[Beignet] [PATCH 2/2] Add vload_half and vstore_half build in.

2013-11-22 Thread Yang Rong

Signed-off-by: Yang Rong rong.r.y...@intel.com
---
 backend/src/ocl_stdlib.tmpl.h | 145 +-
 1 file changed, 143 insertions(+), 2 deletions(-)

diff --git a/backend/src/ocl_stdlib.tmpl.h b/backend/src/ocl_stdlib.tmpl.h
index 62f5f78..e5a6da5 100644
--- a/backend/src/ocl_stdlib.tmpl.h
+++ b/backend/src/ocl_stdlib.tmpl.h
@@ -1977,7 +1977,6 @@ INLINE_OVERLOADABLE TYPE##3 vload3(size_t offset, const 
SPACE TYPE *p) { \
   return *(SPACE TYPE##3 *) (p + 3 * offset); \
 }
 
-
 #define DECL_UNTYPED_RW_ALL_SPACE(TYPE, SPACE) \
   DECL_UNTYPED_RW_SPACE_N(TYPE, 2, SPACE) \
   DECL_UNTYPED_V3_SPACE(TYPE, SPACE) \
@@ -2011,7 +2010,149 @@ DECL_UNTYPED_RW_ALL(double)
 
 #undef DECL_UNTYPED_RW_ALL
 #undef DECL_UNTYPED_RW_ALL_SPACE
+#undef DECL_UNTYPED_RD_ALL_SPACE
 #undef DECL_UNTYPED_RW_SPACE_N
+#undef DECL_UNTYPED_RD_SPACE_N
+#undef DECL_UNTYPED_V3_SPACE
+#undef DECL_UNTYPED_RDV3_SPACE
+
+PURE CONST float __gen_ocl_f16to32(short h);
+PURE CONST short __gen_ocl_f32to16(float f);
+
+INLINE_OVERLOADABLE short f32to16_rtp(float f) {
+  short s = __gen_ocl_f32to16(f);
+  float con = __gen_ocl_f16to32(s);
+  //if(isinf(con)) return s;
+  if (f  con)
+return s - signbit(f) * 2 + 1;
+  else
+return s;
+}
+
+INLINE_OVERLOADABLE short f32to16_rtn(float f) {
+  short s = __gen_ocl_f32to16(f);
+  float con = __gen_ocl_f16to32(s);
+  //if(isinf(con)) return s;
+  if (con  f)
+return s + signbit(f) * 2 - 1;
+  else
+return s;
+}
+
+INLINE_OVERLOADABLE short f32to16_rtz(float f) {
+  short s = __gen_ocl_f32to16(f);
+  float con = __gen_ocl_f16to32(s);
+  //if(isinf(con)) return s;
+  if (((con  f)  !signbit(f)) ||
+  ((con  f)  signbit(f)))
+return s - 1;
+  else
+return s;
+}
+
+#define DECL_HALF_LD_SPACE(SPACE) \
+INLINE_OVERLOADABLE float vload_half(size_t offset, const SPACE half *p) { \
+  return __gen_ocl_f16to32(*(SPACE short *)(p + offset)); \
+} \
+INLINE_OVERLOADABLE float2 vload_half2(size_t offset, const SPACE half *p) { \
+  return (float2)(vload_half(offset*2, p), \
+  vload_half(offset*2 + 1, p)); \
+} \
+INLINE_OVERLOADABLE float3 vload_half3(size_t offset, const SPACE half *p) { \
+  return (float3)(vload_half(offset*3, p), \
+  vload_half(offset*3 + 1, p), \
+  vload_half(offset*3 + 2, p)); \
+} \
+INLINE_OVERLOADABLE float3 vloada_half3(size_t offset, const SPACE half *p) { \
+  return (float3)(vload_half(offset*4, p), \
+  vload_half(offset*4 + 1, p), \
+  vload_half(offset*4 + 2, p)); \
+} \
+INLINE_OVERLOADABLE float4 vload_half4(size_t offset, const SPACE half *p) { \
+  return (float4)(vload_half2(offset*2, p), \
+  vload_half2(offset*2 + 1, p)); \
+} \
+INLINE_OVERLOADABLE float8 vload_half8(size_t offset, const SPACE half *p) { \
+  return (float8)(vload_half4(offset*2, p), \
+  vload_half4(offset*2 + 1, p)); \
+} \
+INLINE_OVERLOADABLE float16 vload_half16(size_t offset, const SPACE half *p) { 
\
+  return (float16)(vload_half8(offset*2, p), \
+   vload_half8(offset*2 + 1, p)); \
+}
+
+#define DECL_HALF_ST_SPACE_ROUND(SPACE, ROUND, FUNC) \
+INLINE_OVERLOADABLE void vstore_half##ROUND(float data, size_t offset, SPACE 
half *p) { \
+  *(SPACE short *)(p + offset) = FUNC(data); \
+} \
+INLINE_OVERLOADABLE void vstorea_half##ROUND(float data, size_t offset, SPACE 
half *p) { \
+  vstore_half##ROUND(data, offset, p); \
+} \
+INLINE_OVERLOADABLE void vstore_half2##ROUND(float2 data, size_t offset, SPACE 
half *p) { \
+  vstore_half##ROUND(data.lo, offset*2, p); \
+  vstore_half##ROUND(data.hi, offset*2 + 1, p); \
+} \
+INLINE_OVERLOADABLE void vstorea_half2##ROUND(float2 data, size_t offset, 
SPACE half *p) { \
+  vstore_half2##ROUND(data, offset, p); \
+} \
+INLINE_OVERLOADABLE void vstore_half3##ROUND(float3 data, size_t offset, SPACE 
half *p) { \
+  vstore_half##ROUND(data.s0, offset*3, p); \
+  vstore_half##ROUND(data.s1, offset*3 + 1, p); \
+  vstore_half##ROUND(data.s2, offset*3 + 2, p); \
+} \
+INLINE_OVERLOADABLE void vstorea_half3##ROUND(float3 data, size_t offset, 
SPACE half *p) { \
+  vstore_half##ROUND(data.s0, offset*4, p); \
+  vstore_half##ROUND(data.s1, offset*4 + 1, p); \
+  vstore_half##ROUND(data.s2, offset*4 + 2, p); \
+} \
+INLINE_OVERLOADABLE void vstore_half4##ROUND(float4 data, size_t offset, SPACE 
half *p) { \
+  vstore_half2##ROUND(data.lo, offset*2, p); \
+  vstore_half2##ROUND(data.hi, offset*2 + 1, p); \
+} \
+INLINE_OVERLOADABLE void vstorea_half4##ROUND(float4 data, size_t offset, 
SPACE half *p) { \
+  vstore_half4##ROUND(data, offset, p); \
+} \
+INLINE_OVERLOADABLE void vstore_half8##ROUND(float8 data, size_t offset, SPACE 
half *p) { \
+  vstore_half4##ROUND(data.lo, offset*2, p); \
+  vstore_half4##ROUND(data.hi, offset*2 + 1, p); \
+} \
+INLINE_OVERLOADABLE void vstorea_half8##ROUND(float8 data, size_t offset, 
SPACE half *p) { \
+  vstore_half8##ROUND(data, offset, p); \
+} \

[Beignet] Beignet vs LuxRays, again

2013-11-22 Thread Simon Richter
Hi,

I've just rendered the first picture using LuxRays and Beignet.
Observations:

 - During rendering, colored dots appeared all over the display at
random locations
 - In the final result, there are black pixels
 - Fonts in some applications appear garbled
 - Rendering was slow (AMD's CPU-based OpenCL implementation was
significantly faster on the same machine, an i7-3612QM)

I think there may be something wrong with the buffer management here, if
the on-screen buffer and off-screen textures owned by different apps can
be overwritten by OpenCL code.

   Simon
___
Beignet mailing list
Beignet@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/beignet