[EGIT] [core/efl] master 02/04: evas: add support for BUILD_NEON_INTRINSICS to evas_convert_rgb_32.c

2015-05-13 Thread Yury Usishchev
cedric pushed a commit to branch master.

http://git.enlightenment.org/core/efl.git/commit/?id=835c393d17a333675aa91ffcac801483fcbd5a35

commit 835c393d17a333675aa91ffcac801483fcbd5a35
Author: Yury Usishchev y.usishc...@samsung.com
Date:   Wed May 13 15:33:15 2015 +0200

evas: add support for BUILD_NEON_INTRINSICS to evas_convert_rgb_32.c

Summary: This fixes build for aarch64 when TILE_ROTATE is disabled and 
BUILD_NEON is enabled(it is enabled by default for aarch64 since 
https://phab.enlightenment.org/D2309).

Reviewers: cedric, raster

Subscribers: cedric

Projects: #efl

Differential Revision: https://phab.enlightenment.org/D2498

Signed-off-by: Cedric BAIL ced...@osg.samsung.com
---
 src/lib/evas/common/evas_convert_rgb_32.c | 31 ++-
 1 file changed, 18 insertions(+), 13 deletions(-)

diff --git a/src/lib/evas/common/evas_convert_rgb_32.c 
b/src/lib/evas/common/evas_convert_rgb_32.c
index 0cc3315..89789b2 100644
--- a/src/lib/evas/common/evas_convert_rgb_32.c
+++ b/src/lib/evas/common/evas_convert_rgb_32.c
@@ -308,27 +308,32 @@ evas_common_convert_rgba_to_32bpp_rgb__rot_270 
(DATA32 *src, DATA8 *dst, int
 void
 evas_common_convert_rgba_to_32bpp_rgb__rot_90 (DATA32 *src, DATA8 *dst, 
int src_jump, int dst_jump, int w, int h, int dith_x EINA_UNUSED, int dith_y 
EINA_UNUSED, DATA8 *pal EINA_UNUSED)
 {
-# ifndef BUILD_NEON
-#  ifdef TILE_ROTATE
+#ifdef TILE_ROTATE
blt_rotated_90_((DATA8 *)dst,  dst_jump+w, (const DATA8 *)src, 
src_jump+h, w, h) ;
-#  else
+#else
+# ifndef BUILD_NEON
DATA32 *src_ptr;
DATA32 *dst_ptr;
int x, y;
-   
+
+   dst_ptr = (DATA32 *)dst;
+   CONVERT_LOOP_START_ROT_90();
+
+   *dst_ptr = *src_ptr;
+
+   CONVERT_LOOP_END_ROT_90();
+# elif defined BUILD_NEON_INTRINSICS
+   DATA32 *src_ptr;
+   DATA32 *dst_ptr;
+   int x, y;
+
dst_ptr = (DATA32 *)dst;
CONVERT_LOOP_START_ROT_90();
 
*dst_ptr = *src_ptr;
 
CONVERT_LOOP_END_ROT_90();
-#  endif
-   
 # else
-   
-#  ifdef TILE_ROTATE
-   blt_rotated_90_((DATA8 *)dst,  dst_jump+w, (const DATA8 *)src, 
src_jump+h, w, h) ;
-#  else
if ((w  1) || (h  1))
  {
 /* Rarely (if ever) if ever: so slow path is fine */
@@ -345,7 +350,7 @@ evas_common_convert_rgba_to_32bpp_rgb__rot_90 (DATA32 
*src, DATA8 *dst, int
  }
else
  {
-#   define AP  convert_rgba32_rot_90_
+#  define AP  convert_rgba32_rot_90_
 asm volatile (
 .fpu neon  \n\t
mov %[s1],  %[src]  \n\t
@@ -417,9 +422,9 @@ evas_common_convert_rgba_to_32bpp_rgb__rot_90 (DATA32 
*src, DATA8 *dst, int
 
 );
  }
-#   undef AP
-#  endif
+#  undef AP
 # endif
+#endif
return;
 }
 

-- 




[EGIT] [core/efl] master 11/55: evas: implement _op_blend_p_dp_neon and _op_blend_pas_dp_neon in NEON intrinsics.

2015-05-07 Thread Yury Usishchev
cedric pushed a commit to branch master.

http://git.enlightenment.org/core/efl.git/commit/?id=9caa6a3597ce5d6cfcb8c99cc4ada4a88f8ad37c

commit 9caa6a3597ce5d6cfcb8c99cc4ada4a88f8ad37c
Author: Yury Usishchev y.usishc...@samsung.com
Date:   Wed Apr 15 17:24:03 2015 +0200

evas: implement _op_blend_p_dp_neon and _op_blend_pas_dp_neon in NEON 
intrinsics.

Reviewers: raster, cedric

Reviewed By: cedric

Subscribers: cedric

Projects: #efl

Differential Revision: https://phab.enlightenment.org/D2311
---
 .../common/evas_op_blend/op_blend_pixel_neon.c | 245 ++---
 1 file changed, 219 insertions(+), 26 deletions(-)

diff --git a/src/lib/evas/common/evas_op_blend/op_blend_pixel_neon.c 
b/src/lib/evas/common/evas_op_blend/op_blend_pixel_neon.c
index 3c32790..e81466c 100644
--- a/src/lib/evas/common/evas_op_blend/op_blend_pixel_neon.c
+++ b/src/lib/evas/common/evas_op_blend/op_blend_pixel_neon.c
@@ -1,17 +1,121 @@
+#ifdef BUILD_NEON
+#ifdef BUILD_NEON_INTRINSICS
+#include arm_neon.h
+#endif
+#endif
 /* blend pixel -- dst */
 
 #ifdef BUILD_NEON
 static void
 _op_blend_p_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) {
 #ifdef BUILD_NEON_INTRINSICS
-   DATA32 *e;
-   int alpha;
-   UNROLL8_PLD_WHILE(d, l, e,
- {
-alpha = 256 - (*s  24);
-*d = *s++ + MUL_256(alpha, *d);
-d++;
- });
+   uint16x8_t alpha00_16x8;
+   uint16x8_t alpha01_16x8;
+   uint16x8_t alpha10_16x8;
+   uint16x8_t alpha11_16x8;
+   uint16x8_t d00_16x8;
+   uint16x8_t d01_16x8;
+   uint16x8_t d10_16x8;
+   uint16x8_t d11_16x8;
+   uint32x4_t alpha0_32x4;
+   uint32x4_t alpha1_32x4;
+   uint32x4_t d0_32x4;
+   uint32x4_t d1_32x4;
+   uint32x4_t s0_32x4;
+   uint32x4_t s1_32x4;
+   uint32x4_t x1_32x4;
+   uint8x16_t alpha0_8x16;
+   uint8x16_t alpha1_8x16;
+   uint8x16_t d0_8x16;
+   uint8x16_t d1_8x16;
+   uint8x16_t s0_8x16;
+   uint8x16_t s1_8x16;
+   uint8x16_t x1_8x16;
+   uint8x16_t x255_8x16;
+   uint8x8_t alpha00_8x8;
+   uint8x8_t alpha01_8x8;
+   uint8x8_t alpha10_8x8;
+   uint8x8_t alpha11_8x8;
+   uint8x8_t d00_8x8;
+   uint8x8_t d01_8x8;
+   uint8x8_t d10_8x8;
+   uint8x8_t d11_8x8;
+
+   x1_8x16 = vdupq_n_u8(0x1);
+   x1_32x4 = vreinterpretq_u32_u8(x1_8x16);
+   x255_8x16 = vdupq_n_u8(0xff);
+
+   DATA32 *start = d;
+   int size = l;
+   DATA32 *end = start + (size  ~7);
+   while (start  end)
+   {
+  s0_32x4 = vld1q_u32(s);
+  s1_32x4 = vld1q_u32(s+4);
+
+  d0_32x4 = vld1q_u32(start);
+  d1_32x4 = vld1q_u32(start+4);
+
+  alpha0_32x4 = vshrq_n_u32(s0_32x4, 24);
+  alpha1_32x4 = vshrq_n_u32(s1_32x4, 24);
+
+  alpha0_32x4 = vmulq_u32(x1_32x4, alpha0_32x4);
+  alpha1_32x4 = vmulq_u32(x1_32x4, alpha1_32x4);
+
+  alpha0_8x16 = vreinterpretq_u8_u32(alpha0_32x4);
+  alpha1_8x16 = vreinterpretq_u8_u32(alpha1_32x4);
+
+  alpha0_8x16 = vsubq_u8(x255_8x16, alpha0_8x16);
+  alpha1_8x16 = vsubq_u8(x255_8x16, alpha1_8x16);
+
+  alpha10_8x8 = vget_low_u8(alpha1_8x16);
+  alpha11_8x8 = vget_high_u8(alpha1_8x16);
+  alpha00_8x8 = vget_low_u8(alpha0_8x16);
+  alpha01_8x8 = vget_high_u8(alpha0_8x16);
+  d0_8x16 = vreinterpretq_u8_u32(d0_32x4);
+  d1_8x16 = vreinterpretq_u8_u32(d1_32x4);
+  d00_8x8 = vget_low_u8(d0_8x16);
+  d01_8x8 = vget_high_u8(d0_8x16);
+  d10_8x8 = vget_low_u8(d1_8x16);
+  d11_8x8 = vget_high_u8(d1_8x16);
+  alpha00_16x8 = vmull_u8(alpha00_8x8, d00_8x8);
+  alpha01_16x8 = vmull_u8(alpha01_8x8, d01_8x8);
+  alpha10_16x8 = vmull_u8(alpha10_8x8, d10_8x8);
+  alpha11_16x8 = vmull_u8(alpha11_8x8, d11_8x8);
+  d00_16x8 = vmovl_u8(d00_8x8);
+  d01_16x8 = vmovl_u8(d01_8x8);
+  d10_16x8 = vmovl_u8(d10_8x8);
+  d11_16x8 = vmovl_u8(d11_8x8);
+  alpha00_16x8 = vaddq_u16(alpha00_16x8, d00_16x8);
+  alpha01_16x8 = vaddq_u16(alpha01_16x8, d01_16x8);
+  alpha10_16x8 = vaddq_u16(alpha10_16x8, d10_16x8);
+  alpha11_16x8 = vaddq_u16(alpha11_16x8, d11_16x8);
+  alpha00_8x8 = vshrn_n_u16(alpha00_16x8,8);
+  alpha01_8x8 = vshrn_n_u16(alpha01_16x8,8);
+  alpha10_8x8 = vshrn_n_u16(alpha10_16x8,8);
+  alpha11_8x8 = vshrn_n_u16(alpha11_16x8,8);
+  alpha0_8x16 = vcombine_u8(alpha00_8x8, alpha01_8x8);
+  alpha1_8x16 = vcombine_u8(alpha10_8x8, alpha11_8x8);
+  s0_8x16 = vreinterpretq_u8_u32(s0_32x4);
+  s1_8x16 = vreinterpretq_u8_u32(s1_32x4);
+  d0_8x16 = vaddq_u8(s0_8x16, alpha0_8x16);
+  d1_8x16 = vaddq_u8(s1_8x16, alpha1_8x16);
+  d0_32x4 = vreinterpretq_u32_u8(d0_8x16);
+  d1_32x4 = vreinterpretq_u32_u8(d1_8x16);
+
+  vst1q_u32(start, d0_32x4);
+  vst1q_u32(start+4, d1_32x4);
+  s+=8;
+  start+=8;
+   }
+   end += (size  7);
+   while (start   end)
+   {
+  int alpha;
+  alpha = 256 - (*s  24);
+  *start = *s++ + MUL_256(alpha, *start);
+  start++;
+   }
 #else

[EGIT] [core/efl] master 12/55: evas: implement _op_blend_mas_c_dp_neon in NEON intrinsics.

2015-05-07 Thread Yury Usishchev
cedric pushed a commit to branch master.

http://git.enlightenment.org/core/efl.git/commit/?id=d2c5730b812f32b1e0a193e0011afead5110fc08

commit d2c5730b812f32b1e0a193e0011afead5110fc08
Author: Yury Usishchev y.usishc...@samsung.com
Date:   Wed Apr 15 17:27:58 2015 +0200

evas: implement _op_blend_mas_c_dp_neon in NEON intrinsics.

Reviewers: raster

Subscribers: cedric

Projects: #efl

Differential Revision: https://phab.enlightenment.org/D2312
---
 .../evas_op_blend/op_blend_mask_color_neon.c   | 150 ++---
 1 file changed, 128 insertions(+), 22 deletions(-)

diff --git a/src/lib/evas/common/evas_op_blend/op_blend_mask_color_neon.c 
b/src/lib/evas/common/evas_op_blend/op_blend_mask_color_neon.c
index dbeb063..0bc8c5c 100644
--- a/src/lib/evas/common/evas_op_blend/op_blend_mask_color_neon.c
+++ b/src/lib/evas/common/evas_op_blend/op_blend_mask_color_neon.c
@@ -1,3 +1,8 @@
+#ifdef BUILD_NEON
+#ifdef BUILD_NEON_INTRINSICS
+#include arm_neon.h
+#endif
+#endif
 #define NEONDEBUG 0
 
 
@@ -20,28 +25,129 @@
 static void
 _op_blend_mas_c_dp_neon(DATA32 *s EINA_UNUSED, DATA8 *m, DATA32 c, DATA32 *d, 
int l) {
 #ifdef BUILD_NEON_INTRINSICS
-   DATA32 *e;
-   int alpha = 256 - (c  24);
-   UNROLL8_PLD_WHILE(d, l, e,
- {
-DATA32 a = *m;
-switch(a)
-  {
-  case 0:
- break;
-  case 255:
- *d = c + MUL_256(alpha, *d);
- break;
-  default:
-   {
-  DATA32 mc = MUL_SYM(a, c);
-  a = 256 - (mc  24);
-  *d = mc + MUL_256(a, *d);
-   }
- break;
-  }
-m++;  d++;
- });
+   uint16x8_t d0_16x8;
+   uint16x8_t d1_16x8;
+   uint16x8_t m_16x8;
+   uint16x8_t mc0_16x8;
+   uint16x8_t mc1_16x8;
+   uint16x8_t temp0_16x8;
+   uint16x8_t temp1_16x8;
+   uint16x8_t x255_16x8;
+   uint32x2_t c_32x2;
+   uint32x2_t m_32x2;
+   uint32x4_t a_32x4;
+   uint32x4_t d_32x4;
+   uint32x4_t m_32x4;
+   uint32x4_t x1_32x4;
+   uint8x16_t a_8x16;
+   uint8x16_t d_8x16;
+   uint8x16_t m_8x16;
+   uint8x16_t mc_8x16;
+   uint8x16_t temp_8x16;
+   uint8x16_t x1_8x16;
+   uint8x8_t a0_8x8;
+   uint8x8_t a1_8x8;
+   uint8x8_t c_8x8;
+   uint8x8_t d0_8x8;
+   uint8x8_t d1_8x8;
+   uint8x8_t m0_8x8;
+   uint8x8_t m1_8x8;
+   uint8x8_t m_8x8;
+   uint8x8_t mc0_8x8;
+   uint8x8_t mc1_8x8;
+   uint8x8_t temp0_8x8;
+   uint8x8_t temp1_8x8;
+
+   x1_8x16 = vdupq_n_u8(0x1);
+   x255_16x8 = vdupq_n_u16(0xff);
+   x1_32x4 = vreinterpretq_u32_u8(x1_8x16);
+   c_32x2 = vdup_n_u32(c);
+   c_8x8 = vreinterpret_u8_u32(c_32x2);
+
+   DATA32 *start = d;
+   int size = l;
+   DATA32 *end = start + (size  ~7);
+   while (start  end) {
+  int k = *((int *)m);
+  if (k == 0)
+  {
+ m+=4;
+ start+=4;
+ continue;
+  }
+
+  m_32x2 = vld1_lane_u32((DATA32*)m, m_32x2, 0);
+
+  d_32x4 = vld1q_u32(start);
+
+  m_8x8 = vreinterpret_u8_u32(m_32x2);
+  m_16x8 = vmovl_u8(m_8x8);
+  m_8x16 = vreinterpretq_u8_u16(m_16x8);
+  m_8x8 = vget_low_u8(m_8x16);
+  m_16x8 = vmovl_u8(m_8x8);
+  m_32x4 = vreinterpretq_u32_u16(m_16x8);
+
+  m_32x4 = vmulq_u32(m_32x4, x1_32x4);
+  m_8x16 = vreinterpretq_u8_u32(m_32x4);
+  m0_8x8 = vget_low_u8(m_8x16);
+  m1_8x8 = vget_high_u8(m_8x16);
+
+  mc0_16x8 = vmull_u8(m0_8x8, c_8x8);
+  mc1_16x8 = vmull_u8(m1_8x8, c_8x8);
+
+  mc0_16x8 = vaddq_u16(mc0_16x8, x255_16x8);
+  mc1_16x8 = vaddq_u16(mc1_16x8, x255_16x8);
+
+  mc0_8x8 = vshrn_n_u16(mc0_16x8, 8);
+  mc1_8x8 = vshrn_n_u16(mc1_16x8, 8);
+
+  mc_8x16 = vcombine_u8(mc0_8x8, mc1_8x8);
+  a_8x16 = vmvnq_u8(mc_8x16);
+  a_32x4 = vreinterpretq_u32_u8(a_8x16);
+  a_32x4 = vshrq_n_u32(a_32x4, 24);
+  a_32x4 = vmulq_u32(a_32x4, x1_32x4);
+
+  a_8x16 = vreinterpretq_u8_u32(a_32x4);
+  a0_8x8 = vget_low_u8(a_8x16);
+  a1_8x8 = vget_high_u8(a_8x16);
+
+  d_8x16 = vreinterpretq_u8_u32(d_32x4);
+
+  d0_8x8 = vget_low_u8(d_8x16);
+  d1_8x8 = vget_high_u8(d_8x16);
+
+  d0_16x8 = vmovl_u8(d0_8x8);
+  d1_16x8 = vmovl_u8(d1_8x8);
+
+  temp0_16x8 = vmull_u8(a0_8x8, d0_8x8);
+  temp1_16x8 = vmull_u8(a1_8x8, d1_8x8);
+
+  temp0_16x8 = vaddq_u16(temp0_16x8, d0_16x8);
+  temp1_16x8 = vaddq_u16(temp1_16x8, d1_16x8);
+
+  temp0_8x8 = vshrn_n_u16(temp0_16x8,8);
+  temp1_8x8 = vshrn_n_u16(temp1_16x8,8);
+
+  temp_8x16 = vcombine_u8(temp0_8x8, temp1_8x8);
+
+  d_8x16 = vaddq_u8(mc_8x16, temp_8x16);
+
+  d_32x4 = vreinterpretq_u32_u8(d_8x16);
+
+  vst1q_u32(start, d_32x4);
+
+  start+=4

[EGIT] [core/efl] master 10/55: evas: implement _op_blend_c_dp_neon in NEON intrinsics.

2015-05-07 Thread Yury Usishchev
cedric pushed a commit to branch master.

http://git.enlightenment.org/core/efl.git/commit/?id=a30481d27ba5e2dd5ad84cef9f6c55a9c89880a1

commit a30481d27ba5e2dd5ad84cef9f6c55a9c89880a1
Author: Yury Usishchev y.usishc...@samsung.com
Date:   Wed Apr 15 17:22:54 2015 +0200

evas: implement _op_blend_c_dp_neon in NEON intrinsics.

Reviewers: raster, cedric

@feature

Reviewed By: cedric

Subscribers: jpeg, cedric

Projects: #efl

Differential Revision: https://phab.enlightenment.org/D2310

Signed-off-by: Cedric BAIL ced...@osg.samsung.com
---
 .../common/evas_op_blend/op_blend_color_neon.c | 92 --
 1 file changed, 86 insertions(+), 6 deletions(-)

diff --git a/src/lib/evas/common/evas_op_blend/op_blend_color_neon.c 
b/src/lib/evas/common/evas_op_blend/op_blend_color_neon.c
index 2bf14c1..7ba2ffd 100644
--- a/src/lib/evas/common/evas_op_blend/op_blend_color_neon.c
+++ b/src/lib/evas/common/evas_op_blend/op_blend_color_neon.c
@@ -1,15 +1,95 @@
+#ifdef BUILD_NEON
+#ifdef BUILD_NEON_INTRINSICS
+#include arm_neon.h
+#endif
+#endif
 /* blend color -- dst */
 
 #ifdef BUILD_NEON
 static void
 _op_blend_c_dp_neon(DATA32 *s EINA_UNUSED, DATA8 *m EINA_UNUSED, DATA32 c, 
DATA32 *d, int l) {
 #ifdef BUILD_NEON_INTRINSICS
-DATA32 *e, a = 256 - (c  24);
-UNROLL8_PLD_WHILE(d, l, e,
-  {
- *d = c + MUL_256(a, *d);
- d++;
-  });
+   uint16x8_t temp00_16x8;
+   uint16x8_t temp01_16x8;
+   uint16x8_t temp10_16x8;
+   uint16x8_t temp11_16x8;
+   uint32x4_t temp0_32x4;
+   uint32x4_t temp1_32x4;
+   uint32x4_t c_32x4;
+   uint32x4_t d0_32x4;
+   uint32x4_t d1_32x4;
+   uint8x16_t d0_8x16;
+   uint8x16_t d1_8x16;
+   uint8x16_t temp0_8x16;
+   uint8x16_t temp1_8x16;
+   uint8x8_t alpha_8x8;
+   uint8x8_t d00_8x8;
+   uint8x8_t d01_8x8;
+   uint8x8_t d10_8x8;
+   uint8x8_t d11_8x8;
+   uint8x8_t temp00_8x8;
+   uint8x8_t temp01_8x8;
+   uint8x8_t temp10_8x8;
+   uint8x8_t temp11_8x8;
+
+   // alpha can only be 0 if color is 0x0. In that case we can just return.
+   // Otherwise we can assume alpha != 0. This allows more optimization in
+   // NEON code.
+
+   if(!c)
+  return;
+
+   DATA32 *start = d;
+   int size = l;
+   DATA32 *end = start + (size  ~7);
+
+   unsigned char alpha;
+   alpha = ~(c  24) + 1; // 256 - (c  24)
+   alpha_8x8 = vdup_n_u8(alpha);
+
+   c_32x4 = vdupq_n_u32(c);
+
+   while (start  end)
+   {
+  d0_32x4 = vld1q_u32(start);
+  d1_32x4 = vld1q_u32(start+4);
+  d0_8x16 = vreinterpretq_u8_u32(d0_32x4);
+  d1_8x16 = vreinterpretq_u8_u32(d1_32x4);
+
+  d00_8x8 = vget_low_u8(d0_8x16);
+  d01_8x8 = vget_high_u8(d0_8x16);
+  d10_8x8 = vget_low_u8(d1_8x16);
+  d11_8x8 = vget_high_u8(d1_8x16);
+
+  temp00_16x8 = vmull_u8(alpha_8x8, d00_8x8);
+  temp01_16x8 = vmull_u8(alpha_8x8, d01_8x8);
+  temp10_16x8 = vmull_u8(alpha_8x8, d10_8x8);
+  temp11_16x8 = vmull_u8(alpha_8x8, d11_8x8);
+
+  temp00_8x8 = vshrn_n_u16(temp00_16x8,8);
+  temp01_8x8 = vshrn_n_u16(temp01_16x8,8);
+  temp10_8x8 = vshrn_n_u16(temp10_16x8,8);
+  temp11_8x8 = vshrn_n_u16(temp11_16x8,8);
+
+  temp0_8x16 = vcombine_u8(temp00_8x8, temp01_8x8);
+  temp1_8x16 = vcombine_u8(temp10_8x8, temp11_8x8);
+
+  temp0_32x4 = vreinterpretq_u32_u8(temp0_8x16);
+  temp1_32x4 = vreinterpretq_u32_u8(temp1_8x16);
+
+  d0_32x4 = vaddq_u32(c_32x4, temp0_32x4);
+  d1_32x4 = vaddq_u32(c_32x4, temp1_32x4);
+
+  vst1q_u32(start, d0_32x4);
+  vst1q_u32(start+4, d1_32x4);
+  start+=8;
+   }
+   end += (size  7);
+   while (start   end)
+   {
+  *start = c + MUL_256(alpha, *start);
+  start++;
+   }
 #else
DATA32 *e, *tmp = 0;
 #define AP B_C_DP

-- 




[EGIT] [core/efl] master 09/55: evas: enable NEON-optimized code for aarch64.

2015-05-07 Thread Yury Usishchev
cedric pushed a commit to branch master.

http://git.enlightenment.org/core/efl.git/commit/?id=71eec44ccc9ab43e728ba986fadce6c6cfd2ff7c

commit 71eec44ccc9ab43e728ba986fadce6c6cfd2ff7c
Author: Yury Usishchev y.usishc...@samsung.com
Date:   Wed Apr 15 17:21:33 2015 +0200

evas: enable NEON-optimized code for aarch64.

Summary:
Add new define, BUILD_NEON_INTRINSICS to control whether NEON inline code or
NEON intrinsics should be built.

GCC NEON intrinsics can be built both for armv7 and armv8. However NEON 
inline
code can be built only for armv7.

@feature

Reviewers: raster, stefan_schmidt, cedric

Subscribers: cedric, stefan_schmidt

Projects: #efl

Differential Revision: https://phab.enlightenment.org/D2309

Signed-off-by: Cedric BAIL ced...@osg.samsung.com
---
 configure.ac   | 18 +
 src/lib/evas/common/evas_blit_main.c   |  8 
 src/lib/evas/common/evas_cpu.c |  9 +
 .../common/evas_op_blend/op_blend_color_neon.c | 10 -
 .../evas_op_blend/op_blend_mask_color_neon.c   | 47 ++
 .../evas_op_blend/op_blend_pixel_color_neon.c  | 14 ++-
 .../common/evas_op_blend/op_blend_pixel_neon.c | 33 ++-
 .../evas/common/evas_op_copy/op_copy_color_neon.c  |  9 +
 8 files changed, 145 insertions(+), 3 deletions(-)

diff --git a/configure.ac b/configure.ac
index 9eed98c..63cc54d 100644
--- a/configure.ac
+++ b/configure.ac
@@ -576,6 +576,21 @@ case $host_cpu in
CFLAGS=${CFLAGS_save}
 fi
 ;;
+  aarch64*)
+if test x${want_neon} = xyes; then
+   build_cpu_neon=yes
+   AC_MSG_CHECKING([whether to use NEON instructions])
+   AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[#include arm_neon.h]], 
[[volatile uint32x4_t test = vdupq_n_u32(0x1);]])],[
+   AC_MSG_RESULT([yes])
+   AC_DEFINE([BUILD_NEON], [1], [Build NEON Code])
+   AC_DEFINE([BUILD_NEON_INTRINSICS], [1], [Build NEON Intrinsics])
+   build_cpu_neon=yes
+],[
+  AC_MSG_RESULT([no])
+   build_cpu_neon=no
+])
+fi
+;;
 esac
 
 AC_SUBST([ALTIVEC_CFLAGS])
@@ -4741,6 +4756,9 @@ case $host_cpu in
   arm*)
 EFL_ADD_FEATURE([cpu], [neon], [${build_cpu_neon}])
 ;;
+  aarch64*)
+EFL_ADD_FEATURE([cpu], [neon], [${build_cpu_neon}])
+;;
 esac
 
 if test ${have_linux} = yes; then
diff --git a/src/lib/evas/common/evas_blit_main.c 
b/src/lib/evas/common/evas_blit_main.c
index 7f8faa1..4da4034 100644
--- a/src/lib/evas/common/evas_blit_main.c
+++ b/src/lib/evas/common/evas_blit_main.c
@@ -132,6 +132,9 @@ evas_common_copy_rev_pixels_c(DATA32 *src, DATA32 *dst, int 
len)
 static void
 evas_common_copy_pixels_rev_neon(DATA32 *src, DATA32 *dst, int len)
 {
+#ifdef BUILD_NEON_INTRINSICS
+evas_common_copy_pixels_rev_c(src, dst, len);
+#else
uint32_t *tmp = (void *)37;
 #define AP evas_common_copy_rev_pixels_neon_
asm volatile (
@@ -228,6 +231,7 @@ evas_common_copy_pixels_rev_neon(DATA32 *src, DATA32 *dst, 
int len)
);
 #undef AP
 
+#endif
 }
 #endif
 
@@ -324,6 +328,9 @@ evas_common_copy_pixels_mmx2(DATA32 *src, DATA32 *dst, int 
len)
 #ifdef BUILD_NEON
 static void
 evas_common_copy_pixels_neon(DATA32 *src, DATA32 *dst, int len){
+#ifdef BUILD_NEON_INTRINSICS
+evas_common_copy_pixels_c(src, dst, len);
+#else
uint32_t *e,*tmp = (void *)37;
e = dst + len;
 #define AP evas_common_copy_pixels_neon_
@@ -410,6 +417,7 @@ evas_common_copy_pixels_neon(DATA32 *src, DATA32 *dst, int 
len){
);
 #undef AP
 
+#endif
 }
 #endif /* BUILD_NEON */
 
diff --git a/src/lib/evas/common/evas_cpu.c b/src/lib/evas/common/evas_cpu.c
index 4139098..0f83258 100644
--- a/src/lib/evas/common/evas_cpu.c
+++ b/src/lib/evas/common/evas_cpu.c
@@ -2,6 +2,11 @@
 #ifdef BUILD_MMX
 #include evas_mmx.h
 #endif
+#ifdef BUILD_NEON
+#ifdef BUILD_NEON_INTRINSICS
+#include arm_neon.h
+#endif
+#endif
 #if defined BUILD_SSE3
 #include immintrin.h
 #endif
@@ -92,6 +97,9 @@ evas_common_cpu_neon_test(void)
 {
 //#if defined(__ARM_ARCH__)  (__ARM_ARCH__ = 70)
 #ifdef BUILD_NEON
+#ifdef BUILD_NEON_INTRINSICS
+   volatile uint32x4_t temp = vdupq_n_u32(0x1);
+#else
asm volatile (
.fpu neon   \n\t
  vqadd.u8 d0, d1, d0\n
@@ -101,6 +109,7 @@ evas_common_cpu_neon_test(void)
  d0, d1
  );
 #endif
+#endif
 //#endif
 }
 
diff --git a/src/lib/evas/common/evas_op_blend/op_blend_color_neon.c 
b/src/lib/evas/common/evas_op_blend/op_blend_color_neon.c
index 9e94298..2bf14c1 100644
--- a/src/lib/evas/common/evas_op_blend/op_blend_color_neon.c
+++ b/src/lib/evas/common/evas_op_blend/op_blend_color_neon.c
@@ -3,6 +3,14 @@
 #ifdef BUILD_NEON
 static void
 _op_blend_c_dp_neon(DATA32 *s EINA_UNUSED, DATA8 *m EINA_UNUSED, DATA32 c, 
DATA32 *d, int l) {
+#ifdef BUILD_NEON_INTRINSICS
+DATA32 *e, a = 256 - (c

[EGIT] [core/efl] master 15/55: evas: implement _op_blend_p_c_dp_neon in NEON intrinsics.

2015-05-07 Thread Yury Usishchev
cedric pushed a commit to branch master.

http://git.enlightenment.org/core/efl.git/commit/?id=be7c7c2c77c7b61f569532be7abb07858490bae6

commit be7c7c2c77c7b61f569532be7abb07858490bae6
Author: Yury Usishchev y.usishc...@samsung.com
Date:   Thu Apr 16 19:25:29 2015 +0200

evas: implement _op_blend_p_c_dp_neon in NEON intrinsics.

Reviewers: cedric, raster

Projects: #efl

Differential Revision: https://phab.enlightenment.org/D2366

Signed-off-by: Cedric BAIL ced...@osg.samsung.com
---
 .../evas_op_blend/op_blend_pixel_color_neon.c  | 116 +++--
 1 file changed, 106 insertions(+), 10 deletions(-)

diff --git a/src/lib/evas/common/evas_op_blend/op_blend_pixel_color_neon.c 
b/src/lib/evas/common/evas_op_blend/op_blend_pixel_color_neon.c
index c47ec7c..b1bfc25 100644
--- a/src/lib/evas/common/evas_op_blend/op_blend_pixel_color_neon.c
+++ b/src/lib/evas/common/evas_op_blend/op_blend_pixel_color_neon.c
@@ -1,3 +1,8 @@
+#ifdef BUILD_NEON
+#ifdef BUILD_NEON_INTRINSICS
+#include arm_neon.h
+#endif
+#endif
 /* blend pixel x color -- dst */
 #ifdef BUILD_NEON
 
@@ -8,16 +13,107 @@
 static void
 _op_blend_p_c_dp_neon(DATA32 * __restrict s, DATA8 *m EINA_UNUSED, DATA32 c, 
DATA32 * __restrict d, int l) {
 #ifdef BUILD_NEON_INTRINSICS
-   DATA32 *e;
-   int alpha;
-   UNROLL8_PLD_WHILE(d, l, e,
- {
-DATA32 sc = MUL4_SYM(c, *s);
-alpha = 256 - (sc  24);
-*d = sc + MUL_256(alpha, *d);
-d++;
-s++;
- });
+   uint16x8_t ad0_16x8;
+   uint16x8_t ad1_16x8;
+   uint16x8_t sc0_16x8;
+   uint16x8_t sc1_16x8;
+   uint16x8_t x255_16x8;
+   uint32x2_t c_32x2;
+   uint32x4_t ad_32x4;
+   uint32x4_t alpha_32x4;
+   uint32x4_t cond_32x4;
+   uint32x4_t d_32x4;
+   uint32x4_t s_32x4;
+   uint32x4_t sc_32x4;
+   uint32x4_t x0_32x4;
+   uint32x4_t x1_32x4;
+   uint8x16_t ad_8x16;
+   uint8x16_t alpha_8x16;
+   uint8x16_t d_8x16;
+   uint8x16_t s_8x16;
+   uint8x16_t sc_8x16;
+   uint8x16_t x0_8x16;
+   uint8x16_t x1_8x16;
+   uint8x8_t ad0_8x8;
+   uint8x8_t ad1_8x8;
+   uint8x8_t alpha0_8x8;
+   uint8x8_t alpha1_8x8;
+   uint8x8_t c_8x8;
+   uint8x8_t d0_8x8;
+   uint8x8_t d1_8x8;
+   uint8x8_t s0_8x8;
+   uint8x8_t s1_8x8;
+   uint8x8_t sc0_8x8;
+   uint8x8_t sc1_8x8;
+
+   c_32x2 = vdup_n_u32(c);
+   c_8x8 = vreinterpret_u8_u32(c_32x2);
+   x255_16x8 = vdupq_n_u16(0xff);
+   x0_8x16 = vdupq_n_u8(0x0);
+   x0_32x4 = vreinterpretq_u32_u8(x0_8x16);
+   x1_8x16 = vdupq_n_u8(0x1);
+   x1_32x4 = vreinterpretq_u32_u8(x1_8x16);
+   DATA32 *start = d;
+   int size = l;
+   DATA32 *end = start + (size  ~3);
+   while (start  end)
+   {
+
+  s_32x4 = vld1q_u32(s);
+  s_8x16 = vreinterpretq_u8_u32(s_32x4);
+
+  d_32x4 = vld1q_u32(start);
+  d_8x16 = vreinterpretq_u8_u32(d_32x4);
+  d0_8x8 = vget_low_u8(d_8x16);
+  d1_8x8 = vget_high_u8(d_8x16);
+
+  s0_8x8 = vget_low_u8(s_8x16);
+  s1_8x8 = vget_high_u8(s_8x16);
+
+  sc0_16x8 = vmull_u8(s0_8x8, c_8x8);
+  sc1_16x8 = vmull_u8(s1_8x8, c_8x8);
+  sc0_16x8 = vaddq_u16(sc0_16x8, x255_16x8);
+  sc1_16x8 = vaddq_u16(sc1_16x8, x255_16x8);
+  sc0_8x8 = vshrn_n_u16(sc0_16x8, 8);
+  sc1_8x8 = vshrn_n_u16(sc1_16x8, 8);
+  sc_8x16 = vcombine_u8(sc0_8x8, sc1_8x8);
+
+  alpha_32x4 = vreinterpretq_u32_u8(sc_8x16);
+  alpha_32x4 = vshrq_n_u32(alpha_32x4, 24);
+  alpha_32x4 = vmulq_u32(x1_32x4, alpha_32x4);
+  alpha_8x16 = vreinterpretq_u8_u32(alpha_32x4);
+  alpha_8x16 = vsubq_u8(x0_8x16, alpha_8x16);
+  alpha0_8x8 = vget_low_u8(alpha_8x16);
+  alpha1_8x8 = vget_high_u8(alpha_8x16);
+
+  ad0_16x8 = vmull_u8(alpha0_8x8, d0_8x8);
+  ad1_16x8 = vmull_u8(alpha1_8x8, d1_8x8);
+  ad0_8x8 = vshrn_n_u16(ad0_16x8,8);
+  ad1_8x8 = vshrn_n_u16(ad1_16x8,8);
+  ad_8x16 = vcombine_u8(ad0_8x8, ad1_8x8);
+  ad_32x4 = vreinterpretq_u32_u8(ad_8x16);
+
+  alpha_32x4 = vreinterpretq_u32_u8(alpha_8x16);
+  cond_32x4 = vceqq_u32(alpha_32x4, x0_32x4);
+  ad_32x4 = vbslq_u32(cond_32x4, d_32x4 , ad_32x4);
+
+  sc_32x4 = vreinterpretq_u32_u8(sc_8x16);
+  d_32x4 = vaddq_u32(sc_32x4, ad_32x4);
+
+  vst1q_u32(start, d_32x4);
+
+  s+=4;
+  start+=4;
+   }
+   end += (size  3);
+   while (start   end)
+   {
+  DATA32 sc = MUL4_SYM(c, *s);
+  DATA32 alpha = 256 - (sc  24);
+  *start = sc + MUL_256(alpha, *start);
+  start++;
+  s++;
+   }
 #else
 #define AP blend_p_c_dp_
asm volatile (

--