[EGIT] [core/efl] master 02/04: evas: add support for BUILD_NEON_INTRINSICS to evas_convert_rgb_32.c
cedric pushed a commit to branch master. http://git.enlightenment.org/core/efl.git/commit/?id=835c393d17a333675aa91ffcac801483fcbd5a35 commit 835c393d17a333675aa91ffcac801483fcbd5a35 Author: Yury Usishchev y.usishc...@samsung.com Date: Wed May 13 15:33:15 2015 +0200 evas: add support for BUILD_NEON_INTRINSICS to evas_convert_rgb_32.c Summary: This fixes build for aarch64 when TILE_ROTATE is disabled and BUILD_NEON is enabled(it is enabled by default for aarch64 since https://phab.enlightenment.org/D2309). Reviewers: cedric, raster Subscribers: cedric Projects: #efl Differential Revision: https://phab.enlightenment.org/D2498 Signed-off-by: Cedric BAIL ced...@osg.samsung.com --- src/lib/evas/common/evas_convert_rgb_32.c | 31 ++- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/src/lib/evas/common/evas_convert_rgb_32.c b/src/lib/evas/common/evas_convert_rgb_32.c index 0cc3315..89789b2 100644 --- a/src/lib/evas/common/evas_convert_rgb_32.c +++ b/src/lib/evas/common/evas_convert_rgb_32.c @@ -308,27 +308,32 @@ evas_common_convert_rgba_to_32bpp_rgb__rot_270 (DATA32 *src, DATA8 *dst, int void evas_common_convert_rgba_to_32bpp_rgb__rot_90 (DATA32 *src, DATA8 *dst, int src_jump, int dst_jump, int w, int h, int dith_x EINA_UNUSED, int dith_y EINA_UNUSED, DATA8 *pal EINA_UNUSED) { -# ifndef BUILD_NEON -# ifdef TILE_ROTATE +#ifdef TILE_ROTATE blt_rotated_90_((DATA8 *)dst, dst_jump+w, (const DATA8 *)src, src_jump+h, w, h) ; -# else +#else +# ifndef BUILD_NEON DATA32 *src_ptr; DATA32 *dst_ptr; int x, y; - + + dst_ptr = (DATA32 *)dst; + CONVERT_LOOP_START_ROT_90(); + + *dst_ptr = *src_ptr; + + CONVERT_LOOP_END_ROT_90(); +# elif defined BUILD_NEON_INTRINSICS + DATA32 *src_ptr; + DATA32 *dst_ptr; + int x, y; + dst_ptr = (DATA32 *)dst; CONVERT_LOOP_START_ROT_90(); *dst_ptr = *src_ptr; CONVERT_LOOP_END_ROT_90(); -# endif - # else - -# ifdef TILE_ROTATE - blt_rotated_90_((DATA8 *)dst, dst_jump+w, (const DATA8 *)src, src_jump+h, w, h) ; -# else if ((w 1) || (h 1)) { /* Rarely (if ever) if ever: so slow path is fine */ @@ -345,7 +350,7 @@ evas_common_convert_rgba_to_32bpp_rgb__rot_90 (DATA32 *src, DATA8 *dst, int } else { -# define AP convert_rgba32_rot_90_ +# define AP convert_rgba32_rot_90_ asm volatile ( .fpu neon \n\t mov %[s1], %[src] \n\t @@ -417,9 +422,9 @@ evas_common_convert_rgba_to_32bpp_rgb__rot_90 (DATA32 *src, DATA8 *dst, int ); } -# undef AP -# endif +# undef AP # endif +#endif return; } --
[EGIT] [core/efl] master 11/55: evas: implement _op_blend_p_dp_neon and _op_blend_pas_dp_neon in NEON intrinsics.
cedric pushed a commit to branch master. http://git.enlightenment.org/core/efl.git/commit/?id=9caa6a3597ce5d6cfcb8c99cc4ada4a88f8ad37c commit 9caa6a3597ce5d6cfcb8c99cc4ada4a88f8ad37c Author: Yury Usishchev y.usishc...@samsung.com Date: Wed Apr 15 17:24:03 2015 +0200 evas: implement _op_blend_p_dp_neon and _op_blend_pas_dp_neon in NEON intrinsics. Reviewers: raster, cedric Reviewed By: cedric Subscribers: cedric Projects: #efl Differential Revision: https://phab.enlightenment.org/D2311 --- .../common/evas_op_blend/op_blend_pixel_neon.c | 245 ++--- 1 file changed, 219 insertions(+), 26 deletions(-) diff --git a/src/lib/evas/common/evas_op_blend/op_blend_pixel_neon.c b/src/lib/evas/common/evas_op_blend/op_blend_pixel_neon.c index 3c32790..e81466c 100644 --- a/src/lib/evas/common/evas_op_blend/op_blend_pixel_neon.c +++ b/src/lib/evas/common/evas_op_blend/op_blend_pixel_neon.c @@ -1,17 +1,121 @@ +#ifdef BUILD_NEON +#ifdef BUILD_NEON_INTRINSICS +#include arm_neon.h +#endif +#endif /* blend pixel -- dst */ #ifdef BUILD_NEON static void _op_blend_p_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) { #ifdef BUILD_NEON_INTRINSICS - DATA32 *e; - int alpha; - UNROLL8_PLD_WHILE(d, l, e, - { -alpha = 256 - (*s 24); -*d = *s++ + MUL_256(alpha, *d); -d++; - }); + uint16x8_t alpha00_16x8; + uint16x8_t alpha01_16x8; + uint16x8_t alpha10_16x8; + uint16x8_t alpha11_16x8; + uint16x8_t d00_16x8; + uint16x8_t d01_16x8; + uint16x8_t d10_16x8; + uint16x8_t d11_16x8; + uint32x4_t alpha0_32x4; + uint32x4_t alpha1_32x4; + uint32x4_t d0_32x4; + uint32x4_t d1_32x4; + uint32x4_t s0_32x4; + uint32x4_t s1_32x4; + uint32x4_t x1_32x4; + uint8x16_t alpha0_8x16; + uint8x16_t alpha1_8x16; + uint8x16_t d0_8x16; + uint8x16_t d1_8x16; + uint8x16_t s0_8x16; + uint8x16_t s1_8x16; + uint8x16_t x1_8x16; + uint8x16_t x255_8x16; + uint8x8_t alpha00_8x8; + uint8x8_t alpha01_8x8; + uint8x8_t alpha10_8x8; + uint8x8_t alpha11_8x8; + uint8x8_t d00_8x8; + uint8x8_t d01_8x8; + uint8x8_t d10_8x8; + uint8x8_t d11_8x8; + + x1_8x16 = vdupq_n_u8(0x1); + x1_32x4 = vreinterpretq_u32_u8(x1_8x16); + x255_8x16 = vdupq_n_u8(0xff); + + DATA32 *start = d; + int size = l; + DATA32 *end = start + (size ~7); + while (start end) + { + s0_32x4 = vld1q_u32(s); + s1_32x4 = vld1q_u32(s+4); + + d0_32x4 = vld1q_u32(start); + d1_32x4 = vld1q_u32(start+4); + + alpha0_32x4 = vshrq_n_u32(s0_32x4, 24); + alpha1_32x4 = vshrq_n_u32(s1_32x4, 24); + + alpha0_32x4 = vmulq_u32(x1_32x4, alpha0_32x4); + alpha1_32x4 = vmulq_u32(x1_32x4, alpha1_32x4); + + alpha0_8x16 = vreinterpretq_u8_u32(alpha0_32x4); + alpha1_8x16 = vreinterpretq_u8_u32(alpha1_32x4); + + alpha0_8x16 = vsubq_u8(x255_8x16, alpha0_8x16); + alpha1_8x16 = vsubq_u8(x255_8x16, alpha1_8x16); + + alpha10_8x8 = vget_low_u8(alpha1_8x16); + alpha11_8x8 = vget_high_u8(alpha1_8x16); + alpha00_8x8 = vget_low_u8(alpha0_8x16); + alpha01_8x8 = vget_high_u8(alpha0_8x16); + d0_8x16 = vreinterpretq_u8_u32(d0_32x4); + d1_8x16 = vreinterpretq_u8_u32(d1_32x4); + d00_8x8 = vget_low_u8(d0_8x16); + d01_8x8 = vget_high_u8(d0_8x16); + d10_8x8 = vget_low_u8(d1_8x16); + d11_8x8 = vget_high_u8(d1_8x16); + alpha00_16x8 = vmull_u8(alpha00_8x8, d00_8x8); + alpha01_16x8 = vmull_u8(alpha01_8x8, d01_8x8); + alpha10_16x8 = vmull_u8(alpha10_8x8, d10_8x8); + alpha11_16x8 = vmull_u8(alpha11_8x8, d11_8x8); + d00_16x8 = vmovl_u8(d00_8x8); + d01_16x8 = vmovl_u8(d01_8x8); + d10_16x8 = vmovl_u8(d10_8x8); + d11_16x8 = vmovl_u8(d11_8x8); + alpha00_16x8 = vaddq_u16(alpha00_16x8, d00_16x8); + alpha01_16x8 = vaddq_u16(alpha01_16x8, d01_16x8); + alpha10_16x8 = vaddq_u16(alpha10_16x8, d10_16x8); + alpha11_16x8 = vaddq_u16(alpha11_16x8, d11_16x8); + alpha00_8x8 = vshrn_n_u16(alpha00_16x8,8); + alpha01_8x8 = vshrn_n_u16(alpha01_16x8,8); + alpha10_8x8 = vshrn_n_u16(alpha10_16x8,8); + alpha11_8x8 = vshrn_n_u16(alpha11_16x8,8); + alpha0_8x16 = vcombine_u8(alpha00_8x8, alpha01_8x8); + alpha1_8x16 = vcombine_u8(alpha10_8x8, alpha11_8x8); + s0_8x16 = vreinterpretq_u8_u32(s0_32x4); + s1_8x16 = vreinterpretq_u8_u32(s1_32x4); + d0_8x16 = vaddq_u8(s0_8x16, alpha0_8x16); + d1_8x16 = vaddq_u8(s1_8x16, alpha1_8x16); + d0_32x4 = vreinterpretq_u32_u8(d0_8x16); + d1_32x4 = vreinterpretq_u32_u8(d1_8x16); + + vst1q_u32(start, d0_32x4); + vst1q_u32(start+4, d1_32x4); + s+=8; + start+=8; + } + end += (size 7); + while (start end) + { + int alpha; + alpha = 256 - (*s 24); + *start = *s++ + MUL_256(alpha, *start); + start++; + } #else
[EGIT] [core/efl] master 12/55: evas: implement _op_blend_mas_c_dp_neon in NEON intrinsics.
cedric pushed a commit to branch master. http://git.enlightenment.org/core/efl.git/commit/?id=d2c5730b812f32b1e0a193e0011afead5110fc08 commit d2c5730b812f32b1e0a193e0011afead5110fc08 Author: Yury Usishchev y.usishc...@samsung.com Date: Wed Apr 15 17:27:58 2015 +0200 evas: implement _op_blend_mas_c_dp_neon in NEON intrinsics. Reviewers: raster Subscribers: cedric Projects: #efl Differential Revision: https://phab.enlightenment.org/D2312 --- .../evas_op_blend/op_blend_mask_color_neon.c | 150 ++--- 1 file changed, 128 insertions(+), 22 deletions(-) diff --git a/src/lib/evas/common/evas_op_blend/op_blend_mask_color_neon.c b/src/lib/evas/common/evas_op_blend/op_blend_mask_color_neon.c index dbeb063..0bc8c5c 100644 --- a/src/lib/evas/common/evas_op_blend/op_blend_mask_color_neon.c +++ b/src/lib/evas/common/evas_op_blend/op_blend_mask_color_neon.c @@ -1,3 +1,8 @@ +#ifdef BUILD_NEON +#ifdef BUILD_NEON_INTRINSICS +#include arm_neon.h +#endif +#endif #define NEONDEBUG 0 @@ -20,28 +25,129 @@ static void _op_blend_mas_c_dp_neon(DATA32 *s EINA_UNUSED, DATA8 *m, DATA32 c, DATA32 *d, int l) { #ifdef BUILD_NEON_INTRINSICS - DATA32 *e; - int alpha = 256 - (c 24); - UNROLL8_PLD_WHILE(d, l, e, - { -DATA32 a = *m; -switch(a) - { - case 0: - break; - case 255: - *d = c + MUL_256(alpha, *d); - break; - default: - { - DATA32 mc = MUL_SYM(a, c); - a = 256 - (mc 24); - *d = mc + MUL_256(a, *d); - } - break; - } -m++; d++; - }); + uint16x8_t d0_16x8; + uint16x8_t d1_16x8; + uint16x8_t m_16x8; + uint16x8_t mc0_16x8; + uint16x8_t mc1_16x8; + uint16x8_t temp0_16x8; + uint16x8_t temp1_16x8; + uint16x8_t x255_16x8; + uint32x2_t c_32x2; + uint32x2_t m_32x2; + uint32x4_t a_32x4; + uint32x4_t d_32x4; + uint32x4_t m_32x4; + uint32x4_t x1_32x4; + uint8x16_t a_8x16; + uint8x16_t d_8x16; + uint8x16_t m_8x16; + uint8x16_t mc_8x16; + uint8x16_t temp_8x16; + uint8x16_t x1_8x16; + uint8x8_t a0_8x8; + uint8x8_t a1_8x8; + uint8x8_t c_8x8; + uint8x8_t d0_8x8; + uint8x8_t d1_8x8; + uint8x8_t m0_8x8; + uint8x8_t m1_8x8; + uint8x8_t m_8x8; + uint8x8_t mc0_8x8; + uint8x8_t mc1_8x8; + uint8x8_t temp0_8x8; + uint8x8_t temp1_8x8; + + x1_8x16 = vdupq_n_u8(0x1); + x255_16x8 = vdupq_n_u16(0xff); + x1_32x4 = vreinterpretq_u32_u8(x1_8x16); + c_32x2 = vdup_n_u32(c); + c_8x8 = vreinterpret_u8_u32(c_32x2); + + DATA32 *start = d; + int size = l; + DATA32 *end = start + (size ~7); + while (start end) { + int k = *((int *)m); + if (k == 0) + { + m+=4; + start+=4; + continue; + } + + m_32x2 = vld1_lane_u32((DATA32*)m, m_32x2, 0); + + d_32x4 = vld1q_u32(start); + + m_8x8 = vreinterpret_u8_u32(m_32x2); + m_16x8 = vmovl_u8(m_8x8); + m_8x16 = vreinterpretq_u8_u16(m_16x8); + m_8x8 = vget_low_u8(m_8x16); + m_16x8 = vmovl_u8(m_8x8); + m_32x4 = vreinterpretq_u32_u16(m_16x8); + + m_32x4 = vmulq_u32(m_32x4, x1_32x4); + m_8x16 = vreinterpretq_u8_u32(m_32x4); + m0_8x8 = vget_low_u8(m_8x16); + m1_8x8 = vget_high_u8(m_8x16); + + mc0_16x8 = vmull_u8(m0_8x8, c_8x8); + mc1_16x8 = vmull_u8(m1_8x8, c_8x8); + + mc0_16x8 = vaddq_u16(mc0_16x8, x255_16x8); + mc1_16x8 = vaddq_u16(mc1_16x8, x255_16x8); + + mc0_8x8 = vshrn_n_u16(mc0_16x8, 8); + mc1_8x8 = vshrn_n_u16(mc1_16x8, 8); + + mc_8x16 = vcombine_u8(mc0_8x8, mc1_8x8); + a_8x16 = vmvnq_u8(mc_8x16); + a_32x4 = vreinterpretq_u32_u8(a_8x16); + a_32x4 = vshrq_n_u32(a_32x4, 24); + a_32x4 = vmulq_u32(a_32x4, x1_32x4); + + a_8x16 = vreinterpretq_u8_u32(a_32x4); + a0_8x8 = vget_low_u8(a_8x16); + a1_8x8 = vget_high_u8(a_8x16); + + d_8x16 = vreinterpretq_u8_u32(d_32x4); + + d0_8x8 = vget_low_u8(d_8x16); + d1_8x8 = vget_high_u8(d_8x16); + + d0_16x8 = vmovl_u8(d0_8x8); + d1_16x8 = vmovl_u8(d1_8x8); + + temp0_16x8 = vmull_u8(a0_8x8, d0_8x8); + temp1_16x8 = vmull_u8(a1_8x8, d1_8x8); + + temp0_16x8 = vaddq_u16(temp0_16x8, d0_16x8); + temp1_16x8 = vaddq_u16(temp1_16x8, d1_16x8); + + temp0_8x8 = vshrn_n_u16(temp0_16x8,8); + temp1_8x8 = vshrn_n_u16(temp1_16x8,8); + + temp_8x16 = vcombine_u8(temp0_8x8, temp1_8x8); + + d_8x16 = vaddq_u8(mc_8x16, temp_8x16); + + d_32x4 = vreinterpretq_u32_u8(d_8x16); + + vst1q_u32(start, d_32x4); + + start+=4
[EGIT] [core/efl] master 10/55: evas: implement _op_blend_c_dp_neon in NEON intrinsics.
cedric pushed a commit to branch master. http://git.enlightenment.org/core/efl.git/commit/?id=a30481d27ba5e2dd5ad84cef9f6c55a9c89880a1 commit a30481d27ba5e2dd5ad84cef9f6c55a9c89880a1 Author: Yury Usishchev y.usishc...@samsung.com Date: Wed Apr 15 17:22:54 2015 +0200 evas: implement _op_blend_c_dp_neon in NEON intrinsics. Reviewers: raster, cedric @feature Reviewed By: cedric Subscribers: jpeg, cedric Projects: #efl Differential Revision: https://phab.enlightenment.org/D2310 Signed-off-by: Cedric BAIL ced...@osg.samsung.com --- .../common/evas_op_blend/op_blend_color_neon.c | 92 -- 1 file changed, 86 insertions(+), 6 deletions(-) diff --git a/src/lib/evas/common/evas_op_blend/op_blend_color_neon.c b/src/lib/evas/common/evas_op_blend/op_blend_color_neon.c index 2bf14c1..7ba2ffd 100644 --- a/src/lib/evas/common/evas_op_blend/op_blend_color_neon.c +++ b/src/lib/evas/common/evas_op_blend/op_blend_color_neon.c @@ -1,15 +1,95 @@ +#ifdef BUILD_NEON +#ifdef BUILD_NEON_INTRINSICS +#include arm_neon.h +#endif +#endif /* blend color -- dst */ #ifdef BUILD_NEON static void _op_blend_c_dp_neon(DATA32 *s EINA_UNUSED, DATA8 *m EINA_UNUSED, DATA32 c, DATA32 *d, int l) { #ifdef BUILD_NEON_INTRINSICS -DATA32 *e, a = 256 - (c 24); -UNROLL8_PLD_WHILE(d, l, e, - { - *d = c + MUL_256(a, *d); - d++; - }); + uint16x8_t temp00_16x8; + uint16x8_t temp01_16x8; + uint16x8_t temp10_16x8; + uint16x8_t temp11_16x8; + uint32x4_t temp0_32x4; + uint32x4_t temp1_32x4; + uint32x4_t c_32x4; + uint32x4_t d0_32x4; + uint32x4_t d1_32x4; + uint8x16_t d0_8x16; + uint8x16_t d1_8x16; + uint8x16_t temp0_8x16; + uint8x16_t temp1_8x16; + uint8x8_t alpha_8x8; + uint8x8_t d00_8x8; + uint8x8_t d01_8x8; + uint8x8_t d10_8x8; + uint8x8_t d11_8x8; + uint8x8_t temp00_8x8; + uint8x8_t temp01_8x8; + uint8x8_t temp10_8x8; + uint8x8_t temp11_8x8; + + // alpha can only be 0 if color is 0x0. In that case we can just return. + // Otherwise we can assume alpha != 0. This allows more optimization in + // NEON code. + + if(!c) + return; + + DATA32 *start = d; + int size = l; + DATA32 *end = start + (size ~7); + + unsigned char alpha; + alpha = ~(c 24) + 1; // 256 - (c 24) + alpha_8x8 = vdup_n_u8(alpha); + + c_32x4 = vdupq_n_u32(c); + + while (start end) + { + d0_32x4 = vld1q_u32(start); + d1_32x4 = vld1q_u32(start+4); + d0_8x16 = vreinterpretq_u8_u32(d0_32x4); + d1_8x16 = vreinterpretq_u8_u32(d1_32x4); + + d00_8x8 = vget_low_u8(d0_8x16); + d01_8x8 = vget_high_u8(d0_8x16); + d10_8x8 = vget_low_u8(d1_8x16); + d11_8x8 = vget_high_u8(d1_8x16); + + temp00_16x8 = vmull_u8(alpha_8x8, d00_8x8); + temp01_16x8 = vmull_u8(alpha_8x8, d01_8x8); + temp10_16x8 = vmull_u8(alpha_8x8, d10_8x8); + temp11_16x8 = vmull_u8(alpha_8x8, d11_8x8); + + temp00_8x8 = vshrn_n_u16(temp00_16x8,8); + temp01_8x8 = vshrn_n_u16(temp01_16x8,8); + temp10_8x8 = vshrn_n_u16(temp10_16x8,8); + temp11_8x8 = vshrn_n_u16(temp11_16x8,8); + + temp0_8x16 = vcombine_u8(temp00_8x8, temp01_8x8); + temp1_8x16 = vcombine_u8(temp10_8x8, temp11_8x8); + + temp0_32x4 = vreinterpretq_u32_u8(temp0_8x16); + temp1_32x4 = vreinterpretq_u32_u8(temp1_8x16); + + d0_32x4 = vaddq_u32(c_32x4, temp0_32x4); + d1_32x4 = vaddq_u32(c_32x4, temp1_32x4); + + vst1q_u32(start, d0_32x4); + vst1q_u32(start+4, d1_32x4); + start+=8; + } + end += (size 7); + while (start end) + { + *start = c + MUL_256(alpha, *start); + start++; + } #else DATA32 *e, *tmp = 0; #define AP B_C_DP --
[EGIT] [core/efl] master 09/55: evas: enable NEON-optimized code for aarch64.
cedric pushed a commit to branch master. http://git.enlightenment.org/core/efl.git/commit/?id=71eec44ccc9ab43e728ba986fadce6c6cfd2ff7c commit 71eec44ccc9ab43e728ba986fadce6c6cfd2ff7c Author: Yury Usishchev y.usishc...@samsung.com Date: Wed Apr 15 17:21:33 2015 +0200 evas: enable NEON-optimized code for aarch64. Summary: Add new define, BUILD_NEON_INTRINSICS to control whether NEON inline code or NEON intrinsics should be built. GCC NEON intrinsics can be built both for armv7 and armv8. However NEON inline code can be built only for armv7. @feature Reviewers: raster, stefan_schmidt, cedric Subscribers: cedric, stefan_schmidt Projects: #efl Differential Revision: https://phab.enlightenment.org/D2309 Signed-off-by: Cedric BAIL ced...@osg.samsung.com --- configure.ac | 18 + src/lib/evas/common/evas_blit_main.c | 8 src/lib/evas/common/evas_cpu.c | 9 + .../common/evas_op_blend/op_blend_color_neon.c | 10 - .../evas_op_blend/op_blend_mask_color_neon.c | 47 ++ .../evas_op_blend/op_blend_pixel_color_neon.c | 14 ++- .../common/evas_op_blend/op_blend_pixel_neon.c | 33 ++- .../evas/common/evas_op_copy/op_copy_color_neon.c | 9 + 8 files changed, 145 insertions(+), 3 deletions(-) diff --git a/configure.ac b/configure.ac index 9eed98c..63cc54d 100644 --- a/configure.ac +++ b/configure.ac @@ -576,6 +576,21 @@ case $host_cpu in CFLAGS=${CFLAGS_save} fi ;; + aarch64*) +if test x${want_neon} = xyes; then + build_cpu_neon=yes + AC_MSG_CHECKING([whether to use NEON instructions]) + AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[#include arm_neon.h]], [[volatile uint32x4_t test = vdupq_n_u32(0x1);]])],[ + AC_MSG_RESULT([yes]) + AC_DEFINE([BUILD_NEON], [1], [Build NEON Code]) + AC_DEFINE([BUILD_NEON_INTRINSICS], [1], [Build NEON Intrinsics]) + build_cpu_neon=yes +],[ + AC_MSG_RESULT([no]) + build_cpu_neon=no +]) +fi +;; esac AC_SUBST([ALTIVEC_CFLAGS]) @@ -4741,6 +4756,9 @@ case $host_cpu in arm*) EFL_ADD_FEATURE([cpu], [neon], [${build_cpu_neon}]) ;; + aarch64*) +EFL_ADD_FEATURE([cpu], [neon], [${build_cpu_neon}]) +;; esac if test ${have_linux} = yes; then diff --git a/src/lib/evas/common/evas_blit_main.c b/src/lib/evas/common/evas_blit_main.c index 7f8faa1..4da4034 100644 --- a/src/lib/evas/common/evas_blit_main.c +++ b/src/lib/evas/common/evas_blit_main.c @@ -132,6 +132,9 @@ evas_common_copy_rev_pixels_c(DATA32 *src, DATA32 *dst, int len) static void evas_common_copy_pixels_rev_neon(DATA32 *src, DATA32 *dst, int len) { +#ifdef BUILD_NEON_INTRINSICS +evas_common_copy_pixels_rev_c(src, dst, len); +#else uint32_t *tmp = (void *)37; #define AP evas_common_copy_rev_pixels_neon_ asm volatile ( @@ -228,6 +231,7 @@ evas_common_copy_pixels_rev_neon(DATA32 *src, DATA32 *dst, int len) ); #undef AP +#endif } #endif @@ -324,6 +328,9 @@ evas_common_copy_pixels_mmx2(DATA32 *src, DATA32 *dst, int len) #ifdef BUILD_NEON static void evas_common_copy_pixels_neon(DATA32 *src, DATA32 *dst, int len){ +#ifdef BUILD_NEON_INTRINSICS +evas_common_copy_pixels_c(src, dst, len); +#else uint32_t *e,*tmp = (void *)37; e = dst + len; #define AP evas_common_copy_pixels_neon_ @@ -410,6 +417,7 @@ evas_common_copy_pixels_neon(DATA32 *src, DATA32 *dst, int len){ ); #undef AP +#endif } #endif /* BUILD_NEON */ diff --git a/src/lib/evas/common/evas_cpu.c b/src/lib/evas/common/evas_cpu.c index 4139098..0f83258 100644 --- a/src/lib/evas/common/evas_cpu.c +++ b/src/lib/evas/common/evas_cpu.c @@ -2,6 +2,11 @@ #ifdef BUILD_MMX #include evas_mmx.h #endif +#ifdef BUILD_NEON +#ifdef BUILD_NEON_INTRINSICS +#include arm_neon.h +#endif +#endif #if defined BUILD_SSE3 #include immintrin.h #endif @@ -92,6 +97,9 @@ evas_common_cpu_neon_test(void) { //#if defined(__ARM_ARCH__) (__ARM_ARCH__ = 70) #ifdef BUILD_NEON +#ifdef BUILD_NEON_INTRINSICS + volatile uint32x4_t temp = vdupq_n_u32(0x1); +#else asm volatile ( .fpu neon \n\t vqadd.u8 d0, d1, d0\n @@ -101,6 +109,7 @@ evas_common_cpu_neon_test(void) d0, d1 ); #endif +#endif //#endif } diff --git a/src/lib/evas/common/evas_op_blend/op_blend_color_neon.c b/src/lib/evas/common/evas_op_blend/op_blend_color_neon.c index 9e94298..2bf14c1 100644 --- a/src/lib/evas/common/evas_op_blend/op_blend_color_neon.c +++ b/src/lib/evas/common/evas_op_blend/op_blend_color_neon.c @@ -3,6 +3,14 @@ #ifdef BUILD_NEON static void _op_blend_c_dp_neon(DATA32 *s EINA_UNUSED, DATA8 *m EINA_UNUSED, DATA32 c, DATA32 *d, int l) { +#ifdef BUILD_NEON_INTRINSICS +DATA32 *e, a = 256 - (c
[EGIT] [core/efl] master 15/55: evas: implement _op_blend_p_c_dp_neon in NEON intrinsics.
cedric pushed a commit to branch master. http://git.enlightenment.org/core/efl.git/commit/?id=be7c7c2c77c7b61f569532be7abb07858490bae6 commit be7c7c2c77c7b61f569532be7abb07858490bae6 Author: Yury Usishchev y.usishc...@samsung.com Date: Thu Apr 16 19:25:29 2015 +0200 evas: implement _op_blend_p_c_dp_neon in NEON intrinsics. Reviewers: cedric, raster Projects: #efl Differential Revision: https://phab.enlightenment.org/D2366 Signed-off-by: Cedric BAIL ced...@osg.samsung.com --- .../evas_op_blend/op_blend_pixel_color_neon.c | 116 +++-- 1 file changed, 106 insertions(+), 10 deletions(-) diff --git a/src/lib/evas/common/evas_op_blend/op_blend_pixel_color_neon.c b/src/lib/evas/common/evas_op_blend/op_blend_pixel_color_neon.c index c47ec7c..b1bfc25 100644 --- a/src/lib/evas/common/evas_op_blend/op_blend_pixel_color_neon.c +++ b/src/lib/evas/common/evas_op_blend/op_blend_pixel_color_neon.c @@ -1,3 +1,8 @@ +#ifdef BUILD_NEON +#ifdef BUILD_NEON_INTRINSICS +#include arm_neon.h +#endif +#endif /* blend pixel x color -- dst */ #ifdef BUILD_NEON @@ -8,16 +13,107 @@ static void _op_blend_p_c_dp_neon(DATA32 * __restrict s, DATA8 *m EINA_UNUSED, DATA32 c, DATA32 * __restrict d, int l) { #ifdef BUILD_NEON_INTRINSICS - DATA32 *e; - int alpha; - UNROLL8_PLD_WHILE(d, l, e, - { -DATA32 sc = MUL4_SYM(c, *s); -alpha = 256 - (sc 24); -*d = sc + MUL_256(alpha, *d); -d++; -s++; - }); + uint16x8_t ad0_16x8; + uint16x8_t ad1_16x8; + uint16x8_t sc0_16x8; + uint16x8_t sc1_16x8; + uint16x8_t x255_16x8; + uint32x2_t c_32x2; + uint32x4_t ad_32x4; + uint32x4_t alpha_32x4; + uint32x4_t cond_32x4; + uint32x4_t d_32x4; + uint32x4_t s_32x4; + uint32x4_t sc_32x4; + uint32x4_t x0_32x4; + uint32x4_t x1_32x4; + uint8x16_t ad_8x16; + uint8x16_t alpha_8x16; + uint8x16_t d_8x16; + uint8x16_t s_8x16; + uint8x16_t sc_8x16; + uint8x16_t x0_8x16; + uint8x16_t x1_8x16; + uint8x8_t ad0_8x8; + uint8x8_t ad1_8x8; + uint8x8_t alpha0_8x8; + uint8x8_t alpha1_8x8; + uint8x8_t c_8x8; + uint8x8_t d0_8x8; + uint8x8_t d1_8x8; + uint8x8_t s0_8x8; + uint8x8_t s1_8x8; + uint8x8_t sc0_8x8; + uint8x8_t sc1_8x8; + + c_32x2 = vdup_n_u32(c); + c_8x8 = vreinterpret_u8_u32(c_32x2); + x255_16x8 = vdupq_n_u16(0xff); + x0_8x16 = vdupq_n_u8(0x0); + x0_32x4 = vreinterpretq_u32_u8(x0_8x16); + x1_8x16 = vdupq_n_u8(0x1); + x1_32x4 = vreinterpretq_u32_u8(x1_8x16); + DATA32 *start = d; + int size = l; + DATA32 *end = start + (size ~3); + while (start end) + { + + s_32x4 = vld1q_u32(s); + s_8x16 = vreinterpretq_u8_u32(s_32x4); + + d_32x4 = vld1q_u32(start); + d_8x16 = vreinterpretq_u8_u32(d_32x4); + d0_8x8 = vget_low_u8(d_8x16); + d1_8x8 = vget_high_u8(d_8x16); + + s0_8x8 = vget_low_u8(s_8x16); + s1_8x8 = vget_high_u8(s_8x16); + + sc0_16x8 = vmull_u8(s0_8x8, c_8x8); + sc1_16x8 = vmull_u8(s1_8x8, c_8x8); + sc0_16x8 = vaddq_u16(sc0_16x8, x255_16x8); + sc1_16x8 = vaddq_u16(sc1_16x8, x255_16x8); + sc0_8x8 = vshrn_n_u16(sc0_16x8, 8); + sc1_8x8 = vshrn_n_u16(sc1_16x8, 8); + sc_8x16 = vcombine_u8(sc0_8x8, sc1_8x8); + + alpha_32x4 = vreinterpretq_u32_u8(sc_8x16); + alpha_32x4 = vshrq_n_u32(alpha_32x4, 24); + alpha_32x4 = vmulq_u32(x1_32x4, alpha_32x4); + alpha_8x16 = vreinterpretq_u8_u32(alpha_32x4); + alpha_8x16 = vsubq_u8(x0_8x16, alpha_8x16); + alpha0_8x8 = vget_low_u8(alpha_8x16); + alpha1_8x8 = vget_high_u8(alpha_8x16); + + ad0_16x8 = vmull_u8(alpha0_8x8, d0_8x8); + ad1_16x8 = vmull_u8(alpha1_8x8, d1_8x8); + ad0_8x8 = vshrn_n_u16(ad0_16x8,8); + ad1_8x8 = vshrn_n_u16(ad1_16x8,8); + ad_8x16 = vcombine_u8(ad0_8x8, ad1_8x8); + ad_32x4 = vreinterpretq_u32_u8(ad_8x16); + + alpha_32x4 = vreinterpretq_u32_u8(alpha_8x16); + cond_32x4 = vceqq_u32(alpha_32x4, x0_32x4); + ad_32x4 = vbslq_u32(cond_32x4, d_32x4 , ad_32x4); + + sc_32x4 = vreinterpretq_u32_u8(sc_8x16); + d_32x4 = vaddq_u32(sc_32x4, ad_32x4); + + vst1q_u32(start, d_32x4); + + s+=4; + start+=4; + } + end += (size 3); + while (start end) + { + DATA32 sc = MUL4_SYM(c, *s); + DATA32 alpha = 256 - (sc 24); + *start = sc + MUL_256(alpha, *start); + start++; + s++; + } #else #define AP blend_p_c_dp_ asm volatile ( --