On 17 November 2014 06:58, Yangfei (Felix) felix.y...@huawei.com wrote:
PING?
BTW: It seems that Alan's way of improving vld1(q?)_dup intrinsic is more
elegant.
So is the improvement of vcls(q?) vcnt(q?) OK for trunk? Thanks.
Please rebase over Alan's patch and repost, thank you /Marcus
I rebased the patch on the latest trunk.
Regtested for aarch64-linux-gnu with qemu.
OK for the trunk?
Index: gcc/ChangeLog
===
--- gcc/ChangeLog (revision 217717)
+++ gcc/ChangeLog (working copy)
@@ -1,3 +1,14 @@
+2014-11-13 Felix Yang felix.y...@huawei.com
+ Shanyao Chen chenshan...@huawei.com
+
+ * config/aarch64/aarch64-simd.md (clrsbmode2, popcountmode2): New
+ patterns.
+ * config/aarch64/aarch64-simd-builtins.def (clrsb, popcount): New
+ builtins.
+ * config/aarch64/arm_neon.h (vcls_s8, vcls_s16, vcls_s32, vclsq_s8,
+ vclsq_s16, vclsq_s32, vcnt_p8, vcnt_s8, vcnt_u8, vcntq_p8, vcntq_s8,
+ vcntq_u8): Rewrite using builtin functions.
+
2014-11-18 Felix Yang felix.y...@huawei.com
* config/aarch64/aarch64.c (doloop_end): New pattern.
Index: gcc/config/aarch64/arm_neon.h
===
--- gcc/config/aarch64/arm_neon.h (revision 217717)
+++ gcc/config/aarch64/arm_neon.h (working copy)
@@ -5317,138 +5317,6 @@ vaddlvq_u32 (uint32x4_t a)
return result;
}
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vcls_s8 (int8x8_t a)
-{
- int8x8_t result;
- __asm__ (cls %0.8b,%1.8b
- : =w(result)
- : w(a)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vcls_s16 (int16x4_t a)
-{
- int16x4_t result;
- __asm__ (cls %0.4h,%1.4h
- : =w(result)
- : w(a)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vcls_s32 (int32x2_t a)
-{
- int32x2_t result;
- __asm__ (cls %0.2s,%1.2s
- : =w(result)
- : w(a)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vclsq_s8 (int8x16_t a)
-{
- int8x16_t result;
- __asm__ (cls %0.16b,%1.16b
- : =w(result)
- : w(a)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vclsq_s16 (int16x8_t a)
-{
- int16x8_t result;
- __asm__ (cls %0.8h,%1.8h
- : =w(result)
- : w(a)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vclsq_s32 (int32x4_t a)
-{
- int32x4_t result;
- __asm__ (cls %0.4s,%1.4s
- : =w(result)
- : w(a)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vcnt_p8 (poly8x8_t a)
-{
- poly8x8_t result;
- __asm__ (cnt %0.8b,%1.8b
- : =w(result)
- : w(a)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vcnt_s8 (int8x8_t a)
-{
- int8x8_t result;
- __asm__ (cnt %0.8b,%1.8b
- : =w(result)
- : w(a)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vcnt_u8 (uint8x8_t a)
-{
- uint8x8_t result;
- __asm__ (cnt %0.8b,%1.8b
- : =w(result)
- : w(a)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-vcntq_p8 (poly8x16_t a)
-{
- poly8x16_t result;
- __asm__ (cnt %0.16b,%1.16b
- : =w(result)
- : w(a)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vcntq_s8 (int8x16_t a)
-{
- int8x16_t result;
- __asm__ (cnt %0.16b,%1.16b
- : =w(result)
- : w(a)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vcntq_u8 (uint8x16_t a)
-{
- uint8x16_t result;
- __asm__ (cnt %0.16b,%1.16b
- : =w(result)
- : w(a)
- : /* No clobbers */);
- return result;
-}
-
#define vcopyq_lane_f32(a, b, c, d) \
__extension__ \
({ \
@@ -14082,6 +13950,44 @@ vcltzd_f64 (float64_t __a)
return __a 0.0 ? -1ll : 0ll;
}
+/* vcls. */
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vcls_s8 (int8x8_t __a)
+{
+ return