On Mon, 11 Jun 2012 16:46:27 +0100
Ramana Radhakrishnan <ramana.radhakrish...@linaro.org> wrote:
> Hi,
>
> I don't like the ML bits of the patch as it stands today and before
> committing I would like to clean up the ML bits quite a bit further
> especially in areas where I've put FIXMEs [...]
I had a go at this, see attached. Untested. Note there are some
semantic differences in output:
vzipq_p8 (poly8x16_t __a, poly8x16_t __b)
{
poly8x16x2_t __rv;
- uint8x16_t __mask1 = {0, 2};
- uint8x16_t __mask2 = {1, 3};
- __rv.val[0] = (poly8x16_t)__builtin_shuffle (__a, __b, __mask1);
- __rv.val[1] = (poly8x16_t)__builtin_shuffle (__a, __b, __mask2);
+ uint8x16_t __mask1 = { 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6,
22, 7, 23 };
+ uint8x16_t __mask2 = { 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29,
14, 30, 15, 31 };
+ __rv.val[0] = (poly8x16_t) __builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (poly8x16_t) __builtin_shuffle (__a, __b, __mask2);
return __rv;
}
I wasn't quite sure which version was correct -- but your version
doesn't seem to have enough elements for these cases?
HTH,
Julian
Index: neon.ml
===================================================================
--- neon.ml (revision 188392)
+++ neon.ml (working copy)
@@ -201,6 +201,42 @@
(* Reinterpret casts. *)
| Vreinterp
+let rev_elems revsize elsize nelts _ =
+ let mask = (revsize / elsize) - 1 in
+ let arr = Array.init nelts
+ (fun i -> i lxor mask) in
+ Array.to_list arr
+
+let permute_range i stride nelts increment =
+ let rec build i = function
+ 0 -> []
+ | nelts -> i :: (i + stride) :: build (i + increment) (pred nelts) in
+ build i nelts
+
+(* Generate a list of integers suitable for vzip. *)
+let zip_range i stride nelts = permute_range i stride nelts 1
+
+(* Generate a list of integers suitable for vunzip. *)
+let uzip_range i stride nelts = permute_range i stride nelts 4
+
+(* Generate a list of integers suitable for trn. *)
+let trn_range i stride nelts = permute_range i stride nelts 2
+
+let zip_elems _ nelts part =
+ match part with
+ `lo -> zip_range 0 nelts (nelts / 2)
+ | `hi -> zip_range (nelts / 2) nelts (nelts / 2)
+
+let uzip_elems _ nelts part =
+ match part with
+ `lo -> uzip_range 0 2 (nelts / 2)
+ | `hi -> uzip_range 1 2 (nelts / 2)
+
+let trn_elems _ nelts part =
+ match part with
+ `lo -> trn_range 0 nelts (nelts / 2)
+ | `hi -> trn_range 1 nelts (nelts / 2)
+
(* Features used for documentation, to distinguish between some instruction
variants, and to signal special requirements (e.g. swapping arguments). *)
@@ -214,7 +250,10 @@
| Flipped of string (* Builtin name to use with flipped arguments. *)
| InfoWord (* Pass an extra word for signage/rounding etc. (always passed
for All _, Long, Wide, Narrow shape_forms. *)
- | ReturnPtr (* Pass explicit pointer to return value as first argument. *)
+ (* Implement builtin as shuffle. The parameter is a function which returns
+ masks suitable for __builtin_shuffle: arguments are (element size,
+ number of elements, high/low part selector). *)
+ | Use_shuffle of (int -> int -> [`lo|`hi] -> int list)
(* A specification as to the shape of instruction expected upon
disassembly, used if it differs from the shape used to build the
intrinsic prototype. Multiple entries in the constructor's argument
@@ -1317,12 +1356,18 @@
pf_su_8_64;
(* Reverse elements. *)
- Vrev64, [], All (2, Dreg), "vrev64", bits_1, P8 :: P16 :: F32 :: su_8_32;
- Vrev64, [], All (2, Qreg), "vrev64Q", bits_1, P8 :: P16 :: F32 :: su_8_32;
- Vrev32, [], All (2, Dreg), "vrev32", bits_1, [P8; P16; S8; U8; S16; U16];
- Vrev32, [], All (2, Qreg), "vrev32Q", bits_1, [P8; P16; S8; U8; S16; U16];
- Vrev16, [], All (2, Dreg), "vrev16", bits_1, [P8; S8; U8];
- Vrev16, [], All (2, Qreg), "vrev16Q", bits_1, [P8; S8; U8];
+ Vrev64, [Use_shuffle (rev_elems 64)], All (2, Dreg), "vrev64", bits_1,
+ P8 :: P16 :: F32 :: su_8_32;
+ Vrev64, [Use_shuffle (rev_elems 64)], All (2, Qreg), "vrev64Q", bits_1,
+ P8 :: P16 :: F32 :: su_8_32;
+ Vrev32, [Use_shuffle (rev_elems 32)], All (2, Dreg), "vrev32", bits_1,
+ [P8; P16; S8; U8; S16; U16];
+ Vrev32, [Use_shuffle (rev_elems 32)], All (2, Qreg), "vrev32Q", bits_1,
+ [P8; P16; S8; U8; S16; U16];
+ Vrev16, [Use_shuffle (rev_elems 16)], All (2, Dreg), "vrev16", bits_1,
+ [P8; S8; U8];
+ Vrev16, [Use_shuffle (rev_elems 16)], All (2, Qreg), "vrev16Q", bits_1,
+ [P8; S8; U8];
(* Bit selection. *)
Vbsl,
@@ -1336,25 +1381,19 @@
Use_operands [| Qreg; Qreg; Qreg; Qreg |], "vbslQ", bit_select,
pf_su_8_64;
- (* Transpose elements. **NOTE** ReturnPtr goes some of the way towards
- generating good code for intrinsics which return structure types --
- builtins work well by themselves (and understand that the values being
- stored on e.g. the stack also reside in registers, so can optimise the
- stores away entirely if the results are used immediately), but
- intrinsics are very much less efficient. Maybe something can be improved
- re: inlining, or tweaking the ABI used for intrinsics (a special call
- attribute?).
- *)
- Vtrn, [ReturnPtr], Pair_result Dreg, "vtrn", bits_2, pf_su_8_32;
- Vtrn, [ReturnPtr], Pair_result Qreg, "vtrnQ", bits_2, pf_su_8_32;
-
+ Vtrn, [Use_shuffle trn_elems], Pair_result Dreg, "vtrn", bits_2, pf_su_8_32;
+ Vtrn, [Use_shuffle trn_elems], Pair_result Qreg, "vtrnQ", bits_2,
+ pf_su_8_32;
(* Zip elements. *)
- Vzip, [ReturnPtr], Pair_result Dreg, "vzip", bits_2, pf_su_8_32;
- Vzip, [ReturnPtr], Pair_result Qreg, "vzipQ", bits_2, pf_su_8_32;
+ Vzip, [Use_shuffle zip_elems], Pair_result Dreg, "vzip", bits_2, pf_su_8_32;
+ Vzip, [Use_shuffle zip_elems], Pair_result Qreg, "vzipQ", bits_2,
+ pf_su_8_32;
(* Unzip elements. *)
- Vuzp, [ReturnPtr], Pair_result Dreg, "vuzp", bits_2, pf_su_8_32;
- Vuzp, [ReturnPtr], Pair_result Qreg, "vuzpQ", bits_2, pf_su_8_32;
+ Vuzp, [Use_shuffle uzip_elems], Pair_result Dreg, "vuzp", bits_2,
+ pf_su_8_32;
+ Vuzp, [Use_shuffle uzip_elems], Pair_result Qreg, "vuzpQ", bits_2,
+ pf_su_8_32;
(* Element/structure loads. VLD1 variants. *)
Vldx 1,
Index: arm_neon.h
===================================================================
--- arm_neon.h (revision 188392)
+++ arm_neon.h (working copy)
@@ -7047,217 +7047,253 @@
__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
vrev64_s8 (int8x8_t __a)
{
- return (int8x8_t)__builtin_neon_vrev64v8qi (__a, 1);
+ uint8x8_t __mask = { 7, 6, 5, 4, 3, 2, 1, 0 };
+ return (int8x8_t) __builtin_shuffle (__a, __mask);
}
__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
vrev64_s16 (int16x4_t __a)
{
- return (int16x4_t)__builtin_neon_vrev64v4hi (__a, 1);
+ uint16x4_t __mask = { 3, 2, 1, 0 };
+ return (int16x4_t) __builtin_shuffle (__a, __mask);
}
__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
vrev64_s32 (int32x2_t __a)
{
- return (int32x2_t)__builtin_neon_vrev64v2si (__a, 1);
+ uint32x2_t __mask = { 1, 0 };
+ return (int32x2_t) __builtin_shuffle (__a, __mask);
}
__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
vrev64_f32 (float32x2_t __a)
{
- return (float32x2_t)__builtin_neon_vrev64v2sf (__a, 3);
+ uint32x2_t __mask = { 1, 0 };
+ return (float32x2_t) __builtin_shuffle (__a, __mask);
}
__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
vrev64_u8 (uint8x8_t __a)
{
- return (uint8x8_t)__builtin_neon_vrev64v8qi ((int8x8_t) __a, 0);
+ uint8x8_t __mask = { 7, 6, 5, 4, 3, 2, 1, 0 };
+ return (uint8x8_t) __builtin_shuffle (__a, __mask);
}
__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
vrev64_u16 (uint16x4_t __a)
{
- return (uint16x4_t)__builtin_neon_vrev64v4hi ((int16x4_t) __a, 0);
+ uint16x4_t __mask = { 3, 2, 1, 0 };
+ return (uint16x4_t) __builtin_shuffle (__a, __mask);
}
__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
vrev64_u32 (uint32x2_t __a)
{
- return (uint32x2_t)__builtin_neon_vrev64v2si ((int32x2_t) __a, 0);
+ uint32x2_t __mask = { 1, 0 };
+ return (uint32x2_t) __builtin_shuffle (__a, __mask);
}
__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
vrev64_p8 (poly8x8_t __a)
{
- return (poly8x8_t)__builtin_neon_vrev64v8qi ((int8x8_t) __a, 2);
+ uint8x8_t __mask = { 7, 6, 5, 4, 3, 2, 1, 0 };
+ return (poly8x8_t) __builtin_shuffle (__a, __mask);
}
__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
vrev64_p16 (poly16x4_t __a)
{
- return (poly16x4_t)__builtin_neon_vrev64v4hi ((int16x4_t) __a, 2);
+ uint16x4_t __mask = { 3, 2, 1, 0 };
+ return (poly16x4_t) __builtin_shuffle (__a, __mask);
}
__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
vrev64q_s8 (int8x16_t __a)
{
- return (int8x16_t)__builtin_neon_vrev64v16qi (__a, 1);
+ uint8x16_t __mask = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 };
+ return (int8x16_t) __builtin_shuffle (__a, __mask);
}
__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
vrev64q_s16 (int16x8_t __a)
{
- return (int16x8_t)__builtin_neon_vrev64v8hi (__a, 1);
+ uint16x8_t __mask = { 3, 2, 1, 0, 7, 6, 5, 4 };
+ return (int16x8_t) __builtin_shuffle (__a, __mask);
}
__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
vrev64q_s32 (int32x4_t __a)
{
- return (int32x4_t)__builtin_neon_vrev64v4si (__a, 1);
+ uint32x4_t __mask = { 1, 0, 3, 2 };
+ return (int32x4_t) __builtin_shuffle (__a, __mask);
}
__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
vrev64q_f32 (float32x4_t __a)
{
- return (float32x4_t)__builtin_neon_vrev64v4sf (__a, 3);
+ uint32x4_t __mask = { 1, 0, 3, 2 };
+ return (float32x4_t) __builtin_shuffle (__a, __mask);
}
__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
vrev64q_u8 (uint8x16_t __a)
{
- return (uint8x16_t)__builtin_neon_vrev64v16qi ((int8x16_t) __a, 0);
+ uint8x16_t __mask = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 };
+ return (uint8x16_t) __builtin_shuffle (__a, __mask);
}
__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
vrev64q_u16 (uint16x8_t __a)
{
- return (uint16x8_t)__builtin_neon_vrev64v8hi ((int16x8_t) __a, 0);
+ uint16x8_t __mask = { 3, 2, 1, 0, 7, 6, 5, 4 };
+ return (uint16x8_t) __builtin_shuffle (__a, __mask);
}
__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
vrev64q_u32 (uint32x4_t __a)
{
- return (uint32x4_t)__builtin_neon_vrev64v4si ((int32x4_t) __a, 0);
+ uint32x4_t __mask = { 1, 0, 3, 2 };
+ return (uint32x4_t) __builtin_shuffle (__a, __mask);
}
__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
vrev64q_p8 (poly8x16_t __a)
{
- return (poly8x16_t)__builtin_neon_vrev64v16qi ((int8x16_t) __a, 2);
+ uint8x16_t __mask = { 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 };
+ return (poly8x16_t) __builtin_shuffle (__a, __mask);
}
__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
vrev64q_p16 (poly16x8_t __a)
{
- return (poly16x8_t)__builtin_neon_vrev64v8hi ((int16x8_t) __a, 2);
+ uint16x8_t __mask = { 3, 2, 1, 0, 7, 6, 5, 4 };
+ return (poly16x8_t) __builtin_shuffle (__a, __mask);
}
__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
vrev32_s8 (int8x8_t __a)
{
- return (int8x8_t)__builtin_neon_vrev32v8qi (__a, 1);
+ uint8x8_t __mask = { 3, 2, 1, 0, 7, 6, 5, 4 };
+ return (int8x8_t) __builtin_shuffle (__a, __mask);
}
__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
vrev32_s16 (int16x4_t __a)
{
- return (int16x4_t)__builtin_neon_vrev32v4hi (__a, 1);
+ uint16x4_t __mask = { 1, 0, 3, 2 };
+ return (int16x4_t) __builtin_shuffle (__a, __mask);
}
__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
vrev32_u8 (uint8x8_t __a)
{
- return (uint8x8_t)__builtin_neon_vrev32v8qi ((int8x8_t) __a, 0);
+ uint8x8_t __mask = { 3, 2, 1, 0, 7, 6, 5, 4 };
+ return (uint8x8_t) __builtin_shuffle (__a, __mask);
}
__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
vrev32_u16 (uint16x4_t __a)
{
- return (uint16x4_t)__builtin_neon_vrev32v4hi ((int16x4_t) __a, 0);
+ uint16x4_t __mask = { 1, 0, 3, 2 };
+ return (uint16x4_t) __builtin_shuffle (__a, __mask);
}
__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
vrev32_p8 (poly8x8_t __a)
{
- return (poly8x8_t)__builtin_neon_vrev32v8qi ((int8x8_t) __a, 2);
+ uint8x8_t __mask = { 3, 2, 1, 0, 7, 6, 5, 4 };
+ return (poly8x8_t) __builtin_shuffle (__a, __mask);
}
__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
vrev32_p16 (poly16x4_t __a)
{
- return (poly16x4_t)__builtin_neon_vrev32v4hi ((int16x4_t) __a, 2);
+ uint16x4_t __mask = { 1, 0, 3, 2 };
+ return (poly16x4_t) __builtin_shuffle (__a, __mask);
}
__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
vrev32q_s8 (int8x16_t __a)
{
- return (int8x16_t)__builtin_neon_vrev32v16qi (__a, 1);
+ uint8x16_t __mask = { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 };
+ return (int8x16_t) __builtin_shuffle (__a, __mask);
}
__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
vrev32q_s16 (int16x8_t __a)
{
- return (int16x8_t)__builtin_neon_vrev32v8hi (__a, 1);
+ uint16x8_t __mask = { 1, 0, 3, 2, 5, 4, 7, 6 };
+ return (int16x8_t) __builtin_shuffle (__a, __mask);
}
__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
vrev32q_u8 (uint8x16_t __a)
{
- return (uint8x16_t)__builtin_neon_vrev32v16qi ((int8x16_t) __a, 0);
+ uint8x16_t __mask = { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 };
+ return (uint8x16_t) __builtin_shuffle (__a, __mask);
}
__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
vrev32q_u16 (uint16x8_t __a)
{
- return (uint16x8_t)__builtin_neon_vrev32v8hi ((int16x8_t) __a, 0);
+ uint16x8_t __mask = { 1, 0, 3, 2, 5, 4, 7, 6 };
+ return (uint16x8_t) __builtin_shuffle (__a, __mask);
}
__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
vrev32q_p8 (poly8x16_t __a)
{
- return (poly8x16_t)__builtin_neon_vrev32v16qi ((int8x16_t) __a, 2);
+ uint8x16_t __mask = { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 };
+ return (poly8x16_t) __builtin_shuffle (__a, __mask);
}
__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
vrev32q_p16 (poly16x8_t __a)
{
- return (poly16x8_t)__builtin_neon_vrev32v8hi ((int16x8_t) __a, 2);
+ uint16x8_t __mask = { 1, 0, 3, 2, 5, 4, 7, 6 };
+ return (poly16x8_t) __builtin_shuffle (__a, __mask);
}
__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
vrev16_s8 (int8x8_t __a)
{
- return (int8x8_t)__builtin_neon_vrev16v8qi (__a, 1);
+ uint8x8_t __mask = { 1, 0, 3, 2, 5, 4, 7, 6 };
+ return (int8x8_t) __builtin_shuffle (__a, __mask);
}
__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
vrev16_u8 (uint8x8_t __a)
{
- return (uint8x8_t)__builtin_neon_vrev16v8qi ((int8x8_t) __a, 0);
+ uint8x8_t __mask = { 1, 0, 3, 2, 5, 4, 7, 6 };
+ return (uint8x8_t) __builtin_shuffle (__a, __mask);
}
__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
vrev16_p8 (poly8x8_t __a)
{
- return (poly8x8_t)__builtin_neon_vrev16v8qi ((int8x8_t) __a, 2);
+ uint8x8_t __mask = { 1, 0, 3, 2, 5, 4, 7, 6 };
+ return (poly8x8_t) __builtin_shuffle (__a, __mask);
}
__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
vrev16q_s8 (int8x16_t __a)
{
- return (int8x16_t)__builtin_neon_vrev16v16qi (__a, 1);
+ uint8x16_t __mask = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 };
+ return (int8x16_t) __builtin_shuffle (__a, __mask);
}
__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
vrev16q_u8 (uint8x16_t __a)
{
- return (uint8x16_t)__builtin_neon_vrev16v16qi ((int8x16_t) __a, 0);
+ uint8x16_t __mask = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 };
+ return (uint8x16_t) __builtin_shuffle (__a, __mask);
}
__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
vrev16q_p8 (poly8x16_t __a)
{
- return (poly8x16_t)__builtin_neon_vrev16v16qi ((int8x16_t) __a, 2);
+ uint8x16_t __mask = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 };
+ return (poly8x16_t) __builtin_shuffle (__a, __mask);
}
__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
@@ -7396,7 +7432,10 @@
vtrn_s8 (int8x8_t __a, int8x8_t __b)
{
int8x8x2_t __rv;
- __builtin_neon_vtrnv8qi (&__rv.val[0], __a, __b);
+ uint8x8_t __mask1 = { 0, 8, 2, 10, 4, 12, 6, 14 };
+ uint8x8_t __mask2 = { 1, 9, 3, 11, 5, 13, 7, 15 };
+ __rv.val[0] = (int8x8_t) __builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (int8x8_t) __builtin_shuffle (__a, __b, __mask2);
return __rv;
}
@@ -7404,7 +7443,10 @@
vtrn_s16 (int16x4_t __a, int16x4_t __b)
{
int16x4x2_t __rv;
- __builtin_neon_vtrnv4hi (&__rv.val[0], __a, __b);
+ uint16x4_t __mask1 = { 0, 4, 2, 6 };
+ uint16x4_t __mask2 = { 1, 5, 3, 7 };
+ __rv.val[0] = (int16x4_t) __builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (int16x4_t) __builtin_shuffle (__a, __b, __mask2);
return __rv;
}
@@ -7412,7 +7454,10 @@
vtrn_s32 (int32x2_t __a, int32x2_t __b)
{
int32x2x2_t __rv;
- __builtin_neon_vtrnv2si (&__rv.val[0], __a, __b);
+ uint32x2_t __mask1 = { 0, 2 };
+ uint32x2_t __mask2 = { 1, 3 };
+ __rv.val[0] = (int32x2_t) __builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (int32x2_t) __builtin_shuffle (__a, __b, __mask2);
return __rv;
}
@@ -7420,7 +7465,10 @@
vtrn_f32 (float32x2_t __a, float32x2_t __b)
{
float32x2x2_t __rv;
- __builtin_neon_vtrnv2sf (&__rv.val[0], __a, __b);
+ uint32x2_t __mask1 = { 0, 2 };
+ uint32x2_t __mask2 = { 1, 3 };
+ __rv.val[0] = (float32x2_t) __builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (float32x2_t) __builtin_shuffle (__a, __b, __mask2);
return __rv;
}
@@ -7428,7 +7476,10 @@
vtrn_u8 (uint8x8_t __a, uint8x8_t __b)
{
uint8x8x2_t __rv;
- __builtin_neon_vtrnv8qi ((int8x8_t *) &__rv.val[0], (int8x8_t) __a, (int8x8_t) __b);
+ uint8x8_t __mask1 = { 0, 8, 2, 10, 4, 12, 6, 14 };
+ uint8x8_t __mask2 = { 1, 9, 3, 11, 5, 13, 7, 15 };
+ __rv.val[0] = (uint8x8_t) __builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (uint8x8_t) __builtin_shuffle (__a, __b, __mask2);
return __rv;
}
@@ -7436,7 +7487,10 @@
vtrn_u16 (uint16x4_t __a, uint16x4_t __b)
{
uint16x4x2_t __rv;
- __builtin_neon_vtrnv4hi ((int16x4_t *) &__rv.val[0], (int16x4_t) __a, (int16x4_t) __b);
+ uint16x4_t __mask1 = { 0, 4, 2, 6 };
+ uint16x4_t __mask2 = { 1, 5, 3, 7 };
+ __rv.val[0] = (uint16x4_t) __builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (uint16x4_t) __builtin_shuffle (__a, __b, __mask2);
return __rv;
}
@@ -7444,7 +7498,10 @@
vtrn_u32 (uint32x2_t __a, uint32x2_t __b)
{
uint32x2x2_t __rv;
- __builtin_neon_vtrnv2si ((int32x2_t *) &__rv.val[0], (int32x2_t) __a, (int32x2_t) __b);
+ uint32x2_t __mask1 = { 0, 2 };
+ uint32x2_t __mask2 = { 1, 3 };
+ __rv.val[0] = (uint32x2_t) __builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (uint32x2_t) __builtin_shuffle (__a, __b, __mask2);
return __rv;
}
@@ -7452,7 +7509,10 @@
vtrn_p8 (poly8x8_t __a, poly8x8_t __b)
{
poly8x8x2_t __rv;
- __builtin_neon_vtrnv8qi ((int8x8_t *) &__rv.val[0], (int8x8_t) __a, (int8x8_t) __b);
+ uint8x8_t __mask1 = { 0, 8, 2, 10, 4, 12, 6, 14 };
+ uint8x8_t __mask2 = { 1, 9, 3, 11, 5, 13, 7, 15 };
+ __rv.val[0] = (poly8x8_t) __builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (poly8x8_t) __builtin_shuffle (__a, __b, __mask2);
return __rv;
}
@@ -7460,7 +7520,10 @@
vtrn_p16 (poly16x4_t __a, poly16x4_t __b)
{
poly16x4x2_t __rv;
- __builtin_neon_vtrnv4hi ((int16x4_t *) &__rv.val[0], (int16x4_t) __a, (int16x4_t) __b);
+ uint16x4_t __mask1 = { 0, 4, 2, 6 };
+ uint16x4_t __mask2 = { 1, 5, 3, 7 };
+ __rv.val[0] = (poly16x4_t) __builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (poly16x4_t) __builtin_shuffle (__a, __b, __mask2);
return __rv;
}
@@ -7468,7 +7531,10 @@
vtrnq_s8 (int8x16_t __a, int8x16_t __b)
{
int8x16x2_t __rv;
- __builtin_neon_vtrnv16qi (&__rv.val[0], __a, __b);
+ uint8x16_t __mask1 = { 0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30 };
+ uint8x16_t __mask2 = { 1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31 };
+ __rv.val[0] = (int8x16_t) __builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (int8x16_t) __builtin_shuffle (__a, __b, __mask2);
return __rv;
}
@@ -7476,7 +7542,10 @@
vtrnq_s16 (int16x8_t __a, int16x8_t __b)
{
int16x8x2_t __rv;
- __builtin_neon_vtrnv8hi (&__rv.val[0], __a, __b);
+ uint16x8_t __mask1 = { 0, 8, 2, 10, 4, 12, 6, 14 };
+ uint16x8_t __mask2 = { 1, 9, 3, 11, 5, 13, 7, 15 };
+ __rv.val[0] = (int16x8_t) __builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (int16x8_t) __builtin_shuffle (__a, __b, __mask2);
return __rv;
}
@@ -7484,7 +7553,10 @@
vtrnq_s32 (int32x4_t __a, int32x4_t __b)
{
int32x4x2_t __rv;
- __builtin_neon_vtrnv4si (&__rv.val[0], __a, __b);
+ uint32x4_t __mask1 = { 0, 4, 2, 6 };
+ uint32x4_t __mask2 = { 1, 5, 3, 7 };
+ __rv.val[0] = (int32x4_t) __builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (int32x4_t) __builtin_shuffle (__a, __b, __mask2);
return __rv;
}
@@ -7492,7 +7564,10 @@
vtrnq_f32 (float32x4_t __a, float32x4_t __b)
{
float32x4x2_t __rv;
- __builtin_neon_vtrnv4sf (&__rv.val[0], __a, __b);
+ uint32x4_t __mask1 = { 0, 4, 2, 6 };
+ uint32x4_t __mask2 = { 1, 5, 3, 7 };
+ __rv.val[0] = (float32x4_t) __builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (float32x4_t) __builtin_shuffle (__a, __b, __mask2);
return __rv;
}
@@ -7500,7 +7575,10 @@
vtrnq_u8 (uint8x16_t __a, uint8x16_t __b)
{
uint8x16x2_t __rv;
- __builtin_neon_vtrnv16qi ((int8x16_t *) &__rv.val[0], (int8x16_t) __a, (int8x16_t) __b);
+ uint8x16_t __mask1 = { 0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30 };
+ uint8x16_t __mask2 = { 1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31 };
+ __rv.val[0] = (uint8x16_t) __builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (uint8x16_t) __builtin_shuffle (__a, __b, __mask2);
return __rv;
}
@@ -7508,7 +7586,10 @@
vtrnq_u16 (uint16x8_t __a, uint16x8_t __b)
{
uint16x8x2_t __rv;
- __builtin_neon_vtrnv8hi ((int16x8_t *) &__rv.val[0], (int16x8_t) __a, (int16x8_t) __b);
+ uint16x8_t __mask1 = { 0, 8, 2, 10, 4, 12, 6, 14 };
+ uint16x8_t __mask2 = { 1, 9, 3, 11, 5, 13, 7, 15 };
+ __rv.val[0] = (uint16x8_t) __builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (uint16x8_t) __builtin_shuffle (__a, __b, __mask2);
return __rv;
}
@@ -7516,7 +7597,10 @@
vtrnq_u32 (uint32x4_t __a, uint32x4_t __b)
{
uint32x4x2_t __rv;
- __builtin_neon_vtrnv4si ((int32x4_t *) &__rv.val[0], (int32x4_t) __a, (int32x4_t) __b);
+ uint32x4_t __mask1 = { 0, 4, 2, 6 };
+ uint32x4_t __mask2 = { 1, 5, 3, 7 };
+ __rv.val[0] = (uint32x4_t) __builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (uint32x4_t) __builtin_shuffle (__a, __b, __mask2);
return __rv;
}
@@ -7524,7 +7608,10 @@
vtrnq_p8 (poly8x16_t __a, poly8x16_t __b)
{
poly8x16x2_t __rv;
- __builtin_neon_vtrnv16qi ((int8x16_t *) &__rv.val[0], (int8x16_t) __a, (int8x16_t) __b);
+ uint8x16_t __mask1 = { 0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30 };
+ uint8x16_t __mask2 = { 1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31 };
+ __rv.val[0] = (poly8x16_t) __builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (poly8x16_t) __builtin_shuffle (__a, __b, __mask2);
return __rv;
}
@@ -7532,7 +7619,10 @@
vtrnq_p16 (poly16x8_t __a, poly16x8_t __b)
{
poly16x8x2_t __rv;
- __builtin_neon_vtrnv8hi ((int16x8_t *) &__rv.val[0], (int16x8_t) __a, (int16x8_t) __b);
+ uint16x8_t __mask1 = { 0, 8, 2, 10, 4, 12, 6, 14 };
+ uint16x8_t __mask2 = { 1, 9, 3, 11, 5, 13, 7, 15 };
+ __rv.val[0] = (poly16x8_t) __builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (poly16x8_t) __builtin_shuffle (__a, __b, __mask2);
return __rv;
}
@@ -7540,7 +7630,10 @@
vzip_s8 (int8x8_t __a, int8x8_t __b)
{
int8x8x2_t __rv;
- __builtin_neon_vzipv8qi (&__rv.val[0], __a, __b);
+ uint8x8_t __mask1 = { 0, 8, 1, 9, 2, 10, 3, 11 };
+ uint8x8_t __mask2 = { 4, 12, 5, 13, 6, 14, 7, 15 };
+ __rv.val[0] = (int8x8_t) __builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (int8x8_t) __builtin_shuffle (__a, __b, __mask2);
return __rv;
}
@@ -7548,7 +7641,10 @@
vzip_s16 (int16x4_t __a, int16x4_t __b)
{
int16x4x2_t __rv;
- __builtin_neon_vzipv4hi (&__rv.val[0], __a, __b);
+ uint16x4_t __mask1 = { 0, 4, 1, 5 };
+ uint16x4_t __mask2 = { 2, 6, 3, 7 };
+ __rv.val[0] = (int16x4_t) __builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (int16x4_t) __builtin_shuffle (__a, __b, __mask2);
return __rv;
}
@@ -7556,7 +7652,10 @@
vzip_s32 (int32x2_t __a, int32x2_t __b)
{
int32x2x2_t __rv;
- __builtin_neon_vzipv2si (&__rv.val[0], __a, __b);
+ uint32x2_t __mask1 = { 0, 2 };
+ uint32x2_t __mask2 = { 1, 3 };
+ __rv.val[0] = (int32x2_t) __builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (int32x2_t) __builtin_shuffle (__a, __b, __mask2);
return __rv;
}
@@ -7564,7 +7663,10 @@
vzip_f32 (float32x2_t __a, float32x2_t __b)
{
float32x2x2_t __rv;
- __builtin_neon_vzipv2sf (&__rv.val[0], __a, __b);
+ uint32x2_t __mask1 = { 0, 2 };
+ uint32x2_t __mask2 = { 1, 3 };
+ __rv.val[0] = (float32x2_t) __builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (float32x2_t) __builtin_shuffle (__a, __b, __mask2);
return __rv;
}
@@ -7572,7 +7674,10 @@
vzip_u8 (uint8x8_t __a, uint8x8_t __b)
{
uint8x8x2_t __rv;
- __builtin_neon_vzipv8qi ((int8x8_t *) &__rv.val[0], (int8x8_t) __a, (int8x8_t) __b);
+ uint8x8_t __mask1 = { 0, 8, 1, 9, 2, 10, 3, 11 };
+ uint8x8_t __mask2 = { 4, 12, 5, 13, 6, 14, 7, 15 };
+ __rv.val[0] = (uint8x8_t) __builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (uint8x8_t) __builtin_shuffle (__a, __b, __mask2);
return __rv;
}
@@ -7580,7 +7685,10 @@
vzip_u16 (uint16x4_t __a, uint16x4_t __b)
{
uint16x4x2_t __rv;
- __builtin_neon_vzipv4hi ((int16x4_t *) &__rv.val[0], (int16x4_t) __a, (int16x4_t) __b);
+ uint16x4_t __mask1 = { 0, 4, 1, 5 };
+ uint16x4_t __mask2 = { 2, 6, 3, 7 };
+ __rv.val[0] = (uint16x4_t) __builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (uint16x4_t) __builtin_shuffle (__a, __b, __mask2);
return __rv;
}
@@ -7588,7 +7696,10 @@
vzip_u32 (uint32x2_t __a, uint32x2_t __b)
{
uint32x2x2_t __rv;
- __builtin_neon_vzipv2si ((int32x2_t *) &__rv.val[0], (int32x2_t) __a, (int32x2_t) __b);
+ uint32x2_t __mask1 = { 0, 2 };
+ uint32x2_t __mask2 = { 1, 3 };
+ __rv.val[0] = (uint32x2_t) __builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (uint32x2_t) __builtin_shuffle (__a, __b, __mask2);
return __rv;
}
@@ -7596,7 +7707,10 @@
vzip_p8 (poly8x8_t __a, poly8x8_t __b)
{
poly8x8x2_t __rv;
- __builtin_neon_vzipv8qi ((int8x8_t *) &__rv.val[0], (int8x8_t) __a, (int8x8_t) __b);
+ uint8x8_t __mask1 = { 0, 8, 1, 9, 2, 10, 3, 11 };
+ uint8x8_t __mask2 = { 4, 12, 5, 13, 6, 14, 7, 15 };
+ __rv.val[0] = (poly8x8_t) __builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (poly8x8_t) __builtin_shuffle (__a, __b, __mask2);
return __rv;
}
@@ -7604,7 +7718,10 @@
vzip_p16 (poly16x4_t __a, poly16x4_t __b)
{
poly16x4x2_t __rv;
- __builtin_neon_vzipv4hi ((int16x4_t *) &__rv.val[0], (int16x4_t) __a, (int16x4_t) __b);
+ uint16x4_t __mask1 = { 0, 4, 1, 5 };
+ uint16x4_t __mask2 = { 2, 6, 3, 7 };
+ __rv.val[0] = (poly16x4_t) __builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (poly16x4_t) __builtin_shuffle (__a, __b, __mask2);
return __rv;
}
@@ -7612,7 +7729,10 @@
vzipq_s8 (int8x16_t __a, int8x16_t __b)
{
int8x16x2_t __rv;
- __builtin_neon_vzipv16qi (&__rv.val[0], __a, __b);
+ uint8x16_t __mask1 = { 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23 };
+ uint8x16_t __mask2 = { 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31 };
+ __rv.val[0] = (int8x16_t) __builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (int8x16_t) __builtin_shuffle (__a, __b, __mask2);
return __rv;
}
@@ -7620,7 +7740,10 @@
vzipq_s16 (int16x8_t __a, int16x8_t __b)
{
int16x8x2_t __rv;
- __builtin_neon_vzipv8hi (&__rv.val[0], __a, __b);
+ uint16x8_t __mask1 = { 0, 8, 1, 9, 2, 10, 3, 11 };
+ uint16x8_t __mask2 = { 4, 12, 5, 13, 6, 14, 7, 15 };
+ __rv.val[0] = (int16x8_t) __builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (int16x8_t) __builtin_shuffle (__a, __b, __mask2);
return __rv;
}
@@ -7628,7 +7751,10 @@
vzipq_s32 (int32x4_t __a, int32x4_t __b)
{
int32x4x2_t __rv;
- __builtin_neon_vzipv4si (&__rv.val[0], __a, __b);
+ uint32x4_t __mask1 = { 0, 4, 1, 5 };
+ uint32x4_t __mask2 = { 2, 6, 3, 7 };
+ __rv.val[0] = (int32x4_t) __builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (int32x4_t) __builtin_shuffle (__a, __b, __mask2);
return __rv;
}
@@ -7636,7 +7762,10 @@
vzipq_f32 (float32x4_t __a, float32x4_t __b)
{
float32x4x2_t __rv;
- __builtin_neon_vzipv4sf (&__rv.val[0], __a, __b);
+ uint32x4_t __mask1 = { 0, 4, 1, 5 };
+ uint32x4_t __mask2 = { 2, 6, 3, 7 };
+ __rv.val[0] = (float32x4_t) __builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (float32x4_t) __builtin_shuffle (__a, __b, __mask2);
return __rv;
}
@@ -7644,7 +7773,10 @@
vzipq_u8 (uint8x16_t __a, uint8x16_t __b)
{
uint8x16x2_t __rv;
- __builtin_neon_vzipv16qi ((int8x16_t *) &__rv.val[0], (int8x16_t) __a, (int8x16_t) __b);
+ uint8x16_t __mask1 = { 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23 };
+ uint8x16_t __mask2 = { 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31 };
+ __rv.val[0] = (uint8x16_t) __builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (uint8x16_t) __builtin_shuffle (__a, __b, __mask2);
return __rv;
}
@@ -7652,7 +7784,10 @@
vzipq_u16 (uint16x8_t __a, uint16x8_t __b)
{
uint16x8x2_t __rv;
- __builtin_neon_vzipv8hi ((int16x8_t *) &__rv.val[0], (int16x8_t) __a, (int16x8_t) __b);
+ uint16x8_t __mask1 = { 0, 8, 1, 9, 2, 10, 3, 11 };
+ uint16x8_t __mask2 = { 4, 12, 5, 13, 6, 14, 7, 15 };
+ __rv.val[0] = (uint16x8_t) __builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (uint16x8_t) __builtin_shuffle (__a, __b, __mask2);
return __rv;
}
@@ -7660,7 +7795,10 @@
vzipq_u32 (uint32x4_t __a, uint32x4_t __b)
{
uint32x4x2_t __rv;
- __builtin_neon_vzipv4si ((int32x4_t *) &__rv.val[0], (int32x4_t) __a, (int32x4_t) __b);
+ uint32x4_t __mask1 = { 0, 4, 1, 5 };
+ uint32x4_t __mask2 = { 2, 6, 3, 7 };
+ __rv.val[0] = (uint32x4_t) __builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (uint32x4_t) __builtin_shuffle (__a, __b, __mask2);
return __rv;
}
@@ -7668,7 +7806,10 @@
vzipq_p8 (poly8x16_t __a, poly8x16_t __b)
{
poly8x16x2_t __rv;
- __builtin_neon_vzipv16qi ((int8x16_t *) &__rv.val[0], (int8x16_t) __a, (int8x16_t) __b);
+ uint8x16_t __mask1 = { 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23 };
+ uint8x16_t __mask2 = { 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31 };
+ __rv.val[0] = (poly8x16_t) __builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (poly8x16_t) __builtin_shuffle (__a, __b, __mask2);
return __rv;
}
@@ -7676,7 +7817,10 @@
vzipq_p16 (poly16x8_t __a, poly16x8_t __b)
{
poly16x8x2_t __rv;
- __builtin_neon_vzipv8hi ((int16x8_t *) &__rv.val[0], (int16x8_t) __a, (int16x8_t) __b);
+ uint16x8_t __mask1 = { 0, 8, 1, 9, 2, 10, 3, 11 };
+ uint16x8_t __mask2 = { 4, 12, 5, 13, 6, 14, 7, 15 };
+ __rv.val[0] = (poly16x8_t) __builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (poly16x8_t) __builtin_shuffle (__a, __b, __mask2);
return __rv;
}
@@ -7684,7 +7828,10 @@
vuzp_s8 (int8x8_t __a, int8x8_t __b)
{
int8x8x2_t __rv;
- __builtin_neon_vuzpv8qi (&__rv.val[0], __a, __b);
+ uint8x8_t __mask1 = { 0, 2, 4, 6, 8, 10, 12, 14 };
+ uint8x8_t __mask2 = { 1, 3, 5, 7, 9, 11, 13, 15 };
+ __rv.val[0] = (int8x8_t) __builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (int8x8_t) __builtin_shuffle (__a, __b, __mask2);
return __rv;
}
@@ -7692,7 +7839,10 @@
vuzp_s16 (int16x4_t __a, int16x4_t __b)
{
int16x4x2_t __rv;
- __builtin_neon_vuzpv4hi (&__rv.val[0], __a, __b);
+ uint16x4_t __mask1 = { 0, 2, 4, 6 };
+ uint16x4_t __mask2 = { 1, 3, 5, 7 };
+ __rv.val[0] = (int16x4_t) __builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (int16x4_t) __builtin_shuffle (__a, __b, __mask2);
return __rv;
}
@@ -7700,7 +7850,10 @@
vuzp_s32 (int32x2_t __a, int32x2_t __b)
{
int32x2x2_t __rv;
- __builtin_neon_vuzpv2si (&__rv.val[0], __a, __b);
+ uint32x2_t __mask1 = { 0, 2 };
+ uint32x2_t __mask2 = { 1, 3 };
+ __rv.val[0] = (int32x2_t) __builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (int32x2_t) __builtin_shuffle (__a, __b, __mask2);
return __rv;
}
@@ -7708,7 +7861,10 @@
vuzp_f32 (float32x2_t __a, float32x2_t __b)
{
float32x2x2_t __rv;
- __builtin_neon_vuzpv2sf (&__rv.val[0], __a, __b);
+ uint32x2_t __mask1 = { 0, 2 };
+ uint32x2_t __mask2 = { 1, 3 };
+ __rv.val[0] = (float32x2_t) __builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (float32x2_t) __builtin_shuffle (__a, __b, __mask2);
return __rv;
}
@@ -7716,7 +7872,10 @@
vuzp_u8 (uint8x8_t __a, uint8x8_t __b)
{
uint8x8x2_t __rv;
- __builtin_neon_vuzpv8qi ((int8x8_t *) &__rv.val[0], (int8x8_t) __a, (int8x8_t) __b);
+ uint8x8_t __mask1 = { 0, 2, 4, 6, 8, 10, 12, 14 };
+ uint8x8_t __mask2 = { 1, 3, 5, 7, 9, 11, 13, 15 };
+ __rv.val[0] = (uint8x8_t) __builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (uint8x8_t) __builtin_shuffle (__a, __b, __mask2);
return __rv;
}
@@ -7724,7 +7883,10 @@
vuzp_u16 (uint16x4_t __a, uint16x4_t __b)
{
uint16x4x2_t __rv;
- __builtin_neon_vuzpv4hi ((int16x4_t *) &__rv.val[0], (int16x4_t) __a, (int16x4_t) __b);
+ uint16x4_t __mask1 = { 0, 2, 4, 6 };
+ uint16x4_t __mask2 = { 1, 3, 5, 7 };
+ __rv.val[0] = (uint16x4_t) __builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (uint16x4_t) __builtin_shuffle (__a, __b, __mask2);
return __rv;
}
@@ -7732,7 +7894,10 @@
vuzp_u32 (uint32x2_t __a, uint32x2_t __b)
{
uint32x2x2_t __rv;
- __builtin_neon_vuzpv2si ((int32x2_t *) &__rv.val[0], (int32x2_t) __a, (int32x2_t) __b);
+ uint32x2_t __mask1 = { 0, 2 };
+ uint32x2_t __mask2 = { 1, 3 };
+ __rv.val[0] = (uint32x2_t) __builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (uint32x2_t) __builtin_shuffle (__a, __b, __mask2);
return __rv;
}
@@ -7740,7 +7905,10 @@
vuzp_p8 (poly8x8_t __a, poly8x8_t __b)
{
poly8x8x2_t __rv;
- __builtin_neon_vuzpv8qi ((int8x8_t *) &__rv.val[0], (int8x8_t) __a, (int8x8_t) __b);
+ uint8x8_t __mask1 = { 0, 2, 4, 6, 8, 10, 12, 14 };
+ uint8x8_t __mask2 = { 1, 3, 5, 7, 9, 11, 13, 15 };
+ __rv.val[0] = (poly8x8_t) __builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (poly8x8_t) __builtin_shuffle (__a, __b, __mask2);
return __rv;
}
@@ -7748,7 +7916,10 @@
vuzp_p16 (poly16x4_t __a, poly16x4_t __b)
{
poly16x4x2_t __rv;
- __builtin_neon_vuzpv4hi ((int16x4_t *) &__rv.val[0], (int16x4_t) __a, (int16x4_t) __b);
+ uint16x4_t __mask1 = { 0, 2, 4, 6 };
+ uint16x4_t __mask2 = { 1, 3, 5, 7 };
+ __rv.val[0] = (poly16x4_t) __builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (poly16x4_t) __builtin_shuffle (__a, __b, __mask2);
return __rv;
}
@@ -7756,7 +7927,10 @@
vuzpq_s8 (int8x16_t __a, int8x16_t __b)
{
int8x16x2_t __rv;
- __builtin_neon_vuzpv16qi (&__rv.val[0], __a, __b);
+ uint8x16_t __mask1 = { 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 };
+ uint8x16_t __mask2 = { 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 };
+ __rv.val[0] = (int8x16_t) __builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (int8x16_t) __builtin_shuffle (__a, __b, __mask2);
return __rv;
}
@@ -7764,7 +7938,10 @@
vuzpq_s16 (int16x8_t __a, int16x8_t __b)
{
int16x8x2_t __rv;
- __builtin_neon_vuzpv8hi (&__rv.val[0], __a, __b);
+ uint16x8_t __mask1 = { 0, 2, 4, 6, 8, 10, 12, 14 };
+ uint16x8_t __mask2 = { 1, 3, 5, 7, 9, 11, 13, 15 };
+ __rv.val[0] = (int16x8_t) __builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (int16x8_t) __builtin_shuffle (__a, __b, __mask2);
return __rv;
}
@@ -7772,7 +7949,10 @@
vuzpq_s32 (int32x4_t __a, int32x4_t __b)
{
int32x4x2_t __rv;
- __builtin_neon_vuzpv4si (&__rv.val[0], __a, __b);
+ uint32x4_t __mask1 = { 0, 2, 4, 6 };
+ uint32x4_t __mask2 = { 1, 3, 5, 7 };
+ __rv.val[0] = (int32x4_t) __builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (int32x4_t) __builtin_shuffle (__a, __b, __mask2);
return __rv;
}
@@ -7780,7 +7960,10 @@
vuzpq_f32 (float32x4_t __a, float32x4_t __b)
{
float32x4x2_t __rv;
- __builtin_neon_vuzpv4sf (&__rv.val[0], __a, __b);
+ uint32x4_t __mask1 = { 0, 2, 4, 6 };
+ uint32x4_t __mask2 = { 1, 3, 5, 7 };
+ __rv.val[0] = (float32x4_t) __builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (float32x4_t) __builtin_shuffle (__a, __b, __mask2);
return __rv;
}
@@ -7788,7 +7971,10 @@
vuzpq_u8 (uint8x16_t __a, uint8x16_t __b)
{
uint8x16x2_t __rv;
- __builtin_neon_vuzpv16qi ((int8x16_t *) &__rv.val[0], (int8x16_t) __a, (int8x16_t) __b);
+ uint8x16_t __mask1 = { 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 };
+ uint8x16_t __mask2 = { 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 };
+ __rv.val[0] = (uint8x16_t) __builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (uint8x16_t) __builtin_shuffle (__a, __b, __mask2);
return __rv;
}
@@ -7796,7 +7982,10 @@
vuzpq_u16 (uint16x8_t __a, uint16x8_t __b)
{
uint16x8x2_t __rv;
- __builtin_neon_vuzpv8hi ((int16x8_t *) &__rv.val[0], (int16x8_t) __a, (int16x8_t) __b);
+ uint16x8_t __mask1 = { 0, 2, 4, 6, 8, 10, 12, 14 };
+ uint16x8_t __mask2 = { 1, 3, 5, 7, 9, 11, 13, 15 };
+ __rv.val[0] = (uint16x8_t) __builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (uint16x8_t) __builtin_shuffle (__a, __b, __mask2);
return __rv;
}
@@ -7804,7 +7993,10 @@
vuzpq_u32 (uint32x4_t __a, uint32x4_t __b)
{
uint32x4x2_t __rv;
- __builtin_neon_vuzpv4si ((int32x4_t *) &__rv.val[0], (int32x4_t) __a, (int32x4_t) __b);
+ uint32x4_t __mask1 = { 0, 2, 4, 6 };
+ uint32x4_t __mask2 = { 1, 3, 5, 7 };
+ __rv.val[0] = (uint32x4_t) __builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (uint32x4_t) __builtin_shuffle (__a, __b, __mask2);
return __rv;
}
@@ -7812,7 +8004,10 @@
vuzpq_p8 (poly8x16_t __a, poly8x16_t __b)
{
poly8x16x2_t __rv;
- __builtin_neon_vuzpv16qi ((int8x16_t *) &__rv.val[0], (int8x16_t) __a, (int8x16_t) __b);
+ uint8x16_t __mask1 = { 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 };
+ uint8x16_t __mask2 = { 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 };
+ __rv.val[0] = (poly8x16_t) __builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (poly8x16_t) __builtin_shuffle (__a, __b, __mask2);
return __rv;
}
@@ -7820,7 +8015,10 @@
vuzpq_p16 (poly16x8_t __a, poly16x8_t __b)
{
poly16x8x2_t __rv;
- __builtin_neon_vuzpv8hi ((int16x8_t *) &__rv.val[0], (int16x8_t) __a, (int16x8_t) __b);
+ uint16x8_t __mask1 = { 0, 2, 4, 6, 8, 10, 12, 14 };
+ uint16x8_t __mask2 = { 1, 3, 5, 7, 9, 11, 13, 15 };
+ __rv.val[0] = (poly16x8_t) __builtin_shuffle (__a, __b, __mask1);
+ __rv.val[1] = (poly16x8_t) __builtin_shuffle (__a, __b, __mask2);
return __rv;
}
Index: neon-gen.ml
===================================================================
--- neon-gen.ml (revision 188392)
+++ neon-gen.ml (working copy)
@@ -98,8 +98,6 @@
close_braceblock ffmt;
end_function ffmt
-let return_by_ptr features = List.mem ReturnPtr features
-
let union_string num elts base =
let itype = inttype_for_array num elts in
let iname = string_of_inttype itype
@@ -141,29 +139,78 @@
(* Return a tuple of a list of declarations to go at the start of the function,
and a list of statements needed to return THING. *)
-let return arity return_by_ptr thing =
+let return arity thing =
match arity with
Arity0 (ret) | Arity1 (ret, _) | Arity2 (ret, _, _) | Arity3 (ret, _, _, _)
| Arity4 (ret, _, _, _, _) ->
- match ret with
- T_arrayof (num, vec) ->
- if return_by_ptr then
- let sname = string_of_vectype ret in
- [Printf.sprintf "%s __rv;" sname],
- [thing ^ ";"; "return __rv;"]
- else
+ begin match ret with
+ T_arrayof (num, vec) ->
let uname = union_string num vec "__rv" in
[uname ^ ";"], ["__rv.__o = " ^ thing ^ ";"; "return __rv.__i;"]
- | T_void -> [], [thing ^ ";"]
- | _ ->
- [], ["return " ^ (cast_for_return ret) ^ thing ^ ";"]
+ | T_void ->
+ [], [thing ^ ";"]
+ | _ ->
+ [], ["return " ^ (cast_for_return ret) ^ thing ^ ";"]
+ end
+let mask_shape_for_shuffle = function
+ All (num, reg) -> All (num, reg)
+ | Pair_result reg -> All (2, reg)
+ | _ -> failwith "mask_for_shuffle"
+
+let mask_elems shuffle shape elttype part =
+ let elem_size = elt_width elttype in
+ let num_elems =
+ match regmap shape 0 with
+ Dreg -> 64 / elem_size
+ | Qreg -> 128 / elem_size
+ | _ -> failwith "mask_elems" in
+ shuffle elem_size num_elems part
+
+(* Return a tuple of a list of declarations 0and a list of statements needed
+ to implement an intrinsic using __builtin_shuffle. SHUFFLE is a function
+ which returns a list of elements suitable for using as a mask. *)
+
+let shuffle_fn shuffle shape arity elttype =
+ let mshape = mask_shape_for_shuffle shape in
+ let masktype = type_for_elt mshape (unsigned_of_elt elttype) 0 in
+ let masktype_str = string_of_vectype masktype in
+ let shuffle_res = type_for_elt mshape elttype 0 in
+ let shuffle_res_str = string_of_vectype shuffle_res in
+ match arity with
+ Arity0 (ret) | Arity1 (ret, _) | Arity2 (ret, _, _) | Arity3 (ret, _, _, _)
+ | Arity4 (ret, _, _, _, _) ->
+ begin match ret with
+ T_arrayof (num, vec) ->
+ let elems1 = mask_elems shuffle mshape elttype `lo
+ and elems2 = mask_elems shuffle mshape elttype `hi in
+ let mask1 = Printf.sprintf "%s __mask1 = { %s };" masktype_str
+ (String.concat ", " (List.map string_of_int elems1))
+ and mask2 = Printf.sprintf "%s __mask2 = { %s };" masktype_str
+ (String.concat ", " (List.map string_of_int elems2)) in
+ let shuf1 = Printf.sprintf
+ "__rv.val[0] = (%s) __builtin_shuffle (__a, __b, __mask1);"
+ shuffle_res_str
+ and shuf2 = Printf.sprintf
+ "__rv.val[1] = (%s) __builtin_shuffle (__a, __b, __mask2);"
+ shuffle_res_str in
+ [Printf.sprintf "%s __rv;" (string_of_vectype ret); mask1; mask2],
+ [shuf1; shuf2; "return __rv;"]
+ | _ ->
+ let elems = mask_elems shuffle mshape elttype `lo in
+ let mask = Printf.sprintf "%s __mask = { %s };" masktype_str
+ (String.concat ", " (List.map string_of_int elems)) in
+ let shuf = Printf.sprintf
+ "return (%s) __builtin_shuffle (__a, __mask);" shuffle_res_str in
+ [mask], [shuf]
+ end
+
let rec element_type ctype =
match ctype with
T_arrayof (_, v) -> element_type v
| _ -> ctype
-let params return_by_ptr ps =
+let params ps =
let pdecls = ref [] in
let ptype t p =
match t with
@@ -180,13 +227,7 @@
| Arity3 (_, t1, t2, t3) -> [ptype t1 "__a"; ptype t2 "__b"; ptype t3 "__c"]
| Arity4 (_, t1, t2, t3, t4) ->
[ptype t1 "__a"; ptype t2 "__b"; ptype t3 "__c"; ptype t4 "__d"] in
- match ps with
- Arity0 ret | Arity1 (ret, _) | Arity2 (ret, _, _) | Arity3 (ret, _, _, _)
- | Arity4 (ret, _, _, _, _) ->
- if return_by_ptr then
- !pdecls, add_cast (T_ptrto (element_type ret)) "&__rv.val[0]" :: plist
- else
- !pdecls, plist
+ !pdecls, plist
let modify_params features plist =
let is_flipped =
@@ -239,17 +280,27 @@
and srcmode = mode_of_elt src shape in
string_of_mode dstmode ^ string_of_mode srcmode
+let get_shuffle features =
+ try
+ match List.find (function Use_shuffle _ -> true | _ -> false) features with
+ Use_shuffle fn -> Some fn
+ | _ -> None
+ with Not_found -> None
+
let print_variant opcode features shape name (ctype, asmtype, elttype) =
let bits = infoword_value elttype features in
let modesuf = mode_suffix elttype shape in
- let return_by_ptr = return_by_ptr features in
- let pdecls, paramlist = params return_by_ptr ctype in
- let paramlist' = modify_params features paramlist in
- let paramlist'' = extra_word shape features paramlist' bits in
- let parstr = String.concat ", " paramlist'' in
- let builtin = Printf.sprintf "__builtin_neon_%s%s (%s)"
- (builtin_name features name) modesuf parstr in
- let rdecls, stmts = return ctype return_by_ptr builtin in
+ let pdecls, paramlist = params ctype in
+ let rdecls, stmts =
+ match get_shuffle features with
+ Some shuffle -> shuffle_fn shuffle shape ctype elttype
+ | None ->
+ let paramlist' = modify_params features paramlist in
+ let paramlist'' = extra_word shape features paramlist' bits in
+ let parstr = String.concat ", " paramlist'' in
+ let builtin = Printf.sprintf "__builtin_neon_%s%s (%s)"
+ (builtin_name features name) modesuf parstr in
+ return ctype builtin in
let body = pdecls @ rdecls @ stmts
and fnname = (intrinsic_name name) ^ "_" ^ (string_of_elt elttype) in
print_function ctype fnname body