This allows reduction of non-(plus|min|max) operations using log_2(N) shifts
rather than N vec_extracts; e.g. for example code
int
main (unsigned char argc, char **argv)
{
unsigned char in[16] = { 1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31 };
unsigned char i = 0;
unsigned char sum = 1;
/* Prevent constant propagation of the entire loop below. */
asm volatile ("" : : : "memory");
for (i = 0; i < 16; i++)
sum *= in[i];
if (sum != 33)
__builtin_printf("Failed %d\n", sum);
}
(a simplified, less-general version of vect-reduc-mul_1.c) this gives
main:
ldr q0, .LC0
sub sp, sp, #16
str q0, [sp]
ldr q1, [sp]
movi v0.4s, 0
ext v2.16b, v1.16b, v0.16b, #8
mul v1.16b, v1.16b, v2.16b
ext v2.16b, v1.16b, v0.16b, #4
mul v1.16b, v2.16b, v1.16b
ext v2.16b, v1.16b, v0.16b, #2
mul v1.16b, v2.16b, v1.16b
ext v0.16b, v1.16b, v0.16b, #1
mul v0.16b, v0.16b, v1.16b
umov w1, v0.b[0]
cmp w1, 33
beq .L2
...
rather than previously:
main:
ldr q0, .LC0
sub sp, sp, #16
str q0, [sp]
ldr d1, [sp]
ldr d0, [sp, 8]
mul v0.8b, v0.8b, v1.8b
umov w0, v0.b[1]
umov w3, v0.b[0]
umov w2, v0.b[2]
umov w7, v0.b[3]
umov w6, v0.b[4]
mul w3, w0, w3
umov w5, v0.b[5]
umov w4, v0.b[6]
umov w1, v0.b[7]
mul w3, w3, w2
mul w2, w3, w7
mul w2, w2, w6
mul w0, w2, w5
mul w0, w0, w4
mul w1, w0, w1
uxtb w1, w1
cmp w1, 33
beq .L2
...
Tested check-gcc on aarch64-none-elf and aarch64_be-none-elf. (Including new
tests from previous patches.)
gcc/ChangeLog:
* config/aarch64/aarch64-simd.md (vec_shr<mode>): New (*2).
gcc/testsuite/ChangeLog:
* lib/target_supports.exp (check_effective_target_whole_vector_shift):
Add aarch64*-*-*.
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index d4a745be59897b4cb2a0de23adb56b5d79203592..3fcf809113d73b37a95653b8c2be432478d2bc1e 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -770,6 +770,45 @@
}
)
+;; For 64-bit modes we use ushl/r, as this does not require a SIMD zero.
+(define_insn "vec_shr_<mode>"
+ [(set (match_operand:VD 0 "register_operand" "=w")
+ (lshiftrt:VD (match_operand:VD 1 "register_operand" "w")
+ (match_operand:SI 2 "immediate_operand" "i")))]
+ "TARGET_SIMD"
+ "ushr %d0, %d1, %2"
+ [(set_attr "type" "neon_shift_imm")]
+)
+
+(define_expand "vec_shr_<mode>"
+ [(set (match_operand:VQ 0 "register_operand" "=w")
+ (lshiftrt:VQ (match_operand:VQ 1 "register_operand" "w")
+ (match_operand:SI 2 "immediate_operand" "i")))]
+ "TARGET_SIMD"
+{
+ HOST_WIDE_INT num_bits = INTVAL (operands[2]);
+ HOST_WIDE_INT elem_bits = GET_MODE_BITSIZE (GET_MODE_INNER (<MODE>mode));
+ rtx zero_reg = force_reg (<MODE>mode, CONST0_RTX (<MODE>mode));
+
+ gcc_assert (GET_MODE_BITSIZE (<MODE>mode) == 128);
+ gcc_assert (num_bits % elem_bits == 0);
+
+ if (num_bits == 0)
+ {
+ emit_move_insn (operands[0], operands[1]);
+ DONE;
+ }
+ else if (num_bits == 128)
+ {
+ emit_move_insn (operands[0], CONST0_RTX (<MODE>mode));
+ DONE;
+ }
+
+ emit_insn (gen_aarch64_ext<mode> (operands[0], operands[1], zero_reg,
+ GEN_INT (num_bits / elem_bits)));
+ DONE;
+})
+
(define_insn "aarch64_simd_vec_setv2di"
[(set (match_operand:V2DI 0 "register_operand" "=w,w")
(vec_merge:V2DI
diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp
index 5e40f5fcdfc95e41e804075bb5daa7030eb9bc66..720cc345bf6a76470cc85116d7b3365be07caa97 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -3323,6 +3323,7 @@ proc check_effective_target_vect_shift { } {
proc check_effective_target_whole_vector_shift { } {
if { [istarget x86_64-*-*]
|| [istarget ia64-*-*]
+ || [istarget aarch64*-*-*]
|| ([check_effective_target_arm32]
&& [check_effective_target_arm_little_endian])
|| ([istarget mips*-*-*]