Hi all,
This patch implements ACLE intrinsics vget_low_bf16 and vget_high_bf16 to
extract lower or higher half from a bfloat16x8 vector.
The vget_high_bf16 is done by 'dup' instruction. The vget_low_bf16 could be
done by a 'dup' or 'mov', or it's mostly optimized out by just using the lower
half of a vector register.
The test for vget_low_bf16 only checks that the interface can be compiled but
no instruction is checked since none is generated in the test case.
Arm ACLE document at
https://developer.arm.com/architectures/instruction-sets/simd-isas/neon/intrinsics
Regtested and bootstrapped.
Is it OK for trunk please?
Thanks
Denni
gcc/ChangeLog:
2020-10-29 Dennis Zhang <dennis.zh...@arm.com>
* config/aarch64/aarch64-simd-builtins.def (vget_half): New entry.
* config/aarch64/aarch64-simd.md (aarch64_vget_halfv8bf): New entry.
* config/aarch64/arm_neon.h (vget_low_bf16): New intrinsic.
(vget_high_bf16): Likewise.
* config/aarch64/predicates.md (aarch64_zero_or_1): New predicate
for zero or one immediate to indicate the lower or higher half.
gcc/testsuite/ChangeLog
2020-10-29 Dennis Zhang <dennis.zh...@arm.com>
* gcc.target/aarch64/advsimd-intrinsics/bf16_dup.c
(test_vget_low_bf16, test_vget_high_bf16): New tests.
diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index 332a0b6b1ea..39ebb776d1d 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -719,6 +719,9 @@
VAR1 (QUADOP_LANE, bfmlalb_lane_q, 0, ALL, v4sf)
VAR1 (QUADOP_LANE, bfmlalt_lane_q, 0, ALL, v4sf)
+ /* Implemented by aarch64_vget_halfv8bf. */
+ VAR1 (GETREG, vget_half, 0, ALL, v8bf)
+
/* Implemented by aarch64_simd_<sur>mmlav16qi. */
VAR1 (TERNOP, simd_smmla, 0, NONE, v16qi)
VAR1 (TERNOPU, simd_ummla, 0, NONE, v16qi)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 9f0e2bd1e6f..f62c52ca327 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -7159,6 +7159,19 @@
[(set_attr "type" "neon_dot<VDQSF:q>")]
)
+;; vget_low/high_bf16
+(define_expand "aarch64_vget_halfv8bf"
+ [(match_operand:V4BF 0 "register_operand")
+ (match_operand:V8BF 1 "register_operand")
+ (match_operand:SI 2 "aarch64_zero_or_1")]
+ "TARGET_BF16_SIMD"
+{
+ int hbase = INTVAL (operands[2]);
+ rtx sel = aarch64_gen_stepped_int_parallel (4, hbase * 4, 1);
+ emit_insn (gen_aarch64_get_halfv8bf (operands[0], operands[1], sel));
+ DONE;
+})
+
;; bfmmla
(define_insn "aarch64_bfmmlaqv4sf"
[(set (match_operand:V4SF 0 "register_operand" "=w")
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 50f8b23bc17..c6ac0b8dd17 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -35530,6 +35530,20 @@ vbfmlaltq_laneq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b,
return __builtin_aarch64_bfmlalt_lane_qv4sf (__r, __a, __b, __index);
}
+__extension__ extern __inline bfloat16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vget_low_bf16 (bfloat16x8_t __a)
+{
+ return __builtin_aarch64_vget_halfv8bf (__a, 0);
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vget_high_bf16 (bfloat16x8_t __a)
+{
+ return __builtin_aarch64_vget_halfv8bf (__a, 1);
+}
+
__extension__ extern __inline bfloat16x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vcvt_bf16_f32 (float32x4_t __a)
diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md
index 215fcec5955..0c8bc2b0c73 100644
--- a/gcc/config/aarch64/predicates.md
+++ b/gcc/config/aarch64/predicates.md
@@ -84,6 +84,10 @@
(ior (match_test "op == constm1_rtx")
(match_test "op == const1_rtx"))))))
+(define_predicate "aarch64_zero_or_1"
+ (and (match_code "const_int")
+ (match_test "op == const0_rtx || op == const1_rtx")))
+
(define_predicate "aarch64_reg_or_orr_imm"
(ior (match_operand 0 "register_operand")
(and (match_code "const_vector")
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_dup.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_dup.c
index c42c7acbbe9..35f4cb864f2 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_dup.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_dup.c
@@ -83,3 +83,14 @@ bfloat16_t test_vduph_laneq_bf16 (bfloat16x8_t a)
return vduph_laneq_bf16 (a, 7);
}
/* { dg-final { scan-assembler-times "dup\\th\[0-9\]+, v\[0-9\]+\.h\\\[7\\\]" 2 } } */
+
+bfloat16x4_t test_vget_low_bf16 (bfloat16x8_t a)
+{
+ return vget_low_bf16 (a);
+}
+
+bfloat16x4_t test_vget_high_bf16 (bfloat16x8_t a)
+{
+ return vget_high_bf16 (a);
+}
+/* { dg-final { scan-assembler-times "dup\\td\[0-9\]+, v\[0-9\]+\.d\\\[1\\\]" 1 } } */