Hi all,

This patch implements ACLE intrinsics vget_low_bf16 and vget_high_bf16 to 
extract lower or higher half from a bfloat16x8 vector.
The vget_high_bf16 is done by 'dup' instruction. The vget_low_bf16 could be 
done by a 'dup' or 'mov', or it's mostly optimized out by just using the lower 
half of a vector register.
The test for vget_low_bf16 only checks that the interface can be compiled but 
no instruction is checked since none is generated in the test case.

Arm ACLE document at 
https://developer.arm.com/architectures/instruction-sets/simd-isas/neon/intrinsics

Regtested and bootstrapped.

Is it OK for trunk please?

Thanks
Denni

gcc/ChangeLog:

2020-10-29  Dennis Zhang  <dennis.zh...@arm.com>

        * config/aarch64/aarch64-simd-builtins.def (vget_half): New entry.
        * config/aarch64/aarch64-simd.md (aarch64_vget_halfv8bf): New entry.
        * config/aarch64/arm_neon.h (vget_low_bf16): New intrinsic.
        (vget_high_bf16): Likewise.
        * config/aarch64/predicates.md (aarch64_zero_or_1): New predicate
        for zero or one immediate to indicate the lower or higher half.

gcc/testsuite/ChangeLog

2020-10-29  Dennis Zhang  <dennis.zh...@arm.com>

        * gcc.target/aarch64/advsimd-intrinsics/bf16_dup.c
        (test_vget_low_bf16, test_vget_high_bf16): New tests.
diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index 332a0b6b1ea..39ebb776d1d 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -719,6 +719,9 @@
   VAR1 (QUADOP_LANE, bfmlalb_lane_q, 0, ALL, v4sf)
   VAR1 (QUADOP_LANE, bfmlalt_lane_q, 0, ALL, v4sf)
 
+  /* Implemented by aarch64_vget_halfv8bf.  */
+  VAR1 (GETREG, vget_half, 0, ALL, v8bf)
+
   /* Implemented by aarch64_simd_<sur>mmlav16qi.  */
   VAR1 (TERNOP, simd_smmla, 0, NONE, v16qi)
   VAR1 (TERNOPU, simd_ummla, 0, NONE, v16qi)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 9f0e2bd1e6f..f62c52ca327 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -7159,6 +7159,19 @@
   [(set_attr "type" "neon_dot<VDQSF:q>")]
 )
 
+;; vget_low/high_bf16
+(define_expand "aarch64_vget_halfv8bf"
+  [(match_operand:V4BF 0 "register_operand")
+   (match_operand:V8BF 1 "register_operand")
+   (match_operand:SI 2 "aarch64_zero_or_1")]
+  "TARGET_BF16_SIMD"
+{
+  int hbase = INTVAL (operands[2]);
+  rtx sel = aarch64_gen_stepped_int_parallel (4, hbase * 4, 1);
+  emit_insn (gen_aarch64_get_halfv8bf (operands[0], operands[1], sel));
+  DONE;
+})
+
 ;; bfmmla
 (define_insn "aarch64_bfmmlaqv4sf"
   [(set (match_operand:V4SF 0 "register_operand" "=w")
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 50f8b23bc17..c6ac0b8dd17 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -35530,6 +35530,20 @@ vbfmlaltq_laneq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b,
   return __builtin_aarch64_bfmlalt_lane_qv4sf (__r, __a, __b, __index);
 }
 
+__extension__ extern __inline bfloat16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vget_low_bf16 (bfloat16x8_t __a)
+{
+  return __builtin_aarch64_vget_halfv8bf (__a, 0);
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vget_high_bf16 (bfloat16x8_t __a)
+{
+  return __builtin_aarch64_vget_halfv8bf (__a, 1);
+}
+
 __extension__ extern __inline bfloat16x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vcvt_bf16_f32 (float32x4_t __a)
diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md
index 215fcec5955..0c8bc2b0c73 100644
--- a/gcc/config/aarch64/predicates.md
+++ b/gcc/config/aarch64/predicates.md
@@ -84,6 +84,10 @@
 		 (ior (match_test "op == constm1_rtx")
 		      (match_test "op == const1_rtx"))))))
 
+(define_predicate "aarch64_zero_or_1"
+  (and (match_code "const_int")
+       (match_test "op == const0_rtx || op == const1_rtx")))
+
 (define_predicate "aarch64_reg_or_orr_imm"
    (ior (match_operand 0 "register_operand")
 	(and (match_code "const_vector")
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_dup.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_dup.c
index c42c7acbbe9..35f4cb864f2 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_dup.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_dup.c
@@ -83,3 +83,14 @@ bfloat16_t test_vduph_laneq_bf16 (bfloat16x8_t a)
   return vduph_laneq_bf16 (a, 7);
 }
 /* { dg-final { scan-assembler-times "dup\\th\[0-9\]+, v\[0-9\]+\.h\\\[7\\\]" 2 } } */
+
+bfloat16x4_t test_vget_low_bf16 (bfloat16x8_t a)
+{
+  return vget_low_bf16 (a);
+}
+
+bfloat16x4_t test_vget_high_bf16 (bfloat16x8_t a)
+{
+  return vget_high_bf16 (a);
+}
+/* { dg-final { scan-assembler-times "dup\\td\[0-9\]+, v\[0-9\]+\.d\\\[1\\\]" 1 } } */

Reply via email to