Re: [Ping][PATCH][Arm] ACLE intrinsics for AdvSIMD bfloat16 dot product

Dennis Zhang Tue, 25 Feb 2020 09:19:30 -0800

Hi Kyrill,

On 25/02/2020 12:18, Kyrill Tkachov wrote:

Hi Dennis,


On 2/25/20 11:54 AM, Dennis Zhang wrote:

Hi all,

On 07/01/2020 12:12, Dennis Zhang wrote:
> Hi all,
>
> This patch is part of a series adding support for Armv8.6-A features.
> It depends on the patch enabling Arm BFmode
> https://gcc.gnu.org/ml/gcc-patches/2019-12/msg01448.html
>
> This patch adds intrinsics for brain half-precision float-point dot
> product.
> ACLE documents are at https://developer.arm.com/docs/101028/latest
> ISA documents are at https://developer.arm.com/docs/ddi0596/latest
>
> Regression tested for arm-none-linux-gnueabi-armv8-a.
>
> Is it OK for trunk please?
>
> Thanks,
> Dennis
>
> gcc/ChangeLog:
>
> 2020-01-03  Dennis Zhang  <dennis.zh...@arm.com>
>
>      * config/arm/arm_neon.h (vbfdot_f32, vbfdotq_f32): New
>      (vbfdot_lane_f32, vbfdotq_laneq_f32): New.
>      (vbfdot_laneq_f32, vbfdotq_lane_f32): New.
>      * config/arm/arm_neon_builtins.def (vbfdot): New.
>      (vbfdot_lanev4bf, vbfdot_lanev8bf): New.
>      * config/arm/iterators.md (VSF2BF): New mode attribute.
>      * config/arm/neon.md (neon_vbfdot<VCVTF:mode>): New.
>      (neon_vbfdot_lanev4bf<VCVTF:mode>): New.
>      (neon_vbfdot_lanev8bf<VCVTF:mode>): New.
>
> gcc/testsuite/ChangeLog:
>
> 2020-01-03  Dennis Zhang  <dennis.zh...@arm.com>
>
>      * gcc.target/arm/simd/bf16_dot_1.c: New test.
>      * gcc.target/arm/simd/bf16_dot_2.c: New test.
>

This patch updates tests in bf16_dot_1.c to make proper assembly check.
Is it OK for trunk, please?

Cheers
Dennis


Looks ok but...


new file mode 100644
index 00000000000..c533f9d0b2f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/simd/bf16_dot_2.c
@@ -0,0 +1,29 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-add-options arm_v8_2a_bf16_neon } */
+
+#include "arm_neon.h"
+
+float32x2_t
+test_vbfdot_lane_f32 (float32x2_t r, bfloat16x4_t a, bfloat16x4_t b)
+{

+ return __builtin_neon_vbfdot_lanev4bfv2sf (r, a, b, 2); /* { dg-error{out of range 0 - 1} } */

+}
+
+float32x4_t
+test_vbfdotq_lane_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b)
+{

+ return __builtin_neon_vbfdot_lanev4bfv4sf (r, a, b, 2); /* { dg-error{out of range 0 - 1} } */

+}
+
+float32x2_t
+test_vbfdot_laneq_f32 (float32x2_t r, bfloat16x4_t a, bfloat16x8_t b)
+{

+ return __builtin_neon_vbfdot_lanev8bfv2sf (r, a, b, 4); /* { dg-error{out of range 0 - 3} } */

+}
+
+float32x4_t
+test_vbfdotq_laneq_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b)
+{

+ return __builtin_neon_vbfdot_lanev8bfv4sf (r, a, b, 4); /* { dg-error{out of range 0 - 3} } */

+}

These tests shouldn't be calling the __builtin* directly, they are justan implementation detail.

What we want to test is the intrinsic itself.
Thanks,
Kyrill


Many thanks for the review.
The issue is fixed in the updated patch.
Is it ready please?

Dennis
Cheers

gcc/ChangeLog:

2020-02-25  Dennis Zhang  <dennis.zh...@arm.com>

        * config/arm/arm_neon.h (vbfdot_f32, vbfdotq_f32): New
        (vbfdot_lane_f32, vbfdotq_laneq_f32): New.
        (vbfdot_laneq_f32, vbfdotq_lane_f32): New.
        * config/arm/arm_neon_builtins.def (vbfdot): New entry.
        (vbfdot_lanev4bf, vbfdot_lanev8bf): Likewise.
        * config/arm/iterators.md (VSF2BF): New attribute.
        * config/arm/neon.md (neon_vbfdot<VCVTF:mode>): New entry.
        (neon_vbfdot_lanev4bf<VCVTF:mode>): Likewise.
        (neon_vbfdot_lanev8bf<VCVTF:mode>): Likewise.

gcc/testsuite/ChangeLog:

2020-02-25  Dennis Zhang  <dennis.zh...@arm.com>

        * gcc.target/arm/simd/bf16_dot_1.c: New test.
        * gcc.target/arm/simd/bf16_dot_2.c: New test.
        * gcc.target/arm/simd/bf16_dot_3.c: New test.

diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h
index e81681aa415..d2ebee40538 100644
--- a/gcc/config/arm/arm_neon.h
+++ b/gcc/config/arm/arm_neon.h
@@ -18819,6 +18819,58 @@ vusmmlaq_s32 (int32x4_t __r, uint8x16_t __a, int8x16_t __b)
 
 #pragma GCC pop_options
 
+/* AdvSIMD Brain half-precision float-point (Bfloat16) intrinsics.  */
+#pragma GCC push_options
+#pragma GCC target ("arch=armv8.2-a+bf16")
+
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfdot_f32 (float32x2_t __r, bfloat16x4_t __a, bfloat16x4_t __b)
+{
+  return __builtin_neon_vbfdotv2sf (__r, __a, __b);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfdotq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b)
+{
+  return __builtin_neon_vbfdotv4sf (__r, __a, __b);
+}
+
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfdot_lane_f32 (float32x2_t __r, bfloat16x4_t __a, bfloat16x4_t __b,
+		 const int __index)
+{
+  return __builtin_neon_vbfdot_lanev4bfv2sf (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfdotq_laneq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b,
+		   const int __index)
+{
+  return __builtin_neon_vbfdot_lanev8bfv4sf (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfdot_laneq_f32 (float32x2_t __r, bfloat16x4_t __a, bfloat16x8_t __b,
+		  const int __index)
+{
+  return __builtin_neon_vbfdot_lanev8bfv2sf (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfdotq_lane_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x4_t __b,
+		  const int __index)
+{
+  return __builtin_neon_vbfdot_lanev4bfv4sf (__r, __a, __b, __index);
+}
+
+#pragma GCC pop_options
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/gcc/config/arm/arm_neon_builtins.def b/gcc/config/arm/arm_neon_builtins.def
index f4a97fd764c..4a6f4cfc44e 100644
--- a/gcc/config/arm/arm_neon_builtins.def
+++ b/gcc/config/arm/arm_neon_builtins.def
@@ -381,3 +381,7 @@ VAR2 (MAC_LANE_PAIR, vcmlaq_lane270, v4sf, v8hf)
 VAR1 (TERNOP, smmla, v16qi)
 VAR1 (UTERNOP, ummla, v16qi)
 VAR1 (USTERNOP, usmmla, v16qi)
+
+VAR2 (TERNOP, vbfdot, v2sf, v4sf)
+VAR2 (MAC_LANE_PAIR, vbfdot_lanev4bf, v2sf, v4sf)
+VAR2 (MAC_LANE_PAIR, vbfdot_lanev8bf, v2sf, v4sf)
diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
index 136c45274ae..b435a05d219 100644
--- a/gcc/config/arm/iterators.md
+++ b/gcc/config/arm/iterators.md
@@ -835,6 +835,8 @@
 (define_mode_attr VSI2QI [(V2SI "V8QI") (V4SI "V16QI")])
 (define_mode_attr vsi2qi [(V2SI "v8qi") (V4SI "v16qi")])
 
+(define_mode_attr VSF2BF [(V2SF "V4BF") (V4SF "V8BF")])
+
 ;;----------------------------------------------------------------------------
 ;; Code attributes
 ;;----------------------------------------------------------------------------
diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
index 039cd90c3da..80e94de4b84 100644
--- a/gcc/config/arm/neon.md
+++ b/gcc/config/arm/neon.md
@@ -6596,3 +6596,51 @@ if (BYTES_BIG_ENDIAN)
   "v<sup>mmla.<mmla_sfx>\t%q0, %q2, %q3"
   [(set_attr "type" "neon_mla_s_q")]
 )
+
+(define_insn "neon_vbfdot<VCVTF:mode>"
+  [(set (match_operand:VCVTF 0 "register_operand" "=w")
+	(plus:VCVTF (match_operand:VCVTF 1 "register_operand" "0")
+		    (unspec:VCVTF [
+			    (match_operand:<VSF2BF> 2 "register_operand" "w")
+			    (match_operand:<VSF2BF> 3 "register_operand" "w")]
+		     UNSPEC_DOT_S)))]
+  "TARGET_BF16_SIMD"
+  "vdot.bf16\\t%<V_reg>0, %<V_reg>2, %<V_reg>3"
+  [(set_attr "type" "neon_dot<q>")]
+)
+
+(define_insn "neon_vbfdot_lanev4bf<VCVTF:mode>"
+  [(set (match_operand:VCVTF 0 "register_operand" "=w")
+	(plus:VCVTF (match_operand:VCVTF 1 "register_operand" "0")
+		    (unspec:VCVTF [
+			    (match_operand:<VSF2BF> 2 "register_operand" "w")
+			    (match_operand:V4BF 3 "register_operand" "x")
+			    (match_operand:SI 4 "immediate_operand" "i")]
+		     UNSPEC_DOT_S)))]
+  "TARGET_BF16_SIMD"
+  "vdot.bf16\\t%<V_reg>0, %<V_reg>2, %P3[%c4]"
+  [(set_attr "type" "neon_dot<q>")]
+)
+
+(define_insn "neon_vbfdot_lanev8bf<VCVTF:mode>"
+  [(set (match_operand:VCVTF 0 "register_operand" "=w")
+	(plus:VCVTF (match_operand:VCVTF 1 "register_operand" "0")
+		    (unspec:VCVTF [
+			    (match_operand:<VSF2BF> 2 "register_operand" "w")
+			    (match_operand:V8BF 3 "register_operand" "x")
+			    (match_operand:SI 4 "immediate_operand" "i")]
+		     UNSPEC_DOT_S)))]
+  "TARGET_BF16_SIMD"
+  {
+    int lane = INTVAL (operands[4]);
+    int half = GET_MODE_NUNITS (GET_MODE (operands[3])) / 4;
+    if (lane < half)
+      return "vdot.bf16\\t%<V_reg>0, %<V_reg>2, %e3[%c4]";
+    else
+      {
+	operands[4] = GEN_INT (lane - half);
+	return "vdot.bf16\\t%<V_reg>0, %<V_reg>2, %f3[%c4]";
+      }
+  }
+  [(set_attr "type" "neon_dot<q>")]
+)
diff --git a/gcc/testsuite/gcc.target/arm/simd/bf16_dot_1.c b/gcc/testsuite/gcc.target/arm/simd/bf16_dot_1.c
new file mode 100644
index 00000000000..4487152d6cb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/simd/bf16_dot_1.c
@@ -0,0 +1,100 @@
+/* { dg-do assemble } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-options "-save-temps -O2" } */
+/* { dg-add-options arm_v8_2a_bf16_neon } */
+
+#include "arm_neon.h"
+
+/* BF16 DOT without lane.  */
+float32x2_t
+test_vbfdot_f32 (float32x2_t r, bfloat16x4_t a, bfloat16x4_t b)
+{
+  /* vdot.bf16 d, d, d */
+  return vbfdot_f32 (r, a, b);
+}
+
+float32x4_t
+test_vbfdotq_f32 (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b)
+{
+  /* vdot.bf16 q, q, q */
+  return vbfdotq_f32 (r, a, b);
+}
+
+/* 64-bit BF16 DOT with lane.  */
+float32x2_t
+test_vbfdot_lane_f32_0 (float32x2_t r, bfloat16x4_t a, bfloat16x4_t b)
+{
+  /* vdot.bf16 d, d, d[0] */
+  return vbfdot_lane_f32 (r, a, b, 0);
+}
+
+float32x2_t
+test_vbfdot_lane_f32_1 (float32x2_t r, bfloat16x4_t a, bfloat16x4_t b)
+{
+  /* vdot.bf16 d, d, d[1] */
+  return vbfdot_lane_f32 (r, a, b, 1);
+}
+
+float32x2_t
+test_vbfdot_laneq_f32_0 (float32x2_t r, bfloat16x4_t a, bfloat16x8_t b)
+{
+  /* vdot.bf16 d, d, d[0] */
+  return vbfdot_laneq_f32 (r, a, b, 0);
+}
+
+float32x2_t
+test_vbfdot_laneq_f32_1 (float32x2_t r, bfloat16x4_t a, bfloat16x8_t b)
+{
+  /* vdot.bf16 d, d, d[1] */
+  return vbfdot_laneq_f32 (r, a, b, 1);
+}
+
+float32x2_t
+test_vbfdot_laneq_f32_2 (float32x2_t r, bfloat16x4_t a, bfloat16x8_t b)
+{
+  /* vdot.bf16 d, d, d[0] */
+  return vbfdot_laneq_f32 (r, a, b, 2);
+}
+
+float32x2_t
+test_vbfdot_laneq_f32_3 (float32x2_t r, bfloat16x4_t a, bfloat16x8_t b)
+{
+  /* vdot.bf16 d, d, d[1] */
+  return vbfdot_laneq_f32 (r, a, b, 3);
+}
+
+/* 128-bit BF16 DOT with lane.  */
+float32x4_t
+test_vbfdotq_lane_f32_0 (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b)
+{
+  /* vdot.bf16 q, q, d[0] */
+  return vbfdotq_lane_f32 (r, a, b, 0);
+}
+
+float32x4_t
+test_vbfdotq_lane_f32_1 (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b)
+{
+  /* vdot.bf16 q, q, d[1] */
+  return vbfdotq_lane_f32 (r, a, b, 1);
+}
+
+float32x4_t
+test_vbfdotq_laneq_f32_0 (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b)
+{
+  /* vdot.bf16 q, q, d[0] */
+  return vbfdotq_laneq_f32 (r, a, b, 0);
+}
+
+float32x4_t
+test_vbfdotq_laneq_f32_3 (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b)
+{
+  /* vdot.bf16 q, q, d[1] */
+  return vbfdotq_laneq_f32 (r, a, b, 3);
+}
+
+/* { dg-final { scan-assembler-times {\tvdot.bf16\td[0-9]+, d[0-9]+, d[0-9]+\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvdot.bf16\tq[0-9]+, q[0-9]+, q[0-9]+\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tvdot.bf16\td[0-9]+, d[0-9]+, d[0-9]+\[0\]\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tvdot.bf16\td[0-9]+, d[0-9]+, d[0-9]+\[1\]\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tvdot.bf16\tq[0-9]+, q[0-9]+, d[0-9]+\[0\]\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tvdot.bf16\tq[0-9]+, q[0-9]+, d[0-9]+\[1\]\n} 2 } } */
diff --git a/gcc/testsuite/gcc.target/arm/simd/bf16_dot_2.c b/gcc/testsuite/gcc.target/arm/simd/bf16_dot_2.c
new file mode 100644
index 00000000000..d2ef344c68b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/simd/bf16_dot_2.c
@@ -0,0 +1,33 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-add-options arm_v8_2a_bf16_neon } */
+
+#include "arm_neon.h"
+
+float32x2_t
+test_vbfdot_lane_f32_a (float32x2_t r, bfloat16x4_t a, bfloat16x4_t b)
+{
+  /* { dg-error "lane -1 out of range 0 - 1" "" {target *-*-*} 0 } */
+  return vbfdot_lane_f32 (r, a, b, -1);
+}
+
+float32x2_t
+test_vbfdot_lane_f32_b (float32x2_t r, bfloat16x4_t a, bfloat16x4_t b)
+{
+  /* { dg-error "lane 2 out of range 0 - 1" "" {target *-*-*} 0 } */
+  return vbfdot_lane_f32 (r, a, b, 2);
+}
+
+float32x2_t
+test_vbfdot_laneq_f32_a (float32x2_t r, bfloat16x4_t a, bfloat16x8_t b)
+{
+  /* { dg-error "lane -1 out of range 0 - 3" "" { target *-*-* } 0 } */
+  return vbfdot_laneq_f32 (r, a, b, -1);
+}
+
+float32x2_t
+test_vbfdot_laneq_f32_b (float32x2_t r, bfloat16x4_t a, bfloat16x8_t b)
+{
+  /* { dg-error "lane 4 out of range 0 - 3" "" { target *-*-* } 0 } */
+  return vbfdot_laneq_f32 (r, a, b, 4);
+}
diff --git a/gcc/testsuite/gcc.target/arm/simd/bf16_dot_3.c b/gcc/testsuite/gcc.target/arm/simd/bf16_dot_3.c
new file mode 100644
index 00000000000..93f08f02bc7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/simd/bf16_dot_3.c
@@ -0,0 +1,33 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-add-options arm_v8_2a_bf16_neon } */
+
+#include "arm_neon.h"
+
+float32x4_t
+test_vbfdotq_lane_f32_a (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b)
+{
+  /* { dg-error "lane -1 out of range 0 - 1" "" {target *-*-*} 0 } */
+  return vbfdotq_lane_f32 (r, a, b, -1);
+}
+
+float32x4_t
+test_vbfdotq_lane_f32_b (float32x4_t r, bfloat16x8_t a, bfloat16x4_t b)
+{
+  /* { dg-error "lane 2 out of range 0 - 1" "" {target *-*-*} 0 } */
+  return vbfdotq_lane_f32 (r, a, b, 2);
+}
+
+float32x4_t
+test_vbfdotq_laneq_f32_a (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b)
+{
+  /* { dg-error "lane -1 out of range 0 - 3" "" { target *-*-* } 0 } */
+  return vbfdotq_laneq_f32 (r, a, b, -1);
+}
+
+float32x4_t
+test_vbfdotq_laneq_f32_b (float32x4_t r, bfloat16x8_t a, bfloat16x8_t b)
+{
+  /* { dg-error "lane 4 out of range 0 - 3" "" { target *-*-* } 0 } */
+  return vbfdotq_laneq_f32 (r, a, b, 4);
+}

Re: [Ping][PATCH][Arm] ACLE intrinsics for AdvSIMD bfloat16 dot product

Reply via email to