[PATCH] D80928: [BFloat] Add convert/copy instrinsic support

Alexandros Lamprineas via Phabricator via cfe-commits Mon, 01 Jun 2020 08:34:44 -0700

labrinea created this revision.
labrinea added reviewers: fpetrogalli, LukeGeeson, stuij, momchil.velikov, 
SjoerdMeijer, miyuki.
Herald added subscribers: hiraditya, kristof.beyls.
Herald added projects: clang, LLVM.


This patch is part of a series implementing the Bfloat16 extension of the 
Armv8.6-a architecture, as detailed here:

https://community.arm.com/developer/ip-products/processors/b/processors-ip-blog/posts/arm-architecture-developments-armv8-6-a

Specifically it adds intrinsic support in clang and llvm for Arm and AArch64.

The bfloat type, and its properties are specified in the Arm Architecture 
Reference Manual:

https://developer.arm.com/docs/ddi0487/latest/arm-architecture-reference-manual-armv8-for-armv8-a-architecture-profile

The following people contributed to this patch:

- Alexandros Lamprineas
- Luke Cheeseman
- Mikhail Maltsev
- Momchil Velikov


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D80928

Files:
  clang/include/clang/Basic/arm_neon.td
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/test/CodeGen/aarch64-bf16-lane-intrinsics.c
  clang/test/CodeGen/arm-bf16-conv-copy-intrinsics.c
  clang/test/Sema/aarch64-neon-bf16-ranges.c
  clang/utils/TableGen/NeonEmitter.cpp
  llvm/include/llvm/IR/IntrinsicsAArch64.td
  llvm/include/llvm/IR/IntrinsicsARM.td
  llvm/lib/Target/AArch64/AArch64InstrFormats.td
  llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
  llvm/test/CodeGen/AArch64/bf16-intrinsics.ll
  llvm/test/CodeGen/ARM/bf16-intrinsics-nofp16.ll
  llvm/test/CodeGen/ARM/bf16-intrinsics.ll

Index: llvm/test/CodeGen/ARM/bf16-intrinsics.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/ARM/bf16-intrinsics.ll
@@ -0,0 +1,24 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=armv8.6a-arm-none-eabi -mattr=+fullfp16 -mattr=+neon -mattr=+bf16 | FileCheck %s
+
+declare bfloat @llvm.arm.neon.vcvtbfp2bf.bf16.f32(float)
+declare <4 x bfloat> @llvm.arm.neon.vcvtfp2bf.v4bf16.v4f32(<4 x float>)
+
+; CHECK-LABEL: test_vcvth_bf16_f32
+; CHECK: vcvtb.bf16.f32  s0, s0
+define arm_aapcs_vfpcc float @test_vcvth_bf16_f32(float %a) {
+entry:
+  %vcvtbfp2bf.i = tail call bfloat @llvm.arm.neon.vcvtbfp2bf.bf16.f32(float %a)
+  %0 = bitcast bfloat %vcvtbfp2bf.i to i16
+  %tmp.0.insert.ext.i = zext i16 %0 to i32
+  %1 = bitcast i32 %tmp.0.insert.ext.i to float
+  ret float %1
+}
+
+; CHECK-LABEL: test_vcvt_bf16_f32
+; CHECK: vcvt.bf16.f32   d0, q0
+define arm_aapcs_vfpcc <4 x bfloat> @test_vcvt_bf16_f32(<4 x float> %a) {
+entry:
+  %vcvtfp2bf1.i.i = call <4 x bfloat> @llvm.arm.neon.vcvtfp2bf.v4bf16.v4f32(<4 x float> %a)
+  ret <4 x bfloat> %vcvtfp2bf1.i.i
+}
+
Index: llvm/test/CodeGen/ARM/bf16-intrinsics-nofp16.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/ARM/bf16-intrinsics-nofp16.ll
@@ -0,0 +1,23 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=armv8.6a-arm-none-eabi -mattr=+neon -mattr=+bf16 | FileCheck %s
+
+declare i32 @llvm.arm.neon.vcvtbfp2bf.i32.f32(float)
+declare <4 x i16> @llvm.arm.neon.vcvtfp2bf.v4i16.v4f32(<4 x float>)
+
+; CHECK-LABEL: test_vcvth_bf16_f32
+; CHECK: vcvtb.bf16.f32  s0, s0
+define arm_aapcs_vfpcc float @test_vcvth_bf16_f32(float %a) {
+entry:
+  %vcvtbfp2bf = tail call i32 @llvm.arm.neon.vcvtbfp2bf.i32.f32(float %a)
+  %tmp.0.insert.ext = and i32 %vcvtbfp2bf, 65535
+  %0 = bitcast i32 %tmp.0.insert.ext to float
+  ret float %0
+}
+
+; CHECK-LABEL: test_vcvt_bf16_f32
+; CHECK: vcvt.bf16.f32   d0, q0
+define arm_aapcs_vfpcc <2 x i32> @test_vcvt_bf16_f32(<4 x float> %a) {
+entry:
+  %vcvtfp2bf1.i.i = tail call <4 x i16> @llvm.arm.neon.vcvtfp2bf.v4i16.v4f32(<4 x float> %a)
+  %0 = bitcast <4 x i16> %vcvtfp2bf1.i.i to <2 x i32>
+  ret <2 x i32> %0
+}
Index: llvm/test/CodeGen/AArch64/bf16-intrinsics.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/AArch64/bf16-intrinsics.ll
@@ -0,0 +1,34 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-arm-none-eabi -mattr=+neon -mattr=+bf16 | FileCheck %s
+
+declare bfloat @llvm.aarch64.neon.bfcvt.f16.f32(float)
+declare <8 x bfloat> @llvm.aarch64.neon.bfcvtn.v8f16.v4f32(<4 x float>)
+declare <8 x bfloat> @llvm.aarch64.neon.bfcvtn2.v8f16.v8f16.v4f32(<8 x bfloat>, <4 x float>)
+
+; CHECK-LABEL: test_vcvth_bf16_f32
+; CHECK:      bfcvt h0, s0
+; CHECK-NEXT: ret
+define bfloat @test_vcvth_bf16_f32(float %a) {
+entry:
+  %vcvth_bf16_f32 = call bfloat @llvm.aarch64.neon.bfcvt.f16.f32(float %a)
+  ret bfloat %vcvth_bf16_f32
+}
+
+; CHECK-LABEL: test_vcvtq_low_bf16_f32
+; CHECK:      bfcvtn v0.4h, v0.4s
+; CHECK-NEXT: ret
+define <8 x bfloat> @test_vcvtq_low_bf16_f32(<4 x float> %a) {
+entry:
+  %cvt = call <8 x bfloat> @llvm.aarch64.neon.bfcvtn.v8f16.v4f32(<4 x float> %a)
+  ret <8 x bfloat> %cvt
+}
+
+; CHECK-LABEL: test_vcvtq_high_bf16_f32
+; CHECK:      bfcvtn2 v1.8h, v0.4s
+; CHECK-NEXT: mov v0.16b, v1.16b
+; CHECK-NEXT: ret
+define <8 x bfloat> @test_vcvtq_high_bf16_f32(<4 x float> %a, <8 x bfloat> %inactive) {
+entry:
+  %cvt = call <8 x bfloat> @llvm.aarch64.neon.bfcvtn2.v8f16.v8f16.v4f32(<8 x bfloat> %inactive, <4 x float> %a)
+  ret <8 x bfloat> %cvt
+}
+
Index: llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
===================================================================
--- llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -4653,6 +4653,29 @@
     default:
       break;
 
+    // Scalar f32 -> bf16
+    case Intrinsic::arm_neon_vcvtbfp2bf: {
+      SDLoc dl(N);
+      const SDValue &Src = N->getOperand(1);
+      llvm::EVT DestTy = N->getValueType(0);
+      SDValue Pred = getAL(CurDAG, dl);
+      SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
+      SDValue Ops[] = { Src, Src, Pred, Reg0 };
+      CurDAG->SelectNodeTo(N, ARM::BF16_VCVTB, DestTy, Ops);
+      return;
+    }
+
+    // Vector v4f32 -> v4bf16
+    case Intrinsic::arm_neon_vcvtfp2bf: {
+      SDLoc dl(N);
+      const SDValue &Src = N->getOperand(1);
+      SDValue Pred = getAL(CurDAG, dl);
+      SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
+      SDValue Ops[] = { Src, Pred, Reg0 };
+      CurDAG->SelectNodeTo(N, ARM::BF16_VCVT, MVT::v4i16, Ops);
+      return;
+    }
+
     case Intrinsic::arm_mve_urshrl:
       SelectMVE_LongShift(N, ARM::MVE_URSHRL, true, false);
       return;
Index: llvm/lib/Target/AArch64/AArch64InstrFormats.td
===================================================================
--- llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -7911,15 +7911,18 @@
 class SIMD_BFCVTN
   : BaseSIMDMixedTwoVector<0, 0, 0b10, 0b10110, V128, V128,
                            "bfcvtn", ".4h", ".4s",
-    []>;
+    [(set (v8bf16 V128:$Rd),
+          (int_aarch64_neon_bfcvtn (v4f32 V128:$Rn)))]>;
 
 class SIMD_BFCVTN2
   : BaseSIMDMixedTwoVectorTied<1, 0, 0b10, 0b10110, V128, V128,
                            "bfcvtn2", ".8h", ".4s",
-    []>;
+    [(set (v8bf16 V128:$dst),
+          (int_aarch64_neon_bfcvtn2 (v8bf16 V128:$Rd), (v4f32 V128:$Rn)))]>;
 
 class BF16ToSinglePrecision<string asm>
-  : I<(outs FPR16:$Rd), (ins FPR32:$Rn), asm, "\t$Rd, $Rn", "", []>,
+  : I<(outs FPR16:$Rd), (ins FPR32:$Rn), asm, "\t$Rd, $Rn", "",
+    [(set (bf16 FPR16:$Rd), (int_aarch64_neon_bfcvt (f32 FPR32:$Rn)))]>,
     Sched<[WriteFCvt]> {
   bits<5> Rd;
   bits<5> Rn;
Index: llvm/include/llvm/IR/IntrinsicsARM.td
===================================================================
--- llvm/include/llvm/IR/IntrinsicsARM.td
+++ llvm/include/llvm/IR/IntrinsicsARM.td
@@ -786,6 +786,12 @@
 
 // v8.6-A Bfloat Intrinsics
 
+def int_arm_neon_vcvtfp2bf
+    : Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty], [IntrNoMem]>;
+
+def int_arm_neon_vcvtbfp2bf
+    : Intrinsic<[llvm_any_ty], [llvm_anyfloat_ty], [IntrNoMem]>;
+
 def int_arm_cls: Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>;
 def int_arm_cls64: Intrinsic<[llvm_i32_ty], [llvm_i64_ty], [IntrNoMem]>;
 
Index: llvm/include/llvm/IR/IntrinsicsAArch64.td
===================================================================
--- llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -471,6 +471,16 @@
   def int_aarch64_neon_bfmlalt : AdvSIMD_FML_Intrinsic;
 
 
+  // v8.6-A Bfloat Intrinsics
+  def int_aarch64_neon_bfcvt
+    : Intrinsic<[llvm_anyfloat_ty], [llvm_anyfloat_ty], [IntrNoMem]>;
+  def int_aarch64_neon_bfcvtn
+    : Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty], [IntrNoMem]>;
+  def int_aarch64_neon_bfcvtn2
+    : Intrinsic<[llvm_anyvector_ty],
+                [llvm_anyvector_ty, llvm_anyvector_ty],
+                [IntrNoMem]>;
+
   // v8.2-A FP16 Fused Multiply-Add Long
   def int_aarch64_neon_fmlal : AdvSIMD_FP16FML_Intrinsic;
   def int_aarch64_neon_fmlsl : AdvSIMD_FP16FML_Intrinsic;
Index: clang/utils/TableGen/NeonEmitter.cpp
===================================================================
--- clang/utils/TableGen/NeonEmitter.cpp
+++ clang/utils/TableGen/NeonEmitter.cpp
@@ -1064,7 +1064,8 @@
   std::string S = Name;
 
   if (Name == "vcvt_f16_f32" || Name == "vcvt_f32_f16" ||
-      Name == "vcvt_f32_f64" || Name == "vcvt_f64_f32")
+      Name == "vcvt_f32_f64" || Name == "vcvt_f64_f32" ||
+      Name == "vcvt_f32_bf16")
     return Name;
 
   if (!typeCode.empty()) {
Index: clang/test/Sema/aarch64-neon-bf16-ranges.c
===================================================================
--- /dev/null
+++ clang/test/Sema/aarch64-neon-bf16-ranges.c
@@ -0,0 +1,47 @@
+// RUN: %clang_cc1 -fsyntax-only -verify \
+// RUN: -triple aarch64-arm-none-eabi -target-feature +neon \
+// RUN: -target-feature +bf16 %s
+
+#include <arm_neon.h>
+
+int x;
+
+void test_vcopy_lane_bf16(bfloat16x4_t a, bfloat16x8_t b) {
+  // 0 <= lane1 <= 3; 0 <= lane2 <= 3
+  (void)vcopy_lane_bf16(a, 3, a, 3);
+  (void)vcopy_lane_bf16(a, 0, a, 4);    // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  (void)vcopy_lane_bf16(a, 1, a, -1);   // expected-error {{argument value -1 is outside the valid range [0, 3]}}
+  (void)vcopy_lane_bf16(a, 4, a, 0);    // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  (void)vcopy_lane_bf16(a, -1, a, 1);   // expected-error {{argument value -1 is outside the valid range [0, 3]}}
+  (void)vcopy_lane_bf16(a, 0, a, x);    // expected-error-re {{argument {{.*}} must be a constant integer}}
+  (void)vcopy_lane_bf16(a, x, a, 0);    // expected-error-re {{argument {{.*}} must be a constant integer}}
+
+  // 0 <= lane1 <= 7; 0 <= lane2 <= 3
+  (void)vcopyq_lane_bf16(b, 7, a, 3);
+  (void)vcopyq_lane_bf16(b, 0, a, 4);   // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  (void)vcopyq_lane_bf16(b, 1, a, -1);  // expected-error {{argument value -1 is outside the valid range [0, 3]}}
+  (void)vcopyq_lane_bf16(b, 8, a, 0);   // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  (void)vcopyq_lane_bf16(b, -1, a, 1);  // expected-error {{argument value -1 is outside the valid range [0, 7]}}
+  (void)vcopyq_lane_bf16(b, 0, a, x);   // expected-error-re {{argument {{.*}} must be a constant integer}}
+  (void)vcopyq_lane_bf16(b, x, a, 0);   // expected-error-re {{argument {{.*}} must be a constant integer}}
+
+  // 0 <= lane1 <= 3; 0 <= lane2 <= 7
+  (void)vcopy_laneq_bf16(a, 3, b, 7);
+  (void)vcopy_laneq_bf16(a, 0, b, 8);   // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  (void)vcopy_laneq_bf16(a, 1, b, -1);  // expected-error {{argument value -1 is outside the valid range [0, 7]}}
+  (void)vcopy_laneq_bf16(a, 4, b, 0);   // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  (void)vcopy_laneq_bf16(a, -1, b, 1);  // expected-error {{argument value -1 is outside the valid range [0, 3]}}
+  (void)vcopy_laneq_bf16(a, 0, b, x);   // expected-error-re {{argument {{.*}} must be a constant integer}}
+  (void)vcopy_laneq_bf16(a, x, b, 0);   // expected-error-re {{argument {{.*}} must be a constant integer}}
+
+
+  // 0 <= lane1 <= 7; 0 <= lane2 <= 7
+  (void)vcopyq_laneq_bf16(b, 7, b, 7);
+  (void)vcopyq_laneq_bf16(b, 0, b, 8);  // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  (void)vcopyq_laneq_bf16(b, 1, b, -1); // expected-error {{argument value -1 is outside the valid range [0, 7]}}
+  (void)vcopyq_laneq_bf16(b, 8, b, 0);  // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  (void)vcopyq_laneq_bf16(b, -1, b, 1); // expected-error {{argument value -1 is outside the valid range [0, 7]}}
+  (void)vcopyq_laneq_bf16(b, 0, b, x);  // expected-error-re {{argument {{.*}} must be a constant integer}}
+  (void)vcopyq_laneq_bf16(b, x, b, 0);  // expected-error-re {{argument {{.*}} must be a constant integer}}
+}
+
Index: clang/test/CodeGen/arm-bf16-conv-copy-intrinsics.c
===================================================================
--- /dev/null
+++ clang/test/CodeGen/arm-bf16-conv-copy-intrinsics.c
@@ -0,0 +1,88 @@
+// RUN: %clang_cc1 \
+// RUN:   -triple aarch64-arm-none-eabi -target-feature +neon -target-feature +bf16 \
+// RUN:   -disable-O0-optnone -emit-llvm -o - %s \
+// RUN:   | opt -S -mem2reg -instcombine \
+// RUN:   | FileCheck --check-prefixes=CHECK,CHECK-A64 %s
+// RUN: %clang_cc1 \
+// RUN:   -triple armv8.6a-arm-none-eabi -target-feature +neon \
+// RUN:   -target-feature +bf16 -mfloat-abi hard \
+// RUN:   -disable-O0-optnone -emit-llvm -o - %s \
+// RUN:   | opt -S -mem2reg -instcombine \
+// RUN:   | FileCheck --check-prefixes=CHECK,CHECK-A32-FP16 %s
+// RUN: %clang_cc1 \
+// RUN:   -triple armv8.6a-arm-none-eabi -target-feature +neon \
+// RUN:   -target-feature +bf16 -mfloat-abi softfp \
+// RUN:   -disable-O0-optnone -emit-llvm -o - %s \
+// RUN:   | opt -S -mem2reg -instcombine \
+// RUN:   | FileCheck --check-prefixes=CHECK,CHECK-A32-NOFP16 %s
+
+#include <arm_neon.h>
+
+// CHECK-LABEL: test_vcvt_f32_bf16
+// CHECK: %[[EXT:.*]] = zext <4 x i16> %{{.*}} to <4 x i32>
+// CHECK: shl nuw <4 x i32> %[[EXT]], <i32 16, i32 16, i32 16, i32 16>
+float32x4_t test_vcvt_f32_bf16(bfloat16x4_t a) {
+  return vcvt_f32_bf16(a);
+}
+
+// CHECK-LABEL: test_vcvtq_low_f32_bf16
+// CHECK: shufflevector <8 x bfloat> %{{.*}}, <8 x bfloat> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK: %[[EXT:.*]] = zext <4 x i16> %{{.*}} to <4 x i32>
+// CHECK: shl nuw <4 x i32> %[[EXT]], <i32 16, i32 16, i32 16, i32 16>
+float32x4_t test_vcvtq_low_f32_bf16(bfloat16x8_t a) {
+  return vcvtq_low_f32_bf16(a);
+}
+
+// CHECK-LABEL: test_vcvtq_high_f32_bf16
+// CHECK: shufflevector <8 x bfloat> %{{.*}}, <8 x bfloat> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK: %[[EXT:.*]] = zext <4 x i16> %{{.*}} to <4 x i32>
+// CHECK: shl nuw <4 x i32> %[[EXT]], <i32 16, i32 16, i32 16, i32 16>
+float32x4_t test_vcvtq_high_f32_bf16(bfloat16x8_t a) {
+  return vcvtq_high_f32_bf16(a);
+}
+
+// CHECK-LABEL: test_vcvt_bf16_f32
+// CHECK-A64: %[[CVT:.*]] = call <8 x bfloat> @llvm.aarch64.neon.bfcvtn.v8bf16.v4f32(<4 x float> %a)
+// CHECK-A64: shufflevector <8 x bfloat> %[[CVT]], <8 x bfloat> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-A32-FP16: call <4 x bfloat> @llvm.arm.neon.vcvtfp2bf.v4bf16.v4f32(<4 x float> %a)
+// CHECK-A32-NOFP16: call <4 x i16> @llvm.arm.neon.vcvtfp2bf.v4i16.v4f32(<4 x float> %a)
+bfloat16x4_t test_vcvt_bf16_f32(float32x4_t a) {
+  return vcvt_bf16_f32(a);
+}
+
+// CHECK-LABEL: test_vcvtq_low_bf16_f32
+// CHECK-A64: call <8 x bfloat> @llvm.aarch64.neon.bfcvtn.v8bf16.v4f32(<4 x float> %a)
+// CHECK-A32-FP16: %[[CVT:.*]] = call <4 x bfloat> @llvm.arm.neon.vcvtfp2bf.v4bf16.v4f32
+// CHECK-A32-FP16: shufflevector <4 x bfloat> zeroinitializer, <4 x bfloat> %[[CVT]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-A32-NOFP16: call <4 x i16> @llvm.arm.neon.vcvtfp2bf.v4i16.v4f32
+// CHECK-A32-NOFP16: shufflevector <4 x bfloat> zeroinitializer, <4 x bfloat> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+bfloat16x8_t test_vcvtq_low_bf16_f32(float32x4_t a) {
+  return vcvtq_low_bf16_f32(a);
+}
+
+// CHECK-LABEL: test_vcvtq_high_bf16_f32
+// CHECK-A64: call <8 x bfloat> @llvm.aarch64.neon.bfcvtn2.v8bf16.v8bf16.v4f32(<8 x bfloat> %inactive, <4 x float> %a)
+// CHECK-A32-FP16: %[[CVT:.*]] = call <4 x bfloat> @llvm.arm.neon.vcvtfp2bf.v4bf16.v4f32(<4 x float> %a)
+// CHECK-A32-FP16: %[[INACT:.*]] = shufflevector <8 x bfloat> %inactive, <8 x bfloat> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-A32-FP16: shufflevector <4 x bfloat> %[[CVT]], <4 x bfloat> %[[INACT]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-A32-NOFP16: call <4 x i16> @llvm.arm.neon.vcvtfp2bf.v4i16.v4f32(<4 x float> %a)
+// CHECK-A32-NOFP16: shufflevector <8 x bfloat> %{{.*}}, <8 x bfloat> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK-A32-NOFP16: shufflevector <4 x bfloat> %{{.*}}, <4 x bfloat> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+bfloat16x8_t test_vcvtq_high_bf16_f32(bfloat16x8_t inactive, float32x4_t a) {
+  return vcvtq_high_bf16_f32(inactive, a);
+}
+
+// CHECK-LABEL: test_vcvth_bf16_f32
+// CHECK-A64: call bfloat @llvm.aarch64.neon.bfcvt.bf16.f32
+// CHECK-A32-FP16: call bfloat @llvm.arm.neon.vcvtbfp2bf.bf16.f32(float %a)
+// CHECK-A32-NOFP16: call i32 @llvm.arm.neon.vcvtbfp2bf.i32.f32(float %a)
+bfloat16_t test_vcvth_bf16_f32(float32_t a) {
+  return vcvth_bf16_f32(a);
+}
+
+// CHECK-LABEL: test_vcvtah_f32_bf16
+// CHECK: shl i32 %{{.*}}, 16
+float32_t test_vcvtah_f32_bf16(bfloat16_t a) {
+  return vcvtah_f32_bf16(a);
+}
+
Index: clang/test/CodeGen/aarch64-bf16-lane-intrinsics.c
===================================================================
--- /dev/null
+++ clang/test/CodeGen/aarch64-bf16-lane-intrinsics.c
@@ -0,0 +1,78 @@
+// RUN: %clang_cc1 \
+// RUN:   -triple aarch64-arm-none-eabi -target-cpu cortex-a75 \
+// RUN:   -target-feature +neon -target-feature +bf16 -ffreestanding \
+// RUN:   -disable-O0-optnone -emit-llvm -o - %s \
+// RUN:   | opt -S -mem2reg | FileCheck --check-prefixes=CHECK,CHECK-LE %s
+// RUN: %clang_cc1 \
+// RUN:   -triple aarch64_be-arm-none-eabi -target-cpu cortex-a75 \
+// RUN:   -target-feature +neon -target-feature +bf16 -ffreestanding \
+// RUN:   -disable-O0-optnone -emit-llvm -o - %s \
+// RUN:   | opt -S -mem2reg | FileCheck %s
+
+#include <arm_neon.h>
+
+// CHECK-LABEL: test_vcopy_lane_bf16_v1
+// CHECK: %[[LANE:.*]] = extractelement <4 x bfloat> %{{.*}}, i32 3
+// CHECK: %[[RES:.*]] = insertelement <4 x bfloat> %{{.*}}, bfloat %[[LANE]], i32 1
+// CHECK-LE: ret <4 x bfloat> %[[RES]]
+bfloat16x4_t test_vcopy_lane_bf16_v1(bfloat16x4_t a, bfloat16x4_t b) {
+  return vcopy_lane_bf16(a, 1, b, 3);
+}
+
+// CHECK-LABEL: test_vcopy_lane_bf16_v2
+// CHECK: %[[LANE:.*]] = extractelement <4 x bfloat> %{{.*}}, i32 0
+// CHECK: %[[RES:.*]] = insertelement <4 x bfloat> %{{.*}}, bfloat %[[LANE]], i32 2
+// CHECK-LE: ret <4 x bfloat> %[[RES]]
+bfloat16x4_t test_vcopy_lane_bf16_v2(bfloat16x4_t a, bfloat16x4_t b) {
+  return vcopy_lane_bf16(a, 2, b, 0);
+}
+
+// CHECK-LABEL: test_vcopyq_lane_bf16_v1
+// CHECK: %[[LANE:.*]] = extractelement <4 x bfloat> %{{.*}}, i32 2
+// CHECK: %[[RES:.*]] = insertelement <8 x bfloat> %{{.*}}, bfloat %[[LANE]], i32 0
+// CHECK-LE: ret <8 x bfloat> %[[RES]]
+bfloat16x8_t test_vcopyq_lane_bf16_v1(bfloat16x8_t a, bfloat16x4_t b) {
+  return vcopyq_lane_bf16(a, 0, b, 2);
+}
+
+// CHECK-LABEL: test_vcopyq_lane_bf16_v2
+// CHECK: %[[LANE:.*]] = extractelement <4 x bfloat> %{{.*}}, i32 0
+// CHECK: %[[RES:.*]] = insertelement <8 x bfloat> %{{.*}}, bfloat %[[LANE]], i32 6
+// CHECK-LE: ret <8 x bfloat> %[[RES]]
+bfloat16x8_t test_vcopyq_lane_bf16_v2(bfloat16x8_t a, bfloat16x4_t b) {
+  return vcopyq_lane_bf16(a, 6, b, 0);
+}
+
+// CHECK-LABEL: test_vcopy_laneq_bf16_v1
+// CHECK: %[[LANE:.*]] = extractelement <8 x bfloat> %{{.*}}, i32 7
+// CHECK: %[[RES:.*]] = insertelement <4 x bfloat> %{{.*}}, bfloat %[[LANE]], i32 0
+// CHECK-LE: ret <4 x bfloat> %[[RES]]
+bfloat16x4_t test_vcopy_laneq_bf16_v1(bfloat16x4_t a, bfloat16x8_t b) {
+  return vcopy_laneq_bf16(a, 0, b, 7);
+}
+
+// CHECK-LABEL: test_vcopy_laneq_bf16_v2
+// CHECK: %[[LANE:.*]] = extractelement <8 x bfloat> %{{.*}}, i32 4
+// CHECK: %[[RES:.*]] = insertelement <4 x bfloat> %{{.*}}, bfloat %[[LANE]], i32 3
+// CHECK-LE: ret <4 x bfloat> %[[RES]]
+bfloat16x4_t test_vcopy_laneq_bf16_v2(bfloat16x4_t a, bfloat16x8_t b) {
+  return vcopy_laneq_bf16(a, 3, b, 4);
+}
+
+// CHECK-LABEL: test_vcopyq_laneq_bf16_v1
+// CHECK: %[[LANE:.*]] = extractelement <8 x bfloat> %{{.*}}, i32 7
+// CHECK: %[[RES:.*]] = insertelement <8 x bfloat> %{{.*}}, bfloat %[[LANE]], i32 3
+// CHECK-LE: ret <8 x bfloat> %[[RES]]
+bfloat16x8_t test_vcopyq_laneq_bf16_v1(bfloat16x8_t a, bfloat16x8_t b) {
+  return vcopyq_laneq_bf16(a, 3, b, 7);
+
+}
+
+// CHECK-LABEL: test_vcopyq_laneq_bf16_v2
+// CHECK: %[[LANE:.*]] = extractelement <8 x bfloat> %{{.*}}, i32 2
+// CHECK: %[[RES:.*]] = insertelement <8 x bfloat> %{{.*}}, bfloat %[[LANE]], i32 6
+// CHECK-LE: ret <8 x bfloat> %[[RES]]
+bfloat16x8_t test_vcopyq_laneq_bf16_v2(bfloat16x8_t a, bfloat16x8_t b) {
+  return vcopyq_laneq_bf16(a, 6, b, 2);
+}
+
Index: clang/lib/CodeGen/CGBuiltin.cpp
===================================================================
--- clang/lib/CodeGen/CGBuiltin.cpp
+++ clang/lib/CodeGen/CGBuiltin.cpp
@@ -4656,6 +4656,7 @@
       TypeModifier }
 
 static const ARMVectorIntrinsicInfo ARMSIMDIntrinsicMap [] = {
+  NEONMAP1(__a32_vcvt_bf16_v, arm_neon_vcvtfp2bf, 0),
   NEONMAP0(splat_lane_v),
   NEONMAP0(splat_laneq_v),
   NEONMAP0(splatq_lane_v),
@@ -4729,6 +4730,7 @@
   NEONMAP1(vcvtaq_u16_v, arm_neon_vcvtau, 0),
   NEONMAP1(vcvtaq_u32_v, arm_neon_vcvtau, 0),
   NEONMAP1(vcvtaq_u64_v, arm_neon_vcvtau, 0),
+  NEONMAP1(vcvth_bf16_f32, arm_neon_vcvtbfp2bf, 0),
   NEONMAP1(vcvtm_s16_v, arm_neon_vcvtms, 0),
   NEONMAP1(vcvtm_s32_v, arm_neon_vcvtms, 0),
   NEONMAP1(vcvtm_s64_v, arm_neon_vcvtms, 0),
@@ -4945,6 +4947,7 @@
 };
 
 static const ARMVectorIntrinsicInfo AArch64SIMDIntrinsicMap[] = {
+  NEONMAP1(__a64_vcvtq_low_bf16_v, aarch64_neon_bfcvtn, 0),
   NEONMAP0(splat_lane_v),
   NEONMAP0(splat_laneq_v),
   NEONMAP0(splatq_lane_v),
@@ -5004,6 +5007,7 @@
   NEONMAP1(vcvt_n_u64_v, aarch64_neon_vcvtfp2fxu, 0),
   NEONMAP0(vcvtq_f16_v),
   NEONMAP0(vcvtq_f32_v),
+  NEONMAP1(vcvtq_high_bf16_v, aarch64_neon_bfcvtn2, 0),
   NEONMAP2(vcvtq_n_f16_v, aarch64_neon_vcvtfxu2fp, aarch64_neon_vcvtfxs2fp, 0),
   NEONMAP2(vcvtq_n_f32_v, aarch64_neon_vcvtfxu2fp, aarch64_neon_vcvtfxs2fp, 0),
   NEONMAP2(vcvtq_n_f64_v, aarch64_neon_vcvtfxu2fp, aarch64_neon_vcvtfxs2fp, 0),
@@ -5159,6 +5163,7 @@
   NEONMAP1(vcvtd_n_f64_u64, aarch64_neon_vcvtfxu2fp, AddRetType | Add1ArgType),
   NEONMAP1(vcvtd_n_s64_f64, aarch64_neon_vcvtfp2fxs, AddRetType | Add1ArgType),
   NEONMAP1(vcvtd_n_u64_f64, aarch64_neon_vcvtfp2fxu, AddRetType | Add1ArgType),
+  NEONMAP1(vcvth_bf16_f32, aarch64_neon_bfcvt, AddRetType | Add1ArgType),
   NEONMAP1(vcvtmd_s64_f64, aarch64_neon_fcvtms, AddRetType | Add1ArgType),
   NEONMAP1(vcvtmd_u64_f64, aarch64_neon_fcvtmu, AddRetType | Add1ArgType),
   NEONMAP1(vcvtms_s32_f32, aarch64_neon_fcvtms, AddRetType | Add1ArgType),
@@ -6157,6 +6162,27 @@
     llvm::Type *Tys[2] = { Ty, InputTy };
     return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vbfmlalt");
   }
+  case NEON::BI__builtin_neon___a64_vcvtq_low_bf16_v: {
+    llvm::Type *InputTy =
+          GetNeonType(this, NeonTypeFlags(NeonTypeFlags::Float32, false, true));
+    llvm::Type *Tys[2] = { Ty, InputTy };
+    Function *F = CGM.getIntrinsic(Int, Tys);
+    return EmitNeonCall(F, Ops, "bfcvtn");
+  }
+  case NEON::BI__builtin_neon_vcvtq_high_bf16_v: {
+    llvm::Type *InputTy =
+          GetNeonType(this, NeonTypeFlags(NeonTypeFlags::Float32, false, true));
+    llvm::Type *Tys[3] = { Ty, Ty, InputTy };
+    Function *F = CGM.getIntrinsic(Int, Tys);
+    return EmitNeonCall(F, Ops, "bfcvtn2");
+  }
+  case NEON::BI__builtin_neon___a32_vcvt_bf16_v: {
+    llvm::Type *InputTy =
+          GetNeonType(this, NeonTypeFlags(NeonTypeFlags::Float32, false, true));
+    llvm::Type *Tys[2] = { Ty, InputTy };
+    Function *F = CGM.getIntrinsic(Int, Tys);
+    return EmitNeonCall(F, Ops, "vcvtfp2bf");
+  }
   }
 
   assert(Int && "Expected valid intrinsic number");
@@ -6364,6 +6390,7 @@
   case NEON::BI__builtin_neon_vsha1cq_u32:
   case NEON::BI__builtin_neon_vsha1pq_u32:
   case NEON::BI__builtin_neon_vsha1mq_u32:
+  case NEON::BI__builtin_neon_vcvth_bf16_f32:
   case clang::ARM::BI_MoveToCoprocessor:
   case clang::ARM::BI_MoveToCoprocessor2:
     return false;
@@ -6847,6 +6874,22 @@
     return EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_sha1m), Ops,
                         "vsha1h");
 
+  case NEON::BI__builtin_neon_vcvth_bf16_f32: {
+    LLVMContext &Ctx = CGM.getLLVMContext();
+    const bool AllowBFloatArgsAndRet =
+        getTargetHooks().getABIInfo().allowBFloatArgsAndRet();
+    llvm::Type *DestTy = AllowBFloatArgsAndRet ? llvm::Type::getBFloatTy(Ctx)
+                                               : llvm::Type::getInt32Ty(Ctx);
+    llvm::Type *Tys[] = { DestTy, Ops[0]->getType() };
+    Value *Ret =
+        EmitNeonCall(CGM.getIntrinsic(Intrinsic::arm_neon_vcvtbfp2bf, Tys), Ops,
+                     "vcvtbfp2bf");
+    if (AllowBFloatArgsAndRet)
+      return Ret;
+    Value *RetLO = Builder.CreateTrunc(Ret, llvm::Type::getInt16Ty(Ctx));
+    return Builder.CreateBitCast(RetLO, llvm::Type::getBFloatTy(Ctx));
+  }
+
   // The ARM _MoveToCoprocessor builtins put the input register value as
   // the first argument, but the LLVM intrinsic expects it as the third one.
   case ARM::BI_MoveToCoprocessor:
Index: clang/include/clang/Basic/arm_neon.td
===================================================================
--- clang/include/clang/Basic/arm_neon.td
+++ clang/include/clang/Basic/arm_neon.td
@@ -252,6 +252,34 @@
     : Op<(call "vbfmlalt", $p0, $p1,
           (dup_typed $p1, (call "vget_lane", $p2, $p3)))>;
 
+def OP_VCVT_F32_BF16
+    : Op<(bitcast "R",
+          (call "vshll_n", (bitcast "int16x4_t", $p0),
+                           (literal "int32_t", "16")))>;
+def OP_VCVT_F32_BF16_LO
+    : Op<(call "vcvt_f32_bf16", (call "vget_low", $p0))>;
+def OP_VCVT_F32_BF16_HI
+    : Op<(call "vcvt_f32_bf16", (call "vget_high", $p0))>;
+
+def OP_VCVT_BF16_F32_LO_A64
+    : Op<(call "__a64_vcvtq_low_bf16", $p0)>;
+def OP_VCVT_BF16_F32_A64
+    : Op<(call "vget_low", (call "__a64_vcvtq_low_bf16", $p0))>;
+
+def OP_VCVT_BF16_F32_A32
+    : Op<(call "__a32_vcvt_bf16", $p0)>;
+
+def OP_VCVT_BF16_F32_LO_A32
+    : Op<(call "vcombine", (cast "bfloat16x4_t", (literal "uint64_t", "0ULL")),
+                           (call "__a32_vcvt_bf16", $p0))>;
+def OP_VCVT_BF16_F32_HI_A32
+    : Op<(call "vcombine", (call "__a32_vcvt_bf16", $p1),
+                           (call "vget_low", $p0))>;
+
+def OP_CVT_F32_BF16
+    : Op<(bitcast "R", (op "<<", (bitcast "int32_t", $p0),
+                                 (literal "int32_t", "16")))>;
+
 //===----------------------------------------------------------------------===//
 // Auxiliary Instructions
 //===----------------------------------------------------------------------===//
@@ -1942,7 +1970,31 @@
   def VLD3_DUP_BF : WInst<"vld3_dup", "3(c*!)", "bQb">;
   def VLD4_DUP_BF : WInst<"vld4_dup", "4(c*!)", "bQb">;
 
+  def VCVT_F32_BF16 : SOpInst<"vcvt_f32_bf16", "(F>)(Bq!)",  "Qb", OP_VCVT_F32_BF16>;
+  def VCVT_LOW_F32_BF16 : SOpInst<"vcvt_low_f32", "(F>)(BQ!)",  "Qb", OP_VCVT_F32_BF16_LO>;
+  def VCVT_HIGH_F32_BF16 : SOpInst<"vcvt_high_f32", "(F>)(BQ!)", "Qb", OP_VCVT_F32_BF16_HI>;
+
+  def SCALAR_CVT_BF16_F32 : SInst<"vcvth_bf16", "(1B)1", "f">;
+  def SCALAR_CVT_F32_BF16 : SOpInst<"vcvtah_f32", "(1F>)(1!)", "b", OP_CVT_F32_BF16>;
+}
+
+let ArchGuard = "defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) && !defined(__aarch64__)" in {
+  def VCVT_BF16_F32_A32_INTERNAL : WInst<"__a32_vcvt_bf16", "BQ", "f">;
+  def VCVT_BF16_F32_A32 : SOpInst<"vcvt_bf16", "BQ", "f", OP_VCVT_BF16_F32_A32>;
+  def VCVT_LOW_BF16_F32_A32 : SOpInst<"vcvt_low_bf16",  "BQ", "Qf", OP_VCVT_BF16_F32_LO_A32>;
+  def VCVT_HIGH_BF16_F32_A32 : SOpInst<"vcvt_high_bf16", "BBQ", "Qf", OP_VCVT_BF16_F32_HI_A32>;
+}
+
+let ArchGuard = "defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) && defined(__aarch64__)" in {
+  def VCVT_LOW_BF16_F32_A64_INTERNAL : WInst<"__a64_vcvtq_low_bf16", "BQ", "Hf">;
+  def VCVT_LOW_BF16_F32_A64 : SOpInst<"vcvt_low_bf16", "BQ", "Qf", OP_VCVT_BF16_F32_LO_A64>;
+  def VCVT_HIGH_BF16_F32_A64 : SInst<"vcvt_high_bf16", "BBQ", "Qf">;
+  def VCVT_BF16_F32 : SOpInst<"vcvt_bf16",    "BQ", "f", OP_VCVT_BF16_F32_A64>;
 
+  def COPY_LANE_BF16 : IOpInst<"vcopy_lane", "..I.I", "b", OP_COPY_LN>;
+  def COPYQ_LANE_BF16 : IOpInst<"vcopy_lane", "..IqI", "Qb", OP_COPY_LN>;
+  def COPY_LANEQ_BF16 : IOpInst<"vcopy_laneq", "..IQI", "b", OP_COPY_LN>;
+  def COPYQ_LANEQ_BF16 : IOpInst<"vcopy_laneq", "..I.I", "Qb", OP_COPY_LN>;
 }
 
 let ArchGuard = "defined(__ARM_FEATURE_BF16) && !defined(__aarch64__)" in {

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D80928: [BFloat] Add convert/copy instrinsic support

Reply via email to