[PATCH] D76610: [ARM,CDE] Implement predicated Q-register CDE intrinsics

2020-03-25 Thread Mikhail Maltsev via Phabricator via cfe-commits
This revision was automatically updated to reflect the committed changes.
Closed by commit rGbb4da94e5b5f: [ARM,CDE] Implement predicated Q-register CDE 
intrinsics (authored by miyuki).

Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D76610/new/

https://reviews.llvm.org/D76610

Files:
  clang/include/clang/Basic/arm_cde.td
  clang/test/CodeGen/arm-cde-vec.c
  llvm/include/llvm/IR/IntrinsicsARM.td
  llvm/lib/Target/ARM/ARMInstrCDE.td
  llvm/test/CodeGen/Thumb2/cde-vec.ll

Index: llvm/test/CodeGen/Thumb2/cde-vec.ll
===
--- llvm/test/CodeGen/Thumb2/cde-vec.ll
+++ llvm/test/CodeGen/Thumb2/cde-vec.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=thumbv8.1m.main -mattr=+cdecp0 -mattr=+cdecp1 -mattr=+mve -verify-machineinstrs -o - %s | FileCheck %s
+; RUN: llc -mtriple=thumbv8.1m.main -mattr=+cdecp0 -mattr=+cdecp1 -mattr=+mve.fp -verify-machineinstrs -o - %s | FileCheck %s
 
 declare <16 x i8> @llvm.arm.cde.vcx1q(i32 immarg, i32 immarg)
 declare <16 x i8> @llvm.arm.cde.vcx1qa(i32 immarg, <16 x i8>, i32 immarg)
@@ -112,3 +112,103 @@
   %2 = call <16 x i8> @llvm.arm.cde.vcx3qa(i32 1, <16 x i8> %acc, <16 x i8> %0, <16 x i8> %1, i32 13)
   ret <16 x i8> %2
 }
+
+declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32)
+declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32)
+declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32)
+declare <8 x i16> @llvm.arm.cde.vcx1q.predicated.v8i16.v8i1(i32 immarg, <8 x i16>, i32 immarg, <8 x i1>)
+declare <16 x i8> @llvm.arm.cde.vcx1qa.predicated.v16i8.v16i1(i32 immarg, <16 x i8>, i32 immarg, <16 x i1>)
+declare <4 x i32> @llvm.arm.cde.vcx2q.predicated.v4i32.v4i1(i32 immarg, <4 x i32>, <16 x i8>, i32 immarg, <4 x i1>)
+declare <4 x float> @llvm.arm.cde.vcx2qa.predicated.v4f32.v4i1(i32 immarg, <4 x float>, <16 x i8>, i32 immarg, <4 x i1>)
+declare <2 x i64> @llvm.arm.cde.vcx3q.predicated.v2i64.v4i1(i32 immarg, <2 x i64>, <16 x i8>, <16 x i8>, i32 immarg, <4 x i1>)
+declare <4 x float> @llvm.arm.cde.vcx3qa.predicated.v4f32.v4i1(i32 immarg, <4 x float>, <16 x i8>, <16 x i8>, i32 immarg, <4 x i1>)
+
+define arm_aapcs_vfpcc <8 x i16> @test_vcx1q_m(<8 x i16> %inactive, i16 zeroext %p) {
+; CHECK-LABEL: test_vcx1q_m:
+; CHECK:   @ %bb.0: @ %entry
+; CHECK-NEXT:vmsr p0, r0
+; CHECK-NEXT:vpst
+; CHECK-NEXT:vcx1t p0, q0, #
+; CHECK-NEXT:bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = call <8 x i16> @llvm.arm.cde.vcx1q.predicated.v8i16.v8i1(i32 0, <8 x i16> %inactive, i32 , <8 x i1> %1)
+  ret <8 x i16> %2
+}
+
+define arm_aapcs_vfpcc <16 x i8> @test_vcx1qa_m(<16 x i8> %acc, i16 zeroext %p) {
+; CHECK-LABEL: test_vcx1qa_m:
+; CHECK:   @ %bb.0: @ %entry
+; CHECK-NEXT:vmsr p0, r0
+; CHECK-NEXT:vpst
+; CHECK-NEXT:vcx1at p1, q0, #1112
+; CHECK-NEXT:bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+  %2 = call <16 x i8> @llvm.arm.cde.vcx1qa.predicated.v16i8.v16i1(i32 1, <16 x i8> %acc, i32 1112, <16 x i1> %1)
+  ret <16 x i8> %2
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vcx2q_m(<4 x i32> %inactive, <4 x float> %n, i16 zeroext %p) {
+; CHECK-LABEL: test_vcx2q_m:
+; CHECK:   @ %bb.0: @ %entry
+; CHECK-NEXT:vmsr p0, r0
+; CHECK-NEXT:vpst
+; CHECK-NEXT:vcx2t p0, q0, q1, #111
+; CHECK-NEXT:bx lr
+entry:
+  %0 = bitcast <4 x float> %n to <16 x i8>
+  %1 = zext i16 %p to i32
+  %2 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1)
+  %3 = call <4 x i32> @llvm.arm.cde.vcx2q.predicated.v4i32.v4i1(i32 0, <4 x i32> %inactive, <16 x i8> %0, i32 111, <4 x i1> %2)
+  ret <4 x i32> %3
+}
+
+define arm_aapcs_vfpcc <4 x float> @test_vcx2qa_m(<4 x float> %acc, <8 x half> %n, i16 zeroext %p) {
+; CHECK-LABEL: test_vcx2qa_m:
+; CHECK:   @ %bb.0: @ %entry
+; CHECK-NEXT:vmsr p0, r0
+; CHECK-NEXT:vpst
+; CHECK-NEXT:vcx2at p0, q0, q1, #112
+; CHECK-NEXT:bx lr
+entry:
+  %0 = bitcast <8 x half> %n to <16 x i8>
+  %1 = zext i16 %p to i32
+  %2 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1)
+  %3 = call <4 x float> @llvm.arm.cde.vcx2qa.predicated.v4f32.v4i1(i32 0, <4 x float> %acc, <16 x i8> %0, i32 112, <4 x i1> %2)
+  ret <4 x float> %3
+}
+
+define arm_aapcs_vfpcc <2 x i64> @test_vcx3q_m(<2 x i64> %inactive, <4 x float> %n, <16 x i8> %m, i16 zeroext %p) {
+; CHECK-LABEL: test_vcx3q_m:
+; CHECK:   @ %bb.0: @ %entry
+; CHECK-NEXT:vmsr p0, r0
+; CHECK-NEXT:vpst
+; CHECK-NEXT:vcx3t p0, q0, q1, q2, #11
+; CHECK-NEXT:bx lr
+entry:
+  %0 = bitcast <4 x float> %n to <16 x i8>
+  %1 = zext i16 %p to i32
+  %2 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1)
+  %3 = call <2 x i64> @llvm.arm.cde.vcx3q.predicated.v2i64.v4i1(i32 0, <2 x i64> %inactive, <16 x i8> %0, <16 x i8> %m, i32 11, <4 x i1> %2)
+  ret <2 x i64> %3
+}
+
+define arm_aapcs_v

[PATCH] D76610: [ARM,CDE] Implement predicated Q-register CDE intrinsics

2020-03-23 Thread Mikhail Maltsev via Phabricator via cfe-commits
miyuki created this revision.
miyuki added reviewers: simon_tatham, MarkMurrayARM, ostannard, dmgreen.
Herald added subscribers: cfe-commits, danielkiss, hiraditya, kristof.beyls.
Herald added a project: clang.

This patch implements the following CDE intrinsics:

  T __arm_vcx1q_m(int coproc, T inactive, uint32_t imm, mve_pred_t p);
  T __arm_vcx2q_m(int coproc, T inactive, U n, uint32_t imm, mve_pred_t p);
  T __arm_vcx3q_m(int coproc, T inactive, U n, V m, uint32_t imm, mve_pred_t p);
  
  T __arm_vcx1qa_m(int coproc, T acc, uint32_t imm, mve_pred_t p);
  T __arm_vcx2qa_m(int coproc, T acc, U n, uint32_t imm, mve_pred_t p);
  T __arm_vcx3qa_m(int coproc, T acc, U n, V m, uint32_t imm, mve_pred_t p);

The intrinsics are not part of the released ACLE spec, but internally at
Arm we have reached consensus to add them to the next ACLE release.


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D76610

Files:
  clang/include/clang/Basic/arm_cde.td
  clang/test/CodeGen/arm-cde-vec.c
  llvm/include/llvm/IR/IntrinsicsARM.td
  llvm/lib/Target/ARM/ARMInstrCDE.td
  llvm/test/CodeGen/Thumb2/cde-vec.ll

Index: llvm/test/CodeGen/Thumb2/cde-vec.ll
===
--- llvm/test/CodeGen/Thumb2/cde-vec.ll
+++ llvm/test/CodeGen/Thumb2/cde-vec.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=thumbv8.1m.main -mattr=+cdecp0 -mattr=+cdecp1 -mattr=+mve -verify-machineinstrs -o - %s | FileCheck %s
+; RUN: llc -mtriple=thumbv8.1m.main -mattr=+cdecp0 -mattr=+cdecp1 -mattr=+mve.fp -verify-machineinstrs -o - %s | FileCheck %s
 
 declare <16 x i8> @llvm.arm.cde.vcx1q(i32 immarg, i32 immarg)
 declare <16 x i8> @llvm.arm.cde.vcx1qa(i32 immarg, <16 x i8>, i32 immarg)
@@ -112,3 +112,103 @@
   %2 = call <16 x i8> @llvm.arm.cde.vcx3qa(i32 1, <16 x i8> %acc, <16 x i8> %0, <16 x i8> %1, i32 13)
   ret <16 x i8> %2
 }
+
+declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32)
+declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32)
+declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32)
+declare <8 x i16> @llvm.arm.cde.vcx1q.predicated.v8i16.v8i1(i32 immarg, <8 x i16>, i32 immarg, <8 x i1>)
+declare <16 x i8> @llvm.arm.cde.vcx1qa.predicated.v16i8.v16i1(i32 immarg, <16 x i8>, i32 immarg, <16 x i1>)
+declare <4 x i32> @llvm.arm.cde.vcx2q.predicated.v4i32.v4i1(i32 immarg, <4 x i32>, <16 x i8>, i32 immarg, <4 x i1>)
+declare <4 x float> @llvm.arm.cde.vcx2qa.predicated.v4f32.v4i1(i32 immarg, <4 x float>, <16 x i8>, i32 immarg, <4 x i1>)
+declare <2 x i64> @llvm.arm.cde.vcx3q.predicated.v2i64.v4i1(i32 immarg, <2 x i64>, <16 x i8>, <16 x i8>, i32 immarg, <4 x i1>)
+declare <4 x float> @llvm.arm.cde.vcx3qa.predicated.v4f32.v4i1(i32 immarg, <4 x float>, <16 x i8>, <16 x i8>, i32 immarg, <4 x i1>)
+
+define arm_aapcs_vfpcc <8 x i16> @test_vcx1q_m(<8 x i16> %inactive, i16 zeroext %p) {
+; CHECK-LABEL: test_vcx1q_m:
+; CHECK:   @ %bb.0: @ %entry
+; CHECK-NEXT:vmsr p0, r0
+; CHECK-NEXT:vpst
+; CHECK-NEXT:vcx1t p0, q0, #
+; CHECK-NEXT:bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = call <8 x i16> @llvm.arm.cde.vcx1q.predicated.v8i16.v8i1(i32 0, <8 x i16> %inactive, i32 , <8 x i1> %1)
+  ret <8 x i16> %2
+}
+
+define arm_aapcs_vfpcc <16 x i8> @test_vcx1qa_m(<16 x i8> %acc, i16 zeroext %p) {
+; CHECK-LABEL: test_vcx1qa_m:
+; CHECK:   @ %bb.0: @ %entry
+; CHECK-NEXT:vmsr p0, r0
+; CHECK-NEXT:vpst
+; CHECK-NEXT:vcx1at p1, q0, #1112
+; CHECK-NEXT:bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+  %2 = call <16 x i8> @llvm.arm.cde.vcx1qa.predicated.v16i8.v16i1(i32 1, <16 x i8> %acc, i32 1112, <16 x i1> %1)
+  ret <16 x i8> %2
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vcx2q_m(<4 x i32> %inactive, <4 x float> %n, i16 zeroext %p) {
+; CHECK-LABEL: test_vcx2q_m:
+; CHECK:   @ %bb.0: @ %entry
+; CHECK-NEXT:vmsr p0, r0
+; CHECK-NEXT:vpst
+; CHECK-NEXT:vcx2t p0, q0, q1, #111
+; CHECK-NEXT:bx lr
+entry:
+  %0 = bitcast <4 x float> %n to <16 x i8>
+  %1 = zext i16 %p to i32
+  %2 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1)
+  %3 = call <4 x i32> @llvm.arm.cde.vcx2q.predicated.v4i32.v4i1(i32 0, <4 x i32> %inactive, <16 x i8> %0, i32 111, <4 x i1> %2)
+  ret <4 x i32> %3
+}
+
+define arm_aapcs_vfpcc <4 x float> @test_vcx2qa_m(<4 x float> %acc, <8 x half> %n, i16 zeroext %p) {
+; CHECK-LABEL: test_vcx2qa_m:
+; CHECK:   @ %bb.0: @ %entry
+; CHECK-NEXT:vmsr p0, r0
+; CHECK-NEXT:vpst
+; CHECK-NEXT:vcx2at p0, q0, q1, #112
+; CHECK-NEXT:bx lr
+entry:
+  %0 = bitcast <8 x half> %n to <16 x i8>
+  %1 = zext i16 %p to i32
+  %2 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1)
+  %3 = call <4 x float> @llvm.arm.cde.vcx2qa.predicated.v4f32.v4i1(i32 0, <4 x float> %acc, <16 x i8> %0, i32 112, <4 x i1> %2)
+  ret <4 x float> %3
+}
+
+define arm_aapcs_vfpcc