[PATCH] D72934: [ARM,MVE] Support immediate vbicq,vorrq,vmvnq intrinsics.

Simon Tatham via Phabricator via cfe-commits Fri, 17 Jan 2020 09:17:48 -0800

simon_tatham created this revision.
simon_tatham added reviewers: dmgreen, MarkMurrayARM, miyuki, ostannard.
Herald added subscribers: llvm-commits, cfe-commits, hiraditya, kristof.beyls.
Herald added projects: clang, LLVM.


Immediate vmvnq is code-generated as a simple vector constant in IR,
and left to the backend to recognize that it can be created with an
MVE VMVN instruction. The predicated version is represented as a
select between the input and the same constant, and I've added a
Tablegen isel rule to turn that into a predicated VMVN. (That should
be better than the previous VMVN + VPSEL: it's the same number of
instructions but now it can fold into an adjacent VPT block.)

The unpredicated forms of VBIC and VORR are done by enabling the same
isel lowering as for NEON, recognizing appropriate immediates and
rewriting them as ARMISD::VBICIMM / ARMISD::VORRIMM SDNodes, which I
then instruction-select into the right MVE instructions (but in custom
C++, because of the awkward MC representation). In order to do that, I
had to promote the Tablegen SDNode instance `NEONvorrImm` to a general
`ARMvorrImm` available in MVE as well, and similarly for `NEONvbicImm`.

For the predicated forms of VBIC and VORR, I've just invented IR
intrinsics. I considered trying to match a call to the existing
predicated VBIC intrinsic where one argument was a vector splat, but
it looked like needing a lot more code than it would give benefit.

This intrinsic family is the first to use the `imm_simd` system I put
into the MveEmitter tablegen backend. So, naturally, it showed up a
bug or two (emitting bogus range checks and the like). Fixed those,
and added a full set of tests for the permissible immediates in the
existing Sema test.

Finally, the new isel support for immediate VBIC has caused changes in
a few existing MVE codegen tests, because `vbic.i16 q0,#0xff00` has
now taken over from `vmovlb.u8 q0,q0` as LLVM's preferred way to clear
the top byte of each 16-bit lane. I think the changes are benign.


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D72934

Files:
  clang/include/clang/Basic/arm_mve.td
  clang/include/clang/Basic/arm_mve_defs.td
  clang/include/clang/Sema/Sema.h
  clang/lib/Sema/SemaChecking.cpp
  clang/test/CodeGen/arm-mve-intrinsics/bitwise-imm.c
  clang/test/Sema/arm-mve-immediates.c
  clang/utils/TableGen/MveEmitter.cpp
  llvm/include/llvm/IR/IntrinsicsARM.td
  llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
  llvm/lib/Target/ARM/ARMISelLowering.cpp
  llvm/lib/Target/ARM/ARMInstrInfo.td
  llvm/lib/Target/ARM/ARMInstrMVE.td
  llvm/lib/Target/ARM/ARMInstrNEON.td
  llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll
  llvm/test/CodeGen/Thumb2/mve-intrinsics/bitwise-imm.ll
  llvm/test/CodeGen/Thumb2/mve-masked-load.ll
  llvm/test/CodeGen/Thumb2/mve-sext.ll
  llvm/test/CodeGen/Thumb2/mve-shuffleext.ll

Index: llvm/test/CodeGen/Thumb2/mve-shuffleext.ll
===================================================================
--- llvm/test/CodeGen/Thumb2/mve-shuffleext.ll
+++ llvm/test/CodeGen/Thumb2/mve-shuffleext.ll
@@ -73,7 +73,7 @@
 define arm_aapcs_vfpcc <8 x i16> @zext_02468101214(<16 x i8> %src) {
 ; CHECK-LABEL: zext_02468101214:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmovlb.u8 q0, q0
+; CHECK-NEXT:    vbic.i16 q0, #0xff00
 ; CHECK-NEXT:    bx lr
 entry:
   %strided.vec = shufflevector <16 x i8> %src, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
@@ -85,7 +85,7 @@
 ; CHECK-LABEL: zext_13579111315:
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vrev16.8 q0, q0
-; CHECK-NEXT:    vmovlb.u8 q0, q0
+; CHECK-NEXT:    vbic.i16 q0, #0xff00
 ; CHECK-NEXT:    bx lr
 entry:
   %strided.vec = shufflevector <16 x i8> %src, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
Index: llvm/test/CodeGen/Thumb2/mve-sext.ll
===================================================================
--- llvm/test/CodeGen/Thumb2/mve-sext.ll
+++ llvm/test/CodeGen/Thumb2/mve-sext.ll
@@ -277,7 +277,7 @@
 define arm_aapcs_vfpcc <8 x i16> @zext_v8i8_v8i16(<8 x i8> %src) {
 ; CHECK-LABEL: zext_v8i8_v8i16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vmovlb.u8 q0, q0
+; CHECK-NEXT:    vbic.i16 q0, #0xff00
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = zext <8 x i8> %src to <8 x i16>
@@ -308,41 +308,41 @@
 define arm_aapcs_vfpcc <16 x i16> @zext_v16i8_v16i16(<16 x i8> %src) {
 ; CHECK-LABEL: zext_v16i8_v16i16:
 ; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov q2, q0
 ; CHECK-NEXT:    vmov.u8 r0, q0[0]
+; CHECK-NEXT:    vmov.16 q0[0], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[1]
+; CHECK-NEXT:    vmov.16 q0[1], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[2]
+; CHECK-NEXT:    vmov.16 q0[2], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[3]
+; CHECK-NEXT:    vmov.16 q0[3], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[4]
+; CHECK-NEXT:    vmov.16 q0[4], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[5]
+; CHECK-NEXT:    vmov.16 q0[5], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[6]
+; CHECK-NEXT:    vmov.16 q0[6], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[7]
+; CHECK-NEXT:    vmov.16 q0[7], r0
+; CHECK-NEXT:    vmov.u8 r0, q2[8]
 ; CHECK-NEXT:    vmov.16 q1[0], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[1]
-; CHECK-NEXT:    vmov.16 q1[1], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[2]
-; CHECK-NEXT:    vmov.16 q1[2], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[3]
-; CHECK-NEXT:    vmov.16 q1[3], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[4]
-; CHECK-NEXT:    vmov.16 q1[4], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[5]
-; CHECK-NEXT:    vmov.16 q1[5], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[6]
-; CHECK-NEXT:    vmov.16 q1[6], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[7]
-; CHECK-NEXT:    vmov.16 q1[7], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[8]
-; CHECK-NEXT:    vmovlb.u8 q2, q1
-; CHECK-NEXT:    vmov.16 q1[0], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[9]
+; CHECK-NEXT:    vmov.u8 r0, q2[9]
 ; CHECK-NEXT:    vmov.16 q1[1], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[10]
+; CHECK-NEXT:    vmov.u8 r0, q2[10]
 ; CHECK-NEXT:    vmov.16 q1[2], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[11]
+; CHECK-NEXT:    vmov.u8 r0, q2[11]
 ; CHECK-NEXT:    vmov.16 q1[3], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[12]
+; CHECK-NEXT:    vmov.u8 r0, q2[12]
 ; CHECK-NEXT:    vmov.16 q1[4], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[13]
+; CHECK-NEXT:    vmov.u8 r0, q2[13]
 ; CHECK-NEXT:    vmov.16 q1[5], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[14]
+; CHECK-NEXT:    vmov.u8 r0, q2[14]
 ; CHECK-NEXT:    vmov.16 q1[6], r0
-; CHECK-NEXT:    vmov.u8 r0, q0[15]
+; CHECK-NEXT:    vmov.u8 r0, q2[15]
 ; CHECK-NEXT:    vmov.16 q1[7], r0
-; CHECK-NEXT:    vmov q0, q2
-; CHECK-NEXT:    vmovlb.u8 q1, q1
+; CHECK-NEXT:    vbic.i16 q0, #0xff00
+; CHECK-NEXT:    vbic.i16 q1, #0xff00
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = zext <16 x i8> %src to <16 x i16>
Index: llvm/test/CodeGen/Thumb2/mve-masked-load.ll
===================================================================
--- llvm/test/CodeGen/Thumb2/mve-masked-load.ll
+++ llvm/test/CodeGen/Thumb2/mve-masked-load.ll
@@ -998,21 +998,21 @@
 define arm_aapcs_vfpcc <8 x i16> @zext8_masked_v8i16_align1_other(<8 x i8> *%dest, <8 x i8> %a) {
 ; CHECK-LE-LABEL: zext8_masked_v8i16_align1_other:
 ; CHECK-LE:       @ %bb.0: @ %entry
-; CHECK-LE-NEXT:    vmovlb.u8 q1, q0
-; CHECK-LE-NEXT:    vmovlb.s8 q0, q0
-; CHECK-LE-NEXT:    vpt.s16 gt, q0, zr
-; CHECK-LE-NEXT:    vldrbt.u16 q0, [r0]
-; CHECK-LE-NEXT:    vpsel q0, q0, q1
+; CHECK-LE-NEXT:    vmovlb.s8 q1, q0
+; CHECK-LE-NEXT:    vpt.s16 gt, q1, zr
+; CHECK-LE-NEXT:    vldrbt.u16 q1, [r0]
+; CHECK-LE-NEXT:    vbic.i16 q0, #0xff00
+; CHECK-LE-NEXT:    vpsel q0, q1, q0
 ; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: zext8_masked_v8i16_align1_other:
 ; CHECK-BE:       @ %bb.0: @ %entry
 ; CHECK-BE-NEXT:    vrev64.16 q1, q0
-; CHECK-BE-NEXT:    vmovlb.u8 q0, q1
-; CHECK-BE-NEXT:    vmovlb.s8 q1, q1
-; CHECK-BE-NEXT:    vpt.s16 gt, q1, zr
-; CHECK-BE-NEXT:    vldrbt.u16 q1, [r0]
-; CHECK-BE-NEXT:    vpsel q1, q1, q0
+; CHECK-BE-NEXT:    vmovlb.s8 q0, q1
+; CHECK-BE-NEXT:    vpt.s16 gt, q0, zr
+; CHECK-BE-NEXT:    vldrbt.u16 q0, [r0]
+; CHECK-BE-NEXT:    vbic.i16 q1, #0xff00
+; CHECK-BE-NEXT:    vpsel q1, q0, q1
 ; CHECK-BE-NEXT:    vrev64.16 q0, q1
 ; CHECK-BE-NEXT:    bx lr
 entry:
Index: llvm/test/CodeGen/Thumb2/mve-intrinsics/bitwise-imm.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/Thumb2/mve-intrinsics/bitwise-imm.ll
@@ -0,0 +1,343 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -verify-machineinstrs -o - %s | FileCheck %s
+
+define arm_aapcs_vfpcc <8 x i16> @test_vbicq_n_u16_sh0(<8 x i16> %a) {
+; CHECK-LABEL: test_vbicq_n_u16_sh0:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vbic.i16 q0, #0x64
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = and <8 x i16> %a, <i16 -101, i16 -101, i16 -101, i16 -101, i16 -101, i16 -101, i16 -101, i16 -101>
+  ret <8 x i16> %0
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vbicq_n_u16_sh8(<8 x i16> %a) {
+; CHECK-LABEL: test_vbicq_n_u16_sh8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vbic.i16 q0, #0x6400
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = and <8 x i16> %a, <i16 -25601, i16 -25601, i16 -25601, i16 -25601, i16 -25601, i16 -25601, i16 -25601, i16 -25601>
+  ret <8 x i16> %0
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vbicq_n_u32_sh0(<4 x i32> %a) {
+; CHECK-LABEL: test_vbicq_n_u32_sh0:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vbic.i32 q0, #0x64
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = and <4 x i32> %a, <i32 -101, i32 -101, i32 -101, i32 -101>
+  ret <4 x i32> %0
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vbicq_n_u32_sh8(<4 x i32> %a) {
+; CHECK-LABEL: test_vbicq_n_u32_sh8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vbic.i32 q0, #0x6400
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = and <4 x i32> %a, <i32 -25601, i32 -25601, i32 -25601, i32 -25601>
+  ret <4 x i32> %0
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vbicq_n_u32_sh16(<4 x i32> %a) {
+; CHECK-LABEL: test_vbicq_n_u32_sh16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vbic.i32 q0, #0x640000
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = and <4 x i32> %a, <i32 -6553601, i32 -6553601, i32 -6553601, i32 -6553601>
+  ret <4 x i32> %0
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vbicq_n_u32_sh24(<4 x i32> %a) {
+; CHECK-LABEL: test_vbicq_n_u32_sh24:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vbic.i32 q0, #0x64000000
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = and <4 x i32> %a, <i32 -1677721601, i32 -1677721601, i32 -1677721601, i32 -1677721601>
+  ret <4 x i32> %0
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vorrq_n_u16_sh0(<8 x i16> %a) {
+; CHECK-LABEL: test_vorrq_n_u16_sh0:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vorr.i16 q0, #0x64
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = or <8 x i16> %a, <i16 100, i16 100, i16 100, i16 100, i16 100, i16 100, i16 100, i16 100>
+  ret <8 x i16> %0
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vorrq_n_u16_sh8(<8 x i16> %a) {
+; CHECK-LABEL: test_vorrq_n_u16_sh8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vorr.i16 q0, #0x6400
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = or <8 x i16> %a, <i16 25600, i16 25600, i16 25600, i16 25600, i16 25600, i16 25600, i16 25600, i16 25600>
+  ret <8 x i16> %0
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vorrq_n_u32_sh0(<4 x i32> %a) {
+; CHECK-LABEL: test_vorrq_n_u32_sh0:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vorr.i32 q0, #0x64
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = or <4 x i32> %a, <i32 100, i32 100, i32 100, i32 100>
+  ret <4 x i32> %0
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vorrq_n_u32_sh8(<4 x i32> %a) {
+; CHECK-LABEL: test_vorrq_n_u32_sh8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vorr.i32 q0, #0x6400
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = or <4 x i32> %a, <i32 25600, i32 25600, i32 25600, i32 25600>
+  ret <4 x i32> %0
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vorrq_n_u32_sh16(<4 x i32> %a) {
+; CHECK-LABEL: test_vorrq_n_u32_sh16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vorr.i32 q0, #0x640000
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = or <4 x i32> %a, <i32 6553600, i32 6553600, i32 6553600, i32 6553600>
+  ret <4 x i32> %0
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vorrq_n_u32_sh24(<4 x i32> %a) {
+; CHECK-LABEL: test_vorrq_n_u32_sh24:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vorr.i32 q0, #0x64000000
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = or <4 x i32> %a, <i32 1677721600, i32 1677721600, i32 1677721600, i32 1677721600>
+  ret <4 x i32> %0
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vbicq_m_n_u16_sh0(<8 x i16> %a, i16 zeroext %p) {
+; CHECK-LABEL: test_vbicq_m_n_u16_sh0:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vbict.i16 q0, #0x64
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = tail call <8 x i16> @llvm.arm.mve.bic.imm.predicated.v8i16.v8i1(<8 x i16> %a, i32 100, <8 x i1> %1)
+  ret <8 x i16> %2
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vbicq_m_n_u16_sh8(<8 x i16> %a, i16 zeroext %p) {
+; CHECK-LABEL: test_vbicq_m_n_u16_sh8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vbict.i16 q0, #0x6400
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = tail call <8 x i16> @llvm.arm.mve.bic.imm.predicated.v8i16.v8i1(<8 x i16> %a, i32 25600, <8 x i1> %1)
+  ret <8 x i16> %2
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vbicq_m_n_u32_sh0(<4 x i32> %a, i16 zeroext %p) {
+; CHECK-LABEL: test_vbicq_m_n_u32_sh0:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vbict.i32 q0, #0x64
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = tail call <4 x i32> @llvm.arm.mve.bic.imm.predicated.v4i32.v4i1(<4 x i32> %a, i32 100, <4 x i1> %1)
+  ret <4 x i32> %2
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vbicq_m_n_u32_sh8(<4 x i32> %a, i16 zeroext %p) {
+; CHECK-LABEL: test_vbicq_m_n_u32_sh8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vbict.i32 q0, #0x6400
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = tail call <4 x i32> @llvm.arm.mve.bic.imm.predicated.v4i32.v4i1(<4 x i32> %a, i32 25600, <4 x i1> %1)
+  ret <4 x i32> %2
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vbicq_m_n_u32_sh16(<4 x i32> %a, i16 zeroext %p) {
+; CHECK-LABEL: test_vbicq_m_n_u32_sh16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vbict.i32 q0, #0x640000
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = tail call <4 x i32> @llvm.arm.mve.bic.imm.predicated.v4i32.v4i1(<4 x i32> %a, i32 6553600, <4 x i1> %1)
+  ret <4 x i32> %2
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vbicq_m_n_u32_sh24(<4 x i32> %a, i16 zeroext %p) {
+; CHECK-LABEL: test_vbicq_m_n_u32_sh24:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vbict.i32 q0, #0x64000000
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = tail call <4 x i32> @llvm.arm.mve.bic.imm.predicated.v4i32.v4i1(<4 x i32> %a, i32 1677721600, <4 x i1> %1)
+  ret <4 x i32> %2
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vorrq_m_n_u16_sh0(<8 x i16> %a, i16 zeroext %p) {
+; CHECK-LABEL: test_vorrq_m_n_u16_sh0:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vorrt.i16 q0, #0x64
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = tail call <8 x i16> @llvm.arm.mve.orr.imm.predicated.v8i16.v8i1(<8 x i16> %a, i32 100, <8 x i1> %1)
+  ret <8 x i16> %2
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vorrq_m_n_u16_sh8(<8 x i16> %a, i16 zeroext %p) {
+; CHECK-LABEL: test_vorrq_m_n_u16_sh8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vorrt.i16 q0, #0x6400
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = tail call <8 x i16> @llvm.arm.mve.orr.imm.predicated.v8i16.v8i1(<8 x i16> %a, i32 25600, <8 x i1> %1)
+  ret <8 x i16> %2
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vorrq_m_n_u32_sh0(<4 x i32> %a, i16 zeroext %p) {
+; CHECK-LABEL: test_vorrq_m_n_u32_sh0:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vorrt.i32 q0, #0x64
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = tail call <4 x i32> @llvm.arm.mve.orr.imm.predicated.v4i32.v4i1(<4 x i32> %a, i32 100, <4 x i1> %1)
+  ret <4 x i32> %2
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vorrq_m_n_u32_sh8(<4 x i32> %a, i16 zeroext %p) {
+; CHECK-LABEL: test_vorrq_m_n_u32_sh8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vorrt.i32 q0, #0x6400
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = tail call <4 x i32> @llvm.arm.mve.orr.imm.predicated.v4i32.v4i1(<4 x i32> %a, i32 25600, <4 x i1> %1)
+  ret <4 x i32> %2
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vorrq_m_n_u32_sh16(<4 x i32> %a, i16 zeroext %p) {
+; CHECK-LABEL: test_vorrq_m_n_u32_sh16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vorrt.i32 q0, #0x640000
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = tail call <4 x i32> @llvm.arm.mve.orr.imm.predicated.v4i32.v4i1(<4 x i32> %a, i32 6553600, <4 x i1> %1)
+  ret <4 x i32> %2
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vorrq_m_n_u32_sh24(<4 x i32> %a, i16 zeroext %p) {
+; CHECK-LABEL: test_vorrq_m_n_u32_sh24:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vorrt.i32 q0, #0x64000000
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = tail call <4 x i32> @llvm.arm.mve.orr.imm.predicated.v4i32.v4i1(<4 x i32> %a, i32 1677721600, <4 x i1> %1)
+  ret <4 x i32> %2
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vmvnq_n_u16() {
+; CHECK-LABEL: test_vmvnq_n_u16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmvn.i16 q0, #0xaa00
+; CHECK-NEXT:    bx lr
+entry:
+  ret <8 x i16> <i16 -43521, i16 -43521, i16 -43521, i16 -43521, i16 -43521, i16 -43521, i16 -43521, i16 -43521>
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vmvnq_n_u32() {
+; CHECK-LABEL: test_vmvnq_n_u32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmvn.i32 q0, #0xaa00
+; CHECK-NEXT:    bx lr
+entry:
+  ret <4 x i32> <i32 -43521, i32 -43521, i32 -43521, i32 -43521>
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vmvnq_m_n_u16(<8 x i16> %inactive, i16 zeroext %p) {
+; CHECK-LABEL: test_vmvnq_m_n_u16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmvnt.i16 q0, #0xaa00
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = select <8 x i1> %1, <8 x i16> <i16 -43521, i16 -43521, i16 -43521, i16 -43521, i16 -43521, i16 -43521, i16 -43521, i16 -43521>, <8 x i16> %inactive
+  ret <8 x i16> %2
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vmvnq_m_n_u32(<4 x i32> %inactive, i16 zeroext %p) {
+; CHECK-LABEL: test_vmvnq_m_n_u32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmvnt.i32 q0, #0xaa00
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = select <4 x i1> %1, <4 x i32> <i32 -43521, i32 -43521, i32 -43521, i32 -43521>, <4 x i32> %inactive
+  ret <4 x i32> %2
+}
+
+declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32)
+declare <8 x i16> @llvm.arm.mve.bic.imm.predicated.v8i16.v8i1(<8 x i16>, i32, <8 x i1>)
+declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32)
+declare <4 x i32> @llvm.arm.mve.bic.imm.predicated.v4i32.v4i1(<4 x i32>, i32, <4 x i1>)
+declare <8 x i16> @llvm.arm.mve.orr.imm.predicated.v8i16.v8i1(<8 x i16>, i32, <8 x i1>)
+declare <4 x i32> @llvm.arm.mve.orr.imm.predicated.v4i32.v4i1(<4 x i32>, i32, <4 x i1>)
Index: llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll
===================================================================
--- llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll
+++ llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll
@@ -588,7 +588,7 @@
 ; CHECK-NEXT:    vmov.16 q0[5], r2
 ; CHECK-NEXT:    vmov.16 q0[6], r1
 ; CHECK-NEXT:    vmov.16 q0[7], r4
-; CHECK-NEXT:    vmovlb.u8 q0, q0
+; CHECK-NEXT:    vbic.i16 q0, #0xff00
 ; CHECK-NEXT:    pop {r4, r5, r7, pc}
 entry:
   %offs = load <8 x i8*>, <8 x i8*>* %offptr, align 4
Index: llvm/lib/Target/ARM/ARMInstrNEON.td
===================================================================
--- llvm/lib/Target/ARM/ARMInstrNEON.td
+++ llvm/lib/Target/ARM/ARMInstrNEON.td
@@ -509,11 +509,6 @@
 def NEONvsliImm      : SDNode<"ARMISD::VSLIIMM", SDTARMVSHINSIMM>;
 def NEONvsriImm      : SDNode<"ARMISD::VSRIIMM", SDTARMVSHINSIMM>;
 
-def SDTARMVORRIMM : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0, 1>,
-                                           SDTCisVT<2, i32>]>;
-def NEONvorrImm   : SDNode<"ARMISD::VORRIMM", SDTARMVORRIMM>;
-def NEONvbicImm   : SDNode<"ARMISD::VBICIMM", SDTARMVORRIMM>;
-
 def NEONvbsl      : SDNode<"ARMISD::VBSL",
                            SDTypeProfile<1, 3, [SDTCisVec<0>,
                                                 SDTCisSameAs<0, 1>,
@@ -5296,7 +5291,7 @@
                           IIC_VMOVImm,
                           "vorr", "i16", "$Vd, $SIMM", "$src = $Vd",
                           [(set DPR:$Vd,
-                            (v4i16 (NEONvorrImm DPR:$src, timm:$SIMM)))]> {
+                            (v4i16 (ARMvorrImm DPR:$src, timm:$SIMM)))]> {
   let Inst{9} = SIMM{9};
 }
 
@@ -5305,7 +5300,7 @@
                           IIC_VMOVImm,
                           "vorr", "i32", "$Vd, $SIMM", "$src = $Vd",
                           [(set DPR:$Vd,
-                            (v2i32 (NEONvorrImm DPR:$src, timm:$SIMM)))]> {
+                            (v2i32 (ARMvorrImm DPR:$src, timm:$SIMM)))]> {
   let Inst{10-9} = SIMM{10-9};
 }
 
@@ -5314,7 +5309,7 @@
                           IIC_VMOVImm,
                           "vorr", "i16", "$Vd, $SIMM", "$src = $Vd",
                           [(set QPR:$Vd,
-                            (v8i16 (NEONvorrImm QPR:$src, timm:$SIMM)))]> {
+                            (v8i16 (ARMvorrImm QPR:$src, timm:$SIMM)))]> {
   let Inst{9} = SIMM{9};
 }
 
@@ -5323,7 +5318,7 @@
                           IIC_VMOVImm,
                           "vorr", "i32", "$Vd, $SIMM", "$src = $Vd",
                           [(set QPR:$Vd,
-                            (v4i32 (NEONvorrImm QPR:$src, timm:$SIMM)))]> {
+                            (v4i32 (ARMvorrImm QPR:$src, timm:$SIMM)))]> {
   let Inst{10-9} = SIMM{10-9};
 }
 
@@ -5347,7 +5342,7 @@
                           IIC_VMOVImm,
                           "vbic", "i16", "$Vd, $SIMM", "$src = $Vd",
                           [(set DPR:$Vd,
-                            (v4i16 (NEONvbicImm DPR:$src, timm:$SIMM)))]> {
+                            (v4i16 (ARMvbicImm DPR:$src, timm:$SIMM)))]> {
   let Inst{9} = SIMM{9};
 }
 
@@ -5356,7 +5351,7 @@
                           IIC_VMOVImm,
                           "vbic", "i32", "$Vd, $SIMM", "$src = $Vd",
                           [(set DPR:$Vd,
-                            (v2i32 (NEONvbicImm DPR:$src, timm:$SIMM)))]> {
+                            (v2i32 (ARMvbicImm DPR:$src, timm:$SIMM)))]> {
   let Inst{10-9} = SIMM{10-9};
 }
 
@@ -5365,7 +5360,7 @@
                           IIC_VMOVImm,
                           "vbic", "i16", "$Vd, $SIMM", "$src = $Vd",
                           [(set QPR:$Vd,
-                            (v8i16 (NEONvbicImm QPR:$src, timm:$SIMM)))]> {
+                            (v8i16 (ARMvbicImm QPR:$src, timm:$SIMM)))]> {
   let Inst{9} = SIMM{9};
 }
 
@@ -5374,7 +5369,7 @@
                           IIC_VMOVImm,
                           "vbic", "i32", "$Vd, $SIMM", "$src = $Vd",
                           [(set QPR:$Vd,
-                            (v4i32 (NEONvbicImm QPR:$src, timm:$SIMM)))]> {
+                            (v4i32 (ARMvbicImm QPR:$src, timm:$SIMM)))]> {
   let Inst{10-9} = SIMM{10-9};
 }
 
Index: llvm/lib/Target/ARM/ARMInstrMVE.td
===================================================================
--- llvm/lib/Target/ARM/ARMInstrMVE.td
+++ llvm/lib/Target/ARM/ARMInstrMVE.td
@@ -2265,6 +2265,15 @@
 
   def : Pat<(v4f32 (ARMvmovFPImm timm:$simm)),
             (v4f32 (MVE_VMOVimmf32 nImmVMOVF32:$simm))>;
+
+  def : Pat<(v8i16 (vselect (v8i1 VCCR:$pred), (ARMvmvnImm timm:$simm),
+                            MQPR:$inactive)),
+            (v8i16 (MVE_VMVNimmi16 nImmSplatI16:$simm,
+                            ARMVCCThen, VCCR:$pred, MQPR:$inactive))>;
+  def : Pat<(v4i32 (vselect (v4i1 VCCR:$pred), (ARMvmvnImm timm:$simm),
+                            MQPR:$inactive)),
+            (v4i32 (MVE_VMVNimmi32 nImmSplatI32:$simm,
+                            ARMVCCThen, VCCR:$pred, MQPR:$inactive))>;
 }
 
 class MVE_VMINMAXA<string iname, string suffix, bits<2> size,
Index: llvm/lib/Target/ARM/ARMInstrInfo.td
===================================================================
--- llvm/lib/Target/ARM/ARMInstrInfo.td
+++ llvm/lib/Target/ARM/ARMInstrInfo.td
@@ -274,6 +274,10 @@
 def ARMvmvnImm   : SDNode<"ARMISD::VMVNIMM", SDTARMVMOVIMM>;
 def ARMvmovFPImm : SDNode<"ARMISD::VMOVFPIMM", SDTARMVMOVIMM>;
 
+def SDTARMVORRIMM : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0, 1>,
+                                           SDTCisVT<2, i32>]>;
+def ARMvorrImm   : SDNode<"ARMISD::VORRIMM", SDTARMVORRIMM>;
+def ARMvbicImm   : SDNode<"ARMISD::VBICIMM", SDTARMVORRIMM>;
 
 def SDTARMVSHIMM : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisSameAs<0, 1>,
                                         SDTCisVT<2, i32>]>;
Index: llvm/lib/Target/ARM/ARMISelLowering.cpp
===================================================================
--- llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -12176,7 +12176,7 @@
   APInt SplatBits, SplatUndef;
   unsigned SplatBitSize;
   bool HasAnyUndefs;
-  if (BVN && Subtarget->hasNEON() &&
+  if (BVN && (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) &&
       BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
     if (SplatBitSize <= 64) {
       EVT VbicVT;
@@ -12483,7 +12483,7 @@
   APInt SplatBits, SplatUndef;
   unsigned SplatBitSize;
   bool HasAnyUndefs;
-  if (BVN && Subtarget->hasNEON() &&
+  if (BVN && (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) &&
       BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
     if (SplatBitSize <= 64) {
       EVT VorrVT;
Index: llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
===================================================================
--- llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -268,6 +268,20 @@
   void SelectMVE_VLD(SDNode *N, unsigned NumVecs,
                      const uint16_t *const *Opcodes);
 
+  /// SelectMVE_VBIC_VORR_imm - Select immediate forms of MVE VBIC and
+  /// VORR instructions, which have different MC ids depending on the
+  /// shape of the constant. N should be an ARMISD::VBICIMM / VORRIMM
+  /// node; the array Opcodes has 6 entries, used for 8-bit constants
+  /// shifted left by 0, 8, 16 or 24 bits in a 32-bit word, and 0 or 8
+  /// bits in a 16-bit word, respectively.
+  void SelectMVE_VBIC_VORR_imm(SDNode *N, const uint16_t *Opcodes);
+
+  /// SelectMVE_VBIC_VORR_imm_predicated - Select immediate MVE VBIC
+  /// and VORR, but this time the predicated forms, starting from a
+  /// node that is a call to the arm.mve.{bic,orr}.predicated
+  /// intrinsic.
+  void SelectMVE_VBIC_VORR_imm_predicated(SDNode *N, const uint16_t *Opcodes);
+
   /// SelectVLDDup - Select NEON load-duplicate intrinsics.  NumVecs
   /// should be 1, 2, 3 or 4.  The opcode array specifies the instructions used
   /// for loading D registers.
@@ -2581,6 +2595,91 @@
   CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), makeArrayRef(Ops));
 }
 
+static const uint16_t MVE_VBIC_Opcodes[] = {
+  ARM::MVE_VBICIZ0v4i32, ARM::MVE_VBICIZ8v4i32,
+  ARM::MVE_VBICIZ16v4i32, ARM::MVE_VBICIZ24v4i32,
+  ARM::MVE_VBICIZ0v8i16, ARM::MVE_VBICIZ8v8i16,
+};
+static const uint16_t MVE_VORR_Opcodes[] = {
+  ARM::MVE_VORRIZ0v4i32, ARM::MVE_VORRIZ8v4i32,
+  ARM::MVE_VORRIZ16v4i32, ARM::MVE_VORRIZ24v4i32,
+  ARM::MVE_VORRIZ0v8i16, ARM::MVE_VORRIZ8v8i16,
+};
+
+void ARMDAGToDAGISel::SelectMVE_VBIC_VORR_imm(SDNode *N,
+                                              const uint16_t *Opcodes) {
+  SDLoc Loc(N);
+
+  // The VBICIMM or VORRIMM SDNode will have encoded the constant in
+  // NEON form, which we unpack to select between MVE opcodes.
+  //
+  // We expect its low 8 bits to be the literal 8-bit value that will
+  // be shifted to form the true constant, and the higher bits to
+  // specify the NEON control bits. In our case those bits will be
+  // 0,2,4,6,8,10 for the six entries in our Opcodes array.
+  unsigned NeonConst = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+  unsigned ShiftedValue = NeonConst & 0xFF;
+  unsigned ControlBits = NeonConst >> 8;
+  assert(ControlBits % 2 == 0 &&
+         "unexpected NEON constant encoding for MVE VBIC/VORR");
+  unsigned OpcodeIndex = ControlBits >> 1;
+  assert(OpcodeIndex < 6 &&
+         "unexpected NEON constant encoding for MVE VBIC/VORR");
+  uint16_t Opcode = Opcodes[OpcodeIndex];
+
+  // The actual constant operand will have the same value that would
+  // be shown in assembly. So we shift it left by the right number of
+  // bytes, which is 0,1,2,3 for the first four opcode types (with a
+  // 32-bit word size) and then 0,1 again for the other two (16-bit).
+  unsigned ShiftBytes = OpcodeIndex & 3;
+  uint32_t MveConst = ShiftedValue << (8 * ShiftBytes);
+
+  SmallVector<SDValue, 8> Ops;
+  Ops.push_back(N->getOperand(0));
+  Ops.push_back(CurDAG->getTargetConstant(MveConst, Loc, MVT::i32));
+  // FIXME take that predicate away again
+  AddEmptyMVEPredicateToOps(Ops, Loc);
+  CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), makeArrayRef(Ops));
+}
+
+void ARMDAGToDAGISel::SelectMVE_VBIC_VORR_imm_predicated(
+    SDNode *N, const uint16_t *Opcodes) {
+  SDLoc Loc(N);
+
+  uint32_t MveConst = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
+
+  unsigned ElementBits;
+  switch (N->getValueType(0).getSimpleVT().SimpleTy) {
+  case MVT::v8i16:
+    ElementBits = 16;
+    break;
+  case MVT::v4i32:
+    ElementBits = 32;
+    break;
+  default:
+    llvm_unreachable("bad vector type in SelectMVE_VBIC_VORR_imm_predicated");
+  }
+
+  unsigned Shift, ShiftedValue;
+  for (Shift = 0; Shift < ElementBits; Shift += 8) {
+    ShiftedValue = MveConst >> Shift;
+    if (MveConst == ShiftedValue << Shift)
+      break;
+  }
+  assert(Shift < ElementBits &&
+         "bad constant in SelectMVE_VBIC_VORR_imm_predicated");
+
+  unsigned ShiftBytes = Shift / 8;
+  unsigned OpcodeIndex = (ElementBits == 16 ? 4 : 0) + ShiftBytes;
+  uint16_t Opcode = Opcodes[OpcodeIndex];
+
+  SmallVector<SDValue, 8> Ops;
+  Ops.push_back(N->getOperand(1));
+  Ops.push_back(CurDAG->getTargetConstant(MveConst, Loc, MVT::i32));
+  AddMVEPredicateToOps(Ops, Loc, N->getOperand(3));
+  CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), makeArrayRef(Ops));
+}
+
 static bool SDValueToConstBool(SDValue SDVal) {
   assert(isa<ConstantSDNode>(SDVal) && "expected a compile-time constant");
   ConstantSDNode *SDValConstant = dyn_cast<ConstantSDNode>(SDVal);
@@ -4573,6 +4672,14 @@
                           OpcodesS, OpcodesU);
       return;
     }
+
+    case Intrinsic::arm_mve_bic_imm_predicated:
+      SelectMVE_VBIC_VORR_imm_predicated(N, MVE_VBIC_Opcodes);
+      return;
+    case Intrinsic::arm_mve_orr_imm_predicated:
+      SelectMVE_VBIC_VORR_imm_predicated(N, MVE_VORR_Opcodes);
+      return;
+
     }
     break;
   }
@@ -4580,6 +4687,26 @@
   case ISD::ATOMIC_CMP_SWAP:
     SelectCMP_SWAP(N);
     return;
+
+  case ARMISD::VBICIMM:
+    if (Subtarget->hasMVEIntegerOps()) {
+      // The MVE version of this instruction is divided into
+      // sub-opcodes in a way that makes it tricky to select using
+      // Tablegen patterns, so we use custom C++.
+      SelectMVE_VBIC_VORR_imm(N, MVE_VBIC_Opcodes);
+      return;
+    }
+    // On NEON, Tablegen can handle the job.
+    break;
+
+  case ARMISD::VORRIMM:
+    // As with VBICIMM, we have to do this by hand on MVE, and leave
+    // it to the automation otherwise.
+    if (Subtarget->hasMVEIntegerOps()) {
+      SelectMVE_VBIC_VORR_imm(N, MVE_VORR_Opcodes);
+      return;
+    }
+    break;
   }
 
   SelectCode(N);
Index: llvm/include/llvm/IR/IntrinsicsARM.td
===================================================================
--- llvm/include/llvm/IR/IntrinsicsARM.td
+++ llvm/include/llvm/IR/IntrinsicsARM.td
@@ -1133,4 +1133,10 @@
   [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
    llvm_i32_ty, llvm_i32_ty, llvm_anyvector_ty, LLVMMatchType<0>],
   llvm_anyvector_ty>;
+
+def int_arm_mve_bic_imm_predicated: Intrinsic<[llvm_anyvector_ty],
+   [LLVMMatchType<0>, llvm_i32_ty, llvm_anyvector_ty], [IntrNoMem]>;
+def int_arm_mve_orr_imm_predicated: Intrinsic<[llvm_anyvector_ty],
+   [LLVMMatchType<0>, llvm_i32_ty, llvm_anyvector_ty], [IntrNoMem]>;
+
 } // end TargetPrefix
Index: clang/utils/TableGen/MveEmitter.cpp
===================================================================
--- clang/utils/TableGen/MveEmitter.cpp
+++ clang/utils/TableGen/MveEmitter.cpp
@@ -882,38 +882,38 @@
         break;
       case ImmediateArg::BoundsType::UInt:
         lo = 0;
-        hi = IA.i1;
+        hi = llvm::APInt::getMaxValue(IA.i1).zext(128);
         break;
       }
 
-      llvm::APInt typelo, typehi;
-      unsigned Bits = IA.ArgType->sizeInBits();
-      if (cast<ScalarType>(IA.ArgType)->kind() == ScalarTypeKind::SignedInt) {
-        typelo = llvm::APInt::getSignedMinValue(Bits).sext(128);
-        typehi = llvm::APInt::getSignedMaxValue(Bits).sext(128);
-      } else {
-        typelo = llvm::APInt::getMinValue(Bits).zext(128);
-        typehi = llvm::APInt::getMaxValue(Bits).zext(128);
-      }
-
       std::string Index = utostr(kv.first);
 
-      if (lo.sle(typelo) && hi.sge(typehi))
-        SemaChecks.push_back("SemaBuiltinConstantArg(TheCall, " + Index + ")");
-      else
+      unsigned ArgTypeBits = IA.ArgType->sizeInBits();
+      llvm::APInt ArgTypeRange = llvm::APInt::getMaxValue(ArgTypeBits).zext(128);
+      llvm::APInt ActualRange = (hi-lo).trunc(64).sext(128);
+      if (ActualRange.ult(ArgTypeRange))
         SemaChecks.push_back("SemaBuiltinConstantArgRange(TheCall, " + Index +
                              ", " + signedHexLiteral(lo) + ", " +
                              signedHexLiteral(hi) + ")");
 
       if (!IA.ExtraCheckType.empty()) {
         std::string Suffix;
-        if (!IA.ExtraCheckArgs.empty())
-          Suffix = (Twine(", ") + IA.ExtraCheckArgs).str();
+        if (!IA.ExtraCheckArgs.empty()) {
+          std::string tmp;
+          StringRef Arg = IA.ExtraCheckArgs;
+          if (Arg == "!lanesize") {
+            tmp = utostr(IA.ArgType->sizeInBits());
+            Arg = tmp;
+          }
+          Suffix = (Twine(", ") + Arg).str();
+        }
         SemaChecks.push_back((Twine("SemaBuiltinConstantArg") +
                               IA.ExtraCheckType + "(TheCall, " + Index +
                               Suffix + ")")
                                  .str());
       }
+
+      assert(!SemaChecks.empty());
     }
     if (SemaChecks.empty())
       return "";
Index: clang/test/Sema/arm-mve-immediates.c
===================================================================
--- clang/test/Sema/arm-mve-immediates.c
+++ clang/test/Sema/arm-mve-immediates.c
@@ -203,3 +203,73 @@
   vsriq(vw, vw, 0); // expected-error {{argument value 0 is outside the valid range [1, 32]}}
   vsriq(vw, vw, 33); // expected-error {{argument value 33 is outside the valid range [1, 32]}}
 }
+
+void test_simd_bic_orr(int16x8_t h, int32x4_t w)
+{
+    h = vbicq(h, 0x0000);
+    h = vbicq(h, 0x0001);
+    h = vbicq(h, 0x00FF);
+    h = vbicq(h, 0x0100);
+    h = vbicq(h, 0x0101); // expected-error-re {{argument should be an 8-bit value shifted by a multiple of 8 bits{{$}}}}
+    h = vbicq(h, 0x01FF); // expected-error-re {{argument should be an 8-bit value shifted by a multiple of 8 bits{{$}}}}
+    h = vbicq(h, 0xFF00);
+
+    w = vbicq(w, 0x00000000);
+    w = vbicq(w, 0x00000001);
+    w = vbicq(w, 0x000000FF);
+    w = vbicq(w, 0x00000100);
+    w = vbicq(w, 0x0000FF00);
+    w = vbicq(w, 0x00010000);
+    w = vbicq(w, 0x00FF0000);
+    w = vbicq(w, 0x01000000);
+    w = vbicq(w, 0xFF000000);
+    w = vbicq(w, 0x01000001); // expected-error-re {{argument should be an 8-bit value shifted by a multiple of 8 bits{{$}}}}
+    w = vbicq(w, 0x01FFFFFF); // expected-error-re {{argument should be an 8-bit value shifted by a multiple of 8 bits{{$}}}}
+
+    h = vorrq(h, 0x0000);
+    h = vorrq(h, 0x0001);
+    h = vorrq(h, 0x00FF);
+    h = vorrq(h, 0x0100);
+    h = vorrq(h, 0x0101); // expected-error-re {{argument should be an 8-bit value shifted by a multiple of 8 bits{{$}}}}
+    h = vorrq(h, 0x01FF); // expected-error-re {{argument should be an 8-bit value shifted by a multiple of 8 bits{{$}}}}
+    h = vorrq(h, 0xFF00);
+
+    w = vorrq(w, 0x00000000);
+    w = vorrq(w, 0x00000001);
+    w = vorrq(w, 0x000000FF);
+    w = vorrq(w, 0x00000100);
+    w = vorrq(w, 0x0000FF00);
+    w = vorrq(w, 0x00010000);
+    w = vorrq(w, 0x00FF0000);
+    w = vorrq(w, 0x01000000);
+    w = vorrq(w, 0xFF000000);
+    w = vorrq(w, 0x01000001); // expected-error-re {{argument should be an 8-bit value shifted by a multiple of 8 bits{{$}}}}
+    w = vorrq(w, 0x01FFFFFF); // expected-error-re {{argument should be an 8-bit value shifted by a multiple of 8 bits{{$}}}}
+}
+
+void test_simd_vmvn(void)
+{
+    uint16x8_t h;
+    h = vmvnq_n_u16(0x0000);
+    h = vmvnq_n_u16(0x0001);
+    h = vmvnq_n_u16(0x00FF);
+    h = vmvnq_n_u16(0x0100);
+    h = vmvnq_n_u16(0x0101); // expected-error {{argument should be an 8-bit value shifted by a multiple of 8 bits, or in the form 0x??FF}}
+    h = vmvnq_n_u16(0x01FF);
+    h = vmvnq_n_u16(0xFF00);
+
+    uint32x4_t w;
+    w = vmvnq_n_u32(0x00000000);
+    w = vmvnq_n_u32(0x00000001);
+    w = vmvnq_n_u32(0x000000FF);
+    w = vmvnq_n_u32(0x00000100);
+    w = vmvnq_n_u32(0x0000FF00);
+    w = vmvnq_n_u32(0x00010000);
+    w = vmvnq_n_u32(0x00FF0000);
+    w = vmvnq_n_u32(0x01000000);
+    w = vmvnq_n_u32(0xFF000000);
+    w = vmvnq_n_u32(0x01000001); // expected-error {{argument should be an 8-bit value shifted by a multiple of 8 bits, or in the form 0x??FF}}
+    w = vmvnq_n_u32(0x01FFFFFF); // expected-error {{argument should be an 8-bit value shifted by a multiple of 8 bits, or in the form 0x??FF}}
+    w = vmvnq_n_u32(0x0001FFFF); // expected-error {{argument should be an 8-bit value shifted by a multiple of 8 bits, or in the form 0x??FF}}
+    w = vmvnq_n_u32(0x000001FF);
+}
Index: clang/test/CodeGen/arm-mve-intrinsics/bitwise-imm.c
===================================================================
--- /dev/null
+++ clang/test/CodeGen/arm-mve-intrinsics/bitwise-imm.c
@@ -0,0 +1,394 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
+
+#include <arm_mve.h>
+
+// CHECK-LABEL: @test_vbicq_n_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = and <8 x i16> [[A:%.*]], <i16 11007, i16 11007, i16 11007, i16 11007, i16 11007, i16 11007, i16 11007, i16 11007>
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+int16x8_t test_vbicq_n_s16(int16x8_t a)
+{
+#ifdef POLYMORPHIC
+    return vbicq(a, 0xd500);
+#else /* POLYMORPHIC */
+    return vbicq_n_s16(a, 0xd500);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vbicq_n_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = and <4 x i32> [[A:%.*]], <i32 -252, i32 -252, i32 -252, i32 -252>
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+int32x4_t test_vbicq_n_s32(int32x4_t a)
+{
+#ifdef POLYMORPHIC
+    return vbicq(a, 0xfb);
+#else /* POLYMORPHIC */
+    return vbicq_n_s32(a, 0xfb);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vbicq_n_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = and <8 x i16> [[A:%.*]], <i16 -243, i16 -243, i16 -243, i16 -243, i16 -243, i16 -243, i16 -243, i16 -243>
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+uint16x8_t test_vbicq_n_u16(uint16x8_t a)
+{
+#ifdef POLYMORPHIC
+    return vbicq(a, 0xf2);
+#else /* POLYMORPHIC */
+    return vbicq_n_u16(a, 0xf2);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vbicq_n_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = and <4 x i32> [[A:%.*]], <i32 -8193, i32 -8193, i32 -8193, i32 -8193>
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+uint32x4_t test_vbicq_n_u32(uint32x4_t a)
+{
+#ifdef POLYMORPHIC
+    return vbicq(a, 0x2000);
+#else /* POLYMORPHIC */
+    return vbicq_n_u32(a, 0x2000);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vorrq_n_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = or <8 x i16> [[A:%.*]], <i16 195, i16 195, i16 195, i16 195, i16 195, i16 195, i16 195, i16 195>
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+int16x8_t test_vorrq_n_s16(int16x8_t a)
+{
+#ifdef POLYMORPHIC
+    return vorrq(a, 0xc3);
+#else /* POLYMORPHIC */
+    return vorrq_n_s16(a, 0xc3);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vorrq_n_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = or <4 x i32> [[A:%.*]], <i32 65536, i32 65536, i32 65536, i32 65536>
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+int32x4_t test_vorrq_n_s32(int32x4_t a)
+{
+#ifdef POLYMORPHIC
+    return vorrq(a, 0x10000);
+#else /* POLYMORPHIC */
+    return vorrq_n_s32(a, 0x10000);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vorrq_n_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = or <8 x i16> [[A:%.*]], <i16 -4096, i16 -4096, i16 -4096, i16 -4096, i16 -4096, i16 -4096, i16 -4096, i16 -4096>
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+uint16x8_t test_vorrq_n_u16(uint16x8_t a)
+{
+#ifdef POLYMORPHIC
+    return vorrq(a, 0xf000);
+#else /* POLYMORPHIC */
+    return vorrq_n_u16(a, 0xf000);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vorrq_n_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = or <4 x i32> [[A:%.*]], <i32 8978432, i32 8978432, i32 8978432, i32 8978432>
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+uint32x4_t test_vorrq_n_u32(uint32x4_t a)
+{
+#ifdef POLYMORPHIC
+    return vorrq(a, 0x890000);
+#else /* POLYMORPHIC */
+    return vorrq_n_u32(a, 0x890000);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmvnq_n_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret <8 x i16> <i16 27391, i16 27391, i16 27391, i16 27391, i16 27391, i16 27391, i16 27391, i16 27391>
+//
+int16x8_t test_vmvnq_n_s16()
+{
+    return vmvnq_n_s16(0x9500);
+}
+
+// CHECK-LABEL: @test_vmvnq_n_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret <4 x i32> <i32 -5570561, i32 -5570561, i32 -5570561, i32 -5570561>
+//
+int32x4_t test_vmvnq_n_s32()
+{
+    return vmvnq_n_s32(0x550000);
+}
+
+// CHECK-LABEL: @test_vmvnq_n_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret <8 x i16> <i16 -18689, i16 -18689, i16 -18689, i16 -18689, i16 -18689, i16 -18689, i16 -18689, i16 -18689>
+//
+uint16x8_t test_vmvnq_n_u16()
+{
+    return vmvnq_n_u16(0x4900);
+}
+
+// CHECK-LABEL: @test_vmvnq_n_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret <4 x i32> <i32 1023410175, i32 1023410175, i32 1023410175, i32 1023410175>
+//
+uint32x4_t test_vmvnq_n_u32()
+{
+    return vmvnq_n_u32(0xc3000000);
+}
+
+// CHECK-LABEL: @test_vbicq_m_n_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.bic.imm.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], i32 11264, <8 x i1> [[TMP1]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+//
+int16x8_t test_vbicq_m_n_s16(int16x8_t a, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vbicq_m_n(a, 0x2c00, p);
+#else /* POLYMORPHIC */
+    return vbicq_m_n_s16(a, 0x2c00, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vbicq_m_n_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.bic.imm.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], i32 13893632, <4 x i1> [[TMP1]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+//
+int32x4_t test_vbicq_m_n_s32(int32x4_t a, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vbicq_m_n(a, 0xd40000, p);
+#else /* POLYMORPHIC */
+    return vbicq_m_n_s32(a, 0xd40000, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vbicq_m_n_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.bic.imm.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], i32 36, <8 x i1> [[TMP1]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+//
+uint16x8_t test_vbicq_m_n_u16(uint16x8_t a, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vbicq_m_n(a, 0x24, p);
+#else /* POLYMORPHIC */
+    return vbicq_m_n_u16(a, 0x24, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vbicq_m_n_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.bic.imm.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], i32 1644167168, <4 x i1> [[TMP1]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+//
+uint32x4_t test_vbicq_m_n_u32(uint32x4_t a, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vbicq_m_n(a, 0x62000000, p);
+#else /* POLYMORPHIC */
+    return vbicq_m_n_u32(a, 0x62000000, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vorrq_m_n_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.orr.imm.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], i32 13568, <8 x i1> [[TMP1]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+//
+int16x8_t test_vorrq_m_n_s16(int16x8_t a, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vorrq_m_n(a, 0x3500, p);
+#else /* POLYMORPHIC */
+    return vorrq_m_n_s16(a, 0x3500, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vorrq_m_n_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.orr.imm.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], i32 654311424, <4 x i1> [[TMP1]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+//
+int32x4_t test_vorrq_m_n_s32(int32x4_t a, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vorrq_m_n(a, 0x27000000, p);
+#else /* POLYMORPHIC */
+    return vorrq_m_n_s32(a, 0x27000000, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vorrq_m_n_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.orr.imm.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], i32 175, <8 x i1> [[TMP1]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+//
+uint16x8_t test_vorrq_m_n_u16(uint16x8_t a, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vorrq_m_n(a, 0xaf, p);
+#else /* POLYMORPHIC */
+    return vorrq_m_n_u16(a, 0xaf, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vorrq_m_n_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.orr.imm.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], i32 89, <4 x i1> [[TMP1]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+//
+uint32x4_t test_vorrq_m_n_u32(uint32x4_t a, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vorrq_m_n(a, 0x59, p);
+#else /* POLYMORPHIC */
+    return vorrq_m_n_u32(a, 0x59, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmvnq_m_n_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i16> <i16 -3841, i16 -3841, i16 -3841, i16 -3841, i16 -3841, i16 -3841, i16 -3841, i16 -3841>, <8 x i16> [[INACTIVE:%.*]]
+// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+//
+int16x8_t test_vmvnq_m_n_s16(int16x8_t inactive, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vmvnq_m(inactive, 0xf00, p);
+#else /* POLYMORPHIC */
+    return vmvnq_m_n_s16(inactive, 0xf00, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmvnq_m_n_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> <i32 -18945, i32 -18945, i32 -18945, i32 -18945>, <4 x i32> [[INACTIVE:%.*]]
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+//
+int32x4_t test_vmvnq_m_n_s32(int32x4_t inactive, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vmvnq_m(inactive, 0x4a00, p);
+#else /* POLYMORPHIC */
+    return vmvnq_m_n_s32(inactive, 0x4a00, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmvnq_m_n_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i16> <i16 23295, i16 23295, i16 23295, i16 23295, i16 23295, i16 23295, i16 23295, i16 23295>, <8 x i16> [[INACTIVE:%.*]]
+// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+//
+uint16x8_t test_vmvnq_m_n_u16(uint16x8_t inactive, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vmvnq_m(inactive, 0xa500, p);
+#else /* POLYMORPHIC */
+    return vmvnq_m_n_u16(inactive, 0xa500, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmvnq_m_n_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> <i32 -63489, i32 -63489, i32 -63489, i32 -63489>, <4 x i32> [[INACTIVE:%.*]]
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+//
+uint32x4_t test_vmvnq_m_n_u32(uint32x4_t inactive, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vmvnq_m(inactive, 0xf800, p);
+#else /* POLYMORPHIC */
+    return vmvnq_m_n_u32(inactive, 0xf800, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmvnq_x_n_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i16> <i16 767, i16 767, i16 767, i16 767, i16 767, i16 767, i16 767, i16 767>, <8 x i16> undef
+// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+//
+int16x8_t test_vmvnq_x_n_s16(mve_pred16_t p)
+{
+    return vmvnq_x_n_s16(0xfd00, p);
+}
+
+// CHECK-LABEL: @test_vmvnq_x_n_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> <i32 -12189697, i32 -12189697, i32 -12189697, i32 -12189697>, <4 x i32> undef
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+//
+int32x4_t test_vmvnq_x_n_s32(mve_pred16_t p)
+{
+    return vmvnq_x_n_s32(0xba0000, p);
+}
+
+// CHECK-LABEL: @test_vmvnq_x_n_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i16> <i16 -21505, i16 -21505, i16 -21505, i16 -21505, i16 -21505, i16 -21505, i16 -21505, i16 -21505>, <8 x i16> undef
+// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+//
+uint16x8_t test_vmvnq_x_n_u16(mve_pred16_t p)
+{
+    return vmvnq_x_n_u16(0x5400, p);
+}
+
+// CHECK-LABEL: @test_vmvnq_x_n_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> <i32 -4865, i32 -4865, i32 -4865, i32 -4865>, <4 x i32> undef
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+//
+uint32x4_t test_vmvnq_x_n_u32(mve_pred16_t p)
+{
+    return vmvnq_x_n_u32(0x1300, p);
+}
+
Index: clang/lib/Sema/SemaChecking.cpp
===================================================================
--- clang/lib/Sema/SemaChecking.cpp
+++ clang/lib/Sema/SemaChecking.cpp
@@ -6412,7 +6412,8 @@
 /// SemaBuiltinConstantArgShiftedByte - Check if argument ArgNum of TheCall is
 /// a constant expression representing an arbitrary byte value shifted left by
 /// a multiple of 8 bits.
-bool Sema::SemaBuiltinConstantArgShiftedByte(CallExpr *TheCall, int ArgNum) {
+bool Sema::SemaBuiltinConstantArgShiftedByte(CallExpr *TheCall, int ArgNum,
+                                             unsigned ArgBits) {
   llvm::APSInt Result;
 
   // We can't check the value of a dependent argument.
@@ -6424,6 +6425,10 @@
   if (SemaBuiltinConstantArg(TheCall, ArgNum, Result))
     return true;
 
+  // Truncate to the given size.
+  Result = Result.getLoBits(ArgBits);
+  Result.setIsUnsigned(true);
+
   if (IsShiftedByte(Result))
     return false;
 
@@ -6437,7 +6442,8 @@
 /// 0x00FF, 0x01FF, ..., 0xFFFF). This strange range check is needed for some
 /// Arm MVE intrinsics.
 bool Sema::SemaBuiltinConstantArgShiftedByteOrXXFF(CallExpr *TheCall,
-                                                   int ArgNum) {
+                                                   int ArgNum,
+                                                   unsigned ArgBits) {
   llvm::APSInt Result;
 
   // We can't check the value of a dependent argument.
@@ -6449,6 +6455,10 @@
   if (SemaBuiltinConstantArg(TheCall, ArgNum, Result))
     return true;
 
+  // Truncate to the given size.
+  Result = Result.getLoBits(ArgBits);
+  Result.setIsUnsigned(true);
+
   // Check to see if it's in either of the required forms.
   if (IsShiftedByte(Result) ||
       (Result > 0 && Result < 0x10000 && (Result & 0xFF) == 0xFF))
Index: clang/include/clang/Sema/Sema.h
===================================================================
--- clang/include/clang/Sema/Sema.h
+++ clang/include/clang/Sema/Sema.h
@@ -11601,8 +11601,10 @@
   bool SemaBuiltinConstantArgMultiple(CallExpr *TheCall, int ArgNum,
                                       unsigned Multiple);
   bool SemaBuiltinConstantArgPower2(CallExpr *TheCall, int ArgNum);
-  bool SemaBuiltinConstantArgShiftedByte(CallExpr *TheCall, int ArgNum);
-  bool SemaBuiltinConstantArgShiftedByteOrXXFF(CallExpr *TheCall, int ArgNum);
+  bool SemaBuiltinConstantArgShiftedByte(CallExpr *TheCall, int ArgNum,
+                                         unsigned ArgBits);
+  bool SemaBuiltinConstantArgShiftedByteOrXXFF(CallExpr *TheCall, int ArgNum,
+                                               unsigned ArgBits);
   bool SemaBuiltinARMSpecialReg(unsigned BuiltinID, CallExpr *TheCall,
                                 int ArgNum, unsigned ExpectedFieldNum,
                                 bool AllowName);
Index: clang/include/clang/Basic/arm_mve_defs.td
===================================================================
--- clang/include/clang/Basic/arm_mve_defs.td
+++ clang/include/clang/Basic/arm_mve_defs.td
@@ -319,6 +319,7 @@
   int base = base_;
   Type type = type_;
 }
+def IB_ExtraArg_LaneSize;
 
 // -----------------------------------------------------------------------------
 // End-user definitions for immediate arguments.
@@ -327,11 +328,13 @@
 // intrinsics like vmvnq or vorrq. imm_simd_restrictive has to be an 8-bit
 // value shifted left by a whole number of bytes; imm_simd_vmvn can also be of
 // the form 0xXXFF for some byte value XX.
-def imm_simd_restrictive : Immediate<u32, IB_UEltValue> {
+def imm_simd_restrictive : Immediate<Scalar, IB_UEltValue> {
   let extra = "ShiftedByte";
+  let extraarg = "!lanesize";
 }
-def imm_simd_vmvn : Immediate<u32, IB_UEltValue> {
+def imm_simd_vmvn : Immediate<Scalar, IB_UEltValue> {
   let extra = "ShiftedByteOrXXFF";
+  let extraarg = "!lanesize";
 }
 
 // imm_1toN can take any value from 1 to N inclusive, where N is the number of
@@ -457,26 +460,31 @@
 
 // A wrapper to define both _m and _x versions of a predicated
 // intrinsic.
+//
+// We provide optional parameters to override the polymorphic name
+// types separately for the _m and _x variants, because sometimes they
+// polymorph differently (typically because the type of the inactive
+// parameter can be used as a disambiguator if it's present).
 multiclass IntrinsicMX<Type rettype, dag arguments, dag cg,
                        int wantXVariant = 1,
                        string nameSuffix = "",
+                       PolymorphicNameType pnt_m = PNT_Type,
                        PolymorphicNameType pnt_x = PNT_Type> {
   // The _m variant takes an initial parameter called $inactive, which
   // provides the input value of the output register, i.e. all the
   // inactive lanes in the predicated operation take their values from
   // this.
   def "_m" # nameSuffix:
-     Intrinsic<rettype, !con((args rettype:$inactive), arguments), cg>;
+     Intrinsic<rettype, !con((args rettype:$inactive), arguments), cg> {
+    let pnt = pnt_m;
+  }
 
   foreach unusedVar = !if(!eq(wantXVariant, 1), [1], []<int>) in {
     // The _x variant leaves off that parameter, and simply uses an
     // undef value of the same type.
+
     def "_x" # nameSuffix:
-       Intrinsic<rettype, arguments, (seq (undef rettype):$inactive, cg)> {
-      // Allow overriding of the polymorphic name type, because
-      // sometimes the _m and _x variants polymorph differently
-      // (typically because the type of the inactive parameter can be
-      // used as a disambiguator if it's present).
+      Intrinsic<rettype, arguments, (seq (undef rettype):$inactive, cg)> {
       let pnt = pnt_x;
     }
   }
Index: clang/include/clang/Basic/arm_mve.td
===================================================================
--- clang/include/clang/Basic/arm_mve.td
+++ clang/include/clang/Basic/arm_mve.td
@@ -116,6 +116,28 @@
             NameOverride<"vmulq">;
 }
 
+let params = !listconcat(T.Int16, T.Int32) in {
+  let pnt = PNT_None in {
+    def vmvnq_n: Intrinsic<Vector, (args imm_simd_vmvn:$imm),
+                           (not (splat (Scalar $imm)))>;
+  }
+  defm vmvnq: IntrinsicMX<Vector, (args imm_simd_vmvn:$imm, Predicate:$pred),
+                     (select $pred, (not (splat (Scalar $imm))), $inactive),
+                     1, "_n", PNT_NType, PNT_None>;
+  let pnt = PNT_NType in {
+    def vbicq_n: Intrinsic<Vector, (args Vector:$v, imm_simd_restrictive:$imm),
+                           (and $v, (not (splat (Scalar $imm))))>;
+    def vorrq_n: Intrinsic<Vector, (args Vector:$v, imm_simd_restrictive:$imm),
+                           (or $v, (splat (Scalar $imm)))>;
+  }
+  def vbicq_m_n: Intrinsic<
+    Vector, (args Vector:$v, imm_simd_restrictive:$imm, Predicate:$pred),
+    (IRInt<"bic_imm_predicated", [Vector, Predicate]> $v, (u32 $imm), $pred)>;
+  def vorrq_m_n: Intrinsic<
+    Vector, (args Vector:$v, imm_simd_restrictive:$imm, Predicate:$pred),
+    (IRInt<"orr_imm_predicated", [Vector, Predicate]> $v, (u32 $imm), $pred)>;
+}
+
 // The bitcasting below is not overcomplicating the IR because while
 // Vector and UVector may be different vector types at the C level i.e.
 // vectors of same size signed/unsigned ints. Once they're lowered

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D72934: [ARM,MVE] Support immediate vbicq,vorrq,vmvnq intrinsics.

Reply via email to