https://github.com/Amichaxx updated https://github.com/llvm/llvm-project/pull/149329
>From 2895e5e7b56c1c611b39a5c85de92d18f3aae71a Mon Sep 17 00:00:00 2001 From: Amina Chabane <amina.chab...@arm.com> Date: Tue, 15 Jul 2025 15:56:49 +0000 Subject: [PATCH 1/6] [AArch64][NEON] Fix poly lane intrinsics under -fno-lax-vector-conversions. Issue originally raised in https://github.com/llvm/llvm-project/issues/71362#issuecomment-3028515618. Certain NEON intrinsics that operate on poly types (e.g. poly8x8_t) failed to compile with the -fno-lax-vector-conversions flag. This patch updates NeonEmitter.cpp to insert an explicit __builtin_bit_cast from poly types to the required signed integer vector types when generating lane-based intrinsics. A test neon-bitcast-poly is included. --- clang/utils/TableGen/NeonEmitter.cpp | 10 +++- .../test/CodeGen/AArch64/neon-bitcast-poly.ll | 51 +++++++++++++++++++ 2 files changed, 60 insertions(+), 1 deletion(-) create mode 100644 llvm/test/CodeGen/AArch64/neon-bitcast-poly.ll diff --git a/clang/utils/TableGen/NeonEmitter.cpp b/clang/utils/TableGen/NeonEmitter.cpp index 409f1c4f71834..574a29d0e4dd9 100644 --- a/clang/utils/TableGen/NeonEmitter.cpp +++ b/clang/utils/TableGen/NeonEmitter.cpp @@ -1401,12 +1401,20 @@ void Intrinsic::emitBodyAsBuiltinCall() { if (LocalCK == ClassB || (T.isHalf() && !T.isScalarForMangling())) { CastToType.makeInteger(8, true); Arg = "__builtin_bit_cast(" + CastToType.str() + ", " + Arg + ")"; + } + else if ((T.isPoly() || + (T.isInteger() && !T.isSigned() && + StringRef(Name).contains("_p8")) || + StringRef(Name).contains("_p16") || + StringRef(Name).contains("_p64"))) { + CastToType.makeSigned(); + Arg = "__builtin_bit_cast(" + CastToType.str() + ", " + Arg + ")"; + } } else if (LocalCK == ClassI) { if (CastToType.isInteger()) { CastToType.makeSigned(); Arg = "__builtin_bit_cast(" + CastToType.str() + ", " + Arg + ")"; } - } } S += Arg + ", "; diff --git a/llvm/test/CodeGen/AArch64/neon-bitcast-poly.ll b/llvm/test/CodeGen/AArch64/neon-bitcast-poly.ll new file mode 100644 index 0000000000000..b577eb1e34b09 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/neon-bitcast-poly.ll @@ -0,0 +1,51 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=aarch64-linux-gnu -o - %s | FileCheck %s + +; This test verifies that NEON intrinsics using polynomial types (poly8/16/64) emit correct AArch64 instructions +; after bitcasting to signed integer vectors. These intrinsics would previously fail under -fno-lax-vector-conversions. + +define <8 x i8> @_Z18test_vcopy_lane_p811__Poly8x8_tS_(<8 x i8> %a, <8 x i8> %b) { +; CHECK-LABEL: _Z18test_vcopy_lane_p811__Poly8x8_tS_: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: mov v0.b[0], v1.b[0] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret +entry: + %vset_lane = shufflevector <8 x i8> %b, <8 x i8> %a, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> + ret <8 x i8> %vset_lane +} + +define <4 x i16> @_Z18test_vset_lane_p16t12__Poly16x4_t(i16 %val, <4 x i16> %vec) { +; CHECK-LABEL: _Z18test_vset_lane_p16t12__Poly16x4_t: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: mov v0.h[0], w0 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret +entry: + %vset_lane = insertelement <4 x i16> %vec, i16 %val, i64 0 + ret <4 x i16> %vset_lane +} + +define i64 @_Z18test_vget_lane_p6412__Poly64x1_t(<1 x i64> %vec){ +; CHECK-LABEL: _Z18test_vget_lane_p6412__Poly64x1_t: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: fmov x0, d0 +; CHECK-NEXT: ret +entry: + %vget_lane = extractelement <1 x i64> %vec, i64 0 + ret i64 %vget_lane +} + +define <16 x i8> @_Z18test_vsetq_lane_p8h12__Poly8x16_t(i8 %val, <16 x i8> %vec){ +; CHECK-LABEL: _Z18test_vsetq_lane_p8h12__Poly8x16_t: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov v0.b[0], w0 +; CHECK-NEXT: ret +entry: + %vset_lane = insertelement <16 x i8> %vec, i8 %val, i64 0 + ret <16 x i8> %vset_lane +} >From c300ab6ced97df16728fac0a07c94e38792a2047 Mon Sep 17 00:00:00 2001 From: Amina Chabane <amina.chab...@arm.com> Date: Wed, 16 Jul 2025 13:53:30 +0000 Subject: [PATCH 2/6] Added isVector() condition to avoid scalar constants. --- clang/utils/TableGen/NeonEmitter.cpp | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/clang/utils/TableGen/NeonEmitter.cpp b/clang/utils/TableGen/NeonEmitter.cpp index 574a29d0e4dd9..d3dd1c5589920 100644 --- a/clang/utils/TableGen/NeonEmitter.cpp +++ b/clang/utils/TableGen/NeonEmitter.cpp @@ -1401,22 +1401,19 @@ void Intrinsic::emitBodyAsBuiltinCall() { if (LocalCK == ClassB || (T.isHalf() && !T.isScalarForMangling())) { CastToType.makeInteger(8, true); Arg = "__builtin_bit_cast(" + CastToType.str() + ", " + Arg + ")"; + } else if ((T.isPoly() || (T.isVector() && T.isInteger() && !T.isSigned() && + (StringRef(Name).contains("_p8") || + StringRef(Name).contains("_p16") || + StringRef(Name).contains("_p64"))))) { + CastToType.makeSigned(); + Arg = "__builtin_bit_cast(" + CastToType.str() + ", " + Arg + ")"; + } else if (LocalCK == ClassI && CastToType.isInteger()) { + CastToType.makeSigned(); + Arg = "__builtin_bit_cast(" + CastToType.str() + ", " + Arg + ")"; } - else if ((T.isPoly() || - (T.isInteger() && !T.isSigned() && - StringRef(Name).contains("_p8")) || - StringRef(Name).contains("_p16") || - StringRef(Name).contains("_p64"))) { - CastToType.makeSigned(); - Arg = "__builtin_bit_cast(" + CastToType.str() + ", " + Arg + ")"; - } - } else if (LocalCK == ClassI) { - if (CastToType.isInteger()) { - CastToType.makeSigned(); - Arg = "__builtin_bit_cast(" + CastToType.str() + ", " + Arg + ")"; - } } + S += Arg + ", "; } >From 102ca6f20dac9e2c5a458ee5e637e517f242c949 Mon Sep 17 00:00:00 2001 From: Amina Chabane <amina.chab...@arm.com> Date: Thu, 17 Jul 2025 14:42:43 +0000 Subject: [PATCH 3/6] Newline deletion --- clang/utils/TableGen/NeonEmitter.cpp | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/clang/utils/TableGen/NeonEmitter.cpp b/clang/utils/TableGen/NeonEmitter.cpp index d3dd1c5589920..1bd8c8b58c396 100644 --- a/clang/utils/TableGen/NeonEmitter.cpp +++ b/clang/utils/TableGen/NeonEmitter.cpp @@ -1402,9 +1402,9 @@ void Intrinsic::emitBodyAsBuiltinCall() { CastToType.makeInteger(8, true); Arg = "__builtin_bit_cast(" + CastToType.str() + ", " + Arg + ")"; } else if ((T.isPoly() || (T.isVector() && T.isInteger() && !T.isSigned() && - (StringRef(Name).contains("_p8") || - StringRef(Name).contains("_p16") || - StringRef(Name).contains("_p64"))))) { + (StringRef(Name).contains("_p8") || + StringRef(Name).contains("_p16") || + StringRef(Name).contains("_p64"))))) { CastToType.makeSigned(); Arg = "__builtin_bit_cast(" + CastToType.str() + ", " + Arg + ")"; } else if (LocalCK == ClassI && CastToType.isInteger()) { @@ -1412,8 +1412,6 @@ void Intrinsic::emitBodyAsBuiltinCall() { Arg = "__builtin_bit_cast(" + CastToType.str() + ", " + Arg + ")"; } } - - S += Arg + ", "; } >From 7106ac95552f7bb32321cbc7b6d5e9df3eec578b Mon Sep 17 00:00:00 2001 From: Amina Chabane <amina.chab...@arm.com> Date: Tue, 22 Jul 2025 08:43:47 +0000 Subject: [PATCH 4/6] Code formatting change --- clang/utils/TableGen/NeonEmitter.cpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/clang/utils/TableGen/NeonEmitter.cpp b/clang/utils/TableGen/NeonEmitter.cpp index 1bd8c8b58c396..da3bbd4303074 100644 --- a/clang/utils/TableGen/NeonEmitter.cpp +++ b/clang/utils/TableGen/NeonEmitter.cpp @@ -1401,10 +1401,11 @@ void Intrinsic::emitBodyAsBuiltinCall() { if (LocalCK == ClassB || (T.isHalf() && !T.isScalarForMangling())) { CastToType.makeInteger(8, true); Arg = "__builtin_bit_cast(" + CastToType.str() + ", " + Arg + ")"; - } else if ((T.isPoly() || (T.isVector() && T.isInteger() && !T.isSigned() && - (StringRef(Name).contains("_p8") || - StringRef(Name).contains("_p16") || - StringRef(Name).contains("_p64"))))) { + } else if ((T.isPoly() || + (T.isVector() && T.isInteger() && !T.isSigned() && + (StringRef(Name).contains("_p8") || + StringRef(Name).contains("_p16") || + StringRef(Name).contains("_p64"))))) { CastToType.makeSigned(); Arg = "__builtin_bit_cast(" + CastToType.str() + ", " + Arg + ")"; } else if (LocalCK == ClassI && CastToType.isInteger()) { >From cf3cd5bcab17b952fc3c8b6d775d29d78e7a2360 Mon Sep 17 00:00:00 2001 From: Amina Chabane <amina.chab...@arm.com> Date: Fri, 25 Jul 2025 14:23:29 +0000 Subject: [PATCH 5/6] - Added neon-bitcast-poly.c test - Amended check --- .../test/CodeGen/AArch64/neon-bitcast-poly.c | 1065 +++++++++++++++++ clang/utils/TableGen/NeonEmitter.cpp | 10 +- .../test/CodeGen/AArch64/neon-bitcast-poly.ll | 51 - 3 files changed, 1067 insertions(+), 59 deletions(-) create mode 100644 clang/test/CodeGen/AArch64/neon-bitcast-poly.c delete mode 100644 llvm/test/CodeGen/AArch64/neon-bitcast-poly.ll diff --git a/clang/test/CodeGen/AArch64/neon-bitcast-poly.c b/clang/test/CodeGen/AArch64/neon-bitcast-poly.c new file mode 100644 index 0000000000000..2e44e6bb59bab --- /dev/null +++ b/clang/test/CodeGen/AArch64/neon-bitcast-poly.c @@ -0,0 +1,1065 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --prefix-filecheck-ir-name CHECK --version 5 +// RUN: %clang -target aarch64-none-linux-gnu -mcpu=generic \ +// RUN: -fno-lax-vector-conversions -S -emit-llvm -o - %s | FileCheck %s + +// REQUIRES: aarch64-registered-target + +#include <arm_neon.h> + +// CHECK-LABEL: define dso_local i8 @test_vdupb_lane_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <8 x i8>, align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca i8, align 1 +// CHECK-NEXT: [[__S0:%.*]] = alloca <8 x i8>, align 8 +// CHECK-NEXT: [[REF_TMP:%.*]] = alloca i8, align 1 +// CHECK-NEXT: [[TMP:%.*]] = alloca i8, align 1 +// CHECK-NEXT: store <8 x i8> [[A]], ptr [[A_ADDR]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[A_ADDR]], align 8 +// CHECK-NEXT: store <8 x i8> [[TMP0]], ptr [[__S0]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr [[__S0]], align 8 +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <8 x i8> [[TMP1]], i32 1 +// CHECK-NEXT: store i8 [[VGET_LANE]], ptr [[REF_TMP]], align 1 +// CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr [[REF_TMP]], align 1 +// CHECK-NEXT: store i8 [[TMP2]], ptr [[__RET]], align 1 +// CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr [[__RET]], align 1 +// CHECK-NEXT: store i8 [[TMP3]], ptr [[TMP]], align 1 +// CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr [[TMP]], align 1 +// CHECK-NEXT: ret i8 [[TMP4]] +// +poly8_t test_vdupb_lane_p8(poly8x8_t a){ + return vdupb_lane_p8(a, 1); +} + +// CHECK-LABEL: define dso_local i8 @test_vdupb_laneq_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <16 x i8>, align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca i8, align 1 +// CHECK-NEXT: [[__S0:%.*]] = alloca <16 x i8>, align 16 +// CHECK-NEXT: [[REF_TMP:%.*]] = alloca i8, align 1 +// CHECK-NEXT: [[TMP:%.*]] = alloca i8, align 1 +// CHECK-NEXT: store <16 x i8> [[A]], ptr [[A_ADDR]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[A_ADDR]], align 16 +// CHECK-NEXT: store <16 x i8> [[TMP0]], ptr [[__S0]], align 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[__S0]], align 16 +// CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <16 x i8> [[TMP1]], i32 5 +// CHECK-NEXT: store i8 [[VGETQ_LANE]], ptr [[REF_TMP]], align 1 +// CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr [[REF_TMP]], align 1 +// CHECK-NEXT: store i8 [[TMP2]], ptr [[__RET]], align 1 +// CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr [[__RET]], align 1 +// CHECK-NEXT: store i8 [[TMP3]], ptr [[TMP]], align 1 +// CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr [[TMP]], align 1 +// CHECK-NEXT: ret i8 [[TMP4]] +// +poly8_t test_vdupb_laneq_p8(poly8x16_t a) { + return vdupb_laneq_p8(a, 5); +} + +// CHECK-LABEL: define dso_local <8 x i8> @test_vset_lane_p8( +// CHECK-SAME: i8 noundef [[A:%.*]], <8 x i8> noundef [[V:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i8, align 1 +// CHECK-NEXT: [[V_ADDR:%.*]] = alloca <8 x i8>, align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca <8 x i8>, align 8 +// CHECK-NEXT: [[__S0:%.*]] = alloca i8, align 1 +// CHECK-NEXT: [[__S1:%.*]] = alloca <8 x i8>, align 8 +// CHECK-NEXT: [[REF_TMP:%.*]] = alloca <8 x i8>, align 8 +// CHECK-NEXT: [[TMP:%.*]] = alloca <8 x i8>, align 8 +// CHECK-NEXT: store i8 [[A]], ptr [[A_ADDR]], align 1 +// CHECK-NEXT: store <8 x i8> [[V]], ptr [[V_ADDR]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A_ADDR]], align 1 +// CHECK-NEXT: store i8 [[TMP0]], ptr [[__S0]], align 1 +// CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr [[V_ADDR]], align 8 +// CHECK-NEXT: store <8 x i8> [[TMP1]], ptr [[__S1]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr [[__S0]], align 1 +// CHECK-NEXT: [[TMP3:%.*]] = load <8 x i8>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <8 x i8> [[TMP3]], i8 [[TMP2]], i32 3 +// CHECK-NEXT: store <8 x i8> [[VSET_LANE]], ptr [[REF_TMP]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = load <8 x i8>, ptr [[REF_TMP]], align 8 +// CHECK-NEXT: store <8 x i8> [[TMP4]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[TMP5:%.*]] = load <8 x i8>, ptr [[__RET]], align 8 +// CHECK-NEXT: store <8 x i8> [[TMP5]], ptr [[TMP]], align 8 +// CHECK-NEXT: [[TMP6:%.*]] = load <8 x i8>, ptr [[TMP]], align 8 +// CHECK-NEXT: ret <8 x i8> [[TMP6]] +// +poly8x8_t test_vset_lane_p8(poly8_t a, poly8x8_t v){ + return vset_lane_p8(a, v, 3); +} + +// CHECK-LABEL: define dso_local <4 x i16> @test_vset_lane_p16( +// CHECK-SAME: i16 noundef [[A:%.*]], <4 x i16> noundef [[V:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[V_ADDR:%.*]] = alloca <4 x i16>, align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca <4 x i16>, align 8 +// CHECK-NEXT: [[__S0:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[__S1:%.*]] = alloca <4 x i16>, align 8 +// CHECK-NEXT: [[REF_TMP:%.*]] = alloca <4 x i16>, align 8 +// CHECK-NEXT: [[TMP:%.*]] = alloca <4 x i16>, align 8 +// CHECK-NEXT: store i16 [[A]], ptr [[A_ADDR]], align 2 +// CHECK-NEXT: store <4 x i16> [[V]], ptr [[V_ADDR]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A_ADDR]], align 2 +// CHECK-NEXT: store i16 [[TMP0]], ptr [[__S0]], align 2 +// CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr [[V_ADDR]], align 8 +// CHECK-NEXT: store <4 x i16> [[TMP1]], ptr [[__S1]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr [[__S0]], align 2 +// CHECK-NEXT: [[TMP3:%.*]] = load <4 x i16>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <4 x i16> [[TMP3]], i16 [[TMP2]], i32 3 +// CHECK-NEXT: store <4 x i16> [[VSET_LANE]], ptr [[REF_TMP]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = load <4 x i16>, ptr [[REF_TMP]], align 8 +// CHECK-NEXT: store <4 x i16> [[TMP4]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[TMP5:%.*]] = load <4 x i16>, ptr [[__RET]], align 8 +// CHECK-NEXT: store <4 x i16> [[TMP5]], ptr [[TMP]], align 8 +// CHECK-NEXT: [[TMP6:%.*]] = load <4 x i16>, ptr [[TMP]], align 8 +// CHECK-NEXT: ret <4 x i16> [[TMP6]] +// +poly16x4_t test_vset_lane_p16(poly16_t a, poly16x4_t v){ + return vset_lane_p16(a, v, 3); +} + +// CHECK-LABEL: define dso_local <1 x i64> @test_vset_lane_p64( +// CHECK-SAME: i64 noundef [[A:%.*]], <1 x i64> noundef [[V:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[V_ADDR:%.*]] = alloca <1 x i64>, align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca <1 x i64>, align 8 +// CHECK-NEXT: [[__S0:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca <1 x i64>, align 8 +// CHECK-NEXT: [[REF_TMP:%.*]] = alloca <1 x i64>, align 8 +// CHECK-NEXT: [[TMP:%.*]] = alloca <1 x i64>, align 8 +// CHECK-NEXT: store i64 [[A]], ptr [[A_ADDR]], align 8 +// CHECK-NEXT: store <1 x i64> [[V]], ptr [[V_ADDR]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[A_ADDR]], align 8 +// CHECK-NEXT: store i64 [[TMP0]], ptr [[__S0]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <1 x i64>, ptr [[V_ADDR]], align 8 +// CHECK-NEXT: store <1 x i64> [[TMP1]], ptr [[__S1]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr [[__S0]], align 8 +// CHECK-NEXT: [[TMP3:%.*]] = load <1 x i64>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <1 x i64> [[TMP3]], i64 [[TMP2]], i32 0 +// CHECK-NEXT: store <1 x i64> [[VSET_LANE]], ptr [[REF_TMP]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = load <1 x i64>, ptr [[REF_TMP]], align 8 +// CHECK-NEXT: store <1 x i64> [[TMP4]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[TMP5:%.*]] = load <1 x i64>, ptr [[__RET]], align 8 +// CHECK-NEXT: store <1 x i64> [[TMP5]], ptr [[TMP]], align 8 +// CHECK-NEXT: [[TMP6:%.*]] = load <1 x i64>, ptr [[TMP]], align 8 +// CHECK-NEXT: ret <1 x i64> [[TMP6]] +// +poly64x1_t test_vset_lane_p64(poly64_t a, poly64x1_t v){ + return vset_lane_p64(a, v, 0); +} + +// CHECK-LABEL: define dso_local <16 x i8> @test_vsetq_lane_p8( +// CHECK-SAME: i8 noundef [[A:%.*]], <16 x i8> noundef [[V:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i8, align 1 +// CHECK-NEXT: [[V_ADDR:%.*]] = alloca <16 x i8>, align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca <16 x i8>, align 16 +// CHECK-NEXT: [[__S0:%.*]] = alloca i8, align 1 +// CHECK-NEXT: [[__S1:%.*]] = alloca <16 x i8>, align 16 +// CHECK-NEXT: [[REF_TMP:%.*]] = alloca <16 x i8>, align 16 +// CHECK-NEXT: [[TMP:%.*]] = alloca <16 x i8>, align 16 +// CHECK-NEXT: store i8 [[A]], ptr [[A_ADDR]], align 1 +// CHECK-NEXT: store <16 x i8> [[V]], ptr [[V_ADDR]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[A_ADDR]], align 1 +// CHECK-NEXT: store i8 [[TMP0]], ptr [[__S0]], align 1 +// CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[V_ADDR]], align 16 +// CHECK-NEXT: store <16 x i8> [[TMP1]], ptr [[__S1]], align 16 +// CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr [[__S0]], align 1 +// CHECK-NEXT: [[TMP3:%.*]] = load <16 x i8>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <16 x i8> [[TMP3]], i8 [[TMP2]], i32 3 +// CHECK-NEXT: store <16 x i8> [[VSET_LANE]], ptr [[REF_TMP]], align 16 +// CHECK-NEXT: [[TMP4:%.*]] = load <16 x i8>, ptr [[REF_TMP]], align 16 +// CHECK-NEXT: store <16 x i8> [[TMP4]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr [[__RET]], align 16 +// CHECK-NEXT: store <16 x i8> [[TMP5]], ptr [[TMP]], align 16 +// CHECK-NEXT: [[TMP6:%.*]] = load <16 x i8>, ptr [[TMP]], align 16 +// CHECK-NEXT: ret <16 x i8> [[TMP6]] +// +poly8x16_t test_vsetq_lane_p8(poly8_t a, poly8x16_t v){ + return vsetq_lane_p8(a, v, 3); +} + +// CHECK-LABEL: define dso_local <8 x i16> @test_vsetq_lane_p16( +// CHECK-SAME: i16 noundef [[A:%.*]], <8 x i16> noundef [[V:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[V_ADDR:%.*]] = alloca <8 x i16>, align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca <8 x i16>, align 16 +// CHECK-NEXT: [[__S0:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[__S1:%.*]] = alloca <8 x i16>, align 16 +// CHECK-NEXT: [[REF_TMP:%.*]] = alloca <8 x i16>, align 16 +// CHECK-NEXT: [[TMP:%.*]] = alloca <8 x i16>, align 16 +// CHECK-NEXT: store i16 [[A]], ptr [[A_ADDR]], align 2 +// CHECK-NEXT: store <8 x i16> [[V]], ptr [[V_ADDR]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[A_ADDR]], align 2 +// CHECK-NEXT: store i16 [[TMP0]], ptr [[__S0]], align 2 +// CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[V_ADDR]], align 16 +// CHECK-NEXT: store <8 x i16> [[TMP1]], ptr [[__S1]], align 16 +// CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr [[__S0]], align 2 +// CHECK-NEXT: [[TMP3:%.*]] = load <8 x i16>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <8 x i16> [[TMP3]], i16 [[TMP2]], i32 3 +// CHECK-NEXT: store <8 x i16> [[VSET_LANE]], ptr [[REF_TMP]], align 16 +// CHECK-NEXT: [[TMP4:%.*]] = load <8 x i16>, ptr [[REF_TMP]], align 16 +// CHECK-NEXT: store <8 x i16> [[TMP4]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[TMP5:%.*]] = load <8 x i16>, ptr [[__RET]], align 16 +// CHECK-NEXT: store <8 x i16> [[TMP5]], ptr [[TMP]], align 16 +// CHECK-NEXT: [[TMP6:%.*]] = load <8 x i16>, ptr [[TMP]], align 16 +// CHECK-NEXT: ret <8 x i16> [[TMP6]] +// +poly16x8_t test_vsetq_lane_p16(poly16_t a, poly16x8_t v){ + return vsetq_lane_p16(a, v, 3); +} + +// CHECK-LABEL: define dso_local <2 x i64> @test_vsetq_lane_p64( +// CHECK-SAME: i64 noundef [[A:%.*]], <2 x i64> noundef [[V:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[V_ADDR:%.*]] = alloca <2 x i64>, align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca <2 x i64>, align 16 +// CHECK-NEXT: [[__S0:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca <2 x i64>, align 16 +// CHECK-NEXT: [[REF_TMP:%.*]] = alloca <2 x i64>, align 16 +// CHECK-NEXT: [[TMP:%.*]] = alloca <2 x i64>, align 16 +// CHECK-NEXT: store i64 [[A]], ptr [[A_ADDR]], align 8 +// CHECK-NEXT: store <2 x i64> [[V]], ptr [[V_ADDR]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[A_ADDR]], align 8 +// CHECK-NEXT: store i64 [[TMP0]], ptr [[__S0]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[V_ADDR]], align 16 +// CHECK-NEXT: store <2 x i64> [[TMP1]], ptr [[__S1]], align 16 +// CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr [[__S0]], align 8 +// CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[TMP2]], i32 0 +// CHECK-NEXT: store <2 x i64> [[VSET_LANE]], ptr [[REF_TMP]], align 16 +// CHECK-NEXT: [[TMP4:%.*]] = load <2 x i64>, ptr [[REF_TMP]], align 16 +// CHECK-NEXT: store <2 x i64> [[TMP4]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[TMP5:%.*]] = load <2 x i64>, ptr [[__RET]], align 16 +// CHECK-NEXT: store <2 x i64> [[TMP5]], ptr [[TMP]], align 16 +// CHECK-NEXT: [[TMP6:%.*]] = load <2 x i64>, ptr [[TMP]], align 16 +// CHECK-NEXT: ret <2 x i64> [[TMP6]] +// +poly64x2_t test_vsetq_lane_p64(poly64_t a, poly64x2_t v){ + return vsetq_lane_p64(a, v, 0); +} + +// CHECK-LABEL: define dso_local i8 @test_vget_lane_p8( +// CHECK-SAME: <8 x i8> noundef [[V:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[V_ADDR:%.*]] = alloca <8 x i8>, align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca i8, align 1 +// CHECK-NEXT: [[__S0:%.*]] = alloca <8 x i8>, align 8 +// CHECK-NEXT: [[REF_TMP:%.*]] = alloca i8, align 1 +// CHECK-NEXT: [[TMP:%.*]] = alloca i8, align 1 +// CHECK-NEXT: store <8 x i8> [[V]], ptr [[V_ADDR]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[V_ADDR]], align 8 +// CHECK-NEXT: store <8 x i8> [[TMP0]], ptr [[__S0]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr [[__S0]], align 8 +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <8 x i8> [[TMP1]], i32 2 +// CHECK-NEXT: store i8 [[VGET_LANE]], ptr [[REF_TMP]], align 1 +// CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr [[REF_TMP]], align 1 +// CHECK-NEXT: store i8 [[TMP2]], ptr [[__RET]], align 1 +// CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr [[__RET]], align 1 +// CHECK-NEXT: store i8 [[TMP3]], ptr [[TMP]], align 1 +// CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr [[TMP]], align 1 +// CHECK-NEXT: ret i8 [[TMP4]] +// +poly8_t test_vget_lane_p8(poly8x8_t v){ + return vget_lane_p8(v, 2); +} + +// CHECK-LABEL: define dso_local i16 @test_vget_lane_p16( +// CHECK-SAME: <4 x i16> noundef [[V:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[V_ADDR:%.*]] = alloca <4 x i16>, align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[__S0:%.*]] = alloca <4 x i16>, align 8 +// CHECK-NEXT: [[REF_TMP:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[TMP:%.*]] = alloca i16, align 2 +// CHECK-NEXT: store <4 x i16> [[V]], ptr [[V_ADDR]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[V_ADDR]], align 8 +// CHECK-NEXT: store <4 x i16> [[TMP0]], ptr [[__S0]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr [[__S0]], align 8 +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 2 +// CHECK-NEXT: store i16 [[VGET_LANE]], ptr [[REF_TMP]], align 2 +// CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr [[REF_TMP]], align 2 +// CHECK-NEXT: store i16 [[TMP2]], ptr [[__RET]], align 2 +// CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr [[__RET]], align 2 +// CHECK-NEXT: store i16 [[TMP3]], ptr [[TMP]], align 2 +// CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr [[TMP]], align 2 +// CHECK-NEXT: ret i16 [[TMP4]] +// +poly16_t test_vget_lane_p16(poly16x4_t v){ + return vget_lane_p16(v, 2); +} + +// CHECK-LABEL: define dso_local i64 @test_vget_lane_p64( +// CHECK-SAME: <1 x i64> noundef [[V:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[V_ADDR:%.*]] = alloca <1 x i64>, align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[__S0:%.*]] = alloca <1 x i64>, align 8 +// CHECK-NEXT: [[REF_TMP:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[TMP:%.*]] = alloca i64, align 8 +// CHECK-NEXT: store <1 x i64> [[V]], ptr [[V_ADDR]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load <1 x i64>, ptr [[V_ADDR]], align 8 +// CHECK-NEXT: store <1 x i64> [[TMP0]], ptr [[__S0]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <1 x i64>, ptr [[__S0]], align 8 +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <1 x i64> [[TMP1]], i32 0 +// CHECK-NEXT: store i64 [[VGET_LANE]], ptr [[REF_TMP]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr [[REF_TMP]], align 8 +// CHECK-NEXT: store i64 [[TMP2]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[TMP3:%.*]] = load i64, ptr [[__RET]], align 8 +// CHECK-NEXT: store i64 [[TMP3]], ptr [[TMP]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr [[TMP]], align 8 +// CHECK-NEXT: ret i64 [[TMP4]] +// +poly64_t test_vget_lane_p64(poly64x1_t v){ + return vget_lane_p64(v, 0); +} + +// CHECK-LABEL: define dso_local i8 @test_vgetq_lane_p8( +// CHECK-SAME: <16 x i8> noundef [[V:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[V_ADDR:%.*]] = alloca <16 x i8>, align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca i8, align 1 +// CHECK-NEXT: [[__S0:%.*]] = alloca <16 x i8>, align 16 +// CHECK-NEXT: [[REF_TMP:%.*]] = alloca i8, align 1 +// CHECK-NEXT: [[TMP:%.*]] = alloca i8, align 1 +// CHECK-NEXT: store <16 x i8> [[V]], ptr [[V_ADDR]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[V_ADDR]], align 16 +// CHECK-NEXT: store <16 x i8> [[TMP0]], ptr [[__S0]], align 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[__S0]], align 16 +// CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <16 x i8> [[TMP1]], i32 2 +// CHECK-NEXT: store i8 [[VGETQ_LANE]], ptr [[REF_TMP]], align 1 +// CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr [[REF_TMP]], align 1 +// CHECK-NEXT: store i8 [[TMP2]], ptr [[__RET]], align 1 +// CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr [[__RET]], align 1 +// CHECK-NEXT: store i8 [[TMP3]], ptr [[TMP]], align 1 +// CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr [[TMP]], align 1 +// CHECK-NEXT: ret i8 [[TMP4]] +// +poly8_t test_vgetq_lane_p8(poly8x16_t v){ + return vgetq_lane_p8(v, 2); +} + +// CHECK-LABEL: define dso_local i16 @test_vgetq_lane_p16( +// CHECK-SAME: <8 x i16> noundef [[V:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[V_ADDR:%.*]] = alloca <8 x i16>, align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[__S0:%.*]] = alloca <8 x i16>, align 16 +// CHECK-NEXT: [[REF_TMP:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[TMP:%.*]] = alloca i16, align 2 +// CHECK-NEXT: store <8 x i16> [[V]], ptr [[V_ADDR]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr [[V_ADDR]], align 16 +// CHECK-NEXT: store <8 x i16> [[TMP0]], ptr [[__S0]], align 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[__S0]], align 16 +// CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 2 +// CHECK-NEXT: store i16 [[VGETQ_LANE]], ptr [[REF_TMP]], align 2 +// CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr [[REF_TMP]], align 2 +// CHECK-NEXT: store i16 [[TMP2]], ptr [[__RET]], align 2 +// CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr [[__RET]], align 2 +// CHECK-NEXT: store i16 [[TMP3]], ptr [[TMP]], align 2 +// CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr [[TMP]], align 2 +// CHECK-NEXT: ret i16 [[TMP4]] +// +poly16_t test_vgetq_lane_p16(poly16x8_t v){ + return vgetq_lane_p16(v, 2); +} + +// CHECK-LABEL: define dso_local i64 @test_vgetq_lane_p64( +// CHECK-SAME: <2 x i64> noundef [[V:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[V_ADDR:%.*]] = alloca <2 x i64>, align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[__S0:%.*]] = alloca <2 x i64>, align 16 +// CHECK-NEXT: [[REF_TMP:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[TMP:%.*]] = alloca i64, align 8 +// CHECK-NEXT: store <2 x i64> [[V]], ptr [[V_ADDR]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, ptr [[V_ADDR]], align 16 +// CHECK-NEXT: store <2 x i64> [[TMP0]], ptr [[__S0]], align 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[__S0]], align 16 +// CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0 +// CHECK-NEXT: store i64 [[VGETQ_LANE]], ptr [[REF_TMP]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr [[REF_TMP]], align 8 +// CHECK-NEXT: store i64 [[TMP2]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[TMP3:%.*]] = load i64, ptr [[__RET]], align 8 +// CHECK-NEXT: store i64 [[TMP3]], ptr [[TMP]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr [[TMP]], align 8 +// CHECK-NEXT: ret i64 [[TMP4]] +// +poly64_t test_vgetq_lane_p64(poly64x2_t v){ + return vgetq_lane_p64(v, 0); +} + +// CHECK-LABEL: define dso_local <8 x i8> @test_vcopy_lane_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <8 x i8>, align 8 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca <8 x i8>, align 8 +// CHECK-NEXT: [[__RET_306:%.*]] = alloca <8 x i8>, align 8 +// CHECK-NEXT: [[__S0_306:%.*]] = alloca <8 x i8>, align 8 +// CHECK-NEXT: [[__S2_306:%.*]] = alloca <8 x i8>, align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca <8 x i8>, align 8 +// CHECK-NEXT: [[__S0:%.*]] = alloca i8, align 1 +// CHECK-NEXT: [[__RET1:%.*]] = alloca i8, align 1 +// CHECK-NEXT: [[__S02:%.*]] = alloca <8 x i8>, align 8 +// CHECK-NEXT: [[REF_TMP:%.*]] = alloca i8, align 1 +// CHECK-NEXT: [[TMP:%.*]] = alloca i8, align 1 +// CHECK-NEXT: [[__S1:%.*]] = alloca <8 x i8>, align 8 +// CHECK-NEXT: [[REF_TMP3:%.*]] = alloca <8 x i8>, align 8 +// CHECK-NEXT: [[CHECKTMP4:%.*]] = alloca <8 x i8>, align 8 +// CHECK-NEXT: [[CHECKTMP5:%.*]] = alloca <8 x i8>, align 8 +// CHECK-NEXT: store <8 x i8> [[A]], ptr [[A_ADDR]], align 8 +// CHECK-NEXT: store <8 x i8> [[B]], ptr [[B_ADDR]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[A_ADDR]], align 8 +// CHECK-NEXT: store <8 x i8> [[TMP0]], ptr [[__S0_306]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr [[B_ADDR]], align 8 +// CHECK-NEXT: store <8 x i8> [[TMP1]], ptr [[__S2_306]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = load <8 x i8>, ptr [[__S2_306]], align 8 +// CHECK-NEXT: store <8 x i8> [[TMP2]], ptr [[__S02]], align 8 +// CHECK-NEXT: [[TMP3:%.*]] = load <8 x i8>, ptr [[__S02]], align 8 +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <8 x i8> [[TMP3]], i32 0 +// CHECK-NEXT: store i8 [[VGET_LANE]], ptr [[REF_TMP]], align 1 +// CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr [[REF_TMP]], align 1 +// CHECK-NEXT: store i8 [[TMP4]], ptr [[__RET1]], align 1 +// CHECK-NEXT: [[TMP5:%.*]] = load i8, ptr [[__RET1]], align 1 +// CHECK-NEXT: store i8 [[TMP5]], ptr [[TMP]], align 1 +// CHECK-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP]], align 1 +// CHECK-NEXT: store i8 [[TMP6]], ptr [[__S0]], align 1 +// CHECK-NEXT: [[TMP7:%.*]] = load <8 x i8>, ptr [[__S0_306]], align 8 +// CHECK-NEXT: store <8 x i8> [[TMP7]], ptr [[__S1]], align 8 +// CHECK-NEXT: [[TMP8:%.*]] = load i8, ptr [[__S0]], align 1 +// CHECK-NEXT: [[TMP9:%.*]] = load <8 x i8>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <8 x i8> [[TMP9]], i8 [[TMP8]], i32 0 +// CHECK-NEXT: store <8 x i8> [[VSET_LANE]], ptr [[REF_TMP3]], align 8 +// CHECK-NEXT: [[TMP10:%.*]] = load <8 x i8>, ptr [[REF_TMP3]], align 8 +// CHECK-NEXT: store <8 x i8> [[TMP10]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[TMP11:%.*]] = load <8 x i8>, ptr [[__RET]], align 8 +// CHECK-NEXT: store <8 x i8> [[TMP11]], ptr [[CHECKTMP4]], align 8 +// CHECK-NEXT: [[TMP12:%.*]] = load <8 x i8>, ptr [[CHECKTMP4]], align 8 +// CHECK-NEXT: store <8 x i8> [[TMP12]], ptr [[__RET_306]], align 8 +// CHECK-NEXT: [[TMP13:%.*]] = load <8 x i8>, ptr [[__RET_306]], align 8 +// CHECK-NEXT: store <8 x i8> [[TMP13]], ptr [[CHECKTMP5]], align 8 +// CHECK-NEXT: [[TMP14:%.*]] = load <8 x i8>, ptr [[CHECKTMP5]], align 8 +// CHECK-NEXT: ret <8 x i8> [[TMP14]] +// +poly8x8_t test_vcopy_lane_p8(poly8x8_t a, poly8x8_t b) { + return vcopy_lane_p8(a, 0, b, 0); +} + +// CHECK-LABEL: define dso_local <4 x i16> @test_vcopy_lane_p16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <4 x i16>, align 8 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca <4 x i16>, align 8 +// CHECK-NEXT: [[__RET_308:%.*]] = alloca <4 x i16>, align 8 +// CHECK-NEXT: [[__S0_308:%.*]] = alloca <4 x i16>, align 8 +// CHECK-NEXT: [[__S2_308:%.*]] = alloca <4 x i16>, align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca <4 x i16>, align 8 +// CHECK-NEXT: [[__S0:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[__RET1:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[__S02:%.*]] = alloca <4 x i16>, align 8 +// CHECK-NEXT: [[REF_TMP:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[TMP:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[__S1:%.*]] = alloca <4 x i16>, align 8 +// CHECK-NEXT: [[REF_TMP3:%.*]] = alloca <4 x i16>, align 8 +// CHECK-NEXT: [[CHECKTMP4:%.*]] = alloca <4 x i16>, align 8 +// CHECK-NEXT: [[CHECKTMP5:%.*]] = alloca <4 x i16>, align 8 +// CHECK-NEXT: store <4 x i16> [[A]], ptr [[A_ADDR]], align 8 +// CHECK-NEXT: store <4 x i16> [[B]], ptr [[B_ADDR]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[A_ADDR]], align 8 +// CHECK-NEXT: store <4 x i16> [[TMP0]], ptr [[__S0_308]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr [[B_ADDR]], align 8 +// CHECK-NEXT: store <4 x i16> [[TMP1]], ptr [[__S2_308]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = load <4 x i16>, ptr [[__S2_308]], align 8 +// CHECK-NEXT: store <4 x i16> [[TMP2]], ptr [[__S02]], align 8 +// CHECK-NEXT: [[TMP3:%.*]] = load <4 x i16>, ptr [[__S02]], align 8 +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP3]], i32 0 +// CHECK-NEXT: store i16 [[VGET_LANE]], ptr [[REF_TMP]], align 2 +// CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr [[REF_TMP]], align 2 +// CHECK-NEXT: store i16 [[TMP4]], ptr [[__RET1]], align 2 +// CHECK-NEXT: [[TMP5:%.*]] = load i16, ptr [[__RET1]], align 2 +// CHECK-NEXT: store i16 [[TMP5]], ptr [[TMP]], align 2 +// CHECK-NEXT: [[TMP6:%.*]] = load i16, ptr [[TMP]], align 2 +// CHECK-NEXT: store i16 [[TMP6]], ptr [[__S0]], align 2 +// CHECK-NEXT: [[TMP7:%.*]] = load <4 x i16>, ptr [[__S0_308]], align 8 +// CHECK-NEXT: store <4 x i16> [[TMP7]], ptr [[__S1]], align 8 +// CHECK-NEXT: [[TMP8:%.*]] = load i16, ptr [[__S0]], align 2 +// CHECK-NEXT: [[TMP9:%.*]] = load <4 x i16>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <4 x i16> [[TMP9]], i16 [[TMP8]], i32 0 +// CHECK-NEXT: store <4 x i16> [[VSET_LANE]], ptr [[REF_TMP3]], align 8 +// CHECK-NEXT: [[TMP10:%.*]] = load <4 x i16>, ptr [[REF_TMP3]], align 8 +// CHECK-NEXT: store <4 x i16> [[TMP10]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[TMP11:%.*]] = load <4 x i16>, ptr [[__RET]], align 8 +// CHECK-NEXT: store <4 x i16> [[TMP11]], ptr [[CHECKTMP4]], align 8 +// CHECK-NEXT: [[TMP12:%.*]] = load <4 x i16>, ptr [[CHECKTMP4]], align 8 +// CHECK-NEXT: store <4 x i16> [[TMP12]], ptr [[__RET_308]], align 8 +// CHECK-NEXT: [[TMP13:%.*]] = load <4 x i16>, ptr [[__RET_308]], align 8 +// CHECK-NEXT: store <4 x i16> [[TMP13]], ptr [[CHECKTMP5]], align 8 +// CHECK-NEXT: [[TMP14:%.*]] = load <4 x i16>, ptr [[CHECKTMP5]], align 8 +// CHECK-NEXT: ret <4 x i16> [[TMP14]] +// +poly16x4_t test_vcopy_lane_p16(poly16x4_t a, poly16x4_t b) { + return vcopy_lane_p16(a, 0, b, 0); +} + +// CHECK-LABEL: define dso_local <1 x i64> @test_vcopy_lane_p64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <1 x i64>, align 8 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca <1 x i64>, align 8 +// CHECK-NEXT: [[__RET_913:%.*]] = alloca <1 x i64>, align 8 +// CHECK-NEXT: [[__S0_913:%.*]] = alloca <1 x i64>, align 8 +// CHECK-NEXT: [[__S2_913:%.*]] = alloca <1 x i64>, align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca <1 x i64>, align 8 +// CHECK-NEXT: [[__S0:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[__RET1:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[__S02:%.*]] = alloca <1 x i64>, align 8 +// CHECK-NEXT: [[REF_TMP:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[TMP:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca <1 x i64>, align 8 +// CHECK-NEXT: [[REF_TMP3:%.*]] = alloca <1 x i64>, align 8 +// CHECK-NEXT: [[CHECKTMP4:%.*]] = alloca <1 x i64>, align 8 +// CHECK-NEXT: [[CHECKTMP5:%.*]] = alloca <1 x i64>, align 8 +// CHECK-NEXT: store <1 x i64> [[A]], ptr [[A_ADDR]], align 8 +// CHECK-NEXT: store <1 x i64> [[B]], ptr [[B_ADDR]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load <1 x i64>, ptr [[A_ADDR]], align 8 +// CHECK-NEXT: store <1 x i64> [[TMP0]], ptr [[__S0_913]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <1 x i64>, ptr [[B_ADDR]], align 8 +// CHECK-NEXT: store <1 x i64> [[TMP1]], ptr [[__S2_913]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = load <1 x i64>, ptr [[__S2_913]], align 8 +// CHECK-NEXT: store <1 x i64> [[TMP2]], ptr [[__S02]], align 8 +// CHECK-NEXT: [[TMP3:%.*]] = load <1 x i64>, ptr [[__S02]], align 8 +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <1 x i64> [[TMP3]], i32 0 +// CHECK-NEXT: store i64 [[VGET_LANE]], ptr [[REF_TMP]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr [[REF_TMP]], align 8 +// CHECK-NEXT: store i64 [[TMP4]], ptr [[__RET1]], align 8 +// CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr [[__RET1]], align 8 +// CHECK-NEXT: store i64 [[TMP5]], ptr [[TMP]], align 8 +// CHECK-NEXT: [[TMP6:%.*]] = load i64, ptr [[TMP]], align 8 +// CHECK-NEXT: store i64 [[TMP6]], ptr [[__S0]], align 8 +// CHECK-NEXT: [[TMP7:%.*]] = load <1 x i64>, ptr [[__S0_913]], align 8 +// CHECK-NEXT: store <1 x i64> [[TMP7]], ptr [[__S1]], align 8 +// CHECK-NEXT: [[TMP8:%.*]] = load i64, ptr [[__S0]], align 8 +// CHECK-NEXT: [[TMP9:%.*]] = load <1 x i64>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <1 x i64> [[TMP9]], i64 [[TMP8]], i32 0 +// CHECK-NEXT: store <1 x i64> [[VSET_LANE]], ptr [[REF_TMP3]], align 8 +// CHECK-NEXT: [[TMP10:%.*]] = load <1 x i64>, ptr [[REF_TMP3]], align 8 +// CHECK-NEXT: store <1 x i64> [[TMP10]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[TMP11:%.*]] = load <1 x i64>, ptr [[__RET]], align 8 +// CHECK-NEXT: store <1 x i64> [[TMP11]], ptr [[CHECKTMP4]], align 8 +// CHECK-NEXT: [[TMP12:%.*]] = load <1 x i64>, ptr [[CHECKTMP4]], align 8 +// CHECK-NEXT: store <1 x i64> [[TMP12]], ptr [[__RET_913]], align 8 +// CHECK-NEXT: [[TMP13:%.*]] = load <1 x i64>, ptr [[__RET_913]], align 8 +// CHECK-NEXT: store <1 x i64> [[TMP13]], ptr [[CHECKTMP5]], align 8 +// CHECK-NEXT: [[TMP14:%.*]] = load <1 x i64>, ptr [[CHECKTMP5]], align 8 +// CHECK-NEXT: ret <1 x i64> [[TMP14]] +// +poly64x1_t test_vcopy_lane_p64(poly64x1_t a, poly64x1_t b) { + return vcopy_lane_p64(a, 0, b, 0); +} + +// CHECK-LABEL: define dso_local <16 x i8> @test_vcopyq_lane_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <16 x i8>, align 16 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca <8 x i8>, align 8 +// CHECK-NEXT: [[__RET_282:%.*]] = alloca <16 x i8>, align 16 +// CHECK-NEXT: [[__S0_282:%.*]] = alloca <16 x i8>, align 16 +// CHECK-NEXT: [[__S2_282:%.*]] = alloca <8 x i8>, align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca <16 x i8>, align 16 +// CHECK-NEXT: [[__S0:%.*]] = alloca i8, align 1 +// CHECK-NEXT: [[__RET1:%.*]] = alloca i8, align 1 +// CHECK-NEXT: [[__S02:%.*]] = alloca <8 x i8>, align 8 +// CHECK-NEXT: [[REF_TMP:%.*]] = alloca i8, align 1 +// CHECK-NEXT: [[TMP:%.*]] = alloca i8, align 1 +// CHECK-NEXT: [[__S1:%.*]] = alloca <16 x i8>, align 16 +// CHECK-NEXT: [[REF_TMP3:%.*]] = alloca <16 x i8>, align 16 +// CHECK-NEXT: [[CHECKTMP4:%.*]] = alloca <16 x i8>, align 16 +// CHECK-NEXT: [[CHECKTMP5:%.*]] = alloca <16 x i8>, align 16 +// CHECK-NEXT: store <16 x i8> [[A]], ptr [[A_ADDR]], align 16 +// CHECK-NEXT: store <8 x i8> [[B]], ptr [[B_ADDR]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[A_ADDR]], align 16 +// CHECK-NEXT: store <16 x i8> [[TMP0]], ptr [[__S0_282]], align 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr [[B_ADDR]], align 8 +// CHECK-NEXT: store <8 x i8> [[TMP1]], ptr [[__S2_282]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = load <8 x i8>, ptr [[__S2_282]], align 8 +// CHECK-NEXT: store <8 x i8> [[TMP2]], ptr [[__S02]], align 8 +// CHECK-NEXT: [[TMP3:%.*]] = load <8 x i8>, ptr [[__S02]], align 8 +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <8 x i8> [[TMP3]], i32 0 +// CHECK-NEXT: store i8 [[VGET_LANE]], ptr [[REF_TMP]], align 1 +// CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr [[REF_TMP]], align 1 +// CHECK-NEXT: store i8 [[TMP4]], ptr [[__RET1]], align 1 +// CHECK-NEXT: [[TMP5:%.*]] = load i8, ptr [[__RET1]], align 1 +// CHECK-NEXT: store i8 [[TMP5]], ptr [[TMP]], align 1 +// CHECK-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP]], align 1 +// CHECK-NEXT: store i8 [[TMP6]], ptr [[__S0]], align 1 +// CHECK-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr [[__S0_282]], align 16 +// CHECK-NEXT: store <16 x i8> [[TMP7]], ptr [[__S1]], align 16 +// CHECK-NEXT: [[TMP8:%.*]] = load i8, ptr [[__S0]], align 1 +// CHECK-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <16 x i8> [[TMP9]], i8 [[TMP8]], i32 0 +// CHECK-NEXT: store <16 x i8> [[VSET_LANE]], ptr [[REF_TMP3]], align 16 +// CHECK-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr [[REF_TMP3]], align 16 +// CHECK-NEXT: store <16 x i8> [[TMP10]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[TMP11:%.*]] = load <16 x i8>, ptr [[__RET]], align 16 +// CHECK-NEXT: store <16 x i8> [[TMP11]], ptr [[CHECKTMP4]], align 16 +// CHECK-NEXT: [[TMP12:%.*]] = load <16 x i8>, ptr [[CHECKTMP4]], align 16 +// CHECK-NEXT: store <16 x i8> [[TMP12]], ptr [[__RET_282]], align 16 +// CHECK-NEXT: [[TMP13:%.*]] = load <16 x i8>, ptr [[__RET_282]], align 16 +// CHECK-NEXT: store <16 x i8> [[TMP13]], ptr [[CHECKTMP5]], align 16 +// CHECK-NEXT: [[TMP14:%.*]] = load <16 x i8>, ptr [[CHECKTMP5]], align 16 +// CHECK-NEXT: ret <16 x i8> [[TMP14]] +// +poly8x16_t test_vcopyq_lane_p8(poly8x16_t a, poly8x8_t b){ + return vcopyq_lane_p8(a, 0, b, 0); +} + +// CHECK-LABEL: define dso_local <8 x i16> @test_vcopyq_lane_p16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <8 x i16>, align 16 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca <4 x i16>, align 8 +// CHECK-NEXT: [[__RET_284:%.*]] = alloca <8 x i16>, align 16 +// CHECK-NEXT: [[__S0_284:%.*]] = alloca <8 x i16>, align 16 +// CHECK-NEXT: [[__S2_284:%.*]] = alloca <4 x i16>, align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca <8 x i16>, align 16 +// CHECK-NEXT: [[__S0:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[__RET1:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[__S02:%.*]] = alloca <4 x i16>, align 8 +// CHECK-NEXT: [[REF_TMP:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[TMP:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[__S1:%.*]] = alloca <8 x i16>, align 16 +// CHECK-NEXT: [[REF_TMP3:%.*]] = alloca <8 x i16>, align 16 +// CHECK-NEXT: [[CHECKTMP4:%.*]] = alloca <8 x i16>, align 16 +// CHECK-NEXT: [[CHECKTMP5:%.*]] = alloca <8 x i16>, align 16 +// CHECK-NEXT: store <8 x i16> [[A]], ptr [[A_ADDR]], align 16 +// CHECK-NEXT: store <4 x i16> [[B]], ptr [[B_ADDR]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr [[A_ADDR]], align 16 +// CHECK-NEXT: store <8 x i16> [[TMP0]], ptr [[__S0_284]], align 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr [[B_ADDR]], align 8 +// CHECK-NEXT: store <4 x i16> [[TMP1]], ptr [[__S2_284]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = load <4 x i16>, ptr [[__S2_284]], align 8 +// CHECK-NEXT: store <4 x i16> [[TMP2]], ptr [[__S02]], align 8 +// CHECK-NEXT: [[TMP3:%.*]] = load <4 x i16>, ptr [[__S02]], align 8 +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP3]], i32 0 +// CHECK-NEXT: store i16 [[VGET_LANE]], ptr [[REF_TMP]], align 2 +// CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr [[REF_TMP]], align 2 +// CHECK-NEXT: store i16 [[TMP4]], ptr [[__RET1]], align 2 +// CHECK-NEXT: [[TMP5:%.*]] = load i16, ptr [[__RET1]], align 2 +// CHECK-NEXT: store i16 [[TMP5]], ptr [[TMP]], align 2 +// CHECK-NEXT: [[TMP6:%.*]] = load i16, ptr [[TMP]], align 2 +// CHECK-NEXT: store i16 [[TMP6]], ptr [[__S0]], align 2 +// CHECK-NEXT: [[TMP7:%.*]] = load <8 x i16>, ptr [[__S0_284]], align 16 +// CHECK-NEXT: store <8 x i16> [[TMP7]], ptr [[__S1]], align 16 +// CHECK-NEXT: [[TMP8:%.*]] = load i16, ptr [[__S0]], align 2 +// CHECK-NEXT: [[TMP9:%.*]] = load <8 x i16>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <8 x i16> [[TMP9]], i16 [[TMP8]], i32 0 +// CHECK-NEXT: store <8 x i16> [[VSET_LANE]], ptr [[REF_TMP3]], align 16 +// CHECK-NEXT: [[TMP10:%.*]] = load <8 x i16>, ptr [[REF_TMP3]], align 16 +// CHECK-NEXT: store <8 x i16> [[TMP10]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[TMP11:%.*]] = load <8 x i16>, ptr [[__RET]], align 16 +// CHECK-NEXT: store <8 x i16> [[TMP11]], ptr [[CHECKTMP4]], align 16 +// CHECK-NEXT: [[TMP12:%.*]] = load <8 x i16>, ptr [[CHECKTMP4]], align 16 +// CHECK-NEXT: store <8 x i16> [[TMP12]], ptr [[__RET_284]], align 16 +// CHECK-NEXT: [[TMP13:%.*]] = load <8 x i16>, ptr [[__RET_284]], align 16 +// CHECK-NEXT: store <8 x i16> [[TMP13]], ptr [[CHECKTMP5]], align 16 +// CHECK-NEXT: [[TMP14:%.*]] = load <8 x i16>, ptr [[CHECKTMP5]], align 16 +// CHECK-NEXT: ret <8 x i16> [[TMP14]] +// +poly16x8_t test_vcopyq_lane_p16(poly16x8_t a, poly16x4_t b){ + return vcopyq_lane_p16(a, 0, b, 0); +} + +// CHECK-LABEL: define dso_local <2 x i64> @test_vcopyq_lane_p64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <2 x i64>, align 16 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca <1 x i64>, align 8 +// CHECK-NEXT: [[__RET_909:%.*]] = alloca <2 x i64>, align 16 +// CHECK-NEXT: [[__S0_909:%.*]] = alloca <2 x i64>, align 16 +// CHECK-NEXT: [[__S2_909:%.*]] = alloca <1 x i64>, align 8 +// CHECK-NEXT: [[__RET:%.*]] = alloca <2 x i64>, align 16 +// CHECK-NEXT: [[__S0:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[__RET1:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[__S02:%.*]] = alloca <1 x i64>, align 8 +// CHECK-NEXT: [[REF_TMP:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[TMP:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca <2 x i64>, align 16 +// CHECK-NEXT: [[REF_TMP3:%.*]] = alloca <2 x i64>, align 16 +// CHECK-NEXT: [[CHECKTMP4:%.*]] = alloca <2 x i64>, align 16 +// CHECK-NEXT: [[CHECKTMP5:%.*]] = alloca <2 x i64>, align 16 +// CHECK-NEXT: store <2 x i64> [[A]], ptr [[A_ADDR]], align 16 +// CHECK-NEXT: store <1 x i64> [[B]], ptr [[B_ADDR]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16 +// CHECK-NEXT: store <2 x i64> [[TMP0]], ptr [[__S0_909]], align 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <1 x i64>, ptr [[B_ADDR]], align 8 +// CHECK-NEXT: store <1 x i64> [[TMP1]], ptr [[__S2_909]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = load <1 x i64>, ptr [[__S2_909]], align 8 +// CHECK-NEXT: store <1 x i64> [[TMP2]], ptr [[__S02]], align 8 +// CHECK-NEXT: [[TMP3:%.*]] = load <1 x i64>, ptr [[__S02]], align 8 +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <1 x i64> [[TMP3]], i32 0 +// CHECK-NEXT: store i64 [[VGET_LANE]], ptr [[REF_TMP]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr [[REF_TMP]], align 8 +// CHECK-NEXT: store i64 [[TMP4]], ptr [[__RET1]], align 8 +// CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr [[__RET1]], align 8 +// CHECK-NEXT: store i64 [[TMP5]], ptr [[TMP]], align 8 +// CHECK-NEXT: [[TMP6:%.*]] = load i64, ptr [[TMP]], align 8 +// CHECK-NEXT: store i64 [[TMP6]], ptr [[__S0]], align 8 +// CHECK-NEXT: [[TMP7:%.*]] = load <2 x i64>, ptr [[__S0_909]], align 16 +// CHECK-NEXT: store <2 x i64> [[TMP7]], ptr [[__S1]], align 16 +// CHECK-NEXT: [[TMP8:%.*]] = load i64, ptr [[__S0]], align 8 +// CHECK-NEXT: [[TMP9:%.*]] = load <2 x i64>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP8]], i32 0 +// CHECK-NEXT: store <2 x i64> [[VSET_LANE]], ptr [[REF_TMP3]], align 16 +// CHECK-NEXT: [[TMP10:%.*]] = load <2 x i64>, ptr [[REF_TMP3]], align 16 +// CHECK-NEXT: store <2 x i64> [[TMP10]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[TMP11:%.*]] = load <2 x i64>, ptr [[__RET]], align 16 +// CHECK-NEXT: store <2 x i64> [[TMP11]], ptr [[CHECKTMP4]], align 16 +// CHECK-NEXT: [[TMP12:%.*]] = load <2 x i64>, ptr [[CHECKTMP4]], align 16 +// CHECK-NEXT: store <2 x i64> [[TMP12]], ptr [[__RET_909]], align 16 +// CHECK-NEXT: [[TMP13:%.*]] = load <2 x i64>, ptr [[__RET_909]], align 16 +// CHECK-NEXT: store <2 x i64> [[TMP13]], ptr [[CHECKTMP5]], align 16 +// CHECK-NEXT: [[TMP14:%.*]] = load <2 x i64>, ptr [[CHECKTMP5]], align 16 +// CHECK-NEXT: ret <2 x i64> [[TMP14]] +// +poly64x2_t test_vcopyq_lane_p64(poly64x2_t a, poly64x1_t b){ + return vcopyq_lane_p64(a, 0, b, 0); +} + +// CHECK-LABEL: define dso_local <8 x i8> @test_vcopy_laneq_p8( +// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <8 x i8>, align 8 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca <16 x i8>, align 16 +// CHECK-NEXT: [[__RET_352:%.*]] = alloca <8 x i8>, align 8 +// CHECK-NEXT: [[__S0_352:%.*]] = alloca <8 x i8>, align 8 +// CHECK-NEXT: [[__S2_352:%.*]] = alloca <16 x i8>, align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca <8 x i8>, align 8 +// CHECK-NEXT: [[__S0:%.*]] = alloca i8, align 1 +// CHECK-NEXT: [[__RET1:%.*]] = alloca i8, align 1 +// CHECK-NEXT: [[__S02:%.*]] = alloca <16 x i8>, align 16 +// CHECK-NEXT: [[REF_TMP:%.*]] = alloca i8, align 1 +// CHECK-NEXT: [[TMP:%.*]] = alloca i8, align 1 +// CHECK-NEXT: [[__S1:%.*]] = alloca <8 x i8>, align 8 +// CHECK-NEXT: [[REF_TMP3:%.*]] = alloca <8 x i8>, align 8 +// CHECK-NEXT: [[CHECKTMP4:%.*]] = alloca <8 x i8>, align 8 +// CHECK-NEXT: [[CHECKTMP5:%.*]] = alloca <8 x i8>, align 8 +// CHECK-NEXT: store <8 x i8> [[A]], ptr [[A_ADDR]], align 8 +// CHECK-NEXT: store <16 x i8> [[B]], ptr [[B_ADDR]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[A_ADDR]], align 8 +// CHECK-NEXT: store <8 x i8> [[TMP0]], ptr [[__S0_352]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[B_ADDR]], align 16 +// CHECK-NEXT: store <16 x i8> [[TMP1]], ptr [[__S2_352]], align 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr [[__S2_352]], align 16 +// CHECK-NEXT: store <16 x i8> [[TMP2]], ptr [[__S02]], align 16 +// CHECK-NEXT: [[TMP3:%.*]] = load <16 x i8>, ptr [[__S02]], align 16 +// CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <16 x i8> [[TMP3]], i32 0 +// CHECK-NEXT: store i8 [[VGETQ_LANE]], ptr [[REF_TMP]], align 1 +// CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr [[REF_TMP]], align 1 +// CHECK-NEXT: store i8 [[TMP4]], ptr [[__RET1]], align 1 +// CHECK-NEXT: [[TMP5:%.*]] = load i8, ptr [[__RET1]], align 1 +// CHECK-NEXT: store i8 [[TMP5]], ptr [[TMP]], align 1 +// CHECK-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP]], align 1 +// CHECK-NEXT: store i8 [[TMP6]], ptr [[__S0]], align 1 +// CHECK-NEXT: [[TMP7:%.*]] = load <8 x i8>, ptr [[__S0_352]], align 8 +// CHECK-NEXT: store <8 x i8> [[TMP7]], ptr [[__S1]], align 8 +// CHECK-NEXT: [[TMP8:%.*]] = load i8, ptr [[__S0]], align 1 +// CHECK-NEXT: [[TMP9:%.*]] = load <8 x i8>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <8 x i8> [[TMP9]], i8 [[TMP8]], i32 0 +// CHECK-NEXT: store <8 x i8> [[VSET_LANE]], ptr [[REF_TMP3]], align 8 +// CHECK-NEXT: [[TMP10:%.*]] = load <8 x i8>, ptr [[REF_TMP3]], align 8 +// CHECK-NEXT: store <8 x i8> [[TMP10]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[TMP11:%.*]] = load <8 x i8>, ptr [[__RET]], align 8 +// CHECK-NEXT: store <8 x i8> [[TMP11]], ptr [[CHECKTMP4]], align 8 +// CHECK-NEXT: [[TMP12:%.*]] = load <8 x i8>, ptr [[CHECKTMP4]], align 8 +// CHECK-NEXT: store <8 x i8> [[TMP12]], ptr [[__RET_352]], align 8 +// CHECK-NEXT: [[TMP13:%.*]] = load <8 x i8>, ptr [[__RET_352]], align 8 +// CHECK-NEXT: store <8 x i8> [[TMP13]], ptr [[CHECKTMP5]], align 8 +// CHECK-NEXT: [[TMP14:%.*]] = load <8 x i8>, ptr [[CHECKTMP5]], align 8 +// CHECK-NEXT: ret <8 x i8> [[TMP14]] +// +poly8x8_t test_vcopy_laneq_p8(poly8x8_t a, poly8x16_t b){ + return vcopy_laneq_p8(a, 0, b, 0); +} + +// CHECK-LABEL: define dso_local <4 x i16> @test_vcopy_laneq_p16( +// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <4 x i16>, align 8 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca <8 x i16>, align 16 +// CHECK-NEXT: [[__RET_354:%.*]] = alloca <4 x i16>, align 8 +// CHECK-NEXT: [[__S0_354:%.*]] = alloca <4 x i16>, align 8 +// CHECK-NEXT: [[__S2_354:%.*]] = alloca <8 x i16>, align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca <4 x i16>, align 8 +// CHECK-NEXT: [[__S0:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[__RET1:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[__S02:%.*]] = alloca <8 x i16>, align 16 +// CHECK-NEXT: [[REF_TMP:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[TMP:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[__S1:%.*]] = alloca <4 x i16>, align 8 +// CHECK-NEXT: [[REF_TMP3:%.*]] = alloca <4 x i16>, align 8 +// CHECK-NEXT: [[CHECKTMP4:%.*]] = alloca <4 x i16>, align 8 +// CHECK-NEXT: [[CHECKTMP5:%.*]] = alloca <4 x i16>, align 8 +// CHECK-NEXT: store <4 x i16> [[A]], ptr [[A_ADDR]], align 8 +// CHECK-NEXT: store <8 x i16> [[B]], ptr [[B_ADDR]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[A_ADDR]], align 8 +// CHECK-NEXT: store <4 x i16> [[TMP0]], ptr [[__S0_354]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[B_ADDR]], align 16 +// CHECK-NEXT: store <8 x i16> [[TMP1]], ptr [[__S2_354]], align 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr [[__S2_354]], align 16 +// CHECK-NEXT: store <8 x i16> [[TMP2]], ptr [[__S02]], align 16 +// CHECK-NEXT: [[TMP3:%.*]] = load <8 x i16>, ptr [[__S02]], align 16 +// CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0 +// CHECK-NEXT: store i16 [[VGETQ_LANE]], ptr [[REF_TMP]], align 2 +// CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr [[REF_TMP]], align 2 +// CHECK-NEXT: store i16 [[TMP4]], ptr [[__RET1]], align 2 +// CHECK-NEXT: [[TMP5:%.*]] = load i16, ptr [[__RET1]], align 2 +// CHECK-NEXT: store i16 [[TMP5]], ptr [[TMP]], align 2 +// CHECK-NEXT: [[TMP6:%.*]] = load i16, ptr [[TMP]], align 2 +// CHECK-NEXT: store i16 [[TMP6]], ptr [[__S0]], align 2 +// CHECK-NEXT: [[TMP7:%.*]] = load <4 x i16>, ptr [[__S0_354]], align 8 +// CHECK-NEXT: store <4 x i16> [[TMP7]], ptr [[__S1]], align 8 +// CHECK-NEXT: [[TMP8:%.*]] = load i16, ptr [[__S0]], align 2 +// CHECK-NEXT: [[TMP9:%.*]] = load <4 x i16>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <4 x i16> [[TMP9]], i16 [[TMP8]], i32 0 +// CHECK-NEXT: store <4 x i16> [[VSET_LANE]], ptr [[REF_TMP3]], align 8 +// CHECK-NEXT: [[TMP10:%.*]] = load <4 x i16>, ptr [[REF_TMP3]], align 8 +// CHECK-NEXT: store <4 x i16> [[TMP10]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[TMP11:%.*]] = load <4 x i16>, ptr [[__RET]], align 8 +// CHECK-NEXT: store <4 x i16> [[TMP11]], ptr [[CHECKTMP4]], align 8 +// CHECK-NEXT: [[TMP12:%.*]] = load <4 x i16>, ptr [[CHECKTMP4]], align 8 +// CHECK-NEXT: store <4 x i16> [[TMP12]], ptr [[__RET_354]], align 8 +// CHECK-NEXT: [[TMP13:%.*]] = load <4 x i16>, ptr [[__RET_354]], align 8 +// CHECK-NEXT: store <4 x i16> [[TMP13]], ptr [[CHECKTMP5]], align 8 +// CHECK-NEXT: [[TMP14:%.*]] = load <4 x i16>, ptr [[CHECKTMP5]], align 8 +// CHECK-NEXT: ret <4 x i16> [[TMP14]] +// +poly16x4_t test_vcopy_laneq_p16(poly16x4_t a, poly16x8_t b){ + return vcopy_laneq_p16(a, 0, b, 0); +} + +// CHECK-LABEL: define dso_local <1 x i64> @test_vcopy_laneq_p64( +// CHECK-SAME: <1 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <1 x i64>, align 8 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca <2 x i64>, align 16 +// CHECK-NEXT: [[__RET_919:%.*]] = alloca <1 x i64>, align 8 +// CHECK-NEXT: [[__S0_919:%.*]] = alloca <1 x i64>, align 8 +// CHECK-NEXT: [[__S2_919:%.*]] = alloca <2 x i64>, align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca <1 x i64>, align 8 +// CHECK-NEXT: [[__S0:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[__RET1:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[__S02:%.*]] = alloca <2 x i64>, align 16 +// CHECK-NEXT: [[REF_TMP:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[TMP:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca <1 x i64>, align 8 +// CHECK-NEXT: [[REF_TMP3:%.*]] = alloca <1 x i64>, align 8 +// CHECK-NEXT: [[CHECKTMP4:%.*]] = alloca <1 x i64>, align 8 +// CHECK-NEXT: [[CHECKTMP5:%.*]] = alloca <1 x i64>, align 8 +// CHECK-NEXT: store <1 x i64> [[A]], ptr [[A_ADDR]], align 8 +// CHECK-NEXT: store <2 x i64> [[B]], ptr [[B_ADDR]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load <1 x i64>, ptr [[A_ADDR]], align 8 +// CHECK-NEXT: store <1 x i64> [[TMP0]], ptr [[__S0_919]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[B_ADDR]], align 16 +// CHECK-NEXT: store <2 x i64> [[TMP1]], ptr [[__S2_919]], align 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr [[__S2_919]], align 16 +// CHECK-NEXT: store <2 x i64> [[TMP2]], ptr [[__S02]], align 16 +// CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr [[__S02]], align 16 +// CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 +// CHECK-NEXT: store i64 [[VGETQ_LANE]], ptr [[REF_TMP]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr [[REF_TMP]], align 8 +// CHECK-NEXT: store i64 [[TMP4]], ptr [[__RET1]], align 8 +// CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr [[__RET1]], align 8 +// CHECK-NEXT: store i64 [[TMP5]], ptr [[TMP]], align 8 +// CHECK-NEXT: [[TMP6:%.*]] = load i64, ptr [[TMP]], align 8 +// CHECK-NEXT: store i64 [[TMP6]], ptr [[__S0]], align 8 +// CHECK-NEXT: [[TMP7:%.*]] = load <1 x i64>, ptr [[__S0_919]], align 8 +// CHECK-NEXT: store <1 x i64> [[TMP7]], ptr [[__S1]], align 8 +// CHECK-NEXT: [[TMP8:%.*]] = load i64, ptr [[__S0]], align 8 +// CHECK-NEXT: [[TMP9:%.*]] = load <1 x i64>, ptr [[__S1]], align 8 +// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <1 x i64> [[TMP9]], i64 [[TMP8]], i32 0 +// CHECK-NEXT: store <1 x i64> [[VSET_LANE]], ptr [[REF_TMP3]], align 8 +// CHECK-NEXT: [[TMP10:%.*]] = load <1 x i64>, ptr [[REF_TMP3]], align 8 +// CHECK-NEXT: store <1 x i64> [[TMP10]], ptr [[__RET]], align 8 +// CHECK-NEXT: [[TMP11:%.*]] = load <1 x i64>, ptr [[__RET]], align 8 +// CHECK-NEXT: store <1 x i64> [[TMP11]], ptr [[CHECKTMP4]], align 8 +// CHECK-NEXT: [[TMP12:%.*]] = load <1 x i64>, ptr [[CHECKTMP4]], align 8 +// CHECK-NEXT: store <1 x i64> [[TMP12]], ptr [[__RET_919]], align 8 +// CHECK-NEXT: [[TMP13:%.*]] = load <1 x i64>, ptr [[__RET_919]], align 8 +// CHECK-NEXT: store <1 x i64> [[TMP13]], ptr [[CHECKTMP5]], align 8 +// CHECK-NEXT: [[TMP14:%.*]] = load <1 x i64>, ptr [[CHECKTMP5]], align 8 +// CHECK-NEXT: ret <1 x i64> [[TMP14]] +// +poly64x1_t test_vcopy_laneq_p64(poly64x1_t a, poly64x2_t b){ + return vcopy_laneq_p64(a, 0, b, 0); +} + +// CHECK-LABEL: define dso_local <16 x i8> @test_vcopyq_laneq_p8( +// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <16 x i8>, align 16 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca <16 x i8>, align 16 +// CHECK-NEXT: [[__RET_328:%.*]] = alloca <16 x i8>, align 16 +// CHECK-NEXT: [[__S0_328:%.*]] = alloca <16 x i8>, align 16 +// CHECK-NEXT: [[__S2_328:%.*]] = alloca <16 x i8>, align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca <16 x i8>, align 16 +// CHECK-NEXT: [[__S0:%.*]] = alloca i8, align 1 +// CHECK-NEXT: [[__RET1:%.*]] = alloca i8, align 1 +// CHECK-NEXT: [[__S02:%.*]] = alloca <16 x i8>, align 16 +// CHECK-NEXT: [[REF_TMP:%.*]] = alloca i8, align 1 +// CHECK-NEXT: [[TMP:%.*]] = alloca i8, align 1 +// CHECK-NEXT: [[__S1:%.*]] = alloca <16 x i8>, align 16 +// CHECK-NEXT: [[REF_TMP3:%.*]] = alloca <16 x i8>, align 16 +// CHECK-NEXT: [[CHECKTMP4:%.*]] = alloca <16 x i8>, align 16 +// CHECK-NEXT: [[CHECKTMP5:%.*]] = alloca <16 x i8>, align 16 +// CHECK-NEXT: store <16 x i8> [[A]], ptr [[A_ADDR]], align 16 +// CHECK-NEXT: store <16 x i8> [[B]], ptr [[B_ADDR]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[A_ADDR]], align 16 +// CHECK-NEXT: store <16 x i8> [[TMP0]], ptr [[__S0_328]], align 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[B_ADDR]], align 16 +// CHECK-NEXT: store <16 x i8> [[TMP1]], ptr [[__S2_328]], align 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr [[__S2_328]], align 16 +// CHECK-NEXT: store <16 x i8> [[TMP2]], ptr [[__S02]], align 16 +// CHECK-NEXT: [[TMP3:%.*]] = load <16 x i8>, ptr [[__S02]], align 16 +// CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <16 x i8> [[TMP3]], i32 0 +// CHECK-NEXT: store i8 [[VGETQ_LANE]], ptr [[REF_TMP]], align 1 +// CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr [[REF_TMP]], align 1 +// CHECK-NEXT: store i8 [[TMP4]], ptr [[__RET1]], align 1 +// CHECK-NEXT: [[TMP5:%.*]] = load i8, ptr [[__RET1]], align 1 +// CHECK-NEXT: store i8 [[TMP5]], ptr [[TMP]], align 1 +// CHECK-NEXT: [[TMP6:%.*]] = load i8, ptr [[TMP]], align 1 +// CHECK-NEXT: store i8 [[TMP6]], ptr [[__S0]], align 1 +// CHECK-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr [[__S0_328]], align 16 +// CHECK-NEXT: store <16 x i8> [[TMP7]], ptr [[__S1]], align 16 +// CHECK-NEXT: [[TMP8:%.*]] = load i8, ptr [[__S0]], align 1 +// CHECK-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <16 x i8> [[TMP9]], i8 [[TMP8]], i32 0 +// CHECK-NEXT: store <16 x i8> [[VSET_LANE]], ptr [[REF_TMP3]], align 16 +// CHECK-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr [[REF_TMP3]], align 16 +// CHECK-NEXT: store <16 x i8> [[TMP10]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[TMP11:%.*]] = load <16 x i8>, ptr [[__RET]], align 16 +// CHECK-NEXT: store <16 x i8> [[TMP11]], ptr [[CHECKTMP4]], align 16 +// CHECK-NEXT: [[TMP12:%.*]] = load <16 x i8>, ptr [[CHECKTMP4]], align 16 +// CHECK-NEXT: store <16 x i8> [[TMP12]], ptr [[__RET_328]], align 16 +// CHECK-NEXT: [[TMP13:%.*]] = load <16 x i8>, ptr [[__RET_328]], align 16 +// CHECK-NEXT: store <16 x i8> [[TMP13]], ptr [[CHECKTMP5]], align 16 +// CHECK-NEXT: [[TMP14:%.*]] = load <16 x i8>, ptr [[CHECKTMP5]], align 16 +// CHECK-NEXT: ret <16 x i8> [[TMP14]] +// +poly8x16_t test_vcopyq_laneq_p8(poly8x16_t a, poly8x16_t b){ + return vcopyq_laneq_p8(a, 0, b, 0); +} + +// CHECK-LABEL: define dso_local <8 x i16> @test_vcopyq_laneq_p16( +// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <8 x i16>, align 16 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca <8 x i16>, align 16 +// CHECK-NEXT: [[__RET_330:%.*]] = alloca <8 x i16>, align 16 +// CHECK-NEXT: [[__S0_330:%.*]] = alloca <8 x i16>, align 16 +// CHECK-NEXT: [[__S2_330:%.*]] = alloca <8 x i16>, align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca <8 x i16>, align 16 +// CHECK-NEXT: [[__S0:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[__RET1:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[__S02:%.*]] = alloca <8 x i16>, align 16 +// CHECK-NEXT: [[REF_TMP:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[TMP:%.*]] = alloca i16, align 2 +// CHECK-NEXT: [[__S1:%.*]] = alloca <8 x i16>, align 16 +// CHECK-NEXT: [[REF_TMP3:%.*]] = alloca <8 x i16>, align 16 +// CHECK-NEXT: [[CHECKTMP4:%.*]] = alloca <8 x i16>, align 16 +// CHECK-NEXT: [[CHECKTMP5:%.*]] = alloca <8 x i16>, align 16 +// CHECK-NEXT: store <8 x i16> [[A]], ptr [[A_ADDR]], align 16 +// CHECK-NEXT: store <8 x i16> [[B]], ptr [[B_ADDR]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr [[A_ADDR]], align 16 +// CHECK-NEXT: store <8 x i16> [[TMP0]], ptr [[__S0_330]], align 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[B_ADDR]], align 16 +// CHECK-NEXT: store <8 x i16> [[TMP1]], ptr [[__S2_330]], align 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr [[__S2_330]], align 16 +// CHECK-NEXT: store <8 x i16> [[TMP2]], ptr [[__S02]], align 16 +// CHECK-NEXT: [[TMP3:%.*]] = load <8 x i16>, ptr [[__S02]], align 16 +// CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0 +// CHECK-NEXT: store i16 [[VGETQ_LANE]], ptr [[REF_TMP]], align 2 +// CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr [[REF_TMP]], align 2 +// CHECK-NEXT: store i16 [[TMP4]], ptr [[__RET1]], align 2 +// CHECK-NEXT: [[TMP5:%.*]] = load i16, ptr [[__RET1]], align 2 +// CHECK-NEXT: store i16 [[TMP5]], ptr [[TMP]], align 2 +// CHECK-NEXT: [[TMP6:%.*]] = load i16, ptr [[TMP]], align 2 +// CHECK-NEXT: store i16 [[TMP6]], ptr [[__S0]], align 2 +// CHECK-NEXT: [[TMP7:%.*]] = load <8 x i16>, ptr [[__S0_330]], align 16 +// CHECK-NEXT: store <8 x i16> [[TMP7]], ptr [[__S1]], align 16 +// CHECK-NEXT: [[TMP8:%.*]] = load i16, ptr [[__S0]], align 2 +// CHECK-NEXT: [[TMP9:%.*]] = load <8 x i16>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <8 x i16> [[TMP9]], i16 [[TMP8]], i32 0 +// CHECK-NEXT: store <8 x i16> [[VSET_LANE]], ptr [[REF_TMP3]], align 16 +// CHECK-NEXT: [[TMP10:%.*]] = load <8 x i16>, ptr [[REF_TMP3]], align 16 +// CHECK-NEXT: store <8 x i16> [[TMP10]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[TMP11:%.*]] = load <8 x i16>, ptr [[__RET]], align 16 +// CHECK-NEXT: store <8 x i16> [[TMP11]], ptr [[CHECKTMP4]], align 16 +// CHECK-NEXT: [[TMP12:%.*]] = load <8 x i16>, ptr [[CHECKTMP4]], align 16 +// CHECK-NEXT: store <8 x i16> [[TMP12]], ptr [[__RET_330]], align 16 +// CHECK-NEXT: [[TMP13:%.*]] = load <8 x i16>, ptr [[__RET_330]], align 16 +// CHECK-NEXT: store <8 x i16> [[TMP13]], ptr [[CHECKTMP5]], align 16 +// CHECK-NEXT: [[TMP14:%.*]] = load <8 x i16>, ptr [[CHECKTMP5]], align 16 +// CHECK-NEXT: ret <8 x i16> [[TMP14]] +// +poly16x8_t test_vcopyq_laneq_p16(poly16x8_t a, poly16x8_t b){ + return vcopyq_laneq_p16(a, 0, b, 0); +} + +// CHECK-LABEL: define dso_local <2 x i64> @test_vcopyq_laneq_p64( +// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca <2 x i64>, align 16 +// CHECK-NEXT: [[B_ADDR:%.*]] = alloca <2 x i64>, align 16 +// CHECK-NEXT: [[__RET_915:%.*]] = alloca <2 x i64>, align 16 +// CHECK-NEXT: [[__S0_915:%.*]] = alloca <2 x i64>, align 16 +// CHECK-NEXT: [[__S2_915:%.*]] = alloca <2 x i64>, align 16 +// CHECK-NEXT: [[__RET:%.*]] = alloca <2 x i64>, align 16 +// CHECK-NEXT: [[__S0:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[__RET1:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[__S02:%.*]] = alloca <2 x i64>, align 16 +// CHECK-NEXT: [[REF_TMP:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[TMP:%.*]] = alloca i64, align 8 +// CHECK-NEXT: [[__S1:%.*]] = alloca <2 x i64>, align 16 +// CHECK-NEXT: [[REF_TMP3:%.*]] = alloca <2 x i64>, align 16 +// CHECK-NEXT: [[CHECKTMP4:%.*]] = alloca <2 x i64>, align 16 +// CHECK-NEXT: [[CHECKTMP5:%.*]] = alloca <2 x i64>, align 16 +// CHECK-NEXT: store <2 x i64> [[A]], ptr [[A_ADDR]], align 16 +// CHECK-NEXT: store <2 x i64> [[B]], ptr [[B_ADDR]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, ptr [[A_ADDR]], align 16 +// CHECK-NEXT: store <2 x i64> [[TMP0]], ptr [[__S0_915]], align 16 +// CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[B_ADDR]], align 16 +// CHECK-NEXT: store <2 x i64> [[TMP1]], ptr [[__S2_915]], align 16 +// CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr [[__S2_915]], align 16 +// CHECK-NEXT: store <2 x i64> [[TMP2]], ptr [[__S02]], align 16 +// CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr [[__S02]], align 16 +// CHECK-NEXT: [[VGETQ_LANE:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 +// CHECK-NEXT: store i64 [[VGETQ_LANE]], ptr [[REF_TMP]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr [[REF_TMP]], align 8 +// CHECK-NEXT: store i64 [[TMP4]], ptr [[__RET1]], align 8 +// CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr [[__RET1]], align 8 +// CHECK-NEXT: store i64 [[TMP5]], ptr [[TMP]], align 8 +// CHECK-NEXT: [[TMP6:%.*]] = load i64, ptr [[TMP]], align 8 +// CHECK-NEXT: store i64 [[TMP6]], ptr [[__S0]], align 8 +// CHECK-NEXT: [[TMP7:%.*]] = load <2 x i64>, ptr [[__S0_915]], align 16 +// CHECK-NEXT: store <2 x i64> [[TMP7]], ptr [[__S1]], align 16 +// CHECK-NEXT: [[TMP8:%.*]] = load i64, ptr [[__S0]], align 8 +// CHECK-NEXT: [[TMP9:%.*]] = load <2 x i64>, ptr [[__S1]], align 16 +// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP8]], i32 0 +// CHECK-NEXT: store <2 x i64> [[VSET_LANE]], ptr [[REF_TMP3]], align 16 +// CHECK-NEXT: [[TMP10:%.*]] = load <2 x i64>, ptr [[REF_TMP3]], align 16 +// CHECK-NEXT: store <2 x i64> [[TMP10]], ptr [[__RET]], align 16 +// CHECK-NEXT: [[TMP11:%.*]] = load <2 x i64>, ptr [[__RET]], align 16 +// CHECK-NEXT: store <2 x i64> [[TMP11]], ptr [[CHECKTMP4]], align 16 +// CHECK-NEXT: [[TMP12:%.*]] = load <2 x i64>, ptr [[CHECKTMP4]], align 16 +// CHECK-NEXT: store <2 x i64> [[TMP12]], ptr [[__RET_915]], align 16 +// CHECK-NEXT: [[TMP13:%.*]] = load <2 x i64>, ptr [[__RET_915]], align 16 +// CHECK-NEXT: store <2 x i64> [[TMP13]], ptr [[CHECKTMP5]], align 16 +// CHECK-NEXT: [[TMP14:%.*]] = load <2 x i64>, ptr [[CHECKTMP5]], align 16 +// CHECK-NEXT: ret <2 x i64> [[TMP14]] +// +poly64x2_t test_vcopyq_laneq_p64(poly64x2_t a, poly64x2_t b){ + return vcopyq_laneq_p64(a, 0, b, 0); +} diff --git a/clang/utils/TableGen/NeonEmitter.cpp b/clang/utils/TableGen/NeonEmitter.cpp index da3bbd4303074..946a799a4f6a5 100644 --- a/clang/utils/TableGen/NeonEmitter.cpp +++ b/clang/utils/TableGen/NeonEmitter.cpp @@ -1401,14 +1401,8 @@ void Intrinsic::emitBodyAsBuiltinCall() { if (LocalCK == ClassB || (T.isHalf() && !T.isScalarForMangling())) { CastToType.makeInteger(8, true); Arg = "__builtin_bit_cast(" + CastToType.str() + ", " + Arg + ")"; - } else if ((T.isPoly() || - (T.isVector() && T.isInteger() && !T.isSigned() && - (StringRef(Name).contains("_p8") || - StringRef(Name).contains("_p16") || - StringRef(Name).contains("_p64"))))) { - CastToType.makeSigned(); - Arg = "__builtin_bit_cast(" + CastToType.str() + ", " + Arg + ")"; - } else if (LocalCK == ClassI && CastToType.isInteger()) { + } else if (LocalCK == ClassI && + (CastToType.isInteger() || CastToType.isPoly())) { CastToType.makeSigned(); Arg = "__builtin_bit_cast(" + CastToType.str() + ", " + Arg + ")"; } diff --git a/llvm/test/CodeGen/AArch64/neon-bitcast-poly.ll b/llvm/test/CodeGen/AArch64/neon-bitcast-poly.ll deleted file mode 100644 index b577eb1e34b09..0000000000000 --- a/llvm/test/CodeGen/AArch64/neon-bitcast-poly.ll +++ /dev/null @@ -1,51 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=aarch64-linux-gnu -o - %s | FileCheck %s - -; This test verifies that NEON intrinsics using polynomial types (poly8/16/64) emit correct AArch64 instructions -; after bitcasting to signed integer vectors. These intrinsics would previously fail under -fno-lax-vector-conversions. - -define <8 x i8> @_Z18test_vcopy_lane_p811__Poly8x8_tS_(<8 x i8> %a, <8 x i8> %b) { -; CHECK-LABEL: _Z18test_vcopy_lane_p811__Poly8x8_tS_: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NEXT: mov v0.b[0], v1.b[0] -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-NEXT: ret -entry: - %vset_lane = shufflevector <8 x i8> %b, <8 x i8> %a, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> - ret <8 x i8> %vset_lane -} - -define <4 x i16> @_Z18test_vset_lane_p16t12__Poly16x4_t(i16 %val, <4 x i16> %vec) { -; CHECK-LABEL: _Z18test_vset_lane_p16t12__Poly16x4_t: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: mov v0.h[0], w0 -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-NEXT: ret -entry: - %vset_lane = insertelement <4 x i16> %vec, i16 %val, i64 0 - ret <4 x i16> %vset_lane -} - -define i64 @_Z18test_vget_lane_p6412__Poly64x1_t(<1 x i64> %vec){ -; CHECK-LABEL: _Z18test_vget_lane_p6412__Poly64x1_t: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: fmov x0, d0 -; CHECK-NEXT: ret -entry: - %vget_lane = extractelement <1 x i64> %vec, i64 0 - ret i64 %vget_lane -} - -define <16 x i8> @_Z18test_vsetq_lane_p8h12__Poly8x16_t(i8 %val, <16 x i8> %vec){ -; CHECK-LABEL: _Z18test_vsetq_lane_p8h12__Poly8x16_t: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov v0.b[0], w0 -; CHECK-NEXT: ret -entry: - %vset_lane = insertelement <16 x i8> %vec, i8 %val, i64 0 - ret <16 x i8> %vset_lane -} >From bed908b11ab920f99853310f6563d7a86b0da6f3 Mon Sep 17 00:00:00 2001 From: Amina Chabane <amina.chab...@arm.com> Date: Fri, 25 Jul 2025 15:52:33 +0000 Subject: [PATCH 6/6] Code formatting --- clang/utils/TableGen/NeonEmitter.cpp | 162 +++++++++++++++++---------- 1 file changed, 104 insertions(+), 58 deletions(-) diff --git a/clang/utils/TableGen/NeonEmitter.cpp b/clang/utils/TableGen/NeonEmitter.cpp index 946a799a4f6a5..97b3e1a6c7566 100644 --- a/clang/utils/TableGen/NeonEmitter.cpp +++ b/clang/utils/TableGen/NeonEmitter.cpp @@ -157,14 +157,14 @@ class Type { public: Type() - : Kind(Void), Immediate(false), Constant(false), - Pointer(false), ScalarForMangling(false), NoManglingQ(false), - Bitwidth(0), ElementBitwidth(0), NumVectors(0) {} + : Kind(Void), Immediate(false), Constant(false), Pointer(false), + ScalarForMangling(false), NoManglingQ(false), Bitwidth(0), + ElementBitwidth(0), NumVectors(0) {} Type(TypeSpec TS, StringRef CharMods) - : TS(std::move(TS)), Kind(Void), Immediate(false), - Constant(false), Pointer(false), ScalarForMangling(false), - NoManglingQ(false), Bitwidth(0), ElementBitwidth(0), NumVectors(0) { + : TS(std::move(TS)), Kind(Void), Immediate(false), Constant(false), + Pointer(false), ScalarForMangling(false), NoManglingQ(false), + Bitwidth(0), ElementBitwidth(0), NumVectors(0) { applyModifiers(CharMods); } @@ -361,7 +361,7 @@ class Intrinsic { if (BigEndianSafe) return true; - for (const auto &T : Types){ + for (const auto &T : Types) { if (T.isVector() && T.getNumElements() > 1) return false; } @@ -393,9 +393,9 @@ class Intrinsic { } for (const auto &Type : Types) { - // If this builtin takes an immediate argument, we need to #define it rather - // than use a standard declaration, so that SemaChecking can range check - // the immediate passed by the user. + // If this builtin takes an immediate argument, we need to #define it + // rather than use a standard declaration, so that SemaChecking can range + // check the immediate passed by the user. // Pointer arguments need to use macros to avoid hiding aligned attributes // from the pointer type. @@ -527,8 +527,8 @@ class Intrinsic { void emitBodyAsBuiltinCall(); - void generateImpl(bool ReverseArguments, - StringRef NamePrefix, StringRef CallPrefix); + void generateImpl(bool ReverseArguments, StringRef NamePrefix, + StringRef CallPrefix); void emitReturn(); void emitBody(StringRef CallPrefix); void emitShadowedArgs(); @@ -546,9 +546,8 @@ class Intrinsic { StringRef CallPrefix; public: - DagEmitter(Intrinsic &Intr, StringRef CallPrefix) : - Intr(Intr), CallPrefix(CallPrefix) { - } + DagEmitter(Intrinsic &Intr, StringRef CallPrefix) + : Intr(Intr), CallPrefix(CallPrefix) {} std::pair<Type, std::string> emitDagArg(const Init *Arg, std::string ArgName); std::pair<Type, std::string> emitDagSaveTemp(const DagInit *DI); @@ -691,12 +690,23 @@ std::string Type::builtin_str() const { return S; } else if (isInteger()) switch (ElementBitwidth) { - case 8: S += "c"; break; - case 16: S += "s"; break; - case 32: S += "i"; break; - case 64: S += "Wi"; break; - case 128: S += "LLLi"; break; - default: llvm_unreachable("Unhandled case!"); + case 8: + S += "c"; + break; + case 16: + S += "s"; + break; + case 32: + S += "i"; + break; + case 64: + S += "Wi"; + break; + case 128: + S += "LLLi"; + break; + default: + llvm_unreachable("Unhandled case!"); } else if (isBFloat16()) { assert(ElementBitwidth == 16 && "BFloat16 can only be 16 bits"); @@ -708,13 +718,21 @@ std::string Type::builtin_str() const { S += "UWi"; } else switch (ElementBitwidth) { - case 16: S += "h"; break; - case 32: S += "f"; break; - case 64: S += "d"; break; - default: llvm_unreachable("Unhandled case!"); + case 16: + S += "h"; + break; + case 32: + S += "f"; + break; + case 64: + S += "d"; + break; + default: + llvm_unreachable("Unhandled case!"); } - // FIXME: NECESSARY??????????????????????????????????????????????????????????????????????? + // FIXME: + // NECESSARY??????????????????????????????????????????????????????????????????????? if (isChar() && !isPointer() && isSigned()) // Make chars explicitly signed. S = "S" + S; @@ -740,12 +758,23 @@ std::string Type::builtin_str() const { unsigned Type::getNeonEnum() const { unsigned Addend; switch (ElementBitwidth) { - case 8: Addend = 0; break; - case 16: Addend = 1; break; - case 32: Addend = 2; break; - case 64: Addend = 3; break; - case 128: Addend = 4; break; - default: llvm_unreachable("Unhandled element bitwidth!"); + case 8: + Addend = 0; + break; + case 16: + Addend = 1; + break; + case 32: + Addend = 2; + break; + case 64: + Addend = 3; + break; + case 128: + Addend = 4; + break; + default: + llvm_unreachable("Unhandled element bitwidth!"); } unsigned Base = (unsigned)NeonTypeFlags::Int8 + Addend; @@ -1156,11 +1185,20 @@ std::string Intrinsic::mangleName(std::string Name, ClassKind LocalCK) const { char Suffix = '\0'; if (BaseType.isScalarForMangling()) { switch (BaseType.getElementSizeInBits()) { - case 8: Suffix = 'b'; break; - case 16: Suffix = 'h'; break; - case 32: Suffix = 's'; break; - case 64: Suffix = 'd'; break; - default: llvm_unreachable("Bad suffix!"); + case 8: + Suffix = 'b'; + break; + case 16: + Suffix = 'h'; + break; + case 32: + Suffix = 's'; + break; + case 64: + Suffix = 'd'; + break; + default: + llvm_unreachable("Bad suffix!"); } } if (Suffix != '\0') { @@ -1259,9 +1297,9 @@ void Intrinsic::emitReverseVariable(Variable &Dest, Variable &Src) { emitNewLine(); for (unsigned K = 0; K < Dest.getType().getNumVectors(); ++K) { - OS << " " << Dest.getName() << ".val[" << K << "] = " - << "__builtin_shufflevector(" << Src.getName() << ".val[" << K << "], " - << Src.getName() << ".val[" << K << "], __lane_reverse_" + OS << " " << Dest.getName() << ".val[" << K + << "] = " << "__builtin_shufflevector(" << Src.getName() << ".val[" + << K << "], " << Src.getName() << ".val[" << K << "], __lane_reverse_" << Dest.getType().getSizeInBits() << "_" << Dest.getType().getElementSizeInBits() << ");"; emitNewLine(); @@ -1402,7 +1440,7 @@ void Intrinsic::emitBodyAsBuiltinCall() { CastToType.makeInteger(8, true); Arg = "__builtin_bit_cast(" + CastToType.str() + ", " + Arg + ")"; } else if (LocalCK == ClassI && - (CastToType.isInteger() || CastToType.isPoly())) { + (CastToType.isInteger() || CastToType.isPoly())) { CastToType.makeSigned(); Arg = "__builtin_bit_cast(" + CastToType.str() + ", " + Arg + ")"; } @@ -1824,12 +1862,15 @@ Intrinsic::DagEmitter::emitDagNameReplace(const DagInit *DI) { std::string S = Intr.Name; assert_with_loc(DI->getNumArgs() == 2, "name_replace requires 2 arguments!"); - std::string ToReplace = cast<StringInit>(DI->getArg(0))->getAsUnquotedString(); - std::string ReplaceWith = cast<StringInit>(DI->getArg(1))->getAsUnquotedString(); + std::string ToReplace = + cast<StringInit>(DI->getArg(0))->getAsUnquotedString(); + std::string ReplaceWith = + cast<StringInit>(DI->getArg(1))->getAsUnquotedString(); size_t Idx = S.find(ToReplace); - assert_with_loc(Idx != std::string::npos, "name should contain '" + ToReplace + "'!"); + assert_with_loc(Idx != std::string::npos, + "name should contain '" + ToReplace + "'!"); S.replace(Idx, ToReplace.size(), ReplaceWith); return std::make_pair(Type::getVoid(), S); @@ -1894,8 +1935,8 @@ std::string Intrinsic::generate() { return OS.str(); } -void Intrinsic::generateImpl(bool ReverseArguments, - StringRef NamePrefix, StringRef CallPrefix) { +void Intrinsic::generateImpl(bool ReverseArguments, StringRef NamePrefix, + StringRef CallPrefix) { CurrentRecord = R; // If we call a macro, our local variables may be corrupted due to @@ -2009,11 +2050,12 @@ void NeonEmitter::createIntrinsic(const Record *R, std::string Proto = std::string(R->getValueAsString("Prototype")); std::string Types = std::string(R->getValueAsString("Types")); const Record *OperationRec = R->getValueAsDef("Operation"); - bool BigEndianSafe = R->getValueAsBit("BigEndianSafe"); + bool BigEndianSafe = R->getValueAsBit("BigEndianSafe"); std::string ArchGuard = std::string(R->getValueAsString("ArchGuard")); std::string TargetGuard = std::string(R->getValueAsString("TargetGuard")); bool IsUnavailable = OperationRec->getValueAsBit("Unavailable"); - std::string CartesianProductWith = std::string(R->getValueAsString("CartesianProductWith")); + std::string CartesianProductWith = + std::string(R->getValueAsString("CartesianProductWith")); // Set the global current record. This allows assert_with_loc to produce // decent location information even when highly nested. @@ -2029,7 +2071,8 @@ void NeonEmitter::createIntrinsic(const Record *R, std::vector<std::pair<TypeSpec, TypeSpec>> NewTypeSpecs; if (!CartesianProductWith.empty()) { - std::vector<TypeSpec> ProductTypeSpecs = TypeSpec::fromTypeSpecs(CartesianProductWith); + std::vector<TypeSpec> ProductTypeSpecs = + TypeSpec::fromTypeSpecs(CartesianProductWith); for (auto TS : TypeSpecs) { Type DefaultT(TS, "."); for (auto SrcTS : ProductTypeSpecs) { @@ -2299,7 +2342,7 @@ void NeonEmitter::runHeader(raw_ostream &OS) { genIntrinsicRangeCheckCode(OS, Defs); } -static void emitNeonTypeDefs(const std::string& types, raw_ostream &OS) { +static void emitNeonTypeDefs(const std::string &types, raw_ostream &OS) { std::string TypedefTypes(types); std::vector<TypeSpec> TDTypeVec = TypeSpec::fromTypeSpecs(TypedefTypes); @@ -2353,7 +2396,7 @@ static void emitNeonTypeDefs(const std::string& types, raw_ostream &OS) { InIfdef = true; } - const char Mods[] = { static_cast<char>('2' + (NumMembers - 2)), 0}; + const char Mods[] = {static_cast<char>('2' + (NumMembers - 2)), 0}; Type VT(TS, Mods); OS << "typedef struct " << VT.str() << " {\n"; OS << " " << T.str() << " val"; @@ -2484,7 +2527,8 @@ void NeonEmitter::run(raw_ostream &OS) { MadeProgress = false; for (SmallVector<Intrinsic *, 128>::iterator I = Defs.begin(); - I != Defs.end(); /*No step*/) { + I != Defs.end(); + /*No step*/) { bool DependenciesSatisfied = true; for (auto *II : (*I)->getDependencies()) { if (is_contained(Defs, II)) @@ -2532,13 +2576,13 @@ void NeonEmitter::runFP16(raw_ostream &OS) { " * Permission is hereby granted, free of charge, to any person " "obtaining a copy\n" " * of this software and associated documentation files (the " - "\"Software\"), to deal\n" + "\"Software\"), to deal\n" " * in the Software without restriction, including without limitation " - "the rights\n" + "the rights\n" " * to use, copy, modify, merge, publish, distribute, sublicense, " - "and/or sell\n" + "and/or sell\n" " * copies of the Software, and to permit persons to whom the Software " - "is\n" + "is\n" " * furnished to do so, subject to the following conditions:\n" " *\n" " * The above copyright notice and this permission notice shall be " @@ -2591,7 +2635,8 @@ void NeonEmitter::runFP16(raw_ostream &OS) { MadeProgress = false; for (SmallVector<Intrinsic *, 128>::iterator I = Defs.begin(); - I != Defs.end(); /*No step*/) { + I != Defs.end(); + /*No step*/) { bool DependenciesSatisfied = true; for (auto *II : (*I)->getDependencies()) { if (is_contained(Defs, II)) @@ -2754,7 +2799,8 @@ void NeonEmitter::runBF16(raw_ostream &OS) { MadeProgress = false; for (SmallVector<Intrinsic *, 128>::iterator I = Defs.begin(); - I != Defs.end(); /*No step*/) { + I != Defs.end(); + /*No step*/) { bool DependenciesSatisfied = true; for (auto *II : (*I)->getDependencies()) { if (is_contained(Defs, II)) _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits