[llvm] [clang] [SME2] Add LUTI2 and LUTI4 quad Builtins and Intrinsics (PR #73317)
@@ -0,0 +1,280 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py + +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s MDevereau wrote: I'm not sure if this is a diagnostic problem but I get errors about requiring sve when using the C tuple types. https://github.com/llvm/llvm-project/pull/73317 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang] [SME2] Add LUTI2 and LUTI4 quad Builtins and Intrinsics (PR #73317)
@@ -321,9 +321,18 @@ let TargetGuard = "sme2" in { let TargetGuard = "sme2" in { def SVLDR_ZT : Inst<"svldr_zt", "viQ", "", MergeNone, "aarch64_sme_ldr_zt", [IsOverloadNone, IsStreamingCompatible, IsSharedZA, IsPreservesZA], [ImmCheck<0, ImmCheck0_0>]>; def SVSTR_ZT : Inst<"svstr_zt", "vi%", "", MergeNone, "aarch64_sme_str_zt", [IsOverloadNone, IsStreamingCompatible, IsSharedZA, IsPreservesZA], [ImmCheck<0, ImmCheck0_0>]>; +} // // Zero ZT0 // +let TargetGuard = "sme2" in { MDevereau wrote: I think these were added in individually which might explain it. The blocks of instructions are separated by the comments, and maybe it makes it easier to see what individual instructions need once the list of defs grow in size? I've no objections to your suggestion, but the rest of this file doesn't really follow it I think? https://github.com/llvm/llvm-project/pull/73317 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang] [SME2] Add LUTI2 and LUTI4 quad Builtins and Intrinsics (PR #73317)
@@ -1864,6 +1866,35 @@ void AArch64DAGToDAGISel::SelectFrintFromVT(SDNode *N, unsigned NumVecs, SelectUnaryMultiIntrinsic(N, NumVecs, true, Opcode); } +void AArch64DAGToDAGISel::SelectMultiVectorLuti(SDNode *Node, +unsigned NumOutVecs, +unsigned Opc, +uint32_t MaxImm) { + if (ConstantSDNode *Imm = dyn_cast(Node->getOperand(4))) +if (Imm->getZExtValue() > MaxImm) + return; + + SDValue ZtValue; + if (!ImmToReg(Node->getOperand(2), ZtValue)) +return; + SDValue Ops[] = {ZtValue, Node->getOperand(3), Node->getOperand(4)}; + SDLoc DL(Node); + EVT VT = Node->getValueType(0); + + SDNode *Instruction = + CurDAG->getMachineNode(Opc, DL, {MVT::Untyped, MVT::Other}, Ops); + SDValue SuperReg = SDValue(Instruction, 0); + + for (unsigned i = 0; i < NumOutVecs; ++i) sdesmalen-arm wrote: nit: variable names should start with upper-case ```suggestion for (unsigned I = 0; I < NumOutVecs; ++I) ``` https://github.com/llvm/llvm-project/pull/73317 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang] [SME2] Add LUTI2 and LUTI4 quad Builtins and Intrinsics (PR #73317)
https://github.com/sdesmalen-arm commented: Looks mostly fine, just have a few nits. https://github.com/llvm/llvm-project/pull/73317 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang] [SME2] Add LUTI2 and LUTI4 quad Builtins and Intrinsics (PR #73317)
https://github.com/sdesmalen-arm edited https://github.com/llvm/llvm-project/pull/73317 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang] [SME2] Add LUTI2 and LUTI4 quad Builtins and Intrinsics (PR #73317)
@@ -0,0 +1,233 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2 + +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + +#include + +// CHECK-LABEL: define dso_local @test_svluti4_lane_zt_u16 +// CHECK-SAME: ( [[ZN:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT:[[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.luti4.lane.zt.x4.nxv8i16(i32 0, [[ZN]], i32 0) +// CHECK-NEXT:[[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT:[[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP1]], i64 0) +// CHECK-NEXT:[[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT:[[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CHECK-NEXT:[[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT:[[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP4]], [[TMP5]], i64 16) +// CHECK-NEXT:[[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT:[[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 24) +// CHECK-NEXT:ret [[TMP8]] +// +// CPP-CHECK-LABEL: define dso_local @_Z24test_svluti4_lane_zt_u16u11__SVUint8_t +// CPP-CHECK-SAME: ( [[ZN:%.*]]) #[[ATTR0:[0-9]+]] { +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT:[[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.luti4.lane.zt.x4.nxv8i16(i32 0, [[ZN]], i32 0) +// CPP-CHECK-NEXT:[[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT:[[TMP2:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT:[[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT:[[TMP4:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP2]], [[TMP3]], i64 8) +// CPP-CHECK-NEXT:[[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT:[[TMP6:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP4]], [[TMP5]], i64 16) +// CPP-CHECK-NEXT:[[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT:[[TMP8:%.*]] = tail call @llvm.vector.insert.nxv32i16.nxv8i16( [[TMP6]], [[TMP7]], i64 24) +// CPP-CHECK-NEXT:ret [[TMP8]] +// +svuint16x4_t test_svluti4_lane_zt_u16(svuint8_t zn) __arm_streaming __arm_shared_za __arm_preserves_za { + return svluti4_lane_zt_u16_x4(0, zn, 0); sdesmalen-arm wrote: ```suggestion return svluti4_lane_zt_u16_x4(0, zn, 1); ``` Rather than using `0` as the immediate, can you use the maximum range value in this and other tests? https://github.com/llvm/llvm-project/pull/73317 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang] [SME2] Add LUTI2 and LUTI4 quad Builtins and Intrinsics (PR #73317)
@@ -0,0 +1,280 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py + +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + +#include + +// CHECK-LABEL: @test_svluti2_lane_zt_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT:[[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.luti2.lane.zt.x4.nxv16i8(i32 0, [[ZN:%.*]], i32 0) +// CHECK-NEXT:[[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT:[[TMP2:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP1]], i64 0) +// CHECK-NEXT:[[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT:[[TMP4:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CHECK-NEXT:[[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT:[[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP4]], [[TMP5]], i64 32) +// CHECK-NEXT:[[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT:[[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 48) +// CHECK-NEXT:ret [[TMP8]] +// +// CPP-CHECK-LABEL: @_Z23test_svluti2_lane_zt_u8u11__SVUint8_t( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT:[[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.luti2.lane.zt.x4.nxv16i8(i32 0, [[ZN:%.*]], i32 0) +// CPP-CHECK-NEXT:[[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CPP-CHECK-NEXT:[[TMP2:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP1]], i64 0) +// CPP-CHECK-NEXT:[[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CPP-CHECK-NEXT:[[TMP4:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CPP-CHECK-NEXT:[[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CPP-CHECK-NEXT:[[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP4]], [[TMP5]], i64 32) +// CPP-CHECK-NEXT:[[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CPP-CHECK-NEXT:[[TMP8:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP6]], [[TMP7]], i64 48) +// CPP-CHECK-NEXT:ret [[TMP8]] +// +svuint8x4_t test_svluti2_lane_zt_u8(svuint8_t zn) __arm_streaming __arm_shared_za __arm_preserves_za { david-arm wrote: OK that's fair enough! https://github.com/llvm/llvm-project/pull/73317 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang] [SME2] Add LUTI2 and LUTI4 quad Builtins and Intrinsics (PR #73317)
https://github.com/MDevereau edited https://github.com/llvm/llvm-project/pull/73317 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang] [SME2] Add LUTI2 and LUTI4 quad Builtins and Intrinsics (PR #73317)
@@ -1859,6 +1867,34 @@ void AArch64DAGToDAGISel::SelectFrintFromVT(SDNode *N, unsigned NumVecs, SelectUnaryMultiIntrinsic(N, NumVecs, true, Opcode); } +template +void AArch64DAGToDAGISel::SelectMultiVectorLuti(SDNode *Node, +unsigned NumOutVecs, +unsigned Opc) { + if (ConstantSDNode *Imm = dyn_cast(Node->getOperand(4))) +if (Imm->getZExtValue() > Max) + return; + + SDValue ZtValue; + ImmToTile(Node->getOperand(2), ZtValue); MDevereau wrote: I'm not quite sure what you mean here. Why would `Op2 != 0` crash? I think tests added such as [this one](https://github.com/llvm/llvm-project/pull/73317/files/32ce28d2fccda24ea9b223f204ca327133e3d0f9#diff-4626c48918de6a8e9ba8eecc9909d7a6febd2208294852599d4b147f6906656fR188-R189) should demonstrate Op2 != 0 ok https://github.com/llvm/llvm-project/pull/73317 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang] [SME2] Add LUTI2 and LUTI4 quad Builtins and Intrinsics (PR #73317)
@@ -1859,6 +1867,34 @@ void AArch64DAGToDAGISel::SelectFrintFromVT(SDNode *N, unsigned NumVecs, SelectUnaryMultiIntrinsic(N, NumVecs, true, Opcode); } +template +void AArch64DAGToDAGISel::SelectMultiVectorLuti(SDNode *Node, +unsigned NumOutVecs, +unsigned Opc) { + if (ConstantSDNode *Imm = dyn_cast(Node->getOperand(4))) +if (Imm->getZExtValue() > Max) + return; + + SDValue ZtValue; + ImmToTile(Node->getOperand(2), ZtValue); david-arm wrote: If someone invokes the intrinsic with Op2 != 0 this will likely crash. Is it worth asserting the result of ImmToTile is true so that at least it's more obvious? https://github.com/llvm/llvm-project/pull/73317 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang] [SME2] Add LUTI2 and LUTI4 quad Builtins and Intrinsics (PR #73317)
https://github.com/MDevereau updated https://github.com/llvm/llvm-project/pull/73317 >From f5b909e24e3cea49d98b40797880e4329a7a1e4f Mon Sep 17 00:00:00 2001 From: Matt Devereau Date: Mon, 20 Nov 2023 15:50:28 + Subject: [PATCH 1/5] [SME2] Add LUTI2 and LUTI4 quad Builtins and Intrinsics See https://github.com/ARM-software/acle/pull/217 Patch by: Hassnaa Hamdi --- clang/include/clang/Basic/arm_sme.td | 8 + .../acle_sme2_luti2_lane_zt_x4.c | 201 ++ .../acle_sme2_luti4_lane_zt_x4.c | 148 + .../aarch64-sme2-intrinsics/acle_sme2_imm.cpp | 31 +++ llvm/include/llvm/IR/IntrinsicsAArch64.td | 11 + .../Target/AArch64/AArch64ISelDAGToDAG.cpp| 55 - .../Target/AArch64/AArch64RegisterInfo.cpp| 6 + llvm/lib/Target/AArch64/SMEInstrFormats.td| 11 +- .../AArch64/sme2-intrinsics-luti2-lane-x4.ll | 35 +++ .../AArch64/sme2-intrinsics-luti4-lane-x4.ll | 25 +++ 10 files changed, 525 insertions(+), 6 deletions(-) create mode 100644 clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt_x4.c create mode 100644 clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt_x4.c create mode 100644 clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_imm.cpp create mode 100644 llvm/test/CodeGen/AArch64/sme2-intrinsics-luti2-lane-x4.ll create mode 100644 llvm/test/CodeGen/AArch64/sme2-intrinsics-luti4-lane-x4.ll diff --git a/clang/include/clang/Basic/arm_sme.td b/clang/include/clang/Basic/arm_sme.td index b5655afdf419ecf..53319a57d73fdd2 100644 --- a/clang/include/clang/Basic/arm_sme.td +++ b/clang/include/clang/Basic/arm_sme.td @@ -298,3 +298,11 @@ multiclass ZAAddSub { defm SVADD : ZAAddSub<"add">; defm SVSUB : ZAAddSub<"sub">; + +// +// lookup table expand four contiguous registers +// +let TargetGuard = "sme2" in { + def SVLUTI2_LANE_ZT_X4 : Inst<"svluti2_lane_zt[_{d}]_x4", "4.didi", "cUcsUsiUi", MergeNone, "aarch64_sme_luti2_lane_zt_x4", [IsStreaming, IsSharedZA, IsPreservesZA], [ImmCheck<0, ImmCheck0_0>, ImmCheck<2, ImmCheck0_3>]>; + def SVLUTI4_LANE_ZT_X4 : Inst<"svluti4_lane_zt[_{d}]_x4", "4.didi", "sUsiUi", MergeNone, "aarch64_sme_luti4_lane_zt_x4", [IsStreaming, IsSharedZA, IsPreservesZA], [ImmCheck<0, ImmCheck0_0>, ImmCheck<2, ImmCheck0_1>]>; +} diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt_x4.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt_x4.c new file mode 100644 index 000..5479fa109e839c5 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt_x4.c @@ -0,0 +1,201 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py + +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1, A2, A3, A4) A1##A2##A3##A4 +#endif + + + +// CHECK-LABEL: @test_svluti2_lane_zt_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT:[[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.luti2.lane.zt.x4.nxv16i8(i32 0, [[ZN:%.*]], i32 0) +// CHECK-NEXT:[[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT:[[TMP2:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP1]], i64 0) +// CHECK-NEXT:[[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT:[[TMP4:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CHECK-NEXT:[[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT:[[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP4]], [[TMP5]], i64 32) +// CHECK-NEXT:[[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT:[[TMP8:%.*]] = tail call
[llvm] [clang] [SME2] Add LUTI2 and LUTI4 quad Builtins and Intrinsics (PR #73317)
@@ -1666,7 +1674,8 @@ static unsigned SelectOpcodeFromVT(EVT VT, ArrayRef Opcodes) { return 0; break; case SelectTypeKind::FP: -if (EltVT != MVT::f16 && EltVT != MVT::f32 && EltVT != MVT::f64) +if (EltVT != MVT::bf16 && EltVT != MVT::f16 && EltVT != MVT::f32 && +EltVT != MVT::f64) MDevereau wrote: No its not. I'll remove it. https://github.com/llvm/llvm-project/pull/73317 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang] [SME2] Add LUTI2 and LUTI4 quad Builtins and Intrinsics (PR #73317)
@@ -1666,7 +1674,8 @@ static unsigned SelectOpcodeFromVT(EVT VT, ArrayRef Opcodes) { return 0; break; case SelectTypeKind::FP: -if (EltVT != MVT::f16 && EltVT != MVT::f32 && EltVT != MVT::f64) +if (EltVT != MVT::bf16 && EltVT != MVT::f16 && EltVT != MVT::f32 && +EltVT != MVT::f64) kmclaughlin-arm wrote: Is this change needed now that you're using AnyType? https://github.com/llvm/llvm-project/pull/73317 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang] [SME2] Add LUTI2 and LUTI4 quad Builtins and Intrinsics (PR #73317)
@@ -5098,6 +5099,12 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { AArch64::LUTI2_4ZTZI_S})) // Second Immediate must be <= 3: SelectMultiVectorLuti<3>(Node, 4, Opc); + else if (auto Opc = SelectOpcodeFromVT( kmclaughlin-arm wrote: If AnyType is used and i1 is passed, `SelectOpcodeFromVT` will return 0. I think it would be the same for i64/f64 as the size of `Opcodes` for these intrinsics is 3 and the function will check this when trying to return the correct opcode from the list: `return (Opcodes.size() <= Offset) ? 0 : Opcodes[Offset];` https://github.com/llvm/llvm-project/pull/73317 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang] [SME2] Add LUTI2 and LUTI4 quad Builtins and Intrinsics (PR #73317)
@@ -5098,6 +5099,12 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { AArch64::LUTI2_4ZTZI_S})) // Second Immediate must be <= 3: SelectMultiVectorLuti<3>(Node, 4, Opc); + else if (auto Opc = SelectOpcodeFromVT( MDevereau wrote: Yeah that does seem better. I'm not sure what the implications of letting i1, i64 and f64 are though. https://github.com/llvm/llvm-project/pull/73317 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang] [SME2] Add LUTI2 and LUTI4 quad Builtins and Intrinsics (PR #73317)
https://github.com/MDevereau updated https://github.com/llvm/llvm-project/pull/73317 >From f5b909e24e3cea49d98b40797880e4329a7a1e4f Mon Sep 17 00:00:00 2001 From: Matt Devereau Date: Mon, 20 Nov 2023 15:50:28 + Subject: [PATCH 1/4] [SME2] Add LUTI2 and LUTI4 quad Builtins and Intrinsics See https://github.com/ARM-software/acle/pull/217 Patch by: Hassnaa Hamdi --- clang/include/clang/Basic/arm_sme.td | 8 + .../acle_sme2_luti2_lane_zt_x4.c | 201 ++ .../acle_sme2_luti4_lane_zt_x4.c | 148 + .../aarch64-sme2-intrinsics/acle_sme2_imm.cpp | 31 +++ llvm/include/llvm/IR/IntrinsicsAArch64.td | 11 + .../Target/AArch64/AArch64ISelDAGToDAG.cpp| 55 - .../Target/AArch64/AArch64RegisterInfo.cpp| 6 + llvm/lib/Target/AArch64/SMEInstrFormats.td| 11 +- .../AArch64/sme2-intrinsics-luti2-lane-x4.ll | 35 +++ .../AArch64/sme2-intrinsics-luti4-lane-x4.ll | 25 +++ 10 files changed, 525 insertions(+), 6 deletions(-) create mode 100644 clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt_x4.c create mode 100644 clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt_x4.c create mode 100644 clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_imm.cpp create mode 100644 llvm/test/CodeGen/AArch64/sme2-intrinsics-luti2-lane-x4.ll create mode 100644 llvm/test/CodeGen/AArch64/sme2-intrinsics-luti4-lane-x4.ll diff --git a/clang/include/clang/Basic/arm_sme.td b/clang/include/clang/Basic/arm_sme.td index b5655afdf419ecf..53319a57d73fdd2 100644 --- a/clang/include/clang/Basic/arm_sme.td +++ b/clang/include/clang/Basic/arm_sme.td @@ -298,3 +298,11 @@ multiclass ZAAddSub { defm SVADD : ZAAddSub<"add">; defm SVSUB : ZAAddSub<"sub">; + +// +// lookup table expand four contiguous registers +// +let TargetGuard = "sme2" in { + def SVLUTI2_LANE_ZT_X4 : Inst<"svluti2_lane_zt[_{d}]_x4", "4.didi", "cUcsUsiUi", MergeNone, "aarch64_sme_luti2_lane_zt_x4", [IsStreaming, IsSharedZA, IsPreservesZA], [ImmCheck<0, ImmCheck0_0>, ImmCheck<2, ImmCheck0_3>]>; + def SVLUTI4_LANE_ZT_X4 : Inst<"svluti4_lane_zt[_{d}]_x4", "4.didi", "sUsiUi", MergeNone, "aarch64_sme_luti4_lane_zt_x4", [IsStreaming, IsSharedZA, IsPreservesZA], [ImmCheck<0, ImmCheck0_0>, ImmCheck<2, ImmCheck0_1>]>; +} diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt_x4.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt_x4.c new file mode 100644 index 000..5479fa109e839c5 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt_x4.c @@ -0,0 +1,201 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py + +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1, A2, A3, A4) A1##A2##A3##A4 +#endif + + + +// CHECK-LABEL: @test_svluti2_lane_zt_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT:[[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.luti2.lane.zt.x4.nxv16i8(i32 0, [[ZN:%.*]], i32 0) +// CHECK-NEXT:[[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT:[[TMP2:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP1]], i64 0) +// CHECK-NEXT:[[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT:[[TMP4:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CHECK-NEXT:[[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT:[[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP4]], [[TMP5]], i64 32) +// CHECK-NEXT:[[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT:[[TMP8:%.*]] = tail call
[llvm] [clang] [SME2] Add LUTI2 and LUTI4 quad Builtins and Intrinsics (PR #73317)
@@ -5098,6 +5099,12 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { AArch64::LUTI2_4ZTZI_S})) // Second Immediate must be <= 3: SelectMultiVectorLuti<3>(Node, 4, Opc); + else if (auto Opc = SelectOpcodeFromVT( kmclaughlin-arm wrote: Can you instead just pass `` in the if statement above? https://github.com/llvm/llvm-project/pull/73317 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang] [SME2] Add LUTI2 and LUTI4 quad Builtins and Intrinsics (PR #73317)
https://github.com/ldionne updated https://github.com/llvm/llvm-project/pull/73317 >From f5b909e24e3cea49d98b40797880e4329a7a1e4f Mon Sep 17 00:00:00 2001 From: Matt Devereau Date: Mon, 20 Nov 2023 15:50:28 + Subject: [PATCH 1/2] [SME2] Add LUTI2 and LUTI4 quad Builtins and Intrinsics See https://github.com/ARM-software/acle/pull/217 Patch by: Hassnaa Hamdi --- clang/include/clang/Basic/arm_sme.td | 8 + .../acle_sme2_luti2_lane_zt_x4.c | 201 ++ .../acle_sme2_luti4_lane_zt_x4.c | 148 + .../aarch64-sme2-intrinsics/acle_sme2_imm.cpp | 31 +++ llvm/include/llvm/IR/IntrinsicsAArch64.td | 11 + .../Target/AArch64/AArch64ISelDAGToDAG.cpp| 55 - .../Target/AArch64/AArch64RegisterInfo.cpp| 6 + llvm/lib/Target/AArch64/SMEInstrFormats.td| 11 +- .../AArch64/sme2-intrinsics-luti2-lane-x4.ll | 35 +++ .../AArch64/sme2-intrinsics-luti4-lane-x4.ll | 25 +++ 10 files changed, 525 insertions(+), 6 deletions(-) create mode 100644 clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt_x4.c create mode 100644 clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt_x4.c create mode 100644 clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_imm.cpp create mode 100644 llvm/test/CodeGen/AArch64/sme2-intrinsics-luti2-lane-x4.ll create mode 100644 llvm/test/CodeGen/AArch64/sme2-intrinsics-luti4-lane-x4.ll diff --git a/clang/include/clang/Basic/arm_sme.td b/clang/include/clang/Basic/arm_sme.td index b5655afdf419ecf..53319a57d73fdd2 100644 --- a/clang/include/clang/Basic/arm_sme.td +++ b/clang/include/clang/Basic/arm_sme.td @@ -298,3 +298,11 @@ multiclass ZAAddSub { defm SVADD : ZAAddSub<"add">; defm SVSUB : ZAAddSub<"sub">; + +// +// lookup table expand four contiguous registers +// +let TargetGuard = "sme2" in { + def SVLUTI2_LANE_ZT_X4 : Inst<"svluti2_lane_zt[_{d}]_x4", "4.didi", "cUcsUsiUi", MergeNone, "aarch64_sme_luti2_lane_zt_x4", [IsStreaming, IsSharedZA, IsPreservesZA], [ImmCheck<0, ImmCheck0_0>, ImmCheck<2, ImmCheck0_3>]>; + def SVLUTI4_LANE_ZT_X4 : Inst<"svluti4_lane_zt[_{d}]_x4", "4.didi", "sUsiUi", MergeNone, "aarch64_sme_luti4_lane_zt_x4", [IsStreaming, IsSharedZA, IsPreservesZA], [ImmCheck<0, ImmCheck0_0>, ImmCheck<2, ImmCheck0_1>]>; +} diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt_x4.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt_x4.c new file mode 100644 index 000..5479fa109e839c5 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt_x4.c @@ -0,0 +1,201 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py + +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s + +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1, A2, A3, A4) A1##A2##A3##A4 +#endif + + + +// CHECK-LABEL: @test_svluti2_lane_zt_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT:[[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.luti2.lane.zt.x4.nxv16i8(i32 0, [[ZN:%.*]], i32 0) +// CHECK-NEXT:[[TMP1:%.*]] = extractvalue { , , , } [[TMP0]], 0 +// CHECK-NEXT:[[TMP2:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( poison, [[TMP1]], i64 0) +// CHECK-NEXT:[[TMP3:%.*]] = extractvalue { , , , } [[TMP0]], 1 +// CHECK-NEXT:[[TMP4:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP2]], [[TMP3]], i64 16) +// CHECK-NEXT:[[TMP5:%.*]] = extractvalue { , , , } [[TMP0]], 2 +// CHECK-NEXT:[[TMP6:%.*]] = tail call @llvm.vector.insert.nxv64i8.nxv16i8( [[TMP4]], [[TMP5]], i64 32) +// CHECK-NEXT:[[TMP7:%.*]] = extractvalue { , , , } [[TMP0]], 3 +// CHECK-NEXT:[[TMP8:%.*]] = tail call