https://github.com/jthackray updated https://github.com/llvm/llvm-project/pull/175785
>From 403d8aacd6f9e5c4a115bb77b354f11f5ab7c5b2 Mon Sep 17 00:00:00 2001 From: Jonathan Thackray <[email protected]> Date: Tue, 13 Jan 2026 14:37:20 +0000 Subject: [PATCH 1/2] [AArch64][llvm] Improve codegen for svldr_vnum_za/svstr_vnum_za When compiling `svldr_vnum_za` or `svstr_vnum_za`, the output assembly has a superfluous `SXTW` instruction (gcc doesn't add this); this should be excised, see https://godbolt.org/z/sz4s79rf8 In clang we're using int64_t, and `i32` in llvm. The extra `SXTW` is due to a call to `DAG.getNode(ISD::SIGN_EXTEND...)`. Make them both 64bit to make the extra `SXTW` go away. --- clang/lib/CodeGen/TargetBuiltins/ARM.cpp | 4 +- .../AArch64/sme-intrinsics/acle_sme_ldr.c | 11 +- .../AArch64/sme-intrinsics/acle_sme_str.c | 11 +- llvm/include/llvm/IR/IntrinsicsAArch64.td | 2 +- .../Target/AArch64/AArch64ISelLowering.cpp | 16 +-- .../CodeGen/AArch64/sme-intrinsics-loads.ll | 118 +++++++++--------- .../CodeGen/AArch64/sme-intrinsics-stores.ll | 118 +++++++++--------- 7 files changed, 133 insertions(+), 147 deletions(-) diff --git a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp index 2d7128bf95df2..887e5efac76d7 100644 --- a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp @@ -4478,9 +4478,9 @@ Value *CodeGenFunction::EmitSMELdrStr(const SVETypeFlags &TypeFlags, SmallVectorImpl<Value *> &Ops, unsigned IntID) { if (Ops.size() == 2) - Ops.push_back(Builder.getInt32(0)); + Ops.push_back(Builder.getInt64(0)); else - Ops[2] = Builder.CreateIntCast(Ops[2], Int32Ty, true); + Ops[2] = Builder.CreateIntCast(Ops[2], Int64Ty, true); Function *F = CGM.getIntrinsic(IntID, {}); return Builder.CreateCall(F, Ops); } diff --git a/clang/test/CodeGen/AArch64/sme-intrinsics/acle_sme_ldr.c b/clang/test/CodeGen/AArch64/sme-intrinsics/acle_sme_ldr.c index 4c102f38fd30d..5e20a76db69c8 100644 --- a/clang/test/CodeGen/AArch64/sme-intrinsics/acle_sme_ldr.c +++ b/clang/test/CodeGen/AArch64/sme-intrinsics/acle_sme_ldr.c @@ -9,7 +9,7 @@ // CHECK-C-LABEL: @test_svldr_vnum_za( // CHECK-CXX-LABEL: @_Z18test_svldr_vnum_zajPKv( // CHECK-NEXT: entry: -// CHECK-NEXT: tail call void @llvm.aarch64.sme.ldr(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i32 0) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.ldr(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i64 0) // CHECK-NEXT: ret void // void test_svldr_vnum_za(uint32_t slice_base, const void *ptr) __arm_out("za") { @@ -19,7 +19,7 @@ void test_svldr_vnum_za(uint32_t slice_base, const void *ptr) __arm_out("za") { // CHECK-C-LABEL: @test_svldr_vnum_za_1( // CHECK-CXX-LABEL: @_Z20test_svldr_vnum_za_1jPKv( // CHECK-NEXT: entry: -// CHECK-NEXT: tail call void @llvm.aarch64.sme.ldr(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i32 15) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.ldr(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i64 15) // CHECK-NEXT: ret void // void test_svldr_vnum_za_1(uint32_t slice_base, const void *ptr) __arm_out("za") { @@ -29,7 +29,7 @@ void test_svldr_vnum_za_1(uint32_t slice_base, const void *ptr) __arm_out("za") // CHECK-C-LABEL: @test_svldr_za( // CHECK-CXX-LABEL: @_Z13test_svldr_zajPKv( // CHECK-NEXT: entry: -// CHECK-NEXT: tail call void @llvm.aarch64.sme.ldr(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i32 0) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.ldr(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i64 0) // CHECK-NEXT: ret void // void test_svldr_za(uint32_t slice_base, const void *ptr) __arm_out("za") { @@ -39,8 +39,7 @@ void test_svldr_za(uint32_t slice_base, const void *ptr) __arm_out("za") { // CHECK-C-LABEL: @test_svldr_vnum_za_var( // CHECK-CXX-LABEL: @_Z22test_svldr_vnum_za_varjPKvl( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[VNUM:%.*]] to i32 -// CHECK-NEXT: tail call void @llvm.aarch64.sme.ldr(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i32 [[TMP0:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.ldr(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i64 [[VNUM:%.*]]) // CHECK-NEXT: ret void // void test_svldr_vnum_za_var(uint32_t slice_base, const void *ptr, int64_t vnum) __arm_out("za") { @@ -50,7 +49,7 @@ void test_svldr_vnum_za_var(uint32_t slice_base, const void *ptr, int64_t vnum) // CHECK-C-LABEL: @test_svldr_vnum_za_2( // CHECK-CXX-LABEL: @_Z20test_svldr_vnum_za_2jPKv( // CHECK-NEXT: entry: -// CHECK-NEXT: tail call void @llvm.aarch64.sme.ldr(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i32 16) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.ldr(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i64 16) // CHECK-NEXT: ret void // void test_svldr_vnum_za_2(uint32_t slice_base, const void *ptr) __arm_out("za") { diff --git a/clang/test/CodeGen/AArch64/sme-intrinsics/acle_sme_str.c b/clang/test/CodeGen/AArch64/sme-intrinsics/acle_sme_str.c index b6ab6b07fb2bc..0d33010e25ad4 100644 --- a/clang/test/CodeGen/AArch64/sme-intrinsics/acle_sme_str.c +++ b/clang/test/CodeGen/AArch64/sme-intrinsics/acle_sme_str.c @@ -9,7 +9,7 @@ // CHECK-C-LABEL: @test_svstr_vnum_za( // CHECK-CXX-LABEL: @_Z18test_svstr_vnum_zajPv( // CHECK-NEXT: entry: -// CHECK-NEXT: tail call void @llvm.aarch64.sme.str(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i32 0) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.str(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i64 0) // CHECK-NEXT: ret void // void test_svstr_vnum_za(uint32_t slice_base, void *ptr) __arm_in("za") { @@ -19,7 +19,7 @@ void test_svstr_vnum_za(uint32_t slice_base, void *ptr) __arm_in("za") { // CHECK-C-LABEL: @test_svstr_vnum_za_1( // CHECK-CXX-LABEL: @_Z20test_svstr_vnum_za_1jPv( // CHECK-NEXT: entry: -// CHECK-NEXT: tail call void @llvm.aarch64.sme.str(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i32 15) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.str(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i64 15) // CHECK-NEXT: ret void // void test_svstr_vnum_za_1(uint32_t slice_base, void *ptr) __arm_in("za") { @@ -29,7 +29,7 @@ void test_svstr_vnum_za_1(uint32_t slice_base, void *ptr) __arm_in("za") { // CHECK-C-LABEL: @test_svstr_za( // CHECK-CXX-LABEL: @_Z13test_svstr_zajPv( // CHECK-NEXT: entry: -// CHECK-NEXT: tail call void @llvm.aarch64.sme.str(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i32 0) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.str(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i64 0) // CHECK-NEXT: ret void // void test_svstr_za(uint32_t slice_base, void *ptr) __arm_in("za") { @@ -39,8 +39,7 @@ void test_svstr_za(uint32_t slice_base, void *ptr) __arm_in("za") { // CHECK-C-LABEL: @test_svstr_vnum_za_var( // CHECK-CXX-LABEL: @_Z22test_svstr_vnum_za_varjPvl( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[VNUM:%.*]] to i32 -// CHECK-NEXT: tail call void @llvm.aarch64.sme.str(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i32 [[TMP0:%.*]]) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.str(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i64 [[VNUM:%.*]]) // CHECK-NEXT: ret void // void test_svstr_vnum_za_var(uint32_t slice_base, void *ptr, int64_t vnum) __arm_in("za") { @@ -50,7 +49,7 @@ void test_svstr_vnum_za_var(uint32_t slice_base, void *ptr, int64_t vnum) __arm_ // CHECK-C-LABEL: @test_svstr_vnum_za_2( // CHECK-CXX-LABEL: @_Z20test_svstr_vnum_za_2jPv( // CHECK-NEXT: entry: -// CHECK-NEXT: tail call void @llvm.aarch64.sme.str(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i32 16) +// CHECK-NEXT: tail call void @llvm.aarch64.sme.str(i32 [[SLICE_BASE:%.*]], ptr [[PTR:%.*]], i64 16) // CHECK-NEXT: ret void // void test_svstr_vnum_za_2(uint32_t slice_base, void *ptr) __arm_in("za") { diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td index fd56e0e3f9e7b..e0dcb500fb14c 100644 --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -2958,7 +2958,7 @@ let TargetPrefix = "aarch64" in { // Spill + fill class SME_LDR_STR_ZA_Intrinsic - : DefaultAttrsIntrinsic<[], [llvm_i32_ty, llvm_ptr_ty, llvm_i32_ty], [IntrInaccessibleMemOrArgMemOnly]>; + : DefaultAttrsIntrinsic<[], [llvm_i32_ty, llvm_ptr_ty, llvm_i64_ty], [IntrInaccessibleMemOrArgMemOnly]>; def int_aarch64_sme_ldr : SME_LDR_STR_ZA_Intrinsic; def int_aarch64_sme_str : SME_LDR_STR_ZA_Intrinsic; diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 74ee8ff8ab5f5..8df9e30d40bd2 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -6099,7 +6099,7 @@ SDValue LowerSMELdrStr(SDValue N, SelectionDAG &DAG, bool IsLoad) { SDValue TileSlice = N->getOperand(2); SDValue Base = N->getOperand(3); SDValue VecNum = N->getOperand(4); - int32_t ConstAddend = 0; + int64_t ConstAddend = 0; SDValue VarAddend = VecNum; // If the vnum is an add of an immediate, we can fold it into the instruction @@ -6113,10 +6113,10 @@ SDValue LowerSMELdrStr(SDValue N, SelectionDAG &DAG, bool IsLoad) { } int32_t ImmAddend = ConstAddend % 16; - if (int32_t C = (ConstAddend - ImmAddend)) { - SDValue CVal = DAG.getTargetConstant(C, DL, MVT::i32); + if (int64_t C = (ConstAddend - ImmAddend)) { + SDValue CVal = DAG.getTargetConstant(C, DL, MVT::i64); VarAddend = VarAddend - ? DAG.getNode(ISD::ADD, DL, MVT::i32, {VarAddend, CVal}) + ? DAG.getNode(ISD::ADD, DL, MVT::i64, {VarAddend, CVal}) : CVal; } @@ -6126,12 +6126,12 @@ SDValue LowerSMELdrStr(SDValue N, SelectionDAG &DAG, bool IsLoad) { DAG.getConstant(1, DL, MVT::i32)); // Multiply SVL and vnum then add it to the base - SDValue Mul = DAG.getNode( - ISD::MUL, DL, MVT::i64, - {SVL, DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, VarAddend)}); + SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, {SVL, VarAddend}); Base = DAG.getNode(ISD::ADD, DL, MVT::i64, {Base, Mul}); + // Just add vnum to the tileslice - TileSlice = DAG.getNode(ISD::ADD, DL, MVT::i32, {TileSlice, VarAddend}); + SDValue VarAddend32 = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, VarAddend); + TileSlice = DAG.getNode(ISD::ADD, DL, MVT::i32, {TileSlice, VarAddend32}); } return DAG.getNode(IsLoad ? AArch64ISD::SME_ZA_LDR : AArch64ISD::SME_ZA_STR, diff --git a/llvm/test/CodeGen/AArch64/sme-intrinsics-loads.ll b/llvm/test/CodeGen/AArch64/sme-intrinsics-loads.ll index 57f8e5438eaf2..22b27831f4b3d 100644 --- a/llvm/test/CodeGen/AArch64/sme-intrinsics-loads.ll +++ b/llvm/test/CodeGen/AArch64/sme-intrinsics-loads.ll @@ -252,7 +252,7 @@ define void @ldr(ptr %ptr) { ; CHECK-NEXT: mov w12, wzr ; CHECK-NEXT: ldr za[w12, 0], [x0] ; CHECK-NEXT: ret - call void @llvm.aarch64.sme.ldr(i32 0, ptr %ptr, i32 0) + call void @llvm.aarch64.sme.ldr(i32 0, ptr %ptr, i64 0) ret void; } @@ -264,7 +264,7 @@ define void @ldr_with_off_15(ptr %ptr) { ; CHECK-NEXT: ldr za[w12, 0], [x8] ; CHECK-NEXT: ret %base = getelementptr i8, ptr %ptr, i64 15 - call void @llvm.aarch64.sme.ldr(i32 15, ptr %base, i32 0) + call void @llvm.aarch64.sme.ldr(i32 15, ptr %base, i64 0) ret void; } @@ -278,7 +278,7 @@ define void @ldr_with_off_15mulvl(ptr %ptr) { %vscale = call i64 @llvm.vscale.i64() %mulvl = mul i64 %vscale, 240 %base = getelementptr i8, ptr %ptr, i64 %mulvl - call void @llvm.aarch64.sme.ldr(i32 15, ptr %base, i32 0) + call void @llvm.aarch64.sme.ldr(i32 15, ptr %base, i64 0) ret void; } @@ -292,21 +292,19 @@ define void @ldr_with_off_16mulvl(ptr %ptr) { %vscale = call i64 @llvm.vscale.i64() %mulvl = mul i64 %vscale, 256 %base = getelementptr i8, ptr %ptr, i64 %mulvl - call void @llvm.aarch64.sme.ldr(i32 16, ptr %base, i32 0) + call void @llvm.aarch64.sme.ldr(i32 16, ptr %base, i64 0) ret void; } -define void @ldr_with_off_var(ptr %base, i32 %off) { +define void @ldr_with_off_var(ptr %base, i64 %off) { ; CHECK-LABEL: ldr_with_off_var: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 -; CHECK-NEXT: sxtw x8, w1 -; CHECK-NEXT: rdsvl x9, #1 +; CHECK-NEXT: rdsvl x8, #1 ; CHECK-NEXT: add w12, w1, #16 -; CHECK-NEXT: madd x8, x9, x8, x0 +; CHECK-NEXT: madd x8, x8, x1, x0 ; CHECK-NEXT: ldr za[w12, 0], [x8] ; CHECK-NEXT: ret - call void @llvm.aarch64.sme.ldr(i32 16, ptr %base, i32 %off) + call void @llvm.aarch64.sme.ldr(i32 16, ptr %base, i64 %off) ret void; } @@ -316,7 +314,7 @@ define void @ldr_with_off_15imm(ptr %base) { ; CHECK-NEXT: mov w12, #16 // =0x10 ; CHECK-NEXT: ldr za[w12, 15], [x0, #15, mul vl] ; CHECK-NEXT: ret - call void @llvm.aarch64.sme.ldr(i32 16, ptr %base, i32 15) + call void @llvm.aarch64.sme.ldr(i32 16, ptr %base, i64 15) ret void; } @@ -328,7 +326,7 @@ define void @ldr_with_off_16imm(ptr %base) { ; CHECK-NEXT: add x8, x0, x8, lsl #4 ; CHECK-NEXT: ldr za[w12, 0], [x8] ; CHECK-NEXT: ret - call void @llvm.aarch64.sme.ldr(i32 16, ptr %base, i32 16) + call void @llvm.aarch64.sme.ldr(i32 16, ptr %base, i64 16) ret void; } @@ -342,10 +340,10 @@ define void @ldr_with_off_many_imm(i32 %tile_slice, ptr %ptr) { ; CHECK-NEXT: ldr za[w12, 4], [x1, #4, mul vl] ; CHECK-NEXT: ret entry: - tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 1) - tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 2) - tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 3) - tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 4) + tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i64 1) + tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i64 2) + tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i64 3) + tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i64 4) ret void } @@ -362,10 +360,10 @@ define void @ldr_with_off_many_imm_15_18(i32 %tile_slice, ptr %ptr) { ; CHECK-NEXT: ldr za[w12, 2], [x8, #2, mul vl] ; CHECK-NEXT: ret entry: - tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 15) - tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 16) - tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 17) - tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 18) + tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i64 15) + tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i64 16) + tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i64 17) + tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i64 18) ret void } @@ -381,10 +379,10 @@ define void @ldr_with_off_many_imm_16_19(i32 %tile_slice, ptr %ptr) { ; CHECK-NEXT: ldr za[w12, 3], [x8, #3, mul vl] ; CHECK-NEXT: ret entry: - tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 16) - tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 17) - tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 18) - tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 19) + tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i64 16) + tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i64 17) + tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i64 18) + tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i64 19) ret void } @@ -402,10 +400,10 @@ define void @ldr_with_off_many_imm_31_34(i32 %tile_slice, ptr %ptr) { ; CHECK-NEXT: ldr za[w12, 2], [x8, #2, mul vl] ; CHECK-NEXT: ret entry: - tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 31) - tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 32) - tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 33) - tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 34) + tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i64 31) + tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i64 32) + tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i64 33) + tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i64 34) ret void } @@ -421,60 +419,56 @@ define void @ldr_with_off_many_imm_32_35(i32 %tile_slice, ptr %ptr, i64 %vnum) { ; CHECK-NEXT: ldr za[w12, 3], [x8, #3, mul vl] ; CHECK-NEXT: ret entry: - tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 32) - tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 33) - tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 34) - tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 35) + tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i64 32) + tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i64 33) + tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i64 34) + tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i64 35) ret void } define void @ldr_with_off_many_var(i32 %tile_slice, ptr %ptr, i64 %vnum) { ; CHECK-LABEL: ldr_with_off_many_var: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sxtw x8, w2 -; CHECK-NEXT: rdsvl x9, #1 +; CHECK-NEXT: rdsvl x8, #1 ; CHECK-NEXT: add w12, w0, w2 -; CHECK-NEXT: madd x8, x9, x8, x1 +; CHECK-NEXT: madd x8, x8, x2, x1 ; CHECK-NEXT: ldr za[w12, 0], [x8] ; CHECK-NEXT: ldr za[w12, 1], [x8, #1, mul vl] ; CHECK-NEXT: ldr za[w12, 2], [x8, #2, mul vl] ; CHECK-NEXT: ldr za[w12, 3], [x8, #3, mul vl] ; CHECK-NEXT: ret entry: - %0 = trunc i64 %vnum to i32 - tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 %0) - %1 = add i32 %0, 1 - tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 %1) - %2 = add i32 %0, 2 - tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 %2) - %3 = add i32 %0, 3 - tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 %3) + tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i64 %vnum) + %1 = add i64 %vnum, 1 + tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i64 %1) + %2 = add i64 %vnum, 2 + tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i64 %2) + %3 = add i64 %vnum, 3 + tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i64 %3) ret void } define void @ldr_with_off_many_var_high(i32 %tile_slice, ptr %ptr, i64 %vnum) { ; CHECK-LABEL: ldr_with_off_many_var_high: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: add w8, w2, #32 -; CHECK-NEXT: rdsvl x10, #1 -; CHECK-NEXT: sxtw x9, w8 -; CHECK-NEXT: add w12, w0, w8 -; CHECK-NEXT: madd x9, x10, x9, x1 -; CHECK-NEXT: ldr za[w12, 1], [x9, #1, mul vl] -; CHECK-NEXT: ldr za[w12, 2], [x9, #2, mul vl] -; CHECK-NEXT: ldr za[w12, 3], [x9, #3, mul vl] -; CHECK-NEXT: ldr za[w12, 4], [x9, #4, mul vl] +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: add x9, x2, #32 +; CHECK-NEXT: madd x8, x8, x9, x1 +; CHECK-NEXT: add w12, w0, w9 +; CHECK-NEXT: ldr za[w12, 1], [x8, #1, mul vl] +; CHECK-NEXT: ldr za[w12, 2], [x8, #2, mul vl] +; CHECK-NEXT: ldr za[w12, 3], [x8, #3, mul vl] +; CHECK-NEXT: ldr za[w12, 4], [x8, #4, mul vl] ; CHECK-NEXT: ret entry: - %0 = trunc i64 %vnum to i32 - %1 = add i32 %0, 33 - tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 %1) - %2 = add i32 %0, 34 - tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 %2) - %3 = add i32 %0, 35 - tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 %3) - %4 = add i32 %0, 36 - tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 %4) + %1 = add i64 %vnum, 33 + tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i64 %1) + %2 = add i64 %vnum, 34 + tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i64 %2) + %3 = add i64 %vnum, 35 + tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i64 %3) + %4 = add i64 %vnum, 36 + tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i64 %4) ret void } @@ -523,5 +517,5 @@ declare void @llvm.aarch64.sme.ld1w.vert(<vscale x 4 x i1>, ptr, i32, i32) declare void @llvm.aarch64.sme.ld1d.vert(<vscale x 2 x i1>, ptr, i32, i32) declare void @llvm.aarch64.sme.ld1q.vert(<vscale x 1 x i1>, ptr, i32, i32) -declare void @llvm.aarch64.sme.ldr(i32, ptr, i32) +declare void @llvm.aarch64.sme.ldr(i32, ptr, i64) declare i64 @llvm.vscale.i64() diff --git a/llvm/test/CodeGen/AArch64/sme-intrinsics-stores.ll b/llvm/test/CodeGen/AArch64/sme-intrinsics-stores.ll index 1ff32aade4a1f..7f0361254625c 100644 --- a/llvm/test/CodeGen/AArch64/sme-intrinsics-stores.ll +++ b/llvm/test/CodeGen/AArch64/sme-intrinsics-stores.ll @@ -252,7 +252,7 @@ define void @str(ptr %ptr) { ; CHECK-NEXT: mov w12, wzr ; CHECK-NEXT: str za[w12, 0], [x0] ; CHECK-NEXT: ret - call void @llvm.aarch64.sme.str(i32 0, ptr %ptr, i32 0) + call void @llvm.aarch64.sme.str(i32 0, ptr %ptr, i64 0) ret void; } @@ -264,7 +264,7 @@ define void @str_with_off_15(ptr %ptr) { ; CHECK-NEXT: str za[w12, 0], [x8] ; CHECK-NEXT: ret %base = getelementptr i8, ptr %ptr, i64 15 - call void @llvm.aarch64.sme.str(i32 15, ptr %base, i32 0) + call void @llvm.aarch64.sme.str(i32 15, ptr %base, i64 0) ret void; } @@ -278,7 +278,7 @@ define void @str_with_off_15mulvl(ptr %ptr) { %vscale = call i64 @llvm.vscale.i64() %mulvl = mul i64 %vscale, 240 %base = getelementptr i8, ptr %ptr, i64 %mulvl - call void @llvm.aarch64.sme.str(i32 15, ptr %base, i32 0) + call void @llvm.aarch64.sme.str(i32 15, ptr %base, i64 0) ret void; } @@ -292,21 +292,19 @@ define void @str_with_off_16mulvl(ptr %ptr) { %vscale = call i64 @llvm.vscale.i64() %mulvl = mul i64 %vscale, 256 %base = getelementptr i8, ptr %ptr, i64 %mulvl - call void @llvm.aarch64.sme.str(i32 16, ptr %base, i32 0) + call void @llvm.aarch64.sme.str(i32 16, ptr %base, i64 0) ret void; } -define void @str_with_off_var(ptr %base, i32 %off) { +define void @str_with_off_var(ptr %base, i64 %off) { ; CHECK-LABEL: str_with_off_var: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 -; CHECK-NEXT: sxtw x8, w1 -; CHECK-NEXT: rdsvl x9, #1 +; CHECK-NEXT: rdsvl x8, #1 ; CHECK-NEXT: add w12, w1, #16 -; CHECK-NEXT: madd x8, x9, x8, x0 +; CHECK-NEXT: madd x8, x8, x1, x0 ; CHECK-NEXT: str za[w12, 0], [x8] ; CHECK-NEXT: ret - call void @llvm.aarch64.sme.str(i32 16, ptr %base, i32 %off) + call void @llvm.aarch64.sme.str(i32 16, ptr %base, i64 %off) ret void; } @@ -318,7 +316,7 @@ define void @str_with_off_15imm(ptr %ptr) { ; CHECK-NEXT: str za[w12, 15], [x8, #15, mul vl] ; CHECK-NEXT: ret %base = getelementptr i8, ptr %ptr, i64 15 - call void @llvm.aarch64.sme.str(i32 15, ptr %base, i32 15) + call void @llvm.aarch64.sme.str(i32 15, ptr %base, i64 15) ret void; } @@ -332,7 +330,7 @@ define void @str_with_off_16imm(ptr %ptr) { ; CHECK-NEXT: str za[w12, 0], [x8] ; CHECK-NEXT: ret %base = getelementptr i8, ptr %ptr, i64 15 - call void @llvm.aarch64.sme.str(i32 15, ptr %base, i32 16) + call void @llvm.aarch64.sme.str(i32 15, ptr %base, i64 16) ret void; } @@ -346,10 +344,10 @@ define void @str_with_off_many_imm(i32 %tile_slice, ptr %ptr) { ; CHECK-NEXT: str za[w12, 4], [x1, #4, mul vl] ; CHECK-NEXT: ret entry: - tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 1) - tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 2) - tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 3) - tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 4) + tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i64 1) + tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i64 2) + tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i64 3) + tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i64 4) ret void } @@ -366,10 +364,10 @@ define void @str_with_off_many_imm_15_18(i32 %tile_slice, ptr %ptr) { ; CHECK-NEXT: str za[w12, 2], [x8, #2, mul vl] ; CHECK-NEXT: ret entry: - tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 15) - tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 16) - tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 17) - tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 18) + tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i64 15) + tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i64 16) + tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i64 17) + tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i64 18) ret void } @@ -385,10 +383,10 @@ define void @str_with_off_many_imm_16_19(i32 %tile_slice, ptr %ptr) { ; CHECK-NEXT: str za[w12, 3], [x8, #3, mul vl] ; CHECK-NEXT: ret entry: - tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 16) - tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 17) - tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 18) - tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 19) + tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i64 16) + tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i64 17) + tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i64 18) + tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i64 19) ret void } @@ -406,10 +404,10 @@ define void @str_with_off_many_imm_31_34(i32 %tile_slice, ptr %ptr) { ; CHECK-NEXT: str za[w13, 2], [x8, #2, mul vl] ; CHECK-NEXT: ret entry: - tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 31) - tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 32) - tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 33) - tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 34) + tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i64 31) + tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i64 32) + tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i64 33) + tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i64 34) ret void } @@ -425,60 +423,56 @@ define void @str_with_off_many_imm_32_35(i32 %tile_slice, ptr %ptr) { ; CHECK-NEXT: str za[w12, 3], [x8, #3, mul vl] ; CHECK-NEXT: ret entry: - tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 32) - tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 33) - tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 34) - tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 35) + tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i64 32) + tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i64 33) + tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i64 34) + tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i64 35) ret void } define void @str_with_off_many_var(i32 %tile_slice, ptr %ptr, i64 %vnum) { ; CHECK-LABEL: str_with_off_many_var: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sxtw x8, w2 -; CHECK-NEXT: rdsvl x9, #1 +; CHECK-NEXT: rdsvl x8, #1 ; CHECK-NEXT: add w12, w0, w2 -; CHECK-NEXT: madd x8, x9, x8, x1 +; CHECK-NEXT: madd x8, x8, x2, x1 ; CHECK-NEXT: str za[w12, 0], [x8] ; CHECK-NEXT: str za[w12, 1], [x8, #1, mul vl] ; CHECK-NEXT: str za[w12, 2], [x8, #2, mul vl] ; CHECK-NEXT: str za[w12, 3], [x8, #3, mul vl] ; CHECK-NEXT: ret entry: - %0 = trunc i64 %vnum to i32 - tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 %0) - %1 = add i32 %0, 1 - tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 %1) - %2 = add i32 %0, 2 - tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 %2) - %3 = add i32 %0, 3 - tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 %3) + tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i64 %vnum) + %1 = add i64 %vnum, 1 + tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i64 %1) + %2 = add i64 %vnum, 2 + tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i64 %2) + %3 = add i64 %vnum, 3 + tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i64 %3) ret void } define void @str_with_off_many_var_high(i32 %tile_slice, ptr %ptr, i64 %vnum) { ; CHECK-LABEL: str_with_off_many_var_high: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: add w8, w2, #32 -; CHECK-NEXT: rdsvl x10, #1 -; CHECK-NEXT: sxtw x9, w8 -; CHECK-NEXT: add w12, w0, w8 -; CHECK-NEXT: madd x9, x10, x9, x1 -; CHECK-NEXT: str za[w12, 1], [x9, #1, mul vl] -; CHECK-NEXT: str za[w12, 2], [x9, #2, mul vl] -; CHECK-NEXT: str za[w12, 3], [x9, #3, mul vl] -; CHECK-NEXT: str za[w12, 4], [x9, #4, mul vl] +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: add x9, x2, #32 +; CHECK-NEXT: madd x8, x8, x9, x1 +; CHECK-NEXT: add w12, w0, w9 +; CHECK-NEXT: str za[w12, 1], [x8, #1, mul vl] +; CHECK-NEXT: str za[w12, 2], [x8, #2, mul vl] +; CHECK-NEXT: str za[w12, 3], [x8, #3, mul vl] +; CHECK-NEXT: str za[w12, 4], [x8, #4, mul vl] ; CHECK-NEXT: ret entry: - %0 = trunc i64 %vnum to i32 - %1 = add i32 %0, 33 - tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 %1) - %2 = add i32 %0, 34 - tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 %2) - %3 = add i32 %0, 35 - tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 %3) - %4 = add i32 %0, 36 - tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i32 %4) + %1 = add i64 %vnum, 33 + tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i64 %1) + %2 = add i64 %vnum, 34 + tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i64 %2) + %3 = add i64 %vnum, 35 + tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i64 %3) + %4 = add i64 %vnum, 36 + tail call void @llvm.aarch64.sme.str(i32 %tile_slice, ptr %ptr, i64 %4) ret void } @@ -527,5 +521,5 @@ declare void @llvm.aarch64.sme.st1w.vert(<vscale x 4 x i1>, ptr, i32, i32) declare void @llvm.aarch64.sme.st1d.vert(<vscale x 2 x i1>, ptr, i32, i32) declare void @llvm.aarch64.sme.st1q.vert(<vscale x 1 x i1>, ptr, i32, i32) -declare void @llvm.aarch64.sme.str(i32, ptr, i32) +declare void @llvm.aarch64.sme.str(i32, ptr, i64) declare i64 @llvm.vscale.i64() >From 18c43031eb968923d59ac95d3413746f81d1de04 Mon Sep 17 00:00:00 2001 From: Jonathan Thackray <[email protected]> Date: Tue, 13 Jan 2026 17:23:20 +0000 Subject: [PATCH 2/2] fixup! [AArch64][llvm] Improve codegen for svldr_vnum_za/svstr_vnum_za Fix MLIR tests --- mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEIntrinsicOps.td | 2 +- mlir/test/Target/LLVMIR/arm-sme.mlir | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEIntrinsicOps.td b/mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEIntrinsicOps.td index 4d19fa5415ef0..6f2d5b867f959 100644 --- a/mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEIntrinsicOps.td +++ b/mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEIntrinsicOps.td @@ -159,7 +159,7 @@ def LLVM_aarch64_sme_str : ArmSME_IntrOp<"str">, Arguments<(ins Arg<I32, "Index">:$index, Arg<LLVM_AnyPointer, "Store address", [MemWrite]>:$store_address, - Arg<I32, "Offset">:$offset)>; + Arg<I64, "Offset">:$offset)>; // Vector to tile slice class LLVM_aarch64_sme_write<string direction> diff --git a/mlir/test/Target/LLVMIR/arm-sme.mlir b/mlir/test/Target/LLVMIR/arm-sme.mlir index 0a13a75618a23..ef37bfdffeed9 100644 --- a/mlir/test/Target/LLVMIR/arm-sme.mlir +++ b/mlir/test/Target/LLVMIR/arm-sme.mlir @@ -190,6 +190,7 @@ llvm.func @arm_sme_store(%nxv1i1 : vector<[1]xi1>, %nxv16i1 : vector<[16]xi1>, %ptr : !llvm.ptr) { %c0 = llvm.mlir.constant(0 : index) : i32 + %c0_i64 = llvm.mlir.constant(0 : i64) : i64 // CHECK: call void @llvm.aarch64.sme.st1q.horiz "arm_sme.intr.st1q.horiz"(%nxv1i1, %ptr, %c0) <{tile_id = 0 : i32}> : (vector<[1]xi1>, !llvm.ptr, i32) -> () @@ -221,7 +222,7 @@ llvm.func @arm_sme_store(%nxv1i1 : vector<[1]xi1>, "arm_sme.intr.st1b.vert"(%nxv16i1, %ptr, %c0) <{tile_id = 0 : i32}> : (vector<[16]xi1>, !llvm.ptr, i32) -> () // CHECK: call void @llvm.aarch64.sme.str - "arm_sme.intr.str"(%c0, %ptr, %c0) : (i32, !llvm.ptr, i32) -> () + "arm_sme.intr.str"(%c0, %ptr, %c0_i64) : (i32, !llvm.ptr, i64) -> () llvm.return } _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
