https://github.com/jofrn updated https://github.com/llvm/llvm-project/pull/197861
>From 48ba9b2fe30fff337c3679941f90cfb8e19de029 Mon Sep 17 00:00:00 2001 From: jofrn <[email protected]> Date: Thu, 14 May 2026 12:55:03 -0700 Subject: [PATCH] [X86] Cast atomic vectors in IR to support floats MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extend the X86 \`alignedstore\` PatFrag to also match \`atomic_store\` with vector-size alignment, so existing MOVAPS/MOVAPD/MOVDQA-family aligned-store patterns cover 128-bit aligned vector atomic stores on SSE/AVX/AVX-512 without per-type duplicates. \`<4 x float>\`, \`<2 x double>\`, \`<2 x i64>\`, \`<4 x i32>\`, \`<8 x half>\`, \`<8 x bfloat>\` all codegen to a single \`movaps\`/\`movapd\` on AVX+ via this. Adds v8f16/v8bf16 bitconvert variants to the widen-path \`atomic_store_32\` / \`atomic_store_64\` patterns so \`<2 x half>\`, \`<2 x bfloat>\`, \`<4 x half>\`, \`<4 x bfloat>\` atomic stores reaching the PR4 widen path also collapse to a single instruction on AVX+ targets. Vectors whose \`getTypeAction\` is split rather than widen still rely on PR6's \`SplitVecOp_ATOMIC_STORE\` — that path bitcasts the vector to a scalar integer and issues an integer \`atomic_store_N\`, picked up by the pre-existing scalar atomic-store patterns. The two legalization paths together cover the full vector-atomic-store matrix. Store-side counterpart to #148899. --- .../include/llvm/Target/TargetSelectionDAG.td | 2 +- llvm/lib/Target/X86/X86InstrAVX512.td | 2 +- llvm/lib/Target/X86/X86InstrFragmentsSIMD.td | 9 +- llvm/test/CodeGen/X86/atomic-load-store.ll | 104 ++++++++++++++++-- 4 files changed, 99 insertions(+), 18 deletions(-) diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td index 35848f76897b3..e0e00d16961a3 100644 --- a/llvm/include/llvm/Target/TargetSelectionDAG.td +++ b/llvm/include/llvm/Target/TargetSelectionDAG.td @@ -346,7 +346,7 @@ def SDTFPAtomic2 : SDTypeProfile<1, 2, [ ]>; def SDTAtomicStore : SDTypeProfile<0, 2, [ - SDTCisInt<0>, SDTCisPtrTy<1> + SDTCisPtrTy<1> ]>; def SDTAtomicLoad : SDTypeProfile<1, 1, [ SDTCisPtrTy<1> diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index 7e0054ef9e5f6..758cb05c89dd4 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -3356,7 +3356,7 @@ multiclass avx512_load_vl<bits<8> opc, string OpcodeStr, } multiclass avx512_store<bits<8> opc, string OpcodeStr, string BaseName, - X86VectorVTInfo _, PatFrag st_frag, PatFrag mstore, + X86VectorVTInfo _, PatFrags st_frag, PatFrag mstore, X86SchedWriteMoveLS Sched, bit NoMRPattern = 0> { let hasSideEffects = 0, isCodeGenOnly = 1, ForceDisassemble = 1 in { let isMoveReg = 1 in diff --git a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td index ea06b34e52cdf..0b6caeb502879 100644 --- a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -1259,10 +1259,11 @@ def memstore : PatFrags<(ops node:$val, node:$ptr), [(store node:$val, node:$ptr), (atomic_store node:$val, node:$ptr)]>; -// Like 'store', but always requires vector size alignment. -def alignedstore : PatFrag<(ops node:$val, node:$ptr), - (store node:$val, node:$ptr), [{ - auto *St = cast<StoreSDNode>(N); +// Like 'store' or 'atomic_store', but always requires vector size alignment. +def alignedstore : PatFrags<(ops node:$val, node:$ptr), + [(store node:$val, node:$ptr), + (atomic_store node:$val, node:$ptr)], [{ + auto *St = cast<MemSDNode>(N); return St->getAlign() >= St->getMemoryVT().getStoreSize(); }]> { let GISelPredicateCode = [{ diff --git a/llvm/test/CodeGen/X86/atomic-load-store.ll b/llvm/test/CodeGen/X86/atomic-load-store.ll index 066842739fb61..d2c32cb141d38 100644 --- a/llvm/test/CodeGen/X86/atomic-load-store.ll +++ b/llvm/test/CodeGen/X86/atomic-load-store.ll @@ -765,8 +765,7 @@ define void @store_atomic_vec2_half(ptr %x, <2 x half> %v) { ; ; CHECK-AVX-O3-LABEL: store_atomic_vec2_half: ; CHECK-AVX-O3: # %bb.0: -; CHECK-AVX-O3-NEXT: vmovd %xmm0, %eax -; CHECK-AVX-O3-NEXT: movl %eax, (%rdi) +; CHECK-AVX-O3-NEXT: vmovss %xmm0, (%rdi) ; CHECK-AVX-O3-NEXT: retq ; ; CHECK-SSE-O0-LABEL: store_atomic_vec2_half: @@ -788,8 +787,7 @@ define void @store_atomic_vec2_half(ptr %x, <2 x half> %v) { ; ; CHECK-AVX-O0-LABEL: store_atomic_vec2_half: ; CHECK-AVX-O0: # %bb.0: -; CHECK-AVX-O0-NEXT: vmovd %xmm0, %eax -; CHECK-AVX-O0-NEXT: movl %eax, (%rdi) +; CHECK-AVX-O0-NEXT: vmovd %xmm0, (%rdi) ; CHECK-AVX-O0-NEXT: retq store atomic <2 x half> %v, ptr %x release, align 4 ret void @@ -809,8 +807,7 @@ define void @store_atomic_vec2_bfloat(ptr %x, <2 x bfloat> %v) { ; ; CHECK-AVX-O3-LABEL: store_atomic_vec2_bfloat: ; CHECK-AVX-O3: # %bb.0: -; CHECK-AVX-O3-NEXT: vmovd %xmm0, %eax -; CHECK-AVX-O3-NEXT: movl %eax, (%rdi) +; CHECK-AVX-O3-NEXT: vmovss %xmm0, (%rdi) ; CHECK-AVX-O3-NEXT: retq ; ; CHECK-SSE-O0-LABEL: store_atomic_vec2_bfloat: @@ -932,8 +929,7 @@ define void @store_atomic_vec4_half(ptr %x, <4 x half> %v) nounwind { ; ; CHECK-AVX-O3-LABEL: store_atomic_vec4_half: ; CHECK-AVX-O3: # %bb.0: -; CHECK-AVX-O3-NEXT: vmovq %xmm0, %rax -; CHECK-AVX-O3-NEXT: movq %rax, (%rdi) +; CHECK-AVX-O3-NEXT: vmovlps %xmm0, (%rdi) ; CHECK-AVX-O3-NEXT: retq ; ; CHECK-SSE2-O0-LABEL: store_atomic_vec4_half: @@ -1007,8 +1003,7 @@ define void @store_atomic_vec4_half(ptr %x, <4 x half> %v) nounwind { ; ; CHECK-AVX-O0-LABEL: store_atomic_vec4_half: ; CHECK-AVX-O0: # %bb.0: -; CHECK-AVX-O0-NEXT: vmovq %xmm0, %rax -; CHECK-AVX-O0-NEXT: movq %rax, (%rdi) +; CHECK-AVX-O0-NEXT: vmovq %xmm0, (%rdi) ; CHECK-AVX-O0-NEXT: retq store atomic <4 x half> %v, ptr %x release, align 8 ret void @@ -1060,8 +1055,7 @@ define void @store_atomic_vec4_bfloat(ptr %x, <4 x bfloat> %v) nounwind { ; ; CHECK-AVX-O3-LABEL: store_atomic_vec4_bfloat: ; CHECK-AVX-O3: # %bb.0: -; CHECK-AVX-O3-NEXT: vmovq %xmm0, %rax -; CHECK-AVX-O3-NEXT: movq %rax, (%rdi) +; CHECK-AVX-O3-NEXT: vmovlps %xmm0, (%rdi) ; CHECK-AVX-O3-NEXT: retq ; ; CHECK-SSE-O0-LABEL: store_atomic_vec4_bfloat: @@ -1201,6 +1195,92 @@ define void @store_atomic_vec4_bfloat(ptr %x, <4 x bfloat> %v) nounwind { ret void } +define void @store_atomic_vec4_float_align(ptr %x, <4 x float> %v) nounwind { +; CHECK-SSE2-O3-LABEL: store_atomic_vec4_float_align: +; CHECK-SSE2-O3: # %bb.0: +; CHECK-SSE2-O3-NEXT: pushq %rax +; CHECK-SSE2-O3-NEXT: movq %xmm0, %rsi +; CHECK-SSE2-O3-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1] +; CHECK-SSE2-O3-NEXT: movq %xmm0, %rdx +; CHECK-SSE2-O3-NEXT: movl $3, %ecx +; CHECK-SSE2-O3-NEXT: callq __atomic_store_16@PLT +; CHECK-SSE2-O3-NEXT: popq %rax +; CHECK-SSE2-O3-NEXT: retq +; +; CHECK-SSE4-O3-LABEL: store_atomic_vec4_float_align: +; CHECK-SSE4-O3: # %bb.0: +; CHECK-SSE4-O3-NEXT: pushq %rbx +; CHECK-SSE4-O3-NEXT: movdqa (%rdi), %xmm1 +; CHECK-SSE4-O3-NEXT: pextrq $1, %xmm0, %rcx +; CHECK-SSE4-O3-NEXT: movq %xmm0, %rbx +; CHECK-SSE4-O3-NEXT: .p2align 4 +; CHECK-SSE4-O3-NEXT: .LBB39_1: # %atomicrmw.start +; CHECK-SSE4-O3-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-SSE4-O3-NEXT: movq %xmm1, %rax +; CHECK-SSE4-O3-NEXT: pextrq $1, %xmm1, %rdx +; CHECK-SSE4-O3-NEXT: lock cmpxchg16b (%rdi) +; CHECK-SSE4-O3-NEXT: movq %rdx, %xmm0 +; CHECK-SSE4-O3-NEXT: movq %rax, %xmm1 +; CHECK-SSE4-O3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; CHECK-SSE4-O3-NEXT: jne .LBB39_1 +; CHECK-SSE4-O3-NEXT: # %bb.2: # %atomicrmw.end +; CHECK-SSE4-O3-NEXT: popq %rbx +; CHECK-SSE4-O3-NEXT: retq +; +; CHECK-AVX-O3-LABEL: store_atomic_vec4_float_align: +; CHECK-AVX-O3: # %bb.0: +; CHECK-AVX-O3-NEXT: vmovaps %xmm0, (%rdi) +; CHECK-AVX-O3-NEXT: retq +; +; CHECK-SSE2-O0-LABEL: store_atomic_vec4_float_align: +; CHECK-SSE2-O0: # %bb.0: +; CHECK-SSE2-O0-NEXT: pushq %rax +; CHECK-SSE2-O0-NEXT: movq %xmm0, %rsi +; CHECK-SSE2-O0-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] +; CHECK-SSE2-O0-NEXT: movq %xmm0, %rdx +; CHECK-SSE2-O0-NEXT: movl $3, %ecx +; CHECK-SSE2-O0-NEXT: callq __atomic_store_16@PLT +; CHECK-SSE2-O0-NEXT: popq %rax +; CHECK-SSE2-O0-NEXT: retq +; +; CHECK-SSE4-O0-LABEL: store_atomic_vec4_float_align: +; CHECK-SSE4-O0: # %bb.0: +; CHECK-SSE4-O0-NEXT: pushq %rbx +; CHECK-SSE4-O0-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-SSE4-O0-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; CHECK-SSE4-O0-NEXT: movaps (%rdi), %xmm0 +; CHECK-SSE4-O0-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-SSE4-O0-NEXT: .LBB39_1: # %atomicrmw.start +; CHECK-SSE4-O0-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-SSE4-O0-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-SSE4-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload +; CHECK-SSE4-O0-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-SSE4-O0-NEXT: pextrq $1, %xmm1, %rcx +; CHECK-SSE4-O0-NEXT: movq %xmm1, %rbx +; CHECK-SSE4-O0-NEXT: movq %xmm0, %rax +; CHECK-SSE4-O0-NEXT: pextrq $1, %xmm0, %rdx +; CHECK-SSE4-O0-NEXT: lock cmpxchg16b (%rsi) +; CHECK-SSE4-O0-NEXT: movq %rax, %rcx +; CHECK-SSE4-O0-NEXT: sete %al +; CHECK-SSE4-O0-NEXT: movq %rdx, %xmm1 +; CHECK-SSE4-O0-NEXT: movq %rcx, %xmm0 +; CHECK-SSE4-O0-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; CHECK-SSE4-O0-NEXT: testb $1, %al +; CHECK-SSE4-O0-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-SSE4-O0-NEXT: jne .LBB39_2 +; CHECK-SSE4-O0-NEXT: jmp .LBB39_1 +; CHECK-SSE4-O0-NEXT: .LBB39_2: # %atomicrmw.end +; CHECK-SSE4-O0-NEXT: popq %rbx +; CHECK-SSE4-O0-NEXT: retq +; +; CHECK-AVX-O0-LABEL: store_atomic_vec4_float_align: +; CHECK-AVX-O0: # %bb.0: +; CHECK-AVX-O0-NEXT: vmovaps %xmm0, (%rdi) +; CHECK-AVX-O0-NEXT: retq + store atomic <4 x float> %v, ptr %x release, align 16 + ret void +} + define <2 x half> @atomic_vec2_half(ptr %x) { ; CHECK-SSE-O3-LABEL: atomic_vec2_half: ; CHECK-SSE-O3: # %bb.0: _______________________________________________ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
