https://github.com/jofrn updated https://github.com/llvm/llvm-project/pull/148899
>From 7ac25a893e5f92c1fce114b60454843f3d350b31 Mon Sep 17 00:00:00 2001 From: jofrn <[email protected]> Date: Thu, 30 Oct 2025 12:19:59 -0400 Subject: [PATCH 1/2] [SelectionDAG] Split vector types for atomic load Vector types that aren't widened are split so that a single ATOMIC_LOAD is issued for the entire vector at once. This change utilizes the load vectorization infrastructure in SelectionDAG in order to group the vectors. This enables SelectionDAG to translate vectors with type bfloat,half. --- .../include/llvm/Target/TargetSelectionDAG.td | 14 + llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h | 1 + .../SelectionDAG/LegalizeVectorTypes.cpp | 35 ++ llvm/test/CodeGen/X86/atomic-load-store.ll | 352 +++++++++++++++++- 4 files changed, 398 insertions(+), 4 deletions(-) diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td index 573342846b4cf..70ae16f7492c3 100644 --- a/llvm/include/llvm/Target/TargetSelectionDAG.td +++ b/llvm/include/llvm/Target/TargetSelectionDAG.td @@ -2111,6 +2111,20 @@ def atomic_load_64 : let MemoryVT = i64; } +def atomic_load_128_v2i64 : + PatFrag<(ops node:$ptr), + (atomic_load node:$ptr)> { + let IsAtomic = true; + let MemoryVT = v2i64; +} + +def atomic_load_128_v4i32 : + PatFrag<(ops node:$ptr), + (atomic_load node:$ptr)> { + let IsAtomic = true; + let MemoryVT = v4i32; +} + def atomic_load_nonext_8 : PatFrag<(ops node:$ptr), (atomic_load_nonext node:$ptr)> { let IsAtomic = true; // FIXME: Should be IsLoad and/or IsAtomic? diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h index 89698a2c77123..b0420a9a9db73 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -941,6 +941,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer { void SplitVecRes_FPOp_MultiType(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_IS_FPCLASS(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo, SDValue &Hi); + void SplitVecRes_ATOMIC_LOAD(AtomicSDNode *LD, SDValue &Lo, SDValue &Hi); void SplitVecRes_LOAD(LoadSDNode *LD, SDValue &Lo, SDValue &Hi); void SplitVecRes_VP_LOAD(VPLoadSDNode *LD, SDValue &Lo, SDValue &Hi); void SplitVecRes_VP_LOAD_FF(VPLoadFFSDNode *LD, SDValue &Lo, SDValue &Hi); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index cbd67675aab96..bdf8ccbcf7fea 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -1350,6 +1350,9 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) { SplitVecRes_STEP_VECTOR(N, Lo, Hi); break; case ISD::SIGN_EXTEND_INREG: SplitVecRes_InregOp(N, Lo, Hi); break; + case ISD::ATOMIC_LOAD: + SplitVecRes_ATOMIC_LOAD(cast<AtomicSDNode>(N), Lo, Hi); + break; case ISD::LOAD: SplitVecRes_LOAD(cast<LoadSDNode>(N), Lo, Hi); break; @@ -2347,6 +2350,38 @@ void DAGTypeLegalizer::SplitVecRes_ScalarOp(SDNode *N, SDValue &Lo, } } +void DAGTypeLegalizer::SplitVecRes_ATOMIC_LOAD(AtomicSDNode *LD, SDValue &Lo, + SDValue &Hi) { + assert(LD->getExtensionType() == ISD::NON_EXTLOAD && + "Extended load during type legalization!"); + SDLoc dl(LD); + EVT VT = LD->getValueType(0); + EVT LoVT, HiVT; + std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT); + + SDValue Ch = LD->getChain(); + SDValue Ptr = LD->getBasePtr(); + + EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits()); + EVT MemIntVT = + EVT::getIntegerVT(*DAG.getContext(), LD->getMemoryVT().getSizeInBits()); + SDValue ALD = DAG.getAtomicLoad(LD->getExtensionType(), dl, MemIntVT, IntVT, + Ch, Ptr, LD->getMemOperand()); + + EVT LoIntVT = EVT::getIntegerVT(*DAG.getContext(), LoVT.getSizeInBits()); + EVT HiIntVT = EVT::getIntegerVT(*DAG.getContext(), HiVT.getSizeInBits()); + SDValue ExtractLo, ExtractHi; + SplitInteger(ALD, LoIntVT, HiIntVT, ExtractLo, ExtractHi); + + Lo = DAG.getBitcast(LoVT, ExtractLo); + Hi = DAG.getBitcast(HiVT, ExtractHi); + + // Legalize the chain result - switch anything that used the old chain to + // use the new one. + ReplaceValueWith(SDValue(LD, 1), ALD.getValue(1)); +} + + void DAGTypeLegalizer::SplitVecRes_LOAD(LoadSDNode *LD, SDValue &Lo, SDValue &Hi) { assert(ISD::isUNINDEXEDLoad(LD) && "Indexed load during type legalization!"); diff --git a/llvm/test/CodeGen/X86/atomic-load-store.ll b/llvm/test/CodeGen/X86/atomic-load-store.ll index 7e15b9303887f..928dfef3143da 100644 --- a/llvm/test/CodeGen/X86/atomic-load-store.ll +++ b/llvm/test/CodeGen/X86/atomic-load-store.ll @@ -1,10 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -mcpu=x86-64 | FileCheck %s --check-prefixes=CHECK,CHECK-O3,CHECK-SSE-O3 -; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,CHECK-O3,CHECK-SSE-O3 +; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -mcpu=x86-64 | FileCheck %s --check-prefixes=CHECK,CHECK-O3,CHECK-SSE-O3,CHECK-SSE2-O3 +; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,CHECK-O3,CHECK-SSE-O3,CHECK-SSE4-O3 ; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,CHECK-O3,CHECK-AVX-O3,CHECK-AVX2-O3 ; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,CHECK-O3,CHECK-AVX-O3,CHECK-AVX512-O3 -; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -O0 -mcpu=x86-64 | FileCheck %s --check-prefixes=CHECK,CHECK-O0,CHECK-SSE-O0 -; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -O0 -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,CHECK-O0,CHECK-SSE-O0 +; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -O0 -mcpu=x86-64 | FileCheck %s --check-prefixes=CHECK,CHECK-O0,CHECK-SSE-O0,CHECK-SSE2-O0 +; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -O0 -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,CHECK-O0,CHECK-SSE-O0,CHECK-SSE4-O0 ; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -O0 -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,CHECK-O0,CHECK-AVX-O0,CHECK-AVX2-O0 ; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -O0 -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,CHECK-O0,CHECK-AVX-O0,CHECK-AVX512-O0 @@ -295,6 +295,96 @@ define <2 x float> @atomic_vec2_float_align(ptr %x) { ret <2 x float> %ret } +define <2 x half> @atomic_vec2_half(ptr %x) { +; CHECK-SSE-O3-LABEL: atomic_vec2_half: +; CHECK-SSE-O3: # %bb.0: +; CHECK-SSE-O3-NEXT: movl (%rdi), %eax +; CHECK-SSE-O3-NEXT: pinsrw $0, %eax, %xmm0 +; CHECK-SSE-O3-NEXT: shrl $16, %eax +; CHECK-SSE-O3-NEXT: pinsrw $0, %eax, %xmm1 +; CHECK-SSE-O3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; CHECK-SSE-O3-NEXT: retq +; +; CHECK-AVX-O3-LABEL: atomic_vec2_half: +; CHECK-AVX-O3: # %bb.0: +; CHECK-AVX-O3-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-AVX-O3-NEXT: retq +; +; CHECK-SSE-O0-LABEL: atomic_vec2_half: +; CHECK-SSE-O0: # %bb.0: +; CHECK-SSE-O0-NEXT: movl (%rdi), %eax +; CHECK-SSE-O0-NEXT: movl %eax, %ecx +; CHECK-SSE-O0-NEXT: shrl $16, %ecx +; CHECK-SSE-O0-NEXT: movw %cx, %dx +; CHECK-SSE-O0-NEXT: # implicit-def: $ecx +; CHECK-SSE-O0-NEXT: movw %dx, %cx +; CHECK-SSE-O0-NEXT: # implicit-def: $xmm1 +; CHECK-SSE-O0-NEXT: pinsrw $0, %ecx, %xmm1 +; CHECK-SSE-O0-NEXT: movw %ax, %cx +; CHECK-SSE-O0-NEXT: # implicit-def: $eax +; CHECK-SSE-O0-NEXT: movw %cx, %ax +; CHECK-SSE-O0-NEXT: # implicit-def: $xmm0 +; CHECK-SSE-O0-NEXT: pinsrw $0, %eax, %xmm0 +; CHECK-SSE-O0-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; CHECK-SSE-O0-NEXT: retq +; +; CHECK-AVX-O0-LABEL: atomic_vec2_half: +; CHECK-AVX-O0: # %bb.0: +; CHECK-AVX-O0-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; CHECK-AVX-O0-NEXT: retq + %ret = load atomic <2 x half>, ptr %x acquire, align 4 + ret <2 x half> %ret +} +define <2 x bfloat> @atomic_vec2_bfloat(ptr %x) { +; CHECK-SSE-O3-LABEL: atomic_vec2_bfloat: +; CHECK-SSE-O3: # %bb.0: +; CHECK-SSE-O3-NEXT: movl (%rdi), %eax +; CHECK-SSE-O3-NEXT: pinsrw $0, %eax, %xmm0 +; CHECK-SSE-O3-NEXT: shrl $16, %eax +; CHECK-SSE-O3-NEXT: pinsrw $0, %eax, %xmm1 +; CHECK-SSE-O3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; CHECK-SSE-O3-NEXT: retq +; +; CHECK-AVX-O3-LABEL: atomic_vec2_bfloat: +; CHECK-AVX-O3: # %bb.0: +; CHECK-AVX-O3-NEXT: movl (%rdi), %eax +; CHECK-AVX-O3-NEXT: movw %ax, -{{[0-9]+}}(%rsp) +; CHECK-AVX-O3-NEXT: shrl $16, %eax +; CHECK-AVX-O3-NEXT: movw %ax, -{{[0-9]+}}(%rsp) +; CHECK-AVX-O3-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0 +; CHECK-AVX-O3-NEXT: retq +; +; CHECK-SSE-O0-LABEL: atomic_vec2_bfloat: +; CHECK-SSE-O0: # %bb.0: +; CHECK-SSE-O0-NEXT: movl (%rdi), %eax +; CHECK-SSE-O0-NEXT: movl %eax, %ecx +; CHECK-SSE-O0-NEXT: shrl $16, %ecx +; CHECK-SSE-O0-NEXT: # kill: def $cx killed $cx killed $ecx +; CHECK-SSE-O0-NEXT: movw %ax, %dx +; CHECK-SSE-O0-NEXT: # implicit-def: $eax +; CHECK-SSE-O0-NEXT: movw %dx, %ax +; CHECK-SSE-O0-NEXT: # implicit-def: $xmm0 +; CHECK-SSE-O0-NEXT: pinsrw $0, %eax, %xmm0 +; CHECK-SSE-O0-NEXT: # implicit-def: $eax +; CHECK-SSE-O0-NEXT: movw %cx, %ax +; CHECK-SSE-O0-NEXT: # implicit-def: $xmm1 +; CHECK-SSE-O0-NEXT: pinsrw $0, %eax, %xmm1 +; CHECK-SSE-O0-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; CHECK-SSE-O0-NEXT: retq +; +; CHECK-AVX-O0-LABEL: atomic_vec2_bfloat: +; CHECK-AVX-O0: # %bb.0: +; CHECK-AVX-O0-NEXT: movl (%rdi), %eax +; CHECK-AVX-O0-NEXT: movw %ax, %cx +; CHECK-AVX-O0-NEXT: movw %cx, -{{[0-9]+}}(%rsp) +; CHECK-AVX-O0-NEXT: shrl $16, %eax +; CHECK-AVX-O0-NEXT: # kill: def $ax killed $ax killed $eax +; CHECK-AVX-O0-NEXT: movw %ax, -{{[0-9]+}}(%rsp) +; CHECK-AVX-O0-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0 +; CHECK-AVX-O0-NEXT: retq + %ret = load atomic <2 x bfloat>, ptr %x acquire, align 4 + ret <2 x bfloat> %ret +} define <1 x ptr> @atomic_vec1_ptr(ptr %x) nounwind { ; CHECK-O3-LABEL: atomic_vec1_ptr: ; CHECK-O3: # %bb.0: @@ -585,6 +675,260 @@ define <4 x i16> @atomic_vec4_i16(ptr %x) nounwind { ret <4 x i16> %ret } +define <4 x half> @atomic_vec4_half(ptr %x) nounwind { +; CHECK-SSE2-O3-LABEL: atomic_vec4_half: +; CHECK-SSE2-O3: # %bb.0: +; CHECK-SSE2-O3-NEXT: movq (%rdi), %rax +; CHECK-SSE2-O3-NEXT: movl %eax, %ecx +; CHECK-SSE2-O3-NEXT: shrl $16, %ecx +; CHECK-SSE2-O3-NEXT: pinsrw $0, %ecx, %xmm1 +; CHECK-SSE2-O3-NEXT: pinsrw $0, %eax, %xmm0 +; CHECK-SSE2-O3-NEXT: movq %rax, %rcx +; CHECK-SSE2-O3-NEXT: shrq $32, %rcx +; CHECK-SSE2-O3-NEXT: pinsrw $0, %ecx, %xmm2 +; CHECK-SSE2-O3-NEXT: shrq $48, %rax +; CHECK-SSE2-O3-NEXT: pinsrw $0, %eax, %xmm3 +; CHECK-SSE2-O3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; CHECK-SSE2-O3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; CHECK-SSE2-O3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; CHECK-SSE2-O3-NEXT: retq +; +; CHECK-SSE4-O3-LABEL: atomic_vec4_half: +; CHECK-SSE4-O3: # %bb.0: +; CHECK-SSE4-O3-NEXT: movq (%rdi), %rax +; CHECK-SSE4-O3-NEXT: movl %eax, %ecx +; CHECK-SSE4-O3-NEXT: shrl $16, %ecx +; CHECK-SSE4-O3-NEXT: pinsrw $0, %ecx, %xmm1 +; CHECK-SSE4-O3-NEXT: pinsrw $0, %eax, %xmm0 +; CHECK-SSE4-O3-NEXT: movq %rax, %rcx +; CHECK-SSE4-O3-NEXT: shrq $32, %rcx +; CHECK-SSE4-O3-NEXT: pinsrw $0, %ecx, %xmm2 +; CHECK-SSE4-O3-NEXT: shrq $48, %rax +; CHECK-SSE4-O3-NEXT: pinsrw $0, %eax, %xmm3 +; CHECK-SSE4-O3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; CHECK-SSE4-O3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; CHECK-SSE4-O3-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],zero,zero +; CHECK-SSE4-O3-NEXT: retq +; +; CHECK-AVX-O3-LABEL: atomic_vec4_half: +; CHECK-AVX-O3: # %bb.0: +; CHECK-AVX-O3-NEXT: vmovq (%rdi), %xmm0 +; CHECK-AVX-O3-NEXT: retq +; +; CHECK-SSE2-O0-LABEL: atomic_vec4_half: +; CHECK-SSE2-O0: # %bb.0: +; CHECK-SSE2-O0-NEXT: movq (%rdi), %rax +; CHECK-SSE2-O0-NEXT: movl %eax, %ecx +; CHECK-SSE2-O0-NEXT: shrl $16, %ecx +; CHECK-SSE2-O0-NEXT: movw %cx, %dx +; CHECK-SSE2-O0-NEXT: # implicit-def: $ecx +; CHECK-SSE2-O0-NEXT: movw %dx, %cx +; CHECK-SSE2-O0-NEXT: # implicit-def: $xmm2 +; CHECK-SSE2-O0-NEXT: pinsrw $0, %ecx, %xmm2 +; CHECK-SSE2-O0-NEXT: movw %ax, %dx +; CHECK-SSE2-O0-NEXT: # implicit-def: $ecx +; CHECK-SSE2-O0-NEXT: movw %dx, %cx +; CHECK-SSE2-O0-NEXT: # implicit-def: $xmm0 +; CHECK-SSE2-O0-NEXT: pinsrw $0, %ecx, %xmm0 +; CHECK-SSE2-O0-NEXT: movq %rax, %rcx +; CHECK-SSE2-O0-NEXT: shrq $32, %rcx +; CHECK-SSE2-O0-NEXT: movw %cx, %dx +; CHECK-SSE2-O0-NEXT: # implicit-def: $ecx +; CHECK-SSE2-O0-NEXT: movw %dx, %cx +; CHECK-SSE2-O0-NEXT: # implicit-def: $xmm1 +; CHECK-SSE2-O0-NEXT: pinsrw $0, %ecx, %xmm1 +; CHECK-SSE2-O0-NEXT: shrq $48, %rax +; CHECK-SSE2-O0-NEXT: movw %ax, %cx +; CHECK-SSE2-O0-NEXT: # implicit-def: $eax +; CHECK-SSE2-O0-NEXT: movw %cx, %ax +; CHECK-SSE2-O0-NEXT: # implicit-def: $xmm3 +; CHECK-SSE2-O0-NEXT: pinsrw $0, %eax, %xmm3 +; CHECK-SSE2-O0-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; CHECK-SSE2-O0-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; CHECK-SSE2-O0-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-O0-NEXT: retq +; +; CHECK-SSE4-O0-LABEL: atomic_vec4_half: +; CHECK-SSE4-O0: # %bb.0: +; CHECK-SSE4-O0-NEXT: movq (%rdi), %rax +; CHECK-SSE4-O0-NEXT: movl %eax, %ecx +; CHECK-SSE4-O0-NEXT: shrl $16, %ecx +; CHECK-SSE4-O0-NEXT: movw %cx, %dx +; CHECK-SSE4-O0-NEXT: # implicit-def: $ecx +; CHECK-SSE4-O0-NEXT: movw %dx, %cx +; CHECK-SSE4-O0-NEXT: # implicit-def: $xmm2 +; CHECK-SSE4-O0-NEXT: pinsrw $0, %ecx, %xmm2 +; CHECK-SSE4-O0-NEXT: movw %ax, %dx +; CHECK-SSE4-O0-NEXT: # implicit-def: $ecx +; CHECK-SSE4-O0-NEXT: movw %dx, %cx +; CHECK-SSE4-O0-NEXT: # implicit-def: $xmm0 +; CHECK-SSE4-O0-NEXT: pinsrw $0, %ecx, %xmm0 +; CHECK-SSE4-O0-NEXT: movq %rax, %rcx +; CHECK-SSE4-O0-NEXT: shrq $32, %rcx +; CHECK-SSE4-O0-NEXT: movw %cx, %dx +; CHECK-SSE4-O0-NEXT: # implicit-def: $ecx +; CHECK-SSE4-O0-NEXT: movw %dx, %cx +; CHECK-SSE4-O0-NEXT: # implicit-def: $xmm1 +; CHECK-SSE4-O0-NEXT: pinsrw $0, %ecx, %xmm1 +; CHECK-SSE4-O0-NEXT: shrq $48, %rax +; CHECK-SSE4-O0-NEXT: movw %ax, %cx +; CHECK-SSE4-O0-NEXT: # implicit-def: $eax +; CHECK-SSE4-O0-NEXT: movw %cx, %ax +; CHECK-SSE4-O0-NEXT: # implicit-def: $xmm3 +; CHECK-SSE4-O0-NEXT: pinsrw $0, %eax, %xmm3 +; CHECK-SSE4-O0-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; CHECK-SSE4-O0-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; CHECK-SSE4-O0-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero +; CHECK-SSE4-O0-NEXT: retq +; +; CHECK-AVX-O0-LABEL: atomic_vec4_half: +; CHECK-AVX-O0: # %bb.0: +; CHECK-AVX-O0-NEXT: vmovq (%rdi), %xmm0 +; CHECK-AVX-O0-NEXT: retq + %ret = load atomic <4 x half>, ptr %x acquire, align 8 + ret <4 x half> %ret +} +define <4 x bfloat> @atomic_vec4_bfloat(ptr %x) nounwind { +; CHECK-SSE2-O3-LABEL: atomic_vec4_bfloat: +; CHECK-SSE2-O3: # %bb.0: +; CHECK-SSE2-O3-NEXT: movq (%rdi), %rax +; CHECK-SSE2-O3-NEXT: movq %rax, %rcx +; CHECK-SSE2-O3-NEXT: movq %rax, %rdx +; CHECK-SSE2-O3-NEXT: pinsrw $0, %eax, %xmm0 +; CHECK-SSE2-O3-NEXT: # kill: def $eax killed $eax killed $rax +; CHECK-SSE2-O3-NEXT: shrl $16, %eax +; CHECK-SSE2-O3-NEXT: shrq $32, %rcx +; CHECK-SSE2-O3-NEXT: shrq $48, %rdx +; CHECK-SSE2-O3-NEXT: pinsrw $0, %edx, %xmm1 +; CHECK-SSE2-O3-NEXT: pinsrw $0, %ecx, %xmm2 +; CHECK-SSE2-O3-NEXT: pinsrw $0, %eax, %xmm3 +; CHECK-SSE2-O3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; CHECK-SSE2-O3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; CHECK-SSE2-O3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; CHECK-SSE2-O3-NEXT: retq +; +; CHECK-SSE4-O3-LABEL: atomic_vec4_bfloat: +; CHECK-SSE4-O3: # %bb.0: +; CHECK-SSE4-O3-NEXT: movq (%rdi), %rax +; CHECK-SSE4-O3-NEXT: movq %rax, %rcx +; CHECK-SSE4-O3-NEXT: movq %rax, %rdx +; CHECK-SSE4-O3-NEXT: pinsrw $0, %eax, %xmm0 +; CHECK-SSE4-O3-NEXT: # kill: def $eax killed $eax killed $rax +; CHECK-SSE4-O3-NEXT: shrl $16, %eax +; CHECK-SSE4-O3-NEXT: shrq $32, %rcx +; CHECK-SSE4-O3-NEXT: shrq $48, %rdx +; CHECK-SSE4-O3-NEXT: pinsrw $0, %edx, %xmm1 +; CHECK-SSE4-O3-NEXT: pinsrw $0, %ecx, %xmm2 +; CHECK-SSE4-O3-NEXT: pinsrw $0, %eax, %xmm3 +; CHECK-SSE4-O3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; CHECK-SSE4-O3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; CHECK-SSE4-O3-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],zero,zero +; CHECK-SSE4-O3-NEXT: retq +; +; CHECK-AVX-O3-LABEL: atomic_vec4_bfloat: +; CHECK-AVX-O3: # %bb.0: +; CHECK-AVX-O3-NEXT: movq (%rdi), %rax +; CHECK-AVX-O3-NEXT: movw %ax, -{{[0-9]+}}(%rsp) +; CHECK-AVX-O3-NEXT: movq %rax, %rcx +; CHECK-AVX-O3-NEXT: shrq $48, %rcx +; CHECK-AVX-O3-NEXT: movw %cx, -{{[0-9]+}}(%rsp) +; CHECK-AVX-O3-NEXT: movq %rax, %rcx +; CHECK-AVX-O3-NEXT: shrq $32, %rcx +; CHECK-AVX-O3-NEXT: movw %cx, -{{[0-9]+}}(%rsp) +; CHECK-AVX-O3-NEXT: shrl $16, %eax +; CHECK-AVX-O3-NEXT: movw %ax, -{{[0-9]+}}(%rsp) +; CHECK-AVX-O3-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0 +; CHECK-AVX-O3-NEXT: retq +; +; CHECK-SSE2-O0-LABEL: atomic_vec4_bfloat: +; CHECK-SSE2-O0: # %bb.0: +; CHECK-SSE2-O0-NEXT: movq (%rdi), %rax +; CHECK-SSE2-O0-NEXT: movl %eax, %ecx +; CHECK-SSE2-O0-NEXT: shrl $16, %ecx +; CHECK-SSE2-O0-NEXT: # kill: def $cx killed $cx killed $ecx +; CHECK-SSE2-O0-NEXT: movw %ax, %dx +; CHECK-SSE2-O0-NEXT: movq %rax, %rsi +; CHECK-SSE2-O0-NEXT: shrq $32, %rsi +; CHECK-SSE2-O0-NEXT: # kill: def $si killed $si killed $rsi +; CHECK-SSE2-O0-NEXT: shrq $48, %rax +; CHECK-SSE2-O0-NEXT: movw %ax, %di +; CHECK-SSE2-O0-NEXT: # implicit-def: $eax +; CHECK-SSE2-O0-NEXT: movw %di, %ax +; CHECK-SSE2-O0-NEXT: # implicit-def: $xmm0 +; CHECK-SSE2-O0-NEXT: pinsrw $0, %eax, %xmm0 +; CHECK-SSE2-O0-NEXT: # implicit-def: $eax +; CHECK-SSE2-O0-NEXT: movw %si, %ax +; CHECK-SSE2-O0-NEXT: # implicit-def: $xmm1 +; CHECK-SSE2-O0-NEXT: pinsrw $0, %eax, %xmm1 +; CHECK-SSE2-O0-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; CHECK-SSE2-O0-NEXT: # implicit-def: $eax +; CHECK-SSE2-O0-NEXT: movw %dx, %ax +; CHECK-SSE2-O0-NEXT: # implicit-def: $xmm0 +; CHECK-SSE2-O0-NEXT: pinsrw $0, %eax, %xmm0 +; CHECK-SSE2-O0-NEXT: # implicit-def: $eax +; CHECK-SSE2-O0-NEXT: movw %cx, %ax +; CHECK-SSE2-O0-NEXT: # implicit-def: $xmm2 +; CHECK-SSE2-O0-NEXT: pinsrw $0, %eax, %xmm2 +; CHECK-SSE2-O0-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; CHECK-SSE2-O0-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-SSE2-O0-NEXT: retq +; +; CHECK-SSE4-O0-LABEL: atomic_vec4_bfloat: +; CHECK-SSE4-O0: # %bb.0: +; CHECK-SSE4-O0-NEXT: movq (%rdi), %rax +; CHECK-SSE4-O0-NEXT: movl %eax, %ecx +; CHECK-SSE4-O0-NEXT: shrl $16, %ecx +; CHECK-SSE4-O0-NEXT: # kill: def $cx killed $cx killed $ecx +; CHECK-SSE4-O0-NEXT: movw %ax, %dx +; CHECK-SSE4-O0-NEXT: movq %rax, %rsi +; CHECK-SSE4-O0-NEXT: shrq $32, %rsi +; CHECK-SSE4-O0-NEXT: # kill: def $si killed $si killed $rsi +; CHECK-SSE4-O0-NEXT: shrq $48, %rax +; CHECK-SSE4-O0-NEXT: movw %ax, %di +; CHECK-SSE4-O0-NEXT: # implicit-def: $eax +; CHECK-SSE4-O0-NEXT: movw %di, %ax +; CHECK-SSE4-O0-NEXT: # implicit-def: $xmm0 +; CHECK-SSE4-O0-NEXT: pinsrw $0, %eax, %xmm0 +; CHECK-SSE4-O0-NEXT: # implicit-def: $eax +; CHECK-SSE4-O0-NEXT: movw %si, %ax +; CHECK-SSE4-O0-NEXT: # implicit-def: $xmm1 +; CHECK-SSE4-O0-NEXT: pinsrw $0, %eax, %xmm1 +; CHECK-SSE4-O0-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; CHECK-SSE4-O0-NEXT: # implicit-def: $eax +; CHECK-SSE4-O0-NEXT: movw %dx, %ax +; CHECK-SSE4-O0-NEXT: # implicit-def: $xmm0 +; CHECK-SSE4-O0-NEXT: pinsrw $0, %eax, %xmm0 +; CHECK-SSE4-O0-NEXT: # implicit-def: $eax +; CHECK-SSE4-O0-NEXT: movw %cx, %ax +; CHECK-SSE4-O0-NEXT: # implicit-def: $xmm2 +; CHECK-SSE4-O0-NEXT: pinsrw $0, %eax, %xmm2 +; CHECK-SSE4-O0-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; CHECK-SSE4-O0-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero +; CHECK-SSE4-O0-NEXT: retq +; +; CHECK-AVX-O0-LABEL: atomic_vec4_bfloat: +; CHECK-AVX-O0: # %bb.0: +; CHECK-AVX-O0-NEXT: movq (%rdi), %rax +; CHECK-AVX-O0-NEXT: movq %rax, %rcx +; CHECK-AVX-O0-NEXT: shrq $48, %rcx +; CHECK-AVX-O0-NEXT: # kill: def $cx killed $cx killed $rcx +; CHECK-AVX-O0-NEXT: movw %cx, -{{[0-9]+}}(%rsp) +; CHECK-AVX-O0-NEXT: movq %rax, %rcx +; CHECK-AVX-O0-NEXT: shrq $32, %rcx +; CHECK-AVX-O0-NEXT: # kill: def $cx killed $cx killed $rcx +; CHECK-AVX-O0-NEXT: movw %cx, -{{[0-9]+}}(%rsp) +; CHECK-AVX-O0-NEXT: movw %ax, %cx +; CHECK-AVX-O0-NEXT: movw %cx, -{{[0-9]+}}(%rsp) +; CHECK-AVX-O0-NEXT: # kill: def $eax killed $eax killed $rax +; CHECK-AVX-O0-NEXT: shrl $16, %eax +; CHECK-AVX-O0-NEXT: # kill: def $ax killed $ax killed $eax +; CHECK-AVX-O0-NEXT: movw %ax, -{{[0-9]+}}(%rsp) +; CHECK-AVX-O0-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0 +; CHECK-AVX-O0-NEXT: retq + %ret = load atomic <4 x bfloat>, ptr %x acquire, align 8 + ret <4 x bfloat> %ret +} + define <4 x float> @atomic_vec4_float(ptr %x) nounwind { ; CHECK-SSE-O3-LABEL: atomic_vec4_float: ; CHECK-SSE-O3: # %bb.0: >From 449e57c7083d558ded63d6b920a6d637c2a2ba56 Mon Sep 17 00:00:00 2001 From: jofrn <[email protected]> Date: Tue, 15 Jul 2025 13:02:04 -0400 Subject: [PATCH 2/2] [X86] Cast atomic vectors in IR to support floats This commit casts floats to ints in an atomic load during AtomicExpand to support floating point types. It also is required to support 128 bit vectors in SSE/AVX. --- llvm/lib/Target/X86/X86ISelLowering.cpp | 7 + llvm/lib/Target/X86/X86ISelLowering.h | 2 + llvm/lib/Target/X86/X86InstrCompiler.td | 15 + llvm/test/CodeGen/X86/atomic-load-store.ll | 385 ++++++--------------- 4 files changed, 122 insertions(+), 287 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 5a1171b2b4ee6..dd275dd16a92a 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -32956,6 +32956,13 @@ X86TargetLowering::shouldExpandAtomicRMWInIR(const AtomicRMWInst *AI) const { } } +TargetLowering::AtomicExpansionKind +X86TargetLowering::shouldCastAtomicLoadInIR(LoadInst *LI) const { + if (LI->getType()->getScalarType()->isFloatingPointTy()) + return AtomicExpansionKind::CastToInteger; + return AtomicExpansionKind::None; +} + LoadInst * X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const { unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32; diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index 515f28122a00a..98e8dbb32f977 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -889,6 +889,8 @@ namespace llvm { shouldExpandAtomicRMWInIR(const AtomicRMWInst *AI) const override; TargetLoweringBase::AtomicExpansionKind shouldExpandLogicAtomicRMWInIR(const AtomicRMWInst *AI) const; + TargetLoweringBase::AtomicExpansionKind + shouldCastAtomicLoadInIR(LoadInst *LI) const override; void emitBitTestAtomicRMWIntrinsic(AtomicRMWInst *AI) const override; void emitCmpArithAtomicRMWIntrinsic(AtomicRMWInst *AI) const override; diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td index ebbfa48d2660c..2ee2d99efc7f4 100644 --- a/llvm/lib/Target/X86/X86InstrCompiler.td +++ b/llvm/lib/Target/X86/X86InstrCompiler.td @@ -1227,6 +1227,21 @@ def : Pat<(v2i64 (scalar_to_vector (i64 (atomic_load_64 addr:$src)))), def : Pat<(v2i64 (scalar_to_vector (i64 (atomic_load_64 addr:$src)))), (VMOV64toPQIZrm addr:$src)>, Requires<[HasAVX512]>; +// load atomic <2 x i64> +def : Pat<(v2i64 (atomic_load_128_v2i64 addr:$src)), + (MOVAPDrm addr:$src)>, Requires<[UseSSE2]>; +def : Pat<(v2i64 (atomic_load_128_v2i64 addr:$src)), + (VMOVAPDrm addr:$src)>, Requires<[UseAVX]>; +def : Pat<(v2i64 (atomic_load_128_v2i64 addr:$src)), + (VMOVAPDZ128rm addr:$src)>, Requires<[HasAVX512]>; +// load atomic <4 x i32> +def : Pat<(v4i32 (atomic_load_128_v4i32 addr:$src)), + (MOVAPDrm addr:$src)>, Requires<[UseSSE2]>; +def : Pat<(v4i32 (atomic_load_128_v4i32 addr:$src)), + (VMOVAPDrm addr:$src)>, Requires<[UseAVX]>; +def : Pat<(v4i32 (atomic_load_128_v4i32 addr:$src)), + (VMOVAPDZ128rm addr:$src)>, Requires<[HasAVX512]>; + // Floating point loads/stores. def : Pat<(atomic_store_32 (i32 (bitconvert (f32 FR32:$src))), addr:$dst), (MOVSSmr addr:$dst, FR32:$src)>, Requires<[UseSSE1]>; diff --git a/llvm/test/CodeGen/X86/atomic-load-store.ll b/llvm/test/CodeGen/X86/atomic-load-store.ll index 928dfef3143da..00310f6d1f219 100644 --- a/llvm/test/CodeGen/X86/atomic-load-store.ll +++ b/llvm/test/CodeGen/X86/atomic-load-store.ll @@ -119,13 +119,13 @@ define <1 x bfloat> @atomic_vec1_bfloat(ptr %x) { ; CHECK-SSE-O3-LABEL: atomic_vec1_bfloat: ; CHECK-SSE-O3: # %bb.0: ; CHECK-SSE-O3-NEXT: movzwl (%rdi), %eax -; CHECK-SSE-O3-NEXT: pinsrw $0, %eax, %xmm0 +; CHECK-SSE-O3-NEXT: movd %eax, %xmm0 ; CHECK-SSE-O3-NEXT: retq ; ; CHECK-AVX-O3-LABEL: atomic_vec1_bfloat: ; CHECK-AVX-O3: # %bb.0: ; CHECK-AVX-O3-NEXT: movzwl (%rdi), %eax -; CHECK-AVX-O3-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 +; CHECK-AVX-O3-NEXT: vmovd %eax, %xmm0 ; CHECK-AVX-O3-NEXT: retq ; ; CHECK-SSE-O0-LABEL: atomic_vec1_bfloat: @@ -133,8 +133,7 @@ define <1 x bfloat> @atomic_vec1_bfloat(ptr %x) { ; CHECK-SSE-O0-NEXT: movw (%rdi), %cx ; CHECK-SSE-O0-NEXT: # implicit-def: $eax ; CHECK-SSE-O0-NEXT: movw %cx, %ax -; CHECK-SSE-O0-NEXT: # implicit-def: $xmm0 -; CHECK-SSE-O0-NEXT: pinsrw $0, %eax, %xmm0 +; CHECK-SSE-O0-NEXT: movd %eax, %xmm0 ; CHECK-SSE-O0-NEXT: retq ; ; CHECK-AVX-O0-LABEL: atomic_vec1_bfloat: @@ -142,8 +141,7 @@ define <1 x bfloat> @atomic_vec1_bfloat(ptr %x) { ; CHECK-AVX-O0-NEXT: movw (%rdi), %cx ; CHECK-AVX-O0-NEXT: # implicit-def: $eax ; CHECK-AVX-O0-NEXT: movw %cx, %ax -; CHECK-AVX-O0-NEXT: # implicit-def: $xmm0 -; CHECK-AVX-O0-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 +; CHECK-AVX-O0-NEXT: vmovd %eax, %xmm0 ; CHECK-AVX-O0-NEXT: retq %ret = load atomic <1 x bfloat>, ptr %x acquire, align 2 ret <1 x bfloat> %ret @@ -298,11 +296,7 @@ define <2 x float> @atomic_vec2_float_align(ptr %x) { define <2 x half> @atomic_vec2_half(ptr %x) { ; CHECK-SSE-O3-LABEL: atomic_vec2_half: ; CHECK-SSE-O3: # %bb.0: -; CHECK-SSE-O3-NEXT: movl (%rdi), %eax -; CHECK-SSE-O3-NEXT: pinsrw $0, %eax, %xmm0 -; CHECK-SSE-O3-NEXT: shrl $16, %eax -; CHECK-SSE-O3-NEXT: pinsrw $0, %eax, %xmm1 -; CHECK-SSE-O3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; CHECK-SSE-O3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; CHECK-SSE-O3-NEXT: retq ; ; CHECK-AVX-O3-LABEL: atomic_vec2_half: @@ -312,20 +306,7 @@ define <2 x half> @atomic_vec2_half(ptr %x) { ; ; CHECK-SSE-O0-LABEL: atomic_vec2_half: ; CHECK-SSE-O0: # %bb.0: -; CHECK-SSE-O0-NEXT: movl (%rdi), %eax -; CHECK-SSE-O0-NEXT: movl %eax, %ecx -; CHECK-SSE-O0-NEXT: shrl $16, %ecx -; CHECK-SSE-O0-NEXT: movw %cx, %dx -; CHECK-SSE-O0-NEXT: # implicit-def: $ecx -; CHECK-SSE-O0-NEXT: movw %dx, %cx -; CHECK-SSE-O0-NEXT: # implicit-def: $xmm1 -; CHECK-SSE-O0-NEXT: pinsrw $0, %ecx, %xmm1 -; CHECK-SSE-O0-NEXT: movw %ax, %cx -; CHECK-SSE-O0-NEXT: # implicit-def: $eax -; CHECK-SSE-O0-NEXT: movw %cx, %ax -; CHECK-SSE-O0-NEXT: # implicit-def: $xmm0 -; CHECK-SSE-O0-NEXT: pinsrw $0, %eax, %xmm0 -; CHECK-SSE-O0-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; CHECK-SSE-O0-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; CHECK-SSE-O0-NEXT: retq ; ; CHECK-AVX-O0-LABEL: atomic_vec2_half: @@ -338,49 +319,22 @@ define <2 x half> @atomic_vec2_half(ptr %x) { define <2 x bfloat> @atomic_vec2_bfloat(ptr %x) { ; CHECK-SSE-O3-LABEL: atomic_vec2_bfloat: ; CHECK-SSE-O3: # %bb.0: -; CHECK-SSE-O3-NEXT: movl (%rdi), %eax -; CHECK-SSE-O3-NEXT: pinsrw $0, %eax, %xmm0 -; CHECK-SSE-O3-NEXT: shrl $16, %eax -; CHECK-SSE-O3-NEXT: pinsrw $0, %eax, %xmm1 -; CHECK-SSE-O3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; CHECK-SSE-O3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; CHECK-SSE-O3-NEXT: retq ; ; CHECK-AVX-O3-LABEL: atomic_vec2_bfloat: ; CHECK-AVX-O3: # %bb.0: -; CHECK-AVX-O3-NEXT: movl (%rdi), %eax -; CHECK-AVX-O3-NEXT: movw %ax, -{{[0-9]+}}(%rsp) -; CHECK-AVX-O3-NEXT: shrl $16, %eax -; CHECK-AVX-O3-NEXT: movw %ax, -{{[0-9]+}}(%rsp) -; CHECK-AVX-O3-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0 +; CHECK-AVX-O3-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; CHECK-AVX-O3-NEXT: retq ; ; CHECK-SSE-O0-LABEL: atomic_vec2_bfloat: ; CHECK-SSE-O0: # %bb.0: -; CHECK-SSE-O0-NEXT: movl (%rdi), %eax -; CHECK-SSE-O0-NEXT: movl %eax, %ecx -; CHECK-SSE-O0-NEXT: shrl $16, %ecx -; CHECK-SSE-O0-NEXT: # kill: def $cx killed $cx killed $ecx -; CHECK-SSE-O0-NEXT: movw %ax, %dx -; CHECK-SSE-O0-NEXT: # implicit-def: $eax -; CHECK-SSE-O0-NEXT: movw %dx, %ax -; CHECK-SSE-O0-NEXT: # implicit-def: $xmm0 -; CHECK-SSE-O0-NEXT: pinsrw $0, %eax, %xmm0 -; CHECK-SSE-O0-NEXT: # implicit-def: $eax -; CHECK-SSE-O0-NEXT: movw %cx, %ax -; CHECK-SSE-O0-NEXT: # implicit-def: $xmm1 -; CHECK-SSE-O0-NEXT: pinsrw $0, %eax, %xmm1 -; CHECK-SSE-O0-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; CHECK-SSE-O0-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; CHECK-SSE-O0-NEXT: retq ; ; CHECK-AVX-O0-LABEL: atomic_vec2_bfloat: ; CHECK-AVX-O0: # %bb.0: -; CHECK-AVX-O0-NEXT: movl (%rdi), %eax -; CHECK-AVX-O0-NEXT: movw %ax, %cx -; CHECK-AVX-O0-NEXT: movw %cx, -{{[0-9]+}}(%rsp) -; CHECK-AVX-O0-NEXT: shrl $16, %eax -; CHECK-AVX-O0-NEXT: # kill: def $ax killed $ax killed $eax -; CHECK-AVX-O0-NEXT: movw %ax, -{{[0-9]+}}(%rsp) -; CHECK-AVX-O0-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0 +; CHECK-AVX-O0-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; CHECK-AVX-O0-NEXT: retq %ret = load atomic <2 x bfloat>, ptr %x acquire, align 4 ret <2 x bfloat> %ret @@ -417,13 +371,13 @@ define <1 x half> @atomic_vec1_half(ptr %x) { ; CHECK-SSE-O3-LABEL: atomic_vec1_half: ; CHECK-SSE-O3: # %bb.0: ; CHECK-SSE-O3-NEXT: movzwl (%rdi), %eax -; CHECK-SSE-O3-NEXT: pinsrw $0, %eax, %xmm0 +; CHECK-SSE-O3-NEXT: movd %eax, %xmm0 ; CHECK-SSE-O3-NEXT: retq ; ; CHECK-AVX-O3-LABEL: atomic_vec1_half: ; CHECK-AVX-O3: # %bb.0: ; CHECK-AVX-O3-NEXT: movzwl (%rdi), %eax -; CHECK-AVX-O3-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 +; CHECK-AVX-O3-NEXT: vmovd %eax, %xmm0 ; CHECK-AVX-O3-NEXT: retq ; ; CHECK-SSE-O0-LABEL: atomic_vec1_half: @@ -431,8 +385,7 @@ define <1 x half> @atomic_vec1_half(ptr %x) { ; CHECK-SSE-O0-NEXT: movw (%rdi), %cx ; CHECK-SSE-O0-NEXT: # implicit-def: $eax ; CHECK-SSE-O0-NEXT: movw %cx, %ax -; CHECK-SSE-O0-NEXT: # implicit-def: $xmm0 -; CHECK-SSE-O0-NEXT: pinsrw $0, %eax, %xmm0 +; CHECK-SSE-O0-NEXT: movd %eax, %xmm0 ; CHECK-SSE-O0-NEXT: retq ; ; CHECK-AVX-O0-LABEL: atomic_vec1_half: @@ -440,8 +393,7 @@ define <1 x half> @atomic_vec1_half(ptr %x) { ; CHECK-AVX-O0-NEXT: movw (%rdi), %cx ; CHECK-AVX-O0-NEXT: # implicit-def: $eax ; CHECK-AVX-O0-NEXT: movw %cx, %ax -; CHECK-AVX-O0-NEXT: # implicit-def: $xmm0 -; CHECK-AVX-O0-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 +; CHECK-AVX-O0-NEXT: vmovd %eax, %xmm0 ; CHECK-AVX-O0-NEXT: retq %ret = load atomic <1 x half>, ptr %x acquire, align 2 ret <1 x half> %ret @@ -676,110 +628,20 @@ define <4 x i16> @atomic_vec4_i16(ptr %x) nounwind { } define <4 x half> @atomic_vec4_half(ptr %x) nounwind { -; CHECK-SSE2-O3-LABEL: atomic_vec4_half: -; CHECK-SSE2-O3: # %bb.0: -; CHECK-SSE2-O3-NEXT: movq (%rdi), %rax -; CHECK-SSE2-O3-NEXT: movl %eax, %ecx -; CHECK-SSE2-O3-NEXT: shrl $16, %ecx -; CHECK-SSE2-O3-NEXT: pinsrw $0, %ecx, %xmm1 -; CHECK-SSE2-O3-NEXT: pinsrw $0, %eax, %xmm0 -; CHECK-SSE2-O3-NEXT: movq %rax, %rcx -; CHECK-SSE2-O3-NEXT: shrq $32, %rcx -; CHECK-SSE2-O3-NEXT: pinsrw $0, %ecx, %xmm2 -; CHECK-SSE2-O3-NEXT: shrq $48, %rax -; CHECK-SSE2-O3-NEXT: pinsrw $0, %eax, %xmm3 -; CHECK-SSE2-O3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; CHECK-SSE2-O3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; CHECK-SSE2-O3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; CHECK-SSE2-O3-NEXT: retq -; -; CHECK-SSE4-O3-LABEL: atomic_vec4_half: -; CHECK-SSE4-O3: # %bb.0: -; CHECK-SSE4-O3-NEXT: movq (%rdi), %rax -; CHECK-SSE4-O3-NEXT: movl %eax, %ecx -; CHECK-SSE4-O3-NEXT: shrl $16, %ecx -; CHECK-SSE4-O3-NEXT: pinsrw $0, %ecx, %xmm1 -; CHECK-SSE4-O3-NEXT: pinsrw $0, %eax, %xmm0 -; CHECK-SSE4-O3-NEXT: movq %rax, %rcx -; CHECK-SSE4-O3-NEXT: shrq $32, %rcx -; CHECK-SSE4-O3-NEXT: pinsrw $0, %ecx, %xmm2 -; CHECK-SSE4-O3-NEXT: shrq $48, %rax -; CHECK-SSE4-O3-NEXT: pinsrw $0, %eax, %xmm3 -; CHECK-SSE4-O3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; CHECK-SSE4-O3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; CHECK-SSE4-O3-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],zero,zero -; CHECK-SSE4-O3-NEXT: retq +; CHECK-SSE-O3-LABEL: atomic_vec4_half: +; CHECK-SSE-O3: # %bb.0: +; CHECK-SSE-O3-NEXT: movq (%rdi), %xmm0 +; CHECK-SSE-O3-NEXT: retq ; ; CHECK-AVX-O3-LABEL: atomic_vec4_half: ; CHECK-AVX-O3: # %bb.0: ; CHECK-AVX-O3-NEXT: vmovq (%rdi), %xmm0 ; CHECK-AVX-O3-NEXT: retq ; -; CHECK-SSE2-O0-LABEL: atomic_vec4_half: -; CHECK-SSE2-O0: # %bb.0: -; CHECK-SSE2-O0-NEXT: movq (%rdi), %rax -; CHECK-SSE2-O0-NEXT: movl %eax, %ecx -; CHECK-SSE2-O0-NEXT: shrl $16, %ecx -; CHECK-SSE2-O0-NEXT: movw %cx, %dx -; CHECK-SSE2-O0-NEXT: # implicit-def: $ecx -; CHECK-SSE2-O0-NEXT: movw %dx, %cx -; CHECK-SSE2-O0-NEXT: # implicit-def: $xmm2 -; CHECK-SSE2-O0-NEXT: pinsrw $0, %ecx, %xmm2 -; CHECK-SSE2-O0-NEXT: movw %ax, %dx -; CHECK-SSE2-O0-NEXT: # implicit-def: $ecx -; CHECK-SSE2-O0-NEXT: movw %dx, %cx -; CHECK-SSE2-O0-NEXT: # implicit-def: $xmm0 -; CHECK-SSE2-O0-NEXT: pinsrw $0, %ecx, %xmm0 -; CHECK-SSE2-O0-NEXT: movq %rax, %rcx -; CHECK-SSE2-O0-NEXT: shrq $32, %rcx -; CHECK-SSE2-O0-NEXT: movw %cx, %dx -; CHECK-SSE2-O0-NEXT: # implicit-def: $ecx -; CHECK-SSE2-O0-NEXT: movw %dx, %cx -; CHECK-SSE2-O0-NEXT: # implicit-def: $xmm1 -; CHECK-SSE2-O0-NEXT: pinsrw $0, %ecx, %xmm1 -; CHECK-SSE2-O0-NEXT: shrq $48, %rax -; CHECK-SSE2-O0-NEXT: movw %ax, %cx -; CHECK-SSE2-O0-NEXT: # implicit-def: $eax -; CHECK-SSE2-O0-NEXT: movw %cx, %ax -; CHECK-SSE2-O0-NEXT: # implicit-def: $xmm3 -; CHECK-SSE2-O0-NEXT: pinsrw $0, %eax, %xmm3 -; CHECK-SSE2-O0-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; CHECK-SSE2-O0-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; CHECK-SSE2-O0-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-O0-NEXT: retq -; -; CHECK-SSE4-O0-LABEL: atomic_vec4_half: -; CHECK-SSE4-O0: # %bb.0: -; CHECK-SSE4-O0-NEXT: movq (%rdi), %rax -; CHECK-SSE4-O0-NEXT: movl %eax, %ecx -; CHECK-SSE4-O0-NEXT: shrl $16, %ecx -; CHECK-SSE4-O0-NEXT: movw %cx, %dx -; CHECK-SSE4-O0-NEXT: # implicit-def: $ecx -; CHECK-SSE4-O0-NEXT: movw %dx, %cx -; CHECK-SSE4-O0-NEXT: # implicit-def: $xmm2 -; CHECK-SSE4-O0-NEXT: pinsrw $0, %ecx, %xmm2 -; CHECK-SSE4-O0-NEXT: movw %ax, %dx -; CHECK-SSE4-O0-NEXT: # implicit-def: $ecx -; CHECK-SSE4-O0-NEXT: movw %dx, %cx -; CHECK-SSE4-O0-NEXT: # implicit-def: $xmm0 -; CHECK-SSE4-O0-NEXT: pinsrw $0, %ecx, %xmm0 -; CHECK-SSE4-O0-NEXT: movq %rax, %rcx -; CHECK-SSE4-O0-NEXT: shrq $32, %rcx -; CHECK-SSE4-O0-NEXT: movw %cx, %dx -; CHECK-SSE4-O0-NEXT: # implicit-def: $ecx -; CHECK-SSE4-O0-NEXT: movw %dx, %cx -; CHECK-SSE4-O0-NEXT: # implicit-def: $xmm1 -; CHECK-SSE4-O0-NEXT: pinsrw $0, %ecx, %xmm1 -; CHECK-SSE4-O0-NEXT: shrq $48, %rax -; CHECK-SSE4-O0-NEXT: movw %ax, %cx -; CHECK-SSE4-O0-NEXT: # implicit-def: $eax -; CHECK-SSE4-O0-NEXT: movw %cx, %ax -; CHECK-SSE4-O0-NEXT: # implicit-def: $xmm3 -; CHECK-SSE4-O0-NEXT: pinsrw $0, %eax, %xmm3 -; CHECK-SSE4-O0-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; CHECK-SSE4-O0-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; CHECK-SSE4-O0-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero -; CHECK-SSE4-O0-NEXT: retq +; CHECK-SSE-O0-LABEL: atomic_vec4_half: +; CHECK-SSE-O0: # %bb.0: +; CHECK-SSE-O0-NEXT: movq (%rdi), %xmm0 +; CHECK-SSE-O0-NEXT: retq ; ; CHECK-AVX-O0-LABEL: atomic_vec4_half: ; CHECK-AVX-O0: # %bb.0: @@ -789,141 +651,24 @@ define <4 x half> @atomic_vec4_half(ptr %x) nounwind { ret <4 x half> %ret } define <4 x bfloat> @atomic_vec4_bfloat(ptr %x) nounwind { -; CHECK-SSE2-O3-LABEL: atomic_vec4_bfloat: -; CHECK-SSE2-O3: # %bb.0: -; CHECK-SSE2-O3-NEXT: movq (%rdi), %rax -; CHECK-SSE2-O3-NEXT: movq %rax, %rcx -; CHECK-SSE2-O3-NEXT: movq %rax, %rdx -; CHECK-SSE2-O3-NEXT: pinsrw $0, %eax, %xmm0 -; CHECK-SSE2-O3-NEXT: # kill: def $eax killed $eax killed $rax -; CHECK-SSE2-O3-NEXT: shrl $16, %eax -; CHECK-SSE2-O3-NEXT: shrq $32, %rcx -; CHECK-SSE2-O3-NEXT: shrq $48, %rdx -; CHECK-SSE2-O3-NEXT: pinsrw $0, %edx, %xmm1 -; CHECK-SSE2-O3-NEXT: pinsrw $0, %ecx, %xmm2 -; CHECK-SSE2-O3-NEXT: pinsrw $0, %eax, %xmm3 -; CHECK-SSE2-O3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; CHECK-SSE2-O3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; CHECK-SSE2-O3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; CHECK-SSE2-O3-NEXT: retq -; -; CHECK-SSE4-O3-LABEL: atomic_vec4_bfloat: -; CHECK-SSE4-O3: # %bb.0: -; CHECK-SSE4-O3-NEXT: movq (%rdi), %rax -; CHECK-SSE4-O3-NEXT: movq %rax, %rcx -; CHECK-SSE4-O3-NEXT: movq %rax, %rdx -; CHECK-SSE4-O3-NEXT: pinsrw $0, %eax, %xmm0 -; CHECK-SSE4-O3-NEXT: # kill: def $eax killed $eax killed $rax -; CHECK-SSE4-O3-NEXT: shrl $16, %eax -; CHECK-SSE4-O3-NEXT: shrq $32, %rcx -; CHECK-SSE4-O3-NEXT: shrq $48, %rdx -; CHECK-SSE4-O3-NEXT: pinsrw $0, %edx, %xmm1 -; CHECK-SSE4-O3-NEXT: pinsrw $0, %ecx, %xmm2 -; CHECK-SSE4-O3-NEXT: pinsrw $0, %eax, %xmm3 -; CHECK-SSE4-O3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; CHECK-SSE4-O3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; CHECK-SSE4-O3-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],zero,zero -; CHECK-SSE4-O3-NEXT: retq +; CHECK-SSE-O3-LABEL: atomic_vec4_bfloat: +; CHECK-SSE-O3: # %bb.0: +; CHECK-SSE-O3-NEXT: movq (%rdi), %xmm0 +; CHECK-SSE-O3-NEXT: retq ; ; CHECK-AVX-O3-LABEL: atomic_vec4_bfloat: ; CHECK-AVX-O3: # %bb.0: -; CHECK-AVX-O3-NEXT: movq (%rdi), %rax -; CHECK-AVX-O3-NEXT: movw %ax, -{{[0-9]+}}(%rsp) -; CHECK-AVX-O3-NEXT: movq %rax, %rcx -; CHECK-AVX-O3-NEXT: shrq $48, %rcx -; CHECK-AVX-O3-NEXT: movw %cx, -{{[0-9]+}}(%rsp) -; CHECK-AVX-O3-NEXT: movq %rax, %rcx -; CHECK-AVX-O3-NEXT: shrq $32, %rcx -; CHECK-AVX-O3-NEXT: movw %cx, -{{[0-9]+}}(%rsp) -; CHECK-AVX-O3-NEXT: shrl $16, %eax -; CHECK-AVX-O3-NEXT: movw %ax, -{{[0-9]+}}(%rsp) -; CHECK-AVX-O3-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0 +; CHECK-AVX-O3-NEXT: vmovq (%rdi), %xmm0 ; CHECK-AVX-O3-NEXT: retq ; -; CHECK-SSE2-O0-LABEL: atomic_vec4_bfloat: -; CHECK-SSE2-O0: # %bb.0: -; CHECK-SSE2-O0-NEXT: movq (%rdi), %rax -; CHECK-SSE2-O0-NEXT: movl %eax, %ecx -; CHECK-SSE2-O0-NEXT: shrl $16, %ecx -; CHECK-SSE2-O0-NEXT: # kill: def $cx killed $cx killed $ecx -; CHECK-SSE2-O0-NEXT: movw %ax, %dx -; CHECK-SSE2-O0-NEXT: movq %rax, %rsi -; CHECK-SSE2-O0-NEXT: shrq $32, %rsi -; CHECK-SSE2-O0-NEXT: # kill: def $si killed $si killed $rsi -; CHECK-SSE2-O0-NEXT: shrq $48, %rax -; CHECK-SSE2-O0-NEXT: movw %ax, %di -; CHECK-SSE2-O0-NEXT: # implicit-def: $eax -; CHECK-SSE2-O0-NEXT: movw %di, %ax -; CHECK-SSE2-O0-NEXT: # implicit-def: $xmm0 -; CHECK-SSE2-O0-NEXT: pinsrw $0, %eax, %xmm0 -; CHECK-SSE2-O0-NEXT: # implicit-def: $eax -; CHECK-SSE2-O0-NEXT: movw %si, %ax -; CHECK-SSE2-O0-NEXT: # implicit-def: $xmm1 -; CHECK-SSE2-O0-NEXT: pinsrw $0, %eax, %xmm1 -; CHECK-SSE2-O0-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; CHECK-SSE2-O0-NEXT: # implicit-def: $eax -; CHECK-SSE2-O0-NEXT: movw %dx, %ax -; CHECK-SSE2-O0-NEXT: # implicit-def: $xmm0 -; CHECK-SSE2-O0-NEXT: pinsrw $0, %eax, %xmm0 -; CHECK-SSE2-O0-NEXT: # implicit-def: $eax -; CHECK-SSE2-O0-NEXT: movw %cx, %ax -; CHECK-SSE2-O0-NEXT: # implicit-def: $xmm2 -; CHECK-SSE2-O0-NEXT: pinsrw $0, %eax, %xmm2 -; CHECK-SSE2-O0-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; CHECK-SSE2-O0-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; CHECK-SSE2-O0-NEXT: retq -; -; CHECK-SSE4-O0-LABEL: atomic_vec4_bfloat: -; CHECK-SSE4-O0: # %bb.0: -; CHECK-SSE4-O0-NEXT: movq (%rdi), %rax -; CHECK-SSE4-O0-NEXT: movl %eax, %ecx -; CHECK-SSE4-O0-NEXT: shrl $16, %ecx -; CHECK-SSE4-O0-NEXT: # kill: def $cx killed $cx killed $ecx -; CHECK-SSE4-O0-NEXT: movw %ax, %dx -; CHECK-SSE4-O0-NEXT: movq %rax, %rsi -; CHECK-SSE4-O0-NEXT: shrq $32, %rsi -; CHECK-SSE4-O0-NEXT: # kill: def $si killed $si killed $rsi -; CHECK-SSE4-O0-NEXT: shrq $48, %rax -; CHECK-SSE4-O0-NEXT: movw %ax, %di -; CHECK-SSE4-O0-NEXT: # implicit-def: $eax -; CHECK-SSE4-O0-NEXT: movw %di, %ax -; CHECK-SSE4-O0-NEXT: # implicit-def: $xmm0 -; CHECK-SSE4-O0-NEXT: pinsrw $0, %eax, %xmm0 -; CHECK-SSE4-O0-NEXT: # implicit-def: $eax -; CHECK-SSE4-O0-NEXT: movw %si, %ax -; CHECK-SSE4-O0-NEXT: # implicit-def: $xmm1 -; CHECK-SSE4-O0-NEXT: pinsrw $0, %eax, %xmm1 -; CHECK-SSE4-O0-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; CHECK-SSE4-O0-NEXT: # implicit-def: $eax -; CHECK-SSE4-O0-NEXT: movw %dx, %ax -; CHECK-SSE4-O0-NEXT: # implicit-def: $xmm0 -; CHECK-SSE4-O0-NEXT: pinsrw $0, %eax, %xmm0 -; CHECK-SSE4-O0-NEXT: # implicit-def: $eax -; CHECK-SSE4-O0-NEXT: movw %cx, %ax -; CHECK-SSE4-O0-NEXT: # implicit-def: $xmm2 -; CHECK-SSE4-O0-NEXT: pinsrw $0, %eax, %xmm2 -; CHECK-SSE4-O0-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; CHECK-SSE4-O0-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero -; CHECK-SSE4-O0-NEXT: retq +; CHECK-SSE-O0-LABEL: atomic_vec4_bfloat: +; CHECK-SSE-O0: # %bb.0: +; CHECK-SSE-O0-NEXT: movq (%rdi), %xmm0 +; CHECK-SSE-O0-NEXT: retq ; ; CHECK-AVX-O0-LABEL: atomic_vec4_bfloat: ; CHECK-AVX-O0: # %bb.0: -; CHECK-AVX-O0-NEXT: movq (%rdi), %rax -; CHECK-AVX-O0-NEXT: movq %rax, %rcx -; CHECK-AVX-O0-NEXT: shrq $48, %rcx -; CHECK-AVX-O0-NEXT: # kill: def $cx killed $cx killed $rcx -; CHECK-AVX-O0-NEXT: movw %cx, -{{[0-9]+}}(%rsp) -; CHECK-AVX-O0-NEXT: movq %rax, %rcx -; CHECK-AVX-O0-NEXT: shrq $32, %rcx -; CHECK-AVX-O0-NEXT: # kill: def $cx killed $cx killed $rcx -; CHECK-AVX-O0-NEXT: movw %cx, -{{[0-9]+}}(%rsp) -; CHECK-AVX-O0-NEXT: movw %ax, %cx -; CHECK-AVX-O0-NEXT: movw %cx, -{{[0-9]+}}(%rsp) -; CHECK-AVX-O0-NEXT: # kill: def $eax killed $eax killed $rax -; CHECK-AVX-O0-NEXT: shrl $16, %eax -; CHECK-AVX-O0-NEXT: # kill: def $ax killed $ax killed $eax -; CHECK-AVX-O0-NEXT: movw %ax, -{{[0-9]+}}(%rsp) -; CHECK-AVX-O0-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0 +; CHECK-AVX-O0-NEXT: vmovq (%rdi), %xmm0 ; CHECK-AVX-O0-NEXT: retq %ret = load atomic <4 x bfloat>, ptr %x acquire, align 8 ret <4 x bfloat> %ret @@ -981,6 +726,72 @@ define <4 x float> @atomic_vec4_float(ptr %x) nounwind { ret <4 x float> %ret } +define <4 x float> @atomic_vec4_float_align(ptr %x) nounwind { +; +; CHECK-SSE2-O3-LABEL: atomic_vec4_float_align: +; CHECK-SSE2-O3: # %bb.0: +; CHECK-SSE2-O3-NEXT: pushq %rax +; CHECK-SSE2-O3-NEXT: movl $2, %esi +; CHECK-SSE2-O3-NEXT: callq __atomic_load_16@PLT +; CHECK-SSE2-O3-NEXT: movq %rdx, %xmm1 +; CHECK-SSE2-O3-NEXT: movq %rax, %xmm0 +; CHECK-SSE2-O3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; CHECK-SSE2-O3-NEXT: popq %rax +; CHECK-SSE2-O3-NEXT: retq +; +; CHECK-SSE4-O3-LABEL: atomic_vec4_float_align: +; CHECK-SSE4-O3: # %bb.0: +; CHECK-SSE4-O3-NEXT: pushq %rbx +; CHECK-SSE4-O3-NEXT: xorl %eax, %eax +; CHECK-SSE4-O3-NEXT: xorl %edx, %edx +; CHECK-SSE4-O3-NEXT: xorl %ecx, %ecx +; CHECK-SSE4-O3-NEXT: xorl %ebx, %ebx +; CHECK-SSE4-O3-NEXT: lock cmpxchg16b (%rdi) +; CHECK-SSE4-O3-NEXT: movq %rdx, %xmm1 +; CHECK-SSE4-O3-NEXT: movq %rax, %xmm0 +; CHECK-SSE4-O3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; CHECK-SSE4-O3-NEXT: popq %rbx +; CHECK-SSE4-O3-NEXT: retq +; +; CHECK-AVX-O3-LABEL: atomic_vec4_float_align: +; CHECK-AVX-O3: # %bb.0: +; CHECK-AVX-O3-NEXT: vmovaps (%rdi), %xmm0 +; CHECK-AVX-O3-NEXT: retq +; +; CHECK-SSE2-O0-LABEL: atomic_vec4_float_align: +; CHECK-SSE2-O0: # %bb.0: +; CHECK-SSE2-O0-NEXT: pushq %rax +; CHECK-SSE2-O0-NEXT: movl $2, %esi +; CHECK-SSE2-O0-NEXT: callq __atomic_load_16@PLT +; CHECK-SSE2-O0-NEXT: movq %rdx, %xmm1 +; CHECK-SSE2-O0-NEXT: movq %rax, %xmm0 +; CHECK-SSE2-O0-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; CHECK-SSE2-O0-NEXT: popq %rax +; CHECK-SSE2-O0-NEXT: retq +; +; CHECK-SSE4-O0-LABEL: atomic_vec4_float_align: +; CHECK-SSE4-O0: # %bb.0: +; CHECK-SSE4-O0-NEXT: pushq %rbx +; CHECK-SSE4-O0-NEXT: xorl %eax, %eax +; CHECK-SSE4-O0-NEXT: movl %eax, %ebx +; CHECK-SSE4-O0-NEXT: movq %rbx, %rax +; CHECK-SSE4-O0-NEXT: movq %rbx, %rdx +; CHECK-SSE4-O0-NEXT: movq %rbx, %rcx +; CHECK-SSE4-O0-NEXT: lock cmpxchg16b (%rdi) +; CHECK-SSE4-O0-NEXT: movq %rdx, %xmm1 +; CHECK-SSE4-O0-NEXT: movq %rax, %xmm0 +; CHECK-SSE4-O0-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; CHECK-SSE4-O0-NEXT: popq %rbx +; CHECK-SSE4-O0-NEXT: retq +; +; CHECK-AVX-O0-LABEL: atomic_vec4_float_align: +; CHECK-AVX-O0: # %bb.0: +; CHECK-AVX-O0-NEXT: vmovaps (%rdi), %xmm0 +; CHECK-AVX-O0-NEXT: retq + %ret = load atomic <4 x float>, ptr %x acquire, align 16 + ret <4 x float> %ret +} + define <8 x double> @atomic_vec8_double(ptr %x) nounwind { ; CHECK-SSE-O3-LABEL: atomic_vec8_double: ; CHECK-SSE-O3: # %bb.0: _______________________________________________ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
