https://github.com/phoebewang updated https://github.com/llvm/llvm-project/pull/76901
>From ff9b72bdb5442a037d4325619de66e25ad211586 Mon Sep 17 00:00:00 2001 From: Phoebe Wang <phoebe.w...@intel.com> Date: Mon, 1 Jan 2024 15:13:38 +0800 Subject: [PATCH] [X86][BF16][WIP] Try to use `f16` for lowering --- llvm/lib/Target/X86/X86ISelLowering.cpp | 11 +- llvm/lib/Target/X86/X86ISelLowering.h | 10 - llvm/lib/Target/X86/X86ISelLoweringCall.cpp | 37 +- llvm/test/CodeGen/X86/bfloat.ll | 1199 +++++++++++-------- 4 files changed, 739 insertions(+), 518 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index a90ddf132c3897..6d25992315b2db 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -7475,10 +7475,12 @@ static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) { static SDValue LowerBUILD_VECTORvXbf16(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { MVT VT = Op.getSimpleValueType(); - MVT IVT = VT.changeVectorElementTypeToInteger(); + MVT IVT = + VT.changeVectorElementType(Subtarget.hasFP16() ? MVT::f16 : MVT::i16); SmallVector<SDValue, 16> NewOps; for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I) - NewOps.push_back(DAG.getBitcast(MVT::i16, Op.getOperand(I))); + NewOps.push_back(DAG.getBitcast(Subtarget.hasFP16() ? MVT::f16 : MVT::i16, + Op.getOperand(I))); SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, SDLoc(), IVT, NewOps); return DAG.getBitcast(VT, Res); } @@ -21515,9 +21517,8 @@ SDValue X86TargetLowering::LowerFP_TO_BF16(SDValue Op, RTLIB::Libcall LC = RTLIB::getFPROUND(Op.getOperand(0).getValueType(), MVT::bf16); SDValue Res = - makeLibCall(DAG, LC, MVT::f32, Op.getOperand(0), CallOptions, DL).first; - return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, - DAG.getBitcast(MVT::i32, Res)); + makeLibCall(DAG, LC, MVT::f16, Op.getOperand(0), CallOptions, DL).first; + return DAG.getBitcast(MVT::i16, Res); } /// Depending on uarch and/or optimizing for size, we might prefer to use a diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index 9bd1622cb0d3a6..32745400a38b7e 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -1714,16 +1714,6 @@ namespace llvm { MachineBasicBlock *Entry, const SmallVectorImpl<MachineBasicBlock *> &Exits) const override; - bool splitValueIntoRegisterParts( - SelectionDAG & DAG, const SDLoc &DL, SDValue Val, SDValue *Parts, - unsigned NumParts, MVT PartVT, std::optional<CallingConv::ID> CC) - const override; - - SDValue joinRegisterPartsIntoValue( - SelectionDAG & DAG, const SDLoc &DL, const SDValue *Parts, - unsigned NumParts, MVT PartVT, EVT ValueVT, - std::optional<CallingConv::ID> CC) const override; - bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const override; bool mayBeEmittedAsTailCall(const CallInst *CI) const override; diff --git a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp index b8b5421b900501..d75bd4171fde9d 100644 --- a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp +++ b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp @@ -127,6 +127,9 @@ MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context, return getRegisterTypeForCallingConv(Context, CC, VT.changeVectorElementType(MVT::f16)); + if (VT == MVT::bf16) + return MVT::f16; + return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT); } @@ -421,40 +424,6 @@ unsigned X86TargetLowering::getJumpTableEncoding() const { return TargetLowering::getJumpTableEncoding(); } -bool X86TargetLowering::splitValueIntoRegisterParts( - SelectionDAG &DAG, const SDLoc &DL, SDValue Val, SDValue *Parts, - unsigned NumParts, MVT PartVT, std::optional<CallingConv::ID> CC) const { - bool IsABIRegCopy = CC.has_value(); - EVT ValueVT = Val.getValueType(); - if (IsABIRegCopy && ValueVT == MVT::bf16 && PartVT == MVT::f32) { - unsigned ValueBits = ValueVT.getSizeInBits(); - unsigned PartBits = PartVT.getSizeInBits(); - Val = DAG.getNode(ISD::BITCAST, DL, MVT::getIntegerVT(ValueBits), Val); - Val = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::getIntegerVT(PartBits), Val); - Val = DAG.getNode(ISD::BITCAST, DL, PartVT, Val); - Parts[0] = Val; - return true; - } - return false; -} - -SDValue X86TargetLowering::joinRegisterPartsIntoValue( - SelectionDAG &DAG, const SDLoc &DL, const SDValue *Parts, unsigned NumParts, - MVT PartVT, EVT ValueVT, std::optional<CallingConv::ID> CC) const { - bool IsABIRegCopy = CC.has_value(); - if (IsABIRegCopy && ValueVT == MVT::bf16 && PartVT == MVT::f32) { - unsigned ValueBits = ValueVT.getSizeInBits(); - unsigned PartBits = PartVT.getSizeInBits(); - SDValue Val = Parts[0]; - - Val = DAG.getNode(ISD::BITCAST, DL, MVT::getIntegerVT(PartBits), Val); - Val = DAG.getNode(ISD::TRUNCATE, DL, MVT::getIntegerVT(ValueBits), Val); - Val = DAG.getNode(ISD::BITCAST, DL, ValueVT, Val); - return Val; - } - return SDValue(); -} - bool X86TargetLowering::useSoftFloat() const { return Subtarget.useSoftFloat(); } diff --git a/llvm/test/CodeGen/X86/bfloat.ll b/llvm/test/CodeGen/X86/bfloat.ll index 9c65310f79d7ec..30b99f32e1b64d 100644 --- a/llvm/test/CodeGen/X86/bfloat.ll +++ b/llvm/test/CodeGen/X86/bfloat.ll @@ -17,27 +17,58 @@ define void @add(ptr %pa, ptr %pb, ptr %pc) nounwind { ; SSE2-NEXT: movd %eax, %xmm0 ; SSE2-NEXT: addss %xmm1, %xmm0 ; SSE2-NEXT: callq __truncsfbf2@PLT -; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: pextrw $0, %xmm0, %eax ; SSE2-NEXT: movw %ax, (%rbx) ; SSE2-NEXT: popq %rbx ; SSE2-NEXT: retq ; -; AVX-LABEL: add: -; AVX: # %bb.0: -; AVX-NEXT: pushq %rbx -; AVX-NEXT: movq %rdx, %rbx -; AVX-NEXT: movzwl (%rsi), %eax -; AVX-NEXT: shll $16, %eax -; AVX-NEXT: vmovd %eax, %xmm0 -; AVX-NEXT: movzwl (%rdi), %eax -; AVX-NEXT: shll $16, %eax -; AVX-NEXT: vmovd %eax, %xmm1 -; AVX-NEXT: vaddss %xmm0, %xmm1, %xmm0 -; AVX-NEXT: callq __truncsfbf2@PLT -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: movw %ax, (%rbx) -; AVX-NEXT: popq %rbx -; AVX-NEXT: retq +; BF16-LABEL: add: +; BF16: # %bb.0: +; BF16-NEXT: pushq %rbx +; BF16-NEXT: movq %rdx, %rbx +; BF16-NEXT: movzwl (%rsi), %eax +; BF16-NEXT: shll $16, %eax +; BF16-NEXT: vmovd %eax, %xmm0 +; BF16-NEXT: movzwl (%rdi), %eax +; BF16-NEXT: shll $16, %eax +; BF16-NEXT: vmovd %eax, %xmm1 +; BF16-NEXT: vaddss %xmm0, %xmm1, %xmm0 +; BF16-NEXT: callq __truncsfbf2@PLT +; BF16-NEXT: vpextrw $0, %xmm0, (%rbx) +; BF16-NEXT: popq %rbx +; BF16-NEXT: retq +; +; FP16-LABEL: add: +; FP16: # %bb.0: +; FP16-NEXT: pushq %rbx +; FP16-NEXT: movq %rdx, %rbx +; FP16-NEXT: movzwl (%rsi), %eax +; FP16-NEXT: shll $16, %eax +; FP16-NEXT: vmovd %eax, %xmm0 +; FP16-NEXT: movzwl (%rdi), %eax +; FP16-NEXT: shll $16, %eax +; FP16-NEXT: vmovd %eax, %xmm1 +; FP16-NEXT: vaddss %xmm0, %xmm1, %xmm0 +; FP16-NEXT: callq __truncsfbf2@PLT +; FP16-NEXT: vmovsh %xmm0, (%rbx) +; FP16-NEXT: popq %rbx +; FP16-NEXT: retq +; +; AVXNC-LABEL: add: +; AVXNC: # %bb.0: +; AVXNC-NEXT: pushq %rbx +; AVXNC-NEXT: movq %rdx, %rbx +; AVXNC-NEXT: movzwl (%rsi), %eax +; AVXNC-NEXT: shll $16, %eax +; AVXNC-NEXT: vmovd %eax, %xmm0 +; AVXNC-NEXT: movzwl (%rdi), %eax +; AVXNC-NEXT: shll $16, %eax +; AVXNC-NEXT: vmovd %eax, %xmm1 +; AVXNC-NEXT: vaddss %xmm0, %xmm1, %xmm0 +; AVXNC-NEXT: callq __truncsfbf2@PLT +; AVXNC-NEXT: vpextrw $0, %xmm0, (%rbx) +; AVXNC-NEXT: popq %rbx +; AVXNC-NEXT: retq %a = load bfloat, ptr %pa %b = load bfloat, ptr %pb %add = fadd bfloat %a, %b @@ -49,8 +80,8 @@ define bfloat @add2(bfloat %a, bfloat %b) nounwind { ; SSE2-LABEL: add2: ; SSE2: # %bb.0: ; SSE2-NEXT: pushq %rax -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: movd %xmm1, %ecx +; SSE2-NEXT: pextrw $0, %xmm0, %eax +; SSE2-NEXT: pextrw $0, %xmm1, %ecx ; SSE2-NEXT: shll $16, %ecx ; SSE2-NEXT: movd %ecx, %xmm1 ; SSE2-NEXT: shll $16, %eax @@ -60,19 +91,47 @@ define bfloat @add2(bfloat %a, bfloat %b) nounwind { ; SSE2-NEXT: popq %rax ; SSE2-NEXT: retq ; -; AVX-LABEL: add2: -; AVX: # %bb.0: -; AVX-NEXT: pushq %rax -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: vmovd %xmm1, %ecx -; AVX-NEXT: shll $16, %ecx -; AVX-NEXT: vmovd %ecx, %xmm0 -; AVX-NEXT: shll $16, %eax -; AVX-NEXT: vmovd %eax, %xmm1 -; AVX-NEXT: vaddss %xmm0, %xmm1, %xmm0 -; AVX-NEXT: callq __truncsfbf2@PLT -; AVX-NEXT: popq %rax -; AVX-NEXT: retq +; BF16-LABEL: add2: +; BF16: # %bb.0: +; BF16-NEXT: pushq %rax +; BF16-NEXT: vpextrw $0, %xmm0, %eax +; BF16-NEXT: vpextrw $0, %xmm1, %ecx +; BF16-NEXT: shll $16, %ecx +; BF16-NEXT: vmovd %ecx, %xmm0 +; BF16-NEXT: shll $16, %eax +; BF16-NEXT: vmovd %eax, %xmm1 +; BF16-NEXT: vaddss %xmm0, %xmm1, %xmm0 +; BF16-NEXT: callq __truncsfbf2@PLT +; BF16-NEXT: popq %rax +; BF16-NEXT: retq +; +; FP16-LABEL: add2: +; FP16: # %bb.0: +; FP16-NEXT: pushq %rax +; FP16-NEXT: vmovw %xmm0, %eax +; FP16-NEXT: vmovw %xmm1, %ecx +; FP16-NEXT: shll $16, %ecx +; FP16-NEXT: vmovd %ecx, %xmm0 +; FP16-NEXT: shll $16, %eax +; FP16-NEXT: vmovd %eax, %xmm1 +; FP16-NEXT: vaddss %xmm0, %xmm1, %xmm0 +; FP16-NEXT: callq __truncsfbf2@PLT +; FP16-NEXT: popq %rax +; FP16-NEXT: retq +; +; AVXNC-LABEL: add2: +; AVXNC: # %bb.0: +; AVXNC-NEXT: pushq %rax +; AVXNC-NEXT: vpextrw $0, %xmm0, %eax +; AVXNC-NEXT: vpextrw $0, %xmm1, %ecx +; AVXNC-NEXT: shll $16, %ecx +; AVXNC-NEXT: vmovd %ecx, %xmm0 +; AVXNC-NEXT: shll $16, %eax +; AVXNC-NEXT: vmovd %eax, %xmm1 +; AVXNC-NEXT: vaddss %xmm0, %xmm1, %xmm0 +; AVXNC-NEXT: callq __truncsfbf2@PLT +; AVXNC-NEXT: popq %rax +; AVXNC-NEXT: retq %add = fadd bfloat %a, %b ret bfloat %add } @@ -87,17 +146,17 @@ define void @add_double(ptr %pa, ptr %pb, ptr %pc) nounwind { ; SSE2-NEXT: movq %rsi, %r14 ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; SSE2-NEXT: callq __truncdfbf2@PLT -; SSE2-NEXT: movd %xmm0, %ebp +; SSE2-NEXT: pextrw $0, %xmm0, %ebp ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; SSE2-NEXT: callq __truncdfbf2@PLT -; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: pextrw $0, %xmm0, %eax ; SSE2-NEXT: shll $16, %eax ; SSE2-NEXT: movd %eax, %xmm1 ; SSE2-NEXT: shll $16, %ebp ; SSE2-NEXT: movd %ebp, %xmm0 ; SSE2-NEXT: addss %xmm1, %xmm0 ; SSE2-NEXT: callq __truncsfbf2@PLT -; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: pextrw $0, %xmm0, %eax ; SSE2-NEXT: shll $16, %eax ; SSE2-NEXT: movd %eax, %xmm0 ; SSE2-NEXT: cvtss2sd %xmm0, %xmm0 @@ -107,34 +166,92 @@ define void @add_double(ptr %pa, ptr %pb, ptr %pc) nounwind { ; SSE2-NEXT: popq %rbp ; SSE2-NEXT: retq ; -; AVX-LABEL: add_double: -; AVX: # %bb.0: -; AVX-NEXT: pushq %rbp -; AVX-NEXT: pushq %r14 -; AVX-NEXT: pushq %rbx -; AVX-NEXT: movq %rdx, %rbx -; AVX-NEXT: movq %rsi, %r14 -; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: callq __truncdfbf2@PLT -; AVX-NEXT: vmovd %xmm0, %ebp -; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: callq __truncdfbf2@PLT -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: shll $16, %eax -; AVX-NEXT: vmovd %eax, %xmm0 -; AVX-NEXT: shll $16, %ebp -; AVX-NEXT: vmovd %ebp, %xmm1 -; AVX-NEXT: vaddss %xmm0, %xmm1, %xmm0 -; AVX-NEXT: callq __truncsfbf2@PLT -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: shll $16, %eax -; AVX-NEXT: vmovd %eax, %xmm0 -; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovsd %xmm0, (%rbx) -; AVX-NEXT: popq %rbx -; AVX-NEXT: popq %r14 -; AVX-NEXT: popq %rbp -; AVX-NEXT: retq +; BF16-LABEL: add_double: +; BF16: # %bb.0: +; BF16-NEXT: pushq %rbp +; BF16-NEXT: pushq %r14 +; BF16-NEXT: pushq %rbx +; BF16-NEXT: movq %rdx, %rbx +; BF16-NEXT: movq %rsi, %r14 +; BF16-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; BF16-NEXT: callq __truncdfbf2@PLT +; BF16-NEXT: vpextrw $0, %xmm0, %ebp +; BF16-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; BF16-NEXT: callq __truncdfbf2@PLT +; BF16-NEXT: vpextrw $0, %xmm0, %eax +; BF16-NEXT: shll $16, %eax +; BF16-NEXT: vmovd %eax, %xmm0 +; BF16-NEXT: shll $16, %ebp +; BF16-NEXT: vmovd %ebp, %xmm1 +; BF16-NEXT: vaddss %xmm0, %xmm1, %xmm0 +; BF16-NEXT: callq __truncsfbf2@PLT +; BF16-NEXT: vpextrw $0, %xmm0, %eax +; BF16-NEXT: shll $16, %eax +; BF16-NEXT: vmovd %eax, %xmm0 +; BF16-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 +; BF16-NEXT: vmovsd %xmm0, (%rbx) +; BF16-NEXT: popq %rbx +; BF16-NEXT: popq %r14 +; BF16-NEXT: popq %rbp +; BF16-NEXT: retq +; +; FP16-LABEL: add_double: +; FP16: # %bb.0: +; FP16-NEXT: pushq %rbp +; FP16-NEXT: pushq %r14 +; FP16-NEXT: pushq %rbx +; FP16-NEXT: movq %rdx, %rbx +; FP16-NEXT: movq %rsi, %r14 +; FP16-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; FP16-NEXT: callq __truncdfbf2@PLT +; FP16-NEXT: vmovw %xmm0, %ebp +; FP16-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; FP16-NEXT: callq __truncdfbf2@PLT +; FP16-NEXT: vmovw %xmm0, %eax +; FP16-NEXT: shll $16, %eax +; FP16-NEXT: vmovd %eax, %xmm0 +; FP16-NEXT: shll $16, %ebp +; FP16-NEXT: vmovd %ebp, %xmm1 +; FP16-NEXT: vaddss %xmm0, %xmm1, %xmm0 +; FP16-NEXT: callq __truncsfbf2@PLT +; FP16-NEXT: vmovw %xmm0, %eax +; FP16-NEXT: shll $16, %eax +; FP16-NEXT: vmovd %eax, %xmm0 +; FP16-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 +; FP16-NEXT: vmovsd %xmm0, (%rbx) +; FP16-NEXT: popq %rbx +; FP16-NEXT: popq %r14 +; FP16-NEXT: popq %rbp +; FP16-NEXT: retq +; +; AVXNC-LABEL: add_double: +; AVXNC: # %bb.0: +; AVXNC-NEXT: pushq %rbp +; AVXNC-NEXT: pushq %r14 +; AVXNC-NEXT: pushq %rbx +; AVXNC-NEXT: movq %rdx, %rbx +; AVXNC-NEXT: movq %rsi, %r14 +; AVXNC-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVXNC-NEXT: callq __truncdfbf2@PLT +; AVXNC-NEXT: vpextrw $0, %xmm0, %ebp +; AVXNC-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVXNC-NEXT: callq __truncdfbf2@PLT +; AVXNC-NEXT: vpextrw $0, %xmm0, %eax +; AVXNC-NEXT: shll $16, %eax +; AVXNC-NEXT: vmovd %eax, %xmm0 +; AVXNC-NEXT: shll $16, %ebp +; AVXNC-NEXT: vmovd %ebp, %xmm1 +; AVXNC-NEXT: vaddss %xmm0, %xmm1, %xmm0 +; AVXNC-NEXT: callq __truncsfbf2@PLT +; AVXNC-NEXT: vpextrw $0, %xmm0, %eax +; AVXNC-NEXT: shll $16, %eax +; AVXNC-NEXT: vmovd %eax, %xmm0 +; AVXNC-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 +; AVXNC-NEXT: vmovsd %xmm0, (%rbx) +; AVXNC-NEXT: popq %rbx +; AVXNC-NEXT: popq %r14 +; AVXNC-NEXT: popq %rbp +; AVXNC-NEXT: retq %la = load double, ptr %pa %a = fptrunc double %la to bfloat %lb = load double, ptr %pb @@ -152,18 +269,18 @@ define double @add_double2(double %da, double %db) nounwind { ; SSE2-NEXT: subq $16, %rsp ; SSE2-NEXT: movsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; SSE2-NEXT: callq __truncdfbf2@PLT -; SSE2-NEXT: movd %xmm0, %ebx +; SSE2-NEXT: pextrw $0, %xmm0, %ebx ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Folded Reload ; SSE2-NEXT: # xmm0 = mem[0],zero ; SSE2-NEXT: callq __truncdfbf2@PLT -; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: pextrw $0, %xmm0, %eax ; SSE2-NEXT: shll $16, %eax ; SSE2-NEXT: movd %eax, %xmm1 ; SSE2-NEXT: shll $16, %ebx ; SSE2-NEXT: movd %ebx, %xmm0 ; SSE2-NEXT: addss %xmm1, %xmm0 ; SSE2-NEXT: callq __truncsfbf2@PLT -; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: pextrw $0, %xmm0, %eax ; SSE2-NEXT: shll $16, %eax ; SSE2-NEXT: movd %eax, %xmm0 ; SSE2-NEXT: cvtss2sd %xmm0, %xmm0 @@ -171,30 +288,80 @@ define double @add_double2(double %da, double %db) nounwind { ; SSE2-NEXT: popq %rbx ; SSE2-NEXT: retq ; -; AVX-LABEL: add_double2: -; AVX: # %bb.0: -; AVX-NEXT: pushq %rbx -; AVX-NEXT: subq $16, %rsp -; AVX-NEXT: vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX-NEXT: callq __truncdfbf2@PLT -; AVX-NEXT: vmovd %xmm0, %ebx -; AVX-NEXT: vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Folded Reload -; AVX-NEXT: # xmm0 = mem[0],zero -; AVX-NEXT: callq __truncdfbf2@PLT -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: shll $16, %eax -; AVX-NEXT: vmovd %eax, %xmm0 -; AVX-NEXT: shll $16, %ebx -; AVX-NEXT: vmovd %ebx, %xmm1 -; AVX-NEXT: vaddss %xmm0, %xmm1, %xmm0 -; AVX-NEXT: callq __truncsfbf2@PLT -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: shll $16, %eax -; AVX-NEXT: vmovd %eax, %xmm0 -; AVX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 -; AVX-NEXT: addq $16, %rsp -; AVX-NEXT: popq %rbx -; AVX-NEXT: retq +; BF16-LABEL: add_double2: +; BF16: # %bb.0: +; BF16-NEXT: pushq %rbx +; BF16-NEXT: subq $16, %rsp +; BF16-NEXT: vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; BF16-NEXT: callq __truncdfbf2@PLT +; BF16-NEXT: vpextrw $0, %xmm0, %ebx +; BF16-NEXT: vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Folded Reload +; BF16-NEXT: # xmm0 = mem[0],zero +; BF16-NEXT: callq __truncdfbf2@PLT +; BF16-NEXT: vpextrw $0, %xmm0, %eax +; BF16-NEXT: shll $16, %eax +; BF16-NEXT: vmovd %eax, %xmm0 +; BF16-NEXT: shll $16, %ebx +; BF16-NEXT: vmovd %ebx, %xmm1 +; BF16-NEXT: vaddss %xmm0, %xmm1, %xmm0 +; BF16-NEXT: callq __truncsfbf2@PLT +; BF16-NEXT: vpextrw $0, %xmm0, %eax +; BF16-NEXT: shll $16, %eax +; BF16-NEXT: vmovd %eax, %xmm0 +; BF16-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 +; BF16-NEXT: addq $16, %rsp +; BF16-NEXT: popq %rbx +; BF16-NEXT: retq +; +; FP16-LABEL: add_double2: +; FP16: # %bb.0: +; FP16-NEXT: pushq %rbx +; FP16-NEXT: subq $16, %rsp +; FP16-NEXT: vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; FP16-NEXT: callq __truncdfbf2@PLT +; FP16-NEXT: vmovw %xmm0, %ebx +; FP16-NEXT: vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload +; FP16-NEXT: # xmm0 = mem[0],zero +; FP16-NEXT: callq __truncdfbf2@PLT +; FP16-NEXT: vmovw %xmm0, %eax +; FP16-NEXT: shll $16, %eax +; FP16-NEXT: vmovd %eax, %xmm0 +; FP16-NEXT: shll $16, %ebx +; FP16-NEXT: vmovd %ebx, %xmm1 +; FP16-NEXT: vaddss %xmm0, %xmm1, %xmm0 +; FP16-NEXT: callq __truncsfbf2@PLT +; FP16-NEXT: vmovw %xmm0, %eax +; FP16-NEXT: shll $16, %eax +; FP16-NEXT: vmovd %eax, %xmm0 +; FP16-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 +; FP16-NEXT: addq $16, %rsp +; FP16-NEXT: popq %rbx +; FP16-NEXT: retq +; +; AVXNC-LABEL: add_double2: +; AVXNC: # %bb.0: +; AVXNC-NEXT: pushq %rbx +; AVXNC-NEXT: subq $16, %rsp +; AVXNC-NEXT: vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVXNC-NEXT: callq __truncdfbf2@PLT +; AVXNC-NEXT: vpextrw $0, %xmm0, %ebx +; AVXNC-NEXT: vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Folded Reload +; AVXNC-NEXT: # xmm0 = mem[0],zero +; AVXNC-NEXT: callq __truncdfbf2@PLT +; AVXNC-NEXT: vpextrw $0, %xmm0, %eax +; AVXNC-NEXT: shll $16, %eax +; AVXNC-NEXT: vmovd %eax, %xmm0 +; AVXNC-NEXT: shll $16, %ebx +; AVXNC-NEXT: vmovd %ebx, %xmm1 +; AVXNC-NEXT: vaddss %xmm0, %xmm1, %xmm0 +; AVXNC-NEXT: callq __truncsfbf2@PLT +; AVXNC-NEXT: vpextrw $0, %xmm0, %eax +; AVXNC-NEXT: shll $16, %eax +; AVXNC-NEXT: vmovd %eax, %xmm0 +; AVXNC-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 +; AVXNC-NEXT: addq $16, %rsp +; AVXNC-NEXT: popq %rbx +; AVXNC-NEXT: retq %a = fptrunc double %da to bfloat %b = fptrunc double %db to bfloat %add = fadd bfloat %a, %b @@ -212,24 +379,49 @@ define void @add_constant(ptr %pa, ptr %pc) nounwind { ; SSE2-NEXT: movd %eax, %xmm0 ; SSE2-NEXT: addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: callq __truncsfbf2@PLT -; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: pextrw $0, %xmm0, %eax ; SSE2-NEXT: movw %ax, (%rbx) ; SSE2-NEXT: popq %rbx ; SSE2-NEXT: retq ; -; AVX-LABEL: add_constant: -; AVX: # %bb.0: -; AVX-NEXT: pushq %rbx -; AVX-NEXT: movq %rsi, %rbx -; AVX-NEXT: movzwl (%rdi), %eax -; AVX-NEXT: shll $16, %eax -; AVX-NEXT: vmovd %eax, %xmm0 -; AVX-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: callq __truncsfbf2@PLT -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: movw %ax, (%rbx) -; AVX-NEXT: popq %rbx -; AVX-NEXT: retq +; BF16-LABEL: add_constant: +; BF16: # %bb.0: +; BF16-NEXT: pushq %rbx +; BF16-NEXT: movq %rsi, %rbx +; BF16-NEXT: movzwl (%rdi), %eax +; BF16-NEXT: shll $16, %eax +; BF16-NEXT: vmovd %eax, %xmm0 +; BF16-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; BF16-NEXT: callq __truncsfbf2@PLT +; BF16-NEXT: vpextrw $0, %xmm0, (%rbx) +; BF16-NEXT: popq %rbx +; BF16-NEXT: retq +; +; FP16-LABEL: add_constant: +; FP16: # %bb.0: +; FP16-NEXT: pushq %rbx +; FP16-NEXT: movq %rsi, %rbx +; FP16-NEXT: movzwl (%rdi), %eax +; FP16-NEXT: shll $16, %eax +; FP16-NEXT: vmovd %eax, %xmm0 +; FP16-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; FP16-NEXT: callq __truncsfbf2@PLT +; FP16-NEXT: vmovsh %xmm0, (%rbx) +; FP16-NEXT: popq %rbx +; FP16-NEXT: retq +; +; AVXNC-LABEL: add_constant: +; AVXNC: # %bb.0: +; AVXNC-NEXT: pushq %rbx +; AVXNC-NEXT: movq %rsi, %rbx +; AVXNC-NEXT: movzwl (%rdi), %eax +; AVXNC-NEXT: shll $16, %eax +; AVXNC-NEXT: vmovd %eax, %xmm0 +; AVXNC-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVXNC-NEXT: callq __truncsfbf2@PLT +; AVXNC-NEXT: vpextrw $0, %xmm0, (%rbx) +; AVXNC-NEXT: popq %rbx +; AVXNC-NEXT: retq %a = load bfloat, ptr %pa %add = fadd bfloat %a, 1.0 store bfloat %add, ptr %pc @@ -240,7 +432,7 @@ define bfloat @add_constant2(bfloat %a) nounwind { ; SSE2-LABEL: add_constant2: ; SSE2: # %bb.0: ; SSE2-NEXT: pushq %rax -; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: pextrw $0, %xmm0, %eax ; SSE2-NEXT: shll $16, %eax ; SSE2-NEXT: movd %eax, %xmm0 ; SSE2-NEXT: addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 @@ -248,16 +440,38 @@ define bfloat @add_constant2(bfloat %a) nounwind { ; SSE2-NEXT: popq %rax ; SSE2-NEXT: retq ; -; AVX-LABEL: add_constant2: -; AVX: # %bb.0: -; AVX-NEXT: pushq %rax -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: shll $16, %eax -; AVX-NEXT: vmovd %eax, %xmm0 -; AVX-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: callq __truncsfbf2@PLT -; AVX-NEXT: popq %rax -; AVX-NEXT: retq +; BF16-LABEL: add_constant2: +; BF16: # %bb.0: +; BF16-NEXT: pushq %rax +; BF16-NEXT: vpextrw $0, %xmm0, %eax +; BF16-NEXT: shll $16, %eax +; BF16-NEXT: vmovd %eax, %xmm0 +; BF16-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; BF16-NEXT: callq __truncsfbf2@PLT +; BF16-NEXT: popq %rax +; BF16-NEXT: retq +; +; FP16-LABEL: add_constant2: +; FP16: # %bb.0: +; FP16-NEXT: pushq %rax +; FP16-NEXT: vmovw %xmm0, %eax +; FP16-NEXT: shll $16, %eax +; FP16-NEXT: vmovd %eax, %xmm0 +; FP16-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; FP16-NEXT: callq __truncsfbf2@PLT +; FP16-NEXT: popq %rax +; FP16-NEXT: retq +; +; AVXNC-LABEL: add_constant2: +; AVXNC: # %bb.0: +; AVXNC-NEXT: pushq %rax +; AVXNC-NEXT: vpextrw $0, %xmm0, %eax +; AVXNC-NEXT: shll $16, %eax +; AVXNC-NEXT: vmovd %eax, %xmm0 +; AVXNC-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVXNC-NEXT: callq __truncsfbf2@PLT +; AVXNC-NEXT: popq %rax +; AVXNC-NEXT: retq %add = fadd bfloat %a, 1.0 ret bfloat %add } @@ -306,93 +520,92 @@ define <8 x bfloat> @addv(<8 x bfloat> %a, <8 x bfloat> %b) nounwind { ; SSE2-NEXT: movq %xmm0, %rcx ; SSE2-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; SSE2-NEXT: movq %rcx, %rax -; SSE2-NEXT: shrq $32, %rax +; SSE2-NEXT: shrq $48, %rax ; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; SSE2-NEXT: movq %xmm1, %rdx ; SSE2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; SSE2-NEXT: movq %rdx, %rax -; SSE2-NEXT: shrq $32, %rax +; SSE2-NEXT: shrq $48, %rax ; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; SSE2-NEXT: movq %rcx, %rax -; SSE2-NEXT: shrq $48, %rax +; SSE2-NEXT: shrq $32, %rax ; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; SSE2-NEXT: movq %rdx, %rax -; SSE2-NEXT: shrq $48, %rax +; SSE2-NEXT: shrq $32, %rax ; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1] -; SSE2-NEXT: movq %xmm0, %r12 -; SSE2-NEXT: movq %r12, %rax -; SSE2-NEXT: shrq $32, %rax -; SSE2-NEXT: movq %rax, (%rsp) # 8-byte Spill +; SSE2-NEXT: movq %xmm0, %r15 +; SSE2-NEXT: movq %r15, %rbx +; SSE2-NEXT: shrq $48, %rbx ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1,1] ; SSE2-NEXT: movq %xmm1, %r14 ; SSE2-NEXT: movq %r14, %rbp -; SSE2-NEXT: shrq $32, %rbp -; SSE2-NEXT: movq %r12, %r15 -; SSE2-NEXT: shrq $48, %r15 +; SSE2-NEXT: shrq $48, %rbp +; SSE2-NEXT: movq %r15, %r12 +; SSE2-NEXT: shrq $32, %r12 ; SSE2-NEXT: movq %r14, %r13 -; SSE2-NEXT: shrq $48, %r13 +; SSE2-NEXT: shrq $32, %r13 ; SSE2-NEXT: movl %r14d, %eax -; SSE2-NEXT: andl $-65536, %eax # imm = 0xFFFF0000 +; SSE2-NEXT: shll $16, %eax ; SSE2-NEXT: movd %eax, %xmm1 -; SSE2-NEXT: movl %r12d, %eax -; SSE2-NEXT: andl $-65536, %eax # imm = 0xFFFF0000 +; SSE2-NEXT: movl %r15d, %eax +; SSE2-NEXT: shll $16, %eax ; SSE2-NEXT: movd %eax, %xmm0 ; SSE2-NEXT: addss %xmm1, %xmm0 ; SSE2-NEXT: callq __truncsfbf2@PLT -; SSE2-NEXT: movd %xmm0, %ebx -; SSE2-NEXT: shll $16, %ebx -; SSE2-NEXT: shll $16, %r14d +; SSE2-NEXT: pextrw $0, %xmm0, %eax +; SSE2-NEXT: movzwl %ax, %eax +; SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; SSE2-NEXT: andl $-65536, %r14d # imm = 0xFFFF0000 ; SSE2-NEXT: movd %r14d, %xmm1 -; SSE2-NEXT: shll $16, %r12d -; SSE2-NEXT: movd %r12d, %xmm0 +; SSE2-NEXT: andl $-65536, %r15d # imm = 0xFFFF0000 +; SSE2-NEXT: movd %r15d, %xmm0 ; SSE2-NEXT: addss %xmm1, %xmm0 ; SSE2-NEXT: callq __truncsfbf2@PLT -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: movzwl %ax, %r12d -; SSE2-NEXT: orl %ebx, %r12d +; SSE2-NEXT: pextrw $0, %xmm0, %r15d +; SSE2-NEXT: shll $16, %r15d +; SSE2-NEXT: addl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 4-byte Folded Reload ; SSE2-NEXT: shll $16, %r13d ; SSE2-NEXT: movd %r13d, %xmm1 -; SSE2-NEXT: shll $16, %r15d -; SSE2-NEXT: movd %r15d, %xmm0 +; SSE2-NEXT: shll $16, %r12d +; SSE2-NEXT: movd %r12d, %xmm0 ; SSE2-NEXT: addss %xmm1, %xmm0 ; SSE2-NEXT: callq __truncsfbf2@PLT -; SSE2-NEXT: movd %xmm0, %r14d -; SSE2-NEXT: shll $16, %r14d +; SSE2-NEXT: pextrw $0, %xmm0, %eax +; SSE2-NEXT: movzwl %ax, %r14d ; SSE2-NEXT: shll $16, %ebp ; SSE2-NEXT: movd %ebp, %xmm1 -; SSE2-NEXT: movq (%rsp), %rax # 8-byte Reload -; SSE2-NEXT: shll $16, %eax -; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: shll $16, %ebx +; SSE2-NEXT: movd %ebx, %xmm0 ; SSE2-NEXT: addss %xmm1, %xmm0 ; SSE2-NEXT: callq __truncsfbf2@PLT -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: movzwl %ax, %ebx +; SSE2-NEXT: pextrw $0, %xmm0, %ebx +; SSE2-NEXT: shll $16, %ebx ; SSE2-NEXT: orl %r14d, %ebx ; SSE2-NEXT: shlq $32, %rbx -; SSE2-NEXT: orq %r12, %rbx +; SSE2-NEXT: orq %r15, %rbx ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload ; SSE2-NEXT: movl %r15d, %eax -; SSE2-NEXT: andl $-65536, %eax # imm = 0xFFFF0000 +; SSE2-NEXT: shll $16, %eax ; SSE2-NEXT: movd %eax, %xmm1 ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload ; SSE2-NEXT: movl %r14d, %eax -; SSE2-NEXT: andl $-65536, %eax # imm = 0xFFFF0000 +; SSE2-NEXT: shll $16, %eax ; SSE2-NEXT: movd %eax, %xmm0 ; SSE2-NEXT: addss %xmm1, %xmm0 ; SSE2-NEXT: callq __truncsfbf2@PLT -; SSE2-NEXT: movd %xmm0, %ebp -; SSE2-NEXT: shll $16, %ebp +; SSE2-NEXT: pextrw $0, %xmm0, %eax +; SSE2-NEXT: movzwl %ax, %ebp ; SSE2-NEXT: movq %r15, %rax -; SSE2-NEXT: shll $16, %eax +; SSE2-NEXT: andl $-65536, %eax # imm = 0xFFFF0000 ; SSE2-NEXT: movd %eax, %xmm1 ; SSE2-NEXT: movq %r14, %rax -; SSE2-NEXT: shll $16, %eax +; SSE2-NEXT: andl $-65536, %eax # imm = 0xFFFF0000 ; SSE2-NEXT: movd %eax, %xmm0 ; SSE2-NEXT: addss %xmm1, %xmm0 ; SSE2-NEXT: callq __truncsfbf2@PLT -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: movzwl %ax, %r14d +; SSE2-NEXT: pextrw $0, %xmm0, %r14d +; SSE2-NEXT: shll $16, %r14d ; SSE2-NEXT: orl %ebp, %r14d ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; SSE2-NEXT: shll $16, %eax @@ -402,8 +615,8 @@ define <8 x bfloat> @addv(<8 x bfloat> %a, <8 x bfloat> %b) nounwind { ; SSE2-NEXT: movd %eax, %xmm0 ; SSE2-NEXT: addss %xmm1, %xmm0 ; SSE2-NEXT: callq __truncsfbf2@PLT -; SSE2-NEXT: movd %xmm0, %ebp -; SSE2-NEXT: shll $16, %ebp +; SSE2-NEXT: pextrw $0, %xmm0, %eax +; SSE2-NEXT: movzwl %ax, %ebp ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; SSE2-NEXT: shll $16, %eax ; SSE2-NEXT: movd %eax, %xmm1 @@ -412,8 +625,8 @@ define <8 x bfloat> @addv(<8 x bfloat> %a, <8 x bfloat> %b) nounwind { ; SSE2-NEXT: movd %eax, %xmm0 ; SSE2-NEXT: addss %xmm1, %xmm0 ; SSE2-NEXT: callq __truncsfbf2@PLT -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: movzwl %ax, %eax +; SSE2-NEXT: pextrw $0, %xmm0, %eax +; SSE2-NEXT: shll $16, %eax ; SSE2-NEXT: orl %ebp, %eax ; SSE2-NEXT: shlq $32, %rax ; SSE2-NEXT: orq %r14, %rax @@ -448,7 +661,8 @@ define <8 x bfloat> @addv(<8 x bfloat> %a, <8 x bfloat> %b) nounwind { ; BF16-NEXT: vmovd %eax, %xmm1 ; BF16-NEXT: vaddss %xmm2, %xmm1, %xmm0 ; BF16-NEXT: callq __truncsfbf2@PLT -; BF16-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; BF16-NEXT: vpextrw $0, %xmm0, %eax +; BF16-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; BF16-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload ; BF16-NEXT: vpextrw $6, %xmm0, %eax ; BF16-NEXT: shll $16, %eax @@ -459,7 +673,7 @@ define <8 x bfloat> @addv(<8 x bfloat> %a, <8 x bfloat> %b) nounwind { ; BF16-NEXT: vmovd %eax, %xmm1 ; BF16-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; BF16-NEXT: callq __truncsfbf2@PLT -; BF16-NEXT: vmovd %xmm0, %ebp +; BF16-NEXT: vpextrw $0, %xmm0, %ebp ; BF16-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload ; BF16-NEXT: vpextrw $5, %xmm0, %eax ; BF16-NEXT: shll $16, %eax @@ -470,7 +684,7 @@ define <8 x bfloat> @addv(<8 x bfloat> %a, <8 x bfloat> %b) nounwind { ; BF16-NEXT: vmovd %eax, %xmm1 ; BF16-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; BF16-NEXT: callq __truncsfbf2@PLT -; BF16-NEXT: vmovd %xmm0, %r14d +; BF16-NEXT: vpextrw $0, %xmm0, %r14d ; BF16-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload ; BF16-NEXT: vpextrw $4, %xmm0, %eax ; BF16-NEXT: shll $16, %eax @@ -481,7 +695,7 @@ define <8 x bfloat> @addv(<8 x bfloat> %a, <8 x bfloat> %b) nounwind { ; BF16-NEXT: vmovd %eax, %xmm1 ; BF16-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; BF16-NEXT: callq __truncsfbf2@PLT -; BF16-NEXT: vmovd %xmm0, %r15d +; BF16-NEXT: vpextrw $0, %xmm0, %r15d ; BF16-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload ; BF16-NEXT: vpextrw $3, %xmm0, %eax ; BF16-NEXT: shll $16, %eax @@ -492,7 +706,7 @@ define <8 x bfloat> @addv(<8 x bfloat> %a, <8 x bfloat> %b) nounwind { ; BF16-NEXT: vmovd %eax, %xmm1 ; BF16-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; BF16-NEXT: callq __truncsfbf2@PLT -; BF16-NEXT: vmovd %xmm0, %r12d +; BF16-NEXT: vpextrw $0, %xmm0, %r12d ; BF16-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload ; BF16-NEXT: vpextrw $2, %xmm0, %eax ; BF16-NEXT: shll $16, %eax @@ -503,31 +717,31 @@ define <8 x bfloat> @addv(<8 x bfloat> %a, <8 x bfloat> %b) nounwind { ; BF16-NEXT: vmovd %eax, %xmm1 ; BF16-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; BF16-NEXT: callq __truncsfbf2@PLT -; BF16-NEXT: vmovd %xmm0, %r13d +; BF16-NEXT: vpextrw $0, %xmm0, %r13d ; BF16-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload -; BF16-NEXT: vpextrw $1, %xmm0, %eax +; BF16-NEXT: vmovd %xmm0, %eax ; BF16-NEXT: shll $16, %eax ; BF16-NEXT: vmovd %eax, %xmm0 ; BF16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; BF16-NEXT: vpextrw $1, %xmm1, %eax +; BF16-NEXT: vmovd %xmm1, %eax ; BF16-NEXT: shll $16, %eax ; BF16-NEXT: vmovd %eax, %xmm1 ; BF16-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; BF16-NEXT: callq __truncsfbf2@PLT -; BF16-NEXT: vmovd %xmm0, %ebx +; BF16-NEXT: vpextrw $0, %xmm0, %ebx ; BF16-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload -; BF16-NEXT: vmovd %xmm0, %eax +; BF16-NEXT: vpextrw $1, %xmm0, %eax ; BF16-NEXT: shll $16, %eax ; BF16-NEXT: vmovd %eax, %xmm0 ; BF16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; BF16-NEXT: vmovd %xmm1, %eax +; BF16-NEXT: vpextrw $1, %xmm1, %eax ; BF16-NEXT: shll $16, %eax ; BF16-NEXT: vmovd %eax, %xmm1 ; BF16-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; BF16-NEXT: callq __truncsfbf2@PLT -; BF16-NEXT: vmovd %xmm0, %eax -; BF16-NEXT: vmovd %eax, %xmm0 -; BF16-NEXT: vpinsrw $1, %ebx, %xmm0, %xmm0 +; BF16-NEXT: vpextrw $0, %xmm0, %eax +; BF16-NEXT: vmovd %ebx, %xmm0 +; BF16-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 ; BF16-NEXT: vpinsrw $2, %r13d, %xmm0, %xmm0 ; BF16-NEXT: vpinsrw $3, %r12d, %xmm0, %xmm0 ; BF16-NEXT: vpinsrw $4, %r15d, %xmm0, %xmm0 @@ -545,13 +759,7 @@ define <8 x bfloat> @addv(<8 x bfloat> %a, <8 x bfloat> %b) nounwind { ; ; FP16-LABEL: addv: ; FP16: # %bb.0: -; FP16-NEXT: pushq %rbp -; FP16-NEXT: pushq %r15 -; FP16-NEXT: pushq %r14 -; FP16-NEXT: pushq %r13 -; FP16-NEXT: pushq %r12 -; FP16-NEXT: pushq %rbx -; FP16-NEXT: subq $40, %rsp +; FP16-NEXT: subq $152, %rsp ; FP16-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; FP16-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; FP16-NEXT: vmovw %xmm1, %eax @@ -562,40 +770,40 @@ define <8 x bfloat> @addv(<8 x bfloat> %a, <8 x bfloat> %b) nounwind { ; FP16-NEXT: vmovd %eax, %xmm1 ; FP16-NEXT: vaddss %xmm2, %xmm1, %xmm0 ; FP16-NEXT: callq __truncsfbf2@PLT -; FP16-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FP16-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; FP16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; FP16-NEXT: vpextrw $7, %xmm0, %eax +; FP16-NEXT: vpextrw $1, %xmm0, %eax ; FP16-NEXT: shll $16, %eax ; FP16-NEXT: vmovd %eax, %xmm0 ; FP16-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload -; FP16-NEXT: vpextrw $7, %xmm1, %eax +; FP16-NEXT: vpextrw $1, %xmm1, %eax ; FP16-NEXT: shll $16, %eax ; FP16-NEXT: vmovd %eax, %xmm1 ; FP16-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; FP16-NEXT: callq __truncsfbf2@PLT -; FP16-NEXT: vmovd %xmm0, %ebp +; FP16-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; FP16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; FP16-NEXT: vpextrw $6, %xmm0, %eax +; FP16-NEXT: vpextrw $2, %xmm0, %eax ; FP16-NEXT: shll $16, %eax ; FP16-NEXT: vmovd %eax, %xmm0 ; FP16-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload -; FP16-NEXT: vpextrw $6, %xmm1, %eax +; FP16-NEXT: vpextrw $2, %xmm1, %eax ; FP16-NEXT: shll $16, %eax ; FP16-NEXT: vmovd %eax, %xmm1 ; FP16-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; FP16-NEXT: callq __truncsfbf2@PLT -; FP16-NEXT: vmovd %xmm0, %r14d +; FP16-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; FP16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; FP16-NEXT: vpextrw $5, %xmm0, %eax +; FP16-NEXT: vpextrw $3, %xmm0, %eax ; FP16-NEXT: shll $16, %eax ; FP16-NEXT: vmovd %eax, %xmm0 ; FP16-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload -; FP16-NEXT: vpextrw $5, %xmm1, %eax +; FP16-NEXT: vpextrw $3, %xmm1, %eax ; FP16-NEXT: shll $16, %eax ; FP16-NEXT: vmovd %eax, %xmm1 ; FP16-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; FP16-NEXT: callq __truncsfbf2@PLT -; FP16-NEXT: vmovd %xmm0, %r15d +; FP16-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; FP16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; FP16-NEXT: vpextrw $4, %xmm0, %eax ; FP16-NEXT: shll $16, %eax @@ -606,56 +814,54 @@ define <8 x bfloat> @addv(<8 x bfloat> %a, <8 x bfloat> %b) nounwind { ; FP16-NEXT: vmovd %eax, %xmm1 ; FP16-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; FP16-NEXT: callq __truncsfbf2@PLT -; FP16-NEXT: vmovd %xmm0, %r12d +; FP16-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; FP16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; FP16-NEXT: vpextrw $3, %xmm0, %eax +; FP16-NEXT: vpextrw $5, %xmm0, %eax ; FP16-NEXT: shll $16, %eax ; FP16-NEXT: vmovd %eax, %xmm0 ; FP16-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload -; FP16-NEXT: vpextrw $3, %xmm1, %eax +; FP16-NEXT: vpextrw $5, %xmm1, %eax ; FP16-NEXT: shll $16, %eax ; FP16-NEXT: vmovd %eax, %xmm1 ; FP16-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; FP16-NEXT: callq __truncsfbf2@PLT -; FP16-NEXT: vmovd %xmm0, %r13d +; FP16-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; FP16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; FP16-NEXT: vpextrw $2, %xmm0, %eax +; FP16-NEXT: vpextrw $6, %xmm0, %eax ; FP16-NEXT: shll $16, %eax ; FP16-NEXT: vmovd %eax, %xmm0 ; FP16-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload -; FP16-NEXT: vpextrw $2, %xmm1, %eax +; FP16-NEXT: vpextrw $6, %xmm1, %eax ; FP16-NEXT: shll $16, %eax ; FP16-NEXT: vmovd %eax, %xmm1 ; FP16-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; FP16-NEXT: callq __truncsfbf2@PLT -; FP16-NEXT: vmovd %xmm0, %ebx +; FP16-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; FP16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; FP16-NEXT: vpextrw $1, %xmm0, %eax +; FP16-NEXT: vpextrw $7, %xmm0, %eax ; FP16-NEXT: shll $16, %eax ; FP16-NEXT: vmovd %eax, %xmm0 ; FP16-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload -; FP16-NEXT: vpextrw $1, %xmm1, %eax +; FP16-NEXT: vpextrw $7, %xmm1, %eax ; FP16-NEXT: shll $16, %eax ; FP16-NEXT: vmovd %eax, %xmm1 ; FP16-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; FP16-NEXT: callq __truncsfbf2@PLT -; FP16-NEXT: vmovd %xmm0, %eax -; FP16-NEXT: vmovd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload -; FP16-NEXT: # xmm0 = mem[0],zero,zero,zero -; FP16-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 -; FP16-NEXT: vpinsrw $2, %ebx, %xmm0, %xmm0 -; FP16-NEXT: vpinsrw $3, %r13d, %xmm0, %xmm0 -; FP16-NEXT: vpinsrw $4, %r12d, %xmm0, %xmm0 -; FP16-NEXT: vpinsrw $5, %r15d, %xmm0, %xmm0 -; FP16-NEXT: vpinsrw $6, %r14d, %xmm0, %xmm0 -; FP16-NEXT: vpinsrw $7, %ebp, %xmm0, %xmm0 -; FP16-NEXT: addq $40, %rsp -; FP16-NEXT: popq %rbx -; FP16-NEXT: popq %r12 -; FP16-NEXT: popq %r13 -; FP16-NEXT: popq %r14 -; FP16-NEXT: popq %r15 -; FP16-NEXT: popq %rbp +; FP16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; FP16-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; FP16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; FP16-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; FP16-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; FP16-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; FP16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; FP16-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; FP16-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; FP16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; FP16-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; FP16-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] +; FP16-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; FP16-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; FP16-NEXT: addq $152, %rsp ; FP16-NEXT: retq ; ; AVXNC-LABEL: addv: @@ -677,7 +883,8 @@ define <8 x bfloat> @addv(<8 x bfloat> %a, <8 x bfloat> %b) nounwind { ; AVXNC-NEXT: vmovd %eax, %xmm1 ; AVXNC-NEXT: vaddss %xmm2, %xmm1, %xmm0 ; AVXNC-NEXT: callq __truncsfbf2@PLT -; AVXNC-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVXNC-NEXT: vpextrw $0, %xmm0, %eax +; AVXNC-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; AVXNC-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload ; AVXNC-NEXT: vpextrw $6, %xmm0, %eax ; AVXNC-NEXT: shll $16, %eax @@ -688,7 +895,7 @@ define <8 x bfloat> @addv(<8 x bfloat> %a, <8 x bfloat> %b) nounwind { ; AVXNC-NEXT: vmovd %eax, %xmm1 ; AVXNC-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; AVXNC-NEXT: callq __truncsfbf2@PLT -; AVXNC-NEXT: vmovd %xmm0, %ebp +; AVXNC-NEXT: vpextrw $0, %xmm0, %ebp ; AVXNC-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload ; AVXNC-NEXT: vpextrw $5, %xmm0, %eax ; AVXNC-NEXT: shll $16, %eax @@ -699,7 +906,7 @@ define <8 x bfloat> @addv(<8 x bfloat> %a, <8 x bfloat> %b) nounwind { ; AVXNC-NEXT: vmovd %eax, %xmm1 ; AVXNC-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; AVXNC-NEXT: callq __truncsfbf2@PLT -; AVXNC-NEXT: vmovd %xmm0, %r14d +; AVXNC-NEXT: vpextrw $0, %xmm0, %r14d ; AVXNC-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload ; AVXNC-NEXT: vpextrw $4, %xmm0, %eax ; AVXNC-NEXT: shll $16, %eax @@ -710,7 +917,7 @@ define <8 x bfloat> @addv(<8 x bfloat> %a, <8 x bfloat> %b) nounwind { ; AVXNC-NEXT: vmovd %eax, %xmm1 ; AVXNC-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; AVXNC-NEXT: callq __truncsfbf2@PLT -; AVXNC-NEXT: vmovd %xmm0, %r15d +; AVXNC-NEXT: vpextrw $0, %xmm0, %r15d ; AVXNC-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload ; AVXNC-NEXT: vpextrw $3, %xmm0, %eax ; AVXNC-NEXT: shll $16, %eax @@ -721,7 +928,7 @@ define <8 x bfloat> @addv(<8 x bfloat> %a, <8 x bfloat> %b) nounwind { ; AVXNC-NEXT: vmovd %eax, %xmm1 ; AVXNC-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; AVXNC-NEXT: callq __truncsfbf2@PLT -; AVXNC-NEXT: vmovd %xmm0, %r12d +; AVXNC-NEXT: vpextrw $0, %xmm0, %r12d ; AVXNC-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload ; AVXNC-NEXT: vpextrw $2, %xmm0, %eax ; AVXNC-NEXT: shll $16, %eax @@ -732,31 +939,31 @@ define <8 x bfloat> @addv(<8 x bfloat> %a, <8 x bfloat> %b) nounwind { ; AVXNC-NEXT: vmovd %eax, %xmm1 ; AVXNC-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; AVXNC-NEXT: callq __truncsfbf2@PLT -; AVXNC-NEXT: vmovd %xmm0, %r13d +; AVXNC-NEXT: vpextrw $0, %xmm0, %r13d ; AVXNC-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload -; AVXNC-NEXT: vpextrw $1, %xmm0, %eax +; AVXNC-NEXT: vmovd %xmm0, %eax ; AVXNC-NEXT: shll $16, %eax ; AVXNC-NEXT: vmovd %eax, %xmm0 ; AVXNC-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVXNC-NEXT: vpextrw $1, %xmm1, %eax +; AVXNC-NEXT: vmovd %xmm1, %eax ; AVXNC-NEXT: shll $16, %eax ; AVXNC-NEXT: vmovd %eax, %xmm1 ; AVXNC-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; AVXNC-NEXT: callq __truncsfbf2@PLT -; AVXNC-NEXT: vmovd %xmm0, %ebx +; AVXNC-NEXT: vpextrw $0, %xmm0, %ebx ; AVXNC-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload -; AVXNC-NEXT: vmovd %xmm0, %eax +; AVXNC-NEXT: vpextrw $1, %xmm0, %eax ; AVXNC-NEXT: shll $16, %eax ; AVXNC-NEXT: vmovd %eax, %xmm0 ; AVXNC-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVXNC-NEXT: vmovd %xmm1, %eax +; AVXNC-NEXT: vpextrw $1, %xmm1, %eax ; AVXNC-NEXT: shll $16, %eax ; AVXNC-NEXT: vmovd %eax, %xmm1 ; AVXNC-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; AVXNC-NEXT: callq __truncsfbf2@PLT -; AVXNC-NEXT: vmovd %xmm0, %eax -; AVXNC-NEXT: vmovd %eax, %xmm0 -; AVXNC-NEXT: vpinsrw $1, %ebx, %xmm0, %xmm0 +; AVXNC-NEXT: vpextrw $0, %xmm0, %eax +; AVXNC-NEXT: vmovd %ebx, %xmm0 +; AVXNC-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 ; AVXNC-NEXT: vpinsrw $2, %r13d, %xmm0, %xmm0 ; AVXNC-NEXT: vpinsrw $3, %r12d, %xmm0, %xmm0 ; AVXNC-NEXT: vpinsrw $4, %r15d, %xmm0, %xmm0 @@ -778,20 +985,29 @@ define <8 x bfloat> @addv(<8 x bfloat> %a, <8 x bfloat> %b) nounwind { define <2 x bfloat> @pr62997(bfloat %a, bfloat %b) { ; SSE2-LABEL: pr62997: ; SSE2: # %bb.0: -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: movd %xmm1, %ecx -; SSE2-NEXT: pinsrw $0, %ecx, %xmm1 -; SSE2-NEXT: pinsrw $0, %eax, %xmm0 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE2-NEXT: retq ; -; AVX-LABEL: pr62997: -; AVX: # %bb.0: -; AVX-NEXT: vmovd %xmm1, %eax -; AVX-NEXT: vmovd %xmm0, %ecx -; AVX-NEXT: vmovd %ecx, %xmm0 -; AVX-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 -; AVX-NEXT: retq +; BF16-LABEL: pr62997: +; BF16: # %bb.0: +; BF16-NEXT: vpextrw $0, %xmm0, %eax +; BF16-NEXT: vpextrw $0, %xmm1, %ecx +; BF16-NEXT: vmovd %eax, %xmm0 +; BF16-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0 +; BF16-NEXT: retq +; +; FP16-LABEL: pr62997: +; FP16: # %bb.0: +; FP16-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; FP16-NEXT: retq +; +; AVXNC-LABEL: pr62997: +; AVXNC: # %bb.0: +; AVXNC-NEXT: vpextrw $0, %xmm0, %eax +; AVXNC-NEXT: vpextrw $0, %xmm1, %ecx +; AVXNC-NEXT: vmovd %eax, %xmm0 +; AVXNC-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0 +; AVXNC-NEXT: retq %1 = insertelement <2 x bfloat> undef, bfloat %a, i64 0 %2 = insertelement <2 x bfloat> %1, bfloat %b, i64 1 ret <2 x bfloat> %2 @@ -1149,8 +1365,8 @@ define <32 x bfloat> @pr63017_2() nounwind { ; SSE2-NEXT: movd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill ; SSE2-NEXT: movd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill ; SSE2-NEXT: movd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill -; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill ; SSE2-NEXT: movd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill +; SSE2-NEXT: movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill ; SSE2-NEXT: movd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill ; SSE2-NEXT: movd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill ; SSE2-NEXT: movd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill @@ -1166,23 +1382,23 @@ define <32 x bfloat> @pr63017_2() nounwind { ; SSE2-NEXT: movd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill ; SSE2-NEXT: .LBB12_67: # %else92 ; SSE2-NEXT: callq __truncsfbf2@PLT -; SSE2-NEXT: movd %xmm0, %ebx +; SSE2-NEXT: pextrw $0, %xmm0, %ebx ; SSE2-NEXT: shll $16, %ebx ; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload ; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: callq __truncsfbf2@PLT -; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: pextrw $0, %xmm0, %eax ; SSE2-NEXT: movzwl %ax, %r14d ; SSE2-NEXT: orl %ebx, %r14d ; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload ; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: callq __truncsfbf2@PLT -; SSE2-NEXT: movd %xmm0, %ebx +; SSE2-NEXT: pextrw $0, %xmm0, %ebx ; SSE2-NEXT: shll $16, %ebx ; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload ; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: callq __truncsfbf2@PLT -; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: pextrw $0, %xmm0, %eax ; SSE2-NEXT: movzwl %ax, %eax ; SSE2-NEXT: orl %ebx, %eax ; SSE2-NEXT: shlq $32, %rax @@ -1192,23 +1408,23 @@ define <32 x bfloat> @pr63017_2() nounwind { ; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload ; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: callq __truncsfbf2@PLT -; SSE2-NEXT: movd %xmm0, %ebx +; SSE2-NEXT: pextrw $0, %xmm0, %ebx ; SSE2-NEXT: shll $16, %ebx ; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload ; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: callq __truncsfbf2@PLT -; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: pextrw $0, %xmm0, %eax ; SSE2-NEXT: movzwl %ax, %r14d ; SSE2-NEXT: orl %ebx, %r14d ; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload ; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: callq __truncsfbf2@PLT -; SSE2-NEXT: movd %xmm0, %ebx +; SSE2-NEXT: pextrw $0, %xmm0, %ebx ; SSE2-NEXT: shll $16, %ebx ; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload ; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: callq __truncsfbf2@PLT -; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: pextrw $0, %xmm0, %eax ; SSE2-NEXT: movzwl %ax, %eax ; SSE2-NEXT: orl %ebx, %eax ; SSE2-NEXT: shlq $32, %rax @@ -1220,23 +1436,23 @@ define <32 x bfloat> @pr63017_2() nounwind { ; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload ; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: callq __truncsfbf2@PLT -; SSE2-NEXT: movd %xmm0, %ebx +; SSE2-NEXT: pextrw $0, %xmm0, %ebx ; SSE2-NEXT: shll $16, %ebx ; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload ; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: callq __truncsfbf2@PLT -; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: pextrw $0, %xmm0, %eax ; SSE2-NEXT: movzwl %ax, %r14d ; SSE2-NEXT: orl %ebx, %r14d ; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload ; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: callq __truncsfbf2@PLT -; SSE2-NEXT: movd %xmm0, %ebx +; SSE2-NEXT: pextrw $0, %xmm0, %ebx ; SSE2-NEXT: shll $16, %ebx ; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload ; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: callq __truncsfbf2@PLT -; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: pextrw $0, %xmm0, %eax ; SSE2-NEXT: movzwl %ax, %eax ; SSE2-NEXT: orl %ebx, %eax ; SSE2-NEXT: shlq $32, %rax @@ -1246,23 +1462,23 @@ define <32 x bfloat> @pr63017_2() nounwind { ; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload ; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: callq __truncsfbf2@PLT -; SSE2-NEXT: movd %xmm0, %ebx +; SSE2-NEXT: pextrw $0, %xmm0, %ebx ; SSE2-NEXT: shll $16, %ebx ; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload ; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: callq __truncsfbf2@PLT -; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: pextrw $0, %xmm0, %eax ; SSE2-NEXT: movzwl %ax, %r14d ; SSE2-NEXT: orl %ebx, %r14d ; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload ; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: callq __truncsfbf2@PLT -; SSE2-NEXT: movd %xmm0, %ebx +; SSE2-NEXT: pextrw $0, %xmm0, %ebx ; SSE2-NEXT: shll $16, %ebx ; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload ; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: callq __truncsfbf2@PLT -; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: pextrw $0, %xmm0, %eax ; SSE2-NEXT: movzwl %ax, %eax ; SSE2-NEXT: orl %ebx, %eax ; SSE2-NEXT: shlq $32, %rax @@ -1274,23 +1490,23 @@ define <32 x bfloat> @pr63017_2() nounwind { ; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload ; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: callq __truncsfbf2@PLT -; SSE2-NEXT: movd %xmm0, %ebx +; SSE2-NEXT: pextrw $0, %xmm0, %ebx ; SSE2-NEXT: shll $16, %ebx ; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload ; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: callq __truncsfbf2@PLT -; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: pextrw $0, %xmm0, %eax ; SSE2-NEXT: movzwl %ax, %r14d ; SSE2-NEXT: orl %ebx, %r14d ; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload ; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: callq __truncsfbf2@PLT -; SSE2-NEXT: movd %xmm0, %ebx +; SSE2-NEXT: pextrw $0, %xmm0, %ebx ; SSE2-NEXT: shll $16, %ebx ; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload ; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: callq __truncsfbf2@PLT -; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: pextrw $0, %xmm0, %eax ; SSE2-NEXT: movzwl %ax, %eax ; SSE2-NEXT: orl %ebx, %eax ; SSE2-NEXT: shlq $32, %rax @@ -1300,23 +1516,23 @@ define <32 x bfloat> @pr63017_2() nounwind { ; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload ; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: callq __truncsfbf2@PLT -; SSE2-NEXT: movd %xmm0, %ebx +; SSE2-NEXT: pextrw $0, %xmm0, %ebx ; SSE2-NEXT: shll $16, %ebx ; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload ; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: callq __truncsfbf2@PLT -; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: pextrw $0, %xmm0, %eax ; SSE2-NEXT: movzwl %ax, %r14d ; SSE2-NEXT: orl %ebx, %r14d ; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload ; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: callq __truncsfbf2@PLT -; SSE2-NEXT: movd %xmm0, %ebx +; SSE2-NEXT: pextrw $0, %xmm0, %ebx ; SSE2-NEXT: shll $16, %ebx ; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload ; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: callq __truncsfbf2@PLT -; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: pextrw $0, %xmm0, %eax ; SSE2-NEXT: movzwl %ax, %eax ; SSE2-NEXT: orl %ebx, %eax ; SSE2-NEXT: shlq $32, %rax @@ -1328,23 +1544,23 @@ define <32 x bfloat> @pr63017_2() nounwind { ; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload ; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: callq __truncsfbf2@PLT -; SSE2-NEXT: movd %xmm0, %ebx +; SSE2-NEXT: pextrw $0, %xmm0, %ebx ; SSE2-NEXT: shll $16, %ebx ; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload ; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: callq __truncsfbf2@PLT -; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: pextrw $0, %xmm0, %eax ; SSE2-NEXT: movzwl %ax, %r14d ; SSE2-NEXT: orl %ebx, %r14d ; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload ; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: callq __truncsfbf2@PLT -; SSE2-NEXT: movd %xmm0, %ebx +; SSE2-NEXT: pextrw $0, %xmm0, %ebx ; SSE2-NEXT: shll $16, %ebx ; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload ; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: callq __truncsfbf2@PLT -; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: pextrw $0, %xmm0, %eax ; SSE2-NEXT: movzwl %ax, %eax ; SSE2-NEXT: orl %ebx, %eax ; SSE2-NEXT: shlq $32, %rax @@ -1354,23 +1570,23 @@ define <32 x bfloat> @pr63017_2() nounwind { ; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload ; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: callq __truncsfbf2@PLT -; SSE2-NEXT: movd %xmm0, %ebx +; SSE2-NEXT: pextrw $0, %xmm0, %ebx ; SSE2-NEXT: shll $16, %ebx ; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload ; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: callq __truncsfbf2@PLT -; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: pextrw $0, %xmm0, %eax ; SSE2-NEXT: movzwl %ax, %r14d ; SSE2-NEXT: orl %ebx, %r14d ; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload ; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: callq __truncsfbf2@PLT -; SSE2-NEXT: movd %xmm0, %ebx +; SSE2-NEXT: pextrw $0, %xmm0, %ebx ; SSE2-NEXT: shll $16, %ebx ; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload ; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: callq __truncsfbf2@PLT -; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: pextrw $0, %xmm0, %eax ; SSE2-NEXT: movzwl %ax, %eax ; SSE2-NEXT: orl %ebx, %eax ; SSE2-NEXT: shlq $32, %rax @@ -1386,11 +1602,17 @@ define <32 x bfloat> @pr63017_2() nounwind { ; SSE2-NEXT: popq %r14 ; SSE2-NEXT: retq ; -; F16-LABEL: pr63017_2: -; F16: # %bb.0: -; F16-NEXT: vpbroadcastw {{.*#+}} zmm0 = [49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024] -; F16-NEXT: vmovdqu16 (%rax), %zmm0 {%k1} -; F16-NEXT: retq +; BF16-LABEL: pr63017_2: +; BF16: # %bb.0: +; BF16-NEXT: vpbroadcastw {{.*#+}} zmm0 = [49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024] +; BF16-NEXT: vmovdqu16 (%rax), %zmm0 {%k1} +; BF16-NEXT: retq +; +; FP16-LABEL: pr63017_2: +; FP16: # %bb.0: +; FP16-NEXT: vpbroadcastw {{.*#+}} zmm0 = [-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0,-1.875E+0] +; FP16-NEXT: vmovdqu16 (%rax), %zmm0 {%k1} +; FP16-NEXT: retq ; ; AVXNC-LABEL: pr63017_2: ; AVXNC: # %bb.0: @@ -1650,7 +1872,7 @@ define <32 x bfloat> @pr62997_3(<32 x bfloat> %0, bfloat %1) { ; SSE2-NEXT: movabsq $-4294967296, %rcx # imm = 0xFFFFFFFF00000000 ; SSE2-NEXT: andq %rax, %rcx ; SSE2-NEXT: movzwl %ax, %eax -; SSE2-NEXT: movd %xmm4, %edx +; SSE2-NEXT: pextrw $0, %xmm4, %edx ; SSE2-NEXT: shll $16, %edx ; SSE2-NEXT: orl %eax, %edx ; SSE2-NEXT: orq %rcx, %rdx @@ -1658,16 +1880,23 @@ define <32 x bfloat> @pr62997_3(<32 x bfloat> %0, bfloat %1) { ; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] ; SSE2-NEXT: retq ; -; F16-LABEL: pr62997_3: -; F16: # %bb.0: -; F16-NEXT: vmovd %xmm1, %eax -; F16-NEXT: vpinsrw $1, %eax, %xmm0, %xmm1 -; F16-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 -; F16-NEXT: retq +; BF16-LABEL: pr62997_3: +; BF16: # %bb.0: +; BF16-NEXT: vpextrw $0, %xmm1, %eax +; BF16-NEXT: vpinsrw $1, %eax, %xmm0, %xmm1 +; BF16-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 +; BF16-NEXT: retq +; +; FP16-LABEL: pr62997_3: +; FP16: # %bb.0: +; FP16-NEXT: vmovw %xmm1, %eax +; FP16-NEXT: vpinsrw $1, %eax, %xmm0, %xmm1 +; FP16-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 +; FP16-NEXT: retq ; ; AVXNC-LABEL: pr62997_3: ; AVXNC: # %bb.0: -; AVXNC-NEXT: vmovd %xmm2, %eax +; AVXNC-NEXT: vpextrw $0, %xmm2, %eax ; AVXNC-NEXT: vpinsrw $1, %eax, %xmm0, %xmm2 ; AVXNC-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVXNC-NEXT: retq @@ -1953,42 +2182,28 @@ define <8 x double> @pr64460_4(<8 x bfloat> %a) { define <4 x bfloat> @fptrunc_v4f32(<4 x float> %a) nounwind { ; SSE2-LABEL: fptrunc_v4f32: ; SSE2: # %bb.0: -; SSE2-NEXT: pushq %rbp -; SSE2-NEXT: pushq %r14 -; SSE2-NEXT: pushq %rbx -; SSE2-NEXT: subq $32, %rsp -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE2-NEXT: subq $72, %rsp +; SSE2-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE2-NEXT: callq __truncsfbf2@PLT -; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE2-NEXT: callq __truncsfbf2@PLT -; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload +; SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; SSE2-NEXT: callq __truncsfbf2@PLT -; SSE2-NEXT: movd %xmm0, %ebx -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload -; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: movd %xmm0, %ebp -; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload -; SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: movd %xmm0, %r14d -; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE2-NEXT: callq __truncsfbf2@PLT -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: pinsrw $0, %eax, %xmm0 -; SSE2-NEXT: pinsrw $0, %r14d, %xmm1 +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: pinsrw $0, %ebp, %xmm0 -; SSE2-NEXT: pinsrw $0, %ebx, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; SSE2-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; SSE2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: addq $32, %rsp -; SSE2-NEXT: popq %rbx -; SSE2-NEXT: popq %r14 -; SSE2-NEXT: popq %rbp +; SSE2-NEXT: addq $72, %rsp ; SSE2-NEXT: retq ; ; F16-LABEL: fptrunc_v4f32: @@ -2019,22 +2234,22 @@ define <8 x bfloat> @fptrunc_v8f32(<8 x float> %a) nounwind { ; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE2-NEXT: callq __truncsfbf2@PLT -; SSE2-NEXT: movd %xmm0, %ebx +; SSE2-NEXT: pextrw $0, %xmm0, %ebx ; SSE2-NEXT: shll $16, %ebx ; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE2-NEXT: callq __truncsfbf2@PLT -; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: pextrw $0, %xmm0, %eax ; SSE2-NEXT: movzwl %ax, %r14d ; SSE2-NEXT: orl %ebx, %r14d ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE2-NEXT: callq __truncsfbf2@PLT -; SSE2-NEXT: movd %xmm0, %ebp +; SSE2-NEXT: pextrw $0, %xmm0, %ebp ; SSE2-NEXT: shll $16, %ebp ; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1] ; SSE2-NEXT: callq __truncsfbf2@PLT -; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: pextrw $0, %xmm0, %eax ; SSE2-NEXT: movzwl %ax, %ebx ; SSE2-NEXT: orl %ebp, %ebx ; SSE2-NEXT: shlq $32, %rbx @@ -2042,22 +2257,22 @@ define <8 x bfloat> @fptrunc_v8f32(<8 x float> %a) nounwind { ; SSE2-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE2-NEXT: callq __truncsfbf2@PLT -; SSE2-NEXT: movd %xmm0, %ebp +; SSE2-NEXT: pextrw $0, %xmm0, %ebp ; SSE2-NEXT: shll $16, %ebp ; SSE2-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload ; SSE2-NEXT: callq __truncsfbf2@PLT -; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: pextrw $0, %xmm0, %eax ; SSE2-NEXT: movzwl %ax, %r14d ; SSE2-NEXT: orl %ebp, %r14d ; SSE2-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE2-NEXT: callq __truncsfbf2@PLT -; SSE2-NEXT: movd %xmm0, %ebp +; SSE2-NEXT: pextrw $0, %xmm0, %ebp ; SSE2-NEXT: shll $16, %ebp ; SSE2-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1] ; SSE2-NEXT: callq __truncsfbf2@PLT -; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: pextrw $0, %xmm0, %eax ; SSE2-NEXT: movzwl %ax, %eax ; SSE2-NEXT: orl %ebp, %eax ; SSE2-NEXT: shlq $32, %rax @@ -2102,22 +2317,22 @@ define <16 x bfloat> @fptrunc_v16f32(<16 x float> %a) nounwind { ; SSE2-NEXT: movaps %xmm2, %xmm0 ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[1,1] ; SSE2-NEXT: callq __truncsfbf2@PLT -; SSE2-NEXT: movd %xmm0, %ebx +; SSE2-NEXT: pextrw $0, %xmm0, %ebx ; SSE2-NEXT: shll $16, %ebx ; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE2-NEXT: callq __truncsfbf2@PLT -; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: pextrw $0, %xmm0, %eax ; SSE2-NEXT: movzwl %ax, %r14d ; SSE2-NEXT: orl %ebx, %r14d ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE2-NEXT: callq __truncsfbf2@PLT -; SSE2-NEXT: movd %xmm0, %ebp +; SSE2-NEXT: pextrw $0, %xmm0, %ebp ; SSE2-NEXT: shll $16, %ebp ; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1] ; SSE2-NEXT: callq __truncsfbf2@PLT -; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: pextrw $0, %xmm0, %eax ; SSE2-NEXT: movzwl %ax, %ebx ; SSE2-NEXT: orl %ebp, %ebx ; SSE2-NEXT: shlq $32, %rbx @@ -2125,22 +2340,22 @@ define <16 x bfloat> @fptrunc_v16f32(<16 x float> %a) nounwind { ; SSE2-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE2-NEXT: callq __truncsfbf2@PLT -; SSE2-NEXT: movd %xmm0, %ebp +; SSE2-NEXT: pextrw $0, %xmm0, %ebp ; SSE2-NEXT: shll $16, %ebp ; SSE2-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload ; SSE2-NEXT: callq __truncsfbf2@PLT -; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: pextrw $0, %xmm0, %eax ; SSE2-NEXT: movzwl %ax, %r15d ; SSE2-NEXT: orl %ebp, %r15d ; SSE2-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE2-NEXT: callq __truncsfbf2@PLT -; SSE2-NEXT: movd %xmm0, %ebp +; SSE2-NEXT: pextrw $0, %xmm0, %ebp ; SSE2-NEXT: shll $16, %ebp ; SSE2-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1] ; SSE2-NEXT: callq __truncsfbf2@PLT -; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: pextrw $0, %xmm0, %eax ; SSE2-NEXT: movzwl %ax, %r14d ; SSE2-NEXT: orl %ebp, %r14d ; SSE2-NEXT: shlq $32, %r14 @@ -2148,22 +2363,22 @@ define <16 x bfloat> @fptrunc_v16f32(<16 x float> %a) nounwind { ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE2-NEXT: callq __truncsfbf2@PLT -; SSE2-NEXT: movd %xmm0, %ebp +; SSE2-NEXT: pextrw $0, %xmm0, %ebp ; SSE2-NEXT: shll $16, %ebp ; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE2-NEXT: callq __truncsfbf2@PLT -; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: pextrw $0, %xmm0, %eax ; SSE2-NEXT: movzwl %ax, %r12d ; SSE2-NEXT: orl %ebp, %r12d ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE2-NEXT: callq __truncsfbf2@PLT -; SSE2-NEXT: movd %xmm0, %ebp +; SSE2-NEXT: pextrw $0, %xmm0, %ebp ; SSE2-NEXT: shll $16, %ebp ; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1] ; SSE2-NEXT: callq __truncsfbf2@PLT -; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: pextrw $0, %xmm0, %eax ; SSE2-NEXT: movzwl %ax, %r15d ; SSE2-NEXT: orl %ebp, %r15d ; SSE2-NEXT: shlq $32, %r15 @@ -2171,22 +2386,22 @@ define <16 x bfloat> @fptrunc_v16f32(<16 x float> %a) nounwind { ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; SSE2-NEXT: callq __truncsfbf2@PLT -; SSE2-NEXT: movd %xmm0, %ebp +; SSE2-NEXT: pextrw $0, %xmm0, %ebp ; SSE2-NEXT: shll $16, %ebp ; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE2-NEXT: callq __truncsfbf2@PLT -; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: pextrw $0, %xmm0, %eax ; SSE2-NEXT: movzwl %ax, %r12d ; SSE2-NEXT: orl %ebp, %r12d ; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; SSE2-NEXT: callq __truncsfbf2@PLT -; SSE2-NEXT: movd %xmm0, %ebp +; SSE2-NEXT: pextrw $0, %xmm0, %ebp ; SSE2-NEXT: shll $16, %ebp ; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1] ; SSE2-NEXT: callq __truncsfbf2@PLT -; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: pextrw $0, %xmm0, %eax ; SSE2-NEXT: movzwl %ax, %eax ; SSE2-NEXT: orl %ebp, %eax ; SSE2-NEXT: shlq $32, %rax @@ -2234,21 +2449,21 @@ define <8 x bfloat> @fptrunc_v8f64(<8 x double> %a) nounwind { ; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1] ; SSE2-NEXT: callq __truncdfbf2@PLT -; SSE2-NEXT: movd %xmm0, %ebx +; SSE2-NEXT: pextrw $0, %xmm0, %ebx ; SSE2-NEXT: shll $16, %ebx ; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE2-NEXT: callq __truncdfbf2@PLT -; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: pextrw $0, %xmm0, %eax ; SSE2-NEXT: movzwl %ax, %r14d ; SSE2-NEXT: orl %ebx, %r14d ; SSE2-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1] ; SSE2-NEXT: callq __truncdfbf2@PLT -; SSE2-NEXT: movd %xmm0, %ebp +; SSE2-NEXT: pextrw $0, %xmm0, %ebp ; SSE2-NEXT: shll $16, %ebp ; SSE2-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload ; SSE2-NEXT: callq __truncdfbf2@PLT -; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: pextrw $0, %xmm0, %eax ; SSE2-NEXT: movzwl %ax, %ebx ; SSE2-NEXT: orl %ebp, %ebx ; SSE2-NEXT: shlq $32, %rbx @@ -2256,21 +2471,21 @@ define <8 x bfloat> @fptrunc_v8f64(<8 x double> %a) nounwind { ; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1] ; SSE2-NEXT: callq __truncdfbf2@PLT -; SSE2-NEXT: movd %xmm0, %ebp +; SSE2-NEXT: pextrw $0, %xmm0, %ebp ; SSE2-NEXT: shll $16, %ebp ; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE2-NEXT: callq __truncdfbf2@PLT -; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: pextrw $0, %xmm0, %eax ; SSE2-NEXT: movzwl %ax, %r14d ; SSE2-NEXT: orl %ebp, %r14d ; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1,1] ; SSE2-NEXT: callq __truncdfbf2@PLT -; SSE2-NEXT: movd %xmm0, %ebp +; SSE2-NEXT: pextrw $0, %xmm0, %ebp ; SSE2-NEXT: shll $16, %ebp ; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE2-NEXT: callq __truncdfbf2@PLT -; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: pextrw $0, %xmm0, %eax ; SSE2-NEXT: movzwl %ax, %eax ; SSE2-NEXT: orl %ebp, %eax ; SSE2-NEXT: shlq $32, %rax @@ -2284,89 +2499,141 @@ define <8 x bfloat> @fptrunc_v8f64(<8 x double> %a) nounwind { ; SSE2-NEXT: popq %rbp ; SSE2-NEXT: retq ; -; F16-LABEL: fptrunc_v8f64: -; F16: # %bb.0: -; F16-NEXT: pushq %rbp -; F16-NEXT: pushq %r15 -; F16-NEXT: pushq %r14 -; F16-NEXT: pushq %r13 -; F16-NEXT: pushq %r12 -; F16-NEXT: pushq %rbx -; F16-NEXT: subq $136, %rsp -; F16-NEXT: vmovupd %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; F16-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] -; F16-NEXT: vzeroupper -; F16-NEXT: callq __truncdfbf2@PLT -; F16-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; F16-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; F16-NEXT: vextractf128 $1, %ymm0, %xmm0 -; F16-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; F16-NEXT: vzeroupper -; F16-NEXT: callq __truncdfbf2@PLT -; F16-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; F16-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; F16-NEXT: # xmm0 = mem[1,0] -; F16-NEXT: callq __truncdfbf2@PLT -; F16-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; F16-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; F16-NEXT: vextractf32x4 $2, %zmm0, %xmm0 -; F16-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; F16-NEXT: vzeroupper -; F16-NEXT: callq __truncdfbf2@PLT -; F16-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; F16-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; F16-NEXT: # xmm0 = mem[1,0] -; F16-NEXT: callq __truncdfbf2@PLT -; F16-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; F16-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; F16-NEXT: vextractf32x4 $3, %zmm0, %xmm0 -; F16-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; F16-NEXT: vzeroupper -; F16-NEXT: callq __truncdfbf2@PLT -; F16-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; F16-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; F16-NEXT: # xmm0 = mem[1,0] -; F16-NEXT: callq __truncdfbf2@PLT -; F16-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; F16-NEXT: vmovd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload -; F16-NEXT: # xmm0 = mem[0],zero,zero,zero -; F16-NEXT: vmovd %xmm0, %ebp -; F16-NEXT: vmovd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload -; F16-NEXT: # xmm0 = mem[0],zero,zero,zero -; F16-NEXT: vmovd %xmm0, %r14d -; F16-NEXT: vmovd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload -; F16-NEXT: # xmm0 = mem[0],zero,zero,zero -; F16-NEXT: vmovd %xmm0, %r15d -; F16-NEXT: vmovd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload -; F16-NEXT: # xmm0 = mem[0],zero,zero,zero -; F16-NEXT: vmovd %xmm0, %r12d -; F16-NEXT: vmovd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload -; F16-NEXT: # xmm0 = mem[0],zero,zero,zero -; F16-NEXT: vmovd %xmm0, %r13d -; F16-NEXT: vmovd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload -; F16-NEXT: # xmm0 = mem[0],zero,zero,zero -; F16-NEXT: vmovd %xmm0, %ebx -; F16-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; F16-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; F16-NEXT: vzeroupper -; F16-NEXT: callq __truncdfbf2@PLT -; F16-NEXT: vmovd %xmm0, %eax -; F16-NEXT: vmovd %eax, %xmm0 -; F16-NEXT: vpinsrw $1, %ebx, %xmm0, %xmm0 -; F16-NEXT: vpinsrw $2, %r13d, %xmm0, %xmm0 -; F16-NEXT: vpinsrw $3, %r12d, %xmm0, %xmm0 -; F16-NEXT: vpinsrw $4, %r15d, %xmm0, %xmm0 -; F16-NEXT: vpinsrw $5, %r14d, %xmm0, %xmm0 -; F16-NEXT: vpinsrw $6, %ebp, %xmm0, %xmm0 -; F16-NEXT: vpinsrw $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload -; F16-NEXT: addq $136, %rsp -; F16-NEXT: popq %rbx -; F16-NEXT: popq %r12 -; F16-NEXT: popq %r13 -; F16-NEXT: popq %r14 -; F16-NEXT: popq %r15 -; F16-NEXT: popq %rbp -; F16-NEXT: retq +; BF16-LABEL: fptrunc_v8f64: +; BF16: # %bb.0: +; BF16-NEXT: pushq %rbp +; BF16-NEXT: pushq %r15 +; BF16-NEXT: pushq %r14 +; BF16-NEXT: pushq %r13 +; BF16-NEXT: pushq %r12 +; BF16-NEXT: pushq %rbx +; BF16-NEXT: subq $184, %rsp +; BF16-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; BF16-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; BF16-NEXT: vzeroupper +; BF16-NEXT: callq __truncdfbf2@PLT +; BF16-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; BF16-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; BF16-NEXT: vextractf128 $1, %ymm0, %xmm0 +; BF16-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; BF16-NEXT: vzeroupper +; BF16-NEXT: callq __truncdfbf2@PLT +; BF16-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; BF16-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; BF16-NEXT: # xmm0 = mem[1,0] +; BF16-NEXT: callq __truncdfbf2@PLT +; BF16-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; BF16-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; BF16-NEXT: vextractf32x4 $2, %zmm0, %xmm0 +; BF16-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; BF16-NEXT: vzeroupper +; BF16-NEXT: callq __truncdfbf2@PLT +; BF16-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; BF16-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload +; BF16-NEXT: # xmm0 = mem[1,0] +; BF16-NEXT: callq __truncdfbf2@PLT +; BF16-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; BF16-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; BF16-NEXT: vextractf32x4 $3, %zmm0, %xmm0 +; BF16-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; BF16-NEXT: vzeroupper +; BF16-NEXT: callq __truncdfbf2@PLT +; BF16-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; BF16-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload +; BF16-NEXT: # xmm0 = mem[1,0] +; BF16-NEXT: callq __truncdfbf2@PLT +; BF16-NEXT: vpextrw $0, %xmm0, %eax +; BF16-NEXT: movl %eax, (%rsp) # 4-byte Spill +; BF16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; BF16-NEXT: vpextrw $0, %xmm0, %ebp +; BF16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; BF16-NEXT: vpextrw $0, %xmm0, %r14d +; BF16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; BF16-NEXT: vpextrw $0, %xmm0, %r15d +; BF16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; BF16-NEXT: vpextrw $0, %xmm0, %r12d +; BF16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; BF16-NEXT: vpextrw $0, %xmm0, %r13d +; BF16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; BF16-NEXT: vpextrw $0, %xmm0, %ebx +; BF16-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; BF16-NEXT: # xmm0 = mem[1,0] +; BF16-NEXT: callq __truncdfbf2@PLT +; BF16-NEXT: vpextrw $0, %xmm0, %eax +; BF16-NEXT: vmovd %ebx, %xmm0 +; BF16-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 +; BF16-NEXT: vpinsrw $2, %r13d, %xmm0, %xmm0 +; BF16-NEXT: vpinsrw $3, %r12d, %xmm0, %xmm0 +; BF16-NEXT: vpinsrw $4, %r15d, %xmm0, %xmm0 +; BF16-NEXT: vpinsrw $5, %r14d, %xmm0, %xmm0 +; BF16-NEXT: vpinsrw $6, %ebp, %xmm0, %xmm0 +; BF16-NEXT: vpinsrw $7, (%rsp), %xmm0, %xmm0 # 4-byte Folded Reload +; BF16-NEXT: addq $184, %rsp +; BF16-NEXT: popq %rbx +; BF16-NEXT: popq %r12 +; BF16-NEXT: popq %r13 +; BF16-NEXT: popq %r14 +; BF16-NEXT: popq %r15 +; BF16-NEXT: popq %rbp +; BF16-NEXT: retq +; +; FP16-LABEL: fptrunc_v8f64: +; FP16: # %bb.0: +; FP16-NEXT: subq $184, %rsp +; FP16-NEXT: vmovupd %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; FP16-NEXT: vextractf128 $1, %ymm0, %xmm0 +; FP16-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; FP16-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] +; FP16-NEXT: vzeroupper +; FP16-NEXT: callq __truncdfbf2@PLT +; FP16-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; FP16-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; FP16-NEXT: callq __truncdfbf2@PLT +; FP16-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; FP16-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; FP16-NEXT: # xmm0 = mem[1,0] +; FP16-NEXT: callq __truncdfbf2@PLT +; FP16-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; FP16-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; FP16-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; FP16-NEXT: vzeroupper +; FP16-NEXT: callq __truncdfbf2@PLT +; FP16-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; FP16-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; FP16-NEXT: vextractf32x4 $2, %zmm0, %xmm0 +; FP16-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill +; FP16-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] +; FP16-NEXT: vzeroupper +; FP16-NEXT: callq __truncdfbf2@PLT +; FP16-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; FP16-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; FP16-NEXT: callq __truncdfbf2@PLT +; FP16-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; FP16-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; FP16-NEXT: vextractf32x4 $3, %zmm0, %xmm0 +; FP16-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; FP16-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] +; FP16-NEXT: vzeroupper +; FP16-NEXT: callq __truncdfbf2@PLT +; FP16-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; FP16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; FP16-NEXT: callq __truncdfbf2@PLT +; FP16-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; FP16-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; FP16-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload +; FP16-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; FP16-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; FP16-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; FP16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; FP16-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; FP16-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] +; FP16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; FP16-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; FP16-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3] +; FP16-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; FP16-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; FP16-NEXT: addq $184, %rsp +; FP16-NEXT: retq ; ; AVXNC-LABEL: fptrunc_v8f64: ; AVXNC: # %bb.0: @@ -2376,74 +2643,68 @@ define <8 x bfloat> @fptrunc_v8f64(<8 x double> %a) nounwind { ; AVXNC-NEXT: pushq %r13 ; AVXNC-NEXT: pushq %r12 ; AVXNC-NEXT: pushq %rbx -; AVXNC-NEXT: subq $120, %rsp +; AVXNC-NEXT: subq $168, %rsp ; AVXNC-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVXNC-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVXNC-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0] +; AVXNC-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVXNC-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVXNC-NEXT: vzeroupper ; AVXNC-NEXT: callq __truncdfbf2@PLT -; AVXNC-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVXNC-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVXNC-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVXNC-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVXNC-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVXNC-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVXNC-NEXT: vzeroupper ; AVXNC-NEXT: callq __truncdfbf2@PLT -; AVXNC-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVXNC-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVXNC-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVXNC-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload ; AVXNC-NEXT: # xmm0 = mem[1,0] ; AVXNC-NEXT: callq __truncdfbf2@PLT -; AVXNC-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVXNC-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill ; AVXNC-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVXNC-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVXNC-NEXT: vzeroupper ; AVXNC-NEXT: callq __truncdfbf2@PLT -; AVXNC-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVXNC-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVXNC-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVXNC-NEXT: # xmm0 = mem[1,0] ; AVXNC-NEXT: callq __truncdfbf2@PLT -; AVXNC-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVXNC-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVXNC-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVXNC-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVXNC-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVXNC-NEXT: vzeroupper ; AVXNC-NEXT: callq __truncdfbf2@PLT -; AVXNC-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVXNC-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVXNC-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVXNC-NEXT: # xmm0 = mem[1,0] ; AVXNC-NEXT: callq __truncdfbf2@PLT -; AVXNC-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVXNC-NEXT: vmovd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload -; AVXNC-NEXT: # xmm0 = mem[0],zero,zero,zero -; AVXNC-NEXT: vmovd %xmm0, %ebp -; AVXNC-NEXT: vmovd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload -; AVXNC-NEXT: # xmm0 = mem[0],zero,zero,zero -; AVXNC-NEXT: vmovd %xmm0, %r14d -; AVXNC-NEXT: vmovd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload -; AVXNC-NEXT: # xmm0 = mem[0],zero,zero,zero -; AVXNC-NEXT: vmovd %xmm0, %r15d -; AVXNC-NEXT: vmovd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload -; AVXNC-NEXT: # xmm0 = mem[0],zero,zero,zero -; AVXNC-NEXT: vmovd %xmm0, %r12d -; AVXNC-NEXT: vmovd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload -; AVXNC-NEXT: # xmm0 = mem[0],zero,zero,zero -; AVXNC-NEXT: vmovd %xmm0, %r13d -; AVXNC-NEXT: vmovd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload -; AVXNC-NEXT: # xmm0 = mem[0],zero,zero,zero -; AVXNC-NEXT: vmovd %xmm0, %ebx -; AVXNC-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVXNC-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVXNC-NEXT: vzeroupper +; AVXNC-NEXT: vpextrw $0, %xmm0, %eax +; AVXNC-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; AVXNC-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVXNC-NEXT: vpextrw $0, %xmm0, %ebp +; AVXNC-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVXNC-NEXT: vpextrw $0, %xmm0, %r14d +; AVXNC-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVXNC-NEXT: vpextrw $0, %xmm0, %r15d +; AVXNC-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload +; AVXNC-NEXT: vpextrw $0, %xmm0, %r12d +; AVXNC-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVXNC-NEXT: vpextrw $0, %xmm0, %r13d +; AVXNC-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVXNC-NEXT: vpextrw $0, %xmm0, %ebx +; AVXNC-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; AVXNC-NEXT: # xmm0 = mem[1,0] ; AVXNC-NEXT: callq __truncdfbf2@PLT -; AVXNC-NEXT: vmovd %xmm0, %eax -; AVXNC-NEXT: vmovd %eax, %xmm0 -; AVXNC-NEXT: vpinsrw $1, %ebx, %xmm0, %xmm0 +; AVXNC-NEXT: vpextrw $0, %xmm0, %eax +; AVXNC-NEXT: vmovd %ebx, %xmm0 +; AVXNC-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 ; AVXNC-NEXT: vpinsrw $2, %r13d, %xmm0, %xmm0 ; AVXNC-NEXT: vpinsrw $3, %r12d, %xmm0, %xmm0 ; AVXNC-NEXT: vpinsrw $4, %r15d, %xmm0, %xmm0 ; AVXNC-NEXT: vpinsrw $5, %r14d, %xmm0, %xmm0 ; AVXNC-NEXT: vpinsrw $6, %ebp, %xmm0, %xmm0 ; AVXNC-NEXT: vpinsrw $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload -; AVXNC-NEXT: addq $120, %rsp +; AVXNC-NEXT: addq $168, %rsp ; AVXNC-NEXT: popq %rbx ; AVXNC-NEXT: popq %r12 ; AVXNC-NEXT: popq %r13 _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits