Author: Simon Pilgrim Date: 2021-01-12T14:07:53Z New Revision: 2ed914cb7e9c0737bdf60a0b1fd48b6499973325
URL: https://github.com/llvm/llvm-project/commit/2ed914cb7e9c0737bdf60a0b1fd48b6499973325 DIFF: https://github.com/llvm/llvm-project/commit/2ed914cb7e9c0737bdf60a0b1fd48b6499973325.diff LOG: [X86][SSE] getFauxShuffleMask - handle PACKSS(SRAI(),SRAI()) shuffle patterns. We can't easily treat ASHR a faux shuffle, but if it was just feeding a PACKSS then it was likely being used as sign-extension for a truncation, so just peek through and adjust the mask accordingly. Added: Modified: llvm/lib/Target/X86/X86ISelLowering.cpp llvm/test/CodeGen/X86/psubus.ll Removed: ################################################################################ diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 750c809eafca..f28e28689806 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -7685,12 +7685,26 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts, // If we know input saturation won't happen (or we don't care for particular // lanes), we can treat this as a truncation shuffle. + bool Offset0 = false, Offset1 = false; if (Opcode == X86ISD::PACKSS) { if ((!(N0.isUndef() || EltsLHS.isNullValue()) && DAG.ComputeNumSignBits(N0, EltsLHS, Depth + 1) <= NumBitsPerElt) || (!(N1.isUndef() || EltsRHS.isNullValue()) && DAG.ComputeNumSignBits(N1, EltsRHS, Depth + 1) <= NumBitsPerElt)) return false; + // We can't easily fold ASHR into a shuffle, but if it was feeding a + // PACKSS then it was likely being used for sign-extension for a + // truncation, so just peek through and adjust the mask accordingly. + if (N0.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N0.getNode()) && + N0.getConstantOperandAPInt(1) == NumBitsPerElt) { + Offset0 = true; + N0 = N0.getOperand(0); + } + if (N1.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N1.getNode()) && + N1.getConstantOperandAPInt(1) == NumBitsPerElt) { + Offset1 = true; + N1 = N1.getOperand(0); + } } else { APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt); if ((!(N0.isUndef() || EltsLHS.isNullValue()) && @@ -7707,6 +7721,13 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts, Ops.push_back(N1); createPackShuffleMask(VT, Mask, IsUnary); + + if (Offset0 || Offset1) { + for (int &M : Mask) + if ((Offset0 && isInRange(M, 0, NumElts)) || + (Offset1 && isInRange(M, NumElts, 2 * NumElts))) + ++M; + } return true; } case X86ISD::VTRUNC: { diff --git a/llvm/test/CodeGen/X86/psubus.ll b/llvm/test/CodeGen/X86/psubus.ll index 06240cd8bad3..351629a732c1 100644 --- a/llvm/test/CodeGen/X86/psubus.ll +++ b/llvm/test/CodeGen/X86/psubus.ll @@ -1403,11 +1403,6 @@ define <8 x i16> @psubus_8i32_max(<8 x i16> %x, <8 x i32> %y) nounwind { ; SSE2-NEXT: psrad $16, %xmm5 ; SSE2-NEXT: packssdw %xmm6, %xmm5 ; SSE2-NEXT: psubusw %xmm5, %xmm0 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: psrad $16, %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE2-NEXT: psrad $16, %xmm0 -; SSE2-NEXT: packssdw %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: psubus_8i32_max: @@ -1738,111 +1733,91 @@ define <16 x i16> @psubus_16i32_max(<16 x i16> %x, <16 x i32> %y) nounwind { ; SSE2-LABEL: psubus_16i32_max: ; SSE2: # %bb.0: # %vector.ph ; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm3, %xmm8 +; SSE2-NEXT: movdqa %xmm5, %xmm8 ; SSE2-NEXT: pxor %xmm9, %xmm8 ; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147549183,2147549183,2147549183,2147549183] ; SSE2-NEXT: movdqa %xmm7, %xmm6 ; SSE2-NEXT: pcmpgtd %xmm8, %xmm6 ; SSE2-NEXT: pcmpeqd %xmm8, %xmm8 -; SSE2-NEXT: pand %xmm6, %xmm3 +; SSE2-NEXT: pand %xmm6, %xmm5 ; SSE2-NEXT: pxor %xmm8, %xmm6 -; SSE2-NEXT: por %xmm3, %xmm6 +; SSE2-NEXT: por %xmm5, %xmm6 ; SSE2-NEXT: pslld $16, %xmm6 ; SSE2-NEXT: psrad $16, %xmm6 -; SSE2-NEXT: movdqa %xmm2, %xmm10 +; SSE2-NEXT: movdqa %xmm4, %xmm10 ; SSE2-NEXT: pxor %xmm9, %xmm10 -; SSE2-NEXT: movdqa %xmm7, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm10, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm2 -; SSE2-NEXT: pxor %xmm8, %xmm3 -; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: pslld $16, %xmm3 -; SSE2-NEXT: psrad $16, %xmm3 -; SSE2-NEXT: packssdw %xmm6, %xmm3 -; SSE2-NEXT: movdqa %xmm5, %xmm2 -; SSE2-NEXT: pxor %xmm9, %xmm2 +; SSE2-NEXT: movdqa %xmm7, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm10, %xmm5 +; SSE2-NEXT: pand %xmm5, %xmm4 +; SSE2-NEXT: pxor %xmm8, %xmm5 +; SSE2-NEXT: por %xmm4, %xmm5 +; SSE2-NEXT: pslld $16, %xmm5 +; SSE2-NEXT: psrad $16, %xmm5 +; SSE2-NEXT: packssdw %xmm6, %xmm5 +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: pxor %xmm9, %xmm4 ; SSE2-NEXT: movdqa %xmm7, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm6 -; SSE2-NEXT: pand %xmm6, %xmm5 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm6 +; SSE2-NEXT: pand %xmm6, %xmm3 ; SSE2-NEXT: pxor %xmm8, %xmm6 -; SSE2-NEXT: por %xmm5, %xmm6 +; SSE2-NEXT: por %xmm3, %xmm6 ; SSE2-NEXT: pslld $16, %xmm6 ; SSE2-NEXT: psrad $16, %xmm6 -; SSE2-NEXT: pxor %xmm4, %xmm9 +; SSE2-NEXT: pxor %xmm2, %xmm9 ; SSE2-NEXT: pcmpgtd %xmm9, %xmm7 ; SSE2-NEXT: pxor %xmm7, %xmm8 -; SSE2-NEXT: pand %xmm4, %xmm7 +; SSE2-NEXT: pand %xmm2, %xmm7 ; SSE2-NEXT: por %xmm8, %xmm7 ; SSE2-NEXT: pslld $16, %xmm7 ; SSE2-NEXT: psrad $16, %xmm7 ; SSE2-NEXT: packssdw %xmm6, %xmm7 -; SSE2-NEXT: psubusw %xmm7, %xmm1 -; SSE2-NEXT: psubusw %xmm3, %xmm0 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: psrad $16, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE2-NEXT: psrad $16, %xmm0 -; SSE2-NEXT: packssdw %xmm2, %xmm0 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; SSE2-NEXT: psrad $16, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] -; SSE2-NEXT: psrad $16, %xmm1 -; SSE2-NEXT: packssdw %xmm2, %xmm1 +; SSE2-NEXT: psubusw %xmm7, %xmm0 +; SSE2-NEXT: psubusw %xmm5, %xmm1 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: psubus_16i32_max: ; SSSE3: # %bb.0: # %vector.ph ; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,2147483648,2147483648,2147483648] -; SSSE3-NEXT: movdqa %xmm3, %xmm8 +; SSSE3-NEXT: movdqa %xmm5, %xmm8 ; SSSE3-NEXT: pxor %xmm9, %xmm8 ; SSSE3-NEXT: movdqa {{.*#+}} xmm7 = [2147549183,2147549183,2147549183,2147549183] ; SSSE3-NEXT: movdqa %xmm7, %xmm6 ; SSSE3-NEXT: pcmpgtd %xmm8, %xmm6 ; SSSE3-NEXT: pcmpeqd %xmm8, %xmm8 -; SSSE3-NEXT: pand %xmm6, %xmm3 +; SSSE3-NEXT: pand %xmm6, %xmm5 ; SSSE3-NEXT: pxor %xmm8, %xmm6 -; SSSE3-NEXT: por %xmm3, %xmm6 +; SSSE3-NEXT: por %xmm5, %xmm6 ; SSSE3-NEXT: pslld $16, %xmm6 ; SSSE3-NEXT: psrad $16, %xmm6 -; SSSE3-NEXT: movdqa %xmm2, %xmm10 +; SSSE3-NEXT: movdqa %xmm4, %xmm10 ; SSSE3-NEXT: pxor %xmm9, %xmm10 -; SSSE3-NEXT: movdqa %xmm7, %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm10, %xmm3 -; SSSE3-NEXT: pand %xmm3, %xmm2 -; SSSE3-NEXT: pxor %xmm8, %xmm3 -; SSSE3-NEXT: por %xmm2, %xmm3 -; SSSE3-NEXT: pslld $16, %xmm3 -; SSSE3-NEXT: psrad $16, %xmm3 -; SSSE3-NEXT: packssdw %xmm6, %xmm3 -; SSSE3-NEXT: movdqa %xmm5, %xmm2 -; SSSE3-NEXT: pxor %xmm9, %xmm2 +; SSSE3-NEXT: movdqa %xmm7, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm10, %xmm5 +; SSSE3-NEXT: pand %xmm5, %xmm4 +; SSSE3-NEXT: pxor %xmm8, %xmm5 +; SSSE3-NEXT: por %xmm4, %xmm5 +; SSSE3-NEXT: pslld $16, %xmm5 +; SSSE3-NEXT: psrad $16, %xmm5 +; SSSE3-NEXT: packssdw %xmm6, %xmm5 +; SSSE3-NEXT: movdqa %xmm3, %xmm4 +; SSSE3-NEXT: pxor %xmm9, %xmm4 ; SSSE3-NEXT: movdqa %xmm7, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm6 -; SSSE3-NEXT: pand %xmm6, %xmm5 +; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 +; SSSE3-NEXT: pand %xmm6, %xmm3 ; SSSE3-NEXT: pxor %xmm8, %xmm6 -; SSSE3-NEXT: por %xmm5, %xmm6 +; SSSE3-NEXT: por %xmm3, %xmm6 ; SSSE3-NEXT: pslld $16, %xmm6 ; SSSE3-NEXT: psrad $16, %xmm6 -; SSSE3-NEXT: pxor %xmm4, %xmm9 +; SSSE3-NEXT: pxor %xmm2, %xmm9 ; SSSE3-NEXT: pcmpgtd %xmm9, %xmm7 ; SSSE3-NEXT: pxor %xmm7, %xmm8 -; SSSE3-NEXT: pand %xmm4, %xmm7 +; SSSE3-NEXT: pand %xmm2, %xmm7 ; SSSE3-NEXT: por %xmm8, %xmm7 ; SSSE3-NEXT: pslld $16, %xmm7 ; SSSE3-NEXT: psrad $16, %xmm7 ; SSSE3-NEXT: packssdw %xmm6, %xmm7 -; SSSE3-NEXT: psubusw %xmm7, %xmm1 -; SSSE3-NEXT: psubusw %xmm3, %xmm0 -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSSE3-NEXT: psrad $16, %xmm2 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: psrad $16, %xmm0 -; SSSE3-NEXT: packssdw %xmm2, %xmm0 -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; SSSE3-NEXT: psrad $16, %xmm2 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: psrad $16, %xmm1 -; SSSE3-NEXT: packssdw %xmm2, %xmm1 +; SSSE3-NEXT: psubusw %xmm7, %xmm0 +; SSSE3-NEXT: psubusw %xmm5, %xmm1 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: psubus_16i32_max: @@ -1923,11 +1898,6 @@ define <8 x i16> @psubus_i16_i32_max_swapped(<8 x i16> %x, <8 x i32> %y) nounwin ; SSE2-NEXT: psrad $16, %xmm5 ; SSE2-NEXT: packssdw %xmm6, %xmm5 ; SSE2-NEXT: psubusw %xmm5, %xmm0 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: psrad $16, %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE2-NEXT: psrad $16, %xmm0 -; SSE2-NEXT: packssdw %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: psubus_i16_i32_max_swapped: _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits