Author: Simon Pilgrim Date: 2021-01-15T13:55:30Z New Revision: 1dfd5c9ad8cf677fb4c9c3ccf39d7b1f20c397d3
URL: https://github.com/llvm/llvm-project/commit/1dfd5c9ad8cf677fb4c9c3ccf39d7b1f20c397d3 DIFF: https://github.com/llvm/llvm-project/commit/1dfd5c9ad8cf677fb4c9c3ccf39d7b1f20c397d3.diff LOG: [X86][AVX] combineHorizOpWithShuffle - support target shuffles in HOP(SHUFFLE(X,Y),SHUFFLE(X,Y)) -> SHUFFLE(HOP(X,Y)) Be more aggressive on (AVX2+) folds of lane shuffles of 256-bit horizontal ops by working on target/faux shuffles as well. Added: Modified: llvm/lib/Target/X86/X86ISelLowering.cpp llvm/test/CodeGen/X86/haddsub-2.ll llvm/test/CodeGen/X86/haddsub-undef.ll Removed: ################################################################################ diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index d45eb5366bfe..a84250782c19 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -43114,30 +43114,32 @@ static SDValue combineHorizOpWithShuffle(SDNode *N, SelectionDAG &DAG, // Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(X,Y)) -> SHUFFLE(HOP(X,Y)). // TODO: Relax shuffle scaling to support sub-128-bit subvector shuffles. if (VT.is256BitVector() && Subtarget.hasInt256()) { - if (auto *SVN0 = dyn_cast<ShuffleVectorSDNode>(N0)) { - if (auto *SVN1 = dyn_cast<ShuffleVectorSDNode>(N1)) { - SmallVector<int, 2> ShuffleMask0, ShuffleMask1; - if (scaleShuffleElements(SVN0->getMask(), 2, ShuffleMask0) && - scaleShuffleElements(SVN1->getMask(), 2, ShuffleMask1)) { - SDValue Op00 = SVN0->getOperand(0); - SDValue Op01 = SVN0->getOperand(1); - SDValue Op10 = SVN1->getOperand(0); - SDValue Op11 = SVN1->getOperand(1); - if ((Op00 == Op11) && (Op01 == Op10)) { - std::swap(Op10, Op11); - ShuffleVectorSDNode::commuteMask(ShuffleMask1); - } - if ((Op00 == Op10) && (Op01 == Op11)) { - SmallVector<int, 4> ShuffleMask; - ShuffleMask.append(ShuffleMask0.begin(), ShuffleMask0.end()); - ShuffleMask.append(ShuffleMask1.begin(), ShuffleMask1.end()); - SDLoc DL(N); - MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64; - SDValue Res = DAG.getNode(Opcode, DL, VT, Op00, Op01); - Res = DAG.getBitcast(ShufVT, Res); - Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ShuffleMask); - return DAG.getBitcast(VT, Res); - } + SmallVector<int> Mask0, Mask1; + SmallVector<SDValue> Ops0, Ops1; + if (getTargetShuffleInputs(N0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) && + getTargetShuffleInputs(N1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) && + !Ops0.empty() && !Ops1.empty()) { + SDValue Op00 = Ops0.front(), Op01 = Ops0.back(); + SDValue Op10 = Ops1.front(), Op11 = Ops1.back(); + SmallVector<int, 2> ShuffleMask0, ShuffleMask1; + if (Op00.getValueType() == SrcVT && Op01.getValueType() == SrcVT && + Op11.getValueType() == SrcVT && Op11.getValueType() == SrcVT && + scaleShuffleElements(Mask0, 2, ShuffleMask0) && + scaleShuffleElements(Mask1, 2, ShuffleMask1)) { + if ((Op00 == Op11) && (Op01 == Op10)) { + std::swap(Op10, Op11); + ShuffleVectorSDNode::commuteMask(ShuffleMask1); + } + if ((Op00 == Op10) && (Op01 == Op11)) { + SmallVector<int, 4> ShuffleMask; + ShuffleMask.append(ShuffleMask0.begin(), ShuffleMask0.end()); + ShuffleMask.append(ShuffleMask1.begin(), ShuffleMask1.end()); + SDLoc DL(N); + MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64; + SDValue Res = DAG.getNode(Opcode, DL, VT, Op00, Op01); + Res = DAG.getBitcast(ShufVT, Res); + Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ShuffleMask); + return DAG.getBitcast(VT, Res); } } } diff --git a/llvm/test/CodeGen/X86/haddsub-2.ll b/llvm/test/CodeGen/X86/haddsub-2.ll index a025604f44a5..82fd7a2699a5 100644 --- a/llvm/test/CodeGen/X86/haddsub-2.ll +++ b/llvm/test/CodeGen/X86/haddsub-2.ll @@ -444,12 +444,18 @@ define <4 x double> @avx_vhadd_pd_test(<4 x double> %A, <4 x double> %B) { ; SSE-NEXT: movapd %xmm2, %xmm1 ; SSE-NEXT: retq ; -; AVX-LABEL: avx_vhadd_pd_test: -; AVX: # %bb.0: -; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] -; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX-NEXT: vhaddpd %ymm2, %ymm0, %ymm0 -; AVX-NEXT: retq +; AVX1-LABEL: avx_vhadd_pd_test: +; AVX1: # %bb.0: +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vhaddpd %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: avx_vhadd_pd_test: +; AVX2: # %bb.0: +; AVX2-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: retq %vecext = extractelement <4 x double> %A, i32 0 %vecext1 = extractelement <4 x double> %A, i32 1 %add = fadd double %vecext, %vecext1 @@ -477,12 +483,18 @@ define <4 x double> @avx_vhsub_pd_test(<4 x double> %A, <4 x double> %B) { ; SSE-NEXT: movapd %xmm2, %xmm1 ; SSE-NEXT: retq ; -; AVX-LABEL: avx_vhsub_pd_test: -; AVX: # %bb.0: -; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] -; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX-NEXT: vhsubpd %ymm2, %ymm0, %ymm0 -; AVX-NEXT: retq +; AVX1-LABEL: avx_vhsub_pd_test: +; AVX1: # %bb.0: +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vhsubpd %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: avx_vhsub_pd_test: +; AVX2: # %bb.0: +; AVX2-NEXT: vhsubpd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: retq %vecext = extractelement <4 x double> %A, i32 0 %vecext1 = extractelement <4 x double> %A, i32 1 %sub = fsub double %vecext, %vecext1 diff --git a/llvm/test/CodeGen/X86/haddsub-undef.ll b/llvm/test/CodeGen/X86/haddsub-undef.ll index d6ee47f75b50..d268438121ef 100644 --- a/llvm/test/CodeGen/X86/haddsub-undef.ll +++ b/llvm/test/CodeGen/X86/haddsub-undef.ll @@ -4,7 +4,7 @@ ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX1-SLOW ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX1-FAST ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX512,AVX512-SLOW -; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f,fast-hops | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX512 +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f,fast-hops | FileCheck %s --check-prefixes=AVX,AVX-FAST,AVX512,AVX512-FAST ; Verify that we correctly fold horizontal binop even in the presence of UNDEFs. @@ -1190,13 +1190,20 @@ define <4 x double> @PR34724_add_v4f64_u123(<4 x double> %0, <4 x double> %1) { ; AVX-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX-SLOW-NEXT: retq ; -; AVX-FAST-LABEL: PR34724_add_v4f64_u123: -; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX-FAST-NEXT: vblendpd {{.*#+}} ymm2 = ymm0[0,1],ymm1[2,3] -; AVX-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX-FAST-NEXT: vhaddpd %ymm2, %ymm0, %ymm0 -; AVX-FAST-NEXT: retq +; AVX1-FAST-LABEL: PR34724_add_v4f64_u123: +; AVX1-FAST: # %bb.0: +; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-FAST-NEXT: vblendpd {{.*#+}} ymm2 = ymm0[0,1],ymm1[2,3] +; AVX1-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-FAST-NEXT: vhaddpd %ymm2, %ymm0, %ymm0 +; AVX1-FAST-NEXT: retq +; +; AVX512-FAST-LABEL: PR34724_add_v4f64_u123: +; AVX512-FAST: # %bb.0: +; AVX512-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512-FAST-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 +; AVX512-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,0,3] +; AVX512-FAST-NEXT: retq %3 = shufflevector <4 x double> %0, <4 x double> %1, <2 x i32> <i32 2, i32 4> %4 = shufflevector <4 x double> %0, <4 x double> %1, <2 x i32> <i32 3, i32 5> %5 = fadd <2 x double> %3, %4 @@ -1286,12 +1293,18 @@ define <4 x double> @PR34724_add_v4f64_01u3(<4 x double> %0, <4 x double> %1) { ; AVX-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX-SLOW-NEXT: retq ; -; AVX-FAST-LABEL: PR34724_add_v4f64_01u3: -; AVX-FAST: # %bb.0: -; AVX-FAST-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] -; AVX-FAST-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] -; AVX-FAST-NEXT: vhaddpd %ymm2, %ymm0, %ymm0 -; AVX-FAST-NEXT: retq +; AVX1-FAST-LABEL: PR34724_add_v4f64_01u3: +; AVX1-FAST: # %bb.0: +; AVX1-FAST-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] +; AVX1-FAST-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3] +; AVX1-FAST-NEXT: vhaddpd %ymm2, %ymm0, %ymm0 +; AVX1-FAST-NEXT: retq +; +; AVX512-FAST-LABEL: PR34724_add_v4f64_01u3: +; AVX512-FAST: # %bb.0: +; AVX512-FAST-NEXT: vhaddpd %ymm1, %ymm0, %ymm0 +; AVX512-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,1,3] +; AVX512-FAST-NEXT: retq %3 = shufflevector <4 x double> %0, <4 x double> undef, <2 x i32> <i32 0, i32 2> %4 = shufflevector <4 x double> %0, <4 x double> undef, <2 x i32> <i32 1, i32 3> %5 = fadd <2 x double> %3, %4 _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits