Author: Simon Pilgrim Date: 2021-01-25T11:35:45Z New Revision: f461e35cbafed593e637305e2a76822dfb7ca6c7
URL: https://github.com/llvm/llvm-project/commit/f461e35cbafed593e637305e2a76822dfb7ca6c7 DIFF: https://github.com/llvm/llvm-project/commit/f461e35cbafed593e637305e2a76822dfb7ca6c7.diff LOG: [X86][AVX] combineX86ShuffleChain - avoid bitcasts around insert_subvector() shuffle patterns. We allow insert_subvector lowering of all legal types, so don't always cast to the vXi64/vXf64 shuffle types - this is only necessary for X86ISD::SHUF128/X86ISD::VPERM2X128 patterns later. Added: Modified: llvm/lib/Target/X86/X86ISelLowering.cpp Removed: ################################################################################ diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 0edc40683ea8..2a86e12dd53c 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -35357,8 +35357,6 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, // Handle 128/256-bit lane shuffles of 512-bit vectors. if (RootVT.is512BitVector() && (NumBaseMaskElts == 2 || NumBaseMaskElts == 4)) { - MVT ShuffleVT = (FloatDomain ? MVT::v8f64 : MVT::v8i64); - // If the upper subvectors are zeroable, then an extract+insert is more // optimal than using X86ISD::SHUF128. The insertion is free, even if it has // to zero the upper subvectors. @@ -35367,12 +35365,11 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, return SDValue(); // Nothing to do! assert(isInRange(BaseMask[0], 0, NumBaseMaskElts) && "Unexpected lane shuffle"); - Res = CanonicalizeShuffleInput(ShuffleVT, V1); - unsigned SubIdx = BaseMask[0] * (8 / NumBaseMaskElts); + Res = CanonicalizeShuffleInput(RootVT, V1); + unsigned SubIdx = BaseMask[0] * (NumRootElts / NumBaseMaskElts); bool UseZero = isAnyZero(BaseMask); Res = extractSubVector(Res, SubIdx, DAG, DL, BaseMaskEltSizeInBits); - Res = widenSubVector(Res, UseZero, Subtarget, DAG, DL, RootSizeInBits); - return DAG.getBitcast(RootVT, Res); + return widenSubVector(Res, UseZero, Subtarget, DAG, DL, RootSizeInBits); } // Narrow shuffle mask to v4x128. @@ -35423,6 +35420,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, if (!isAnyZero(Mask) && !PreferPERMQ) { if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128) return SDValue(); // Nothing to do! + MVT ShuffleVT = (FloatDomain ? MVT::v8f64 : MVT::v8i64); if (SDValue V = MatchSHUF128(ShuffleVT, DL, Mask, V1, V2, DAG)) return DAG.getBitcast(RootVT, V); } @@ -35430,8 +35428,6 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, // Handle 128-bit lane shuffles of 256-bit vectors. if (RootVT.is256BitVector() && NumBaseMaskElts == 2) { - MVT ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64); - // If the upper half is zeroable, then an extract+insert is more optimal // than using X86ISD::VPERM2X128. The insertion is free, even if it has to // zero the upper half. @@ -35439,13 +35435,13 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR) return SDValue(); // Nothing to do! assert(isInRange(BaseMask[0], 0, 2) && "Unexpected lane shuffle"); - Res = CanonicalizeShuffleInput(ShuffleVT, V1); - Res = extract128BitVector(Res, BaseMask[0] * 2, DAG, DL); - Res = widenSubVector(Res, BaseMask[1] == SM_SentinelZero, Subtarget, DAG, - DL, 256); - return DAG.getBitcast(RootVT, Res); + Res = CanonicalizeShuffleInput(RootVT, V1); + Res = extract128BitVector(Res, BaseMask[0] * (NumRootElts / 2), DAG, DL); + return widenSubVector(Res, BaseMask[1] == SM_SentinelZero, Subtarget, DAG, + DL, 256); } + MVT ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64); if (Depth == 0 && Root.getOpcode() == X86ISD::VPERM2X128) return SDValue(); // Nothing to do! _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits