Possibly. Lane splits make it a lot uglier than SSE2. I'm also somewhat concerned about the complete loss of size checking on the immediate. For instance 256 aliases to 0 in SSE2 case now. I hadn't thought about that when I reviewed your patch.
On Sun, Feb 15, 2015 at 4:56 PM, Filipe Cabecinhas <[email protected]> wrote: > Hi Craig, > > Could this be done in the headers, like the 128-bit ones? > That way we could get rid of the builtins, no? > > Thanks, > > Filipe > > On Sun, Feb 15, 2015 at 4:42 PM, Craig Topper <[email protected]> > wrote: > >> Author: ctopper >> Date: Sun Feb 15 18:42:49 2015 >> New Revision: 229348 >> >> URL: http://llvm.org/viewvc/llvm-project?rev=229348&view=rev >> Log: >> [X86] Teach clang to lower __builtin_ia32_psrldqi256 and >> __builtin_ia32_pslldqi256 to vector shuffles the backend recognizes. This >> is a step towards removing the corresponding intrinsics from the backend. >> >> Modified: >> cfe/trunk/lib/CodeGen/CGBuiltin.cpp >> cfe/trunk/test/CodeGen/avx2-builtins.c >> >> Modified: cfe/trunk/lib/CodeGen/CGBuiltin.cpp >> URL: >> http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/CodeGen/CGBuiltin.cpp?rev=229348&r1=229347&r2=229348&view=diff >> >> ============================================================================== >> --- cfe/trunk/lib/CodeGen/CGBuiltin.cpp (original) >> +++ cfe/trunk/lib/CodeGen/CGBuiltin.cpp Sun Feb 15 18:42:49 2015 >> @@ -6025,6 +6025,60 @@ Value *CodeGenFunction::EmitX86BuiltinEx >> // If palignr is shifting the pair of vectors more than 32 bytes, >> emit zero. >> return llvm::Constant::getNullValue(ConvertType(E->getType())); >> } >> + case X86::BI__builtin_ia32_pslldqi256: { >> + // Shift value is in bits so divide by 8. >> + unsigned shiftVal = cast<llvm::ConstantInt>(Ops[1])->getZExtValue() >> >> 3; >> + >> + // If pslldq is shifting the vector more than 15 bytes, emit zero. >> + if (shiftVal >= 16) >> + return llvm::Constant::getNullValue(ConvertType(E->getType())); >> + >> + SmallVector<llvm::Constant*, 32> Indices; >> + // 256-bit pslldq operates on 128-bit lanes so we need to handle that >> + for (unsigned l = 0; l != 32; l += 16) { >> + for (unsigned i = 0; i != 16; ++i) { >> + unsigned Idx = 32 + i - shiftVal; >> + if (Idx < 32) Idx -= 16; // end of lane, switch operand. >> + Indices.push_back(llvm::ConstantInt::get(Int32Ty, Idx + l)); >> + } >> + } >> + >> + llvm::Type *VecTy = llvm::VectorType::get(Int8Ty, 32); >> + Ops[0] = Builder.CreateBitCast(Ops[0], VecTy, "cast"); >> + Value *Zero = llvm::Constant::getNullValue(VecTy); >> + >> + Value *SV = llvm::ConstantVector::get(Indices); >> + SV = Builder.CreateShuffleVector(Zero, Ops[0], SV, "pslldq"); >> + llvm::Type *ResultType = ConvertType(E->getType()); >> + return Builder.CreateBitCast(SV, ResultType, "cast"); >> + } >> + case X86::BI__builtin_ia32_psrldqi256: { >> + // Shift value is in bits so divide by 8. >> + unsigned shiftVal = cast<llvm::ConstantInt>(Ops[1])->getZExtValue() >> >> 3; >> + >> + // If psrldq is shifting the vector more than 15 bytes, emit zero. >> + if (shiftVal >= 16) >> + return llvm::Constant::getNullValue(ConvertType(E->getType())); >> + >> + SmallVector<llvm::Constant*, 32> Indices; >> + // 256-bit psrldq operates on 128-bit lanes so we need to handle that >> + for (unsigned l = 0; l != 32; l += 16) { >> + for (unsigned i = 0; i != 16; ++i) { >> + unsigned Idx = i + shiftVal; >> + if (Idx >= 16) Idx += 16; // end of lane, switch operand. >> + Indices.push_back(llvm::ConstantInt::get(Int32Ty, Idx + l)); >> + } >> + } >> + >> + llvm::Type *VecTy = llvm::VectorType::get(Int8Ty, 32); >> + Ops[0] = Builder.CreateBitCast(Ops[0], VecTy, "cast"); >> + Value *Zero = llvm::Constant::getNullValue(VecTy); >> + >> + Value *SV = llvm::ConstantVector::get(Indices); >> + SV = Builder.CreateShuffleVector(Ops[0], Zero, SV, "psrldq"); >> + llvm::Type *ResultType = ConvertType(E->getType()); >> + return Builder.CreateBitCast(SV, ResultType, "cast"); >> + } >> case X86::BI__builtin_ia32_movntps: >> case X86::BI__builtin_ia32_movntps256: >> case X86::BI__builtin_ia32_movntpd: >> >> Modified: cfe/trunk/test/CodeGen/avx2-builtins.c >> URL: >> http://llvm.org/viewvc/llvm-project/cfe/trunk/test/CodeGen/avx2-builtins.c?rev=229348&r1=229347&r2=229348&view=diff >> >> ============================================================================== >> --- cfe/trunk/test/CodeGen/avx2-builtins.c (original) >> +++ cfe/trunk/test/CodeGen/avx2-builtins.c Sun Feb 15 18:42:49 2015 >> @@ -462,7 +462,7 @@ __m256i test_mm256_sign_epi32(__m256i a, >> } >> >> __m256i test_mm256_slli_si256(__m256i a) { >> - // CHECK: @llvm.x86.avx2.psll.dq >> + // CHECK: shufflevector <32 x i8> zeroinitializer, <32 x i8> %{{.*}}, >> <32 x i32> <i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, >> i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 29, i32 >> 30, i32 31, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, >> i32 56, i32 57, i32 58, i32 59, i32 60> >> return _mm256_slli_si256(a, 3); >> } >> >> @@ -517,7 +517,7 @@ __m256i test_mm256_sra_epi32(__m256i a, >> } >> >> __m256i test_mm256_srli_si256(__m256i a) { >> - // CHECK: @llvm.x86.avx2.psrl.dq >> + // CHECK: shufflevector <32 x i8> %{{.*}}, <32 x i8> zeroinitializer, >> <32 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 >> 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 19, i32 20, >> i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 >> 30, i32 31, i32 48, i32 49, i32 50> >> return _mm256_srli_si256(a, 3); >> } >> >> >> >> _______________________________________________ >> cfe-commits mailing list >> [email protected] >> http://lists.cs.uiuc.edu/mailman/listinfo/cfe-commits >> > > -- ~Craig
_______________________________________________ cfe-commits mailing list [email protected] http://lists.cs.uiuc.edu/mailman/listinfo/cfe-commits
