https://github.com/AkashDeoNU created https://github.com/llvm/llvm-project/pull/190549
This fixes #161340 @RKSimon. Sorry for the delay. Please let me know about any fixes I can make. >From e6add1db145c37c22b70acad557a7d03507456e1 Mon Sep 17 00:00:00 2001 From: Akash Deo <[email protected]> Date: Sun, 5 Apr 2026 15:17:30 -0500 Subject: [PATCH] [clang][x86] Add constexpr support for VNNI intrinsics --- clang/include/clang/Basic/BuiltinsX86.td | 24 +- clang/lib/AST/ByteCode/InterpBuiltin.cpp | 91 ++++++ clang/lib/AST/ExprConstant.cpp | 80 +++++ clang/lib/Headers/avx512vlvnniintrin.h | 90 +++--- clang/lib/Headers/avx512vnniintrin.h | 73 ++--- clang/lib/Headers/avxvnniintrin.h | 48 +-- .../test/CodeGen/X86/avx512vlvnni-builtins.c | 298 ++++++++++++++++++ clang/test/CodeGen/X86/avx512vnni-builtins.c | 156 +++++++++ clang/test/CodeGen/X86/avxvnni-builtins.c | 246 +++++++++++++++ 9 files changed, 986 insertions(+), 120 deletions(-) diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td index 0cab8c77d465d..342a23e1f2aab 100644 --- a/clang/include/clang/Basic/BuiltinsX86.td +++ b/clang/include/clang/Basic/BuiltinsX86.td @@ -1075,51 +1075,51 @@ let Features = "avx512f", Attributes = [NoThrow, Const, Constexpr, RequiredVecto def extractf32x4_mask : X86Builtin<"_Vector<4, float>(_Vector<16, float>, _Constant int, _Vector<4, float>, unsigned char)">; } -let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in { +let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in { def vpdpbusd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<16, unsigned char>, _Vector<16, char>)">; } -let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in { +let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in { def vpdpbusd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<32, unsigned char>, _Vector<32, char>)">; } -let Features = "avx512vnni", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in { +let Features = "avx512vnni", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in { def vpdpbusd512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<64, unsigned char>, _Vector<64, char>)">; } -let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in { +let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in { def vpdpbusds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<16, unsigned char>, _Vector<16, char>)">; } -let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in { +let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in { def vpdpbusds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<32, unsigned char>, _Vector<32, char>)">; } -let Features = "avx512vnni", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in { +let Features = "avx512vnni", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in { def vpdpbusds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<64, unsigned char>, _Vector<64, char>)">; } -let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in { +let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in { def vpdpwssd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<8, short>, _Vector<8, short>)">; } -let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in { +let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in { def vpdpwssd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<16, short>, _Vector<16, short>)">; } -let Features = "avx512vnni", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in { +let Features = "avx512vnni", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in { def vpdpwssd512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<32, short>, _Vector<32, short>)">; } -let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in { +let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in { def vpdpwssds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<8, short>, _Vector<8, short>)">; } -let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in { +let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in { def vpdpwssds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<16, short>, _Vector<16, short>)">; } -let Features = "avx512vnni", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in { +let Features = "avx512vnni", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in { def vpdpwssds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<32, short>, _Vector<32, short>)">; } diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp index e7b3ef6ce1510..03c9add584658 100644 --- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp +++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp @@ -4189,6 +4189,53 @@ static bool interp__builtin_ia32_gfni_mul(InterpState &S, CodePtr OpPC, return true; } +static bool interp__builtin_ia32_vpdp(InterpState &S, CodePtr OpPC, + const CallExpr *Call, bool IsDottingWord, + bool IsSaturating) { + const auto *SrcVecT = Call->getArg(0)->getType()->castAs<VectorType>(); + const auto *OpAVecT = Call->getArg(1)->getType()->castAs<VectorType>(); + const auto *OpBVecT = Call->getArg(2)->getType()->castAs<VectorType>(); + + PrimType SrcElemT = *S.getContext().classify(SrcVecT->getElementType()); + PrimType OpAElemT = *S.getContext().classify(OpAVecT->getElementType()); + PrimType OpBElemT = *S.getContext().classify(OpBVecT->getElementType()); + + unsigned NumElements = SrcVecT->getNumElements(); + unsigned Iters = IsDottingWord ? 2 : 4; + + const Pointer &OpBPtr = S.Stk.pop<Pointer>(); + const Pointer &OpAPtr = S.Stk.pop<Pointer>(); + const Pointer &SrcPtr = S.Stk.pop<Pointer>(); + const Pointer &Dst = S.Stk.peek<Pointer>(); + + for (unsigned I = 0; I < NumElements; ++I) { + APSInt Acc; + INT_TYPE_SWITCH_NO_BOOL(SrcElemT, { Acc = SrcPtr.elem<T>(I).toAPSInt(); }); + Acc = Acc.sext(64); + for (unsigned J = 0; J < Iters; ++J) { + APSInt OpA, OpB; + INT_TYPE_SWITCH_NO_BOOL( + OpAElemT, { OpA = OpAPtr.elem<T>(Iters * I + J).toAPSInt(); }); + INT_TYPE_SWITCH_NO_BOOL( + OpBElemT, { OpB = OpBPtr.elem<T>(Iters * I + J).toAPSInt(); }); + if (IsDottingWord) { + OpA = APSInt(OpA.sext(64), false); + } else { + OpA = APSInt(OpA.zext(64), false); + } + OpB = APSInt(OpB.sext(64), false); + Acc += OpA * OpB; + } + if (IsSaturating) { + Acc = APSInt(Acc.truncSSat(32), false); + } + INT_TYPE_SWITCH_NO_BOOL(SrcElemT, + { Dst.elem<T>(I) = static_cast<T>(Acc); }); + } + Dst.initializeAllElements(); + return true; +} + bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call, uint32_t BuiltinID) { if (!S.getASTContext().BuiltinInfo.isConstantEvaluated(BuiltinID)) @@ -6049,6 +6096,50 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call, return EvalScalarMinMaxFp(A, B, RoundingMode, /*IsMin=*/false); }, /*IsScalar=*/true); + case X86::BI__builtin_ia32_vpdpwssd128: + case X86::BI__builtin_ia32_vpdpwssd256: + case X86::BI__builtin_ia32_vpdpwssd512: + case X86::BI__builtin_ia32_vpdpwssds128: + case X86::BI__builtin_ia32_vpdpwssds256: + case X86::BI__builtin_ia32_vpdpwssds512: + case X86::BI__builtin_ia32_vpdpbusds128: + case X86::BI__builtin_ia32_vpdpbusds256: + case X86::BI__builtin_ia32_vpdpbusds512: + case X86::BI__builtin_ia32_vpdpbusd128: + case X86::BI__builtin_ia32_vpdpbusd256: + case X86::BI__builtin_ia32_vpdpbusd512: { + unsigned BuiltinID = Call->getBuiltinCallee(); + bool IsDottingWord; + bool IsSaturating; + switch (BuiltinID) { + case X86::BI__builtin_ia32_vpdpwssd128: + case X86::BI__builtin_ia32_vpdpwssd256: + case X86::BI__builtin_ia32_vpdpwssd512: + IsDottingWord = true; + IsSaturating = false; + break; + case X86::BI__builtin_ia32_vpdpwssds128: + case X86::BI__builtin_ia32_vpdpwssds256: + case X86::BI__builtin_ia32_vpdpwssds512: + IsDottingWord = true; + IsSaturating = true; + break; + case X86::BI__builtin_ia32_vpdpbusds128: + case X86::BI__builtin_ia32_vpdpbusds256: + case X86::BI__builtin_ia32_vpdpbusds512: + IsDottingWord = false; + IsSaturating = true; + break; + case X86::BI__builtin_ia32_vpdpbusd128: + case X86::BI__builtin_ia32_vpdpbusd256: + case X86::BI__builtin_ia32_vpdpbusd512: + IsDottingWord = false; + IsSaturating = false; + break; + } + return interp__builtin_ia32_vpdp(S, OpPC, Call, IsDottingWord, + IsSaturating); + } default: S.FFDiag(S.Current->getLocation(OpPC), diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index 4f45fa728c605..ecbdb8cac301d 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -14634,6 +14634,86 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) { return false; return Success(R, E); } + case X86::BI__builtin_ia32_vpdpwssd128: + case X86::BI__builtin_ia32_vpdpwssd256: + case X86::BI__builtin_ia32_vpdpwssd512: + case X86::BI__builtin_ia32_vpdpwssds128: + case X86::BI__builtin_ia32_vpdpwssds256: + case X86::BI__builtin_ia32_vpdpwssds512: + case X86::BI__builtin_ia32_vpdpbusds128: + case X86::BI__builtin_ia32_vpdpbusds256: + case X86::BI__builtin_ia32_vpdpbusds512: + case X86::BI__builtin_ia32_vpdpbusd128: + case X86::BI__builtin_ia32_vpdpbusd256: + case X86::BI__builtin_ia32_vpdpbusd512: { + unsigned BuiltinID = E->getBuiltinCallee(); + bool IsDottingWord = false; + bool IsSaturating = false; + switch (BuiltinID) { + case X86::BI__builtin_ia32_vpdpwssd128: + case X86::BI__builtin_ia32_vpdpwssd256: + case X86::BI__builtin_ia32_vpdpwssd512: + IsDottingWord = true; + IsSaturating = false; + break; + case X86::BI__builtin_ia32_vpdpwssds128: + case X86::BI__builtin_ia32_vpdpwssds256: + case X86::BI__builtin_ia32_vpdpwssds512: + IsDottingWord = true; + IsSaturating = true; + break; + case X86::BI__builtin_ia32_vpdpbusds128: + case X86::BI__builtin_ia32_vpdpbusds256: + case X86::BI__builtin_ia32_vpdpbusds512: + IsDottingWord = false; + IsSaturating = true; + break; + case X86::BI__builtin_ia32_vpdpbusd128: + case X86::BI__builtin_ia32_vpdpbusd256: + case X86::BI__builtin_ia32_vpdpbusd512: + IsDottingWord = false; + IsSaturating = false; + break; + } + + APValue Source, OperandA, OperandB; + if (!EvaluateAsRValue(Info, E->getArg(0), Source) || + !EvaluateAsRValue(Info, E->getArg(1), OperandA) || + !EvaluateAsRValue(Info, E->getArg(2), OperandB)) { + return false; + } + + unsigned NumElements = Source.getVectorLength(); + + SmallVector<APValue, 16> Result; + Result.reserve(NumElements); + unsigned Iters = IsDottingWord ? 2 : 4; + for (unsigned I = 0; I < NumElements; ++I) { + APSInt DotProduct = Source.getVectorElt(I).getInt(); + DotProduct = DotProduct.sext(64); + for (unsigned J = 0; J < Iters; ++J) { + APSInt OpA; + if (IsDottingWord) { + OpA = APSInt(OperandA.getVectorElt(Iters * I + J).getInt().sext(64), + false); + } else { + OpA = APSInt(OperandA.getVectorElt(Iters * I + J).getInt().zext(64), + false); + } + APSInt OpB = APSInt( + OperandB.getVectorElt(Iters * I + J).getInt().sext(64), false); + DotProduct += OpA * OpB; + } + if (IsSaturating) { + DotProduct = APSInt(DotProduct.truncSSat(32), false); + } else { + DotProduct = APSInt(DotProduct.trunc(32), false); + } + Result.push_back(APValue(DotProduct)); + } + + return Success(APValue(Result.data(), Result.size()), E); + } } } diff --git a/clang/lib/Headers/avx512vlvnniintrin.h b/clang/lib/Headers/avx512vlvnniintrin.h index 4b8a199af32e5..053807032fcb3 100644 --- a/clang/lib/Headers/avx512vlvnniintrin.h +++ b/clang/lib/Headers/avx512vlvnniintrin.h @@ -24,6 +24,14 @@ __target__("avx512vl,avx512vnni"), \ __min_vector_width__(256))) +#if defined(__cplusplus) && (__cplusplus >= 201103L) +#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 constexpr +#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 constexpr +#else +#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 +#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 +#endif + /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with /// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed /// 16-bit results. Sum these 4 results with the corresponding 32-bit integer @@ -179,129 +187,115 @@ #define _mm_dpwssds_epi32(S, A, B) \ ((__m128i)__builtin_ia32_vpdpwssds128((__v4si)(S), (__v8hi)(A), (__v8hi)(B))) -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_dpbusd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_mask_dpbusd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectd_256(__U, (__v8si)_mm256_dpbusd_epi32(__S, __A, __B), (__v8si)__S); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_dpbusd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_maskz_dpbusd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectd_256(__U, (__v8si)_mm256_dpbusd_epi32(__S, __A, __B), (__v8si)_mm256_setzero_si256()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_dpbusds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_mask_dpbusds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectd_256(__U, (__v8si)_mm256_dpbusds_epi32(__S, __A, __B), (__v8si)__S); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_dpbusds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_maskz_dpbusds_epi32(__mmask8 __U, __m256i __S, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_selectd_256(__U, (__v8si)_mm256_dpbusds_epi32(__S, __A, __B), (__v8si)_mm256_setzero_si256()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_dpwssd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_mask_dpwssd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectd_256(__U, (__v8si)_mm256_dpwssd_epi32(__S, __A, __B), (__v8si)__S); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_dpwssd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_maskz_dpwssd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectd_256(__U, (__v8si)_mm256_dpwssd_epi32(__S, __A, __B), (__v8si)_mm256_setzero_si256()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_dpwssds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_mask_dpwssds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectd_256(__U, (__v8si)_mm256_dpwssds_epi32(__S, __A, __B), (__v8si)__S); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_dpwssds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_maskz_dpwssds_epi32(__mmask8 __U, __m256i __S, __m256i __A, + __m256i __B) { return (__m256i)__builtin_ia32_selectd_256(__U, (__v8si)_mm256_dpwssds_epi32(__S, __A, __B), (__v8si)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_dpbusd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_mask_dpbusd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectd_128(__U, (__v4si)_mm_dpbusd_epi32(__S, __A, __B), (__v4si)__S); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_dpbusd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_maskz_dpbusd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectd_128(__U, (__v4si)_mm_dpbusd_epi32(__S, __A, __B), (__v4si)_mm_setzero_si128()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_dpbusds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_mask_dpbusds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectd_128(__U, (__v4si)_mm_dpbusds_epi32(__S, __A, __B), (__v4si)__S); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_dpbusds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_maskz_dpbusds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectd_128(__U, (__v4si)_mm_dpbusds_epi32(__S, __A, __B), (__v4si)_mm_setzero_si128()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_dpwssd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_mask_dpwssd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectd_128(__U, (__v4si)_mm_dpwssd_epi32(__S, __A, __B), (__v4si)__S); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_dpwssd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_maskz_dpwssd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectd_128(__U, (__v4si)_mm_dpwssd_epi32(__S, __A, __B), (__v4si)_mm_setzero_si128()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_dpwssds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_mask_dpwssds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectd_128(__U, (__v4si)_mm_dpwssds_epi32(__S, __A, __B), (__v4si)__S); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_dpwssds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_maskz_dpwssds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectd_128(__U, (__v4si)_mm_dpwssds_epi32(__S, __A, __B), (__v4si)_mm_setzero_si128()); diff --git a/clang/lib/Headers/avx512vnniintrin.h b/clang/lib/Headers/avx512vnniintrin.h index 2ce88efe4a04f..1aa431ed446b2 100644 --- a/clang/lib/Headers/avx512vnniintrin.h +++ b/clang/lib/Headers/avx512vnniintrin.h @@ -19,98 +19,99 @@ __attribute__((__always_inline__, __nodebug__, __target__("avx512vnni"), \ __min_vector_width__(512))) -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_dpbusd_epi32(__m512i __S, __m512i __A, __m512i __B) -{ +#if defined(__cplusplus) && (__cplusplus >= 201103L) +#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr +#else +#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS +#endif + +static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm512_dpbusd_epi32(__m512i __S, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_vpdpbusd512((__v16si)__S, (__v64qu)__A, (__v64qi)__B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_dpbusd_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm512_mask_dpbusd_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectd_512(__U, (__v16si)_mm512_dpbusd_epi32(__S, __A, __B), (__v16si)__S); } -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_dpbusd_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm512_maskz_dpbusd_epi32(__mmask16 __U, __m512i __S, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_selectd_512(__U, (__v16si)_mm512_dpbusd_epi32(__S, __A, __B), (__v16si)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_dpbusds_epi32(__m512i __S, __m512i __A, __m512i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm512_dpbusds_epi32(__m512i __S, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_vpdpbusds512((__v16si)__S, (__v64qu)__A, (__v64qi)__B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_dpbusds_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm512_mask_dpbusds_epi32(__m512i __S, __mmask16 __U, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_selectd_512(__U, (__v16si)_mm512_dpbusds_epi32(__S, __A, __B), (__v16si)__S); } -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_dpbusds_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm512_maskz_dpbusds_epi32(__mmask16 __U, __m512i __S, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_selectd_512(__U, (__v16si)_mm512_dpbusds_epi32(__S, __A, __B), (__v16si)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_dpwssd_epi32(__m512i __S, __m512i __A, __m512i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm512_dpwssd_epi32(__m512i __S, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_vpdpwssd512((__v16si)__S, (__v32hi)__A, (__v32hi)__B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_dpwssd_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm512_mask_dpwssd_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectd_512(__U, (__v16si)_mm512_dpwssd_epi32(__S, __A, __B), (__v16si)__S); } -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_dpwssd_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm512_maskz_dpwssd_epi32(__mmask16 __U, __m512i __S, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_selectd_512(__U, (__v16si)_mm512_dpwssd_epi32(__S, __A, __B), (__v16si)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_dpwssds_epi32(__m512i __S, __m512i __A, __m512i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm512_dpwssds_epi32(__m512i __S, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_vpdpwssds512((__v16si)__S, (__v32hi)__A, (__v32hi)__B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_dpwssds_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm512_mask_dpwssds_epi32(__m512i __S, __mmask16 __U, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_selectd_512(__U, (__v16si)_mm512_dpwssds_epi32(__S, __A, __B), (__v16si)__S); } -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_dpwssds_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B) -{ +static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm512_maskz_dpwssds_epi32(__mmask16 __U, __m512i __S, __m512i __A, + __m512i __B) { return (__m512i)__builtin_ia32_selectd_512(__U, (__v16si)_mm512_dpwssds_epi32(__S, __A, __B), (__v16si)_mm512_setzero_si512()); } #undef __DEFAULT_FN_ATTRS +#undef __DEFAULT_FN_ATTRS_CONSTEXPR #endif diff --git a/clang/lib/Headers/avxvnniintrin.h b/clang/lib/Headers/avxvnniintrin.h index 1d2e8c906effc..ee82676fcb392 100644 --- a/clang/lib/Headers/avxvnniintrin.h +++ b/clang/lib/Headers/avxvnniintrin.h @@ -43,6 +43,14 @@ #define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avxvnni"), __min_vector_width__(256))) #define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avxvnni"), __min_vector_width__(128))) +#if defined(__cplusplus) && (__cplusplus >= 201103L) +#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 constexpr +#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 constexpr +#else +#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 +#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 +#endif + /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with /// corresponding signed 8-bit integers in \a __B, producing 4 intermediate signed /// 16-bit results. Sum these 4 results with the corresponding 32-bit integer @@ -60,9 +68,8 @@ /// ENDFOR /// DST[MAX:256] := 0 /// \endcode -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_dpbusd_avx_epi32(__m256i __S, __m256i __A, __m256i __B) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_dpbusd_avx_epi32(__m256i __S, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_vpdpbusd256((__v8si)__S, (__v32qu)__A, (__v32qi)__B); } @@ -84,9 +91,8 @@ _mm256_dpbusd_avx_epi32(__m256i __S, __m256i __A, __m256i __B) /// ENDFOR /// DST[MAX:256] := 0 /// \endcode -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_dpbusds_avx_epi32(__m256i __S, __m256i __A, __m256i __B) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_dpbusds_avx_epi32(__m256i __S, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_vpdpbusds256((__v8si)__S, (__v32qu)__A, (__v32qi)__B); } @@ -106,9 +112,8 @@ _mm256_dpbusds_avx_epi32(__m256i __S, __m256i __A, __m256i __B) /// ENDFOR /// DST[MAX:256] := 0 /// \endcode -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_dpwssd_avx_epi32(__m256i __S, __m256i __A, __m256i __B) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_dpwssd_avx_epi32(__m256i __S, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_vpdpwssd256((__v8si)__S, (__v16hi)__A, (__v16hi)__B); } @@ -128,9 +133,8 @@ _mm256_dpwssd_avx_epi32(__m256i __S, __m256i __A, __m256i __B) /// ENDFOR /// DST[MAX:256] := 0 /// \endcode -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_dpwssds_avx_epi32(__m256i __S, __m256i __A, __m256i __B) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_dpwssds_avx_epi32(__m256i __S, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_vpdpwssds256((__v8si)__S, (__v16hi)__A, (__v16hi)__B); } @@ -152,9 +156,8 @@ _mm256_dpwssds_avx_epi32(__m256i __S, __m256i __A, __m256i __B) /// ENDFOR /// DST[MAX:128] := 0 /// \endcode -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_dpbusd_avx_epi32(__m128i __S, __m128i __A, __m128i __B) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_dpbusd_avx_epi32(__m128i __S, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_vpdpbusd128((__v4si)__S, (__v16qu)__A, (__v16qi)__B); } @@ -176,9 +179,8 @@ _mm_dpbusd_avx_epi32(__m128i __S, __m128i __A, __m128i __B) /// ENDFOR /// DST[MAX:128] := 0 /// \endcode -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_dpbusds_avx_epi32(__m128i __S, __m128i __A, __m128i __B) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_dpbusds_avx_epi32(__m128i __S, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_vpdpbusds128((__v4si)__S, (__v16qu)__A, (__v16qi)__B); } @@ -198,9 +200,8 @@ _mm_dpbusds_avx_epi32(__m128i __S, __m128i __A, __m128i __B) /// ENDFOR /// DST[MAX:128] := 0 /// \endcode -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_dpwssd_avx_epi32(__m128i __S, __m128i __A, __m128i __B) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_dpwssd_avx_epi32(__m128i __S, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_vpdpwssd128((__v4si)__S, (__v8hi)__A, (__v8hi)__B); } @@ -220,9 +221,8 @@ _mm_dpwssd_avx_epi32(__m128i __S, __m128i __A, __m128i __B) /// ENDFOR /// DST[MAX:128] := 0 /// \endcode -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_dpwssds_avx_epi32(__m128i __S, __m128i __A, __m128i __B) -{ +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_dpwssds_avx_epi32(__m128i __S, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_vpdpwssds128((__v4si)__S, (__v8hi)__A, (__v8hi)__B); } diff --git a/clang/test/CodeGen/X86/avx512vlvnni-builtins.c b/clang/test/CodeGen/X86/avx512vlvnni-builtins.c index 11dbd717a9f77..8bdbdf1ca94a0 100644 --- a/clang/test/CodeGen/X86/avx512vlvnni-builtins.c +++ b/clang/test/CodeGen/X86/avx512vlvnni-builtins.c @@ -3,7 +3,13 @@ // RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512vnni -target-feature +avx512vl -emit-llvm -o - -Wall -Werror | FileCheck %s // RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx512vnni -target-feature +avx512vl -emit-llvm -o - -Wall -Werror | FileCheck %s +// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512vnni -target-feature +avx512vl -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s +// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512vnni -target-feature +avx512vl -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s +// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx512vnni -target-feature +avx512vl -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s +// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx512vnni -target-feature +avx512vl -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s + #include <immintrin.h> +#include "builtin_test_helpers.h" __m256i test_mm256_mask_dpbusd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) { // CHECK-LABEL: test_mm256_mask_dpbusd_epi32 @@ -11,6 +17,13 @@ __m256i test_mm256_mask_dpbusd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}} return _mm256_mask_dpbusd_epi32(__S, __U, __A, __B); } +TEST_CONSTEXPR(match_v8si( + _mm256_mask_dpbusd_epi32( + (__m256i)(__v8si){100, 200, 300, 400, 500, 600, 700, 800}, + (__mmask8)0x55, + (__m256i)(__v32qu){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}, + (__m256i)(__v32qi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}), + 104, 200, 304, 400, 504, 600, 704, 800)); __m256i test_mm256_maskz_dpbusd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B) { // CHECK-LABEL: test_mm256_maskz_dpbusd_epi32 @@ -18,12 +31,43 @@ __m256i test_mm256_maskz_dpbusd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __ // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}} return _mm256_maskz_dpbusd_epi32(__U, __S, __A, __B); } +TEST_CONSTEXPR(match_v8si( + _mm256_maskz_dpbusd_epi32( + (__mmask8)0x0F, + (__m256i)(__v8si){0, 0, 0, 0, 0, 0, 0, 0}, + (__m256i)(__v32qu){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}, + (__m256i)(__v32qi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}), + 4, 4, 4, 4, 0, 0, 0, 0)); __m256i test_mm256_dpbusd_epi32(__m256i __S, __m256i __A, __m256i __B) { // CHECK-LABEL: test_mm256_dpbusd_epi32 // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}) return _mm256_dpbusd_epi32(__S, __A, __B); } +TEST_CONSTEXPR(match_v8si( + _mm256_dpbusd_epi32( + ((__m256i)(__v8si){0, 0, 0, 0, 0, 0, 0, 0}), + ((__m256i)(__v32qu){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}), + ((__m256i)(__v32qi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1})), + 4, 4, 4, 4, 4, 4, 4, 4)); +TEST_CONSTEXPR(match_v8si( + _mm256_dpbusd_epi32( + ((__m256i)(__v8si){10, 10, 10, 10, 10, 10, 10, 10}), + ((__m256i)(__v32qu){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}), + ((__m256i)(__v32qi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1})), + 14, 14, 14, 14, 14, 14, 14, 14)); +TEST_CONSTEXPR(match_v8si( + _mm256_dpbusd_epi32( + ((__m256i)(__v8si){0, 0, 0, 0, 0, 0, 0, 0}), + ((__m256i)(__v32qu){255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255}), + ((__m256i)(__v32qi){-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1})), + -1020, -1020, -1020, -1020, -1020, -1020, -1020, -1020)); +TEST_CONSTEXPR(match_v8si( + _mm256_dpbusd_epi32( + ((__m256i)(__v8si){2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647}), + ((__m256i)(__v32qu){1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0}), + ((__m256i)(__v32qi){1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0})), + -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1)); __m256i test_mm256_mask_dpbusds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) { // CHECK-LABEL: test_mm256_mask_dpbusds_epi32 @@ -31,6 +75,13 @@ __m256i test_mm256_mask_dpbusds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __ // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}} return _mm256_mask_dpbusds_epi32(__S, __U, __A, __B); } +TEST_CONSTEXPR(match_v8si( + _mm256_mask_dpbusds_epi32( + (__m256i)(__v8si){100, 200, 300, 400, 500, 600, 700, 800}, + (__mmask8)0xAA, + (__m256i)(__v32qu){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}, + (__m256i)(__v32qi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}), + 100, 204, 300, 404, 500, 604, 700, 804)); __m256i test_mm256_maskz_dpbusds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B) { // CHECK-LABEL: test_mm256_maskz_dpbusds_epi32 @@ -38,12 +89,37 @@ __m256i test_mm256_maskz_dpbusds_epi32(__mmask8 __U, __m256i __S, __m256i __A, _ // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}} return _mm256_maskz_dpbusds_epi32(__U, __S, __A, __B); } +TEST_CONSTEXPR(match_v8si( + _mm256_maskz_dpbusds_epi32( + (__mmask8)0xFF, + (__m256i)(__v8si){0, 0, 0, 0, 0, 0, 0, 0}, + (__m256i)(__v32qu){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}, + (__m256i)(__v32qi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}), + 4, 4, 4, 4, 4, 4, 4, 4)); __m256i test_mm256_dpbusds_epi32(__m256i __S, __m256i __A, __m256i __B) { // CHECK-LABEL: test_mm256_dpbusds_epi32 // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}) return _mm256_dpbusds_epi32(__S, __A, __B); } +TEST_CONSTEXPR(match_v8si( + _mm256_dpbusds_epi32( + ((__m256i)(__v8si){0, 0, 0, 0, 0, 0, 0, 0}), + ((__m256i)(__v32qu){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}), + ((__m256i)(__v32qi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1})), + 4, 4, 4, 4, 4, 4, 4, 4)); +TEST_CONSTEXPR(match_v8si( + _mm256_dpbusds_epi32( + ((__m256i)(__v8si){2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647}), + ((__m256i)(__v32qu){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}), + ((__m256i)(__v32qi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1})), + 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647)); +TEST_CONSTEXPR(match_v8si( + _mm256_dpbusds_epi32( + ((__m256i)(__v8si){-2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1}), + ((__m256i)(__v32qu){255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255}), + ((__m256i)(__v32qi){-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1})), + -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1)); __m256i test_mm256_mask_dpwssd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) { // CHECK-LABEL: test_mm256_mask_dpwssd_epi32 @@ -51,6 +127,13 @@ __m256i test_mm256_mask_dpwssd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}} return _mm256_mask_dpwssd_epi32(__S, __U, __A, __B); } +TEST_CONSTEXPR(match_v8si( + _mm256_mask_dpwssd_epi32( + (__m256i)(__v8si){100, 200, 300, 400, 500, 600, 700, 800}, + (__mmask8)0xF0, + (__m256i)(__v16hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}, + (__m256i)(__v16hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}), + 100, 200, 300, 400, 502, 602, 702, 802)); __m256i test_mm256_maskz_dpwssd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B) { // CHECK-LABEL: test_mm256_maskz_dpwssd_epi32 @@ -58,12 +141,49 @@ __m256i test_mm256_maskz_dpwssd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __ // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}} return _mm256_maskz_dpwssd_epi32(__U, __S, __A, __B); } +TEST_CONSTEXPR(match_v8si( + _mm256_maskz_dpwssd_epi32( + (__mmask8)0x0F, + (__m256i)(__v8si){0, 0, 0, 0, 0, 0, 0, 0}, + (__m256i)(__v16hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}, + (__m256i)(__v16hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}), + 2, 2, 2, 2, 0, 0, 0, 0)); __m256i test_mm256_dpwssd_epi32(__m256i __S, __m256i __A, __m256i __B) { // CHECK-LABEL: test_mm256_dpwssd_epi32 // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}) return _mm256_dpwssd_epi32(__S, __A, __B); } +TEST_CONSTEXPR(match_v8si( + _mm256_dpwssd_epi32( + ((__m256i)(__v8si){0, 0, 0, 0, 0, 0, 0, 0}), + ((__m256i)(__v16hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}), + ((__m256i)(__v16hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1})), + 2, 2, 2, 2, 2, 2, 2, 2)); +TEST_CONSTEXPR(match_v8si( + _mm256_dpwssd_epi32( + ((__m256i)(__v8si){10, 10, 10, 10, 10, 10, 10, 10}), + ((__m256i)(__v16hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}), + ((__m256i)(__v16hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1})), + 12, 12, 12, 12, 12, 12, 12, 12)); +TEST_CONSTEXPR(match_v8si( + _mm256_dpwssd_epi32( + ((__m256i)(__v8si){0, 0, 0, 0, 0, 0, 0, 0}), + ((__m256i)(__v16hi){-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1}), + ((__m256i)(__v16hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1})), + -2, -2, -2, -2, -2, -2, -2, -2)); +TEST_CONSTEXPR(match_v8si( + _mm256_dpwssd_epi32( + ((__m256i)(__v8si){0, 0, 0, 0, 0, 0, 0, 0}), + ((__m256i)(__v16hi){32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767}), + ((__m256i)(__v16hi){32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767})), + 2147352578, 2147352578, 2147352578, 2147352578, 2147352578, 2147352578, 2147352578, 2147352578)); +TEST_CONSTEXPR(match_v8si( + _mm256_dpwssd_epi32( + ((__m256i)(__v8si){2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647}), + ((__m256i)(__v16hi){1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0}), + ((__m256i)(__v16hi){1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0})), + -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1)); __m256i test_mm256_mask_dpwssds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) { // CHECK-LABEL: test_mm256_mask_dpwssds_epi32 @@ -71,6 +191,13 @@ __m256i test_mm256_mask_dpwssds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __ // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}} return _mm256_mask_dpwssds_epi32(__S, __U, __A, __B); } +TEST_CONSTEXPR(match_v8si( + _mm256_mask_dpwssds_epi32( + (__m256i)(__v8si){100, 200, 300, 400, 500, 600, 700, 800}, + (__mmask8)0xAA, + (__m256i)(__v16hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}, + (__m256i)(__v16hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}), + 100, 202, 300, 402, 500, 602, 700, 802)); __m256i test_mm256_maskz_dpwssds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B) { // CHECK-LABEL: test_mm256_maskz_dpwssds_epi32 @@ -78,12 +205,37 @@ __m256i test_mm256_maskz_dpwssds_epi32(__mmask8 __U, __m256i __S, __m256i __A, _ // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}} return _mm256_maskz_dpwssds_epi32(__U, __S, __A, __B); } +TEST_CONSTEXPR(match_v8si( + _mm256_maskz_dpwssds_epi32( + (__mmask8)0xFF, + (__m256i)(__v8si){0, 0, 0, 0, 0, 0, 0, 0}, + (__m256i)(__v16hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}, + (__m256i)(__v16hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}), + 2, 2, 2, 2, 2, 2, 2, 2)); __m256i test_mm256_dpwssds_epi32(__m256i __S, __m256i __A, __m256i __B) { // CHECK-LABEL: test_mm256_dpwssds_epi32 // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}) return _mm256_dpwssds_epi32(__S, __A, __B); } +TEST_CONSTEXPR(match_v8si( + _mm256_dpwssds_epi32( + ((__m256i)(__v8si){0, 0, 0, 0, 0, 0, 0, 0}), + ((__m256i)(__v16hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}), + ((__m256i)(__v16hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1})), + 2, 2, 2, 2, 2, 2, 2, 2)); +TEST_CONSTEXPR(match_v8si( + _mm256_dpwssds_epi32( + ((__m256i)(__v8si){2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647}), + ((__m256i)(__v16hi){32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767}), + ((__m256i)(__v16hi){32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767})), + 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647)); +TEST_CONSTEXPR(match_v8si( + _mm256_dpwssds_epi32( + ((__m256i)(__v8si){-2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1}), + ((__m256i)(__v16hi){-32768,-32768,-32768,-32768,-32768,-32768,-32768,-32768,-32768,-32768,-32768,-32768,-32768,-32768,-32768,-32768}), + ((__m256i)(__v16hi){32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767})), + -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1)); __m128i test_mm_mask_dpbusd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) { // CHECK-LABEL: test_mm_mask_dpbusd_epi32 @@ -91,6 +243,13 @@ __m128i test_mm_mask_dpbusd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128 // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}} return _mm_mask_dpbusd_epi32(__S, __U, __A, __B); } +TEST_CONSTEXPR(match_v4si( + _mm_mask_dpbusd_epi32( + (__m128i)(__v4si){100, 200, 300, 400}, + (__mmask8)0x05, + (__m128i)(__v16qu){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}, + (__m128i)(__v16qi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}), + 104, 200, 304, 400)); __m128i test_mm_maskz_dpbusd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) { // CHECK-LABEL: test_mm_maskz_dpbusd_epi32 @@ -98,12 +257,43 @@ __m128i test_mm_maskz_dpbusd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m12 // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}} return _mm_maskz_dpbusd_epi32(__U, __S, __A, __B); } +TEST_CONSTEXPR(match_v4si( + _mm_maskz_dpbusd_epi32( + (__mmask8)0x03, + (__m128i)(__v4si){0, 0, 0, 0}, + (__m128i)(__v16qu){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}, + (__m128i)(__v16qi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}), + 4, 4, 0, 0)); __m128i test_mm_dpbusd_epi32(__m128i __S, __m128i __A, __m128i __B) { // CHECK-LABEL: test_mm_dpbusd_epi32 // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}) return _mm_dpbusd_epi32(__S, __A, __B); } +TEST_CONSTEXPR(match_v4si( + _mm_dpbusd_epi32( + ((__m128i)(__v4si){0, 0, 0, 0}), + ((__m128i)(__v16qu){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}), + ((__m128i)(__v16qi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1})), + 4, 4, 4, 4)); +TEST_CONSTEXPR(match_v4si( + _mm_dpbusd_epi32( + ((__m128i)(__v4si){10, 10, 10, 10}), + ((__m128i)(__v16qu){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}), + ((__m128i)(__v16qi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1})), + 14, 14, 14, 14)); +TEST_CONSTEXPR(match_v4si( + _mm_dpbusd_epi32( + ((__m128i)(__v4si){0, 0, 0, 0}), + ((__m128i)(__v16qu){255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255}), + ((__m128i)(__v16qi){-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1})), + -1020, -1020, -1020, -1020)); +TEST_CONSTEXPR(match_v4si( + _mm_dpbusd_epi32( + ((__m128i)(__v4si){2147483647, 2147483647, 2147483647, 2147483647}), + ((__m128i)(__v16qu){1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0}), + ((__m128i)(__v16qi){1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0})), + -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1)); __m128i test_mm_mask_dpbusds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) { // CHECK-LABEL: test_mm_mask_dpbusds_epi32 @@ -111,6 +301,13 @@ __m128i test_mm_mask_dpbusds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m12 // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}} return _mm_mask_dpbusds_epi32(__S, __U, __A, __B); } +TEST_CONSTEXPR(match_v4si( + _mm_mask_dpbusds_epi32( + (__m128i)(__v4si){100, 200, 300, 400}, + (__mmask8)0x0A, + (__m128i)(__v16qu){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}, + (__m128i)(__v16qi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}), + 100, 204, 300, 404)); __m128i test_mm_maskz_dpbusds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) { // CHECK-LABEL: test_mm_maskz_dpbusds_epi32 @@ -118,12 +315,37 @@ __m128i test_mm_maskz_dpbusds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m1 // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}} return _mm_maskz_dpbusds_epi32(__U, __S, __A, __B); } +TEST_CONSTEXPR(match_v4si( + _mm_maskz_dpbusds_epi32( + (__mmask8)0x0F, + (__m128i)(__v4si){0, 0, 0, 0}, + (__m128i)(__v16qu){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}, + (__m128i)(__v16qi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}), + 4, 4, 4, 4)); __m128i test_mm_dpbusds_epi32(__m128i __S, __m128i __A, __m128i __B) { // CHECK-LABEL: test_mm_dpbusds_epi32 // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}) return _mm_dpbusds_epi32(__S, __A, __B); } +TEST_CONSTEXPR(match_v4si( + _mm_dpbusds_epi32( + ((__m128i)(__v4si){0, 0, 0, 0}), + ((__m128i)(__v16qu){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}), + ((__m128i)(__v16qi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1})), + 4, 4, 4, 4)); +TEST_CONSTEXPR(match_v4si( + _mm_dpbusds_epi32( + ((__m128i)(__v4si){2147483647, 2147483647, 2147483647, 2147483647}), + ((__m128i)(__v16qu){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}), + ((__m128i)(__v16qi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1})), + 2147483647, 2147483647, 2147483647, 2147483647)); +TEST_CONSTEXPR(match_v4si( + _mm_dpbusds_epi32( + ((__m128i)(__v4si){-2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1}), + ((__m128i)(__v16qu){255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255}), + ((__m128i)(__v16qi){-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1})), + -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1)); __m128i test_mm_mask_dpwssd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) { // CHECK-LABEL: test_mm_mask_dpwssd_epi32 @@ -131,6 +353,13 @@ __m128i test_mm_mask_dpwssd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128 // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}} return _mm_mask_dpwssd_epi32(__S, __U, __A, __B); } +TEST_CONSTEXPR(match_v4si( + _mm_mask_dpwssd_epi32( + (__m128i)(__v4si){100, 200, 300, 400}, + (__mmask8)0x05, + (__m128i)(__v8hi){1,1,1,1,1,1,1,1}, + (__m128i)(__v8hi){1,1,1,1,1,1,1,1}), + 102, 200, 302, 400)); __m128i test_mm_maskz_dpwssd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) { // CHECK-LABEL: test_mm_maskz_dpwssd_epi32 @@ -138,12 +367,49 @@ __m128i test_mm_maskz_dpwssd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m12 // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}} return _mm_maskz_dpwssd_epi32(__U, __S, __A, __B); } +TEST_CONSTEXPR(match_v4si( + _mm_maskz_dpwssd_epi32( + (__mmask8)0x03, + (__m128i)(__v4si){0, 0, 0, 0}, + (__m128i)(__v8hi){1,1,1,1,1,1,1,1}, + (__m128i)(__v8hi){1,1,1,1,1,1,1,1}), + 2, 2, 0, 0)); __m128i test_mm_dpwssd_epi32(__m128i __S, __m128i __A, __m128i __B) { // CHECK-LABEL: test_mm_dpwssd_epi32 // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}) return _mm_dpwssd_epi32(__S, __A, __B); } +TEST_CONSTEXPR(match_v4si( + _mm_dpwssd_epi32( + ((__m128i)(__v4si){0, 0, 0, 0}), + ((__m128i)(__v8hi){1,1,1,1,1,1,1,1}), + ((__m128i)(__v8hi){1,1,1,1,1,1,1,1})), + 2, 2, 2, 2)); +TEST_CONSTEXPR(match_v4si( + _mm_dpwssd_epi32( + ((__m128i)(__v4si){10, 10, 10, 10}), + ((__m128i)(__v8hi){1,1,1,1,1,1,1,1}), + ((__m128i)(__v8hi){1,1,1,1,1,1,1,1})), + 12, 12, 12, 12)); +TEST_CONSTEXPR(match_v4si( + _mm_dpwssd_epi32( + ((__m128i)(__v4si){0, 0, 0, 0}), + ((__m128i)(__v8hi){-1,-1,-1,-1,-1,-1,-1,-1}), + ((__m128i)(__v8hi){1,1,1,1,1,1,1,1})), + -2, -2, -2, -2)); +TEST_CONSTEXPR(match_v4si( + _mm_dpwssd_epi32( + ((__m128i)(__v4si){0, 0, 0, 0}), + ((__m128i)(__v8hi){32767,32767,32767,32767,32767,32767,32767,32767}), + ((__m128i)(__v8hi){32767,32767,32767,32767,32767,32767,32767,32767})), + 2147352578, 2147352578, 2147352578, 2147352578)); +TEST_CONSTEXPR(match_v4si( + _mm_dpwssd_epi32( + ((__m128i)(__v4si){2147483647, 2147483647, 2147483647, 2147483647}), + ((__m128i)(__v8hi){1,0,1,0,1,0,1,0}), + ((__m128i)(__v8hi){1,0,1,0,1,0,1,0})), + -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1)); __m128i test_mm_mask_dpwssds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) { // CHECK-LABEL: test_mm_mask_dpwssds_epi32 @@ -151,6 +417,13 @@ __m128i test_mm_mask_dpwssds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m12 // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}} return _mm_mask_dpwssds_epi32(__S, __U, __A, __B); } +TEST_CONSTEXPR(match_v4si( + _mm_mask_dpwssds_epi32( + (__m128i)(__v4si){100, 200, 300, 400}, + (__mmask8)0x0A, + (__m128i)(__v8hi){1,1,1,1,1,1,1,1}, + (__m128i)(__v8hi){1,1,1,1,1,1,1,1}), + 100, 202, 300, 402)); __m128i test_mm_maskz_dpwssds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) { // CHECK-LABEL: test_mm_maskz_dpwssds_epi32 @@ -158,10 +431,35 @@ __m128i test_mm_maskz_dpwssds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m1 // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}} return _mm_maskz_dpwssds_epi32(__U, __S, __A, __B); } +TEST_CONSTEXPR(match_v4si( + _mm_maskz_dpwssds_epi32( + (__mmask8)0x0F, + (__m128i)(__v4si){0, 0, 0, 0}, + (__m128i)(__v8hi){1,1,1,1,1,1,1,1}, + (__m128i)(__v8hi){1,1,1,1,1,1,1,1}), + 2, 2, 2, 2)); __m128i test_mm_dpwssds_epi32(__m128i __S, __m128i __A, __m128i __B) { // CHECK-LABEL: test_mm_dpwssds_epi32 // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}) return _mm_dpwssds_epi32(__S, __A, __B); } +TEST_CONSTEXPR(match_v4si( + _mm_dpwssds_epi32( + ((__m128i)(__v4si){0, 0, 0, 0}), + ((__m128i)(__v8hi){1,1,1,1,1,1,1,1}), + ((__m128i)(__v8hi){1,1,1,1,1,1,1,1})), + 2, 2, 2, 2)); +TEST_CONSTEXPR(match_v4si( + _mm_dpwssds_epi32( + ((__m128i)(__v4si){2147483647, 2147483647, 2147483647, 2147483647}), + ((__m128i)(__v8hi){32767,32767,32767,32767,32767,32767,32767,32767}), + ((__m128i)(__v8hi){32767,32767,32767,32767,32767,32767,32767,32767})), + 2147483647, 2147483647, 2147483647, 2147483647)); +TEST_CONSTEXPR(match_v4si( + _mm_dpwssds_epi32( + ((__m128i)(__v4si){-2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1}), + ((__m128i)(__v8hi){-32768,-32768,-32768,-32768,-32768,-32768,-32768,-32768}), + ((__m128i)(__v8hi){32767,32767,32767,32767,32767,32767,32767,32767})), + -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1)); diff --git a/clang/test/CodeGen/X86/avx512vnni-builtins.c b/clang/test/CodeGen/X86/avx512vnni-builtins.c index 6b8465206eedb..f8f663b48aa36 100644 --- a/clang/test/CodeGen/X86/avx512vnni-builtins.c +++ b/clang/test/CodeGen/X86/avx512vnni-builtins.c @@ -3,7 +3,13 @@ // RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512vnni -emit-llvm -o - -Wall -Werror | FileCheck %s // RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx512vnni -emit-llvm -o - -Wall -Werror | FileCheck %s +// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512vnni -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s +// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512vnni -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s +// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx512vnni -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s +// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx512vnni -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s + #include <immintrin.h> +#include "builtin_test_helpers.h" __m512i test_mm512_mask_dpbusd_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B) { // CHECK-LABEL: test_mm512_mask_dpbusd_epi32 @@ -11,6 +17,13 @@ __m512i test_mm512_mask_dpbusd_epi32(__m512i __S, __mmask16 __U, __m512i __A, __ // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}} return _mm512_mask_dpbusd_epi32(__S, __U, __A, __B); } +TEST_CONSTEXPR(match_v16si( + _mm512_mask_dpbusd_epi32( + (__m512i)(__v16si){100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200, 1300, 1400, 1500, 1600}, + (__mmask16)0x5555, + (__m512i)(__v64qu){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}, + (__m512i)(__v64qi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}), + 104, 200, 304, 400, 504, 600, 704, 800, 904, 1000, 1104, 1200, 1304, 1400, 1504, 1600)); __m512i test_mm512_maskz_dpbusd_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B) { // CHECK-LABEL: test_mm512_maskz_dpbusd_epi32 @@ -18,6 +31,13 @@ __m512i test_mm512_maskz_dpbusd_epi32(__mmask16 __U, __m512i __S, __m512i __A, _ // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}} return _mm512_maskz_dpbusd_epi32(__U, __S, __A, __B); } +TEST_CONSTEXPR(match_v16si( + _mm512_maskz_dpbusd_epi32( + (__mmask16)0x00FF, + (__m512i)(__v16si){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + (__m512i)(__v64qu){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}, + (__m512i)(__v64qi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}), + 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0)); __m512i test_mm512_dpbusd_epi32(__m512i __S, __m512i __A, __m512i __B) { // CHECK-LABEL: test_mm512_dpbusd_epi32 @@ -25,12 +45,47 @@ __m512i test_mm512_dpbusd_epi32(__m512i __S, __m512i __A, __m512i __B) { return _mm512_dpbusd_epi32(__S, __A, __B); } +TEST_CONSTEXPR(match_v16si(_mm512_dpbusd_epi32((__m512i)(__v16si){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + (__m512i)(__v64qu){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}, + (__m512i)(__v64qi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}), + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4)); + +TEST_CONSTEXPR(match_v16si(_mm512_dpbusd_epi32((__m512i)(__v16si){1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}, + (__m512i)(__v64qu){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}, + (__m512i)(__v64qi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}), + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5)); + +TEST_CONSTEXPR(match_v16si(_mm512_dpbusd_epi32((__m512i)(__v16si){2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}, + (__m512i)(__v64qu){1,2,3,4,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}, + (__m512i)(__v64qi){5,6,7,8,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}), + 72, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5)); + +TEST_CONSTEXPR(match_v16si(_mm512_dpbusd_epi32((__m512i)(__v16si){2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}, + (__m512i)(__v64qu){1,2,3,4,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}, + (__m512i)(__v64qi){5,6,7,8,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}), + 72, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5)); + +TEST_CONSTEXPR(match_v16si( + _mm512_dpbusd_epi32( + (__m512i)(__v16si){2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647}, + (__m512i)(__v64qu){1,0,0,0, 1,0,0,0, 1,0,0,0, 1,0,0,0, 1,0,0,0, 1,0,0,0, 1,0,0,0, 1,0,0,0, 1,0,0,0, 1,0,0,0, 1,0,0,0, 1,0,0,0, 1,0,0,0, 1,0,0,0, 1,0,0,0, 1,0,0,0}, + (__m512i)(__v64qi){1,0,0,0, 1,0,0,0, 1,0,0,0, 1,0,0,0, 1,0,0,0, 1,0,0,0, 1,0,0,0, 1,0,0,0, 1,0,0,0, 1,0,0,0, 1,0,0,0, 1,0,0,0, 1,0,0,0, 1,0,0,0, 1,0,0,0, 1,0,0,0}), + -2147483648, -2147483648, -2147483648, -2147483648, -2147483648, -2147483648, -2147483648, -2147483648, + -2147483648, -2147483648, -2147483648, -2147483648, -2147483648, -2147483648, -2147483648, -2147483648)); + __m512i test_mm512_mask_dpbusds_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B) { // CHECK-LABEL: test_mm512_mask_dpbusds_epi32 // CHECK: call <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}) // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}} return _mm512_mask_dpbusds_epi32(__S, __U, __A, __B); } +TEST_CONSTEXPR(match_v16si( + _mm512_mask_dpbusds_epi32( + (__m512i)(__v16si){100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200, 1300, 1400, 1500, 1600}, + (__mmask16)0x5555, + (__m512i)(__v64qu){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}, + (__m512i)(__v64qi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}), + 104, 200, 304, 400, 504, 600, 704, 800, 904, 1000, 1104, 1200, 1304, 1400, 1504, 1600)); __m512i test_mm512_maskz_dpbusds_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B) { // CHECK-LABEL: test_mm512_maskz_dpbusds_epi32 @@ -38,12 +93,37 @@ __m512i test_mm512_maskz_dpbusds_epi32(__mmask16 __U, __m512i __S, __m512i __A, // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}} return _mm512_maskz_dpbusds_epi32(__U, __S, __A, __B); } +TEST_CONSTEXPR(match_v16si( + _mm512_maskz_dpbusds_epi32( + (__mmask16)0x00FF, + (__m512i)(__v16si){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + (__m512i)(__v64qu){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}, + (__m512i)(__v64qi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}), + 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0)); __m512i test_mm512_dpbusds_epi32(__m512i __S, __m512i __A, __m512i __B) { // CHECK-LABEL: test_mm512_dpbusds_epi32 // CHECK: call <16 x i32> @llvm.x86.avx512.vpdpbusds.512(<16 x i32> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}) return _mm512_dpbusds_epi32(__S, __A, __B); } +TEST_CONSTEXPR(match_v16si( + _mm512_dpbusds_epi32( + (__m512i)(__v16si){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + (__m512i)(__v64qu){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}, + (__m512i)(__v64qi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}), + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4)); +TEST_CONSTEXPR(match_v16si( + _mm512_dpbusds_epi32( + (__m512i)(__v16si){2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647}, + (__m512i)(__v64qu){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}, + (__m512i)(__v64qi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}), + 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647)); +TEST_CONSTEXPR(match_v16si( + _mm512_dpbusds_epi32( + (__m512i)(__v16si){-2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1}, + (__m512i)(__v64qu){255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255}, + (__m512i)(__v64qi){-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1}), + -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1)); __m512i test_mm512_mask_dpwssd_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B) { // CHECK-LABEL: test_mm512_mask_dpwssd_epi32 @@ -51,6 +131,13 @@ __m512i test_mm512_mask_dpwssd_epi32(__m512i __S, __mmask16 __U, __m512i __A, __ // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}} return _mm512_mask_dpwssd_epi32(__S, __U, __A, __B); } +TEST_CONSTEXPR(match_v16si( + _mm512_mask_dpwssd_epi32( + (__m512i)(__v16si){100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200, 1300, 1400, 1500, 1600}, + (__mmask16)0xFF00, + (__m512i)(__v32hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}, + (__m512i)(__v32hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}), + 100, 200, 300, 400, 500, 600, 700, 800, 902, 1002, 1102, 1202, 1302, 1402, 1502, 1602)); __m512i test_mm512_maskz_dpwssd_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B) { // CHECK-LABEL: test_mm512_maskz_dpwssd_epi32 @@ -58,12 +145,49 @@ __m512i test_mm512_maskz_dpwssd_epi32(__mmask16 __U, __m512i __S, __m512i __A, _ // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}} return _mm512_maskz_dpwssd_epi32(__U, __S, __A, __B); } +TEST_CONSTEXPR(match_v16si( + _mm512_maskz_dpwssd_epi32( + (__mmask16)0x000F, + (__m512i)(__v16si){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + (__m512i)(__v32hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}, + (__m512i)(__v32hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}), + 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)); __m512i test_mm512_dpwssd_epi32(__m512i __S, __m512i __A, __m512i __B) { // CHECK-LABEL: test_mm512_dpwssd_epi32 // CHECK: call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}) return _mm512_dpwssd_epi32(__S, __A, __B); } +TEST_CONSTEXPR(match_v16si( + _mm512_dpwssd_epi32( + (__m512i)(__v16si){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + (__m512i)(__v32hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}, + (__m512i)(__v32hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}), + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2)); +TEST_CONSTEXPR(match_v16si( + _mm512_dpwssd_epi32( + (__m512i)(__v16si){10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10}, + (__m512i)(__v32hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}, + (__m512i)(__v32hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}), + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12)); +TEST_CONSTEXPR(match_v16si( + _mm512_dpwssd_epi32( + (__m512i)(__v16si){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + (__m512i)(__v32hi){-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1}, + (__m512i)(__v32hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}), + -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2)); +TEST_CONSTEXPR(match_v16si( + _mm512_dpwssd_epi32( + (__m512i)(__v16si){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + (__m512i)(__v32hi){32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767}, + (__m512i)(__v32hi){32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767}), + 2147352578, 2147352578, 2147352578, 2147352578, 2147352578, 2147352578, 2147352578, 2147352578, 2147352578, 2147352578, 2147352578, 2147352578, 2147352578, 2147352578, 2147352578, 2147352578)); +TEST_CONSTEXPR(match_v16si( + _mm512_dpwssd_epi32( + (__m512i)(__v16si){2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647}, + (__m512i)(__v32hi){1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0}, + (__m512i)(__v32hi){1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0}), + -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1)); __m512i test_mm512_mask_dpwssds_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B) { // CHECK-LABEL: test_mm512_mask_dpwssds_epi32 @@ -71,6 +195,13 @@ __m512i test_mm512_mask_dpwssds_epi32(__m512i __S, __mmask16 __U, __m512i __A, _ // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}} return _mm512_mask_dpwssds_epi32(__S, __U, __A, __B); } +TEST_CONSTEXPR(match_v16si( + _mm512_mask_dpwssds_epi32( + (__m512i)(__v16si){100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200, 1300, 1400, 1500, 1600}, + (__mmask16)0xAAAA, + (__m512i)(__v32hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}, + (__m512i)(__v32hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}), + 100, 202, 300, 402, 500, 602, 700, 802, 900, 1002, 1100, 1202, 1300, 1402, 1500, 1602)); __m512i test_mm512_maskz_dpwssds_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B) { // CHECK-LABEL: test_mm512_maskz_dpwssds_epi32 @@ -78,10 +209,35 @@ __m512i test_mm512_maskz_dpwssds_epi32(__mmask16 __U, __m512i __S, __m512i __A, // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}} return _mm512_maskz_dpwssds_epi32(__U, __S, __A, __B); } +TEST_CONSTEXPR(match_v16si( + _mm512_maskz_dpwssds_epi32( + (__mmask16)0xFFFF, + (__m512i)(__v16si){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + (__m512i)(__v32hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}, + (__m512i)(__v32hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}), + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2)); __m512i test_mm512_dpwssds_epi32(__m512i __S, __m512i __A, __m512i __B) { // CHECK-LABEL: test_mm512_dpwssds_epi32 // CHECK: call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}) return _mm512_dpwssds_epi32(__S, __A, __B); } +TEST_CONSTEXPR(match_v16si( + _mm512_dpwssds_epi32( + (__m512i)(__v16si){0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + (__m512i)(__v32hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}, + (__m512i)(__v32hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}), + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2)); +TEST_CONSTEXPR(match_v16si( + _mm512_dpwssds_epi32( + (__m512i)(__v16si){2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647}, + (__m512i)(__v32hi){32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767}, + (__m512i)(__v32hi){32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767}), + 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647)); +TEST_CONSTEXPR(match_v16si( + _mm512_dpwssds_epi32( + (__m512i)(__v16si){-2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1}, + (__m512i)(__v32hi){-32768,-32768,-32768,-32768,-32768,-32768,-32768,-32768,-32768,-32768,-32768,-32768,-32768,-32768,-32768,-32768,-32768,-32768,-32768,-32768,-32768,-32768,-32768,-32768,-32768,-32768,-32768,-32768,-32768,-32768,-32768,-32768}, + (__m512i)(__v32hi){32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767}), + -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1)); diff --git a/clang/test/CodeGen/X86/avxvnni-builtins.c b/clang/test/CodeGen/X86/avxvnni-builtins.c index 6557a26807eb2..7bf4d563f7ba2 100644 --- a/clang/test/CodeGen/X86/avxvnni-builtins.c +++ b/clang/test/CodeGen/X86/avxvnni-builtins.c @@ -3,100 +3,346 @@ // RUN: %clang_cc1 -x c++ -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avxvnni -emit-llvm -o - -Wall -Werror | FileCheck %s // RUN: %clang_cc1 -x c++ -ffreestanding %s -triple=i386-apple-darwin -target-feature +avxvnni -emit-llvm -o - -Wall -Werror | FileCheck %s +// RUN: %clang_cc1 -x c -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avxvnni -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s +// RUN: %clang_cc1 -x c++ -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avxvnni -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s +// RUN: %clang_cc1 -x c -ffreestanding %s -triple=i386-apple-darwin -target-feature +avxvnni -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s +// RUN: %clang_cc1 -x c++ -ffreestanding %s -triple=i386-apple-darwin -target-feature +avxvnni -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s + #include <immintrin.h> +#include "builtin_test_helpers.h" __m256i test_mm256_dpbusd_epi32(__m256i __S, __m256i __A, __m256i __B) { // CHECK-LABEL: test_mm256_dpbusd_epi32 // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}) return _mm256_dpbusd_epi32(__S, __A, __B); } +TEST_CONSTEXPR(match_v8si( + _mm256_dpbusd_epi32( + ((__m256i)(__v8si){0, 0, 0, 0, 0, 0, 0, 0}), + ((__m256i)(__v32qu){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}), + ((__m256i)(__v32qi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1})), + 4, 4, 4, 4, 4, 4, 4, 4)); +TEST_CONSTEXPR(match_v8si( + _mm256_dpbusd_epi32( + ((__m256i)(__v8si){0, 0, 0, 0, 0, 0, 0, 0}), + ((__m256i)(__v32qu){255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255}), + ((__m256i)(__v32qi){-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1})), + -1020, -1020, -1020, -1020, -1020, -1020, -1020, -1020)); +TEST_CONSTEXPR(match_v8si( + _mm256_dpbusd_epi32( + ((__m256i)(__v8si){2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647}), + ((__m256i)(__v32qu){1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0}), + ((__m256i)(__v32qi){1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0})), + -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1)); __m256i test_mm256_dpbusds_epi32(__m256i __S, __m256i __A, __m256i __B) { // CHECK-LABEL: test_mm256_dpbusds_epi32 // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}) return _mm256_dpbusds_epi32(__S, __A, __B); } +TEST_CONSTEXPR(match_v8si( + _mm256_dpbusds_epi32( + ((__m256i)(__v8si){0, 0, 0, 0, 0, 0, 0, 0}), + ((__m256i)(__v32qu){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}), + ((__m256i)(__v32qi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1})), + 4, 4, 4, 4, 4, 4, 4, 4)); +TEST_CONSTEXPR(match_v8si( + _mm256_dpbusds_epi32( + ((__m256i)(__v8si){2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647}), + ((__m256i)(__v32qu){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}), + ((__m256i)(__v32qi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1})), + 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647)); __m256i test_mm256_dpwssd_epi32(__m256i __S, __m256i __A, __m256i __B) { // CHECK-LABEL: test_mm256_dpwssd_epi32 // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}) return _mm256_dpwssd_epi32(__S, __A, __B); } +TEST_CONSTEXPR(match_v8si( + _mm256_dpwssd_epi32( + ((__m256i)(__v8si){0, 0, 0, 0, 0, 0, 0, 0}), + ((__m256i)(__v16hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}), + ((__m256i)(__v16hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1})), + 2, 2, 2, 2, 2, 2, 2, 2)); +TEST_CONSTEXPR(match_v8si( + _mm256_dpwssd_epi32( + ((__m256i)(__v8si){0, 0, 0, 0, 0, 0, 0, 0}), + ((__m256i)(__v16hi){-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1}), + ((__m256i)(__v16hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1})), + -2, -2, -2, -2, -2, -2, -2, -2)); +TEST_CONSTEXPR(match_v8si( + _mm256_dpwssd_epi32( + ((__m256i)(__v8si){2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647}), + ((__m256i)(__v16hi){1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0}), + ((__m256i)(__v16hi){1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0})), + -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1)); __m256i test_mm256_dpwssds_epi32(__m256i __S, __m256i __A, __m256i __B) { // CHECK-LABEL: test_mm256_dpwssds_epi32 // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}) return _mm256_dpwssds_epi32(__S, __A, __B); } +TEST_CONSTEXPR(match_v8si( + _mm256_dpwssds_epi32( + ((__m256i)(__v8si){0, 0, 0, 0, 0, 0, 0, 0}), + ((__m256i)(__v16hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}), + ((__m256i)(__v16hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1})), + 2, 2, 2, 2, 2, 2, 2, 2)); +TEST_CONSTEXPR(match_v8si( + _mm256_dpwssds_epi32( + ((__m256i)(__v8si){2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647}), + ((__m256i)(__v16hi){32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767}), + ((__m256i)(__v16hi){32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767})), + 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647)); __m128i test_mm_dpbusd_epi32(__m128i __S, __m128i __A, __m128i __B) { // CHECK-LABEL: test_mm_dpbusd_epi32 // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}) return _mm_dpbusd_epi32(__S, __A, __B); } +TEST_CONSTEXPR(match_v4si( + _mm_dpbusd_epi32( + ((__m128i)(__v4si){0, 0, 0, 0}), + ((__m128i)(__v16qu){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}), + ((__m128i)(__v16qi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1})), + 4, 4, 4, 4)); +TEST_CONSTEXPR(match_v4si( + _mm_dpbusd_epi32( + ((__m128i)(__v4si){0, 0, 0, 0}), + ((__m128i)(__v16qu){255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255}), + ((__m128i)(__v16qi){-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1})), + -1020, -1020, -1020, -1020)); +TEST_CONSTEXPR(match_v4si( + _mm_dpbusd_epi32( + ((__m128i)(__v4si){2147483647, 2147483647, 2147483647, 2147483647}), + ((__m128i)(__v16qu){1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0}), + ((__m128i)(__v16qi){1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0})), + -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1)); __m128i test_mm_dpbusds_epi32(__m128i __S, __m128i __A, __m128i __B) { // CHECK-LABEL: test_mm_dpbusds_epi32 // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}) return _mm_dpbusds_epi32(__S, __A, __B); } +TEST_CONSTEXPR(match_v4si( + _mm_dpbusds_epi32( + ((__m128i)(__v4si){0, 0, 0, 0}), + ((__m128i)(__v16qu){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}), + ((__m128i)(__v16qi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1})), + 4, 4, 4, 4)); +TEST_CONSTEXPR(match_v4si( + _mm_dpbusds_epi32( + ((__m128i)(__v4si){2147483647, 2147483647, 2147483647, 2147483647}), + ((__m128i)(__v16qu){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}), + ((__m128i)(__v16qi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1})), + 2147483647, 2147483647, 2147483647, 2147483647)); __m128i test_mm_dpwssd_epi32(__m128i __S, __m128i __A, __m128i __B) { // CHECK-LABEL: test_mm_dpwssd_epi32 // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}) return _mm_dpwssd_epi32(__S, __A, __B); } +TEST_CONSTEXPR(match_v4si( + _mm_dpwssd_epi32( + ((__m128i)(__v4si){0, 0, 0, 0}), + ((__m128i)(__v8hi){1,1,1,1,1,1,1,1}), + ((__m128i)(__v8hi){1,1,1,1,1,1,1,1})), + 2, 2, 2, 2)); +TEST_CONSTEXPR(match_v4si( + _mm_dpwssd_epi32( + ((__m128i)(__v4si){0, 0, 0, 0}), + ((__m128i)(__v8hi){32767,32767,32767,32767,32767,32767,32767,32767}), + ((__m128i)(__v8hi){32767,32767,32767,32767,32767,32767,32767,32767})), + 2147352578, 2147352578, 2147352578, 2147352578)); +TEST_CONSTEXPR(match_v4si( + _mm_dpwssd_epi32( + ((__m128i)(__v4si){2147483647, 2147483647, 2147483647, 2147483647}), + ((__m128i)(__v8hi){1,0,1,0,1,0,1,0}), + ((__m128i)(__v8hi){1,0,1,0,1,0,1,0})), + -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1)); __m128i test_mm_dpwssds_epi32(__m128i __S, __m128i __A, __m128i __B) { // CHECK-LABEL: test_mm_dpwssds_epi32 // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}) return _mm_dpwssds_epi32(__S, __A, __B); } +TEST_CONSTEXPR(match_v4si( + _mm_dpwssds_epi32( + ((__m128i)(__v4si){0, 0, 0, 0}), + ((__m128i)(__v8hi){1,1,1,1,1,1,1,1}), + ((__m128i)(__v8hi){1,1,1,1,1,1,1,1})), + 2, 2, 2, 2)); +TEST_CONSTEXPR(match_v4si( + _mm_dpwssds_epi32( + ((__m128i)(__v4si){2147483647, 2147483647, 2147483647, 2147483647}), + ((__m128i)(__v8hi){32767,32767,32767,32767,32767,32767,32767,32767}), + ((__m128i)(__v8hi){32767,32767,32767,32767,32767,32767,32767,32767})), + 2147483647, 2147483647, 2147483647, 2147483647)); __m256i test_mm256_dpbusd_avx_epi32(__m256i __S, __m256i __A, __m256i __B) { // CHECK-LABEL: test_mm256_dpbusd_avx_epi32 // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}) return _mm256_dpbusd_avx_epi32(__S, __A, __B); } +TEST_CONSTEXPR(match_v8si( + _mm256_dpbusd_avx_epi32( + (__m256i)(__v8si){0, 0, 0, 0, 0, 0, 0, 0}, + (__m256i)(__v32qu){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}, + (__m256i)(__v32qi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}), + 4, 4, 4, 4, 4, 4, 4, 4)); +TEST_CONSTEXPR(match_v8si( + _mm256_dpbusd_avx_epi32( + (__m256i)(__v8si){0, 0, 0, 0, 0, 0, 0, 0}, + (__m256i)(__v32qu){255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255}, + (__m256i)(__v32qi){-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1}), + -1020, -1020, -1020, -1020, -1020, -1020, -1020, -1020)); +TEST_CONSTEXPR(match_v8si( + _mm256_dpbusd_avx_epi32( + (__m256i)(__v8si){2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647}, + (__m256i)(__v32qu){1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0}, + (__m256i)(__v32qi){1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0}), + -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1)); __m256i test_mm256_dpbusds_avx_epi32(__m256i __S, __m256i __A, __m256i __B) { // CHECK-LABEL: test_mm256_dpbusds_avx_epi32 // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}) return _mm256_dpbusds_avx_epi32(__S, __A, __B); } +TEST_CONSTEXPR(match_v8si( + _mm256_dpbusds_avx_epi32( + (__m256i)(__v8si){0, 0, 0, 0, 0, 0, 0, 0}, + (__m256i)(__v32qu){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}, + (__m256i)(__v32qi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}), + 4, 4, 4, 4, 4, 4, 4, 4)); +TEST_CONSTEXPR(match_v8si( + _mm256_dpbusds_avx_epi32( + (__m256i)(__v8si){2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647}, + (__m256i)(__v32qu){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}, + (__m256i)(__v32qi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}), + 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647)); __m256i test_mm256_dpwssd_avx_epi32(__m256i __S, __m256i __A, __m256i __B) { // CHECK-LABEL: test_mm256_dpwssd_avx_epi32 // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}) return _mm256_dpwssd_avx_epi32(__S, __A, __B); } +TEST_CONSTEXPR(match_v8si( + _mm256_dpwssd_avx_epi32( + (__m256i)(__v8si){0, 0, 0, 0, 0, 0, 0, 0}, + (__m256i)(__v16hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}, + (__m256i)(__v16hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}), + 2, 2, 2, 2, 2, 2, 2, 2)); +TEST_CONSTEXPR(match_v8si( + _mm256_dpwssd_avx_epi32( + (__m256i)(__v8si){0, 0, 0, 0, 0, 0, 0, 0}, + (__m256i)(__v16hi){-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1}, + (__m256i)(__v16hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}), + -2, -2, -2, -2, -2, -2, -2, -2)); +TEST_CONSTEXPR(match_v8si( + _mm256_dpwssd_avx_epi32( + (__m256i)(__v8si){2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647}, + (__m256i)(__v16hi){1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0}, + (__m256i)(__v16hi){1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0}), + -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1)); __m256i test_mm256_dpwssds_avx_epi32(__m256i __S, __m256i __A, __m256i __B) { // CHECK-LABEL: test_mm256_dpwssds_avx_epi32 // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}) return _mm256_dpwssds_avx_epi32(__S, __A, __B); } +TEST_CONSTEXPR(match_v8si( + _mm256_dpwssds_avx_epi32( + (__m256i)(__v8si){0, 0, 0, 0, 0, 0, 0, 0}, + (__m256i)(__v16hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}, + (__m256i)(__v16hi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}), + 2, 2, 2, 2, 2, 2, 2, 2)); +TEST_CONSTEXPR(match_v8si( + _mm256_dpwssds_avx_epi32( + (__m256i)(__v8si){2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647}, + (__m256i)(__v16hi){32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767}, + (__m256i)(__v16hi){32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767}), + 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647, 2147483647)); __m128i test_mm_dpbusd_avx_epi32(__m128i __S, __m128i __A, __m128i __B) { // CHECK-LABEL: test_mm_dpbusd_avx_epi32 // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}) return _mm_dpbusd_avx_epi32(__S, __A, __B); } +TEST_CONSTEXPR(match_v4si( + _mm_dpbusd_avx_epi32( + (__m128i)(__v4si){0, 0, 0, 0}, + (__m128i)(__v16qu){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}, + (__m128i)(__v16qi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}), + 4, 4, 4, 4)); +TEST_CONSTEXPR(match_v4si( + _mm_dpbusd_avx_epi32( + (__m128i)(__v4si){0, 0, 0, 0}, + (__m128i)(__v16qu){255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255}, + (__m128i)(__v16qi){-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1}), + -1020, -1020, -1020, -1020)); +TEST_CONSTEXPR(match_v4si( + _mm_dpbusd_avx_epi32( + (__m128i)(__v4si){2147483647, 2147483647, 2147483647, 2147483647}, + (__m128i)(__v16qu){1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0}, + (__m128i)(__v16qi){1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0}), + -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1)); __m128i test_mm_dpbusds_avx_epi32(__m128i __S, __m128i __A, __m128i __B) { // CHECK-LABEL: test_mm_dpbusds_avx_epi32 // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}) return _mm_dpbusds_avx_epi32(__S, __A, __B); } +TEST_CONSTEXPR(match_v4si( + _mm_dpbusds_avx_epi32( + (__m128i)(__v4si){0, 0, 0, 0}, + (__m128i)(__v16qu){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}, + (__m128i)(__v16qi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}), + 4, 4, 4, 4)); +TEST_CONSTEXPR(match_v4si( + _mm_dpbusds_avx_epi32( + (__m128i)(__v4si){2147483647, 2147483647, 2147483647, 2147483647}, + (__m128i)(__v16qu){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}, + (__m128i)(__v16qi){1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}), + 2147483647, 2147483647, 2147483647, 2147483647)); __m128i test_mm_dpwssd_avx_epi32(__m128i __S, __m128i __A, __m128i __B) { // CHECK-LABEL: test_mm_dpwssd_avx_epi32 // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}) return _mm_dpwssd_avx_epi32(__S, __A, __B); } +TEST_CONSTEXPR(match_v4si( + _mm_dpwssd_avx_epi32( + (__m128i)(__v4si){0, 0, 0, 0}, + (__m128i)(__v8hi){1,1,1,1,1,1,1,1}, + (__m128i)(__v8hi){1,1,1,1,1,1,1,1}), + 2, 2, 2, 2)); +TEST_CONSTEXPR(match_v4si( + _mm_dpwssd_avx_epi32( + (__m128i)(__v4si){0, 0, 0, 0}, + (__m128i)(__v8hi){32767,32767,32767,32767,32767,32767,32767,32767}, + (__m128i)(__v8hi){32767,32767,32767,32767,32767,32767,32767,32767}), + 2147352578, 2147352578, 2147352578, 2147352578)); +TEST_CONSTEXPR(match_v4si( + _mm_dpwssd_avx_epi32( + (__m128i)(__v4si){2147483647, 2147483647, 2147483647, 2147483647}, + (__m128i)(__v8hi){1,0,1,0,1,0,1,0}, + (__m128i)(__v8hi){1,0,1,0,1,0,1,0}), + -2147483647-1, -2147483647-1, -2147483647-1, -2147483647-1)); __m128i test_mm_dpwssds_avx_epi32(__m128i __S, __m128i __A, __m128i __B) { // CHECK-LABEL: test_mm_dpwssds_avx_epi32 // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}) return _mm_dpwssds_avx_epi32(__S, __A, __B); } +TEST_CONSTEXPR(match_v4si( + _mm_dpwssds_avx_epi32( + (__m128i)(__v4si){0, 0, 0, 0}, + (__m128i)(__v8hi){1,1,1,1,1,1,1,1}, + (__m128i)(__v8hi){1,1,1,1,1,1,1,1}), + 2, 2, 2, 2)); +TEST_CONSTEXPR(match_v4si( + _mm_dpwssds_avx_epi32( + (__m128i)(__v4si){2147483647, 2147483647, 2147483647, 2147483647}, + (__m128i)(__v8hi){32767,32767,32767,32767,32767,32767,32767,32767}, + (__m128i)(__v8hi){32767,32767,32767,32767,32767,32767,32767,32767}), + 2147483647, 2147483647, 2147483647, 2147483647)); _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
