[PATCH v2] libstdc++: add ARM SVE support to std::experimental::simd
Hi, Thanks for review @Richard!. I have tried to address most of your comments in this patch. The major updates include optimizing operator[] for masks, find_first_set and find_last_set. My further comments on some of the pointed out issues are a. regarding the coverage of types supported for sve : Yes, all the types are covered by mapping any type using simple two rules : the size of the type and signedness of it. b. all the operator overloads now use infix operators. For division and remainder, the inactive elements are padded with 1 to avoid undefined behavior. c. isnan is optimized to have only two cases i.e finite_math_only case or case where svcmpuo is used. d. _S_load for masks (bool) now uses svld1 by reinterpret_casting the pointer to uint8_t pointer and then performing a svunpklo. The same optimization is not done for masked_load and stores, as conversion of mask from a higher size type to lower size type is not optimal (sequential). e. _S_unary_minus could not use svneg_x because it does not support unsigned types. f. added specializations for reductions. g. find_first_set and find_last_set are optimized using svclastb. libstdc++-v3/ChangeLog: * include/Makefile.am: Add simd_sve.h. * include/Makefile.in: Add simd_sve.h. * include/experimental/bits/simd.h: Add new SveAbi. * include/experimental/bits/simd_builtin.h: Use __no_sve_deduce_t to support existing Neon Abi. * include/experimental/bits/simd_converter.h: Convert sequentially when sve is available. * include/experimental/bits/simd_detail.h: Define sve specific macro. * include/experimental/bits/simd_math.h: Fallback frexp to execute sequntially when sve is available, to handle fixed_size_simd return type that always uses sve. * include/experimental/simd: Include bits/simd_sve.h. * testsuite/experimental/simd/tests/bits/main.h: Enable testing for sve128, sve256, sve512. * include/experimental/bits/simd_sve.h: New file. Signed-off-by: Srinivas Yadav Singanaboina vasu.srinivasvasu...@gmail.com --- libstdc++-v3/include/Makefile.am |1 + libstdc++-v3/include/Makefile.in |1 + libstdc++-v3/include/experimental/bits/simd.h | 131 +- .../include/experimental/bits/simd_builtin.h | 35 +- .../experimental/bits/simd_converter.h| 57 +- .../include/experimental/bits/simd_detail.h |7 +- .../include/experimental/bits/simd_math.h | 14 +- .../include/experimental/bits/simd_sve.h | 1863 + libstdc++-v3/include/experimental/simd|3 + .../experimental/simd/tests/bits/main.h |3 + 10 files changed, 2084 insertions(+), 31 deletions(-) create mode 100644 libstdc++-v3/include/experimental/bits/simd_sve.h diff --git a/libstdc++-v3/include/Makefile.am b/libstdc++-v3/include/Makefile.am index 6209f390e08..1170cb047a6 100644 --- a/libstdc++-v3/include/Makefile.am +++ b/libstdc++-v3/include/Makefile.am @@ -826,6 +826,7 @@ experimental_bits_headers = \ ${experimental_bits_srcdir}/simd_neon.h \ ${experimental_bits_srcdir}/simd_ppc.h \ ${experimental_bits_srcdir}/simd_scalar.h \ + ${experimental_bits_srcdir}/simd_sve.h \ ${experimental_bits_srcdir}/simd_x86.h \ ${experimental_bits_srcdir}/simd_x86_conversions.h \ ${experimental_bits_srcdir}/string_view.tcc \ diff --git a/libstdc++-v3/include/Makefile.in b/libstdc++-v3/include/Makefile.in index 596fa0d2390..bc44582a2da 100644 --- a/libstdc++-v3/include/Makefile.in +++ b/libstdc++-v3/include/Makefile.in @@ -1172,6 +1172,7 @@ experimental_bits_headers = \ ${experimental_bits_srcdir}/simd_neon.h \ ${experimental_bits_srcdir}/simd_ppc.h \ ${experimental_bits_srcdir}/simd_scalar.h \ + ${experimental_bits_srcdir}/simd_sve.h \ ${experimental_bits_srcdir}/simd_x86.h \ ${experimental_bits_srcdir}/simd_x86_conversions.h \ ${experimental_bits_srcdir}/string_view.tcc \ diff --git a/libstdc++-v3/include/experimental/bits/simd.h b/libstdc++-v3/include/experimental/bits/simd.h index 90523ea57dc..d274cd740fe 100644 --- a/libstdc++-v3/include/experimental/bits/simd.h +++ b/libstdc++-v3/include/experimental/bits/simd.h @@ -39,12 +39,16 @@ #include #include #include +#include #if _GLIBCXX_SIMD_X86INTRIN #include #elif _GLIBCXX_SIMD_HAVE_NEON #include #endif +#if _GLIBCXX_SIMD_HAVE_SVE +#include +#endif /** @ingroup ts_simd * @{ @@ -83,6 +87,12 @@ using __m512d [[__gnu__::__vector_size__(64)]] = double; using __m512i [[__gnu__::__vector_size__(64)]] = long long; #endif +#if _GLIBCXX_SIMD_HAVE_SVE +constexpr inline int __sve_vectorized_size_bytes = __ARM_FEATURE_SVE_BITS / 8; +#else +constexpr inline int __sve_vectorized_size_bytes = 0; +#endif + namespace simd_abi { // simd_abi forward declarations {{{ // implementation details: @@ -108,6
Re: [PATCH] libstdc++: add ARM SVE support to std::experimental::simd
nstexpr _MaskMember<_Tp> > _S_broadcast(bool __x) > { > constexpr size_t _Np = simd_size_v<_Tp, _Abi>; > __sve_bool_type __tr = __sve_vector_type<_Tp, > _Np>::__sve_active_mask(); > __sve_bool_type __fl = svnot_z(__tr, __tr); > > This can just be svpfalse_b(); > > Got it! Thanks! > template > struct _MaskImplSve > { > ... > template > _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp> > _S_load(const bool* __mem) > { > _SveMaskWrapper> __r; > > __execute_n_times>( >[&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { > __r._M_set(__i, __mem[__i]); }); > > return __r; > } > > template > static inline _SveMaskWrapper<_Bits, _Np> > _S_masked_load(_SveMaskWrapper<_Bits, _Np> __merge, > _SveMaskWrapper<_Bits, _Np> __mask, > const bool* __mem) noexcept > { > _SveMaskWrapper<_Bits, _Np> __r; > > __execute_n_times<_Np>([&](auto __i) > _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { >if (__mask[__i]) > __r._M_set(__i, __mem[__i]); >else > __r._M_set(__i, __merge[__i]); > }); > > return __r; > } > > If these are loading unpacked booleans, couldn't we just use svld1 > followed by a comparison? Similarly the stores could use svdup_u8_z > to load a vector of 1s and 0s and then use svst1 to store it. Do you mean reinterpret-casting the input pointer (bool*) to (uint8*) and perform a comparison ? > template > _GLIBCXX_SIMD_INTRINSIC static bool > _S_all_of(simd_mask<_Tp, _Abi> __k) > { return _S_popcount(__k) == simd_size_v<_Tp, _Abi>; } > > In principle, this should be better as !svptest_any(..., svnot_z (..., > __k)), > since we should then be able to use a single flag-setting predicate > logic instruction. > > Incidentally, __k seems like a bit of an AVX-centric name :) > > template > _GLIBCXX_SIMD_INTRINSIC static bool > _S_any_of(simd_mask<_Tp, _Abi> __k) > { return _S_popcount(__k) > 0; } > > template > _GLIBCXX_SIMD_INTRINSIC static bool > _S_none_of(simd_mask<_Tp, _Abi> __k) > { return _S_popcount(__k) == 0; } > > These should map directly to svptest_any and !svptest_any respectively. > > Got it! I will update with these changes. > template > _GLIBCXX_SIMD_INTRINSIC static int > _S_find_first_set(simd_mask<_Tp, _Abi> __k) > { > constexpr size_t _Np = simd_size_v<_Tp, _Abi>; > > auto __first_index = > __sve_mask_type::__sve_mask_first_true(); > for (int __idx = 0; __idx < _Np; __idx++) >{ > if (__sve_mask_type::__sve_mask_active_count( >__sve_vector_type<_Tp, _Np>::__sve_active_mask(), >svand_z(__sve_vector_type<_Tp, > _Np>::__sve_active_mask(), __k._M_data, >__first_index))) >return __idx; > __first_index = > __sve_mask_type::__sve_mask_next_true( >__sve_vector_type<_Tp, > _Np>::__sve_active_mask(), __first_index); >} > return -1; > } > > template > _GLIBCXX_SIMD_INTRINSIC static int > _S_find_last_set(simd_mask<_Tp, _Abi> __k) > { > constexpr size_t _Np = simd_size_v<_Tp, _Abi>; > > int __ret = -1; > auto __first_index = > __sve_mask_type::__sve_mask_first_true(); > for (int __idx = 0; __idx < _Np; __idx++) >{ > if (__sve_mask_type::__sve_mask_active_count( >__sve_vector_type<_Tp, _Np>::__sve_active_mask(), >svand_z(__sve_vector_type<_Tp, > _Np>::__sve_active_mask(), __k._M_data, >__first_index))) >__ret = __idx; > __first_index = > __sve_mask_type::__sve_mask_next_true( >__sve_vector_type<_Tp, > _Np>::__sve_active_mask(), __first_index); >} > return __ret; > } > > _S_find_last_set should be able to use svclasta and an iota vector. > _S_find_first_set could do the same with a leading svpfirst. > Thanks. This solution for find_last_set should significantly improves the performance. Can you please elaborate solution for find_first_set ? Other efficient solution for find_first_set I have in my mind is to use svrev_b* and then perform a find_last_set. Thank you, Srinivas Yadav Singanaboina
[PATCH] libstdc++: add ARM SVE support to std::experimental::simd
libstdc++-v3/ChangeLog: * include/Makefile.am: Add simd_sve.h. * include/Makefile.in: Add simd_sve.h. * include/experimental/bits/simd.h: Add new SveAbi. * include/experimental/bits/simd_builtin.h: Use __no_sve_deduce_t to support existing Neon Abi. * include/experimental/bits/simd_converter.h: Convert sequentially when sve is available. * include/experimental/bits/simd_detail.h: Define sve specific macro. * include/experimental/bits/simd_math.h: Fallback frexp to execute sequntially when sve is available, to handle fixed_size_simd return type that always uses sve. * include/experimental/simd: Include bits/simd_sve.h. * testsuite/experimental/simd/tests/bits/main.h: Enable testing for sve128, sve256, sve512. * include/experimental/bits/simd_sve.h: New file. Signed-off-by: Srinivas Yadav Singanaboina vasu.srinivasvasu...@gmail.com --- libstdc++-v3/include/Makefile.am |1 + libstdc++-v3/include/Makefile.in |1 + libstdc++-v3/include/experimental/bits/simd.h | 131 +- .../include/experimental/bits/simd_builtin.h | 35 +- .../experimental/bits/simd_converter.h| 57 +- .../include/experimental/bits/simd_detail.h |7 +- .../include/experimental/bits/simd_math.h | 14 +- .../include/experimental/bits/simd_sve.h | 1818 + libstdc++-v3/include/experimental/simd|3 + .../experimental/simd/tests/bits/main.h |3 + 10 files changed, 2039 insertions(+), 31 deletions(-) create mode 100644 libstdc++-v3/include/experimental/bits/simd_sve.h diff --git a/libstdc++-v3/include/Makefile.am b/libstdc++-v3/include/Makefile.am index 6209f390e08..1170cb047a6 100644 --- a/libstdc++-v3/include/Makefile.am +++ b/libstdc++-v3/include/Makefile.am @@ -826,6 +826,7 @@ experimental_bits_headers = \ ${experimental_bits_srcdir}/simd_neon.h \ ${experimental_bits_srcdir}/simd_ppc.h \ ${experimental_bits_srcdir}/simd_scalar.h \ + ${experimental_bits_srcdir}/simd_sve.h \ ${experimental_bits_srcdir}/simd_x86.h \ ${experimental_bits_srcdir}/simd_x86_conversions.h \ ${experimental_bits_srcdir}/string_view.tcc \ diff --git a/libstdc++-v3/include/Makefile.in b/libstdc++-v3/include/Makefile.in index 596fa0d2390..bc44582a2da 100644 --- a/libstdc++-v3/include/Makefile.in +++ b/libstdc++-v3/include/Makefile.in @@ -1172,6 +1172,7 @@ experimental_bits_headers = \ ${experimental_bits_srcdir}/simd_neon.h \ ${experimental_bits_srcdir}/simd_ppc.h \ ${experimental_bits_srcdir}/simd_scalar.h \ + ${experimental_bits_srcdir}/simd_sve.h \ ${experimental_bits_srcdir}/simd_x86.h \ ${experimental_bits_srcdir}/simd_x86_conversions.h \ ${experimental_bits_srcdir}/string_view.tcc \ diff --git a/libstdc++-v3/include/experimental/bits/simd.h b/libstdc++-v3/include/experimental/bits/simd.h index 90523ea57dc..95fd92784b2 100644 --- a/libstdc++-v3/include/experimental/bits/simd.h +++ b/libstdc++-v3/include/experimental/bits/simd.h @@ -39,12 +39,16 @@ #include #include #include +#include #if _GLIBCXX_SIMD_X86INTRIN #include #elif _GLIBCXX_SIMD_HAVE_NEON #include #endif +#if _GLIBCXX_SIMD_HAVE_SVE +#include +#endif /** @ingroup ts_simd * @{ @@ -83,6 +87,12 @@ using __m512d [[__gnu__::__vector_size__(64)]] = double; using __m512i [[__gnu__::__vector_size__(64)]] = long long; #endif +#if _GLIBCXX_SIMD_HAVE_SVE +constexpr inline int __sve_vectorized_size_bytes = __ARM_FEATURE_SVE_BITS / 8; +#else +constexpr inline int __sve_vectorized_size_bytes = 0; +#endif + namespace simd_abi { // simd_abi forward declarations {{{ // implementation details: @@ -108,6 +118,9 @@ template template struct _VecBltnBtmsk; +template + struct _SveAbi; + template using _VecN = _VecBuiltin; @@ -123,6 +136,9 @@ template template using _Neon = _VecBuiltin<_UsedBytes>; +template + using _Sve = _SveAbi<_UsedBytes>; + // implementation-defined: using __sse = _Sse<>; using __avx = _Avx<>; @@ -130,6 +146,7 @@ using __avx512 = _Avx512<>; using __neon = _Neon<>; using __neon128 = _Neon<16>; using __neon64 = _Neon<8>; +using __sve = _Sve<>; // standard: template @@ -250,6 +267,8 @@ constexpr inline bool __support_neon_float = false; #endif +constexpr inline bool __have_sve = _GLIBCXX_SIMD_HAVE_SVE; + #ifdef _ARCH_PWR10 constexpr inline bool __have_power10vec = true; #else @@ -356,12 +375,13 @@ namespace __detail | (__have_avx512vnni << 27) | (__have_avx512vpopcntdq<< 28) | (__have_avx512vp2intersect << 29); -else if constexpr (__have_neon) +else if constexpr (__have_neon || __have_sve) return __have_neon