[PATCH v2] libstdc++: add ARM SVE support to std::experimental::simd

2024-02-09 Thread Srinivas Yadav Singanaboina
Hi,

Thanks for review @Richard!. I have tried to address most of your comments in 
this patch.
The major updates include optimizing operator[] for masks, find_first_set and 
find_last_set.

My further comments on some of the pointed out issues are
a. regarding the coverage of types supported for sve : Yes, all the types are 
covered by 
mapping any type using simple two rules : the size of the type and signedness 
of it.
b. all the operator overloads now use infix operators. For division and 
remainder, 
the inactive elements are padded with 1 to avoid undefined behavior.
c. isnan is optimized to have only two cases i.e finite_math_only case or case 
where svcmpuo is used.
d. _S_load for masks (bool) now uses svld1 by reinterpret_casting the pointer 
to uint8_t pointer and then performing a svunpklo.
The same optimization is not done for masked_load and stores, as conversion of 
mask from a higher size type to lower size type is not optimal (sequential).
e. _S_unary_minus could not use svneg_x because it does not support unsigned 
types.
f. added specializations for reductions.
g. find_first_set and find_last_set are optimized using svclastb.


libstdc++-v3/ChangeLog:

* include/Makefile.am: Add simd_sve.h.
* include/Makefile.in: Add simd_sve.h.
* include/experimental/bits/simd.h: Add new SveAbi.
* include/experimental/bits/simd_builtin.h: Use
  __no_sve_deduce_t to support existing Neon Abi.
* include/experimental/bits/simd_converter.h: Convert
  sequentially when sve is available.
* include/experimental/bits/simd_detail.h: Define sve
  specific macro.
* include/experimental/bits/simd_math.h: Fallback frexp
  to execute sequntially when sve is available, to handle
  fixed_size_simd return type that always uses sve.
* include/experimental/simd: Include bits/simd_sve.h.
* testsuite/experimental/simd/tests/bits/main.h: Enable
  testing for sve128, sve256, sve512.
* include/experimental/bits/simd_sve.h: New file.

 Signed-off-by: Srinivas Yadav Singanaboina
 vasu.srinivasvasu...@gmail.com
---
 libstdc++-v3/include/Makefile.am  |1 +
 libstdc++-v3/include/Makefile.in  |1 +
 libstdc++-v3/include/experimental/bits/simd.h |  131 +-
 .../include/experimental/bits/simd_builtin.h  |   35 +-
 .../experimental/bits/simd_converter.h|   57 +-
 .../include/experimental/bits/simd_detail.h   |7 +-
 .../include/experimental/bits/simd_math.h |   14 +-
 .../include/experimental/bits/simd_sve.h  | 1863 +
 libstdc++-v3/include/experimental/simd|3 +
 .../experimental/simd/tests/bits/main.h   |3 +
 10 files changed, 2084 insertions(+), 31 deletions(-)
 create mode 100644 libstdc++-v3/include/experimental/bits/simd_sve.h

diff --git a/libstdc++-v3/include/Makefile.am b/libstdc++-v3/include/Makefile.am
index 6209f390e08..1170cb047a6 100644
--- a/libstdc++-v3/include/Makefile.am
+++ b/libstdc++-v3/include/Makefile.am
@@ -826,6 +826,7 @@ experimental_bits_headers = \
${experimental_bits_srcdir}/simd_neon.h \
${experimental_bits_srcdir}/simd_ppc.h \
${experimental_bits_srcdir}/simd_scalar.h \
+   ${experimental_bits_srcdir}/simd_sve.h \
${experimental_bits_srcdir}/simd_x86.h \
${experimental_bits_srcdir}/simd_x86_conversions.h \
${experimental_bits_srcdir}/string_view.tcc \
diff --git a/libstdc++-v3/include/Makefile.in b/libstdc++-v3/include/Makefile.in
index 596fa0d2390..bc44582a2da 100644
--- a/libstdc++-v3/include/Makefile.in
+++ b/libstdc++-v3/include/Makefile.in
@@ -1172,6 +1172,7 @@ experimental_bits_headers = \
${experimental_bits_srcdir}/simd_neon.h \
${experimental_bits_srcdir}/simd_ppc.h \
${experimental_bits_srcdir}/simd_scalar.h \
+   ${experimental_bits_srcdir}/simd_sve.h \
${experimental_bits_srcdir}/simd_x86.h \
${experimental_bits_srcdir}/simd_x86_conversions.h \
${experimental_bits_srcdir}/string_view.tcc \
diff --git a/libstdc++-v3/include/experimental/bits/simd.h 
b/libstdc++-v3/include/experimental/bits/simd.h
index 90523ea57dc..d274cd740fe 100644
--- a/libstdc++-v3/include/experimental/bits/simd.h
+++ b/libstdc++-v3/include/experimental/bits/simd.h
@@ -39,12 +39,16 @@
 #include 
 #include 
 #include 
+#include 
 
 #if _GLIBCXX_SIMD_X86INTRIN
 #include 
 #elif _GLIBCXX_SIMD_HAVE_NEON
 #include 
 #endif
+#if _GLIBCXX_SIMD_HAVE_SVE
+#include 
+#endif
 
 /** @ingroup ts_simd
  * @{
@@ -83,6 +87,12 @@ using __m512d [[__gnu__::__vector_size__(64)]] = double;
 using __m512i [[__gnu__::__vector_size__(64)]] = long long;
 #endif
 
+#if _GLIBCXX_SIMD_HAVE_SVE
+constexpr inline int __sve_vectorized_size_bytes = __ARM_FEATURE_SVE_BITS / 8;
+#else
+constexpr inline int __sve_vectorized_size_bytes = 0;
+#endif 
+
 namespace simd_abi {
 // simd_abi forward declarations {{{
 // implementation details:
@@ -108,6

Re: [PATCH] libstdc++: add ARM SVE support to std::experimental::simd

2024-01-03 Thread Srinivas Yadav
nstexpr _MaskMember<_Tp>
> _S_broadcast(bool __x)
> {
>  constexpr size_t _Np = simd_size_v<_Tp, _Abi>;
>  __sve_bool_type __tr = __sve_vector_type<_Tp,
> _Np>::__sve_active_mask();
>  __sve_bool_type __fl = svnot_z(__tr, __tr);
>
> This can just be svpfalse_b();
>
> Got it! Thanks!


>   template 
> struct _MaskImplSve
> {
> ...
>   template 
> _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
> _S_load(const bool* __mem)
> {
>  _SveMaskWrapper> __r;
>
>  __execute_n_times>(
>[&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
> __r._M_set(__i, __mem[__i]); });
>
>  return __r;
> }
>
>   template 
> static inline _SveMaskWrapper<_Bits, _Np>
> _S_masked_load(_SveMaskWrapper<_Bits, _Np> __merge,
> _SveMaskWrapper<_Bits, _Np> __mask,
>   const bool* __mem) noexcept
> {
>  _SveMaskWrapper<_Bits, _Np> __r;
>
>  __execute_n_times<_Np>([&](auto __i)
> _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
>if (__mask[__i])
>  __r._M_set(__i, __mem[__i]);
>else
>  __r._M_set(__i, __merge[__i]);
>  });
>
>  return __r;
> }
>
> If these are loading unpacked booleans, couldn't we just use svld1
> followed by a comparison?  Similarly the stores could use svdup_u8_z
> to load a vector of 1s and 0s and then use svst1 to store it.

Do you mean reinterpret-casting the input pointer (bool*) to (uint8*) and
perform a comparison ?


>   template 
> _GLIBCXX_SIMD_INTRINSIC static bool
> _S_all_of(simd_mask<_Tp, _Abi> __k)
> { return _S_popcount(__k) == simd_size_v<_Tp, _Abi>; }
>
> In principle, this should be better as !svptest_any(..., svnot_z (...,
> __k)),
> since we should then be able to use a single flag-setting predicate
> logic instruction.
>
> Incidentally, __k seems like a bit of an AVX-centric name :)
>
>   template 
> _GLIBCXX_SIMD_INTRINSIC static bool
> _S_any_of(simd_mask<_Tp, _Abi> __k)
> { return _S_popcount(__k) > 0; }
>
>   template 
> _GLIBCXX_SIMD_INTRINSIC static bool
> _S_none_of(simd_mask<_Tp, _Abi> __k)
> { return _S_popcount(__k) == 0; }
>
> These should map directly to svptest_any and !svptest_any respectively.
>
> Got it! I will update with these changes.


>   template 
> _GLIBCXX_SIMD_INTRINSIC static int
> _S_find_first_set(simd_mask<_Tp, _Abi> __k)
> {
>  constexpr size_t _Np = simd_size_v<_Tp, _Abi>;
>
>  auto __first_index =
> __sve_mask_type::__sve_mask_first_true();
>  for (int __idx = 0; __idx < _Np; __idx++)
>{
>  if (__sve_mask_type::__sve_mask_active_count(
>__sve_vector_type<_Tp, _Np>::__sve_active_mask(),
>svand_z(__sve_vector_type<_Tp,
> _Np>::__sve_active_mask(), __k._M_data,
>__first_index)))
>return __idx;
>  __first_index =
> __sve_mask_type::__sve_mask_next_true(
>__sve_vector_type<_Tp,
> _Np>::__sve_active_mask(), __first_index);
>}
>  return -1;
> }
>
>   template 
> _GLIBCXX_SIMD_INTRINSIC static int
> _S_find_last_set(simd_mask<_Tp, _Abi> __k)
> {
>  constexpr size_t _Np = simd_size_v<_Tp, _Abi>;
>
>  int __ret = -1;
>  auto __first_index =
> __sve_mask_type::__sve_mask_first_true();
>  for (int __idx = 0; __idx < _Np; __idx++)
>{
>  if (__sve_mask_type::__sve_mask_active_count(
>__sve_vector_type<_Tp, _Np>::__sve_active_mask(),
>svand_z(__sve_vector_type<_Tp,
> _Np>::__sve_active_mask(), __k._M_data,
>__first_index)))
>__ret = __idx;
>  __first_index =
> __sve_mask_type::__sve_mask_next_true(
>__sve_vector_type<_Tp,
> _Np>::__sve_active_mask(), __first_index);
>}
>  return __ret;
> }
>
> _S_find_last_set should be able to use svclasta and an iota vector.
> _S_find_first_set could do the same with a leading svpfirst.
>
Thanks. This solution for find_last_set should significantly improves the
performance.
Can you please elaborate solution for find_first_set ?
Other efficient solution for find_first_set I have in my mind is to use
svrev_b*  and then perform a find_last_set.

Thank you,
Srinivas Yadav Singanaboina


[PATCH] libstdc++: add ARM SVE support to std::experimental::simd

2023-11-23 Thread Srinivas Yadav Singanaboina
libstdc++-v3/ChangeLog:

* include/Makefile.am: Add simd_sve.h.
* include/Makefile.in: Add simd_sve.h.
* include/experimental/bits/simd.h: Add new SveAbi.
* include/experimental/bits/simd_builtin.h: Use
  __no_sve_deduce_t to support existing Neon Abi.
* include/experimental/bits/simd_converter.h: Convert
  sequentially when sve is available.
* include/experimental/bits/simd_detail.h: Define sve
  specific macro.
* include/experimental/bits/simd_math.h: Fallback frexp
  to execute sequntially when sve is available, to handle
  fixed_size_simd return type that always uses sve.
* include/experimental/simd: Include bits/simd_sve.h.
* testsuite/experimental/simd/tests/bits/main.h: Enable
  testing for sve128, sve256, sve512.
* include/experimental/bits/simd_sve.h: New file.

Signed-off-by: Srinivas Yadav Singanaboina
vasu.srinivasvasu...@gmail.com
---
 libstdc++-v3/include/Makefile.am  |1 +
 libstdc++-v3/include/Makefile.in  |1 +
 libstdc++-v3/include/experimental/bits/simd.h |  131 +-
 .../include/experimental/bits/simd_builtin.h  |   35 +-
 .../experimental/bits/simd_converter.h|   57 +-
 .../include/experimental/bits/simd_detail.h   |7 +-
 .../include/experimental/bits/simd_math.h |   14 +-
 .../include/experimental/bits/simd_sve.h  | 1818 +
 libstdc++-v3/include/experimental/simd|3 +
 .../experimental/simd/tests/bits/main.h   |3 +
 10 files changed, 2039 insertions(+), 31 deletions(-)
 create mode 100644 libstdc++-v3/include/experimental/bits/simd_sve.h

diff --git a/libstdc++-v3/include/Makefile.am b/libstdc++-v3/include/Makefile.am
index 6209f390e08..1170cb047a6 100644
--- a/libstdc++-v3/include/Makefile.am
+++ b/libstdc++-v3/include/Makefile.am
@@ -826,6 +826,7 @@ experimental_bits_headers = \
${experimental_bits_srcdir}/simd_neon.h \
${experimental_bits_srcdir}/simd_ppc.h \
${experimental_bits_srcdir}/simd_scalar.h \
+   ${experimental_bits_srcdir}/simd_sve.h \
${experimental_bits_srcdir}/simd_x86.h \
${experimental_bits_srcdir}/simd_x86_conversions.h \
${experimental_bits_srcdir}/string_view.tcc \
diff --git a/libstdc++-v3/include/Makefile.in b/libstdc++-v3/include/Makefile.in
index 596fa0d2390..bc44582a2da 100644
--- a/libstdc++-v3/include/Makefile.in
+++ b/libstdc++-v3/include/Makefile.in
@@ -1172,6 +1172,7 @@ experimental_bits_headers = \
${experimental_bits_srcdir}/simd_neon.h \
${experimental_bits_srcdir}/simd_ppc.h \
${experimental_bits_srcdir}/simd_scalar.h \
+   ${experimental_bits_srcdir}/simd_sve.h \
${experimental_bits_srcdir}/simd_x86.h \
${experimental_bits_srcdir}/simd_x86_conversions.h \
${experimental_bits_srcdir}/string_view.tcc \
diff --git a/libstdc++-v3/include/experimental/bits/simd.h 
b/libstdc++-v3/include/experimental/bits/simd.h
index 90523ea57dc..95fd92784b2 100644
--- a/libstdc++-v3/include/experimental/bits/simd.h
+++ b/libstdc++-v3/include/experimental/bits/simd.h
@@ -39,12 +39,16 @@
 #include 
 #include 
 #include 
+#include 
 
 #if _GLIBCXX_SIMD_X86INTRIN
 #include 
 #elif _GLIBCXX_SIMD_HAVE_NEON
 #include 
 #endif
+#if _GLIBCXX_SIMD_HAVE_SVE
+#include 
+#endif
 
 /** @ingroup ts_simd
  * @{
@@ -83,6 +87,12 @@ using __m512d [[__gnu__::__vector_size__(64)]] = double;
 using __m512i [[__gnu__::__vector_size__(64)]] = long long;
 #endif
 
+#if _GLIBCXX_SIMD_HAVE_SVE
+constexpr inline int __sve_vectorized_size_bytes = __ARM_FEATURE_SVE_BITS / 8;
+#else
+constexpr inline int __sve_vectorized_size_bytes = 0;
+#endif 
+
 namespace simd_abi {
 // simd_abi forward declarations {{{
 // implementation details:
@@ -108,6 +118,9 @@ template 
 template 
   struct _VecBltnBtmsk;
 
+template 
+  struct _SveAbi;
+
 template 
   using _VecN = _VecBuiltin;
 
@@ -123,6 +136,9 @@ template 
 template 
   using _Neon = _VecBuiltin<_UsedBytes>;
 
+template 
+  using _Sve = _SveAbi<_UsedBytes>;
+
 // implementation-defined:
 using __sse = _Sse<>;
 using __avx = _Avx<>;
@@ -130,6 +146,7 @@ using __avx512 = _Avx512<>;
 using __neon = _Neon<>;
 using __neon128 = _Neon<16>;
 using __neon64 = _Neon<8>;
+using __sve = _Sve<>;
 
 // standard:
 template 
@@ -250,6 +267,8 @@ constexpr inline bool __support_neon_float =
   false;
 #endif
 
+constexpr inline bool __have_sve = _GLIBCXX_SIMD_HAVE_SVE;
+
 #ifdef _ARCH_PWR10
 constexpr inline bool __have_power10vec = true;
 #else
@@ -356,12 +375,13 @@ namespace __detail
 | (__have_avx512vnni << 27)
 | (__have_avx512vpopcntdq<< 28)
 | (__have_avx512vp2intersect << 29);
-else if constexpr (__have_neon)
+else if constexpr (__have_neon || __have_sve)
   return __have_neon