On Sun, Jul 27, 2025 at 2:53 PM Luc Grosheintz <luc.groshei...@gmail.com>
wrote:

> Let E denote an multi-dimensional extent; n the rank of E; r = 0, ...,
> n; E[i] the i-th extent; and D[k] be the (possibly empty) array of
> dynamic extents.
>
> The two partial products for r = 0, ..., n:
>
>   \prod_{i = 0}^r E[i]     (fwd)
>   \prod_{i = r+1}^n E[i]   (rev)
>
> can be computed as the product of static and dynamic extents. The static
> fwd and rev product can be computed at compile time for all values of r.
>
> Three methods are directly affected by this optimization:
>
>    layout_left::mapping::stride
>    layout_right::mapping::stride
>    mdspan::size
>
> We'll check the generated code (-O2) for all three methods for a generic
> (artificially) high-dimensional multi-dimensional extents.
>
> Consider a generic case:
>
>   using Extents = std::extents<int, 3, 5, dyn, dyn, dyn, 7, dyn>;
>
>   int stride_left(const std::layout_left::mapping<Extents>& m, size_t r)
>   { return m.stride(r); }
>
> The code generated by master:
>
>   80:  49 89 f0               mov    r8,rsi
>   83:  49 89 f1               mov    r9,rsi
>   86:  49 c1 e0 03            shl    r8,0x3
>   8a:  74 6c                  je     f8 <stride_left+0x78>
>   8c:  49 81 c0 00 00 00 00   add    r8,0x0
>   93:  ba 00 00 00 00         mov    edx,0x0
>   98:  b8 01 00 00 00         mov    eax,0x1
>   9d:  0f 1f 00               nop    DWORD PTR [rax]
>   a0:  48 8b 0a               mov    rcx,QWORD PTR [rdx]
>   a3:  48 89 c6               mov    rsi,rax
>   a6:  48 0f af f1            imul   rsi,rcx
>   aa:  48 83 f9 ff            cmp    rcx,0xffffffffffffffff
>   ae:  48 0f 45 c6            cmovne rax,rsi
>   b2:  48 83 c2 08            add    rdx,0x8
>   b6:  49 39 d0               cmp    r8,rdx
>   b9:  75 e5                  jne    a0 <stride_left+0x20>
>   bb:  31 d2                  xor    edx,edx
>   bd:  48 85 c0               test   rax,rax
>   c0:  74 30                  je     f2 <stride_left+0x72>
>   c2:  4a 8b 0c cd 00 00 00   mov    rcx,QWORD PTR [r9*8+0x0]
>   c9:  00
>   ca:  48 c1 e1 02            shl    rcx,0x2
>   ce:  74 20                  je     f0 <stride_left+0x70>
>   d0:  48 01 f9               add    rcx,rdi
>   d3:  66 66 2e 0f 1f 84 00   data16 cs nop WORD PTR [rax+rax*1+0x0]
>   da:  00 00 00 00
>   de:  66 90                  xchg   ax,ax
>   e0:  48 63 17               movsxd rdx,DWORD PTR [rdi]
>   e3:  48 83 c7 04            add    rdi,0x4
>   e7:  48 0f af c2            imul   rax,rdx
>   eb:  48 39 f9               cmp    rcx,rdi
>   ee:  75 f0                  jne    e0 <stride_left+0x60>
>   f0:  89 c2                  mov    edx,eax
>   f2:  89 d0                  mov    eax,edx
>   f4:  c3                     ret
>   f5:  0f 1f 00               nop    DWORD PTR [rax]
>   f8:  b8 01 00 00 00         mov    eax,0x1
>   fd:  eb c3                  jmp    c2 <stride_left+0x42>
>
> is reduced to:
>
>   50:  48 8b 0c f5 00 00 00   mov    rcx,QWORD PTR [rsi*8+0x0]
>   57:  00
>   58:  48 8b 04 f5 00 00 00   mov    rax,QWORD PTR [rsi*8+0x0]
>   5f:  00
>   60:  48 c1 e1 02            shl    rcx,0x2
>   64:  74 1a                  je     80 <stride_left+0x30>
>   66:  48 01 f9               add    rcx,rdi
>   69:  0f 1f 80 00 00 00 00   nop    DWORD PTR [rax+0x0]
>   70:  48 63 17               movsxd rdx,DWORD PTR [rdi]
>   73:  48 83 c7 04            add    rdi,0x4
>   77:  48 0f af c2            imul   rax,rdx
>   7b:  48 39 f9               cmp    rcx,rdi
>   7e:  75 f0                  jne    70 <stride_left+0x20>
>   80:  c3                     ret
>
> Loosely speaking this does the following:
>
>   1. Load the starting position k in the array of dynamic extents.
>   2. Load the partial product of static extents.
>   3. Computes the \prod_{i = k}^d D[i] where d is the number of
>   dynamic extents in a loop.
>
> It shows that the span used for passing in the dynamic extents is
> completely eliminated; and the fact that the product always runs to the
> end of the array of dynamic extents is used by the compiler to eliminate
> one indirection to determine the end position in the array of dynamic
> extents.
>
> The analogous code is generated for layout_right.
>
> Next, consider
>
>   using E2 = std::extents<int, 3, 5, dyn, dyn, 7, dyn, 11>;
>   int size2(const std::mdspan<double, E2>& md)
>   { return md.size(); }
>
> on master the generated code is
>
>   10:  b8 00 00 00 00         mov    eax,0x0
>   15:  ba 01 00 00 00         mov    edx,0x1
>   1a:  66 0f 1f 44 00 00      nop    WORD PTR [rax+rax*1+0x0]
>   20:  48 8b 08               mov    rcx,QWORD PTR [rax]
>   23:  48 89 d6               mov    rsi,rdx
>   26:  48 0f af f1            imul   rsi,rcx
>   2a:  48 83 f9 ff            cmp    rcx,0xffffffffffffffff
>   2e:  48 0f 45 d6            cmovne rdx,rsi
>   32:  48 83 c0 08            add    rax,0x8
>   36:  48 3d 00 00 00 00      cmp    rax,0x0
>   3c:  75 e2                  jne    20 <size2+0x10>
>   3e:  31 c0                  xor    eax,eax
>   40:  48 85 d2               test   rdx,rdx
>   43:  74 12                  je     57 <size2+0x47>
>   45:  48 63 07               movsxd rax,DWORD PTR [rdi]
>   48:  48 63 4f 04            movsxd rcx,DWORD PTR [rdi+0x4]
>   4c:  48 0f af c1            imul   rax,rcx
>   50:  0f af 47 08            imul   eax,DWORD PTR [rdi+0x8]
>   54:  0f af c2               imul   eax,edx
>   57:  c3                     ret
>
> the optimized version is:
>
>   10:  48 63 07               movsxd rax,DWORD PTR [rdi]
>   13:  48 63 57 04            movsxd rdx,DWORD PTR [rdi+0x4]
>   17:  48 0f af c2            imul   rax,rdx
>   1b:  0f af 47 08            imul   eax,DWORD PTR [rdi+0x8]
>   1f:  69 c0 83 04 00 00      imul   eax,eax,0x483
>   25:  c3                     ret
>
> Which simply computes the product:
>
>   D[0] * D[1] * D[2] * const
>
> where const is the product of all static extents. Meaning the loop to
> compute the product of dynamic extents has been fully unrolled and
> all constants are perfectly precomputed.
>
> libstdc++-v3/ChangeLog:
>
>         * include/std/mdspan (__mdspan::__fwd_prod): Compute as the
>         product of pre-computed static static and the product of dynamic
>         extents.
>         (__mdspan::__rev_prod): Ditto.
>
> Signed-off-by: Luc Grosheintz <luc.groshei...@gmail.com>
> ---
>
I am bit concerned with additional code size with ths tables, so would work
out
on removing their dependency on index_type, by passing auto& with extents
arrray
instead of Extents type.

>  libstdc++-v3/include/std/mdspan | 81 +++++++++++++++++++++++----------
>  1 file changed, 56 insertions(+), 25 deletions(-)
>
> diff --git a/libstdc++-v3/include/std/mdspan
> b/libstdc++-v3/include/std/mdspan
> index 5e79d4bfb59..06ccf3e3827 100644
> --- a/libstdc++-v3/include/std/mdspan
> +++ b/libstdc++-v3/include/std/mdspan
> @@ -184,6 +184,49 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
>        __valid_static_extent = _Extent == dynamic_extent
>         || _Extent <= numeric_limits<_IndexType>::max();
>
> +    template<typename _Extents>
>
Same here, we do not depend on the index type.

> +      consteval size_t
> +      __static_prod(size_t __begin, size_t __end)
> +      {
> +       size_t __prod = 1;
> +       for(size_t __i = __begin; __i < __end; ++__i)
> +         {
> +             auto __ext = _Extents::static_extent(__i);
> +             __prod *= __ext == dynamic_extent ? size_t(1) : __ext;
> +         }
> +       return __prod;
> +      }
> +
> +    // Pre-compute: \prod_{i = 0}^r _Extents[i]
> +    template<typename _Extents>
>
We do not really depend on the index parameter here, so again to reduce the
number of instantiations (and the code bloat), I would use the
auto& parameter,
and change _Extents:_S_static_extents to return a reference to it.

Then have two __mdspan::__static_extents overloads:
__mdspan::__static_extents() -> array&
__mdspan::__static_extents(size_t, size_t) -> span

> +      struct _FwdProd
>
+      {
> +       constexpr static std::array<size_t, _Extents::rank() + 1> _S_value
> =
>
I would declare them as inline constexpr variables, instead of declaring
static variables
inside the class.

> +       [] consteval
> +       {
> +         constexpr size_t __rank = _Extents::rank();
> +         std::array<size_t, __rank + 1> __ret;
> +         for(size_t __r = 0; __r < __rank + 1; ++__r)
> +           __ret[__r] = __static_prod<_Extents>(0, __r);
> +         return __ret;
> +       }();
> +      };
> +
> +    // Pre-compute: \prod_{i = r+1}^n _Extents[i]
> +    template<typename _Extents>
> +      struct _RevProd
> +      {
> +       constexpr static std::array<size_t, _Extents::rank()> _S_value =
> +       [] consteval
> +       {
> +         constexpr size_t __rank = _Extents::rank();
> +         std::array<size_t, __rank> __ret;
> +         for(size_t __r = 0; __r < __rank; ++__r)
> +           __ret[__r] = __static_prod<_Extents>(__r + 1, __rank);
> +         return __ret;
> +       }();
> +      };
> +
>      template<typename _Extents>
>        constexpr span<const size_t>
>        __static_extents(size_t __begin = 0, size_t __end =
> _Extents::rank())
> @@ -352,46 +395,34 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
>           return false;
>        }
>
> -    constexpr size_t
> -    __static_extents_prod(const auto& __sta_exts) noexcept
> -    {
> -      size_t __ret = 1;
> -      for (auto __factor : __sta_exts)
> -       if (__factor != dynamic_extent)
> -         __ret *= __factor;
> -      return __ret;
> -    }
> -
>      template<typename _Extents>
>        constexpr typename _Extents::index_type
> -      __exts_prod(const _Extents& __exts, size_t __begin, size_t __end)
> noexcept
> +      __extents_prod(const _Extents& __exts, size_t __sta_prod, size_t
> __begin,
> +                    size_t __end)
>        {
> -       using _IndexType = typename _Extents::index_type;
> -
> -       size_t __ret = 1;
> -       if constexpr (_Extents::rank_dynamic() != _Extents::rank())
> -         {
> -           auto __sta_exts = __static_extents<_Extents>(__begin, __end);
> -           __ret = __static_extents_prod(__sta_exts);
> -           if (__ret == 0)
> -             return 0;
> -         }
> -
> +       size_t __ret = __sta_prod;
>         if constexpr (_Extents::rank_dynamic() > 0)
>           for (auto __factor : __dynamic_extents(__exts, __begin, __end))
>             __ret *= size_t(__factor);
> -       return _IndexType(__ret);
> +       return static_cast<typename _Extents::index_type>(__ret);
>        }
>
>      template<typename _Extents>
>        constexpr typename _Extents::index_type
>        __fwd_prod(const _Extents& __exts, size_t __r) noexcept
> -      { return __exts_prod(__exts, 0, __r); }
> +      {
> +       size_t __sta_prod = _FwdProd<_Extents>::_S_value[__r];
> +       return __extents_prod(__exts, __sta_prod, 0, __r);
> +      }
>
>      template<typename _Extents>
>        constexpr typename _Extents::index_type
>        __rev_prod(const _Extents& __exts, size_t __r) noexcept
> -      { return __exts_prod(__exts, __r + 1, __exts.rank()); }
> +      {
> +       constexpr size_t __rank = _Extents::rank();
> +       size_t __sta_prod = _RevProd<_Extents>::_S_value[__r];
> +       return __extents_prod(__exts, __sta_prod, __r + 1, __rank);
> +      }
>
>      template<typename _Extents>
>        constexpr typename _Extents::index_type
> --
> 2.50.0
>
>

Reply via email to