On Sun, Aug 3, 2025 at 10:59 PM Luc Grosheintz <[email protected]>
wrote:
> Let E denote an multi-dimensional extent; n the rank of E; r = 0, ...,
> n; E[i] the i-th extent; and D[k] be the (possibly empty) array of
> dynamic extents.
>
> The two partial products for r = 0, ..., n:
>
> \prod_{i = 0}^r E[i] (fwd)
> \prod_{i = r+1}^n E[i] (rev)
>
> can be computed as the product of static and dynamic extents. The static
> fwd and rev product can be computed at compile time for all values of r.
>
> Three methods are directly affected by this optimization:
>
> layout_left::mapping::stride
> layout_right::mapping::stride
> mdspan::size
>
> We'll check the generated code (-O2) for all three methods for a generic
> (artificially) high-dimensional multi-dimensional extents.
>
> Consider a generic case:
>
> using Extents = std::extents<int, 3, 5, dyn, dyn, dyn, 7, dyn>;
>
> int stride_left(const std::layout_left::mapping<Extents>& m, size_t r)
> { return m.stride(r); }
>
> The code generated prior to this commit:
>
> 4f0: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] #
> 4f8
> 4f7: 00
> 4f8: 48 83 c6 01 add rsi,0x1
> 4fc: 48 c7 44 24 e8 ff ff mov QWORD PTR
> [rsp-0x18],0xffffffffffffffff
> 503: ff ff
> 505: 48 8d 04 f5 00 00 00 lea rax,[rsi*8+0x0]
> 50c: 00
> 50d: 0f 29 44 24 b8 movaps XMMWORD PTR [rsp-0x48],xmm0
> 512: 66 0f 76 c0 pcmpeqd xmm0,xmm0
> 516: 0f 29 44 24 c8 movaps XMMWORD PTR [rsp-0x38],xmm0
> 51b: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] #
> 523
> 522: 00
> 523: 0f 29 44 24 d8 movaps XMMWORD PTR [rsp-0x28],xmm0
> 528: 48 83 f8 38 cmp rax,0x38
> 52c: 74 72 je 5a0 <stride_right_E1+0xb0>
> 52e: 48 8d 54 04 b8 lea rdx,[rsp+rax*1-0x48]
> 533: 4c 8d 4c 24 f0 lea r9,[rsp-0x10]
> 538: b8 01 00 00 00 mov eax,0x1
> 53d: 0f 1f 00 nop DWORD PTR [rax]
> 540: 48 8b 0a mov rcx,QWORD PTR [rdx]
> 543: 49 89 c0 mov r8,rax
> 546: 4c 0f af c1 imul r8,rcx
> 54a: 48 83 f9 ff cmp rcx,0xffffffffffffffff
> 54e: 49 0f 45 c0 cmovne rax,r8
> 552: 48 83 c2 08 add rdx,0x8
> 556: 49 39 d1 cmp r9,rdx
> 559: 75 e5 jne 540 <stride_right_E1+0x50>
> 55b: 48 85 c0 test rax,rax
> 55e: 74 38 je 598 <stride_right_E1+0xa8>
> 560: 48 8b 14 f5 00 00 00 mov rdx,QWORD PTR [rsi*8+0x0]
> 567: 00
> 568: 48 c1 e2 02 shl rdx,0x2
> 56c: 48 83 fa 10 cmp rdx,0x10
> 570: 74 1e je 590 <stride_right_E1+0xa0>
> 572: 48 8d 4f 10 lea rcx,[rdi+0x10]
> 576: 48 01 d7 add rdi,rdx
> 579: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0]
> 580: 48 63 17 movsxd rdx,DWORD PTR [rdi]
> 583: 48 83 c7 04 add rdi,0x4
> 587: 48 0f af c2 imul rax,rdx
> 58b: 48 39 f9 cmp rcx,rdi
> 58e: 75 f0 jne 580 <stride_right_E1+0x90>
> 590: c3 ret
> 591: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0]
> 598: c3 ret
> 599: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0]
> 5a0: b8 01 00 00 00 mov eax,0x1
> 5a5: eb b9 jmp 560 <stride_right_E1+0x70>
> 5a7: 66 0f 1f 84 00 00 00 nop WORD PTR [rax+rax*1+0x0]
> 5ae: 00 00
>
> which seems to be performing:
>
> preparatory_work();
> ret = 1
> for(i = 0; i < rank; ++i)
> tmp = ret * E[i]
> if E[i] != -1
> ret = tmp
> for(i = 0; i < rank_dynamic; ++i)
> ret *= D[i]
>
> This commit reduces it down to:
>
> 270: 48 8b 04 f5 00 00 00 mov rax,QWORD PTR [rsi*8+0x0]
> 277: 00
> 278: 31 d2 xor edx,edx
> 27a: 48 85 c0 test rax,rax
> 27d: 74 33 je 2b2 <stride_right_E1+0x42>
> 27f: 48 8b 14 f5 00 00 00 mov rdx,QWORD PTR [rsi*8+0x0]
> 286: 00
> 287: 48 c1 e2 02 shl rdx,0x2
> 28b: 48 83 fa 10 cmp rdx,0x10
> 28f: 74 1f je 2b0 <stride_right_E1+0x40>
> 291: 48 8d 4f 10 lea rcx,[rdi+0x10]
> 295: 48 01 d7 add rdi,rdx
> 298: 0f 1f 84 00 00 00 00 nop DWORD PTR [rax+rax*1+0x0]
> 29f: 00
> 2a0: 48 63 17 movsxd rdx,DWORD PTR [rdi]
> 2a3: 48 83 c7 04 add rdi,0x4
> 2a7: 48 0f af c2 imul rax,rdx
> 2ab: 48 39 f9 cmp rcx,rdi
> 2ae: 75 f0 jne 2a0 <stride_right_E1+0x30>
> 2b0: 89 c2 mov edx,eax
> 2b2: 89 d0 mov eax,edx
> 2b4: c3 ret
>
> Loosely speaking this does the following:
>
> 1. Load the starting position k in the array of dynamic extents; and
> return if possible.
> 2. Load the partial product of static extents.
> 3. Computes the \prod_{i = k}^d D[i] where d is the number of
> dynamic extents in a loop.
>
> It shows that the span used for passing in the dynamic extents is
> completely eliminated; and the fact that the product always runs to the
> end of the array of dynamic extents is used by the compiler to eliminate
> one indirection to determine the end position in the array of dynamic
> extents.
>
> The analogous code is generated for layout_left.
>
> Next, consider
>
> using E2 = std::extents<int, 3, 5, dyn, dyn, 7, dyn, 11>;
> int size2(const std::mdspan<double, E2>& md)
> { return md.size(); }
>
> on immediately preceding commit the generated code is
>
> 10: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 18
> 17: 00
> 18: 49 89 f8 mov r8,rdi
> 1b: 48 8d 44 24 b8 lea rax,[rsp-0x48]
> 20: 48 c7 44 24 e8 0b 00 mov QWORD PTR [rsp-0x18],0xb
> 27: 00 00
> 29: 48 8d 7c 24 f0 lea rdi,[rsp-0x10]
> 2e: ba 01 00 00 00 mov edx,0x1
> 33: 0f 29 44 24 b8 movaps XMMWORD PTR [rsp-0x48],xmm0
> 38: 66 0f 76 c0 pcmpeqd xmm0,xmm0
> 3c: 0f 29 44 24 c8 movaps XMMWORD PTR [rsp-0x38],xmm0
> 41: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 49
> 48: 00
> 49: 0f 29 44 24 d8 movaps XMMWORD PTR [rsp-0x28],xmm0
> 4e: 66 66 2e 0f 1f 84 00 data16 cs nop WORD PTR [rax+rax*1+0x0]
> 55: 00 00 00 00
> 59: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0]
> 60: 48 8b 08 mov rcx,QWORD PTR [rax]
> 63: 48 89 d6 mov rsi,rdx
> 66: 48 0f af f1 imul rsi,rcx
> 6a: 48 83 f9 ff cmp rcx,0xffffffffffffffff
> 6e: 48 0f 45 d6 cmovne rdx,rsi
> 72: 48 83 c0 08 add rax,0x8
> 76: 48 39 c7 cmp rdi,rax
> 79: 75 e5 jne 60 <size2+0x50>
> 7b: 48 85 d2 test rdx,rdx
> 7e: 74 18 je 98 <size2+0x88>
> 80: 49 63 00 movsxd rax,DWORD PTR [r8]
> 83: 49 63 48 04 movsxd rcx,DWORD PTR [r8+0x4]
> 87: 48 0f af c1 imul rax,rcx
> 8b: 41 0f af 40 08 imul eax,DWORD PTR [r8+0x8]
> 90: 0f af c2 imul eax,edx
> 93: c3 ret
> 94: 0f 1f 40 00 nop DWORD PTR [rax+0x0]
> 98: 31 c0 xor eax,eax
> 9a: c3 ret
>
> which is needlessly long. The current commit reduces it down to:
>
> 10: 48 63 07 movsxd rax,DWORD PTR [rdi]
> 13: 48 63 57 04 movsxd rdx,DWORD PTR [rdi+0x4]
> 17: 48 0f af c2 imul rax,rdx
> 1b: 0f af 47 08 imul eax,DWORD PTR [rdi+0x8]
> 1f: 69 c0 83 04 00 00 imul eax,eax,0x483
> 25: c3 ret
>
> Which simply computes the product:
>
> D[0] * D[1] * D[2] * const
>
> where const is the product of all static extents. Meaning the loop to
> compute the product of dynamic extents has been fully unrolled and
> all constants are perfectly precomputed.
>
> The size of the object file described in the previous commit reduces
> by 17% from 55.8kB to 46.0kB.
>
> libstdc++-v3/ChangeLog:
>
> * include/std/mdspan (__mdspan::__static_prod): New function.
> (__mdspan::__fwd_partial_prods): Constexpr array of partial
> forward products.
> (__mdspan::__fwd_partial_prods): Same for reverse partial
> products.
> (__mdspan::__static_extents_prod): Delete function.
> (__mdspan::__extents_prod): Renamed from __exts_prod and
> refactored.
> include/std/mdspan (__mdspan::__fwd_prod): Compute as the
> product of pre-computed static static and the product of dynamic
> extents.
> (__mdspan::__rev_prod): Ditto.
>
> Signed-off-by: Luc Grosheintz <[email protected]>
> ---
>
LGTM.
> libstdc++-v3/include/std/mdspan | 77 ++++++++++++++++++++++-----------
> 1 file changed, 52 insertions(+), 25 deletions(-)
>
> diff --git a/libstdc++-v3/include/std/mdspan
> b/libstdc++-v3/include/std/mdspan
> index 7b73df8550e..dc1b44ee35c 100644
> --- a/libstdc++-v3/include/std/mdspan
> +++ b/libstdc++-v3/include/std/mdspan
> @@ -202,6 +202,41 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
> __static_extents() noexcept
> { return _Extents::_S_storage::_S_static_extents(); }
>
> + template<array _Extents>
> + consteval size_t
> + __static_prod(size_t __begin, size_t __end)
> + {
> + size_t __prod = 1;
> + for(size_t __i = __begin; __i < __end; ++__i)
> + {
> + auto __ext = _Extents[__i];
> + __prod *= __ext == dynamic_extent ? size_t(1) : __ext;
> + }
> + return __prod;
> + }
> +
> + // Pre-compute: \prod_{i = 0}^r _Extents[i], for r = 0,..., n
> (exclusive)
> + template<array _Extents>
> + constexpr auto __fwd_partial_prods = [] consteval
> + {
> + constexpr size_t __rank = _Extents.size();
> + std::array<size_t, __rank + 1> __ret;
> + for(size_t __r = 0; __r < __rank + 1; ++__r)
> + __ret[__r] = __static_prod<_Extents>(0, __r);
> + return __ret;
> + }();
> +
> + // Pre-compute: \prod_{i = r+1}^{n-1} _Extents[i]
> + template<array _Extents>
> + constexpr auto __rev_partial_prods = [] consteval
> + {
> + constexpr size_t __rank = _Extents.size();
> + std::array<size_t, __rank> __ret;
> + for(size_t __r = 0; __r < __rank; ++__r)
> + __ret[__r] = __static_prod<_Extents>(__r + 1, __rank);
> + return __ret;
> + }();
> +
> template<typename _Extents>
> constexpr span<const typename _Extents::index_type>
> __dynamic_extents(const _Extents& __exts, size_t __begin = 0,
> @@ -369,47 +404,39 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
> return false;
> }
>
> - constexpr size_t
> - __static_extents_prod(const auto& __sta_exts) noexcept
> - {
> - size_t __ret = 1;
> - for (auto __factor : __sta_exts)
> - if (__factor != dynamic_extent)
> - __ret *= __factor;
> - return __ret;
> - }
> -
> template<typename _Extents>
> constexpr typename _Extents::index_type
> - __exts_prod(const _Extents& __exts, size_t __begin, size_t __end)
> noexcept
> + __extents_prod(const _Extents& __exts, size_t __sta_prod, size_t
> __begin,
> + size_t __end) noexcept
> {
> - using _IndexType = typename _Extents::index_type;
> -
> - size_t __ret = 1;
> - if constexpr (_Extents::rank_dynamic() != _Extents::rank())
> - {
> - auto __sta_exts = __static_extents<_Extents>();
> - __ret = __static_extents_prod(span{__sta_exts.data() + __begin,
> - __sta_exts.data() + __end});
> - if (__ret == 0)
> - return 0;
> - }
> + if (__sta_prod == 0)
> + return 0;
>
> + size_t __ret = __sta_prod;
> if constexpr (_Extents::rank_dynamic() > 0)
> for (auto __factor : __dynamic_extents(__exts, __begin, __end))
> __ret *= size_t(__factor);
> - return _IndexType(__ret);
> + return static_cast<typename _Extents::index_type>(__ret);
> }
>
> template<typename _Extents>
> constexpr typename _Extents::index_type
> __fwd_prod(const _Extents& __exts, size_t __r) noexcept
> - { return __exts_prod(__exts, 0, __r); }
> + {
> + constexpr auto __sta_exts = __static_extents<_Extents>();
> + size_t __sta_prod = __fwd_partial_prods<__sta_exts>[__r];
> + return __extents_prod(__exts, __sta_prod, 0, __r);
> + }
>
> template<typename _Extents>
> constexpr typename _Extents::index_type
> __rev_prod(const _Extents& __exts, size_t __r) noexcept
> - { return __exts_prod(__exts, __r + 1, __exts.rank()); }
> + {
> + constexpr auto __sta_exts = __static_extents<_Extents>();
> + constexpr size_t __rank = _Extents::rank();
> + size_t __sta_prod = __rev_partial_prods<__sta_exts>[__r];
> + return __extents_prod(__exts, __sta_prod, __r + 1, __rank);
> + }
>
> template<typename _Extents>
> constexpr typename _Extents::index_type
> --
> 2.50.0
>
>