On Sun, Aug 3, 2025 at 11:07 PM Luc Grosheintz <[email protected]>
wrote:
> The methods layout_{left,right}::mapping::stride are defined
> as
>
> \prod_{i = 0}^r E[i]
> \prod_{i = r+1}^n E[i]
>
> This is computed as the product of a precomputed static product and the
> product of the required dynamic extents.
>
> Disassembly shows that even for low-rank extents, i.e. rank == 1 and
> rank == 2, with at least one dynamic extent, the generated code loads
> two values; and then runs the loop over at most one element, e.g. for
> stride_left_d5 defined below the generated code is:
>
> 220: 48 8b 04 f5 00 00 00 mov rax,QWORD PTR [rsi*8+0x0]
> 227: 00
> 228: 31 d2 xor edx,edx
> 22a: 48 85 c0 test rax,rax
> 22d: 74 23 je 252 <stride_left_d5+0x32>
> 22f: 48 8b 0c f5 00 00 00 mov rcx,QWORD PTR [rsi*8+0x0]
> 236: 00
> 237: 48 c1 e1 02 shl rcx,0x2
> 23b: 74 13 je 250 <stride_left_d5+0x30>
> 23d: 48 01 f9 add rcx,rdi
> 240: 48 63 17 movsxd rdx,DWORD PTR [rdi]
> 243: 48 83 c7 04 add rdi,0x4
> 247: 48 0f af c2 imul rax,rdx
> 24b: 48 39 f9 cmp rcx,rdi
> 24e: 75 f0 jne 240 <stride_left_d5+0x20>
> 250: 89 c2 mov edx,eax
> 252: 89 d0 mov eax,edx
> 254: c3 ret
>
> If there's no dynamic extents, it simply loads the precomputed product
> of static extents.
>
> For rank == 1 the answer is the constant `1`; for rank == 2 it's either 1
> or
> extents.extent(k), with k == 0 for layout_left and k == 1 for
> layout_right.
>
> Consider,
>
> using Ed = std::extents<int, dyn>;
> int stride_left_d(const std::layout_left::mapping<Ed>& m, size_t r)
> { return m.stride(r); }
>
> using E3d = std::extents<int, 3, dyn>;
> int stride_left_3d(const std::layout_left::mapping<E3d>& m, size_t r)
> { return m.stride(r); }
>
> using Ed5 = std::extents<int, dyn, 5>;
> int stride_left_d5(const std::layout_left::mapping<Ed5>& m, size_t r)
> { return m.stride(r); }
>
> The optimized code for these three cases is:
>
> 0000000000000060 <stride_left_d>:
> 60: b8 01 00 00 00 mov eax,0x1
> 65: c3 ret
>
> 0000000000000090 <stride_left_3d>:
> 90: 48 83 fe 01 cmp rsi,0x1
> 94: 19 c0 sbb eax,eax
> 96: 83 e0 fe and eax,0xfffffffe
> 99: 83 c0 03 add eax,0x3
> 9c: c3 ret
>
> 00000000000000a0 <stride_left_d5>:
> a0: b8 01 00 00 00 mov eax,0x1
> a5: 48 85 f6 test rsi,rsi
> a8: 74 02 je ac <stride_left_d5+0xc>
> aa: 8b 07 mov eax,DWORD PTR [rdi]
> ac: c3 ret
>
> For rank == 1 it simply returns 1 (as expected). For rank == 2, it
> either implements a branchless formula, or conditionally loads one
> value. In all cases involving a dynamic extent this seems like it's
> always doing clearly less work, both in terms of computation and loads.
> In cases not involving a dynamic extent, it replaces loading one value
> with a branchless sequence of four instructions.
>
> This commit also refactors __size to no use any of the precomputed
> arrays. This prevents instantiating __{fwd,rev}_partial_prods for
> low-rank extents. This results in a further size reduction of a
> reference object file (described two commits prior) by 9% from 46.0kB to
> 41.9kB.
>
> In a prior commit we optimized __size to produce better object code by
> precomputing the static products. This refactor enables the optimizer to
> generate the same optimized code.
>
> libstdc++-v3/ChangeLog:
>
> * include/std/mdspan (__mdspan::__fwd_prod): Optimize
> for rank <= 2.
> (__mdspan::__rev_prod): Ditto.
> (__mdspan::__size): Refactor to use a pre-computed product, not
> a partial product.
>
> Signed-off-by: Luc Grosheintz <[email protected]>
> ---
> libstdc++-v3/include/std/mdspan | 32 ++++++++++++++++++++++++++------
> 1 file changed, 26 insertions(+), 6 deletions(-)
>
> diff --git a/libstdc++-v3/include/std/mdspan
> b/libstdc++-v3/include/std/mdspan
> index dc1b44ee35c..11a5063f60d 100644
> --- a/libstdc++-v3/include/std/mdspan
> +++ b/libstdc++-v3/include/std/mdspan
> @@ -423,25 +423,45 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
> constexpr typename _Extents::index_type
> __fwd_prod(const _Extents& __exts, size_t __r) noexcept
> {
> + constexpr size_t __rank = _Extents::rank();
> constexpr auto __sta_exts = __static_extents<_Extents>();
> - size_t __sta_prod = __fwd_partial_prods<__sta_exts>[__r];
> - return __extents_prod(__exts, __sta_prod, 0, __r);
> + if constexpr (__rank == 1)
> + return 1;
> + else if constexpr (__rank == 2)
> + return __r == 0 ? 1 : __exts.extent(0);
> + else
> + {
> + size_t __sta_prod = __fwd_partial_prods<__sta_exts>[__r];
> + return __extents_prod(__exts, __sta_prod, 0, __r);
> + }
> }
>
> template<typename _Extents>
> constexpr typename _Extents::index_type
> __rev_prod(const _Extents& __exts, size_t __r) noexcept
> {
> - constexpr auto __sta_exts = __static_extents<_Extents>();
> constexpr size_t __rank = _Extents::rank();
> - size_t __sta_prod = __rev_partial_prods<__sta_exts>[__r];
> - return __extents_prod(__exts, __sta_prod, __r + 1, __rank);
> + constexpr auto __sta_exts = __static_extents<_Extents>();
> + if constexpr (__rank == 1)
> + return 1;
> + else if constexpr (__rank == 2)
> + return __r == 0 ? __exts.extent(1) : 1;
> + else
> + {
> + size_t __sta_prod = __rev_partial_prods<__sta_exts>[__r];
> + return __extents_prod(__exts, __sta_prod, __r + 1, __rank);
> + }
> }
>
> template<typename _Extents>
> constexpr typename _Extents::index_type
> __size(const _Extents& __exts) noexcept
> - { return __fwd_prod(__exts, __exts.rank()); }
> + {
> + constexpr auto __sta_exts = __static_extents<_Extents>();
>
I am also changing this to constexpr auto&, there is no need to make a copy
of aray.
> + constexpr size_t __rank = _Extents::rank();
> + constexpr size_t __sta_prod = __static_prod<__sta_exts>(0, __rank);
> + return __extents_prod(__exts, __sta_prod, 0, __rank);
> + }
>
> template<typename _IndexType, size_t... _Counts>
> auto __build_dextents_type(integer_sequence<size_t, _Counts...>)
> --
> 2.50.0
>
>