I'll ignore all of the template instantiation related comments. They'll
get fixed in a later commit in this series; and if that's not acceptable,
I'd rather change the other of the commits (though this is order is my
favourite).

On 7/28/25 07:58, Tomasz Kaminski wrote:
On Sun, Jul 27, 2025 at 2:53 PM Luc Grosheintz <luc.groshei...@gmail.com>
wrote:

Let E denote an multi-dimensional extent; n the rank of E; r = 0, ...,
n; E[i] the i-th extent; and D[k] be the (possibly empty) array of
dynamic extents.

The two partial products for r = 0, ..., n:

   \prod_{i = 0}^r E[i]     (fwd)
   \prod_{i = r+1}^n E[i]   (rev)

can be computed as the product of static and dynamic extents. The static
fwd and rev product can be computed at compile time for all values of r.

Three methods are directly affected by this optimization:

    layout_left::mapping::stride
    layout_right::mapping::stride
    mdspan::size

We'll check the generated code (-O2) for all three methods for a generic
(artificially) high-dimensional multi-dimensional extents.

Consider a generic case:

   using Extents = std::extents<int, 3, 5, dyn, dyn, dyn, 7, dyn>;

   int stride_left(const std::layout_left::mapping<Extents>& m, size_t r)
   { return m.stride(r); }

The code generated by master:

   80:  49 89 f0               mov    r8,rsi
   83:  49 89 f1               mov    r9,rsi
   86:  49 c1 e0 03            shl    r8,0x3
   8a:  74 6c                  je     f8 <stride_left+0x78>
   8c:  49 81 c0 00 00 00 00   add    r8,0x0
   93:  ba 00 00 00 00         mov    edx,0x0
   98:  b8 01 00 00 00         mov    eax,0x1
   9d:  0f 1f 00               nop    DWORD PTR [rax]
   a0:  48 8b 0a               mov    rcx,QWORD PTR [rdx]
   a3:  48 89 c6               mov    rsi,rax
   a6:  48 0f af f1            imul   rsi,rcx
   aa:  48 83 f9 ff            cmp    rcx,0xffffffffffffffff
   ae:  48 0f 45 c6            cmovne rax,rsi
   b2:  48 83 c2 08            add    rdx,0x8
   b6:  49 39 d0               cmp    r8,rdx
   b9:  75 e5                  jne    a0 <stride_left+0x20>
   bb:  31 d2                  xor    edx,edx
   bd:  48 85 c0               test   rax,rax
   c0:  74 30                  je     f2 <stride_left+0x72>
   c2:  4a 8b 0c cd 00 00 00   mov    rcx,QWORD PTR [r9*8+0x0]
   c9:  00
   ca:  48 c1 e1 02            shl    rcx,0x2
   ce:  74 20                  je     f0 <stride_left+0x70>
   d0:  48 01 f9               add    rcx,rdi
   d3:  66 66 2e 0f 1f 84 00   data16 cs nop WORD PTR [rax+rax*1+0x0]
   da:  00 00 00 00
   de:  66 90                  xchg   ax,ax
   e0:  48 63 17               movsxd rdx,DWORD PTR [rdi]
   e3:  48 83 c7 04            add    rdi,0x4
   e7:  48 0f af c2            imul   rax,rdx
   eb:  48 39 f9               cmp    rcx,rdi
   ee:  75 f0                  jne    e0 <stride_left+0x60>
   f0:  89 c2                  mov    edx,eax
   f2:  89 d0                  mov    eax,edx
   f4:  c3                     ret
   f5:  0f 1f 00               nop    DWORD PTR [rax]
   f8:  b8 01 00 00 00         mov    eax,0x1
   fd:  eb c3                  jmp    c2 <stride_left+0x42>

is reduced to:

   50:  48 8b 0c f5 00 00 00   mov    rcx,QWORD PTR [rsi*8+0x0]
   57:  00
   58:  48 8b 04 f5 00 00 00   mov    rax,QWORD PTR [rsi*8+0x0]
   5f:  00
   60:  48 c1 e1 02            shl    rcx,0x2
   64:  74 1a                  je     80 <stride_left+0x30>
   66:  48 01 f9               add    rcx,rdi
   69:  0f 1f 80 00 00 00 00   nop    DWORD PTR [rax+0x0]
   70:  48 63 17               movsxd rdx,DWORD PTR [rdi]
   73:  48 83 c7 04            add    rdi,0x4
   77:  48 0f af c2            imul   rax,rdx
   7b:  48 39 f9               cmp    rcx,rdi
   7e:  75 f0                  jne    70 <stride_left+0x20>
   80:  c3                     ret

Loosely speaking this does the following:

   1. Load the starting position k in the array of dynamic extents.
   2. Load the partial product of static extents.
   3. Computes the \prod_{i = k}^d D[i] where d is the number of
   dynamic extents in a loop.

It shows that the span used for passing in the dynamic extents is
completely eliminated; and the fact that the product always runs to the
end of the array of dynamic extents is used by the compiler to eliminate
one indirection to determine the end position in the array of dynamic
extents.

The analogous code is generated for layout_right.

Next, consider

   using E2 = std::extents<int, 3, 5, dyn, dyn, 7, dyn, 11>;
   int size2(const std::mdspan<double, E2>& md)
   { return md.size(); }

on master the generated code is

   10:  b8 00 00 00 00         mov    eax,0x0
   15:  ba 01 00 00 00         mov    edx,0x1
   1a:  66 0f 1f 44 00 00      nop    WORD PTR [rax+rax*1+0x0]
   20:  48 8b 08               mov    rcx,QWORD PTR [rax]
   23:  48 89 d6               mov    rsi,rdx
   26:  48 0f af f1            imul   rsi,rcx
   2a:  48 83 f9 ff            cmp    rcx,0xffffffffffffffff
   2e:  48 0f 45 d6            cmovne rdx,rsi
   32:  48 83 c0 08            add    rax,0x8
   36:  48 3d 00 00 00 00      cmp    rax,0x0
   3c:  75 e2                  jne    20 <size2+0x10>
   3e:  31 c0                  xor    eax,eax
   40:  48 85 d2               test   rdx,rdx
   43:  74 12                  je     57 <size2+0x47>
   45:  48 63 07               movsxd rax,DWORD PTR [rdi]
   48:  48 63 4f 04            movsxd rcx,DWORD PTR [rdi+0x4]
   4c:  48 0f af c1            imul   rax,rcx
   50:  0f af 47 08            imul   eax,DWORD PTR [rdi+0x8]
   54:  0f af c2               imul   eax,edx
   57:  c3                     ret

the optimized version is:

   10:  48 63 07               movsxd rax,DWORD PTR [rdi]
   13:  48 63 57 04            movsxd rdx,DWORD PTR [rdi+0x4]
   17:  48 0f af c2            imul   rax,rdx
   1b:  0f af 47 08            imul   eax,DWORD PTR [rdi+0x8]
   1f:  69 c0 83 04 00 00      imul   eax,eax,0x483
   25:  c3                     ret

Which simply computes the product:

   D[0] * D[1] * D[2] * const

where const is the product of all static extents. Meaning the loop to
compute the product of dynamic extents has been fully unrolled and
all constants are perfectly precomputed.

libstdc++-v3/ChangeLog:

         * include/std/mdspan (__mdspan::__fwd_prod): Compute as the
         product of pre-computed static static and the product of dynamic
         extents.
         (__mdspan::__rev_prod): Ditto.

Signed-off-by: Luc Grosheintz <luc.groshei...@gmail.com>
---

I am bit concerned with additional code size with ths tables, so would work
out
on removing their dependency on index_type, by passing auto& with extents
arrray
instead of Extents type.

  libstdc++-v3/include/std/mdspan | 81 +++++++++++++++++++++++----------
  1 file changed, 56 insertions(+), 25 deletions(-)

diff --git a/libstdc++-v3/include/std/mdspan
b/libstdc++-v3/include/std/mdspan
index 5e79d4bfb59..06ccf3e3827 100644
--- a/libstdc++-v3/include/std/mdspan
+++ b/libstdc++-v3/include/std/mdspan
@@ -184,6 +184,49 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
        __valid_static_extent = _Extent == dynamic_extent
         || _Extent <= numeric_limits<_IndexType>::max();

+    template<typename _Extents>

Same here, we do not depend on the index type.

+      consteval size_t
+      __static_prod(size_t __begin, size_t __end)
+      {
+       size_t __prod = 1;
+       for(size_t __i = __begin; __i < __end; ++__i)
+         {
+             auto __ext = _Extents::static_extent(__i);
+             __prod *= __ext == dynamic_extent ? size_t(1) : __ext;
+         }
+       return __prod;
+      }
+
+    // Pre-compute: \prod_{i = 0}^r _Extents[i]
+    template<typename _Extents>

We do not really depend on the index parameter here, so again to reduce the
number of instantiations (and the code bloat), I would use the
auto& parameter,
and change _Extents:_S_static_extents to return a reference to it.

Then have two __mdspan::__static_extents overloads:
__mdspan::__static_extents() -> array&
__mdspan::__static_extents(size_t, size_t) -> span

+      struct _FwdProd

+      {
+       constexpr static std::array<size_t, _Extents::rank() + 1> _S_value
=

I would declare them as inline constexpr variables, instead of declaring
static variables
inside the class.


Nice idea, though in a different context I proposed that

  template<size_t... _Extents>
    struct _RevProd
    {
       size_t _S_value(size_t i)
       { return _S_data[i]; }

    private:
       array _S_data = consteval ...;
    }

will give us more freedom to optimize various cases. Same pattern
can be used for _S_dynamic_index and _S_dynamic_index_inv.

+       [] consteval
+       {
+         constexpr size_t __rank = _Extents::rank();
+         std::array<size_t, __rank + 1> __ret;
+         for(size_t __r = 0; __r < __rank + 1; ++__r)
+           __ret[__r] = __static_prod<_Extents>(0, __r);
+         return __ret;
+       }();
+      };
+
+    // Pre-compute: \prod_{i = r+1}^n _Extents[i]
+    template<typename _Extents>
+      struct _RevProd
+      {
+       constexpr static std::array<size_t, _Extents::rank()> _S_value =
+       [] consteval
+       {
+         constexpr size_t __rank = _Extents::rank();
+         std::array<size_t, __rank> __ret;
+         for(size_t __r = 0; __r < __rank; ++__r)
+           __ret[__r] = __static_prod<_Extents>(__r + 1, __rank);
+         return __ret;
+       }();
+      };
+
      template<typename _Extents>
        constexpr span<const size_t>
        __static_extents(size_t __begin = 0, size_t __end =
_Extents::rank())
@@ -352,46 +395,34 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
           return false;
        }

-    constexpr size_t
-    __static_extents_prod(const auto& __sta_exts) noexcept
-    {
-      size_t __ret = 1;
-      for (auto __factor : __sta_exts)
-       if (__factor != dynamic_extent)
-         __ret *= __factor;
-      return __ret;
-    }
-
      template<typename _Extents>
        constexpr typename _Extents::index_type
-      __exts_prod(const _Extents& __exts, size_t __begin, size_t __end)
noexcept
+      __extents_prod(const _Extents& __exts, size_t __sta_prod, size_t
__begin,
+                    size_t __end)
        {
-       using _IndexType = typename _Extents::index_type;
-
-       size_t __ret = 1;
-       if constexpr (_Extents::rank_dynamic() != _Extents::rank())
-         {
-           auto __sta_exts = __static_extents<_Extents>(__begin, __end);
-           __ret = __static_extents_prod(__sta_exts);
-           if (__ret == 0)
-             return 0;
-         }
-
+       size_t __ret = __sta_prod;
         if constexpr (_Extents::rank_dynamic() > 0)
           for (auto __factor : __dynamic_extents(__exts, __begin, __end))
             __ret *= size_t(__factor);
-       return _IndexType(__ret);
+       return static_cast<typename _Extents::index_type>(__ret);
        }

      template<typename _Extents>
        constexpr typename _Extents::index_type
        __fwd_prod(const _Extents& __exts, size_t __r) noexcept
-      { return __exts_prod(__exts, 0, __r); }
+      {
+       size_t __sta_prod = _FwdProd<_Extents>::_S_value[__r];
+       return __extents_prod(__exts, __sta_prod, 0, __r);
+      }

      template<typename _Extents>
        constexpr typename _Extents::index_type
        __rev_prod(const _Extents& __exts, size_t __r) noexcept
-      { return __exts_prod(__exts, __r + 1, __exts.rank()); }
+      {
+       constexpr size_t __rank = _Extents::rank();
+       size_t __sta_prod = _RevProd<_Extents>::_S_value[__r];
+       return __extents_prod(__exts, __sta_prod, __r + 1, __rank);
+      }

      template<typename _Extents>
        constexpr typename _Extents::index_type
--
2.50.0




Reply via email to