[Getfem-commits] [getfem-commits] branch master updated: Use hard coded loop unrollings

Konstantinos Poulios via Getfem-commits Tue, 05 Dec 2023 09:24:45 -0800

This is an automated email from the git hooks/post-receive script.

logari81 pushed a commit to branch master
in repository getfem.


The following commit(s) were added to refs/heads/master by this push:
     new 0ce4bf39 Use hard coded loop unrollings
0ce4bf39 is described below

commit 0ce4bf3963d748964714a90851e6cd1dfc17a4b8
Author: Konstantinos Poulios <logar...@gmail.com>
AuthorDate: Tue Dec 5 18:24:02 2023 +0100

    Use hard coded loop unrollings
    
      - recursive template unrolling seems to be fragile at least with gcc
      - changes do not introduce any significant slow down in test_assembly
        benchmarks, when compiled with O2 optimizations
---
 src/getfem_generic_assembly_compile_and_exec.cc | 401 +++++++++++++++++-------
 1 file changed, 289 insertions(+), 112 deletions(-)

diff --git a/src/getfem_generic_assembly_compile_and_exec.cc 
b/src/getfem_generic_assembly_compile_and_exec.cc
index 71793f6a..f51bcf80 100644
--- a/src/getfem_generic_assembly_compile_and_exec.cc
+++ b/src/getfem_generic_assembly_compile_and_exec.cc
@@ -25,7 +25,7 @@
 #include "getfem/getfem_generic_assembly_compile_and_exec.h"
 #include "getfem/getfem_generic_assembly_functions_and_operators.h"
 
-// #define GA_USES_BLAS // not so interesting, at least for debian blas
+//#define GA_USES_BLAS
 
 // #define GA_DEBUG_INFO(a) { cout << a << endl; }
 #define GA_DEBUG_INFO(a)
@@ -2515,43 +2515,46 @@ namespace getfem {
 
   // Performs Ani Bmi -> Cmn
   struct ga_instruction_contraction : public ga_instruction {
-    base_tensor &t, &tc1, &tc2;
-    size_type nn;
-    virtual int exec() {
-      GA_DEBUG_INFO("Instruction: contraction operation of size " << nn);
-#if GA_USES_BLAS
-      long m = int(tc1.size()/nn), k = int(nn), n = int(tc2.size()/nn);
-      long lda = m, ldb = n, ldc = m;
-      char T = 'T', N = 'N';
-      scalar_type alpha(1), beta(0);
-      gmm::dgemm_(&N, &T, &m, &n, &k, &alpha, &(tc1[0]), &lda, &(tc2[0]), &ldb,
-                  &beta, &(t[0]), &ldc);
+    base_tensor &t;
+    const base_tensor &tc1, &tc2;
+    const size_type I;
+    virtual int exec() {
+      GA_DEBUG_INFO("Instruction: contraction operation of size " << I);
+#if defined(GA_USES_BLAS)
+      BLAS_INT N = BLAS_INT(tc1.size()/I), I_ = BLAS_INT(I),
+               M = BLAS_INT(tc2.size()/I);
+      char notrans = 'N', trans = 'T';
+      static const scalar_type one(1), zero(0);
+      gmm::dgemm_(&notrans, &trans, &M, &N, &I_, &one,
+                  &(tc2[0]), &M, &(tc1[0]), &N, &zero, &(t[0]), &M);
 #else
-      size_type s1 = tc1.size()/nn, s2 = tc2.size()/nn;
-      GA_DEBUG_ASSERT(t.size() == s1*s2, "Internal error");
+      size_type N = tc1.size()/I,
+                M = tc2.size()/I;
+      GA_DEBUG_ASSERT(t.size() == N*M, "Internal error");
 
-      auto it1=tc1.begin(), it2=tc2.begin(), it2end=it2 + s2;
+      auto it1=tc1.begin(), it2=tc2.begin(), it2end=it2 + M;
       for (auto it = t.begin(); it != t.end(); ++it) {
         auto it11 = it1, it22 = it2;
         scalar_type a = (*it11) * (*it22);
-        for (size_type i = 1; i < nn; ++i)
-          { it11 += s1; it22 += s2; a += (*it11) * (*it22); }
+        for (size_type i = 1; i < I; ++i)
+          { it11 += N; it22 += M; a += (*it11) * (*it22); }
         *it = a;
         ++it2; if (it2 == it2end) { it2 = tc2.begin(), ++it1; }
       }
       // auto it = t.begin(); // Unoptimized version.
-      // for (size_type i = 0; i < s1; ++i)
-      //   for (size_type j = 0; j < s2; ++j, ++it) {
+      // for (size_type n = 0; n < N; ++n)
+      //   for (size_type m = 0; m < M; ++m, ++it) {
       //     *it = scalar_type(0);
-      //     for (size_type k = 0; k < nn; ++k)
-      //       *it += tc1[i+k*s1] * tc2[j+k*s2];
+      //     for (size_type i = 0; i < I; ++i)
+      //       *it += tc1[n+N*i] * tc2[m+M*i];
       //   }
 #endif
       return 0;
     }
-    ga_instruction_contraction(base_tensor &t_, base_tensor &tc1_,
-                             base_tensor &tc2_, size_type n_)
-      : t(t_), tc1(tc1_), tc2(tc2_), nn(n_) {}
+    ga_instruction_contraction(base_tensor &t_,
+                               const base_tensor &tc1_,
+                               const base_tensor &tc2_, size_type I_)
+      : t(t_), tc1(tc1_), tc2(tc2_), I(I_) {}
   };
 
   // Performs Ani Bmi -> Cmn
@@ -2850,29 +2853,116 @@ namespace getfem {
 
 
 
-  template<int N> inline scalar_type reduc_elem_unrolled__
-  (base_tensor::iterator &it1, base_tensor::iterator &it2,
-   size_type s1, size_type s2) {
-    return (it1[(N-1)*s1])*(it2[(N-1)*s2])
-      + reduc_elem_unrolled__<N-1>(it1, it2, s1, s2);
+  template<int I> inline
+  void reduc_elem_unrolled__(base_tensor::iterator &it,
+                             base_tensor::const_iterator &it1, 
base_tensor::const_iterator &it2,
+                             size_type s1, size_type s2) {
+    *it = (*it1)*(*it2);
+    for (int i=1; i < I; ++i)
+      *it += (*(it1+i*s1)) * (*(it2+i*s2));
+  }
+  template<> inline
+  void reduc_elem_unrolled__<9>(base_tensor::iterator &it,
+                                base_tensor::const_iterator &it1, 
base_tensor::const_iterator &it2,
+                                size_type s1, size_type s2) {
+    *it = (*it1) * (*it2);
+    *it += (*(it1+s1)) * (*(it2+s2));
+    *it += (*(it1+2*s1)) * (*(it2+2*s2));
+    *it += (*(it1+3*s1)) * (*(it2+3*s2));
+    *it += (*(it1+4*s1)) * (*(it2+4*s2));
+    *it += (*(it1+5*s1)) * (*(it2+5*s2));
+    *it += (*(it1+6*s1)) * (*(it2+6*s2));
+    *it += (*(it1+7*s1)) * (*(it2+7*s2));
+    *it += (*(it1+8*s1)) * (*(it2+8*s2));
+  }
+  template<> inline
+  void reduc_elem_unrolled__<8>(base_tensor::iterator &it,
+                                base_tensor::const_iterator &it1, 
base_tensor::const_iterator &it2,
+                                size_type s1, size_type s2) {
+    *it = (*it1) * (*it2);
+    *it += (*(it1+s1)) * (*(it2+s2));
+    *it += (*(it1+2*s1)) * (*(it2+2*s2));
+    *it += (*(it1+3*s1)) * (*(it2+3*s2));
+    *it += (*(it1+4*s1)) * (*(it2+4*s2));
+    *it += (*(it1+5*s1)) * (*(it2+5*s2));
+    *it += (*(it1+6*s1)) * (*(it2+6*s2));
+    *it += (*(it1+7*s1)) * (*(it2+7*s2));
+  }
+  template<> inline
+  void reduc_elem_unrolled__<7>(base_tensor::iterator &it,
+                                base_tensor::const_iterator &it1, 
base_tensor::const_iterator &it2,
+                                size_type s1, size_type s2) {
+    *it = (*it1) * (*it2);
+    *it += (*(it1+s1)) * (*(it2+s2));
+    *it += (*(it1+2*s1)) * (*(it2+2*s2));
+    *it += (*(it1+3*s1)) * (*(it2+3*s2));
+    *it += (*(it1+4*s1)) * (*(it2+4*s2));
+    *it += (*(it1+5*s1)) * (*(it2+5*s2));
+    *it += (*(it1+6*s1)) * (*(it2+6*s2));
+  }
+  template<> inline
+  void reduc_elem_unrolled__<6>(base_tensor::iterator &it,
+                                base_tensor::const_iterator &it1, 
base_tensor::const_iterator &it2,
+                                size_type s1, size_type s2) {
+    *it = (*it1) * (*it2);
+    *it += (*(it1+s1)) * (*(it2+s2));
+    *it += (*(it1+2*s1)) * (*(it2+2*s2));
+    *it += (*(it1+3*s1)) * (*(it2+3*s2));
+    *it += (*(it1+4*s1)) * (*(it2+4*s2));
+    *it += (*(it1+5*s1)) * (*(it2+5*s2));
   }
-  template<> inline scalar_type reduc_elem_unrolled__<1>
-  (base_tensor::iterator &it1, base_tensor::iterator &it2,
-   size_type /*s1*/, size_type /*s2*/)
-  { return (*it1)*(*it2); }
+  template<> inline
+  void reduc_elem_unrolled__<5>(base_tensor::iterator &it,
+                                base_tensor::const_iterator &it1, 
base_tensor::const_iterator &it2,
+                                size_type s1, size_type s2) {
+    *it = (*it1) * (*it2);
+    *it += (*(it1+s1)) * (*(it2+s2));
+    *it += (*(it1+2*s1)) * (*(it2+2*s2));
+    *it += (*(it1+3*s1)) * (*(it2+3*s2));
+    *it += (*(it1+4*s1)) * (*(it2+4*s2));
+  }
+  template<> inline
+  void reduc_elem_unrolled__<4>(base_tensor::iterator &it,
+                                base_tensor::const_iterator &it1, 
base_tensor::const_iterator &it2,
+                                size_type s1, size_type s2) {
+    *it = (*it1) * (*it2);
+    *it += (*(it1+s1)) * (*(it2+s2));
+    *it += (*(it1+2*s1)) * (*(it2+2*s2));
+    *it += (*(it1+3*s1)) * (*(it2+3*s2));
+  }
+  template<> inline
+  void reduc_elem_unrolled__<3>(base_tensor::iterator &it,
+                                base_tensor::const_iterator &it1, 
base_tensor::const_iterator &it2,
+                                size_type s1, size_type s2) {
+    *it = (*it1) * (*it2);
+    *it += (*(it1+s1)) * (*(it2+s2));
+    *it += (*(it1+2*s1)) * (*(it2+2*s2));
+  }
+  template<> inline
+  void reduc_elem_unrolled__<2>(base_tensor::iterator &it,
+                                base_tensor::const_iterator &it1, 
base_tensor::const_iterator &it2,
+                                size_type s1, size_type s2) {
+    *it = (*it1) * (*it2);
+    *it += (*(it1+s1)) * (*(it2+s2));
+  }
+  template<> inline
+  void reduc_elem_unrolled__<1>(base_tensor::iterator &it,
+                                base_tensor::const_iterator &it1, 
base_tensor::const_iterator &it2,
+                                size_type /*s1*/, size_type /*s2*/)
+  { *it = (*it1)*(*it2); }
 
   // Performs Ani Bmi -> Cmn. Unrolled operation.
-  template<int N> struct ga_instruction_contraction_unrolled
+  template<int I> struct ga_instruction_contraction_unrolled
     : public ga_instruction {
     base_tensor &t, &tc1, &tc2;
     virtual int exec() {
-      GA_DEBUG_INFO("Instruction: unrolled contraction operation of size " << 
N);
-      size_type s1 = tc1.size()/N, s2 = tc2.size()/N;
-      GA_DEBUG_ASSERT(t.size() == s1*s2, "Internal error, " << t.size()
-                      << " != " << s1 << "*" << s2);
-      base_tensor::iterator it1=tc1.begin(), it2=tc2.begin(), it2end=it2 + s2;
+      GA_DEBUG_INFO("Instruction: unrolled contraction operation of size " << 
I);
+      size_type N = tc1.size()/I, M = tc2.size()/I;
+      GA_DEBUG_ASSERT(t.size() == N*M, "Internal error, " << t.size()
+                      << " != " << N << "*" << M);
+      base_tensor::const_iterator it1=tc1.cbegin(), it2=tc2.cbegin(), 
it2end=it2+M;
       for (base_tensor::iterator it = t.begin(); it != t.end(); ++it) {
-        *it = reduc_elem_unrolled__<N>(it1, it2, s1, s2);
+        reduc_elem_unrolled__<I>(it, it1, it2, N, M);
         ++it2; if (it2 == it2end) { it2 = tc2.begin(), ++it1; }
       }
       return 0;
@@ -2883,62 +2973,63 @@ namespace getfem {
   };
 
   template<int N, int S2> inline void reduc_elem_d_unrolled__
-  (base_tensor::iterator &it, base_tensor::iterator &it1,
-   base_tensor::iterator &it2, size_type s1, size_type s2)  {
-    *it++ = reduc_elem_unrolled__<N>(it1, it2, s1, s2);
-    reduc_elem_d_unrolled__<N, S2-1>(it, it1, ++it2, s1, s2);
+  (base_tensor::iterator &it,
+   base_tensor::const_iterator &it1, base_tensor::const_iterator &it2,
+   size_type s1, size_type s2)  {
+    reduc_elem_unrolled__<N>(it, it1, it2, s1, s2);
+    reduc_elem_d_unrolled__<N, S2-1>(++it, it1, ++it2, s1, s2);
   }
   // A Repeated definition is following because partial specialization
   // of functions is not allowed in C++ for the moment.
   // The gain in assembly time is small compared to the simply unrolled version
   template<> inline void reduc_elem_d_unrolled__<1, 0>
-  (base_tensor::iterator &/* it */, base_tensor::iterator &/* it1 */,
-   base_tensor::iterator &/* it2 */, size_type /* s1 */, size_type /* s2 */) { 
}
+  (base_tensor::iterator &/* it */, base_tensor::const_iterator &/* it1 */,
+   base_tensor::const_iterator &/* it2 */, size_type /* s1 */, size_type /* s2 
*/) { }
   template<> inline void reduc_elem_d_unrolled__<2, 0>
-  (base_tensor::iterator &/* it */, base_tensor::iterator &/* it1 */,
-   base_tensor::iterator &/* it2 */, size_type /* s1 */, size_type /* s2 */) { 
}
+  (base_tensor::iterator &/* it */, base_tensor::const_iterator &/* it1 */,
+   base_tensor::const_iterator &/* it2 */, size_type /* s1 */, size_type /* s2 
*/) { }
   template<> inline void reduc_elem_d_unrolled__<3, 0>
-  (base_tensor::iterator &/* it */, base_tensor::iterator &/* it1 */,
-   base_tensor::iterator &/* it2 */, size_type /* s1 */, size_type /* s2 */) { 
}
+  (base_tensor::iterator &/* it */, base_tensor::const_iterator &/* it1 */,
+   base_tensor::const_iterator &/* it2 */, size_type /* s1 */, size_type /* s2 
*/) { }
   template<> inline void reduc_elem_d_unrolled__<4, 0>
-  (base_tensor::iterator &/* it */, base_tensor::iterator &/* it1 */,
-   base_tensor::iterator &/* it2 */, size_type /* s1 */, size_type /* s2 */) { 
}
+  (base_tensor::iterator &/* it */, base_tensor::const_iterator &/* it1 */,
+   base_tensor::const_iterator &/* it2 */, size_type /* s1 */, size_type /* s2 
*/) { }
   template<> inline void reduc_elem_d_unrolled__<5, 0>
-  (base_tensor::iterator &/* it */, base_tensor::iterator &/* it1 */,
-   base_tensor::iterator &/* it2 */, size_type /* s1 */, size_type /* s2 */) { 
}
+  (base_tensor::iterator &/* it */, base_tensor::const_iterator &/* it1 */,
+   base_tensor::const_iterator &/* it2 */, size_type /* s1 */, size_type /* s2 
*/) { }
   template<> inline void reduc_elem_d_unrolled__<6, 0>
-  (base_tensor::iterator &/* it */, base_tensor::iterator &/* it1 */,
-   base_tensor::iterator &/* it2 */, size_type /* s1 */, size_type /* s2 */) { 
}
+  (base_tensor::iterator &/* it */, base_tensor::const_iterator &/* it1 */,
+   base_tensor::const_iterator &/* it2 */, size_type /* s1 */, size_type /* s2 
*/) { }
   template<> inline void reduc_elem_d_unrolled__<7, 0>
-  (base_tensor::iterator &/* it */, base_tensor::iterator &/* it1 */,
-   base_tensor::iterator &/* it2 */, size_type /* s1 */, size_type /* s2 */) { 
}
+  (base_tensor::iterator &/* it */, base_tensor::const_iterator &/* it1 */,
+   base_tensor::const_iterator &/* it2 */, size_type /* s1 */, size_type /* s2 
*/) { }
   template<> inline void reduc_elem_d_unrolled__<8, 0>
-  (base_tensor::iterator &/* it */, base_tensor::iterator &/* it1 */,
-   base_tensor::iterator &/* it2 */, size_type /* s1 */, size_type /* s2 */) { 
}
+  (base_tensor::iterator &/* it */, base_tensor::const_iterator &/* it1 */,
+   base_tensor::const_iterator &/* it2 */, size_type /* s1 */, size_type /* s2 
*/) { }
   template<> inline void reduc_elem_d_unrolled__<9, 0>
-  (base_tensor::iterator &/* it */, base_tensor::iterator &/* it1 */,
-   base_tensor::iterator &/* it2 */, size_type /* s1 */, size_type /* s2 */) { 
}
+  (base_tensor::iterator &/* it */, base_tensor::const_iterator &/* it1 */,
+   base_tensor::const_iterator &/* it2 */, size_type /* s1 */, size_type /* s2 
*/) { }
   template<> inline void reduc_elem_d_unrolled__<10, 0>
-  (base_tensor::iterator &/* it */, base_tensor::iterator &/* it1 */,
-   base_tensor::iterator &/* it2 */, size_type /* s1 */, size_type /* s2 */) { 
}
+  (base_tensor::iterator &/* it */, base_tensor::const_iterator &/* it1 */,
+   base_tensor::const_iterator &/* it2 */, size_type /* s1 */, size_type /* s2 
*/) { }
   template<> inline void reduc_elem_d_unrolled__<11, 0>
-  (base_tensor::iterator &/* it */, base_tensor::iterator &/* it1 */,
-   base_tensor::iterator &/* it2 */, size_type /* s1 */, size_type /* s2 */) { 
}
+  (base_tensor::iterator &/* it */, base_tensor::const_iterator &/* it1 */,
+   base_tensor::const_iterator &/* it2 */, size_type /* s1 */, size_type /* s2 
*/) { }
   template<> inline void reduc_elem_d_unrolled__<12, 0>
-  (base_tensor::iterator &/* it */, base_tensor::iterator &/* it1 */,
-   base_tensor::iterator &/* it2 */, size_type /* s1 */, size_type /* s2 */) { 
}
+  (base_tensor::iterator &/* it */, base_tensor::const_iterator &/* it1 */,
+   base_tensor::const_iterator &/* it2 */, size_type /* s1 */, size_type /* s2 
*/) { }
   template<> inline void reduc_elem_d_unrolled__<13, 0>
-  (base_tensor::iterator &/* it */, base_tensor::iterator &/* it1 */,
-   base_tensor::iterator &/* it2 */, size_type /* s1 */, size_type /* s2 */) { 
}
+  (base_tensor::iterator &/* it */, base_tensor::const_iterator &/* it1 */,
+   base_tensor::const_iterator &/* it2 */, size_type /* s1 */, size_type /* s2 
*/) { }
   template<> inline void reduc_elem_d_unrolled__<14, 0>
-  (base_tensor::iterator &/* it */, base_tensor::iterator &/* it1 */,
-   base_tensor::iterator &/* it2 */, size_type /* s1 */, size_type /* s2 */) { 
}
+  (base_tensor::iterator &/* it */, base_tensor::const_iterator &/* it1 */,
+   base_tensor::const_iterator &/* it2 */, size_type /* s1 */, size_type /* s2 
*/) { }
   template<> inline void reduc_elem_d_unrolled__<15, 0>
-  (base_tensor::iterator &/* it */, base_tensor::iterator &/* it1 */,
-   base_tensor::iterator &/* it2 */, size_type /* s1 */, size_type /* s2 */) { 
}
+  (base_tensor::iterator &/* it */, base_tensor::const_iterator &/* it1 */,
+   base_tensor::const_iterator &/* it2 */, size_type /* s1 */, size_type /* s2 
*/) { }
   template<> inline void reduc_elem_d_unrolled__<16, 0>
-  (base_tensor::iterator &/* it */, base_tensor::iterator &/* it1 */,
-   base_tensor::iterator &/* it2 */, size_type /* s1 */, size_type /* s2 */) { 
}
+  (base_tensor::iterator &/* it */, base_tensor::const_iterator &/* it1 */,
+   base_tensor::const_iterator &/* it2 */, size_type /* s1 */, size_type /* s2 
*/) { }
 
   // Performs Ani Bmi -> Cmn. Automatically doubly unrolled operation
   // (for uniform meshes).
@@ -2952,9 +3043,10 @@ namespace getfem {
       GA_DEBUG_ASSERT(s2 == S2, "Internal error");
       GA_DEBUG_ASSERT(t.size() == s1*s2, "Internal error, " << t.size()
                       << " != " << s1 << "*" << s2);
-      base_tensor::iterator it = t.begin(), it1 = tc1.begin();
+      base_tensor::iterator it = t.begin();
+      base_tensor::const_iterator it1 = tc1.cbegin();
       for (size_type ii = 0; ii < s1; ++ii, ++it1) {
-        base_tensor::iterator it2 = tc2.begin();
+        base_tensor::const_iterator it2 = tc2.cbegin();
         reduc_elem_d_unrolled__<N, S2>(it, it1, it2, s1, s2);
       }
       GA_DEBUG_ASSERT(it == t.end(), "Internal error");
@@ -3141,6 +3233,7 @@ namespace getfem {
     }
 
     switch(n) {
+    case  1 : GMM_ASSERT1(false, "n=1 should not happen");
     case  2 : return std::make_shared<ga_instruction_contraction_unrolled< 2>>
                      (t, tc1, tc2);
     case  3 : return std::make_shared<ga_instruction_contraction_unrolled< 3>>
@@ -3498,34 +3591,114 @@ namespace getfem {
       : t(t_), tc1(tc1_), tc2(tc2_) {}
   };
 
-  template<int S1> inline void tmult_elem_unrolled__
-  (base_tensor::iterator &it, base_tensor::iterator &it1,
-   base_tensor::iterator &it2) {
-    *it++ = (*it1++)*(*it2);
-    tmult_elem_unrolled__<S1-1>(it, it1, it2);
+  template<int I> inline void dax__(base_tensor::iterator &it,
+                                    base_tensor::const_iterator &itx,
+                                    const scalar_type &a) {
+    constexpr int I1 = I/8;
+    constexpr int I2 = I - I1*8;
+    for (int i=0; i < I1; ++i)
+      dax__<8>(it, itx , a);
+    dax__<I2>(it, itx , a);
+  }
+  template<> inline void dax__<8>(base_tensor::iterator &it,
+                                  base_tensor::const_iterator &itx,
+                                  const scalar_type &a) {
+    *it++ = *itx++ * a;
+    *it++ = *itx++ * a;
+    *it++ = *itx++ * a;
+    *it++ = *itx++ * a;
+    *it++ = *itx++ * a;
+    *it++ = *itx++ * a;
+    *it++ = *itx++ * a;
+    *it++ = *itx++ * a;
   }
-  template<> inline void tmult_elem_unrolled__<0>
-  (base_tensor::iterator &/*it*/, base_tensor::iterator &/*it1*/,
-   base_tensor::iterator &/*it2*/) { }
+  template<> inline void dax__<7>(base_tensor::iterator &it,
+                                  base_tensor::const_iterator &itx,
+                                  const scalar_type &a) {
+    *it++ = *itx++ * a;
+    *it++ = *itx++ * a;
+    *it++ = *itx++ * a;
+    *it++ = *itx++ * a;
+    *it++ = *itx++ * a;
+    *it++ = *itx++ * a;
+    *it++ = *itx++ * a;
+  }
+  template<> inline void dax__<6>(base_tensor::iterator &it,
+                                  base_tensor::const_iterator &itx,
+                                  const scalar_type &a) {
+    *it++ = *itx++ * a;
+    *it++ = *itx++ * a;
+    *it++ = *itx++ * a;
+    *it++ = *itx++ * a;
+    *it++ = *itx++ * a;
+    *it++ = *itx++ * a;
+  }
+  template<> inline void dax__<5>(base_tensor::iterator &it,
+                                  base_tensor::const_iterator &itx,
+                                  const scalar_type &a) {
+    *it++ = *itx++ * a;
+    *it++ = *itx++ * a;
+    *it++ = *itx++ * a;
+    *it++ = *itx++ * a;
+    *it++ = *itx++ * a;
+  }
+  template<> inline void dax__<4>(base_tensor::iterator &it,
+                                  base_tensor::const_iterator &itx,
+                                  const scalar_type &a) {
+    *it++ = *itx++ * a;
+    *it++ = *itx++ * a;
+    *it++ = *itx++ * a;
+    *it++ = *itx++ * a;
+  }
+  template<> inline void dax__<3>(base_tensor::iterator &it,
+                                  base_tensor::const_iterator &itx,
+                                  const scalar_type &a) {
+    *it++ = *itx++ * a;
+    *it++ = *itx++ * a;
+    *it++ = *itx++ * a;
+  }
+  template<> inline void dax__<2>(base_tensor::iterator &it,
+                                  base_tensor::const_iterator &itx,
+                                  const scalar_type &a) {
+    *it++ = *itx++ * a;
+    *it++ = *itx++ * a;
+  }
+  template<> inline void dax__<1>(base_tensor::iterator &it,
+                                  base_tensor::const_iterator &itx,
+                                  const scalar_type &a) {
+    *it++ = *itx++ * a;
+  }
+  template<> inline void dax__<0>(base_tensor::iterator &,
+                                  base_tensor::const_iterator &,
+                                  const scalar_type &) {}
 
   // Performs Aij Bkl -> Cijkl, partially unrolled version
-  template<int S1> struct ga_instruction_simple_tmult_unrolled
+  template<int IJ> struct ga_instruction_simple_tmult_unrolled
   : public ga_instruction {
-    base_tensor &t, &tc1, &tc2;
+    base_tensor &t;
+    const base_tensor &tc1, &tc2;
     virtual int exec() {
-      size_type s2 = tc2.size();
-      GA_DEBUG_ASSERT(tc1.size() == S1,
-                      "Wrong sizes " << tc1.size() << " != " << S1);
+      size_type KL = tc2.size();
+      GA_DEBUG_ASSERT(tc1.size() == IJ,
+                      "Wrong sizes " << tc1.size() << " != " << IJ);
       GA_DEBUG_INFO("Instruction: simple tensor product, unrolled with "
-                    << S1 << " operations");
-      GA_DEBUG_ASSERT(t.size() == S1 * s2,
-                      "Wrong sizes " << t.size() << " != " << S1 << "*" << s2);
-      base_tensor::iterator it = t.begin(), it2 = tc2.begin();
-      for (size_type ii = 0; ii < s2; ++ii, ++it2) {
-        base_tensor::iterator it1 = tc1.begin();
-        tmult_elem_unrolled__<S1>(it, it1, it2);
+                    << IJ << " operations");
+      GA_DEBUG_ASSERT(t.size() == IJ * KL,
+                      "Wrong sizes " << t.size() << " != " << IJ << "*" << KL);
+#if 0 // too slow, how can this be? that's what dger should be good at. (it is 
slower even without the std::fill overhead)
+      const BLAS_INT IJ_=BLAS_INT(IJ), KL_=BLAS_INT(KL), INC(1);
+      const scalar_type one(1);
+      std::fill(t.begin(), t.end(), scalar_type(0));
+      gmm::dger_(&IJ_, &KL_, &one, &tc1[0], &INC, &tc2[0], &INC, &(t[0]), 
&IJ_);
+#else
+      base_tensor::iterator it = t.begin();
+      base_tensor::const_iterator it2 = tc2.cbegin();
+      for (size_type kl = 0; kl < KL; ++kl, ++it2) {
+        base_tensor::const_iterator it1 = tc1.cbegin();
+        dax__<IJ>(it, it1, *it2);
       }
       GA_DEBUG_ASSERT(it == t.end(), "Internal error");
+#endif
       return 0;
     }
     ga_instruction_simple_tmult_unrolled(base_tensor &t_, base_tensor &tc1_,
@@ -3536,6 +3709,7 @@ namespace getfem {
   pga_instruction  ga_uniform_instruction_simple_tmult
   (base_tensor &t, base_tensor &tc1, base_tensor &tc2) {
     switch(tc1.size()) {
+    case  1 : GMM_ASSERT1(false, "size 1 should not happen");
     case  2 : return std::make_shared<ga_instruction_simple_tmult_unrolled< 2>>
                      (t, tc1, tc2);
     case  3 : return std::make_shared<ga_instruction_simple_tmult_unrolled< 3>>
@@ -3574,27 +3748,30 @@ namespace getfem {
 
   // Performs Ami Bnj -> Cmnij. To be optimized.
   struct ga_instruction_spec_tmult : public ga_instruction {
-    base_tensor &t, &tc1, &tc2;
-    size_type s1_2, s2_2;
+    base_tensor &t;
+    const base_tensor &tc1, &tc2;
+    const size_type I, J;
     virtual int exec() {
       GA_DEBUG_INFO("Instruction: specific tensor product");
       GA_DEBUG_ASSERT(t.size() == tc1.size() * tc2.size(), "Wrong sizes");
-      size_type s1_1 = tc1.size() / s1_2;
-      size_type s2_1 = tc2.size() / s2_2;
-
+      const size_type M = tc1.size() / I,
+                      N = tc2.size() / J;
       base_tensor::iterator it = t.begin();
-      for (size_type j = 0; j < s2_2; ++j)
-        for (size_type i = 0; i < s1_2; ++i)
-          for (size_type n = 0; n < s2_1; ++n)
-            for (size_type m = 0; m < s1_1; ++m, ++it)
-              *it = tc1[m+i*s1_1] * tc2[n+j*s2_1];
+      for (size_type j = 0; j < J; ++j)
+        for (size_type i = 0; i < I; ++i)
+          for (size_type n = 0; n < N; ++n) {
+            scalar_type val = tc2[n+N*j];
+            for (size_type m = 0; m < M; ++m, ++it)
+              *it = tc1[m+M*i] * val;
+          }
       GA_DEBUG_ASSERT(it == t.end(), "Wrong sizes");
       return 0;
     }
-    ga_instruction_spec_tmult(base_tensor &t_, base_tensor &tc1_,
-                              base_tensor &tc2_, size_type s1_2_,
-                              size_type s2_2_)
-      : t(t_), tc1(tc1_), tc2(tc2_), s1_2(s1_2_), s2_2(s2_2_) {}
+    ga_instruction_spec_tmult(base_tensor &t_,
+                              const base_tensor &tc1_,
+                              const base_tensor &tc2_,
+                              size_type I_, size_type J_)
+      : t(t_), tc1(tc1_), tc2(tc2_), I(I_), J(J_) {}
   };
 
   // Performs Ai Bmj -> Cmij. To be optimized.

[Getfem-commits] [getfem-commits] branch master updated: Use hard coded loop unrollings

Reply via email to