[Getfem-commits] [getfem-commits] branch improve-expm-performance created (now ee139e69)

2023-10-20 Thread Konstantinos Poulios via Getfem-commits
logari81 pushed a change to branch improve-expm-performance.

  at ee139e69 Use a Pade approximation for expm, ported from 
Eigen/Unsupported

This branch includes the following new commits:

 new ee139e69 Use a Pade approximation for expm, ported from 
Eigen/Unsupported




[Getfem-commits] (no subject)

2023-10-20 Thread Konstantinos Poulios via Getfem-commits
branch: improve-expm-performance
commit ee139e697b50db9198ef8257e2c492202b037a35
Author: Konstantinos Poulios 
AuthorDate: Fri Oct 20 10:59:42 2023 +0200

Use a Pade approximation for expm, ported from Eigen/Unsupported
---
 src/getfem_plasticity.cc | 472 ++-
 1 file changed, 386 insertions(+), 86 deletions(-)

diff --git a/src/getfem_plasticity.cc b/src/getfem_plasticity.cc
index 5a0ec9c0..69ca66b2 100644
--- a/src/getfem_plasticity.cc
+++ b/src/getfem_plasticity.cc
@@ -46,109 +46,409 @@ namespace getfem {
 { mi.resize(2); mi[0] = mi[1] = N; }
 
 
-  bool expm(const base_matrix _, base_matrix , scalar_type tol=1e-15) {
+  inline void matmul(base_matrix ,base_matrix ,base_matrix )
+{gmm::mult(aa,bb,cc);}
 
-const size_type itmax = 40;
-base_matrix a(a_);
-// scale input matrix a
-int e;
-frexp(gmm::mat_norminf(a), );
-e = std::max(0, std::min(1023, e));
-gmm::scale(a, pow(scalar_type(2),-scalar_type(e)));
-
-base_matrix atmp(a), an(a);
-gmm::copy(a, aexp);
-gmm::add(gmm::identity_matrix(), aexp);
-scalar_type factn(1);
+  bool expm(const base_matrix _, base_matrix ) {
+
+const size_type N = gmm::mat_nrows(a_);
 bool success(false);
-for (size_type n=2; n < itmax; ++n) {
-  factn /= scalar_type(n);
-  gmm::mult(an, a, atmp);
-  gmm::copy(atmp, an);
-  gmm::scale(atmp, factn);
-  gmm::add(atmp, aexp);
-  if (gmm::mat_euclidean_norm(atmp) < tol) {
-success = true;
-break;
-  }
+
+// Pade approximation ported from Eigen/Unsupported
+base_matrix a(a_);
+gmm::clear(aexp.as_vector());
+base_matrix tmp(aexp), v(aexp), u(aexp); // Pade approximant is (v+u)/(v-u)
+const scalar_type l1norm = gmm::mat_norminf(a_);
+int e = 0; // squarings
+if (l1norm < 1.495585217958292e-002) { // matrix_exp_pade3(a, u, v)
+  const static std::array b{120,60,12,1};
+  base_matrix a2(a);
+  matmul(a, a, a2);
+  gmm::copy(gmm::scaled(a2,b[2]), v);   // v = b2*A2 + b0*I
+  gmm::copy(gmm::scaled(a2,b[3]), u);   // u = b3*A2 + b1*I
+  for (size_type ij=0; ij < N; ++ij)
+{ v(ij,ij) += b[0]; u(ij,ij) += b[1]; }
+} else if (l1norm < 2.539398330063230e-001) { // matrix_exp_pade5(a, u, v)
+  const static std::array b{30240,15120,3360,420,30,1};
+  base_matrix a2(a), a4(a);
+  matmul(a, a, a2);
+  matmul(a2, a2, a4);
+  gmm::add(gmm::scaled(a4,b[4]),// v = b4*A4 + b2*A2 + b0*I
+   gmm::scaled(a2,b[2]), v);
+  gmm::add(gmm::scaled(a4,b[5]),// u = b5*A4 + b3*A2 + b1*I
+   gmm::scaled(a2,b[3]), u);
+  for (size_type ij=0; ij < N; ++ij)
+{ v(ij,ij) += b[0]; u(ij,ij) += b[1]; }
+} else if (l1norm < 9.504178996162932e-001) { // matrix_exp_pade7(a, u, v)
+  const static std::array
+b{17297280,8648640,1995840,277200,25200,1512,56,1};
+  base_matrix a2(a), a4(a), a6(a);
+  matmul(a, a, a2);
+  matmul(a2, a2, a4);
+  matmul(a2, a4, a6);
+  gmm::add(gmm::scaled(a6,b[6]),// v = b6*A6 + b4*A4 + b2*A2 + b0*I
+   gmm::scaled(a4,b[4]), v);
+  gmm::add(gmm::scaled(a2,b[2]), v);
+  gmm::add(gmm::scaled(a6,b[7]),// u = b7*A6 + b5*A4 + b3*A2 + b1*I
+   gmm::scaled(a4,b[5]), u);
+  gmm::add(gmm::scaled(a2,b[3]), u);
+  for (size_type ij=0; ij < N; ++ij)
+{ v(ij,ij) += b[0]; u(ij,ij) += b[1]; }
+} else if (l1norm < 2.097847961257068e+000) { // matrix_exp_pade9(a, u, v)
+  const static std::array
+b{17643225600,8821612800,2075673600,302702400,30270240,2162160,
+  110880,3960,90,1};
+  base_matrix a2(a), a4(a), a6(a), a8(a);
+  matmul(a, a, a2);
+  matmul(a2, a2, a4);
+  matmul(a2, a4, a6);
+  matmul(a4, a4, a8);
+  gmm::add(gmm::scaled(a8,b[8]),// v = b8*A8+b6*A6+b4*A4+b2*A2+b0*I
+   gmm::scaled(a6,b[6]), v);
+  gmm::add(gmm::scaled(a4,b[4]), v);
+  gmm::add(gmm::scaled(a2,b[2]), v);
+  gmm::add(gmm::scaled(a8,b[9]),// u = b9*A8+b7*A6+b5*A4+b3*A2+b1*I
+   gmm::scaled(a6,b[7]), u);
+  gmm::add(gmm::scaled(a4,b[5]), u);
+  gmm::add(gmm::scaled(a2,b[3]), u);
+  for (size_type ij=0; ij < N; ++ij)
+{ v(ij,ij) += b[0]; u(ij,ij) += b[1]; }
+} else { // matrix_exp_pade13(a, U, V);
+  const scalar_type maxnorm = 5.371920351148152;
+  frexp(l1norm / maxnorm, );
+  if (e <= 0) e = 0;
+  else for (auto & : a.as_vector()) { val = ldexp(val,-e); }
+   // <==> gmm::scale(a, pow(scalar_type(2),-scalar_type(e)));
+  const static std::array
+b{6476475253248,3238237626624,7771770303897600,
+  1187353796428800,129060195264000,10559470521600,670442572800,
+  33522128640,1323241920,40840800,960960,16380,182,1};
+  base_matrix a2(a), a4(a), a6(a);
+  matmul(a, a, a2);
+  matmul(a2, a2, a4);
+  matmul(a2, a4, a6);
+