ni...@lysator.liu.se (Niels Möller) writes:

> You're idea of conditonally adding the invariant d * B2 at the right
> place is also interesting,

I've tried it out. Works nicely, but no speedup on my machine. I'm
attaching another patch. There are then 4 methods:

method 1: Old loop around udiv_qrnnd_preinv.

method 2: The clever code from 10 years ago, with the microoptimization
  I commited the other day.

method 3: More or less the same as I posted a few days ago.

method 4: Postpones the update u1 -= u2 d, off the critical recurrency
  chain. Instead, conditionally adds in the constant B2 (B - d) to the
  lower u limbs.

Benchmark on my laptop:

$ ./speed -p 1000000 -s 2-20 -C mpn_div_qr_1n_pi1.0x8765432108765432 
mpn_div_qr_1n_pi1_1.0x8765432108765432 mpn_div_qr_1n_pi1_2.0x8765432108765432 
mpn_div_qr_1n_pi1_3.0x8765432108765432 mpn_div_qr_1n_pi1_4.0x8765432108765432
overhead 2.63 cycles, precision 1000000 units of 7.16e-10 secs, CPU freq 
1396.05 MHz
        mpn_div_qr_1n_pi1.0x8765432108765432 
mpn_div_qr_1n_pi1_1.0x8765432108765432 mpn_div_qr_1n_pi1_2.0x8765432108765432 
mpn_div_qr_1n_pi1_3.0x8765432108765432 mpn_div_qr_1n_pi1_4.0x8765432108765432
2              4.4566       #3.9635        5.8490        5.4085        6.0087
3              4.4323       #4.2441        5.8115        5.1158        5.7832
4              4.5348       #4.3992        5.7306        5.0807        5.8611
5             #4.5534        4.6698        5.7493        4.9803        5.5605
6             #4.5653        4.8412        5.7497        4.9129        5.6516
7             #4.6069        5.1388        5.7388        4.8811        5.6110
8             #4.6202        5.5073        5.7423        4.9359        5.5695
9             #4.6433        5.7341        5.7537        4.9357        5.5407
10            #4.6436        5.9595        5.7400        4.9231        5.5428
11            #4.6698        6.1348        5.7449        4.9430        5.5237
12            #4.6395        6.2541        5.7378        4.9452        5.5239
13            #4.6905        6.3761        5.7373        4.9482        5.5085
14            #4.6700        6.4692        5.8173        4.9447        5.5006
15            #4.6643        6.5548        5.7426        4.9644        5.4958
16            #4.6809        6.6305        5.7439        4.9625        5.4924
17            #4.6800        6.6901        5.7418        4.9576        5.4760
18            #4.6903        6.7440        5.7436        4.9866        5.4840
19            #4.6886        6.7891        5.7366        4.9753        5.4872
20            #4.6818        6.8370        5.7405        4.9783        5.4820
              asm          method 1      method 2      method 3      method 4

So on my laptop, method 3 wins, just 0.3 cycles/limb behind the assembly
implementation. 

It's also curius that method 1 wins for the smaller operands, maybe
that's the cost of the B2 = -d * dinv ? I think assembly implementation
of method 3 may get register challenged (but I haven't loked at what the
compiler generates). To reduce number of live registers, one might need
to complete the u accumulation before the q multiply.

Regards,
/Niels

diff -r a9f0db9f7199 mpn/generic/div_qr_1n_pi1.c
--- a/mpn/generic/div_qr_1n_pi1.c	Thu Jun 03 23:50:08 2021 +0200
+++ b/mpn/generic/div_qr_1n_pi1.c	Sun Jun 06 19:47:36 2021 +0200
@@ -298,6 +298,208 @@
   return u0;
 }
 
+#elif DIV_QR_1N_METHOD == 3
+
+/* This variant handles carry from the u update earlier. This gives a
+   longer critical path, but reduces the work needed for the
+   quotients. */
+mp_limb_t
+mpn_div_qr_1n_pi1 (mp_ptr qp, mp_srcptr up, mp_size_t n, mp_limb_t u1,
+		   mp_limb_t d, mp_limb_t dinv)
+{
+  mp_limb_t B2;
+  mp_limb_t cy, u0;
+  mp_limb_t q0, q1;
+  mp_limb_t p0, p1;
+  mp_limb_t t;
+  mp_size_t j;
+
+  ASSERT (d & GMP_LIMB_HIGHBIT);
+  ASSERT (n > 0);
+  ASSERT (u1 < d);
+
+  if (n == 1)
+    {
+      udiv_qrnnd_preinv (qp[0], u1, u1, up[0], d, dinv);
+      return u1;
+    }
+
+  /* FIXME: Could be precomputed */
+  B2 = -d*dinv;
+
+  umul_ppmm (q1, q0, dinv, u1);
+  umul_ppmm (p1, p0, B2, u1);
+  q1 += u1;
+  ASSERT (q1 >= u1);
+  u0 = up[n-1];	/* Early read, to allow qp == up. */
+
+  add_mssaaaa (cy, u1, u0, u0, up[n-2], p1, p0);
+  u1 -= cy & d;
+  q1 -= cy;
+  qp[n-1] = q1;
+
+  /* FIXME: Keep q1 in a variable between iterations, to reduce number
+     of memory accesses. */
+  for (j = n-2; j-- > 0; )
+    {
+      mp_limb_t q2, cy;
+      mp_limb_t t1, t0;
+
+      /* Additions for the q update:
+       *	+-------+
+       *        |u1 * v |
+       *        +---+---+
+       *        | u1|
+       *        +---+
+       *        | 1 |  (conditional on {u1, u0} carry)
+       *        +---+
+       * +      | q0|
+       *   -+---+---+---+
+       *    | q2| q1| q0|
+       *    +---+---+---+
+       *
+       * Additions for the u update:
+       *        +-------+
+       *        |u1 * B2|
+       *        +---+---+
+       *      + |u0 |u-1|
+       *        +---+---+
+       *      - | d |     (conditional on carry)
+       *     ---+---+---+
+       *        |u1 | u0|
+       *        +---+---+
+       *
+      */
+      umul_ppmm (p1, p0, u1, B2);
+      ADDC_LIMB (q2, q1, u1, q0);
+      umul_ppmm (t1, t0, u1, dinv);
+      add_mssaaaa (cy, u1, u0, u0, up[j], p1, p0);
+      u1 -= cy & d;
+
+      /* t1 <= B-2, so cy can be added in without overflow. */
+      add_ssaaaa (q2, q1, q2, q1, CNST_LIMB(0), t1 - cy);
+      q0 = t0;
+
+      /* Final q update */
+      qp[j+1] = q1;
+      MPN_INCR_U (qp+j+2, n-j-2, q2);
+    }
+
+  q1 = (u1 >= d);
+  u1 -= (-q1) & d;
+
+  udiv_qrnnd_preinv (t, u0, u1, u0, d, dinv);
+  add_ssaaaa (q1, q0, q1, q0, CNST_LIMB(0), t);
+
+  MPN_INCR_U (qp+1, n-1, q1);
+
+  qp[0] = q0;
+  return u0;
+}
+
+#elif DIV_QR_1N_METHOD == 4
+
+mp_limb_t
+mpn_div_qr_1n_pi1 (mp_ptr qp, mp_srcptr up, mp_size_t n, mp_limb_t u1,
+		   mp_limb_t d, mp_limb_t dinv)
+{
+  mp_limb_t B2;
+  mp_limb_t u2, u0;
+  mp_limb_t q0, q1;
+  mp_limb_t p0, p1;
+  mp_limb_t B2d0, B2d1;
+  mp_limb_t t;
+  mp_size_t j;
+
+  ASSERT (d & GMP_LIMB_HIGHBIT);
+  ASSERT (n > 0);
+  ASSERT (u1 < d);
+
+  if (n == 1)
+    {
+      udiv_qrnnd_preinv (qp[0], u1, u1, up[0], d, dinv);
+      return u1;
+    }
+
+  /* FIXME: Could be precomputed */
+  B2 = -d*dinv;
+  /* B2 * (B-d) */
+  umul_ppmm (B2d1, B2d0, B2, -d);
+
+  umul_ppmm (q1, q0, dinv, u1);
+  umul_ppmm (p1, p0, B2, u1);
+  q1 += u1;
+  ASSERT (q1 >= u1);
+
+  add_mssaaaa (u2, u1, u0, up[n-1], up[n-2], p1, p0);
+
+  /* After read of up[n-1], to allow qp == up. */
+  qp[n-1] = q1 - u2;
+
+  /* FIXME: Keep q1 in a variable between iterations, to reduce number
+     of memory accesses. */
+  for (j = n-2; j-- > 0; )
+    {
+      mp_limb_t q2, cy;
+      mp_limb_t t1, t0;
+
+      /* Additions for the q update. *After* u1 -= u2 & d adjustment.
+       *	+-------+
+       *        |u1 * v |
+       *        +---+---+
+       *        | u1|
+       *        +---+
+       *        | 1 |  (conditional on {u1, u0} carry)
+       *        +---+
+       * +      | q0|
+       *   -+---+---+---+
+       *    | q2| q1| q0|
+       *    +---+---+---+
+       *
+       * Additions for the u update. *Before* u1 -= u2 & d adjstment.
+       *        +-------+
+       *        |u1 * B2|
+       *        +---+---+
+       *        |u0 |u-1|
+       *        +---+---+
+       + +      |B2(B-d)| (conditional on u2)
+       *   -+---+---+---+
+       *    |u2 |u1 | u0|
+       *    +---+---+---+
+       *
+      */
+       /* Multiply with unadjusted u1, to shorten critical path. */
+      umul_ppmm (p1, p0, u1, B2);
+      u1 -= (d & u2);
+      ADDC_LIMB (q2, q1, u1, q0);
+      umul_ppmm (t1, t0, u1, dinv);
+
+      add_mssaaaa (cy, u1, u0, u0, up[j], u2 & B2d1, u2 & B2d0);
+      add_mssaaaa (u2, u1, u0, u1, u0, p1, p0);
+      u2 += cy;
+      ASSERT(-u2 <= 1);
+
+      /* t1 <= B-2, so u2 can be added in without overflow. */
+      add_ssaaaa (q2, q1, q2, q1, CNST_LIMB(0), t1 - u2);
+      q0 = t0;
+
+      /* Final q update */
+      qp[j+1] = q1;
+      MPN_INCR_U (qp+j+2, n-j-2, q2);
+    }
+  u1 -= u2 & d;
+
+  q1 = (u1 >= d);
+  u1 -= (-q1) & d;
+
+  udiv_qrnnd_preinv (t, u0, u1, u0, d, dinv);
+  add_ssaaaa (q1, q0, q1, q0, CNST_LIMB(0), t);
+
+  MPN_INCR_U (qp+1, n-1, q1);
+
+  qp[0] = q0;
+  return u0;
+}
 #else
 #error Unknown DIV_QR_1N_METHOD
 #endif
diff -r a9f0db9f7199 tune/Makefile.am
--- a/tune/Makefile.am	Thu Jun 03 23:50:08 2021 +0200
+++ b/tune/Makefile.am	Sun Jun 06 19:47:36 2021 +0200
@@ -53,7 +53,8 @@
 
 libspeed_la_SOURCES =							\
   common.c divrem1div.c divrem1inv.c divrem2div.c divrem2inv.c		\
-  div_qr_1n_pi1_1.c div_qr_1n_pi1_2.c div_qr_1_tune.c			\
+  div_qr_1n_pi1_1.c div_qr_1n_pi1_2.c div_qr_1n_pi1_3.c 		\
+  div_qr_1n_pi1_4.c div_qr_1_tune.c					\
   freq.c								\
   gcdext_single.c gcdext_double.c gcdextod.c gcdextos.c			\
   hgcd_lehmer.c hgcd_appr_lehmer.c hgcd_reduce_1.c hgcd_reduce_2.c	\
diff -r a9f0db9f7199 tune/common.c
--- a/tune/common.c	Thu Jun 03 23:50:08 2021 +0200
+++ b/tune/common.c	Sun Jun 06 19:47:36 2021 +0200
@@ -718,6 +718,16 @@
 {
   SPEED_ROUTINE_MPN_DIV_QR_1N_PI1 (mpn_div_qr_1n_pi1_2);
 }
+double
+speed_mpn_div_qr_1n_pi1_3 (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_DIV_QR_1N_PI1 (mpn_div_qr_1n_pi1_3);
+}
+double
+speed_mpn_div_qr_1n_pi1_4 (struct speed_params *s)
+{
+  SPEED_ROUTINE_MPN_DIV_QR_1N_PI1 (mpn_div_qr_1n_pi1_4);
+}
 
 double
 speed_mpn_div_qr_1 (struct speed_params *s)
diff -r a9f0db9f7199 tune/div_qr_1_tune.c
--- a/tune/div_qr_1_tune.c	Thu Jun 03 23:50:08 2021 +0200
+++ b/tune/div_qr_1_tune.c	Sun Jun 06 19:47:36 2021 +0200
@@ -34,10 +34,13 @@
 
 mp_limb_t mpn_div_qr_1n_pi1_1 (mp_ptr, mp_srcptr, mp_size_t, mp_limb_t, mp_limb_t, mp_limb_t);
 mp_limb_t mpn_div_qr_1n_pi1_2 (mp_ptr, mp_srcptr, mp_size_t, mp_limb_t, mp_limb_t, mp_limb_t);
+mp_limb_t mpn_div_qr_1n_pi1_3 (mp_ptr, mp_srcptr, mp_size_t, mp_limb_t, mp_limb_t, mp_limb_t);
 
 #if !HAVE_NATIVE_mpn_div_qr_1n_pi1
-#define __gmpn_div_qr_1n_pi1 \
-  (div_qr_1n_pi1_method == 1 ? mpn_div_qr_1n_pi1_1 : mpn_div_qr_1n_pi1_2)
+#define __gmpn_div_qr_1n_pi1						\
+  (div_qr_1n_pi1_method <= 2						\
+   ? (div_qr_1n_pi1_method == 1 ? mpn_div_qr_1n_pi1_1 : mpn_div_qr_1n_pi1_2) \
+   : (div_qr_1n_pi1_method == 3 ? mpn_div_qr_1n_pi1_3 : mpn_div_qr_1n_pi1_4))
 #endif
 
 #undef mpn_div_qr_1
diff -r a9f0db9f7199 tune/div_qr_1n_pi1_3.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tune/div_qr_1n_pi1_3.c	Sun Jun 06 19:47:36 2021 +0200
@@ -0,0 +1,38 @@
+/* mpn/generic/div_qr_1n_pi1.c method 3.
+
+Copyright 2013 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+
+#undef DIV_QR_1N_METHOD
+#define DIV_QR_1N_METHOD 3
+#undef mpn_div_qr_1n_pi1
+#define mpn_div_qr_1n_pi1 mpn_div_qr_1n_pi1_3
+
+#include "mpn/generic/div_qr_1n_pi1.c"
diff -r a9f0db9f7199 tune/div_qr_1n_pi1_4.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tune/div_qr_1n_pi1_4.c	Sun Jun 06 19:47:36 2021 +0200
@@ -0,0 +1,38 @@
+/* mpn/generic/div_qr_1n_pi1.c method 4.
+
+Copyright 2013 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include "gmp-impl.h"
+
+#undef DIV_QR_1N_METHOD
+#define DIV_QR_1N_METHOD 4
+#undef mpn_div_qr_1n_pi1
+#define mpn_div_qr_1n_pi1 mpn_div_qr_1n_pi1_4
+
+#include "mpn/generic/div_qr_1n_pi1.c"
diff -r a9f0db9f7199 tune/speed.c
--- a/tune/speed.c	Thu Jun 03 23:50:08 2021 +0200
+++ b/tune/speed.c	Sun Jun 06 19:47:36 2021 +0200
@@ -244,6 +244,8 @@
   { "mpn_div_qr_1n_pi1", speed_mpn_div_qr_1n_pi1, FLAG_R  },
   { "mpn_div_qr_1n_pi1_1",speed_mpn_div_qr_1n_pi1_1, FLAG_R  },
   { "mpn_div_qr_1n_pi1_2",speed_mpn_div_qr_1n_pi1_2, FLAG_R  },
+  { "mpn_div_qr_1n_pi1_3",speed_mpn_div_qr_1n_pi1_3, FLAG_R  },
+  { "mpn_div_qr_1n_pi1_4",speed_mpn_div_qr_1n_pi1_4, FLAG_R  },
   { "mpn_div_qr_1",      speed_mpn_div_qr_1,      FLAG_R },
 
   { "mpn_div_qr_2n",     speed_mpn_div_qr_2n,       },
diff -r a9f0db9f7199 tune/speed.h
--- a/tune/speed.h	Thu Jun 03 23:50:08 2021 +0200
+++ b/tune/speed.h	Sun Jun 06 19:47:36 2021 +0200
@@ -210,6 +210,8 @@
 double speed_mpn_div_qr_1n_pi1 (struct speed_params *);
 double speed_mpn_div_qr_1n_pi1_1 (struct speed_params *);
 double speed_mpn_div_qr_1n_pi1_2 (struct speed_params *);
+double speed_mpn_div_qr_1n_pi1_3 (struct speed_params *);
+double speed_mpn_div_qr_1n_pi1_4 (struct speed_params *);
 double speed_mpn_div_qr_1 (struct speed_params *);
 double speed_mpn_div_qr_2n (struct speed_params *);
 double speed_mpn_div_qr_2u (struct speed_params *);
@@ -482,6 +484,8 @@
 
 mp_limb_t mpn_div_qr_1n_pi1_1 (mp_ptr, mp_srcptr, mp_size_t, mp_limb_t, mp_limb_t, mp_limb_t);
 mp_limb_t mpn_div_qr_1n_pi1_2 (mp_ptr, mp_srcptr, mp_size_t, mp_limb_t, mp_limb_t, mp_limb_t);
+mp_limb_t mpn_div_qr_1n_pi1_3 (mp_ptr, mp_srcptr, mp_size_t, mp_limb_t, mp_limb_t, mp_limb_t);
+mp_limb_t mpn_div_qr_1n_pi1_4 (mp_ptr, mp_srcptr, mp_size_t, mp_limb_t, mp_limb_t, mp_limb_t);
 
 mp_limb_t mpn_divrem_1_div (mp_ptr, mp_size_t, mp_srcptr, mp_size_t, mp_limb_t);
 mp_limb_t mpn_divrem_1_inv (mp_ptr, mp_size_t, mp_srcptr, mp_size_t, mp_limb_t);
diff -r a9f0db9f7199 tune/tuneup.c
--- a/tune/tuneup.c	Thu Jun 03 23:50:08 2021 +0200
+++ b/tune/tuneup.c	Sun Jun 06 19:47:36 2021 +0200
@@ -2253,16 +2253,18 @@
   if (!HAVE_NATIVE_mpn_div_qr_1n_pi1)
     {
       static struct param_t  param;
-      speed_function_t f[2] =
+      speed_function_t f[] =
 	{
 	 speed_mpn_div_qr_1n_pi1_1,
 	 speed_mpn_div_qr_1n_pi1_2,
+	 speed_mpn_div_qr_1n_pi1_3,
+	 speed_mpn_div_qr_1n_pi1_4,
 	};
 
       s.size = 10;
       s.r = randlimb_norm ();
 
-      one_method (2, f, "mpn_div_qr_1n_pi1", "DIV_QR_1N_PI1_METHOD", &param);
+      one_method (numberof(f), f, "mpn_div_qr_1n_pi1", "DIV_QR_1N_PI1_METHOD", &param);
     }
 
   {

-- 
Niels Möller. PGP-encrypted email is preferred. Keyid 368C6677.
Internet email is subject to wholesale government surveillance.
_______________________________________________
gmp-devel mailing list
gmp-devel@gmplib.org
https://gmplib.org/mailman/listinfo/gmp-devel

Reply via email to