Re: binvert_limb speedup on 64 bit machines with UHWtype

Marco Bodrato Sun, 27 Feb 2022 09:45:36 -0800

Ciao,

Il 2022-02-27 18:13 Marco Bodrato ha scritto:

I did never look into that file :-)


I inserted there a few more versions of binvert_limb.

The attached patch is only for testing, not to be pushed (I useduint_fast##_t types).


On shell I get the following:

@shell ~/gmp-repo]$ /var/tmp/bodrato/gmp/hg/build/tune/speed -p100000000-s 1 -rc binvert_limb binvert_limb_sec binvert_limb_pipebinvert_limb_uintfastoverhead 5.84 cycles, precision 100000000 units of 2.86e-10 secs, CPUfreq 3500.09 MHz

   binvert_limb binvert_limb_sec binvert_limb_pipe binvert_limb_uintfast
1         28.39          1.0485          #0.8930           0.9999

The _sec version is 5% slower, and the _pipe one is 10% faster than thecurrent.


Ĝis,
m

diff -r 1cafba189d5a tune/modlinv.c
--- a/tune/modlinv.c	Sun Feb 27 15:10:38 2022 +0100
+++ b/tune/modlinv.c	Sun Feb 27 18:35:48 2022 +0100
@@ -1,7 +1,7 @@
 /* Alternate implementations of binvert_limb to compare speeds. */
 
 /*
-Copyright 2000, 2002 Free Software Foundation, Inc.
+Copyright 2000, 2002, 2022 Free Software Foundation, Inc.
 
 This file is part of the GNU MP Library.
 
@@ -29,11 +29,133 @@
 GNU Lesser General Public License along with the GNU MP Library.  If not,
 see https://www.gnu.org/licenses/.  */
 
-#include <stdio.h>
 #include "gmp-impl.h"
 #include "longlong.h"
 #include "speed.h"
 
+#define binvert_limb_uintfast(inv,n)					\
+  do {									\
+    mp_limb_t  __n = (n);						\
+    mp_limb_t  __inv;							\
+    uint_fast8_t  __inv8;						\
+    ASSERT ((__n & 1) == 1);						\
+									\
+    __inv8 = binvert_limb_table[(__n & 0xff)/2]; /*  8 */		\
+    if (GMP_NUMB_BITS > 8) {						\
+      uint_fast16_t __inv16 = 2 * (uint_fast16_t)__inv8 - (uint_fast16_t)__inv8 * __inv8 * (uint_fast16_t)__n; \
+    if (GMP_NUMB_BITS > 16) {						\
+      uint_fast32_t __inv32 = 2 * (uint_fast32_t)__inv16 - (uint_fast32_t)__inv16 * __inv16 * (uint_fast32_t)__n; \
+    if (GMP_NUMB_BITS > 32) {						\
+    __inv = 2 * (mp_limb_t)__inv32 - (mp_limb_t)__inv32 * __inv32 * __n; \
+    } else { __inv = __inv32; };					\
+    } else { __inv = __inv16; };					\
+    } else { __inv = __inv8; };						\
+									\
+    if (GMP_NUMB_BITS > 64)						\
+      {									\
+	int  __invbits = 64;						\
+	do {								\
+	  __inv = 2 * __inv - __inv * __inv * __n;			\
+	  __invbits *= 2;						\
+	} while (__invbits < GMP_NUMB_BITS);				\
+      }									\
+									\
+    ASSERT ((__inv * __n & GMP_NUMB_MASK) == 1);			\
+    (inv) = __inv & GMP_NUMB_MASK;					\
+  } while (0)
+
+/* An (imperfect) copy of the binvert function contained in
+   mpn/generic/sec_powm.c .
+ */
+
+#if GMP_LIMB_BITS <= 32
+#define binvert_limb_sec(inv,n)					\
+  do {								\
+    mp_limb_t  __n = (n);					\
+    mp_limb_t  __inv, __t;					\
+    ASSERT ((__n & 1) == 1);					\
+    /* 3 + 2 -> 5 */						\
+    __inv = __n + (((__n + 1) << 1) & 0x18);			\
+								\
+    __t = __n * __inv;						\
+    /* 5 x 2 + 2 -> 12 */					\
+    __inv = 2*__inv - __inv*__t + ((__inv<<10)&-(__t&(1<<5)));	\
+								\
+    if (GMP_NUMB_BITS > 12)					\
+      {								\
+        __t = __n * __inv - 1;					\
+	/* 12 x 3 -> 36 */					\
+	__inv += __inv * __t * (__t - 1);			\
+      }								\
+								\
+    ASSERT ((__inv * __n & GMP_NUMB_MASK) == 1);		\
+    (inv) = __inv & GMP_NUMB_MASK;				\
+  } while (0)
+#else /* GMP_LIMB_BITS > 32, but actually > 48 is required */
+#define binvert_limb_sec(inv,n)					\
+  do {								\
+    mp_limb_t  __n = (n);					\
+    mp_limb_t  __inv, __t;					\
+    ASSERT ((__n & 1) == 1);					\
+    /* 3 + 2 -> 5 */						\
+    __inv = __n + (((__n + 1) << 1) & 0x18);			\
+								\
+    __t = __n * __inv;						\
+    /* 5 x 2 + 2 -> 12 */					\
+    __inv = 2*__inv - __inv*__t + ((__inv<<10)&-(__t&(1<<5)));	\
+								\
+    __t = __n * __inv - 1;					\
+    mp_limb_t __t2 = __t * __t;					\
+    /* 12 x 5 + 4 -> 64 */				       	\
+    __inv *= (__t2+1)*(__t2-__t)+1 -((__t<<48)&-(__t&(1<<12))); \
+								\
+    /* 64 -> 128 -> 256 -> ... */				\
+    for (int __b = (GMP_NUMB_BITS - 1) >> 6; __b != 0; __b >>= 1)	\
+      __inv = 2 * __inv - __inv * __inv * __n;			\
+								\
+    ASSERT ((__inv * __n & GMP_NUMB_MASK) == 1);		\
+    (inv) = __inv & GMP_NUMB_MASK;				\
+  } while (0)
+#endif
+
+/* Like the standard version in gmp-impl.h, but with a different path
+   for bit sizes larger than 32, with concurrent multiplications.  */
+
+#define binvert_limb_pipe(inv,n)					\
+  do {									\
+    mp_limb_t  __n = (n);						\
+    mp_limb_t  __inv;							\
+    ASSERT ((__n & 1) == 1);						\
+									\
+    __inv = binvert_limb_table[(__n & 0xFF) / 2]; /*  8 */		\
+    if (GMP_NUMB_BITS <= 32)						\
+      {									\
+	if (GMP_NUMB_BITS > 8)						\
+	  __inv = 2 * __inv - __inv * __inv * __n;			\
+	if (GMP_NUMB_BITS > 16)						\
+	  __inv = 2 * __inv - __inv * __inv * __n;			\
+      }									\
+    else								\
+      {									\
+	mp_limb_t  __t, __t2, __p;					\
+	int  __invbits = 32;						\
+	__t = __n * __inv - 1;	/* x */					\
+	__t2= __t * __t;	/* x^2 */				\
+	__p = __inv * (__t - __t2);	/* (1-x^2)x/(x+1) */		\
+									\
+	while (__invbits < GMP_NUMB_BITS - 8)				\
+	  {								\
+	    __p  *= __t2 + 1;	/* (1-x^{2^n})x/(x+1) */		\
+	    __t2 *= __t2;	/* x^{2^n} */				\
+	    __invbits <<= 1;						\
+	  }								\
+	/* __inv * (x^{2^{n+2}+1}-1)/(x+1) */				\
+	__inv -= __p * (__t2 + 1);					\
+      }									\
+									\
+    ASSERT ((__inv * __n & GMP_NUMB_MASK) == 1);			\
+    (inv) = __inv & GMP_NUMB_MASK;					\
+  } while (0)
 
 /* Like the standard version in gmp-impl.h, but with the expressions using a
    "1-" form.  This has the same number of steps, but "1-" is on the
@@ -175,3 +297,18 @@
 {
   SPEED_ROUTINE_MODLIMB_INVERT (binvert_limb_arith);
 }
+double
+speed_binvert_limb_sec (struct speed_params *s)
+{
+  SPEED_ROUTINE_MODLIMB_INVERT (binvert_limb_sec);
+}
+double
+speed_binvert_limb_pipe (struct speed_params *s)
+{
+  SPEED_ROUTINE_MODLIMB_INVERT (binvert_limb_pipe);
+}
+double
+speed_binvert_limb_uintfast (struct speed_params *s)
+{
+  SPEED_ROUTINE_MODLIMB_INVERT (binvert_limb_uintfast);
+}
diff -r 1cafba189d5a tune/speed.c
--- a/tune/speed.c	Sun Feb 27 15:10:38 2022 +0100
+++ b/tune/speed.c	Sun Feb 27 18:35:48 2022 +0100
@@ -532,6 +532,9 @@
   { "binvert_limb_loop",  speed_binvert_limb_loop,  FLAG_NODATA },
   { "binvert_limb_cond",  speed_binvert_limb_cond,  FLAG_NODATA },
   { "binvert_limb_arith", speed_binvert_limb_arith, FLAG_NODATA },
+  { "binvert_limb_sec",   speed_binvert_limb_sec,   FLAG_NODATA },
+  { "binvert_limb_pipe",  speed_binvert_limb_pipe,  FLAG_NODATA },
+  { "binvert_limb_uintfast", speed_binvert_limb_uintfast, FLAG_NODATA },
 
   { "malloc_free",                  speed_malloc_free                  },
   { "malloc_realloc_free",          speed_malloc_realloc_free          },
diff -r 1cafba189d5a tune/speed.h
--- a/tune/speed.h	Sun Feb 27 15:10:38 2022 +0100
+++ b/tune/speed.h	Sun Feb 27 18:35:48 2022 +0100
@@ -152,6 +152,9 @@
 double speed_binvert_limb_loop (struct speed_params *);
 double speed_binvert_limb_cond (struct speed_params *);
 double speed_binvert_limb_arith (struct speed_params *);
+double speed_binvert_limb_sec (struct speed_params *);
+double speed_binvert_limb_pipe (struct speed_params *);
+double speed_binvert_limb_uintfast (struct speed_params *);
 
 double speed_mpf_init_clear (struct speed_params *);

_______________________________________________
gmp-devel mailing list
gmp-devel@gmplib.org
https://gmplib.org/mailman/listinfo/gmp-devel

Re: binvert_limb speedup on 64 bit machines with UHWtype

Reply via email to