Ciao,
Il 2022-02-27 18:13 Marco Bodrato ha scritto:
I did never look into that file :-)
I inserted there a few more versions of binvert_limb.
The attached patch is only for testing, not to be pushed (I used
uint_fast##_t types).
On shell I get the following:
@shell ~/gmp-repo]$ /var/tmp/bodrato/gmp/hg/build/tune/speed -p100000000
-s 1 -rc binvert_limb binvert_limb_sec binvert_limb_pipe
binvert_limb_uintfast
overhead 5.84 cycles, precision 100000000 units of 2.86e-10 secs, CPU
freq 3500.09 MHz
binvert_limb binvert_limb_sec binvert_limb_pipe binvert_limb_uintfast
1 28.39 1.0485 #0.8930 0.9999
The _sec version is 5% slower, and the _pipe one is 10% faster than the
current.
Ĝis,
m
diff -r 1cafba189d5a tune/modlinv.c
--- a/tune/modlinv.c Sun Feb 27 15:10:38 2022 +0100
+++ b/tune/modlinv.c Sun Feb 27 18:35:48 2022 +0100
@@ -1,7 +1,7 @@
/* Alternate implementations of binvert_limb to compare speeds. */
/*
-Copyright 2000, 2002 Free Software Foundation, Inc.
+Copyright 2000, 2002, 2022 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
@@ -29,11 +29,133 @@
GNU Lesser General Public License along with the GNU MP Library. If not,
see https://www.gnu.org/licenses/. */
-#include <stdio.h>
#include "gmp-impl.h"
#include "longlong.h"
#include "speed.h"
+#define binvert_limb_uintfast(inv,n) \
+ do { \
+ mp_limb_t __n = (n); \
+ mp_limb_t __inv; \
+ uint_fast8_t __inv8; \
+ ASSERT ((__n & 1) == 1); \
+ \
+ __inv8 = binvert_limb_table[(__n & 0xff)/2]; /* 8 */ \
+ if (GMP_NUMB_BITS > 8) { \
+ uint_fast16_t __inv16 = 2 * (uint_fast16_t)__inv8 - (uint_fast16_t)__inv8 * __inv8 * (uint_fast16_t)__n; \
+ if (GMP_NUMB_BITS > 16) { \
+ uint_fast32_t __inv32 = 2 * (uint_fast32_t)__inv16 - (uint_fast32_t)__inv16 * __inv16 * (uint_fast32_t)__n; \
+ if (GMP_NUMB_BITS > 32) { \
+ __inv = 2 * (mp_limb_t)__inv32 - (mp_limb_t)__inv32 * __inv32 * __n; \
+ } else { __inv = __inv32; }; \
+ } else { __inv = __inv16; }; \
+ } else { __inv = __inv8; }; \
+ \
+ if (GMP_NUMB_BITS > 64) \
+ { \
+ int __invbits = 64; \
+ do { \
+ __inv = 2 * __inv - __inv * __inv * __n; \
+ __invbits *= 2; \
+ } while (__invbits < GMP_NUMB_BITS); \
+ } \
+ \
+ ASSERT ((__inv * __n & GMP_NUMB_MASK) == 1); \
+ (inv) = __inv & GMP_NUMB_MASK; \
+ } while (0)
+
+/* An (imperfect) copy of the binvert function contained in
+ mpn/generic/sec_powm.c .
+ */
+
+#if GMP_LIMB_BITS <= 32
+#define binvert_limb_sec(inv,n) \
+ do { \
+ mp_limb_t __n = (n); \
+ mp_limb_t __inv, __t; \
+ ASSERT ((__n & 1) == 1); \
+ /* 3 + 2 -> 5 */ \
+ __inv = __n + (((__n + 1) << 1) & 0x18); \
+ \
+ __t = __n * __inv; \
+ /* 5 x 2 + 2 -> 12 */ \
+ __inv = 2*__inv - __inv*__t + ((__inv<<10)&-(__t&(1<<5))); \
+ \
+ if (GMP_NUMB_BITS > 12) \
+ { \
+ __t = __n * __inv - 1; \
+ /* 12 x 3 -> 36 */ \
+ __inv += __inv * __t * (__t - 1); \
+ } \
+ \
+ ASSERT ((__inv * __n & GMP_NUMB_MASK) == 1); \
+ (inv) = __inv & GMP_NUMB_MASK; \
+ } while (0)
+#else /* GMP_LIMB_BITS > 32, but actually > 48 is required */
+#define binvert_limb_sec(inv,n) \
+ do { \
+ mp_limb_t __n = (n); \
+ mp_limb_t __inv, __t; \
+ ASSERT ((__n & 1) == 1); \
+ /* 3 + 2 -> 5 */ \
+ __inv = __n + (((__n + 1) << 1) & 0x18); \
+ \
+ __t = __n * __inv; \
+ /* 5 x 2 + 2 -> 12 */ \
+ __inv = 2*__inv - __inv*__t + ((__inv<<10)&-(__t&(1<<5))); \
+ \
+ __t = __n * __inv - 1; \
+ mp_limb_t __t2 = __t * __t; \
+ /* 12 x 5 + 4 -> 64 */ \
+ __inv *= (__t2+1)*(__t2-__t)+1 -((__t<<48)&-(__t&(1<<12))); \
+ \
+ /* 64 -> 128 -> 256 -> ... */ \
+ for (int __b = (GMP_NUMB_BITS - 1) >> 6; __b != 0; __b >>= 1) \
+ __inv = 2 * __inv - __inv * __inv * __n; \
+ \
+ ASSERT ((__inv * __n & GMP_NUMB_MASK) == 1); \
+ (inv) = __inv & GMP_NUMB_MASK; \
+ } while (0)
+#endif
+
+/* Like the standard version in gmp-impl.h, but with a different path
+ for bit sizes larger than 32, with concurrent multiplications. */
+
+#define binvert_limb_pipe(inv,n) \
+ do { \
+ mp_limb_t __n = (n); \
+ mp_limb_t __inv; \
+ ASSERT ((__n & 1) == 1); \
+ \
+ __inv = binvert_limb_table[(__n & 0xFF) / 2]; /* 8 */ \
+ if (GMP_NUMB_BITS <= 32) \
+ { \
+ if (GMP_NUMB_BITS > 8) \
+ __inv = 2 * __inv - __inv * __inv * __n; \
+ if (GMP_NUMB_BITS > 16) \
+ __inv = 2 * __inv - __inv * __inv * __n; \
+ } \
+ else \
+ { \
+ mp_limb_t __t, __t2, __p; \
+ int __invbits = 32; \
+ __t = __n * __inv - 1; /* x */ \
+ __t2= __t * __t; /* x^2 */ \
+ __p = __inv * (__t - __t2); /* (1-x^2)x/(x+1) */ \
+ \
+ while (__invbits < GMP_NUMB_BITS - 8) \
+ { \
+ __p *= __t2 + 1; /* (1-x^{2^n})x/(x+1) */ \
+ __t2 *= __t2; /* x^{2^n} */ \
+ __invbits <<= 1; \
+ } \
+ /* __inv * (x^{2^{n+2}+1}-1)/(x+1) */ \
+ __inv -= __p * (__t2 + 1); \
+ } \
+ \
+ ASSERT ((__inv * __n & GMP_NUMB_MASK) == 1); \
+ (inv) = __inv & GMP_NUMB_MASK; \
+ } while (0)
/* Like the standard version in gmp-impl.h, but with the expressions using a
"1-" form. This has the same number of steps, but "1-" is on the
@@ -175,3 +297,18 @@
{
SPEED_ROUTINE_MODLIMB_INVERT (binvert_limb_arith);
}
+double
+speed_binvert_limb_sec (struct speed_params *s)
+{
+ SPEED_ROUTINE_MODLIMB_INVERT (binvert_limb_sec);
+}
+double
+speed_binvert_limb_pipe (struct speed_params *s)
+{
+ SPEED_ROUTINE_MODLIMB_INVERT (binvert_limb_pipe);
+}
+double
+speed_binvert_limb_uintfast (struct speed_params *s)
+{
+ SPEED_ROUTINE_MODLIMB_INVERT (binvert_limb_uintfast);
+}
diff -r 1cafba189d5a tune/speed.c
--- a/tune/speed.c Sun Feb 27 15:10:38 2022 +0100
+++ b/tune/speed.c Sun Feb 27 18:35:48 2022 +0100
@@ -532,6 +532,9 @@
{ "binvert_limb_loop", speed_binvert_limb_loop, FLAG_NODATA },
{ "binvert_limb_cond", speed_binvert_limb_cond, FLAG_NODATA },
{ "binvert_limb_arith", speed_binvert_limb_arith, FLAG_NODATA },
+ { "binvert_limb_sec", speed_binvert_limb_sec, FLAG_NODATA },
+ { "binvert_limb_pipe", speed_binvert_limb_pipe, FLAG_NODATA },
+ { "binvert_limb_uintfast", speed_binvert_limb_uintfast, FLAG_NODATA },
{ "malloc_free", speed_malloc_free },
{ "malloc_realloc_free", speed_malloc_realloc_free },
diff -r 1cafba189d5a tune/speed.h
--- a/tune/speed.h Sun Feb 27 15:10:38 2022 +0100
+++ b/tune/speed.h Sun Feb 27 18:35:48 2022 +0100
@@ -152,6 +152,9 @@
double speed_binvert_limb_loop (struct speed_params *);
double speed_binvert_limb_cond (struct speed_params *);
double speed_binvert_limb_arith (struct speed_params *);
+double speed_binvert_limb_sec (struct speed_params *);
+double speed_binvert_limb_pipe (struct speed_params *);
+double speed_binvert_limb_uintfast (struct speed_params *);
double speed_mpf_init_clear (struct speed_params *);
_______________________________________________
gmp-devel mailing list
gmp-devel@gmplib.org
https://gmplib.org/mailman/listinfo/gmp-devel