From: Zhaoxiu Zeng <zhaoxiu.z...@gmail.com> Because some architectures (alpha, armv6, etc.) don't provide hardware division, the mod operation is slow! Binary GCD algorithm uses simple arithmetic operations, it replaces division with arithmetic shifts, comparisons, and subtraction.
I have compiled successfully with x86_64_defconfig and i386_defconfig. I use the following code to test: #include <stdio.h> #include <stdlib.h> #include <stdint.h> #include <time.h> #define swap(a, b) \ do { \ a ^= b; \ b ^= a; \ a ^= b; \ } while (0) unsigned long gcd0(unsigned long a, unsigned long b) { unsigned long r; if (a < b) { swap(a, b); } if (b == 0) return a; while ((r = a % b) != 0) { a = b; b = r; } return b; } unsigned long gcd1(unsigned long a, unsigned long b) { unsigned long r = a | b; if (!a || !b) return r; b >>= __builtin_ctzl(b); for (;;) { a >>= __builtin_ctzl(a); if (a == b) break; if (a < b) swap(a, b); a -= b; } b <<= __builtin_ctzl(r); return b; } unsigned long gcd2(unsigned long a, unsigned long b) { unsigned long r = a | b; if (!a || !b) return r; r ^= (r - 1); while (!(b & r)) b >>= 1; for (;;) { while (!(a & r)) a >>= 1; if (a == b) break; if (a < b) swap(a, b); a -= b; a >>= 1; if (a & r) a += b; } return b; } static unsigned long (*gcd_func[])(unsigned long a, unsigned long b) = { gcd0, gcd1, gcd2, }; #define TEST_ENTRIES (sizeof(gcd_func) / sizeof(gcd_func[0])) #define TEST_TIMES 100 static unsigned long res[TEST_ENTRIES][TEST_TIMES]; #define TIME_T struct timespec static inline struct timespec read_time(void) { struct timespec time; clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &time); return time; } static uint64_t diff_time(struct timespec start, struct timespec end) { struct timespec temp; if ((end.tv_nsec - start.tv_nsec) < 0) { temp.tv_sec = end.tv_sec - start.tv_sec - 1; temp.tv_nsec = 1000000000UL + end.tv_nsec - start.tv_nsec; } else { temp.tv_sec = end.tv_sec - start.tv_sec; temp.tv_nsec = end.tv_nsec - start.tv_nsec; } return temp.tv_sec * 1000000000UL + temp.tv_nsec; } static inline unsigned long get_rand() { if (sizeof(long) == 8) return (unsigned long)rand() << 32 | rand(); else return rand(); } int main(int argc, char **argv) { unsigned int seed = time(0); unsigned int i, j; TIME_T start, end; for (i = 0; i < TEST_ENTRIES; i++) { srand(seed); start = read_time(); for (j = 0; j < TEST_TIMES; j++) res[i][j] = gcd_func[i](get_rand(), get_rand()); end = read_time(); printf("gcd%d: elapsed %lld\n", i, diff_time(start, end)); sleep(1); } for (j = 0; j < TEST_TIMES; j++) { for (i = 1; i < TEST_ENTRIES; i++) { if (res[i][j] != res[0][j]) break; } if (i < TEST_ENTRIES) { fprintf(stderr, "Error: "); for (i = 0; i < TEST_ENTRIES; i++) fprintf(stderr, "res%d %ld%s", i, res[i][j], i < TEST_ENTRIES - 1 ? ", " : "\n"); } } return 0; } Compiled with "-O2", on "VirtualBox 4.2.0-35-generic #40-Ubuntu x86_64" got: zhaoxiuzeng@zhaoxiuzeng-VirtualBox:~/develop$ ./gcd gcd0: elapsed 92281 gcd1: elapsed 55005 gcd2: elapsed 91088 zhaoxiuzeng@zhaoxiuzeng-VirtualBox:~/develop$ ./gcd gcd0: elapsed 115546 gcd1: elapsed 55928 gcd2: elapsed 91938 zhaoxiuzeng@zhaoxiuzeng-VirtualBox:~/develop$ ./gcd gcd0: elapsed 91189 gcd1: elapsed 55493 gcd2: elapsed 90078 zhaoxiuzeng@zhaoxiuzeng-VirtualBox:~/develop$ ./gcd gcd0: elapsed 157364 gcd1: elapsed 55204 gcd2: elapsed 90058 zhaoxiuzeng@zhaoxiuzeng-VirtualBox:~/develop$ ./gcd gcd0: elapsed 91667 gcd1: elapsed 54641 gcd2: elapsed 91364 Changes to V1: - Don't touch Kconfig, remove the Euclidean algorithm implementation - Don't use the "even-odd" variant - Use __ffs if the CPU has efficient __ffs Signed-off-by: Zhaoxiu Zeng <zhaoxiu.z...@gmail.com> --- lib/gcd.c | 81 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 72 insertions(+), 9 deletions(-) diff --git a/lib/gcd.c b/lib/gcd.c index 3657f12..460b414 100644 --- a/lib/gcd.c +++ b/lib/gcd.c @@ -2,19 +2,82 @@ #include <linux/gcd.h> #include <linux/export.h> -/* Greatest common divisor */ +/* + * use __ffs if the CPU has efficient __ffs + */ +#if (defined(CONFIG_ALPHA) && defined(CONFIG_ALPHA_EV6) && defined(CONFIG_ALPHA_EV67)) || \ + defined(CONFIG_ARC) || \ + (defined(CONFIG_ARM) && __LINUX_ARM_ARCH__ >= 5) || defined(CONFIG_ARM64) || \ + defined(CONFIG_AVR32) || \ + defined(CONFIG_BLACKFIN) || \ + defined(CONFIG_C6X) || \ + defined(CONFIG_CRIS) || \ + defined(CONFIG_FRV) || \ + defined(CONFIG_HEXAGON) || \ + defined(CONFIG_IA64) || \ + (defined(CONFIG_M68K) && \ + (!defined(CONFIG_CPU_HAS_NO_BITFIELDS) || \ + ((defined(__mcfisaaplus__) || defined(__mcfisac__)) && \ + !defined(CONFIG_M68000) && !defined(CONFIG_MCPU32)))) || \ + defined(CONFIG_MN10300) || \ + defined(CONFIG_OPENRISC) || \ + defined(CONFIG_POWERPC) || \ + defined(CONFIG_S390) || \ + defined(CONFIG_TILE) || \ + defined(CONFIG_UNICORE32) || \ + defined(CONFIG_X86) || \ + defined(CONFIG_XTENSA) +# define USE_FFS 1 +#elif defined(CONFIG_MIPS) +# define USE_FFS (__builtin_constant_p(cpu_has_clo_clz) && cpu_has_clo_clz) +#else +# define USE_FFS 0 +#endif + +/* + * This implements the binary GCD algorithm. (Often attributed to Stein, + * but as Knith has noted, appears a first-century Chinese math text.) + */ unsigned long gcd(unsigned long a, unsigned long b) { - unsigned long r; + unsigned long r = a | b; + + if (!a || !b) + return r; + + if (USE_FFS) { + b >>= __ffs(b); + } else { + /* least-significant mask, equ to "(1UL << ffs(r)) - 1" */ + r ^= (r - 1); + + while (!(b & r)) + b >>= 1; + } + + for (;;) { + if (USE_FFS) { + a >>= __ffs(a); + } else { + while (!(a & r)) + a >>= 1; + } + if (a == b) + break; - if (a < b) - swap(a, b); + if (a < b) + swap(a, b); + + a -= b; + if (!USE_FFS) { + a >>= 1; + if (a & r) + a += b; + } + } - if (!b) - return a; - while ((r = a % b) != 0) { - a = b; - b = r; + if (USE_FFS) { + b <<= __ffs(r); } return b; } -- 2.5.0