[patch V4] lib: GCD: Use binary GCD algorithm instead of Euclidean

zengzhaoxiu Fri, 06 May 2016 02:56:32 -0700

From: Zhaoxiu Zeng <zhaoxiu.z...@gmail.com>

The binary GCD algorithm is based on the following facts:
        1. If a and b are all evens, then gcd(a,b) = 2 * gcd(a/2, b/2)
        2. If a is even and b is odd, then gcd(a,b) = gcd(a/2, b)
        3. If a and b are all odds, then gcd(a,b) = gcd((a-b)/2, b) = 
gcd((a+b)/2, b)


Even on x86 machines with reasonable division hardware, the binary
algorithm runs about 25% faster (80% the execution time) than the
division-based Euclidian algorithm.

On platforms like Alpha and ARMv6 where division is a function call to
emulation code, it's even more significant.

There are two variants of the code here, depending on whether a
fast __ffs (find least significant set bit) instruction is available.
This allows the unpredictable branches in the bit-at-a-time shifting
loop to be eliminated.

If fast __ffs is not available, the "even/odd" GCD variant is used.

I use the following code to benchmark:

        #include <stdio.h>
        #include <stdlib.h>
        #include <stdint.h>
        #include <string.h>
        #include <time.h>
        #include <unistd.h>

        #define swap(a, b) \
                do { \
                        a ^= b; \
                        b ^= a; \
                        a ^= b; \
                } while (0)

        unsigned long gcd0(unsigned long a, unsigned long b)
        {
                unsigned long r;

                if (a < b) {
                        swap(a, b);
                }

                if (b == 0)
                        return a;

                while ((r = a % b) != 0) {
                        a = b;
                        b = r;
                }

                return b;
        }

        unsigned long gcd1(unsigned long a, unsigned long b)
        {
                unsigned long r = a | b;

                if (!a || !b)
                        return r;

                b >>= __builtin_ctzl(b);

                for (;;) {
                        a >>= __builtin_ctzl(a);
                        if (a == b)
                                return a << __builtin_ctzl(r);

                        if (a < b)
                                swap(a, b);
                        a -= b;
                }
        }

        unsigned long gcd2(unsigned long a, unsigned long b)
        {
                unsigned long r = a | b;

                if (!a || !b)
                        return r;

                r &= -r;

                while (!(b & r))
                        b >>= 1;

                for (;;) {
                        while (!(a & r))
                                a >>= 1;
                        if (a == b)
                                return a;

                        if (a < b)
                                swap(a, b);
                        a -= b;
                        a >>= 1;
                        if (a & r)
                                a += b;
                        a >>= 1;
                }
        }

        unsigned long gcd3(unsigned long a, unsigned long b)
        {
                unsigned long r = a | b;

                if (!a || !b)
                        return r;

                b >>= __builtin_ctzl(b);
                if (b == 1)
                        return r & -r;

                for (;;) {
                        a >>= __builtin_ctzl(a);
                        if (a == 1)
                                return r & -r;
                        if (a == b)
                                return a << __builtin_ctzl(r);

                        if (a < b)
                                swap(a, b);
                        a -= b;
                }
        }

        unsigned long gcd4(unsigned long a, unsigned long b)
        {
                unsigned long r = a | b;

                if (!a || !b)
                        return r;

                r &= -r;

                while (!(b & r))
                        b >>= 1;
                if (b == r)
                        return r;

                for (;;) {
                        while (!(a & r))
                                a >>= 1;
                        if (a == r)
                                return r;
                        if (a == b)
                                return a;

                        if (a < b)
                                swap(a, b);
                        a -= b;
                        a >>= 1;
                        if (a & r)
                                a += b;
                        a >>= 1;
                }
        }

        static unsigned long (*gcd_func[])(unsigned long a, unsigned long b) = {
                gcd0, gcd1, gcd2, gcd3, gcd4,
        };

        #define TEST_ENTRIES (sizeof(gcd_func) / sizeof(gcd_func[0]))

        #if defined(__x86_64__)

        #define rdtscll(val) do { \
                unsigned long __a,__d; \
                __asm__ __volatile__("rdtsc" : "=a" (__a), "=d" (__d)); \
                (val) = ((unsigned long long)__a) | (((unsigned long 
long)__d)<<32); \
        } while(0)

        static unsigned long long benchmark_gcd_func(unsigned long 
(*gcd)(unsigned long, unsigned long),
                                                                unsigned long 
a, unsigned long b, unsigned long *res)
        {
                unsigned long long start, end;
                unsigned long long ret;
                unsigned long gcd_res;

                rdtscll(start);
                gcd_res = gcd(a, b);
                rdtscll(end);

                if (end >= start)
                        ret = end - start;
                else
                        ret = ~0ULL - start + 1 + end;

                *res = gcd_res;
                return ret;
        }

        #else

        static inline struct timespec read_time(void)
        {
                struct timespec time;
                clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &time);
                return time;
        }

        static inline unsigned long long diff_time(struct timespec start, 
struct timespec end)
        {
                struct timespec temp;

                if ((end.tv_nsec - start.tv_nsec) < 0) {
                        temp.tv_sec = end.tv_sec - start.tv_sec - 1;
                        temp.tv_nsec = 1000000000ULL + end.tv_nsec - 
start.tv_nsec;
                } else {
                        temp.tv_sec = end.tv_sec - start.tv_sec;
                        temp.tv_nsec = end.tv_nsec - start.tv_nsec;
                }

                return temp.tv_sec * 1000000000ULL + temp.tv_nsec;
        }

        static unsigned long long benchmark_gcd_func(unsigned long 
(*gcd)(unsigned long, unsigned long),
                                                                unsigned long 
a, unsigned long b, unsigned long *res)
        {
                struct timespec start, end;
                unsigned long gcd_res;

                start = read_time();
                gcd_res = gcd(a, b);
                end = read_time();

                *res = gcd_res;
                return diff_time(start, end);
        }

        #endif

        static inline unsigned long get_rand()
        {
                if (sizeof(long) == 8)
                        return (unsigned long)rand() << 32 | rand();
                else
                        return rand();
        }

        int main(int argc, char **argv)
        {
                unsigned int seed = time(0);
                int loops = 100;
                int repeats = 1000;
                unsigned long (*res)[TEST_ENTRIES];
                unsigned long long elapsed[TEST_ENTRIES];
                int i, j, k;

                for (;;) {
                        int opt = getopt(argc, argv, "n:r:s:");
                        /* End condition always first */
                        if (opt == -1)
                                break;

                        switch (opt) {
                        case 'n':
                                loops = atoi(optarg);
                                break;
                        case 'r':
                                repeats = atoi(optarg);
                                break;
                        case 's':
                                seed = strtoul(optarg, NULL, 10);
                                break;
                        default:
                                /* You won't actually get here. */
                                break;
                        }
                }

                res = malloc(sizeof(unsigned long) * TEST_ENTRIES * loops);
                memset(elapsed, 0, sizeof(elapsed));

                srand(seed);
                for (j = 0; j < loops; j++) {
                        unsigned long a = get_rand();
                        /* Do we have args? */
                        unsigned long b = argc > optind ? strtoul(argv[optind], 
NULL, 10) : get_rand();
                        unsigned long long min_elapsed[TEST_ENTRIES];
                        for (k = 0; k < repeats; k++) {
                                for (i = 0; i < TEST_ENTRIES; i++) {
                                        unsigned long long tmp = 
benchmark_gcd_func(gcd_func[i], a, b, &res[j][i]);
                                        if (k == 0 || min_elapsed[i] > tmp)
                                                min_elapsed[i] = tmp;
                                }
                        }
                        for (i = 0; i < TEST_ENTRIES; i++)
                                elapsed[i] += min_elapsed[i];
                }

                for (i = 0; i < TEST_ENTRIES; i++)
                        printf("gcd%d: elapsed %llu\n", i, elapsed[i]);

                k = 0;
                srand(seed);
                for (j = 0; j < loops; j++) {
                        unsigned long a = get_rand();
                        unsigned long b = argc > optind ? strtoul(argv[optind], 
NULL, 10) : get_rand();
                        for (i = 1; i < TEST_ENTRIES; i++) {
                                if (res[j][i] != res[j][0])
                                        break;
                        }
                        if (i < TEST_ENTRIES) {
                                if (k == 0) {
                                        k = 1;
                                        fprintf(stderr, "Error:\n");
                                }
                                fprintf(stderr, "gcd(%lu, %lu): ", a, b);
                                for (i = 0; i < TEST_ENTRIES; i++)
                                        fprintf(stderr, "%ld%s", res[j][i], i < 
TEST_ENTRIES - 1 ? ", " : "\n");
                        }
                }

                if (k == 0)
                        fprintf(stderr, "PASS\n");

                free(res);

                return 0;
        }

Compiled with "-O2", on "VirtualBox 4.4.0-22-generic #38-Ubuntu x86_64" got:

zhaoxiuzeng@zhaoxiuzeng-VirtualBox:~/develop$ ./gcd -r 500000 -n 10
gcd0: elapsed 10174
gcd1: elapsed 2120
gcd2: elapsed 2902
gcd3: elapsed 2039
gcd4: elapsed 2812
PASS
zhaoxiuzeng@zhaoxiuzeng-VirtualBox:~/develop$ ./gcd -r 500000 -n 10
gcd0: elapsed 9309
gcd1: elapsed 2280
gcd2: elapsed 2822
gcd3: elapsed 2217
gcd4: elapsed 2710
PASS
zhaoxiuzeng@zhaoxiuzeng-VirtualBox:~/develop$ ./gcd -r 500000 -n 10
gcd0: elapsed 9589
gcd1: elapsed 2098
gcd2: elapsed 2815
gcd3: elapsed 2030
gcd4: elapsed 2718
PASS
zhaoxiuzeng@zhaoxiuzeng-VirtualBox:~/develop$ ./gcd -r 500000 -n 10
gcd0: elapsed 9914
gcd1: elapsed 2309
gcd2: elapsed 2779
gcd3: elapsed 2228
gcd4: elapsed 2709
PASS

Changes to V3:
- Fix build error
- Select CPU_NO_EFFICIENT_FFS if CONFIG_ARC && CONFIG_ISA_ARCOMPACT
- Select CPU_NO_EFFICIENT_FFS if CONFIG_S390 && 
!CONFIG_HAVE_MARCH_Z9_109_FEATURES
- Do new brnchmark
- Return immediately if one number becomes a power of 2
- Add comments written by George Spelvin

Changes to V2:
- Add a new Kconfig variable CPU_NO_EFFICIENT_FFS
- Separate into two versions by CPU_NO_EFFICIENT_FFS
- Return directly from the loop, rather than using break().
- Use "r &= -r" mostly because it's clearer.

Changes to V1:
- Don't touch Kconfig, remove the Euclidean algorithm implementation
- Don't use the "even-odd" variant
- Use __ffs if the CPU has efficient __ffs

Signed-off-by: Zhaoxiu Zeng <zhaoxiu.z...@gmail.com>
Signed-off-by: George Spelvin <li...@horizon.com>
---
 arch/Kconfig                         |  3 ++
 arch/alpha/Kconfig                   |  1 +
 arch/arc/Kconfig                     |  1 +
 arch/arm/mm/Kconfig                  |  3 ++
 arch/h8300/Kconfig                   |  1 +
 arch/m32r/Kconfig                    |  1 +
 arch/m68k/Kconfig.cpu                | 11 ++++++
 arch/metag/Kconfig                   |  1 +
 arch/microblaze/Kconfig              |  1 +
 arch/mips/include/asm/cpu-features.h | 10 +++++
 arch/nios2/Kconfig                   |  1 +
 arch/openrisc/Kconfig                |  1 +
 arch/parisc/Kconfig                  |  1 +
 arch/s390/Kconfig                    |  2 +-
 arch/score/Kconfig                   |  1 +
 arch/sh/Kconfig                      |  1 +
 arch/sparc/Kconfig                   |  1 +
 lib/gcd.c                            | 77 +++++++++++++++++++++++++++++++-----
 18 files changed, 107 insertions(+), 11 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index 81869a5..275f17d 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -638,4 +638,7 @@ config COMPAT_OLD_SIGACTION
 config ARCH_NO_COHERENT_DMA_MMAP
        bool
 
+config CPU_NO_EFFICIENT_FFS
+       def_bool n
+
 source "kernel/gcov/Kconfig"
diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig
index 9d8a858..44e6f05 100644
--- a/arch/alpha/Kconfig
+++ b/arch/alpha/Kconfig
@@ -27,6 +27,7 @@ config ALPHA
        select MODULES_USE_ELF_RELA
        select ODD_RT_SIGACTION
        select OLD_SIGSUSPEND
+       select CPU_NO_EFFICIENT_FFS if !ALPHA_EV67
        help
          The Alpha is a 64-bit general-purpose processor designed and
          marketed by the Digital Equipment Corporation of blessed memory,
diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig
index ec4791e..f41eb4c 100644
--- a/arch/arc/Kconfig
+++ b/arch/arc/Kconfig
@@ -101,6 +101,7 @@ choice
 
 config ISA_ARCOMPACT
        bool "ARCompact ISA"
+       select CPU_NO_EFFICIENT_FFS
        help
          The original ARC ISA of ARC600/700 cores
 
diff --git a/arch/arm/mm/Kconfig b/arch/arm/mm/Kconfig
index 5534766..cb569b6 100644
--- a/arch/arm/mm/Kconfig
+++ b/arch/arm/mm/Kconfig
@@ -421,18 +421,21 @@ config CPU_32v3
        select CPU_USE_DOMAINS if MMU
        select NEED_KUSER_HELPERS
        select TLS_REG_EMUL if SMP || !MMU
+       select CPU_NO_EFFICIENT_FFS
 
 config CPU_32v4
        bool
        select CPU_USE_DOMAINS if MMU
        select NEED_KUSER_HELPERS
        select TLS_REG_EMUL if SMP || !MMU
+       select CPU_NO_EFFICIENT_FFS
 
 config CPU_32v4T
        bool
        select CPU_USE_DOMAINS if MMU
        select NEED_KUSER_HELPERS
        select TLS_REG_EMUL if SMP || !MMU
+       select CPU_NO_EFFICIENT_FFS
 
 config CPU_32v5
        bool
diff --git a/arch/h8300/Kconfig b/arch/h8300/Kconfig
index 986ea84..aa232de 100644
--- a/arch/h8300/Kconfig
+++ b/arch/h8300/Kconfig
@@ -20,6 +20,7 @@ config H8300
        select HAVE_KERNEL_GZIP
        select HAVE_KERNEL_LZO
        select HAVE_ARCH_KGDB
+       select CPU_NO_EFFICIENT_FFS
 
 config RWSEM_GENERIC_SPINLOCK
        def_bool y
diff --git a/arch/m32r/Kconfig b/arch/m32r/Kconfig
index c82b292..3cc8498 100644
--- a/arch/m32r/Kconfig
+++ b/arch/m32r/Kconfig
@@ -17,6 +17,7 @@ config M32R
        select ARCH_USES_GETTIMEOFFSET
        select MODULES_USE_ELF_RELA
        select HAVE_DEBUG_STACKOVERFLOW
+       select CPU_NO_EFFICIENT_FFS
 
 config SBUS
        bool
diff --git a/arch/m68k/Kconfig.cpu b/arch/m68k/Kconfig.cpu
index 0dfcf12..0b6efe8 100644
--- a/arch/m68k/Kconfig.cpu
+++ b/arch/m68k/Kconfig.cpu
@@ -40,6 +40,7 @@ config M68000
        select CPU_HAS_NO_MULDIV64
        select CPU_HAS_NO_UNALIGNED
        select GENERIC_CSUM
+       select CPU_NO_EFFICIENT_FFS
        help
          The Freescale (was Motorola) 68000 CPU is the first generation of
          the well known M68K family of processors. The CPU core as well as
@@ -51,6 +52,7 @@ config MCPU32
        bool
        select CPU_HAS_NO_BITFIELDS
        select CPU_HAS_NO_UNALIGNED
+       select CPU_NO_EFFICIENT_FFS
        help
          The Freescale (was then Motorola) CPU32 is a CPU core that is
          based on the 68020 processor. For the most part it is used in
@@ -130,6 +132,7 @@ config M5206
        depends on !MMU
        select COLDFIRE_SW_A7
        select HAVE_MBAR
+       select CPU_NO_EFFICIENT_FFS
        help
          Motorola ColdFire 5206 processor support.
 
@@ -138,6 +141,7 @@ config M5206e
        depends on !MMU
        select COLDFIRE_SW_A7
        select HAVE_MBAR
+       select CPU_NO_EFFICIENT_FFS
        help
          Motorola ColdFire 5206e processor support.
 
@@ -163,6 +167,7 @@ config M5249
        depends on !MMU
        select COLDFIRE_SW_A7
        select HAVE_MBAR
+       select CPU_NO_EFFICIENT_FFS
        help
          Motorola ColdFire 5249 processor support.
 
@@ -171,6 +176,7 @@ config M525x
        depends on !MMU
        select COLDFIRE_SW_A7
        select HAVE_MBAR
+       select CPU_NO_EFFICIENT_FFS
        help
          Freescale (Motorola) Coldfire 5251/5253 processor support.
 
@@ -189,6 +195,7 @@ config M5272
        depends on !MMU
        select COLDFIRE_SW_A7
        select HAVE_MBAR
+       select CPU_NO_EFFICIENT_FFS
        help
          Motorola ColdFire 5272 processor support.
 
@@ -217,6 +224,7 @@ config M5307
        select COLDFIRE_SW_A7
        select HAVE_CACHE_CB
        select HAVE_MBAR
+       select CPU_NO_EFFICIENT_FFS
        help
          Motorola ColdFire 5307 processor support.
 
@@ -242,6 +250,7 @@ config M5407
        select COLDFIRE_SW_A7
        select HAVE_CACHE_CB
        select HAVE_MBAR
+       select CPU_NO_EFFICIENT_FFS
        help
          Motorola ColdFire 5407 processor support.
 
@@ -251,6 +260,7 @@ config M547x
        select MMU_COLDFIRE if MMU
        select HAVE_CACHE_CB
        select HAVE_MBAR
+       select CPU_NO_EFFICIENT_FFS
        help
          Freescale ColdFire 5470/5471/5472/5473/5474/5475 processor support.
 
@@ -260,6 +270,7 @@ config M548x
        select M54xx
        select HAVE_CACHE_CB
        select HAVE_MBAR
+       select CPU_NO_EFFICIENT_FFS
        help
          Freescale ColdFire 5480/5481/5482/5483/5484/5485 processor support.
 
diff --git a/arch/metag/Kconfig b/arch/metag/Kconfig
index a0fa88d..2ac2de6 100644
--- a/arch/metag/Kconfig
+++ b/arch/metag/Kconfig
@@ -29,6 +29,7 @@ config METAG
        select OF
        select OF_EARLY_FLATTREE
        select SPARSE_IRQ
+       select CPU_NO_EFFICIENT_FFS
 
 config STACKTRACE_SUPPORT
        def_bool y
diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig
index 3d793b5..f17c3a4 100644
--- a/arch/microblaze/Kconfig
+++ b/arch/microblaze/Kconfig
@@ -32,6 +32,7 @@ config MICROBLAZE
        select OF_EARLY_FLATTREE
        select TRACING_SUPPORT
        select VIRT_TO_BUS
+       select CPU_NO_EFFICIENT_FFS
 
 config SWAP
        def_bool n
diff --git a/arch/mips/include/asm/cpu-features.h 
b/arch/mips/include/asm/cpu-features.h
index eeec8c8..e20e100 100644
--- a/arch/mips/include/asm/cpu-features.h
+++ b/arch/mips/include/asm/cpu-features.h
@@ -180,6 +180,16 @@
 #endif
 #endif
 
+/* __builtin_constant_p(cpu_has_mips_r) && cpu_has_mips_r */
+#if !((defined(cpu_has_mips32r1) && cpu_has_mips32r1) || \
+         (defined(cpu_has_mips32r2) && cpu_has_mips32r2) || \
+         (defined(cpu_has_mips32r6) && cpu_has_mips32r6) || \
+         (defined(cpu_has_mips64r1) && cpu_has_mips64r1) || \
+         (defined(cpu_has_mips64r2) && cpu_has_mips64r2) || \
+         (defined(cpu_has_mips64r6) && cpu_has_mips64r6))
+#define CONFIG_CPU_NO_EFFICIENT_FFS 1
+#endif
+
 #ifndef cpu_has_mips_1
 # define cpu_has_mips_1                (!cpu_has_mips_r6)
 #endif
diff --git a/arch/nios2/Kconfig b/arch/nios2/Kconfig
index 4375554..f10bd2c 100644
--- a/arch/nios2/Kconfig
+++ b/arch/nios2/Kconfig
@@ -16,6 +16,7 @@ config NIOS2
        select SOC_BUS
        select SPARSE_IRQ
        select USB_ARCH_HAS_HCD if USB_SUPPORT
+       select CPU_NO_EFFICIENT_FFS
 
 config GENERIC_CSUM
        def_bool y
diff --git a/arch/openrisc/Kconfig b/arch/openrisc/Kconfig
index e118c02..142cb05 100644
--- a/arch/openrisc/Kconfig
+++ b/arch/openrisc/Kconfig
@@ -25,6 +25,7 @@ config OPENRISC
        select MODULES_USE_ELF_RELA
        select HAVE_DEBUG_STACKOVERFLOW
        select OR1K_PIC
+       select CPU_NO_EFFICIENT_FFS if !OPENRISC_HAVE_INST_FF1
 
 config MMU
        def_bool y
diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig
index 88cfaa8..3d498a6 100644
--- a/arch/parisc/Kconfig
+++ b/arch/parisc/Kconfig
@@ -32,6 +32,7 @@ config PARISC
        select HAVE_ARCH_AUDITSYSCALL
        select HAVE_ARCH_SECCOMP_FILTER
        select ARCH_NO_COHERENT_DMA_MMAP
+       select CPU_NO_EFFICIENT_FFS
 
        help
          The PA-RISC microprocessor is designed by Hewlett-Packard and used
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index bf24ab1..9eb3932 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -164,7 +164,7 @@ config S390
        select TTY
        select VIRT_CPU_ACCOUNTING
        select VIRT_TO_BUS
-
+       select CPU_NO_EFFICIENT_FFS if !HAVE_MARCH_Z9_109_FEATURES
 
 config SCHED_OMIT_FRAME_POINTER
        def_bool y
diff --git a/arch/score/Kconfig b/arch/score/Kconfig
index 366e1b5..507d631 100644
--- a/arch/score/Kconfig
+++ b/arch/score/Kconfig
@@ -14,6 +14,7 @@ config SCORE
        select VIRT_TO_BUS
        select MODULES_USE_ELF_REL
        select CLONE_BACKWARDS
+       select CPU_NO_EFFICIENT_FFS
 
 choice
        prompt "System type"
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 7ed20fc..56cf5e5 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -44,6 +44,7 @@ config SUPERH
        select OLD_SIGSUSPEND
        select OLD_SIGACTION
        select HAVE_ARCH_AUDITSYSCALL
+       select CPU_NO_EFFICIENT_FFS
        help
          The SuperH is a RISC processor targeted for use in embedded systems
          and consumer electronics; it was also used in the Sega Dreamcast
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index 57ffaf2..ca675ed 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -42,6 +42,7 @@ config SPARC
        select ODD_RT_SIGACTION
        select OLD_SIGSUSPEND
        select ARCH_HAS_SG_CHAIN
+       select CPU_NO_EFFICIENT_FFS
 
 config SPARC32
        def_bool !64BIT
diff --git a/lib/gcd.c b/lib/gcd.c
index 3657f12..b9fbe73 100644
--- a/lib/gcd.c
+++ b/lib/gcd.c
@@ -2,20 +2,77 @@
 #include <linux/gcd.h>
 #include <linux/export.h>
 
-/* Greatest common divisor */
+/*
+ * This implements the binary GCD algorithm. (Often attributed to Stein,
+ * but as Knuth has noted, appears in a first-century Chinese math text.)
+ *
+ * This is faster than the division-based algorithm even on x86, which
+ * has decent hardware division.
+ */
+
+#if !defined(CONFIG_CPU_NO_EFFICIENT_FFS)
+
+/* If __ffs is available, the even/odd algorithm benchmarks slower. */
 unsigned long gcd(unsigned long a, unsigned long b)
 {
-       unsigned long r;
+       unsigned long r = a | b;
+
+       if (!a || !b)
+               return r;
 
-       if (a < b)
-               swap(a, b);
+       b >>= __ffs(b);
+       if (b == 1)
+               return r & -r;
 
-       if (!b)
-               return a;
-       while ((r = a % b) != 0) {
-               a = b;
-               b = r;
+       for (;;) {
+               a >>= __ffs(a);
+               if (a == 1)
+                       return r & -r;
+               if (a == b)
+                       return a << __ffs(r);
+
+               if (a < b)
+                       swap(a, b);
+               a -= b;
        }
-       return b;
 }
+
+#else
+
+/* If normalization is done by loops, the even/odd algorithm is a win. */
+unsigned long gcd(unsigned long a, unsigned long b)
+{
+       unsigned long r = a | b;
+
+       if (!a || !b)
+               return r;
+
+       /* Isolate lsbit of r */
+       r &= -r;
+
+       while (!(b & r))
+               b >>= 1;
+       if (b == r)
+               return r;
+
+       for (;;) {
+               while (!(a & r))
+                       a >>= 1;
+               if (a == r)
+                       return r;
+               if (a == b)
+                       return a;
+
+               if (a < b)
+                       swap(a, b);
+               a -= b;
+               a >>= 1;
+               if (a & r)
+                       a += b;
+               a >>= 1;
+       }
+}
+
+#endif
+
 EXPORT_SYMBOL_GPL(gcd);
-- 
2.7.4

[patch V4] lib: GCD: Use binary GCD algorithm instead of Euclidean

Reply via email to