t...@gmplib.org (Torbjörn Granlund) writes: > I believe an amalgamation of Niels code which uses an implicit lsb and > his present _22 code would be best. The present requirement for > sub_mddmmss is a portability inconvenience, and for subtract-with-carry > challenged processors a major hassle.
Below another version which does pass some tests. It has an implicit one bit, which causes some impedance mismatch. Maybe it's better to have some special iterations at the top to eliminate the top bits, before falling into the main loop. Not sure I've identified the main loop correctly in the generated code, but I think it's this (x86_64, gcc-8.3 -O3): 4a: mov %rsi,%r9 4d: mov %rbx,%rax 50: sub %r11,%rax 53: sbb %r8,%r9 56: mov %r9,%rdi 59: sar $0x3f,%rdi 5d: test %rax,%rax 60: je 130 <gcd_22+0x130> 66: bsf %rax,%r10 6a: mov %r9,%rdx 6d: mov %rax,%rsi 70: add $0x1,%r10d 74: xor %rdi,%rax 77: mov %r11,%rcx 7a: and %rdi,%rdx 7d: and %rdi,%rsi 80: sub %rdi,%rax 83: add %rsi,%rcx 86: adc %rdx,%r8 89: xor %rdi,%r9 8c: mov %rcx,%r11 8f: cmp $0x40,%r10d 93: je 168 <gcd_22+0x168> 99: mov %r10d,%ecx 9c: mov %r9,%rbx 9f: mov %r9,%rsi a2: shr %cl,%rax a5: mov %ebp,%ecx a7: sub %r10d,%ecx aa: shl %cl,%rbx ad: mov %r10d,%ecx b0: shr %cl,%rsi b3: or %rax,%rbx b6: mov %rsi,%rax b9: or %r8,%rax bc: jne 4a <gcd_22+0x4a> with the branches out going to unlikely code paths. Regards, /Niels ----8<------ #include <stdio.h> #include <stdlib.h> #include "gmp.h" #include "gmp-impl.h" #include "longlong.h" typedef struct { mp_limb_t lo; mp_limb_t hi; } double_limb_t; static double_limb_t ref_gcd_22 (mp_limb_t uh, mp_limb_t ul, mp_limb_t vh, mp_limb_t vl) { double_limb_t res; mpz_t u; mpz_t v; mpz_t g; mp_limb_t ua[2] = { ul, uh }; mp_limb_t va[2] = { vl, vh }; mpz_init (g); mpz_gcd (g, mpz_roinit_n (u, ua, 2), mpz_roinit_n (v, va, 2)); res.lo = mpz_getlimbn (g, 0); res.hi = mpz_getlimbn (g, 1); return res; } static double_limb_t gcd_22 (mp_limb_t u1, mp_limb_t u0, mp_limb_t v1, mp_limb_t v0) { double_limb_t r; ASSERT (u0 & v0 & 1); u0 = (u0 >> 1) | (u1 << (GMP_LIMB_BITS - 1)); u1 >>= 1; v0 = (v0 >> 1) | (v1 << (GMP_LIMB_BITS - 1)); v1 >>= 1; while (u1 || v1) /* u1 == 0 can happen at most twice per call */ { mp_limb_t vgtu, t1, t0; sub_ddmmss (t1, t0, u1, u0, v1, v0); vgtu = LIMB_HIGHBIT_TO_MASK(t1); if (UNLIKELY (t0 == 0)) { if (t1 == 0) { r.hi = (u1 << 1) | (u0 >> (GMP_LIMB_BITS - 1)); r.lo = (u0 << 1) | 1; return r; } int c; count_trailing_zeros (c, t1); /* v1 = min (u1, v1) */ v1 += (vgtu & t1); /* u0 = |u1 - v1| */ u0 = (t1 ^ vgtu) - vgtu; ASSERT (c < GMP_LIMB_BITS - 1); u0 >>= c + 1; u1 = 0; } else { int c; count_trailing_zeros (c, t0); c++; /* V <-- min (U, V). Assembly version should use cmov. Another alternative, avoiding carry propagation, would be v0 += vgtu & t0; v1 += vtgu & (u1 - v1); */ add_ssaaaa (v1, v0, v1, v0, vgtu & t1, vgtu & t0); /* U <-- |U - V| No carry handling needed in this conditional negation, since t0 != 0. */ u0 = (t0 ^ vgtu) - vgtu; u1 = t1 ^ vgtu; if (UNLIKELY (c == GMP_LIMB_BITS)) { u0 = u1; u1 = 0; } else { u0 = (u0 >> c) | (u1 << (GMP_LIMB_BITS - c)); u1 >>= c; } } } while ((v0 | u0) & GMP_LIMB_HIGHBIT) { /* At most two iterations */ mp_limb_t vgtu, t0; int c; sub_ddmmss (vgtu, t0, 0, u0, 0, v0); if (UNLIKELY (t0 == 0)) { r.hi = u0 >> (GMP_LIMB_BITS - 1); r.lo = (u0 << 1) | 1; return r; } /* v <-- min (u, v) */ v0 += (vgtu & t0); /* u <-- |u - v| */ u0 = (t0 ^ vgtu) - vgtu; count_trailing_zeros (c, t0); u0 = (u0 >> 1) >> c; } r.lo = mpn_gcd_11 ((u0 << 1) + 1, (v0 << 1) + 1); r.hi = 0; return r; } static void one_test (mp_limb_t ah, mp_limb_t al, mp_limb_t bh, mp_limb_t bl, double_limb_t ref) { double_limb_t r = gcd_22 (ah, al, bh, bl); if (r.lo != ref.lo || r.hi != ref.hi) { gmp_fprintf (stderr, "gcd_22 (0x%Mx %08Mx, 0x%Mx %08Mx) failed, got: 0x%Mx %08Mx, ref: 0x%Mx %08Mx\n", ah, al, bh, bl, r.hi, r.lo, ref.hi, ref.lo); abort(); } } int main (int argc, char **argv) { mpz_t a, b; int count = 100000; int test; gmp_randstate_t rands; gmp_randinit_default (rands); mpz_init (a); mpz_init (b); for (test = 0; test < count; test++) { mp_limb_t al, ah, bl, bh; mp_bitcnt_t size = 1 + gmp_urandomm_ui(rands, 2*GMP_NUMB_BITS); if (test & 1) { mpz_urandomb (a, rands, size); mpz_urandomb (b, rands, size); } else { mpz_rrandomb (a, rands, size); mpz_rrandomb (b, rands, size); } mpz_setbit (a, 0); mpz_setbit (b, 0); al = mpz_getlimbn (a, 0); ah = mpz_getlimbn (a, 1); bl = mpz_getlimbn (b, 0); bh = mpz_getlimbn (b, 1); one_test (ah, al, bh, bl, ref_gcd_22 (ah, al, bh, bl)); } mpz_clear (a); mpz_clear (b); gmp_randclear (rands); return EXIT_SUCCESS; } -- Niels Möller. PGP-encrypted email is preferred. Keyid 368C6677. Internet email is subject to wholesale government surveillance. _______________________________________________ gmp-devel mailing list gmp-devel@gmplib.org https://gmplib.org/mailman/listinfo/gmp-devel