t...@gmplib.org (Torbjörn Granlund) writes: > Why would it be faster? Because while cnd1 needs to wait for the > previous iteration's double-limb shifting, cnd2 depends on just the > single-limb shifting of the high U world.
The below implementation appears to pass tests, and give a modest speedup of 0.2 cycles per input bit, or 2.5%. Benchmark, comparing C implementations of gcd_11 and gcd_22: $ ./tune/speed -p 100000 -s 1-64 -t 3 -C mpn_gcd_11 mpn_gcd_22 overhead 4.01 cycles, precision 100000 units of 1.03e-09 secs, CPU freq 974.93 MHz mpn_gcd_11 mpn_gcd_22 1 #7.0542 12.8931 4 #2.8895 6.7366 7 #3.2135 7.0463 10 #3.8417 7.2487 13 #4.4648 7.4802 16 #5.0947 7.4609 19 #5.2140 7.6955 22 #5.3684 7.5914 25 #5.2278 7.7002 28 #5.4690 7.6998 31 #5.3778 7.7374 34 #5.4362 7.7808 37 #5.5201 7.7397 40 #5.3979 7.7096 43 #5.4339 7.7887 46 #5.4229 7.7318 49 #5.4557 7.7160 52 #5.4047 7.6908 55 #5.4043 7.6791 58 #5.3859 7.6680 61 #5.4084 7.7529 64 #5.4277 7.6582 Regards, /Niels mp_double_limb_t mpn_gcd_22 (mp_limb_t u1, mp_limb_t u0, mp_limb_t v1, mp_limb_t v0) { mp_double_limb_t g; ASSERT_ALWAYS (u0 & v0 & 1); /* Implicit least significant bit */ u0 = (u0 >> 1) | (u1 << (GMP_LIMB_BITS - 1)); u1 >>= 1; v0 = (v0 >> 1) | (v1 << (GMP_LIMB_BITS - 1)); v1 >>= 1; while (u1 || v1) /* u1 == 0 can happen at most twice per call */ { mp_limb_t vgtu, t1, t0; sub_ddmmss (t1, t0, u1, u0, v1, v0); vgtu = LIMB_HIGHBIT_TO_MASK(u1 - v1); if (UNLIKELY (t0 == 0)) { if (t1 == 0) { g.d1 = (u1 << 1) | (u0 >> (GMP_LIMB_BITS - 1)); g.d0 = (u0 << 1) | 1; return g; } int c; count_trailing_zeros (c, t1); /* v1 = min (u1, v1) */ v1 += (vgtu & t1); /* u0 = |u1 - v1| */ u0 = (t1 ^ vgtu) - vgtu; ASSERT (c < GMP_LIMB_BITS - 1); u0 >>= c + 1; u1 = 0; } else { int c; count_trailing_zeros (c, t0); c++; /* V <-- min (U, V). Assembly version should use cmov. Another alternative, avoiding carry propagation, would be v0 += vgtu & t0; v1 += vtgu & (u1 - v1); */ add_ssaaaa (v1, v0, v1, v0, vgtu & t1, vgtu & t0); /* U <-- |U - V| No carry handling needed in this conditional negation, since t0 != 0. */ u0 = (t0 ^ vgtu) - vgtu; u1 = t1 ^ vgtu; if (UNLIKELY ((mp_limb_signed_t) u1 < 0)) { u0 = -u0; u1 = ~u1; } if (UNLIKELY (c == GMP_LIMB_BITS)) { u0 = u1; u1 = 0; } else { u0 = (u0 >> c) | (u1 << (GMP_LIMB_BITS - c)); u1 >>= c; } } } while ((v0 | u0) & GMP_LIMB_HIGHBIT) { /* At most two iterations */ mp_limb_t vgtu, t0; int c; sub_ddmmss (vgtu, t0, 0, u0, 0, v0); if (UNLIKELY (t0 == 0)) { g.d1 = u0 >> (GMP_LIMB_BITS - 1); g.d0 = (u0 << 1) | 1; return g; } /* v <-- min (u, v) */ v0 += (vgtu & t0); /* u <-- |u - v| */ u0 = (t0 ^ vgtu) - vgtu; count_trailing_zeros (c, t0); u0 = (u0 >> 1) >> c; } g.d0 = mpn_gcd_11 ((u0 << 1) + 1, (v0 << 1) + 1); g.d1 = 0; return g; } -- Niels Möller. PGP-encrypted email is preferred. Keyid 368C6677. Internet email is subject to wholesale government surveillance. _______________________________________________ gmp-devel mailing list gmp-devel@gmplib.org https://gmplib.org/mailman/listinfo/gmp-devel