https://gcc.gnu.org/bugzilla/show_bug.cgi?id=97459
--- Comment #9 from Thomas Koenig <tkoenig at gcc dot gnu.org> --- (In reply to Jakub Jelinek from comment #7) > So, can we use this for anything but modulo 3, or 5, or 17, or 257 (all of > those have 2^32 mod N == 2^64 mod N == 2^128 mod N == 1) I think so, too. > probably also > keyed on the target having corresponding uaddv4_optab handler, normal > expansion not being able to handle it and emitting a libcall? Again, yes. This can also be used as a building block for handling division and remainder base 10. Here's a benchmark for this (it uses the sum of digits base 10 instead). qsum1 uses the standard method, which you can find (for example) in libgfortran. div_rem5_v2 first calculates the remainder of the division by 5 using this method, then does an exact division by multiplying with its modular inverse for 2^128. div_rem10_v2 then uses div_rem5_v2 to calculate the value and remainder of the division by 10, and qsum_v2 uses that to calculate the sum of digits. The timings are about a factor of 2 faster than the straightforward libcall version: s = 360398898 qsum_v1: 1.091621 s s = 360398898 qsum_v2: 0.485509 s #include <stdio.h> #include <sys/types.h> #include <sys/stat.h> #include <fcntl.h> #include <unistd.h> #include <sys/time.h> #include <stdlib.h> #define ONE ((__uint128_t) 1) #define TWO_64 (ONE << 64) typedef __uint128_t mytype; double this_time () { struct timeval tv; gettimeofday (&tv, NULL); return tv.tv_sec + tv.tv_usec * 1e-6; } unsigned qsum_v1 (mytype n) { unsigned ret; ret = 0; while (n > 0) { ret += n % 10; n = n / 10; } return ret; } static void inline __attribute__((always_inline)) div_rem_5_v2 (mytype n, mytype *div, unsigned *rem) { unsigned long a, b, c; /* The modular inverse to 5 modulo 2^128 */ const mytype magic = (0xCCCCCCCCCCCCCCCC * TWO_64 + 0xCCCCCCCCCCCCCCCD * ONE); b = n >> 64; c = n; if (__builtin_add_overflow (b, c, &a)) a++; *rem = a % 5; *div = (n-*rem) * magic; } static void inline __attribute__((always_inline)) div_rem_10_v2 (mytype n, mytype *div, unsigned *rem) { mytype n5; unsigned rem5; div_rem_5_v2 (n, &n5, &rem5); *rem = rem5 + (n5 % 2) * 5; *div = n5/2; } unsigned qsum_v2 (mytype n) { unsigned ret; unsigned rem; mytype n_new; ret = 0; while (n > 0) { div_rem_10_v2 (n, &n_new, &rem); ret += rem; n = n_new; } return ret; } #define N 10000000 int main() { mytype *a; unsigned long int s; double t1, t2; int fd; long int i; a = malloc (sizeof (*a) * N); fd = open ("/dev/urandom", O_RDONLY); read (fd, a, sizeof (*a) * N); s = 0; t1 = this_time(); for (i=0; i<N; i++) s += qsum_v1(a[i]); t2 = this_time(); printf ("s = %lu qsum_v1: %f s\n", s, (t2-t1)); s = 0; t1 = this_time(); for (i=0; i<N; i++) s += qsum_v2(a[i]); t2 = this_time(); printf ("s = %lu qsum_v2: %f s\n", s, (t2-t1)); return 0; }