https://gcc.gnu.org/bugzilla/show_bug.cgi?id=97459
--- Comment #5 from Thomas Koenig <tkoenig at gcc dot gnu.org> --- OK, so here is a benchmark with its function names corrected. It also includes one version (_v5) which is a bit faster. (Note I increased the number of iterations to get more accuracy out of the cycle count, which leads to numbers not being comparable to the previous benchmark.) #include <stdio.h> #include <sys/types.h> #include <sys/stat.h> #include <fcntl.h> #include <unistd.h> #include <x86intrin.h> unsigned r3_128u_v2 (__uint128_t n) { return (unsigned) (n%3); } unsigned r3_128u_v3 (__uint128_t n) { unsigned long a; a = (n >> 88); a += (n >> 44) & 0xfffffffffffULL; a += (n & 0xfffffffffffULL); return a % 3; } unsigned r3_128u_v4 (__uint128_t n) { unsigned long a; a = (n >> 96); a += (n >> 64) & 0xffffffffULL; a += (n >> 32) & 0xffffffffULL; a += (n & 0xffffffffULL); return a % 3; } unsigned r3_128u_v5 (__uint128_t n) { unsigned long a, b, c; b = n >> 64; c = n; if (__builtin_add_overflow (b, c, &a)) a++; return a%3; } #define N 100000000 int main() { __uint128_t *a; unsigned int s; unsigned long t1, t2; int fd; int i; a = malloc (sizeof (*a) * N); fd = open ("/dev/random", O_RDONLY); read (fd, a, sizeof (*a) * N); s = 0; t1 = __rdtsc(); for (i=0; i<N; i++) s += r3_128u_v2(a[i]); t2 = __rdtsc(); printf ("s = %u r3_128u_v2: %f cycles per iteration\n", s, (t2-t1)/(double) N); s = 0; t1 = __rdtsc(); for (i=0; i<N; i++) s += r3_128u_v3(a[i]); t2 = __rdtsc(); printf ("s = %u r3_128u_v3: %f cycles per iteration\n", s, (t2-t1)/(double) N); s = 0; t1 = __rdtsc(); for (i=0; i<N; i++) s += r3_128u_v4(a[i]); t2 = __rdtsc(); printf ("s = %u r3_128u_v4: %f cycles per iteration\n", s, (t2-t1)/(double) N); s = 0; t1 = __rdtsc(); for (i=0; i<N; i++) s += r3_128u_v5(a[i]); t2 = __rdtsc(); printf ("s = %u r3_128u_v5: %f cycles per iteration\n", s, (t2-t1)/(double) N); } This gets me s = 6 r3_128u_v2: 12.638648 cycles per iteration s = 6 r3_128u_v3: 5.588043 cycles per iteration s = 6 r3_128u_v4: 5.524949 cycles per iteration s = 6 r3_128u_v5: 3.539010 cycles per iteration so the _v5 version seems to be the fastest (so far).