https://gcc.gnu.org/bugzilla/show_bug.cgi?id=97459

--- Comment #5 from Thomas Koenig <tkoenig at gcc dot gnu.org> ---
OK, so here is a benchmark with its function names corrected. It also
includes one version (_v5) which is a bit faster.

(Note I increased the number of iterations to get more accuracy out
of the cycle count, which leads to numbers not being comparable
to the previous benchmark.)

#include <stdio.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include <x86intrin.h>

unsigned r3_128u_v2 (__uint128_t n)
{
  return (unsigned) (n%3);
}

unsigned r3_128u_v3 (__uint128_t n)
{
  unsigned long a;
  a = (n >> 88);
  a += (n >> 44) & 0xfffffffffffULL;
  a += (n & 0xfffffffffffULL);
  return a % 3;
}

unsigned r3_128u_v4 (__uint128_t n)
{
  unsigned long a;
  a = (n >> 96);
  a += (n >> 64) & 0xffffffffULL;
  a += (n >> 32) & 0xffffffffULL;
  a += (n & 0xffffffffULL);
  return a % 3;
}

unsigned r3_128u_v5 (__uint128_t n)
{
  unsigned long a, b, c;
  b = n >> 64;
  c = n;
  if (__builtin_add_overflow (b, c, &a))
    a++;

  return a%3;
}

#define N 100000000

int main()
{
  __uint128_t *a;
  unsigned int s;
  unsigned long t1, t2;
  int fd;
  int i;
  a = malloc (sizeof (*a) * N);
  fd = open ("/dev/random", O_RDONLY);
  read (fd, a, sizeof (*a) * N);
  s = 0;
  t1 = __rdtsc();
  for (i=0; i<N; i++)
    s += r3_128u_v2(a[i]);
  t2 = __rdtsc();
  printf ("s = %u r3_128u_v2: %f cycles per iteration\n", s, (t2-t1)/(double)
N);

  s = 0;
  t1 = __rdtsc();
  for (i=0; i<N; i++)
    s += r3_128u_v3(a[i]);
  t2 = __rdtsc();
  printf ("s = %u r3_128u_v3: %f cycles per iteration\n", s, (t2-t1)/(double)
N);

  s = 0;
  t1 = __rdtsc();
  for (i=0; i<N; i++)
    s += r3_128u_v4(a[i]);
  t2 = __rdtsc();
  printf ("s = %u r3_128u_v4: %f cycles per iteration\n", s, (t2-t1)/(double)
N);

  s = 0;
  t1 = __rdtsc();
  for (i=0; i<N; i++)
    s += r3_128u_v5(a[i]);
  t2 = __rdtsc();
  printf ("s = %u r3_128u_v5: %f cycles per iteration\n", s, (t2-t1)/(double)
N);

}

This gets me

s = 6 r3_128u_v2: 12.638648 cycles per iteration
s = 6 r3_128u_v3: 5.588043 cycles per iteration
s = 6 r3_128u_v4: 5.524949 cycles per iteration
s = 6 r3_128u_v5: 3.539010 cycles per iteration

so the _v5 version seems to be the fastest (so far).

Reply via email to