https://gcc.gnu.org/bugzilla/show_bug.cgi?id=97459

--- Comment #9 from Thomas Koenig <tkoenig at gcc dot gnu.org> ---
(In reply to Jakub Jelinek from comment #7)
> So, can we use this for anything but modulo 3, or 5, or 17, or 257 (all of
> those have 2^32 mod N == 2^64 mod N == 2^128 mod N == 1)

I think so, too.

> probably also
> keyed on the target having corresponding uaddv4_optab handler, normal
> expansion not being able to handle it and emitting a libcall?

Again, yes.

This can also be used as a building block for handling division
and remainder base 10.

Here's a benchmark for this (it uses the sum of digits base 10
instead). qsum1 uses the standard method, which you can find
(for example) in libgfortran.

div_rem5_v2 first calculates the remainder of the division by 5 using this
method, then does an exact division by multiplying with its modular inverse
for 2^128.

div_rem10_v2 then uses div_rem5_v2 to calculate the value and
remainder of the division by 10, and qsum_v2 uses that to
calculate the sum of digits.

The timings are about a factor of 2 faster than the straightforward
libcall version:

s = 360398898 qsum_v1: 1.091621 s
s = 360398898 qsum_v2: 0.485509 s


#include <stdio.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include <sys/time.h>
#include <stdlib.h>

#define ONE ((__uint128_t) 1)
#define TWO_64 (ONE << 64)

typedef __uint128_t mytype;

double this_time ()
{
  struct timeval tv;
  gettimeofday (&tv, NULL);
  return tv.tv_sec + tv.tv_usec * 1e-6;
}

unsigned
qsum_v1 (mytype n)
{
  unsigned ret;
  ret = 0;
  while (n > 0)
    {
      ret += n % 10;
      n = n / 10;
    }
  return ret;
}

static void inline __attribute__((always_inline))
div_rem_5_v2 (mytype n, mytype *div, unsigned *rem)
{
  unsigned long a, b, c;
  /* The modular inverse to 5 modulo 2^128  */
  const mytype magic = (0xCCCCCCCCCCCCCCCC * TWO_64 + 0xCCCCCCCCCCCCCCCD *
ONE);
  b = n >> 64;
  c = n;
  if (__builtin_add_overflow (b, c, &a))
    a++;

  *rem = a % 5;
  *div = (n-*rem) * magic;
}

static void inline __attribute__((always_inline))
div_rem_10_v2 (mytype n, mytype *div, unsigned *rem)
{
  mytype n5;
  unsigned rem5;
  div_rem_5_v2 (n, &n5, &rem5);
  *rem = rem5 + (n5 % 2) * 5;
  *div = n5/2;
}

unsigned
qsum_v2 (mytype n)
{
  unsigned ret;
  unsigned rem;
  mytype n_new;
  ret = 0;
  while (n > 0)
    {
      div_rem_10_v2 (n, &n_new, &rem);
      ret += rem;
      n = n_new;
    }
  return ret;
}

#define N 10000000

int main()
{
  mytype *a;
  unsigned long int s;
  double t1, t2;
  int fd;
  long int i;
  a = malloc (sizeof (*a) * N);
  fd = open ("/dev/urandom", O_RDONLY);
  read (fd, a, sizeof (*a) * N);

  s = 0;
  t1 = this_time();
  for (i=0; i<N; i++)
    s += qsum_v1(a[i]);

  t2 = this_time();
  printf ("s = %lu qsum_v1: %f s\n", s, (t2-t1));

  s = 0;
  t1 = this_time();
  for (i=0; i<N; i++)
    s += qsum_v2(a[i]);

  t2 = this_time();
  printf ("s = %lu qsum_v2: %f s\n", s, (t2-t1));

  return 0;
}

Reply via email to