https://gcc.gnu.org/bugzilla/show_bug.cgi?id=102989

--- Comment #70 from Jakub Jelinek <jakub at gcc dot gnu.org> ---
For right shifts, I wonder if we shouldn't emit inline (perhaps with exception
of -Os) something like:
__attribute__((noipa)) void
ashiftrt575 (unsigned long *p, unsigned long *q, int n)
{
  int prec = 575;
  int n1 = n & 63;
  int n2 = n / 64;
  int n3 = n1 != 0;
  int n4 = (-n1) & 63;
  unsigned long ext;
  int i;
  for (i = n2; i < prec / 64 - n3; ++i)
    p[i - n2] = (q[i] >> n1) | (q[i + n3] << n4);
  ext = ((signed long) (q[prec / 64] << (64 - (prec & 63)))) >> (64 - (prec &
63));
  if (n1 && i == prec / 64 - n3)
    {
      p[i - n2] = (q[i] >> n1) | (ext << n4);
      ++i;
    }
  i -= n2;
  p[i] = ((signed long) ext) >> n1;
  ext = ((signed long) ext) >> 63;
  for (++i; i < prec / 64 + 1; ++i)
    p[i] = ext;
}

__attribute__((noipa)) void
lshiftrt575 (unsigned long *p, unsigned long *q, int n)
{
  int prec = 575;
  int n1 = n & 63;
  int n2 = n / 64;
  int n3 = n1 != 0;
  int n4 = (-n1) & 63;
  unsigned long ext;
  int i;
  for (i = n2; i < prec / 64 - n3; ++i)
    p[i - n2] = (q[i] >> n1) | (q[i + n3] << n4);
  ext = q[prec / 64] & ((1UL << (prec % 64)) - 1);
  if (n1 && i == prec / 64 - n3)
    {
      p[i - n2] = (q[i] >> n1) | (ext << n4);
      ++i;
    }
  i -= n2;
  p[i] = ext >> n1;
  ext = 0;
  for (++i; i < prec / 64 + 1; ++i)
    p[i] = 0;
}
(for _BitInt(575) and 64-bit limb little endian).  If the shift count is
constant, it will allow further optimizations,
and if e.g. get_nonzero_bits tells us that n is variable but multiple of limb
precision, we can optimize some more as well.
Looking at what LLVM does, they seem to sign extend in memory to twice as many
bits and then just use an unrolled loop without any conditionals, but that
doesn't look well for memory usage etc.

Reply via email to