https://gcc.gnu.org/bugzilla/show_bug.cgi?id=102989
--- Comment #70 from Jakub Jelinek <jakub at gcc dot gnu.org> --- For right shifts, I wonder if we shouldn't emit inline (perhaps with exception of -Os) something like: __attribute__((noipa)) void ashiftrt575 (unsigned long *p, unsigned long *q, int n) { int prec = 575; int n1 = n & 63; int n2 = n / 64; int n3 = n1 != 0; int n4 = (-n1) & 63; unsigned long ext; int i; for (i = n2; i < prec / 64 - n3; ++i) p[i - n2] = (q[i] >> n1) | (q[i + n3] << n4); ext = ((signed long) (q[prec / 64] << (64 - (prec & 63)))) >> (64 - (prec & 63)); if (n1 && i == prec / 64 - n3) { p[i - n2] = (q[i] >> n1) | (ext << n4); ++i; } i -= n2; p[i] = ((signed long) ext) >> n1; ext = ((signed long) ext) >> 63; for (++i; i < prec / 64 + 1; ++i) p[i] = ext; } __attribute__((noipa)) void lshiftrt575 (unsigned long *p, unsigned long *q, int n) { int prec = 575; int n1 = n & 63; int n2 = n / 64; int n3 = n1 != 0; int n4 = (-n1) & 63; unsigned long ext; int i; for (i = n2; i < prec / 64 - n3; ++i) p[i - n2] = (q[i] >> n1) | (q[i + n3] << n4); ext = q[prec / 64] & ((1UL << (prec % 64)) - 1); if (n1 && i == prec / 64 - n3) { p[i - n2] = (q[i] >> n1) | (ext << n4); ++i; } i -= n2; p[i] = ext >> n1; ext = 0; for (++i; i < prec / 64 + 1; ++i) p[i] = 0; } (for _BitInt(575) and 64-bit limb little endian). If the shift count is constant, it will allow further optimizations, and if e.g. get_nonzero_bits tells us that n is variable but multiple of limb precision, we can optimize some more as well. Looking at what LLVM does, they seem to sign extend in memory to twice as many bits and then just use an unrolled loop without any conditionals, but that doesn't look well for memory usage etc.