https://gcc.gnu.org/bugzilla/show_bug.cgi?id=118499
--- Comment #24 from Thomas Koenig <tkoenig at gcc dot gnu.org> ---
Considering that ctz is rather expensive, comparable to an
integer multiplication, I think I will do away with this
optimization altogether - we are spending log2(n) imuls anyway.
I think the library version should look something like (where I know
that the first n == 0 comparison is not needed, but I had it in
for testing purposes).
#define XTYPE uint64_t
#define NTYPE uint32_t
XTYPE static inline
power_simple (XTYPE x, NTYPE n)
{
XTYPE pow = 1;
if (n == 0)
return 1;
for (;;)
{
if (n & 1)
pow *= x;
n >>= 1;
if (n)
x *= x;
else
break;
}
return pow;
}
XTYPE
power_complicated (XTYPE x, NTYPE n)
{
const XTYPE mask = (XTYPE) (-1) / 2;
if (n == 0)
return 1;
if (x == 0)
return 0;
if (x & 1)
return power_simple (x, n & mask);
if (n > sizeof (x) * 8)
return 0;
return power_simple (x, n);
}