Don't count the existing Newton-Raphson out. It turns out that to get enough
precision for 32 bits, only 4 iterations are needed. By unrolling those, it
gets much better timing.

Slightly gross test program (with original cubic wraparound bug fixed).

---
/* Test and measure perf of cube root algorithms.  */
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <math.h>

#ifdef __x86_64

#define rdtscll(val) do { \
     unsigned int __a,__d; \
     asm volatile("rdtsc" : "=a" (__a), "=d" (__d)); \
     (val) = ((unsigned long)__a) | (((unsigned long)__d)<<32); \
} while(0)

# define do_div(n,base) ({                                      \
        uint32_t __base = (base);                               \
        uint32_t __rem;                                         \
        __rem = ((uint64_t)(n)) % __base;                       \
        (n) = ((uint64_t)(n)) / __base;                         \
        __rem;                                                  \
 })


/**
 * __ffs - find first bit in word.
 * @word: The word to search
 *
 * Undefined if no bit exists, so code should check against 0 first.
 */
static __inline__ unsigned long __ffs(unsigned long word)
{
        __asm__("bsfq %1,%0"
                :"=r" (word)
                :"rm" (word));
        return word;
}

/*
 * __fls: find last bit set.
 * @word: The word to search
 *
 * Undefined if no zero exists, so code should check against ~0UL first.
 */
static inline unsigned long __fls(unsigned long word)
{
        __asm__("bsrq %1,%0"
                :"=r" (word)
                :"rm" (word));
        return word;
}

/**
 * ffs - find first bit set
 * @x: the word to search
 *
 * This is defined the same way as
 * the libc and compiler builtin ffs routines, therefore
 * differs in spirit from the above ffz (man ffs).
 */
static __inline__ int ffs(int x)
{
        int r;

        __asm__("bsfl %1,%0\n\t"
                "cmovzl %2,%0" 
                : "=r" (r) : "rm" (x), "r" (-1));
        return r+1;
}

/**
 * fls - find last bit set
 * @x: the word to search
 *
 * This is defined the same way as ffs.
 */
static inline int fls(int x)
{
        int r;

        __asm__("bsrl %1,%0\n\t"
                "cmovzl %2,%0"
                : "=&r" (r) : "rm" (x), "rm" (-1));
        return r+1;
}

/**
 * fls64 - find last bit set in 64 bit word
 * @x: the word to search
 *
 * This is defined the same way as fls.
 */
static inline int fls64(uint64_t x)
{
        if (x == 0)
                return 0;
        return __fls(x) + 1;
}

static inline uint64_t div64_64(uint64_t dividend, uint64_t divisor)
{
        return dividend / divisor;
}

#elif __i386

#define rdtscll(val) \
     __asm__ __volatile__("rdtsc" : "=A" (val))

/**
 * ffs - find first bit set
 * @x: the word to search
 *
 * This is defined the same way as
 * the libc and compiler builtin ffs routines, therefore
 * differs in spirit from the above ffz() (man ffs).
 */
static inline int ffs(int x)
{
        int r;

        __asm__("bsfl %1,%0\n\t"
                "jnz 1f\n\t"
                "movl $-1,%0\n"
                "1:" : "=r" (r) : "rm" (x));
        return r+1;
}

/**
 * fls - find last bit set
 * @x: the word to search
 *
 * This is defined the same way as ffs().
 */
static inline int fls(int x)
{
        int r;

        __asm__("bsrl %1,%0\n\t"
                "jnz 1f\n\t"
                "movl $-1,%0\n"
                "1:" : "=r" (r) : "rm" (x));
        return r+1;
}

static inline int fls64(uint64_t x)
{
        uint32_t h = x >> 32;
        if (h)
                return fls(h) + 32;
        return fls(x);
}


#define do_div(n,base) ({ \
        unsigned long __upper, __low, __high, __mod, __base; \
        __base = (base); \
        asm("":"=a" (__low), "=d" (__high):"A" (n)); \
        __upper = __high; \
        if (__high) { \
                __upper = __high % (__base); \
                __high = __high / (__base); \
        } \
        asm("divl %2":"=a" (__low), "=d" (__mod):"rm" (__base), "0" (__low), 
"1" (__upper)); \
        asm("":"=A" (n):"a" (__low),"d" (__high)); \
        __mod; \
})


/* 64bit divisor, dividend and result. dynamic precision */
static uint64_t div64_64(uint64_t dividend, uint64_t divisor)
{
        uint32_t d = divisor;

        if (divisor > 0xffffffffULL) {
                unsigned int shift = fls(divisor >> 32);

                d = divisor >> shift;
                dividend >>= shift;
        }

        /* avoid 64 bit division if possible */
        if (dividend >> 32)
                do_div(dividend, d);
        else
                dividend = (uint32_t) dividend / d;

        return dividend;
}
#endif

/* Andi Kleen's version */
uint32_t acbrt(uint64_t x)
{
        uint32_t y = 0;
        int s;

        for (s = 63; s >= 0; s -= 3) {
                uint64_t b, bs;

                y = 2 * y;
                b = 3 * y * (y+1) + 1;
                bs = b << s;
                if (x >= bs && (b == (bs>>s))) {  /* avoid overflow */
                        x -= bs;
                        y++;
                }
        }
        return y;
}

/* My version of hacker's delight */
uint32_t hcbrt(uint64_t x)
{
        int s = 60;
        uint32_t y = 0;

        do {
                uint64_t b;
                y = 2*y;
                b = (uint64_t)(3*y*(y + 1) + 1) << s;
                s = s - 3;
                if (x >= b) {
                        x = x - b;
                        y = y + 1;
                }
        } while(s >= 0);

        return y;
}

/* calculate the cubic root of x using Newton-Raphson */
static uint32_t ocubic(uint64_t a)
{
        uint32_t x, x1;

        /* Initial estimate is based on:
         * cbrt(x) = exp(log(x) / 3)
         */
        x = 1u << (fls64(a)/3);

        /*
         * Iteration based on:
         *                         2
         * x    = ( 2 * x  +  a / x  ) / 3
         *  k+1          k         k
         */
        do {
                uint64_t x2;

                x2 = x;
                x2 *= x;
                x1 = x;

                x = (2 * x + div64_64(a, x2)) / 3;
        } while (abs(x1 - x) > 1);

        return x;
}

/* calculate the cubic root of x using Newton-Raphson */
static uint32_t ncubic(uint64_t a)
{
        uint64_t x;

        /* Initial estimate is based on:
         * cbrt(x) = exp(log(x) / 3)
         */
        x = 1u << (fls64(a)/3);

        /* Converges in 3 iterations to > 32 bits */

        x = (2 * x + div64_64(a, x*x)) / 3;
        x = (2 * x + div64_64(a, x*x)) / 3;
        x = (2 * x + div64_64(a, x*x)) / 3;

        return x;
}

static const struct cbrt {
        uint64_t        in;
        uint32_t        result;
} cases[] = {
        {1, 1}, {2, 1}, {3, 1}, {4, 1}, {5, 1}, {6, 1}, {7, 1}, 
        {8, 2}, {9, 2}, {10, 2}, {11, 2}, {12, 2}, {13, 2}, {14, 2},
        {15, 2}, {16, 2}, {17, 2}, {18, 2}, {19, 2}, {20, 2}, {21, 2}, {22, 2},
        {23, 2}, {24, 2}, {25, 2}, {26, 2}, {27, 3}, {28, 3}, {29, 3}, {30, 3},
        {31, 3}, {32, 3}, {33, 3}, {34, 3}, {35, 3}, {36, 3}, {37, 3}, {38, 3},
        {39, 3}, {40, 3}, {99, 4}, {100, 4}, {101, 4}, 
        { 125ull, 5 }, { 216ull, 6 },  { 343ull, 7 }, { 512ull, 8 }, 
        { 1000ull, 10 },  { 1331ull, 11 },
        { 8000ull, 20 },  { 9261ull, 21 },
        {32767, 31},     {32768, 32}, {32769, 32}, 
        { 64000ull, 40 },  { 68921ull, 41 },
        { 512000ull, 80 },  { 531441ull, 81 },
        { 1000000ull, 100 },  { 1030301ull, 101 },
        { 4096000ull, 160 },  { 4173281ull, 161 },
        { 16387064ull, 254 },  { 16581375ull, 255 },
        { 16777216ull, 256 },  { 16974593ull, 257 },
        { 131096512ull, 508 },  { 131872229ull, 509 },
        { 132651000ull, 510 },  { 133432831ull, 511 },
        { 134217728ull, 512 },  { 135005697ull, 513 },
        { 1000000000ull, 1000 },  { 1003003001ull, 1001 },
        { 1006012008ull, 1002 },  { 1009027027ull, 1003 },
        { 1061208000ull, 1020 },  { 1064332261ull, 1021 },
        { 1067462648ull, 1022 },  { 1070599167ull, 1023 },
        
        {1073741823, 1023}, {1073741824, 1024}, {1073741825, 1024},
        {~0, 2097151},
/* 100 random values */
 { 7749363893351949254ull, 1978891},  { 7222815480849057907ull, 1933016},
 { 8408462745175416063ull, 2033475},  { 3091884191388096748ull, 1456826},
 { 2562019500164152525ull, 1368340},  { 4403210617922443179ull, 1639041},
 { 3364542905362882299ull, 1498449},  { 8782769017716072774ull, 2063211},
 { 5863405773976003266ull, 1803225},  { 1306053050111174648ull, 1093084},
 { 150346236956174824ull, 531737},  { 1265737889039205261ull, 1081719},
 { 1445109530774087002ull, 1130577},  { 1197105577171186275ull, 1061803},
 { 9213452462461015967ull, 2096399},  { 4730966302945445786ull, 1678739},
 { 5650605098630667570ull, 1781141},  { 5880381756353009591ull, 1804963},
 { 4552499520046621784ull, 1657359},  { 2697991130065918298ull, 1392131},
 { 4858364911220984157ull, 1693674},  { 3691457481531040535ull, 1545489},
 { 2613117305472506601ull, 1377377},  { 7449943749836318932ull, 1953069},
 { 643378865959570610ull, 863287},  { 4851450802679832774ull, 1692871},
 { 1772859812839988916ull, 1210295},  { 8210946489571640849ull, 2017426},
 { 591875965497384322ull, 839608},  { 4221553402965100097ull, 1616183},
 { 2197744667347238205ull, 1300146},  { 8321400714356781191ull, 2026432},
 { 2459557415995497961ull, 1349850},  { 3460673533926954145ull, 1512586},
 { 4727304344741345505ull, 1678306},  { 4903203917250634599ull, 1698869},
 { 4036494370831490817ull, 1592214},  { 8585205035691420311ull, 2047624},
 { 2622143824319236828ull, 1378961},  { 5902762718897731478ull, 1807250},
 { 6344401509618197560ull, 1851243},  { 4059247793194552874ull, 1595200},
 { 7648030174294342832ull, 1970228},  { 2111858627070002939ull, 1282985},
 { 3231502273651985583ull, 1478432},  { 8821862535190318932ull, 2066268},
 { 6062559696943389464ull, 1823414},  { 4054224670122353756ull, 1594541},
 { 3674929609692563482ull, 1543179},  { 6310802012126231363ull, 1847969},
 { 4450190829039920890ull, 1644849},  { 8764531173541462842ull, 2061782},
 { 1361923252301505833ull, 1108453},  { 5912924843615600614ull, 1808287},
 { 5714768882048811324ull, 1787857},  { 7249589769047033748ull, 1935401},
 { 4123157012528828376ull, 1603528},  { 1729687638268160097ull, 1200390},
 { 5132287771298228729ull, 1724925},  { 1564349257200314043ull, 1160854},
 { 951586254223522969ull, 983594},  { 4569664949094662293ull, 1659439},
 { 9082730968228181483ull, 2086437},  { 6312891027251024051ull, 1848173},
 { 6915415788559031791ull, 1905194},  { 2713150456497618688ull, 1394733},
 { 5390954890749602465ull, 1753430},  { 1405547745908296421ull, 1120164},
 { 1157301728707637259ull, 1049902},  { 1513573187112042448ull, 1148156},
 { 687416080475161159ull, 882551},  { 484496930861389501ull, 785411},
 { 1625256440396143907ull, 1175729},  { 7358388240824901288ull, 1945035},
 { 6055730836615196283ull, 1822729},  { 5897962221937294789ull, 1806760},
 { 862205218853780339ull, 951780},  { 4798091009445823173ull, 1686641},
 { 644772714391937867ull, 863910},  { 4255852691293155171ull, 1620549},
 { 5287931004512034672ull, 1742188},  { 479051048987854372ull, 782457},
 { 9223312736680112286ull, 2097147},  { 8208392001457969628ull, 2017217},
 { 9203071384420047828ull, 2095612},  { 8029313043584389618ull, 2002439},
 { 38384068872053008ull, 337326},  { 5477688516749455419ull, 1762784},
 { 1504622508868036557ull, 1145888},  { 8421184723110053200ull, 2034500},
 { 3312070181890020423ull, 1490618},  { 5344298403762143580ull, 1748357},
 { 6340030040222269807ull, 1850818},  { 4895839553118470425ull, 1698018},
 { 2806627376195262363ull, 1410570},  { 5321619225005368821ull, 1745880},
 { 6897323351052656353ull, 1903532},  { 326700202259382556ull, 688731},
 { 7685269066741890339ull, 1973420},  { 8054506481558450217ull, 2004531},
};
#define NCASES (sizeof(cases)/sizeof(cases[0]))


static double ticks_per_usec;

static void show(const char *func, uint64_t sum, uint64_t sq, 
                 unsigned long long mx, unsigned long err)
{
        double mean, std;

        mean = (double) sum / ticks_per_usec / NCASES ;
        std = sqrtl( (double) sq / ticks_per_usec / NCASES - mean * mean);

        printf("%-10s %8llu %8.2f %8.2f %8.2f %lu\n", func, 
               (unsigned long long) sum / NCASES, mean, std, 
               (double) mx / ticks_per_usec, err);
}


int main(int argc, char **argv)
{
        int i;
        uint32_t c;
        unsigned long long start, end, t, mx;
        unsigned long long err, sum, sum_sq;

        rdtscll(start);
        sleep(2);
        rdtscll(end);
        ticks_per_usec = (double) (end - start) / 2000000.;
        printf("Function     clocks  mean(us) max(us)  std(us)  total error\n");

#define DOTEST(func) \
        if (func(27) != 3) printf("ouch\n"); \
        sum = sum_sq = mx = 0; \
        for (err = 0, i = 0; i < NCASES; i++) { \
                rdtscll(start);                                         \
                c = func(cases[i].in); \
                rdtscll(end); \
                t = end - start; sum += t; sum_sq += t*t; \
                if (t > mx) mx = t; \
                err += abs((long) cases[i].result - c); \
        } \
        show(#func, sum, sum_sq, mx, err);

        DOTEST(ocubic);
        DOTEST(ncubic);
        DOTEST(acbrt);
        DOTEST(hcbrt);
        return 0;
}
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to