Re: [RFC] div64_64 support

Sami Farin Wed, 07 Mar 2007 11:19:07 -0800

On Wed, Mar 07, 2007 at 11:11:49 -0500, Chuck Ebbert wrote:
> Sami Farin wrote:
> > On Tue, Mar 06, 2007 at 23:53:49 +0200, Sami Farin wrote:
> > ...
> >> And I found bug in gcc-4.1.2, it gave 0 for ncubic results
> >> when doing 1000 loops test... gcc-4.0.3 works.
> > 
> > Found it.
> > 
> > --- cbrt-test.c~    2007-03-07 00:20:54.735248105 +0200
> > +++ cbrt-test.c     2007-03-07 00:21:03.964864343 +0200
> > @@ -209,7 +209,7 @@
> >  
> >     __asm__("bsrl %1,%0\n\t"
> >             "cmovzl %2,%0"
> > -           : "=&r" (r) : "rm" (x), "rm" (-1));
> > +           : "=&r" (r) : "rm" (x), "rm" (-1) : "memory");
> >     return r+1;
> >  }
> >  
> > Now Linux 2.6 does not have "memory" in fls, maybe it causes
> > some gcc funnies some people are seeing.
> 
> Can you post the difference in the generated code with that change?


Fun.. looks when not using "memory" gcc does not even bother
calling ncubic() 666 times.  So it gets better timings ( 42/666=0 ) =)

--- cbrt-test-no_memory.s       2007-03-07 20:22:27.838466385 +0200
+++ cbrt-test-using_memory.s    2007-03-07 20:22:38.237013197 +0200
...
 main:
        leal    4(%esp), %ecx
        andl    $-16, %esp
        pushl   -4(%ecx)
        pushl   %ebp
        pushl   %edi
        pushl   %esi
        pushl   %ebx
        pushl   %ecx
-       subl    $136, %esp
+       subl    $152, %esp
        movl    $.LC0, (%esp)
        call    puts
        xorl    %edx, %edx
        movl    $27, %eax
        call    ncubic
        cmpl    $3, %eax
-       je      .L83
+       je      .L87
        movl    $.LC1, (%esp)
        call    puts
-.L83:
-       xorl    %eax, %eax
-       xorl    %edi, %edi
-       movl    %eax, 88(%esp)
+.L87:
        xorl    %eax, %eax
-       xorl    %esi, %esi
+       xorl    %ebp, %ebp
        movl    %eax, 92(%esp)
        xorl    %eax, %eax
-       xorl    %ebp, %ebp
+       xorl    %edi, %edi
        movl    %eax, 96(%esp)
        xorl    %eax, %eax
+       xorl    %esi, %esi
        movl    %eax, 100(%esp)
        xorl    %eax, %eax
        movl    %eax, 104(%esp)
        xorl    %eax, %eax
        movl    %eax, 108(%esp)
-       movl    %edi, 112(%esp)
-       movl    %esi, 116(%esp)
-       .p2align 4,,15
-.L84:
+       xorl    %eax, %eax
+       movl    %eax, 112(%esp)
+       movl    %ebp, 116(%esp)
+       movl    %edi, 120(%esp)
+       movl    %esi, 124(%esp)
+.L88:
 #APP
        movl $0, %eax
        cpuid
        rdtsc
 
 #NO_APP
        movl    %eax, 56(%esp)
        movl    %edx, 60(%esp)
 #APP
        movl $0, %eax
        cpuid
        rdtsc
 
 #NO_APP
        movl    %eax, %esi
        movl    %edx, %edi
        subl    56(%esp), %esi
        sbbl    60(%esp), %edi
        cmpl    $0, %edi
        ja      .L66
        cmpl    $999, %esi
-       jbe     .L84
+       jbe     .L88
 .L66:
+       movl    92(%esp), %edx
+       leal    (%edx,%edx,2), %eax
+       movl    cases+4(,%eax,4), %edi
+       movl    cases(,%eax,4), %esi
+       movl    %edi, %edx
+       movl    %esi, %eax
+       call    ncubic
 #APP
        movl $0, %eax
        cpuid
        rdtsc
 
 #NO_APP
-       movl    %eax, %esi
-       movl    %edx, %edi
+       movl    $666, %ebx
+       movl    %eax, 128(%esp)
+       movl    %edx, 132(%esp)
+       .p2align 4,,15
+.L67:
+       movl    %esi, %eax
+       movl    %edi, %edx
+       call    ncubic
+       decl    %ebx
+       movl    %eax, %ebp
+       jne     .L67
 #APP
        movl $0, %eax
        cpuid
        rdtsc
 
 #NO_APP
-       subl    %esi, %eax
+       subl    128(%esp), %eax
        movl    $666, %ebx
-       sbbl    %edi, %edx
-       xorl    %ecx, %ecx
        movl    %ebx, 8(%esp)
+       sbbl    132(%esp), %edx
+       xorl    %ecx, %ecx
        movl    %ecx, 12(%esp)
        movl    %eax, (%esp)
        movl    %edx, 4(%esp)
        call    __udivdi3
-       addl    %eax, 104(%esp)
+       addl    %eax, 112(%esp)
        movl    %edx, %ecx
        movl    %eax, %ebx
        movl    %edx, %esi
-       adcl    %edx, 108(%esp)
+       adcl    %edx, 116(%esp)
        imull   %eax, %ecx
        mull    %ebx
        addl    %ecx, %ecx
        movl    %eax, 56(%esp)
        addl    %ecx, %edx
        movl    56(%esp), %eax
-       addl    %eax, 112(%esp)
+       addl    %eax, 120(%esp)
        movl    %edx, 60(%esp)
        movl    60(%esp), %edx
-       adcl    %edx, 116(%esp)
-       cmpl    %esi, 92(%esp)
-       ja      .L67
-       jb      .L68
-       cmpl    %ebx, 88(%esp)
-       jae     .L67
-.L68:
-       movl    %ebx, 88(%esp)
-       movl    %esi, 92(%esp)
-.L67:
-       leal    (%ebp,%ebp,2), %ebx
-       sall    $2, %ebx
-       movl    cases+4(%ebx), %edx
-       movl    cases(%ebx), %eax
-       call    ncubic
-       movl    cases+8(%ebx), %edx
-       subl    %eax, %edx
-       movl    %edx, %eax
-       sarl    $31, %eax
-       xorl    %eax, %edx
-       subl    %eax, %edx
-       movl    %edx, %ecx
-       sarl    $31, %ecx
-       addl    %edx, 96(%esp)
-       adcl    %ecx, 100(%esp)
-       incl    %ebp
-       cmpl    $183, %ebp
-       jbe     .L84
-       movl    108(%esp), %eax
-       fildll  104(%esp)
-       testl   %eax, %eax
-       js      .L85
+       adcl    %edx, 124(%esp)
+       cmpl    %esi, 100(%esp)
+       ja      .L69
+       jb      .L70
+       cmpl    %ebx, 96(%esp)
+       jae     .L69
 .L70:
-       fstpl   120(%esp)
+       movl    %ebx, 96(%esp)
+       movl    %esi, 100(%esp)
+.L69:
+       movl    92(%esp), %edx
+       leal    (%edx,%edx,2), %eax
+       movl    cases+8(,%eax,4), %eax
+       subl    %ebp, %eax
+       movl    %eax, %ecx
+       sarl    $31, %ecx
+       xorl    %ecx, %eax
+       subl    %ecx, %eax
+       cltd
+       addl    %eax, 104(%esp)
+       adcl    %edx, 108(%esp)
+       incl    92(%esp)
+       cmpl    $183, 92(%esp)
+       jbe     .L88
        movl    116(%esp), %eax
-       fldl    120(%esp)
+       fildll  112(%esp)
+       testl   %eax, %eax
+       js      .L89
+.L72:
+       fstpl   136(%esp)
+       movl    124(%esp), %eax
+       fldl    136(%esp)
        fdivl   .LC7
        testl   %eax, %eax
        flds    .LC4
        fdivr   %st, %st(1)
-       fildll  112(%esp)
-       js      .L86
-.L71:
-       fstpl   120(%esp)
-       fldl    120(%esp)
+       fildll  120(%esp)
+       js      .L90
+.L73:
+       fstpl   136(%esp)
+       fldl    136(%esp)
        fdivl   .LC7
        fdivp   %st, %st(1)
        fld     %st(1)
        fmul    %st(2), %st
        fsubrp  %st, %st(1)
        fld     %st(0)
        fsqrt
        fucomi  %st(0), %st
-       jp      .L88
-       je      .L89
-.L88:
+       jp      .L92
+       je      .L93
+.L92:
        fstp    %st(0)
        fstpl   (%esp)
        fstpl   64(%esp)
        call    sqrt
        fldl    64(%esp)
        fxch    %st(1)
-.L72:
-       movl    96(%esp), %eax
-       movl    100(%esp), %edx
-       fildll  88(%esp)
+.L74:
+       movl    104(%esp), %eax
+       movl    108(%esp), %edx
+       fildll  96(%esp)
        movl    %eax, 40(%esp)
-       movl    92(%esp), %eax
+       movl    100(%esp), %eax
        movl    %edx, 44(%esp)
        testl   %eax, %eax
-       js      .L87
-.L73:
-       fstpl   120(%esp)
-       movl    104(%esp), %eax
+       js      .L91
+.L75:
+       fstpl   136(%esp)
+       movl    112(%esp), %eax
        movl    $184, %ebp
-       fldl    120(%esp)
+       fldl    136(%esp)
        xorl    %edi, %edi
        movl    $.LC5, %esi
        fdivl   .LC7
-       movl    108(%esp), %edx
+       movl    116(%esp), %edx
        movl    %ebp, 8(%esp)
        movl    %edi, 12(%esp)
        movl    %eax, (%esp)
        movl    %edx, 4(%esp)
        fstpl   32(%esp)
        fstpl   24(%esp)
        fstpl   16(%esp)
        call    __udivdi3
        movl    %esi, 4(%esp)
        movl    $.LC6, (%esp)
        movl    %eax, 8(%esp)
        movl    %edx, 12(%esp)
        call    printf
-       addl    $136, %esp
+       addl    $152, %esp
        xorl    %eax, %eax
        popl    %ecx
        popl    %ebx
        popl    %esi
        popl    %edi
        popl    %ebp
        leal    -4(%ecx), %esp
        ret
-.L89:
+.L93:
        fstp    %st(1)
+       jmp     .L74
+.L89:
+       fadds   .LC2
        jmp     .L72
-.L85:
+.L91:
        fadds   .LC2
-       jmp     .L70
-.L87:
+       jmp     .L75
+.L90:
        fadds   .LC2
        jmp     .L73
-.L86:
-       fadds   .LC2
-       jmp     .L71
        .size   main, .-main
        .section        .rodata
        .align 32
        .type   cases, @object
        .size   cases, 2208
 cases:
...


--

/*

Don't count the existing Newton-Raphson out. It turns out that to get enough
precision for 32 bits, only 4 iterations are needed. By unrolling those, it
gets much better timing.

Slightly gross test program (with original cubic wraparound bug fixed).

---

*/

/* Test and measure perf of cube root algorithms.  */
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <math.h>

#define MEMCLOBBER 1

/**
 * ffs - find first bit set
 * @x: the word to search
 *
 * This is defined the same way as
 * the libc and compiler builtin ffs routines, therefore
 * differs in spirit from the above ffz (man ffs).
 */
static inline int ffs(int x)
{
        int r;

        __asm__("bsfl %1,%0\n\t"
                "cmovzl %2,%0" 
#if MEMCLOBBER
                : "=&r" (r) : "rm" (x), "r" (-1) : "memory");
#else
                : "=&r" (r) : "rm" (x), "r" (-1));
#endif
        return r+1;
}

/**
 * fls - find last bit set
 * @x: the word to search
 *
 * This is defined the same way as ffs.
 */
static inline int fls(int x)
{
        int r;

        __asm__("bsrl %1,%0\n\t"
                "cmovzl %2,%0"
#if MEMCLOBBER
                : "=&r" (r) : "rm" (x), "r" (-1) : "memory" );
#else
                : "=&r" (r) : "rm" (x), "r" (-1));
#endif
        return r+1;
}

#ifdef __x86_64

#define rdtscll(val) do { \
     unsigned int __a,__d; \
     asm volatile("rdtsc" : "=a" (__a), "=d" (__d)); \
     (val) = ((unsigned long)__a) | (((unsigned long)__d)<<32); \
} while(0)

# define do_div(n,base) ({                                      \
        uint32_t __base = (base);                               \
        uint32_t __rem;                                         \
        __rem = ((uint64_t)(n)) % __base;                       \
        (n) = ((uint64_t)(n)) / __base;                         \
        __rem;                                                  \
 })


/**
 * __ffs - find first bit in word.
 * @word: The word to search
 *
 * Undefined if no bit exists, so code should check against 0 first.
 */
static inline unsigned long __ffs(unsigned long word)
{
        __asm__("bsfq %1,%0"
                :"=r" (word)
                :"rm" (word));
        return word;
}

/*
 * __fls: find last bit set.
 * @word: The word to search
 *
 * Undefined if no zero exists, so code should check against ~0UL first.
 */
static inline unsigned long __fls(unsigned long word)
{
        __asm__("bsrq %1,%0"
                :"=r" (word)
                :"rm" (word));
        return word;
}

/**
 * fls64 - find last bit set in 64 bit word
 * @x: the word to search
 *
 * This is defined the same way as fls.
 */
static inline int fls64(uint64_t x)
{
        if (x == 0)
                return 0;
        return __fls(x) + 1;
}

static inline uint64_t div64_64(uint64_t dividend, uint64_t divisor)
{
        return dividend / divisor;
}

#elif __i386

#define rdtscll_start(val) \
     __asm__ __volatile__("movl $0, %%eax\n\tcpuid\n\trdtsc\n" : "=A" (val) : : 
"ebx", "ecx")

#define rdtscll(val) \
     __asm__ __volatile__("rdtsc" : "=A" (val))

static inline int fls64(uint64_t x)
{
        uint32_t h = x >> 32;
        if (h)
                return fls(h) + 32;
        return fls(x);
}


#define do_div(n,base) ({ \
        unsigned long __upper, __low, __high, __mod, __base; \
        __base = (base); \
        asm("":"=a" (__low), "=d" (__high):"A" (n)); \
        __upper = __high; \
        if (__high) { \
                __upper = __high % (__base); \
                __high = __high / (__base); \
        } \
        asm("divl %2":"=a" (__low), "=d" (__mod):"rm" (__base), "0" (__low), 
"1" (__upper)); \
        asm("":"=A" (n):"a" (__low),"d" (__high)); \
        __mod; \
})


/* 64bit divisor, dividend and result. dynamic precision */
static uint64_t div64_64(uint64_t dividend, uint64_t divisor)
{
        uint32_t d = divisor;

        if (divisor > 0xffffffffULL) {
                unsigned int shift = fls(divisor >> 32);

                d = divisor >> shift;
                dividend >>= shift;
        }

        /* avoid 64 bit division if possible */
        if (dividend >> 32)
                do_div(dividend, d);
        else
                dividend = (uint32_t) dividend / d;

        return dividend;
}
#endif

/* Andi Kleen's version */
uint32_t acbrt(uint64_t x)
{
        uint32_t y = 0;
        int s;

        for (s = 63; s >= 0; s -= 3) {
                uint64_t b, bs;

                y = 2 * y;
                b = 3 * y * (y+1) + 1;
                bs = b << s;
                if (x >= bs && (b == (bs>>s))) {  /* avoid overflow */
                        x -= bs;
                        y++;
                }
        }
        return y;
}

/* My version of hacker's delight */
uint32_t hcbrt(uint64_t x)
{
        int s = 60;
        uint32_t y = 0;

        do {
                uint64_t b;
                y = 2*y;
                b = (uint64_t)(3*y*(y + 1) + 1) << s;
                s = s - 3;
                if (x >= b) {
                        x = x - b;
                        y = y + 1;
                }
        } while(s >= 0);

        return y;
}

/* calculate the cubic root of x using Newton-Raphson */
static uint32_t ocubic(uint64_t a)
{
        uint32_t x, x1;

        /* Initial estimate is based on:
         * cbrt(x) = exp(log(x) / 3)
         */
        x = 1u << (fls64(a)/3);

        /*
         * Iteration based on:
         *                         2
         * x    = ( 2 * x  +  a / x  ) / 3
         *  k+1          k         k
         */
        do {
                uint64_t x2;

                x2 = x;
                x2 *= x;
                x1 = x;

                x = (2 * x + div64_64(a, x2)) / 3;
        } while (abs(x1 - x) > 1);

        return x;
}

/* calculate the cubic root of x using Newton-Raphson */
static uint32_t ncubic(uint64_t a)
{
        uint64_t x;

        /* Initial estimate is based on:
         * cbrt(x) = exp(log(x) / 3)
         */
        x = 1u << (fls64(a)/3);

        /* Converges in 3 iterations to > 32 bits */

        x = (2 * x + div64_64(a, x*x)) / 3;
        x = (2 * x + div64_64(a, x*x)) / 3;
        x = (2 * x + div64_64(a, x*x)) / 3;

        return x;
}

static const struct cbrt {
        uint64_t        in;
        uint32_t        result;
} cases[] = {
        {1, 1}, {2, 1}, {3, 1}, {4, 1}, {5, 1}, {6, 1}, {7, 1}, 
        {8, 2}, {9, 2}, {10, 2}, {11, 2}, {12, 2}, {13, 2}, {14, 2},
        {15, 2}, {16, 2}, {17, 2}, {18, 2}, {19, 2}, {20, 2}, {21, 2}, {22, 2},
        {23, 2}, {24, 2}, {25, 2}, {26, 2}, {27, 3}, {28, 3}, {29, 3}, {30, 3},
        {31, 3}, {32, 3}, {33, 3}, {34, 3}, {35, 3}, {36, 3}, {37, 3}, {38, 3},
        {39, 3}, {40, 3}, {99, 4}, {100, 4}, {101, 4}, 
        { 125ull, 5 }, { 216ull, 6 },  { 343ull, 7 }, { 512ull, 8 }, 
        { 1000ull, 10 },  { 1331ull, 11 },
        { 8000ull, 20 },  { 9261ull, 21 },
        {32767, 31},     {32768, 32}, {32769, 32}, 
        { 64000ull, 40 },  { 68921ull, 41 },
        { 512000ull, 80 },  { 531441ull, 81 },
        { 1000000ull, 100 },  { 1030301ull, 101 },
        { 4096000ull, 160 },  { 4173281ull, 161 },
        { 16387064ull, 254 },  { 16581375ull, 255 },
        { 16777216ull, 256 },  { 16974593ull, 257 },
        { 131096512ull, 508 },  { 131872229ull, 509 },
        { 132651000ull, 510 },  { 133432831ull, 511 },
        { 134217728ull, 512 },  { 135005697ull, 513 },
        { 1000000000ull, 1000 },  { 1003003001ull, 1001 },
        { 1006012008ull, 1002 },  { 1009027027ull, 1003 },
        { 1061208000ull, 1020 },  { 1064332261ull, 1021 },
        { 1067462648ull, 1022 },  { 1070599167ull, 1023 },
        
        {1073741823, 1023}, {1073741824, 1024}, {1073741825, 1024},
        {~0, 2642245},
/* 100 random values */
 { 7749363893351949254ull, 1978891},  { 7222815480849057907ull, 1933016},
 { 8408462745175416063ull, 2033475},  { 3091884191388096748ull, 1456826},
 { 2562019500164152525ull, 1368340},  { 4403210617922443179ull, 1639041},
 { 3364542905362882299ull, 1498449},  { 8782769017716072774ull, 2063211},
 { 5863405773976003266ull, 1803225},  { 1306053050111174648ull, 1093084},
 { 150346236956174824ull, 531737},  { 1265737889039205261ull, 1081719},
 { 1445109530774087002ull, 1130577},  { 1197105577171186275ull, 1061803},
 { 9213452462461015967ull, 2096399},  { 4730966302945445786ull, 1678739},
 { 5650605098630667570ull, 1781141},  { 5880381756353009591ull, 1804963},
 { 4552499520046621784ull, 1657359},  { 2697991130065918298ull, 1392131},
 { 4858364911220984157ull, 1693674},  { 3691457481531040535ull, 1545489},
 { 2613117305472506601ull, 1377377},  { 7449943749836318932ull, 1953069},
 { 643378865959570610ull, 863287},  { 4851450802679832774ull, 1692871},
 { 1772859812839988916ull, 1210295},  { 8210946489571640849ull, 2017426},
 { 591875965497384322ull, 839608},  { 4221553402965100097ull, 1616183},
 { 2197744667347238205ull, 1300146},  { 8321400714356781191ull, 2026432},
 { 2459557415995497961ull, 1349850},  { 3460673533926954145ull, 1512586},
 { 4727304344741345505ull, 1678306},  { 4903203917250634599ull, 1698869},
 { 4036494370831490817ull, 1592214},  { 8585205035691420311ull, 2047624},
 { 2622143824319236828ull, 1378961},  { 5902762718897731478ull, 1807250},
 { 6344401509618197560ull, 1851243},  { 4059247793194552874ull, 1595200},
 { 7648030174294342832ull, 1970228},  { 2111858627070002939ull, 1282985},
 { 3231502273651985583ull, 1478432},  { 8821862535190318932ull, 2066268},
 { 6062559696943389464ull, 1823414},  { 4054224670122353756ull, 1594541},
 { 3674929609692563482ull, 1543179},  { 6310802012126231363ull, 1847969},
 { 4450190829039920890ull, 1644849},  { 8764531173541462842ull, 2061782},
 { 1361923252301505833ull, 1108453},  { 5912924843615600614ull, 1808287},
 { 5714768882048811324ull, 1787857},  { 7249589769047033748ull, 1935401},
 { 4123157012528828376ull, 1603528},  { 1729687638268160097ull, 1200390},
 { 5132287771298228729ull, 1724925},  { 1564349257200314043ull, 1160854},
 { 951586254223522969ull, 983594},  { 4569664949094662293ull, 1659439},
 { 9082730968228181483ull, 2086437},  { 6312891027251024051ull, 1848173},
 { 6915415788559031791ull, 1905194},  { 2713150456497618688ull, 1394733},
 { 5390954890749602465ull, 1753430},  { 1405547745908296421ull, 1120164},
 { 1157301728707637259ull, 1049902},  { 1513573187112042448ull, 1148156},
 { 687416080475161159ull, 882551},  { 484496930861389501ull, 785411},
 { 1625256440396143907ull, 1175729},  { 7358388240824901288ull, 1945035},
 { 6055730836615196283ull, 1822729},  { 5897962221937294789ull, 1806760},
 { 862205218853780339ull, 951780},  { 4798091009445823173ull, 1686641},
 { 644772714391937867ull, 863910},  { 4255852691293155171ull, 1620549},
 { 5287931004512034672ull, 1742188},  { 479051048987854372ull, 782457},
 { 9223312736680112286ull, 2097147},  { 8208392001457969628ull, 2017217},
 { 9203071384420047828ull, 2095612},  { 8029313043584389618ull, 2002439},
 { 38384068872053008ull, 337326},  { 5477688516749455419ull, 1762784},
 { 1504622508868036557ull, 1145888},  { 8421184723110053200ull, 2034500},
 { 3312070181890020423ull, 1490618},  { 5344298403762143580ull, 1748357},
 { 6340030040222269807ull, 1850818},  { 4895839553118470425ull, 1698018},
 { 2806627376195262363ull, 1410570},  { 5321619225005368821ull, 1745880},
 { 6897323351052656353ull, 1903532},  { 326700202259382556ull, 688731},
 { 7685269066741890339ull, 1973420},  { 8054506481558450217ull, 2004531},
};
#define NCASES (sizeof(cases)/sizeof(cases[0]))


static double ticks_per_usec = 2979.0;

static void show(const char *func, uint64_t sum, uint64_t sq, 
                 unsigned long long mx, unsigned long long err)
{
        double mean, std;

        mean = (double) sum / ticks_per_usec / NCASES ;
        std = sqrtl( (double) sq / ticks_per_usec / NCASES - mean * mean);

        printf("%-10s %8llu %8.3f %8.3f %8.3f %11llu\n", func, 
               (unsigned long long) sum / NCASES, mean, std, 
               (double) mx / ticks_per_usec, err);
}


int main(int argc, char **argv)
{
        int i;
        uint32_t c;
        unsigned long long start, end, t, mx;
        unsigned long long err, sum, sum_sq, cerr;
        int loops;

#if 0
        rdtscll(start);
        sleep(2);
        rdtscll(end);
        ticks_per_usec = (double) (end - start) / 2000000.;
#endif

        printf("Function     clocks mean(us)  max(us)  std(us) total error\n");

#define DOTEST(func) \
        if (func(27) != 3) printf("ouch\n"); \
        sum = sum_sq = mx = 0; \
        for (err = 0, i = 0; i < NCASES; i++) { \
                do { \
                        rdtscll_start(start); \
                        rdtscll_start(end); \
                } while ((end - start) < 1000); \
                c = func(cases[i].in); \
                loops = 667; \
                rdtscll_start(start);           \
                while (--loops > 0) c = func(cases[i].in); \
                rdtscll_start(end); \
                t = (end - start) / 666; sum += t; sum_sq += t*t; \
                if (t > mx) mx = t; \
                cerr = abs((long) cases[i].result - c); \
                err += cerr; \
        } \
        show(#func, sum, sum_sq, mx, err);

        //DOTEST(ocubic);
        DOTEST(ncubic);
        //DOTEST(acbrt);
        //DOTEST(hcbrt);
        return 0;
}

Re: [RFC] div64_64 support

Reply via email to