On Mon, Feb 02, 2015 at 11:13:44AM -0800, Linus Torvalds wrote:
> On Mon, Feb 2, 2015 at 11:00 AM, Linus Torvalds
> <torva...@linux-foundation.org> wrote:
> >
> > (I'm also not entirely sure what uses int_sqrt() that ends up being so
> > performance-critical, so it would be good to document that too, since
> > that probably also matters for the "what's the normal argument range"
> > question..)
> 
> ... it's also not entirely clear that we need a whole new loop. We
> might just instead start off with a better guess for 'm' using some
> calculation that might be doable with a single conditional move
> instruction instead of a loop. Because I suspect that the inevitable
> branch misprediction of a new loop is likely as expensive as a few
> iterations through the core one.
> 
> IOW, instead of
> 
>   m = 1UL << (BITS_PER_LONG - 2);
> 
> perhaps something like
> 
>    m = 1UL << (BITS_PER_LONG/2- 2);
>    if (m < x)
>        m <<= BITS_PER_LONG/2;
> 
> (assuming gcc can change that code into a "cmov") might cut down the
> "lots of empty loops" case in half for small values of 'x'.
> 
> There's probably some other better cheap initial guess value estimator.

So I had a wee look at int_sqrt(), and yes Davidlohr's patch makes it
worse for my machine (Xeon E3v5 aka. fancy Skylake).

As in _MUCH_ worse..

~/tmp$ gcc -o sqrt sqrt.c -lm -O2 -DLOOPS=10000000 -DNEW=1  ; perf stat 
--repeat 10 -e cycles:u -e instructions:u ./sqrt

 Performance counter stats for './sqrt' (10 runs):

       828,233,295      cycles:u                                                
      ( +-  0.04% )
     2,480,095,771      instructions:u            #    2.99  insn per cycle     
      ( +-  0.00% )

       0.220202758 seconds time elapsed                                         
 ( +-  0.18% )

~/tmp$ gcc -o sqrt sqrt.c -lm -O2 -DLOOPS=10000000   ; perf stat --repeat 10 -e 
cycles:u -e instructions:u ./sqrt

 Performance counter stats for './sqrt' (10 runs):

       603,809,551      cycles:u                                                
      ( +-  1.30% )
     1,677,726,591      instructions:u            #    2.78  insn per cycle     
      ( +-  0.00% )

       0.160801044 seconds time elapsed  

That is, we went from ~60 cycles to ~82 cycles per invocation.


Now, the patches proposed in this thread do indeed improve matters but
seem to not have merged up until this day:


~/tmp$ gcc -o sqrt sqrt.c -lm -O2 -DLOOPS=10000000 -DNEW=1 -DANSHUL=1  ; perf 
stat --repeat 10 -e cycles:u -e instructions:u ./sqrt

 Performance counter stats for './sqrt' (10 runs):

       598,827,196      cycles:u                                                
      ( +-  0.18% )
     1,697,726,381      instructions:u            #    2.84  insn per cycle     
      ( +-  0.00% )

       0.159880738 seconds time elapsed                                         
 ( +-  0.36% )

~/tmp$ gcc -o sqrt sqrt.c -lm -O2 -DLOOPS=10000000 -DNEW=1 -DLINUS=1  ; perf 
stat --repeat 10 -e cycles:u -e instructions:u ./sqrt

 Performance counter stats for './sqrt' (10 runs):

       601,463,184      cycles:u                                                
      ( +-  0.07% )
     1,380,095,976      instructions:u            #    2.29  insn per cycle     
      ( +-  0.00% )

       0.160788095 seconds time elapsed                                         
 ( +-  0.55% )

Which basically bring us back to the old performance. So I would
strongly suggest we merge one.


Now, the thing is, Intel CPUs are actually fairly good at fls(), so I
gave Joe's suggestions a spin too..

~/tmp$ gcc -o sqrt sqrt.c -lm -O2 -DLOOPS=10000000 -DNEW=1 -DFLS=1  ; perf stat 
--repeat 10 -e cycles:u -e instructions:u ./sqrt

 Performance counter stats for './sqrt' (10 runs):

       390,420,004      cycles:u                                                
      ( +-  0.07% )
     1,141,802,493      instructions:u            #    2.92  insn per cycle     
      ( +-  0.00% )

       0.105480000 seconds time elapsed                                         
 ( +-  0.64% )

~/tmp$ gcc -o sqrt sqrt.c -lm -O2 -DLOOPS=10000000 -DNEW=1 -DFLS=1 -DANSHUL=1  
; perf stat --repeat 10 -e cycles:u -e instructions:u ./sqrt

 Performance counter stats for './sqrt' (10 runs):

       328,415,775      cycles:u                                                
      ( +-  0.15% )
     1,138,579,704      instructions:u            #    3.47  insn per cycle     
      ( +-  0.00% )

       0.088703205 seconds time elapsed  

Which gets us a real improvement...

Now even with the software fls() fallback from
include/asm-generic/bitops/fls.h there is a significant improvement:


~/tmp$ gcc -o sqrt sqrt.c -lm -O2 -DLOOPS=10000000 -DNEW=1 -DFLS=1 -DANSHUL=1 
-DSOFTFLS=1  ; perf stat --repeat 10 -e cycles:u -e instructions:u ./sqrt

 Performance counter stats for './sqrt' (10 runs):

       375,633,931      cycles:u                                                
      ( +-  0.13% )
     1,289,700,013      instructions:u            #    3.43  insn per cycle     
      ( +-  0.00% )

       0.100596211 seconds time elapsed 


Also, I ran with -DVALIDATE=1 and while the comment says its a 'rough'
estimate of the sqrt() I did not in fact find any value for which we
deviate < INT_MAX.




----------------------[ sqrt.c ]---------------------------

#include <math.h>
#include <stdio.h>

#define BITS_PER_LONG   (sizeof(long) * 8)

#ifndef LOOPS
#define LOOPS 1000000
#endif

#ifdef SOFTFLS

static __always_inline unsigned long fls(unsigned long x)
{
        int r = 32;

        if (!x)
                return 0;

        if (!(x & 0xffff0000u)) {
                x <<= 16;
                r -= 16;
        }
        if (!(x & 0xff000000u)) {
                x <<= 8;
                r -= 8;
        }
        if (!(x & 0xf0000000u)) {
                x <<= 4;
                r -= 4;
        }
        if (!(x & 0xc0000000u)) {
                x <<= 2;
                r -= 2;
        }
        if (!(x & 0x80000000u)) {
                x <<= 1;
                r -= 1;
        }
        return r;
}

#else

static __always_inline unsigned long fls(unsigned long word)
{
        asm("rep; bsr %1,%0"
                : "=r" (word)
                : "rm" (word));
        return BITS_PER_LONG - 1 - word;
}

#endif


#ifdef NEW

unsigned long __attribute__((noinline)) int_sqrt(unsigned long x)
{
        unsigned long b, m, y = 0;

        if (x <= 1)
                return x;

#ifdef LINUS
        m = 1UL << (BITS_PER_LONG/2 - 2);
        if (m < x)
                m <<= BITS_PER_LONG/2;

#else

#ifdef FLS
        m = 1UL << ((fls(x) + 1) & ~1UL);
#else
        m = 1UL << (BITS_PER_LONG - 2);
#endif

#ifdef ANSHUL
        while (m > x)
                m >>= 2;
#endif
#endif

        while (m != 0) {
                b = y + m;
                y >>= 1;

                if (x >= b) {
                        x -= b;
                        y += m;
                }
                m >>= 2;
        }

        return y;
}

#else

unsigned long __attribute__((noinline)) int_sqrt(unsigned long x)
{
        unsigned long op, res, one;

        op = x;
        res = 0;

        one = 1UL << (BITS_PER_LONG - 2);
        while (one > op)
                one >>= 2;

        while (one != 0) {
                if (op >= res + one) {
                        op = op - (res + one);
                        res = res +  2 * one;
                }
                res /= 2;
                one /= 4;
        }
        return res;
}

#endif

void main(void)
{
        unsigned long i;

        for (i=0; i<LOOPS; i++) {
                unsigned long a = int_sqrt(i);
                asm volatile("" : : "r" (a));

#ifdef VALIDATE
                {
                        unsigned long b = floor(sqrt(i));
                        if (a != b)
                                printf("%ld %ld %ld\n", i, a, b);
                }
#endif

        }
}

--------------

The only caveat seems to be that our fls() is only defined for 'int' not
long :/


---
 lib/int_sqrt.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/lib/int_sqrt.c b/lib/int_sqrt.c
index 1ef4cc344977..87c3aa360441 100644
--- a/lib/int_sqrt.c
+++ b/lib/int_sqrt.c
@@ -7,12 +7,11 @@
 
 #include <linux/kernel.h>
 #include <linux/export.h>
+#include <linux/bitops.h>
 
 /**
- * int_sqrt - rough approximation to sqrt
+ * int_sqrt - approximation to sqrt
  * @x: integer of which to calculate the sqrt
- *
- * A very rough approximation to the sqrt() function.
  */
 unsigned long int_sqrt(unsigned long x)
 {
@@ -21,7 +20,11 @@ unsigned long int_sqrt(unsigned long x)
        if (x <= 1)
                return x;
 
-       m = 1UL << (BITS_PER_LONG - 2);
+       m = 1UL << ((fls(x) + 1) & ~1UL);
+
+       while (m > x)
+               m >>= 2;
+
        while (m != 0) {
                b = y + m;
                y >>= 1;

Reply via email to