Eric Dumazet a écrit :
On Tuesday 06 March 2007 18:28, Eric Dumazet wrote:
On Tuesday 06 March 2007 18:19, Linus Torvalds wrote:

Using reciprocal divides permits to change each divide by two
multiplies, less expensive on current CPUS.
Are you sure?
I am going to test this, but at least on Opterons, the reciprocal divide I
added into mm/slab.c gave me a nice speedup.



Linus,

I did a user space program, attached to this mail.

I rewrote the reciprocal_div() for i386 so that one multiply is used.

static inline u32 reciprocal_divide(u32 A, u32 R)
{
#if __i386
        unsigned int edx, eax;
        asm("mul %2":"=a" (eax), "=d" (edx):"rm" (R), "0" (A));
        return edx;
#else
        return (u32)(((u64)A * R) >> 32);
#endif
}


Results are really good on 32bit. On 64bit/Opteron they are impressive.

$ gcc -O2 -o divide_bench divide_bench.c

First result gives the number of cycles to perform old number() routine using plain do_div()
Second result gives the number of cycles using reciprocal_div trick


results on a Intel Pentium III 866 MHz

$ ./divide_bench
413.453 cycles per call, last res=99999901
132.746 cycles per call, last res=99999901
$ ./divide_bench
411.833 cycles per call, last res=99999901
129.652 cycles per call, last res=99999901
$ ./divide_bench
480.645 cycles per call, last res=99999901
158.642 cycles per call, last res=99999901
$ ./divide_bench
412.769 cycles per call, last res=99999901
129.643 cycles per call, last res=99999901
$ ./divide_bench
410.809 cycles per call, last res=99999901
129.609 cycles per call, last res=99999901


results on AMD 246 (2GHz)
Sorry this machine is quite loaded... I dont have a dev x86_64 machine.

$ gcc -O2 -m32 -o divide_bench32 divide_bench.c
$ ./divide_bench32
412.181 cycles per call, last res=99999901
112.314 cycles per call, last res=99999901
$ ./divide_bench32
444.008 cycles per call, last res=99999901
114.314 cycles per call, last res=99999901
$ ./divide_bench32
423.168 cycles per call, last res=99999901
112.318 cycles per call, last res=99999901
$ ./divide_bench32
427.73 cycles per call, last res=99999901
110.712 cycles per call, last res=99999901
$ ./divide_bench32
410.529 cycles per call, last res=99999901
114.068 cycles per call, last res=99999901
$ ./divide_bench32
489.856 cycles per call, last res=99999901
124.889 cycles per call, last res=99999901
$ ./divide_bench32
389.278 cycles per call, last res=99999901
104.697 cycles per call, last res=99999901

With a 64bit prog :
$ gcc -O2 -m64 -o divide_bench64 divide_bench.c
$ ./divide_bench64
826.136 cycles per call, last res=99999901
105.912 cycles per call, last res=99999901
$ ./divide_bench64
627.096 cycles per call, last res=99999901
76.2473 cycles per call, last res=99999901
$ ./divide_bench64
604.524 cycles per call, last res=99999901
76.1405 cycles per call, last res=99999901
$ ./divide_bench64
621.013 cycles per call, last res=99999901
76.0963 cycles per call, last res=99999901
$ ./divide_bench64
836.799 cycles per call, last res=99999901
103.967 cycles per call, last res=99999901
$ ./divide_bench64
982.718 cycles per call, last res=99999901
127.945 cycles per call, last res=99999901
$ ./divide_bench64
609.346 cycles per call, last res=99999901
76.0768 cycles per call, last res=99999901


#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <math.h>

#ifdef __x86_64

#define rdtscll(val) do { \
     unsigned int __a,__d; \
     asm volatile("rdtsc" : "=a" (__a), "=d" (__d)); \
     (val) = ((unsigned long)__a) | (((unsigned long)__d)<<32); \
} while(0)

# define do_div(n,base) ({                                      \
        uint32_t __base = (base);                               \
        uint32_t __rem;                                         \
        __rem = ((uint64_t)(n)) % __base;                       \
        (n) = ((uint64_t)(n)) / __base;                         \
        __rem;                                                  \
 })
#elif  __i386

#define rdtscll(val) \
     __asm__ __volatile__("rdtsc" : "=A" (val))

#define do_div(n,base) ({ \
        unsigned long __upper, __low, __high, __mod, __base; \
        __base = (base); \
        asm("":"=a" (__low), "=d" (__high):"A" (n)); \
        __upper = __high; \
        if (__high) { \
                __upper = __high % (__base); \
                __high = __high / (__base); \
        } \
        asm("divl %2":"=a" (__low), "=d" (__mod):"rm" (__base), "0" (__low), 
"1" (__upper)); \
        asm("":"=A" (n):"a" (__low),"d" (__high)); \
        __mod; \
})

#endif

static const char digits[] = "0123456789abcdefghijklmnopqrstuvwxyz";

char *number_divides(unsigned long long num, int base, char *result)
{
if (num == 0)
        *result++ = '0';
else while (num) 
        *result++ = digits[do_div(num,base)];
*result = 0;
return result;
}
typedef unsigned int u32;
typedef unsigned long long u64;

#define CONSTANT_RECIPROCAL_VALUE(k) \
        (u32)(((1LL << 32) + (k - 1)) / k)

const u32 reciprocal_values[36 + 1] = {
        0,
        CONSTANT_RECIPROCAL_VALUE(1),   CONSTANT_RECIPROCAL_VALUE(2),
        CONSTANT_RECIPROCAL_VALUE(3),   CONSTANT_RECIPROCAL_VALUE(4),
        CONSTANT_RECIPROCAL_VALUE(5),   CONSTANT_RECIPROCAL_VALUE(6),
        CONSTANT_RECIPROCAL_VALUE(7),   CONSTANT_RECIPROCAL_VALUE(8),
        CONSTANT_RECIPROCAL_VALUE(9),   CONSTANT_RECIPROCAL_VALUE(10),
        CONSTANT_RECIPROCAL_VALUE(11),  CONSTANT_RECIPROCAL_VALUE(12),
        CONSTANT_RECIPROCAL_VALUE(13),  CONSTANT_RECIPROCAL_VALUE(14),
        CONSTANT_RECIPROCAL_VALUE(15),  CONSTANT_RECIPROCAL_VALUE(16),
        CONSTANT_RECIPROCAL_VALUE(17),  CONSTANT_RECIPROCAL_VALUE(18),
        CONSTANT_RECIPROCAL_VALUE(19),  CONSTANT_RECIPROCAL_VALUE(20),
        CONSTANT_RECIPROCAL_VALUE(21),  CONSTANT_RECIPROCAL_VALUE(22),
        CONSTANT_RECIPROCAL_VALUE(23),  CONSTANT_RECIPROCAL_VALUE(24),
        CONSTANT_RECIPROCAL_VALUE(25),  CONSTANT_RECIPROCAL_VALUE(26),
        CONSTANT_RECIPROCAL_VALUE(27),  CONSTANT_RECIPROCAL_VALUE(28),
        CONSTANT_RECIPROCAL_VALUE(29),  CONSTANT_RECIPROCAL_VALUE(30),
        CONSTANT_RECIPROCAL_VALUE(31),  CONSTANT_RECIPROCAL_VALUE(32),
        CONSTANT_RECIPROCAL_VALUE(33),  CONSTANT_RECIPROCAL_VALUE(34),
        CONSTANT_RECIPROCAL_VALUE(35),  CONSTANT_RECIPROCAL_VALUE(36)
};

static inline u32 reciprocal_divide(u32 A, u32 R)
{
#if __i386
        unsigned int edx, eax;
        asm("mul %2":"=a" (eax), "=d" (edx):"rm" (R), "0" (A));
        return edx;
#else
        return (u32)(((u64)A * R) >> 32);
#endif
}

char *number_reciprocal(unsigned long long num, int base, char *result)
{
if (num == 0)
        *result++ = '0';
else {
        while (num >>32)
                *result++ = digits[do_div(num,base)];
        while (num) {
                u32 next = reciprocal_divide((u32)num,
                                reciprocal_values[base]);
                        *result++ = digits[num - next*base];
                        num = next;

                }
        }
*result = 0;
return result;
}

#define START 10000000
int base = 10;
main()
{
unsigned long long start, end;
char result[64];
unsigned long ul;
        rdtscll(start);
        for (ul = START; ul < START + 1000000;ul++)
                number_divides(ul, base, result);
        rdtscll(end);
printf("%g cycles per call, last res=%s\n", (double)(end - start)/1000000.0, 
result);
        rdtscll(start);
        for (ul = START; ul < START + 1000000;ul++)
                number_reciprocal(ul, base, result);
        rdtscll(end);
printf("%g cycles per call, last res=%s\n", (double)(end - start)/1000000.0, 
result);
}

Reply via email to