On 3/13/06, Andrew Pinski <[EMAIL PROTECTED]> wrote:
> Actually the best way of improving the inline heuristics is to get
> a real testcase (and not some benchmark) where the inline heuristics
> is messed up.
Ah, you mean a brand new testcase because PR-21195 wasn't good enough?
$ /usr/local/gcc-4.1.0/bin/g++ -v
Using built-in specs.
Target: i686-pc-cygwin
Configured with: ../configure --prefix=/usr/local/gcc-4.1.0
--enable-languages=c,c++ --enable-threads=posix --with-system-zlib
--disable-checking --disable-nls --disable-shared
--disable-win32-registry --verbose --enable-bootstrap --with-gcc
--with-gnu-ld --with-gnu-as --with-cpu=k8
Thread model: posix
gcc version 4.1.0
/usr/local/gcc-4.1.0/bin/g++ -g -O3 -march=k8 -msse2 -o pr-inline.o
pr-inline.cc
#include <xmmintrin.h>
static __m128 mm_max_ps(const __m128 a, const __m128 b) { return
_mm_max_ps(a,b); }
static __m128 mm_min_ps(const __m128 a, const __m128 b) { return
_mm_min_ps(a,b); }
static __m128 mm_mul_ps(const __m128 a, const __m128 b) { return
_mm_mul_ps(a,b); }
static __m128 mm_div_ps(const __m128 a, const __m128 b) { return
_mm_div_ps(a,b); }
static __m128 mm_or_ps(const __m128 a, const __m128 b) { return
_mm_or_ps(a,b); }
static int mm_movemask_ps(const __m128 a) { return _mm_movemask_ps(a); }
static __attribute__ ((always_inline)) bool bloatit(const __m128 a, const
__m128
b) {
const __m128
v0 = mm_max_ps(a,b),
v1 = mm_min_ps(a,b),
v2 = mm_mul_ps(a,b),
v3 = mm_div_ps(a,b),
g0 = mm_or_ps(_mm_or_ps(_mm_or_ps(v0,v1), v2), v3),
v4 = mm_min_ps(mm_or_ps(a,b),mm_div_ps(b,a)),
v5 = mm_max_ps(mm_min_ps(a,mm_div_ps(b,a)),
mm_or_ps(b, mm_div_ps(b,g0))),
g1 = mm_or_ps(g0,mm_or_ps(v4,v5));
return mm_movemask_ps(g1);
}
bool finalblow(const __m128 a, const __m128 b, const __m128 c, const __m128 d,
const __m128 e, const __m128 f) {
return
bloatit(a,b) & bloatit(c,d) & bloatit(e,f) & bloatit(a,c) &
bloatit(b,d) & bloatit(c,e) & bloatit(d,f) &
bloatit(b,a) & bloatit(d,c) & bloatit(f,e) & bloatit(c,a) &
bloatit(d,b) & bloatit(e,c) & bloatit(f,d);
}
int main() { return 0; }
00401080 <mm_mul_ps(float __vector, float __vector)>:
401080: push %ebp
401081: mulps %xmm1,%xmm0
401084: mov %esp,%ebp
401086: sub $0x8,%esp
401089: leave
40108a: ret
40108b: nop
40108c: lea 0x0(%esi),%esi
00401090 <mm_or_ps(float __vector, float __vector)>:
401090: push %ebp
401091: orps %xmm1,%xmm0
401094: mov %esp,%ebp
401096: sub $0x8,%esp
401099: leave
40109a: ret
40109b: nop
40109c: lea 0x0(%esi),%esi
004010a0 <mm_div_ps(float __vector, float __vector)>:
4010a0: divps %xmm1,%xmm0
4010a3: push %ebp
4010a4: mov %esp,%ebp
4010a6: sub $0x8,%esp
4010a9: leave
4010aa: ret
4010ab: nop
...
004010e0 <finalblow(float __vector, float __vector, float __vector,
float __vector, float __vector, float __vector)>:
...
401101: call 4010c0 <mm_max_ps(float __vector, float __vector)>
401106: movaps %xmm0,0xfffff958(%ebp)
40110d: movaps 0xfffff8f8(%ebp),%xmm1
401114: movaps 0xfffff908(%ebp),%xmm0
40111b: call 4010b0 <mm_min_ps(float __vector, float __vector)>
401120: movaps 0xfffff8f8(%ebp),%xmm1
401127: movaps %xmm0,0xfffff948(%ebp)
40112e: movaps 0xfffff908(%ebp),%xmm0
401135: call 401080 <mm_mul_ps(float __vector, float __vector)>
40113a: movaps 0xfffff8f8(%ebp),%xmm1
401141: movaps %xmm0,0xfffff938(%ebp)
401148: movaps 0xfffff908(%ebp),%xmm0
40114f: call 4010a0 <mm_div_ps(float __vector, float __vector)>
401154: movaps 0xfffff958(%ebp),%xmm1
40115b: orps 0xfffff948(%ebp),%xmm1
401162: movaps %xmm1,0xfffff958(%ebp)
401169: movaps %xmm0,%xmm1
40116c: movaps 0xfffff958(%ebp),%xmm0
401173: orps 0xfffff938(%ebp),%xmm0
40117a: call 401090 <mm_or_ps(float __vector, float __vector)>
40117f: movaps 0xfffff908(%ebp),%xmm1
401186: movaps %xmm0,0xfffff928(%ebp)
40118d: movaps 0xfffff8f8(%ebp),%xmm0
401194: call 4010a0 <mm_div_ps(float __vector, float __vector)>
401199: movaps 0xfffff8f8(%ebp),%xmm1