Hi, I am using MMX built-ins and gcc-4.0-20050825 and I am experiencing generation of uneeded movq (at least I guess so, I am no assembler pro). I don't know which gcc snapshot introduced this, but a I know that some pre-release gcc 4.0 didn't show this bad behaviour. (It's been some time I played with this...)
BTW, this is using gcc built-ins. The situation is much wors when using
intrinsics via mmintrin.h. (Again old pre4.0 gcc didn't have the problem;
using gcc builtins or mmintin.h intrinsics made no differnece; both generated
nice code.)
Here source + assembly mixed + my comments (compiled with -O2 -g3
-march=athlon-xp):
Just shout, if you need anything else.
typedef int v2si __attribute__ ((vector_size (8)));
typedef int di __attribute__ ((vector_size (8)));
typedef short v4hi __attribute__ ((vector_size (8)));
00000320 <MixAudio16_MMX_T>:
void MixAudio16_MMX_T(char* src1, char* src2, char* dst)
{
320: 55 push %ebp
321: 89 e5 mov %esp,%ebp
323: 83 ec 10 sub $0x10,%esp
v4hi indata;
v4hi signmask;
v2si loout;
v2si hiout;
v2si temp;
__attribute__((aligned(16))) static const short sm[4] =
{0x8000,0x8000,0x8000,0x8000};
static const v4hi *m = (v4hi*)sm;
indata = *(v4hi*)src1;
signmask = __builtin_ia32_pand(indata, *m);
326: 8b 15 04 00 00 00 mov 0x4,%edx
32c: 8b 45 08 mov 0x8(%ebp),%eax
32f: 0f 6f 10 movq (%eax),%mm2
signmask = __builtin_ia32_pcmpeqw(signmask, *m);
loout = __builtin_ia32_punpcklwd(indata, signmask);
332: 0f 6f ca movq %mm2,%mm1
hiout = __builtin_ia32_punpckhwd(indata, signmask);
indata = *(v4hi*)src2;
335: 8b 45 0c mov 0xc(%ebp),%eax
338: 0f 7f 55 f8 movq %mm2,0xfffffff8(%ebp)
33c: 0f 6f 45 f8 movq 0xfffffff8(%ebp),%mm0
Why not movq %mm2, %mm0 ?
340: 0f db 02 pand (%edx),%mm0
343: 0f 7f 45 f0 movq %mm0,0xfffffff0(%ebp)
347: 0f 6f 45 f0 movq 0xfffffff0(%ebp),%mm0
what the heck?
34b: 0f 75 02 pcmpeqw (%edx),%mm0
34e: 0f 61 c8 punpcklwd %mm0,%mm1
351: 0f 69 d0 punpckhwd %mm0,%mm2
354: 0f 7f 4d f8 movq %mm1,0xfffffff8(%ebp)
358: 0f 6f 5d f8 movq 0xfffffff8(%ebp),%mm3
As above, this happens throughout, as you can see:
35c: 0f 7f 55 f8 movq %mm2,0xfffffff8(%ebp)
360: 0f 6f 10 movq (%eax),%mm2
363: 0f 6f 65 f8 movq 0xfffffff8(%ebp),%mm4
signmask = __builtin_ia32_pand(indata, *m);
367: 0f 7f 55 f8 movq %mm2,0xfffffff8(%ebp)
36b: 0f 6f 45 f8 movq 0xfffffff8(%ebp),%mm0
signmask = __builtin_ia32_pcmpeqw(signmask, *m);
temp = __builtin_ia32_punpcklwd(indata, signmask);
36f: 0f 6f ca movq %mm2,%mm1
372: 0f db 02 pand (%edx),%mm0
loout = __builtin_ia32_paddd(loout, temp); \
temp = __builtin_ia32_punpckhwd(indata, signmask);
hiout = __builtin_ia32_paddd(hiout, temp);
*(v4hi*)dst = __builtin_ia32_packssdw(loout, hiout);
375: 8b 45 10 mov 0x10(%ebp),%eax
378: 0f 7f 45 f0 movq %mm0,0xfffffff0(%ebp)
37c: 0f 6f 45 f0 movq 0xfffffff0(%ebp),%mm0
380: 0f 75 02 pcmpeqw (%edx),%mm0
383: 0f 61 c8 punpcklwd %mm0,%mm1
386: 0f 69 d0 punpckhwd %mm0,%mm2
389: 0f 7f 4d f8 movq %mm1,0xfffffff8(%ebp)
38d: 0f fe 5d f8 paddd 0xfffffff8(%ebp),%mm3
Why not using MMX register directly?
391: 0f 7f 55 f8 movq %mm2,0xfffffff8(%ebp)
395: 0f fe 65 f8 paddd 0xfffffff8(%ebp),%mm4
dito
399: 0f 6b dc packssdw %mm4,%mm3
39c: 0f 7f 18 movq %mm3,(%eax)
__builtin_ia32_emms();
39f: 0f 77 emms
return;
}
3a1: c9 leave
3a2: c3 ret
--
(°= =°)
//\ Prakash Punnoor /\\
V_/ \_V
signature.asc
Description: OpenPGP digital signature
