Hi, I am using MMX built-ins and gcc-4.0-20050825 and I am experiencing generation of uneeded movq (at least I guess so, I am no assembler pro). I don't know which gcc snapshot introduced this, but a I know that some pre-release gcc 4.0 didn't show this bad behaviour. (It's been some time I played with this...)
BTW, this is using gcc built-ins. The situation is much wors when using intrinsics via mmintrin.h. (Again old pre4.0 gcc didn't have the problem; using gcc builtins or mmintin.h intrinsics made no differnece; both generated nice code.) Here source + assembly mixed + my comments (compiled with -O2 -g3 -march=athlon-xp): Just shout, if you need anything else. typedef int v2si __attribute__ ((vector_size (8))); typedef int di __attribute__ ((vector_size (8))); typedef short v4hi __attribute__ ((vector_size (8))); 00000320 <MixAudio16_MMX_T>: void MixAudio16_MMX_T(char* src1, char* src2, char* dst) { 320: 55 push %ebp 321: 89 e5 mov %esp,%ebp 323: 83 ec 10 sub $0x10,%esp v4hi indata; v4hi signmask; v2si loout; v2si hiout; v2si temp; __attribute__((aligned(16))) static const short sm[4] = {0x8000,0x8000,0x8000,0x8000}; static const v4hi *m = (v4hi*)sm; indata = *(v4hi*)src1; signmask = __builtin_ia32_pand(indata, *m); 326: 8b 15 04 00 00 00 mov 0x4,%edx 32c: 8b 45 08 mov 0x8(%ebp),%eax 32f: 0f 6f 10 movq (%eax),%mm2 signmask = __builtin_ia32_pcmpeqw(signmask, *m); loout = __builtin_ia32_punpcklwd(indata, signmask); 332: 0f 6f ca movq %mm2,%mm1 hiout = __builtin_ia32_punpckhwd(indata, signmask); indata = *(v4hi*)src2; 335: 8b 45 0c mov 0xc(%ebp),%eax 338: 0f 7f 55 f8 movq %mm2,0xfffffff8(%ebp) 33c: 0f 6f 45 f8 movq 0xfffffff8(%ebp),%mm0 Why not movq %mm2, %mm0 ? 340: 0f db 02 pand (%edx),%mm0 343: 0f 7f 45 f0 movq %mm0,0xfffffff0(%ebp) 347: 0f 6f 45 f0 movq 0xfffffff0(%ebp),%mm0 what the heck? 34b: 0f 75 02 pcmpeqw (%edx),%mm0 34e: 0f 61 c8 punpcklwd %mm0,%mm1 351: 0f 69 d0 punpckhwd %mm0,%mm2 354: 0f 7f 4d f8 movq %mm1,0xfffffff8(%ebp) 358: 0f 6f 5d f8 movq 0xfffffff8(%ebp),%mm3 As above, this happens throughout, as you can see: 35c: 0f 7f 55 f8 movq %mm2,0xfffffff8(%ebp) 360: 0f 6f 10 movq (%eax),%mm2 363: 0f 6f 65 f8 movq 0xfffffff8(%ebp),%mm4 signmask = __builtin_ia32_pand(indata, *m); 367: 0f 7f 55 f8 movq %mm2,0xfffffff8(%ebp) 36b: 0f 6f 45 f8 movq 0xfffffff8(%ebp),%mm0 signmask = __builtin_ia32_pcmpeqw(signmask, *m); temp = __builtin_ia32_punpcklwd(indata, signmask); 36f: 0f 6f ca movq %mm2,%mm1 372: 0f db 02 pand (%edx),%mm0 loout = __builtin_ia32_paddd(loout, temp); \ temp = __builtin_ia32_punpckhwd(indata, signmask); hiout = __builtin_ia32_paddd(hiout, temp); *(v4hi*)dst = __builtin_ia32_packssdw(loout, hiout); 375: 8b 45 10 mov 0x10(%ebp),%eax 378: 0f 7f 45 f0 movq %mm0,0xfffffff0(%ebp) 37c: 0f 6f 45 f0 movq 0xfffffff0(%ebp),%mm0 380: 0f 75 02 pcmpeqw (%edx),%mm0 383: 0f 61 c8 punpcklwd %mm0,%mm1 386: 0f 69 d0 punpckhwd %mm0,%mm2 389: 0f 7f 4d f8 movq %mm1,0xfffffff8(%ebp) 38d: 0f fe 5d f8 paddd 0xfffffff8(%ebp),%mm3 Why not using MMX register directly? 391: 0f 7f 55 f8 movq %mm2,0xfffffff8(%ebp) 395: 0f fe 65 f8 paddd 0xfffffff8(%ebp),%mm4 dito 399: 0f 6b dc packssdw %mm4,%mm3 39c: 0f 7f 18 movq %mm3,(%eax) __builtin_ia32_emms(); 39f: 0f 77 emms return; } 3a1: c9 leave 3a2: c3 ret -- (°= =°) //\ Prakash Punnoor /\\ V_/ \_V
signature.asc
Description: OpenPGP digital signature