Hi,

I am using MMX built-ins and gcc-4.0-20050825 and I am experiencing generation
of uneeded movq (at least I guess so, I am no assembler pro). I don't know
which gcc snapshot introduced this, but a I know that some pre-release gcc 4.0
didn't show this bad behaviour. (It's been some time I played with this...)

BTW, this is using gcc built-ins. The situation is much wors when using
intrinsics via mmintrin.h. (Again old pre4.0 gcc didn't have the problem;
using gcc builtins or mmintin.h intrinsics made no differnece; both generated
nice code.)

Here source + assembly mixed + my comments (compiled with -O2 -g3
-march=athlon-xp):

Just shout, if you need anything else.

typedef int v2si __attribute__ ((vector_size (8)));
typedef int di __attribute__ ((vector_size (8)));
typedef short v4hi __attribute__ ((vector_size (8)));

00000320 <MixAudio16_MMX_T>:

void MixAudio16_MMX_T(char* src1, char* src2, char* dst)
{
 320:   55                      push   %ebp
 321:   89 e5                   mov    %esp,%ebp
 323:   83 ec 10                sub    $0x10,%esp
        
        v4hi indata;
        v4hi signmask;
                
        v2si loout;
        v2si hiout;
        
        v2si temp;

        __attribute__((aligned(16))) static const short sm[4] =
{0x8000,0x8000,0x8000,0x8000};
        static const v4hi *m = (v4hi*)sm;

        indata   = *(v4hi*)src1;
        signmask = __builtin_ia32_pand(indata, *m);
 326:   8b 15 04 00 00 00       mov    0x4,%edx
 32c:   8b 45 08                mov    0x8(%ebp),%eax
 32f:   0f 6f 10                movq   (%eax),%mm2
        signmask = __builtin_ia32_pcmpeqw(signmask, *m);
        loout = __builtin_ia32_punpcklwd(indata, signmask);
 332:   0f 6f ca                movq   %mm2,%mm1
        hiout = __builtin_ia32_punpckhwd(indata, signmask);
        
        indata   = *(v4hi*)src2;
 335:   8b 45 0c                mov    0xc(%ebp),%eax
 338:   0f 7f 55 f8             movq   %mm2,0xfffffff8(%ebp)
 33c:   0f 6f 45 f8             movq   0xfffffff8(%ebp),%mm0

Why not movq %mm2, %mm0 ?


 340:   0f db 02                pand   (%edx),%mm0
 343:   0f 7f 45 f0             movq   %mm0,0xfffffff0(%ebp)
 347:   0f 6f 45 f0             movq   0xfffffff0(%ebp),%mm0

what the heck?


 34b:   0f 75 02                pcmpeqw (%edx),%mm0
 34e:   0f 61 c8                punpcklwd %mm0,%mm1
 351:   0f 69 d0                punpckhwd %mm0,%mm2
 354:   0f 7f 4d f8             movq   %mm1,0xfffffff8(%ebp)
 358:   0f 6f 5d f8             movq   0xfffffff8(%ebp),%mm3

As above, this happens throughout, as you can see:


 35c:   0f 7f 55 f8             movq   %mm2,0xfffffff8(%ebp)
 360:   0f 6f 10                movq   (%eax),%mm2
 363:   0f 6f 65 f8             movq   0xfffffff8(%ebp),%mm4
        signmask = __builtin_ia32_pand(indata, *m);
 367:   0f 7f 55 f8             movq   %mm2,0xfffffff8(%ebp)
 36b:   0f 6f 45 f8             movq   0xfffffff8(%ebp),%mm0
        signmask = __builtin_ia32_pcmpeqw(signmask, *m);

        temp  = __builtin_ia32_punpcklwd(indata, signmask);
 36f:   0f 6f ca                movq   %mm2,%mm1
 372:   0f db 02                pand   (%edx),%mm0
        loout = __builtin_ia32_paddd(loout, temp); \
        temp  = __builtin_ia32_punpckhwd(indata, signmask);
        hiout = __builtin_ia32_paddd(hiout, temp);
                
        *(v4hi*)dst = __builtin_ia32_packssdw(loout, hiout);
 375:   8b 45 10                mov    0x10(%ebp),%eax
 378:   0f 7f 45 f0             movq   %mm0,0xfffffff0(%ebp)
 37c:   0f 6f 45 f0             movq   0xfffffff0(%ebp),%mm0
 380:   0f 75 02                pcmpeqw (%edx),%mm0
 383:   0f 61 c8                punpcklwd %mm0,%mm1
 386:   0f 69 d0                punpckhwd %mm0,%mm2
 389:   0f 7f 4d f8             movq   %mm1,0xfffffff8(%ebp)
 38d:   0f fe 5d f8             paddd  0xfffffff8(%ebp),%mm3

Why not using MMX register directly?


 391:   0f 7f 55 f8             movq   %mm2,0xfffffff8(%ebp)
 395:   0f fe 65 f8             paddd  0xfffffff8(%ebp),%mm4

dito


 399:   0f 6b dc                packssdw %mm4,%mm3
 39c:   0f 7f 18                movq   %mm3,(%eax)
        __builtin_ia32_emms();
 39f:   0f 77                   emms
        
        return;
}
 3a1:   c9                      leave
 3a2:   c3                      ret


-- 
(°=                 =°)
//\ Prakash Punnoor /\\
V_/                 \_V

Attachment: signature.asc
Description: OpenPGP digital signature

Reply via email to