http://gcc.gnu.org/bugzilla/show_bug.cgi?id=53645

             Bug #: 53645
           Summary: Missed optimization for division of vector types
    Classification: Unclassified
           Product: gcc
           Version: 4.7.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: c
        AssignedTo: unassig...@gcc.gnu.org
        ReportedBy: andrii.riabushe...@barclays.com


for the following code

v4si ttt(v4si x) {

     return x / (v4si) {3,3,3,3};
}


GCC generates the following assembler

ttt:
    movdqa    (%rcx), %xmm0
    movl    $1431655766, %ecx
    movd    %xmm0, %r8d
    pextrd    $1, %xmm0, %r10d
    pextrd    $2, %xmm0, %r11d
    movl    %r8d, %eax
    sarl    $31, %r8d
    imull    %ecx
    movl    %r10d, %eax
    sarl    $31, %r10d
    movl    %edx, %r9d
    imull    %ecx
    movl    %r11d, %eax
    subl    %r8d, %r9d
    sarl    $31, %r11d
    movl    %edx, %r8d
    imull    %ecx
    subl    %r10d, %r8d
    movl    %edx, %r10d
    subl    %r11d, %r10d
    pextrd    $3, %xmm0, %r11d
    movl    %r11d, %eax
    imull    %ecx
    sarl    $31, %r11d
    movd    %r10d, %xmm1
    movd    %r9d, %xmm0
    pinsrd    $0x1, %r8d, %xmm0
    subl    %r11d, %edx
    pinsrd    $0x1, %edx, %xmm1
    punpcklqdq    %xmm1, %xmm0
    ret


Thus gcc DOES optimize the division to be done through High Multiplication, but
it is applied to each value separately instead of vectorized ones. Assember
should look like


    movdqa    .LC190(%rip), %xmm0
    pmulld    (%rcx), %xmm0
    pslld    $31, %xmm0
    ret

Reply via email to