http://gcc.gnu.org/bugzilla/show_bug.cgi?id=53355

             Bug #: 53355
           Summary: Autovectorization of a simple loop could be improved.
    Classification: Unclassified
           Product: gcc
           Version: 4.7.0
            Status: UNCONFIRMED
          Severity: enhancement
          Priority: P3
         Component: c
        AssignedTo: unassig...@gcc.gnu.org
        ReportedBy: svfue...@gmail.com


The following simple function gets autovectorized by gcc-4.7 -O3

void foo(double *x)
{
    int i;

    for (i = 0; i < 100000; i++)
    {
        x[i] += 2.0;
    }
}

However, the generated code is rather poor:

foo:
.LFB0:
    .cfi_startproc
    movq    %rdi, %rax
    salq    $60, %rax
    sarq    $63, %rax
    movq    %rax, %rdx
    andl    $1, %edx
    testq    %rdx, %rdx
    movl    %edx, %ecx
    je    .L7
    movsd    .LC0(%rip), %xmm0
    movl    $99999, %r9d
    movl    $1, %r10d
    addsd    (%rdi), %xmm0
    movsd    %xmm0, (%rdi)
.L2:
    movl    $100000, %r8d
    andl    $1, %eax
    subl    %ecx, %r8d
    movl    %r8d, %esi
    shrl    %esi
    movl    %esi, %r11d
    addl    %r11d, %r11d
    je    .L3
    movapd    .LC1(%rip), %xmm1
    leaq    (%rdi,%rax,8), %rcx
    xorl    %edx, %edx
    xorl    %eax, %eax
    .p2align 4,,10
    .p2align 3
.L4:
    movapd    (%rcx,%rax), %xmm0
    addl    $1, %edx
    addpd    %xmm1, %xmm0
    movapd    %xmm0, (%rcx,%rax)
    addq    $16, %rax
    cmpl    %esi, %edx
    jb    .L4
    addl    %r11d, %r10d
    subl    %r11d, %r9d
    cmpl    %r11d, %r8d
    je    .L1
.L3:
    leal    -1(%r9), %edx
    movslq    %r10d, %r10
    leaq    (%rdi,%r10,8), %rax
    movsd    .LC0(%rip), %xmm1
    addq    %rdx, %r10
    leaq    8(%rdi,%r10,8), %rdx
    .p2align 4,,10
    .p2align 3
.L6:
    movsd    (%rax), %xmm0
    addsd    %xmm1, %xmm0
    movsd    %xmm0, (%rax)
    addq    $8, %rax
    cmpq    %rdx, %rax
    jne    .L6
.L1:
    rep
    ret
.L7:
    movl    $100000, %r9d
    xorl    %r10d, %r10d
    jmp    .L2


The first thing wrong is that the alignment test is rather clunky.  Instead, we
can just do a "testq $8, %rdi" followed by a jump to the miss-aligned case.

The second thing wrong is that the code instead of falling-through to the
aligned case, jumps down to L7 instead.  Forward jumps are predicted as not
taken.

The inner loop is also longer than it should be.  It is possible to have one
less increment statement inside.

Finally, after the main loop, the code again tests for miss-alignment.  This
pessimizes the fast path.  Instead, it possible to move the slow miss-aligned
case completely out with very little size penalty.

Doing the above optimizations yields:

    movapd 2_s, %xmm1
    testq $8, %rdi
    jne 2f

    lea 800000(%rdi), %rax

# The "Hot" inner loop
1:    movapd    (%rdi), %xmm0
    addq    $16, %rdi
    addpd    %xmm1, %xmm0
    movapd    %xmm0, (%rdi)
    cmpq    %rdi, %rax
    jne 1b
    rep ret

# The slow miss-aligned case    
2:      movsd    (%rdi), %xmm0
    addsd    %xmm1, %xmm0
    movsd    %xmm0, (%rdi)

    leaq     (800000 - 16) (%rdi), %rax
    addq    $8, %rdi

3:    movapd    (%rdi), %xmm0
    addq    $16, %rdi
    addpd    %xmm1, %xmm0
    movapd    %xmm0, (%rdi)
    cmpq    %rdi, %rax
    jne 3b

    movsd    (%rdi), %xmm0
    addsd    %xmm1, %xmm0
    movsd    %xmm0, (%rdi)
    ret

Reply via email to