[Bug tree-optimization/51499] vectorizer missing simple case

dominiq at lps dot ens.fr Mon, 12 Dec 2011 04:48:16 -0800

http://gcc.gnu.org/bugzilla/show_bug.cgi?id=51499


--- Comment #12 from Dominique d'Humieres <dominiq at lps dot ens.fr> 
2011-12-12 12:47:54 UTC ---
> > even when the above loops are unrolled. How can the loop L5 be unrolled if 
> > it
> > is only there for a "scalar epilogue"?
>
> It can't be unrolled, since the alignment is unknown, so we don't know the
> number of iterations of the prologue loop, and, therefore, we don't know the
> number of iterations of the epilogue.

Well, it is unrolled with -funroll-loops, for instance if I compile with
'-Ofast -funroll-loops --param max-unroll-times=4', I get

L3:
        movsd   (%r8,%r11), %xmm3
        addq    $4, %r10
        movsd   16(%r8,%r11), %xmm5
        movsd   32(%r8,%r11), %xmm7
        movhpd  8(%r8,%r11), %xmm3
        movsd   48(%r8,%r11), %xmm9
        movhpd  24(%r8,%r11), %xmm5
        movapd  (%r9,%r11), %xmm4
        movhpd  40(%r8,%r11), %xmm7
        movapd  16(%r9,%r11), %xmm6
        movhpd  56(%r8,%r11), %xmm9
        movapd  32(%r9,%r11), %xmm8
        mulpd   %xmm3, %xmm4
        movapd  48(%r9,%r11), %xmm10
        mulpd   %xmm5, %xmm6
        mulpd   %xmm7, %xmm8
        mulpd   %xmm9, %xmm10
        movlpd  %xmm4, (%rcx,%r11)
        movhpd  %xmm4, 8(%rcx,%r11)
        movlpd  %xmm6, 16(%rcx,%r11)
        movhpd  %xmm6, 24(%rcx,%r11)
        movlpd  %xmm8, 32(%rcx,%r11)
        movhpd  %xmm8, 40(%rcx,%r11)
        movlpd  %xmm10, 48(%rcx,%r11)
        movhpd  %xmm10, 56(%rcx,%r11)
        addq    $64, %r11
        cmpq    $504156, %r10
        jbe     L3

and

L5:
        movsd   -8(%rdi,%r9,8), %xmm15
        leaq    1(%r9), %rbx
        leaq    2(%r9), %r8
        movsd   -8(%rdi,%rbx,8), %xmm0
        leaq    3(%r9), %rcx
        movsd   -8(%rdi,%r8,8), %xmm1
        mulsd   -8(%rdx,%r9,8), %xmm15
        movsd   -8(%rdi,%rcx,8), %xmm2
        mulsd   -8(%rdx,%rbx,8), %xmm0
        mulsd   -8(%rdx,%r8,8), %xmm1
        mulsd   -8(%rdx,%rcx,8), %xmm2
        movsd   %xmm15, -8(%rsi,%r9,8)
        addq    $4, %r9
        cmpq    %r12, %r9
        movsd   %xmm0, -8(%rsi,%rbx,8)
        movsd   %xmm1, -8(%rsi,%r8,8)
        movsd   %xmm2, -8(%rsi,%rcx,8)
        jne     L5

So both the vectorized and the unvectorized loops are unrolled four times. This
does not seem logical to me if the L5 loop was there only to handle a left over
scalar (AFAIU %xmm* store only one or two doubles and there is at most one left
if the length is odd or if the length is even and the first one has been peeled
for alignement).

I am also puzzled by the way the vectors as stored back as a pair

        movlpd  %xmm4, (%rcx,%r11)
        movhpd  %xmm4, 8(%rcx,%r11)

Why not a 'movapd' instead?

[Bug tree-optimization/51499] vectorizer missing simple case

Reply via email to