------- Comment #11 from jv244 at cam dot ac dot uk 2010-04-27 18:25 ------- the original loop gets now (4.6.0) vectorized, and gets the same performance as the 'hand optimized loop' (which does not get vectorized):
> ./a.out default loop 0.88005500000000003 hand optimized loop 0.86005399999999987 it is still not quite as fast as the ifort code: ifort -fno-inline -O3 -xT -static t.f90 > ~/a.out default loop 0.444028000000000 hand optimized loop 0.964060000000000 ifort's asm looks good: # -- Begin s31_ # mark_begin; .align 16,0x90 .globl s31_ s31_: # parameter 1: %rdi # parameter 2: %rsi # parameter 3: %rdx # parameter 4: %rcx ..B2.1: # Preds ..B2.0 ..___tag_value_s31_.10: #3.12 xorps %xmm1, %xmm1 #9.2 movaps %xmm1, %xmm0 #9.2 xorl %eax, %eax #9.2 # LOE rax rdx rbx rbp rsi rdi r12 r13 r14 r15 xmm0 xmm1 ..B2.2: # Preds ..B2.2 ..B2.1 movaps (%rdi,%rax,8), %xmm2 #10.8 movaps 16(%rdi,%rax,8), %xmm3 #10.8 movaps 32(%rdi,%rax,8), %xmm4 #10.8 movaps 48(%rdi,%rax,8), %xmm5 #10.8 mulpd (%rsi,%rax,8), %xmm2 #10.12 mulpd 16(%rsi,%rax,8), %xmm3 #10.12 mulpd 32(%rsi,%rax,8), %xmm4 #10.12 mulpd 48(%rsi,%rax,8), %xmm5 #10.12 addpd %xmm2, %xmm0 #10.4 addq $8, %rax #9.2 cmpq $1024, %rax #9.2 addpd %xmm3, %xmm1 #10.4 addpd %xmm4, %xmm0 #10.4 addpd %xmm5, %xmm1 #10.4 jb ..B2.2 # Prob 82% #9.2 # LOE rax rdx rbx rbp rsi rdi r12 r13 r14 r15 xmm0 xmm1 ..B2.3: # Preds ..B2.2 addpd %xmm1, %xmm0 #9.2 haddpd %xmm0, %xmm0 #9.2 movsd %xmm0, (%rdx) #10.4 ret #12.1 .align 16,0x90 ..___tag_value_s31_.11: # while gcc has more complicated-looking asm .globl s31_ .type s31_, @function s31_: .LFB0: movl (%rcx), %r9d movq $0, (%rdx) testl %r9d, %r9d jle .L9 movl %r9d, %r8d shrl %r8d cmpl $4, %r9d leal (%r8,%r8), %r10d jbe .L15 testl %r10d, %r10d je .L15 xorl %eax, %eax xorl %ecx, %ecx xorpd %xmm1, %xmm1 .p2align 4,,10 .p2align 3 .L12: movsd (%rsi,%rax), %xmm2 movsd (%rdi,%rax), %xmm3 movhpd 8(%rsi,%rax), %xmm2 movhpd 8(%rdi,%rax), %xmm3 movapd %xmm2, %xmm0 incl %ecx mulpd %xmm3, %xmm0 addq $16, %rax addpd %xmm0, %xmm1 cmpl %ecx, %r8d ja .L12 haddpd %xmm1, %xmm1 leal 1(%r10), %eax cmpl %r9d, %r10d je .L13 .L11: movslq %eax, %rcx subl %eax, %r9d leaq -8(,%rcx,8), %rcx xorl %eax, %eax addq %rcx, %rdi addq %rcx, %rsi leaq 8(,%r9,8), %rcx .p2align 4,,10 .p2align 3 .L14: movsd (%rsi), %xmm0 addq $8, %rax mulsd (%rdi), %xmm0 addq $8, %rsi addq $8, %rdi addsd %xmm0, %xmm1 cmpq %rcx, %rax jne .L14 .L13: movsd %xmm1, (%rdx) .L9: rep ret .L15: xorpd %xmm1, %xmm1 movl $1, %eax jmp .L11 .LFE0: .size s31_, .-s31_ -- http://gcc.gnu.org/bugzilla/show_bug.cgi?id=25621