------- Comment #4 from rguenth at gcc dot gnu dot org  2009-01-23 15:33 -------
The testcase should be

subroutine to_product_of(self,a,b,a1,a2)
  complex(kind=8) :: self (:)
  complex(kind=8), intent(in) :: a(:,:)
  complex(kind=8), intent(in) :: b(:)
  integer a1,a2
  do i = 1,a1
    do j = 1,a2
      self(i) = self(i) + a(j,i)*b(j)
    end do
  end do
end subroutine

to be meaningful - otherwise we are accessing a in non-continuous ways in the
inner loop which would prevent vectorization.

With the versioning for stride == 1 I get then

.L13:
        movupd  16(%rax), %xmm1
        movupd  (%rax), %xmm3
        incl    %ecx
        movupd  (%rdx), %xmm4
        addq    $32, %rax
        movapd  %xmm3, %xmm0
        unpckhpd        %xmm1, %xmm3
        unpcklpd        %xmm1, %xmm0
        movupd  16(%rdx), %xmm1
        movapd  %xmm4, %xmm2
        addq    $32, %rdx
        movapd  %xmm3, %xmm9
        cmpl    %ecx, %r8d
        unpcklpd        %xmm1, %xmm2
        unpckhpd        %xmm1, %xmm4
        movapd  %xmm4, %xmm1
        movapd  %xmm2, %xmm4
        mulpd   %xmm1, %xmm9
        mulpd   %xmm0, %xmm4
        mulpd   %xmm3, %xmm2
        mulpd   %xmm1, %xmm0
        subpd   %xmm9, %xmm4
        addpd   %xmm2, %xmm0
        addpd   %xmm4, %xmm6
        addpd   %xmm0, %xmm5
        ja      .L13
        haddpd  %xmm5, %xmm5
        cmpl    %r15d, %edi
        movl    -4(%rsp), %ecx
        haddpd  %xmm6, %xmm6
        addsd   %xmm5, %xmm8
        addsd   %xmm6, %xmm7
        jne     .L12
        jmp     .L14

for the innermost loop, followed by a tail loop (peel for niters).  This is
about 15% faster on AMD K10 than the non-vectorized loop (if you disable
the cost-model and make sure to have enough iterations in the inner loop
to pay back for the extra guarding conditions).


-- 


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=37021

Reply via email to