This testcase produces unoptimal code:

_Complex float af[16], bf[16], cf[16];
_Complex double ad[16], bd[16], cd[16];

void testf(void)
{
  int i;

  for (i = 0; i < 16; i++)
    cf[i] = af[i] * bf[i];
}

void testd(void)
{
  int i;

  for (i = 0; i < 16; i++)
    cd[i] = ad[i] + bd[i];
}

gcc -O2 -ftree-vectorize -msse2:

testd:
        xorl    %eax, %eax
        .p2align 4,,7
        .p2align 3
.L7:
        movsd   ad+8(%eax), %xmm1
        movsd   ad(%eax), %xmm0
        addsd   bd+8(%eax), %xmm1
        addsd   bd(%eax), %xmm0
        movsd   %xmm1, cd+8(%eax)
        movsd   %xmm0, cd(%eax)
        addl    $16, %eax
        cmpl    $256, %eax
        jne     .L7
        rep
        ret

And with -ffast-math:

testf:
        xorl    %eax, %eax
        .p2align 4,,7
        .p2align 3
.L2:
        movss   bf(,%eax,8), %xmm2
        movss   bf+4(,%eax,8), %xmm3
        movss   af(,%eax,8), %xmm5
        movss   af+4(,%eax,8), %xmm4
        movaps  %xmm2, %xmm0
        movaps  %xmm3, %xmm1
        mulss   %xmm5, %xmm0
        mulss   %xmm4, %xmm1
        mulss   %xmm4, %xmm2
        mulss   %xmm5, %xmm3
        subss   %xmm1, %xmm0
        addss   %xmm3, %xmm2
        movss   %xmm0, cf(,%eax,8)
        movss   %xmm2, cf+4(,%eax,8)
        addl    $1, %eax
        cmpl    $16, %eax
        jne     .L2
        rep
        ret

Note, that we can use SSE3 addsubps insn in the later case.


-- 
           Summary: No vectorization for complex arrays
           Product: gcc
           Version: 4.4.0
            Status: UNCONFIRMED
          Severity: enhancement
          Priority: P3
         Component: tree-optimization
        AssignedTo: unassigned at gcc dot gnu dot org
        ReportedBy: ubizjak at gmail dot com
OtherBugsDependingO 31485
             nThis:


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=35252

Reply via email to