https://gcc.gnu.org/bugzilla/show_bug.cgi?id=61481

            Bug ID: 61481
           Summary: Poor optimization of simple small-sized matrix
                    routines with constant data
           Product: gcc
           Version: 4.10.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: tree-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: cas43 at cs dot stanford.edu

Created attachment 32927
  --> https://gcc.gnu.org/bugzilla/attachment.cgi?id=32927&action=edit
Output of the

The following function does some simple matrix operations.  All of the data is
constant, and N is small.  The function optimizes to a return statement for N=1
and N=2.  For N=3, optimization is incomplete after tree optimizations but
benifits significantly from later optimizations.  For N=4, the final result is
not good.

template<int N>
int foo()
{
    int x[N*N],y[N*N],z[N*N];
    for(int i=0;i<N*N;i++) x[i]=0;
    for(int i=0;i<N*N;i++) y[i]=0;
    for(int i=0;i<N*N;i++) z[i]=0;
    for(int i=0;i<N;i++) x[i*(N+1)]=1;
    for(int i=0;i<N;i++) y[i*(N+1)]=2;
    for(int i=0;i<N;i++) for(int j=0;j<N;j++) for(int k=0;k<N;k++)
z[i*N+j]+=x[i*N+k]*y[k*N+j];
    int ret=0;
    for(int i=0;i<N;i++) ret+=z[i*(N+1)];
    return ret;
}
template int foo<1>();
template int foo<2>();
template int foo<3>();
template int foo<4>();


Compiled with: g++ test.cpp -c -S -O3

The full test.s file is attached, but the most immediate bits are summarized
below.

=== Asm produced for foo<1>(); ===

movl $2, %eax
ret

=== Asm produced for foo<2>(); ===

movl $4, %eax
ret

=== Asm produced for foo<3>(); ===

subq $32, %rsp
.cfi_def_cfa_offset 40
movl $6, %eax
addq $32, %rsp
.cfi_def_cfa_offset 8
ret

=== Asm produced for foo<4>(); ===

subq $96, %rsp
.cfi_def_cfa_offset 104
xorl %eax, %eax
movl $8, %ecx
leaq -104(%rsp), %rdx
pxor %xmm7, %xmm7
pxor %xmm10, %xmm10
movq %rdx, %rdi
rep; stosq
movl $1, -104(%rsp)
movl $1, -84(%rsp)
movl $1, -64(%rsp)
movl $1, -44(%rsp)
movdqa %xmm7, %xmm11
shufps $136, %xmm7, %xmm11
movdqa -104(%rsp), %xmm0
shufps $221, %xmm7, %xmm7
movdqa -88(%rsp), %xmm3
... lots and lots of SSE instructions ...
movdqa %xmm2, %xmm0
punpckhdq %xmm5, %xmm2
punpckldq %xmm5, %xmm0
movaps %xmm6, -120(%rsp)
movl -120(%rsp), %eax
addl 44(%rsp), %eax
movaps %xmm0, 56(%rsp)
movaps %xmm2, 72(%rsp)
addl 64(%rsp), %eax
addl 84(%rsp), %eax
addq $96, %rsp
.cfi_def_cfa_offset 8
ret

Reply via email to