http://gcc.gnu.org/bugzilla/show_bug.cgi?id=59650

            Bug ID: 59650
           Summary: Inefficient vector assignment code
           Product: gcc
           Version: 4.8.2
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: tree-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: freddie at witherden dot org

Consider the following snippet:

    typedef double v4d __attribute__((vector_size(32)));

    v4d set1(double *v)
    {
        v4d tmp = { v[0], v[1], v[2], v[3] };
        return tmp;
    }

    v4d set2(double *v)
    {
        v4d tmp;

        tmp[0] = v[0];
        tmp[1] = v[1];
        tmp[2] = v[2];
        tmp[3] = v[3];

        return tmp;
    }

if my understanding of the vector extensions is correct they should both do the
same thing.  Compiling with GCC 4.8.2 with -O3 -march=native on a Sandy Bridge
system gives:

0000000000000000 <_Z4set1Pd>:
   0:   c5 fb 10 57 10          vmovsd 0x10(%rdi),%xmm2
   5:   c5 fb 10 1f             vmovsd (%rdi),%xmm3
   9:   c5 e9 16 47 18          vmovhpd 0x18(%rdi),%xmm2,%xmm0
   e:   c5 e1 16 4f 08          vmovhpd 0x8(%rdi),%xmm3,%xmm1
  13:   c4 e3 75 18 c0 01       vinsertf128 $0x1,%xmm0,%ymm1,%ymm0
  19:   c3                      retq   
  1a:   66 0f 1f 44 00 00       nopw   0x0(%rax,%rax,1)

0000000000000020 <_Z4set2Pd>:
  20:   c5 fb 10 07             vmovsd (%rdi),%xmm0
  24:   c5 f9 28 c0             vmovapd %xmm0,%xmm0
  28:   c5 f9 28 c8             vmovapd %xmm0,%xmm1
  2c:   c5 f1 16 4f 08          vmovhpd 0x8(%rdi),%xmm1,%xmm1
  31:   c4 e3 7d 18 c1 00       vinsertf128 $0x0,%xmm1,%ymm0,%ymm0
  37:   c4 e3 7d 19 c1 01       vextractf128 $0x1,%ymm0,%xmm1
  3d:   c5 f1 12 4f 10          vmovlpd 0x10(%rdi),%xmm1,%xmm1
  42:   c4 e3 7d 18 c1 01       vinsertf128 $0x1,%xmm1,%ymm0,%ymm0
  48:   c4 e3 7d 19 c1 01       vextractf128 $0x1,%ymm0,%xmm1
  4e:   c5 f1 16 4f 18          vmovhpd 0x18(%rdi),%xmm1,%xmm1
  53:   c4 e3 7d 18 c1 01       vinsertf128 $0x1,%xmm1,%ymm0,%ymm0
  59:   c3                      retq  

where I note the functions are different.  For set1 I note that four moves are
issued whereas I was expecting two 128-bit unaligned moves.  The code for set2
also appears to be inefficient.

Reply via email to