Hi!

I've noticed we generate terrible code for the testcase below.
E.g. with -mavx2 it is:
        leal    6(%rdi), %edx
        leal    12(%rdi), %ecx
        leal    18(%rdi), %esi
        leal    3(%rdi), %eax
        movl    %edx, -20(%rsp)
        movl    %ecx, -24(%rsp)
        leal    9(%rdi), %edx
        movl    %esi, -28(%rsp)
        vmovd   -20(%rsp), %xmm6
        leal    15(%rdi), %ecx
        movl    %edi, -20(%rsp)
        leal    21(%rdi), %esi
        vmovd   -28(%rsp), %xmm4
        vmovd   -24(%rsp), %xmm5
        vpinsrd $1, %edx, %xmm6, %xmm3
        vmovd   -20(%rsp), %xmm7
        vpinsrd $1, %esi, %xmm4, %xmm2
        vpinsrd $1, %ecx, %xmm5, %xmm1
        vpinsrd $1, %eax, %xmm7, %xmm0
        vpunpcklqdq     %xmm2, %xmm1, %xmm1
        vpunpcklqdq     %xmm3, %xmm0, %xmm0
        vinserti128     $0x1, %xmm1, %ymm0, %ymm0
to load the { x, x + 3, x + 6, x + 9, x + 12, x + 15, x + 18, x + 21 }
CONSTRUCTOR into a vector register.  This patch expands it as:
        movl    %edi, -20(%rsp)
        vbroadcastss    -20(%rsp), %ymm0
        vpaddd  .LC0(%rip), %ymm0, %ymm0
instead.  Similarly for SSE2:
        leal    3(%rdi), %eax
        movl    %eax, -20(%rsp)
        leal    6(%rdi), %eax
        movd    -20(%rsp), %xmm4
        movl    %eax, -16(%rsp)
        leal    9(%rdi), %eax
        movd    -16(%rsp), %xmm1
        movl    %edi, -16(%rsp)
        movl    %eax, -12(%rsp)
        movd    -16(%rsp), %xmm0
        movd    -12(%rsp), %xmm3
        punpckldq       %xmm4, %xmm0
        punpckldq       %xmm3, %xmm1
        punpcklqdq      %xmm1, %xmm0
instead of:
        movl    %edi, -12(%rsp)
        movd    -12(%rsp), %xmm3
        pshufd  $0, %xmm3, %xmm0
        paddd   .LC0(%rip), %xmm0

The patch leaves that decision to the target (in it's
*_expand_vector_init).

Ok?

2013-11-20  Jakub Jelinek  <ja...@redhat.com>

        * expr.c (store_constructor): If all CONSTRUCTOR values are
        the same SSA_NAME plus optional constant, pass plus_constant results
        down to vec_init.
        * config/i386/i386.c (ix86_expand_vector_init): Optimize it using
        broadcast followed by addition of a constant vector.

        * gcc.dg/vect/vect-124.c: New test.

--- gcc/expr.c.jj       2013-11-19 21:56:27.000000000 +0100
+++ gcc/expr.c  2013-11-20 09:24:14.435860130 +0100
@@ -6296,6 +6296,8 @@ store_constructor (tree exp, rtx target,
        rtvec vector = NULL;
        unsigned n_elts;
        alias_set_type alias;
+       tree base = NULL_TREE;
+       rtx base_rtx = NULL_RTX;
 
        gcc_assert (eltmode != BLKmode);
 
@@ -6334,6 +6336,31 @@ store_constructor (tree exp, rtx target,
                                    TYPE_SIZE (TREE_TYPE (value)),
                                    TYPE_SIZE (elttype)));
 
+               /* Try to detect fairly common pattern of
+                  { a_5, a_5 + 1, a_5 + 2, a_5 + 3 }.  */
+               if (vector
+                   && (count == 0 || base)
+                   && TREE_CODE (value) == SSA_NAME)
+                 {
+                   gimple g = get_gimple_for_ssa_name (value);
+                   tree this_base = value;
+                   if (g && is_gimple_assign (g))
+                     switch (gimple_assign_rhs_code (g))
+                       {
+                       case PLUS_EXPR:
+                       case POINTER_PLUS_EXPR:
+                         if (TREE_CODE (gimple_assign_rhs1 (g)) == SSA_NAME
+                             && tree_fits_shwi_p (gimple_assign_rhs2 (g)))
+                           this_base = gimple_assign_rhs1 (g);
+                         break;
+                       default:
+                         break;
+                       }
+                   if (count == 0)
+                     base = this_base;
+                   else if (base != this_base)
+                     base = NULL_TREE;
+                 }
                count += n_elts_here;
                if (mostly_zeros_p (value))
                  zero_count += n_elts_here;
@@ -6342,6 +6369,9 @@ store_constructor (tree exp, rtx target,
            /* Clear the entire vector first if there are any missing elements,
               or if the incidence of zero elements is >= 75%.  */
            need_to_clear = (count < n_elts || 4 * zero_count >= 3 * count);
+
+           if (count < n_elts)
+             base = NULL_TREE;
          }
 
        if (need_to_clear && size > 0 && !vector)
@@ -6362,6 +6392,9 @@ store_constructor (tree exp, rtx target,
        else
          alias = get_alias_set (elttype);
 
+       if (base)
+         base_rtx = expand_normal (base);
+
         /* Store each element of the constructor into the corresponding
           element of TARGET, determined by counting the elements.  */
        for (idx = 0, i = 0;
@@ -6385,8 +6418,21 @@ store_constructor (tree exp, rtx target,
                /* Vector CONSTRUCTORs should only be built from smaller
                   vectors in the case of BLKmode vectors.  */
                gcc_assert (TREE_CODE (TREE_TYPE (value)) != VECTOR_TYPE);
-               RTVEC_ELT (vector, eltpos)
-                 = expand_normal (value);
+               if (base)
+                 {
+                   if (value == base)
+                     RTVEC_ELT (vector, eltpos) = base_rtx;
+                   else
+                     {
+                       gimple g = get_gimple_for_ssa_name (value);
+                       rtx cst = expand_normal (gimple_assign_rhs2 (g));
+                       RTVEC_ELT (vector, eltpos)
+                         = plus_constant (TYPE_MODE (TREE_TYPE (value)),
+                                          base_rtx, INTVAL (cst));
+                     }
+                 }
+               else
+                 RTVEC_ELT (vector, eltpos) = expand_normal (value);
              }
            else
              {
--- gcc/config/i386/i386.c.jj   2013-11-19 21:56:41.000000000 +0100
+++ gcc/config/i386/i386.c      2013-11-20 09:20:16.837649403 +0100
@@ -37736,9 +37736,10 @@ ix86_expand_vector_init (bool mmx_ok, rt
   enum machine_mode inner_mode = GET_MODE_INNER (mode);
   int n_elts = GET_MODE_NUNITS (mode);
   int n_var = 0, one_var = -1;
-  bool all_same = true, all_const_zero = true;
+  bool all_same = true, all_const_zero = true, need_force_reg = false;
   int i;
   rtx x;
+  rtx base = NULL_RTX;
 
   for (i = 0; i < n_elts; ++i)
     {
@@ -37746,11 +37747,25 @@ ix86_expand_vector_init (bool mmx_ok, rt
       if (!(CONST_INT_P (x)
            || GET_CODE (x) == CONST_DOUBLE
            || GET_CODE (x) == CONST_FIXED))
-       n_var++, one_var = i;
+       {
+         n_var++;
+         one_var = i;
+         if (!REG_P (x) && !MEM_P (x))
+           need_force_reg = true;
+       }
       else if (x != CONST0_RTX (inner_mode))
        all_const_zero = false;
       if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
        all_same = false;
+      if (i == 0 || base)
+       {
+         if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
+           x = XEXP (x, 0);
+         if (i == 0)
+           base = x;
+         else if (!rtx_equal_p (x, base))
+           base = NULL_RTX;
+       }
     }
 
   /* Constants are best loaded from the constant pool.  */
@@ -37760,6 +37775,49 @@ ix86_expand_vector_init (bool mmx_ok, rt
       return;
     }
 
+  /* For { a_5, a_5 + 4, a_5 + 2, a_5 + 7 } generate a broadcast
+     followed by addition of constant.  */
+  if (!all_same
+      && base
+      && n_elts >= 4
+      && (REG_P (base) || MEM_P (base)))
+    {
+      rtx subtarget = gen_reg_rtx (mode);
+      if (ix86_expand_vector_init_duplicate (mmx_ok, mode, subtarget, base))
+       {
+         rtvec cstvals = rtvec_alloc (n_elts);
+         rtx cstsubtarget = gen_reg_rtx (mode);
+         for (i = 0; i < n_elts; i++)
+           {
+             x = XVECEXP (vals, 0, i);
+             if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
+               RTVEC_ELT (cstvals, i) = XEXP (x, 1);
+             else
+               RTVEC_ELT (cstvals, i) = const0_rtx;
+           }
+         ix86_expand_vector_init (mmx_ok, cstsubtarget,
+                                  gen_rtx_PARALLEL (mode, cstvals));
+         subtarget = expand_simple_binop (mode, PLUS, subtarget,
+                                          cstsubtarget, target, 1,
+                                          OPTAB_DIRECT);
+         if (subtarget != target)
+           emit_move_insn (target, subtarget);
+         return;
+       }
+    }
+
+  if (need_force_reg)
+    for (i = 0; i < n_elts; ++i)
+      {
+       x = XVECEXP (vals, 0, i);
+       if (!(CONST_INT_P (x)
+             || GET_CODE (x) == CONST_DOUBLE
+             || GET_CODE (x) == CONST_FIXED
+             || REG_P (x)
+             || MEM_P (x)))
+         XVECEXP (vals, 0, i) = force_reg (GET_MODE (x), x);
+      }
+
   /* If all values are identical, broadcast the value.  */
   if (all_same
       && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
--- gcc/testsuite/gcc.dg/vect/vect-124.c.jj     2013-11-20 09:27:35.760798649 
+0100
+++ gcc/testsuite/gcc.dg/vect/vect-124.c        2013-11-20 09:29:32.219199794 
+0100
@@ -0,0 +1,28 @@
+#include "tree-vect.h"
+
+#ifndef N
+#define N 64
+#endif
+
+int a[N];
+
+__attribute__((noinline, noclone)) void
+foo (int x)
+{
+  int i;
+  for (i = 0; i < N; i++, x += 3)
+    a[i] = x;
+}
+
+int
+main ()
+{
+  int i;
+  
+  check_vect ();
+  foo (6);
+  for (i = 0; i < N; i++)
+    if (a[i] != i * 3 + 6)
+      abort ();
+  return 0;
+}

        Jakub

Reply via email to