https://gcc.gnu.org/bugzilla/show_bug.cgi?id=97194

--- Comment #13 from Richard Biener <rguenth at gcc dot gnu.org> ---
(In reply to Richard Biener from comment #12)
> (In reply to Alexander Monakov from comment #11)
> > Yeah, for inserts such tactic would be inappropriate due to bad store
> > forwarding stalls anyway. As you've shown in earlier comments, inserts have
> > a very nice generic way to expand them (that does not touch stack).
> 
> Unfortunately it doesn't work (the CSE).  Patch:
> 
> diff --git a/gcc/cfgexpand.c b/gcc/cfgexpand.c
> index 1eaa1da11b9..f7b1a92dd95 100644
> --- a/gcc/cfgexpand.c
> +++ b/gcc/cfgexpand.c
> @@ -6102,7 +6102,11 @@ discover_nonconstant_array_refs_r (tree * tp, int
> *walk_subtrees,
>              || CONVERT_EXPR_P (t))
>         t = TREE_OPERAND (t, 0);
>  
> -      if (TREE_CODE (t) == ARRAY_REF || TREE_CODE (t) == ARRAY_RANGE_REF)
> +      if ((TREE_CODE (t) == ARRAY_REF
> +          && !(TREE_CODE (TREE_OPERAND (t, 0)) == VIEW_CONVERT_EXPR
> +               && DECL_P (TREE_OPERAND (TREE_OPERAND (t, 0), 0)))
> +               && VECTOR_TYPE_P (TREE_TYPE (TREE_OPERAND (TREE_OPERAND (t,
> 0), 0))))
> +          || TREE_CODE (t) == ARRAY_RANGE_REF)
>         {
>           t = get_base_address (t);
>           if (t && DECL_P (t)
> 
> 
> and for
> 
> typedef int v4si __attribute__((vector_size(16)));
> 
> int foo (v4si v, int i)
> {
>   v = v + v;
>   return v[i] + v[2*i];
> }
> 
> at -O2 we get
> 
> foo:
> .LFB0:
>         .cfi_startproc
>         leal    (%rdi,%rdi), %edx
>         paddd   %xmm0, %xmm0
>         movslq  %edi, %rdi
>         movslq  %edx, %rdx
>         movaps  %xmm0, -24(%rsp)
>         movaps  %xmm0, -40(%rsp)
>         movl    -40(%rsp,%rdi,4), %eax
>         addl    -24(%rsp,%rdx,4), %eax
>         ret

and unpatched

foo:
.LFB0:
        .cfi_startproc
        leal    (%rdi,%rdi), %edx
        paddd   %xmm0, %xmm0
        movslq  %edi, %rdi
        movslq  %edx, %rdx
        movaps  %xmm0, -24(%rsp)
        movl    -24(%rsp,%rdi,4), %eax
        addl    -24(%rsp,%rdx,4), %eax
        ret

so we're able to elide the stack slot usage for the add and retain a single
slot.

Reply via email to