https://gcc.gnu.org/bugzilla/show_bug.cgi?id=97194

            Bug ID: 97194
           Summary: optimize vector element set/extract at variable
                    position
           Product: gcc
           Version: 11.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: target
          Assignee: unassigned at gcc dot gnu.org
          Reporter: rguenth at gcc dot gnu.org
  Target Milestone: ---

#define N 32
typedef int T;
typedef T V __attribute__((vector_size(N)));
V set (V v, int idx, T val)
{
  v[idx] = val;
  return v;
}
T get (V v, int idx)
{
  return v[idx];
}

generates with -mavx2

set:
.LFB0:
        .cfi_startproc
        pushq   %rbp
        .cfi_def_cfa_offset 16
        .cfi_offset 6, -16
        movslq  %edi, %rdi
        movq    %rsp, %rbp
        .cfi_def_cfa_register 6
        andq    $-32, %rsp
        vmovdqa %ymm0, -32(%rsp)
        movl    %esi, -32(%rsp,%rdi,4)
        vmovdqa -32(%rsp), %ymm0
^^^ store forwarding fail
        leave
        .cfi_def_cfa 7, 8
        ret

get:
.LFB1:
        .cfi_startproc
        pushq   %rbp
        .cfi_def_cfa_offset 16
        .cfi_offset 6, -16
        movslq  %edi, %rdi
        movq    %rsp, %rbp
        .cfi_def_cfa_register 6
        andq    $-32, %rsp
        vmovdqa %ymm0, -32(%rsp)
        movl    -32(%rsp,%rdi,4), %eax
        leave
        .cfi_def_cfa 7, 8
        ret

maybe not too bad.

Vary N and T to cover all types and vector sizes.

It should be possible to do the 'get' case via variable permutes
and the 'set' case via a splat of the value and a blend using
a mask generated from 'idx'.

Reply via email to