https://gcc.gnu.org/bugzilla/show_bug.cgi?id=78007

--- Comment #3 from Richard Biener <rguenth at gcc dot gnu.org> ---
Created attachment 39827
  --> https://gcc.gnu.org/bugzilla/attachment.cgi?id=39827&action=edit
untested patch

Mostly untested prototype.  For -mavx2 we get from the testcase innermost loop

.L6:
        vmovdqa (%r9,%rdx), %ymm0
        addl    $1, %r8d
        vperm2i128      $0, %ymm0, %ymm0, %ymm0
        vpshufb %ymm1, %ymm0, %ymm0
        vmovdqa %ymm0, (%r9,%rdx)
        addq    $32, %rdx
        cmpl    %r11d, %r8d
        jb      .L6

with -msse4:

.L6:
        movdqa  (%rax,%rdx), %xmm0
        addl    $1, %r8d
        pshufb  %xmm1, %xmm0
        movaps  %xmm0, (%rax,%rdx)
        addq    $16, %rdx
        cmpl    %r10d, %r8d
        jb      .L6

not sure if I got the bswap permutation vector constant correct either ;) 
(quick hack)

  vect_load_dst_8.13_63 = MEM[(u32 *)vectp_b.11_61];
  load_dst_8 = *_3;
  _64 = VIEW_CONVERT_EXPR<vector(16) char>(vect_load_dst_8.13_63);
  _65 = VEC_PERM_EXPR <_64, _64, { 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1,
0 }>;
  _66 = VIEW_CONVERT_EXPR<vector(4) unsigned int>(_65);
  _13 = __builtin_bswap32 (load_dst_8);
  MEM[(u32 *)vectp_b.14_69] = _66;

Reply via email to