https://gcc.gnu.org/bugzilla/show_bug.cgi?id=66741

--- Comment #1 from Bernhard Reutner-Fischer <aldot at gcc dot gnu.org> ---
i.e. maybe something more along the lines of

$ cat <<EOF | gcc-5 -xc -S - -o - -Ofast -fomit-frame-pointer
-minline-all-stringops -mstringop-strategy=unrolled_loop  -fdump-tree-all-all
-fdump-rtl-all-all -fdump-ipa-all-all -msse4
#include <smmintrin.h>
#include <assert.h>
#include <stdint.h>

void
sse_tolower_strcpy (const char *d, const char *s)
{

  __m128i ranges =
    _mm_setr_epi8 ('A', 'Z', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);

  __m128i *src = (__m128i *) s;
  __m128i *dst = (__m128i *) d;
  const __m128i diff = _mm_set1_epi8 (0x20);

  const uint8_t mode = _SIDD_UBYTE_OPS | _SIDD_CMP_RANGES | _SIDD_UNIT_MASK;

  for (;; src++, dst++)
    {

      const __m128i chunk = _mm_loadu_si128 (src);
      if (_mm_cmpistrc (ranges, chunk, mode))
{

  const __m128i tmp1 = _mm_cmpistrm (ranges, chunk, mode);
  const __m128i mask = _mm_and_si128 (tmp1, diff);

  _mm_storeu_si128 (dst, _mm_xor_si128 (chunk, mask));
}

      if (_mm_cmpistrz (ranges, chunk, mode))
break;
    }
}

#ifdef MAIN
#include <unistd.h>
#include <string.h>
int main(void) {
char src[128], dest[128];
int n = read(0, &src, sizeof(src));
if (n < 1)
 1;
src[n] = 0;
sse_tolower_strcpy(dest, src);
write(2, dest, strlen(dest));
return 0;
}
#endif
EOF

        .file   ""
        .section        .text.unlikely,"ax",@progbits
.LCOLDB2:
        .text
.LHOTB2:
        .p2align 4,,15
        .globl  sse_tolower_strcpy
        .type   sse_tolower_strcpy, @function
sse_tolower_strcpy:
.LFB641:
        .cfi_startproc
        movdqa  .LC0(%rip), %xmm2
        movdqa  .LC1(%rip), %xmm3
        jmp     .L4
        .p2align 4,,10
        .p2align 3
.L2:
        pcmpistrm       $68, %xmm1, %xmm2
        je      .L1
.L9:
        addq    $16, %rsi
        addq    $16, %rdi
.L4:
        movdqu  (%rsi), %xmm1
        pcmpistrm       $68, %xmm1, %xmm2
        jnc     .L2
        pand    %xmm3, %xmm0
        pxor    %xmm1, %xmm0
        movups  %xmm0, (%rdi)
        pcmpistrm       $68, %xmm1, %xmm2
        jne     .L9
.L1:
        rep ret
        .cfi_endproc
.LFE641:
        .size   sse_tolower_strcpy, .-sse_tolower_strcpy
        .section        .text.unlikely
.LCOLDE2:
        .text
.LHOTE2:
        .section        .rodata.cst16,"aM",@progbits,16
        .align 16
.LC0:
        .byte   65
        .byte   90
        .byte   0
        .byte   0
        .byte   0
        .byte   0
        .byte   0
        .byte   0
        .byte   0
        .byte   0
        .byte   0
        .byte   0
        .byte   0
        .byte   0
        .byte   0
        .byte   0
        .align 16
.LC1:
        .quad   2314885530818453536
        .quad   2314885530818453536
        .ident  "GCC: (Debian 5.1.1-12) 5.1.1 20150622"
        .section        .note.GNU-stack,"",@progbits


This would be *much* smaller and supposedly is also faster:
   text    data     bss     dec     hex filename
    228       0       0     228      e4 comment0.o
    153       0       0     153      99 comment1.o

Reply via email to