https://gcc.gnu.org/bugzilla/show_bug.cgi?id=83358

            Bug ID: 83358
           Summary: [8 Regression] division not converted with Intel
                    tuning since r253934
           Product: gcc
           Version: 8.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: target
          Assignee: unassigned at gcc dot gnu.org
          Reporter: trippels at gcc dot gnu.org
                CC: hubicka at ucw dot cz
  Target Milestone: ---

The following code snipped came up on the realworldtech forum recently:

#include <stdint.h>

__attribute__((noinline, noclone))
void bin2ascii(uint64_t val, char *dst) {
  const int64_t POW10_10 = ((int64_t)10) * 1000 * 1000 * 1000;
  int64_t hix = val / POW10_10;
  int64_t lox = val % POW10_10;
  int32_t v0 = hix / 100000;
  int32_t v1 = hix % 100000;
  int32_t v2 = lox / 100000;
  int32_t v3 = lox % 100000;
  for (int i = 4; i != 0; --i) {
    dst[i + 0 * 5] = v0 % 10 + '0';
    v0 /= 10;
    dst[i + 1 * 5] = v1 % 10 + '0';
    v1 /= 10;
    dst[i + 2 * 5] = v2 % 10 + '0';
    v2 /= 10;
    dst[i + 3 * 5] = v3 % 10 + '0';
    v3 /= 10;
  }
  dst[0 * 5] = v0 + '0';
  dst[1 * 5] = v1 + '0';
  dst[2 * 5] = v2 + '0';
  dst[3 * 5] = v3 + '0';
  dst[4 * 5] = 0;
}

int main() {
  char ch[20];
  for (int i = 0; i < 10000000; i++) {
    bin2ascii(10010155021040540, ch);
  }
}

Since r253934 gcc doesn't convert divisions anymore when tuning for Intel with
e.g. -mtune=haswell. The result is ~5x slower than before.

bin2ascii:
.LFB0:
        .cfi_startproc
        movq    %rdi, %rax
        pushq   %rbx
        .cfi_def_cfa_offset 16
        .cfi_offset 3, -16
        leaq    4(%rsi), %r9
        movl    $-858993459, %ebx
        movabsq $-2601111570856684097, %rdx
        movabsq $755578637259143235, %rcx
        mulq    %rdx
        movabsq $10000000000, %rax
        movq    %rdx, %r8
        shrq    $38, %rdx
        shrq    $33, %r8
        imulq   %r8, %rax
        subq    %rax, %rdi
        movq    %rdx, %rax
        mulq    %rcx
        shrq    $7, %rdx
        movl    %edx, %r10d
        imulq   $100000, %rdx, %rdx
        subl    %edx, %r8d
        movq    %rdi, %rdx
        shrq    $5, %rdx
        movq    %rdx, %rax
        mulq    %rcx
        movl    $10, %ecx
        shrq    $7, %rdx
        movl    %edx, %r11d
        imulq   $100000, %rdx, %rdx
        subl    %edx, %edi
.L2:
        movl    %r10d, %eax
        decq    %r9
        cltd
        idivl   %ecx
        movl    %r10d, %eax
        addl    $48, %edx
        movb    %dl, 1(%r9)
        mull    %ebx
        movl    %r8d, %eax
        movl    %edx, %r10d
        cltd
        idivl   %ecx
        movl    %r8d, %eax
        shrl    $3, %r10d
        addl    $48, %edx
        movb    %dl, 6(%r9)
        mull    %ebx
        movl    %r11d, %eax
        movl    %edx, %r8d
        cltd
        idivl   %ecx
        movl    %r11d, %eax
        shrl    $3, %r8d
        addl    $48, %edx
        movb    %dl, 11(%r9)
        mull    %ebx
        movl    %edi, %eax
        movl    %edx, %r11d
        cltd
        idivl   %ecx
        movl    %edi, %eax
        shrl    $3, %r11d
        addl    $48, %edx
        movb    %dl, 16(%r9)
        mull    %ebx
        movl    %edx, %edi
        shrl    $3, %edi
        cmpq    %r9, %rsi
        jne     .L2
        addl    $48, %r10d
        addl    $48, %r8d
        leal    48(%r11), %edx
        addl    $48, %edi
        movb    %r10b, (%rsi)
        popq    %rbx
        .cfi_def_cfa_offset 8
        movb    %r8b, 5(%rsi)
        movb    %dl, 10(%rsi)
        movb    %dil, 15(%rsi)
        movb    $0, 20(%rsi)
        ret

Reply via email to