https://gcc.gnu.org/bugzilla/show_bug.cgi?id=83358
Bug ID: 83358 Summary: [8 Regression] division not converted with Intel tuning since r253934 Product: gcc Version: 8.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: target Assignee: unassigned at gcc dot gnu.org Reporter: trippels at gcc dot gnu.org CC: hubicka at ucw dot cz Target Milestone: --- The following code snipped came up on the realworldtech forum recently: #include <stdint.h> __attribute__((noinline, noclone)) void bin2ascii(uint64_t val, char *dst) { const int64_t POW10_10 = ((int64_t)10) * 1000 * 1000 * 1000; int64_t hix = val / POW10_10; int64_t lox = val % POW10_10; int32_t v0 = hix / 100000; int32_t v1 = hix % 100000; int32_t v2 = lox / 100000; int32_t v3 = lox % 100000; for (int i = 4; i != 0; --i) { dst[i + 0 * 5] = v0 % 10 + '0'; v0 /= 10; dst[i + 1 * 5] = v1 % 10 + '0'; v1 /= 10; dst[i + 2 * 5] = v2 % 10 + '0'; v2 /= 10; dst[i + 3 * 5] = v3 % 10 + '0'; v3 /= 10; } dst[0 * 5] = v0 + '0'; dst[1 * 5] = v1 + '0'; dst[2 * 5] = v2 + '0'; dst[3 * 5] = v3 + '0'; dst[4 * 5] = 0; } int main() { char ch[20]; for (int i = 0; i < 10000000; i++) { bin2ascii(10010155021040540, ch); } } Since r253934 gcc doesn't convert divisions anymore when tuning for Intel with e.g. -mtune=haswell. The result is ~5x slower than before. bin2ascii: .LFB0: .cfi_startproc movq %rdi, %rax pushq %rbx .cfi_def_cfa_offset 16 .cfi_offset 3, -16 leaq 4(%rsi), %r9 movl $-858993459, %ebx movabsq $-2601111570856684097, %rdx movabsq $755578637259143235, %rcx mulq %rdx movabsq $10000000000, %rax movq %rdx, %r8 shrq $38, %rdx shrq $33, %r8 imulq %r8, %rax subq %rax, %rdi movq %rdx, %rax mulq %rcx shrq $7, %rdx movl %edx, %r10d imulq $100000, %rdx, %rdx subl %edx, %r8d movq %rdi, %rdx shrq $5, %rdx movq %rdx, %rax mulq %rcx movl $10, %ecx shrq $7, %rdx movl %edx, %r11d imulq $100000, %rdx, %rdx subl %edx, %edi .L2: movl %r10d, %eax decq %r9 cltd idivl %ecx movl %r10d, %eax addl $48, %edx movb %dl, 1(%r9) mull %ebx movl %r8d, %eax movl %edx, %r10d cltd idivl %ecx movl %r8d, %eax shrl $3, %r10d addl $48, %edx movb %dl, 6(%r9) mull %ebx movl %r11d, %eax movl %edx, %r8d cltd idivl %ecx movl %r11d, %eax shrl $3, %r8d addl $48, %edx movb %dl, 11(%r9) mull %ebx movl %edi, %eax movl %edx, %r11d cltd idivl %ecx movl %edi, %eax shrl $3, %r11d addl $48, %edx movb %dl, 16(%r9) mull %ebx movl %edx, %edi shrl $3, %edi cmpq %r9, %rsi jne .L2 addl $48, %r10d addl $48, %r8d leal 48(%r11), %edx addl $48, %edi movb %r10b, (%rsi) popq %rbx .cfi_def_cfa_offset 8 movb %r8b, 5(%rsi) movb %dl, 10(%rsi) movb %dil, 15(%rsi) movb $0, 20(%rsi) ret