https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106236
Bug ID: 106236 Summary: Not a bug, bad performance (with GCC 11.3.0 - O3) of a small etude in C Product: gcc Version: og11 (devel/omp/gcc-11) Status: UNCONFIRMED Severity: normal Priority: P3 Component: c Assignee: unassigned at gcc dot gnu.org Reporter: sanmayce at hotmail dot com Target Milestone: --- Created attachment 53279 --> https://gcc.gnu.org/bugzilla/attachment.cgi?id=53279&action=edit The full C source, along with how to compile The following C etude is compiled not in a good way by GCC 11.3.0 -O3; the thing that is unpleasant to look on, is the unnecesary jumps GCC generates: // 'Magnetica' partitioning, mainloop rev.7p [ for (;;) { for (;Pivot < *Jndx; Jndx--) { } // Jndx could be PR i.e. PR == Jndx if (PR == Jndx) break; //if (PR == Jndx) goto KUH; //] QS_bench_r14_CLANG_14.0.1_rev5bypass.exe Glupendxr DNA Done in 240/59 seconds. //PR++; //]<----+ //PR = PR + !!(PR - Jndx); //] | //PR = PR + (PR != Jndx); //]------+ QS_bench_r14_CLANG_14.0.1_rev5bypass.exe Glupendxr DNA Done in 256/63 seconds. PR++; M18_swapUnconditional (PR, Jndx); // Inhere Pivot is either == or > *(PR) when PR<Jndx if (Pivot > *(PR)) { *PL=*(PR); PL++; *(PR)=Pivot; } } //M18_SwapConditional_ifXbY_BUGGY((uint64_t)Jndx, (uint64_t)PR, PR, Jndx); //] QS_bench_r14_CLANG_14.0.1_rev5bypass.exe Glupendxr DNA Done in 239/57 seconds. //PR = PR - (PR > Jndx); //] //PR = PR + M18_SwapConditional_ifXbY_BUGGY_DidWeSwap((uint64_t)Jndx, (uint64_t)PR, PR, Jndx); //] QS_bench_r14_CLANG_14.0.1_rev5bypass.exe Glupendxr DNA Done in 233/57 seconds. KUH:; // 'Magnetica' partitioning, mainloop rev.7p ] // QS_bench_r14_CLANG_14.0.1_rev5bypass.exe Glupendxr DNA; Done in 236/33 seconds. // QS_bench_r14_GCC11.3.0_rev5bypass.exe Glupendxr DNA; Done in 239/37 seconds. // QS_bench_r14_ICL19.0_rev5bypass.exe Glupendxr DNA; Done in 244/32 seconds. // clang_14.0.1 -O3 -mavx2 -S -fverbose-asm /* // 'Magnetica' partitioning, mainloop rev.7p [ .LBB181_7: ]<-----+ movq %rax, (%rsi) | movq %rsi, %rdi | .p2align 4, 0x90 | .LBB181_8: ]<--+ | addq $8, %rcx | | .p2align 4, 0x90 | | .LBB181_9: ]<-+ | | movq -8(%rcx), %rbx | | | addq $-8, %rcx | | | cmpq %rbx, %rax | | | jb .LBB181_9 ]--+ | | cmpq %rdi, %rcx | | je .LBB181_13 | | leaq 8(%rdi), %rsi | | movq 8(%rdi), %r14 | | movq %rbx, 8(%rdi) | | movq %r14, (%rcx) | | movq 8(%rdi), %rbx | | movq %rsi, %rdi | | cmpq %rbx, %rax | | jbe .LBB181_8 ]---+ | movq %rbx, (%r11) | addq $8, %r11 | jmp .LBB181_7 ]------+ // 'Magnetica' partitioning, mainloop rev.7p ] */ // gcc_11.3.0 -S -O3 -mavx2 -m64 -static -fomit-frame-pointer /* // 'Magnetica' partitioning, mainloop rev.7p [ .L5597: ]<----------------------+<-+ cmpq %rcx, %r8 | | jb .L5598 ]-------------------+ | | .L5609: ]<--+ | | | leaq 8(%rax), %r11 | | | | cmpq %rax, %rdx | | | | je .L5599 | | | | movq 8(%rax), %r10 | | | | movq %rcx, 8(%rax) | | | | movq %r10, (%rdx) | | | | movq 8(%rax), %rcx | | | | cmpq %r8, %rcx | | | | jnb .L5605 | ]---+ | | | movq %rcx, (%r9) | | | | | addq $8, %r9 | | | | | movq %r8, 8(%rax) | | | | | movq (%rdx), %rcx | | | | | movq %r11, %rax | | | | | cmpq %rcx, %r8 | | | | | jnb .L5609 ]---+ | | | | .L5598: | ]<--------+ | | movq -8(%rdx), %rcx | | | subq $8, %rdx | | | jmp .L5597 | ]------------+ | .p2align 4,,10 | | .p2align 3 | | .L5605: ]<-------+ | movq %r10, %rcx | movq %r11, %rax | jmp .L5597 ]--------------------------+ // 'Magnetica' partitioning, mainloop rev.7p ] */