https://gcc.gnu.org/bugzilla/show_bug.cgi?id=87608

            Bug ID: 87608
           Summary: Very slow swap operations
           Product: gcc
           Version: 9.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: rtl-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: tkoenig at gcc dot gnu.org
  Target Milestone: ---

The following test program I received from somebody else (reproduced
with permission) takes about three times as many cycles using
gcc as it does with clang - 1428 cycles vs. 544 cycles including
measurement overhead.

#include <stdio.h>

extern "C" long rdtsc(void);

#define cond_swap5(a,b);\
    t = *(a);\
    *(a) = (t<*(b))?t:*(b);\
    *(b) = (t<*(b))?*(b):t;

template<int n>
void static_sort1(int *a){
    return;
}

template<>
void static_sort1<32>(int* first){
        int t;
    static_sort1<16>(first);
    static_sort1<16>(first+16);

    cond_swap5(first + 0u, first + 16u);
    cond_swap5(first + 8u, first + 24u);
    cond_swap5(first + 8u, first + 16u);
    cond_swap5(first + 4u, first + 20u);
    cond_swap5(first + 12u, first + 28u);
    cond_swap5(first + 12u, first + 20u);
    cond_swap5(first + 4u, first + 8u);
    cond_swap5(first + 12u, first + 16u);
    cond_swap5(first + 20u, first + 24u);
    cond_swap5(first + 2u, first + 18u);
    cond_swap5(first + 10u, first + 26u);
    cond_swap5(first + 10u, first + 18u);
    cond_swap5(first + 6u, first + 22u);
    cond_swap5(first + 14u, first + 30u);
    cond_swap5(first + 14u, first + 22u);
    cond_swap5(first + 6u, first + 10u);
    cond_swap5(first + 14u, first + 18u);
    cond_swap5(first + 22u, first + 26u);
    cond_swap5(first + 2u, first + 4u);
    cond_swap5(first + 6u, first + 8u);
    cond_swap5(first + 10u, first + 12u);
    cond_swap5(first + 14u, first + 16u);
    cond_swap5(first + 18u, first + 20u);
    cond_swap5(first + 22u, first + 24u);
    cond_swap5(first + 26u, first + 28u);
    cond_swap5(first + 1u, first + 17u);
    cond_swap5(first + 9u, first + 25u);
    cond_swap5(first + 9u, first + 17u);
    cond_swap5(first + 5u, first + 21u);
    cond_swap5(first + 13u, first + 29u);
    cond_swap5(first + 13u, first + 21u);
    cond_swap5(first + 5u, first + 9u);
    cond_swap5(first + 13u, first + 17u);
    cond_swap5(first + 21u, first + 25u);
    cond_swap5(first + 3u, first + 19u);
    cond_swap5(first + 11u, first + 27u);
    cond_swap5(first + 11u, first + 19u);
    cond_swap5(first + 7u, first + 23u);
    cond_swap5(first + 15u, first + 31u);
    cond_swap5(first + 15u, first + 23u);
    cond_swap5(first + 7u, first + 11u);
    cond_swap5(first + 15u, first + 19u);
    cond_swap5(first + 23u, first + 27u);
    cond_swap5(first + 3u, first + 5u);
    cond_swap5(first + 7u, first + 9u);
    cond_swap5(first + 11u, first + 13u);
    cond_swap5(first + 15u, first + 17u);
    cond_swap5(first + 19u, first + 21u);
    cond_swap5(first + 23u, first + 25u);
    cond_swap5(first + 27u, first + 29u);
    cond_swap5(first + 1u, first + 2u);
    cond_swap5(first + 3u, first + 4u);
    cond_swap5(first + 5u, first + 6u);
    cond_swap5(first + 7u, first + 8u);
    cond_swap5(first + 9u, first + 10u);
    cond_swap5(first + 11u, first + 12u);
    cond_swap5(first + 13u, first + 14u);
    cond_swap5(first + 15u, first + 16u);
    cond_swap5(first + 17u, first + 18u);
    cond_swap5(first + 19u, first + 20u);
    cond_swap5(first + 21u, first + 22u);
    cond_swap5(first + 23u, first + 24u);
    cond_swap5(first + 25u, first + 26u);
    cond_swap5(first + 27u, first + 28u);
    cond_swap5(first + 29u, first + 30u);
};

int main(){
    int a[32];
    long t1, t2;
        for (int i=0; i<32; i++)
       a[i] = 20*i - 32*i*i;

    t1 = rdtsc();
    static_sort1<32>(a);
    t2 = rdtsc();
        for (int i=0; i<32; i++)
      printf("%d ",a[i]);
    printf("\n %ld\n", t2-t1);
    return 0;
}
$ cat rdtsc.s
        .file   "rdtsc.s"
        .text
        .globl  rdtsc
        .type   rdtsc, @function
rdtsc:
.LFB0:
        rdtsc
        shl     $32, %rdx
        or      %rdx, %rax
        ret
.LFE0:
        .size   rdtsc, .-rdtsc
        .section        .note.GNU-stack,"",@progbits
$ g++ -march=native -mtune=native -O3 j2.c rdtsc.s
$ ./a.out
-7872 -17952 -8908 -19500 -10008 -21112 -11172 -22788 -12400 -24528 -13692
-26332 -15048 -28200 -16468 -30132 0 -1888 -12 -2412 -88 -3000 -228 -3652 -432
-4368 -700 -5148 -1032 -5992 -1428 -6900 
 1428
$ clang++ -O3 -stdlib=libc++ j2.c rdtsc.s
clang-3.8: warning: treating 'c' input as 'c++' when in C++ mode, this behavior
is deprecated
$ ./a.out
-7872 -17952 -8908 -19500 -10008 -21112 -11172 -22788 -12400 -24528 -13692
-26332 -15048 -28200 -16468 -30132 0 -1888 -12 -2412 -88 -3000 -228 -3652 -432
-4368 -700 -5148 -1032 -5992 -1428 -6900 
 544

This is on x86_64-pc-linux-gnu with an AMD Ryzen 7.

Reply via email to