https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108376

--- Comment #3 from Jan Hubicka <hubicka at gcc dot gnu.org> ---
If I make the arrays random then GCC code is indeed faster:
#include <math.h>
#include <malloc.h>

typedef float real_t;
#define iterations 1000000
#define LEN_1D 32000
#define LEN_2D 256
real_t a[LEN_1D],b[LEN_1D],c[LEN_1D],d[LEN_1D],e[LEN_1D];
real_t aa[LEN_2D][LEN_2D];
real_t bb[LEN_2D][LEN_2D];
real_t cc[LEN_2D][LEN_2D];
real_t qq;
int
main(void)
{
//    reductions
//    if to max reduction

    real_t x;
    for (int i = 0; i < LEN_1D; i++)
    {
       a[i]=(rand() %5) - 3;
       b[i]=(rand() %6) - 3;
    }
    for (int nl = 0; nl < iterations; nl++) {
        for (int i = 0; i < LEN_1D; i++) {
            if (a[i] < (real_t)0.) {
                if (b[i] > a[i]) {
                    c[i] += d[i] * e[i];
                }
            }
        }
        //dummy(a, b, c, d, e, aa, bb, cc, 0.);
    }

    return x;
}

jh@alberti:~/tsvc/bin> ~/aocc-compiler-4.0.0/bin/clang -Ofast s1279.c
-march=native
s1279.c:23:14: warning: implicit declaration of function 'rand' is invalid in
C99 [-Wimplicit-function-declaration]
       a[i]=(rand() %5) - 3;
             ^
1 warning generated.
jh@alberti:~/tsvc/bin> time ./a.out

real    0m5.638s
user    0m5.636s
sys     0m0.000s
jh@alberti:~/tsvc/bin> ~/trunk-install/bin/gcc -Ofast s1279.c -march=native
s1279.c: In function 'main':
s1279.c:23:14: warning: implicit declaration of function 'rand'
[-Wimplicit-function-declaration]
   23 |        a[i]=(rand() %5) - 3;
      |              ^~~~
jh@alberti:~/tsvc/bin> time ./a.out

real    0m2.791s
user    0m2.790s
sys     0m0.000s


sorry for wrong code, just for reference the loop compiles as:
.L4:
        xorl    %eax, %eax
        .p2align 4
        .p2align 3
.L3:
        vmovaps a(%rax), %ymm2
        vmovaps b(%rax), %ymm3
        vmovaps c(%rax), %ymm6
        addq    $32, %rax
        vmovaps c-32(%rax), %ymm0
        vmovaps e-32(%rax), %ymm4
        vcmpps  $1, %ymm1, %ymm2, %k1
        vcmpps  $14, %ymm2, %ymm3, %k1{%k1}
        vfmadd231ps     d-32(%rax), %ymm4, %ymm0{%k1}
        vfmadd231ps     d-32(%rax), %ymm4, %ymm0
        vblendmps       %ymm0, %ymm6, %ymm0{%k1}
        vmovaps %ymm0, c-32(%rax)
        cmpq    $128000, %rax
        jne     .L3
        decl    %edx
        jne     .L4

Reply via email to