https://gcc.gnu.org/bugzilla/show_bug.cgi?id=78379
--- Comment #8 from Jerry DeLisle <jvdelisle at gcc dot gnu.org> --- (In reply to Thomas Koenig from comment #6) > > You may notice I was invoking the wrong executable in what I posted in > > comment #3. I did rerun the correct one several times and tried it with > > -mavx -mprefer-avx128. I get the same poor results regardless. > > Several things could go wrong here... > > If you run the benchmark under gdb and break, then type > "disassemble $pc,$pc+200", do you actually end up in the right > program part (the one with AVX instructions)? 452 f32 += t1[l - ll + 1 + ((i - ii + 3) << 8) - 257] (gdb) disassemble $pc,$pc+200 Dump of assembler code from 0x7ffff7af3554 to 0x7ffff7af361c: => 0x00007ffff7af3554 <aux_matmul_r8+5220>: vaddpd %ymm12,%ymm4,%ymm4 0x00007ffff7af3559 <aux_matmul_r8+5225>: vmulpd %ymm10,%ymm15,%ymm12 0x00007ffff7af355e <aux_matmul_r8+5230>: vaddpd %ymm11,%ymm5,%ymm5 0x00007ffff7af3563 <aux_matmul_r8+5235>: vmulpd %ymm14,%ymm15,%ymm15 0x00007ffff7af3568 <aux_matmul_r8+5240>: vmulpd %ymm10,%ymm13,%ymm10 0x00007ffff7af356d <aux_matmul_r8+5245>: vaddpd %ymm12,%ymm6,%ymm6 0x00007ffff7af3572 <aux_matmul_r8+5250>: vmulpd %ymm14,%ymm13,%ymm14 0x00007ffff7af3577 <aux_matmul_r8+5255>: vaddpd %ymm15,%ymm8,%ymm8 0x00007ffff7af357c <aux_matmul_r8+5260>: vaddpd %ymm10,%ymm7,%ymm7 0x00007ffff7af3581 <aux_matmul_r8+5265>: vaddpd %ymm14,%ymm9,%ymm9 0x00007ffff7af3586 <aux_matmul_r8+5270>: ja 0x7ffff7af3433 <aux_matmul_r8+4931> 0x00007ffff7af358c <aux_matmul_r8+5276>: mov -0x801f8(%rbp),%rdx 0x00007ffff7af3593 <aux_matmul_r8+5283>: vhaddpd %ymm9,%ymm9,%ymm13 0x00007ffff7af3598 <aux_matmul_r8+5288>: vhaddpd %ymm8,%ymm8,%ymm15 0x00007ffff7af359d <aux_matmul_r8+5293>: vhaddpd %ymm7,%ymm7,%ymm7 0x00007ffff7af35a1 <aux_matmul_r8+5297>: vperm2f128 $0x1,%ymm13,%ymm13,%ymm11 0x00007ffff7af35a7 <aux_matmul_r8+5303>: vhaddpd %ymm5,%ymm5,%ymm5 0x00007ffff7af35ab <aux_matmul_r8+5307>: vperm2f128 $0x1,%ymm15,%ymm15,%ymm8 0x00007ffff7af35b1 <aux_matmul_r8+5313>: vaddpd %ymm11,%ymm13,%ymm12 0x00007ffff7af35b6 <aux_matmul_r8+5318>: vperm2f128 $0x1,%ymm7,%ymm7,%ymm13 0x00007ffff7af35bc <aux_matmul_r8+5324>: vaddpd %ymm8,%ymm15,%ymm14 0x00007ffff7af35c1 <aux_matmul_r8+5329>: vhaddpd %ymm6,%ymm6,%ymm6 ---Type <return> to continue, or q <return> to quit--- 0x00007ffff7af35c5 <aux_matmul_r8+5333>: vaddsd -0x80068(%rbp),%xmm12,%xmm10 0x00007ffff7af35cd <aux_matmul_r8+5341>: vaddsd -0x80070(%rbp),%xmm14,%xmm9 0x00007ffff7af35d5 <aux_matmul_r8+5349>: vperm2f128 $0x1,%ymm5,%ymm5,%ymm14 0x00007ffff7af35db <aux_matmul_r8+5355>: vhaddpd %ymm4,%ymm4,%ymm4 0x00007ffff7af35df <aux_matmul_r8+5359>: vaddpd %ymm13,%ymm7,%ymm11 0x00007ffff7af35e4 <aux_matmul_r8+5364>: vmovsd %xmm10,-0x80068(%rbp) 0x00007ffff7af35ec <aux_matmul_r8+5372>: vperm2f128 $0x1,%ymm6,%ymm6,%ymm10 0x00007ffff7af35f2 <aux_matmul_r8+5378>: vperm2f128 $0x1,%ymm4,%ymm4,%ymm13 0x00007ffff7af35f8 <aux_matmul_r8+5384>: vmovsd %xmm9,-0x80070(%rbp) 0x00007ffff7af3600 <aux_matmul_r8+5392>: vaddpd %ymm14,%ymm5,%ymm9 0x00007ffff7af3605 <aux_matmul_r8+5397>: vhaddpd %ymm0,%ymm0,%ymm0 0x00007ffff7af3609 <aux_matmul_r8+5401>: vaddsd -0x80058(%rbp),%xmm11,%xmm12 0x00007ffff7af3611 <aux_matmul_r8+5409>: vaddpd %ymm10,%ymm6,%ymm15 0x00007ffff7af3616 <aux_matmul_r8+5414>: vaddpd %ymm13,%ymm4,%ymm11 0x00007ffff7af361b <aux_matmul_r8+5419>: vperm2f128 $0x1,%ymm0,%ymm0,%ymm13 End of assembler dump. > > Or does your machine prefer AVX128? > > To find out, what are the timings for inline code using > > -mavx -Ofast > > -mavx -mprefer=avx128 -Ofast > > ? $ gfc -finline-matmul-limit=64 -Ofast compare.f90 $ ./a.out ========================================================= ================ MEASURED GIGAFLOPS = ========================================================= Matmul Matmul fixed Matmul variable Size Loops explicit refMatmul assumed explicit ========================================================= 2 2000 4.933 0.045 0.086 0.144 4 2000 1.418 0.225 0.271 0.347 8 2000 2.168 0.616 1.296 1.830 16 2000 5.330 2.824 1.784 2.907 32 2000 6.239 3.488 1.446 3.406 64 2000 2.650 2.746 1.552 2.691 $ gfc -finline-matmul-limit=64 -mavx -Ofast compare.f90 $ ./a.out ========================================================= ================ MEASURED GIGAFLOPS = ========================================================= Matmul Matmul fixed Matmul variable Size Loops explicit refMatmul assumed explicit ========================================================= 2 2000 6.934 0.042 0.091 0.134 4 2000 1.320 0.181 0.365 0.252 8 2000 1.007 0.446 1.595 0.982 16 2000 0.581 1.163 2.411 1.180 32 2000 1.346 1.276 2.061 1.277 64 2000 1.397 1.327 2.288 1.328 $ gfc -finline-matmul-limit=64 -mavx -mprefer-avx128 -Ofast compare.f90 $ ./a.out ========================================================= ================ MEASURED GIGAFLOPS = ========================================================= Matmul Matmul fixed Matmul variable Size Loops explicit refMatmul assumed explicit ========================================================= 2 2000 5.021 0.045 0.088 0.139 4 2000 1.607 0.202 0.288 0.341 8 2000 2.482 0.575 0.743 1.861 16 2000 5.674 2.804 1.809 2.792 32 2000 6.323 3.460 1.478 3.293 64 2000 2.714 2.832 1.582 2.694 If I put -mavx -prefer-avx128 in the Makefile.am I get as good or better than without your patch. I also see none of the HAVE_AVX defined in config. $ gfc -finline-matmul-limit=0 -Ofast compare.f90 $ ./a.out ========================================================= ================ MEASURED GIGAFLOPS = ========================================================= Matmul Matmul fixed Matmul variable Size Loops explicit refMatmul assumed explicit ========================================================= 2 2000 0.043 0.041 0.034 0.043 4 2000 0.272 0.234 0.223 0.256 8 2000 0.835 1.687 1.627 1.709 16 2000 2.886 2.887 2.859 2.869 32 2000 4.733 3.494 4.755 4.652 64 2000 6.933 2.837 6.933 6.877 128 2000 7.949 3.285 8.705 7.914 256 477 10.040 3.447 9.999 9.951 512 59 8.885 2.341 8.923 8.940 1024 7 8.937 1.367 8.978 8.991 2048 1 8.799 1.672 8.831 8.854 The following in config.h.in for what it is worth: /* Define if AVX instructions can be compiled. */ #undef HAVE_AVX /* Define if AVX2 instructions can be compiled. */ #undef HAVE_AVX2 /* Define if AVX512f instructions can be compiled. */ #undef HAVE_AVX512F