https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111551
Jan Hubicka <hubicka at gcc dot gnu.org> changed:
What |Removed |Added
----------------------------------------------------------------------------
Ever confirmed|0 |1
Last reconfirmed| |2025-03-05
Status|UNCONFIRMED |NEW
--- Comment #2 from Jan Hubicka <hubicka at gcc dot gnu.org> ---
Building imagemagick with -Ofast -fprofile-use (no -flto) yields:
52.50% imagick_r_peak. imagick_r_peak.trunk-pgolto-Ofast-native-m64
[.] MorphologyApply.cold
28.65% imagick_r_peak. imagick_r_peak.trunk-pgolto-Ofast-native-m64
[.] MeanShiftImage
10.81% imagick_r_peak. imagick_r_peak.trunk-pgolto-Ofast-native-m64
[.] GetVirtualPixelsFromNexus
3.67% imagick_r_peak. imagick_r_peak.trunk-pgolto-Ofast-native-m64
[.] GetOneCacheViewVirtualPixel
2.26% imagick_r_peak. imagick_r_peak.trunk-pgolto-Ofast-native-m64
[.] MagickRound
0.11% imagick_r_peak. imagick_r_peak.trunk-pgolto-Ofast-native-m64
[.] HorizontalFilter
So we declare hot loop of MorphologyApply as cold. This does not seem to be
due to train run missing hot spot of ref run, since it preproduces even if
train run data is replaced by ref run data.
Hot loop is the kernel of Morphology
ercent │ │ je 3143 ▒
│ │for (v=0; v < (ssize_t) kernel->height; v++) { ▒
│2dcb:│ inc %rcx ▒
│ │ jmp 2d6a ▒
│ │return((Quantum) 0); ▒
│2dd0:│ xor %eax,%eax ▒
│ │ jmp 207d ▒
│ │return(QuantumRange); ▒
│2dd7:│ or $0xffffffff,%eax ▒
│ │ jmp 207d ▒
│ │return((Quantum) (value+0.5f)); ▒
│2ddf:│ vaddss 0xbfe65(%rip),%xmm0,%xmm0 # 53▒
│ │ vcvttss2si %xmm0,%eax ▒
│ │ jmp 1f02 ▒
│ │result.red += (*k)*k_pixels[u].red; ▒
│2df0:│ imul $0xfffffffffffffff8,%rax,%r9 ▒
5.68 │ │ vxorpd %xmm7,%xmm7,%xmm7 ▒
0.14 │ │ vmovsd (%rcx,%r9,1),%xmm5 ▒
0.05 │ │ movzwl 0x4(%rbx,%rax,8),%r9d ▒
6.26 │ │ vcvtsi2sd %r9d,%xmm7,%xmm6 ▒
│ │result.green += (*k)*k_pixels[u].green; ▒
9.60 │ │ movzwl 0x2(%rbx,%rax,8),%r9d ▒
│ │result.red += (*k)*k_pixels[u].red; ▒
0.00 │ │ vfmadd231sd %xmm6,%xmm5,%xmm4 ▒
│ │result.green += (*k)*k_pixels[u].green; ▒
12.66 │ │ vcvtsi2sd %r9d,%xmm7,%xmm6 ▒
│ │result.blue += (*k)*k_pixels[u].blue; ▒
7.89 │ │ movzwl (%rbx,%rax,8),%r9d ▒
│ │result.green += (*k)*k_pixels[u].green; ▒
│ │ vfmadd231sd %xmm6,%xmm5,%xmm3 ▒
│ │result.blue += (*k)*k_pixels[u].blue; ▒
21.48 │ │ vcvtsi2sd %r9d,%xmm7,%xmm6 ▒
│ │result.opacity += (*k)*k_pixels[u].opacity; ▒
2.67 │ │ movzwl 0x6(%rbx,%rax,8),%r9d ▒
│ │result.blue += (*k)*k_pixels[u].blue; ▒
│ │ vfmadd231sd %xmm6,%xmm5,%xmm2 ▒
│ │result.opacity += (*k)*k_pixels[u].opacity; ▒
14.91 │ │ vcvtsi2sd %r9d,%xmm7,%xmm6 ▒
2.07 │ │ vfmadd231sd %xmm6,%xmm5,%xmm1 ▒
│ │if ( image->colorspace == CMYKColorspace) ▒
11.53 │ │ cmp $0xc,%r13d ▒
4.63 │ │ je 2e4b ▒
│ │for (u=0; u < (ssize_t) kernel->width; u++, k--) { ▒
│2e43:│ inc %rax ▒
0.00 │ └──jmp 1f68 ▒
If I replace train run by refrate run we consider the loop hot, so it seems
like bad train run.
│ result.opacity += (*k)*k_pixels[u].opacity;
3.56 │700: vmovdqu (%rdx),%ymm0
│ result.red += (*k)*k_pixels[u].red;
5.34 │ vmovupd (%r8),%ymm2
3.51 │ add $0x20,%rdx
2.15 │ sub $0x20,%r8
│ result.opacity += (*k)*k_pixels[u].opacity;
4.38 │ vpmovzxwd %xmm0,%ymm1
1.26 │ vextracti128 $0x1,%ymm0,%xmm0
2.52 │ vpermpd $0xaa,%ymm2,%ymm11
3.37 │ vpermpd $0x55,%ymm2,%ymm10
3.53 │ vcvtdq2pd %xmm1,%ymm12
1.55 │ vextracti128 $0x1,%ymm1,%xmm1
3.61 │ vpmovzxwd %xmm0,%ymm0
1.80 │ vbroadcastsd %xmm2,%ymm9
5.27 │ vcvtdq2pd %xmm1,%ymm1
5.37 │ vpermpd $0xff,%ymm2,%ymm2
2.68 │ vfmadd231pd %ymm12,%ymm2,%ymm3
2.62 │ vfmadd231pd %ymm11,%ymm1,%ymm6
6.87 │ vcvtdq2pd %xmm0,%ymm1
4.28 │ vextracti128 $0x1,%ymm0,%xmm0
8.40 │ vcvtdq2pd %xmm0,%ymm0
4.32 │ vfmadd231pd %ymm1,%ymm10,%ymm8
4.22 │ vfmadd231pd %ymm0,%ymm9,%ymm7
│ for (u=0; u < (ssize_t) kernel->width; u++, k--) {
4.17 │ cmp %rdx,%r9
│ ↑ jne 700
So it seems we simply miss vectorization because we optimize for size.
runtime is
217s with -Ofast -fprofile-use and train run
170s with -Ofast
165s with -Ofast -fprofile-use and train run hacked to be refrate run