https://gcc.gnu.org/bugzilla/show_bug.cgi?id=109812
--- Comment #9 from Jan Hubicka <hubicka at gcc dot gnu.org> --- Oddly enough simplified version of the loop SLP vectorizes for me: struct rgb {unsigned char r,g,b;} *rgbs; int *addr; double *weights; struct drgb {double r,g,b;}; struct drgb sum() { struct drgb r; for (int i = 0; i < 100000; i++) { int j = addr[i]; double w = weights[i]; r.r += rgbs[j].r * w; r.g += rgbs[j].g * w; r.b += rgbs[j].b * w; } return r; } I get: L2: movslq (%r9,%rdx,4), %rax vmovsd (%r8,%rdx,8), %xmm1 incq %rdx leaq (%rax,%rax,2), %rax addq %rsi, %rax movzbl (%rax), %ecx vmovddup %xmm1, %xmm4 vmovd %ecx, %xmm0 movzbl 1(%rax), %ecx movzbl 2(%rax), %eax vpinsrd $1, %ecx, %xmm0, %xmm0 vcvtdq2pd %xmm0, %xmm0 vfmadd231pd %xmm4, %xmm0, %xmm2 vcvtsi2sdl %eax, %xmm5, %xmm0 vfmadd231sd %xmm1, %xmm0, %xmm3 cmpq $100000, %rdx jne .L2 I think the actual loop is: <bb 53> [local count: 44202554]: _106 = _262->pixel; _109 = *source_231(D).columns; <bb 54> [local count: 401841405]: # pixel$green_332 = PHI <_124(89), pixel$green_265(53)> # i_357 = PHI <i_298(89), 0(53)> # pixel$red_371 = PHI <_119(89), pixel$red_263(53)> # pixel$blue_377 = PHI <_129(89), pixel$blue_267(53)> i.51_102 = (long unsigned int) i_357; _103 = i.51_102 * 16; _104 = _262 + _103; _105 = _104->pixel; _107 = _105 - _106; _108 = (long unsigned int) _107; _110 = _108 * _109; _112 = _110 + _621; weight_297 = _104->weight; _113 = _112 * 4; _114 = _276 + _113; _115 = _114->red; _116 = (int) _115; _117 = (double) _116; _118 = _117 * weight_297; _119 = _118 + pixel$red_371; _120 = _114->green; _121 = (int) _120; _122 = (double) _121; _123 = _122 * weight_297; _124 = _123 + pixel$green_332; _125 = _114->blue; _126 = (int) _125; _127 = (double) _126; _128 = _127 * weight_297; _129 = _128 + pixel$blue_377; i_298 = i_357 + 1; if (n_195 > i_298) goto <bb 89>; [89.00%] else goto <bb 118>; [11.00%] <bb 118> [local count: 44202554]: # _607 = PHI <_124(54)> # _606 = PHI <_119(54)> # _605 = PHI <_129(54)> goto <bb 55>; [100.00%] <bb 89> [local count: 357638851]: goto <bb 54>; [100.00%] and SLP vectorizer seems to claim: ../magick/resize.c:1284:52: note: _125 = _114->blue; ../magick/resize.c:1284:52: note: _120 = _114->green; ../magick/resize.c:1284:52: note: _115 = _114->red; ../magick/resize.c:1284:52: missed: not consecutive access weight_297 = _104->weight; ../magick/resize.c:1284:52: missed: not consecutive access _105 = _104->pixel; ../magick/resize.c:1284:52: missed: not consecutive access _134->red = iftmp.57_207; ../magick/resize.c:1284:52: missed: not consecutive access _134->green = iftmp.60_208; ../magick/resize.c:1284:52: missed: not consecutive access _134->blue = iftmp.63_209; ../magick/resize.c:1284:52: missed: not consecutive access _134->opacity = 0; ../magick/resize.c:1284:52: missed: not consecutive access _63 = *source_231(D).columns; ../magick/resize.c:1284:52: missed: not consecutive access _60 = _262->pixel; Not sure if that is related to the real testcase: struct rgb {unsigned char r,g,b;} *rgbs; int *addr; double *weights; struct drgb {double r,g,b,o;}; struct drgb sum() { struct drgb r; for (int i = 0; i < 100000; i++) { int j = addr[i]; double w = weights[i]; r.r += rgbs[j].r * w; r.g += rgbs[j].g * w; r.b += rgbs[j].b * w; } return r; } make us to miss the vectorization even though there is nothing using drgb->o: sum: .LFB0: .cfi_startproc movq %rdi, %r8 movq weights(%rip), %rsi movq addr(%rip), %rdi vxorps %xmm2, %xmm2, %xmm2 movq rgbs(%rip), %rcx xorl %edx, %edx .p2align 4 .p2align 3 .L2: movslq (%rdi,%rdx,4), %rax vmovsd (%rsi,%rdx,8), %xmm0 incq %rdx leaq (%rax,%rax,2), %rax addq %rcx, %rax movzbl (%rax), %r9d vcvtsi2sdl %r9d, %xmm2, %xmm1 movzbl 1(%rax), %r9d movzbl 2(%rax), %eax vfmadd231sd %xmm0, %xmm1, %xmm3 vcvtsi2sdl %r9d, %xmm2, %xmm1 vfmadd231sd %xmm0, %xmm1, %xmm5 vcvtsi2sdl %eax, %xmm2, %xmm1 vfmadd231sd %xmm0, %xmm1, %xmm4 cmpq $100000, %rdx jne .L2 vmovq %xmm4, %xmm4 vunpcklpd %xmm5, %xmm3, %xmm0 movq %r8, %rax vinsertf128 $0x1, %xmm4, %ymm0, %ymm0 vmovupd %ymm0, (%r8) vzeroupper ret