https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96789
--- Comment #5 from Richard Biener <rguenth at gcc dot gnu.org> --- testcase from https://github.com/mirror/x264/blob/master/common/dct.c where FENC_STRIDE is 16 and FDEC_STRIDE 32 pixel is unsigned char, dctcoef is unsigned short static inline void pixel_sub_wxh( dctcoef *diff, int i_size, pixel *pix1, int i_pix1, pixel *pix2, int i_pix2 ) { for( int y = 0; y < i_size; y++ ) { for( int x = 0; x < i_size; x++ ) diff[x + y*i_size] = pix1[x] - pix2[x]; pix1 += i_pix1; pix2 += i_pix2; } } static void sub4x4_dct( dctcoef dct[16], pixel *pix1, pixel *pix2 ) { dctcoef d[16]; dctcoef tmp[16]; pixel_sub_wxh( d, 4, pix1, FENC_STRIDE, pix2, FDEC_STRIDE ); for( int i = 0; i < 4; i++ ) { int s03 = d[i*4+0] + d[i*4+3]; int s12 = d[i*4+1] + d[i*4+2]; int d03 = d[i*4+0] - d[i*4+3]; int d12 = d[i*4+1] - d[i*4+2]; tmp[0*4+i] = s03 + s12; tmp[1*4+i] = 2*d03 + d12; tmp[2*4+i] = s03 - s12; tmp[3*4+i] = d03 - 2*d12; } for( int i = 0; i < 4; i++ ) { int s03 = tmp[i*4+0] + tmp[i*4+3]; int s12 = tmp[i*4+1] + tmp[i*4+2]; int d03 = tmp[i*4+0] - tmp[i*4+3]; int d12 = tmp[i*4+1] - tmp[i*4+2]; dct[i*4+0] = s03 + s12; dct[i*4+1] = 2*d03 + d12; dct[i*4+2] = s03 - s12; dct[i*4+3] = d03 - 2*d12; } } one can see vectorization is complicated by the non-homogenous computes.