https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113600
--- Comment #5 from Hongtao Liu <liuhongt at gcc dot gnu.org> --- It looks like x264_pixel_satd_16x16 consumes more time after my commit, an extracted case is as below, note there's no attribute((always_inline)) in the original x264_pixel_satd_8x4, it's added to force inline(Under PGO, it's hot and will be inlined) typedef unsigned char uint8_t; typedef unsigned uint32_t; typedef unsigned short uint16_t; static inline uint32_t abs2( uint32_t a ) { uint32_t s = ((a>>15)&0x10001)*0xffff; return (a+s)^s; } int __attribute__((always_inline)) x264_pixel_satd_8x4( uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 ) { uint32_t tmp[4][4]; uint32_t a0, a1, a2, a3; int sum = 0; for( int i = 0; i < 4; i++, pix1 += i_pix1, pix2 += i_pix2 ) { a0 = (pix1[0] - pix2[0]) + ((pix1[4] - pix2[4]) << 16); a1 = (pix1[1] - pix2[1]) + ((pix1[5] - pix2[5]) << 16); a2 = (pix1[2] - pix2[2]) + ((pix1[6] - pix2[6]) << 16); a3 = (pix1[3] - pix2[3]) + ((pix1[7] - pix2[7]) << 16); { int t0 = a0 + a1; int t1 = a0 - a1; int t2 = a2 + a3; int t3 = a2 - a3; tmp[i][0] = t0 + t2; tmp[i][2] = t0 - t2; tmp[i][1] = t1 + t3; tmp[i][3] = t1 - t3;}; } for( int i = 0; i < 4; i++ ) { { int t0 = tmp[0][i] + tmp[1][i]; int t1 = tmp[0][i] - tmp[1][i]; int t2 = tmp[2][i] + tmp[3][i]; int t3 = tmp[2][i] - tmp[3][i]; a0 = t0 + t2; a2 = t0 - t2; a1 = t1 + t3; a3 = t1 - t3;}; sum += abs2(a0) + abs2(a1) + abs2(a2) + abs2(a3); } return (((uint16_t)sum) + ((uint32_t)sum>>16)) >> 1; } int x264_pixel_satd_16x16( uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 ) { int sum = x264_pixel_satd_8x4( pix1, i_pix1, pix2, i_pix2 ) + x264_pixel_satd_8x4( pix1+4*i_pix1, i_pix1, pix2+4*i_pix2, i_pix2 ); sum+= x264_pixel_satd_8x4( pix1+8, i_pix1, pix2+8, i_pix2 ) + x264_pixel_satd_8x4( pix1+8+4*i_pix1, i_pix1, pix2+8+4*i_pix2, i_pix2 ); sum+= x264_pixel_satd_8x4( pix1+8*i_pix1, i_pix1, pix2+8*i_pix2, i_pix2 ) + x264_pixel_satd_8x4( pix1+12*i_pix1, i_pix1, pix2+12*i_pix2, i_pix2 ); sum+= x264_pixel_satd_8x4( pix1+8+8*i_pix1, i_pix1, pix2+8+8*i_pix2, i_pix2 ) + x264_pixel_satd_8x4( pix1+8+12*i_pix1, i_pix1, pix2+8+12*i_pix2, i_pix2 ); return sum; } after commits, slp failed to splitted group size 16(vector int(16)) into small 4 + 12 and missed vectorization for below cases. vect_t2_2445.784_8503 = VIEW_CONVERT_EXPR<vector(4) int>(_8502); vect__2457.786_8505 = vect_t0_2441.783_8501 - vect_t2_2445.784_8503; vect__2448.785_8504 = vect_t0_2441.783_8501 + vect_t2_2445.784_8503; _8506 = VEC_PERM_EXPR <vect__2448.785_8504, vect__2457.786_8505, { 0, 1, 6, 7 }>; vect__2449.787_8507 = VIEW_CONVERT_EXPR<vector(4) unsigned int>(_8506); t3_2447 = (int) _2446; _2448 = t0_2441 + t2_2445; _2449 = (unsigned int) _2448; _2451 = t0_2441 - t2_2445; _2452 = (unsigned int) _2451; _2454 = t1_2443 + t3_2447; _2455 = (unsigned int) _2454; _2457 = t1_2443 - t3_2447; _2458 = (unsigned int) _2457; MEM <vector(4) unsigned int> [(unsigned int *)&tmp + 16B] = vect__2449.787_8507; The vector store will be optimized off with later vector load, so for the bad case there're STLF issue.