On 4/24/24 12:22, Robin Dapp wrote:
> The dynamic icounts looks sane (vs. Apr 10 snapshot) except for a
>> regression in x264 which is likely independent of the chaos going on.
>>
>> Apr 10 | Apr 23 |
>> 109f1b28fc94 | 6f0a646dd2fc |
>> +-+
>> 276,584,692,883 | 277,816,987,018 | -0.45%
>> 913,452,236,000 | 927,291,935,180 | -1.52%
>> 903,916,092,805 | 915,364,006,176 | -1.27%
> x264 uses widening arithmetic so it could be the reverts.
> Can you compare the hot functions (e.g. x264_pixel_sad_16x16)
Function old new delta
x264_pixel_sad_x4_16x8.lto_priv 5188 5288 +100
x264_pixel_sad_x4_8x16.lto_priv 5844 5924 +80
x264_pixel_sad_x3_16x8.lto_priv 3904 3980 +76
x264_pixel_sad_x4_16x16.lto_priv 834 898 +64
x264_pixel_sad_x3_8x16.lto_priv 4408 4468 +60
x264_pixel_sad_x4_8x8.lto_priv 3010 3058 +48
x264_pixel_sad_x3_8x8.lto_priv 2290 2338 +48
...
...
x264_pixel_sad_x4_4x8.lto_priv 1366 1362 -4
x264_pixel_sad_x4_4x4.lto_priv 716 712 -4
x264_pixel_sad_4x8.lto_priv 332 328 -4
x264_pixel_sad_4x4.lto_priv 172 168 -4
hpel_filter.lto_priv 984 980 -4
> if anything stands out surrounding the vwadd.wv for example?
Yeah it does: not specifically in the routine you mentioned above but
in its various brethren: see attached objdump for
x264_pixel_sad_x4_16x16 () for the 2 cases.
-Vineet00021872 :
21872: vsetivlizero,4,e32,m1,ta,ma
21876: vmv.v.i v5,0
2187a: add sp,sp,-32
2187c: add t4,a0,4
21880: vmv1r.v v7,v5
21884: vmv1r.v v8,v5
21888: vmv1r.v v9,v5
2188c: vmv1r.v v6,v5
21890: add t5,a0,8
21894: add t6,a0,12
21898: sd s0,24(sp)
2189a: sd s1,16(sp)
2189c: mv s0,a6
2189e: sd s2,8(sp)
218a0: sd s3,0(sp)
218a2: mv a6,t4
218a4: mv t2,t5
218a6: mv t0,t6
218a8: mv t1,a0
218aa: add a7,a0,256
218ae: mv t3,a0
218b0: vsetvli zero,zero,e8,mf4,ta,ma
218b4: add s3,a1,4
218b8: add s2,a1,8
218bc: vle8.v v3,(s3)
218c0: vle8.v v14,(a6)
218c4: vle8.v v2,(s2)
218c8: vle8.v v13,(t2)
218cc: add s1,a1,12
218d0: vle8.v v12,(t0)
218d4: vle8.v v11,(t3)
218d8: vle8.v v10,(a1)
218dc: vle8.v v1,(s1)
218e0: vwsubu.vv v4,v14,v3
218e4: vwsubu.vv v3,v13,v2
218e8: add t3,t3,16
218ea: add a6,a6,16
218ec: add t2,t2,16
218ee: vwsubu.vv v2,v12,v1
218f2: vwsubu.vv v1,v11,v10
218f6: vsetvli zero,zero,e16,mf2,ta,mu
218fa: vmsle.viv0,v4,-1
218fe: vmsle.viv12,v3,-1
21902: vmsle.viv11,v2,-1
21906: vneg.v v4,v4,v0.t
2190a: vmv1r.v v0,v12
2190e: vmsle.viv10,v1,-1
21912: vwadd.wvv9,v9,v4
21916: vneg.v v3,v3,v0.t
2191a: vmv1r.v v0,v11
2191e: add t0,t0,16
21920: vwadd.wvv8,v8,v3
21924: vneg.v v2,v2,v0.t
21928: vmv1r.v v0,v10
2192c: add a1,a1,a5
2192e: vwadd.wvv7,v7,v2
21932: vneg.v v1,v1,v0.t
21936: vwadd.wvv6,v6,v1
2193a: bne t3,a7,218b0
2193e: vsetvli zero,zero,e32,m1,ta,ma
21942: vadd.vv v1,v6,v9
21946: li a6,0
21948: vmv.s.x v2,a6
2194c: vadd.vv v1,v1,v8
21950: vmv1r.v v9,v5
21954: vmv1r.v v8,v5
21958: vadd.vv v1,v1,v7
2195c: vmv1r.v v6,v5
21960: vmv1r.v v7,v5
21964: vredsum.vs v1,v1,v2
21968: mv t2,t6
2196a: mv t0,t5
2196c: mv t3,t4
2196e: mv a1,a0
21970: vmv.x.s a6,v1
21974: sw a6,0(s0)
21978: vsetvli zero,zero,e8,mf4,ta,ma
2197c: add s2,a2,4
21980: add s1,a2,8
21984: vle8.v v3,(s2)
21988: vle8.v v14,(t3)
2198c: vle8.v v2,(s1)
21990: vle8.v v13,(t0)
21994: add a6,a2,12
21998: vle8.v v12,(t2)
2199c: vle8.v v11,(a1)
219a0: vle8.v v10,(a2)
219a4: vle8.v v1,(a6)
219a8: vwsubu.vv v4,v14,v3
219ac: vwsubu.vv v3,v13,v2
219b0: add a1,a1,16
219b2: add t3,t3,16
219b4: add t0,t0,16
219b6: vwsubu.vv v2,v12,v1
219ba: vwsubu.vv v1,v11,v10
219be: vsetvli