from:"flow gg"

Re: [FFmpeg-devel] [PATCH 2/2] lavc/vvc_mc: R-V V dmvr

2024-09-28 Thread flow gg

> At similar speed, shorter code is better.

Okay, updated it.

> Sure but so what? vsetvli/vsetivli is pretty fast (unlike vsetvl), and in
this case the code would be shorter. Or are you trying to factor the code
for different VTYPEs?

I mistakenly thought these vsets would slow things down.. after updating,
it has indeed become faster.

Rémi Denis-Courmont  于2024年9月28日周六 21:49写道：

>
>
> Le 28 septembre 2024 12:42:37 GMT+03:00, flow gg  a
> écrit :
> >> Is 4x unroll really faster than 2x here? We don't typically unroll 4x
> >> manually.
> >
> >I first did 2x and then changed it to 4x. The test results are similar,
> and
> >I'm not sure how to choose between them...
>
> At similar speed, shorter code is better.
>
> >> t5 seems to be 8-bit, so vwmulu.vx should work better here? Since you
> >> leveraged it in the previous function, I'm a bit confused why not here,
> >TBH.
> >> Likewise vwmaccu.vx.
> >
> >DMVR doesn't have right shifts, but DMVR_h, _v, and _hv do.
> >So DMVR only needs one vset, while the others, if using widen, require
> vset
> >switching.
>
> Sure but so what? vsetvli/vsetivli is pretty fast (unlike vsetvl), and in
> this case the code would be shorter. Or are you trying to factor the code
> for different VTYPEs?
>
> >> Missing rounding opportunity, vssra.vi should work better here.
> >> Same comments.
> >
> >Okay, Updated it.
> >
> >Rémi Denis-Courmont  于2024年9月28日周六 14:56写道：
> >
> >> Hi,
> >>
> >> Le perjantaina 27. syyskuuta 2024, 20.09.30 EEST u...@foxmail.com a
> écrit
> >> :
> >> > From: sunyuechi 
> >> >
> >> >  k230   banana_f3
> >> > dmvr_8_12x20_c:   628.5 ( 1.00x)624.1 ( 1.00x)
> >> > dmvr_8_12x20_rvv_i32: 137.5 ( 4.57x)92.9 ( 6.72x)
> >> > dmvr_8_20x12_c:   609.7 ( 1.00x)655.4 ( 1.00x)
> >> > dmvr_8_20x12_rvv_i32: 146.7 ( 4.16x)82.4 ( 7.95x)
> >> > dmvr_8_20x20_c:   998.7 ( 1.00x)1092.9 (
> 1.00x)
> >> > dmvr_8_20x20_rvv_i32: 221.0 ( 4.52x)144.9 ( 7.54x)
> >> > dmvr_h_8_12x20_c:2008.0 ( 1.00x)1999.2 (
> 1.00x)
> >> > dmvr_h_8_12x20_rvv_i32:   285.7 ( 7.03x)207.4 ( 9.64x)
> >> > dmvr_h_8_20x12_c:1989.5 ( 1.00x)2009.7 (
> 1.00x)
> >> > dmvr_h_8_20x12_rvv_i32:   322.7 ( 6.16x)176.2 (11.41x)
> >> > dmvr_h_8_20x20_c:3304.2 ( 1.00x)3342.9 (
> 1.00x)
> >> > dmvr_h_8_20x20_rvv_i32:   526.5 ( 6.28x)290.6 (11.50x)
> >> > dmvr_hv_8_12x20_c:   3609.7 ( 1.00x)3603.4 (
> 1.00x)
> >> > dmvr_hv_8_12x20_rvv_i32:  554.2 ( 6.51x)467.9 ( 7.70x)
> >> > dmvr_hv_8_20x12_c:   3637.5 ( 1.00x)3624.4 (
> 1.00x)
> >> > dmvr_hv_8_20x12_rvv_i32:  489.5 ( 7.43x)342.6 (10.58x)
> >> > dmvr_hv_8_20x20_c:   6794.7 ( 1.00x)5936.9 (
> 1.00x)
> >> > dmvr_hv_8_20x20_rvv_i32:  785.7 ( 8.65x)561.4 (10.58x)
> >> > dmvr_v_8_12x20_c:2156.0 ( 1.00x)2155.2 (
> 1.00x)
> >> > dmvr_v_8_12x20_rvv_i32:   295.0 ( 7.31x)207.4 (10.39x)
> >> > dmvr_v_8_20x12_c:2137.5 ( 1.00x)2165.7 (
> 1.00x)
> >> > dmvr_v_8_20x12_rvv_i32:   322.7 ( 6.62x)186.7 (11.60x)
> >> > dmvr_v_8_20x20_c:3554.2 ( 1.00x)3593.2 (
> 1.00x)
> >> > dmvr_v_8_20x20_rvv_i32:   535.7 ( 6.63x)290.6 (12.36x)
> >> > ---
> >> >  libavcodec/riscv/vvc/vvc_mc_rvv.S  | 141
> +
> >> >  libavcodec/riscv/vvc/vvcdsp_init.c |  22 +
> >> >  2 files changed, 163 insertions(+)
> >> >
> >> > diff --git a/libavcodec/riscv/vvc/vvc_mc_rvv.S
> >> > b/libavcodec/riscv/vvc/vvc_mc_rvv.S index 18532616d9..a5e20cbc67
> 100644
> >> > --- a/libavcodec/riscv/vvc/vvc_mc_rvv.S
> >> > +++ b/libavcodec/riscv/vvc/vvc_mc_rvv.S
> >> > @@ -285,3 +285,144 @@ endfunc
> >> >  func_w_avg 128
> >> >  func_w_avg 256
> >> >  #endif
> >> > +
> >> > +func dmvr zve32x, zbb, zba
> >> > +lpad0
> >> > +lit0, 4
> >> &

Re: [FFmpeg-devel] [PATCH 2/2] lavc/vvc_mc: R-V V dmvr

2024-09-28 Thread flow gg

> Is 4x unroll really faster than 2x here? We don't typically unroll 4x
> manually.

I first did 2x and then changed it to 4x. The test results are similar, and
I'm not sure how to choose between them...

> t5 seems to be 8-bit, so vwmulu.vx should work better here? Since you
> leveraged it in the previous function, I'm a bit confused why not here,
TBH.
> Likewise vwmaccu.vx.

DMVR doesn't have right shifts, but DMVR_h, _v, and _hv do.
So DMVR only needs one vset, while the others, if using widen, require vset
switching.

> Missing rounding opportunity, vssra.vi should work better here.
> Same comments.

Okay, Updated it.

Rémi Denis-Courmont  于2024年9月28日周六 14:56写道：

> Hi,
>
> Le perjantaina 27. syyskuuta 2024, 20.09.30 EEST u...@foxmail.com a écrit
> :
> > From: sunyuechi 
> >
> >  k230   banana_f3
> > dmvr_8_12x20_c:   628.5 ( 1.00x)624.1 ( 1.00x)
> > dmvr_8_12x20_rvv_i32: 137.5 ( 4.57x)92.9 ( 6.72x)
> > dmvr_8_20x12_c:   609.7 ( 1.00x)655.4 ( 1.00x)
> > dmvr_8_20x12_rvv_i32: 146.7 ( 4.16x)82.4 ( 7.95x)
> > dmvr_8_20x20_c:   998.7 ( 1.00x)1092.9 ( 1.00x)
> > dmvr_8_20x20_rvv_i32: 221.0 ( 4.52x)144.9 ( 7.54x)
> > dmvr_h_8_12x20_c:2008.0 ( 1.00x)1999.2 ( 1.00x)
> > dmvr_h_8_12x20_rvv_i32:   285.7 ( 7.03x)207.4 ( 9.64x)
> > dmvr_h_8_20x12_c:1989.5 ( 1.00x)2009.7 ( 1.00x)
> > dmvr_h_8_20x12_rvv_i32:   322.7 ( 6.16x)176.2 (11.41x)
> > dmvr_h_8_20x20_c:3304.2 ( 1.00x)3342.9 ( 1.00x)
> > dmvr_h_8_20x20_rvv_i32:   526.5 ( 6.28x)290.6 (11.50x)
> > dmvr_hv_8_12x20_c:   3609.7 ( 1.00x)3603.4 ( 1.00x)
> > dmvr_hv_8_12x20_rvv_i32:  554.2 ( 6.51x)467.9 ( 7.70x)
> > dmvr_hv_8_20x12_c:   3637.5 ( 1.00x)3624.4 ( 1.00x)
> > dmvr_hv_8_20x12_rvv_i32:  489.5 ( 7.43x)342.6 (10.58x)
> > dmvr_hv_8_20x20_c:   6794.7 ( 1.00x)5936.9 ( 1.00x)
> > dmvr_hv_8_20x20_rvv_i32:  785.7 ( 8.65x)561.4 (10.58x)
> > dmvr_v_8_12x20_c:2156.0 ( 1.00x)2155.2 ( 1.00x)
> > dmvr_v_8_12x20_rvv_i32:   295.0 ( 7.31x)207.4 (10.39x)
> > dmvr_v_8_20x12_c:2137.5 ( 1.00x)2165.7 ( 1.00x)
> > dmvr_v_8_20x12_rvv_i32:   322.7 ( 6.62x)186.7 (11.60x)
> > dmvr_v_8_20x20_c:3554.2 ( 1.00x)3593.2 ( 1.00x)
> > dmvr_v_8_20x20_rvv_i32:   535.7 ( 6.63x)290.6 (12.36x)
> > ---
> >  libavcodec/riscv/vvc/vvc_mc_rvv.S  | 141 +
> >  libavcodec/riscv/vvc/vvcdsp_init.c |  22 +
> >  2 files changed, 163 insertions(+)
> >
> > diff --git a/libavcodec/riscv/vvc/vvc_mc_rvv.S
> > b/libavcodec/riscv/vvc/vvc_mc_rvv.S index 18532616d9..a5e20cbc67 100644
> > --- a/libavcodec/riscv/vvc/vvc_mc_rvv.S
> > +++ b/libavcodec/riscv/vvc/vvc_mc_rvv.S
> > @@ -285,3 +285,144 @@ endfunc
> >  func_w_avg 128
> >  func_w_avg 256
> >  #endif
> > +
> > +func dmvr zve32x, zbb, zba
> > +lpad0
> > +lit0, 4
> > +1:
> > +add   t1, a1, a2
> > +addi  t4, a0, 128*2
> > +add   t2, t1, a2
> > +addi  t5, a0, 128*2*2
> > +add   t3, t2, a2
> > +addi  t6, a0, 128*2*3
> > +vle8.vv0, (a1)
> > +vle8.vv4, (t1)
> > +vle8.vv8, (t2)
> > +vle8.vv12, (t3)
> > +addi  a3, a3, -4
> > +vwmulu.vx v16, v0, t0
> > +vwmulu.vx v20, v4, t0
> > +vwmulu.vx v24, v8, t0
> > +vwmulu.vx v28, v12, t0
> > +vse16.v   v16, (a0)
> > +vse16.v   v20, (t4)
> > +vse16.v   v24, (t5)
> > +vse16.v   v28, (t6)
> > +sh2adda1, a2, a1
> > +add   a0, a0, 128*2*4
> > +bnez  a3, 1b
> > +ret
> > +endfunc
>
> Is 4x unroll really faster than 2x here? We don't typically unroll 4x
> manually.
>
> > +
> > +.macro dmvr_h_v mn, type
> > +lla   t4, ff_vvc_inter_luma_dmvr_filters
> > +sh1addt4, \mn, t4
> > +lbu   t5, (t4)
> > +lbu   t6, 1(t4)
> > +1:
> > +.ifc \type,h
> > +addi  t0, a1, 1
> > +addi  t1, a1, 2
> > +.else
> > +add   t0, a1, a2
> > +add   t1, t0, a2
> > +.endif
> > +vle8.vv0, (a1)
> > +vle8.vv4, (t0)
> > +vle8.vv8, (t1)
> > +addi  a3, a3, -2
> > +vzext.vf2 v12, v0
> > +vzext.vf2 v16, v4
> > +v

Re: [FFmpeg-devel] [PATCH 1/2] lavc/vp9dsp: R-V V mc tap h v

2024-09-24 Thread flow gg

ping

flow gg  于2024年8月28日周三 14:43写道：

> It seems that the previous patch have partially lacked if RVB, but now it
> has if (flags & AV_CPU_FLAG_RVB).
>
> Rémi Denis-Courmont  于2024年8月28日周三 03:00写道：
>
>> Le sunnuntaina 25. elokuuta 2024, 14.41.22 EEST flow gg a écrit :
>> > > Does not assemble with binutils 2.43.1 and default flags.
>> >
>> > Fixed through zve32x -> zve32x, zba
>>
>> Are the Bitmanip runtime support checks missing or did I miss them?
>>
>> --
>> Rémi Denis-Courmont
>> http://www.remlab.net/
>>
>>
>>
>> ___
>> ffmpeg-devel mailing list
>> ffmpeg-devel@ffmpeg.org
>> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>>
>> To unsubscribe, visit link above, or email
>> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
>>
>
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH] lavc/vvc_mc: R-V V avg w_avg

2024-09-21 Thread flow gg

It feels like this patch has been sitting idle for quite a long time...
Maybe it's time to merge it

Rémi Denis-Courmont  于2024年9月14日周六 22:45写道：

> Hi,
>
> LGTM for the RISC-V side. No clue about the VVC side.
> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
>
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH] lavc/vvc_mc: R-V V avg w_avg

2024-09-15 Thread flow gg

> LGTM for the RISC-V side. No clue about the VVC side.

Hi, Nuomi, could you please reply here? Thanks

flow gg  于2024年9月13日周五 00:45写道：

> ping
>
> flow gg  于2024年8月28日周三 14:38写道：
>
>> Updated: zve32x -> zve32x, zbb, zba
>>
>>  于2024年8月28日周三 14:37写道：
>>
>>> From: sunyuechi 
>>>
>>>  C908   X60
>>> avg_8_2x2_c:1.21.0
>>> avg_8_2x2_rvv_i32  :0.70.7
>>> avg_8_2x4_c:2.02.2
>>> avg_8_2x4_rvv_i32  :1.21.2
>>> avg_8_2x8_c:3.74.0
>>> avg_8_2x8_rvv_i32  :1.71.5
>>> avg_8_2x16_c   :7.27.7
>>> avg_8_2x16_rvv_i32 :3.02.7
>>> avg_8_2x32_c   :   14.2   15.2
>>> avg_8_2x32_rvv_i32 :5.55.0
>>> avg_8_2x64_c   :   51.0   43.7
>>> avg_8_2x64_rvv_i32 :   39.2   29.7
>>> avg_8_2x128_c  :  100.5   79.2
>>> avg_8_2x128_rvv_i32:   79.7   68.2
>>> avg_8_4x2_c:1.72.0
>>> avg_8_4x2_rvv_i32  :1.00.7
>>> avg_8_4x4_c:3.53.7
>>> avg_8_4x4_rvv_i32  :1.21.2
>>> avg_8_4x8_c:6.77.0
>>> avg_8_4x8_rvv_i32  :1.71.5
>>> avg_8_4x16_c   :   13.5   14.0
>>> avg_8_4x16_rvv_i32 :3.02.7
>>> avg_8_4x32_c   :   26.2   27.7
>>> avg_8_4x32_rvv_i32 :5.54.7
>>> avg_8_4x64_c   :   73.0   73.7
>>> avg_8_4x64_rvv_i32 :   39.0   32.5
>>> avg_8_4x128_c  :  143.0  137.2
>>> avg_8_4x128_rvv_i32:   72.7   68.0
>>> avg_8_8x2_c:3.53.5
>>> avg_8_8x2_rvv_i32  :1.00.7
>>> avg_8_8x4_c:6.26.5
>>> avg_8_8x4_rvv_i32  :1.51.0
>>> avg_8_8x8_c:   12.7   13.2
>>> avg_8_8x8_rvv_i32  :2.01.5
>>> avg_8_8x16_c   :   25.0   26.5
>>> avg_8_8x16_rvv_i32 :3.22.7
>>> avg_8_8x32_c   :   50.0   52.7
>>> avg_8_8x32_rvv_i32 :6.25.0
>>> avg_8_8x64_c   :  118.7  122.5
>>> avg_8_8x64_rvv_i32 :   40.2   31.5
>>> avg_8_8x128_c  :  236.7  220.2
>>> avg_8_8x128_rvv_i32:   85.2   67.7
>>> avg_8_16x2_c   :6.26.7
>>> avg_8_16x2_rvv_i32 :1.20.7
>>> avg_8_16x4_c   :   12.5   13.0
>>> avg_8_16x4_rvv_i32 :1.71.0
>>> avg_8_16x8_c   :   24.5   26.0
>>> avg_8_16x8_rvv_i32 :3.01.7
>>> avg_8_16x16_c  :   49.0   51.5
>>> avg_8_16x16_rvv_i32:5.53.0
>>> avg_8_16x32_c  :   97.5  102.5
>>> avg_8_16x32_rvv_i32:   10.55.5
>>> avg_8_16x64_c  :  213.7  222.0
>>> avg_8_16x64_rvv_i32:   48.5   34.2
>>> avg_8_16x128_c :  434.7  420.0
>>> avg_8_16x128_rvv_i32   :   97.7   74.0
>>> avg_8_32x2_c   :   12.2   12.7
>>&

Re: [FFmpeg-devel] [PATCH] lavc/vvc_mc: R-V V avg w_avg

2024-09-12 Thread flow gg

ping

flow gg  于2024年8月28日周三 14:38写道：

> Updated: zve32x -> zve32x, zbb, zba
>
>  于2024年8月28日周三 14:37写道：
>
>> From: sunyuechi 
>>
>>  C908   X60
>> avg_8_2x2_c:1.21.0
>> avg_8_2x2_rvv_i32  :0.70.7
>> avg_8_2x4_c:2.02.2
>> avg_8_2x4_rvv_i32  :1.21.2
>> avg_8_2x8_c:3.74.0
>> avg_8_2x8_rvv_i32  :1.71.5
>> avg_8_2x16_c   :7.27.7
>> avg_8_2x16_rvv_i32 :3.02.7
>> avg_8_2x32_c   :   14.2   15.2
>> avg_8_2x32_rvv_i32 :5.55.0
>> avg_8_2x64_c   :   51.0   43.7
>> avg_8_2x64_rvv_i32 :   39.2   29.7
>> avg_8_2x128_c  :  100.5   79.2
>> avg_8_2x128_rvv_i32:   79.7   68.2
>> avg_8_4x2_c:1.72.0
>> avg_8_4x2_rvv_i32  :1.00.7
>> avg_8_4x4_c:3.53.7
>> avg_8_4x4_rvv_i32  :1.21.2
>> avg_8_4x8_c:6.77.0
>> avg_8_4x8_rvv_i32  :1.71.5
>> avg_8_4x16_c   :   13.5   14.0
>> avg_8_4x16_rvv_i32 :3.02.7
>> avg_8_4x32_c   :   26.2   27.7
>> avg_8_4x32_rvv_i32 :5.54.7
>> avg_8_4x64_c   :   73.0   73.7
>> avg_8_4x64_rvv_i32 :   39.0   32.5
>> avg_8_4x128_c  :  143.0  137.2
>> avg_8_4x128_rvv_i32:   72.7   68.0
>> avg_8_8x2_c:3.53.5
>> avg_8_8x2_rvv_i32  :1.00.7
>> avg_8_8x4_c:6.26.5
>> avg_8_8x4_rvv_i32  :1.51.0
>> avg_8_8x8_c:   12.7   13.2
>> avg_8_8x8_rvv_i32  :2.01.5
>> avg_8_8x16_c   :   25.0   26.5
>> avg_8_8x16_rvv_i32 :3.22.7
>> avg_8_8x32_c   :   50.0   52.7
>> avg_8_8x32_rvv_i32 :6.25.0
>> avg_8_8x64_c   :  118.7  122.5
>> avg_8_8x64_rvv_i32 :   40.2   31.5
>> avg_8_8x128_c  :  236.7  220.2
>> avg_8_8x128_rvv_i32:   85.2   67.7
>> avg_8_16x2_c   :6.26.7
>> avg_8_16x2_rvv_i32 :1.20.7
>> avg_8_16x4_c   :   12.5   13.0
>> avg_8_16x4_rvv_i32 :1.71.0
>> avg_8_16x8_c   :   24.5   26.0
>> avg_8_16x8_rvv_i32 :3.01.7
>> avg_8_16x16_c  :   49.0   51.5
>> avg_8_16x16_rvv_i32:5.53.0
>> avg_8_16x32_c  :   97.5  102.5
>> avg_8_16x32_rvv_i32:   10.55.5
>> avg_8_16x64_c  :  213.7  222.0
>> avg_8_16x64_rvv_i32:   48.5   34.2
>> avg_8_16x128_c :  434.7  420.0
>> avg_8_16x128_rvv_i32   :   97.7   74.0
>> avg_8_32x2_c   :   12.2   12.7
>> avg_8_32x2_rvv_i32 :1.51.0
>> avg_8_32x4_c   :   24.5   25.5
>> avg_8_32x4_rvv_i32 :3.01.7
>> avg_8_32x8_c   :   48.5   50.7
>> avg_8_32x8_rvv_i32 :5.22.7
>> avg_8_32x16_c

Re: [FFmpeg-devel] [PATCH 1/2] lavc/vp9dsp: R-V V mc tap h v

2024-08-27 Thread flow gg

It seems that the previous patch have partially lacked if RVB, but now it
has if (flags & AV_CPU_FLAG_RVB).

Rémi Denis-Courmont  于2024年8月28日周三 03:00写道：

> Le sunnuntaina 25. elokuuta 2024, 14.41.22 EEST flow gg a écrit :
> > > Does not assemble with binutils 2.43.1 and default flags.
> >
> > Fixed through zve32x -> zve32x, zba
>
> Are the Bitmanip runtime support checks missing or did I miss them?
>
> --
> Rémi Denis-Courmont
> http://www.remlab.net/
>
>
>
> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
>
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH] lavc/vvc_mc: R-V V avg w_avg

2024-08-27 Thread flow gg

Updated: zve32x -> zve32x, zbb, zba

 于2024年8月28日周三 14:37写道：

> From: sunyuechi 
>
>  C908   X60
> avg_8_2x2_c:1.21.0
> avg_8_2x2_rvv_i32  :0.70.7
> avg_8_2x4_c:2.02.2
> avg_8_2x4_rvv_i32  :1.21.2
> avg_8_2x8_c:3.74.0
> avg_8_2x8_rvv_i32  :1.71.5
> avg_8_2x16_c   :7.27.7
> avg_8_2x16_rvv_i32 :3.02.7
> avg_8_2x32_c   :   14.2   15.2
> avg_8_2x32_rvv_i32 :5.55.0
> avg_8_2x64_c   :   51.0   43.7
> avg_8_2x64_rvv_i32 :   39.2   29.7
> avg_8_2x128_c  :  100.5   79.2
> avg_8_2x128_rvv_i32:   79.7   68.2
> avg_8_4x2_c:1.72.0
> avg_8_4x2_rvv_i32  :1.00.7
> avg_8_4x4_c:3.53.7
> avg_8_4x4_rvv_i32  :1.21.2
> avg_8_4x8_c:6.77.0
> avg_8_4x8_rvv_i32  :1.71.5
> avg_8_4x16_c   :   13.5   14.0
> avg_8_4x16_rvv_i32 :3.02.7
> avg_8_4x32_c   :   26.2   27.7
> avg_8_4x32_rvv_i32 :5.54.7
> avg_8_4x64_c   :   73.0   73.7
> avg_8_4x64_rvv_i32 :   39.0   32.5
> avg_8_4x128_c  :  143.0  137.2
> avg_8_4x128_rvv_i32:   72.7   68.0
> avg_8_8x2_c:3.53.5
> avg_8_8x2_rvv_i32  :1.00.7
> avg_8_8x4_c:6.26.5
> avg_8_8x4_rvv_i32  :1.51.0
> avg_8_8x8_c:   12.7   13.2
> avg_8_8x8_rvv_i32  :2.01.5
> avg_8_8x16_c   :   25.0   26.5
> avg_8_8x16_rvv_i32 :3.22.7
> avg_8_8x32_c   :   50.0   52.7
> avg_8_8x32_rvv_i32 :6.25.0
> avg_8_8x64_c   :  118.7  122.5
> avg_8_8x64_rvv_i32 :   40.2   31.5
> avg_8_8x128_c  :  236.7  220.2
> avg_8_8x128_rvv_i32:   85.2   67.7
> avg_8_16x2_c   :6.26.7
> avg_8_16x2_rvv_i32 :1.20.7
> avg_8_16x4_c   :   12.5   13.0
> avg_8_16x4_rvv_i32 :1.71.0
> avg_8_16x8_c   :   24.5   26.0
> avg_8_16x8_rvv_i32 :3.01.7
> avg_8_16x16_c  :   49.0   51.5
> avg_8_16x16_rvv_i32:5.53.0
> avg_8_16x32_c  :   97.5  102.5
> avg_8_16x32_rvv_i32:   10.55.5
> avg_8_16x64_c  :  213.7  222.0
> avg_8_16x64_rvv_i32:   48.5   34.2
> avg_8_16x128_c :  434.7  420.0
> avg_8_16x128_rvv_i32   :   97.7   74.0
> avg_8_32x2_c   :   12.2   12.7
> avg_8_32x2_rvv_i32 :1.51.0
> avg_8_32x4_c   :   24.5   25.5
> avg_8_32x4_rvv_i32 :3.01.7
> avg_8_32x8_c   :   48.5   50.7
> avg_8_32x8_rvv_i32 :5.22.7
> avg_8_32x16_c  :   96.7  101.2
> avg_8_32x16_rvv_i32:   10.25.0
> avg_8_32x32_c  :  192.7  202.2
> avg_8_32x32_rvv_i32:   19.79.5
> avg_8_32x64_c  :  427.5  426.5
> avg_8_32x64_rvv_i32:   64.2   18.2
> avg_8_32x128_c :  816.5  821.0
> avg_8_32x128_rvv_i32   :  135.2   75.5
> avg_8_64x2_c

Re: [FFmpeg-devel] [PATCH 1/2] lavc/vp9dsp: R-V V mc tap h v

2024-08-25 Thread flow gg

> Does not assemble with binutils 2.43.1 and default flags.

Fixed through zve32x -> zve32x, zba

 于2024年8月25日周日 19:40写道：

> From: sunyuechi 
>
>  C908   X60
> vp9_avg_8tap_smooth_4h_8bpp_c  :   12.7   11.2
> vp9_avg_8tap_smooth_4h_8bpp_rvv_i32:4.74.2
> vp9_avg_8tap_smooth_4v_8bpp_c  :   29.7   12.5
> vp9_avg_8tap_smooth_4v_8bpp_rvv_i32:4.74.2
> vp9_avg_8tap_smooth_8h_8bpp_c  :   48.7   42.2
> vp9_avg_8tap_smooth_8h_8bpp_rvv_i32:9.58.5
> vp9_avg_8tap_smooth_8v_8bpp_c  :   49.7   45.5
> vp9_avg_8tap_smooth_8v_8bpp_rvv_i32:9.58.5
> vp9_avg_8tap_smooth_16h_8bpp_c :  192.0  166.5
> vp9_avg_8tap_smooth_16h_8bpp_rvv_i32   :   21.7   19.5
> vp9_avg_8tap_smooth_16v_8bpp_c :  191.2  175.2
> vp9_avg_8tap_smooth_16v_8bpp_rvv_i32   :   21.2   19.0
> vp9_avg_8tap_smooth_32h_8bpp_c :  780.2  663.2
> vp9_avg_8tap_smooth_32h_8bpp_rvv_i32   :   68.2   60.5
> vp9_avg_8tap_smooth_32v_8bpp_c :  770.0  685.7
> vp9_avg_8tap_smooth_32v_8bpp_rvv_i32   :   67.0   59.5
> vp9_avg_8tap_smooth_64h_8bpp_c : 3116.2 2648.2
> vp9_avg_8tap_smooth_64h_8bpp_rvv_i32   :  270.7  120.7
> vp9_avg_8tap_smooth_64v_8bpp_c : 3058.5 2731.7
> vp9_avg_8tap_smooth_64v_8bpp_rvv_i32   :  266.5  119.0
> vp9_put_8tap_smooth_4h_8bpp_c  :   11.09.7
> vp9_put_8tap_smooth_4h_8bpp_rvv_i32:4.23.7
> vp9_put_8tap_smooth_4v_8bpp_c  :   11.7   10.5
> vp9_put_8tap_smooth_4v_8bpp_rvv_i32:4.03.7
> vp9_put_8tap_smooth_8h_8bpp_c  :   42.0   37.5
> vp9_put_8tap_smooth_8h_8bpp_rvv_i32:8.57.7
> vp9_put_8tap_smooth_8v_8bpp_c  :   43.5   38.5
> vp9_put_8tap_smooth_8v_8bpp_rvv_i32:8.77.7
> vp9_put_8tap_smooth_16h_8bpp_c :  181.7  147.2
> vp9_put_8tap_smooth_16h_8bpp_rvv_i32   :   20.0   18.0
> vp9_put_8tap_smooth_16v_8bpp_c :  168.5  149.7
> vp9_put_8tap_smooth_16v_8bpp_rvv_i32   :   19.7   17.5
> vp9_put_8tap_smooth_32h_8bpp_c :  675.0  586.5
> vp9_put_8tap_smooth_32h_8bpp_rvv_i32   :   65.2   58.0
> vp9_put_8tap_smooth_32v_8bpp_c :  664.7  591.2
> vp9_put_8tap_smooth_32v_8bpp_rvv_i32   :   64.0   57.0
> vp9_put_8tap_smooth_64h_8bpp_c : 2696.2 2339.0
> vp9_put_8tap_smooth_64h_8bpp_rvv_i32   :  259.7  115.7
> vp9_put_8tap_smooth_64v_8bpp_c : 2691.0 2348.5
> vp9_put_8tap_smooth_64v_8bpp_rvv_i32   :  255.5  114.0
> ---
>  libavcodec/riscv/vp9_mc_rvv.S  | 183 +
>  libavcodec/riscv/vp9dsp.h  |  72 -
>  libavcodec/riscv/vp9dsp_init.c |  35 ++-
>  3 files changed, 265 insertions(+), 25 deletions(-)
>
> diff --git a/libavcodec/riscv/vp9_mc_rvv.S b/libavcodec/riscv/vp9_mc_rvv.S
> index d1ddbe007b..32143b67e1 100644
> --- a/libavcodec/riscv/vp9_mc_rvv.S
> +++ b/libavcodec/riscv/vp9_mc_rvv.S
> @@ -36,6 +36,18 @@
>  .endif
>  .endm
>
> +.macro vsetvlstatic16 len
> +.ifc \len,4
> +vsetvli zero, zero, e16, mf2, ta, ma
> +.elseif \len == 8
> +vsetvli zero, zero, e16, m1, ta, ma
> +.elseif \len == 16
> +vsetvli zero, zero, e16, m2, ta, ma
> +.else
> +vsetvli zero, zero, e16, m4, ta, ma
> +.endif
> +.endm
> +
>  .macro copy_avg len
>  func ff_vp9_avg\len\()_rvv, zve32x
>  lpad0
> @@ -180,8 +192,179 @@ func ff_\op\()_vp9_bilin_64hv_rvv, zve32x
>  endfunc
>  .endm
>
> +.equ ff_vp9_subpel_filters_smooth, ff_vp9_subpel_filters
> +.equ ff_vp9_subpel_filters_regular, ff_vp9_subpel_filters + 16*8*2
> +.equ ff_vp9_subpel_filters_sharp, ff_vp9_subpel_filters + 16*8*2*2
> +
> +.macro epel_filter name, type, regtype, arg
> +lla \regtype\()2, ff_vp9_subpel_filters_\name
> +.ifc \type,v
> +slli\regtype\()0, a6, 4
> +.else
> +slli\regtype\()0, a5, 4
> +.endif
> +add \regtype\()0, \regtype\()0, \regtype\()2
> +lh  \regtype\()1, 2(\regtype\()0)
> +lh  \regtype\()2, 4(\regtype\()0)
> +lh  \regtype\()3, 6(\regtype\()0)
> +lh  \regtype\()4, 8(\regtype\()0)
> +lh  \regtype\()5, 10(\regtype\()0)
> +lh  \regtype\()6, 12(\regtype\()0)
> +lh  \arg, 14(\regtype\()0)
> +lh  \regtype\()0, 0(\regtype\()0)
> +.endm
> +
> +.macro epel_load dst, len, op, name, type, from_mem, regtype
>

Re: [FFmpeg-devel] [PATCH] lavc/vvc_mc: R-V V avg w_avg

2024-08-18 Thread flow gg

I wrote `ff_vvc_w_avg_8_rvv` by mimicking the h264 weight function.

Based on the test results for 49 different resolutions, most of them were
significantly slower.

Only 2x32 and 2x64 had similar performance, without noticeable speed
improvement.

I'm not sure about the reason. Some differences are that
`ff_h264_weight_pixels_8_rvv` only requires one `vlsseg2e8.v`, while
`ff_vvc_w_avg_8_rvv` requires two `vlsseg2e16.v`. The v calculations inside
the loop of `ff_h264_weight_pixels_8_rvv` are more than in
`ff_vvc_w_avg_8_rvv`, but there are fewer scalar operations.

Rémi Denis-Courmont  于2024年8月15日周四 16:10写道：

>
>
> Le 3 août 2024 13:30:34 GMT+03:00, u...@foxmail.com a écrit :
> >From: sunyuechi 
> >
> > C908   X60
> >avg_8_2x2_c:1.21.0
> >avg_8_2x2_rvv_i32  :0.70.7
> >avg_8_2x4_c:2.02.2
> >avg_8_2x4_rvv_i32  :1.21.2
> >avg_8_2x8_c:3.74.0
> >avg_8_2x8_rvv_i32  :1.71.5
> >avg_8_2x16_c   :7.27.7
> >avg_8_2x16_rvv_i32 :3.02.7
> >avg_8_2x32_c   :   14.2   15.2
> >avg_8_2x32_rvv_i32 :5.55.0
> >avg_8_2x64_c   :   51.0   43.7
> >avg_8_2x64_rvv_i32 :   39.2   29.7
> >avg_8_2x128_c  :  100.5   79.2
> >avg_8_2x128_rvv_i32:   79.7   68.2
> >avg_8_4x2_c:1.72.0
> >avg_8_4x2_rvv_i32  :1.00.7
> >avg_8_4x4_c:3.53.7
> >avg_8_4x4_rvv_i32  :1.21.2
> >avg_8_4x8_c:6.77.0
> >avg_8_4x8_rvv_i32  :1.71.5
> >avg_8_4x16_c   :   13.5   14.0
> >avg_8_4x16_rvv_i32 :3.02.7
> >avg_8_4x32_c   :   26.2   27.7
> >avg_8_4x32_rvv_i32 :5.54.7
> >avg_8_4x64_c   :   73.0   73.7
> >avg_8_4x64_rvv_i32 :   39.0   32.5
> >avg_8_4x128_c  :  143.0  137.2
> >avg_8_4x128_rvv_i32:   72.7   68.0
> >avg_8_8x2_c:3.53.5
> >avg_8_8x2_rvv_i32  :1.00.7
> >avg_8_8x4_c:6.26.5
> >avg_8_8x4_rvv_i32  :1.51.0
> >avg_8_8x8_c:   12.7   13.2
> >avg_8_8x8_rvv_i32  :2.01.5
> >avg_8_8x16_c   :   25.0   26.5
> >avg_8_8x16_rvv_i32 :3.22.7
> >avg_8_8x32_c   :   50.0   52.7
> >avg_8_8x32_rvv_i32 :6.25.0
> >avg_8_8x64_c   :  118.7  122.5
> >avg_8_8x64_rvv_i32 :   40.2   31.5
> >avg_8_8x128_c  :  236.7  220.2
> >avg_8_8x128_rvv_i32:   85.2   67.7
> >avg_8_16x2_c   :6.26.7
> >avg_8_16x2_rvv_i32 :1.20.7
> >avg_8_16x4_c   :   12.5   13.0
> >avg_8_16x4_rvv_i32 :1.71.0
> >avg_8_16x8_c   :   24.5   26.0
> >avg_8_16x8_rvv_i32 :3.01.7
> >avg_8_16x16_c  :   49.0   51.5
> >avg_8_16x16_rvv_i32:5.53.0
> >avg_8_16x32_c  :   97.5  102.5
> >avg_8_16x32_rvv_i32:   10.55.5
> >avg_8_16x64_c  :  213.7  222.0
> >avg_8_16x64_rvv_i32:   48.5   34.2
> >avg_8_16x128_c :  434.7  420.0
> >avg_8_16x128_rvv_i32   :   97.7   74.0
> >avg_8_32x2_c   :   12.2   12.7
> >avg_8_32x2_rvv_i32 :1.51.0
> >avg_8_32x4_c   :   24.5   25.5
> >avg_8_32x4_rvv_i32 :3.01.7
> >avg_8_32x8_c

Re: [FFmpeg-devel] [PATCH] lavc/vvc_mc: R-V V avg w_avg

2024-08-17 Thread flow gg

How can I test the weight and biweight of H.264? I haven't seen the related
test code..
tests/checkasm/checkasm --bench --test=h264dsp

Rémi Denis-Courmont  于2024年8月15日周四 16:10写道：

>
>
> Le 3 août 2024 13:30:34 GMT+03:00, u...@foxmail.com a écrit :
> >From: sunyuechi 
> >
> > C908   X60
> >avg_8_2x2_c:1.21.0
> >avg_8_2x2_rvv_i32  :0.70.7
> >avg_8_2x4_c:2.02.2
> >avg_8_2x4_rvv_i32  :1.21.2
> >avg_8_2x8_c:3.74.0
> >avg_8_2x8_rvv_i32  :1.71.5
> >avg_8_2x16_c   :7.27.7
> >avg_8_2x16_rvv_i32 :3.02.7
> >avg_8_2x32_c   :   14.2   15.2
> >avg_8_2x32_rvv_i32 :5.55.0
> >avg_8_2x64_c   :   51.0   43.7
> >avg_8_2x64_rvv_i32 :   39.2   29.7
> >avg_8_2x128_c  :  100.5   79.2
> >avg_8_2x128_rvv_i32:   79.7   68.2
> >avg_8_4x2_c:1.72.0
> >avg_8_4x2_rvv_i32  :1.00.7
> >avg_8_4x4_c:3.53.7
> >avg_8_4x4_rvv_i32  :1.21.2
> >avg_8_4x8_c:6.77.0
> >avg_8_4x8_rvv_i32  :1.71.5
> >avg_8_4x16_c   :   13.5   14.0
> >avg_8_4x16_rvv_i32 :3.02.7
> >avg_8_4x32_c   :   26.2   27.7
> >avg_8_4x32_rvv_i32 :5.54.7
> >avg_8_4x64_c   :   73.0   73.7
> >avg_8_4x64_rvv_i32 :   39.0   32.5
> >avg_8_4x128_c  :  143.0  137.2
> >avg_8_4x128_rvv_i32:   72.7   68.0
> >avg_8_8x2_c:3.53.5
> >avg_8_8x2_rvv_i32  :1.00.7
> >avg_8_8x4_c:6.26.5
> >avg_8_8x4_rvv_i32  :1.51.0
> >avg_8_8x8_c:   12.7   13.2
> >avg_8_8x8_rvv_i32  :2.01.5
> >avg_8_8x16_c   :   25.0   26.5
> >avg_8_8x16_rvv_i32 :3.22.7
> >avg_8_8x32_c   :   50.0   52.7
> >avg_8_8x32_rvv_i32 :6.25.0
> >avg_8_8x64_c   :  118.7  122.5
> >avg_8_8x64_rvv_i32 :   40.2   31.5
> >avg_8_8x128_c  :  236.7  220.2
> >avg_8_8x128_rvv_i32:   85.2   67.7
> >avg_8_16x2_c   :6.26.7
> >avg_8_16x2_rvv_i32 :1.20.7
> >avg_8_16x4_c   :   12.5   13.0
> >avg_8_16x4_rvv_i32 :1.71.0
> >avg_8_16x8_c   :   24.5   26.0
> >avg_8_16x8_rvv_i32 :3.01.7
> >avg_8_16x16_c  :   49.0   51.5
> >avg_8_16x16_rvv_i32:5.53.0
> >avg_8_16x32_c  :   97.5  102.5
> >avg_8_16x32_rvv_i32:   10.55.5
> >avg_8_16x64_c  :  213.7  222.0
> >avg_8_16x64_rvv_i32:   48.5   34.2
> >avg_8_16x128_c :  434.7  420.0
> >avg_8_16x128_rvv_i32   :   97.7   74.0
> >avg_8_32x2_c   :   12.2   12.7
> >avg_8_32x2_rvv_i32 :1.51.0
> >avg_8_32x4_c   :   24.5   25.5
> >avg_8_32x4_rvv_i32 :3.01.7
> >avg_8_32x8_c   :   48.5   50.7
> >avg_8_32x8_rvv_i32 :5.22.7
> >avg_8_32x16_c  :   96.7  101.2
> >avg_8_32x16_rvv_i32:   10.25.0
> >avg_8_32x32_c  :  192.7  202.2
> >avg_8_32x32_rvv_i32:   19.79.5
> >avg_8_32x64_c

Re: [FFmpeg-devel] [PATCH 2/4] lavc/vp9dsp: R-V V mc bilin hv

2024-08-09 Thread flow gg

> That seems suboptimal and unnecessary.

Updated it, there is no longer any vmv.

 于2024年8月9日周五 22:24写道：

> From: sunyuechi 
>
>  C908   X60
> vp9_avg_bilin_4hv_8bpp_c   :   10.79.5
> vp9_avg_bilin_4hv_8bpp_rvv_i32 :4.03.5
> vp9_avg_bilin_8hv_8bpp_c   :   38.5   34.2
> vp9_avg_bilin_8hv_8bpp_rvv_i32 :7.26.5
> vp9_avg_bilin_16hv_8bpp_c  :  147.2  130.5
> vp9_avg_bilin_16hv_8bpp_rvv_i32:   14.5   12.7
> vp9_avg_bilin_32hv_8bpp_c  :  574.2  509.7
> vp9_avg_bilin_32hv_8bpp_rvv_i32:   42.5   38.0
> vp9_avg_bilin_64hv_8bpp_c  : 2321.2 2017.7
> vp9_avg_bilin_64hv_8bpp_rvv_i32:  163.5  131.0
> vp9_put_bilin_4hv_8bpp_c   :   10.08.7
> vp9_put_bilin_4hv_8bpp_rvv_i32 :3.53.0
> vp9_put_bilin_8hv_8bpp_c   :   35.2   31.2
> vp9_put_bilin_8hv_8bpp_rvv_i32 :6.55.7
> vp9_put_bilin_16hv_8bpp_c  :  134.0  119.0
> vp9_put_bilin_16hv_8bpp_rvv_i32:   12.7   11.5
> vp9_put_bilin_32hv_8bpp_c  :  538.5  464.2
> vp9_put_bilin_32hv_8bpp_rvv_i32:   39.7   35.2
> vp9_put_bilin_64hv_8bpp_c  : 2111.7 1833.2
> vp9_put_bilin_64hv_8bpp_rvv_i32:  138.5  122.5
> ---
>  libavcodec/riscv/vp9_mc_rvv.S  | 46 +-
>  libavcodec/riscv/vp9dsp_init.c | 10 
>  2 files changed, 55 insertions(+), 1 deletion(-)
>
> diff --git a/libavcodec/riscv/vp9_mc_rvv.S b/libavcodec/riscv/vp9_mc_rvv.S
> index 9e8061616f..d1ddbe007b 100644
> --- a/libavcodec/riscv/vp9_mc_rvv.S
> +++ b/libavcodec/riscv/vp9_mc_rvv.S
> @@ -138,6 +138,48 @@ func ff_\op\()_vp9_bilin_64\type\()_rvv, zve32x
>  endfunc
>  .endm
>
> +.macro bilin_hv op
> +func ff_\op\()_vp9_bilin_64hv_rvv, zve32x
> +lpad0
> +vsetvlstatic8   64, t0, 64
> +.Lbilin_hv\op:
> +.ifc \op,avg
> +csrwi   vxrm, 0
> +.endif
> +neg t1, a5
> +neg t2, a6
> +li  t3, 8
> +bilin_load  v24, a5, h
> +1:
> +addia4, a4, -2
> +bilin_load  v8, a5, h
> +vwmulu.vx   v16, v8, a6
> +vwmaccsu.vx v16, t2, v24
> +vwadd.wxv16, v16, t3
> +vnsra.wiv16, v16, 4
> +vadd.vv v12, v16, v24
> +add t5, a0, a1
> +bilin_load  v24, a5, h
> +vwmulu.vx   v16, v24, a6
> +vwmaccsu.vx v16, t2, v8
> +vwadd.wxv16, v16, t3
> +vnsra.wiv16, v16, 4
> +vadd.vv v0, v16, v8
> +.ifc \op,avg
> +vle8.v  v8, (a0)
> +vle8.v  v16, (t5)
> +vaaddu.vv   v12, v12, v8
> +vaaddu.vv   v0, v0, v16
> +.endif
> +vse8.v  v12, (a0)
> +vse8.v  v0, (t5)
> +add a0, t5, a1
> +bneza4, 1b
> +
> +ret
> +endfunc
> +.endm
> +
>  .irp len, 64, 32, 16, 8, 4
>  copy_avg \len
>  .endr
> @@ -146,6 +188,8 @@ bilin_h_v  put, h, a5
>  bilin_h_v  avg, h, a5
>  bilin_h_v  put, v, a6
>  bilin_h_v  avg, v, a6
> +bilin_hv   put
> +bilin_hv   avg
>
>  .macro func_bilin_h_v len, op, type
>  func ff_\op\()_vp9_bilin_\len\()\type\()_rvv, zve32x
> @@ -157,7 +201,7 @@ endfunc
>
>  .irp len, 32, 16, 8, 4
>  .irp op, put, avg
> -.irp type, h, v
> +.irp type, h, v, hv
>  func_bilin_h_v \len, \op, \type
>  .endr
>  .endr
> diff --git a/libavcodec/riscv/vp9dsp_init.c
> b/libavcodec/riscv/vp9dsp_init.c
> index 83dbe1b5d9..d53852f673 100644
> --- a/libavcodec/riscv/vp9dsp_init.c
> +++ b/libavcodec/riscv/vp9dsp_init.c
> @@ -83,6 +83,16 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext
> *dsp, int bpp)
>  dsp->mc[4][FILTER_BILINEAR ][0][1][0] = ff_put_vp9_bilin_4h_rvv;
>  dsp->mc[4][FILTER_BILINEAR ][1][0][1] = ff_avg_vp9_bilin_4v_rvv;
>  dsp->mc[4][FILTER_BILINEAR ][1][1][0] = ff_avg_vp9_bilin_4h_rvv;
> +dsp->mc[0][FILTER_BILINEAR ][0][1][1] = ff_put_vp9_bilin_64hv_rvv;
> +dsp->mc[0][FILTER_BILINEAR ][1][1][1] = ff_avg_vp9_bilin_64hv_rvv;
> +dsp->mc[1][FILTER_BILINEAR ][0][1][1] = ff_put_vp9_bilin_32hv_rvv;
> +dsp->mc[1][FILTER_BILINEAR ][1][1][1] = ff_avg_vp9_bilin_32hv_rvv;
> +dsp->mc[2][FILTER_BILINEAR ][0][1][1] = ff_put_vp9_bilin_16hv_rvv;
> +dsp->mc[2][FILTER_BILINEAR ][1][1][1] = ff_avg_vp9_bilin_16hv_rvv;
> +dsp->mc[3][FILTER_BILINEAR ][0][1][1] = ff_put_vp9_bilin_8hv_rvv;
> +dsp->mc[3][FILTER_BILINEAR ][1][1][1] = ff_avg_vp9_bilin_8hv_rvv;
> +dsp->mc[4][FILT

Re: [FFmpeg-devel] [PATCH] lavc/vvc_mc: R-V V avg w_avg

2024-08-03 Thread flow gg

Added lpad and resolved conflicts with master.

 于2024年8月3日周六 18:31写道：

> From: sunyuechi 
>
>  C908   X60
> avg_8_2x2_c:1.21.0
> avg_8_2x2_rvv_i32  :0.70.7
> avg_8_2x4_c:2.02.2
> avg_8_2x4_rvv_i32  :1.21.2
> avg_8_2x8_c:3.74.0
> avg_8_2x8_rvv_i32  :1.71.5
> avg_8_2x16_c   :7.27.7
> avg_8_2x16_rvv_i32 :3.02.7
> avg_8_2x32_c   :   14.2   15.2
> avg_8_2x32_rvv_i32 :5.55.0
> avg_8_2x64_c   :   51.0   43.7
> avg_8_2x64_rvv_i32 :   39.2   29.7
> avg_8_2x128_c  :  100.5   79.2
> avg_8_2x128_rvv_i32:   79.7   68.2
> avg_8_4x2_c:1.72.0
> avg_8_4x2_rvv_i32  :1.00.7
> avg_8_4x4_c:3.53.7
> avg_8_4x4_rvv_i32  :1.21.2
> avg_8_4x8_c:6.77.0
> avg_8_4x8_rvv_i32  :1.71.5
> avg_8_4x16_c   :   13.5   14.0
> avg_8_4x16_rvv_i32 :3.02.7
> avg_8_4x32_c   :   26.2   27.7
> avg_8_4x32_rvv_i32 :5.54.7
> avg_8_4x64_c   :   73.0   73.7
> avg_8_4x64_rvv_i32 :   39.0   32.5
> avg_8_4x128_c  :  143.0  137.2
> avg_8_4x128_rvv_i32:   72.7   68.0
> avg_8_8x2_c:3.53.5
> avg_8_8x2_rvv_i32  :1.00.7
> avg_8_8x4_c:6.26.5
> avg_8_8x4_rvv_i32  :1.51.0
> avg_8_8x8_c:   12.7   13.2
> avg_8_8x8_rvv_i32  :2.01.5
> avg_8_8x16_c   :   25.0   26.5
> avg_8_8x16_rvv_i32 :3.22.7
> avg_8_8x32_c   :   50.0   52.7
> avg_8_8x32_rvv_i32 :6.25.0
> avg_8_8x64_c   :  118.7  122.5
> avg_8_8x64_rvv_i32 :   40.2   31.5
> avg_8_8x128_c  :  236.7  220.2
> avg_8_8x128_rvv_i32:   85.2   67.7
> avg_8_16x2_c   :6.26.7
> avg_8_16x2_rvv_i32 :1.20.7
> avg_8_16x4_c   :   12.5   13.0
> avg_8_16x4_rvv_i32 :1.71.0
> avg_8_16x8_c   :   24.5   26.0
> avg_8_16x8_rvv_i32 :3.01.7
> avg_8_16x16_c  :   49.0   51.5
> avg_8_16x16_rvv_i32:5.53.0
> avg_8_16x32_c  :   97.5  102.5
> avg_8_16x32_rvv_i32:   10.55.5
> avg_8_16x64_c  :  213.7  222.0
> avg_8_16x64_rvv_i32:   48.5   34.2
> avg_8_16x128_c :  434.7  420.0
> avg_8_16x128_rvv_i32   :   97.7   74.0
> avg_8_32x2_c   :   12.2   12.7
> avg_8_32x2_rvv_i32 :1.51.0
> avg_8_32x4_c   :   24.5   25.5
> avg_8_32x4_rvv_i32 :3.01.7
> avg_8_32x8_c   :   48.5   50.7
> avg_8_32x8_rvv_i32 :5.22.7
> avg_8_32x16_c  :   96.7  101.2
> avg_8_32x16_rvv_i32:   10.25.0
> avg_8_32x32_c  :  192.7  202.2
> avg_8_32x32_rvv_i32:   19.79.5
> avg_8_32x64_c  :  427.5  426.5
> avg_8_32x64_rvv_i32:   64.2   18.2
> avg_8_32x128_c :  816.5  821.0
> avg_8_32x128_rvv_i32   :  135.2   75.5
> avg_8_64

Re: [FFmpeg-devel] [PATCH 1/4] lavc/vp9dsp: R-V V mc bilin h v

2024-08-03 Thread flow gg

> Looks OK, but missing CFI landing pads.

Added lpad.

 于2024年8月3日周六 17:51写道：

> From: sunyuechi 
>
>  C908   X60
> vp9_avg_bilin_4h_8bpp_c:5.54.7
> vp9_avg_bilin_4h_8bpp_rvv_i32  :1.71.5
> vp9_avg_bilin_4v_8bpp_c:5.54.7
> vp9_avg_bilin_4v_8bpp_rvv_i32  :1.51.2
> vp9_avg_bilin_8h_8bpp_c:   20.0   17.7
> vp9_avg_bilin_8h_8bpp_rvv_i32  :3.02.7
> vp9_avg_bilin_8v_8bpp_c:   20.7   18.7
> vp9_avg_bilin_8v_8bpp_rvv_i32  :3.02.7
> vp9_avg_bilin_16h_8bpp_c   :   78.2   69.7
> vp9_avg_bilin_16h_8bpp_rvv_i32 :7.06.2
> vp9_avg_bilin_16v_8bpp_c   :   98.5   73.2
> vp9_avg_bilin_16v_8bpp_rvv_i32 :7.06.0
> vp9_avg_bilin_32h_8bpp_c   :  325.5  275.5
> vp9_avg_bilin_32h_8bpp_rvv_i32 :   23.0   20.5
> vp9_avg_bilin_32v_8bpp_c   :  342.2  290.0
> vp9_avg_bilin_32v_8bpp_rvv_i32 :   21.7   19.5
> vp9_avg_bilin_64h_8bpp_c   : 1263.7 1095.7
> vp9_avg_bilin_64h_8bpp_rvv_i32 :   91.2   81.2
> vp9_avg_bilin_64v_8bpp_c   : 1331.7 1155.2
> vp9_avg_bilin_64v_8bpp_rvv_i32 :   91.2   81.0
> vp9_put_bilin_4h_8bpp_c:4.54.0
> vp9_put_bilin_4h_8bpp_rvv_i32  :1.01.0
> vp9_put_bilin_4v_8bpp_c:4.74.2
> vp9_put_bilin_4v_8bpp_rvv_i32  :1.01.0
> vp9_put_bilin_8h_8bpp_c:   16.7   15.0
> vp9_put_bilin_8h_8bpp_rvv_i32  :2.22.0
> vp9_put_bilin_8v_8bpp_c:   17.5   15.7
> vp9_put_bilin_8v_8bpp_rvv_i32  :2.22.0
> vp9_put_bilin_16h_8bpp_c   :   65.2   58.0
> vp9_put_bilin_16h_8bpp_rvv_i32 :6.05.5
> vp9_put_bilin_16v_8bpp_c   :   69.2   61.7
> vp9_put_bilin_16v_8bpp_rvv_i32 :5.75.2
> vp9_put_bilin_32h_8bpp_c   :  273.2  229.0
> vp9_put_bilin_32h_8bpp_rvv_i32 :   19.7   17.7
> vp9_put_bilin_32v_8bpp_c   :  290.5  243.7
> vp9_put_bilin_32v_8bpp_rvv_i32 :   18.7   16.7
> vp9_put_bilin_64h_8bpp_c   : 1040.5  910.5
> vp9_put_bilin_64h_8bpp_rvv_i32 :   82.5   73.0
> vp9_put_bilin_64v_8bpp_c   : 1108.5  971.0
> vp9_put_bilin_64v_8bpp_rvv_i32 :   82.2   73.2
> ---
>  libavcodec/riscv/vp9_mc_rvv.S  | 116 +
>  libavcodec/riscv/vp9dsp.h  |  12 ++--
>  libavcodec/riscv/vp9dsp_init.c |  21 ++
>  3 files changed, 143 insertions(+), 6 deletions(-)
>
> diff --git a/libavcodec/riscv/vp9_mc_rvv.S b/libavcodec/riscv/vp9_mc_rvv.S
> index 8d776661d9..817cc58b5e 100644
> --- a/libavcodec/riscv/vp9_mc_rvv.S
> +++ b/libavcodec/riscv/vp9_mc_rvv.S
> @@ -54,6 +54,122 @@ func ff_vp9_avg\len\()_rvv, zve32x
>  endfunc
>  .endm
>
> +.macro bilin_load_h dst, op, mn
> +addit5, a2, 1
> +vle8.v  v8, (a2)
> +vle8.v  v0, (t5)
> +vwmulu.vx   v16, v0, \mn
> +vwmaccsu.vx v16, t1, v8
> +vwadd.wxv16, v16, t4
> +vnsra.wiv16, v16, 4
> +vadd.vv \dst, v16, v8
> +.ifc \op,avg
> +vle8.v  v16, (a0)
> +vaaddu.vv   \dst, \dst, v16
> +.endif
> +.endm
> +
> +.macro bilin_h_v op, type, mn
> +func ff_\op\()_vp9_bilin_64\type\()_rvv, zve32x
> +lpad0
> +vsetvlstatic8   64, t0, 64
> +.ifc \op,avg
> +csrwi   vxrm, 0
> +.endif
> +li  t4, 8
> +neg t1, \mn
> +1:
> +addia4, a4, -1
> +.ifc \type,v
> +add t5, a2, a3
> +.else
> +addit5, a2, 1
> +.endif
> +vle8.v  v8, (a2)
> +vle8.v  v0, (t5)
> +vwmulu.vx   v16, v0, \mn
> +vwmaccsu.vx v16, t1, v8
> +vwadd.wxv16, v16, t4
> +vnsra.wiv16, v16, 4
> +vadd.vv v0, v16, v8
> +.ifc \op,avg
> +vle8.v  v16, (a0)
> +vaaddu.vv   v0, v0, v16
> +.endif
> +vse8.v  v0, (a0)
> +add a2, a2, a3
> +add a0, a0, a1
> +bneza4, 1b
> +ret
> +
> +.Lbilin_\type\op:
> +.ifc \op,avg
> +csrwi   vxrm, 0
> +.endif
> +li  t4, 8
> +neg t1, \mn

Re: [FFmpeg-devel] [PATCH 3/4] lavc/vp9dsp: R-V V mc tap h v

2024-08-01 Thread flow gg

> Use rounding.

Updated it and resolved conflicts with master.

 于2024年8月1日周四 20:16写道：

> From: sunyuechi 
>
>  C908   X60
> vp9_avg_8tap_smooth_4h_8bpp_c  :   12.7   11.2
> vp9_avg_8tap_smooth_4h_8bpp_rvv_i32:4.74.2
> vp9_avg_8tap_smooth_4v_8bpp_c  :   29.7   12.5
> vp9_avg_8tap_smooth_4v_8bpp_rvv_i32:4.74.2
> vp9_avg_8tap_smooth_8h_8bpp_c  :   48.7   42.2
> vp9_avg_8tap_smooth_8h_8bpp_rvv_i32:9.58.5
> vp9_avg_8tap_smooth_8v_8bpp_c  :   49.7   45.5
> vp9_avg_8tap_smooth_8v_8bpp_rvv_i32:9.58.5
> vp9_avg_8tap_smooth_16h_8bpp_c :  192.0  166.5
> vp9_avg_8tap_smooth_16h_8bpp_rvv_i32   :   21.7   19.5
> vp9_avg_8tap_smooth_16v_8bpp_c :  191.2  175.2
> vp9_avg_8tap_smooth_16v_8bpp_rvv_i32   :   21.2   19.0
> vp9_avg_8tap_smooth_32h_8bpp_c :  780.2  663.2
> vp9_avg_8tap_smooth_32h_8bpp_rvv_i32   :   68.2   60.5
> vp9_avg_8tap_smooth_32v_8bpp_c :  770.0  685.7
> vp9_avg_8tap_smooth_32v_8bpp_rvv_i32   :   67.0   59.5
> vp9_avg_8tap_smooth_64h_8bpp_c : 3116.2 2648.2
> vp9_avg_8tap_smooth_64h_8bpp_rvv_i32   :  270.7  120.7
> vp9_avg_8tap_smooth_64v_8bpp_c : 3058.5 2731.7
> vp9_avg_8tap_smooth_64v_8bpp_rvv_i32   :  266.5  119.0
> vp9_put_8tap_smooth_4h_8bpp_c  :   11.09.7
> vp9_put_8tap_smooth_4h_8bpp_rvv_i32:4.23.7
> vp9_put_8tap_smooth_4v_8bpp_c  :   11.7   10.5
> vp9_put_8tap_smooth_4v_8bpp_rvv_i32:4.03.7
> vp9_put_8tap_smooth_8h_8bpp_c  :   42.0   37.5
> vp9_put_8tap_smooth_8h_8bpp_rvv_i32:8.57.7
> vp9_put_8tap_smooth_8v_8bpp_c  :   43.5   38.5
> vp9_put_8tap_smooth_8v_8bpp_rvv_i32:8.77.7
> vp9_put_8tap_smooth_16h_8bpp_c :  181.7  147.2
> vp9_put_8tap_smooth_16h_8bpp_rvv_i32   :   20.0   18.0
> vp9_put_8tap_smooth_16v_8bpp_c :  168.5  149.7
> vp9_put_8tap_smooth_16v_8bpp_rvv_i32   :   19.7   17.5
> vp9_put_8tap_smooth_32h_8bpp_c :  675.0  586.5
> vp9_put_8tap_smooth_32h_8bpp_rvv_i32   :   65.2   58.0
> vp9_put_8tap_smooth_32v_8bpp_c :  664.7  591.2
> vp9_put_8tap_smooth_32v_8bpp_rvv_i32   :   64.0   57.0
> vp9_put_8tap_smooth_64h_8bpp_c : 2696.2 2339.0
> vp9_put_8tap_smooth_64h_8bpp_rvv_i32   :  259.7  115.7
> vp9_put_8tap_smooth_64v_8bpp_c : 2691.0 2348.5
> vp9_put_8tap_smooth_64v_8bpp_rvv_i32   :  255.5  114.0
> ---
>  libavcodec/riscv/vp9_mc_rvv.S  | 182 +
>  libavcodec/riscv/vp9dsp.h  |  72 -
>  libavcodec/riscv/vp9dsp_init.c |  38 ++-
>  3 files changed, 267 insertions(+), 25 deletions(-)
>
> diff --git a/libavcodec/riscv/vp9_mc_rvv.S b/libavcodec/riscv/vp9_mc_rvv.S
> index 4faf932100..ce8fe6f66f 100644
> --- a/libavcodec/riscv/vp9_mc_rvv.S
> +++ b/libavcodec/riscv/vp9_mc_rvv.S
> @@ -36,6 +36,18 @@
>  .endif
>  .endm
>
> +.macro vsetvlstatic16 len
> +.ifc \len,4
> +vsetvli zero, zero, e16, mf2, ta, ma
> +.elseif \len == 8
> +vsetvli zero, zero, e16, m1, ta, ma
> +.elseif \len == 16
> +vsetvli zero, zero, e16, m2, ta, ma
> +.else
> +vsetvli zero, zero, e16, m4, ta, ma
> +.endif
> +.endm
> +
>  .macro copy_avg len
>  func ff_vp9_avg\len\()_rvv, zve32x
>  lpad0
> @@ -182,8 +194,178 @@ func ff_\op\()_vp9_bilin_64hv_rvv, zve32x
>  endfunc
>  .endm
>
> +.equ ff_vp9_subpel_filters_smooth, ff_vp9_subpel_filters
> +.equ ff_vp9_subpel_filters_regular, ff_vp9_subpel_filters + 16*8*2
> +.equ ff_vp9_subpel_filters_sharp, ff_vp9_subpel_filters + 16*8*2*2
> +
> +.macro epel_filter name, type, regtype, arg
> +lla \regtype\()2, ff_vp9_subpel_filters_\name
> +.ifc \type,v
> +slli\regtype\()0, a6, 4
> +.else
> +slli\regtype\()0, a5, 4
> +.endif
> +add \regtype\()0, \regtype\()0, \regtype\()2
> +lh  \regtype\()1, 2(\regtype\()0)
> +lh  \regtype\()2, 4(\regtype\()0)
> +lh  \regtype\()3, 6(\regtype\()0)
> +lh  \regtype\()4, 8(\regtype\()0)
> +lh  \regtype\()5, 10(\regtype\()0)
> +lh  \regtype\()6, 12(\regtype\()0)
> +lh  \arg, 14(\regtype\()0)
> +lh  \regtype\()0, 0(\regtype\()0)
> +.endm
> +
> +.macro epel_load dst, len, op, name, type, from_mem, regtype
> +.ifc \from_mem, 1
> +vle8

Re: [FFmpeg-devel] [PATCH 1/4] lavc/vp8dsp: R-V V 256 bilin,epel

2024-07-31 Thread flow gg

Thank you for the detailed explanation. One more question: I understand
that assembly code needs to be further broken down, but what's the issue
with adding this code to the init section of the C code here? I think this
C code is just mimicking the init section of the C code in x86.

Rémi Denis-Courmont  于2024年7月31日周三 23:06写道：

> Le tiistaina 30. heinäkuuta 2024, 20.57.28 EEST flow gg a écrit :
> > From my understanding, moving from supporting only 128b to adding 256b
> > versions can simultaneously improve LMUL and solve some issues related to
> > insufficient vector registers (vvc, vp9).
>
> To the contrary, if vectors are too short to process a macroblock in a
> single
> round, then there should be a loop with maximum LMUL, and the code should
> be
> the same for all vector length. That is just normal textbook RVV coding
> style.
> There should *not* be vector length specialisation since the code can be
> shared.
>
> > If we continue to support 512, 1024, ..., it almost exclusively improves
> > LMUL.
>
> I don't think so. Even more so than 256-bit hardware, 512-bit and 1024-bit
> hardware really _needs_ to short-circuit vector processing based on VL and
> not
> simply follow LMUL.
>
> > Therefore, 256b is the most worthwhile addition, and we can skip
> > adding 512b, 1024b, etc.
> >
> > Additionally, even though longer hardware will continually be developed,
> > the most used will probably still be 128b and 256b.
>
> I wouldn't be so sure. Realistically, lower-end SoCs decode video with
> DSPs.
> So video decoder vector optimisations are mainly for the server side, and
> that's exactly where larger vector sizes are most likely (e.g. AVX-512).
>
> > If someone complains that FFmpeg's RVV doesn't support 1024b well, it can
> > be said that it's not just RISC-V that lacks good support.
> > However, if the 256b performance is not good, then it seems like an issue
> > with RISC-V. :)
> >
> > I think maybe we can give some preference to the two smallest lengths?
>
> As I wrote, I am not necessarily against specialising for 256-bit as such.
> I
> am against:
> 1) specialising functions that do not really need to be specialised,
> 2) adding tons of boilerplate (notably in the C code) for it.
>
> --
> 雷米‧德尼-库尔蒙
> http://www.remlab.net/
>
>
>
> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
>
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH v4 3/4] lavc/vp9dsp: R-V V mc tap h v

2024-07-31 Thread flow gg

I'm a bit confused because the calculation here goes up to 32 bits and then
returns to 8 bits. It seems that the vmax and vnclipu instructions can't be
removed by using round-related instructions？

Rémi Denis-Courmont  于2024年7月29日周一 23:21写道：

> Le tiistaina 23. heinäkuuta 2024, 11.51.48 EEST u...@foxmail.com a écrit :
> > From: sunyuechi 
> >
> >  C908   X60
> > vp9_avg_8tap_smooth_4h_8bpp_c  :   12.7   11.2
> > vp9_avg_8tap_smooth_4h_8bpp_rvv_i32:4.74.2
> > vp9_avg_8tap_smooth_4v_8bpp_c  :   29.7   12.5
> > vp9_avg_8tap_smooth_4v_8bpp_rvv_i32:4.74.2
> > vp9_avg_8tap_smooth_8h_8bpp_c  :   48.7   42.2
> > vp9_avg_8tap_smooth_8h_8bpp_rvv_i32:9.58.5
> > vp9_avg_8tap_smooth_8v_8bpp_c  :   49.7   45.5
> > vp9_avg_8tap_smooth_8v_8bpp_rvv_i32:9.58.5
> > vp9_avg_8tap_smooth_16h_8bpp_c :  192.0  166.5
> > vp9_avg_8tap_smooth_16h_8bpp_rvv_i32   :   21.7   19.5
> > vp9_avg_8tap_smooth_16v_8bpp_c :  191.2  175.2
> > vp9_avg_8tap_smooth_16v_8bpp_rvv_i32   :   21.2   19.0
> > vp9_avg_8tap_smooth_32h_8bpp_c :  780.2  663.2
> > vp9_avg_8tap_smooth_32h_8bpp_rvv_i32   :   68.2   60.5
> > vp9_avg_8tap_smooth_32v_8bpp_c :  770.0  685.7
> > vp9_avg_8tap_smooth_32v_8bpp_rvv_i32   :   67.0   59.5
> > vp9_avg_8tap_smooth_64h_8bpp_c : 3116.2 2648.2
> > vp9_avg_8tap_smooth_64h_8bpp_rvv_i32   :  270.7  120.7
> > vp9_avg_8tap_smooth_64v_8bpp_c : 3058.5 2731.7
> > vp9_avg_8tap_smooth_64v_8bpp_rvv_i32   :  266.5  119.0
> > vp9_put_8tap_smooth_4h_8bpp_c  :   11.09.7
> > vp9_put_8tap_smooth_4h_8bpp_rvv_i32:4.23.7
> > vp9_put_8tap_smooth_4v_8bpp_c  :   11.7   10.5
> > vp9_put_8tap_smooth_4v_8bpp_rvv_i32:4.03.7
> > vp9_put_8tap_smooth_8h_8bpp_c  :   42.0   37.5
> > vp9_put_8tap_smooth_8h_8bpp_rvv_i32:8.57.7
> > vp9_put_8tap_smooth_8v_8bpp_c  :   43.5   38.5
> > vp9_put_8tap_smooth_8v_8bpp_rvv_i32:8.77.7
> > vp9_put_8tap_smooth_16h_8bpp_c :  181.7  147.2
> > vp9_put_8tap_smooth_16h_8bpp_rvv_i32   :   20.0   18.0
> > vp9_put_8tap_smooth_16v_8bpp_c :  168.5  149.7
> > vp9_put_8tap_smooth_16v_8bpp_rvv_i32   :   19.7   17.5
> > vp9_put_8tap_smooth_32h_8bpp_c :  675.0  586.5
> > vp9_put_8tap_smooth_32h_8bpp_rvv_i32   :   65.2   58.0
> > vp9_put_8tap_smooth_32v_8bpp_c :  664.7  591.2
> > vp9_put_8tap_smooth_32v_8bpp_rvv_i32   :   64.0   57.0
> > vp9_put_8tap_smooth_64h_8bpp_c : 2696.2 2339.0
> > vp9_put_8tap_smooth_64h_8bpp_rvv_i32   :  259.7  115.7
> > vp9_put_8tap_smooth_64v_8bpp_c : 2691.0 2348.5
> > vp9_put_8tap_smooth_64v_8bpp_rvv_i32   :  255.5  114.0
> > ---
> >  libavcodec/riscv/vp9_mc_rvv.S  | 193 +
> >  libavcodec/riscv/vp9dsp.h  |  72 
> >  libavcodec/riscv/vp9dsp_init.c |  38 ++-
> >  3 files changed, 278 insertions(+), 25 deletions(-)
> >
> > diff --git a/libavcodec/riscv/vp9_mc_rvv.S
> b/libavcodec/riscv/vp9_mc_rvv.S
> > index 5241562531..6a4be7b9bd 100644
> > --- a/libavcodec/riscv/vp9_mc_rvv.S
> > +++ b/libavcodec/riscv/vp9_mc_rvv.S
> > @@ -36,6 +36,18 @@
> >  .endif
> >  .endm
> >
> > +.macro vsetvlstatic16 len
> > +.ifc \len,4
> > +vsetvli zero, zero, e16, mf2, ta, ma
> > +.elseif \len == 8
> > +vsetvli zero, zero, e16, m1, ta, ma
> > +.elseif \len == 16
> > +vsetvli zero, zero, e16, m2, ta, ma
> > +.else
> > +vsetvli zero, zero, e16, m4, ta, ma
> > +.endif
> > +.endm
> > +
> >  .macro copy_avg len
> >  func ff_vp9_avg\len\()_rvv, zve32x
> >  csrwi   vxrm, 0
> > @@ -181,8 +193,189 @@ func ff_\op\()_vp9_bilin_64hv_rvv, zve32x
> >  endfunc
> >  .endm
> >
> > +.equ ff_vp9_subpel_filters_smooth, ff_vp9_subpel_filters
> > +.equ ff_vp9_subpel_filters_regular, ff_vp9_subpel_filters + 16*8*2
> > +.equ ff_vp9_subpel_filters_sharp, ff_vp9_subpel_filters + 16*8*2*2
> > +
> > +.macro epel_filter name, type, regtype, arg
> > +lla \regtype\()2, ff_vp9_subpel_filters_\name
> > +.ifc \type,v
> > +slli\regtype\()0, a6, 4
> > +.else
> > +slli\regtype\()0, a5, 4
> > +.endif
> > +add \regtype\()0, \regtype\()0, \regtype\()2
> > +lh  \regtype\()1, 2(\regtype\()0)
> > +lh  \regtype\()2, 4(\regtype\()0)
> > +lh

Re: [FFmpeg-devel] [PATCH 1/4] lavc/vp8dsp: R-V V 256 bilin,epel

2024-07-30 Thread flow gg

Hi, these four patches have v2 (although the first one seems to be the
same).

From my understanding, moving from supporting only 128b to adding 256b
versions can simultaneously improve LMUL and solve some issues related to
insufficient vector registers (vvc, vp9).
This can be very helpful in certain situations.

If we continue to support 512, 1024, ..., it almost exclusively improves
LMUL. Therefore, 256b is the most worthwhile addition, and we can skip
adding 512b, 1024b, etc.

Additionally, even though longer hardware will continually be developed,
the most used will probably still be 128b and 256b.
If someone complains that FFmpeg's RVV doesn't support 1024b well, it can
be said that it's not just RISC-V that lacks good support.
However, if the 256b performance is not good, then it seems like an issue
with RISC-V. :)

I think maybe we can give some preference to the two smallest lengths?

Rémi Denis-Courmont  于2024年7月29日周一 22:45写道：

> Hi,
>
> Le lauantaina 22. kesäkuuta 2024, 18.58.03 EEST u...@foxmail.com a écrit :
> > From: sunyuechi 
>
> In my opinion, we can't keep on like this. By the end of year, there will
> also
> be 512-bit vector hardware. In the worst case, specialisation on vector
> length
> could require 7 variants of every function, as many as legal LMUL values.
>
> Generating the LMUL at run time or initialisation time is too slow for
> fixed-
> size functions, so I can only see two viable options here:
>
> 1) We ignore this problem entirely and only optimise to 128-bit or to the
> current minimum VLEN. The intent of the specification is ostensibly that
> processing should scale according to the current value of VL, not
> VTYPE.LMUL.
> That is why the minimum legal LMUL value is SEW/ELEN rather than 1/VLMAX
> (and
> draft versions did not even have fractional multipliers).
>
> 2) The specialisation code is heavily factored, including in the C
> initialisation side.
>
> Personally, I prefer to ignore the problem until we see more mature and
> varied
> hardware. I do note that SiFive is ostensibly not specialising their code
> by
> VLEN, which tends to confirm that this is just a case of immature design
> from
> T-Head.
>
> --
> Rémi Denis-Courmont
> http://www.remlab.net/
>
>
>
> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
>
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH v4 4/4] lavc/vp9dsp: R-V V mc tap hv

2024-07-23 Thread flow gg

Because of the 3/4 update, updated it."

 于2024年7月23日周二 16:59写道：

> From: sunyuechi 
>
>  C908   X60
> vp9_avg_8tap_smooth_4hv_8bpp_c :   32.0   28.0
> vp9_avg_8tap_smooth_4hv_8bpp_rvv_i32   :   15.0   13.2
> vp9_avg_8tap_smooth_8hv_8bpp_c :   98.0   86.2
> vp9_avg_8tap_smooth_8hv_8bpp_rvv_i32   :   23.7   21.2
> vp9_avg_8tap_smooth_16hv_8bpp_c:  355.7  297.0
> vp9_avg_8tap_smooth_16hv_8bpp_rvv_i32  :   47.0   41.5
> vp9_avg_8tap_smooth_32hv_8bpp_c: 1272.7 1099.7
> vp9_avg_8tap_smooth_32hv_8bpp_rvv_i32  :  134.7  119.7
> vp9_avg_8tap_smooth_64hv_8bpp_c: 4937.0 4224.2
> vp9_avg_8tap_smooth_64hv_8bpp_rvv_i32  :  528.5  228.5
> vp9_put_8tap_smooth_4hv_8bpp_c :   30.2   26.7
> vp9_put_8tap_smooth_4hv_8bpp_rvv_i32   :   30.5   12.5
> vp9_put_8tap_smooth_8hv_8bpp_c :   91.5   81.2
> vp9_put_8tap_smooth_8hv_8bpp_rvv_i32   :   22.7   20.2
> vp9_put_8tap_smooth_16hv_8bpp_c:  313.2  277.5
> vp9_put_8tap_smooth_16hv_8bpp_rvv_i32  :   45.2   40.2
> vp9_put_8tap_smooth_32hv_8bpp_c: 1166.7 1022.2
> vp9_put_8tap_smooth_32hv_8bpp_rvv_i32  :  131.7  117.2
> vp9_put_8tap_smooth_64hv_8bpp_c: 4560.5 3961.7
> vp9_put_8tap_smooth_64hv_8bpp_rvv_i32  :  517.0  223.2
> ---
>  libavcodec/riscv/vp9_mc_rvv.S  | 75 ++
>  libavcodec/riscv/vp9dsp_init.c |  8 
>  2 files changed, 83 insertions(+)
>
> diff --git a/libavcodec/riscv/vp9_mc_rvv.S b/libavcodec/riscv/vp9_mc_rvv.S
> index 6a4be7b9bd..26754ac6f8 100644
> --- a/libavcodec/riscv/vp9_mc_rvv.S
> +++ b/libavcodec/riscv/vp9_mc_rvv.S
> @@ -366,6 +366,77 @@ func
> ff_\op\()_vp9_8tap_\name\()_\len\()\type\()_rvv\vlen\(), zve32x
>  endfunc
>  .endm
>
> +#if __riscv_xlen == 64
> +.macro epel_hv_once len, name, op
> +sub a2, a2, a3
> +sub a2, a2, a3
> +sub a2, a2, a3
> +.irp n,0,2,4,6,8,10,12,14
> +epel_load_inc   v\n, \len, put, \name, h, 1, t
> +.endr
> +addia4, a4, -1
> +1:
> +addia4, a4, -1
> +epel_load   v30, \len, \op, \name, v, 0, s
> +vse8.v  v30, (a0)
> +vmv.v.v v0, v2
> +vmv.v.v v2, v4
> +vmv.v.v v4, v6
> +vmv.v.v v6, v8
> +vmv.v.v v8, v10
> +vmv.v.v v10, v12
> +vmv.v.v v12, v14
> +epel_load   v14, \len, put, \name, h, 1, t
> +add a2, a2, a3
> +add a0, a0, a1
> +bneza4, 1b
> +epel_load   v30, \len, \op, \name, v, 0, s
> +vse8.v  v30, (a0)
> +.endm
> +
> +.macro epel_hv op, name, len, vlen
> +func ff_\op\()_vp9_8tap_\name\()_\len\()hv_rvv\vlen\(), zve32x
> +addisp, sp, -64
> +.irp n,0,1,2,3,4,5,6,7
> +sd  s\n, \n\()<<3(sp)
> +.endr
> +.if \len == 64 && \vlen < 256
> +addisp, sp, -48
> +.irp n,0,1,2,3,4,5
> +sd  a\n, \n\()<<3(sp)
> +.endr
> +.endif
> +.ifc \op,avg
> +csrwi   vxrm, 0
> +.endif
> +epel_filter \name, h, t, a7
> +epel_filter \name, v, s, s7
> +.if \vlen < 256
> +vsetvlstatic8   \len, a6, 32, m2
> +.else
> +vsetvlstatic8   \len, a6, 64, m2
> +.endif
> +epel_hv_once\len, \name, \op
> +.if \len == 64 && \vlen < 256
> +.irp n,0,1,2,3,4,5
> +ld  a\n, \n\()<<3(sp)
> +.endr
> +addisp, sp, 48
> +addia0, a0, 32
> +addia2, a2, 32
> +epel_filter \name, h, t, a7
> +epel_hv_once\len, \name, \op
> +.endif
> +.irp n,0,1,2,3,4,5,6,7
> +ld  s\n, \n\()<<3(sp)
> +.endr
> +addisp, sp, 64
> +
> +ret
> +endfunc
> +.endm
> +#endif
> +
>  .irp len, 64, 32, 16, 8, 4
>  copy_avg \len
>  .irp op, put, avg
> @@ -374,6 +445,10 @@ endfunc
>  epel \len, \op, \name, \type, 128
>  epel \len, \op, \name, \type, 256
>  .endr
> +#if __riscv_xlen == 64
> +epel_hv \op, \name, \len, 128
> +epel_hv \op, \name, \len, 256
> +#endif
>  .endr
>  .endr
>  .endr
> diff --git a/libavcodec/riscv/vp9dsp_init.c
> b/libavcodec/riscv/vp9dsp_init.c
> index 3669070fca..7b090c9889 100644
> --- a/libavcodec/riscv/vp9dsp_init.c
> +++ b/libavcodec/riscv/vp9dsp_init.c
> @@ -119,6 +119,10 @@ sta

Re: [FFmpeg-devel] [PATCH v4 3/4] lavc/vp9dsp: R-V V mc tap h v

2024-07-23 Thread flow gg

> TBH it is very hard to review this due to the large extents of code
> conditionals. This should avoidable at least partly. You can name macros
for
> each filter and then expand those macros instead of using if's.

Do you mean that before the addition of .equ ff_vp9_subpel_filters_xxx,
epel_filter had too many if statements?
Now the filter has only two if statements. Anyway, I have updated it and
reduced one more if.

> Besides in my experience, it is more readable to leave the loads/stores
to the
> outer function or macros and factor only the calculations, whenever you
need
> to apply the same maths vertically and/or horizontally. This also
sometimes
> enables actually using shared code, e.g., the H.263 loop filter or the
VC-1
> ITX.

There is an issue here because of insufficient vector registers, so vector
registers need to be reused.
If we use the H.263 method, it would require two more jumps.
Additionally, scalar registers are also insufficient. so need more stack.
I want to implement this as a macro for lengths of 4, 8, 16, 32, and 64
first.
In a subsequent patch, I will break down 4, 8, and 16 into one macro, and
32 or 64 into another macro.
This way, code can be better shared and some other adjustments like vlseg
...

> Lastly this seems to both add new optimisations *and* add specialisations
for
> 256-bit vectors, which really should be separate patches, but maybe I just
> don't understand the code. In any case, that would not really match with
the
> patch description.

I think the purpose of this patch is to implement 128b+256b RVV, so adding
the corresponding 128+256 functions in vp9dsp.h could also be part of this
patch?

Rémi Denis-Courmont  于2024年7月13日周六 17:02写道：

> Le lauantaina 15. kesäkuuta 2024, 14.50.33 EEST u...@foxmail.com a écrit :
> > From: sunyuechi 
>
> OK, so I realise that this review is very late, but...
>
> TBH it is very hard to review this due to the large extents of code
> conditionals. This should avoidable at least partly. You can name macros
> for
> each filter and then expand those macros instead of using if's.
>
> Besides in my experience, it is more readable to leave the loads/stores to
> the
> outer function or macros and factor only the calculations, whenever you
> need
> to apply the same maths vertically and/or horizontally. This also
> sometimes
> enables actually using shared code, e.g., the H.263 loop filter or the
> VC-1
> ITX.
>
> Lastly this seems to both add new optimisations *and* add specialisations
> for
> 256-bit vectors, which really should be separate patches, but maybe I just
> don't understand the code. In any case, that would not really match with
> the
> patch description.
>
>
> >  C908   X60
> > vp9_avg_8tap_smooth_4h_8bpp_c  :   12.7   11.2
> > vp9_avg_8tap_smooth_4h_8bpp_rvv_i32:4.74.2
> > vp9_avg_8tap_smooth_4v_8bpp_c  :   29.7   12.5
> > vp9_avg_8tap_smooth_4v_8bpp_rvv_i32:4.74.2
> > vp9_avg_8tap_smooth_8h_8bpp_c  :   48.7   42.2
> > vp9_avg_8tap_smooth_8h_8bpp_rvv_i32:9.58.5
> > vp9_avg_8tap_smooth_8v_8bpp_c  :   49.7   45.5
> > vp9_avg_8tap_smooth_8v_8bpp_rvv_i32:9.58.5
> > vp9_avg_8tap_smooth_16h_8bpp_c :  192.0  166.5
> > vp9_avg_8tap_smooth_16h_8bpp_rvv_i32   :   21.7   19.5
> > vp9_avg_8tap_smooth_16v_8bpp_c :  191.2  175.2
> > vp9_avg_8tap_smooth_16v_8bpp_rvv_i32   :   21.2   19.0
> > vp9_avg_8tap_smooth_32h_8bpp_c :  780.2  663.2
> > vp9_avg_8tap_smooth_32h_8bpp_rvv_i32   :   68.2   60.5
> > vp9_avg_8tap_smooth_32v_8bpp_c :  770.0  685.7
> > vp9_avg_8tap_smooth_32v_8bpp_rvv_i32   :   67.0   59.5
> > vp9_avg_8tap_smooth_64h_8bpp_c : 3116.2 2648.2
> > vp9_avg_8tap_smooth_64h_8bpp_rvv_i32   :  270.7  120.7
> > vp9_avg_8tap_smooth_64v_8bpp_c : 3058.5 2731.7
> > vp9_avg_8tap_smooth_64v_8bpp_rvv_i32   :  266.5  119.0
> > vp9_put_8tap_smooth_4h_8bpp_c  :   11.09.7
> > vp9_put_8tap_smooth_4h_8bpp_rvv_i32:4.23.7
> > vp9_put_8tap_smooth_4v_8bpp_c  :   11.7   10.5
> > vp9_put_8tap_smooth_4v_8bpp_rvv_i32:4.03.7
> > vp9_put_8tap_smooth_8h_8bpp_c  :   42.0   37.5
> > vp9_put_8tap_smooth_8h_8bpp_rvv_i32:8.57.7
> > vp9_put_8tap_smooth_8v_8bpp_c  :   43.5   38.5
> > vp9_put_8tap_smooth_8v_8bpp_rvv_i32:8.77.7
> > vp9_put_8tap_smooth_16h_8bpp_c :  181.7  147.2
> > vp9_put_8tap_smooth_16h_8bpp_rvv_i32   :   20.0   18.0
> > vp9_put_8tap_smooth_16v_8bpp_c :  168.5  149.7
> > vp9_put_8tap_smooth_16v_8bpp_rvv_i32

Re: [FFmpeg-devel] [PATCH] lavc/vvc_mc: R-V V avg w_avg

2024-07-21 Thread flow gg

Okay, updated it

Rémi Denis-Courmont  于2024年7月19日周五 23:56写道：

> Le torstaina 18. heinäkuuta 2024, 18.04.15 EEST flow gg a écrit :
> > > Again, I don't think that a maximul multiplier belongs here. If the
> > > calling code cannot scale the multiplier up, then it should be a normal
> > > loop providing the same code for all VLENs.
> >
> > I think it's acceptable to add such a parameter, which isn't particularly
> > common in other files, because this vset is used for vvc_mc_rvv.S rather
> > than libavutil/riscv/asm.S.
>
> Maybe but that's really not my point. If you use the same LMUL for all
> VLENBs,
> then you should use the same function, not two copies of the exact same
> function.
>
> --
> 雷米‧德尼-库尔蒙
> http://www.remlab.net/
>
>
>
> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
>
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH] lavc/vvc_mc: R-V V avg w_avg

2024-07-18 Thread flow gg

> Again, I don't think that a maximul multiplier belongs here. If the
calling
> code cannot scale the multiplier up, then it should be a normal loop
providing
> the same code for all VLENs.

I think it's acceptable to add such a parameter, which isn't particularly
common in other files, because this vset is used for vvc_mc_rvv.S rather
than libavutil/riscv/asm.S. This parameter isn't only used for avg and
w_avg; it can also save some if  for other functions in vvc_mc_rvv.S later
on


>> +.4byte \id\()64\vlen\()f - jmp_table_\id\vlen
>> +.4byte \id\()128\vlen\()f - jmp_table_\id\vlen
> Maybe use .irp here?

I'm not sure, there is a syntax error here with the mixed use of them.

.irp w,2,4,8,16,32,64,128
.4byte \id\()\w\()\vlen\()f - jmp_table_\id\vlen
.endr

libavcodec/riscv/vvc/vvc_mc_rvv.S:176: Error: junk at end of line, first
unrecognized character is `\'
libavcodec/riscv/vvc/vvc_mc_rvv.S:195:   Info: macro invoked from here

> Breaks build if XLEN = 32.

Okay，updated it

Rémi Denis-Courmont  于2024年7月16日周二 22:31写道：

> Le keskiviikkona 10. heinäkuuta 2024, 13.02.44 EEST u...@foxmail.com a
> écrit :
> > From: sunyuechi 
> >
> >   C908   X60
> > avg_8_2x2_c:1.21.2
> > avg_8_2x2_rvv_i32  :0.70.7
> > avg_8_2x4_c:2.02.0
> > avg_8_2x4_rvv_i32  :1.21.0
> > avg_8_2x8_c:3.74.0
> > avg_8_2x8_rvv_i32  :1.71.5
> > avg_8_2x16_c   :7.27.5
> > avg_8_2x16_rvv_i32 :3.02.7
> > avg_8_2x32_c   :   14.5   15.2
> > avg_8_2x32_rvv_i32 :5.55.0
> > avg_8_2x64_c   :   53.5   42.2
> > avg_8_2x64_rvv_i32 :   42.0   33.2
> > avg_8_2x128_c  :   93.5   86.0
> > avg_8_2x128_rvv_i32:   79.2   74.0
> > avg_8_4x2_c:1.72.0
> > avg_8_4x2_rvv_i32  :1.01.0
> > avg_8_4x4_c:3.53.5
> > avg_8_4x4_rvv_i32  :1.21.0
> > avg_8_4x8_c:6.57.0
> > avg_8_4x8_rvv_i32  :1.71.7
> > avg_8_4x16_c   :   13.5   14.0
> > avg_8_4x16_rvv_i32 :3.02.5
> > avg_8_4x32_c   :   26.2   27.5
> > avg_8_4x32_rvv_i32 :5.75.0
> > avg_8_4x64_c   :   79.0   66.5
> > avg_8_4x64_rvv_i32 :   41.7   34.2
> > avg_8_4x128_c  :  154.0  128.7
> > avg_8_4x128_rvv_i32:   80.5   74.5
> > avg_8_8x2_c:3.23.2
> > avg_8_8x2_rvv_i32  :1.00.7
> > avg_8_8x4_c:6.56.5
> > avg_8_8x4_rvv_i32  :1.21.0
> > avg_8_8x8_c:   12.5   13.2
> > avg_8_8x8_rvv_i32  :2.01.7
> > avg_8_8x16_c   :   25.2   26.5
> > avg_8_8x16_rvv_i32 :3.22.7
> > avg_8_8x32_c   :   50.0   52.7
> > avg_8_8x32_rvv_i32 :6.24.7
> > avg_8_8x64_c   :  130.0  112.2
> > avg_8_8x64_rvv_i32 :   44.2   33.5
> > avg_8_8x128_c  :  241.5  226.7
> > avg_8_8x128_rvv_i32:   78.7   74.0
> > avg_8_16x2_c   :6.26.5
> > avg_8_16x2_rvv_i32 :1.20.7
> > avg_8_16x4_c   :   12.2   13.0
> > avg_8_16x4_rvv_i32 :1.71.0
> > avg_8_16x8_c   :   24.7   25.7
> > avg_8_16x8_rvv_i32 :3.01.7
> > avg_8_16x16_c  :   49.0   51.5
> > avg_8_16x16_rvv_i32:5.53.2
> > avg_8_16x32_c  :   97.7  102.7
> > avg_8_16x32_rvv_i32:   10.55.5
> > avg_8_16x64_c

Re: [FFmpeg-devel] [PATCH v2 2/4] lavc/vp8dsp: R-V V loop_filter_simple

2024-07-14 Thread flow gg

> vssseg2e8
> vlsseg4e8
> vwadd.wv
> I can't find where VXRM is initialised for that.

Updated them and add csrwi

 于2024年7月15日周一 00:30写道：

> From: sunyuechi 
>
>  C908   X60
> vp8_loop_filter_simple_h_c :6.25.7
> vp8_loop_filter_simple_h_rvv_i32   :3.02.5
> vp8_loop_filter_simple_v_c :6.56.2
> vp8_loop_filter_simple_v_rvv_i32   :2.01.5
> ---
>  libavcodec/riscv/vp8dsp_init.c | 18 +++-
>  libavcodec/riscv/vp8dsp_rvv.S  | 77 ++
>  2 files changed, 94 insertions(+), 1 deletion(-)
>
> diff --git a/libavcodec/riscv/vp8dsp_init.c
> b/libavcodec/riscv/vp8dsp_init.c
> index dcb6307d5b..8c5b2c8b04 100644
> --- a/libavcodec/riscv/vp8dsp_init.c
> +++ b/libavcodec/riscv/vp8dsp_init.c
> @@ -49,6 +49,9 @@ VP8_BILIN(16, rvv256);
>  VP8_BILIN(8,  rvv256);
>  VP8_BILIN(4,  rvv256);
>
> +VP8_LF(rvv128);
> +VP8_LF(rvv256);
> +
>  av_cold void ff_vp78dsp_init_riscv(VP8DSPContext *c)
>  {
>  #if HAVE_RV
> @@ -147,9 +150,15 @@ av_cold void ff_vp78dsp_init_riscv(VP8DSPContext *c)
>  av_cold void ff_vp8dsp_init_riscv(VP8DSPContext *c)
>  {
>  #if HAVE_RVV
> +int vlenb = ff_get_rv_vlenb();
> +
> +#define init_loop_filter(vlen)   \
> +c->vp8_v_loop_filter_simple =
> ff_vp8_v_loop_filter16_simple_rvv##vlen; \
> +c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter16_simple_rvv##vlen;
> +
>  int flags = av_get_cpu_flags();
>
> -if (flags & AV_CPU_FLAG_RVV_I32 && ff_rv_vlen_least(128)) {
> +if (flags & AV_CPU_FLAG_RVV_I32 && vlenb >= 16) {
>  #if __riscv_xlen >= 64
>  if (flags & AV_CPU_FLAG_RVV_I64)
>  c->vp8_luma_dc_wht = ff_vp8_luma_dc_wht_rvv;
> @@ -159,6 +168,13 @@ av_cold void ff_vp8dsp_init_riscv(VP8DSPContext *c)
>  c->vp8_idct_dc_add4y = ff_vp8_idct_dc_add4y_rvv;
>  if (flags & AV_CPU_FLAG_RVV_I64)
>  c->vp8_idct_dc_add4uv = ff_vp8_idct_dc_add4uv_rvv;
> +
> +if (vlenb >= 32) {
> +init_loop_filter(256);
> +} else {
> +init_loop_filter(128);
> +}
>  }
> +#undef init_loop_filter
>  #endif
>  }
> diff --git a/libavcodec/riscv/vp8dsp_rvv.S b/libavcodec/riscv/vp8dsp_rvv.S
> index 0cbf1672f7..3cec4dd135 100644
> --- a/libavcodec/riscv/vp8dsp_rvv.S
> +++ b/libavcodec/riscv/vp8dsp_rvv.S
> @@ -275,6 +275,83 @@ func ff_vp78_idct_dc_add4uv_rvv, zve64x
>  ret
>  endfunc
>
> +.macro filter_fmin len, vlen, a, f1, p0f2, q0f1, p0, q0
> +vsetvlstatic16  \len, \vlen
> +vsext.vf2   \q0f1, \a
> +vmin.vx \p0f2, \q0f1, a6
> +vmin.vx \q0f1, \q0f1, t6
> +vadd.vi \p0f2, \p0f2, 3
> +vadd.vi \q0f1, \q0f1, 4
> +vsra.vi \p0f2, \p0f2, 3
> +vsra.vi \f1,   \q0f1, 3
> +vadd.vv \p0f2, \p0f2, \p0
> +vsub.vv \q0f1, \q0, \f1
> +vmax.vx \p0f2, \p0f2, zero
> +vmax.vx \q0f1, \q0f1, zero
> +.endm
> +
> +.macro filter len, vlen, type, normal, inner, dst, stride, fE, fI, thresh
> +.ifc \type,v
> +sub t3, \dst, \stride  // -1
> +sub t2, t3, \stride// -2
> +add t4, \dst, \stride  // 1
> +vle8.v  v3, (t2)   // p1
> +vle8.v  v4, (t3)   // p0
> +vle8.v  v5, (\dst) // q0
> +vle8.v  v6, (t4)   // q1
> +.else
> +addit2, \dst, -2
> +addit3, \dst, -1
> +vlsseg4e8.v v3, (t2), \stride
> +.endif
> +vwsubu.vv   v10, v3, v6 // p1-q1
> +vwsubu.vv   v12, v5, v4 // q0-p0
> +
> +vnclip.wi   v16, v10, 0 // clip_int8(p1 - q1)
> +vsetvlstatic16  \len, \vlen
> +// vp8_simple_limit(dst + i, stride, flim)
> +li  a6, 2
> +vneg.v  v22, v10
> +vneg.v  v24, v12
> +vmax.vv v22, v22, v10
> +vmax.vv v24, v24, v12
> +vsrl.vi v22, v22, 1
> +vmacc.vxv22, a6, v24
> +vmsleu.vx   v0, v22, \fE
> +
> +li  a7, 3
> +li  a6, 124
> +li  t6, 123
> +vmul.vx v22, v12, a7// 3 * (q0 - p0)
> +vzext.vf2   v24, v4 // p0
> +vzext.vf2   v20, v5 // q0
> +vsetvlstatic8   \len, \vlen
> +vwadd.wvv10, v22, v16
> +vnclip.wi   v28, v10, 0
> +filter_fmin \len, \vlen, v28, v12, v26, v10, v24, v20
> +vsetvlstatic8   \len, \vlen
> +vnclipu.wi  v30, v26, 0
> +vnclipu.wi  v31, v10, 0
> +.ifc \type,v
> +vse8.v  v

Re: [FFmpeg-devel] [PATCH v5] lavc/vvc_mc: R-V V avg w_avg

2024-07-10 Thread flow gg

Sorry, the previous logic was indeed a bit confusing. I rewrote it,
renaming the third parameter to max_lmul.

If e16 is involved in the function, then vsetvlstatic8  uses max_lmul == m4.
If e32 is involved in the function, then vsetvlstatic8  uses max_lmul == m2.

If e16 is involved in the function, then vsetvlstatic16 uses max_lmul == m8.
If e32 is involved in the function, then vsetvlstatic16 uses max_lmul == m4.

I think it is clearer now.

Rémi Denis-Courmont  于2024年7月8日周一 23:41写道：

> Le maanantaina 1. heinäkuuta 2024, 19.09.01 EEST flow gg a écrit :
> > I reviewed it again, the purpose of is_w is to limit lmul to a maximum of
> > 1/4 of vlen,
>
> 1/4 of vlen? Do you mean limit to EMUL=1 for EEW=32 and EMUL=1/4 for EEW=8?
>
> Limiting LMUL to less than 1 at maximum EEW is useless from a functional
> standpoint, since fractional registers cannot be addressed individually.
> (Of
> course it might still be useful for performance reasons.)
>
> > to prevent vector register shortage, which can also be
> > considered as vset limiting lmul. I renamed it to quarter_len_limit.
>
> TBH, I don't really understand.
>
> If a lower LMUL limit is reached, then specialisations for the
> corresponding
> VLEN are simply unncessary/infeasible and the code for lower VLEN should
> be
> used.
>
> If a higher LMUL limit is reached due to register pressure (or the 8 hard
> limit), then the given VLEN cannot be supported at all, or requires some
> completely different code.
>
> Either way, I don't really follow why vsetvlfixed macros need to be
> involved.
>
> --
> 雷米‧德尼-库尔蒙
> http://www.remlab.net/
>
>
>
> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
>
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH v5] lavc/vvc_mc: R-V V avg w_avg

2024-07-01 Thread flow gg

I reviewed it again, the purpose of is_w is to limit lmul to a maximum of
1/4 of vlen, to prevent vector register shortage, which can also be
considered as vset limiting lmul. I renamed it to quarter_len_limit.

t0 is changed to t1.

 于2024年7月2日周二 00:07写道：

> From: sunyuechi 
>
>   C908   X60
> avg_8_2x2_c:1.21.0
> avg_8_2x2_rvv_i32  :1.01.0
> avg_8_2x4_c:2.02.0
> avg_8_2x4_rvv_i32  :1.51.2
> avg_8_2x8_c:3.74.0
> avg_8_2x8_rvv_i32  :2.02.0
> avg_8_2x16_c   :7.27.7
> avg_8_2x16_rvv_i32 :3.23.0
> avg_8_2x32_c   :   14.5   15.2
> avg_8_2x32_rvv_i32 :5.75.0
> avg_8_2x64_c   :   50.0   45.2
> avg_8_2x64_rvv_i32 :   41.5   32.5
> avg_8_2x128_c  :  101.5   84.2
> avg_8_2x128_rvv_i32:   89.5   73.2
> avg_8_4x2_c:2.02.0
> avg_8_4x2_rvv_i32  :1.01.0
> avg_8_4x4_c:3.53.5
> avg_8_4x4_rvv_i32  :1.51.2
> avg_8_4x8_c:6.77.0
> avg_8_4x8_rvv_i32  :2.01.7
> avg_8_4x16_c   :   13.2   14.0
> avg_8_4x16_rvv_i32 :3.23.0
> avg_8_4x32_c   :   26.2   27.7
> avg_8_4x32_rvv_i32 :5.75.0
> avg_8_4x64_c   :   75.0   66.0
> avg_8_4x64_rvv_i32 :   40.2   33.0
> avg_8_4x128_c  :  144.5  128.0
> avg_8_4x128_rvv_i32:   89.5   78.7
> avg_8_8x2_c:3.23.5
> avg_8_8x2_rvv_i32  :1.21.0
> avg_8_8x4_c:6.56.7
> avg_8_8x4_rvv_i32  :1.51.5
> avg_8_8x8_c:   12.7   13.2
> avg_8_8x8_rvv_i32  :2.21.7
> avg_8_8x16_c   :   25.2   26.5
> avg_8_8x16_rvv_i32 :3.72.7
> avg_8_8x32_c   :   50.2   52.7
> avg_8_8x32_rvv_i32 :6.55.0
> avg_8_8x64_c   :  120.2  117.7
> avg_8_8x64_rvv_i32 :   45.2   39.2
> avg_8_8x128_c  :  223.0  233.5
> avg_8_8x128_rvv_i32:   80.0   73.2
> avg_8_16x2_c   :6.26.5
> avg_8_16x2_rvv_i32 :1.51.0
> avg_8_16x4_c   :   12.5   12.7
> avg_8_16x4_rvv_i32 :2.01.2
> avg_8_16x8_c   :   24.7   26.0
> avg_8_16x8_rvv_i32 :3.22.0
> avg_8_16x16_c  :   49.0   51.2
> avg_8_16x16_rvv_i32:5.73.2
> avg_8_16x32_c  :   97.7  102.5
> avg_8_16x32_rvv_i32:   10.75.7
> avg_8_16x64_c  :  220.5  214.2
> avg_8_16x64_rvv_i32:   48.2   39.5
> avg_8_16x128_c :  436.2  428.0
> avg_8_16x128_rvv_i32   :   97.2   77.0
> avg_8_32x2_c   :   12.2   12.7
> avg_8_32x2_rvv_i32 :2.01.2
> avg_8_32x4_c   :   24.5   25.5
> avg_8_32x4_rvv_i32 :3.21.7
> avg_8_32x8_c   :   48.5   50.7
> avg_8_32x8_rvv_i32 :5.72.7
> avg_8_32x16_c  :   96.5  101.2
> avg_8_32x16_rvv_i32:   10.25.0
> avg_8_32x32_c  :  192.5  202.2
> avg_8_32x32_rvv_i32:   20.09.5
> avg_8_32x64_c  :  405.7  404.5
> avg_8_32x64_rvv_i32

Re: [FFmpeg-devel] [PATCH v5] lavc/vvc_mc: R-V V avg w_avg

2024-07-01 Thread flow gg

> I am not sure what is_w means or serves here. If you need special cases,
this
> feels a bit out of place for this macro.

It is a special case added to merge the vset of avg and w_avg, how about
giving it a default value so that it doesn't affect the use of other
functions?

> I am not sure if I get it, but it seems like this could be a normal vector
> processing loop without specialisation for the vector length, at (almost)
no
> performance cost.
>
> Can we use a regular loop here instead of repeating the same code?

Okay, updated it

> t0 is a link register, so the branch predictor will treat this a return,
but
> it seems to be a tail call instead.

Will this cause any issues? It will execute at a label, and after
executing, there is a ret at the label.

> For named labels, it is preferable to use func (perhaps with
.variant_cc), to
> get all properties right.

This macro is used in func. I assume it already has the properties of func?

> Could t4 be added in 16-bit mode so we don't need to switch vtype?
> (Also same below)

No, it is 32-bit :(

 于2024年7月1日周一 21:39写道：

> From: sunyuechi 
>
>   C908   X60
> avg_8_2x2_c:1.21.0
> avg_8_2x2_rvv_i32  :1.01.0
> avg_8_2x4_c:2.02.0
> avg_8_2x4_rvv_i32  :1.51.2
> avg_8_2x8_c:3.74.0
> avg_8_2x8_rvv_i32  :2.02.0
> avg_8_2x16_c   :7.27.7
> avg_8_2x16_rvv_i32 :3.23.0
> avg_8_2x32_c   :   14.5   15.2
> avg_8_2x32_rvv_i32 :5.75.0
> avg_8_2x64_c   :   50.0   45.2
> avg_8_2x64_rvv_i32 :   41.5   32.5
> avg_8_2x128_c  :  101.5   84.2
> avg_8_2x128_rvv_i32:   89.5   73.2
> avg_8_4x2_c:2.02.0
> avg_8_4x2_rvv_i32  :1.01.0
> avg_8_4x4_c:3.53.5
> avg_8_4x4_rvv_i32  :1.51.2
> avg_8_4x8_c:6.77.0
> avg_8_4x8_rvv_i32  :2.01.7
> avg_8_4x16_c   :   13.2   14.0
> avg_8_4x16_rvv_i32 :3.23.0
> avg_8_4x32_c   :   26.2   27.7
> avg_8_4x32_rvv_i32 :5.75.0
> avg_8_4x64_c   :   75.0   66.0
> avg_8_4x64_rvv_i32 :   40.2   33.0
> avg_8_4x128_c  :  144.5  128.0
> avg_8_4x128_rvv_i32:   89.5   78.7
> avg_8_8x2_c:3.23.5
> avg_8_8x2_rvv_i32  :1.21.0
> avg_8_8x4_c:6.56.7
> avg_8_8x4_rvv_i32  :1.51.5
> avg_8_8x8_c:   12.7   13.2
> avg_8_8x8_rvv_i32  :2.21.7
> avg_8_8x16_c   :   25.2   26.5
> avg_8_8x16_rvv_i32 :3.72.7
> avg_8_8x32_c   :   50.2   52.7
> avg_8_8x32_rvv_i32 :6.55.0
> avg_8_8x64_c   :  120.2  117.7
> avg_8_8x64_rvv_i32 :   45.2   39.2
> avg_8_8x128_c  :  223.0  233.5
> avg_8_8x128_rvv_i32:   80.0   73.2
> avg_8_16x2_c   :6.26.5
> avg_8_16x2_rvv_i32 :1.51.0
> avg_8_16x4_c   :   12.5   12.7
> avg_8_16x4_rvv_i32 :2.01.2
> avg_8_16x8_c   :   24.7   26.0
> avg_8_16x8_rvv_i32 :3.22.0
> avg_8_16x16_c  :   49.0   51.2
> avg_8_16x16_rvv_i32:5.73.2
> avg_8_16x32_c  :   97.7  102.5
> avg_8_16x32_rvv_i32:   10.75.7
> avg_8_16x64_c  :  220.5  214.2
> avg_8_16x64_rvv_i32:   48.2   39.5
> avg_8_16x128_c :  436.2  428.0
> avg_8_16x128_rvv_i32

Re: [FFmpeg-devel] [PATCH 2/2] lavc/h264dsp: R-V V 8-bit luma loop filter

2024-07-01 Thread flow gg

The loop filter horizontal in vp8 also has this issue ..

Rémi Denis-Courmont  于2024年6月30日周日 17:04写道：

> T-Head C908 (cycles):
> h264_h_loop_filter_luma_8bpp_c:   297.5
> h264_h_loop_filter_luma_8bpp_rvv_i32: 374.7
> h264_v_loop_filter_luma_8bpp_c:   862.7
> h264_v_loop_filter_luma_8bpp_rvv_i32: 200.7
>
> Performance in the horizontal scenario seems worse than scalar. x86
> SSE2 and AVX optimisations are similarly affected. This is presumably
> caused by unlucky inputs from checkasm, such that the C code
> short-circuits almost all filter calculations.
> ---
>  libavcodec/riscv/Makefile   |   1 +
>  libavcodec/riscv/h264dsp_init.c |  13 ++-
>  libavcodec/riscv/h264dsp_rvv.S  | 136 
>  3 files changed, 149 insertions(+), 1 deletion(-)
>  create mode 100644 libavcodec/riscv/h264dsp_rvv.S
>
> diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
> index c180223141..a1510e8c6e 100644
> --- a/libavcodec/riscv/Makefile
> +++ b/libavcodec/riscv/Makefile
> @@ -31,6 +31,7 @@ RVV-OBJS-$(CONFIG_H263DSP) += riscv/h263dsp_rvv.o
>  OBJS-$(CONFIG_H264CHROMA) += riscv/h264_chroma_init_riscv.o
>  RVV-OBJS-$(CONFIG_H264CHROMA) += riscv/h264_mc_chroma.o
>  OBJS-$(CONFIG_H264DSP) += riscv/h264dsp_init.o
> +RVV-OBJS-$(CONFIG_H264DSP) += riscv/h264dsp_rvv.o
>  OBJS-$(CONFIG_HUFFYUV_DECODER) += riscv/huffyuvdsp_init.o
>  RVV-OBJS-$(CONFIG_HUFFYUV_DECODER) += riscv/huffyuvdsp_rvv.o
>  OBJS-$(CONFIG_IDCTDSP) += riscv/idctdsp_init.o
> diff --git a/libavcodec/riscv/h264dsp_init.c
> b/libavcodec/riscv/h264dsp_init.c
> index dbbf3db400..0d4d541992 100644
> --- a/libavcodec/riscv/h264dsp_init.c
> +++ b/libavcodec/riscv/h264dsp_init.c
> @@ -24,8 +24,14 @@
>
>  #include "libavutil/attributes.h"
>  #include "libavutil/cpu.h"
> +#include "libavutil/riscv/cpu.h"
>  #include "libavcodec/h264dsp.h"
>
> +void ff_h264_v_loop_filter_luma_8_rvv(uint8_t *pix, ptrdiff_t stride,
> +  int alpha, int beta, int8_t *tc0);
> +void ff_h264_h_loop_filter_luma_8_rvv(uint8_t *pix, ptrdiff_t stride,
> +  int alpha, int beta, int8_t *tc0);
> +
>  extern int ff_startcode_find_candidate_rvb(const uint8_t *, int);
>  extern int ff_startcode_find_candidate_rvv(const uint8_t *, int);
>
> @@ -38,8 +44,13 @@ av_cold void ff_h264dsp_init_riscv(H264DSPContext *dsp,
> const int bit_depth,
>  if (flags & AV_CPU_FLAG_RVB_BASIC)
>  dsp->startcode_find_candidate = ff_startcode_find_candidate_rvb;
>  # if HAVE_RVV
> -if (flags & AV_CPU_FLAG_RVV_I32)
> +if (flags & AV_CPU_FLAG_RVV_I32) {
> +if (bit_depth == 8 && ff_rv_vlen_least(128)) {
> +dsp->h264_v_loop_filter_luma =
> ff_h264_v_loop_filter_luma_8_rvv;
> +dsp->h264_h_loop_filter_luma =
> ff_h264_h_loop_filter_luma_8_rvv;
> +}
>  dsp->startcode_find_candidate = ff_startcode_find_candidate_rvv;
> +}
>  # endif
>  #endif
>  }
> diff --git a/libavcodec/riscv/h264dsp_rvv.S
> b/libavcodec/riscv/h264dsp_rvv.S
> new file mode 100644
> index 00..ea9dfb1a7e
> --- /dev/null
> +++ b/libavcodec/riscv/h264dsp_rvv.S
> @@ -0,0 +1,136 @@
> +/*
> + * Copyright © 2024 Rémi Denis-Courmont.
> + *
> + * Redistribution and use in source and binary forms, with or without
> + * modification, are permitted provided that the following conditions are
> met:
> + *
> + * 1. Redistributions of source code must retain the above copyright
> notice,
> + *this list of conditions and the following disclaimer.
> + *
> + * 2. Redistributions in binary form must reproduce the above copyright
> notice,
> + *this list of conditions and the following disclaimer in the
> documentation
> + *and/or other materials provided with the distribution.
> + *
> + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> "AS IS"
> + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
> THE
> + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
> PURPOSE
> + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS
> BE
> + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
> + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
> + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
> BUSINESS
> + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
> + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
> + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
> THE
> + * POSSIBILITY OF SUCH DAMAGE.
> + */
> +
> +#include "libavutil/riscv/asm.S"
> +
> +.variant_cc ff_h264_loop_filter_luma_8_rvv
> +func ff_h264_loop_filter_luma_8_rvv, zve32x
> +# p2: v8, p1: v9, p0: v10, q0: v11, q1: v12, q2: v13
> +# alpha: a2, beta: a3
> +csrwivxrm, 0
> +vid.vv0
> +vaaddu.vvv14, v10, v11 # (p0 + q0 + 1) / 2
> +vsrl.vi

Re: [FFmpeg-devel] [PATCH v4 2/4] lavc/vp9dsp: R-V V mc bilin hv

2024-06-30 Thread flow gg

Initially, I tried using `vnclip.wi` with reference to h264,
-vwadd.wxv16, v16, t4
-vnsra.wiv16, v16, 4
+vnclip.wi   v16, v16, 4

but couldn't find the correct way... I think there might be some overflow
issues that I didn't understand correctly. How do you think it should be
replaced?

Rémi Denis-Courmont  于2024年6月25日周二 04:07写道：

> Le lauantaina 15. kesäkuuta 2024, 14.50.32 EEST u...@foxmail.com a écrit :
> > From: sunyuechi 
> >
> >  C908   X60
> > vp9_avg_bilin_4hv_8bpp_c   :   10.79.5
> > vp9_avg_bilin_4hv_8bpp_rvv_i32 :4.03.5
> > vp9_avg_bilin_8hv_8bpp_c   :   38.5   34.2
> > vp9_avg_bilin_8hv_8bpp_rvv_i32 :7.26.5
> > vp9_avg_bilin_16hv_8bpp_c  :  147.2  130.5
> > vp9_avg_bilin_16hv_8bpp_rvv_i32:   14.5   12.7
> > vp9_avg_bilin_32hv_8bpp_c  :  574.2  509.7
> > vp9_avg_bilin_32hv_8bpp_rvv_i32:   42.5   38.0
> > vp9_avg_bilin_64hv_8bpp_c  : 2321.2 2017.7
> > vp9_avg_bilin_64hv_8bpp_rvv_i32:  163.5  131.0
> > vp9_put_bilin_4hv_8bpp_c   :   10.08.7
> > vp9_put_bilin_4hv_8bpp_rvv_i32 :3.53.0
> > vp9_put_bilin_8hv_8bpp_c   :   35.2   31.2
> > vp9_put_bilin_8hv_8bpp_rvv_i32 :6.55.7
> > vp9_put_bilin_16hv_8bpp_c  :  134.0  119.0
> > vp9_put_bilin_16hv_8bpp_rvv_i32:   12.7   11.5
> > vp9_put_bilin_32hv_8bpp_c  :  538.5  464.2
> > vp9_put_bilin_32hv_8bpp_rvv_i32:   39.7   35.2
> > vp9_put_bilin_64hv_8bpp_c  : 2111.7 1833.2
> > vp9_put_bilin_64hv_8bpp_rvv_i32:  138.5  122.5
> > ---
> >  libavcodec/riscv/vp9_mc_rvv.S  | 38 +-
> >  libavcodec/riscv/vp9dsp_init.c | 10 +
> >  2 files changed, 47 insertions(+), 1 deletion(-)
> >
> > diff --git a/libavcodec/riscv/vp9_mc_rvv.S
> b/libavcodec/riscv/vp9_mc_rvv.S
> > index fb7377048a..5241562531 100644
> > --- a/libavcodec/riscv/vp9_mc_rvv.S
> > +++ b/libavcodec/riscv/vp9_mc_rvv.S
> > @@ -147,6 +147,40 @@ func ff_\op\()_vp9_bilin_64\type\()_rvv, zve32x
> >  endfunc
> >  .endm
> >
> > +.macro bilin_hv op
> > +func ff_\op\()_vp9_bilin_64hv_rvv, zve32x
> > +vsetvlstatic8   64, t0, 64
> > +.Lbilin_hv\op:
> > +.ifc \op,avg
> > +csrwi   vxrm, 0
> > +.endif
> > +neg t1, a5
> > +neg t2, a6
> > +li  t4, 8
> > +bilin_load_hv24, put, a5
> > +add a2, a2, a3
> > +1:
> > +addia4, a4, -1
> > +bilin_load_hv4, put, a5
> > +vwmulu.vx   v16, v4, a6
> > +vwmaccsu.vx v16, t2, v24
> > +vwadd.wxv16, v16, t4
> > +vnsra.wiv16, v16, 4
>
> Why round manually?
> It looks like vnclip.wi would be more straightforward here.
>
> > +vadd.vv v0, v16, v24
> > +.ifc \op,avg
> > +vle8.v  v16, (a0)
> > +vaaddu.vv   v0, v0, v16
> > +.endif
> > +vse8.v  v0, (a0)
> > +vmv.v.v v24, v4
> > +add a2, a2, a3
> > +add a0, a0, a1
> > +bneza4, 1b
> > +
> > +ret
> > +endfunc
> > +.endm
> > +
> >  .irp len, 64, 32, 16, 8, 4
> >  copy_avg \len
> >  .endr
> > @@ -155,6 +189,8 @@ bilin_h_v  put, h, a5
> >  bilin_h_v  avg, h, a5
> >  bilin_h_v  put, v, a6
> >  bilin_h_v  avg, v, a6
> > +bilin_hv   put
> > +bilin_hv   avg
> >
> >  .macro func_bilin_h_v len, op, type
> >  func ff_\op\()_vp9_bilin_\len\()\type\()_rvv, zve32x
> > @@ -165,7 +201,7 @@ endfunc
> >
> >  .irp len, 32, 16, 8, 4
> >  .irp op, put, avg
> > -.irp type, h, v
> > +.irp type, h, v, hv
> >  func_bilin_h_v \len, \op, \type
> >  .endr
> >  .endr
> > diff --git a/libavcodec/riscv/vp9dsp_init.c
> b/libavcodec/riscv/vp9dsp_init.c
> > index 9606d8545f..b3700dfb08 100644
> > --- a/libavcodec/riscv/vp9dsp_init.c
> > +++ b/libavcodec/riscv/vp9dsp_init.c
> > @@ -83,6 +83,16 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext
> > *dsp, int bpp) dsp->mc[4][FILTER_BILINEAR ][0][1][0] =
> > ff_put_vp9_bilin_4h_rvv; dsp->mc[4][FILTER_BILINEAR ][1][0][1] =
> > ff_avg_vp9_bilin_4v_rvv; dsp->mc[4][FILTER_BILINEAR ][1][1][0] =
> > ff_avg_vp9_bilin_4h_rvv; +dsp->mc[0][FILTER_BILINEAR ][0][1][1] =
> > ff_put_vp9_bilin_64hv_rvv; +dsp->mc[0][FILTER_BILINEAR ][1][1][1] =
> > ff_avg_vp9_bilin_64hv_rvv; +dsp->mc[1][FILTER_BILINEAR ][0][1][1] =
> > ff_put_vp9_bilin_32hv_rvv; +dsp->mc[1][FILTER_BILINEAR ][1][1][1] =
> > ff_av

Re: [FFmpeg-devel] [PATCH v4 3/4] lavc/vp9dsp: R-V V mc tap h v

2024-06-15 Thread flow gg

> You can directly LLA filters + 16 * 8 * 2 and save one add. Same below.
You can
> also use .equ to alias the filter addresses, and avoid if's.

> That's a lot of address dependencies, which is going to hurt performance.
It
> might help to just spill more S registers if needed.

> This can be done in 3 instructions, even without mul. Of course you'll
again
> need a spare register.

Okay, updated them

> Use a macro parameter for the stride register.

Doing this will reduce one if-else statement in this patch, but in the next
patch, it will lead to adding multiple if-else statements. I think we can
leave it unchanged.

 于2024年6月15日周六 19:51写道：

> From: sunyuechi 
>
>  C908   X60
> vp9_avg_8tap_smooth_4h_8bpp_c  :   12.7   11.2
> vp9_avg_8tap_smooth_4h_8bpp_rvv_i32:4.74.2
> vp9_avg_8tap_smooth_4v_8bpp_c  :   29.7   12.5
> vp9_avg_8tap_smooth_4v_8bpp_rvv_i32:4.74.2
> vp9_avg_8tap_smooth_8h_8bpp_c  :   48.7   42.2
> vp9_avg_8tap_smooth_8h_8bpp_rvv_i32:9.58.5
> vp9_avg_8tap_smooth_8v_8bpp_c  :   49.7   45.5
> vp9_avg_8tap_smooth_8v_8bpp_rvv_i32:9.58.5
> vp9_avg_8tap_smooth_16h_8bpp_c :  192.0  166.5
> vp9_avg_8tap_smooth_16h_8bpp_rvv_i32   :   21.7   19.5
> vp9_avg_8tap_smooth_16v_8bpp_c :  191.2  175.2
> vp9_avg_8tap_smooth_16v_8bpp_rvv_i32   :   21.2   19.0
> vp9_avg_8tap_smooth_32h_8bpp_c :  780.2  663.2
> vp9_avg_8tap_smooth_32h_8bpp_rvv_i32   :   68.2   60.5
> vp9_avg_8tap_smooth_32v_8bpp_c :  770.0  685.7
> vp9_avg_8tap_smooth_32v_8bpp_rvv_i32   :   67.0   59.5
> vp9_avg_8tap_smooth_64h_8bpp_c : 3116.2 2648.2
> vp9_avg_8tap_smooth_64h_8bpp_rvv_i32   :  270.7  120.7
> vp9_avg_8tap_smooth_64v_8bpp_c : 3058.5 2731.7
> vp9_avg_8tap_smooth_64v_8bpp_rvv_i32   :  266.5  119.0
> vp9_put_8tap_smooth_4h_8bpp_c  :   11.09.7
> vp9_put_8tap_smooth_4h_8bpp_rvv_i32:4.23.7
> vp9_put_8tap_smooth_4v_8bpp_c  :   11.7   10.5
> vp9_put_8tap_smooth_4v_8bpp_rvv_i32:4.03.7
> vp9_put_8tap_smooth_8h_8bpp_c  :   42.0   37.5
> vp9_put_8tap_smooth_8h_8bpp_rvv_i32:8.57.7
> vp9_put_8tap_smooth_8v_8bpp_c  :   43.5   38.5
> vp9_put_8tap_smooth_8v_8bpp_rvv_i32:8.77.7
> vp9_put_8tap_smooth_16h_8bpp_c :  181.7  147.2
> vp9_put_8tap_smooth_16h_8bpp_rvv_i32   :   20.0   18.0
> vp9_put_8tap_smooth_16v_8bpp_c :  168.5  149.7
> vp9_put_8tap_smooth_16v_8bpp_rvv_i32   :   19.7   17.5
> vp9_put_8tap_smooth_32h_8bpp_c :  675.0  586.5
> vp9_put_8tap_smooth_32h_8bpp_rvv_i32   :   65.2   58.0
> vp9_put_8tap_smooth_32v_8bpp_c :  664.7  591.2
> vp9_put_8tap_smooth_32v_8bpp_rvv_i32   :   64.0   57.0
> vp9_put_8tap_smooth_64h_8bpp_c : 2696.2 2339.0
> vp9_put_8tap_smooth_64h_8bpp_rvv_i32   :  259.7  115.7
> vp9_put_8tap_smooth_64v_8bpp_c : 2691.0 2348.5
> vp9_put_8tap_smooth_64v_8bpp_rvv_i32   :  255.5  114.0
> ---
>  libavcodec/riscv/vp9_mc_rvv.S  | 200 +
>  libavcodec/riscv/vp9dsp.h  |  72 
>  libavcodec/riscv/vp9dsp_init.c |  38 ++-
>  3 files changed, 285 insertions(+), 25 deletions(-)
>
> diff --git a/libavcodec/riscv/vp9_mc_rvv.S b/libavcodec/riscv/vp9_mc_rvv.S
> index 5241562531..5e81301aa5 100644
> --- a/libavcodec/riscv/vp9_mc_rvv.S
> +++ b/libavcodec/riscv/vp9_mc_rvv.S
> @@ -36,6 +36,18 @@
>  .endif
>  .endm
>
> +.macro vsetvlstatic16 len
> +.ifc \len,4
> +vsetvli zero, zero, e16, mf2, ta, ma
> +.elseif \len == 8
> +vsetvli zero, zero, e16, m1, ta, ma
> +.elseif \len == 16
> +vsetvli zero, zero, e16, m2, ta, ma
> +.else
> +vsetvli zero, zero, e16, m4, ta, ma
> +.endif
> +.endm
> +
>  .macro copy_avg len
>  func ff_vp9_avg\len\()_rvv, zve32x
>  csrwi   vxrm, 0
> @@ -181,8 +193,196 @@ func ff_\op\()_vp9_bilin_64hv_rvv, zve32x
>  endfunc
>  .endm
>
> +.equ ff_vp9_subpel_filters_smooth, ff_vp9_subpel_filters
> +.equ ff_vp9_subpel_filters_regular, ff_vp9_subpel_filters + 16*8*2
> +.equ ff_vp9_subpel_filters_sharp, ff_vp9_subpel_filters + 16*8*2*2
> +
> +.macro epel_filter name, type, regtype
> +lla \regtype\()2, ff_vp9_subpel_filters_\name
> +
> +.ifc \type,v
> +slli\regtype\()0, a6, 4
> +.else
> +slli\regtype\()0, a5, 4
> +.endif
> +add \regtype\()0, \regtype\()0, \r

Re: [FFmpeg-devel] [PATCH v4 2/4] lavc/vp9dsp: R-V V mc bilin hv

2024-06-15 Thread flow gg

> Copying vectors is rarely justified - mostly only before destructive
> instructions such as FMA.

It is slightly different from VP8. In VP8, many scalar values are positive,
so the related calculations can be easily replaced. However, in this
context of VP9, since t2 is a negative number, vwmaccsu is required.
Therefore, unlike the logic in VP8, we cannot use vwmulu.vx before
bilin_load to avoid vmv.


 于2024年6月15日周六 19:51写道：

> From: sunyuechi 
>
>  C908   X60
> vp9_avg_bilin_4hv_8bpp_c   :   10.79.5
> vp9_avg_bilin_4hv_8bpp_rvv_i32 :4.03.5
> vp9_avg_bilin_8hv_8bpp_c   :   38.5   34.2
> vp9_avg_bilin_8hv_8bpp_rvv_i32 :7.26.5
> vp9_avg_bilin_16hv_8bpp_c  :  147.2  130.5
> vp9_avg_bilin_16hv_8bpp_rvv_i32:   14.5   12.7
> vp9_avg_bilin_32hv_8bpp_c  :  574.2  509.7
> vp9_avg_bilin_32hv_8bpp_rvv_i32:   42.5   38.0
> vp9_avg_bilin_64hv_8bpp_c  : 2321.2 2017.7
> vp9_avg_bilin_64hv_8bpp_rvv_i32:  163.5  131.0
> vp9_put_bilin_4hv_8bpp_c   :   10.08.7
> vp9_put_bilin_4hv_8bpp_rvv_i32 :3.53.0
> vp9_put_bilin_8hv_8bpp_c   :   35.2   31.2
> vp9_put_bilin_8hv_8bpp_rvv_i32 :6.55.7
> vp9_put_bilin_16hv_8bpp_c  :  134.0  119.0
> vp9_put_bilin_16hv_8bpp_rvv_i32:   12.7   11.5
> vp9_put_bilin_32hv_8bpp_c  :  538.5  464.2
> vp9_put_bilin_32hv_8bpp_rvv_i32:   39.7   35.2
> vp9_put_bilin_64hv_8bpp_c  : 2111.7 1833.2
> vp9_put_bilin_64hv_8bpp_rvv_i32:  138.5  122.5
> ---
>  libavcodec/riscv/vp9_mc_rvv.S  | 38 +-
>  libavcodec/riscv/vp9dsp_init.c | 10 +
>  2 files changed, 47 insertions(+), 1 deletion(-)
>
> diff --git a/libavcodec/riscv/vp9_mc_rvv.S b/libavcodec/riscv/vp9_mc_rvv.S
> index fb7377048a..5241562531 100644
> --- a/libavcodec/riscv/vp9_mc_rvv.S
> +++ b/libavcodec/riscv/vp9_mc_rvv.S
> @@ -147,6 +147,40 @@ func ff_\op\()_vp9_bilin_64\type\()_rvv, zve32x
>  endfunc
>  .endm
>
> +.macro bilin_hv op
> +func ff_\op\()_vp9_bilin_64hv_rvv, zve32x
> +vsetvlstatic8   64, t0, 64
> +.Lbilin_hv\op:
> +.ifc \op,avg
> +csrwi   vxrm, 0
> +.endif
> +neg t1, a5
> +neg t2, a6
> +li  t4, 8
> +bilin_load_hv24, put, a5
> +add a2, a2, a3
> +1:
> +addia4, a4, -1
> +bilin_load_hv4, put, a5
> +vwmulu.vx   v16, v4, a6
> +vwmaccsu.vx v16, t2, v24
> +vwadd.wxv16, v16, t4
> +vnsra.wiv16, v16, 4
> +vadd.vv v0, v16, v24
> +.ifc \op,avg
> +vle8.v  v16, (a0)
> +vaaddu.vv   v0, v0, v16
> +.endif
> +vse8.v  v0, (a0)
> +vmv.v.v v24, v4
> +add a2, a2, a3
> +add a0, a0, a1
> +bneza4, 1b
> +
> +ret
> +endfunc
> +.endm
> +
>  .irp len, 64, 32, 16, 8, 4
>  copy_avg \len
>  .endr
> @@ -155,6 +189,8 @@ bilin_h_v  put, h, a5
>  bilin_h_v  avg, h, a5
>  bilin_h_v  put, v, a6
>  bilin_h_v  avg, v, a6
> +bilin_hv   put
> +bilin_hv   avg
>
>  .macro func_bilin_h_v len, op, type
>  func ff_\op\()_vp9_bilin_\len\()\type\()_rvv, zve32x
> @@ -165,7 +201,7 @@ endfunc
>
>  .irp len, 32, 16, 8, 4
>  .irp op, put, avg
> -.irp type, h, v
> +.irp type, h, v, hv
>  func_bilin_h_v \len, \op, \type
>  .endr
>  .endr
> diff --git a/libavcodec/riscv/vp9dsp_init.c
> b/libavcodec/riscv/vp9dsp_init.c
> index 9606d8545f..b3700dfb08 100644
> --- a/libavcodec/riscv/vp9dsp_init.c
> +++ b/libavcodec/riscv/vp9dsp_init.c
> @@ -83,6 +83,16 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext
> *dsp, int bpp)
>  dsp->mc[4][FILTER_BILINEAR ][0][1][0] = ff_put_vp9_bilin_4h_rvv;
>  dsp->mc[4][FILTER_BILINEAR ][1][0][1] = ff_avg_vp9_bilin_4v_rvv;
>  dsp->mc[4][FILTER_BILINEAR ][1][1][0] = ff_avg_vp9_bilin_4h_rvv;
> +dsp->mc[0][FILTER_BILINEAR ][0][1][1] = ff_put_vp9_bilin_64hv_rvv;
> +dsp->mc[0][FILTER_BILINEAR ][1][1][1] = ff_avg_vp9_bilin_64hv_rvv;
> +dsp->mc[1][FILTER_BILINEAR ][0][1][1] = ff_put_vp9_bilin_32hv_rvv;
> +dsp->mc[1][FILTER_BILINEAR ][1][1][1] = ff_avg_vp9_bilin_32hv_rvv;
> +dsp->mc[2][FILTER_BILINEAR ][0][1][1] = ff_put_vp9_bilin_16hv_rvv;
> +dsp->mc[2][FILTER_BILINEAR ][1][1][1] = ff_avg_vp9_bilin_16hv_rvv;
> +dsp->mc[3][FILTER_BILINEAR ][0][1][1] = ff_put_vp9_bilin_8hv_rvv;
> +dsp->mc[3][FILTER_BILINEAR ][1][1][1] = ff_avg_vp9_bil

Re: [FFmpeg-devel] [PATCH v4 1/4] lavc/vp9dsp: R-V V mc bilin h v

2024-06-15 Thread flow gg

Just like in VP8, the unroll has been updated.

 于2024年6月15日周六 19:51写道：

> From: sunyuechi 
>
>  C908   X60
> vp9_avg_bilin_4h_8bpp_c:5.54.7
> vp9_avg_bilin_4h_8bpp_rvv_i32  :1.71.5
> vp9_avg_bilin_4v_8bpp_c:5.54.7
> vp9_avg_bilin_4v_8bpp_rvv_i32  :1.51.2
> vp9_avg_bilin_8h_8bpp_c:   20.0   17.7
> vp9_avg_bilin_8h_8bpp_rvv_i32  :3.02.7
> vp9_avg_bilin_8v_8bpp_c:   20.7   18.7
> vp9_avg_bilin_8v_8bpp_rvv_i32  :3.02.7
> vp9_avg_bilin_16h_8bpp_c   :   78.2   69.7
> vp9_avg_bilin_16h_8bpp_rvv_i32 :7.06.2
> vp9_avg_bilin_16v_8bpp_c   :   98.5   73.2
> vp9_avg_bilin_16v_8bpp_rvv_i32 :7.06.0
> vp9_avg_bilin_32h_8bpp_c   :  325.5  275.5
> vp9_avg_bilin_32h_8bpp_rvv_i32 :   23.0   20.5
> vp9_avg_bilin_32v_8bpp_c   :  342.2  290.0
> vp9_avg_bilin_32v_8bpp_rvv_i32 :   21.7   19.5
> vp9_avg_bilin_64h_8bpp_c   : 1263.7 1095.7
> vp9_avg_bilin_64h_8bpp_rvv_i32 :   91.2   81.2
> vp9_avg_bilin_64v_8bpp_c   : 1331.7 1155.2
> vp9_avg_bilin_64v_8bpp_rvv_i32 :   91.2   81.0
> vp9_put_bilin_4h_8bpp_c:4.54.0
> vp9_put_bilin_4h_8bpp_rvv_i32  :1.01.0
> vp9_put_bilin_4v_8bpp_c:4.74.2
> vp9_put_bilin_4v_8bpp_rvv_i32  :1.01.0
> vp9_put_bilin_8h_8bpp_c:   16.7   15.0
> vp9_put_bilin_8h_8bpp_rvv_i32  :2.22.0
> vp9_put_bilin_8v_8bpp_c:   17.5   15.7
> vp9_put_bilin_8v_8bpp_rvv_i32  :2.22.0
> vp9_put_bilin_16h_8bpp_c   :   65.2   58.0
> vp9_put_bilin_16h_8bpp_rvv_i32 :6.05.5
> vp9_put_bilin_16v_8bpp_c   :   69.2   61.7
> vp9_put_bilin_16v_8bpp_rvv_i32 :5.75.2
> vp9_put_bilin_32h_8bpp_c   :  273.2  229.0
> vp9_put_bilin_32h_8bpp_rvv_i32 :   19.7   17.7
> vp9_put_bilin_32v_8bpp_c   :  290.5  243.7
> vp9_put_bilin_32v_8bpp_rvv_i32 :   18.7   16.7
> vp9_put_bilin_64h_8bpp_c   : 1040.5  910.5
> vp9_put_bilin_64h_8bpp_rvv_i32 :   82.5   73.0
> vp9_put_bilin_64v_8bpp_c   : 1108.5  971.0
> vp9_put_bilin_64v_8bpp_rvv_i32 :   82.2   73.2
> ---
>  libavcodec/riscv/vp9_mc_rvv.S  | 114 +
>  libavcodec/riscv/vp9dsp.h  |  12 ++--
>  libavcodec/riscv/vp9dsp_init.c |  21 ++
>  3 files changed, 141 insertions(+), 6 deletions(-)
>
> diff --git a/libavcodec/riscv/vp9_mc_rvv.S b/libavcodec/riscv/vp9_mc_rvv.S
> index 7cb38ec94a..fb7377048a 100644
> --- a/libavcodec/riscv/vp9_mc_rvv.S
> +++ b/libavcodec/riscv/vp9_mc_rvv.S
> @@ -53,6 +53,120 @@ func ff_vp9_avg\len\()_rvv, zve32x
>  endfunc
>  .endm
>
> +.macro bilin_load_h dst, op, mn
> +addit5, a2, 1
> +vle8.v  v8, (a2)
> +vle8.v  v0, (t5)
> +vwmulu.vx   v16, v0, \mn
> +vwmaccsu.vx v16, t1, v8
> +vwadd.wxv16, v16, t4
> +vnsra.wiv16, v16, 4
> +vadd.vv \dst, v16, v8
> +.ifc \op,avg
> +vle8.v  v16, (a0)
> +vaaddu.vv   \dst, \dst, v16
> +.endif
> +.endm
> +
> +.macro bilin_h_v op, type, mn
> +func ff_\op\()_vp9_bilin_64\type\()_rvv, zve32x
> +vsetvlstatic8   64, t0, 64
> +.ifc \op,avg
> +csrwi   vxrm, 0
> +.endif
> +li  t4, 8
> +neg t1, \mn
> +1:
> +addia4, a4, -1
> +.ifc \type,v
> +add t5, a2, a3
> +.else
> +addit5, a2, 1
> +.endif
> +vle8.v  v8, (a2)
> +vle8.v  v0, (t5)
> +vwmulu.vx   v16, v0, \mn
> +vwmaccsu.vx v16, t1, v8
> +vwadd.wxv16, v16, t4
> +vnsra.wiv16, v16, 4
> +vadd.vv v0, v16, v8
> +.ifc \op,avg
> +vle8.v  v16, (a0)
> +vaaddu.vv   v0, v0, v16
> +.endif
> +vse8.v  v0, (a0)
> +add a2, a2, a3
> +add a0, a0, a1
> +bneza4, 1b
> +ret
> +
> +.Lbilin_\type\op:
> +.ifc \op,avg
> +csrwi   vxrm, 0
> +.endif
> +li  t4, 8
> +neg t1, \mn
> +1:
> +addi

Re: [FFmpeg-devel] [PATCH] lavc/vp8dsp: R-V V put_bilin_h v unroll

2024-06-12 Thread flow gg

> Does this not render the type parameter of bilin_load useless (always h)?
> (Not a blocker for this patch.)

Yes, this was needed in the initial version, but it is no longer required.
I just sent a patch.

> Not sure if I already asked this but is this really faster than slide1?
> Normally we want to minimise the work of the memory bus.

Originally it was slide, but based on your review, it was changed to load,
which should be better.

review: "Can't we skip the slide and just load the vector at a2+1? Also
then, we can keep VL=len and halve the multipler."

Rémi Denis-Courmont  于2024年6月12日周三 22:41写道：

> Le torstaina 30. toukokuuta 2024, 18.26.53 EEST u...@foxmail.com a écrit :
> > From: sunyuechi 
> >
> > Since len < 64, the registers are sufficient, so it can be
> > directly unrolled (a4 is even).
> >
> > Another benefit of unrolling is that it reduces one load operation
> > vertically compared to horizontally.
> >
> >  old new
> >  C908   X60  C908   X60
> > vp8_put_bilin4_h_c :6.25.5 :6.25.5
> > vp8_put_bilin4_h_rvv_i32   :2.22.0 :1.51.5
> > vp8_put_bilin4_v_c :6.55.7 :6.25.7
> > vp8_put_bilin4_v_rvv_i32   :2.22.0 :1.21.5
> > vp8_put_bilin8_h_c :   24.2   21.5 :   24.2   21.5
> > vp8_put_bilin8_h_rvv_i32   :5.24.7 :3.53.5
> > vp8_put_bilin8_v_c :   24.5   21.7 :   24.5   21.7
> > vp8_put_bilin8_v_rvv_i32   :5.24.7 :3.53.2
> > vp8_put_bilin16_h_c:   48.0   42.7 :   48.0   42.7
> > vp8_put_bilin16_h_rvv_i32  :5.75.0 :5.24.5
> > vp8_put_bilin16_v_c:   48.2   43.0 :   48.2   42.7
> > vp8_put_bilin16_v_rvv_i32  :5.75.2 :4.54.2
> > ---
> >  libavcodec/riscv/vp8dsp_rvv.S | 34 +-
> >  1 file changed, 29 insertions(+), 5 deletions(-)
> >
> > diff --git a/libavcodec/riscv/vp8dsp_rvv.S
> b/libavcodec/riscv/vp8dsp_rvv.S
> > index 3360a38cac..5bea6cba9c 100644
> > --- a/libavcodec/riscv/vp8dsp_rvv.S
> > +++ b/libavcodec/riscv/vp8dsp_rvv.S
> > @@ -172,11 +172,35 @@ func ff_put_vp8_bilin4_\type\()_rvv, zve32x
> >  li  t4, 4
> >  sub t1, t1, \mn
> >  1:
> > -addia4, a4, -1
> > -bilin_load  v0, \type, \mn
>
> Does this not render the type parameter of bilin_load useless (always h)?
> (Not a blocker for this patch.)
>
> > -vse8.v  v0, (a0)
> > -add a2, a2, a3
> > -add a0, a0, a1
> > +add t0, a2, a3
> > +add t2, a0, a1
> > +addia4, a4, -2
> > +.ifc \type,v
> > +add t3, t0, a3
> > +.else
> > +addit5, a2, 1
> > +addit3, t0, 1
> > +vle8.v  v2, (t5)
>
> Not sure if I already asked this but is this really faster than slide1?
> Normally we want to minimise the work of the memory bus.
>
> > +.endif
> > +vle8.v  v0, (a2)
> > +vle8.v  v4, (t0)
> > +vle8.v  v6, (t3)
> > +vwmulu.vx   v28, v0, t1
> > +vwmulu.vx   v26, v4, t1
> > +.ifc \type,v
> > +vwmaccu.vx  v28, \mn, v4
> > +.else
> > +vwmaccu.vx  v28, \mn, v2
> > +.endif
> > +vwmaccu.vx  v26, \mn, v6
> > +vwaddu.wx   v24, v28, t4
> > +vwaddu.wx   v22, v26, t4
> > +vnsra.wiv30, v24, 3
> > +vnsra.wiv0, v22, 3
> > +vse8.v  v30, (a0)
> > +vse8.v  v0, (t2)
> > +add a2, t0, a3
> > +add a0, t2, a1
> >  bneza4, 1b
> >
> >  ret
>
> --
> 雷米‧德尼-库尔蒙
> http://www.remlab.net/
>
>
>
> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
>
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH] lavc/vp8dsp: R-V V put_bilin_h v unroll

2024-06-12 Thread flow gg

ping

 于2024年5月30日周四 23:27写道：

> From: sunyuechi 
>
> Since len < 64, the registers are sufficient, so it can be
> directly unrolled (a4 is even).
>
> Another benefit of unrolling is that it reduces one load operation
> vertically compared to horizontally.
>
>  old new
>  C908   X60  C908   X60
> vp8_put_bilin4_h_c :6.25.5 :6.25.5
> vp8_put_bilin4_h_rvv_i32   :2.22.0 :1.51.5
> vp8_put_bilin4_v_c :6.55.7 :6.25.7
> vp8_put_bilin4_v_rvv_i32   :2.22.0 :1.21.5
> vp8_put_bilin8_h_c :   24.2   21.5 :   24.2   21.5
> vp8_put_bilin8_h_rvv_i32   :5.24.7 :3.53.5
> vp8_put_bilin8_v_c :   24.5   21.7 :   24.5   21.7
> vp8_put_bilin8_v_rvv_i32   :5.24.7 :3.53.2
> vp8_put_bilin16_h_c:   48.0   42.7 :   48.0   42.7
> vp8_put_bilin16_h_rvv_i32  :5.75.0 :5.24.5
> vp8_put_bilin16_v_c:   48.2   43.0 :   48.2   42.7
> vp8_put_bilin16_v_rvv_i32  :5.75.2 :4.54.2
> ---
>  libavcodec/riscv/vp8dsp_rvv.S | 34 +-
>  1 file changed, 29 insertions(+), 5 deletions(-)
>
> diff --git a/libavcodec/riscv/vp8dsp_rvv.S b/libavcodec/riscv/vp8dsp_rvv.S
> index 3360a38cac..5bea6cba9c 100644
> --- a/libavcodec/riscv/vp8dsp_rvv.S
> +++ b/libavcodec/riscv/vp8dsp_rvv.S
> @@ -172,11 +172,35 @@ func ff_put_vp8_bilin4_\type\()_rvv, zve32x
>  li  t4, 4
>  sub t1, t1, \mn
>  1:
> -addia4, a4, -1
> -bilin_load  v0, \type, \mn
> -vse8.v  v0, (a0)
> -add a2, a2, a3
> -add a0, a0, a1
> +add t0, a2, a3
> +add t2, a0, a1
> +addia4, a4, -2
> +.ifc \type,v
> +add t3, t0, a3
> +.else
> +addit5, a2, 1
> +addit3, t0, 1
> +vle8.v  v2, (t5)
> +.endif
> +vle8.v  v0, (a2)
> +vle8.v  v4, (t0)
> +vle8.v  v6, (t3)
> +vwmulu.vx   v28, v0, t1
> +vwmulu.vx   v26, v4, t1
> +.ifc \type,v
> +vwmaccu.vx  v28, \mn, v4
> +.else
> +vwmaccu.vx  v28, \mn, v2
> +.endif
> +vwmaccu.vx  v26, \mn, v6
> +vwaddu.wx   v24, v28, t4
> +vwaddu.wx   v22, v26, t4
> +vnsra.wiv30, v24, 3
> +vnsra.wiv0, v22, 3
> +vse8.v  v30, (a0)
> +vse8.v  v0, (t2)
> +add a2, t0, a3
> +add a0, t2, a1
>  bneza4, 1b
>
>  ret
> --
> 2.45.1
>
> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
>
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH v4] lavc/vvc_mc: R-V V avg w_avg

2024-06-11 Thread flow gg

> Nit: for overall code base consistency, I'd use csrwi here. Reason being
that
> for other rounding modes, csrwi is the better option.
>
> Probably faster to swap the two above, to avoid stalling on LD.
>
> If you check more than one length, better to get ff_get_rv_vlenb() into a
local
> variable.
>
> In C, it would be invalid pointer arithmetic, but in assembler, you can
add
> whatever constant offset you want to this symbol, even if points outside
the
> table. So you should be able to eliminate the LI above. It won't make much
> difference though.

Okay, updated them in the reply

> Could SEW be a parameter so that these three macros would be a little bit
more
> factored? .ifc / .ifnc might help to match e8/e16/e32.

I feel this makes the vset overly complex, and adding more if-else
statements doesn't significantly reduce the amount of code..

> I guess t4 is 32-bit? Kinda sad to switch VTYPE just for this but if so, I
> don't have any better idea :(

Yes, t4 is 32-bit. I've considered this and haven't found a better
solution. :(

> Is that .rept meaningfully faster than a run-time loop?

I haven't done a direct comparison.. it's just to reduce a few comparisons.

 于2024年6月12日周三 02:38写道：

> From: sunyuechi 
>
>   C908   X60
> avg_8_2x2_c:1.21.0
> avg_8_2x2_rvv_i32  :1.01.0
> avg_8_2x4_c:2.02.0
> avg_8_2x4_rvv_i32  :1.51.2
> avg_8_2x8_c:3.74.0
> avg_8_2x8_rvv_i32  :2.02.0
> avg_8_2x16_c   :7.27.7
> avg_8_2x16_rvv_i32 :3.23.0
> avg_8_2x32_c   :   14.5   15.2
> avg_8_2x32_rvv_i32 :5.75.0
> avg_8_2x64_c   :   50.0   45.2
> avg_8_2x64_rvv_i32 :   41.5   32.5
> avg_8_2x128_c  :  101.5   84.2
> avg_8_2x128_rvv_i32:   89.5   73.2
> avg_8_4x2_c:2.02.0
> avg_8_4x2_rvv_i32  :1.01.0
> avg_8_4x4_c:3.53.5
> avg_8_4x4_rvv_i32  :1.51.2
> avg_8_4x8_c:6.77.0
> avg_8_4x8_rvv_i32  :2.01.7
> avg_8_4x16_c   :   13.2   14.0
> avg_8_4x16_rvv_i32 :3.23.0
> avg_8_4x32_c   :   26.2   27.7
> avg_8_4x32_rvv_i32 :5.75.0
> avg_8_4x64_c   :   75.0   66.0
> avg_8_4x64_rvv_i32 :   40.2   33.0
> avg_8_4x128_c  :  144.5  128.0
> avg_8_4x128_rvv_i32:   89.5   78.7
> avg_8_8x2_c:3.23.5
> avg_8_8x2_rvv_i32  :1.21.0
> avg_8_8x4_c:6.56.7
> avg_8_8x4_rvv_i32  :1.51.5
> avg_8_8x8_c:   12.7   13.2
> avg_8_8x8_rvv_i32  :2.21.7
> avg_8_8x16_c   :   25.2   26.5
> avg_8_8x16_rvv_i32 :3.72.7
> avg_8_8x32_c   :   50.2   52.7
> avg_8_8x32_rvv_i32 :6.55.0
> avg_8_8x64_c   :  120.2  117.7
> avg_8_8x64_rvv_i32 :   45.2   39.2
> avg_8_8x128_c  :  223.0  233.5
> avg_8_8x128_rvv_i32:   80.0   73.2
> avg_8_16x2_c   :6.26.5
> avg_8_16x2_rvv_i32 :1.51.0
> avg_8_16x4_c   :   12.5   12.7
> avg_8_16x4_rvv_i32 :2.01.2
> avg_8_16x8_c   :   24.7   26.0
> avg_8_16x8_rvv_i32 :3.22.0
> avg_8_16x16_c  :   49.0   51.2
> avg_8_16x16_rvv_i32:5.73.2
> avg_8_16x32_c  :   97.7  102.5
> avg_8_16x32_rvv_i32:   10.75.7
> avg_8_16x64_c  :  220.5  214.2
> avg_8_16x64_rvv_i32

Re: [FFmpeg-devel] [PATCH v3] lavc/vvc_mc: R-V V avg w_avg

2024-06-11 Thread flow gg

> I think we can drop the 2x2 transforms. In all likelihood, scalar code
will
> end up faster than vector code on future hardware, especially out-of-order
> pipelines.

I want to drop 2x2, but since there's only one function to handle all
situations instead of 7*7 functions..

> AFAIU, this will generate relocations. I wonder if the linker smart
enough to
> put that into .data.relro rather than whine that it can't live it in
.rodata?
>
> In assembler, we can dodge the problem entirely by storing relative
offsets
> rather than addresses. You can also stick to 4- or even 2-byte values
then.

Okay, updated it in the reply

> LLA is an alias for AUIPC; ADD. You can avoid that ADD by folding the low
bits
> into LD. See how ff_h263_loop_filter_strength is addressed in
h263dsp_rvv.S.

With the previous change to use relative offsets in the table,
it seems that the full table start address needs to be stored in a register
once,
so it appears that this situation requires the use of lla.

 于2024年6月12日周三 00:38写道：

> From: sunyuechi 
>
>   C908   X60
> avg_8_2x2_c:1.21.0
> avg_8_2x2_rvv_i32  :1.01.0
> avg_8_2x4_c:2.02.0
> avg_8_2x4_rvv_i32  :1.51.2
> avg_8_2x8_c:3.74.0
> avg_8_2x8_rvv_i32  :2.02.0
> avg_8_2x16_c   :7.27.7
> avg_8_2x16_rvv_i32 :3.23.0
> avg_8_2x32_c   :   14.5   15.2
> avg_8_2x32_rvv_i32 :5.75.0
> avg_8_2x64_c   :   50.0   45.2
> avg_8_2x64_rvv_i32 :   41.5   32.5
> avg_8_2x128_c  :  101.5   84.2
> avg_8_2x128_rvv_i32:   89.5   73.2
> avg_8_4x2_c:2.02.0
> avg_8_4x2_rvv_i32  :1.01.0
> avg_8_4x4_c:3.53.5
> avg_8_4x4_rvv_i32  :1.51.2
> avg_8_4x8_c:6.77.0
> avg_8_4x8_rvv_i32  :2.01.7
> avg_8_4x16_c   :   13.2   14.0
> avg_8_4x16_rvv_i32 :3.23.0
> avg_8_4x32_c   :   26.2   27.7
> avg_8_4x32_rvv_i32 :5.75.0
> avg_8_4x64_c   :   75.0   66.0
> avg_8_4x64_rvv_i32 :   40.2   33.0
> avg_8_4x128_c  :  144.5  128.0
> avg_8_4x128_rvv_i32:   89.5   78.7
> avg_8_8x2_c:3.23.5
> avg_8_8x2_rvv_i32  :1.21.0
> avg_8_8x4_c:6.56.7
> avg_8_8x4_rvv_i32  :1.51.5
> avg_8_8x8_c:   12.7   13.2
> avg_8_8x8_rvv_i32  :2.21.7
> avg_8_8x16_c   :   25.2   26.5
> avg_8_8x16_rvv_i32 :3.72.7
> avg_8_8x32_c   :   50.2   52.7
> avg_8_8x32_rvv_i32 :6.55.0
> avg_8_8x64_c   :  120.2  117.7
> avg_8_8x64_rvv_i32 :   45.2   39.2
> avg_8_8x128_c  :  223.0  233.5
> avg_8_8x128_rvv_i32:   80.0   73.2
> avg_8_16x2_c   :6.26.5
> avg_8_16x2_rvv_i32 :1.51.0
> avg_8_16x4_c   :   12.5   12.7
> avg_8_16x4_rvv_i32 :2.01.2
> avg_8_16x8_c   :   24.7   26.0
> avg_8_16x8_rvv_i32 :3.22.0
> avg_8_16x16_c  :   49.0   51.2
> avg_8_16x16_rvv_i32:5.73.2
> avg_8_16x32_c  :   97.7  102.5
> avg_8_16x32_rvv_i32:   10.75.7
> avg_8_16x64_c  :  220.5  214.2
> avg_8_16x64_rvv_i32:   48.2   39.5
> avg_8_16x128_c :  436.2  428.0
> avg_8_16x128_rvv_i32   :   97.2   77.0
> avg_8_32x2_c

Re: [FFmpeg-devel] [PATCH v2] lavc/vvc_mc: R-V V avg w_avg

2024-06-01 Thread flow gg

> I think we can drop the 2x2 transforms. In all likelihood, scalar code
will
> end up faster than vector code on future hardware, especially out-of-order
> pipelines.

I want to drop 2x2, but since there's only one function to handle all
situations instead of 7*7 functions, how can I drop only 2x2?

Rémi Denis-Courmont  于2024年6月2日周日 03:54写道：

> Le lauantaina 1. kesäkuuta 2024, 21.01.16 EEST u...@foxmail.com a écrit :
> > From: sunyuechi 
> >
> >   C908   X60
> > avg_8_2x2_c:1.01.0
> > avg_8_2x2_rvv_i32  :1.01.0
>
> I think we can drop the 2x2 transforms. In all likelihood, scalar code
> will
> end up faster than vector code on future hardware, especially out-of-order
> pipelines.
>
> > avg_8_2x4_c:1.72.0
> > avg_8_2x4_rvv_i32  :1.21.2
> > avg_8_2x8_c:3.74.0
> > avg_8_2x8_rvv_i32  :2.02.0
> > avg_8_2x16_c   :7.27.5
> > avg_8_2x16_rvv_i32 :3.23.0
> > avg_8_2x32_c   :   14.2   15.0
> > avg_8_2x32_rvv_i32 :5.75.0
> > avg_8_2x64_c   :   46.7   44.2
> > avg_8_2x64_rvv_i32 :   39.2   36.0
> > avg_8_2x128_c  :   99.7   80.0
> > avg_8_2x128_rvv_i32:   86.2   65.5
> > avg_8_4x2_c:2.02.0
> > avg_8_4x2_rvv_i32  :1.01.0
> > avg_8_4x4_c:3.53.7
> > avg_8_4x4_rvv_i32  :1.51.2
> > avg_8_4x8_c:6.57.0
> > avg_8_4x8_rvv_i32  :2.01.7
> > avg_8_4x16_c   :   13.5   14.0
> > avg_8_4x16_rvv_i32 :3.22.7
> > avg_8_4x32_c   :   26.2   27.5
> > avg_8_4x32_rvv_i32 :5.75.0
> > avg_8_4x64_c   :   75.0   65.7
> > avg_8_4x64_rvv_i32 :   44.0   32.0
> > avg_8_4x128_c  :  165.2  118.5
> > avg_8_4x128_rvv_i32:   81.5   71.0
> > avg_8_8x2_c:3.23.5
> > avg_8_8x2_rvv_i32  :1.21.0
> > avg_8_8x4_c:6.56.5
> > avg_8_8x4_rvv_i32  :1.51.5
> > avg_8_8x8_c:   12.5   13.2
> > avg_8_8x8_rvv_i32  :2.21.7
> > avg_8_8x16_c   :   25.2   26.5
> > avg_8_8x16_rvv_i32 :3.72.7
> > avg_8_8x32_c   :   50.0   52.5
> > avg_8_8x32_rvv_i32 :6.75.2
> > avg_8_8x64_c   :  120.7  119.0
> > avg_8_8x64_rvv_i32 :   43.2   33.5
> > avg_8_8x128_c  :  247.5  217.7
> > avg_8_8x128_rvv_i32:  100.5   74.7
> > avg_8_16x2_c   :6.26.5
> > avg_8_16x2_rvv_i32 :1.21.0
> > avg_8_16x4_c   :   12.2   13.0
> > avg_8_16x4_rvv_i32 :2.01.2
> > avg_8_16x8_c   :   24.5   25.7
> > avg_8_16x8_rvv_i32 :3.22.0
> > avg_8_16x16_c  :   48.7   51.2
> > avg_8_16x16_rvv_i32:5.73.2
> > avg_8_16x32_c  :   97.5  102.7
> > avg_8_16x32_rvv_i32:   10.76.0
> > avg_8_16x64_c  :  213.0  215.0
> > avg_8_16x64_rvv_i32:   51.5   33.5
> > avg_8_16x128_c :  408.5  417.0
> > avg_8_16x128_rvv_i32   :  102.0   71.5
> > avg_8_32x2_c   :   12.2   13.0
> > avg_8_32x2_rvv_i32 :2.01.2
> > avg_8_32x4_c   :   24.5   25.5
> > avg_8_32x4_rvv_i32 :3.21.7
> > avg_8_32x8_c

Re: [FFmpeg-devel] [PATCH v2] lavc/vvc_mc: R-V V avg w_avg

2024-06-01 Thread flow gg

> In keeping in line with the rest of the project, that should probably go
into
> **libavcodec/riscv/vvc/**
> Expanding the macro 49 times, with up to 14 **branches** to get there is
maybe not
> such a great idea. It might look nice on the checkasm µbenchmarks because
the
> branches under test get predicted and cached.
>
> But in real use, branch prediction will not work so well, and the I-cache
will be filled with all variants of the same function.
>
> Indeed, this seems to result in about .5 MiB of code.
>
> Even if only one half is needed (128-bit or 256+-bit variants). that's a
lot.
>
> For comparison, x86 uses just about 10 KiB, also with two variants.
>
> What I make out from the arcane forbidden CISC arts there:
>
> - functions are specialised only in one dimension, not both,
> - dispatch tables avoid multiplying branches.

Referring to x86, the code has been updated. The current code size is 6k,
and a jmp table has been added.

 于2024年6月2日周日 02:01写道：

> From: sunyuechi 
>
>   C908   X60
> avg_8_2x2_c:1.01.0
> avg_8_2x2_rvv_i32  :1.01.0
> avg_8_2x4_c:1.72.0
> avg_8_2x4_rvv_i32  :1.21.2
> avg_8_2x8_c:3.74.0
> avg_8_2x8_rvv_i32  :2.02.0
> avg_8_2x16_c   :7.27.5
> avg_8_2x16_rvv_i32 :3.23.0
> avg_8_2x32_c   :   14.2   15.0
> avg_8_2x32_rvv_i32 :5.75.0
> avg_8_2x64_c   :   46.7   44.2
> avg_8_2x64_rvv_i32 :   39.2   36.0
> avg_8_2x128_c  :   99.7   80.0
> avg_8_2x128_rvv_i32:   86.2   65.5
> avg_8_4x2_c:2.02.0
> avg_8_4x2_rvv_i32  :1.01.0
> avg_8_4x4_c:3.53.7
> avg_8_4x4_rvv_i32  :1.51.2
> avg_8_4x8_c:6.57.0
> avg_8_4x8_rvv_i32  :2.01.7
> avg_8_4x16_c   :   13.5   14.0
> avg_8_4x16_rvv_i32 :3.22.7
> avg_8_4x32_c   :   26.2   27.5
> avg_8_4x32_rvv_i32 :5.75.0
> avg_8_4x64_c   :   75.0   65.7
> avg_8_4x64_rvv_i32 :   44.0   32.0
> avg_8_4x128_c  :  165.2  118.5
> avg_8_4x128_rvv_i32:   81.5   71.0
> avg_8_8x2_c:3.23.5
> avg_8_8x2_rvv_i32  :1.21.0
> avg_8_8x4_c:6.56.5
> avg_8_8x4_rvv_i32  :1.51.5
> avg_8_8x8_c:   12.5   13.2
> avg_8_8x8_rvv_i32  :2.21.7
> avg_8_8x16_c   :   25.2   26.5
> avg_8_8x16_rvv_i32 :3.72.7
> avg_8_8x32_c   :   50.0   52.5
> avg_8_8x32_rvv_i32 :6.75.2
> avg_8_8x64_c   :  120.7  119.0
> avg_8_8x64_rvv_i32 :   43.2   33.5
> avg_8_8x128_c  :  247.5  217.7
> avg_8_8x128_rvv_i32:  100.5   74.7
> avg_8_16x2_c   :6.26.5
> avg_8_16x2_rvv_i32 :1.21.0
> avg_8_16x4_c   :   12.2   13.0
> avg_8_16x4_rvv_i32 :2.01.2
> avg_8_16x8_c   :   24.5   25.7
> avg_8_16x8_rvv_i32 :3.22.0
> avg_8_16x16_c  :   48.7   51.2
> avg_8_16x16_rvv_i32:5.73.2
> avg_8_16x32_c  :   97.5  102.7
> avg_8_16x32_rvv_i32:   10.76.0
> avg_8_16x64_c  :  213.0  215.0
> avg_8_16x64_rvv_i32:   51.5   33.5
> avg_8_16x128_c :  408.5  417.0
> avg_8_16x128_rvv_i32   :  102.0   71.5
> avg_8_32x2_c   :   12.2   13.0
> av

Re: [FFmpeg-devel] [PATCH] lavc/vp8dsp: R-V V put_bilin_h v unroll

2024-05-30 Thread flow gg

Well.. because scalar registers are limited, the direct unrolling will be
like this for now. We can handle different lengths separately in the future

flow gg  于2024年5月30日周四 23:36写道：

> I directly copied the VP9 modifications over... Since len <= 16, it seems
> like it can be improved a bit more
>
>  于2024年5月30日周四 23:27写道：
>
>> From: sunyuechi 
>>
>> Since len < 64, the registers are sufficient, so it can be
>> directly unrolled (a4 is even).
>>
>> Another benefit of unrolling is that it reduces one load operation
>> vertically compared to horizontally.
>>
>>  old new
>>  C908   X60  C908   X60
>> vp8_put_bilin4_h_c :6.25.5 :6.25.5
>> vp8_put_bilin4_h_rvv_i32   :2.22.0 :1.51.5
>> vp8_put_bilin4_v_c :6.55.7 :6.25.7
>> vp8_put_bilin4_v_rvv_i32   :2.22.0 :1.21.5
>> vp8_put_bilin8_h_c :   24.2   21.5 :   24.2   21.5
>> vp8_put_bilin8_h_rvv_i32   :5.24.7 :3.53.5
>> vp8_put_bilin8_v_c :   24.5   21.7 :   24.5   21.7
>> vp8_put_bilin8_v_rvv_i32   :5.24.7 :3.53.2
>> vp8_put_bilin16_h_c:   48.0   42.7 :   48.0   42.7
>> vp8_put_bilin16_h_rvv_i32  :5.75.0 :5.24.5
>> vp8_put_bilin16_v_c:   48.2   43.0 :   48.2   42.7
>> vp8_put_bilin16_v_rvv_i32  :5.75.2 :4.54.2
>> ---
>>  libavcodec/riscv/vp8dsp_rvv.S | 34 +-
>>  1 file changed, 29 insertions(+), 5 deletions(-)
>>
>> diff --git a/libavcodec/riscv/vp8dsp_rvv.S b/libavcodec/riscv/vp8dsp_rvv.S
>> index 3360a38cac..5bea6cba9c 100644
>> --- a/libavcodec/riscv/vp8dsp_rvv.S
>> +++ b/libavcodec/riscv/vp8dsp_rvv.S
>> @@ -172,11 +172,35 @@ func ff_put_vp8_bilin4_\type\()_rvv, zve32x
>>  li  t4, 4
>>  sub t1, t1, \mn
>>  1:
>> -addia4, a4, -1
>> -bilin_load  v0, \type, \mn
>> -vse8.v  v0, (a0)
>> -add a2, a2, a3
>> -add a0, a0, a1
>> +add t0, a2, a3
>> +add t2, a0, a1
>> +addia4, a4, -2
>> +.ifc \type,v
>> +add t3, t0, a3
>> +.else
>> +addit5, a2, 1
>> +addit3, t0, 1
>> +vle8.v  v2, (t5)
>> +.endif
>> +vle8.v  v0, (a2)
>> +vle8.v  v4, (t0)
>> +vle8.v  v6, (t3)
>> +vwmulu.vx   v28, v0, t1
>> +vwmulu.vx   v26, v4, t1
>> +.ifc \type,v
>> +vwmaccu.vx  v28, \mn, v4
>> +.else
>> +vwmaccu.vx  v28, \mn, v2
>> +.endif
>> +vwmaccu.vx  v26, \mn, v6
>> +vwaddu.wx   v24, v28, t4
>> +vwaddu.wx   v22, v26, t4
>> +vnsra.wiv30, v24, 3
>> +vnsra.wiv0, v22, 3
>> +vse8.v  v30, (a0)
>> +vse8.v  v0, (t2)
>> +add a2, t0, a3
>> +add a0, t2, a1
>>  bneza4, 1b
>>
>>  ret
>> --
>> 2.45.1
>>
>> ___
>> ffmpeg-devel mailing list
>> ffmpeg-devel@ffmpeg.org
>> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>>
>> To unsubscribe, visit link above, or email
>> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
>>
>
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH] lavc/vp8dsp: R-V V put_bilin_h v unroll

2024-05-30 Thread flow gg

I directly copied the VP9 modifications over... Since len <= 16, it seems
like it can be improved a bit more

 于2024年5月30日周四 23:27写道：

> From: sunyuechi 
>
> Since len < 64, the registers are sufficient, so it can be
> directly unrolled (a4 is even).
>
> Another benefit of unrolling is that it reduces one load operation
> vertically compared to horizontally.
>
>  old new
>  C908   X60  C908   X60
> vp8_put_bilin4_h_c :6.25.5 :6.25.5
> vp8_put_bilin4_h_rvv_i32   :2.22.0 :1.51.5
> vp8_put_bilin4_v_c :6.55.7 :6.25.7
> vp8_put_bilin4_v_rvv_i32   :2.22.0 :1.21.5
> vp8_put_bilin8_h_c :   24.2   21.5 :   24.2   21.5
> vp8_put_bilin8_h_rvv_i32   :5.24.7 :3.53.5
> vp8_put_bilin8_v_c :   24.5   21.7 :   24.5   21.7
> vp8_put_bilin8_v_rvv_i32   :5.24.7 :3.53.2
> vp8_put_bilin16_h_c:   48.0   42.7 :   48.0   42.7
> vp8_put_bilin16_h_rvv_i32  :5.75.0 :5.24.5
> vp8_put_bilin16_v_c:   48.2   43.0 :   48.2   42.7
> vp8_put_bilin16_v_rvv_i32  :5.75.2 :4.54.2
> ---
>  libavcodec/riscv/vp8dsp_rvv.S | 34 +-
>  1 file changed, 29 insertions(+), 5 deletions(-)
>
> diff --git a/libavcodec/riscv/vp8dsp_rvv.S b/libavcodec/riscv/vp8dsp_rvv.S
> index 3360a38cac..5bea6cba9c 100644
> --- a/libavcodec/riscv/vp8dsp_rvv.S
> +++ b/libavcodec/riscv/vp8dsp_rvv.S
> @@ -172,11 +172,35 @@ func ff_put_vp8_bilin4_\type\()_rvv, zve32x
>  li  t4, 4
>  sub t1, t1, \mn
>  1:
> -addia4, a4, -1
> -bilin_load  v0, \type, \mn
> -vse8.v  v0, (a0)
> -add a2, a2, a3
> -add a0, a0, a1
> +add t0, a2, a3
> +add t2, a0, a1
> +addia4, a4, -2
> +.ifc \type,v
> +add t3, t0, a3
> +.else
> +addit5, a2, 1
> +addit3, t0, 1
> +vle8.v  v2, (t5)
> +.endif
> +vle8.v  v0, (a2)
> +vle8.v  v4, (t0)
> +vle8.v  v6, (t3)
> +vwmulu.vx   v28, v0, t1
> +vwmulu.vx   v26, v4, t1
> +.ifc \type,v
> +vwmaccu.vx  v28, \mn, v4
> +.else
> +vwmaccu.vx  v28, \mn, v2
> +.endif
> +vwmaccu.vx  v26, \mn, v6
> +vwaddu.wx   v24, v28, t4
> +vwaddu.wx   v22, v26, t4
> +vnsra.wiv30, v24, 3
> +vnsra.wiv0, v22, 3
> +vse8.v  v30, (a0)
> +vse8.v  v0, (t2)
> +add a2, t0, a3
> +add a0, t2, a1
>  bneza4, 1b
>
>  ret
> --
> 2.45.1
>
> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
>
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH v3 4/5] lavc/vp9dsp: R-V V mc tap h v

2024-05-29 Thread flow gg

A portion has been modified according to the previous review, but there are
still some parts that haven't been updated

> Similarly, it
> should be possible to share most of the horizontal and vertical code
(maybe
> also for bilinear. not just EPel) with separate load/store then inner
> procedures. The H.263 loop filter already does that though with almost no
> overhead, though
> H.263 is obviously simpler than VP9.
>
> A French philosopher famously said that Perfect is the ennemy of Good.
> Generally, as with VVC, nested repetition macros for finely specialised
> functions tend to generate way too much byte code, and this ends up being
> worse rather than better in the big picture.

Here, bilin is modified with reference to your vp8 modification method, but
there are some issues with epel. I want to share most of the horizontal and
vertical code like h263, but because there are different types
(op/name/len), such changes seem hard. Trying to make similar modifications
for bilin also seems some hard , maybe leaving it for future optimization
:'(

> It should be possible to spare one ADDI by using just AUIPC here, and
folding
> the immediate offset into the LB's below (see also H.263 loop filter).

I'm not sure where the problem lies, but for smooth it works, but for
sharp, regular, it gives this error:
dangerous relocation: %pcrel_lo overflow with an addend, the value of
%pcrel_hi is 0xa5000 without any addend, but may be 0xa6000 after adding
the %pcrel_lo addend

 于2024年5月30日周四 01:16写道：

> From: sunyuechi 
>
>  C908   X60
> vp9_avg_8tap_smooth_4h_8bpp_c  :   13.0   11.2
> vp9_avg_8tap_smooth_4h_8bpp_rvv_i32:5.04.2
> vp9_avg_8tap_smooth_4v_8bpp_c  :   13.7   12.5
> vp9_avg_8tap_smooth_4v_8bpp_rvv_i32:5.04.2
> vp9_avg_8tap_smooth_8h_8bpp_c  :   49.5   42.2
> vp9_avg_8tap_smooth_8h_8bpp_rvv_i32:9.28.5
> vp9_avg_8tap_smooth_8v_8bpp_c  :   66.5   45.0
> vp9_avg_8tap_smooth_8v_8bpp_rvv_i32:9.58.5
> vp9_avg_8tap_smooth_16h_8bpp_c :  192.7  166.5
> vp9_avg_8tap_smooth_16h_8bpp_rvv_i32   :   21.2   18.7
> vp9_avg_8tap_smooth_16v_8bpp_c :  192.2  175.7
> vp9_avg_8tap_smooth_16v_8bpp_rvv_i32   :   21.5   19.0
> vp9_avg_8tap_smooth_32h_8bpp_c :  780.2  663.7
> vp9_avg_8tap_smooth_32h_8bpp_rvv_i32   :   83.5   60.0
> vp9_avg_8tap_smooth_32v_8bpp_c :  770.5  689.2
> vp9_avg_8tap_smooth_32v_8bpp_rvv_i32   :   67.2   60.0
> vp9_avg_8tap_smooth_64h_8bpp_c : 3115.5 2647.2
> vp9_avg_8tap_smooth_64h_8bpp_rvv_i32   :  283.5  119.2
> vp9_avg_8tap_smooth_64v_8bpp_c : 3082.2 2729.0
> vp9_avg_8tap_smooth_64v_8bpp_rvv_i32   :  305.2  119.0
> vp9_put_8tap_smooth_4h_8bpp_c  :   11.29.7
> vp9_put_8tap_smooth_4h_8bpp_rvv_i32:4.24.0
> vp9_put_8tap_smooth_4v_8bpp_c  :   11.7   10.7
> vp9_put_8tap_smooth_4v_8bpp_rvv_i32:4.24.0
> vp9_put_8tap_smooth_8h_8bpp_c  :   42.0   37.5
> vp9_put_8tap_smooth_8h_8bpp_rvv_i32:8.57.7
> vp9_put_8tap_smooth_8v_8bpp_c  :   44.2   38.7
> vp9_put_8tap_smooth_8v_8bpp_rvv_i32:8.57.7
> vp9_put_8tap_smooth_16h_8bpp_c :  165.7  147.2
> vp9_put_8tap_smooth_16h_8bpp_rvv_i32   :   19.5   17.5
> vp9_put_8tap_smooth_16v_8bpp_c :  169.0  149.7
> vp9_put_8tap_smooth_16v_8bpp_rvv_i32   :   19.7   17.5
> vp9_put_8tap_smooth_32h_8bpp_c :  659.7  586.7
> vp9_put_8tap_smooth_32h_8bpp_rvv_i32   :   64.2   57.2
> vp9_put_8tap_smooth_32v_8bpp_c :  680.5  591.2
> vp9_put_8tap_smooth_32v_8bpp_rvv_i32   :   64.2   57.2
> vp9_put_8tap_smooth_64h_8bpp_c : 2681.5 2339.0
> vp9_put_8tap_smooth_64h_8bpp_rvv_i32   :  255.5  114.2
> vp9_put_8tap_smooth_64v_8bpp_c : 2709.7 2348.7
> vp9_put_8tap_smooth_64v_8bpp_rvv_i32   :  255.5  114.0
> ---
>  libavcodec/riscv/vp9_mc_rvv.S  | 204 +
>  libavcodec/riscv/vp9dsp.h  |  72 
>  libavcodec/riscv/vp9dsp_init.c |  37 +-
>  3 files changed, 288 insertions(+), 25 deletions(-)
>
> diff --git a/libavcodec/riscv/vp9_mc_rvv.S b/libavcodec/riscv/vp9_mc_rvv.S
> index 990271736b..53dd833dac 100644
> --- a/libavcodec/riscv/vp9_mc_rvv.S
> +++ b/libavcodec/riscv/vp9_mc_rvv.S
> @@ -36,6 +36,18 @@
>  .endif
>  .endm
>
> +.macro vsetvlstatic16 len
> +.ifc \len,4
> +vsetvli zero, zero, e16, mf2, ta, ma
> +.elseif \len == 8
> +vsetvli zero, zero, e16, m1,

Re: [FFmpeg-devel] [PATCH v2 1/5] lavc/vp9dsp: R-V V mc avg

2024-05-26 Thread flow gg

Hi, maybe we can prioritize this revert:
https://git.ffmpeg.org/gitweb/ffmpeg.git/commit/0c1304ae11b0361ede055ee8ffc6e83529468c73
Using [PATCH v2 1/5] lavc/vp9dsp: R-V V mc avg to avoid conflicts with
other patches.

flow gg  于2024年5月24日周五 14:13写道：

> I want to update the VP9 bilin load, just like you did with VP8, but it
> seems like this patch（[PATCH v2 1/5] lavc/vp9dsp: R-V V mc avg） doesn't
> merge the current updates here but merges the previous version instead, so
> the subsequent patches will have conflicts.
>
> flow gg  于2024年5月22日周三 01:15写道：
>
>> > Please put commas between operands.
>> > This should probably be ff_avg_vp9 or something slightly more specific.
>>
>> Updated here.
>>
>>  于2024年5月22日周三 01:14写道：
>>
>>> From: sunyuechi 
>>>
>>> C908:
>>> vp9_avg4_8bpp_c: 1.2
>>> vp9_avg4_8bpp_rvv_i64: 1.0
>>> vp9_avg8_8bpp_c: 3.7
>>> vp9_avg8_8bpp_rvv_i64: 1.5
>>> vp9_avg16_8bpp_c: 14.7
>>> vp9_avg16_8bpp_rvv_i64: 3.5
>>> vp9_avg32_8bpp_c: 57.7
>>> vp9_avg32_8bpp_rvv_i64: 10.0
>>> vp9_avg64_8bpp_c: 229.0
>>> vp9_avg64_8bpp_rvv_i64: 31.7
>>> ---
>>>  libavcodec/riscv/Makefile  |  3 +-
>>>  libavcodec/riscv/vp9_mc_rvv.S  | 58 ++
>>>  libavcodec/riscv/vp9dsp.h  |  4 +--
>>>  libavcodec/riscv/vp9dsp_init.c | 18 +++
>>>  4 files changed, 80 insertions(+), 3 deletions(-)
>>>  create mode 100644 libavcodec/riscv/vp9_mc_rvv.S
>>>
>>> diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
>>> index 07d5c2915d..67e198d754 100644
>>> --- a/libavcodec/riscv/Makefile
>>> +++ b/libavcodec/riscv/Makefile
>>> @@ -69,6 +69,7 @@ RVV-OBJS-$(CONFIG_VP8DSP) += riscv/vp8dsp_rvv.o
>>>  OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9dsp_init.o
>>>  RV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvi.o \
>>>   riscv/vp9_mc_rvi.o
>>> -RVV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvv.o
>>> +RVV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvv.o \
>>> +  riscv/vp9_mc_rvv.o
>>>  OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_init.o
>>>  RVV-OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_rvv.o
>>> diff --git a/libavcodec/riscv/vp9_mc_rvv.S
>>> b/libavcodec/riscv/vp9_mc_rvv.S
>>> new file mode 100644
>>> index 00..7cb38ec94a
>>> --- /dev/null
>>> +++ b/libavcodec/riscv/vp9_mc_rvv.S
>>> @@ -0,0 +1,58 @@
>>> +/*
>>> + * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences
>>> (ISCAS).
>>> + *
>>> + * This file is part of FFmpeg.
>>> + *
>>> + * FFmpeg is free software; you can redistribute it and/or
>>> + * modify it under the terms of the GNU Lesser General Public
>>> + * License as published by the Free Software Foundation; either
>>> + * version 2.1 of the License, or (at your option) any later version.
>>> + *
>>> + * FFmpeg is distributed in the hope that it will be useful,
>>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
>>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>>> + * Lesser General Public License for more details.
>>> + *
>>> + * You should have received a copy of the GNU Lesser General Public
>>> + * License along with FFmpeg; if not, write to the Free Software
>>> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
>>> 02110-1301 USA
>>> + */
>>> +
>>> +#include "libavutil/riscv/asm.S"
>>> +
>>> +.macro vsetvlstatic8 len an maxlen mn=m4
>>> +.if \len == 4
>>> +vsetivlizero, \len, e8, mf4, ta, ma
>>> +.elseif \len == 8
>>> +vsetivlizero, \len, e8, mf2, ta, ma
>>> +.elseif \len == 16
>>> +vsetivlizero, \len, e8, m1, ta, ma
>>> +.elseif \len == 32
>>> +li  \an, \len
>>> +vsetvli zero, \an, e8, m2, ta, ma
>>> +.elseif \len == 64
>>> +li  \an, \maxlen
>>> +vsetvli zero, \an, e8, \mn, ta, ma
>>> +.endif
>>> +.endm
>>> +
>>> +.macro copy_avg len
>>> +func ff_vp9_avg\len\()_rvv, zve32x
>>> +csrwi   vxrm, 0
>>> +vsetvlstatic8   \len, t0, 64
>>> +1:
>>> +vle8.v  v8, (a2)
>>> +vle8.v

Re: [FFmpeg-devel] [PATCH 5/5] lavc/vp8dsp: factor R-V V EPEL functions for all lengths

2024-05-25 Thread flow gg

Well, I'm mainly considering that we have added some vset related lines,
but they haven't played a new role for the time being. If it's for future
modifications, it does make sense.

> This is reducing code size by over 2 kib of code, or several hundreds of
instructions.

The reduction in code size seems to be due to switching to using j labels,
doesn't seem to be about vset, but another issue. j labels are indeed
better. I will make similar modifications.

Rémi Denis-Courmont  于2024年5月26日周日 02:29写道：

> Le lauantaina 25. toukokuuta 2024, 21.16.22 EEST flow gg a écrit :
> > Would it be better to replace the two vsetvlstatic8 and vsetvlstatic16
> with
> > two vsetvl?
>
> The other option is to hard-code the most pessimistic multiplier. That
> would
> be easier to read and save two instructions in the head, it would most
> likely
> end up slower overall, due to increased latency from the vector unit in
> the
> main loop.
>
> On the other hand, with vsetvl, we have the option to adjust the
> multiplier at
> run-time depending on hardware vector size. That will not be possible with
> vsetvli unless we patch the code live (yikes).
>
> > This would require the previous patch and this one to work
> > together,
>
> Yes, patch order matters.
>
> > increasing the number of lines of code
>
> This is reducing code size by over 2 kib of code, or several hundreds of
> instructions.
>
> > Additionally, I have a question about patch 4 'save one R-V GPR' and
> patch
> > 5. Should they be submitted as a single patch? Because patch 4 looks
> > similar to what I initially submitted, and you suggested changing it to
> > save lines of code. If it is only for patch 5, shouldn't they be combined
> > together?
>
> I think people here like to have as small and many patches as possible, as
> is
> generally considered the right way to use Git. Since patch 4 is a very
> minor
> but still independent (from patch 5) improvement, it should be separate,
> as
> far as I understand FFmpeg's practices.
>
> --
> レミ・デニ-クールモン
> http://www.remlab.net/
>
>
>
> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
>
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH 5/5] lavc/vp8dsp: factor R-V V EPEL functions for all lengths

2024-05-25 Thread flow gg

Would it be better to replace the two vsetvlstatic8 and vsetvlstatic16 with
two vsetvl? This would require the previous patch and this one to work
together, increasing the number of lines of code and making the code a bit
harder to read.
Additionally, I have a question about patch 4 'save one R-V GPR' and patch
5. Should they be submitted as a single patch? Because patch 4 looks
similar to what I initially submitted, and you suggested changing it to
save lines of code. If it is only for patch 5, shouldn't they be combined
together?

Rémi Denis-Courmont  于2024年5月25日周六 23:39写道：

> ---
>  libavcodec/riscv/vp8dsp_rvv.S | 56 ---
>  1 file changed, 32 insertions(+), 24 deletions(-)
>
> diff --git a/libavcodec/riscv/vp8dsp_rvv.S b/libavcodec/riscv/vp8dsp_rvv.S
> index a4fcd158a5..002e7f3174 100644
> --- a/libavcodec/riscv/vp8dsp_rvv.S
> +++ b/libavcodec/riscv/vp8dsp_rvv.S
> @@ -32,16 +32,6 @@
>  .endif
>  .endm
>
> -.macro vsetvlstatic16 len
> -.if \len <= 4
> -vsetivlizero, \len, e16, mf2, ta, ma
> -.elseif \len <= 8
> -vsetivlizero, \len, e16, m1, ta, ma
> -.elseif \len <= 16
> -vsetivlizero, \len, e16, m2, ta, ma
> -.endif
> -.endm
> -
>  .macro vp8_idct_dc_add
>  vlse32.v  v0, (a0), a2
>  lha5, 0(a1)
> @@ -181,13 +171,8 @@ const subpel_filters
>  .byte 0,  -1,  12, 123,  -6, 0
>  endconst
>
> -.macro epel len size type
> -func ff_put_vp8_epel\len\()_\type\()\size\()_rvv, zve32x
> -.ifc \type,v
> -addit0, a6, -1
> -.else
> -addit0, a5, -1
> -.endif
> +.macro epel_common size, type
> +func ff_put_vp8_epel_\type\()\size\().rvv, zve32x
>  lla t2, subpel_filters
>  sh1add  t0, t0, t0
>  sh1add  t0, t0, t2
> @@ -198,7 +183,6 @@ func ff_put_vp8_epel\len\()_\type\()\size\()_rvv,
> zve32x
>  lb  t5, 5(t0)
>  lb  t0, (t0)
>  .endif
> -vsetvlstatic8   \len
>  1:
>  addia4, a4, -1
>  .ifc \type,v
> @@ -236,11 +220,11 @@ func ff_put_vp8_epel\len\()_\type\()\size\()_rvv,
> zve32x
>  vwmaccsu.vx v16, t1, v22
>  vwmaccsu.vx v16, t4, v28
>  vwadd.wxv16, v16, t6
> -vsetvlstatic16  \len
> +vsetvl  zero, zero, a6 # e16
>  vwadd.vvv24, v16, v20
>  vnsra.wiv24, v24, 7
>  vmax.vx v24, v24, zero
> -vsetvlstatic8   \len
> +vsetvl  zero, zero, a5 # e8
>  vnclipu.wi  v30, v24, 0
>  add a2, a2, a3
>  vse8.v  v30, (a0)
> @@ -251,9 +235,33 @@ func ff_put_vp8_epel\len\()_\type\()\size\()_rvv,
> zve32x
>  endfunc
>  .endm
>
> +.macro epel len, size, type
> +func ff_put_vp8_epel\len\()_\type\()\size\()_rvv, zve32x
> +.ifc \type,v
> +addit0, a6, -1
> +.else
> +addit0, a5, -1
> +.endif
> +.if \len <= 4
> +li  a5, 0306 # e8, mf4, ta, ma
> +li  a6, 0317 # e16, mf2, ta, ma
> +.elseif \len <= 8
> +li  a5, 0307 # e8, mf2, ta, ma
> +li  a6, 0310 # e16, m1, ta, ma
> +.else # if len <= 16
> +li  a5, 0300 # e8, m1, ta, ma
> +li  a6, 0311 # e16, m2, ta, ma
> +.endif
> +vsetvlstatic8 \len
> +j   ff_put_vp8_epel_\type\()\size\().rvv
> +endfunc
> +.endm
> +
> +.irp type,h,v
> +.irp size,4,6
> +epel_common \size, \type
>  .irp len,16,8,4
> -epel \len 6 h
> -epel \len 4 h
> -epel \len 6 v
> -epel \len 4 v
> +epel \len, \size, \type
> +.endr
> +.endr
>  .endr
> --
> 2.45.1
>
> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
>
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH v2 3/5] lavc/vp9dsp: R-V V mc tap h v

2024-05-25 Thread flow gg

One more thing I remember is that after adjusting the sign, vmacc can be
used; otherwise, due to the sign, mul + add are needed.

flow gg  于2024年5月25日周六 18:38写道：

> > Is there a reason that you cannot use the tables from C code?
>
> Similar to VP8, to adjust the positive and negative data and prevent small
> probability overflow during calculations.
>
> > AFAICT, regular and sharp are identical, except for the base address of
> the
> > filter table, so it should be possible to share the byte code
>
> Initially, they used the same code, but after testing hundreds of times,
> there were always a few failures...
>
> Because the data in the table is different, when regular, sharp, and
> smooth use the same code, there will always be a small amount of overflow.
> Different signed and unsigned calculations are needed.
>
> > A French philosopher famously said that Perfect is the ennemy of Good.
> > Generally, as with VVC, nested repetition macros for finely specialised
> > functions tend to generate way too much byte code, and this ends up being
> > worse rather than better in the big picture.
>
> Got it, I will try to update.
>
> Rémi Denis-Courmont  于2024年5月25日周六 18:17写道：
>
>> Le tiistaina 21. toukokuuta 2024, 20.13.17 EEST u...@foxmail.com a écrit
>> :
>> > From: sunyuechi 
>> >
>> >  C908   X60
>> > vp9_avg_8tap_smooth_4h_8bpp_c  :   13.0   11.2
>> > vp9_avg_8tap_smooth_4h_8bpp_rvv_i32:5.04.2
>> > vp9_avg_8tap_smooth_4v_8bpp_c  :   13.7   12.5
>> > vp9_avg_8tap_smooth_4v_8bpp_rvv_i32:5.04.2
>> > vp9_avg_8tap_smooth_8h_8bpp_c  :   49.5   42.2
>> > vp9_avg_8tap_smooth_8h_8bpp_rvv_i32:9.28.5
>> > vp9_avg_8tap_smooth_8v_8bpp_c  :   66.5   45.0
>> > vp9_avg_8tap_smooth_8v_8bpp_rvv_i32:9.58.5
>> > vp9_avg_8tap_smooth_16h_8bpp_c :  192.7  166.5
>> > vp9_avg_8tap_smooth_16h_8bpp_rvv_i32   :   21.2   18.7
>> > vp9_avg_8tap_smooth_16v_8bpp_c :  192.2  175.7
>> > vp9_avg_8tap_smooth_16v_8bpp_rvv_i32   :   21.5   19.0
>> > vp9_avg_8tap_smooth_32h_8bpp_c :  780.2  663.7
>> > vp9_avg_8tap_smooth_32h_8bpp_rvv_i32   :   83.5   60.0
>> > vp9_avg_8tap_smooth_32v_8bpp_c :  770.5  689.2
>> > vp9_avg_8tap_smooth_32v_8bpp_rvv_i32   :   67.2   60.0
>> > vp9_avg_8tap_smooth_64h_8bpp_c : 3115.5 2647.2
>> > vp9_avg_8tap_smooth_64h_8bpp_rvv_i32   :  283.5  119.2
>> > vp9_avg_8tap_smooth_64v_8bpp_c : 3082.2 2729.0
>> > vp9_avg_8tap_smooth_64v_8bpp_rvv_i32   :  305.2  119.0
>> > vp9_put_8tap_smooth_4h_8bpp_c  :   11.29.7
>> > vp9_put_8tap_smooth_4h_8bpp_rvv_i32:4.24.0
>> > vp9_put_8tap_smooth_4v_8bpp_c  :   11.7   10.7
>> > vp9_put_8tap_smooth_4v_8bpp_rvv_i32:4.24.0
>> > vp9_put_8tap_smooth_8h_8bpp_c  :   42.0   37.5
>> > vp9_put_8tap_smooth_8h_8bpp_rvv_i32:8.57.7
>> > vp9_put_8tap_smooth_8v_8bpp_c  :   44.2   38.7
>> > vp9_put_8tap_smooth_8v_8bpp_rvv_i32:8.57.7
>> > vp9_put_8tap_smooth_16h_8bpp_c :  165.7  147.2
>> > vp9_put_8tap_smooth_16h_8bpp_rvv_i32   :   19.5   17.5
>> > vp9_put_8tap_smooth_16v_8bpp_c :  169.0  149.7
>> > vp9_put_8tap_smooth_16v_8bpp_rvv_i32   :   19.7   17.5
>> > vp9_put_8tap_smooth_32h_8bpp_c :  659.7  586.7
>> > vp9_put_8tap_smooth_32h_8bpp_rvv_i32   :   64.2   57.2
>> > vp9_put_8tap_smooth_32v_8bpp_c :  680.5  591.2
>> > vp9_put_8tap_smooth_32v_8bpp_rvv_i32   :   64.2   57.2
>> > vp9_put_8tap_smooth_64h_8bpp_c : 2681.5 2339.0
>> > vp9_put_8tap_smooth_64h_8bpp_rvv_i32   :  255.5  114.2
>> > vp9_put_8tap_smooth_64v_8bpp_c : 2709.7 2348.7
>> > vp9_put_8tap_smooth_64v_8bpp_rvv_i32   :  255.5  114.0
>> > ---
>> >  libavcodec/riscv/vp9_mc_rvv.S  | 243 +
>> >  libavcodec/riscv/vp9dsp.h  |  72 ++
>> >  libavcodec/riscv/vp9dsp_init.c |  38 +-
>> >  3 files change

Re: [FFmpeg-devel] [PATCH v2 3/5] lavc/vp9dsp: R-V V mc tap h v

2024-05-25 Thread flow gg

> Is there a reason that you cannot use the tables from C code?

Similar to VP8, to adjust the positive and negative data and prevent small
probability overflow during calculations.

> AFAICT, regular and sharp are identical, except for the base address of
the
> filter table, so it should be possible to share the byte code

Initially, they used the same code, but after testing hundreds of times,
there were always a few failures...

Because the data in the table is different, when regular, sharp, and smooth
use the same code, there will always be a small amount of overflow.
Different signed and unsigned calculations are needed.

> A French philosopher famously said that Perfect is the ennemy of Good.
> Generally, as with VVC, nested repetition macros for finely specialised
> functions tend to generate way too much byte code, and this ends up being
> worse rather than better in the big picture.

Got it, I will try to update.

Rémi Denis-Courmont  于2024年5月25日周六 18:17写道：

> Le tiistaina 21. toukokuuta 2024, 20.13.17 EEST u...@foxmail.com a écrit :
> > From: sunyuechi 
> >
> >  C908   X60
> > vp9_avg_8tap_smooth_4h_8bpp_c  :   13.0   11.2
> > vp9_avg_8tap_smooth_4h_8bpp_rvv_i32:5.04.2
> > vp9_avg_8tap_smooth_4v_8bpp_c  :   13.7   12.5
> > vp9_avg_8tap_smooth_4v_8bpp_rvv_i32:5.04.2
> > vp9_avg_8tap_smooth_8h_8bpp_c  :   49.5   42.2
> > vp9_avg_8tap_smooth_8h_8bpp_rvv_i32:9.28.5
> > vp9_avg_8tap_smooth_8v_8bpp_c  :   66.5   45.0
> > vp9_avg_8tap_smooth_8v_8bpp_rvv_i32:9.58.5
> > vp9_avg_8tap_smooth_16h_8bpp_c :  192.7  166.5
> > vp9_avg_8tap_smooth_16h_8bpp_rvv_i32   :   21.2   18.7
> > vp9_avg_8tap_smooth_16v_8bpp_c :  192.2  175.7
> > vp9_avg_8tap_smooth_16v_8bpp_rvv_i32   :   21.5   19.0
> > vp9_avg_8tap_smooth_32h_8bpp_c :  780.2  663.7
> > vp9_avg_8tap_smooth_32h_8bpp_rvv_i32   :   83.5   60.0
> > vp9_avg_8tap_smooth_32v_8bpp_c :  770.5  689.2
> > vp9_avg_8tap_smooth_32v_8bpp_rvv_i32   :   67.2   60.0
> > vp9_avg_8tap_smooth_64h_8bpp_c : 3115.5 2647.2
> > vp9_avg_8tap_smooth_64h_8bpp_rvv_i32   :  283.5  119.2
> > vp9_avg_8tap_smooth_64v_8bpp_c : 3082.2 2729.0
> > vp9_avg_8tap_smooth_64v_8bpp_rvv_i32   :  305.2  119.0
> > vp9_put_8tap_smooth_4h_8bpp_c  :   11.29.7
> > vp9_put_8tap_smooth_4h_8bpp_rvv_i32:4.24.0
> > vp9_put_8tap_smooth_4v_8bpp_c  :   11.7   10.7
> > vp9_put_8tap_smooth_4v_8bpp_rvv_i32:4.24.0
> > vp9_put_8tap_smooth_8h_8bpp_c  :   42.0   37.5
> > vp9_put_8tap_smooth_8h_8bpp_rvv_i32:8.57.7
> > vp9_put_8tap_smooth_8v_8bpp_c  :   44.2   38.7
> > vp9_put_8tap_smooth_8v_8bpp_rvv_i32:8.57.7
> > vp9_put_8tap_smooth_16h_8bpp_c :  165.7  147.2
> > vp9_put_8tap_smooth_16h_8bpp_rvv_i32   :   19.5   17.5
> > vp9_put_8tap_smooth_16v_8bpp_c :  169.0  149.7
> > vp9_put_8tap_smooth_16v_8bpp_rvv_i32   :   19.7   17.5
> > vp9_put_8tap_smooth_32h_8bpp_c :  659.7  586.7
> > vp9_put_8tap_smooth_32h_8bpp_rvv_i32   :   64.2   57.2
> > vp9_put_8tap_smooth_32v_8bpp_c :  680.5  591.2
> > vp9_put_8tap_smooth_32v_8bpp_rvv_i32   :   64.2   57.2
> > vp9_put_8tap_smooth_64h_8bpp_c : 2681.5 2339.0
> > vp9_put_8tap_smooth_64h_8bpp_rvv_i32   :  255.5  114.2
> > vp9_put_8tap_smooth_64v_8bpp_c : 2709.7 2348.7
> > vp9_put_8tap_smooth_64v_8bpp_rvv_i32   :  255.5  114.0
> > ---
> >  libavcodec/riscv/vp9_mc_rvv.S  | 243 +
> >  libavcodec/riscv/vp9dsp.h  |  72 ++
> >  libavcodec/riscv/vp9dsp_init.c |  38 +-
> >  3 files changed, 328 insertions(+), 25 deletions(-)
> >
> > diff --git a/libavcodec/riscv/vp9_mc_rvv.S
> b/libavcodec/riscv/vp9_mc_rvv.S
> > index 739380d9a9..adba4afb90 100644
> > --- a/libavcodec/riscv/vp9_mc_rvv.S
> > +++ b/libavcodec/riscv/vp9_mc_rvv.S
> > @@ -36,6 +36,18 @@
> >  .endif
> >  .endm
> >
> > +.macro vsetvlstatic16 len
> > +.ifc \len,4
> > +vsetvli zero, zero, e16, mf2, ta, ma
> > +.elseif \len == 8
> > +vsetvli zero, zero, e16, m1, ta, ma
> > +.elseif \len == 16
> > +vsetvli zero, zero, e16, m2, ta, ma
> > +.else
> > +vsetvli zero, zero, e16, m4, ta, ma
> > +.endif
> > +.endm
> > +
> >  .macro copy_avg len
> >  func ff_vp9_avg\len\()_rvv, zve32x
> >  csrwi   vxrm, 0
> > @@ -92,10 +104,241 @@ func ff_\op\()_v

Re: [FFmpeg-devel] [PATCH v2 1/5] lavc/vp9dsp: R-V V mc avg

2024-05-23 Thread flow gg

I want to update the VP9 bilin load, just like you did with VP8, but it
seems like this patch（[PATCH v2 1/5] lavc/vp9dsp: R-V V mc avg） doesn't
merge the current updates here but merges the previous version instead, so
the subsequent patches will have conflicts.

flow gg  于2024年5月22日周三 01:15写道：

> > Please put commas between operands.
> > This should probably be ff_avg_vp9 or something slightly more specific.
>
> Updated here.
>
>  于2024年5月22日周三 01:14写道：
>
>> From: sunyuechi 
>>
>> C908:
>> vp9_avg4_8bpp_c: 1.2
>> vp9_avg4_8bpp_rvv_i64: 1.0
>> vp9_avg8_8bpp_c: 3.7
>> vp9_avg8_8bpp_rvv_i64: 1.5
>> vp9_avg16_8bpp_c: 14.7
>> vp9_avg16_8bpp_rvv_i64: 3.5
>> vp9_avg32_8bpp_c: 57.7
>> vp9_avg32_8bpp_rvv_i64: 10.0
>> vp9_avg64_8bpp_c: 229.0
>> vp9_avg64_8bpp_rvv_i64: 31.7
>> ---
>>  libavcodec/riscv/Makefile  |  3 +-
>>  libavcodec/riscv/vp9_mc_rvv.S  | 58 ++
>>  libavcodec/riscv/vp9dsp.h  |  4 +--
>>  libavcodec/riscv/vp9dsp_init.c | 18 +++
>>  4 files changed, 80 insertions(+), 3 deletions(-)
>>  create mode 100644 libavcodec/riscv/vp9_mc_rvv.S
>>
>> diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
>> index 07d5c2915d..67e198d754 100644
>> --- a/libavcodec/riscv/Makefile
>> +++ b/libavcodec/riscv/Makefile
>> @@ -69,6 +69,7 @@ RVV-OBJS-$(CONFIG_VP8DSP) += riscv/vp8dsp_rvv.o
>>  OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9dsp_init.o
>>  RV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvi.o \
>>   riscv/vp9_mc_rvi.o
>> -RVV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvv.o
>> +RVV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvv.o \
>> +  riscv/vp9_mc_rvv.o
>>  OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_init.o
>>  RVV-OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_rvv.o
>> diff --git a/libavcodec/riscv/vp9_mc_rvv.S b/libavcodec/riscv/vp9_mc_rvv.S
>> new file mode 100644
>> index 00..7cb38ec94a
>> --- /dev/null
>> +++ b/libavcodec/riscv/vp9_mc_rvv.S
>> @@ -0,0 +1,58 @@
>> +/*
>> + * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences
>> (ISCAS).
>> + *
>> + * This file is part of FFmpeg.
>> + *
>> + * FFmpeg is free software; you can redistribute it and/or
>> + * modify it under the terms of the GNU Lesser General Public
>> + * License as published by the Free Software Foundation; either
>> + * version 2.1 of the License, or (at your option) any later version.
>> + *
>> + * FFmpeg is distributed in the hope that it will be useful,
>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>> + * Lesser General Public License for more details.
>> + *
>> + * You should have received a copy of the GNU Lesser General Public
>> + * License along with FFmpeg; if not, write to the Free Software
>> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
>> 02110-1301 USA
>> + */
>> +
>> +#include "libavutil/riscv/asm.S"
>> +
>> +.macro vsetvlstatic8 len an maxlen mn=m4
>> +.if \len == 4
>> +vsetivlizero, \len, e8, mf4, ta, ma
>> +.elseif \len == 8
>> +vsetivlizero, \len, e8, mf2, ta, ma
>> +.elseif \len == 16
>> +vsetivlizero, \len, e8, m1, ta, ma
>> +.elseif \len == 32
>> +li  \an, \len
>> +vsetvli zero, \an, e8, m2, ta, ma
>> +.elseif \len == 64
>> +li  \an, \maxlen
>> +vsetvli zero, \an, e8, \mn, ta, ma
>> +.endif
>> +.endm
>> +
>> +.macro copy_avg len
>> +func ff_vp9_avg\len\()_rvv, zve32x
>> +csrwi   vxrm, 0
>> +vsetvlstatic8   \len, t0, 64
>> +1:
>> +vle8.v  v8, (a2)
>> +vle8.v  v16, (a0)
>> +vaaddu.vv   v8, v8, v16
>> +addia4, a4, -1
>> +vse8.v  v8, (a0)
>> +add a2, a2, a3
>> +add a0, a0, a1
>> +bneza4, 1b
>> +ret
>> +endfunc
>> +.endm
>> +
>> +.irp len, 64, 32, 16, 8, 4
>> +copy_avg \len
>> +.endr
>> diff --git a/libavcodec/riscv/vp9dsp.h b/libavcodec/riscv/vp9dsp.h
>> index 79330b4968..ff8431591c 100644
>> --- a/libavcodec/riscv/vp9dsp.h
>> +++ b/libavcodec/riscv/vp9dsp.h
>> @@ -138,11 +138,11 @@ void ff_a

Re: [FFmpeg-devel] [PATCH] lavc/rv34dsp: optimise R-V V idct_dc_add

2024-05-22 Thread flow gg

Unfortunately I only test to obtain benchmarks and basic correctness. I
always feel the need for a professional to write the tests.

Rémi Denis-Courmont  于2024年5月23日周四 04:35写道：

>
>
> Le 22 mai 2024 23:28:54 GMT+03:00, "Rémi Denis-Courmont" 
> a écrit :
> >This removes one stray LI and reworks the vector arithmetic to avoid
> >changing the vector configuration. On K230, this takes the 46.5 cycle
> >count down from 46.5 to 43.5.
> >---
> > libavcodec/riscv/rv34dsp_rvv.S | 13 ++---
> > 1 file changed, 6 insertions(+), 7 deletions(-)
> >
> >diff --git a/libavcodec/riscv/rv34dsp_rvv.S
> b/libavcodec/riscv/rv34dsp_rvv.S
> >index f1f6345012..e8aff7e570 100644
> >--- a/libavcodec/riscv/rv34dsp_rvv.S
> >+++ b/libavcodec/riscv/rv34dsp_rvv.S
> >@@ -36,16 +36,15 @@ func ff_rv34_idct_dc_add_rvv, zve32x
> > vsetivli  zero, 4, e8, mf4, ta, ma
> > vlse32.v  v0, (a0), a1
> > lit1, 169
> >+lit2, 128
> > mul   t1, t1, a2
> >-lia2, 255
> >+vsetivli  zero, 4*4, e8, m1, ta, ma
> >+vwsubu.vx v2, v0, t2
> > addi  t1, t1, 512
> > srai  t1, t1, 10
> >-vsetivli  zero, 4*4, e16, m2, ta, ma
> >-vzext.vf2 v2, v0
> >-vadd.vx   v2, v2, t1
> >-vmax.vx   v2, v2, zero
> >-vsetvli   zero, zero, e8, m1, ta, ma
> >-vnclipu.wiv0, v2, 0
> >+vwadd.wx  v2, v2, t1
>
> Hmm, this should not work, as t1 has more than 8 bits. Maybe checkasm is
> sloppy here.
>
> >+vnclip.wi v0, v2, 0
> >+vxor.vx   v0, v0, t2
> > vsetivli  zero, 4, e8, mf4, ta, ma
> > vsse32.v  v0, (a0), a1
> >
> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
>
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH] lavc/vvc_mc: R-V V avg w_avg

2024-05-21 Thread flow gg

Reordered some here.

 于2024年5月22日周三 03:24写道：

> From: sunyuechi 
>
>   C908   X60
> avg_8_2x2_c:1.01.0
> avg_8_2x2_rvv_i32  :0.70.7
> avg_8_2x4_c:2.02.0
> avg_8_2x4_rvv_i32  :1.00.7
> avg_8_2x8_c:4.03.7
> avg_8_2x8_rvv_i32  :1.51.2
> avg_8_2x16_c   :7.57.7
> avg_8_2x16_rvv_i32 :2.72.5
> avg_8_2x32_c   :   14.2   15.0
> avg_8_2x32_rvv_i32 :5.04.5
> avg_8_2x64_c   :   28.5   30.2
> avg_8_2x64_rvv_i32 :9.58.7
> avg_8_2x128_c  :   80.0   70.5
> avg_8_2x128_rvv_i32:   50.7   41.2
> avg_8_4x2_c:1.72.0
> avg_8_4x2_rvv_i32  :0.70.7
> avg_8_4x4_c:3.53.7
> avg_8_4x4_rvv_i32  :1.21.0
> avg_8_4x8_c:6.77.0
> avg_8_4x8_rvv_i32  :1.51.2
> avg_8_4x16_c   :   13.2   14.0
> avg_8_4x16_rvv_i32 :2.72.5
> avg_8_4x32_c   :   26.2   27.7
> avg_8_4x32_rvv_i32 :5.04.5
> avg_8_4x64_c   :   52.2   55.0
> avg_8_4x64_rvv_i32 :9.58.7
> avg_8_4x128_c  :  146.0  117.5
> avg_8_4x128_rvv_i32:   53.2   40.5
> avg_8_8x2_c:3.53.5
> avg_8_8x2_rvv_i32  :0.70.7
> avg_8_8x4_c:6.56.5
> avg_8_8x4_rvv_i32  :1.21.0
> avg_8_8x8_c:   12.7   13.2
> avg_8_8x8_rvv_i32  :2.01.5
> avg_8_8x16_c   :   25.2   26.2
> avg_8_8x16_rvv_i32 :3.52.5
> avg_8_8x32_c   :   50.0   52.7
> avg_8_8x32_rvv_i32 :6.54.7
> avg_8_8x64_c   :   99.7  105.0
> avg_8_8x64_rvv_i32 :   12.58.5
> avg_8_8x128_c  :  225.7  218.0
> avg_8_8x128_rvv_i32:   78.0   39.2
> avg_8_16x2_c   :6.26.7
> avg_8_16x2_rvv_i32 :1.20.7
> avg_8_16x4_c   :   12.2   12.7
> avg_8_16x4_rvv_i32 :2.01.2
> avg_8_16x8_c   :   24.7   26.0
> avg_8_16x8_rvv_i32 :3.51.7
> avg_8_16x16_c  :   49.0   51.5
> avg_8_16x16_rvv_i32:6.23.2
> avg_8_16x32_c  :   97.5  102.5
> avg_8_16x32_rvv_i32:   11.55.7
> avg_8_16x64_c  :  212.5  204.7
> avg_8_16x64_rvv_i32:   22.5   11.0
> avg_8_16x128_c :  411.2  418.2
> avg_8_16x128_rvv_i32   :   76.0   47.7
> avg_8_32x2_c   :   12.2   12.7
> avg_8_32x2_rvv_i32 :2.01.2
> avg_8_32x4_c   :   24.2   25.5
> avg_8_32x4_rvv_i32 :3.21.7
> avg_8_32x8_c   :   48.5   50.7
> avg_8_32x8_rvv_i32 :5.73.2
> avg_8_32x16_c  :   96.5  101.2
> avg_8_32x16_rvv_i32:   10.75.7
> avg_8_32x32_c  :  192.5  202.5
> avg_8_32x32_rvv_i32:   20.7   10.5
> avg_8_32x64_c  :  411.2  404.5
> avg_8_32x64_rvv_i32:   41.0   20.5
> avg_8_32x128_c :  834.7  855.2
> avg_8_32x128_rvv_i32   :  151.2  118.7
> avg_8_64x2_c

Re: [FFmpeg-devel] [PATCH v2 2/5] lavc/vp9dsp: R-V V mc bilin h v

2024-05-21 Thread flow gg

Do macros definition also need a comma? I noticed that many of my old code
and SiFive's code don't have a comma

Rémi Denis-Courmont  于2024年5月22日周三 02:29写道：

> Le tiistaina 21. toukokuuta 2024, 20.13.16 EEST u...@foxmail.com a écrit :
> > From: sunyuechi 
>
> > diff --git a/libavcodec/riscv/vp9_mc_rvv.S
> b/libavcodec/riscv/vp9_mc_rvv.S
> > index 7cb38ec94a..739380d9a9 100644
> > --- a/libavcodec/riscv/vp9_mc_rvv.S
> > +++ b/libavcodec/riscv/vp9_mc_rvv.S
> > @@ -53,6 +53,49 @@ func ff_vp9_avg\len\()_rvv, zve32x
> >  endfunc
> >  .endm
> >
> > +.macro bilin_load dst len op type mn
>
> Commas, please.
>
> --
> Rémi Denis-Courmont
> http://www.remlab.net/
>
>
>
> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
>
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH] lavc/vvc_mc: R-V V avg w_avg

2024-05-21 Thread flow gg

> I would expect that you can get better performance by interleaving scalar
and
vector stuff, and possibly also vector loads and vector arithmetic.

Okay, I will try

> These labels lead to nowhere? If you actually mean to implicitly fall
through
to the next function, you can use the function name directly rather than add
odd labels.

These labels are used to convert variable parameters to constants to
achieve better performance and prepare for the next .irp. Some names are
strange because they cannot be duplicated. Here, there is only one
function, which should be executed after going through these labels?

Rémi Denis-Courmont  于2024年5月22日周三 00:04写道：

> Le tiistaina 21. toukokuuta 2024, 10.37.51 EEST u...@foxmail.com a écrit :
> > From: sunyuechi 
> > ---
> >  libavcodec/riscv/Makefile  |   2 +
> >  libavcodec/riscv/vvc_mc_rvv.S  | 312 +
> >  libavcodec/riscv/vvcdsp_init.c |  76 
> >  libavcodec/vvc/dsp.c   |   4 +-
> >  libavcodec/vvc/dsp.h   |   1 +
> >  5 files changed, 394 insertions(+), 1 deletion(-)
> >  create mode 100644 libavcodec/riscv/vvc_mc_rvv.S
> >  create mode 100644 libavcodec/riscv/vvcdsp_init.c
> >
> > diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
> > index 27b268ae39..6297664fc9 100644
> > --- a/libavcodec/riscv/Makefile
> > +++ b/libavcodec/riscv/Makefile
> > @@ -68,3 +68,5 @@ RV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvi.o
> \
> >  RVV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvv.o
> >  OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_init.o
> >  RVV-OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_rvv.o
> > +OBJS-$(CONFIG_VVC_DECODER) += riscv/vvcdsp_init.o
> > +RVV-OBJS-$(CONFIG_VVC_DECODER) += riscv/vvc_mc_rvv.o
> > diff --git a/libavcodec/riscv/vvc_mc_rvv.S
> b/libavcodec/riscv/vvc_mc_rvv.S
> > new file mode 100644
> > index 00..26a6afba1f
> > --- /dev/null
> > +++ b/libavcodec/riscv/vvc_mc_rvv.S
> > @@ -0,0 +1,312 @@
> > +/*
> > + * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences
> > (ISCAS). + *
> > + * This file is part of FFmpeg.
> > + *
> > + * FFmpeg is free software; you can redistribute it and/or
> > + * modify it under the terms of the GNU Lesser General Public
> > + * License as published by the Free Software Foundation; either
> > + * version 2.1 of the License, or (at your option) any later version.
> > + *
> > + * FFmpeg is distributed in the hope that it will be useful,
> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > + * Lesser General Public License for more details.
> > + *
> > + * You should have received a copy of the GNU Lesser General Public
> > + * License along with FFmpeg; if not, write to the Free Software
> > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> 02110-1301
> > USA + */
> > +
> > +#include "libavutil/riscv/asm.S"
> > +
> > +.macro vsetvlstatic8 w vlen is_w
> > +.if \w <= 2
> > +vsetivlizero, \w, e8, mf8, ta, ma
> > +.elseif \w <= 4 && \vlen == 128
> > +vsetivlizero, \w, e8, mf4, ta, ma
> > +.elseif \w <= 4 && \vlen >= 256
> > +vsetivlizero, \w, e8, mf8, ta, ma
> > +.elseif \w <= 8 && \vlen == 128
> > +vsetivlizero, \w, e8, mf2, ta, ma
> > +.elseif \w <= 8 && \vlen >= 256
> > +vsetivlizero, \w, e8, mf4, ta, ma
> > +.elseif \w <= 16 && \vlen == 128
> > +vsetivlizero, \w, e8, m1, ta, ma
> > +.elseif \w <= 16 && \vlen >= 256
> > +vsetivlizero, \w, e8, mf2, ta, ma
> > +.elseif \w <= 32 && \vlen >= 256
> > +li t0, \w
> > +vsetvli zero, t0, e8, m1, ta, ma
> > +.elseif \w <= (\vlen / 4) || \is_w
> > +li t0, 64
> > +vsetvli zero, t0, e8, m2, ta, ma
> > +.else
> > +li t0, \w
> > +vsetvli zero, t0, e8, m4, ta, ma
> > +.endif
> > +.endm
> > +
> > +.macro vsetvlstatic16 w vlen is_w
> > +.if \w <= 2
> > +vsetivlizero, \w, e16, mf4, ta, ma
> > +.elseif \w <= 4 && \vlen == 128
> > +vsetivlizero, \w, e16, mf2, ta, ma
> > +.elseif \w <= 4 && \vlen >= 256
> > +vsetivlizero, \w, e16, mf4, ta, ma
> > +.elseif \w <= 8 && \vlen == 128
> > +vsetivlizero, \w, e16, m1, ta, ma
> > +.elseif \w <= 8 && \vlen >= 256
> > +vsetivlizero, \w, e16, mf2, ta, ma
> > +.elseif \w <= 16 && \vlen == 128
> > +vsetivlizero, \w, e16, m2, ta, ma
> > +.elseif \w <= 16 && \vlen >= 256
> > +vsetivlizero, \w, e16, m1, ta, ma
> > +.elseif \w <= 32 && \vlen >= 25

Re: [FFmpeg-devel] [PATCH v2 1/5] lavc/vp9dsp: R-V V mc avg

2024-05-21 Thread flow gg

> Please put commas between operands.
> This should probably be ff_avg_vp9 or something slightly more specific.

Updated here.

 于2024年5月22日周三 01:14写道：

> From: sunyuechi 
>
> C908:
> vp9_avg4_8bpp_c: 1.2
> vp9_avg4_8bpp_rvv_i64: 1.0
> vp9_avg8_8bpp_c: 3.7
> vp9_avg8_8bpp_rvv_i64: 1.5
> vp9_avg16_8bpp_c: 14.7
> vp9_avg16_8bpp_rvv_i64: 3.5
> vp9_avg32_8bpp_c: 57.7
> vp9_avg32_8bpp_rvv_i64: 10.0
> vp9_avg64_8bpp_c: 229.0
> vp9_avg64_8bpp_rvv_i64: 31.7
> ---
>  libavcodec/riscv/Makefile  |  3 +-
>  libavcodec/riscv/vp9_mc_rvv.S  | 58 ++
>  libavcodec/riscv/vp9dsp.h  |  4 +--
>  libavcodec/riscv/vp9dsp_init.c | 18 +++
>  4 files changed, 80 insertions(+), 3 deletions(-)
>  create mode 100644 libavcodec/riscv/vp9_mc_rvv.S
>
> diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
> index 07d5c2915d..67e198d754 100644
> --- a/libavcodec/riscv/Makefile
> +++ b/libavcodec/riscv/Makefile
> @@ -69,6 +69,7 @@ RVV-OBJS-$(CONFIG_VP8DSP) += riscv/vp8dsp_rvv.o
>  OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9dsp_init.o
>  RV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvi.o \
>   riscv/vp9_mc_rvi.o
> -RVV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvv.o
> +RVV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvv.o \
> +  riscv/vp9_mc_rvv.o
>  OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_init.o
>  RVV-OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_rvv.o
> diff --git a/libavcodec/riscv/vp9_mc_rvv.S b/libavcodec/riscv/vp9_mc_rvv.S
> new file mode 100644
> index 00..7cb38ec94a
> --- /dev/null
> +++ b/libavcodec/riscv/vp9_mc_rvv.S
> @@ -0,0 +1,58 @@
> +/*
> + * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences
> (ISCAS).
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> 02110-1301 USA
> + */
> +
> +#include "libavutil/riscv/asm.S"
> +
> +.macro vsetvlstatic8 len an maxlen mn=m4
> +.if \len == 4
> +vsetivlizero, \len, e8, mf4, ta, ma
> +.elseif \len == 8
> +vsetivlizero, \len, e8, mf2, ta, ma
> +.elseif \len == 16
> +vsetivlizero, \len, e8, m1, ta, ma
> +.elseif \len == 32
> +li  \an, \len
> +vsetvli zero, \an, e8, m2, ta, ma
> +.elseif \len == 64
> +li  \an, \maxlen
> +vsetvli zero, \an, e8, \mn, ta, ma
> +.endif
> +.endm
> +
> +.macro copy_avg len
> +func ff_vp9_avg\len\()_rvv, zve32x
> +csrwi   vxrm, 0
> +vsetvlstatic8   \len, t0, 64
> +1:
> +vle8.v  v8, (a2)
> +vle8.v  v16, (a0)
> +vaaddu.vv   v8, v8, v16
> +addia4, a4, -1
> +vse8.v  v8, (a0)
> +add a2, a2, a3
> +add a0, a0, a1
> +bneza4, 1b
> +ret
> +endfunc
> +.endm
> +
> +.irp len, 64, 32, 16, 8, 4
> +copy_avg \len
> +.endr
> diff --git a/libavcodec/riscv/vp9dsp.h b/libavcodec/riscv/vp9dsp.h
> index 79330b4968..ff8431591c 100644
> --- a/libavcodec/riscv/vp9dsp.h
> +++ b/libavcodec/riscv/vp9dsp.h
> @@ -138,11 +138,11 @@ void ff_avg_bilin_##SIZE##hv_rvv(uint8_t *dst,
> ptrdiff_t dststride,\
>   int h, int mx, int my);
>
>  #define VP9_COPY_AVG_RISCV_RVV_FUNC(SIZE)   \
> -void ff_copy##SIZE##_rvv(uint8_t *dst, ptrdiff_t dststride,\
> +void ff_vp9_copy##SIZE##_rvv(uint8_t *dst, ptrdiff_t dststride,\
>   const uint8_t *src, ptrdiff_t srcstride,  \
>   int h, int mx, int my);   \
> \
> -void ff_avg##SIZE##_rvv(uint8_t *dst, ptrdiff_t dststride, \
> +void ff_vp9_avg##SIZE##_rvv(uint8_t *dst, ptrdiff_t dststride, \
>  const uint8_t *src, ptrdiff_t srcstride,   \
>  int h, int mx, int my);
>
> diff --git a/libavcodec/riscv/vp9dsp_init.c
> b/libavcodec/riscv/vp9dsp_init.c
> index ab99294d44..454dcd963f 100644
> --- a/libavcodec/riscv/vp9dsp_init.c
> +++ b/libavcodec/riscv/vp9dsp_init.c
> @@ -48,6 +48,24 @@ static av_cold void vp9dsp_mc_init_risc

Re: [FFmpeg-devel] [PATCH v4 1/5] lavc/vp9dsp: R-V V mc avg

2024-05-21 Thread flow gg

> Please put commas between operands.

Okay

> This should probably be ff_avg_vp9 or something slightly more specific.

Is it necessary here? Many macros in the C file are copied from MIPS, where
it is called ff_avg4_msa. Here, it has been simply changed to ff_avg4_rvv.

Rémi Denis-Courmont  于2024年5月21日周二 23:24写道：

> Le lauantaina 18. toukokuuta 2024, 21.15.29 EEST u...@foxmail.com a écrit
> :
> > From: sunyuechi 
> >
> > C908:
> > vp9_avg4_8bpp_c: 1.2
> > vp9_avg4_8bpp_rvv_i64: 1.0
> > vp9_avg8_8bpp_c: 3.7
> > vp9_avg8_8bpp_rvv_i64: 1.5
> > vp9_avg16_8bpp_c: 14.7
> > vp9_avg16_8bpp_rvv_i64: 3.5
> > vp9_avg32_8bpp_c: 57.7
> > vp9_avg32_8bpp_rvv_i64: 10.0
> > vp9_avg64_8bpp_c: 229.0
> > vp9_avg64_8bpp_rvv_i64: 31.7
> > ---
> >  libavcodec/riscv/Makefile  |  3 +-
> >  libavcodec/riscv/vp9_mc_rvv.S  | 58 ++
> >  libavcodec/riscv/vp9dsp_init.c | 18 +++
> >  3 files changed, 78 insertions(+), 1 deletion(-)
> >  create mode 100644 libavcodec/riscv/vp9_mc_rvv.S
> >
> > diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
> > index 27b268ae39..4739d83522 100644
> > --- a/libavcodec/riscv/Makefile
> > +++ b/libavcodec/riscv/Makefile
> > @@ -65,6 +65,7 @@ RVV-OBJS-$(CONFIG_VP8DSP) += riscv/vp8dsp_rvv.o
> >  OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9dsp_init.o
> >  RV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvi.o \
> >   riscv/vp9_mc_rvi.o
> > -RVV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvv.o
> > +RVV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvv.o \
> > +  riscv/vp9_mc_rvv.o
> >  OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_init.o
> >  RVV-OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_rvv.o
> > diff --git a/libavcodec/riscv/vp9_mc_rvv.S
> b/libavcodec/riscv/vp9_mc_rvv.S
> > new file mode 100644
> > index 00..7811cd9928
> > --- /dev/null
> > +++ b/libavcodec/riscv/vp9_mc_rvv.S
> > @@ -0,0 +1,58 @@
> > +/*
> > + * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences
> > (ISCAS). + *
> > + * This file is part of FFmpeg.
> > + *
> > + * FFmpeg is free software; you can redistribute it and/or
> > + * modify it under the terms of the GNU Lesser General Public
> > + * License as published by the Free Software Foundation; either
> > + * version 2.1 of the License, or (at your option) any later version.
> > + *
> > + * FFmpeg is distributed in the hope that it will be useful,
> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > + * Lesser General Public License for more details.
> > + *
> > + * You should have received a copy of the GNU Lesser General Public
> > + * License along with FFmpeg; if not, write to the Free Software
> > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> 02110-1301
> > USA + */
> > +
> > +#include "libavutil/riscv/asm.S"
> > +
> > +.macro vsetvlstatic8 len an maxlen mn=m4
>
> Please put commas between operands.
>
> > +.if \len == 4
> > +vsetivlizero, \len, e8, mf4, ta, ma
> > +.elseif \len == 8
> > +vsetivlizero, \len, e8, mf2, ta, ma
> > +.elseif \len == 16
> > +vsetivlizero, \len, e8, m1, ta, ma
> > +.elseif \len == 32
> > +li  \an, \len
> > +vsetvli zero, \an, e8, m2, ta, ma
> > +.elseif \len == 64
> > +li  \an, \maxlen
> > +vsetvli zero, \an, e8, \mn, ta, ma
> > +.endif
> > +.endm
> > +
> > +.macro copy_avg len
> > +func ff_avg\len\()_rvv, zve32x
>
> This should probably be ff_avg_vp9 or something slightly more specific.
>
> > +csrwi   vxrm, 0
> > +vsetvlstatic8   \len t0 64
> > +1:
> > +vle8.v  v8, (a2)
> > +vle8.v  v16, (a0)
> > +vaaddu.vv   v8, v8, v16
> > +addia4, a4, -1
> > +vse8.v  v8, (a0)
> > +add a2, a2, a3
> > +add a0, a0, a1
> > +bneza4, 1b
> > +ret
> > +endfunc
> > +.endm
> > +
> > +.irp len, 64, 32, 16, 8, 4
> > +copy_avg \len
> > +.endr
> > diff --git a/libavcodec/riscv/vp9dsp_init.c
> b/libavcodec/riscv/vp9dsp_init.c
> > index ab99294d44..6bfe23563a 100644
> > --- a/libavcodec/riscv/vp9dsp_init.c
> > +++ b/libavcodec/riscv/vp9dsp_init.c
> > @@ -48,6 +48,24 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext
> > *dsp, int bpp) }
> >  # endif
> >
> > +#if HAVE_RVV
> > +if (bpp == 8 && (flags & AV_CPU_FLAG_RVV_I32) &&
> ff_rv_vlen_least(128))
> > { +
> > +#define init_fpel(idx1, sz)   \
> > +dsp->mc[idx1][FILTER_8TAP_SMOOTH ][1][0][0] = ff_avg##sz##_rvv;  \
> > +dsp->mc[idx1][FILTER_8TAP_REGULAR][1][0][0] = ff_avg##sz##_rvv;  \
> > +dsp->mc[idx1][FILTER_8TAP_SHARP  ][1][0][0] = ff_avg##sz##_rvv;  \
> > +dsp->mc[idx1][FILTER_BILINEAR][1][0][0] = f

Re: [FFmpeg-devel] [PATCH] lavc/vvc_mc: R-V V avg w_avg

2024-05-21 Thread flow gg

There are three unused lines which I forgot to delete before submitting. I
have updated them here.

 于2024年5月21日周二 15:47写道：

> From: sunyuechi 
>
>   C908   X60
> avg_8_2x2_c:1.01.0
> avg_8_2x2_rvv_i32  :0.70.7
> avg_8_2x4_c:2.02.0
> avg_8_2x4_rvv_i32  :1.00.7
> avg_8_2x8_c:4.03.7
> avg_8_2x8_rvv_i32  :1.51.2
> avg_8_2x16_c   :7.57.7
> avg_8_2x16_rvv_i32 :2.72.5
> avg_8_2x32_c   :   14.2   15.0
> avg_8_2x32_rvv_i32 :5.04.5
> avg_8_2x64_c   :   28.5   30.2
> avg_8_2x64_rvv_i32 :9.58.7
> avg_8_2x128_c  :   80.0   70.5
> avg_8_2x128_rvv_i32:   50.7   41.2
> avg_8_4x2_c:1.72.0
> avg_8_4x2_rvv_i32  :0.70.7
> avg_8_4x4_c:3.53.7
> avg_8_4x4_rvv_i32  :1.21.0
> avg_8_4x8_c:6.77.0
> avg_8_4x8_rvv_i32  :1.51.2
> avg_8_4x16_c   :   13.2   14.0
> avg_8_4x16_rvv_i32 :2.72.5
> avg_8_4x32_c   :   26.2   27.7
> avg_8_4x32_rvv_i32 :5.04.5
> avg_8_4x64_c   :   52.2   55.0
> avg_8_4x64_rvv_i32 :9.58.7
> avg_8_4x128_c  :  146.0  117.5
> avg_8_4x128_rvv_i32:   53.2   40.5
> avg_8_8x2_c:3.53.5
> avg_8_8x2_rvv_i32  :0.70.7
> avg_8_8x4_c:6.56.5
> avg_8_8x4_rvv_i32  :1.21.0
> avg_8_8x8_c:   12.7   13.2
> avg_8_8x8_rvv_i32  :2.01.5
> avg_8_8x16_c   :   25.2   26.2
> avg_8_8x16_rvv_i32 :3.52.5
> avg_8_8x32_c   :   50.0   52.7
> avg_8_8x32_rvv_i32 :6.54.7
> avg_8_8x64_c   :   99.7  105.0
> avg_8_8x64_rvv_i32 :   12.58.5
> avg_8_8x128_c  :  225.7  218.0
> avg_8_8x128_rvv_i32:   78.0   39.2
> avg_8_16x2_c   :6.26.7
> avg_8_16x2_rvv_i32 :1.20.7
> avg_8_16x4_c   :   12.2   12.7
> avg_8_16x4_rvv_i32 :2.01.2
> avg_8_16x8_c   :   24.7   26.0
> avg_8_16x8_rvv_i32 :3.51.7
> avg_8_16x16_c  :   49.0   51.5
> avg_8_16x16_rvv_i32:6.23.2
> avg_8_16x32_c  :   97.5  102.5
> avg_8_16x32_rvv_i32:   11.55.7
> avg_8_16x64_c  :  212.5  204.7
> avg_8_16x64_rvv_i32:   22.5   11.0
> avg_8_16x128_c :  411.2  418.2
> avg_8_16x128_rvv_i32   :   76.0   47.7
> avg_8_32x2_c   :   12.2   12.7
> avg_8_32x2_rvv_i32 :2.01.2
> avg_8_32x4_c   :   24.2   25.5
> avg_8_32x4_rvv_i32 :3.21.7
> avg_8_32x8_c   :   48.5   50.7
> avg_8_32x8_rvv_i32 :5.73.2
> avg_8_32x16_c  :   96.5  101.2
> avg_8_32x16_rvv_i32:   10.75.7
> avg_8_32x32_c  :  192.5  202.5
> avg_8_32x32_rvv_i32:   20.7   10.5
> avg_8_32x64_c  :  411.2  404.5
> avg_8_32x64_rvv_i32:   41.0   20.5
> avg_8_32x128_c :  834.7  855.2
> avg_8_32x128_rvv_i32

Re: [FFmpeg-devel] [PATCH] lavc/vvc_mc: R-V V avg w_avg

2024-05-21 Thread flow gg

To obtain test results, need to comment out the if (w == h) in
tests/checkasm/vvc_mc.c.
Because vset needs to be used in the loop, I manually wrote a cumbersome
vset macro.

 于2024年5月21日周二 15:38写道：

> From: sunyuechi 
>
>   C908   X60
> avg_8_2x2_c:1.01.0
> avg_8_2x2_rvv_i32  :0.70.7
> avg_8_2x4_c:2.02.0
> avg_8_2x4_rvv_i32  :1.00.7
> avg_8_2x8_c:4.03.7
> avg_8_2x8_rvv_i32  :1.51.2
> avg_8_2x16_c   :7.57.7
> avg_8_2x16_rvv_i32 :2.72.5
> avg_8_2x32_c   :   14.2   15.0
> avg_8_2x32_rvv_i32 :5.04.5
> avg_8_2x64_c   :   28.5   30.2
> avg_8_2x64_rvv_i32 :9.58.7
> avg_8_2x128_c  :   80.0   70.5
> avg_8_2x128_rvv_i32:   50.7   41.2
> avg_8_4x2_c:1.72.0
> avg_8_4x2_rvv_i32  :0.70.7
> avg_8_4x4_c:3.53.7
> avg_8_4x4_rvv_i32  :1.21.0
> avg_8_4x8_c:6.77.0
> avg_8_4x8_rvv_i32  :1.51.2
> avg_8_4x16_c   :   13.2   14.0
> avg_8_4x16_rvv_i32 :2.72.5
> avg_8_4x32_c   :   26.2   27.7
> avg_8_4x32_rvv_i32 :5.04.5
> avg_8_4x64_c   :   52.2   55.0
> avg_8_4x64_rvv_i32 :9.58.7
> avg_8_4x128_c  :  146.0  117.5
> avg_8_4x128_rvv_i32:   53.2   40.5
> avg_8_8x2_c:3.53.5
> avg_8_8x2_rvv_i32  :0.70.7
> avg_8_8x4_c:6.56.5
> avg_8_8x4_rvv_i32  :1.21.0
> avg_8_8x8_c:   12.7   13.2
> avg_8_8x8_rvv_i32  :2.01.5
> avg_8_8x16_c   :   25.2   26.2
> avg_8_8x16_rvv_i32 :3.52.5
> avg_8_8x32_c   :   50.0   52.7
> avg_8_8x32_rvv_i32 :6.54.7
> avg_8_8x64_c   :   99.7  105.0
> avg_8_8x64_rvv_i32 :   12.58.5
> avg_8_8x128_c  :  225.7  218.0
> avg_8_8x128_rvv_i32:   78.0   39.2
> avg_8_16x2_c   :6.26.7
> avg_8_16x2_rvv_i32 :1.20.7
> avg_8_16x4_c   :   12.2   12.7
> avg_8_16x4_rvv_i32 :2.01.2
> avg_8_16x8_c   :   24.7   26.0
> avg_8_16x8_rvv_i32 :3.51.7
> avg_8_16x16_c  :   49.0   51.5
> avg_8_16x16_rvv_i32:6.23.2
> avg_8_16x32_c  :   97.5  102.5
> avg_8_16x32_rvv_i32:   11.55.7
> avg_8_16x64_c  :  212.5  204.7
> avg_8_16x64_rvv_i32:   22.5   11.0
> avg_8_16x128_c :  411.2  418.2
> avg_8_16x128_rvv_i32   :   76.0   47.7
> avg_8_32x2_c   :   12.2   12.7
> avg_8_32x2_rvv_i32 :2.01.2
> avg_8_32x4_c   :   24.2   25.5
> avg_8_32x4_rvv_i32 :3.21.7
> avg_8_32x8_c   :   48.5   50.7
> avg_8_32x8_rvv_i32 :5.73.2
> avg_8_32x16_c  :   96.5  101.2
> avg_8_32x16_rvv_i32:   10.75.7
> avg_8_32x32_c  :  192.5  202.5
> avg_8_32x32_rvv_i32:   20.7   10.5
> avg_8_32x64_c  :  411.2  404.5
> avg_8_32x64_rvv_i32:   41.0   20.5
> avg_8_32x128_c

Re: [FFmpeg-devel] [PATCH 1/4] lavc/vp8dsp: R-V V put_epel hv

2024-05-19 Thread flow gg

fix .irp use

 于2024年5月19日周日 16:18写道：

> From: sunyuechi 
>
> C908:
> vp8_put_epel4_h4v4_c: 20.0
> vp8_put_epel4_h4v4_rvv_i32: 11.0
> vp8_put_epel4_h4v6_c: 25.2
> vp8_put_epel4_h4v6_rvv_i32: 13.5
> vp8_put_epel4_h6v4_c: 22.2
> vp8_put_epel4_h6v4_rvv_i32: 14.5
> vp8_put_epel4_h6v6_c: 29.0
> vp8_put_epel4_h6v6_rvv_i32: 15.7
> vp8_put_epel8_h4v4_c: 73.0
> vp8_put_epel8_h4v4_rvv_i32: 22.2
> vp8_put_epel8_h4v6_c: 90.5
> vp8_put_epel8_h4v6_rvv_i32: 26.7
> vp8_put_epel8_h6v4_c: 85.0
> vp8_put_epel8_h6v4_rvv_i32: 27.2
> vp8_put_epel8_h6v6_c: 104.7
> vp8_put_epel8_h6v6_rvv_i32: 29.5
> vp8_put_epel16_h4v4_c: 145.5
> vp8_put_epel16_h4v4_rvv_i32: 26.5
> vp8_put_epel16_h4v6_c: 190.7
> vp8_put_epel16_h4v6_rvv_i32: 47.5
> vp8_put_epel16_h6v4_c: 173.7
> vp8_put_epel16_h6v4_rvv_i32: 33.2
> vp8_put_epel16_h6v6_c: 222.2
> vp8_put_epel16_h6v6_rvv_i32: 35.5
> ---
>  libavcodec/riscv/vp8dsp_init.c |  13 
>  libavcodec/riscv/vp8dsp_rvv.S  | 123 +++--
>  2 files changed, 115 insertions(+), 21 deletions(-)
>
> diff --git a/libavcodec/riscv/vp8dsp_init.c
> b/libavcodec/riscv/vp8dsp_init.c
> index 31e8227fa4..86927907e0 100644
> --- a/libavcodec/riscv/vp8dsp_init.c
> +++ b/libavcodec/riscv/vp8dsp_init.c
> @@ -97,6 +97,19 @@ av_cold void ff_vp78dsp_init_riscv(VP8DSPContext *c)
>  c->put_vp8_epel_pixels_tab[0][1][0] = ff_put_vp8_epel16_v4_rvv;
>  c->put_vp8_epel_pixels_tab[1][1][0] = ff_put_vp8_epel8_v4_rvv;
>  c->put_vp8_epel_pixels_tab[2][1][0] = ff_put_vp8_epel4_v4_rvv;
> +
> +c->put_vp8_epel_pixels_tab[0][2][2] = ff_put_vp8_epel16_h6v6_rvv;
> +c->put_vp8_epel_pixels_tab[1][2][2] = ff_put_vp8_epel8_h6v6_rvv;
> +c->put_vp8_epel_pixels_tab[2][2][2] = ff_put_vp8_epel4_h6v6_rvv;
> +c->put_vp8_epel_pixels_tab[0][2][1] = ff_put_vp8_epel16_h4v6_rvv;
> +c->put_vp8_epel_pixels_tab[1][2][1] = ff_put_vp8_epel8_h4v6_rvv;
> +c->put_vp8_epel_pixels_tab[2][2][1] = ff_put_vp8_epel4_h4v6_rvv;
> +c->put_vp8_epel_pixels_tab[0][1][1] = ff_put_vp8_epel16_h4v4_rvv;
> +c->put_vp8_epel_pixels_tab[1][1][1] = ff_put_vp8_epel8_h4v4_rvv;
> +c->put_vp8_epel_pixels_tab[2][1][1] = ff_put_vp8_epel4_h4v4_rvv;
> +c->put_vp8_epel_pixels_tab[0][1][2] = ff_put_vp8_epel16_h6v4_rvv;
> +c->put_vp8_epel_pixels_tab[1][1][2] = ff_put_vp8_epel8_h6v4_rvv;
> +c->put_vp8_epel_pixels_tab[2][1][2] = ff_put_vp8_epel4_h6v4_rvv;
>  }
>  #endif
>  #endif
> diff --git a/libavcodec/riscv/vp8dsp_rvv.S b/libavcodec/riscv/vp8dsp_rvv.S
> index 0ba9fa443d..c79a8afacf 100644
> --- a/libavcodec/riscv/vp8dsp_rvv.S
> +++ b/libavcodec/riscv/vp8dsp_rvv.S
> @@ -161,26 +161,26 @@ const subpel_filters
>  .byte 0,  -1,  12, 123,  -6, 0
>  endconst
>
> -.macro epel_filter size type
> -lla t2, subpel_filters
> +.macro epel_filter size type regtype
> +lla \regtype\()2, subpel_filters
>  .ifc \type,v
> -addit0, a6, -1
> +addi\regtype\()0, a6, -1
>  .else
> -addit0, a5, -1
> +addi\regtype\()0, a5, -1
>  .endif
> -li  t1, 6
> -mul t0, t0, t1
> -add t0, t0, t2
> +li  \regtype\()1, 6
> +mul \regtype\()0, \regtype\()0, \regtype\()1
> +add \regtype\()0, \regtype\()0, \regtype\()2
>  .irp n,1,2,3,4
> -lb  t\n, \n(t0)
> +lb  \regtype\n, \n(\regtype\()0)
>  .endr
>  .ifc \size,6
> -lb  t5, 5(t0)
> -lb  t0, (t0)
> +lb  \regtype\()5, 5(\regtype\()0)
> +lb  \regtype\()0, (\regtype\()0)
>  .endif
>  .endm
>
> -.macro epel_load dst len size type
> +.macro epel_load dst len size type from_mem regtype
>  .ifc \type,v
>  mv  a5, a3
>  .else
> @@ -189,24 +189,35 @@ endconst
>  sub t6, a2, a5
>  add a7, a2, a5
>
> +.if \from_mem
>  vle8.v  v24, (a2)
>  vle8.v  v22, (t6)
>  vle8.v  v26, (a7)
>  add a7, a7, a5
>  vle8.v  v28, (a7)
> -vwmulu.vx   v16, v24, t2
> -vwmulu.vx   v20, v26, t3
> +vwmulu.vx   v16, v24, \regtype\()2
> +vwmulu.vx   v20, v26, \regtype\()3
>  .ifc \size,6
>  sub t6, t6, a5
>  add a7, a7, a5
>  vle8.v  v24, (t6)
>  vle8.v  v26, (a7)
> -vwmaccu.vx  v16, t0, v24
> -vwmaccu.vx  v16, t5, v26
> +vwmaccu.vx  v16, \regtype\()0, v24
> +vwmaccu.vx  v16, \regtype\()5, v26
> +.endif
> +vwmaccsu.vx v16, \regtype\()1, v22
> +vwmaccsu.vx v16, \regtype\()4, v28
> +.else
> +vwmulu.vx   v16, v4, \regtype\()2
> +vwmulu.vx   v20, v6, \regtyp

Re: [FFmpeg-devel] [PATCH v3 6/9] lavc/vp9dsp: R-V V mc bilin h v

2024-05-18 Thread flow gg

fixed in v4

Rémi Denis-Courmont  于2024年5月18日周六 23:56写道：

> Le maanantaina 13. toukokuuta 2024, 19.59.23 EEST u...@foxmail.com a
> écrit :
> > From: sunyuechi 
> >
> > C908:
> > vp9_avg_bilin_4h_8bpp_c: 5.2
> > vp9_avg_bilin_4h_8bpp_rvv_i64: 2.2
> > vp9_avg_bilin_4v_8bpp_c: 5.5
> > vp9_avg_bilin_4v_8bpp_rvv_i64: 2.2
> > vp9_avg_bilin_8h_8bpp_c: 20.0
> > vp9_avg_bilin_8h_8bpp_rvv_i64: 4.5
> > vp9_avg_bilin_8v_8bpp_c: 21.0
> > vp9_avg_bilin_8v_8bpp_rvv_i64: 4.2
> > vp9_avg_bilin_16h_8bpp_c: 78.2
> > vp9_avg_bilin_16h_8bpp_rvv_i64: 9.0
> > vp9_avg_bilin_16v_8bpp_c: 82.0
> > vp9_avg_bilin_16v_8bpp_rvv_i64: 9.0
> > vp9_avg_bilin_32h_8bpp_c: 325.5
> > vp9_avg_bilin_32h_8bpp_rvv_i64: 26.2
> > vp9_avg_bilin_32v_8bpp_c: 326.2
> > vp9_avg_bilin_32v_8bpp_rvv_i64: 26.2
> > vp9_avg_bilin_64h_8bpp_c: 1265.7
> > vp9_avg_bilin_64h_8bpp_rvv_i64: 91.5
> > vp9_avg_bilin_64v_8bpp_c: 1317.0
> > vp9_avg_bilin_64v_8bpp_rvv_i64: 91.2
> > vp9_put_bilin_4h_8bpp_c: 4.5
> > vp9_put_bilin_4h_8bpp_rvv_i64: 1.7
> > vp9_put_bilin_4v_8bpp_c: 4.7
> > vp9_put_bilin_4v_8bpp_rvv_i64: 1.7
> > vp9_put_bilin_8h_8bpp_c: 17.0
> > vp9_put_bilin_8h_8bpp_rvv_i64: 3.5
> > vp9_put_bilin_8v_8bpp_c: 18.0
> > vp9_put_bilin_8v_8bpp_rvv_i64: 3.5
> > vp9_put_bilin_16h_8bpp_c: 65.2
> > vp9_put_bilin_16h_8bpp_rvv_i64: 7.5
> > vp9_put_bilin_16v_8bpp_c: 85.7
> > vp9_put_bilin_16v_8bpp_rvv_i64: 7.5
> > vp9_put_bilin_32h_8bpp_c: 257.5
> > vp9_put_bilin_32h_8bpp_rvv_i64: 23.5
> > vp9_put_bilin_32v_8bpp_c: 274.5
> > vp9_put_bilin_32v_8bpp_rvv_i64: 23.5
> > vp9_put_bilin_64h_8bpp_c: 1040.5
> > vp9_put_bilin_64h_8bpp_rvv_i64: 82.5
> > vp9_put_bilin_64v_8bpp_c: 1108.7
> > vp9_put_bilin_64v_8bpp_rvv_i64: 82.2
> > ---
> >  libavcodec/riscv/vp9_mc_rvv.S  | 43 ++
> >  libavcodec/riscv/vp9dsp_init.c | 21 +
> >  2 files changed, 64 insertions(+)
> >
> > diff --git a/libavcodec/riscv/vp9_mc_rvv.S
> b/libavcodec/riscv/vp9_mc_rvv.S
> > index 5d917e7b98..986cc3760d 100644
> > --- a/libavcodec/riscv/vp9_mc_rvv.S
> > +++ b/libavcodec/riscv/vp9_mc_rvv.S
> > @@ -53,6 +53,49 @@ func ff_avg\len\()_rvv, zve32x
> >  endfunc
> >  .endm
> >
> > +.macro bilin_load dst len op type mn
> > +.ifc \type,v
> > +add t5, a2, a3
> > +.elseif \type == h
> > +addit5, a2, 1
> > +.endif
> > +vle8.v  v8, (a2)
> > +vle8.v  v0, (t5)
> > +vwmulu.vx   v16, v0, \mn
> > +vwmaccsu.vx v16, t1, v8
> > +vwadd.wxv16, v16, t4
> > +vnsra.wiv16, v16, 4
> > +vadd.vv \dst, v16, v8
> > +.ifc \op,avg
> > +vle8.v  v16, (a0)
> > +vaaddu.vv   \dst, \dst, v16
> > +.endif
> > +.endm
> > +
> > +.macro bilin_h_v len op type mn
> > +func ff_\op\()_bilin_\len\()\type\()_rvv, zve32x
> > +.ifc \op,avg
> > +csrwi   vxrm, 0
> > +.endif
> > +vsetvlstatic8   \len t0 64
> > +li  t4, 8
> > +neg t1, \mn
> > +1:
> > +addia4, a4, -1
> > +bilin_load  v0, \len, \op, \type, \mn
> > +vse8.v  v0, (a0)
> > +add a2, a2, a3
> > +add a0, a0, a1
> > +bneza4, 1b
> > +
> > +ret
> > +endfunc
> > +.endm
> > +
> >  .irp len 64, 32, 16, 8, 4
>
> Missing comma after len
>
> >  copy_avg \len
> > +.irp op put avg
> > +bilin_h_v \len \op h a5
> > +bilin_h_v \len \op v a6
> > +.endr
> >  .endr
> > diff --git a/libavcodec/riscv/vp9dsp_init.c
> b/libavcodec/riscv/vp9dsp_init.c
> > index 1922484a1d..ec6db51774 100644
> > --- a/libavcodec/riscv/vp9dsp_init.c
> > +++ b/libavcodec/riscv/vp9dsp_init.c
> > @@ -63,6 +63,27 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext
> > *dsp, int bpp) init_fpel(3, 8);
> >  init_fpel(4, 4);
> >
> > +dsp->mc[0][FILTER_BILINEAR ][0][0][1] = ff_put_bilin_64v_rvv;
> > +dsp->mc[0][FILTER_BILINEAR ][0][1][0] = ff_put_bilin_64h_rvv;
> > +dsp->mc[0][FILTER_BILINEAR ][1][0][1] = ff_avg_bilin_64v_rvv;
> > +dsp->mc[0][FILTER_BILINEAR ][1][1][0] = ff_avg_bilin_64h_rvv;
> > +dsp->mc[1][FILTER_BILINEAR ][0][0][1] = ff_put_bilin_32v_rvv;
> > +dsp->mc[1][FILTER_BILINEAR ][0][1][0] = ff_put_bilin_32h_rvv;
> > +dsp->mc[1][FILTER_BILINEAR ][1][0][1] = ff_avg_bilin_32v_rvv;
> > +dsp->mc[1][FILTER_BILINEAR ][1][1][0] = ff_avg_bilin_32h_rvv;
> > +dsp->mc[2][FILTER_BILINEAR ][0][0][1] = ff_put_bilin_16v_rvv;
> > +dsp->mc[2][FILTER_BILINEAR ][0][1][0] = ff_put_bilin_16h_rvv;
> > +dsp->mc[2][FILTER_BILINEAR ][1][0][1] = ff_avg_bilin_16v_rvv;
> > +dsp->mc[2][FILTER_BILINEAR ][1][1][0] = ff_avg_bilin_16h_rvv;
> > +dsp->mc[3][FILTER_BILINEAR ][0][0][1] = ff_put_bilin_8v_rvv;
> > +dsp->mc[3][FILTER_BILINEAR ][0][1][0] = ff_put_bilin_8h_rvv;
> > +dsp->mc[3][FILTER_BILINEAR ][1][0][1] = ff_avg_bilin_8v_rvv;
> > +dsp->mc[3][FILTER_BILINEAR ]

Re: [FFmpeg-devel] [PATCH v4 1/5] lavc/vp9dsp: R-V V mc avg

2024-05-18 Thread flow gg

Fixed issues with .irp and comma, as well as the ifc issue (same
modifications as previously done for vp8).

 于2024年5月19日周日 02:16写道：

> From: sunyuechi 
>
> C908:
> vp9_avg4_8bpp_c: 1.2
> vp9_avg4_8bpp_rvv_i64: 1.0
> vp9_avg8_8bpp_c: 3.7
> vp9_avg8_8bpp_rvv_i64: 1.5
> vp9_avg16_8bpp_c: 14.7
> vp9_avg16_8bpp_rvv_i64: 3.5
> vp9_avg32_8bpp_c: 57.7
> vp9_avg32_8bpp_rvv_i64: 10.0
> vp9_avg64_8bpp_c: 229.0
> vp9_avg64_8bpp_rvv_i64: 31.7
> ---
>  libavcodec/riscv/Makefile  |  3 +-
>  libavcodec/riscv/vp9_mc_rvv.S  | 58 ++
>  libavcodec/riscv/vp9dsp_init.c | 18 +++
>  3 files changed, 78 insertions(+), 1 deletion(-)
>  create mode 100644 libavcodec/riscv/vp9_mc_rvv.S
>
> diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
> index 27b268ae39..4739d83522 100644
> --- a/libavcodec/riscv/Makefile
> +++ b/libavcodec/riscv/Makefile
> @@ -65,6 +65,7 @@ RVV-OBJS-$(CONFIG_VP8DSP) += riscv/vp8dsp_rvv.o
>  OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9dsp_init.o
>  RV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvi.o \
>   riscv/vp9_mc_rvi.o
> -RVV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvv.o
> +RVV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvv.o \
> +  riscv/vp9_mc_rvv.o
>  OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_init.o
>  RVV-OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_rvv.o
> diff --git a/libavcodec/riscv/vp9_mc_rvv.S b/libavcodec/riscv/vp9_mc_rvv.S
> new file mode 100644
> index 00..7811cd9928
> --- /dev/null
> +++ b/libavcodec/riscv/vp9_mc_rvv.S
> @@ -0,0 +1,58 @@
> +/*
> + * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences
> (ISCAS).
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> 02110-1301 USA
> + */
> +
> +#include "libavutil/riscv/asm.S"
> +
> +.macro vsetvlstatic8 len an maxlen mn=m4
> +.if \len == 4
> +vsetivlizero, \len, e8, mf4, ta, ma
> +.elseif \len == 8
> +vsetivlizero, \len, e8, mf2, ta, ma
> +.elseif \len == 16
> +vsetivlizero, \len, e8, m1, ta, ma
> +.elseif \len == 32
> +li  \an, \len
> +vsetvli zero, \an, e8, m2, ta, ma
> +.elseif \len == 64
> +li  \an, \maxlen
> +vsetvli zero, \an, e8, \mn, ta, ma
> +.endif
> +.endm
> +
> +.macro copy_avg len
> +func ff_avg\len\()_rvv, zve32x
> +csrwi   vxrm, 0
> +vsetvlstatic8   \len t0 64
> +1:
> +vle8.v  v8, (a2)
> +vle8.v  v16, (a0)
> +vaaddu.vv   v8, v8, v16
> +addia4, a4, -1
> +vse8.v  v8, (a0)
> +add a2, a2, a3
> +add a0, a0, a1
> +bneza4, 1b
> +ret
> +endfunc
> +.endm
> +
> +.irp len, 64, 32, 16, 8, 4
> +copy_avg \len
> +.endr
> diff --git a/libavcodec/riscv/vp9dsp_init.c
> b/libavcodec/riscv/vp9dsp_init.c
> index ab99294d44..6bfe23563a 100644
> --- a/libavcodec/riscv/vp9dsp_init.c
> +++ b/libavcodec/riscv/vp9dsp_init.c
> @@ -48,6 +48,24 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext
> *dsp, int bpp)
>  }
>  # endif
>
> +#if HAVE_RVV
> +if (bpp == 8 && (flags & AV_CPU_FLAG_RVV_I32) &&
> ff_rv_vlen_least(128)) {
> +
> +#define init_fpel(idx1, sz)   \
> +dsp->mc[idx1][FILTER_8TAP_SMOOTH ][1][0][0] = ff_avg##sz##_rvv;  \
> +dsp->mc[idx1][FILTER_8TAP_REGULAR][1][0][0] = ff_avg##sz##_rvv;  \
> +dsp->mc[idx1][FILTER_8TAP_SHARP  ][1][0][0] = ff_avg##sz##_rvv;  \
> +dsp->mc[idx1][FILTER_BILINEAR][1][0][0] = ff_avg##sz##_rvv
> +
> +init_fpel(0, 64);
> +init_fpel(1, 32);
> +init_fpel(2, 16);
> +init_fpel(3, 8);
> +init_fpel(4, 4);
> +
> +#undef init_fpel
> +}
> +#endif
>  #endif
>  }
>
> --
> 2.45.1
>
> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
>
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/

Re: [FFmpeg-devel] [PATCH v3 5/9] lavc/vp9dsp: R-V V mc avg

2024-05-17 Thread flow gg

yeah, updated it in the reply

Rémi Denis-Courmont  于2024年5月17日周五 23:11写道：

> Le maanantaina 13. toukokuuta 2024, 19.59.22 EEST u...@foxmail.com a
> écrit :
> > From: sunyuechi 
> >
> > C908:
> > vp9_avg4_8bpp_c: 1.2
> > vp9_avg4_8bpp_rvv_i64: 1.0
> > vp9_avg8_8bpp_c: 3.7
> > vp9_avg8_8bpp_rvv_i64: 1.5
> > vp9_avg16_8bpp_c: 14.7
> > vp9_avg16_8bpp_rvv_i64: 3.5
> > vp9_avg32_8bpp_c: 57.7
> > vp9_avg32_8bpp_rvv_i64: 10.0
> > vp9_avg64_8bpp_c: 229.0
> > vp9_avg64_8bpp_rvv_i64: 31.7
> > ---
> >  libavcodec/riscv/Makefile  |  3 +-
> >  libavcodec/riscv/vp9_mc_rvv.S  | 58 ++
> >  libavcodec/riscv/vp9dsp_init.c | 18 +++
> >  3 files changed, 78 insertions(+), 1 deletion(-)
> >  create mode 100644 libavcodec/riscv/vp9_mc_rvv.S
> >
> > diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
> > index 0cd900104f..1183357b37 100644
> > --- a/libavcodec/riscv/Makefile
> > +++ b/libavcodec/riscv/Makefile
> > @@ -64,6 +64,7 @@ RVV-OBJS-$(CONFIG_VP8DSP) += riscv/vp8dsp_rvv.o
> >  OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9dsp_init.o
> >  RV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvi.o \
> >   riscv/vp9_mc_rvi.o
> > -RVV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvv.o
> > +RVV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvv.o \
> > +  riscv/vp9_mc_rvv.o
> >  OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_init.o
> >  RVV-OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_rvv.o
> > diff --git a/libavcodec/riscv/vp9_mc_rvv.S
> b/libavcodec/riscv/vp9_mc_rvv.S
> > new file mode 100644
> > index 00..5d917e7b98
> > --- /dev/null
> > +++ b/libavcodec/riscv/vp9_mc_rvv.S
> > @@ -0,0 +1,58 @@
> > +/*
> > + * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences
> > (ISCAS). + *
> > + * This file is part of FFmpeg.
> > + *
> > + * FFmpeg is free software; you can redistribute it and/or
> > + * modify it under the terms of the GNU Lesser General Public
> > + * License as published by the Free Software Foundation; either
> > + * version 2.1 of the License, or (at your option) any later version.
> > + *
> > + * FFmpeg is distributed in the hope that it will be useful,
> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > + * Lesser General Public License for more details.
> > + *
> > + * You should have received a copy of the GNU Lesser General Public
> > + * License along with FFmpeg; if not, write to the Free Software
> > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> 02110-1301
> > USA + */
> > +
> > +#include "libavutil/riscv/asm.S"
> > +
> > +.macro vsetvlstatic8 len an maxlen mn=m4
> > +.if \len == 4
> > +vsetivlizero, \len, e8, mf4, ta, ma
> > +.elseif \len == 8
> > +vsetivlizero, \len, e8, mf2, ta, ma
> > +.elseif \len == 16
> > +vsetivlizero, \len, e8, m1, ta, ma
> > +.elseif \len == 32
> > +li  \an, \len
> > +vsetvli zero, \an, e8, m2, ta, ma
> > +.elseif \len == 64
> > +li  \an, \maxlen
> > +vsetvli zero, \an, e8, \mn, ta, ma
> > +.endif
> > +.endm
> > +
> > +.macro copy_avg len
> > +func ff_avg\len\()_rvv, zve32x
> > +csrwi   vxrm, 0
> > +vsetvlstatic8   \len t0 64
> > +1:
> > +addia4, a4, -1
> > +vle8.v  v8, (a2)
> > +vle8.v  v16, (a0)
> > +vaaddu.vv   v8, v8, v16
> > +vse8.v  v8, (a0)
> > +add a2, a2, a3
> > +add a0, a0, a1
> > +bneza4, 1b
> > +ret
>
> Doesn't this get slightly faster by interleaving scalar and vector
> instructions?
>
> > +endfunc
> > +.endm
> > +
> > +.irp len 64, 32, 16, 8, 4
> > +copy_avg \len
> > +.endr
> > diff --git a/libavcodec/riscv/vp9dsp_init.c
> b/libavcodec/riscv/vp9dsp_init.c
> > index 184fadbaf7..1922484a1d 100644
> > --- a/libavcodec/riscv/vp9dsp_init.c
> > +++ b/libavcodec/riscv/vp9dsp_init.c
> > @@ -48,6 +48,24 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext
> > *dsp, int bpp) }
> >  # endif
> >
> > +#if HAVE_RVV
> > +if (bpp == 8 && (flags & AV_CPU_FLAG_RVV_I32) &&
> ff_rv_vlen_least(128))
> > { +
> > +#define init_fpel(idx1, sz)   \
> > +dsp->mc[idx1][FILTER_8TAP_SMOOTH ][1][0][0] = ff_avg##sz##_rvv;  \
> > +dsp->mc[idx1][FILTER_8TAP_REGULAR][1][0][0] = ff_avg##sz##_rvv;  \
> > +dsp->mc[idx1][FILTER_8TAP_SHARP  ][1][0][0] = ff_avg##sz##_rvv;  \
> > +dsp->mc[idx1][FILTER_BILINEAR][1][0][0] = ff_avg##sz##_rvv
> > +
> > +init_fpel(0, 64);
> > +init_fpel(1, 32);
> > +init_fpel(2, 16);
> > +init_fpel(3, 8);
> > +init_fpel(4, 4);
> > +
> > +#undef init_fpel
> > +}
> > +#endif
> >  #endif
> >  }
>
>
> --
> 雷米‧德尼-库尔蒙
> http://www.remlab.net/
>

Re: [FFmpeg-devel] [PATCHv2 2/2] lavc/startcode: add R-V V startcode_find_candidate

2024-05-15 Thread flow gg

Is the test result missing here?

Rémi Denis-Courmont  于2024年5月16日周四 01:11写道：

> ---
>  libavcodec/riscv/Makefile|  1 +
>  libavcodec/riscv/h264dsp_init.c  |  5 
>  libavcodec/riscv/startcode_rvv.S | 44 
>  libavcodec/riscv/vc1dsp_init.c   | 16 +++-
>  4 files changed, 60 insertions(+), 6 deletions(-)
>  create mode 100644 libavcodec/riscv/startcode_rvv.S
>
> diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
> index e87a2a9f9f..42665165f0 100644
> --- a/libavcodec/riscv/Makefile
> +++ b/libavcodec/riscv/Makefile
> @@ -53,6 +53,7 @@ RVV-OBJS-$(CONFIG_RV34DSP) += riscv/rv34dsp_rvv.o
>  OBJS-$(CONFIG_RV40_DECODER) += riscv/rv40dsp_init.o
>  RVV-OBJS-$(CONFIG_RV40_DECODER) += riscv/rv40dsp_rvv.o
>  RV-OBJS-$(CONFIG_STARTCODE) += riscv/startcode_rvb.o
> +RVV-OBJS-$(CONFIG_STARTCODE) += riscv/startcode_rvv.o
>  OBJS-$(CONFIG_SVQ1_ENCODER) += riscv/svqenc_init.o
>  RVV-OBJS-$(CONFIG_SVQ1_ENCODER) += riscv/svqenc_rvv.o
>  OBJS-$(CONFIG_TAK_DECODER) += riscv/takdsp_init.o
> diff --git a/libavcodec/riscv/h264dsp_init.c
> b/libavcodec/riscv/h264dsp_init.c
> index 60c84734cd..dbbf3db400 100644
> --- a/libavcodec/riscv/h264dsp_init.c
> +++ b/libavcodec/riscv/h264dsp_init.c
> @@ -27,6 +27,7 @@
>  #include "libavcodec/h264dsp.h"
>
>  extern int ff_startcode_find_candidate_rvb(const uint8_t *, int);
> +extern int ff_startcode_find_candidate_rvv(const uint8_t *, int);
>
>  av_cold void ff_h264dsp_init_riscv(H264DSPContext *dsp, const int
> bit_depth,
> const int chroma_format_idc)
> @@ -36,5 +37,9 @@ av_cold void ff_h264dsp_init_riscv(H264DSPContext *dsp,
> const int bit_depth,
>
>  if (flags & AV_CPU_FLAG_RVB_BASIC)
>  dsp->startcode_find_candidate = ff_startcode_find_candidate_rvb;
> +# if HAVE_RVV
> +if (flags & AV_CPU_FLAG_RVV_I32)
> +dsp->startcode_find_candidate = ff_startcode_find_candidate_rvv;
> +# endif
>  #endif
>  }
> diff --git a/libavcodec/riscv/startcode_rvv.S
> b/libavcodec/riscv/startcode_rvv.S
> new file mode 100644
> index 00..7c43b1d7f3
> --- /dev/null
> +++ b/libavcodec/riscv/startcode_rvv.S
> @@ -0,0 +1,44 @@
> +/*
> + * Copyright © 2024 Rémi Denis-Courmont.
> + *
> + * Redistribution and use in source and binary forms, with or without
> + * modification, are permitted provided that the following conditions are
> met:
> + *
> + * 1. Redistributions of source code must retain the above copyright
> notice,
> + *this list of conditions and the following disclaimer.
> + *
> + * 2. Redistributions in binary form must reproduce the above copyright
> notice,
> + *this list of conditions and the following disclaimer in the
> documentation
> + *and/or other materials provided with the distribution.
> + *
> + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> "AS IS"
> + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
> THE
> + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
> PURPOSE
> + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS
> BE
> + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
> + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
> + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
> BUSINESS
> + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
> + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
> + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
> THE
> + * POSSIBILITY OF SUCH DAMAGE.
> + */
> +
> +#include "libavutil/riscv/asm.S"
> +
> +func ff_startcode_find_candidate_rvv, zve32x
> +mv   t0, a0
> +1:
> +vsetvli  t1, a1, e8, m8, ta, ma
> +vle8.v   v8, (t0)
> +sub  a1, a1, t1
> +vmseq.vi v0, v8, 0
> +vfirst.m t2, v0
> +bgez t2, 2f
> +add  t0, t0, t1
> +bnez a1, 1b
> +2:
> +add  t0, t0, t2
> +sub  a0, t0, a0
> +ret
> +endfunc
> diff --git a/libavcodec/riscv/vc1dsp_init.c
> b/libavcodec/riscv/vc1dsp_init.c
> index 844f39b891..71237ae723 100644
> --- a/libavcodec/riscv/vc1dsp_init.c
> +++ b/libavcodec/riscv/vc1dsp_init.c
> @@ -30,6 +30,7 @@ void ff_vc1_inv_trans_4x8_dc_rvv(uint8_t *dest,
> ptrdiff_t stride, int16_t *block
>  void ff_vc1_inv_trans_8x4_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t
> *block);
>  void ff_vc1_inv_trans_4x4_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t
> *block);
>  int ff_startcode_find_candidate_rvb(const uint8_t *, int);
> +int ff_startcode_find_candidate_rvv(const uint8_t *, int);
>
>  av_cold void ff_vc1dsp_init_riscv(VC1DSPContext *dsp)
>  {
> @@ -39,13 +40,16 @@ av_cold void ff_vc1dsp_init_riscv(VC1DSPContext *dsp)
>  if (flags & AV_CPU_FLAG_RVB_BASIC)
>  dsp->startcode_find_candidate = ff_startcode_find_candidate_rvb;
>  # if HAVE_RVV
> -if (flags & AV_C

Re: [FFmpeg-devel] [PATCH 4/9] lavc/vp9dsp: R-V V ipred tm

2024-05-14 Thread flow gg

updated for clean code

 于2024年5月15日周三 11:56写道：

> From: sunyuechi 
>
> C908:
> vp9_tm_4x4_8bpp_c: 116.5
> vp9_tm_4x4_8bpp_rvv_i32: 43.5
> vp9_tm_8x8_8bpp_c: 416.2
> vp9_tm_8x8_8bpp_rvv_i32: 86.0
> vp9_tm_16x16_8bpp_c: 1665.5
> vp9_tm_16x16_8bpp_rvv_i32: 187.2
> vp9_tm_32x32_8bpp_c: 6974.2
> vp9_tm_32x32_8bpp_rvv_i32: 625.7
> ---
>  libavcodec/riscv/vp9_intra_rvv.S | 118 +++
>  libavcodec/riscv/vp9dsp.h|   8 +++
>  libavcodec/riscv/vp9dsp_init.c   |   4 ++
>  3 files changed, 130 insertions(+)
>
> diff --git a/libavcodec/riscv/vp9_intra_rvv.S
> b/libavcodec/riscv/vp9_intra_rvv.S
> index ca156d65cd..280c497687 100644
> --- a/libavcodec/riscv/vp9_intra_rvv.S
> +++ b/libavcodec/riscv/vp9_intra_rvv.S
> @@ -173,3 +173,121 @@ func ff_h_8x8_rvv, zve32x
>
>  ret
>  endfunc
> +
> +.macro tm_sum4 dst1, dst2, dst3, dst4, top, n1
> +lbu  t1, \n1(a2)
> +lbu  t2, (\n1-1)(a2)
> +lbu  t3, (\n1-2)(a2)
> +lbu  t4, (\n1-3)(a2)
> +sub  t1, t1, a4
> +sub  t2, t2, a4
> +sub  t3, t3, a4
> +sub  t4, t4, a4
> +vadd.vx  \dst1, \top, t1
> +vadd.vx  \dst2, \top, t2
> +vadd.vx  \dst3, \top, t3
> +vadd.vx  \dst4, \top, t4
> +.endm
> +
> +func ff_tm_32x32_rvv, zve32x
> +lbu  a4, -1(a3)
> +li   t5, 32
> +
> +.irp offset 31, 23, 15, 7
> +vsetvli  zero, t5, e16, m4, ta, ma
> +vle8.v   v8, (a3)
> +vzext.vf2v28, v8
> +
> +tm_sum4  v0, v4, v8, v12, v28, \offset
> +tm_sum4  v16, v20, v24, v28, v28, (\offset-4)
> +
> +.irp n 0, 4, 8, 12, 16, 20, 24, 28
> +vmax.vx  v\n, v\n, zero
> +.endr
> +
> +vsetvli  zero, zero, e8, m2, ta, ma
> +.irp n 0, 4, 8, 12, 16, 20, 24, 28
> +vnclipu.wi   v\n, v\n, 0
> +vse8.v   v\n, (a0)
> +add  a0, a0, a1
> +.endr
> +.endr
> +
> +ret
> +endfunc
> +
> +func ff_tm_16x16_rvv, zve32x
> +vsetivli  zero, 16, e16, m2, ta, ma
> +vle8.vv8, (a3)
> +vzext.vf2 v30, v8
> +lbu   a4, -1(a3)
> +
> +tm_sum4   v0, v2, v4, v6, v30, 15
> +tm_sum4   v8, v10, v12, v14, v30, 11
> +tm_sum4   v16, v18, v20, v22, v30, 7
> +tm_sum4   v24, v26, v28, v30, v30, 3
> +
> +.irp n 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
> +vmax.vx  v\n, v\n, zero
> +.endr
> +
> +vsetvli  zero, zero, e8, m1, ta, ma
> +.irp n 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28
> +vnclipu.wi   v\n, v\n, 0
> +vse8.v   v\n, (a0)
> +add  a0, a0, a1
> +.endr
> +vnclipu.wi   v30, v30, 0
> +vse8.v   v30, (a0)
> +
> +ret
> +endfunc
> +
> +func ff_tm_8x8_rvv, zve32x
> +vsetivli zero, 8, e16, m1, ta, ma
> +vle8.v   v8, (a3)
> +vzext.vf2v28, v8
> +lbu  a4, -1(a3)
> +
> +tm_sum4  v16, v17, v18, v19, v28, 7
> +tm_sum4  v20, v21, v22, v23, v28, 3
> +
> +.irp n 16, 17, 18, 19, 20, 21, 22, 23
> +vmax.vx  v\n, v\n, zero
> +.endr
> +
> +vsetvli  zero, zero, e8, mf2, ta, ma
> +.irp n 16, 17, 18, 19, 20, 21, 22
> +vnclipu.wi   v\n, v\n, 0
> +vse8.v   v\n, (a0)
> +add  a0, a0, a1
> +.endr
> +vnclipu.wi   v24, v23, 0
> +vse8.v   v24, (a0)
> +
> +ret
> +endfunc
> +
> +func ff_tm_4x4_rvv, zve32x
> +vsetivli zero, 4, e16, mf2, ta, ma
> +vle8.v   v8, (a3)
> +vzext.vf2v28, v8
> +lbu  a4, -1(a3)
> +
> +tm_sum4  v16, v17, v18, v19, v28, 3
> +
> +.irp n 16, 17, 18, 19
> +vmax.vx  v\n, v\n, zero
> +.endr
> +
> +vsetvli  zero, zero, e8, mf4, ta, ma
> +.irp n 16, 17, 18
> +vnclipu.wi   v\n, v\n, 0
> +vse8.v   v\n, (a0)
> +add  a0, a0, a1
> +.endr
> +vnclipu.wi   v24, v19, 0
> +vse8.v   v24, (a0)
> +
> +ret
> +endfunc
> diff --git a/libavcodec/riscv/vp9dsp.h b/libavcodec/riscv/vp9dsp.h
> index 0ad961c7e0..79330b4968 100644
> --- a/libavcodec/riscv/vp9dsp.h
> +++ b/libavcodec/riscv/vp9dsp.h
> @@ -72,6 +72,14 @@ void ff_h_16x16_rvv(uint8_t *dst, ptrdiff_t stride,
> const uint8_t *l,
>  const uint8_t *a);
>  void ff_h_8x8_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l,
>const uint8_t *a);
> +void ff_tm_32x32_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l,
> + const uint8_t *a);
> +void ff_tm_16x16_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l,
> + const uint8_t *a);
> +void

Re: [FFmpeg-devel] [PATCH v3 4/9] lavc/vp9dsp: R-V V ipred tm

2024-05-14 Thread flow gg

> The macro saves some copycat code, but it seems to prevent good
scheduling.
> Consuming t3 right after loading it is not ideal.

> OTOH, it seems that you could just write the tm_sum32 with a single
parameter,
> as the other ones are just relative by constant +/-1.

Okay, updated it in the reply

Rémi Denis-Courmont  于2024年5月15日周三 02:08写道：

> Le tiistaina 14. toukokuuta 2024, 20.57.17 EEST flow gg a écrit :
> > Why is it unnecessary to reset the vector configuration every time? I
> think
> > it is necessary to reset e16/e8 each time.
>
> I misread the placement of .endm
>
> OTOH, it seems that you could just write the tm_sum32 with a single
> parameter,
> as the other ones are just relative by constant +/-1.
>
> --
> 雷米‧德尼-库尔蒙
> http://www.remlab.net/
>
>
>
> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
>
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH v3 4/9] lavc/vp9dsp: R-V V ipred tm

2024-05-14 Thread flow gg

Why is it unnecessary to reset the vector configuration every time? I think
it is necessary to reset e16/e8 each time.

Rémi Denis-Courmont  于2024年5月15日周三 01:46写道：

> Le maanantaina 13. toukokuuta 2024, 19.59.21 EEST u...@foxmail.com a
> écrit :
> > From: sunyuechi 
> >
> > C908:
> > vp9_tm_4x4_8bpp_c: 116.5
> > vp9_tm_4x4_8bpp_rvv_i32: 43.5
> > vp9_tm_8x8_8bpp_c: 416.2
> > vp9_tm_8x8_8bpp_rvv_i32: 86.0
> > vp9_tm_16x16_8bpp_c: 1665.5
> > vp9_tm_16x16_8bpp_rvv_i32: 187.2
> > vp9_tm_32x32_8bpp_c: 6974.2
> > vp9_tm_32x32_8bpp_rvv_i32: 625.7
> > ---
> >  libavcodec/riscv/vp9_intra_rvv.S | 141 +++
> >  libavcodec/riscv/vp9dsp.h|   8 ++
> >  libavcodec/riscv/vp9dsp_init.c   |   4 +
> >  3 files changed, 153 insertions(+)
> >
> > diff --git a/libavcodec/riscv/vp9_intra_rvv.S
> > b/libavcodec/riscv/vp9_intra_rvv.S index ca156d65cd..7e1046bc13 100644
> > --- a/libavcodec/riscv/vp9_intra_rvv.S
> > +++ b/libavcodec/riscv/vp9_intra_rvv.S
> > @@ -173,3 +173,144 @@ func ff_h_8x8_rvv, zve32x
> >
> >  ret
> >  endfunc
> > +
> > +.macro tm_sum dst, top, offset
> > +lbu  t3, \offset(a2)
> > +sub  t3, t3, a4
> > +vadd.vx  \dst, \top, t3
>
> The macro saves some copycat code, but it seems to prevent good
> scheduling.
> Consuming t3 right after loading it is not ideal.
>
> > +.endm
> > +
> > +func ff_tm_32x32_rvv, zve32x
> > +lbu  a4, -1(a3)
> > +li   t5, 32
> > +
> > +.macro tm_sum32 n1,n2,n3,n4,n5,n6,n7,n8
> > +vsetvli  zero, t5, e16, m4, ta, ma
>
> AFAICT, you do not need to reset the vector configuration every time.
>
> > +vle8.v   v8, (a3)
> > +vzext.vf2v28, v8
> > +
> > +tm_sum   v0, v28, \n1
> > +tm_sum   v4, v28, \n2
> > +tm_sum   v8, v28, \n3
> > +tm_sum   v12, v28, \n4
> > +tm_sum   v16, v28, \n5
> > +tm_sum   v20, v28, \n6
> > +tm_sum   v24, v28, \n7
> > +tm_sum   v28, v28, \n8
> > +
> > +.irp n 0, 4, 8, 12, 16, 20, 24, 28
> > +vmax.vx  v\n, v\n, zero
> > +.endr
> > +
> > +vsetvli  zero, zero, e8, m2, ta, ma
> > +.irp n 0, 4, 8, 12, 16, 20, 24, 28
> > +vnclipu.wi   v\n, v\n, 0
> > +vse8.v   v\n, (a0)
> > +add  a0, a0, a1
> > +.endr
> > +.endm
> > +
> > +tm_sum32 31, 30, 29, 28, 27, 26, 25, 24
> > +tm_sum32 23, 22, 21, 20, 19, 18, 17, 16
> > +tm_sum32 15, 14, 13, 12, 11, 10, 9, 8
> > +tm_sum32 7, 6, 5, 4, 3, 2, 1, 0
> > +
> > +ret
> > +endfunc
> > +
> > +func ff_tm_16x16_rvv, zve32x
> > +vsetivli  zero, 16, e16, m2, ta, ma
> > +vle8.vv8, (a3)
> > +vzext.vf2 v30, v8
> > +lbu   a4, -1(a3)
> > +
> > +tm_sum   v0, v30, 15
> > +tm_sum   v2, v30, 14
> > +tm_sum   v4, v30, 13
> > +tm_sum   v6, v30, 12
> > +tm_sum   v8, v30, 11
> > +tm_sum   v10, v30, 10
> > +tm_sum   v12, v30, 9
> > +tm_sum   v14, v30, 8
> > +tm_sum   v16, v30, 7
> > +tm_sum   v18, v30, 6
> > +tm_sum   v20, v30, 5
> > +tm_sum   v22, v30, 4
> > +tm_sum   v24, v30, 3
> > +tm_sum   v26, v30, 2
> > +tm_sum   v28, v30, 1
> > +tm_sum   v30, v30, 0
> > +
> > +.irp n 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
> > +vmax.vx  v\n, v\n, zero
> > +.endr
> > +
> > +vsetvli  zero, zero, e8, m1, ta, ma
> > +.irp n 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28
> > +vnclipu.wi   v\n, v\n, 0
> > +vse8.v   v\n, (a0)
> > +add  a0, a0, a1
> > +.endr
> > +vnclipu.wi   v30, v30, 0
> > +vse8.v   v30, (a0)
> > +
> > +ret
> > +endfunc
> > +
> > +func ff_tm_8x8_rvv, zve32x
> > +vsetivli zero, 8, e16, m1, ta, ma
> > +vle8.v   v8, (a3)
> > +vzext.vf2v28, v8
> > +lbu  a4, -1(a3)
> > +
> > +tm_sum   v16, v28, 7
> > +tm_sum   v17, v28, 6
> > +tm_sum   v18, v28, 5
> > +tm_sum   v19, v28, 4
> > +tm_sum   v20, v28, 3
> > +tm_sum   v21, v28, 2
> > +tm_sum   v22, v28, 1
> > +tm_sum   v23, v28, 0
> > +
> > +.irp n 16, 17, 18, 19, 20, 21, 22, 23
> > +vmax.vx  v\n, v\n, zero
> > +.endr
> > +
> > +vsetvli  zero, zero, e8, mf2, ta, ma
> > +.irp n 16, 17, 18, 19, 20, 21, 22
> > +vnclipu.wi   v\n, v\n, 0
> > +vse8.v   v\n, (a0)
> > +add  a0, a0, a1
> > +.endr
> > +vnclipu.wi   v24, v23, 0
> > +vse8.v   v24, (a0)
> > +
> > +ret
> > +endfunc
> > +

Re: [FFmpeg-devel] [PATCH v3 1/9] lavc/vp9dsp: R-V ipred vert

2024-05-14 Thread flow gg

Okay, learned it

Rémi Denis-Courmont  于2024年5月15日周三 01:00写道：

> Le tiistaina 14. toukokuuta 2024, 7.45.29 EEST flow gg a écrit :
> > I am locally using:
> > if (bpp == 8 && (flags & AV_CPU_FLAG_RVI) && (flags &
> > AV_CPU_FLAG_RVB_ADDR)) {
>
> There is no point testing the I flag if you test any other flag. The I
> flag is
> always set (since we don't, and probably never will, support RV32E) and
> only
> exists for the benefit of checkasm.
>
> > this performs better on k230/banana_f3 than C.
>
> It also performs better than C on SiFive U74, even though that design has
> veery slow unaligned access (emulated in SBI). Of course, it could
> just be
> that checkasm only tests aligned accesses and unaligned accesses are
> legal,
> hence my earlier question.
>
> --
> レミ・デニ-クールモン
> http://www.remlab.net/
>
>
>
> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
>
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH v3 2/9] lavc/vp9dsp: R-V mc copy

2024-05-14 Thread flow gg

Using this will give output `if (bpp == 8 && (flags & AV_CPU_FLAG_RVI)) {`
Did you comment out the MISALIGNED flag check but not add RVI, resulting in
no output?

Rémi Denis-Courmont  于2024年5月15日周三 01:02写道：

> Le tiistaina 14. toukokuuta 2024, 7.44.55 EEST flow gg a écrit :
> > I am locally using:
> > if (bpp == 8 && (flags & AV_CPU_FLAG_RVI)) {
> > this performs better on k230/banana_f3 than C.
> > For email, refer to [FFmpeg-devel] [PATCH 2/2] lavc/vp8dsp: restrict RVI
> > optimisations and change it to
> > if (bpp == 8 && (flags & AV_CPU_FLAG_RV_MISALIGNED)) {
> > So no output, but I think the same modification should be made here?
>
> I just can't get any benchmarks out of checkasm. Even if I comment out the
> MISALIGNED flag check, this is not reporting anything. I tested with only
> patch
> 1/9 and 2/9, not the following. I don't know why.
>
> --
> 雷米‧德尼-库尔蒙
> http://www.remlab.net/
>
>
>
> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
>
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH v3 1/9] lavc/vp9dsp: R-V ipred vert

2024-05-13 Thread flow gg

I am locally using:
if (bpp == 8 && (flags & AV_CPU_FLAG_RVI) && (flags &
AV_CPU_FLAG_RVB_ADDR)) {
this performs better on k230/banana_f3 than C.
For email, refer to [FFmpeg-devel] [PATCH 2/2] lavc/vp8dsp: restrict RVI
optimisations and change it to
if (bpp == 8 && (flags & AV_CPU_FLAG_RV_MISALIGNED) && (flags &
AV_CPU_FLAG_RVB_ADDR)) {
So no output, but I think the same modification should be made here?

Rémi Denis-Courmont  于2024年5月14日周二 03:54写道：

> Le maanantaina 13. toukokuuta 2024, 19.59.18 EEST u...@foxmail.com a
> écrit :
> > From: sunyuechi 
> >
> > C908:
> > vp9_vert_8x8_8bpp_c: 22.0
> > vp9_vert_8x8_8bpp_rvi: 15.7
> > vp9_vert_16x16_8bpp_c: 71.2
> > vp9_vert_16x16_8bpp_rvi: 39.0
> > vp9_vert_32x32_8bpp_c: 300.2
> > vp9_vert_32x32_8bpp_rvi: 135.2
>
> Not sure how you get that. It should be reported under rvb_a (Zba) or
> misaligned, AFAIU.
>
> --
> レミ・デニ-クールモン
> http://www.remlab.net/
>
>
>
> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
>
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH v3 2/9] lavc/vp9dsp: R-V mc copy

2024-05-13 Thread flow gg

I am locally using:
if (bpp == 8 && (flags & AV_CPU_FLAG_RVI)) {
this performs better on k230/banana_f3 than C.
For email, refer to [FFmpeg-devel] [PATCH 2/2] lavc/vp8dsp: restrict RVI
optimisations and change it to
if (bpp == 8 && (flags & AV_CPU_FLAG_RV_MISALIGNED)) {
So no output, but I think the same modification should be made here?

Rémi Denis-Courmont  于2024年5月14日周二 03:55写道：

> Le maanantaina 13. toukokuuta 2024, 19.59.19 EEST u...@foxmail.com a
> écrit :
> > From: sunyuechi 
> >
> > C908:
> > vp9_put4_8bpp_c: 0.7
> > vp9_put4_8bpp_rvi: 0.5
> > vp9_put8_8bpp_c: 2.5
> > vp9_put8_8bpp_rvi: 0.5
> > vp9_put16_8bpp_c: 16.7
> > vp9_put16_8bpp_rvi: 1.5
> > vp9_put32_8bpp_c: 37.2
> > vp9_put32_8bpp_rvi: 5.7
> > vp9_put64_8bpp_c: 107.5
> > vp9_put64_8bpp_rvi: 21.7
>
> This patch does not produce any (new) results here though?
>
> --
> レミ・デニ-クールモン
> http://www.remlab.net/
>
>
>
> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
>
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH v3 1/9] lavc/vp9dsp: R-V ipred vert

2024-05-13 Thread flow gg

just rebase

 于2024年5月14日周二 01:00写道：

> From: sunyuechi 
>
> C908:
> vp9_vert_8x8_8bpp_c: 22.0
> vp9_vert_8x8_8bpp_rvi: 15.7
> vp9_vert_16x16_8bpp_c: 71.2
> vp9_vert_16x16_8bpp_rvi: 39.0
> vp9_vert_32x32_8bpp_c: 300.2
> vp9_vert_32x32_8bpp_rvi: 135.2
> ---
>  libavcodec/riscv/Makefile|  1 +
>  libavcodec/riscv/vp9_intra_rvi.S | 71 
>  libavcodec/riscv/vp9dsp.h|  6 +++
>  libavcodec/riscv/vp9dsp_init.c   | 15 +--
>  4 files changed, 90 insertions(+), 3 deletions(-)
>  create mode 100644 libavcodec/riscv/vp9_intra_rvi.S
>
> diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
> index 89273b1cad..ccd060c666 100644
> --- a/libavcodec/riscv/Makefile
> +++ b/libavcodec/riscv/Makefile
> @@ -62,6 +62,7 @@ OBJS-$(CONFIG_VP8DSP) += riscv/vp8dsp_init.o
>  RV-OBJS-$(CONFIG_VP8DSP) += riscv/vp8dsp_rvi.o
>  RVV-OBJS-$(CONFIG_VP8DSP) += riscv/vp8dsp_rvv.o
>  OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9dsp_init.o
> +RV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvi.o
>  RVV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvv.o
>  OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_init.o
>  RVV-OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_rvv.o
> diff --git a/libavcodec/riscv/vp9_intra_rvi.S
> b/libavcodec/riscv/vp9_intra_rvi.S
> new file mode 100644
> index 00..16b6bdb25a
> --- /dev/null
> +++ b/libavcodec/riscv/vp9_intra_rvi.S
> @@ -0,0 +1,71 @@
> +/*
> + * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences
> (ISCAS).
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> 02110-1301 USA
> + */
> +
> +#include "libavutil/riscv/asm.S"
> +
> +#if __riscv_xlen >= 64
> +func ff_v_32x32_rvi
> +ld   t0, (a3)
> +ld   t1, 8(a3)
> +ld   t2, 16(a3)
> +ld   t3, 24(a3)
> +.rept 16
> +add  a7, a0, a1
> +sd   t0, (a0)
> +sd   t1, 8(a0)
> +sd   t2, 16(a0)
> +sd   t3, 24(a0)
> +sh1add   a0, a1, a0
> +sd   t0, (a7)
> +sd   t1, 8(a7)
> +sd   t2, 16(a7)
> +sd   t3, 24(a7)
> +.endr
> +
> +ret
> +endfunc
> +
> +func ff_v_16x16_rvi
> +ld   t0, (a3)
> +ld   t1, 8(a3)
> +.rept 8
> +add  a7, a0, a1
> +sd   t0, (a0)
> +sd   t1, 8(a0)
> +sh1add   a0, a1, a0
> +sd   t0, (a7)
> +sd   t1, 8(a7)
> +.endr
> +
> +ret
> +endfunc
> +
> +func ff_v_8x8_rvi
> +ld   t0, (a3)
> +.rept 4
> +add  a7, a0, a1
> +sd   t0, (a0)
> +sh1add   a0, a1, a0
> +sd   t0, (a7)
> +.endr
> +
> +ret
> +endfunc
> +#endif
> diff --git a/libavcodec/riscv/vp9dsp.h b/libavcodec/riscv/vp9dsp.h
> index 25047ed507..f8bc6563a5 100644
> --- a/libavcodec/riscv/vp9dsp.h
> +++ b/libavcodec/riscv/vp9dsp.h
> @@ -60,6 +60,12 @@ void ff_dc_129_16x16_rvv(uint8_t *dst, ptrdiff_t
> stride, const uint8_t *l,
>   const uint8_t *a);
>  void ff_dc_129_8x8_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l,
> const uint8_t *a);
> +void ff_v_32x32_rvi(uint8_t *dst, ptrdiff_t stride, const uint8_t *l,
> +const uint8_t *a);
> +void ff_v_16x16_rvi(uint8_t *dst, ptrdiff_t stride, const uint8_t *l,
> +const uint8_t *a);
> +void ff_v_8x8_rvi(uint8_t *dst, ptrdiff_t stride, const uint8_t *l,
> +  const uint8_t *a);
>
>  #define VP9_8TAP_RISCV_RVV_FUNC(SIZE, type, type_idx)
>  \
>  void ff_put_8tap_##type##_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t
> dststride,   \
> diff --git a/libavcodec/riscv/vp9dsp_init.c
> b/libavcodec/riscv/vp9dsp_init.c
> index dd418bd5bf..0f64afc6d2 100644
> --- a/libavcodec/riscv/vp9dsp_init.c
> +++ b/libavcodec/riscv/vp9dsp_init.c
> @@ -24,11 +24,19 @@
>  #include "libavcodec/vp9dsp.h"
>  #include "vp9dsp.h"
>
> -static av_cold void vp9dsp_intrapred_init_rvv(VP9DSPContext *dsp, int bpp)
> +static av_cold void vp9dsp_intrapred_init_riscv(VP9DSPContext *dsp, int
> bpp)
>  {
> -#if HAVE_RVV

Re: [FFmpeg-devel] [PATCH] lavc/vc1dsp: R-V V mspel_pixels

2024-05-12 Thread flow gg

It seems like it can't... update using AV_CPU_FLAG_RV_MISALIGNED

Rémi Denis-Courmont  于2024年5月12日周日 19:48写道：

> Le perjantaina 10. toukokuuta 2024, 11.21.14 EEST u...@foxmail.com a
> écrit :
> > From: sunyuechi 
> >
> >   C908 X60
> > vc1dsp.avg_vc1_mspel_pixels_tab[0][0]_c:  14.7 13.2
> > vc1dsp.avg_vc1_mspel_pixels_tab[0][0]_rvv_i32  :   2.5  2.2
> > vc1dsp.avg_vc1_mspel_pixels_tab[1][0]_c:   3.7  3.5
> > vc1dsp.avg_vc1_mspel_pixels_tab[1][0]_rvv_i64  :   1.0  1.2
> > vc1dsp.put_vc1_mspel_pixels_tab[0][0]_c:   9.0  8.0
> > vc1dsp.put_vc1_mspel_pixels_tab[0][0]_rvi  :   1.0  1.0
> > vc1dsp.put_vc1_mspel_pixels_tab[1][0]_c:   2.5  2.2
> > vc1dsp.put_vc1_mspel_pixels_tab[1][0]_rvi  :   0.5  0.5
> > ---
> >  libavcodec/riscv/Makefile  |  1 +
> >  libavcodec/riscv/vc1dsp_init.c | 16 +++-
> >  libavcodec/riscv/vc1dsp_rvi.S  | 48 ++
> >  libavcodec/riscv/vc1dsp_rvv.S  | 48 ++
> >  4 files changed, 112 insertions(+), 1 deletion(-)
> >  create mode 100644 libavcodec/riscv/vc1dsp_rvi.S
> >
> > diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
> > index 43b5c21cf4..cd5cc21cfd 100644
> > --- a/libavcodec/riscv/Makefile
> > +++ b/libavcodec/riscv/Makefile
> > @@ -59,6 +59,7 @@ RVV-OBJS-$(CONFIG_TAK_DECODER) += riscv/takdsp_rvv.o
> >  OBJS-$(CONFIG_UTVIDEO_DECODER) += riscv/utvideodsp_init.o
> >  RVV-OBJS-$(CONFIG_UTVIDEO_DECODER) += riscv/utvideodsp_rvv.o
> >  OBJS-$(CONFIG_VC1DSP) += riscv/vc1dsp_init.o
> > +RV-OBJS-$(CONFIG_VC1DSP) += riscv/vc1dsp_rvi.o
> >  RVV-OBJS-$(CONFIG_VC1DSP) += riscv/vc1dsp_rvv.o
> >  OBJS-$(CONFIG_VP8DSP) += riscv/vp8dsp_init.o
> >  RVV-OBJS-$(CONFIG_VP8DSP) += riscv/vp8dsp_rvv.o
> > diff --git a/libavcodec/riscv/vc1dsp_init.c
> b/libavcodec/riscv/vc1dsp_init.c
> > index e47b644f80..555aa5aea7 100644
> > --- a/libavcodec/riscv/vc1dsp_init.c
> > +++ b/libavcodec/riscv/vc1dsp_init.c
> > @@ -29,19 +29,33 @@ void ff_vc1_inv_trans_8x8_dc_rvv(uint8_t *dest,
> > ptrdiff_t stride, int16_t *block void ff_vc1_inv_trans_4x8_dc_rvv(uint8_t
> > *dest, ptrdiff_t stride, int16_t *block); void
> > ff_vc1_inv_trans_8x4_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t
> > *block); void ff_vc1_inv_trans_4x4_dc_rvv(uint8_t *dest, ptrdiff_t
> stride,
> > int16_t *block); +void ff_put_pixels16x16_rvi(uint8_t *dst, const uint8_t
> > *src, ptrdiff_t line_size, int rnd); +void ff_put_pixels8x8_rvi(uint8_t
> > *dst, const uint8_t *src, ptrdiff_t line_size, int rnd); +void
> > ff_avg_pixels16x16_rvv(uint8_t *dst, const uint8_t *src, ptrdiff_t
> > line_size, int rnd); +void ff_avg_pixels8x8_rvv(uint8_t *dst, const
> uint8_t
> > *src, ptrdiff_t line_size, int rnd);
> >
> >  av_cold void ff_vc1dsp_init_riscv(VC1DSPContext *dsp)
> >  {
> > -#if HAVE_RVV
> > +#if HAVE_RV
> >  int flags = av_get_cpu_flags();
> >
> > +# if __riscv_xlen >= 64
> > +if (flags & AV_CPU_FLAG_RVI) {
> > +dsp->put_vc1_mspel_pixels_tab[1][0] = ff_put_pixels8x8_rvi;
> > +dsp->put_vc1_mspel_pixels_tab[0][0] = ff_put_pixels16x16_rvi;
> > +}
> > +# endif
> > +#if HAVE_RVV
> >  if (flags & AV_CPU_FLAG_RVV_I32 && ff_get_rv_vlenb() >= 16) {
> >  dsp->vc1_inv_trans_4x8_dc = ff_vc1_inv_trans_4x8_dc_rvv;
> >  dsp->vc1_inv_trans_4x4_dc = ff_vc1_inv_trans_4x4_dc_rvv;
> > +dsp->avg_vc1_mspel_pixels_tab[0][0] = ff_avg_pixels16x16_rvv;
> >  if (flags & AV_CPU_FLAG_RVV_I64) {
> >  dsp->vc1_inv_trans_8x8_dc = ff_vc1_inv_trans_8x8_dc_rvv;
> >  dsp->vc1_inv_trans_8x4_dc = ff_vc1_inv_trans_8x4_dc_rvv;
> > +dsp->avg_vc1_mspel_pixels_tab[1][0] = ff_avg_pixels8x8_rvv;
> >  }
> >  }
> >  #endif
> > +#endif
> >  }
> > diff --git a/libavcodec/riscv/vc1dsp_rvi.S
> b/libavcodec/riscv/vc1dsp_rvi.S
> > new file mode 100644
> > index 00..1d5660316f
> > --- /dev/null
> > +++ b/libavcodec/riscv/vc1dsp_rvi.S
> > @@ -0,0 +1,48 @@
> > +/*
> > + * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences
> > (ISCAS). + *
> > + * This file is part of FFmpeg.
> > + *
> > + * FFmpeg is free software; you can redistribute it and/or
> > + * modify it under the terms of the GNU Lesser General Public
> > + * License as published by the Free Software Foundation; either
> > + * version 2.1 of the License, or (at your option) any later version.
> > + *
> > + * FFmpeg is distributed in the hope that it will be useful,
> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > + * Lesser General Public License for more details.
> > + *
> > + * You should have received a copy of the GNU Lesser General Public
> > + * License along with FFmpeg; if not, write to the Free Software
> > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> 02110-1301
> > USA

Re: [FFmpeg-devel] [PATCH v3 1/9] lavc/vp9dsp: R-V ipred vert

2024-05-12 Thread flow gg

> It should be possible to improve ordering to avoid immediate dependency
from ADD to SD

Okay, updated it.

Additionally improved the mc-tap_64 on vlen>=256 and something

 于2024年5月12日周日 18:04写道：

> From: sunyuechi 
>
> C908:
> vp9_vert_8x8_8bpp_c: 22.0
> vp9_vert_8x8_8bpp_rvi: 15.7
> vp9_vert_16x16_8bpp_c: 71.2
> vp9_vert_16x16_8bpp_rvi: 39.0
> vp9_vert_32x32_8bpp_c: 300.2
> vp9_vert_32x32_8bpp_rvi: 135.2
> ---
>  libavcodec/riscv/Makefile|  1 +
>  libavcodec/riscv/vp9_intra_rvi.S | 71 
>  libavcodec/riscv/vp9dsp.h|  6 +++
>  libavcodec/riscv/vp9dsp_init.c   | 63 
>  4 files changed, 114 insertions(+), 27 deletions(-)
>  create mode 100644 libavcodec/riscv/vp9_intra_rvi.S
>
> diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
> index 89273b1cad..ccd060c666 100644
> --- a/libavcodec/riscv/Makefile
> +++ b/libavcodec/riscv/Makefile
> @@ -62,6 +62,7 @@ OBJS-$(CONFIG_VP8DSP) += riscv/vp8dsp_init.o
>  RV-OBJS-$(CONFIG_VP8DSP) += riscv/vp8dsp_rvi.o
>  RVV-OBJS-$(CONFIG_VP8DSP) += riscv/vp8dsp_rvv.o
>  OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9dsp_init.o
> +RV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvi.o
>  RVV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvv.o
>  OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_init.o
>  RVV-OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_rvv.o
> diff --git a/libavcodec/riscv/vp9_intra_rvi.S
> b/libavcodec/riscv/vp9_intra_rvi.S
> new file mode 100644
> index 00..16b6bdb25a
> --- /dev/null
> +++ b/libavcodec/riscv/vp9_intra_rvi.S
> @@ -0,0 +1,71 @@
> +/*
> + * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences
> (ISCAS).
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> 02110-1301 USA
> + */
> +
> +#include "libavutil/riscv/asm.S"
> +
> +#if __riscv_xlen >= 64
> +func ff_v_32x32_rvi
> +ld   t0, (a3)
> +ld   t1, 8(a3)
> +ld   t2, 16(a3)
> +ld   t3, 24(a3)
> +.rept 16
> +add  a7, a0, a1
> +sd   t0, (a0)
> +sd   t1, 8(a0)
> +sd   t2, 16(a0)
> +sd   t3, 24(a0)
> +sh1add   a0, a1, a0
> +sd   t0, (a7)
> +sd   t1, 8(a7)
> +sd   t2, 16(a7)
> +sd   t3, 24(a7)
> +.endr
> +
> +ret
> +endfunc
> +
> +func ff_v_16x16_rvi
> +ld   t0, (a3)
> +ld   t1, 8(a3)
> +.rept 8
> +add  a7, a0, a1
> +sd   t0, (a0)
> +sd   t1, 8(a0)
> +sh1add   a0, a1, a0
> +sd   t0, (a7)
> +sd   t1, 8(a7)
> +.endr
> +
> +ret
> +endfunc
> +
> +func ff_v_8x8_rvi
> +ld   t0, (a3)
> +.rept 4
> +add  a7, a0, a1
> +sd   t0, (a0)
> +sh1add   a0, a1, a0
> +sd   t0, (a7)
> +.endr
> +
> +ret
> +endfunc
> +#endif
> diff --git a/libavcodec/riscv/vp9dsp.h b/libavcodec/riscv/vp9dsp.h
> index 25047ed507..f8bc6563a5 100644
> --- a/libavcodec/riscv/vp9dsp.h
> +++ b/libavcodec/riscv/vp9dsp.h
> @@ -60,6 +60,12 @@ void ff_dc_129_16x16_rvv(uint8_t *dst, ptrdiff_t
> stride, const uint8_t *l,
>   const uint8_t *a);
>  void ff_dc_129_8x8_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l,
> const uint8_t *a);
> +void ff_v_32x32_rvi(uint8_t *dst, ptrdiff_t stride, const uint8_t *l,
> +const uint8_t *a);
> +void ff_v_16x16_rvi(uint8_t *dst, ptrdiff_t stride, const uint8_t *l,
> +const uint8_t *a);
> +void ff_v_8x8_rvi(uint8_t *dst, ptrdiff_t stride, const uint8_t *l,
> +  const uint8_t *a);
>
>  #define VP9_8TAP_RISCV_RVV_FUNC(SIZE, type, type_idx)
>  \
>  void ff_put_8tap_##type##_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t
> dststride,   \
> diff --git a/libavcodec/riscv/vp9dsp_init.c
> b/libavcodec/riscv/vp9dsp_init.c
> index 69ab39004c..e377d377e3 100644
> --- a/libavcodec/riscv/vp9dsp_init.c
> +++ b/libavcodec/riscv/vp9dsp_init.c
> @@ -24,38 +24,47 @@
>  #include "libavcodec/vp9dsp.h"
>  #include "vp9dsp.h"

Re: [FFmpeg-devel] [PATCH v3 1/9] lavc/vp8dsp: R-V put_vp8_pixels

2024-05-11 Thread flow gg

Wow, got it

Rémi Denis-Courmont  于2024年5月11日周六 22:39写道：

> Le maanantaina 6. toukokuuta 2024, 6.38.01 EEST u...@foxmail.com a écrit :
> > From: sunyuechi 
> >
> > C908:
> > vp8_put_pixels4_c: 78.0
> > vp8_put_pixels4_rvi: 33.7
> > vp8_put_pixels8_c: 278.0
> > vp8_put_pixels8_rvi: 55.0
> > vp8_put_pixels16_c: 999.0
> > vp8_put_pixels16_rvi: 86.7
>
> Actually, I do think that the C compiler is doing a better job.
> On SiFive U74, the figures are horrible:
>
> By time:
> vp8_put_pixels4_c: 0.2
> vp8_put_pixels4_rvi:  10.5
> vp8_put_pixels8_c: 1.0
> vp8_put_pixels8_rvi:  30.0
> vp8_put_pixels16_c:2.2
> vp8_put_pixels16_rvi: 60.0
>
> By cycles:
> vp8_put_pixels4_c:   84.0
> vp8_put_pixels4_rvi:   3871.0
> vp8_put_pixels8_c:  331.0
> vp8_put_pixels8_rvi:  11281.2
> vp8_put_pixels16_c: 826.0
> vp8_put_pixels16_rvi: 22502.0
>
> Best guess is that the data is not 64-bit aligned, otherwise I can't
> really
> make sense of those figures.
>
> --
> Rémi Denis-Courmont
> http://www.remlab.net/
>
>
>
> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
>
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH] lavc/vc1dsp: R-V V mspel_pixels

2024-05-11 Thread flow gg

In banana_f3, further reducing the value of mf resulted in another
performance improvement. I think in the end we might need to use different
functions depending on vlen in init..

Rémi Denis-Courmont  于2024年5月11日周六 18:24写道：

> Le lauantaina 11. toukokuuta 2024, 13.02.02 EEST flow gg a écrit :
> > The test results show that changing mf2 to m1 in ff_avg_pixels8x8_rvv in
> > vc1,
> > or changing mf2/mf4 to m1 in vsetvlstatic8 in vp8,
> > results in a 10-20% performance decrease on both k230 and banana_f3.
>
> The questions remain, how changing from MF2 to MF4 affects performance on
> Zvl256b, and if it does, how to deal with that without breaking support
> for
> Zvl128b.
>
> --
> Rémi Denis-Courmont
> http://www.remlab.net/
>
>
>
> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
>
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH v4 6/9] lavc/vp8dsp: R-V V put_epel hv

2024-05-11 Thread flow gg

Okay, updated it in the reply

Rémi Denis-Courmont  于2024年5月10日周五 23:41写道：

> Le tiistaina 7. toukokuuta 2024, 19.54.09 EEST u...@foxmail.com a écrit :
> > From: sunyuechi 
> >
> > C908:
> > vp8_put_epel4_h4v4_c: 20.0
> > vp8_put_epel4_h4v4_rvv_i32: 11.0
> > vp8_put_epel4_h4v6_c: 25.2
> > vp8_put_epel4_h4v6_rvv_i32: 13.5
> > vp8_put_epel4_h6v4_c: 22.2
> > vp8_put_epel4_h6v4_rvv_i32: 14.5
> > vp8_put_epel4_h6v6_c: 29.0
> > vp8_put_epel4_h6v6_rvv_i32: 15.7
> > vp8_put_epel8_h4v4_c: 73.0
> > vp8_put_epel8_h4v4_rvv_i32: 22.2
> > vp8_put_epel8_h4v6_c: 90.5
> > vp8_put_epel8_h4v6_rvv_i32: 26.7
> > vp8_put_epel8_h6v4_c: 85.0
> > vp8_put_epel8_h6v4_rvv_i32: 27.2
> > vp8_put_epel8_h6v6_c: 104.7
> > vp8_put_epel8_h6v6_rvv_i32: 29.5
> > vp8_put_epel16_h4v4_c: 145.5
> > vp8_put_epel16_h4v4_rvv_i32: 26.5
> > vp8_put_epel16_h4v6_c: 190.7
> > vp8_put_epel16_h4v6_rvv_i32: 47.5
> > vp8_put_epel16_h6v4_c: 173.7
> > vp8_put_epel16_h6v4_rvv_i32: 33.2
> > vp8_put_epel16_h6v6_c: 222.2
> > vp8_put_epel16_h6v6_rvv_i32: 35.5
> > ---
> >  libavcodec/riscv/vp8dsp_init.c |  13 
> >  libavcodec/riscv/vp8dsp_rvv.S  | 123 +++--
> >  2 files changed, 115 insertions(+), 21 deletions(-)
> >
> > diff --git a/libavcodec/riscv/vp8dsp_init.c
> b/libavcodec/riscv/vp8dsp_init.c
> > index dc3e087f01..463c8fa0a2 100644
> > --- a/libavcodec/riscv/vp8dsp_init.c
> > +++ b/libavcodec/riscv/vp8dsp_init.c
> > @@ -97,6 +97,19 @@ av_cold void ff_vp78dsp_init_riscv(VP8DSPContext *c)
> >  c->put_vp8_epel_pixels_tab[0][1][0] = ff_put_vp8_epel16_v4_rvv;
> >  c->put_vp8_epel_pixels_tab[1][1][0] = ff_put_vp8_epel8_v4_rvv;
> >  c->put_vp8_epel_pixels_tab[2][1][0] = ff_put_vp8_epel4_v4_rvv;
> > +
> > +c->put_vp8_epel_pixels_tab[0][2][2] =
> ff_put_vp8_epel16_h6v6_rvv;
> > +c->put_vp8_epel_pixels_tab[1][2][2] = ff_put_vp8_epel8_h6v6_rvv;
> > +c->put_vp8_epel_pixels_tab[2][2][2] = ff_put_vp8_epel4_h6v6_rvv;
> > +c->put_vp8_epel_pixels_tab[0][2][1] =
> ff_put_vp8_epel16_h4v6_rvv;
> > +c->put_vp8_epel_pixels_tab[1][2][1] = ff_put_vp8_epel8_h4v6_rvv;
> > +c->put_vp8_epel_pixels_tab[2][2][1] = ff_put_vp8_epel4_h4v6_rvv;
> > +c->put_vp8_epel_pixels_tab[0][1][1] =
> ff_put_vp8_epel16_h4v4_rvv;
> > +c->put_vp8_epel_pixels_tab[1][1][1] = ff_put_vp8_epel8_h4v4_rvv;
> > +c->put_vp8_epel_pixels_tab[2][1][1] = ff_put_vp8_epel4_h4v4_rvv;
> > +c->put_vp8_epel_pixels_tab[0][1][2] =
> ff_put_vp8_epel16_h6v4_rvv;
> > +c->put_vp8_epel_pixels_tab[1][1][2] = ff_put_vp8_epel8_h6v4_rvv;
> > +c->put_vp8_epel_pixels_tab[2][1][2] = ff_put_vp8_epel4_h6v4_rvv;
> >  }
> >  #endif
> >  #endif
> > diff --git a/libavcodec/riscv/vp8dsp_rvv.S
> b/libavcodec/riscv/vp8dsp_rvv.S
> > index 4d7a9f6a2d..fba72f8c15 100644
> > --- a/libavcodec/riscv/vp8dsp_rvv.S
> > +++ b/libavcodec/riscv/vp8dsp_rvv.S
> > @@ -161,26 +161,26 @@ const subpel_filters
> >  .byte 0,  -1,  12, 123,  -6, 0
> >  endconst
> >
> > -.macro epel_filter size type
> > -lla t2, subpel_filters
> > +.macro epel_filter size type regtype
> > +lla \regtype\()2, subpel_filters
> >  .ifc \type,v
> > -addit0, a6, -1
> > +addi\regtype\()0, a6, -1
> >  .else
> > -addit0, a5, -1
> > +addi\regtype\()0, a5, -1
> >  .endif
> > -li  t1, 6
> > -mul t0, t0, t1
> > -add t0, t0, t2
> > +li  \regtype\()1, 6
> > +mul \regtype\()0, \regtype\()0, \regtype\()1
> > +add \regtype\()0, \regtype\()0, \regtype\()2
> >  .irp n 1,2,3,4
> > -lb  t\n, \n(t0)
> > +lb  \regtype\n, \n(\regtype\()0)
> >  .endr
> >  .ifc \size,6
> > -lb  t5, 5(t0)
> > -lb  t0, (t0)
> > +lb  \regtype\()5, 5(\regtype\()0)
> > +lb  \regtype\()0, (\regtype\()0)
> >  .endif
> >  .endm
> >
> > -.macro epel_load dst len size type
> > +.macro epel_load dst len size type from_mem regtype
> >  .ifc \type,v
> >  mv  a5, a3
> >  .else
> > @@ -189,24 +189,35 @@ endconst
> >  sub t6, a2, a5
> >  add a7, a2, a5
> >
> > +.if \from_mem
> >  vle8.v  v24, (a2)
> >  vle8.v  v22, (t6)
> >  vle8.v  v26, (a7)
> >  add a7, a7, a5
> >  vle8.v  v28, (a7)
> > -vwmulu.vx   v16, v24, t2
> > -vwmulu.vx   v20, v26, t3
> > +vwmulu.vx   v16, v24, \regtype\()2
> > +vwmulu.vx   v20, v26, \regtype\()3
> >  .ifc \size,6
> >  sub t6, t6, a5
> >  add a7, a7, a5
> >  vle8.v  v24, (t6)
> >  vle8.v  v26, (a7)
> > -vwmaccu.vx  v16,

Re: [FFmpeg-devel] [PATCH 1/3] lavc/vp9dsp: fix indentation

2024-05-11 Thread flow gg

The patch `lavc/vp9dsp: R-V ipred vert` needs to add `#if HAVE_RV`. How
about I modify these `#if HAVE_RVV` indentations together in this patch?

Rémi Denis-Courmont  于2024年5月11日周六 00:39写道：

> ---
>  libavcodec/riscv/vp9dsp_init.c | 50 +-
>  1 file changed, 25 insertions(+), 25 deletions(-)
>
> diff --git a/libavcodec/riscv/vp9dsp_init.c
> b/libavcodec/riscv/vp9dsp_init.c
> index 69ab39004c..6863c486c8 100644
> --- a/libavcodec/riscv/vp9dsp_init.c
> +++ b/libavcodec/riscv/vp9dsp_init.c
> @@ -26,33 +26,33 @@
>
>  static av_cold void vp9dsp_intrapred_init_rvv(VP9DSPContext *dsp, int bpp)
>  {
> -#if HAVE_RVV
> -int flags = av_get_cpu_flags();
> +#if HAVE_RVV
> +int flags = av_get_cpu_flags();
>
> -if (bpp == 8 && flags & AV_CPU_FLAG_RVV_I64 && ff_get_rv_vlenb()
> >= 16) {
> -dsp->intra_pred[TX_8X8][DC_PRED] = ff_dc_8x8_rvv;
> -dsp->intra_pred[TX_8X8][LEFT_DC_PRED] = ff_dc_left_8x8_rvv;
> -dsp->intra_pred[TX_8X8][DC_127_PRED] = ff_dc_127_8x8_rvv;
> -dsp->intra_pred[TX_8X8][DC_128_PRED] = ff_dc_128_8x8_rvv;
> -dsp->intra_pred[TX_8X8][DC_129_PRED] = ff_dc_129_8x8_rvv;
> -dsp->intra_pred[TX_8X8][TOP_DC_PRED] = ff_dc_top_8x8_rvv;
> -}
> +if (bpp == 8 && flags & AV_CPU_FLAG_RVV_I64 && ff_get_rv_vlenb() >=
> 16) {
> +dsp->intra_pred[TX_8X8][DC_PRED] = ff_dc_8x8_rvv;
> +dsp->intra_pred[TX_8X8][LEFT_DC_PRED] = ff_dc_left_8x8_rvv;
> +dsp->intra_pred[TX_8X8][DC_127_PRED] = ff_dc_127_8x8_rvv;
> +dsp->intra_pred[TX_8X8][DC_128_PRED] = ff_dc_128_8x8_rvv;
> +dsp->intra_pred[TX_8X8][DC_129_PRED] = ff_dc_129_8x8_rvv;
> +dsp->intra_pred[TX_8X8][TOP_DC_PRED] = ff_dc_top_8x8_rvv;
> +}
>
> -if (bpp == 8 && flags & AV_CPU_FLAG_RVV_I32 && ff_get_rv_vlenb()
> >= 16) {
> -dsp->intra_pred[TX_32X32][DC_PRED] = ff_dc_32x32_rvv;
> -dsp->intra_pred[TX_16X16][DC_PRED] = ff_dc_16x16_rvv;
> -dsp->intra_pred[TX_32X32][LEFT_DC_PRED] =
> ff_dc_left_32x32_rvv;
> -dsp->intra_pred[TX_16X16][LEFT_DC_PRED] =
> ff_dc_left_16x16_rvv;
> -dsp->intra_pred[TX_32X32][DC_127_PRED] = ff_dc_127_32x32_rvv;
> -dsp->intra_pred[TX_16X16][DC_127_PRED] = ff_dc_127_16x16_rvv;
> -dsp->intra_pred[TX_32X32][DC_128_PRED] = ff_dc_128_32x32_rvv;
> -dsp->intra_pred[TX_16X16][DC_128_PRED] = ff_dc_128_16x16_rvv;
> -dsp->intra_pred[TX_32X32][DC_129_PRED] = ff_dc_129_32x32_rvv;
> -dsp->intra_pred[TX_16X16][DC_129_PRED] = ff_dc_129_16x16_rvv;
> -dsp->intra_pred[TX_32X32][TOP_DC_PRED] = ff_dc_top_32x32_rvv;
> -dsp->intra_pred[TX_16X16][TOP_DC_PRED] = ff_dc_top_16x16_rvv;
> -}
> -#endif
> +if (bpp == 8 && flags & AV_CPU_FLAG_RVV_I32 && ff_get_rv_vlenb() >=
> 16) {
> +dsp->intra_pred[TX_32X32][DC_PRED] = ff_dc_32x32_rvv;
> +dsp->intra_pred[TX_16X16][DC_PRED] = ff_dc_16x16_rvv;
> +dsp->intra_pred[TX_32X32][LEFT_DC_PRED] = ff_dc_left_32x32_rvv;
> +dsp->intra_pred[TX_16X16][LEFT_DC_PRED] = ff_dc_left_16x16_rvv;
> +dsp->intra_pred[TX_32X32][DC_127_PRED] = ff_dc_127_32x32_rvv;
> +dsp->intra_pred[TX_16X16][DC_127_PRED] = ff_dc_127_16x16_rvv;
> +dsp->intra_pred[TX_32X32][DC_128_PRED] = ff_dc_128_32x32_rvv;
> +dsp->intra_pred[TX_16X16][DC_128_PRED] = ff_dc_128_16x16_rvv;
> +dsp->intra_pred[TX_32X32][DC_129_PRED] = ff_dc_129_32x32_rvv;
> +dsp->intra_pred[TX_16X16][DC_129_PRED] = ff_dc_129_16x16_rvv;
> +dsp->intra_pred[TX_32X32][TOP_DC_PRED] = ff_dc_top_32x32_rvv;
> +dsp->intra_pred[TX_16X16][TOP_DC_PRED] = ff_dc_top_16x16_rvv;
> +}
> +#endif
>  }
>
>  av_cold void ff_vp9dsp_init_riscv(VP9DSPContext *dsp, int bpp, int
> bitexact)
> --
> 2.43.0
>
> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
>
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH] lavc/vc1dsp: R-V V mspel_pixels

2024-05-11 Thread flow gg

The test results show that changing mf2 to m1 in ff_avg_pixels8x8_rvv in
vc1,
or changing mf2/mf4 to m1 in vsetvlstatic8 in vp8,
results in a 10-20% performance decrease on both k230 and banana_f3.

I think we should just continue using it as is...

Rémi Denis-Courmont  于2024年5月10日周五 23:34写道：

> Le perjantaina 10. toukokuuta 2024, 11.22.53 EEST flow gg a écrit :
> > Hi, I got BananaPi F3, made some fixes, updated in reply
>
> So... Does it benefit from halving the logical multiplier to process
> fixed-sized
> block as compared to C908, or can we stick to the same code regardless of
> vector sizes?
>
> Also beware that K60 cores have in-order pipelines, so data dependencies
> will
> probably hurt more than on C908.
>
> --
> Rémi Denis-Courmont
> http://www.remlab.net/
>
>
>
> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
>
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH] lavc/vc1dsp: R-V V mspel_pixels

2024-05-10 Thread flow gg

Hi, I got BananaPi F3, made some fixes, updated in reply

Rémi Denis-Courmont  于2024年5月6日周一 03:26写道：

> Le sunnuntaina 5. toukokuuta 2024, 12.18.56 EEST flow gg a écrit :
> > > Does MF2 actually improve perfs over M1 here?
> >
> > The difference here seems very small, but when both mf2 and m1 are
> correct,
> > the test results have only shown mf2 to be better, so I want to use mf2.
>
> I can live with that. But this is a slippery slope because large vector
> sizes
> would involve even smaller fractions. Then we would need to compute the
> value
> which might negate the performance gains from fractional multipliers.
>
> The fastest approach that I can think of is a symbolic LA (which expands
> to
> 1xAUIPC + 1xLA) to load a precomputed VTYPE value from a static variable.
> Furthermore, this requires VSETVL, which precludes immediate constant VL
> Indeed, the VSETIVL instruction does not exist.
>
> AFAIU, BananaPi F3 has 256-bit vectors already now.
>
> --
> Rémi Denis-Courmont
> http://www.remlab.net/
>
>
>
> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
>
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH v2 3/9] lavc/vp9dsp: R-V V ipred hor

2024-05-07 Thread flow gg

> Do you gain much by unrolling all the way to 16x? Given that you have the
> counter value already in t0, it should not make much difference to just
unroll
> 2x or maybe 4x and then loop.

I chose this simple method because I think the effect is about the same..
Do I need to change it?

> It might also be faster to use lhu or lwu and shift to reduce scalar
loads, at
least if the vector is suitably aligned.

I just tested ff_h_16x16_rvv, lbu version is faster (lbu * 16 version:
80.2, lwu * 4 version: 117.2).


Rémi Denis-Courmont  于2024年5月8日周三 00:10写道：

> Le tiistaina 7. toukokuuta 2024, 10.36.07 EEST u...@foxmail.com a écrit :
> > From: sunyuechi 
> >
> > C908:
> > vp9_hor_8x8_8bpp_c: 74.7
> > vp9_hor_8x8_8bpp_rvv_i32: 35.7
> > vp9_hor_16x16_8bpp_c: 175.5
> > vp9_hor_16x16_8bpp_rvv_i32: 80.2
> > vp9_hor_32x32_8bpp_c: 510.2
> > vp9_hor_32x32_8bpp_rvv_i32: 264.0
> > ---
> >  libavcodec/riscv/vp9_intra_rvv.S | 56 
> >  libavcodec/riscv/vp9dsp.h|  6 
> >  libavcodec/riscv/vp9dsp_init.c   |  3 ++
> >  3 files changed, 65 insertions(+)
> >
> > diff --git a/libavcodec/riscv/vp9_intra_rvv.S
> > b/libavcodec/riscv/vp9_intra_rvv.S index db9774c263..dd9bc036e7 100644
> > --- a/libavcodec/riscv/vp9_intra_rvv.S
> > +++ b/libavcodec/riscv/vp9_intra_rvv.S
> > @@ -113,3 +113,59 @@ func_dc dc_left  8   left 3  0  zve64x
> >  func_dc dc_top   32  top  5  1  zve32x
> >  func_dc dc_top   16  top  4  1  zve32x
> >  func_dc dc_top   8   top  3  0  zve64x
> > +
> > +func ff_h_32x32_rvv, zve32x
> > +li   t0, 32
> > +addi a2, a2, 31
> > +vsetvli  zero, t0, e8, m2, ta, ma
> > +
> > +.rept 2
> > +.irp n 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
> > +lbu  t1, (a2)
> > +addi a2, a2, -1
> > +vmv.v.x  v\n, t1
> > +.endr
> > +.irp n 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
> > +vse8.v   v\n, (a0)
> > +add  a0, a0, a1
> > +.endr
> > +.endr
>
> Do you gain much by unrolling all the way to 16x? Given that you have the
> counter value already in t0, it should not make much difference to just
> unroll
> 2x or maybe 4x and then loop.
>
> It might also be faster to use lhu or lwu and shift to reduce scalar
> loads, at
> least if the vector is suitably aligned.
>
> > +
> > +ret
> > +endfunc
> > +
> > +func ff_h_16x16_rvv, zve32x
> > +addi a2, a2, 15
> > +vsetivli zero, 16, e8, m1, ta, ma
> > +
> > +.irp n 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
> 22, 23
> > +lbu  t1, (a2)
> > +addi a2, a2, -1
> > +vmv.v.x  v\n, t1
> > +.endr
> > +.irp n 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22
> > +vse8.v   v\n, (a0)
> > +add  a0, a0, a1
> > +.endr
> > +vse8.v   v23, (a0)
> > +
> > +ret
> > +endfunc
> > +
> > +func ff_h_8x8_rvv, zve32x
> > +addi a2, a2, 7
> > +vsetivli zero, 8, e8, mf2, ta, ma
> > +
> > +.irp n 8, 9, 10, 11, 12, 13, 14, 15
> > +lbu  t1, (a2)
> > +addi a2, a2, -1
> > +vmv.v.x  v\n, t1
> > +.endr
> > +.irp n 8, 9, 10, 11, 12, 13, 14
> > +vse8.v   v\n, (a0)
> > +add  a0, a0, a1
> > +.endr
> > +vse8.v   v15, (a0)
> > +
> > +ret
> > +endfunc
> > diff --git a/libavcodec/riscv/vp9dsp.h b/libavcodec/riscv/vp9dsp.h
> > index b8ff282f8a..0ad961c7e0 100644
> > --- a/libavcodec/riscv/vp9dsp.h
> > +++ b/libavcodec/riscv/vp9dsp.h
> > @@ -66,6 +66,12 @@ void ff_v_16x16_rvi(uint8_t *dst, ptrdiff_t stride,
> const
> > uint8_t *l, const uint8_t *a);
> >  void ff_v_8x8_rvi(uint8_t *dst, ptrdiff_t stride, const uint8_t *l,
> >const uint8_t *a);
> > +void ff_h_32x32_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l,
> > +const uint8_t *a);
> > +void ff_h_16x16_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l,
> > +const uint8_t *a);
> > +void ff_h_8x8_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l,
> > +  const uint8_t *a);
> >
> >  #define VP9_8TAP_RISCV_RVV_FUNC(SIZE, type, type_idx)
>
> >   \ void ff_put_8tap_##type##_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t
> > dststride,   \ diff --git a/libavcodec/riscv/vp9dsp_init.c
> > b/libavcodec/riscv/vp9dsp_init.c index c10f8bbe41..7816b13fe0 100644
> > --- a/libavcodec/riscv/vp9dsp_init.c
> > +++ b/libavcodec/riscv/vp9dsp_init.c
> > @@ -59,6 +59,9 @@ static av_cold void
> > vp9dsp_intrapred_init_riscv(VP9DSPContext *dsp, int bpp)
> > dsp->intra_pred[TX_16X16][DC_129_PRED] = ff_dc_129_16x16_rvv;
> > dsp->intra_pred[TX_32X32][TOP_DC_PRED] = ff_dc_top_32x32_rvv;
> > dsp->intra_pred[TX_16X16][TOP_DC_PRED] = ff_dc_top_16x16_rvv; +
>
> > dsp->intra_pred[TX_32X32][HOR_P

Re: [FFmpeg-devel] [PATCH v4 2/9] lavc/vp8dsp: R-V V put_bilin_h v

2024-05-07 Thread flow gg

> h is not a number so that's not a valid condition.

Fixed two of this issue

 于2024年5月8日周三 00:55写道：

> From: sunyuechi 
>
> C908:
> vp8_put_bilin4_h_c: 367.0
> vp8_put_bilin4_h_rvv_i32: 137.7
> vp8_put_bilin4_v_c: 377.0
> vp8_put_bilin4_v_rvv_i32: 137.7
> vp8_put_bilin8_h_c: 1431.0
> vp8_put_bilin8_h_rvv_i32: 297.5
> vp8_put_bilin8_v_c: 1449.0
> vp8_put_bilin8_v_rvv_i32: 297.5
> vp8_put_bilin16_h_c: 2839.0
> vp8_put_bilin16_h_rvv_i32: 344.7
> vp8_put_bilin16_v_c: 2857.0
> vp8_put_bilin16_v_rvv_i32: 344.7
> ---
>  libavcodec/riscv/vp8dsp_init.c | 21 +++
>  libavcodec/riscv/vp8dsp_rvv.S  | 49 ++
>  2 files changed, 70 insertions(+)
>
> diff --git a/libavcodec/riscv/vp8dsp_init.c
> b/libavcodec/riscv/vp8dsp_init.c
> index fa3feeacf7..afffa6de2f 100644
> --- a/libavcodec/riscv/vp8dsp_init.c
> +++ b/libavcodec/riscv/vp8dsp_init.c
> @@ -34,6 +34,10 @@ VP8_EPEL(16, rvi);
>  VP8_EPEL(8,  rvi);
>  VP8_EPEL(4,  rvi);
>
> +VP8_BILIN(16, rvv);
> +VP8_BILIN(8,  rvv);
> +VP8_BILIN(4,  rvv);
> +
>  av_cold void ff_vp78dsp_init_riscv(VP8DSPContext *c)
>  {
>  #if HAVE_RV
> @@ -48,6 +52,23 @@ av_cold void ff_vp78dsp_init_riscv(VP8DSPContext *c)
>  c->put_vp8_epel_pixels_tab[2][0][0] = ff_put_vp8_pixels4_rvi;
>  c->put_vp8_bilinear_pixels_tab[2][0][0] = ff_put_vp8_pixels4_rvi;
>  }
> +#if HAVE_RVV
> +if (flags & AV_CPU_FLAG_RVV_I32 && ff_get_rv_vlenb() >= 16) {
> +c->put_vp8_bilinear_pixels_tab[0][0][1] =
> ff_put_vp8_bilin16_h_rvv;
> +c->put_vp8_bilinear_pixels_tab[0][0][2] =
> ff_put_vp8_bilin16_h_rvv;
> +c->put_vp8_bilinear_pixels_tab[1][0][1] = ff_put_vp8_bilin8_h_rvv;
> +c->put_vp8_bilinear_pixels_tab[1][0][2] = ff_put_vp8_bilin8_h_rvv;
> +c->put_vp8_bilinear_pixels_tab[2][0][1] = ff_put_vp8_bilin4_h_rvv;
> +c->put_vp8_bilinear_pixels_tab[2][0][2] = ff_put_vp8_bilin4_h_rvv;
> +
> +c->put_vp8_bilinear_pixels_tab[0][1][0] =
> ff_put_vp8_bilin16_v_rvv;
> +c->put_vp8_bilinear_pixels_tab[0][2][0] =
> ff_put_vp8_bilin16_v_rvv;
> +c->put_vp8_bilinear_pixels_tab[1][1][0] = ff_put_vp8_bilin8_v_rvv;
> +c->put_vp8_bilinear_pixels_tab[1][2][0] = ff_put_vp8_bilin8_v_rvv;
> +c->put_vp8_bilinear_pixels_tab[2][1][0] = ff_put_vp8_bilin4_v_rvv;
> +c->put_vp8_bilinear_pixels_tab[2][2][0] = ff_put_vp8_bilin4_v_rvv;
> +}
> +#endif
>  #endif
>  }
>
> diff --git a/libavcodec/riscv/vp8dsp_rvv.S b/libavcodec/riscv/vp8dsp_rvv.S
> index 8a0773f964..ec8ff917b9 100644
> --- a/libavcodec/riscv/vp8dsp_rvv.S
> +++ b/libavcodec/riscv/vp8dsp_rvv.S
> @@ -20,6 +20,18 @@
>
>  #include "libavutil/riscv/asm.S"
>
> +.macro vsetvlstatic8 len
> +.if \len <= 4
> +vsetivlizero, \len, e8, mf4, ta, ma
> +.elseif \len <= 8
> +vsetivlizero, \len, e8, mf2, ta, ma
> +.elseif \len <= 16
> +vsetivlizero, \len, e8, m1, ta, ma
> +.elseif \len <= 31
> +vsetivlizero, \len, e8, m2, ta, ma
> +.endif
> +.endm
> +
>  .macro vp8_idct_dc_add
>  vlse32.v  v0, (a0), a2
>  lha5, 0(a1)
> @@ -71,3 +83,40 @@ func ff_vp8_idct_dc_add4uv_rvv, zve32x
>
>  ret
>  endfunc
> +
> +.macro bilin_load dst len type mn
> +.ifc \type,v
> +add t5, a2, a3
> +.else
> +addit5, a2, 1
> +.endif
> +vle8.v  \dst, (a2)
> +vle8.v  v2, (t5)
> +vwmulu.vx   v28, \dst, t1
> +vwmaccu.vx  v28, \mn, v2
> +vwaddu.wx   v24, v28, t4
> +vnsra.wi\dst, v24, 3
> +.endm
> +
> +.macro put_vp8_bilin_h_v len type mn
> +func ff_put_vp8_bilin\len\()_\type\()_rvv, zve32x
> +vsetvlstatic8   \len
> +li  t1, 8
> +li  t4, 4
> +sub t1, t1, \mn
> +1:
> +addia4, a4, -1
> +bilin_load  v0, \len, \type, \mn
> +vse8.v  v0, (a0)
> +add a2, a2, a3
> +add a0, a0, a1
> +bneza4, 1b
> +
> +ret
> +endfunc
> +.endm
> +
> +.irp len 16,8,4
> +put_vp8_bilin_h_v \len h a5
> +put_vp8_bilin_h_v \len v a6
> +.endr
> --
> 2.45.0
>
> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
>
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH v3 2/9] lavc/vp8dsp: R-V V put_bilin_h v

2024-05-07 Thread flow gg

I didn't understand what you mean... What does judging whether the type is
'h' or 'v' have to do with the number?

Rémi Denis-Courmont  于2024年5月8日周三 00:00写道：

> Le maanantaina 6. toukokuuta 2024, 6.38.02 EEST u...@foxmail.com a écrit :
> > From: sunyuechi 
> >
> > C908:
> > vp8_put_bilin4_h_c: 367.0
> > vp8_put_bilin4_h_rvv_i32: 137.7
> > vp8_put_bilin4_v_c: 377.0
> > vp8_put_bilin4_v_rvv_i32: 137.7
> > vp8_put_bilin8_h_c: 1431.0
> > vp8_put_bilin8_h_rvv_i32: 297.5
> > vp8_put_bilin8_v_c: 1449.0
> > vp8_put_bilin8_v_rvv_i32: 297.5
> > vp8_put_bilin16_h_c: 2839.0
> > vp8_put_bilin16_h_rvv_i32: 344.7
> > vp8_put_bilin16_v_c: 2857.0
> > vp8_put_bilin16_v_rvv_i32: 344.7
> > ---
> >  libavcodec/riscv/vp8dsp_init.c | 21 +++
> >  libavcodec/riscv/vp8dsp_rvv.S  | 49 ++
> >  2 files changed, 70 insertions(+)
> >
> > diff --git a/libavcodec/riscv/vp8dsp_init.c
> b/libavcodec/riscv/vp8dsp_init.c
> > index fa3feeacf7..afffa6de2f 100644
> > --- a/libavcodec/riscv/vp8dsp_init.c
> > +++ b/libavcodec/riscv/vp8dsp_init.c
> > @@ -34,6 +34,10 @@ VP8_EPEL(16, rvi);
> >  VP8_EPEL(8,  rvi);
> >  VP8_EPEL(4,  rvi);
> >
> > +VP8_BILIN(16, rvv);
> > +VP8_BILIN(8,  rvv);
> > +VP8_BILIN(4,  rvv);
> > +
> >  av_cold void ff_vp78dsp_init_riscv(VP8DSPContext *c)
> >  {
> >  #if HAVE_RV
> > @@ -48,6 +52,23 @@ av_cold void ff_vp78dsp_init_riscv(VP8DSPContext *c)
> >  c->put_vp8_epel_pixels_tab[2][0][0] = ff_put_vp8_pixels4_rvi;
> >  c->put_vp8_bilinear_pixels_tab[2][0][0] =
> ff_put_vp8_pixels4_rvi;
> >  }
> > +#if HAVE_RVV
> > +if (flags & AV_CPU_FLAG_RVV_I32 && ff_get_rv_vlenb() >= 16) {
> > +c->put_vp8_bilinear_pixels_tab[0][0][1] =
> ff_put_vp8_bilin16_h_rvv;
> > +c->put_vp8_bilinear_pixels_tab[0][0][2] =
> > ff_put_vp8_bilin16_h_rvv; +
> c->put_vp8_bilinear_pixels_tab[1][0][1]
> > = ff_put_vp8_bilin8_h_rvv; +
> c->put_vp8_bilinear_pixels_tab[1][0][2]
> > = ff_put_vp8_bilin8_h_rvv; +
> c->put_vp8_bilinear_pixels_tab[2][0][1]
> > = ff_put_vp8_bilin4_h_rvv; +
> c->put_vp8_bilinear_pixels_tab[2][0][2]
> > = ff_put_vp8_bilin4_h_rvv; +
> > +c->put_vp8_bilinear_pixels_tab[0][1][0] =
> ff_put_vp8_bilin16_v_rvv;
> > +c->put_vp8_bilinear_pixels_tab[0][2][0] =
> > ff_put_vp8_bilin16_v_rvv; +
> c->put_vp8_bilinear_pixels_tab[1][1][0]
> > = ff_put_vp8_bilin8_v_rvv; +
> c->put_vp8_bilinear_pixels_tab[1][2][0]
> > = ff_put_vp8_bilin8_v_rvv; +
> c->put_vp8_bilinear_pixels_tab[2][1][0]
> > = ff_put_vp8_bilin4_v_rvv; +
> c->put_vp8_bilinear_pixels_tab[2][2][0]
> > = ff_put_vp8_bilin4_v_rvv; +}
> > +#endif
> >  #endif
> >  }
> >
> > diff --git a/libavcodec/riscv/vp8dsp_rvv.S
> b/libavcodec/riscv/vp8dsp_rvv.S
> > index 8a0773f964..9bf969d794 100644
> > --- a/libavcodec/riscv/vp8dsp_rvv.S
> > +++ b/libavcodec/riscv/vp8dsp_rvv.S
> > @@ -20,6 +20,18 @@
> >
> >  #include "libavutil/riscv/asm.S"
> >
> > +.macro vsetvlstatic8 len
> > +.if \len <= 4
> > +vsetivlizero, \len, e8, mf4, ta, ma
> > +.elseif \len <= 8
> > +vsetivlizero, \len, e8, mf2, ta, ma
> > +.elseif \len <= 16
> > +vsetivlizero, \len, e8, m1, ta, ma
> > +.elseif \len <= 31
> > +vsetivlizero, \len, e8, m2, ta, ma
> > +.endif
> > +.endm
> > +
> >  .macro vp8_idct_dc_add
> >  vlse32.v  v0, (a0), a2
> >  lha5, 0(a1)
> > @@ -71,3 +83,40 @@ func ff_vp8_idct_dc_add4uv_rvv, zve32x
> >
> >  ret
> >  endfunc
> > +
> > +.macro bilin_load dst len type mn
> > +.ifc \type,v
> > +add t5, a2, a3
> > +.elseif \type == h
>
> h is not a number so that's not a valid condition.
>
> > +addit5, a2, 1
> > +.endif
> > +vle8.v  \dst, (a2)
> > +vle8.v  v2, (t5)
> > +vwmulu.vx   v28, \dst, t1
> > +vwmaccu.vx  v28, \mn, v2
> > +vwaddu.wx   v24, v28, t4
> > +vnsra.wi\dst, v24, 3
> > +.endm
> > +
> > +.macro put_vp8_bilin_h_v len type mn
> > +func ff_put_vp8_bilin\len\()_\type\()_rvv, zve32x
> > +vsetvlstatic8   \len
> > +li  t1, 8
> > +li  t4, 4
> > +sub t1, t1, \mn
> > +1:
> > +addia4, a4, -1
> > +bilin_load  v0, \len, \type, \mn
> > +vse8.v  v0, (a0)
> > +add a2, a2, a3
> > +add a0, a0, a1
> > +bneza4, 1b
> > +
> > +ret
> > +endfunc
> > +.endm
> > +
> > +.irp len 16,8,4
> > +put_vp8_bilin_h_v \len h a5
> > +put_vp8_bilin_h_v \len v a6
> > +.endr
>
>
> --
> 雷米‧德尼-库尔蒙
> http://www.remlab.net/
>
>
>
> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
>
___
ffmpeg-dev

Re: [FFmpeg-devel] [PATCH v2 1/9] lavc/vp9dsp: R-V ipred vert

2024-05-07 Thread flow gg

Fixed issues similar to vp8

 于2024年5月7日周二 15:36写道：

> From: sunyuechi 
>
> C908:
> vp9_vert_8x8_8bpp_c: 22.0
> vp9_vert_8x8_8bpp_rvi: 15.7
> vp9_vert_16x16_8bpp_c: 71.2
> vp9_vert_16x16_8bpp_rvi: 39.0
> vp9_vert_32x32_8bpp_c: 300.2
> vp9_vert_32x32_8bpp_rvi: 135.2
> ---
>  libavcodec/riscv/Makefile|  1 +
>  libavcodec/riscv/vp9_intra_rvi.S | 61 
>  libavcodec/riscv/vp9dsp.h|  6 
>  libavcodec/riscv/vp9dsp_init.c   | 15 ++--
>  4 files changed, 80 insertions(+), 3 deletions(-)
>  create mode 100644 libavcodec/riscv/vp9_intra_rvi.S
>
> diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
> index 050c08ee61..65dd0d656a 100644
> --- a/libavcodec/riscv/Makefile
> +++ b/libavcodec/riscv/Makefile
> @@ -63,6 +63,7 @@ RVV-OBJS-$(CONFIG_VC1DSP) += riscv/vc1dsp_rvv.o
>  OBJS-$(CONFIG_VP8DSP) += riscv/vp8dsp_init.o
>  RVV-OBJS-$(CONFIG_VP8DSP) += riscv/vp8dsp_rvv.o
>  OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9dsp_init.o
> +RV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvi.o
>  RVV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvv.o
>  OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_init.o
>  RVV-OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_rvv.o
> diff --git a/libavcodec/riscv/vp9_intra_rvi.S
> b/libavcodec/riscv/vp9_intra_rvi.S
> new file mode 100644
> index 00..617f9f55a2
> --- /dev/null
> +++ b/libavcodec/riscv/vp9_intra_rvi.S
> @@ -0,0 +1,61 @@
> +/*
> + * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences
> (ISCAS).
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> 02110-1301 USA
> + */
> +
> +#include "libavutil/riscv/asm.S"
> +
> +#if __riscv_xlen >= 64
> +func ff_v_32x32_rvi
> +ld   t0, (a3)
> +ld   t1, 8(a3)
> +ld   t2, 16(a3)
> +ld   t3, 24(a3)
> +.rept 32
> +sd   t0, (a0)
> +sd   t1, 8(a0)
> +sd   t2, 16(a0)
> +sd   t3, 24(a0)
> +add  a0, a0, a1
> +.endr
> +
> +ret
> +endfunc
> +
> +func ff_v_16x16_rvi
> +ld   t0, (a3)
> +ld   t1, 8(a3)
> +.rept 16
> +sd   t0, (a0)
> +sd   t1, 8(a0)
> +add  a0, a0, a1
> +.endr
> +
> +ret
> +endfunc
> +
> +func ff_v_8x8_rvi
> +ld   t0, (a3)
> +.rept 8
> +sd   t0, (a0)
> +add  a0, a0, a1
> +.endr
> +
> +ret
> +endfunc
> +#endif
> diff --git a/libavcodec/riscv/vp9dsp.h b/libavcodec/riscv/vp9dsp.h
> index 25047ed507..f8bc6563a5 100644
> --- a/libavcodec/riscv/vp9dsp.h
> +++ b/libavcodec/riscv/vp9dsp.h
> @@ -60,6 +60,12 @@ void ff_dc_129_16x16_rvv(uint8_t *dst, ptrdiff_t
> stride, const uint8_t *l,
>   const uint8_t *a);
>  void ff_dc_129_8x8_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l,
> const uint8_t *a);
> +void ff_v_32x32_rvi(uint8_t *dst, ptrdiff_t stride, const uint8_t *l,
> +const uint8_t *a);
> +void ff_v_16x16_rvi(uint8_t *dst, ptrdiff_t stride, const uint8_t *l,
> +const uint8_t *a);
> +void ff_v_8x8_rvi(uint8_t *dst, ptrdiff_t stride, const uint8_t *l,
> +  const uint8_t *a);
>
>  #define VP9_8TAP_RISCV_RVV_FUNC(SIZE, type, type_idx)
>  \
>  void ff_put_8tap_##type##_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t
> dststride,   \
> diff --git a/libavcodec/riscv/vp9dsp_init.c
> b/libavcodec/riscv/vp9dsp_init.c
> index 69ab39004c..d249dd71b2 100644
> --- a/libavcodec/riscv/vp9dsp_init.c
> +++ b/libavcodec/riscv/vp9dsp_init.c
> @@ -24,11 +24,19 @@
>  #include "libavcodec/vp9dsp.h"
>  #include "vp9dsp.h"
>
> -static av_cold void vp9dsp_intrapred_init_rvv(VP9DSPContext *dsp, int bpp)
> +static av_cold void vp9dsp_intrapred_init_riscv(VP9DSPContext *dsp, int
> bpp)
>  {
> -#if HAVE_RVV
> +#if HAVE_RV
>  int flags = av_get_cpu_flags();
>
> +if (bpp == 8 && flags & AV_CPU_FLAG_RVI) {
> +# if __riscv_xlen >= 64
> +dsp->intra_pred[TX_32X32][VERT_PRED] = ff_v_32x32_rvi;
> +dsp->intra_pred[TX_16X16][VERT_PRED] = ff_v_16x16_rvi;
> +dsp->intra_pred[TX

Re: [FFmpeg-devel] [PATCH v3 6/9] lavc/vp8dsp: R-V V put_epel hv

2024-05-06 Thread flow gg

> IMO, passing a complete register name, if you really need to vary it,
would be
simpler and more flexible than an ABI register type prefix.

If the full register name is passed here, some require four parameters,
some require six parameters, and there is often repetition.
I feel it's easy to get confused about the differences between the
parameters passed each time.
If use a prefix instead, would only need one parameter, which I think would
be less error-prone.

> This code actually requires ==, not >=.
> You can do that but you only need half the stack space and offsets.

Ok, fixed it

Rémi Denis-Courmont  于2024年5月7日周二 03:25写道：

> Le maanantaina 6. toukokuuta 2024, 6.38.06 EEST u...@foxmail.com a écrit :
> > From: sunyuechi 
> >
> > C908:
> > vp8_put_epel4_h4v4_c: 20.0
> > vp8_put_epel4_h4v4_rvv_i32: 11.0
> > vp8_put_epel4_h4v6_c: 25.2
> > vp8_put_epel4_h4v6_rvv_i32: 13.5
> > vp8_put_epel4_h6v4_c: 22.2
> > vp8_put_epel4_h6v4_rvv_i32: 14.5
> > vp8_put_epel4_h6v6_c: 29.0
> > vp8_put_epel4_h6v6_rvv_i32: 15.7
> > vp8_put_epel8_h4v4_c: 73.0
> > vp8_put_epel8_h4v4_rvv_i32: 22.2
> > vp8_put_epel8_h4v6_c: 90.5
> > vp8_put_epel8_h4v6_rvv_i32: 26.7
> > vp8_put_epel8_h6v4_c: 85.0
> > vp8_put_epel8_h6v4_rvv_i32: 27.2
> > vp8_put_epel8_h6v6_c: 104.7
> > vp8_put_epel8_h6v6_rvv_i32: 29.5
> > vp8_put_epel16_h4v4_c: 145.5
> > vp8_put_epel16_h4v4_rvv_i32: 26.5
> > vp8_put_epel16_h4v6_c: 190.7
> > vp8_put_epel16_h4v6_rvv_i32: 47.5
> > vp8_put_epel16_h6v4_c: 173.7
> > vp8_put_epel16_h6v4_rvv_i32: 33.2
> > vp8_put_epel16_h6v6_c: 222.2
> > vp8_put_epel16_h6v6_rvv_i32: 35.5
> > ---
> >  libavcodec/riscv/vp8dsp_init.c |  13 
> >  libavcodec/riscv/vp8dsp_rvv.S  | 117 +++--
> >  2 files changed, 109 insertions(+), 21 deletions(-)
> >
> > diff --git a/libavcodec/riscv/vp8dsp_init.c
> b/libavcodec/riscv/vp8dsp_init.c
> > index dc3e087f01..463c8fa0a2 100644
> > --- a/libavcodec/riscv/vp8dsp_init.c
> > +++ b/libavcodec/riscv/vp8dsp_init.c
> > @@ -97,6 +97,19 @@ av_cold void ff_vp78dsp_init_riscv(VP8DSPContext *c)
> >  c->put_vp8_epel_pixels_tab[0][1][0] = ff_put_vp8_epel16_v4_rvv;
> >  c->put_vp8_epel_pixels_tab[1][1][0] = ff_put_vp8_epel8_v4_rvv;
> >  c->put_vp8_epel_pixels_tab[2][1][0] = ff_put_vp8_epel4_v4_rvv;
> > +
> > +c->put_vp8_epel_pixels_tab[0][2][2] =
> ff_put_vp8_epel16_h6v6_rvv;
> > +c->put_vp8_epel_pixels_tab[1][2][2] = ff_put_vp8_epel8_h6v6_rvv;
> > +c->put_vp8_epel_pixels_tab[2][2][2] = ff_put_vp8_epel4_h6v6_rvv;
> > +c->put_vp8_epel_pixels_tab[0][2][1] =
> ff_put_vp8_epel16_h4v6_rvv;
> > +c->put_vp8_epel_pixels_tab[1][2][1] = ff_put_vp8_epel8_h4v6_rvv;
> > +c->put_vp8_epel_pixels_tab[2][2][1] = ff_put_vp8_epel4_h4v6_rvv;
> > +c->put_vp8_epel_pixels_tab[0][1][1] =
> ff_put_vp8_epel16_h4v4_rvv;
> > +c->put_vp8_epel_pixels_tab[1][1][1] = ff_put_vp8_epel8_h4v4_rvv;
> > +c->put_vp8_epel_pixels_tab[2][1][1] = ff_put_vp8_epel4_h4v4_rvv;
> > +c->put_vp8_epel_pixels_tab[0][1][2] =
> ff_put_vp8_epel16_h6v4_rvv;
> > +c->put_vp8_epel_pixels_tab[1][1][2] = ff_put_vp8_epel8_h6v4_rvv;
> > +c->put_vp8_epel_pixels_tab[2][1][2] = ff_put_vp8_epel4_h6v4_rvv;
> >  }
> >  #endif
> >  #endif
> > diff --git a/libavcodec/riscv/vp8dsp_rvv.S
> b/libavcodec/riscv/vp8dsp_rvv.S
> > index bf268e4d8d..baa8152830 100644
> > --- a/libavcodec/riscv/vp8dsp_rvv.S
> > +++ b/libavcodec/riscv/vp8dsp_rvv.S
> > @@ -161,26 +161,26 @@ const subpel_filters
> >  .byte 0,  -1,  12, 123,  -6, 0
> >  endconst
> >
> > -.macro epel_filter size type
> > -lla t2, subpel_filters
> > +.macro epel_filter size type regtype
> > +lla \regtype\()2, subpel_filters
> >  .ifc \type,v
> > -addit0, a6, -1
> > +addi\regtype\()0, a6, -1
>
> IMO, passing a complete register name, if you really need to vary it,
> would be
> simpler and more flexible than an ABI register type prefix.
>
> >  .elseif \type == h
> > -addit0, a5, -1
> > +addi\regtype\()0, a5, -1
> >  .endif
> > -li  t1, 6
> > -mul t0, t0, t1
> > -add t0, t0, t2
> > +li  \regtype\()1, 6
> > +mul \regtype\()0, \regtype\()0, \regtype\()1
> > +add \regtype\()0, \regtype\()0, \regtype\()2
> >  .irp n 1,2,3,4
> > -lb  t\n, \n(t0)
> > +lb  \regtype\n, \n(\regtype\()0)
> >  .endr
> >  .ifc \size,6
> > -lb  t5, 5(t0)
> > -lb  t0, (t0)
> > +lb  \regtype\()5, 5(\regtype\()0)
> > +lb  \regtype\()0, (\regtype\()0)
> >  .endif
> >  .endm
> >
> > -.macro epel_load dst len size type
> > +.macro epel_load dst len size type from_mem regtype
> >  .ifc \type,v
> >  mv  a5, a3
> >  .else
> > @@

Re: [FFmpeg-devel] [PATCH v3 2/9] lavc/vp8dsp: R-V V put_bilin_h v

2024-05-05 Thread flow gg

> Doesn't this effectively discard the last element, t5?
> Can't we skip the slide and just load the vector at a2+1? Also then, we
can
> keep VL=len and halve the multipler.

Yes, this is better, I remember that using slide1down was better in the
initial version testing, but now it has changed..
I modified it to load a2+1 and merged h and v.

 于2024年5月6日周一 11:38写道：

> From: sunyuechi 
>
> C908:
> vp8_put_bilin4_h_c: 367.0
> vp8_put_bilin4_h_rvv_i32: 137.7
> vp8_put_bilin4_v_c: 377.0
> vp8_put_bilin4_v_rvv_i32: 137.7
> vp8_put_bilin8_h_c: 1431.0
> vp8_put_bilin8_h_rvv_i32: 297.5
> vp8_put_bilin8_v_c: 1449.0
> vp8_put_bilin8_v_rvv_i32: 297.5
> vp8_put_bilin16_h_c: 2839.0
> vp8_put_bilin16_h_rvv_i32: 344.7
> vp8_put_bilin16_v_c: 2857.0
> vp8_put_bilin16_v_rvv_i32: 344.7
> ---
>  libavcodec/riscv/vp8dsp_init.c | 21 +++
>  libavcodec/riscv/vp8dsp_rvv.S  | 49 ++
>  2 files changed, 70 insertions(+)
>
> diff --git a/libavcodec/riscv/vp8dsp_init.c
> b/libavcodec/riscv/vp8dsp_init.c
> index fa3feeacf7..afffa6de2f 100644
> --- a/libavcodec/riscv/vp8dsp_init.c
> +++ b/libavcodec/riscv/vp8dsp_init.c
> @@ -34,6 +34,10 @@ VP8_EPEL(16, rvi);
>  VP8_EPEL(8,  rvi);
>  VP8_EPEL(4,  rvi);
>
> +VP8_BILIN(16, rvv);
> +VP8_BILIN(8,  rvv);
> +VP8_BILIN(4,  rvv);
> +
>  av_cold void ff_vp78dsp_init_riscv(VP8DSPContext *c)
>  {
>  #if HAVE_RV
> @@ -48,6 +52,23 @@ av_cold void ff_vp78dsp_init_riscv(VP8DSPContext *c)
>  c->put_vp8_epel_pixels_tab[2][0][0] = ff_put_vp8_pixels4_rvi;
>  c->put_vp8_bilinear_pixels_tab[2][0][0] = ff_put_vp8_pixels4_rvi;
>  }
> +#if HAVE_RVV
> +if (flags & AV_CPU_FLAG_RVV_I32 && ff_get_rv_vlenb() >= 16) {
> +c->put_vp8_bilinear_pixels_tab[0][0][1] =
> ff_put_vp8_bilin16_h_rvv;
> +c->put_vp8_bilinear_pixels_tab[0][0][2] =
> ff_put_vp8_bilin16_h_rvv;
> +c->put_vp8_bilinear_pixels_tab[1][0][1] = ff_put_vp8_bilin8_h_rvv;
> +c->put_vp8_bilinear_pixels_tab[1][0][2] = ff_put_vp8_bilin8_h_rvv;
> +c->put_vp8_bilinear_pixels_tab[2][0][1] = ff_put_vp8_bilin4_h_rvv;
> +c->put_vp8_bilinear_pixels_tab[2][0][2] = ff_put_vp8_bilin4_h_rvv;
> +
> +c->put_vp8_bilinear_pixels_tab[0][1][0] =
> ff_put_vp8_bilin16_v_rvv;
> +c->put_vp8_bilinear_pixels_tab[0][2][0] =
> ff_put_vp8_bilin16_v_rvv;
> +c->put_vp8_bilinear_pixels_tab[1][1][0] = ff_put_vp8_bilin8_v_rvv;
> +c->put_vp8_bilinear_pixels_tab[1][2][0] = ff_put_vp8_bilin8_v_rvv;
> +c->put_vp8_bilinear_pixels_tab[2][1][0] = ff_put_vp8_bilin4_v_rvv;
> +c->put_vp8_bilinear_pixels_tab[2][2][0] = ff_put_vp8_bilin4_v_rvv;
> +}
> +#endif
>  #endif
>  }
>
> diff --git a/libavcodec/riscv/vp8dsp_rvv.S b/libavcodec/riscv/vp8dsp_rvv.S
> index 8a0773f964..9bf969d794 100644
> --- a/libavcodec/riscv/vp8dsp_rvv.S
> +++ b/libavcodec/riscv/vp8dsp_rvv.S
> @@ -20,6 +20,18 @@
>
>  #include "libavutil/riscv/asm.S"
>
> +.macro vsetvlstatic8 len
> +.if \len <= 4
> +vsetivlizero, \len, e8, mf4, ta, ma
> +.elseif \len <= 8
> +vsetivlizero, \len, e8, mf2, ta, ma
> +.elseif \len <= 16
> +vsetivlizero, \len, e8, m1, ta, ma
> +.elseif \len <= 31
> +vsetivlizero, \len, e8, m2, ta, ma
> +.endif
> +.endm
> +
>  .macro vp8_idct_dc_add
>  vlse32.v  v0, (a0), a2
>  lha5, 0(a1)
> @@ -71,3 +83,40 @@ func ff_vp8_idct_dc_add4uv_rvv, zve32x
>
>  ret
>  endfunc
> +
> +.macro bilin_load dst len type mn
> +.ifc \type,v
> +add t5, a2, a3
> +.elseif \type == h
> +addit5, a2, 1
> +.endif
> +vle8.v  \dst, (a2)
> +vle8.v  v2, (t5)
> +vwmulu.vx   v28, \dst, t1
> +vwmaccu.vx  v28, \mn, v2
> +vwaddu.wx   v24, v28, t4
> +vnsra.wi\dst, v24, 3
> +.endm
> +
> +.macro put_vp8_bilin_h_v len type mn
> +func ff_put_vp8_bilin\len\()_\type\()_rvv, zve32x
> +vsetvlstatic8   \len
> +li  t1, 8
> +li  t4, 4
> +sub t1, t1, \mn
> +1:
> +addia4, a4, -1
> +bilin_load  v0, \len, \type, \mn
> +vse8.v  v0, (a0)
> +add a2, a2, a3
> +add a0, a0, a1
> +bneza4, 1b
> +
> +ret
> +endfunc
> +.endm
> +
> +.irp len 16,8,4
> +put_vp8_bilin_h_v \len h a5
> +put_vp8_bilin_h_v \len v a6
> +.endr
> --
> 2.45.0
>
> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
>
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org w

Re: [FFmpeg-devel] [PATCH 01/10] lavc/vp8dsp: R-V put_vp8_pixels

2024-05-05 Thread flow gg

Made these changes according to the previous review:
moved func into macro, added macro vset to reduce if else, used rvi,
supplemented __riscv_xlen

 于2024年5月6日周一 00:45写道：

> From: sunyuechi 
>
> C908:
> vp8_put_pixels4_c: 78.0
> vp8_put_pixels4_rvi: 33.7
> vp8_put_pixels8_c: 278.0
> vp8_put_pixels8_rvi: 55.0
> vp8_put_pixels16_c: 999.0
> vp8_put_pixels16_rvi: 86.7
> ---
>  libavcodec/riscv/Makefile  |  1 +
>  libavcodec/riscv/vp8dsp.h  | 75 ++
>  libavcodec/riscv/vp8dsp_init.c | 22 ++
>  libavcodec/riscv/vp8dsp_rvi.S  | 61 +++
>  libavcodec/vp8dsp.c|  2 +
>  libavcodec/vp8dsp.h|  1 +
>  6 files changed, 162 insertions(+)
>  create mode 100644 libavcodec/riscv/vp8dsp.h
>  create mode 100644 libavcodec/riscv/vp8dsp_rvi.S
>
> diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
> index 050c08ee61..526cb5c97c 100644
> --- a/libavcodec/riscv/Makefile
> +++ b/libavcodec/riscv/Makefile
> @@ -61,6 +61,7 @@ RVV-OBJS-$(CONFIG_UTVIDEO_DECODER) +=
> riscv/utvideodsp_rvv.o
>  OBJS-$(CONFIG_VC1DSP) += riscv/vc1dsp_init.o
>  RVV-OBJS-$(CONFIG_VC1DSP) += riscv/vc1dsp_rvv.o
>  OBJS-$(CONFIG_VP8DSP) += riscv/vp8dsp_init.o
> +RV-OBJS-$(CONFIG_VP8DSP) += riscv/vp8dsp_rvi.o
>  RVV-OBJS-$(CONFIG_VP8DSP) += riscv/vp8dsp_rvv.o
>  OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9dsp_init.o
>  RVV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvv.o
> diff --git a/libavcodec/riscv/vp8dsp.h b/libavcodec/riscv/vp8dsp.h
> new file mode 100644
> index 00..971c5c0a96
> --- /dev/null
> +++ b/libavcodec/riscv/vp8dsp.h
> @@ -0,0 +1,75 @@
> +/*
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> 02110-1301 USA
> + */
> +
> +#ifndef AVCODEC_RISCV_VP8DSP_H
> +#define AVCODEC_RISCV_VP8DSP_H
> +
> +#include "libavcodec/vp8dsp.h"
> +
> +#define VP8_LF_Y(hv, inner, opt)
>\
> +void ff_vp8_##hv##_loop_filter16##inner##_##opt(uint8_t *dst,
> \
> +ptrdiff_t stride,
> \
> +int flim_E, int
> flim_I,  \
> +int hev_thresh)
> +
> +#define VP8_LF_UV(hv, inner, opt)
> \
> +void ff_vp8_##hv##_loop_filter8uv##inner##_##opt(uint8_t *dstU,
> \
> + uint8_t *dstV,
> \
> + ptrdiff_t stride,
>\
> + int flim_E, int
> flim_I, \
> + int hev_thresh)
> +
> +#define VP8_LF_SIMPLE(hv, opt)  \
> +void ff_vp8_##hv##_loop_filter16_simple_##opt(uint8_t *dst, \
> +  ptrdiff_t stride, \
> +  int flim)
> +
> +#define VP8_LF_HV(inner, opt)   \
> +VP8_LF_Y(h,  inner, opt);   \
> +VP8_LF_Y(v,  inner, opt);   \
> +VP8_LF_UV(h, inner, opt);   \
> +VP8_LF_UV(v, inner, opt)
> +
> +#define VP8_LF(opt) \
> +VP8_LF_HV(,   opt); \
> +VP8_LF_HV(_inner, opt); \
> +VP8_LF_SIMPLE(h, opt);  \
> +VP8_LF_SIMPLE(v, opt)
> +
> +#define VP8_MC(n, opt)  \
> +void ff_put_vp8_##n##_##opt(uint8_t *dst, ptrdiff_t dststride,  \
> +const uint8_t *src, ptrdiff_t srcstride,\
> +int h, int x, int y)
> +
> +#define VP8_EPEL(w, opt)\
> +VP8_MC(pixels ## w, opt);   \
> +VP8_MC(epel ## w ## _h4, opt);  \
> +VP8_MC(epel ## w ## _h6, opt);  \
> +VP8_MC(epel ## w ## _v4, opt);  \
> +VP8_MC(epel ## w ## _h4v4, opt);\
> +VP8_MC(epel ## w ## _h6v4, opt);\
> +VP8_MC(epel ## w ## _v6, opt);  \
> +VP8_MC(epel ## w ## _h4v6, opt);\
> +VP8_MC(epel ## w ## _h6v6, opt)
> +
> +#define VP8_BILIN(w, o

Re: [FFmpeg-devel] [PATCH] lavc/vc1dsp: R-V V mspel_pixels

2024-05-05 Thread flow gg

> Is it not faster to compute the address ahead of time, e.g.:
> Ditto below and in other patches.

Yes, update here and I will check other patches

> Copying 64-bit quantities should not need RVV at all. Maybe the C version
needs to be improved instead, but if that is not possible, then an RVI
version
may be more portable and work just as well.

The logic in the c version is the same in other places, which might be
difficult to modify. I've updated it using rvi.

> Does MF2 actually improve perfs over M1 here?

The difference here seems very small, but when both mf2 and m1 are correct,
the test results have only shown mf2 to be better, so I want to use mf2.

Rémi Denis-Courmont  于2024年5月5日周日 01:53写道：

> Le lauantaina 4. toukokuuta 2024, 13.01.05 EEST u...@foxmail.com a écrit :
> > From: sunyuechi 
> >
> > vc1dsp.avg_vc1_mspel_pixels_tab[0][0]_c: 869.7
> > vc1dsp.avg_vc1_mspel_pixels_tab[0][0]_rvv_i32: 148.7
> > vc1dsp.avg_vc1_mspel_pixels_tab[1][0]_c: 220.5
> > vc1dsp.avg_vc1_mspel_pixels_tab[1][0]_rvv_i64: 56.2
> > vc1dsp.put_vc1_mspel_pixels_tab[0][0]_c: 523.7
> > vc1dsp.put_vc1_mspel_pixels_tab[0][0]_rvv_i32: 82.0
> > vc1dsp.put_vc1_mspel_pixels_tab[1][0]_c: 138.5
> > vc1dsp.put_vc1_mspel_pixels_tab[1][0]_rvv_i64: 23.7
> > ---
> >  libavcodec/riscv/vc1dsp_init.c |  8 +
> >  libavcodec/riscv/vc1dsp_rvv.S  | 66 ++
> >  2 files changed, 74 insertions(+)
> >
> > diff --git a/libavcodec/riscv/vc1dsp_init.c
> b/libavcodec/riscv/vc1dsp_init.c
> > index e47b644f80..610c43a1a3 100644
> > --- a/libavcodec/riscv/vc1dsp_init.c
> > +++ b/libavcodec/riscv/vc1dsp_init.c
> > @@ -29,6 +29,10 @@ void ff_vc1_inv_trans_8x8_dc_rvv(uint8_t *dest,
> ptrdiff_t
> > stride, int16_t *block void ff_vc1_inv_trans_4x8_dc_rvv(uint8_t *dest,
> > ptrdiff_t stride, int16_t *block); void
> ff_vc1_inv_trans_8x4_dc_rvv(uint8_t
> > *dest, ptrdiff_t stride, int16_t *block); void
> > ff_vc1_inv_trans_4x4_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t
> > *block); +void ff_put_pixels16x16_rvv(uint8_t *dst, const uint8_t *src,
> > ptrdiff_t line_size, int rnd); +void ff_put_pixels8x8_rvv(uint8_t *dst,
> > const uint8_t *src, ptrdiff_t line_size, int rnd); +void
> > ff_avg_pixels16x16_rvv(uint8_t *dst, const uint8_t *src, ptrdiff_t
> > line_size, int rnd); +void ff_avg_pixels8x8_rvv(uint8_t *dst, const
> uint8_t
> > *src, ptrdiff_t line_size, int rnd);
> >
> >  av_cold void ff_vc1dsp_init_riscv(VC1DSPContext *dsp)
> >  {
> > @@ -38,9 +42,13 @@ av_cold void ff_vc1dsp_init_riscv(VC1DSPContext *dsp)
> >  if (flags & AV_CPU_FLAG_RVV_I32 && ff_get_rv_vlenb() >= 16) {
> >  dsp->vc1_inv_trans_4x8_dc = ff_vc1_inv_trans_4x8_dc_rvv;
> >  dsp->vc1_inv_trans_4x4_dc = ff_vc1_inv_trans_4x4_dc_rvv;
> > +dsp->put_vc1_mspel_pixels_tab[0][0] = ff_put_pixels16x16_rvv;
> > +dsp->avg_vc1_mspel_pixels_tab[0][0] = ff_avg_pixels16x16_rvv;
> >  if (flags & AV_CPU_FLAG_RVV_I64) {
> >  dsp->vc1_inv_trans_8x8_dc = ff_vc1_inv_trans_8x8_dc_rvv;
> >  dsp->vc1_inv_trans_8x4_dc = ff_vc1_inv_trans_8x4_dc_rvv;
> > +dsp->put_vc1_mspel_pixels_tab[1][0] = ff_put_pixels8x8_rvv;
> > +dsp->avg_vc1_mspel_pixels_tab[1][0] = ff_avg_pixels8x8_rvv;
> >  }
> >  }
> >  #endif
> > diff --git a/libavcodec/riscv/vc1dsp_rvv.S
> b/libavcodec/riscv/vc1dsp_rvv.S
> > index 4a00945ead..48244f91aa 100644
> > --- a/libavcodec/riscv/vc1dsp_rvv.S
> > +++ b/libavcodec/riscv/vc1dsp_rvv.S
> > @@ -111,3 +111,69 @@ func ff_vc1_inv_trans_4x4_dc_rvv, zve32x
> >  vsse32.v  v0, (a0), a1
> >  ret
> >  endfunc
> > +
> > +func ff_put_pixels16x16_rvv, zve32x
> > +vsetivli  zero, 16, e8, m1, ta, ma
> > +.irp n 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
> 30
> > +vle8.vv\n, (a1)
> > +add   a1, a1, a2
> > +.endr
> > +vle8.vv31, (a1)
>
> Is it not faster to compute the address ahead of time, e.g.:
>
> add t1, a2, a1
> vle8.v vN, (a1)
> sh1add a1, a2, a1
> vle8.v vN+1, (t1)
>
> ...and so on? Even on a reordering core, you can't eliminate stall on data
> dependency if there is nothing else to be done.
>
> (Ditto below and in other patches.)
>
> > +.irp n 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
> 30
> > +vse8.vv\n, (a0)
> > +add   a0, a0, a2
> > +.endr
> > +vse8.vv31, (a0)
> > +
> > +ret
> > +endfunc
> > +
> > +func ff_put_pixels8x8_rvv, zve64x
> > +vsetivli  zero, 8, e8, mf2, ta, ma
> > +vlse64.v  v8, (a1), a2
> > +vsse64.v  v8, (a0), a2
>
> Copying 64-bit quantities should not need RVV at all. Maybe the C version
> needs to be improved instead, but if that is not possible, then an RVI
> version
> may be more portable and work just as well.
>
> > +
> > +ret
> > +endfunc
> > +
> > +func ff_avg_pixels16x16_rvv, zve32x
> > +csrwi vxrm,

Re: [FFmpeg-devel] [PATCH 01/10] lavc/vp9dsp: R-V V ipred vert

2024-05-04 Thread flow gg

the github link: https://github.com/hleft/FFmpeg/tree/vp9

 于2024年5月4日周六 23:03写道：

> From: sunyuechi 
>
> C908:
> vp9_vert_8x8_8bpp_c: 22.0
> vp9_vert_8x8_8bpp_rvv_i64: 18.5
> vp9_vert_16x16_8bpp_c: 71.2
> vp9_vert_16x16_8bpp_rvv_i32: 50.7
> vp9_vert_32x32_8bpp_c: 300.2
> vp9_vert_32x32_8bpp_rvv_i32: 136.7
> ---
>  libavcodec/riscv/vp9_intra_rvv.S | 35 
>  libavcodec/riscv/vp9dsp.h|  6 ++
>  libavcodec/riscv/vp9dsp_init.c   |  3 +++
>  3 files changed, 44 insertions(+)
>
> diff --git a/libavcodec/riscv/vp9_intra_rvv.S
> b/libavcodec/riscv/vp9_intra_rvv.S
> index db9774c263..b5f0f9d3c3 100644
> --- a/libavcodec/riscv/vp9_intra_rvv.S
> +++ b/libavcodec/riscv/vp9_intra_rvv.S
> @@ -113,3 +113,38 @@ func_dc dc_left  8   left 3  0  zve64x
>  func_dc dc_top   32  top  5  1  zve32x
>  func_dc dc_top   16  top  4  1  zve32x
>  func_dc dc_top   8   top  3  0  zve64x
> +
> +func ff_v_32x32_rvv, zve32x
> +vsetivli zero, 8, e8, mf2, ta, ma
> +vle32.v  v8, (a3)
> +
> +.rept 31
> +vse32.v  v8, (a0)
> +add  a0, a0, a1
> +.endr
> +vse32.v  v8, (a0)
> +
> +ret
> +endfunc
> +
> +func ff_v_16x16_rvv, zve32x
> +vsetivli zero, 4, e8, mf4, ta, ma
> +vle32.v  v8, (a3)
> +
> +.rept 15
> +vse32.v  v8, (a0)
> +add  a0, a0, a1
> +.endr
> +vse32.v  v8, (a0)
> +
> +ret
> +endfunc
> +
> +func ff_v_8x8_rvv, zve64x
> +ld   t0, (a3)
> +vsetivli zero, 8, e64, m4, ta, ma
> +vmv.v.x  v8, t0
> +vsse64.v v8, (a0), a1
> +
> +ret
> +endfunc
> diff --git a/libavcodec/riscv/vp9dsp.h b/libavcodec/riscv/vp9dsp.h
> index 25047ed507..113397ce86 100644
> --- a/libavcodec/riscv/vp9dsp.h
> +++ b/libavcodec/riscv/vp9dsp.h
> @@ -60,6 +60,12 @@ void ff_dc_129_16x16_rvv(uint8_t *dst, ptrdiff_t
> stride, const uint8_t *l,
>   const uint8_t *a);
>  void ff_dc_129_8x8_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l,
> const uint8_t *a);
> +void ff_v_32x32_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l,
> +const uint8_t *a);
> +void ff_v_16x16_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l,
> +const uint8_t *a);
> +void ff_v_8x8_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l,
> +  const uint8_t *a);
>
>  #define VP9_8TAP_RISCV_RVV_FUNC(SIZE, type, type_idx)
>  \
>  void ff_put_8tap_##type##_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t
> dststride,   \
> diff --git a/libavcodec/riscv/vp9dsp_init.c
> b/libavcodec/riscv/vp9dsp_init.c
> index 69ab39004c..9c550d40b5 100644
> --- a/libavcodec/riscv/vp9dsp_init.c
> +++ b/libavcodec/riscv/vp9dsp_init.c
> @@ -36,6 +36,7 @@ static av_cold void
> vp9dsp_intrapred_init_rvv(VP9DSPContext *dsp, int bpp)
>  dsp->intra_pred[TX_8X8][DC_128_PRED] = ff_dc_128_8x8_rvv;
>  dsp->intra_pred[TX_8X8][DC_129_PRED] = ff_dc_129_8x8_rvv;
>  dsp->intra_pred[TX_8X8][TOP_DC_PRED] = ff_dc_top_8x8_rvv;
> +dsp->intra_pred[TX_8X8][VERT_PRED] = ff_v_8x8_rvv;
>  }
>
>  if (bpp == 8 && flags & AV_CPU_FLAG_RVV_I32 && ff_get_rv_vlenb()
> >= 16) {
> @@ -51,6 +52,8 @@ static av_cold void
> vp9dsp_intrapred_init_rvv(VP9DSPContext *dsp, int bpp)
>  dsp->intra_pred[TX_16X16][DC_129_PRED] = ff_dc_129_16x16_rvv;
>  dsp->intra_pred[TX_32X32][TOP_DC_PRED] = ff_dc_top_32x32_rvv;
>  dsp->intra_pred[TX_16X16][TOP_DC_PRED] = ff_dc_top_16x16_rvv;
> +dsp->intra_pred[TX_32X32][VERT_PRED] = ff_v_32x32_rvv;
> +dsp->intra_pred[TX_16X16][VERT_PRED] = ff_v_16x16_rvv;
>  }
>  #endif
>  }
> --
> 2.45.0
>
> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
>
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH 01/10] lavc/vp8dsp: R-V V put_vp8_pixels

2024-05-04 Thread flow gg

I've reorganized it, and the github link is at :
https://github.com/hleft/FFmpeg/tree/vp8

 于2024年5月4日周六 22:49写道：

> From: sunyuechi 
>
> C908:
> vp8_put_pixels4_c: 87.5
> vp8_put_pixels4_rvv_i32: 42.7
> vp8_put_pixels8_c: 284.5
> vp8_put_pixels8_rvv_i32: 77.7
> vp8_put_pixels16_c: 1087.7
> vp8_put_pixels16_rvv_i32: 108.0
> ---
>  libavcodec/riscv/vp8dsp.h  | 75 ++
>  libavcodec/riscv/vp8dsp_init.c | 22 ++
>  libavcodec/riscv/vp8dsp_rvv.S  | 27 
>  libavcodec/vp8dsp.c|  2 +
>  libavcodec/vp8dsp.h|  1 +
>  5 files changed, 127 insertions(+)
>  create mode 100644 libavcodec/riscv/vp8dsp.h
>
> diff --git a/libavcodec/riscv/vp8dsp.h b/libavcodec/riscv/vp8dsp.h
> new file mode 100644
> index 00..971c5c0a96
> --- /dev/null
> +++ b/libavcodec/riscv/vp8dsp.h
> @@ -0,0 +1,75 @@
> +/*
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> 02110-1301 USA
> + */
> +
> +#ifndef AVCODEC_RISCV_VP8DSP_H
> +#define AVCODEC_RISCV_VP8DSP_H
> +
> +#include "libavcodec/vp8dsp.h"
> +
> +#define VP8_LF_Y(hv, inner, opt)
>\
> +void ff_vp8_##hv##_loop_filter16##inner##_##opt(uint8_t *dst,
> \
> +ptrdiff_t stride,
> \
> +int flim_E, int
> flim_I,  \
> +int hev_thresh)
> +
> +#define VP8_LF_UV(hv, inner, opt)
> \
> +void ff_vp8_##hv##_loop_filter8uv##inner##_##opt(uint8_t *dstU,
> \
> + uint8_t *dstV,
> \
> + ptrdiff_t stride,
>\
> + int flim_E, int
> flim_I, \
> + int hev_thresh)
> +
> +#define VP8_LF_SIMPLE(hv, opt)  \
> +void ff_vp8_##hv##_loop_filter16_simple_##opt(uint8_t *dst, \
> +  ptrdiff_t stride, \
> +  int flim)
> +
> +#define VP8_LF_HV(inner, opt)   \
> +VP8_LF_Y(h,  inner, opt);   \
> +VP8_LF_Y(v,  inner, opt);   \
> +VP8_LF_UV(h, inner, opt);   \
> +VP8_LF_UV(v, inner, opt)
> +
> +#define VP8_LF(opt) \
> +VP8_LF_HV(,   opt); \
> +VP8_LF_HV(_inner, opt); \
> +VP8_LF_SIMPLE(h, opt);  \
> +VP8_LF_SIMPLE(v, opt)
> +
> +#define VP8_MC(n, opt)  \
> +void ff_put_vp8_##n##_##opt(uint8_t *dst, ptrdiff_t dststride,  \
> +const uint8_t *src, ptrdiff_t srcstride,\
> +int h, int x, int y)
> +
> +#define VP8_EPEL(w, opt)\
> +VP8_MC(pixels ## w, opt);   \
> +VP8_MC(epel ## w ## _h4, opt);  \
> +VP8_MC(epel ## w ## _h6, opt);  \
> +VP8_MC(epel ## w ## _v4, opt);  \
> +VP8_MC(epel ## w ## _h4v4, opt);\
> +VP8_MC(epel ## w ## _h6v4, opt);\
> +VP8_MC(epel ## w ## _v6, opt);  \
> +VP8_MC(epel ## w ## _h4v6, opt);\
> +VP8_MC(epel ## w ## _h6v6, opt)
> +
> +#define VP8_BILIN(w, opt)   \
> +VP8_MC(bilin ## w ## _h, opt);  \
> +VP8_MC(bilin ## w ## _v, opt);  \
> +VP8_MC(bilin ## w ## _hv, opt)
> +
> +#endif /* AVCODEC_RISCV_VP8DSP_H */
> diff --git a/libavcodec/riscv/vp8dsp_init.c
> b/libavcodec/riscv/vp8dsp_init.c
> index af57aabb71..c364de3dc9 100644
> --- a/libavcodec/riscv/vp8dsp_init.c
> +++ b/libavcodec/riscv/vp8dsp_init.c
> @@ -24,11 +24,33 @@
>  #include "libavutil/cpu.h"
>  #include "libavutil/riscv/cpu.h"
>  #include "libavcodec/vp8dsp.h"
> +#include "vp8dsp.h"
>
>  void ff_vp8_idct_dc_add_rvv(uint8_t *dst, int16_t block[16], ptrdiff_t
> stride);
>  void ff_vp8_idct_dc_add4y_rvv(uint8_t *dst, int16_t block[4][16],
> ptrdiff_t stride);
>  void ff_vp8_idct_dc_add4uv_rvv(ui

Re: [FFmpeg-devel] [PATCH] lavc/vc1dsp: R-V V mspel_pixels

2024-05-04 Thread flow gg

Hi, it's me. I accidentally repeated it but it seems to be correct.

 于2024年5月4日周六 18:01写道：

> From: sunyuechi 
>
> vc1dsp.avg_vc1_mspel_pixels_tab[0][0]_c: 869.7
> vc1dsp.avg_vc1_mspel_pixels_tab[0][0]_rvv_i32: 148.7
> vc1dsp.avg_vc1_mspel_pixels_tab[1][0]_c: 220.5
> vc1dsp.avg_vc1_mspel_pixels_tab[1][0]_rvv_i64: 56.2
> vc1dsp.put_vc1_mspel_pixels_tab[0][0]_c: 523.7
> vc1dsp.put_vc1_mspel_pixels_tab[0][0]_rvv_i32: 82.0
> vc1dsp.put_vc1_mspel_pixels_tab[1][0]_c: 138.5
> vc1dsp.put_vc1_mspel_pixels_tab[1][0]_rvv_i64: 23.7
> ---
>  libavcodec/riscv/vc1dsp_init.c |  8 +
>  libavcodec/riscv/vc1dsp_rvv.S  | 66 ++
>  2 files changed, 74 insertions(+)
>
> diff --git a/libavcodec/riscv/vc1dsp_init.c
> b/libavcodec/riscv/vc1dsp_init.c
> index e47b644f80..610c43a1a3 100644
> --- a/libavcodec/riscv/vc1dsp_init.c
> +++ b/libavcodec/riscv/vc1dsp_init.c
> @@ -29,6 +29,10 @@ void ff_vc1_inv_trans_8x8_dc_rvv(uint8_t *dest,
> ptrdiff_t stride, int16_t *block
>  void ff_vc1_inv_trans_4x8_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t
> *block);
>  void ff_vc1_inv_trans_8x4_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t
> *block);
>  void ff_vc1_inv_trans_4x4_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t
> *block);
> +void ff_put_pixels16x16_rvv(uint8_t *dst, const uint8_t *src, ptrdiff_t
> line_size, int rnd);
> +void ff_put_pixels8x8_rvv(uint8_t *dst, const uint8_t *src, ptrdiff_t
> line_size, int rnd);
> +void ff_avg_pixels16x16_rvv(uint8_t *dst, const uint8_t *src, ptrdiff_t
> line_size, int rnd);
> +void ff_avg_pixels8x8_rvv(uint8_t *dst, const uint8_t *src, ptrdiff_t
> line_size, int rnd);
>
>  av_cold void ff_vc1dsp_init_riscv(VC1DSPContext *dsp)
>  {
> @@ -38,9 +42,13 @@ av_cold void ff_vc1dsp_init_riscv(VC1DSPContext *dsp)
>  if (flags & AV_CPU_FLAG_RVV_I32 && ff_get_rv_vlenb() >= 16) {
>  dsp->vc1_inv_trans_4x8_dc = ff_vc1_inv_trans_4x8_dc_rvv;
>  dsp->vc1_inv_trans_4x4_dc = ff_vc1_inv_trans_4x4_dc_rvv;
> +dsp->put_vc1_mspel_pixels_tab[0][0] = ff_put_pixels16x16_rvv;
> +dsp->avg_vc1_mspel_pixels_tab[0][0] = ff_avg_pixels16x16_rvv;
>  if (flags & AV_CPU_FLAG_RVV_I64) {
>  dsp->vc1_inv_trans_8x8_dc = ff_vc1_inv_trans_8x8_dc_rvv;
>  dsp->vc1_inv_trans_8x4_dc = ff_vc1_inv_trans_8x4_dc_rvv;
> +dsp->put_vc1_mspel_pixels_tab[1][0] = ff_put_pixels8x8_rvv;
> +dsp->avg_vc1_mspel_pixels_tab[1][0] = ff_avg_pixels8x8_rvv;
>  }
>  }
>  #endif
> diff --git a/libavcodec/riscv/vc1dsp_rvv.S b/libavcodec/riscv/vc1dsp_rvv.S
> index 4a00945ead..48244f91aa 100644
> --- a/libavcodec/riscv/vc1dsp_rvv.S
> +++ b/libavcodec/riscv/vc1dsp_rvv.S
> @@ -111,3 +111,69 @@ func ff_vc1_inv_trans_4x4_dc_rvv, zve32x
>  vsse32.v  v0, (a0), a1
>  ret
>  endfunc
> +
> +func ff_put_pixels16x16_rvv, zve32x
> +vsetivli  zero, 16, e8, m1, ta, ma
> +.irp n 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30
> +vle8.vv\n, (a1)
> +add   a1, a1, a2
> +.endr
> +vle8.vv31, (a1)
> +.irp n 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30
> +vse8.vv\n, (a0)
> +add   a0, a0, a2
> +.endr
> +vse8.vv31, (a0)
> +
> +ret
> +endfunc
> +
> +func ff_put_pixels8x8_rvv, zve64x
> +vsetivli  zero, 8, e8, mf2, ta, ma
> +vlse64.v  v8, (a1), a2
> +vsse64.v  v8, (a0), a2
> +
> +ret
> +endfunc
> +
> +func ff_avg_pixels16x16_rvv, zve32x
> +csrwi vxrm, 0
> +vsetivli  zero, 16, e8, m1, ta, ma
> +lit0, 128
> +
> +.irp n 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30
> +vle8.vv\n, (a1)
> +add   a1, a1, a2
> +.endr
> +vle8.vv31, (a1)
> +.irp n 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14
> +vle8.vv\n, (a0)
> +add   a0, a0, a2
> +.endr
> +vle8.vv15, (a0)
> +vsetvli   zero, t0, e8, m8, ta, ma
> +vaaddu.vv v0, v0, v16
> +vaaddu.vv v8, v8, v24
> +vsetivli  zero, 16, e8, m1, ta, ma
> +.irp n  15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1
> +vse8.vv\n, (a0)
> +sub   a0, a0, a2
> +.endr
> +vse8.vv0, (a0)
> +
> +ret
> +endfunc
> +
> +func ff_avg_pixels8x8_rvv, zve64x
> +csrwi vxrm, 0
> +lit0, 64
> +vsetivli  zero, 8, e8, mf2, ta, ma
> +vlse64.v  v16, (a1), a2
> +vlse64.v  v8, (a0), a2
> +vsetvli   zero, t0, e8, m4, ta, ma
> +vaaddu.vv v16, v16, v8
> +vsetivli  zero, 8, e8, mf2, ta, ma
> +vsse64.v  v16, (a0), a2
> +
> +ret
> +endfunc
> --
> 2.45.0
>
> ___

Re: [FFmpeg-devel] [RFC] 5 year plan & Inovation

2024-05-03 Thread flow gg

I saw about comparing emails and gitlab/hub .., I did not comprehensively
understand their advantages and disadvantages, but I want to say that I
support it to change to gitlab/hub

Simple reason:

If you need to use git-send-email, I may not be able to submit any code
If you do not need to use git-send-email, it is troublesome for the
reviewer and the contributor

In detail:

I have tried git-send-email, but it failed. You can say that I am stupid,
but I would say that this is because of various reasons such as my area and
the network. It is really not what I can solve.
Maybe I will spend a lot of energy trying it in the future, but this is
because I have submitted thousands of lines of code. I don't want to give
up. If it is from the beginning, it will cause abandonment.

Maybe I am younger here in FFMPEG. I have a lot of good young people around
me. They all use github/lab by default, and there will be the same problem
as me, resulting in abandonment.

I don't really care about the quality between these tools. I think people
are important. I only want to use it, and I can facilitate the real
reviewer of Review.

I don't know if I can say my personal feelings here, but I will say:

I feel despised by this passage, which makes me uncomfortable. If you are a
reviewer, maybe I have no chance to contribute, but anyway, I have made
some contributions.

> How can anyne use git, but not git send-email? Any develop email provider
HAS Support for External Clients Over SMTP. And I Believe You * Can *
Actually
Dictate that people doon't attach patches - if you have control over the
Mailing list software, you can set up a filter that rejects such emails
And auto-replies with instructions on how to send them properly.

I think I should have the right to contribute

Ondřej Fiala  于2024年5月2日周四 22:25写道：

> On Wed May 1, 2024 at 7:27 AM CEST, Rémi Denis-Courmont wrote:
> > Le 30 avril 2024 22:15:10 GMT+03:00, "Ondřej Fiala" 
> a écrit :
> > >On Tue Apr 30, 2024 at 9:06 PM CEST, Hendrik Leppkes wrote:
> > >> I will take the replacement instead, thanks. Email is archaic. The
> > >> entire point is to get away from email, not dress it up.
> > >> SourceHut usage would likely make me even less interested then today.
> > >>
> > >> - Hendrik
> > >I guess that depends on how (and with what) you use it. Using it with
> > >Gmail UI for example is obviously not a great idea. No idea whether you
> > >do, but if you do, you should be upset at Gmail, not email.
> >
> > I don't use Gmail, and using email for review still sucks. No matter how
> you
> > slice it, email was not meant for threaded code reviews.
> Email was not meant for a lot of what it's used for today. Many email
> clients
> have support for threading, and unlike GitHub allow threads of arbitrary
> depth. Using such a client with commands for moving between messages in a
> a thread etc. makes threaded code review over email quite usably in my
> opinion.
>
> > Also while I can use git-send-email, not everyone can. And patches as
> > attachments are simply awful. Unfortunately I can't dictate that people
> don't
> > send patches that way.
> How can anyone use git, but not git send-email? Any decent email provider
> has support for external clients over SMTP. And I believe you *can*
> actually
> dictate that people don't attach patches -- if you have control over the
> mailing list software, you can set up a filter that rejects such emails
> and auto-replies with instructions on how to send them properly.
>
> > >But you did not answer my question: which specific code review features
> > >are you missing?
> >
> > Proper threaded reviews with state tracking, ability to collapse and
> expand
> > context and files, and proper listing of open MR (*not* like patchwork).
> I can sort of understand everything except the last one. What is "a proper
> listing of open MR" supposed to mean...? (I know what a merge request is,
> of course, but I don't get how the way GitLab lists them is supposedly
> superior to SourceHut's list of patches.)
> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
>
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH 2/4] lavc/vp9dsp: R-V V ipred vert

2024-05-02 Thread flow gg

Sorry, this is because a 'bpp == 8' was missed. It has been fixed in this
link

Rémi Denis-Courmont  于2024年5月2日周四 22:11写道：

> Le tiistaina 30. huhtikuuta 2024, 2.36.22 EEST flow gg a écrit :
> > updated it in the reply and https://github.com/hleft/FFmpeg/tree/vp8vp9
>
> VP9 checkasm does not pass on that branch.
>
> > Rémi Denis-Courmont  于2024年4月30日周二 01:57写道：
> >
> > > Le perjantaina 22. maaliskuuta 2024, 8.02.38 EEST flow gg a écrit :
> > > > Because the previous patch was updated, so it was updated in this
> > >
> > > response
> > >
> > > Seemingly needs rebase since April 7.
> > >
> > > --
> > > レミ・デニ-クールモン
> > > http://www.remlab.net/
> > >
> > >
> > >
> > > ___
> > > ffmpeg-devel mailing list
> > > ffmpeg-devel@ffmpeg.org
> > > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> > >
> > > To unsubscribe, visit link above, or email
> > > ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
>
>
> --
> レミ・デニ-クールモン
> http://www.remlab.net/
>
>
>
> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
>
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH 2/2] lavc/rv40dsp: R-V V chroma_mc

2024-04-30 Thread flow gg


From 3e66b2bbe257cc91a4c2169362163e92aba6760b Mon Sep 17 00:00:00 2001
From: sunyuechi 
Date: Tue, 30 Apr 2024 18:24:00 +0800
Subject: [PATCH 2/2] lavc/rv40dsp: R-V V chroma_mc

This is similar to h264, but here we use manual_avg instead of vaaddu
because rv40's OP differs from h264. If we use vaaddu,
rv40 would need to repeatedly switch between vxrm=0 and vxrm=2,
and switching vxrm is very slow.

C908:
avg_chroma_mc4_c: 2330.0
avg_chroma_mc4_rvv_i32: 602.7
avg_chroma_mc8_c: 1211.0
avg_chroma_mc8_rvv_i32: 602.7
put_chroma_mc4_c: 1825.0
put_chroma_mc4_rvv_i32: 414.7
put_chroma_mc8_c: 932.0
put_chroma_mc8_rvv_i32: 414.7
---
 libavcodec/riscv/Makefile   |   2 +
 libavcodec/riscv/rv40dsp_init.c |  51 +
 libavcodec/riscv/rv40dsp_rvv.S  | 371 
 libavcodec/rv34dsp.h|   1 +
 libavcodec/rv40dsp.c|   2 +
 5 files changed, 427 insertions(+)
 create mode 100644 libavcodec/riscv/rv40dsp_init.c
 create mode 100644 libavcodec/riscv/rv40dsp_rvv.S

diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
index dce1236b84..43b5c21cf4 100644
--- a/libavcodec/riscv/Makefile
+++ b/libavcodec/riscv/Makefile
@@ -50,6 +50,8 @@ RV-OBJS-$(CONFIG_PIXBLOCKDSP) += riscv/pixblockdsp_rvi.o
 RVV-OBJS-$(CONFIG_PIXBLOCKDSP) += riscv/pixblockdsp_rvv.o
 OBJS-$(CONFIG_RV34DSP) += riscv/rv34dsp_init.o
 RVV-OBJS-$(CONFIG_RV34DSP) += riscv/rv34dsp_rvv.o
+OBJS-$(CONFIG_RV40_DECODER) += riscv/rv40dsp_init.o
+RVV-OBJS-$(CONFIG_RV40_DECODER) += riscv/rv40dsp_rvv.o
 OBJS-$(CONFIG_SVQ1_ENCODER) += riscv/svqenc_init.o
 RVV-OBJS-$(CONFIG_SVQ1_ENCODER) += riscv/svqenc_rvv.o
 OBJS-$(CONFIG_TAK_DECODER) += riscv/takdsp_init.o
diff --git a/libavcodec/riscv/rv40dsp_init.c b/libavcodec/riscv/rv40dsp_init.c
new file mode 100644
index 00..f5a5510b28
--- /dev/null
+++ b/libavcodec/riscv/rv40dsp_init.c
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences (ISCAS).
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/riscv/cpu.h"
+#include "libavcodec/rv34dsp.h"
+
+void ff_put_rv40_chroma_mc8_rvv(uint8_t *dst, const uint8_t *src, ptrdiff_t stride,
+ int h, int x, int y);
+void ff_put_rv40_chroma_mc4_rvv(uint8_t *dst, const uint8_t *src, ptrdiff_t stride,
+ int h, int x, int y);
+
+void ff_avg_rv40_chroma_mc8_rvv(uint8_t *dst, const uint8_t *src, ptrdiff_t stride,
+ int h, int x, int y);
+void ff_avg_rv40_chroma_mc4_rvv(uint8_t *dst, const uint8_t *src, ptrdiff_t stride,
+ int h, int x, int y);
+
+av_cold void ff_rv40dsp_init_riscv(RV34DSPContext *c)
+{
+#if HAVE_RVV
+int flags = av_get_cpu_flags();
+
+if ((flags & AV_CPU_FLAG_RVV_I32) && ff_get_rv_vlenb() >= 16 &&
+(flags & AV_CPU_FLAG_RVB_ADDR)) {
+c->put_chroma_pixels_tab[0] = ff_put_rv40_chroma_mc8_rvv;
+c->put_chroma_pixels_tab[1] = ff_put_rv40_chroma_mc4_rvv;
+c->avg_chroma_pixels_tab[0] = ff_avg_rv40_chroma_mc8_rvv;
+c->avg_chroma_pixels_tab[1] = ff_avg_rv40_chroma_mc4_rvv;
+}
+#endif
+}
diff --git a/libavcodec/riscv/rv40dsp_rvv.S b/libavcodec/riscv/rv40dsp_rvv.S
new file mode 100644
index 00..e49345ef70
--- /dev/null
+++ b/libavcodec/riscv/rv40dsp_rvv.S
@@ -0,0 +1,371 @@
+/*
+ * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences (ISCAS).
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth F

[FFmpeg-devel] [PATCH 1/2] checkasm/rv40dsp: add chroma_mc test

2024-04-30 Thread flow gg


From 07c0b8a26b76e31c46ecabddb251f317c48c73a3 Mon Sep 17 00:00:00 2001
From: sunyuechi 
Date: Tue, 30 Apr 2024 12:43:57 +0800
Subject: [PATCH 1/2] checkasm/rv40dsp: add chroma_mc test

This is similar to h264.
---
 tests/checkasm/Makefile   |  1 +
 tests/checkasm/checkasm.c |  3 ++
 tests/checkasm/checkasm.h |  1 +
 tests/checkasm/rv40dsp.c  | 75 +++
 tests/fate/checkasm.mak   |  1 +
 5 files changed, 81 insertions(+)
 create mode 100644 tests/checkasm/rv40dsp.c

diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile
index d846a48585..559d88cba4 100644
--- a/tests/checkasm/Makefile
+++ b/tests/checkasm/Makefile
@@ -35,6 +35,7 @@ AVCODECOBJS-$(CONFIG_OPUS_DECODER)  += opusdsp.o
 AVCODECOBJS-$(CONFIG_PIXBLOCKDSP)   += pixblockdsp.o
 AVCODECOBJS-$(CONFIG_HEVC_DECODER)  += hevc_add_res.o hevc_deblock.o hevc_idct.o hevc_sao.o hevc_pel.o
 AVCODECOBJS-$(CONFIG_RV34DSP)   += rv34dsp.o
+AVCODECOBJS-$(CONFIG_RV40_DECODER)  += rv40dsp.o
 AVCODECOBJS-$(CONFIG_SVQ1_ENCODER)  += svq1enc.o
 AVCODECOBJS-$(CONFIG_TAK_DECODER)   += takdsp.o
 AVCODECOBJS-$(CONFIG_UTVIDEO_DECODER)   += utvideodsp.o
diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
index ffc04f0623..e007cd59a5 100644
--- a/tests/checkasm/checkasm.c
+++ b/tests/checkasm/checkasm.c
@@ -170,6 +170,9 @@ static const struct {
 #if CONFIG_RV34DSP
 { "rv34dsp", checkasm_check_rv34dsp },
 #endif
+#if CONFIG_RV40_DECODER
+{ "rv40dsp", checkasm_check_rv40dsp },
+#endif
 #if CONFIG_SVQ1_ENCODER
 { "svq1enc", checkasm_check_svq1enc },
 #endif
diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h
index 1f31591ac0..3dadbb00ad 100644
--- a/tests/checkasm/checkasm.h
+++ b/tests/checkasm/checkasm.h
@@ -113,6 +113,7 @@ void checkasm_check_opusdsp(void);
 void checkasm_check_pixblockdsp(void);
 void checkasm_check_sbrdsp(void);
 void checkasm_check_rv34dsp(void);
+void checkasm_check_rv40dsp(void);
 void checkasm_check_svq1enc(void);
 void checkasm_check_synth_filter(void);
 void checkasm_check_sw_gbrp(void);
diff --git a/tests/checkasm/rv40dsp.c b/tests/checkasm/rv40dsp.c
new file mode 100644
index 00..a1a873d430
--- /dev/null
+++ b/tests/checkasm/rv40dsp.c
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences (ISCAS).
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include 
+#include 
+#include "checkasm.h"
+#include "libavcodec/rv40dsp.c"
+#include "libavutil/mem_internal.h"
+
+#define randomize_buffers()  \
+do { \
+for (int i = 0; i < 16*18*2; i++)\
+src[i] = rnd() & 0x3;\
+} while (0)
+
+static void check_chroma_mc(void)
+{
+RV34DSPContext h;
+LOCAL_ALIGNED_32(uint8_t, src,  [16 * 18 * 2]);
+LOCAL_ALIGNED_32(uint8_t, dst0, [16 * 18 * 2]);
+LOCAL_ALIGNED_32(uint8_t, dst1, [16 * 18 * 2]);
+
+declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *dst, const uint8_t *src,
+  ptrdiff_t stride, int h, int x, int y);
+
+ff_rv40dsp_init(&h);
+randomize_buffers();
+for (int size = 0; size < 2; size++) {
+
+#define CHECK_CHROMA_MC(name) \
+do {  \
+if (check_func(h.name## _pixels_tab[size], #name "_mc%d", 1 << (3 - size))) { \
+for (int x = 0; x < 2; x++) { \
+for (int y = 0; y < 2; y++) { \
+memcpy(dst0, src, 16 * 18);   \
+memcpy(dst1, src, 16 * 18);   \
+call_ref(dst0, src, 16, 16, x, y);\
+call_new(dst1, src, 16, 16, x, y);\
+if (memcmp(dst0, dst1, 16 * 16)) {\
+fprintf(stderr, #na

Re: [FFmpeg-devel] [PATCH 2/2] lavc/blockdsp: R-V V fill_block

2024-04-30 Thread flow gg

Since the number of stores is controlled by a3 and not by zero, it doesn't
have to be exactly 16 bytes ？

Rémi Denis-Courmont  于2024年4月30日周二 14:40写道：

>
>
> Le 30 avril 2024 03:26:25 GMT+03:00, flow gg  a
> écrit :
> >Hi, I initially used a loop, but according to libavcodec/blockdsp.h,
> >
> >the maximum is 8x16 = 128 bytes, so using ff_get_rv_vlenb() >= 16 and m8
> >does not require a loop.
>
> It's okay to assume that VLENB is at least 16 bytes (as long as it's
> checked), but the code seems to assume (?) that it's *exactly* 16 bytes,
> which will break on future hardware.
>
> >
> >```
> >/* add and put pixel (decoding)
> > * Block sizes for op_pixels_func are 8x4,8x8 16x8 16x16.
> > * h for op_pixels_func is limited to { width / 2, width },
> > * but never larger than 16 and never smaller than 4. */
> >typedef void (*op_fill_func)(uint8_t *block /* align width (8 or 16) */,
> > uint8_t value, ptrdiff_t line_size, int h);
> >```
> >
> >Rémi Denis-Courmont  于2024年4月30日周二 01:31写道：
> >
> >> Le maanantaina 29. huhtikuuta 2024, 10.09.41 EEST flow gg a écrit :
> >> >
> >>
> >> Are you sure that this works with all vector lengths?
> >> The block8 code looks odd.
> >>
> >> --
> >> レミ・デニ-クールモン
> >> http://www.remlab.net/
> >> ___
> >> ffmpeg-devel mailing list
> >> ffmpeg-devel@ffmpeg.org
> >> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> >>
> >> To unsubscribe, visit link above, or email
> >> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
> >>
> >___
> >ffmpeg-devel mailing list
> >ffmpeg-devel@ffmpeg.org
> >https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> >
> >To unsubscribe, visit link above, or email
> >ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
>
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH 1/2] checkasm/blockdsp: add fill_block test

2024-04-29 Thread flow gg

Since there is no 8x16, not test 8x16, and updated it in the reply

flow gg  于2024年4月29日周一 15:09写道：

>
>
From fc7c28cb78e0c90880f31c0b8d6f2fc16d0fe581 Mon Sep 17 00:00:00 2001
From: sunyuechi 
Date: Mon, 29 Apr 2024 14:18:23 +0800
Subject: [PATCH 1/2] checkasm/blockdsp: add fill_block test

---
 tests/checkasm/blockdsp.c | 31 +++
 1 file changed, 31 insertions(+)

diff --git a/tests/checkasm/blockdsp.c b/tests/checkasm/blockdsp.c
index 22a2f79455..19d69b8687 100644
--- a/tests/checkasm/blockdsp.c
+++ b/tests/checkasm/blockdsp.c
@@ -29,6 +29,11 @@
 #include "libavutil/intreadwrite.h"
 #include "libavutil/mem_internal.h"
 
+typedef struct {
+const char *name;
+int size;
+} test;
+
 #define randomize_buffers(size) \
 do {\
 int i;  \
@@ -52,6 +57,30 @@ do {\
 }   \
 } while (0)
 
+static void check_fill(BlockDSPContext *h){
+const test tests[] = {
+{"fill_block_tab[0]", 16},
+{"fill_block_tab[1]", 8},
+};
+LOCAL_ALIGNED_32(uint8_t, buf0, [16 * 16]);
+LOCAL_ALIGNED_32(uint8_t, buf1, [16 * 16]);
+
+for (size_t t = 0; t < FF_ARRAY_ELEMS(tests); ++t) {
+int n = tests[t].size;
+declare_func(void, uint8_t *block, uint8_t value,
+ ptrdiff_t line_size, int h);
+if (check_func(h->fill_block_tab[t], "blockdsp.%s", tests[t].name)) {
+uint8_t value = rnd();
+randomize_buffers(tests[t].size);
+call_ref(buf0, value, n, n);
+call_new(buf1, value, n, n);
+if (memcmp(buf0, buf1, sizeof(*buf0) * n * n))
+fail();
+bench_new(buf0, value, n, n);
+}
+}
+}
+
 void checkasm_check_blockdsp(void)
 {
 LOCAL_ALIGNED_32(uint16_t, buf0, [6 * 8 * 8]);
@@ -64,5 +93,7 @@ void checkasm_check_blockdsp(void)
 check_clear(clear_block,  8 * 8);
 check_clear(clear_blocks, 8 * 8 * 6);
 
+check_fill(&h);
+
 report("blockdsp");
 }
-- 
2.44.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH 2/2] lavc/blockdsp: R-V V fill_block

2024-04-29 Thread flow gg

Since there is no 8x16, I changed m8 to m4, and updated it in the reply



flow gg  于2024年4月30日周二 08:26写道：

> Hi, I initially used a loop, but according to libavcodec/blockdsp.h,
>
> the maximum is 8x16 = 128 bytes, so using ff_get_rv_vlenb() >= 16 and m8
> does not require a loop.
>
> ```
> /* add and put pixel (decoding)
>  * Block sizes for op_pixels_func are 8x4,8x8 16x8 16x16.
>  * h for op_pixels_func is limited to { width / 2, width },
>  * but never larger than 16 and never smaller than 4. */
> typedef void (*op_fill_func)(uint8_t *block /* align width (8 or 16) */,
>  uint8_t value, ptrdiff_t line_size, int h);
> ```
>
> Rémi Denis-Courmont  于2024年4月30日周二 01:31写道：
>
>> Le maanantaina 29. huhtikuuta 2024, 10.09.41 EEST flow gg a écrit :
>> >
>>
>> Are you sure that this works with all vector lengths?
>> The block8 code looks odd.
>>
>> --
>> レミ・デニ-クールモン
>> http://www.remlab.net/
>> ___
>> ffmpeg-devel mailing list
>> ffmpeg-devel@ffmpeg.org
>> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>>
>> To unsubscribe, visit link above, or email
>> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
>>
>
From 38068cd4c770b24ac494bddab6c3d19149d2f5cb Mon Sep 17 00:00:00 2001
From: sunyuechi 
Date: Wed, 6 Mar 2024 12:46:03 +0800
Subject: [PATCH 2/2] lavc/blockdsp: R-V V fill_block

C908:
blockdsp.fill_block_tab[0]_c: 549.7
blockdsp.fill_block_tab[0]_rvv_i64: 48.2
blockdsp.fill_block_tab[1]_c: 77.0
blockdsp.fill_block_tab[1]_rvv_i64: 19.7
---
 libavcodec/riscv/blockdsp_init.c |  6 ++
 libavcodec/riscv/blockdsp_rvv.S  | 21 +
 2 files changed, 27 insertions(+)

diff --git a/libavcodec/riscv/blockdsp_init.c b/libavcodec/riscv/blockdsp_init.c
index 59b2f9d47b..42c8e87fa7 100644
--- a/libavcodec/riscv/blockdsp_init.c
+++ b/libavcodec/riscv/blockdsp_init.c
@@ -27,6 +27,10 @@
 
 void ff_clear_block_rvv(int16_t *block);
 void ff_clear_blocks_rvv(int16_t *block);
+void ff_fill_block16_rvv(uint8_t *block, uint8_t value, ptrdiff_t line_size,
+   int h);
+void ff_fill_block8_rvv(uint8_t *block, uint8_t value, ptrdiff_t line_size,
+   int h);
 
 av_cold void ff_blockdsp_init_riscv(BlockDSPContext *c)
 {
@@ -36,6 +40,8 @@ av_cold void ff_blockdsp_init_riscv(BlockDSPContext *c)
 if (flags & AV_CPU_FLAG_RVV_I64 && ff_get_rv_vlenb() >= 16) {
 c->clear_block = ff_clear_block_rvv;
 c->clear_blocks = ff_clear_blocks_rvv;
+c->fill_block_tab[0] = ff_fill_block16_rvv;
+c->fill_block_tab[1] = ff_fill_block8_rvv;
 }
 #endif
 }
diff --git a/libavcodec/riscv/blockdsp_rvv.S b/libavcodec/riscv/blockdsp_rvv.S
index 8bb00bb467..18ab17da00 100644
--- a/libavcodec/riscv/blockdsp_rvv.S
+++ b/libavcodec/riscv/blockdsp_rvv.S
@@ -40,3 +40,24 @@ func ff_clear_blocks_rvv, zve64x
 
 ret
 endfunc
+
+func ff_fill_block16_rvv, zve32x
+vsetivli  t0, 16, e8, m1, ta, ma
+vmv.v.x   v8, a1
+1:
+addi  a3, a3, -1
+vse8.vv8, (a0)
+add   a0, a0, a2
+bnez  a3, 1b
+
+ret
+endfunc
+
+func ff_fill_block8_rvv, zve64x
+vsetvli   t0, zero, e8, m4, ta, ma
+vmv.v.x   v8, a1
+vsetvli   t0, a3, e64, m4, ta, ma
+vsse64.v  v8, (a0), a2
+
+ret
+endfunc
-- 
2.44.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH 2/2] lavc/blockdsp: R-V V fill_block

2024-04-29 Thread flow gg

Hi, I initially used a loop, but according to libavcodec/blockdsp.h,

the maximum is 8x16 = 128 bytes, so using ff_get_rv_vlenb() >= 16 and m8
does not require a loop.

```
/* add and put pixel (decoding)
 * Block sizes for op_pixels_func are 8x4,8x8 16x8 16x16.
 * h for op_pixels_func is limited to { width / 2, width },
 * but never larger than 16 and never smaller than 4. */
typedef void (*op_fill_func)(uint8_t *block /* align width (8 or 16) */,
 uint8_t value, ptrdiff_t line_size, int h);
```

Rémi Denis-Courmont  于2024年4月30日周二 01:31写道：

> Le maanantaina 29. huhtikuuta 2024, 10.09.41 EEST flow gg a écrit :
> >
>
> Are you sure that this works with all vector lengths?
> The block8 code looks odd.
>
> --
> レミ・デニ-クールモン
> http://www.remlab.net/
> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
>
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH 2/4] lavc/vp9dsp: R-V V ipred vert

2024-04-29 Thread flow gg

updated it in the reply and https://github.com/hleft/FFmpeg/tree/vp8vp9

Rémi Denis-Courmont  于2024年4月30日周二 01:57写道：

> Le perjantaina 22. maaliskuuta 2024, 8.02.38 EEST flow gg a écrit :
> > Because the previous patch was updated, so it was updated in this
> response
>
> Seemingly needs rebase since April 7.
>
> --
> レミ・デニ-クールモン
> http://www.remlab.net/
>
>
>
> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
>
From f9a3d9d10536520c8a0b34de46fd5804796207ac Mon Sep 17 00:00:00 2001
From: sunyuechi 
Date: Sun, 7 Apr 2024 13:21:02 +0800
Subject: [PATCH 08/20] lavc/vp9dsp: R-V V ipred vert

C908:
vp9_vert_8x8_8bpp_c: 22.0
vp9_vert_8x8_8bpp_rvv_i64: 18.5
vp9_vert_16x16_8bpp_c: 71.2
vp9_vert_16x16_8bpp_rvv_i32: 50.7
vp9_vert_32x32_8bpp_c: 300.2
vp9_vert_32x32_8bpp_rvv_i32: 136.7
---
 libavcodec/riscv/vp9_intra_rvv.S | 35 
 libavcodec/riscv/vp9dsp.h|  6 ++
 libavcodec/riscv/vp9dsp_init.c   |  3 +++
 3 files changed, 44 insertions(+)

diff --git a/libavcodec/riscv/vp9_intra_rvv.S b/libavcodec/riscv/vp9_intra_rvv.S
index db9774c263..b5f0f9d3c3 100644
--- a/libavcodec/riscv/vp9_intra_rvv.S
+++ b/libavcodec/riscv/vp9_intra_rvv.S
@@ -113,3 +113,38 @@ func_dc dc_left  8   left 3  0  zve64x
 func_dc dc_top   32  top  5  1  zve32x
 func_dc dc_top   16  top  4  1  zve32x
 func_dc dc_top   8   top  3  0  zve64x
+
+func ff_v_32x32_rvv, zve32x
+vsetivli zero, 8, e8, mf2, ta, ma
+vle32.v  v8, (a3)
+
+.rept 31
+vse32.v  v8, (a0)
+add  a0, a0, a1
+.endr
+vse32.v  v8, (a0)
+
+ret
+endfunc
+
+func ff_v_16x16_rvv, zve32x
+vsetivli zero, 4, e8, mf4, ta, ma
+vle32.v  v8, (a3)
+
+.rept 15
+vse32.v  v8, (a0)
+add  a0, a0, a1
+.endr
+vse32.v  v8, (a0)
+
+ret
+endfunc
+
+func ff_v_8x8_rvv, zve64x
+ld   t0, (a3)
+vsetivli zero, 8, e64, m4, ta, ma
+vmv.v.x  v8, t0
+vsse64.v v8, (a0), a1
+
+ret
+endfunc
diff --git a/libavcodec/riscv/vp9dsp.h b/libavcodec/riscv/vp9dsp.h
index 25047ed507..113397ce86 100644
--- a/libavcodec/riscv/vp9dsp.h
+++ b/libavcodec/riscv/vp9dsp.h
@@ -60,6 +60,12 @@ void ff_dc_129_16x16_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l,
  const uint8_t *a);
 void ff_dc_129_8x8_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l,
const uint8_t *a);
+void ff_v_32x32_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l,
+const uint8_t *a);
+void ff_v_16x16_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l,
+const uint8_t *a);
+void ff_v_8x8_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l,
+  const uint8_t *a);
 
 #define VP9_8TAP_RISCV_RVV_FUNC(SIZE, type, type_idx) \
 void ff_put_8tap_##type##_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t dststride,   \
diff --git a/libavcodec/riscv/vp9dsp_init.c b/libavcodec/riscv/vp9dsp_init.c
index 69ab39004c..9c550d40b5 100644
--- a/libavcodec/riscv/vp9dsp_init.c
+++ b/libavcodec/riscv/vp9dsp_init.c
@@ -36,6 +36,7 @@ static av_cold void vp9dsp_intrapred_init_rvv(VP9DSPContext *dsp, int bpp)
 dsp->intra_pred[TX_8X8][DC_128_PRED] = ff_dc_128_8x8_rvv;
 dsp->intra_pred[TX_8X8][DC_129_PRED] = ff_dc_129_8x8_rvv;
 dsp->intra_pred[TX_8X8][TOP_DC_PRED] = ff_dc_top_8x8_rvv;
+dsp->intra_pred[TX_8X8][VERT_PRED] = ff_v_8x8_rvv;
 }
 
 if (bpp == 8 && flags & AV_CPU_FLAG_RVV_I32 && ff_get_rv_vlenb() >= 16) {
@@ -51,6 +52,8 @@ static av_cold void vp9dsp_intrapred_init_rvv(VP9DSPContext *dsp, int bpp)
 dsp->intra_pred[TX_16X16][DC_129_PRED] = ff_dc_129_16x16_rvv;
 dsp->intra_pred[TX_32X32][TOP_DC_PRED] = ff_dc_top_32x32_rvv;
 dsp->intra_pred[TX_16X16][TOP_DC_PRED] = ff_dc_top_16x16_rvv;
+dsp->intra_pred[TX_32X32][VERT_PRED] = ff_v_32x32_rvv;
+dsp->intra_pred[TX_16X16][VERT_PRED] = ff_v_16x16_rvv;
 }
 #endif
 }
-- 
2.44.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH 2/2] lavc/blockdsp: R-V V fill_block

2024-04-29 Thread flow gg


From 4315f4e4774e3006d7cc55b6d235cb80e0173cf9 Mon Sep 17 00:00:00 2001
From: sunyuechi 
Date: Wed, 6 Mar 2024 12:46:03 +0800
Subject: [PATCH 2/2] lavc/blockdsp: R-V V fill_block

C908:
blockdsp.fill_block_tab[0]_c: 550.0
blockdsp.fill_block_tab[0]_rvv_i64: 48.2
blockdsp.fill_block_tab[1]_c: 148.7
blockdsp.fill_block_tab[1]_rvv_i64: 29.7
---
 libavcodec/riscv/blockdsp_init.c |  6 ++
 libavcodec/riscv/blockdsp_rvv.S  | 21 +
 2 files changed, 27 insertions(+)

diff --git a/libavcodec/riscv/blockdsp_init.c b/libavcodec/riscv/blockdsp_init.c
index 59b2f9d47b..42c8e87fa7 100644
--- a/libavcodec/riscv/blockdsp_init.c
+++ b/libavcodec/riscv/blockdsp_init.c
@@ -27,6 +27,10 @@
 
 void ff_clear_block_rvv(int16_t *block);
 void ff_clear_blocks_rvv(int16_t *block);
+void ff_fill_block16_rvv(uint8_t *block, uint8_t value, ptrdiff_t line_size,
+   int h);
+void ff_fill_block8_rvv(uint8_t *block, uint8_t value, ptrdiff_t line_size,
+   int h);
 
 av_cold void ff_blockdsp_init_riscv(BlockDSPContext *c)
 {
@@ -36,6 +40,8 @@ av_cold void ff_blockdsp_init_riscv(BlockDSPContext *c)
 if (flags & AV_CPU_FLAG_RVV_I64 && ff_get_rv_vlenb() >= 16) {
 c->clear_block = ff_clear_block_rvv;
 c->clear_blocks = ff_clear_blocks_rvv;
+c->fill_block_tab[0] = ff_fill_block16_rvv;
+c->fill_block_tab[1] = ff_fill_block8_rvv;
 }
 #endif
 }
diff --git a/libavcodec/riscv/blockdsp_rvv.S b/libavcodec/riscv/blockdsp_rvv.S
index 8bb00bb467..71d72cce56 100644
--- a/libavcodec/riscv/blockdsp_rvv.S
+++ b/libavcodec/riscv/blockdsp_rvv.S
@@ -40,3 +40,24 @@ func ff_clear_blocks_rvv, zve64x
 
 ret
 endfunc
+
+func ff_fill_block16_rvv, zve32x
+vsetivli  t0, 16, e8, m1, ta, ma
+vmv.v.x   v8, a1
+1:
+addi  a3, a3, -1
+vse8.vv8, (a0)
+add   a0, a0, a2
+bnez  a3, 1b
+
+ret
+endfunc
+
+func ff_fill_block8_rvv, zve64x
+vsetvli   t0, zero, e8, m8, ta, ma
+vmv.v.x   v8, a1
+vsetvli   t0, a3, e64, m8, ta, ma
+vsse64.v  v8, (a0), a2
+
+ret
+endfunc
-- 
2.44.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH 1/2] checkasm/blockdsp: add fill_block test

2024-04-29 Thread flow gg


From 0c196a37cb4036d8c618c06c02a011b910cc56ce Mon Sep 17 00:00:00 2001
From: sunyuechi 
Date: Mon, 29 Apr 2024 14:18:23 +0800
Subject: [PATCH 1/2] checkasm/blockdsp: add fill_block test

---
 tests/checkasm/blockdsp.c | 32 
 1 file changed, 32 insertions(+)

diff --git a/tests/checkasm/blockdsp.c b/tests/checkasm/blockdsp.c
index 22a2f79455..355e111d43 100644
--- a/tests/checkasm/blockdsp.c
+++ b/tests/checkasm/blockdsp.c
@@ -29,6 +29,11 @@
 #include "libavutil/intreadwrite.h"
 #include "libavutil/mem_internal.h"
 
+typedef struct {
+const char *name;
+int size;
+} test;
+
 #define randomize_buffers(size) \
 do {\
 int i;  \
@@ -52,6 +57,31 @@ do {\
 }   \
 } while (0)
 
+static void check_fill(BlockDSPContext *h){
+const test tests[] = {
+{"fill_block_tab[0]", 16},
+{"fill_block_tab[1]", 8},
+};
+const int n = 16;
+
+LOCAL_ALIGNED_32(uint8_t, buf0, [16 * 32]);
+LOCAL_ALIGNED_32(uint8_t, buf1, [16 * 32]);
+
+for (size_t t = 0; t < FF_ARRAY_ELEMS(tests); ++t) {
+declare_func(void, uint8_t *block, uint8_t value,
+ ptrdiff_t line_size, int h);
+if (check_func(h->fill_block_tab[t], "blockdsp.%s", tests[t].name)) {
+uint8_t value = rnd();
+randomize_buffers(tests[t].size);
+call_ref(buf0, value, 16, n);
+call_new(buf1, value, 16, n);
+if (memcmp(buf0, buf1, sizeof(*buf0) * tests[t].size * n))
+fail();
+bench_new(buf0, value, 16, n);
+}
+}
+}
+
 void checkasm_check_blockdsp(void)
 {
 LOCAL_ALIGNED_32(uint16_t, buf0, [6 * 8 * 8]);
@@ -64,5 +94,7 @@ void checkasm_check_blockdsp(void)
 check_clear(clear_block,  8 * 8);
 check_clear(clear_blocks, 8 * 8 * 6);
 
+check_fill(&h);
+
 report("blockdsp");
 }
-- 
2.44.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH 2/2] lavc/vc1dsp: R-V V mspel_pixels

2024-04-29 Thread flow gg

Happy to see you back :)

Rémi Denis-Courmont  于2024年4月29日周一 02:06写道：

> Le sunnuntaina 7. huhtikuuta 2024, 8.38.54 EEST flow gg a écrit :
> > ping
>
> I have been away for a while, and catching up takes time, sorry.
>
> --
> レミ・デニ-クールモン
> http://www.remlab.net/
> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
>
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH 1/3] lavc/vp8dsp: R-V V loop_filter_simple

2024-04-20 Thread flow gg

github link: https://github.com/hleft/FFmpeg/tree/vp8vp9

flow gg  于2024年4月20日周六 23:55写道：

>
>
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH 3/3] lavc/vp8dsp: R-V V loop_filter

2024-04-20 Thread flow gg


From cff79c9500b94f4c0abdd9cd68c91cc736366c78 Mon Sep 17 00:00:00 2001
From: sunyuechi 
Date: Sat, 20 Apr 2024 23:26:58 +0800
Subject: [PATCH 3/3] lavc/vp8dsp: R-V V loop_filter

C908:
vp8_loop_filter8uv_v_c: 745.5
vp8_loop_filter8uv_v_rvv_i32: 467.2
vp8_loop_filter16y_h_c: 674.2
vp8_loop_filter16y_h_rvv_i32: 553.0
vp8_loop_filter16y_v_c: 732.7
vp8_loop_filter16y_v_rvv_i32: 324.5
---
 libavcodec/riscv/vp8dsp_init.c |  4 +++
 libavcodec/riscv/vp8dsp_rvv.S  | 63 ++
 2 files changed, 67 insertions(+)

diff --git a/libavcodec/riscv/vp8dsp_init.c b/libavcodec/riscv/vp8dsp_init.c
index aa95021df5..597e6acec8 100644
--- a/libavcodec/riscv/vp8dsp_init.c
+++ b/libavcodec/riscv/vp8dsp_init.c
@@ -123,6 +123,10 @@ av_cold void ff_vp8dsp_init_riscv(VP8DSPContext *c)
 c->vp8_idct_dc_add4uv = ff_vp8_idct_dc_add4uv_rvv;
 }
 
+c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16_rvv;
+c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16_rvv;
+c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_rvv;
+
 c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16_inner_rvv;
 c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16_inner_rvv;
 c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_rvv;
diff --git a/libavcodec/riscv/vp8dsp_rvv.S b/libavcodec/riscv/vp8dsp_rvv.S
index f10e269d9d..af28ea5258 100644
--- a/libavcodec/riscv/vp8dsp_rvv.S
+++ b/libavcodec/riscv/vp8dsp_rvv.S
@@ -229,6 +229,39 @@ endfunc
 vsra.vi v24, v24, 1  // (f1 + 1) >> 1;
 vadd.vv v8, v18, v24
 vsub.vv v10, v20, v24
+.else
+li  t5, 27
+li  t3, 9
+li  a7, 18
+vwmul.vxv2, v11, t5
+vwmul.vxv6, v11, t3
+vwmul.vxv4, v11, a7
+
+.ifc \len,16
+vsetvli zero, zero, e16, m2, ta, ma
+.else
+vsetvli zero, zero, e16, m1, ta, ma
+.endif
+
+li  a7, 63
+vzext.vf2   v14, v15 // p2
+vzext.vf2   v24, v10 // q2
+vadd.vx v2, v2, a7
+vadd.vx v4, v4, a7
+vadd.vx v6, v6, a7
+vsra.vi v2, v2, 7// a0
+vsra.vi v12, v4, 7   // a1
+vsra.vi v6, v6, 7// a2
+vadd.vv v14, v14, v6 // p2 + a2
+vsub.vv v22, v24, v6 // q2 - a2
+vsub.vv v10, v20, v12// q1 - a1
+vadd.vv v4, v8, v2   // p0 + a0
+vsub.vv v6, v16, v2  // q0 - a0
+vadd.vv v8, v12, v18 // a1 + p1
+vmax.vx v4, v4, zero
+vmax.vx v6, v6, zero
+vmax.vx v14, v14, zero
+vmax.vx v16, v22, zero
 .endif
 
 vmax.vx v8, v8, zero
@@ -253,6 +286,17 @@ endfunc
 vsse8.v v6, (a6), \stride, v0.t
 vsse8.v v7, (t4), \stride, v0.t
 .endif
+.if !\inner
+vnclipu.wi  v14, v14, 0
+vnclipu.wi  v16, v16, 0
+.ifc \type,v
+vse8.v  v14, (t0), v0.t
+vse8.v  v16, (t6), v0.t
+.else
+vsse8.v v14, (t0), \stride, v0.t
+vsse8.v v16, (t6), \stride, v0.t
+.endif
+.endif
 .endif
 .endm
 
@@ -275,6 +319,25 @@ func ff_vp8_v_loop_filter8uv_inner_rvv, zve32x
 ret
 endfunc
 
+func ff_vp8_v_loop_filter16_rvv, zve32x
+vsetivlizero, 16, e8, m1, ta, ma
+filter 16 v 1 0 a0 a1 a2 a3 a4
+ret
+endfunc
+
+func ff_vp8_h_loop_filter16_rvv, zve32x
+vsetivlizero, 16, e8, m1, ta, ma
+filter 16 h 1 0 a0 a1 a2 a3 a4
+ret
+endfunc
+
+func ff_vp8_v_loop_filter8uv_rvv, zve32x
+vsetivlizero, 8, e8, mf2, ta, ma
+filter 8 v 1 0 a0 a2 a3 a4 a5
+filter 8 v 1 0 a1 a2 a3 a4 a5
+ret
+endfunc
+
 func ff_vp8_v_loop_filter16_simple_rvv, zve32x
 vsetivlizero, 16, e8, m1, ta, ma
 filter 16 v 0 0 a0 a1 a2 a3 a4
-- 
2.44.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

1 2 3 >

1 - 100 of 272 matches

Mail list logo