This saves one register and one instruction per transform.
add16 and add16intra thus become stack-less.
---
 libavcodec/riscv/h264dsp_init.c | 25 ++++++++--------
 libavcodec/riscv/h264idct_rvv.S | 51 ++++++++++++++++-----------------
 2 files changed, 38 insertions(+), 38 deletions(-)

diff --git a/libavcodec/riscv/h264dsp_init.c b/libavcodec/riscv/h264dsp_init.c
index 9ae182151c..836c073559 100644
--- a/libavcodec/riscv/h264dsp_init.c
+++ b/libavcodec/riscv/h264dsp_init.c
@@ -98,13 +98,14 @@ av_cold void ff_h264dsp_init_riscv(H264DSPContext *dsp, 
const int bit_depth,
 
             dsp->h264_idct_add  = ff_h264_idct_add_8_rvv;
             dsp->h264_idct8_add = ff_h264_idct8_add_8_rvv;
+            dsp->h264_idct_dc_add = ff_h264_idct4_dc_add_8_rvv;
+            if (flags & AV_CPU_FLAG_RVB) {
+                dsp->h264_idct_add16      = ff_h264_idct_add16_8_rvv;
+                dsp->h264_idct_add16intra = ff_h264_idct_add16intra_8_rvv;
 #  if __riscv_xlen == 64
-            dsp->h264_idct_add16      = ff_h264_idct_add16_8_rvv;
-            dsp->h264_idct_add16intra = ff_h264_idct_add16intra_8_rvv;
-            dsp->h264_idct8_add4      = ff_h264_idct8_add4_8_rvv;
+                dsp->h264_idct8_add4      = ff_h264_idct8_add4_8_rvv;
 #  endif
-            if (flags & AV_CPU_FLAG_RVV_I32)
-                dsp->h264_idct_dc_add = ff_h264_idct4_dc_add_8_rvv;
+            }
             if (flags & AV_CPU_FLAG_RVV_I64) {
                 dsp->h264_add_pixels8_clear = ff_h264_add_pixels8_8_rvv;
                 dsp->h264_idct8_dc_add      = ff_h264_idct8_dc_add_8_rvv;
@@ -118,16 +119,16 @@ av_cold void ff_h264dsp_init_riscv(H264DSPContext *dsp, 
const int bit_depth,
                 dsp->h264_idct_add = ff_h264_idct_add_##depth##_rvv; \
             if (flags & AV_CPU_FLAG_RVB_ADDR) \
                 dsp->h264_idct8_add = ff_h264_idct8_add_##depth##_rvv; \
-            if (zvl128b && (flags & AV_CPU_FLAG_RVB_ADDR)) { \
+            if (zvl128b && (flags & AV_CPU_FLAG_RVB)) { \
                 dsp->h264_idct_dc_add  = ff_h264_idct4_dc_add_##depth##_rvv; \
                 dsp->h264_idct8_dc_add = ff_h264_idct8_dc_add_##depth##_rvv; \
+                if (__riscv_xlen == 64) { \
+                    dsp->h264_idct_add16 = ff_h264_idct_add16_##depth##_rvv; \
+                    dsp->h264_idct_add16intra = \
+                        ff_h264_idct_add16intra_##depth##_rvv; \
+                } \
             } \
-            if (__riscv_xlen == 64 && zvl128b) { \
-                dsp->h264_idct_add16 = ff_h264_idct_add16_##depth##_rvv; \
-                dsp->h264_idct_add16intra = \
-                    ff_h264_idct_add16intra_##depth##_rvv; \
-            } \
-            if (__riscv_xlen == 64 && (flags & AV_CPU_FLAG_RVB_ADDR)) \
+            if (__riscv_xlen == 64 && (flags & AV_CPU_FLAG_RVB)) \
                 dsp->h264_idct8_add4 = ff_h264_idct8_add4_##depth##_rvv; \
         }
 
diff --git a/libavcodec/riscv/h264idct_rvv.S b/libavcodec/riscv/h264idct_rvv.S
index 514c849bce..a49a32c47e 100644
--- a/libavcodec/riscv/h264idct_rvv.S
+++ b/libavcodec/riscv/h264idct_rvv.S
@@ -532,16 +532,11 @@ const ff_h264_scan8
         .byte   034, 035, 044, 045, 036, 037, 046, 047
 endconst
 
-#if (__riscv_xlen == 64)
 .macro  idct4_adds type, depth
-func ff_h264_idct_add\type\()_\depth\()_rvv, zve32x
+func ff_h264_idct_add\type\()_\depth\()_rvv, zve32x, b
         csrwi   vxrm, 0
-        addi    sp, sp, -16
         lla     t0, ff_h264_scan8
-        sd      s0,   (sp)
         li      t1, 32 * (\depth / 8)
-        mv      s0, sp
-        sd      ra,  8(sp)
         vsetivli  zero, 16, e8, m1, ta, ma
         vle8.v    v8, (t0)
 .if \depth == 8
@@ -567,20 +562,23 @@ func ff_h264_idct_add\type\()_\depth\()_rvv, zve32x
         vsetvli   zero, zero, e16, m2, ta, ma
         vmv.x.s   a4, v0
         vmv.x.s   a7, v1
+        zext.h  a4, a4
+        slli    a7, a7, 16
         mv      t4, a0
+        or      a4, a4, a7
         mv      t5, a1
         mv      a1, a2
         mv      a2, a3
         li      a3, 16
+        mv      a7, ra
 1:
         andi    t0, a4, 1
         addi    a3, a3, -1
-        srli    a4, a4, 1
 .ifc \type, 16
         beqz    t0, 3f     # if (nnz)
 .endif
         lw      t2, (t5)   # block_offset[i]
-        andi    t1, a7, 1
+        bexti   t1, a4, 16
         add     a0, t4, t2
 .ifc \type, 16
         bnez    t1, 2f     # if (nnz == 1 && block[i * 16])
@@ -595,14 +593,12 @@ func ff_h264_idct_add\type\()_\depth\()_rvv, zve32x
 .endif
         jal     ff_h264_idct4_dc_add_\depth\()_rvv
 3:
-        srli    a7, a7, 1
+        srli    a4, a4, 1
         addi    t5, t5, 4
         addi    a1, a1, 16 * 2 * (\depth / 8)
         bnez    a3, 1b
 
-        ld      ra,  8(sp)
-        ld      s0,  0(sp)
-        addi    sp, sp, 16
+        mv      ra, a7
         ret
 endfunc
 .endm
@@ -611,9 +607,10 @@ endfunc
 idct4_adds 16, \depth
 idct4_adds 16intra, \depth
 
-func ff_h264_idct8_add4_\depth\()_rvv, zve32x
+#if (__riscv_xlen == 64)
+func ff_h264_idct8_add4_\depth\()_rvv, zve32x, b
         csrwi       vxrm, 0
-        addi    sp, sp, -64
+        addi    sp, sp, -48
         lla     t0, ff_h264_scan8
         sd      s0,   (sp)
         li      t1, 4 * 32 * (\depth / 8)
@@ -622,9 +619,8 @@ func ff_h264_idct8_add4_\depth\()_rvv, zve32x
         sd      ra,  8(sp)
         sd      s1, 16(sp)
         sd      s2, 24(sp)
-        sd      s3, 32(sp)
-        sd      s4, 40(sp)
-        sd      s5, 48(sp)
+        sd      s4, 32(sp)
+        sd      s5, 40(sp)
         vsetivli  zero, 4, e8, mf4, ta, ma
         vlse8.v   v8, (t0), t2
 .if \depth == 8
@@ -644,8 +640,11 @@ func ff_h264_idct8_add4_\depth\()_rvv, zve32x
         vmsne.vi  v0, v12, 0
         vmand.mm  v1, v1, v2
         vmv.x.s   s2, v0
-        vmv.x.s   s3, v1
+        vmv.x.s   a7, v1
+        zext.h  s2, s2
+        slli    a7, a7, 16
         li      s1, 4
+        or      s2, s2, a7
         mv      s4, a0
         mv      s5, a1
         mv      a1, a2
@@ -653,10 +652,9 @@ func ff_h264_idct8_add4_\depth\()_rvv, zve32x
 1:
         andi    t0, s2, 1
         addi    s1, s1, -1
-        srli    s2, s2, 1
         beqz    t0, 3f     # if (nnz)
         lw      t2, (s5)   # block_offset[i]
-        andi    t1, s3, 1
+        bexti   t1, s2, 16
         add     a0, s4, t2
         bnez    t1, 2f    # if (nnz == 1 && block[i * 16])
         jal     .Lidct8_add_\depth\()_rvv
@@ -670,20 +668,20 @@ func ff_h264_idct8_add4_\depth\()_rvv, zve32x
 3:
         addi    a1, a1, 4 * 16 * 2 * (\depth / 8)
 4:
-        srli    s3, s3, 1
+        srli    s2, s2, 1
         addi    s5, s5, 4 * 4
         bnez    s1, 1b
 
-        ld      s5, 48(sp)
-        ld      s4, 40(sp)
-        ld      s3, 32(sp)
+        ld      s5, 40(sp)
+        ld      s4, 32(sp)
         ld      s2, 24(sp)
         ld      s1, 16(sp)
         ld      ra,  8(sp)
         ld      s0,  0(sp)
-        addi    sp, sp, 64
+        addi    sp, sp, 48
         ret
 endfunc
+#endif
 .endr
 
 .irp    depth, 9, 10, 12, 14
@@ -697,9 +695,10 @@ func ff_h264_idct_add16intra_\depth\()_rvv, zve32x
         j       ff_h264_idct_add16intra_16_rvv
 endfunc
 
+#if (__riscv_xlen == 64)
 func ff_h264_idct8_add4_\depth\()_rvv, zve32x
         li      a5, (1 << \depth) - 1
         j       ff_h264_idct8_add4_16_rvv
 endfunc
-.endr
 #endif
+.endr
-- 
2.45.2

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Reply via email to