This patch adds an RVV-optimized implementation of DCT 32x32 for RISC-V.

The current implementation in the repository is written with the assumption of 
a 128-bit VLEN and does not account for wider vector lengths. Therefore, 
initial testing was performed on a 128-bit platform, allowing the results to 
directly reflect the advantages of the optimized code over the existing 
implementation.

**SG2044 (128-bit VLEN):**

```
dct32x32 | 5.14x | 1800.12 | 9247.73
dct32x32 | 9.85x |  935.26 | 9214.26
```

Building on this, the new implementation adopts a Vector-Length Agnostic (VLA) 
design. Additional testing on a 256-bit platform demonstrates good scalability 
and further performance gains.

**Banana Pi F3 (256-bit VLEN):**

```
dct32x32 | 5.59x | 2222.48 | 12420.64
dct32x32 | 13.28x |  935.97 | 12431.17
```

To simplify comparison with the existing implementation, this patch introduces 
an `RVV_DCT32_OPT` compile-time option. The optimization can be disabled using:

```
-DRVV_DCT32_OPT=0
```

allowing straightforward A/B performance testing.

Signed-off-by: daichengrong <[email protected]>
---
 source/CMakeLists.txt                    |   6 +
 source/common/CMakeLists.txt             |   2 +-
 source/common/riscv64/asm-primitives.cpp |   3 +
 source/common/riscv64/dct-32dct.S        | 714 +++++++++++++++++++++++
 source/common/riscv64/fun-decls.h        |   1 +
 5 files changed, 725 insertions(+), 1 deletion(-)
 mode change 100755 => 100644 source/CMakeLists.txt
 create mode 100644 source/common/riscv64/dct-32dct.S

diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt
old mode 100755
new mode 100644
index 9f93b6ec2..fd91da702
--- a/source/CMakeLists.txt
+++ b/source/CMakeLists.txt
@@ -512,6 +512,11 @@ int main() {
             message(STATUS "Found RVV")
             add_definitions(-DHAVE_RVV=1)
 
+           option(RVV_DCT32_OPT "Enable use of RVV DCT32 OPT" ON)
+            if(RVV_DCT32_OPT)
+                add_definitions(-DHAVE_RVV_OPT=1)
+            endif()
+
             set(RVV_INTRINSIC_TEST [[
 #include <riscv_vector.h>
 #include <stdint.h>
@@ -947,6 +952,7 @@ if((MSVC_IDE OR XCODE OR GCC) AND ENABLE_ASSEMBLY)
         enable_language(ASM)
         foreach(ASM ${RISCV64_ASMS})
             set(ASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/riscv64/${ASM})
+           message(STATUS "add ... ${ASM_SRC}")
             list(APPEND ASM_SRCS ${ASM_SRC})
             list(APPEND ASM_OBJS ${ASM}.${SUFFIX})
             add_custom_command(
diff --git a/source/common/CMakeLists.txt b/source/common/CMakeLists.txt
index 69125c3cb..4945af009 100644
--- a/source/common/CMakeLists.txt
+++ b/source/common/CMakeLists.txt
@@ -185,7 +185,7 @@ if(ENABLE_ASSEMBLY AND (RISCV64 OR CROSS_COMPILE_RISCV64))
     source_group(Assembly FILES ${ASM_PRIMITIVES})
 
     # Add riscv64 assembly files here.
-    set(A_SRCS asm.S blockcopy8.S dct.S sad-a.S ssd-a.S pixel-util.S mc-a.S 
p2s.S sao.S loopfilter.S intrapred.S riscv64_utils.S)
+    set(A_SRCS asm.S blockcopy8.S dct.S sad-a.S ssd-a.S pixel-util.S mc-a.S 
p2s.S sao.S loopfilter.S intrapred.S riscv64_utils.S dct-32dct.S)
     set(VEC_PRIMITIVES)
 
     if(CPU_HAS_RVV)
diff --git a/source/common/riscv64/asm-primitives.cpp 
b/source/common/riscv64/asm-primitives.cpp
index ce03288f9..7bd017cf8 100644
--- a/source/common/riscv64/asm-primitives.cpp
+++ b/source/common/riscv64/asm-primitives.cpp
@@ -234,6 +234,9 @@ void setupRVVPrimitives(EncoderPrimitives &p)
     p.dst4x4                = PFX(dst4_v);
 
     ALL_LUMA_TU_S(dct, dct, v);
+#if defined(HAVE_RVV_OPT)
+    p.cu[BLOCK_32x32].dct = PFX(dct_32_v_opt);
+#endif
     ALL_LUMA_TU_S(idct, idct, v);
 
     ALL_LUMA_TU_L(nonPsyRdoQuant, nonPsyRdoQuant, v);
diff --git a/source/common/riscv64/dct-32dct.S 
b/source/common/riscv64/dct-32dct.S
new file mode 100644
index 000000000..a25521706
--- /dev/null
+++ b/source/common/riscv64/dct-32dct.S
@@ -0,0 +1,714 @@
+/*****************************************************************************
+ * Copyright (C) 2026 MulticoreWare, Inc
+ *
+ * Authors: daichengrong <[email protected]>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "asm.S"
+
+#ifdef __APPLE__
+.section __RODATA,__rodata
+#else
+.section .rodata
+#endif
+
+.align 4
+
+.set dct32_shift_1, 4 + BIT_DEPTH - 8
+.set dct32_shift_2, 11
+
+.text
+
+#define DCT32_O_CONSTANT_1_0 90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 
31, 22, 13, 4
+#define DCT32_O_CONSTANT_3_1  90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, 
-88, -78, -61, -38, -13
+#define DCT32_O_CONSTANT_5_2  88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 
38, 73, 90, 85, 61, 22
+#define DCT32_O_CONSTANT_7_3  85, 46, -13, -67, -90, -73, -22, 38, 82, 88, 54, 
-4, -61, -90, -78, -31
+#define DCT32_O_CONSTANT_9_4  82, 22, -54, -90, -61, 13, 78, 85, 31, -46, -90, 
-67, 4, 73, 88, 38
+#define DCT32_O_CONSTANT_11_5  78, -4, -82, -73, 13, 85, 67, -22, -88, -61, 
31, 90, 54, -38, -90, -46
+#define DCT32_O_CONSTANT_13_6  73, -31, -90, -22, 78, 67, -38, -90, -13, 82, 
61, -46, -88, -4, 85, 54
+#define DCT32_O_CONSTANT_15_7  67, -54, -78, 38, 85, -22, -90, 4, 90, 13, -88, 
-31, 82, 46, -73, -61
+#define DCT32_O_CONSTANT_17_8  61, -73, -46, 82, 31, -88, -13, 90, -4, -90, 
22, 85, -38, -78, 54, 67
+#define DCT32_O_CONSTANT_19_9  54, -85, -4, 88, -46, -61, 82, 13, -90, 38, 67, 
-78, -22, 90, -31, -73
+#define DCT32_O_CONSTANT_21_10  46, -90, 38, 54, -90, 31, 61, -88, 22, 67, 
-85, 13, 73, -82,  4, 78
+#define DCT32_O_CONSTANT_23_11  38, -88, 73, -4, -67, 90, -46, -31, 85, -78, 
13, 61, -90, 54, 22, -82
+#define DCT32_O_CONSTANT_25_12  31, -78, 90, -61, 4, 54, -88, 82, -38, -22, 
73, -90, 67, -13, -46, 85
+#define DCT32_O_CONSTANT_27_13  22, -61, 85, -90, 73, -38, -4, 46, -78, 90, 
-82, 54, -13, -31, 67, -88
+#define DCT32_O_CONSTANT_29_14  13, -38, 61, -78, 88, -90, 85, -73, 54, -31, 
4, 22, -46, 67, -82, 90
+#define DCT32_O_CONSTANT_31_15  4, -13, 22, -31, 38, -46, 54, -61, 67, -73, 
78, -82, 85, -88, 90, -90
+
+
+#define DCT32_EO_CONSTANT_2_0  90, 87, 80, 70, 57, 43, 25, 9
+#define DCT32_EO_CONSTANT_6_1  87, 57,  9, -43, -80, -90, -70, -25
+#define DCT32_EO_CONSTANT_10_2  80,  9, -70, -87, -25, 57, 90, 43
+#define DCT32_EO_CONSTANT_14_3  70, -43, -87,  9, 90, 25, -80, -57
+
+#define DCT32_EO_CONSTANT_18_4  57, -80, -25, 90, -9, -87, 43, 70
+#define DCT32_EO_CONSTANT_22_5  43, -90, 57, 25, -87, 70,  9, -80
+#define DCT32_EO_CONSTANT_26_6  25, -70, 90, -80, 43,  9, -57, 87
+#define DCT32_EO_CONSTANT_30_7  9, -25, 43, -57, 70, -80, 87, -90
+
+.macro  lx rd, addr
+#if (__riscv_xlen == 32)
+        lw      \rd, \addr
+#elif (__riscv_xlen == 64)
+        ld      \rd, \addr
+#else
+        lq      \rd, \addr
+#endif
+.endm
+
+.macro  sx rd, addr
+#if (__riscv_xlen == 32)
+        sw      \rd, \addr
+#elif (__riscv_xlen == 64)
+        sd      \rd, \addr
+#else
+        sq      \rd, \addr
+#endif
+.endm
+
+.macro butterfly e, o, tmp_p, tmp_m
+        vadd.vv         \tmp_p, \e, \o
+        vsub.vv         \tmp_m, \e, \o
+.endm
+
+.macro butterfly_widen e, o, tmp_p, tmp_m
+        vwadd.vv         \tmp_p, \e, \o
+        vwsub.vv         \tmp_m, \e, \o
+.endm
+
+.macro DCT32_EEO_CAL dst, m1, m2, m3, m4, s1, s2, s3, s4, line, shift
+    li              a2, \m1
+    li              a3, \m2
+    li              a4, \m3
+    li              a5, \m4
+    vmul.vx         \dst, \s1, a2
+    vmacc.vx        \dst, a3, \s2
+    vmacc.vx        \dst, a4, \s3
+    vmacc.vx        \dst, a5, \s4
+.endm
+
+.macro DCT32_4_DST_ADD_1_MEMBER first, in, dst_start_index, dst1, dst2, dst3, 
dst4, t0, t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11,t12,t13,t14,t15
+.if \dst_start_index == 0
+    li              a2, \t0
+    li              a3, \t1
+    li              a4, \t2
+    li              a5, \t3
+.elseif \dst_start_index == 4
+    li              a2, \t4
+    li              a3, \t5
+    li              a4, \t6
+    li              a5, \t7
+.elseif \dst_start_index == 8
+    li              a2, \t8
+    li              a3, \t9
+    li              a4, \t10
+    li              a5, \t11
+.else
+    li              a2, \t12
+    li              a3, \t13
+    li              a4, \t14
+    li              a5, \t15
+.endif
+
+.if \first == 1
+    vmul.vx        \dst1, \in, a2
+    vmul.vx        \dst2, \in, a3
+    vmul.vx        \dst3, \in, a4
+    vmul.vx        \dst4, \in, a5
+.else
+    vmacc.vx       \dst1, a2, \in
+    vmacc.vx       \dst2, a3, \in
+    vmacc.vx       \dst3, a4, \in
+    vmacc.vx       \dst4, a5, \in
+.endif
+.endm
+
+.macro DCT32_STORE_L line, shift, in
+    vnclip.wi           \in, \in, \shift
+    addi                t0, a1, 32 * 2 * \line
+    vse16.v             \in, (t0)
+.endm
+
+.macro tr_32xN_rvv name, shift
+function func_tr_32xN_\name\()_rvv
+        .option arch, +zba
+        // E saved from tmp stack
+        mv              a7, t5
+        // one vector bytes after widen
+        slli            t2, t4, 2
+        // O saved from tmp stack + 16xE
+        slli            t0, t2, 4
+        add             a6, t5, t0
+
+        // load 0-3 28-31
+        add             t0, a0, 2*0
+        vlsseg4e16.v        v0,(a0), t3
+        add             t0, a0, 2*28
+        vlsseg4e16.v       v4,(t0), t3
+
+        butterfly_widen     v0, v7, v8, v16
+        butterfly_widen     v1, v6, v10, v18
+        butterfly_widen     v2, v5, v12, v20
+        butterfly_widen     v3, v4, v14, v22
+
+        // load 4-7 24-27
+        add             t0, a0, 2*4
+        vlsseg4e16.v       v0,(t0), t3
+        add             t0, a0, 2*24
+        vlsseg4e16.v       v4,(t0), t3
+
+        // save E 0 1 2 3
+        vse32.v         v8, (a7)
+        add             a7, a7, t2
+        vse32.v         v10, (a7)
+        add             a7, a7, t2
+        vse32.v         v12, (a7)
+        add             a7, a7, t2
+        vse32.v         v14, (a7)
+
+        // save O 1 2 3 4
+        vse32.v         v16, (a6)
+        add             a6, a6, t2
+        vse32.v         v18, (a6)
+        add             a6, a6, t2
+        vse32.v         v20, (a6)
+        add             a6, a6, t2
+        vse32.v         v22, (a6)
+
+        vsetvli zero, zero, e32, m2, ta, ma
+        DCT32_4_DST_ADD_1_MEMBER     1, v16, 0, v24, v26, v28, v30, 
DCT32_O_CONSTANT_1_0
+        DCT32_4_DST_ADD_1_MEMBER     0, v18, 0, v24, v26, v28, v30, 
DCT32_O_CONSTANT_3_1
+
+        DCT32_4_DST_ADD_1_MEMBER     0, v20, 0, v24, v26, v28, v30, 
DCT32_O_CONSTANT_5_2
+        DCT32_4_DST_ADD_1_MEMBER     0, v22, 0, v24, v26, v28, v30, 
DCT32_O_CONSTANT_7_3
+
+        vsetvli zero, zero, e16, m1, ta, ma
+        butterfly_widen     v0, v7, v8, v16
+        butterfly_widen     v1, v6, v10, v18
+        butterfly_widen     v2, v5, v12, v20
+        butterfly_widen     v3, v4, v14, v22
+
+        // load 8-11 20-23
+        add             t0, a0, 2*8
+        vlsseg4e16.v       v0,(t0), t3
+        add             t0, a0, 2*20
+        vlsseg4e16.v       v4,(t0), t3
+
+        // save E 4 5 6 7
+        add             a7, a7, t2
+        vse32.v         v8, (a7)
+        add             a7, a7, t2
+        vse32.v         v10, (a7)
+        add             a7, a7, t2
+        vse32.v         v12, (a7)
+        add             a7, a7, t2
+        vse32.v         v14, (a7)
+
+        // save O 4 5 6 7
+        add             a6, a6, t2
+        vse32.v         v16, (a6)
+        add             a6, a6, t2
+        vse32.v         v18, (a6)
+        add             a6, a6, t2
+        vse32.v         v20, (a6)
+        add             a6, a6, t2
+        vse32.v         v22, (a6)
+
+        vsetvli zero, zero, e32, m2, ta, ma
+        DCT32_4_DST_ADD_1_MEMBER      0, v16, 0, v24, v26, v28, v30, 
DCT32_O_CONSTANT_9_4
+        DCT32_4_DST_ADD_1_MEMBER      0, v18, 0, v24, v26, v28, v30, 
DCT32_O_CONSTANT_11_5
+
+        DCT32_4_DST_ADD_1_MEMBER      0, v20, 0, v24, v26, v28, v30, 
DCT32_O_CONSTANT_13_6
+        DCT32_4_DST_ADD_1_MEMBER      0, v22, 0, v24, v26, v28, v30, 
DCT32_O_CONSTANT_15_7
+
+        vsetvli zero, zero, e16, m1, ta, ma
+        butterfly_widen     v0, v7, v8, v16
+        butterfly_widen     v1, v6, v10, v18
+        butterfly_widen     v2, v5, v12, v20
+        butterfly_widen     v3, v4, v14, v22
+
+        // load 12-15 16-19
+        add             t0, a0, 2*12
+        vlsseg4e16.v       v0,(t0), t3
+        add             t0, a0, 2*16
+        vlsseg4e16.v       v4,(t0), t3
+
+        // save E 8 9 10 11
+        add             a7, a7, t2
+        vse32.v         v8, (a7)
+        add             a7, a7, t2
+        vse32.v         v10, (a7)
+        add             a7, a7, t2
+        vse32.v         v12, (a7)
+        add             a7, a7, t2
+        vse32.v         v14, (a7)
+
+        // save O 8 9 10 11
+        add             a6, a6, t2
+        vse32.v         v16, (a6)
+        add             a6, a6, t2
+        vse32.v         v18, (a6)
+        add             a6, a6, t2
+        vse32.v         v20, (a6)
+        add             a6, a6, t2
+        vse32.v         v22, (a6)
+
+        vsetvli zero, zero, e32, m2, ta, ma
+        DCT32_4_DST_ADD_1_MEMBER      0, v16, 0, v24, v26, v28, v30, 
DCT32_O_CONSTANT_17_8
+        DCT32_4_DST_ADD_1_MEMBER      0, v18, 0, v24, v26, v28, v30, 
DCT32_O_CONSTANT_19_9
+
+       DCT32_4_DST_ADD_1_MEMBER      0, v20, 0, v24, v26, v28, v30, 
DCT32_O_CONSTANT_21_10
+       DCT32_4_DST_ADD_1_MEMBER      0, v22, 0, v24, v26, v28, v30, 
DCT32_O_CONSTANT_23_11
+
+        vsetvli zero, zero, e16, m1, ta, ma
+        butterfly_widen     v0, v7, v8, v16
+        butterfly_widen     v1, v6, v10, v18
+        butterfly_widen     v2, v5, v12, v20
+        butterfly_widen     v3, v4, v14, v22
+
+        // save E 12 13 14 15
+        add             a7, a7, t2
+        vse32.v         v8, (a7)
+        add             a7, a7, t2
+        vse32.v         v10, (a7)
+        add             a7, a7, t2
+        vse32.v         v12, (a7)
+        add             a7, a7, t2
+        vse32.v         v14, (a7)
+
+        vsetvli zero, zero, e32, m2, ta, ma
+       DCT32_4_DST_ADD_1_MEMBER      0, v16, 0, v24, v26, v28, v30, 
DCT32_O_CONSTANT_25_12
+       DCT32_4_DST_ADD_1_MEMBER      0, v18, 0, v24, v26, v28, v30, 
DCT32_O_CONSTANT_27_13
+
+       DCT32_4_DST_ADD_1_MEMBER      0, v20, 0, v24, v26, v28, v30, 
DCT32_O_CONSTANT_29_14
+       DCT32_4_DST_ADD_1_MEMBER      0, v22, 0, v24, v26, v28, v30, 
DCT32_O_CONSTANT_31_15
+
+        vsetvli zero, zero, e16, m1, ta, ma
+        DCT32_STORE_L   1, \shift, v24
+        DCT32_STORE_L   3, \shift, v26
+        DCT32_STORE_L   5, \shift, v28
+        DCT32_STORE_L   7, \shift, v30
+
+
+        // cal dst 4-15
+        vsetvli zero, zero, e32, m2, ta, ma
+        // 12
+       DCT32_4_DST_ADD_1_MEMBER      1, v16, 4, v0, v2, v4, v6, 
DCT32_O_CONSTANT_25_12
+       DCT32_4_DST_ADD_1_MEMBER      1, v16, 8, v8, v10, v12, v14, 
DCT32_O_CONSTANT_25_12
+       DCT32_4_DST_ADD_1_MEMBER      1, v16, 12, v24, v26, v28, v30, 
DCT32_O_CONSTANT_25_12
+        // reload O0 to v16
+        slli                        t0, t2, 4
+        add                         a6, t5, t0
+        vle32.v                     v16, (a6)
+
+        // 13
+       DCT32_4_DST_ADD_1_MEMBER      0, v18, 4, v0, v2, v4, v6, 
DCT32_O_CONSTANT_27_13
+       DCT32_4_DST_ADD_1_MEMBER      0, v18, 8, v8, v10, v12, v14, 
DCT32_O_CONSTANT_27_13
+       DCT32_4_DST_ADD_1_MEMBER      0, v18, 12, v24, v26, v28, v30, 
DCT32_O_CONSTANT_27_13
+        // reload O1 to v18
+        add                         a6, a6, t2
+        vle32.v                     v18, (a6)
+
+        // 14
+       DCT32_4_DST_ADD_1_MEMBER      0, v20, 4, v0, v2, v4, v6, 
DCT32_O_CONSTANT_29_14
+       DCT32_4_DST_ADD_1_MEMBER      0, v20, 8, v8, v10, v12, v14, 
DCT32_O_CONSTANT_29_14
+       DCT32_4_DST_ADD_1_MEMBER      0, v20, 12, v24, v26, v28, v30, 
DCT32_O_CONSTANT_29_14
+        // reload O2 to v20
+        add                         a6, a6, t2
+        vle32.v                     v20, (a6)
+
+        // 15
+       DCT32_4_DST_ADD_1_MEMBER      0, v22, 4, v0, v2, v4, v6, 
DCT32_O_CONSTANT_31_15
+       DCT32_4_DST_ADD_1_MEMBER      0, v22, 8, v8, v10, v12, v14, 
DCT32_O_CONSTANT_31_15
+       DCT32_4_DST_ADD_1_MEMBER      0, v22, 12, v24, v26, v28, v30, 
DCT32_O_CONSTANT_31_15
+        // reload O3 to v22
+        add                         a6, a6, t2
+        vle32.v                     v22, (a6)
+
+        DCT32_4_DST_ADD_1_MEMBER      0, v16, 4, v0, v2, v4, v6, 
DCT32_O_CONSTANT_1_0
+        DCT32_4_DST_ADD_1_MEMBER      0, v16, 8, v8, v10, v12, v14, 
DCT32_O_CONSTANT_1_0
+        DCT32_4_DST_ADD_1_MEMBER      0, v16, 12, v24, v26, v28, v30, 
DCT32_O_CONSTANT_1_0
+        // reload O4 to v16
+        add                         a6, a6, t2
+        vle32.v                     v16, (a6)
+
+        DCT32_4_DST_ADD_1_MEMBER      0, v18, 4, v0, v2, v4, v6, 
DCT32_O_CONSTANT_3_1
+        DCT32_4_DST_ADD_1_MEMBER      0, v18, 8, v8, v10, v12, v14, 
DCT32_O_CONSTANT_3_1
+        DCT32_4_DST_ADD_1_MEMBER      0, v18, 12, v24, v26, v28, v30, 
DCT32_O_CONSTANT_3_1
+        // reload O5 to v18
+        add                         a6, a6, t2
+        vle32.v                     v18, (a6)
+
+        DCT32_4_DST_ADD_1_MEMBER      0, v20, 4, v0, v2, v4, v6, 
DCT32_O_CONSTANT_5_2
+        DCT32_4_DST_ADD_1_MEMBER      0, v20, 8, v8, v10, v12, v14, 
DCT32_O_CONSTANT_5_2
+        DCT32_4_DST_ADD_1_MEMBER      0, v20, 12, v24, v26, v28, v30, 
DCT32_O_CONSTANT_5_2
+        // reload O6 to v20
+        add                         a6, a6, t2
+        vle32.v                     v20, (a6)
+
+        DCT32_4_DST_ADD_1_MEMBER      0, v22, 4, v0, v2, v4, v6, 
DCT32_O_CONSTANT_7_3
+        DCT32_4_DST_ADD_1_MEMBER      0, v22, 8, v8, v10, v12, v14, 
DCT32_O_CONSTANT_7_3
+        DCT32_4_DST_ADD_1_MEMBER      0, v22, 12, v24, v26, v28, v30, 
DCT32_O_CONSTANT_7_3
+        // reload O7 to v22
+        add                         a6, a6, t2
+        vle32.v                     v22, (a6)
+
+        DCT32_4_DST_ADD_1_MEMBER      0, v16, 4, v0, v2, v4, v6, 
DCT32_O_CONSTANT_9_4
+        DCT32_4_DST_ADD_1_MEMBER      0, v16, 8, v8, v10, v12, v14, 
DCT32_O_CONSTANT_9_4
+        DCT32_4_DST_ADD_1_MEMBER      0, v16, 12, v24, v26, v28, v30, 
DCT32_O_CONSTANT_9_4
+        // reload O8 to v16
+        add                         a6, a6, t2
+        vle32.v                     v16, (a6)
+
+        DCT32_4_DST_ADD_1_MEMBER      0, v18, 4, v0, v2, v4, v6, 
DCT32_O_CONSTANT_11_5
+        DCT32_4_DST_ADD_1_MEMBER      0, v18, 8, v8, v10, v12, v14, 
DCT32_O_CONSTANT_11_5
+        DCT32_4_DST_ADD_1_MEMBER      0, v18, 12, v24, v26, v28, v30, 
DCT32_O_CONSTANT_11_5
+        // reload O9 to v18
+        add                         a6, a6, t2
+        vle32.v                     v18, (a6)
+
+        DCT32_4_DST_ADD_1_MEMBER      0, v20, 4, v0, v2, v4, v6, 
DCT32_O_CONSTANT_13_6
+        DCT32_4_DST_ADD_1_MEMBER      0, v20, 8, v8, v10, v12, v14, 
DCT32_O_CONSTANT_13_6
+        DCT32_4_DST_ADD_1_MEMBER      0, v20, 12, v24, v26, v28, v30, 
DCT32_O_CONSTANT_13_6
+        // reload O10 to v20
+        add                         a6, a6, t2
+        vle32.v                     v20, (a6)
+
+        DCT32_4_DST_ADD_1_MEMBER      0, v22, 4, v0, v2, v4, v6, 
DCT32_O_CONSTANT_15_7
+        DCT32_4_DST_ADD_1_MEMBER      0, v22, 8, v8, v10, v12, v14, 
DCT32_O_CONSTANT_15_7
+        DCT32_4_DST_ADD_1_MEMBER      0, v22, 12, v24, v26, v28, v30, 
DCT32_O_CONSTANT_15_7
+        // reload O11 to v22
+        add                         a6, a6, t2
+        vle32.v                     v22, (a6)
+
+
+        DCT32_4_DST_ADD_1_MEMBER      0, v16, 4, v0, v2, v4, v6, 
DCT32_O_CONSTANT_17_8
+        DCT32_4_DST_ADD_1_MEMBER      0, v16, 8, v8, v10, v12, v14, 
DCT32_O_CONSTANT_17_8
+        DCT32_4_DST_ADD_1_MEMBER      0, v16, 12, v24, v26, v28, v30, 
DCT32_O_CONSTANT_17_8
+
+        // reload   E 0 to v16
+        add                             a7, t5, zero
+        vle32.v                         v16, (a7)
+
+        DCT32_4_DST_ADD_1_MEMBER      0, v18, 4, v0, v2, v4, v6, 
DCT32_O_CONSTANT_19_9
+        DCT32_4_DST_ADD_1_MEMBER      0, v18, 8, v8, v10, v12, v14, 
DCT32_O_CONSTANT_19_9
+        DCT32_4_DST_ADD_1_MEMBER      0, v18, 12, v24, v26, v28, v30, 
DCT32_O_CONSTANT_19_9
+        // reload   E1 to v18
+        add                             a7, a7, t2
+        vle32.v                         v18, (a7)
+
+       DCT32_4_DST_ADD_1_MEMBER      0, v20, 4, v0, v2, v4, v6, 
DCT32_O_CONSTANT_21_10
+       DCT32_4_DST_ADD_1_MEMBER      0, v20, 8, v8, v10, v12, v14, 
DCT32_O_CONSTANT_21_10
+       DCT32_4_DST_ADD_1_MEMBER      0, v20, 12, v24, v26, v28, v30, 
DCT32_O_CONSTANT_21_10
+        // reload   E2 to v20
+        add                             a7, a7, t2
+        vle32.v                         v20, (a7)
+
+       DCT32_4_DST_ADD_1_MEMBER      0, v22, 4, v0, v2, v4, v6, 
DCT32_O_CONSTANT_23_11
+
+        vsetvli zero, zero, e16, m1, ta, ma
+        // write 9 11 13 15
+        DCT32_STORE_L   9, \shift, v0
+        DCT32_STORE_L   11, \shift, v2
+        DCT32_STORE_L   13, \shift, v4
+        DCT32_STORE_L   15, \shift, v6
+
+        // reload   E3 to v0
+        add                             a7, a7, t2
+        vle32.v                         v0, (a7)
+        // reload   E12 to v2
+        add                             a7, a7, t2
+        sh3add                          a7, t2, a7
+        vle32.v                         v2, (a7)
+        // reload   E13 to v4
+        add                             a7, a7, t2
+        vle32.v                         v4, (a7)
+        // reload   E14 to v6
+        add                             a7, a7, t2
+        vle32.v                         v6, (a7)
+
+        vsetvli zero, zero, e32, m2, ta, ma
+       DCT32_4_DST_ADD_1_MEMBER      0, v22, 8, v8, v10, v12, v14, 
DCT32_O_CONSTANT_23_11
+        // write 17 19 21 23
+        vsetvli zero, zero, e16, m1, ta, ma
+        DCT32_STORE_L   17, \shift, v8
+        DCT32_STORE_L   19, \shift, v10
+        DCT32_STORE_L   21, \shift, v12
+        DCT32_STORE_L   23, \shift, v14
+
+        // reload   E15 to v8
+        add                             a7, a7, t2
+        vle32.v                         v8, (a7)
+
+        vsetvli zero, zero, e32, m2, ta, ma
+       DCT32_4_DST_ADD_1_MEMBER      0, v22, 12, v24, v26, v28, v30, 
DCT32_O_CONSTANT_23_11
+        vsetvli zero, zero, e16, m1, ta, ma
+        // write 25 27 29 31
+        DCT32_STORE_L   25, \shift, v24
+        DCT32_STORE_L   27, \shift, v26
+        DCT32_STORE_L   29, \shift, v28
+        DCT32_STORE_L   31, \shift, v30
+
+        vsetvli zero, zero, e32, m2, ta, ma
+        // cal  E 3 12  EE EO  3
+        butterfly v0, v2, v10, v0
+        // save EE 3
+        slli            t0, t2, 4
+        add             a6, t5, t0
+        vse32.v         v10, (a6)
+        // reload E 4
+        sh2add          a7, t2, t5
+        vle32.v         v10, (a7)
+
+        // cal dst 2 4 6 10
+        DCT32_4_DST_ADD_1_MEMBER      1, v0, 0, v24 v26 v28 v30, 
DCT32_EO_CONSTANT_14_3
+
+        // cal  E 2 13  EE EO  2
+        butterfly v20, v4, v12, v20
+        // save EE 2
+        add             a6, a6, t2
+        vse32.v         v12, (a6)
+        // reload E 5
+        add             a7, a7, t2
+        vle32.v         v12, (a7)
+
+        DCT32_4_DST_ADD_1_MEMBER      0, v20, 0, v24 v26 v28 v30, 
DCT32_EO_CONSTANT_10_2
+
+        // cal E 1 14  EE EO  1
+        butterfly v18, v6, v14, v18
+        // save EE 1
+        add             a6, a6, t2
+        vse32.v         v14, (a6)
+        // reload E 6
+        add             a7, a7, t2
+        vle32.v         v14, (a7)
+        DCT32_4_DST_ADD_1_MEMBER      0, v18, 0, v24 v26 v28 v30, 
DCT32_EO_CONSTANT_6_1
+
+        // cal  E 0 15  EE EO  0
+        butterfly v16, v8, v22, v16
+        // reload EE 0
+        add             a6, a6, t2
+        vse32.v         v22, (a6)
+        // reload E 7
+        add             a7, a7, t2
+        vle32.v         v22, (a7)
+
+        DCT32_4_DST_ADD_1_MEMBER      0, v16, 0, v24 v26 v28 v30, 
DCT32_EO_CONSTANT_2_0
+
+        // cal dst 18 22 26 30
+        DCT32_4_DST_ADD_1_MEMBER      1, v0, 4, v2 v4 v6 v8, 
DCT32_EO_CONSTANT_14_3
+        // reload E 8   v0
+        add             a7, a7, t2
+        vle32.v         v0, (a7)
+
+        DCT32_4_DST_ADD_1_MEMBER      0, v20, 4, v2 v4 v6 v8, 
DCT32_EO_CONSTANT_10_2
+        // reload E 9     v20
+        add             a7, a7, t2
+        vle32.v         v20, (a7)
+
+        DCT32_4_DST_ADD_1_MEMBER      0, v18, 4, v2 v4 v6 v8, 
DCT32_EO_CONSTANT_6_1
+        // reload E 10     v18
+        add             a7, a7, t2
+        vle32.v         v18, (a7)
+
+        DCT32_4_DST_ADD_1_MEMBER      0, v16, 4, v2 v4 v6 v8, 
DCT32_EO_CONSTANT_2_0
+
+
+        // cal  E 7 8  EE EO  7
+        butterfly v22, v0, v16, v22
+        // reload E 11     v0
+        add             a7, a7, t2
+        vle32.v         v0, (a7)
+        DCT32_4_DST_ADD_1_MEMBER      0, v22, 0, v24 v26 v28 v30, 
DCT32_EO_CONSTANT_30_7
+        DCT32_4_DST_ADD_1_MEMBER      0, v22, 4, v2 v4 v6 v8, 
DCT32_EO_CONSTANT_30_7
+
+        // cal  E 6 9  EE EO  6
+        butterfly v14, v20, v22, v14
+        // reload EE 0  v20
+        vle32.v         v20, (a6)
+
+        DCT32_4_DST_ADD_1_MEMBER      0, v14, 0, v24 v26 v28 v30, 
DCT32_EO_CONSTANT_26_6
+        DCT32_4_DST_ADD_1_MEMBER      0, v14, 4, v2 v4 v6 v8, 
DCT32_EO_CONSTANT_26_6
+
+        // cal E 5 10  EE EO  5
+        butterfly v12, v18, v14, v12
+
+        // reload EE 1  v18
+        sub             a6, a6, t2
+        vle32.v         v18, (a6)
+
+        DCT32_4_DST_ADD_1_MEMBER      0, v12, 0, v24 v26 v28 v30, 
DCT32_EO_CONSTANT_22_5
+        DCT32_4_DST_ADD_1_MEMBER      0, v12, 4, v2 v4 v6 v8, 
DCT32_EO_CONSTANT_22_5
+        // load EE 1  v18
+
+        // cal  E 4 11  EE EO  4
+        butterfly v10, v0, v12, v10
+        // reload EE 2  v18
+        sub             a6, a6, t2
+        vle32.v         v0, (a6)
+        DCT32_4_DST_ADD_1_MEMBER      0, v10, 0, v24 v26 v28 v30, 
DCT32_EO_CONSTANT_18_4
+        DCT32_4_DST_ADD_1_MEMBER      0, v10, 4, v2 v4 v6 v8, 
DCT32_EO_CONSTANT_18_4
+        // reload EE 3  v10
+        sub             a6, a6, t2
+        vle32.v         v10, (a6)
+
+        //write dst 2 6 10 14 18 22 26 30
+        vsetvli zero, zero, e16, m1, ta, ma
+        DCT32_STORE_L   2, \shift, v24
+        DCT32_STORE_L   6, \shift, v26
+        DCT32_STORE_L   10, \shift, v28
+        DCT32_STORE_L   14, \shift, v30
+
+        DCT32_STORE_L   18, \shift, v2
+        DCT32_STORE_L   22, \shift, v4
+        DCT32_STORE_L   26, \shift, v6
+        DCT32_STORE_L   30, \shift, v8
+
+        vsetvli zero, zero, e32, m2, ta, ma
+        //  EE 0-7 ready in register
+
+        // EE 3 4 EEE EEO 3
+        butterfly       v10, v12, v28, v26
+        // EE 1 6 EEE EEO 1
+        butterfly       v18, v22, v24, v22
+        // EE 2 5 EEE EEO 2
+        butterfly       v0, v14, v30, v10
+        // EE 0 7       EEE EEO 0
+        butterfly v20, v16, v14, v12
+
+
+        // EEO[0-4] v12 v22 v16 v26
+        //dst 4 12 20 28
+        DCT32_EEO_CAL   v4, 89, 75, 50, 18, v12, v22, v10, v26, 4, \shift
+        DCT32_EEO_CAL   v8, 75, -18, -89, -50, v12, v22, v10, v26, 12, \shift
+        DCT32_EEO_CAL   v6, 50, -89, 18, 75, v12, v22, v10, v26, 20, \shift
+        DCT32_EEO_CAL   v16, 18, -50, 75, -89, v12, v22, v10, v26, 28, \shift
+
+        vsetvli         zero, zero, e16, m1, ta, ma
+
+        DCT32_STORE_L   4, \shift, v4
+        DCT32_STORE_L   12, \shift, v8
+        DCT32_STORE_L   20, \shift, v6
+        DCT32_STORE_L   28, \shift, v16
+
+        vsetvli         zero, zero, e32, m2, ta, ma
+        # EEEE[0] = EEE[0] + EEE[3];
+        # EEEO[0] = EEE[0] - EEE[3];
+        butterfly       v14, v28, v16, v20
+        # EEEE[1] = EEE[1] + EEE[2];
+        # EEEO[1] = EEE[1] - EEE[2];
+        butterfly       v24, v30, v2, v4
+
+
+        # dst[0] = (int16_t)((g_t32[0][0] * EEEE[0] + g_t32[0][1] * EEEE[1] + 
add) >> shift);
+        // 64 64
+        li              a2, 64
+        li              a3, 64
+        vmul.vx         v18, v16, a2
+        vmacc.vx        v18, a3, v2
+        # dst[8 * line] = (int16_t)((g_t32[8][0] * EEEO[0] + g_t32[8][1] * 
EEEO[1] + add) >> shift);
+        // 83  36
+        li              a2, 83
+        li              a3, 36
+        vmul.vx         v6, v20, a2
+        vmacc.vx        v6, a3, v4
+        # dst[16 * line] = (int16_t)((g_t32[16][0] * EEEE[0] + g_t32[16][1] * 
EEEE[1] + add) >> shift);
+        // 64  -64
+        li              a2, 64
+        li              a3, -64
+        vmul.vx         v8, v16, a2
+        vmacc.vx        v8, a3, v2
+        # dst[24 * line] = (int16_t)((g_t32[24][0] * EEEO[0] + g_t32[24][1] * 
EEEO[1] + add) >> shift);
+        // 36 -83
+        li              a2, 36
+        li              a3, -83
+        vmul.vx         v10, v20, a2
+        vmacc.vx        v10, a3, v4
+
+        //write dst 0 8 16 24
+        vsetvli         zero, zero, e16, m1, ta, ma
+        DCT32_STORE_L   0, \shift, v18
+        DCT32_STORE_L   8, \shift, v6
+        DCT32_STORE_L   16, \shift, v8
+        DCT32_STORE_L   24, \shift, v10
+
+        ret
+endfunc
+.endm
+
+tr_32xN_rvv firstpass, dct32_shift_1
+tr_32xN_rvv secondpass, dct32_shift_2
+
+.macro DCT_N size
+function PFX(dct_\size\()_v_opt)
+        .option arch, +zba
+
+        addi    sp, sp, -16
+        sx      ra, (sp)
+
+        mv      t6, a1
+        csrwi   vxrm, 0
+
+        li     t1, 32
+        vsetvli t4, t1, e16, m1, ta, ma
+
+        li      t0, 4096
+        // temp stack address
+        sub     t5, sp, t0
+        li      t0, 2048
+        sub     sp, t5, t0
+
+        // a0
+        mv      a1, sp
+        slli    t3, a2, 1
+1:
+        jal     func_tr_32xN_firstpass_rvv
+        mul     t0, t4, t3
+        add     a0, a0, t0
+        slli    t0, t4, 1
+        add     a1, a1, t0
+        sub     t1, t1, t4
+        bnez    t1, 1b
+
+        li      t1, 32
+        mv      a0, sp
+        mv      a1, t6
+        li      t3, 64
+1:
+        jal     func_tr_32xN_secondpass_rvv
+        slli    t0, t4, 6
+        add     a0, a0, t0
+        slli    t0, t4, 1
+        add     a1, a1, t0
+        sub     t1, t1, t4
+        bnez    t1, 1b
+
+2:
+        li      t0, 4096+2048
+        add     sp, sp, t0
+        lx      ra, (sp)
+        addi    sp, sp, 16
+
+        ret
+endfunc
+.endm
+
+DCT_N 32
diff --git a/source/common/riscv64/fun-decls.h 
b/source/common/riscv64/fun-decls.h
index ec04d9968..7ffb32e65 100644
--- a/source/common/riscv64/fun-decls.h
+++ b/source/common/riscv64/fun-decls.h
@@ -123,6 +123,7 @@ FUNCDEF_TU_S(void, cpy1Dto2D_shr, v, int16_t* dst, const 
int16_t* src, intptr_t
 FUNCDEF_TU_S(void, ssimDist, v, const pixel *fenc, uint32_t fStride, const 
pixel *recon, intptr_t rstride, uint64_t *ssBlock, int shift, uint64_t *ac_k);
 FUNCDEF_TU_S(void, idct, v, const int16_t* src, int16_t* dst, intptr_t 
dstStride);
 FUNCDEF_TU_S(void, dct, v, const int16_t* src, int16_t* dst, intptr_t 
srcStride);
+FUNCDEF_TU_S(void, dct, v_opt, const int16_t* src, int16_t* dst, intptr_t 
srcStride);
 FUNCDEF_TU_S(void, getResidual, v, const pixel* fenc, const pixel* pred, 
int16_t* residual, intptr_t stride);
 
 FUNCDEF_TU_S2(void, intra_pred_planar, rvv, pixel* dst, intptr_t dstride, 
const pixel* srcPix, int, int);
-- 
2.34.1

_______________________________________________
x265-devel mailing list
[email protected]
https://mailman.videolan.org/listinfo/x265-devel

Reply via email to