PR #23504 opened by Felix-Gong
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23504
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23504.patch

## Summary

- Re-enable the 8 RVV range convert functions that were disabled with `#if 0` 
due to API changes
- Adapt functions to accept runtime coefficients and offsets (matching updated 
function pointer types in SwsInternal)
- Fix a sign-extension issue in the 8-bit "To" paths where narrowing from e32 
to e16 via `vnsra.wi` causes values >32767 to wrap negative; the fix clamps at 
the e32 domain with threshold `((1<<15)-1)<<14` before narrowing

Functions covered:
- `ff_lumRangeToJpeg8_rvv`, `ff_lumRangeFromJpeg8_rvv`
- `ff_chrRangeToJpeg8_rvv`, `ff_chrRangeFromJpeg8_rvv`
- `ff_lumRangeToJpeg16_rvv`, `ff_lumRangeFromJpeg16_rvv`
- `ff_chrRangeToJpeg16_rvv`, `ff_chrRangeFromJpeg16_rvv`

## Test plan

- [x] fate-sws-yuv-range passes on QEMU and SG2044
- [x] fate-sws-yuv-colorspace passes on QEMU and SG2044
- [x] fate-sws-unscaled passes on QEMU and SG2044
- [x] All 8 functions verified correct via disassembly and functional tests
- [x] Performance: ~4x average speedup over C on width-1920 benchmarks (SG2044)


From 4286cf7c4863e6a61a6b268aea95739f1c64cc10 Mon Sep 17 00:00:00 2001
From: yudong <[email protected]>
Date: Tue, 16 Jun 2026 21:30:26 +0800
Subject: [PATCH] libswscale/riscv: re-enable and fix RVV range convert
 functions

Re-enable the 8 RVV range convert functions that were disabled with
#if 0 due to API changes. Adapt the functions to accept runtime
coefficients and offsets (matching the updated function pointer
types in SwsInternal), and fix a sign-extension issue in the 8-bit
"To" paths where narrowing from e32 to e16 via vnsra.wi can cause
values >32767 to wrap negative. The fix clamps at the e32 domain
with threshold ((1<<15)-1)<<14 before narrowing.

Functions covered:
  ff_lumRangeToJpeg8_rvv,     ff_lumRangeFromJpeg8_rvv
  ff_chrRangeToJpeg8_rvv,     ff_chrRangeFromJpeg8_rvv
  ff_lumRangeToJpeg16_rvv,    ff_lumRangeFromJpeg16_rvv
  ff_chrRangeToJpeg16_rvv,    ff_chrRangeFromJpeg16_rvv

Tested on SG2044 with fate-sws-yuv-range, fate-sws-yuv-colorspace
and fate-sws-unscaled. Performance improvement of ~4x over C on
width-1920 benchmarks.

Signed-off-by: Felix-Gong <[email protected]>
---
 libswscale/riscv/range_rvv.S | 168 +++++++++++++++++++++--------------
 libswscale/riscv/swscale.c   |  57 +++++++-----
 2 files changed, 136 insertions(+), 89 deletions(-)

diff --git a/libswscale/riscv/range_rvv.S b/libswscale/riscv/range_rvv.S
index f2c20627c5..bb27db8ba3 100644
--- a/libswscale/riscv/range_rvv.S
+++ b/libswscale/riscv/range_rvv.S
@@ -1,5 +1,6 @@
 /*
  * Copyright © 2024 Rémi Denis-Courmont.
+ * Copyright © 2026 Felix Gong.
  *
  * This file is part of FFmpeg.
  *
@@ -20,19 +21,49 @@
 
 #include "libavutil/riscv/asm.S"
 
-func ff_range_lum_to_jpeg_16_rvv, zve32x, zba
+.macro lumConvertRange fromto, bit_depth
+func ff_lumRange\fromto\()Jpeg\bit_depth\()_rvv, zve32x, zba
         lpad    0
-        li       t1, 30189
-        li       t2, 19077
-        li       t3, -39057361
+.if \bit_depth == 16
+        slli     a2, a2, 32
+        srli     a2, a2, 32
+.ifc \fromto, To
+        li       t2, (1 << 19) - 1
+.endif
+1:
+        vsetvli  t0, a1, e32, m4, ta, ma
+        vle32.v  v0, (a0)
+        sub      a1, a1, t0
+        vwmul.vx v8, v0, a2
+        vsetvli  zero, zero, e64, m8, ta, ma
+        vadd.vx  v8, v8, a3
+        vsetvli  zero, zero, e32, m4, ta, ma
+        vnsra.wi v0, v8, 18
+.ifc \fromto, To
+        vmin.vx  v0, v0, t2
+.endif
+        vse32.v  v0, (a0)
+        slli     t1, t0, 2
+        add      a0, a0, t1
+        bnez     a1, 1b
+
+        ret
+.else
+        slli     a2, a2, 48
+        srli     a2, a2, 48
+.ifc \fromto, To
+        li       t2, ((1 << 15) - 1) << 14
+.endif
 1:
         vsetvli  t0, a1, e16, m4, ta, ma
-        vle16.v  v0, (a0)
+        vle16.v  v4, (a0)
         sub      a1, a1, t0
-        vmin.vx  v0, v0, t1
-        vwmul.vx v8, v0, t2
+        vwmul.vx v8, v4, a2
         vsetvli  zero, zero, e32, m8, ta, ma
-        vadd.vx  v8, v8, t3
+        vadd.vx  v8, v8, a3
+.ifc \fromto, To
+        vmin.vx  v8, v8, t2
+.endif
         vsetvli  zero, zero, e16, m4, ta, ma
         vnsra.wi v0, v8, 14
         vse16.v  v0, (a0)
@@ -40,79 +71,84 @@ func ff_range_lum_to_jpeg_16_rvv, zve32x, zba
         bnez     a1, 1b
 
         ret
+.endif
 endfunc
+.endm
 
-func ff_range_lum_from_jpeg_16_rvv, zve32x, zba
+.macro chrConvertRange fromto, bit_depth
+func ff_chrRange\fromto\()Jpeg\bit_depth\()_rvv, zve32x, zba
         lpad    0
-        li       t1, 14071
-        li       t2, 33561947
+.if \bit_depth == 16
+        slli     a3, a3, 32
+        srli     a3, a3, 32
+.ifc \fromto, To
+        li       t2, (1 << 19) - 1
+.endif
 1:
-        vsetvli  t0, a1, e16, m4, ta, ma
-        vle16.v  v0, (a0)
-        sub      a1, a1, t0
-        vwmul.vx v8, v0, t1
-        vsetvli  zero, zero, e32, m8, ta, ma
-        vadd.vx  v8, v8, t2
-        vsetvli  zero, zero, e16, m4, ta, ma
-        vnsra.wi v0, v8, 14
-        vse16.v  v0, (a0)
-        sh1add   a0, t0, a0
-        bnez     a1, 1b
+        vsetvli  t0, a2, e32, m4, ta, ma
+        vle32.v  v0, (a0)
+        sub      a2, a2, t0
+        vle32.v  v4, (a1)
+        vwmul.vx v8, v0, a3
+        vwmul.vx v16, v4, a3
+        vsetvli  zero, zero, e64, m8, ta, ma
+        vadd.vx  v8, v8, a4
+        vadd.vx  v16, v16, a4
+        vsetvli  zero, zero, e32, m4, ta, ma
+        vnsra.wi v0, v8, 18
+        vnsra.wi v4, v16, 18
+.ifc \fromto, To
+        vmin.vx  v0, v0, t2
+        vmin.vx  v4, v4, t2
+.endif
+        vse32.v  v0, (a0)
+        slli     t1, t0, 2
+        add      a0, a0, t1
+        vse32.v  v4, (a1)
+        slli     t1, t0, 2
+        add      a1, a1, t1
+        bnez     a2, 1b
 
         ret
-endfunc
-
-func ff_range_chr_to_jpeg_16_rvv, zve32x, zba
-        lpad    0
-        li      t1, 30775
-        li      t2, 4663
-        li      t3, -9289992
+.else
+        slli     a3, a3, 48
+        srli     a3, a3, 48
+.ifc \fromto, To
+        li       t2, ((1 << 15) - 1) << 14
+.endif
 1:
         vsetvli  t0, a2, e16, m4, ta, ma
-        vle16.v  v0, (a0)
+        vle16.v  v4, (a0)
         sub      a2, a2, t0
-        vle16.v  v4, (a1)
-        vmin.vx  v0, v0, t1
-        vmin.vx  v4, v4, t1
-        vwmul.vx v8, v0, t2
-        vwmul.vx v16, v4, t2
+        vle16.v  v0, (a1)
+        vwmul.vx v8, v4, a3
+        vwmul.vx v16, v0, a3
         vsetvli  zero, zero, e32, m8, ta, ma
-        vadd.vx  v8, v8, t3
-        vadd.vx  v16, v16, t3
+        vadd.vx  v8, v8, a4
+        vadd.vx  v16, v16, a4
+.ifc \fromto, To
+        vmin.vx  v8, v8, t2
+        vmin.vx  v16, v16, t2
+.endif
         vsetvli  zero, zero, e16, m4, ta, ma
-        vnsra.wi v0, v8, 12
-        vnsra.wi v4, v16, 12
-        vse16.v  v0, (a0)
+        vnsra.wi v4, v8, 14
+        vnsra.wi v0, v16, 14
+        vse16.v  v4, (a0)
         sh1add   a0, t0, a0
-        vse16.v  v4, (a1)
+        vse16.v  v0, (a1)
         sh1add   a1, t0, a1
         bnez     a2, 1b
 
         ret
+.endif
 endfunc
+.endm
 
-func ff_range_chr_from_jpeg_16_rvv, zve32x, zba
-        lpad    0
-        li      t1, 1799
-        li      t2, 4081085
-1:
-        vsetvli  t0, a2, e16, m4, ta, ma
-        vle16.v  v0, (a0)
-        sub      a2, a2, t0
-        vle16.v  v4, (a1)
-        vwmul.vx v8, v0, t1
-        vwmul.vx v16, v4, t1
-        vsetvli  zero, zero, e32, m8, ta, ma
-        vadd.vx  v8, v8, t2
-        vadd.vx  v16, v16, t2
-        vsetvli  zero, zero, e16, m4, ta, ma
-        vnsra.wi v0, v8, 11
-        vnsra.wi v4, v16, 11
-        vse16.v  v0, (a0)
-        sh1add   a0, t0, a0
-        vse16.v  v4, (a1)
-        sh1add   a1, t0, a1
-        bnez     a2, 1b
-
-        ret
-endfunc
+lumConvertRange To,    8
+lumConvertRange To,   16
+chrConvertRange To,    8
+chrConvertRange To,   16
+lumConvertRange From,  8
+lumConvertRange From, 16
+chrConvertRange From,  8
+chrConvertRange From, 16
diff --git a/libswscale/riscv/swscale.c b/libswscale/riscv/swscale.c
index 49c492f153..3a12dfc2c9 100644
--- a/libswscale/riscv/swscale.c
+++ b/libswscale/riscv/swscale.c
@@ -21,37 +21,48 @@
 #include "libavutil/riscv/cpu.h"
 #include "libswscale/swscale_internal.h"
 
-void ff_range_lum_to_jpeg_16_rvv(int16_t *, int);
-void ff_range_chr_to_jpeg_16_rvv(int16_t *, int16_t *, int);
-void ff_range_lum_from_jpeg_16_rvv(int16_t *, int);
-void ff_range_chr_from_jpeg_16_rvv(int16_t *, int16_t *, int);
+void ff_lumRangeToJpeg8_rvv(int16_t *dst, int width,
+                            uint32_t coeff, int64_t offset);
+void ff_chrRangeToJpeg8_rvv(int16_t *dstU, int16_t *dstV, int width,
+                            uint32_t coeff, int64_t offset);
+void ff_lumRangeFromJpeg8_rvv(int16_t *dst, int width,
+                              uint32_t coeff, int64_t offset);
+void ff_chrRangeFromJpeg8_rvv(int16_t *dstU, int16_t *dstV, int width,
+                              uint32_t coeff, int64_t offset);
+void ff_lumRangeToJpeg16_rvv(int16_t *dst, int width,
+                             uint32_t coeff, int64_t offset);
+void ff_chrRangeToJpeg16_rvv(int16_t *dstU, int16_t *dstV, int width,
+                             uint32_t coeff, int64_t offset);
+void ff_lumRangeFromJpeg16_rvv(int16_t *dst, int width,
+                               uint32_t coeff, int64_t offset);
+void ff_chrRangeFromJpeg16_rvv(int16_t *dstU, int16_t *dstV, int width,
+                               uint32_t coeff, int64_t offset);
 
 av_cold void ff_sws_init_range_convert_riscv(SwsInternal *c)
 {
-    /* This code is currently disabled because of changes in the base
-     * implementation of these functions. This code should be enabled
-     * again once those changes are ported to this architecture. */
-#if 0
 #if HAVE_RVV
     int flags = av_get_cpu_flags();
 
-    static const struct {
-        void (*lum)(int16_t *, int);
-        void (*chr)(int16_t *, int16_t *, int);
-    } convs[2] = {
-        { ff_range_lum_to_jpeg_16_rvv, ff_range_chr_to_jpeg_16_rvv },
-        { ff_range_lum_from_jpeg_16_rvv, ff_range_chr_from_jpeg_16_rvv },
-    };
-
-    if (c->dstBpc <= 14 &&
-        (flags & AV_CPU_FLAG_RVV_I32) && (flags & AV_CPU_FLAG_RVB)) {
-        bool from = c->opts.src_range != 0;
-
-        c->lumConvertRange = convs[from].lum;
-        c->chrConvertRange = convs[from].chr;
+    if ((flags & AV_CPU_FLAG_RVV_I32) && (flags & AV_CPU_FLAG_RVB)) {
+        if (c->dstBpc <= 14) {
+            if (c->opts.src_range) {
+                c->lumConvertRange = ff_lumRangeFromJpeg8_rvv;
+                c->chrConvertRange = ff_chrRangeFromJpeg8_rvv;
+            } else {
+                c->lumConvertRange = ff_lumRangeToJpeg8_rvv;
+                c->chrConvertRange = ff_chrRangeToJpeg8_rvv;
+            }
+        } else {
+            if (c->opts.src_range) {
+                c->lumConvertRange = ff_lumRangeFromJpeg16_rvv;
+                c->chrConvertRange = ff_chrRangeFromJpeg16_rvv;
+            } else {
+                c->lumConvertRange = ff_lumRangeToJpeg16_rvv;
+                c->chrConvertRange = ff_chrRangeToJpeg16_rvv;
+            }
+        }
     }
 #endif
-#endif
 }
 
 #define RVV_INPUT(name) \
-- 
2.52.0

_______________________________________________
ffmpeg-devel mailing list -- [email protected]
To unsubscribe send an email to [email protected]

Reply via email to