PR #23504 opened by Felix-Gong URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23504 Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23504.patch
## Summary - Re-enable the 8 RVV range convert functions that were disabled with `#if 0` due to API changes - Adapt functions to accept runtime coefficients and offsets (matching updated function pointer types in SwsInternal) - Fix a sign-extension issue in the 8-bit "To" paths where narrowing from e32 to e16 via `vnsra.wi` causes values >32767 to wrap negative; the fix clamps at the e32 domain with threshold `((1<<15)-1)<<14` before narrowing Functions covered: - `ff_lumRangeToJpeg8_rvv`, `ff_lumRangeFromJpeg8_rvv` - `ff_chrRangeToJpeg8_rvv`, `ff_chrRangeFromJpeg8_rvv` - `ff_lumRangeToJpeg16_rvv`, `ff_lumRangeFromJpeg16_rvv` - `ff_chrRangeToJpeg16_rvv`, `ff_chrRangeFromJpeg16_rvv` ## Test plan - [x] fate-sws-yuv-range passes on QEMU and SG2044 - [x] fate-sws-yuv-colorspace passes on QEMU and SG2044 - [x] fate-sws-unscaled passes on QEMU and SG2044 - [x] All 8 functions verified correct via disassembly and functional tests - [x] Performance: ~4x average speedup over C on width-1920 benchmarks (SG2044) From 4286cf7c4863e6a61a6b268aea95739f1c64cc10 Mon Sep 17 00:00:00 2001 From: yudong <[email protected]> Date: Tue, 16 Jun 2026 21:30:26 +0800 Subject: [PATCH] libswscale/riscv: re-enable and fix RVV range convert functions Re-enable the 8 RVV range convert functions that were disabled with #if 0 due to API changes. Adapt the functions to accept runtime coefficients and offsets (matching the updated function pointer types in SwsInternal), and fix a sign-extension issue in the 8-bit "To" paths where narrowing from e32 to e16 via vnsra.wi can cause values >32767 to wrap negative. The fix clamps at the e32 domain with threshold ((1<<15)-1)<<14 before narrowing. Functions covered: ff_lumRangeToJpeg8_rvv, ff_lumRangeFromJpeg8_rvv ff_chrRangeToJpeg8_rvv, ff_chrRangeFromJpeg8_rvv ff_lumRangeToJpeg16_rvv, ff_lumRangeFromJpeg16_rvv ff_chrRangeToJpeg16_rvv, ff_chrRangeFromJpeg16_rvv Tested on SG2044 with fate-sws-yuv-range, fate-sws-yuv-colorspace and fate-sws-unscaled. Performance improvement of ~4x over C on width-1920 benchmarks. Signed-off-by: Felix-Gong <[email protected]> --- libswscale/riscv/range_rvv.S | 168 +++++++++++++++++++++-------------- libswscale/riscv/swscale.c | 57 +++++++----- 2 files changed, 136 insertions(+), 89 deletions(-) diff --git a/libswscale/riscv/range_rvv.S b/libswscale/riscv/range_rvv.S index f2c20627c5..bb27db8ba3 100644 --- a/libswscale/riscv/range_rvv.S +++ b/libswscale/riscv/range_rvv.S @@ -1,5 +1,6 @@ /* * Copyright © 2024 Rémi Denis-Courmont. + * Copyright © 2026 Felix Gong. * * This file is part of FFmpeg. * @@ -20,19 +21,49 @@ #include "libavutil/riscv/asm.S" -func ff_range_lum_to_jpeg_16_rvv, zve32x, zba +.macro lumConvertRange fromto, bit_depth +func ff_lumRange\fromto\()Jpeg\bit_depth\()_rvv, zve32x, zba lpad 0 - li t1, 30189 - li t2, 19077 - li t3, -39057361 +.if \bit_depth == 16 + slli a2, a2, 32 + srli a2, a2, 32 +.ifc \fromto, To + li t2, (1 << 19) - 1 +.endif +1: + vsetvli t0, a1, e32, m4, ta, ma + vle32.v v0, (a0) + sub a1, a1, t0 + vwmul.vx v8, v0, a2 + vsetvli zero, zero, e64, m8, ta, ma + vadd.vx v8, v8, a3 + vsetvli zero, zero, e32, m4, ta, ma + vnsra.wi v0, v8, 18 +.ifc \fromto, To + vmin.vx v0, v0, t2 +.endif + vse32.v v0, (a0) + slli t1, t0, 2 + add a0, a0, t1 + bnez a1, 1b + + ret +.else + slli a2, a2, 48 + srli a2, a2, 48 +.ifc \fromto, To + li t2, ((1 << 15) - 1) << 14 +.endif 1: vsetvli t0, a1, e16, m4, ta, ma - vle16.v v0, (a0) + vle16.v v4, (a0) sub a1, a1, t0 - vmin.vx v0, v0, t1 - vwmul.vx v8, v0, t2 + vwmul.vx v8, v4, a2 vsetvli zero, zero, e32, m8, ta, ma - vadd.vx v8, v8, t3 + vadd.vx v8, v8, a3 +.ifc \fromto, To + vmin.vx v8, v8, t2 +.endif vsetvli zero, zero, e16, m4, ta, ma vnsra.wi v0, v8, 14 vse16.v v0, (a0) @@ -40,79 +71,84 @@ func ff_range_lum_to_jpeg_16_rvv, zve32x, zba bnez a1, 1b ret +.endif endfunc +.endm -func ff_range_lum_from_jpeg_16_rvv, zve32x, zba +.macro chrConvertRange fromto, bit_depth +func ff_chrRange\fromto\()Jpeg\bit_depth\()_rvv, zve32x, zba lpad 0 - li t1, 14071 - li t2, 33561947 +.if \bit_depth == 16 + slli a3, a3, 32 + srli a3, a3, 32 +.ifc \fromto, To + li t2, (1 << 19) - 1 +.endif 1: - vsetvli t0, a1, e16, m4, ta, ma - vle16.v v0, (a0) - sub a1, a1, t0 - vwmul.vx v8, v0, t1 - vsetvli zero, zero, e32, m8, ta, ma - vadd.vx v8, v8, t2 - vsetvli zero, zero, e16, m4, ta, ma - vnsra.wi v0, v8, 14 - vse16.v v0, (a0) - sh1add a0, t0, a0 - bnez a1, 1b + vsetvli t0, a2, e32, m4, ta, ma + vle32.v v0, (a0) + sub a2, a2, t0 + vle32.v v4, (a1) + vwmul.vx v8, v0, a3 + vwmul.vx v16, v4, a3 + vsetvli zero, zero, e64, m8, ta, ma + vadd.vx v8, v8, a4 + vadd.vx v16, v16, a4 + vsetvli zero, zero, e32, m4, ta, ma + vnsra.wi v0, v8, 18 + vnsra.wi v4, v16, 18 +.ifc \fromto, To + vmin.vx v0, v0, t2 + vmin.vx v4, v4, t2 +.endif + vse32.v v0, (a0) + slli t1, t0, 2 + add a0, a0, t1 + vse32.v v4, (a1) + slli t1, t0, 2 + add a1, a1, t1 + bnez a2, 1b ret -endfunc - -func ff_range_chr_to_jpeg_16_rvv, zve32x, zba - lpad 0 - li t1, 30775 - li t2, 4663 - li t3, -9289992 +.else + slli a3, a3, 48 + srli a3, a3, 48 +.ifc \fromto, To + li t2, ((1 << 15) - 1) << 14 +.endif 1: vsetvli t0, a2, e16, m4, ta, ma - vle16.v v0, (a0) + vle16.v v4, (a0) sub a2, a2, t0 - vle16.v v4, (a1) - vmin.vx v0, v0, t1 - vmin.vx v4, v4, t1 - vwmul.vx v8, v0, t2 - vwmul.vx v16, v4, t2 + vle16.v v0, (a1) + vwmul.vx v8, v4, a3 + vwmul.vx v16, v0, a3 vsetvli zero, zero, e32, m8, ta, ma - vadd.vx v8, v8, t3 - vadd.vx v16, v16, t3 + vadd.vx v8, v8, a4 + vadd.vx v16, v16, a4 +.ifc \fromto, To + vmin.vx v8, v8, t2 + vmin.vx v16, v16, t2 +.endif vsetvli zero, zero, e16, m4, ta, ma - vnsra.wi v0, v8, 12 - vnsra.wi v4, v16, 12 - vse16.v v0, (a0) + vnsra.wi v4, v8, 14 + vnsra.wi v0, v16, 14 + vse16.v v4, (a0) sh1add a0, t0, a0 - vse16.v v4, (a1) + vse16.v v0, (a1) sh1add a1, t0, a1 bnez a2, 1b ret +.endif endfunc +.endm -func ff_range_chr_from_jpeg_16_rvv, zve32x, zba - lpad 0 - li t1, 1799 - li t2, 4081085 -1: - vsetvli t0, a2, e16, m4, ta, ma - vle16.v v0, (a0) - sub a2, a2, t0 - vle16.v v4, (a1) - vwmul.vx v8, v0, t1 - vwmul.vx v16, v4, t1 - vsetvli zero, zero, e32, m8, ta, ma - vadd.vx v8, v8, t2 - vadd.vx v16, v16, t2 - vsetvli zero, zero, e16, m4, ta, ma - vnsra.wi v0, v8, 11 - vnsra.wi v4, v16, 11 - vse16.v v0, (a0) - sh1add a0, t0, a0 - vse16.v v4, (a1) - sh1add a1, t0, a1 - bnez a2, 1b - - ret -endfunc +lumConvertRange To, 8 +lumConvertRange To, 16 +chrConvertRange To, 8 +chrConvertRange To, 16 +lumConvertRange From, 8 +lumConvertRange From, 16 +chrConvertRange From, 8 +chrConvertRange From, 16 diff --git a/libswscale/riscv/swscale.c b/libswscale/riscv/swscale.c index 49c492f153..3a12dfc2c9 100644 --- a/libswscale/riscv/swscale.c +++ b/libswscale/riscv/swscale.c @@ -21,37 +21,48 @@ #include "libavutil/riscv/cpu.h" #include "libswscale/swscale_internal.h" -void ff_range_lum_to_jpeg_16_rvv(int16_t *, int); -void ff_range_chr_to_jpeg_16_rvv(int16_t *, int16_t *, int); -void ff_range_lum_from_jpeg_16_rvv(int16_t *, int); -void ff_range_chr_from_jpeg_16_rvv(int16_t *, int16_t *, int); +void ff_lumRangeToJpeg8_rvv(int16_t *dst, int width, + uint32_t coeff, int64_t offset); +void ff_chrRangeToJpeg8_rvv(int16_t *dstU, int16_t *dstV, int width, + uint32_t coeff, int64_t offset); +void ff_lumRangeFromJpeg8_rvv(int16_t *dst, int width, + uint32_t coeff, int64_t offset); +void ff_chrRangeFromJpeg8_rvv(int16_t *dstU, int16_t *dstV, int width, + uint32_t coeff, int64_t offset); +void ff_lumRangeToJpeg16_rvv(int16_t *dst, int width, + uint32_t coeff, int64_t offset); +void ff_chrRangeToJpeg16_rvv(int16_t *dstU, int16_t *dstV, int width, + uint32_t coeff, int64_t offset); +void ff_lumRangeFromJpeg16_rvv(int16_t *dst, int width, + uint32_t coeff, int64_t offset); +void ff_chrRangeFromJpeg16_rvv(int16_t *dstU, int16_t *dstV, int width, + uint32_t coeff, int64_t offset); av_cold void ff_sws_init_range_convert_riscv(SwsInternal *c) { - /* This code is currently disabled because of changes in the base - * implementation of these functions. This code should be enabled - * again once those changes are ported to this architecture. */ -#if 0 #if HAVE_RVV int flags = av_get_cpu_flags(); - static const struct { - void (*lum)(int16_t *, int); - void (*chr)(int16_t *, int16_t *, int); - } convs[2] = { - { ff_range_lum_to_jpeg_16_rvv, ff_range_chr_to_jpeg_16_rvv }, - { ff_range_lum_from_jpeg_16_rvv, ff_range_chr_from_jpeg_16_rvv }, - }; - - if (c->dstBpc <= 14 && - (flags & AV_CPU_FLAG_RVV_I32) && (flags & AV_CPU_FLAG_RVB)) { - bool from = c->opts.src_range != 0; - - c->lumConvertRange = convs[from].lum; - c->chrConvertRange = convs[from].chr; + if ((flags & AV_CPU_FLAG_RVV_I32) && (flags & AV_CPU_FLAG_RVB)) { + if (c->dstBpc <= 14) { + if (c->opts.src_range) { + c->lumConvertRange = ff_lumRangeFromJpeg8_rvv; + c->chrConvertRange = ff_chrRangeFromJpeg8_rvv; + } else { + c->lumConvertRange = ff_lumRangeToJpeg8_rvv; + c->chrConvertRange = ff_chrRangeToJpeg8_rvv; + } + } else { + if (c->opts.src_range) { + c->lumConvertRange = ff_lumRangeFromJpeg16_rvv; + c->chrConvertRange = ff_chrRangeFromJpeg16_rvv; + } else { + c->lumConvertRange = ff_lumRangeToJpeg16_rvv; + c->chrConvertRange = ff_chrRangeToJpeg16_rvv; + } + } } #endif -#endif } #define RVV_INPUT(name) \ -- 2.52.0 _______________________________________________ ffmpeg-devel mailing list -- [email protected] To unsubscribe send an email to [email protected]
