Add NEON unscaled converters for {yuv420p, yuv422p, yuva420p, nv12, nv21}
to {rgb565le, bgr565le, rgb555le, bgr555le}.The 16bpp packing uses v8/v9 as the output accumulator. Since AAPCS-64 requires d8-d15 to be callee-saved, declare_func now wraps a stp d8, d9 / ldp d8, d9 around 16bpp paths only (gated by .ifc on the output format). Pattern matches libswscale/aarch64/hscale.S. yuva420p -> 16bpp drops alpha and routes through the yuv420p wrappers, mirroring how yuva420p -> rgb24/bgr24 already work in tree. Verified with checkasm --test=sw_yuv2rgb (110/110) and the full checkasm regression (7657/7657) on Apple M1. Cycle counts and the speedup table are in the cover letter. Signed-off-by: DROOdotFOO <[email protected]> --- libswscale/aarch64/swscale_unscaled.c | 47 ++++++++ libswscale/aarch64/yuv2rgb_neon.S | 147 ++++++++++++++++++++++++++ tests/checkasm/sw_yuv2rgb.c | 13 ++- 3 files changed, 205 insertions(+), 2 deletions(-) diff --git a/libswscale/aarch64/swscale_unscaled.c b/libswscale/aarch64/swscale_unscaled.c index ba24775210..aa23f9c955 100644 --- a/libswscale/aarch64/swscale_unscaled.c +++ b/libswscale/aarch64/swscale_unscaled.c @@ -95,6 +95,15 @@ DECLARE_FF_YUVX_TO_RGBX_FUNCS(yuvx, bgr24) DECLARE_FF_YUVX_TO_ALL_RGBX_FUNCS(yuv420p) DECLARE_FF_YUVX_TO_ALL_RGBX_FUNCS(yuv422p) +#define DECLARE_FF_YUVX_TO_ALL_RGB16_FUNCS(yuvx) \ +DECLARE_FF_YUVX_TO_RGBX_FUNCS(yuvx, rgb565le) \ +DECLARE_FF_YUVX_TO_RGBX_FUNCS(yuvx, bgr565le) \ +DECLARE_FF_YUVX_TO_RGBX_FUNCS(yuvx, rgb555le) \ +DECLARE_FF_YUVX_TO_RGBX_FUNCS(yuvx, bgr555le) \ + +DECLARE_FF_YUVX_TO_ALL_RGB16_FUNCS(yuv420p) +DECLARE_FF_YUVX_TO_ALL_RGB16_FUNCS(yuv422p) + #define DECLARE_FF_YUVA420P_TO_RGBX_FUNCS(ofmt) \ int ff_yuva420p_to_##ofmt##_neon(int w, int h, \ uint8_t *dst, int linesize, \ @@ -217,6 +226,15 @@ DECLARE_FF_NVX_TO_RGBX_FUNCS(nvx, bgr24) DECLARE_FF_NVX_TO_ALL_RGBX_FUNCS(nv12) DECLARE_FF_NVX_TO_ALL_RGBX_FUNCS(nv21) +#define DECLARE_FF_NVX_TO_ALL_RGB16_FUNCS(nvx) \ +DECLARE_FF_NVX_TO_RGBX_FUNCS(nvx, rgb565le) \ +DECLARE_FF_NVX_TO_RGBX_FUNCS(nvx, bgr565le) \ +DECLARE_FF_NVX_TO_RGBX_FUNCS(nvx, rgb555le) \ +DECLARE_FF_NVX_TO_RGBX_FUNCS(nvx, bgr555le) \ + +DECLARE_FF_NVX_TO_ALL_RGB16_FUNCS(nv12) +DECLARE_FF_NVX_TO_ALL_RGB16_FUNCS(nv21) + /* We need a 16 pixel width alignment. This constraint can easily be removed * for input reading but for the output which is 4-bytes per pixel (RGBA) the * assembly might be writing as much as 4*15=60 extra bytes at the end of the @@ -240,6 +258,13 @@ DECLARE_FF_NVX_TO_ALL_RGBX_FUNCS(nv21) SET_FF_NVX_TO_RGBX_FUNC(nvx, NVX, bgr24, BGR24, accurate_rnd); \ } while (0) +#define SET_FF_NVX_TO_ALL_RGB16_FUNC(nvx, NVX, accurate_rnd) do { \ + SET_FF_NVX_TO_RGBX_FUNC(nvx, NVX, rgb565le, RGB565LE, accurate_rnd); \ + SET_FF_NVX_TO_RGBX_FUNC(nvx, NVX, bgr565le, BGR565LE, accurate_rnd); \ + SET_FF_NVX_TO_RGBX_FUNC(nvx, NVX, rgb555le, RGB555LE, accurate_rnd); \ + SET_FF_NVX_TO_RGBX_FUNC(nvx, NVX, bgr555le, BGR555LE, accurate_rnd); \ +} while (0) + static void get_unscaled_swscale_neon(SwsInternal *c) { int accurate_rnd = c->opts.flags & SWS_ACCURATE_RND; @@ -247,6 +272,10 @@ static void get_unscaled_swscale_neon(SwsInternal *c) { SET_FF_NVX_TO_ALL_RGBX_FUNC(nv21, NV21, accurate_rnd); SET_FF_NVX_TO_ALL_RGBX_FUNC(yuv420p, YUV420P, accurate_rnd); SET_FF_NVX_TO_ALL_RGBX_FUNC(yuv422p, YUV422P, accurate_rnd); + SET_FF_NVX_TO_ALL_RGB16_FUNC(nv12, NV12, accurate_rnd); + SET_FF_NVX_TO_ALL_RGB16_FUNC(nv21, NV21, accurate_rnd); + SET_FF_NVX_TO_ALL_RGB16_FUNC(yuv420p, YUV420P, accurate_rnd); + SET_FF_NVX_TO_ALL_RGB16_FUNC(yuv422p, YUV422P, accurate_rnd); SET_FF_NVX_TO_RGBX_FUNC(yuva420p, YUVA420P, argb, ARGB, accurate_rnd); SET_FF_NVX_TO_RGBX_FUNC(yuva420p, YUVA420P, rgba, RGBA, accurate_rnd); SET_FF_NVX_TO_RGBX_FUNC(yuva420p, YUVA420P, abgr, ABGR, accurate_rnd); @@ -254,6 +283,11 @@ static void get_unscaled_swscale_neon(SwsInternal *c) { SET_FF_NVX_TO_RGBX_FUNC(yuv420p, YUVA420P, rgb24, RGB24, accurate_rnd); SET_FF_NVX_TO_RGBX_FUNC(yuv420p, YUVA420P, bgr24, BGR24, accurate_rnd); SET_FF_NVX_TO_RGBX_FUNC(yuv420p, YUVA420P, gbrp, GBRP, accurate_rnd); + /* yuva420p -> 16bpp: alpha is dropped, route through yuv420p NEON path */ + SET_FF_NVX_TO_RGBX_FUNC(yuv420p, YUVA420P, rgb565le, RGB565LE, accurate_rnd); + SET_FF_NVX_TO_RGBX_FUNC(yuv420p, YUVA420P, bgr565le, BGR565LE, accurate_rnd); + SET_FF_NVX_TO_RGBX_FUNC(yuv420p, YUVA420P, rgb555le, RGB555LE, accurate_rnd); + SET_FF_NVX_TO_RGBX_FUNC(yuv420p, YUVA420P, bgr555le, BGR555LE, accurate_rnd); if (c->opts.dst_format == AV_PIX_FMT_YUV420P && (c->opts.src_format == AV_PIX_FMT_NV24 || c->opts.src_format == AV_PIX_FMT_NV42) && @@ -285,6 +319,10 @@ av_cold SwsFunc ff_yuv2rgb_init_aarch64(SwsInternal *c) case AV_PIX_FMT_RGB24: return yuv420p_to_rgb24_neon_wrapper; case AV_PIX_FMT_BGR24: return yuv420p_to_bgr24_neon_wrapper; case AV_PIX_FMT_GBRP: return yuv420p_to_gbrp_neon_wrapper; + case AV_PIX_FMT_RGB565LE: return yuv420p_to_rgb565le_neon_wrapper; + case AV_PIX_FMT_BGR565LE: return yuv420p_to_bgr565le_neon_wrapper; + case AV_PIX_FMT_RGB555LE: return yuv420p_to_rgb555le_neon_wrapper; + case AV_PIX_FMT_BGR555LE: return yuv420p_to_bgr555le_neon_wrapper; } } else if (c->opts.src_format == AV_PIX_FMT_YUVA420P) { switch (c->opts.dst_format) { @@ -297,6 +335,11 @@ av_cold SwsFunc ff_yuv2rgb_init_aarch64(SwsInternal *c) case AV_PIX_FMT_RGB24: return yuv420p_to_rgb24_neon_wrapper; case AV_PIX_FMT_BGR24: return yuv420p_to_bgr24_neon_wrapper; case AV_PIX_FMT_GBRP: return yuv420p_to_gbrp_neon_wrapper; + /* 16bpp targets drop alpha, share yuv420p path */ + case AV_PIX_FMT_RGB565LE: return yuv420p_to_rgb565le_neon_wrapper; + case AV_PIX_FMT_BGR565LE: return yuv420p_to_bgr565le_neon_wrapper; + case AV_PIX_FMT_RGB555LE: return yuv420p_to_rgb555le_neon_wrapper; + case AV_PIX_FMT_BGR555LE: return yuv420p_to_bgr555le_neon_wrapper; } } else if (c->opts.src_format == AV_PIX_FMT_YUV422P) { switch (c->opts.dst_format) { @@ -307,6 +350,10 @@ av_cold SwsFunc ff_yuv2rgb_init_aarch64(SwsInternal *c) case AV_PIX_FMT_RGB24: return yuv422p_to_rgb24_neon_wrapper; case AV_PIX_FMT_BGR24: return yuv422p_to_bgr24_neon_wrapper; case AV_PIX_FMT_GBRP: return yuv422p_to_gbrp_neon_wrapper; + case AV_PIX_FMT_RGB565LE: return yuv422p_to_rgb565le_neon_wrapper; + case AV_PIX_FMT_BGR565LE: return yuv422p_to_bgr565le_neon_wrapper; + case AV_PIX_FMT_RGB555LE: return yuv422p_to_rgb555le_neon_wrapper; + case AV_PIX_FMT_BGR555LE: return yuv422p_to_bgr555le_neon_wrapper; } } return NULL; diff --git a/libswscale/aarch64/yuv2rgb_neon.S b/libswscale/aarch64/yuv2rgb_neon.S index 19f750545f..cf4b08351a 100644 --- a/libswscale/aarch64/yuv2rgb_neon.S +++ b/libswscale/aarch64/yuv2rgb_neon.S @@ -63,7 +63,23 @@ add w17, w0, w0, lsl #1 sub w3, w3, w17 // w3 = linesize - width * 3 (padding) .else + .ifc \ofmt,rgb565le + sub w3, w3, w0, lsl #1 // w3 = linesize - width * 2 (padding) + .else + .ifc \ofmt,bgr565le + sub w3, w3, w0, lsl #1 // w3 = linesize - width * 2 (padding) + .else + .ifc \ofmt,rgb555le + sub w3, w3, w0, lsl #1 // w3 = linesize - width * 2 (padding) + .else + .ifc \ofmt,bgr555le + sub w3, w3, w0, lsl #1 // w3 = linesize - width * 2 (padding) + .else sub w3, w3, w0, lsl #2 // w3 = linesize - width * 4 (padding) + .endif + .endif + .endif + .endif .endif .endif .endif @@ -96,7 +112,23 @@ add w17, w0, w0, lsl #1 sub w3, w3, w17 // w3 = linesize - width * 3 (padding) .else + .ifc \ofmt,rgb565le + sub w3, w3, w0, lsl #1 // w3 = linesize - width * 2 (padding) + .else + .ifc \ofmt,bgr565le + sub w3, w3, w0, lsl #1 // w3 = linesize - width * 2 (padding) + .else + .ifc \ofmt,rgb555le + sub w3, w3, w0, lsl #1 // w3 = linesize - width * 2 (padding) + .else + .ifc \ofmt,bgr555le + sub w3, w3, w0, lsl #1 // w3 = linesize - width * 2 (padding) + .else sub w3, w3, w0, lsl #2 // w3 = linesize - width * 4 (padding) + .endif + .endif + .endif + .endif .endif .endif .endif @@ -139,7 +171,23 @@ add w17, w0, w0, lsl #1 sub w3, w3, w17 // w3 = linesize - width * 3 (padding) .else + .ifc \ofmt,rgb565le + sub w3, w3, w0, lsl #1 // w3 = linesize - width * 2 (padding) + .else + .ifc \ofmt,bgr565le + sub w3, w3, w0, lsl #1 // w3 = linesize - width * 2 (padding) + .else + .ifc \ofmt,rgb555le + sub w3, w3, w0, lsl #1 // w3 = linesize - width * 2 (padding) + .else + .ifc \ofmt,bgr555le + sub w3, w3, w0, lsl #1 // w3 = linesize - width * 2 (padding) + .else sub w3, w3, w0, lsl #2 // w3 = linesize - width * 4 (padding) + .endif + .endif + .endif + .endif .endif .endif .endif @@ -230,9 +278,63 @@ mov \a2, v29.8b // real alpha (next 8 pixels) .endm +// The 16bpp output paths use v8/v9 to assemble packed pixels before the +// final st1. v8/v9 are AAPCS callee-saved (low 64 bits must be preserved), +// so each function spills d8/d9 to the stack on entry and reloads on exit. +// Other output formats don't touch v8-v15, so the save/restore is gated. +.macro save_d8_d9_if_16bpp ofmt +.ifc \ofmt,rgb565le + stp d8, d9, [sp, #-0x10]! +.endif +.ifc \ofmt,bgr565le + stp d8, d9, [sp, #-0x10]! +.endif +.ifc \ofmt,rgb555le + stp d8, d9, [sp, #-0x10]! +.endif +.ifc \ofmt,bgr555le + stp d8, d9, [sp, #-0x10]! +.endif +.endm + +.macro restore_d8_d9_if_16bpp ofmt +.ifc \ofmt,rgb565le + ldp d8, d9, [sp], #0x10 +.endif +.ifc \ofmt,bgr565le + ldp d8, d9, [sp], #0x10 +.endif +.ifc \ofmt,rgb555le + ldp d8, d9, [sp], #0x10 +.endif +.ifc \ofmt,bgr555le + ldp d8, d9, [sp], #0x10 +.endif +.endm + +// Pack 8 pixels of 16bpp output. The three channels are extracted via ushr, +// widened to u16, then merged via shift-left-insert: +// dst = (high << high_shl) | (mid << 5) | low +// For RGB565LE pass (B, G, R) as (low, mid, high), g_shr=2, high_shl=11. +// For BGR565LE pass (R, G, B), g_shr=2, high_shl=11. +// For RGB555LE pass (B, G, R), g_shr=3, high_shl=10. +// For BGR555LE pass (R, G, B), g_shr=3, high_shl=10. +// Clobbers v20-v23. +.macro pack_rgb16 dst, low_ch, mid_ch, high_ch, g_shr, high_shl + ushr v20.8b, \high_ch\().8b, #3 + ushr v21.8b, \mid_ch\().8b, #\g_shr + ushr v22.8b, \low_ch\().8b, #3 + uxtl \dst\().8h, v22.8b + uxtl v23.8h, v21.8b + sli \dst\().8h, v23.8h, #5 + uxtl v23.8h, v20.8b + sli \dst\().8h, v23.8h, #\high_shl +.endm + .macro declare_func ifmt ofmt function ff_\ifmt\()_to_\ofmt\()_neon, export=1 load_args_\ifmt \ofmt + save_d8_d9_if_16bpp \ofmt movi v31.8h, #4, lsl #8 // 128 * (1<<3) (loop-invariant) movi v30.8b, #255 // alpha = 255 (loop-invariant) @@ -313,8 +415,40 @@ function ff_\ifmt\()_to_\ofmt\()_neon, export=1 st1 { v6.8b, v7.8b }, [x10], #16 st1 { v18.8b, v19.8b }, [x15], #16 .else + .ifc \ofmt,rgb565le + compute_rgb v4.8b,v5.8b,v6.8b, v16.8b,v17.8b,v18.8b + // RGB565 LE: (R[7:3] << 11) | (G[7:2] << 5) | B[7:3] + pack_rgb16 v8, v6, v5, v4, 2, 11 + pack_rgb16 v9, v18, v17, v16, 2, 11 + st1 { v8.8h, v9.8h}, [x2], #32 + .else + .ifc \ofmt,bgr565le + compute_rgb v4.8b,v5.8b,v6.8b, v16.8b,v17.8b,v18.8b + // BGR565 LE: (B[7:3] << 11) | (G[7:2] << 5) | R[7:3] + pack_rgb16 v8, v4, v5, v6, 2, 11 + pack_rgb16 v9, v16, v17, v18, 2, 11 + st1 { v8.8h, v9.8h}, [x2], #32 + .else + .ifc \ofmt,rgb555le + compute_rgb v4.8b,v5.8b,v6.8b, v16.8b,v17.8b,v18.8b + // RGB555 LE: (R[7:3] << 10) | (G[7:3] << 5) | B[7:3] + pack_rgb16 v8, v6, v5, v4, 3, 10 + pack_rgb16 v9, v18, v17, v16, 3, 10 + st1 { v8.8h, v9.8h}, [x2], #32 + .else + .ifc \ofmt,bgr555le + compute_rgb v4.8b,v5.8b,v6.8b, v16.8b,v17.8b,v18.8b + // BGR555 LE: (B[7:3] << 10) | (G[7:3] << 5) | R[7:3] + pack_rgb16 v8, v4, v5, v6, 3, 10 + pack_rgb16 v9, v16, v17, v18, 3, 10 + st1 { v8.8h, v9.8h}, [x2], #32 + .else st4 { v4.8b, v5.8b, v6.8b, v7.8b}, [x2], #32 st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [x2], #32 + .endif + .endif + .endif + .endif .endif .endif .endif @@ -330,6 +464,7 @@ function ff_\ifmt\()_to_\ofmt\()_neon, export=1 subs w1, w1, #1 // height -= 1 b.gt 1b mov w0, w9 + restore_d8_d9_if_16bpp \ofmt ret endfunc .endm @@ -349,6 +484,18 @@ declare_rgb_funcs nv21 declare_rgb_funcs yuv420p declare_rgb_funcs yuv422p +.macro declare_rgb16_funcs ifmt + declare_func \ifmt, rgb565le + declare_func \ifmt, bgr565le + declare_func \ifmt, rgb555le + declare_func \ifmt, bgr555le +.endm + +declare_rgb16_funcs nv12 +declare_rgb16_funcs nv21 +declare_rgb16_funcs yuv420p +declare_rgb16_funcs yuv422p + .macro declare_yuva_funcs ifmt declare_func \ifmt, argb declare_func \ifmt, rgba diff --git a/tests/checkasm/sw_yuv2rgb.c b/tests/checkasm/sw_yuv2rgb.c index 07b967b168..2b3b1eec61 100644 --- a/tests/checkasm/sw_yuv2rgb.c +++ b/tests/checkasm/sw_yuv2rgb.c @@ -19,6 +19,7 @@ #include <string.h> #include "libavutil/common.h" +#include "libavutil/imgutils.h" #include "libavutil/intreadwrite.h" #include "libavutil/mem_internal.h" #include "libavutil/pixdesc.h" @@ -144,10 +145,14 @@ static void check_yuv2rgb(int src_pix_fmt) int width = input_sizes[isi]; int srcSliceY = 0; int srcSliceH = NUM_LINES; + /* Use av_image_get_linesize so that semi-planar formats (NV12, + * NV21) get the correct interleaved-UV stride (= width bytes), + * not (width >> log2_chroma_w) which would only count UV pairs. */ + int chroma_linesize = av_image_get_linesize(src_pix_fmt, width, 1); int srcStride[4] = { width + SRC_STRIDE_PAD, - (width >> src_desc->log2_chroma_w) + SRC_STRIDE_PAD, - (width >> src_desc->log2_chroma_w) + SRC_STRIDE_PAD, + chroma_linesize + SRC_STRIDE_PAD, + chroma_linesize + SRC_STRIDE_PAD, width + SRC_STRIDE_PAD, }; int dstStride[4] = { @@ -239,4 +244,8 @@ void checkasm_check_sw_yuv2rgb(void) report("yuv422p"); check_yuv2rgb(AV_PIX_FMT_YUVA420P); report("yuva420p"); + check_yuv2rgb(AV_PIX_FMT_NV12); + report("nv12"); + check_yuv2rgb(AV_PIX_FMT_NV21); + report("nv21"); } -- 2.50.1 (Apple Git-155) _______________________________________________ ffmpeg-devel mailing list -- [email protected] To unsubscribe send an email to [email protected]
