PR #23523 opened by Ramiro Polla (ramiro) URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23523 Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/23523.patch
>From e6b8fa45e091d3ecec53a4fc336ccce00a57649c Mon Sep 17 00:00:00 2001 From: Ramiro Polla <[email protected]> Date: Sat, 13 Jun 2026 00:51:50 +0200 Subject: [PATCH 1/2] swscale/uops: skip offset from unity detection for linear There is no easy optimization that can be triggered by knowing that the offset is exactly 1. This led to identical functions being instantiated for different params. --- libswscale/uops.c | 2 +- libswscale/uops_macros.h | 34 ++++++++++------------------------ tests/ref/fate/sws-ops-list | 2 +- 3 files changed, 12 insertions(+), 26 deletions(-) diff --git a/libswscale/uops.c b/libswscale/uops.c index b73aedb6e1..f527afb0ef 100644 --- a/libswscale/uops.c +++ b/libswscale/uops.c @@ -712,7 +712,7 @@ static int translate_linear_op(SwsContext *ctx, SwsUOpList *ops, uop.data.mat4[i][j] = px; if (k.num == 0) uop.par.lin.zero |= SWS_MASK(i, j); - else if (k.num == k.den) + else if (j < 4 && k.num == k.den) uop.par.lin.one |= SWS_MASK(i, j); else if (j < 4 && (!bitexact || exact_prod(uop.type, px, input, j))) exact |= SWS_MASK(i, j); diff --git a/libswscale/uops_macros.h b/libswscale/uops_macros.h index f63d046aa3..3a7ca8ece9 100644 --- a/libswscale/uops_macros.h +++ b/libswscale/uops_macros.h @@ -1142,18 +1142,15 @@ #define SWS_FOR_F32_CLEAR(MACRO, ...) #define SWS_FOR_STRUCT_F32_CLEAR(MACRO, ...) #define SWS_FOR_F32_LINEAR(MACRO, ...) \ + MACRO(__VA_ARGS__, f32_linear_x_xxx0x , SWS_PIXEL_F32, SWS_UOP_LINEAR , 0x1, 0x41040, 0xbefa8) \ MACRO(__VA_ARGS__, f32_linear_x_x000x , SWS_PIXEL_F32, SWS_UOP_LINEAR , 0x1, 0x41040, 0xbefae) \ MACRO(__VA_ARGS__, f32_linear_x_xxx00 , SWS_PIXEL_F32, SWS_UOP_LINEAR , 0x1, 0x41040, 0xbefb8) \ - MACRO(__VA_ARGS__, f32_linear_x_xxx01 , SWS_PIXEL_F32, SWS_UOP_LINEAR , 0x1, 0x41050, 0xbefa8) \ - MACRO(__VA_ARGS__, f32_linear_x_x0001 , SWS_PIXEL_F32, SWS_UOP_LINEAR , 0x1, 0x41050, 0xbefae) \ MACRO(__VA_ARGS__, f32_linear_y_0x000 , SWS_PIXEL_F32, SWS_UOP_LINEAR , 0x2, 0x41001, 0xbefbe) \ MACRO(__VA_ARGS__, f32_linear_xyz_xxx0x_xxx0x_xxx0x , SWS_PIXEL_F32, SWS_UOP_LINEAR , 0x7, 0x40000, 0xba108) \ MACRO(__VA_ARGS__, f32_linear_xyz_x0x0x_xxx0x_xx00x , SWS_PIXEL_F32, SWS_UOP_LINEAR , 0x7, 0x40000, 0xbb10a) \ MACRO(__VA_ARGS__, f32_linear_xyz_xxx00_xxx0x_xxx0x , SWS_PIXEL_F32, SWS_UOP_LINEAR , 0x7, 0x40000, 0xba118) \ MACRO(__VA_ARGS__, f32_linear_xyz_x000x_0x00x_00x0x , SWS_PIXEL_F32, SWS_UOP_LINEAR , 0x7, 0x40000, 0xbadae) \ MACRO(__VA_ARGS__, f32_linear_xyz_x0000_0x000_00x00 , SWS_PIXEL_F32, SWS_UOP_LINEAR , 0x7, 0x40000, 0xbefbe) \ - MACRO(__VA_ARGS__, f32_linear_xyz_x0001_0x00x_00x01 , SWS_PIXEL_F32, SWS_UOP_LINEAR , 0x7, 0x44010, 0xbadae) \ - MACRO(__VA_ARGS__, f32_linear_xyz_x0001_0x001_00x01 , SWS_PIXEL_F32, SWS_UOP_LINEAR , 0x7, 0x44210, 0xbadae) \ MACRO(__VA_ARGS__, f32_linear_xyz_10x0x_1xx0x_1x00x , SWS_PIXEL_F32, SWS_UOP_LINEAR , 0x7, 0x40421, 0xbb10a) \ MACRO(__VA_ARGS__, f32_linear_w_000x0 , SWS_PIXEL_F32, SWS_UOP_LINEAR , 0x8, 0x01041, 0xbefbe) \ MACRO(__VA_ARGS__, f32_linear_xw_x000x_000x0 , SWS_PIXEL_F32, SWS_UOP_LINEAR , 0x9, 0x01040, 0xbefae) \ @@ -1162,18 +1159,15 @@ MACRO(__VA_ARGS__, f32_linear_xyzw_x0x0x_xxx0x_xx00x_000x0 , SWS_PIXEL_F32, SWS_UOP_LINEAR , 0xf, 0x00000, 0xbb10a) \ MACRO(__VA_ARGS__, f32_linear_xyzw_x0000_0x000_00x00_000x0 , SWS_PIXEL_F32, SWS_UOP_LINEAR , 0xf, 0x00000, 0xbefbe) #define SWS_FOR_STRUCT_F32_LINEAR(MACRO, ...) \ + MACRO(__VA_ARGS__, f32_linear_x_xxx0x , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR , .mask = 0x1, .par.lin.one = 0x41040, .par.lin.zero = 0xbefa8) \ MACRO(__VA_ARGS__, f32_linear_x_x000x , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR , .mask = 0x1, .par.lin.one = 0x41040, .par.lin.zero = 0xbefae) \ MACRO(__VA_ARGS__, f32_linear_x_xxx00 , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR , .mask = 0x1, .par.lin.one = 0x41040, .par.lin.zero = 0xbefb8) \ - MACRO(__VA_ARGS__, f32_linear_x_xxx01 , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR , .mask = 0x1, .par.lin.one = 0x41050, .par.lin.zero = 0xbefa8) \ - MACRO(__VA_ARGS__, f32_linear_x_x0001 , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR , .mask = 0x1, .par.lin.one = 0x41050, .par.lin.zero = 0xbefae) \ MACRO(__VA_ARGS__, f32_linear_y_0x000 , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR , .mask = 0x2, .par.lin.one = 0x41001, .par.lin.zero = 0xbefbe) \ MACRO(__VA_ARGS__, f32_linear_xyz_xxx0x_xxx0x_xxx0x , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR , .mask = 0x7, .par.lin.one = 0x40000, .par.lin.zero = 0xba108) \ MACRO(__VA_ARGS__, f32_linear_xyz_x0x0x_xxx0x_xx00x , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR , .mask = 0x7, .par.lin.one = 0x40000, .par.lin.zero = 0xbb10a) \ MACRO(__VA_ARGS__, f32_linear_xyz_xxx00_xxx0x_xxx0x , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR , .mask = 0x7, .par.lin.one = 0x40000, .par.lin.zero = 0xba118) \ MACRO(__VA_ARGS__, f32_linear_xyz_x000x_0x00x_00x0x , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR , .mask = 0x7, .par.lin.one = 0x40000, .par.lin.zero = 0xbadae) \ MACRO(__VA_ARGS__, f32_linear_xyz_x0000_0x000_00x00 , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR , .mask = 0x7, .par.lin.one = 0x40000, .par.lin.zero = 0xbefbe) \ - MACRO(__VA_ARGS__, f32_linear_xyz_x0001_0x00x_00x01 , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR , .mask = 0x7, .par.lin.one = 0x44010, .par.lin.zero = 0xbadae) \ - MACRO(__VA_ARGS__, f32_linear_xyz_x0001_0x001_00x01 , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR , .mask = 0x7, .par.lin.one = 0x44210, .par.lin.zero = 0xbadae) \ MACRO(__VA_ARGS__, f32_linear_xyz_10x0x_1xx0x_1x00x , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR , .mask = 0x7, .par.lin.one = 0x40421, .par.lin.zero = 0xbb10a) \ MACRO(__VA_ARGS__, f32_linear_w_000x0 , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR , .mask = 0x8, .par.lin.one = 0x1041, .par.lin.zero = 0xbefbe) \ MACRO(__VA_ARGS__, f32_linear_xw_x000x_000x0 , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR , .mask = 0x9, .par.lin.one = 0x1040, .par.lin.zero = 0xbefae) \ @@ -1182,6 +1176,10 @@ MACRO(__VA_ARGS__, f32_linear_xyzw_x0x0x_xxx0x_xx00x_000x0 , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR , .mask = 0xf, .par.lin.one = 0x0, .par.lin.zero = 0xbb10a) \ MACRO(__VA_ARGS__, f32_linear_xyzw_x0000_0x000_00x00_000x0 , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR , .mask = 0xf, .par.lin.one = 0x0, .par.lin.zero = 0xbefbe) #define SWS_FOR_F32_LINEAR_FMA(MACRO, ...) \ + MACRO(__VA_ARGS__, f32_linear_fma_x_xxx0x , SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0x1, 0x41040, 0xbefa8, 0xfffe8) \ + MACRO(__VA_ARGS__, f32_linear_fma_x_XXx0x , SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0x1, 0x41040, 0xbefa8, 0xfffeb) \ + MACRO(__VA_ARGS__, f32_linear_fma_x_xXX0x , SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0x1, 0x41040, 0xbefa8, 0xfffee) \ + MACRO(__VA_ARGS__, f32_linear_fma_x_XXX0x , SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0x1, 0x41040, 0xbefa8, 0xfffef) \ MACRO(__VA_ARGS__, f32_linear_fma_x_x000x , SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0x1, 0x41040, 0xbefae, 0xfffee) \ MACRO(__VA_ARGS__, f32_linear_fma_x_X000x , SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0x1, 0x41040, 0xbefae, 0xfffef) \ MACRO(__VA_ARGS__, f32_linear_fma_x_xxx00 , SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0x1, 0x41040, 0xbefb8, 0xffff8) \ @@ -1189,12 +1187,6 @@ MACRO(__VA_ARGS__, f32_linear_fma_x_XxX00 , SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0x1, 0x41040, 0xbefb8, 0xffffd) \ MACRO(__VA_ARGS__, f32_linear_fma_x_xXX00 , SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0x1, 0x41040, 0xbefb8, 0xffffe) \ MACRO(__VA_ARGS__, f32_linear_fma_x_XXX00 , SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0x1, 0x41040, 0xbefb8, 0xfffff) \ - MACRO(__VA_ARGS__, f32_linear_fma_x_xxx01 , SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0x1, 0x41050, 0xbefa8, 0xffff8) \ - MACRO(__VA_ARGS__, f32_linear_fma_x_XXx01 , SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0x1, 0x41050, 0xbefa8, 0xffffb) \ - MACRO(__VA_ARGS__, f32_linear_fma_x_xXX01 , SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0x1, 0x41050, 0xbefa8, 0xffffe) \ - MACRO(__VA_ARGS__, f32_linear_fma_x_XXX01 , SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0x1, 0x41050, 0xbefa8, 0xfffff) \ - MACRO(__VA_ARGS__, f32_linear_fma_x_x0001 , SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0x1, 0x41050, 0xbefae, 0xffffe) \ - MACRO(__VA_ARGS__, f32_linear_fma_x_X0001 , SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0x1, 0x41050, 0xbefae, 0xfffff) \ MACRO(__VA_ARGS__, f32_linear_fma_y_0x000 , SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0x2, 0x41001, 0xbefbe, 0xfffbf) \ MACRO(__VA_ARGS__, f32_linear_fma_y_0X000 , SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0x2, 0x41001, 0xbefbe, 0xfffff) \ MACRO(__VA_ARGS__, f32_linear_fma_xyz_xxx0x_xxx0x_xxx0x , SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0x7, 0x40000, 0xba108, 0xfa108) \ @@ -1215,8 +1207,6 @@ MACRO(__VA_ARGS__, f32_linear_fma_xyz_x0000_0X000_00X00 , SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0x7, 0x40000, 0xbefbe, 0xffffe) \ MACRO(__VA_ARGS__, f32_linear_fma_xyz_X0000_0X000_00x00 , SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0x7, 0x40000, 0xbefbe, 0xfefff) \ MACRO(__VA_ARGS__, f32_linear_fma_xyz_X0000_0X000_00X00 , SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0x7, 0x40000, 0xbefbe, 0xfffff) \ - MACRO(__VA_ARGS__, f32_linear_fma_xyz_X0001_0X00x_00X01 , SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0x7, 0x44010, 0xbadae, 0xffdff) \ - MACRO(__VA_ARGS__, f32_linear_fma_xyz_X0001_0X001_00X01 , SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0x7, 0x44210, 0xbadae, 0xfffff) \ MACRO(__VA_ARGS__, f32_linear_fma_xyz_10X0x_1XX0x_1X00x , SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0x7, 0x40421, 0xbb10a, 0xfbdef) \ MACRO(__VA_ARGS__, f32_linear_fma_w_000X0 , SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0x8, 0x01041, 0xbefbe, 0xfffff) \ MACRO(__VA_ARGS__, f32_linear_fma_xw_x000x_000x0 , SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0x9, 0x01040, 0xbefae, 0xbffee) \ @@ -1228,6 +1218,10 @@ MACRO(__VA_ARGS__, f32_linear_fma_xyzw_X0X0x_XXX0x_XX00x_000X0, SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0xf, 0x00000, 0xbb10a, 0xfbdef) \ MACRO(__VA_ARGS__, f32_linear_fma_xyzw_X0000_0X000_00X00_000X0, SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0xf, 0x00000, 0xbefbe, 0xfffff) #define SWS_FOR_STRUCT_F32_LINEAR_FMA(MACRO, ...) \ + MACRO(__VA_ARGS__, f32_linear_fma_x_xxx0x , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x1, .par.lin.one = 0x41040, .par.lin.zero = 0xbefa8, .par.lin.exact = 0xfffe8) \ + MACRO(__VA_ARGS__, f32_linear_fma_x_XXx0x , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x1, .par.lin.one = 0x41040, .par.lin.zero = 0xbefa8, .par.lin.exact = 0xfffeb) \ + MACRO(__VA_ARGS__, f32_linear_fma_x_xXX0x , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x1, .par.lin.one = 0x41040, .par.lin.zero = 0xbefa8, .par.lin.exact = 0xfffee) \ + MACRO(__VA_ARGS__, f32_linear_fma_x_XXX0x , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x1, .par.lin.one = 0x41040, .par.lin.zero = 0xbefa8, .par.lin.exact = 0xfffef) \ MACRO(__VA_ARGS__, f32_linear_fma_x_x000x , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x1, .par.lin.one = 0x41040, .par.lin.zero = 0xbefae, .par.lin.exact = 0xfffee) \ MACRO(__VA_ARGS__, f32_linear_fma_x_X000x , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x1, .par.lin.one = 0x41040, .par.lin.zero = 0xbefae, .par.lin.exact = 0xfffef) \ MACRO(__VA_ARGS__, f32_linear_fma_x_xxx00 , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x1, .par.lin.one = 0x41040, .par.lin.zero = 0xbefb8, .par.lin.exact = 0xffff8) \ @@ -1235,12 +1229,6 @@ MACRO(__VA_ARGS__, f32_linear_fma_x_XxX00 , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x1, .par.lin.one = 0x41040, .par.lin.zero = 0xbefb8, .par.lin.exact = 0xffffd) \ MACRO(__VA_ARGS__, f32_linear_fma_x_xXX00 , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x1, .par.lin.one = 0x41040, .par.lin.zero = 0xbefb8, .par.lin.exact = 0xffffe) \ MACRO(__VA_ARGS__, f32_linear_fma_x_XXX00 , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x1, .par.lin.one = 0x41040, .par.lin.zero = 0xbefb8, .par.lin.exact = 0xfffff) \ - MACRO(__VA_ARGS__, f32_linear_fma_x_xxx01 , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x1, .par.lin.one = 0x41050, .par.lin.zero = 0xbefa8, .par.lin.exact = 0xffff8) \ - MACRO(__VA_ARGS__, f32_linear_fma_x_XXx01 , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x1, .par.lin.one = 0x41050, .par.lin.zero = 0xbefa8, .par.lin.exact = 0xffffb) \ - MACRO(__VA_ARGS__, f32_linear_fma_x_xXX01 , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x1, .par.lin.one = 0x41050, .par.lin.zero = 0xbefa8, .par.lin.exact = 0xffffe) \ - MACRO(__VA_ARGS__, f32_linear_fma_x_XXX01 , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x1, .par.lin.one = 0x41050, .par.lin.zero = 0xbefa8, .par.lin.exact = 0xfffff) \ - MACRO(__VA_ARGS__, f32_linear_fma_x_x0001 , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x1, .par.lin.one = 0x41050, .par.lin.zero = 0xbefae, .par.lin.exact = 0xffffe) \ - MACRO(__VA_ARGS__, f32_linear_fma_x_X0001 , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x1, .par.lin.one = 0x41050, .par.lin.zero = 0xbefae, .par.lin.exact = 0xfffff) \ MACRO(__VA_ARGS__, f32_linear_fma_y_0x000 , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x2, .par.lin.one = 0x41001, .par.lin.zero = 0xbefbe, .par.lin.exact = 0xfffbf) \ MACRO(__VA_ARGS__, f32_linear_fma_y_0X000 , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x2, .par.lin.one = 0x41001, .par.lin.zero = 0xbefbe, .par.lin.exact = 0xfffff) \ MACRO(__VA_ARGS__, f32_linear_fma_xyz_xxx0x_xxx0x_xxx0x , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x7, .par.lin.one = 0x40000, .par.lin.zero = 0xba108, .par.lin.exact = 0xfa108) \ @@ -1261,8 +1249,6 @@ MACRO(__VA_ARGS__, f32_linear_fma_xyz_x0000_0X000_00X00 , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x7, .par.lin.one = 0x40000, .par.lin.zero = 0xbefbe, .par.lin.exact = 0xffffe) \ MACRO(__VA_ARGS__, f32_linear_fma_xyz_X0000_0X000_00x00 , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x7, .par.lin.one = 0x40000, .par.lin.zero = 0xbefbe, .par.lin.exact = 0xfefff) \ MACRO(__VA_ARGS__, f32_linear_fma_xyz_X0000_0X000_00X00 , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x7, .par.lin.one = 0x40000, .par.lin.zero = 0xbefbe, .par.lin.exact = 0xfffff) \ - MACRO(__VA_ARGS__, f32_linear_fma_xyz_X0001_0X00x_00X01 , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x7, .par.lin.one = 0x44010, .par.lin.zero = 0xbadae, .par.lin.exact = 0xffdff) \ - MACRO(__VA_ARGS__, f32_linear_fma_xyz_X0001_0X001_00X01 , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x7, .par.lin.one = 0x44210, .par.lin.zero = 0xbadae, .par.lin.exact = 0xfffff) \ MACRO(__VA_ARGS__, f32_linear_fma_xyz_10X0x_1XX0x_1X00x , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x7, .par.lin.one = 0x40421, .par.lin.zero = 0xbb10a, .par.lin.exact = 0xfbdef) \ MACRO(__VA_ARGS__, f32_linear_fma_w_000X0 , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x8, .par.lin.one = 0x1041, .par.lin.zero = 0xbefbe, .par.lin.exact = 0xfffff) \ MACRO(__VA_ARGS__, f32_linear_fma_xw_x000x_000x0 , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x9, .par.lin.one = 0x1040, .par.lin.zero = 0xbefae, .par.lin.exact = 0xbffee) \ diff --git a/tests/ref/fate/sws-ops-list b/tests/ref/fate/sws-ops-list index 68a1fc1105..6b4003121a 100644 --- a/tests/ref/fate/sws-ops-list +++ b/tests/ref/fate/sws-ops-list @@ -1 +1 @@ -e2f26cb6df5c11015e613016bb1a004a +bbe27c8c324f08d933f6397f5fb96650 -- 2.52.0 >From 3f7421f89f3679ae1354039567966582e96d4546 Mon Sep 17 00:00:00 2001 From: Ramiro Polla <[email protected]> Date: Sat, 13 Jun 2026 01:41:47 +0200 Subject: [PATCH 2/2] swscale/uops: relax detection of exact computations in linear The first computation in a linear row doesn't have anything to accumulate to, so a multiply-accumulate instruction won't be used either way. This led to identical functions being instantiated for different params. --- libswscale/uops.c | 5 +++- libswscale/uops_macros.h | 54 ++++++++++++++++------------------------ 2 files changed, 26 insertions(+), 33 deletions(-) diff --git a/libswscale/uops.c b/libswscale/uops.c index f527afb0ef..1bd3e2f763 100644 --- a/libswscale/uops.c +++ b/libswscale/uops.c @@ -706,6 +706,7 @@ static int translate_linear_op(SwsContext *ctx, SwsUOpList *ops, for (int i = 0; i < 4; i++) { if (SWS_OP_NEEDED(op, i) && (op->lin.mask & SWS_MASK_ROW(i))) uop.mask |= SWS_COMP(i); + bool nonzero = (op->lin.m[i][4].num != 0); for (int j = 0; j < 5; j++) { const AVRational k = op->lin.m[i][j]; const SwsPixel px = Q2PIXEL(k); @@ -714,8 +715,10 @@ static int translate_linear_op(SwsContext *ctx, SwsUOpList *ops, uop.par.lin.zero |= SWS_MASK(i, j); else if (j < 4 && k.num == k.den) uop.par.lin.one |= SWS_MASK(i, j); - else if (j < 4 && (!bitexact || exact_prod(uop.type, px, input, j))) + else if (j < 4 && nonzero && (!bitexact || exact_prod(uop.type, px, input, j))) exact |= SWS_MASK(i, j); + if (k.num != 0) + nonzero = true; } } diff --git a/libswscale/uops_macros.h b/libswscale/uops_macros.h index 3a7ca8ece9..3c4d6b6a3e 100644 --- a/libswscale/uops_macros.h +++ b/libswscale/uops_macros.h @@ -1183,12 +1183,10 @@ MACRO(__VA_ARGS__, f32_linear_fma_x_x000x , SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0x1, 0x41040, 0xbefae, 0xfffee) \ MACRO(__VA_ARGS__, f32_linear_fma_x_X000x , SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0x1, 0x41040, 0xbefae, 0xfffef) \ MACRO(__VA_ARGS__, f32_linear_fma_x_xxx00 , SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0x1, 0x41040, 0xbefb8, 0xffff8) \ - MACRO(__VA_ARGS__, f32_linear_fma_x_XXx00 , SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0x1, 0x41040, 0xbefb8, 0xffffb) \ - MACRO(__VA_ARGS__, f32_linear_fma_x_XxX00 , SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0x1, 0x41040, 0xbefb8, 0xffffd) \ + MACRO(__VA_ARGS__, f32_linear_fma_x_xXx00 , SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0x1, 0x41040, 0xbefb8, 0xffffa) \ + MACRO(__VA_ARGS__, f32_linear_fma_x_xxX00 , SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0x1, 0x41040, 0xbefb8, 0xffffc) \ MACRO(__VA_ARGS__, f32_linear_fma_x_xXX00 , SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0x1, 0x41040, 0xbefb8, 0xffffe) \ - MACRO(__VA_ARGS__, f32_linear_fma_x_XXX00 , SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0x1, 0x41040, 0xbefb8, 0xfffff) \ MACRO(__VA_ARGS__, f32_linear_fma_y_0x000 , SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0x2, 0x41001, 0xbefbe, 0xfffbf) \ - MACRO(__VA_ARGS__, f32_linear_fma_y_0X000 , SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0x2, 0x41001, 0xbefbe, 0xfffff) \ MACRO(__VA_ARGS__, f32_linear_fma_xyz_xxx0x_xxx0x_xxx0x , SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0x7, 0x40000, 0xba108, 0xfa108) \ MACRO(__VA_ARGS__, f32_linear_fma_xyz_XXx0x_XXx0x_XXx0x , SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0x7, 0x40000, 0xba108, 0xfad6b) \ MACRO(__VA_ARGS__, f32_linear_fma_xyz_XXX0x_XxX0x_XXX0x , SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0x7, 0x40000, 0xba108, 0xfbdaf) \ @@ -1197,26 +1195,23 @@ MACRO(__VA_ARGS__, f32_linear_fma_xyz_x0x0x_xxx0x_xx00x , SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0x7, 0x40000, 0xbb10a, 0xfb10a) \ MACRO(__VA_ARGS__, f32_linear_fma_xyz_X0X0x_XXX0x_XX00x , SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0x7, 0x40000, 0xbb10a, 0xfbdef) \ MACRO(__VA_ARGS__, f32_linear_fma_xyz_xxx00_xxx0x_xxx0x , SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0x7, 0x40000, 0xba118, 0xfa118) \ - MACRO(__VA_ARGS__, f32_linear_fma_xyz_XXx00_XXx0x_XXx0x , SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0x7, 0x40000, 0xba118, 0xfad7b) \ - MACRO(__VA_ARGS__, f32_linear_fma_xyz_XXX00_XxX0x_XXX0x , SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0x7, 0x40000, 0xba118, 0xfbdbf) \ + MACRO(__VA_ARGS__, f32_linear_fma_xyz_xXx00_XXx0x_XXx0x , SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0x7, 0x40000, 0xba118, 0xfad7a) \ + MACRO(__VA_ARGS__, f32_linear_fma_xyz_xXX00_XxX0x_XXX0x , SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0x7, 0x40000, 0xba118, 0xfbdbe) \ MACRO(__VA_ARGS__, f32_linear_fma_xyz_xXX00_xXX0x_xXX0x , SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0x7, 0x40000, 0xba118, 0xfb9de) \ - MACRO(__VA_ARGS__, f32_linear_fma_xyz_XXX00_XXX0x_XXX0x , SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0x7, 0x40000, 0xba118, 0xfbdff) \ + MACRO(__VA_ARGS__, f32_linear_fma_xyz_xXX00_XXX0x_XXX0x , SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0x7, 0x40000, 0xba118, 0xfbdfe) \ MACRO(__VA_ARGS__, f32_linear_fma_xyz_x000x_0x00x_00x0x , SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0x7, 0x40000, 0xbadae, 0xfadae) \ MACRO(__VA_ARGS__, f32_linear_fma_xyz_X000x_0X00x_00X0x , SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0x7, 0x40000, 0xbadae, 0xfbdef) \ MACRO(__VA_ARGS__, f32_linear_fma_xyz_x0000_0x000_00x00 , SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0x7, 0x40000, 0xbefbe, 0xfefbe) \ - MACRO(__VA_ARGS__, f32_linear_fma_xyz_x0000_0X000_00X00 , SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0x7, 0x40000, 0xbefbe, 0xffffe) \ - MACRO(__VA_ARGS__, f32_linear_fma_xyz_X0000_0X000_00x00 , SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0x7, 0x40000, 0xbefbe, 0xfefff) \ - MACRO(__VA_ARGS__, f32_linear_fma_xyz_X0000_0X000_00X00 , SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0x7, 0x40000, 0xbefbe, 0xfffff) \ MACRO(__VA_ARGS__, f32_linear_fma_xyz_10X0x_1XX0x_1X00x , SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0x7, 0x40421, 0xbb10a, 0xfbdef) \ - MACRO(__VA_ARGS__, f32_linear_fma_w_000X0 , SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0x8, 0x01041, 0xbefbe, 0xfffff) \ + MACRO(__VA_ARGS__, f32_linear_fma_w_000x0 , SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0x8, 0x01041, 0xbefbe, 0xbffff) \ MACRO(__VA_ARGS__, f32_linear_fma_xw_x000x_000x0 , SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0x9, 0x01040, 0xbefae, 0xbffee) \ - MACRO(__VA_ARGS__, f32_linear_fma_xw_X000x_000X0 , SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0x9, 0x01040, 0xbefae, 0xfffef) \ + MACRO(__VA_ARGS__, f32_linear_fma_xw_X000x_000x0 , SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0x9, 0x01040, 0xbefae, 0xbffef) \ MACRO(__VA_ARGS__, f32_linear_fma_xw_xxx00_000x0 , SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0x9, 0x01040, 0xbefb8, 0xbfff8) \ - MACRO(__VA_ARGS__, f32_linear_fma_xw_XXX00_000X0 , SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0x9, 0x01040, 0xbefb8, 0xfffff) \ + MACRO(__VA_ARGS__, f32_linear_fma_xw_xXX00_000x0 , SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0x9, 0x01040, 0xbefb8, 0xbfffe) \ MACRO(__VA_ARGS__, f32_linear_fma_xyzw_xxx0x_xxx0x_xxx0x_000x0, SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0xf, 0x00000, 0xba108, 0xba108) \ - MACRO(__VA_ARGS__, f32_linear_fma_xyzw_XXX0x_XXX0x_XXX0x_000X0, SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0xf, 0x00000, 0xba108, 0xfbdef) \ - MACRO(__VA_ARGS__, f32_linear_fma_xyzw_X0X0x_XXX0x_XX00x_000X0, SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0xf, 0x00000, 0xbb10a, 0xfbdef) \ - MACRO(__VA_ARGS__, f32_linear_fma_xyzw_X0000_0X000_00X00_000X0, SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0xf, 0x00000, 0xbefbe, 0xfffff) + MACRO(__VA_ARGS__, f32_linear_fma_xyzw_XXX0x_XXX0x_XXX0x_000x0, SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0xf, 0x00000, 0xba108, 0xbbdef) \ + MACRO(__VA_ARGS__, f32_linear_fma_xyzw_X0X0x_XXX0x_XX00x_000x0, SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0xf, 0x00000, 0xbb10a, 0xbbdef) \ + MACRO(__VA_ARGS__, f32_linear_fma_xyzw_x0000_0x000_00x00_000x0, SWS_PIXEL_F32, SWS_UOP_LINEAR_FMA , 0xf, 0x00000, 0xbefbe, 0xbefbe) #define SWS_FOR_STRUCT_F32_LINEAR_FMA(MACRO, ...) \ MACRO(__VA_ARGS__, f32_linear_fma_x_xxx0x , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x1, .par.lin.one = 0x41040, .par.lin.zero = 0xbefa8, .par.lin.exact = 0xfffe8) \ MACRO(__VA_ARGS__, f32_linear_fma_x_XXx0x , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x1, .par.lin.one = 0x41040, .par.lin.zero = 0xbefa8, .par.lin.exact = 0xfffeb) \ @@ -1225,12 +1220,10 @@ MACRO(__VA_ARGS__, f32_linear_fma_x_x000x , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x1, .par.lin.one = 0x41040, .par.lin.zero = 0xbefae, .par.lin.exact = 0xfffee) \ MACRO(__VA_ARGS__, f32_linear_fma_x_X000x , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x1, .par.lin.one = 0x41040, .par.lin.zero = 0xbefae, .par.lin.exact = 0xfffef) \ MACRO(__VA_ARGS__, f32_linear_fma_x_xxx00 , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x1, .par.lin.one = 0x41040, .par.lin.zero = 0xbefb8, .par.lin.exact = 0xffff8) \ - MACRO(__VA_ARGS__, f32_linear_fma_x_XXx00 , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x1, .par.lin.one = 0x41040, .par.lin.zero = 0xbefb8, .par.lin.exact = 0xffffb) \ - MACRO(__VA_ARGS__, f32_linear_fma_x_XxX00 , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x1, .par.lin.one = 0x41040, .par.lin.zero = 0xbefb8, .par.lin.exact = 0xffffd) \ + MACRO(__VA_ARGS__, f32_linear_fma_x_xXx00 , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x1, .par.lin.one = 0x41040, .par.lin.zero = 0xbefb8, .par.lin.exact = 0xffffa) \ + MACRO(__VA_ARGS__, f32_linear_fma_x_xxX00 , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x1, .par.lin.one = 0x41040, .par.lin.zero = 0xbefb8, .par.lin.exact = 0xffffc) \ MACRO(__VA_ARGS__, f32_linear_fma_x_xXX00 , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x1, .par.lin.one = 0x41040, .par.lin.zero = 0xbefb8, .par.lin.exact = 0xffffe) \ - MACRO(__VA_ARGS__, f32_linear_fma_x_XXX00 , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x1, .par.lin.one = 0x41040, .par.lin.zero = 0xbefb8, .par.lin.exact = 0xfffff) \ MACRO(__VA_ARGS__, f32_linear_fma_y_0x000 , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x2, .par.lin.one = 0x41001, .par.lin.zero = 0xbefbe, .par.lin.exact = 0xfffbf) \ - MACRO(__VA_ARGS__, f32_linear_fma_y_0X000 , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x2, .par.lin.one = 0x41001, .par.lin.zero = 0xbefbe, .par.lin.exact = 0xfffff) \ MACRO(__VA_ARGS__, f32_linear_fma_xyz_xxx0x_xxx0x_xxx0x , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x7, .par.lin.one = 0x40000, .par.lin.zero = 0xba108, .par.lin.exact = 0xfa108) \ MACRO(__VA_ARGS__, f32_linear_fma_xyz_XXx0x_XXx0x_XXx0x , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x7, .par.lin.one = 0x40000, .par.lin.zero = 0xba108, .par.lin.exact = 0xfad6b) \ MACRO(__VA_ARGS__, f32_linear_fma_xyz_XXX0x_XxX0x_XXX0x , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x7, .par.lin.one = 0x40000, .par.lin.zero = 0xba108, .par.lin.exact = 0xfbdaf) \ @@ -1239,26 +1232,23 @@ MACRO(__VA_ARGS__, f32_linear_fma_xyz_x0x0x_xxx0x_xx00x , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x7, .par.lin.one = 0x40000, .par.lin.zero = 0xbb10a, .par.lin.exact = 0xfb10a) \ MACRO(__VA_ARGS__, f32_linear_fma_xyz_X0X0x_XXX0x_XX00x , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x7, .par.lin.one = 0x40000, .par.lin.zero = 0xbb10a, .par.lin.exact = 0xfbdef) \ MACRO(__VA_ARGS__, f32_linear_fma_xyz_xxx00_xxx0x_xxx0x , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x7, .par.lin.one = 0x40000, .par.lin.zero = 0xba118, .par.lin.exact = 0xfa118) \ - MACRO(__VA_ARGS__, f32_linear_fma_xyz_XXx00_XXx0x_XXx0x , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x7, .par.lin.one = 0x40000, .par.lin.zero = 0xba118, .par.lin.exact = 0xfad7b) \ - MACRO(__VA_ARGS__, f32_linear_fma_xyz_XXX00_XxX0x_XXX0x , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x7, .par.lin.one = 0x40000, .par.lin.zero = 0xba118, .par.lin.exact = 0xfbdbf) \ + MACRO(__VA_ARGS__, f32_linear_fma_xyz_xXx00_XXx0x_XXx0x , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x7, .par.lin.one = 0x40000, .par.lin.zero = 0xba118, .par.lin.exact = 0xfad7a) \ + MACRO(__VA_ARGS__, f32_linear_fma_xyz_xXX00_XxX0x_XXX0x , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x7, .par.lin.one = 0x40000, .par.lin.zero = 0xba118, .par.lin.exact = 0xfbdbe) \ MACRO(__VA_ARGS__, f32_linear_fma_xyz_xXX00_xXX0x_xXX0x , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x7, .par.lin.one = 0x40000, .par.lin.zero = 0xba118, .par.lin.exact = 0xfb9de) \ - MACRO(__VA_ARGS__, f32_linear_fma_xyz_XXX00_XXX0x_XXX0x , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x7, .par.lin.one = 0x40000, .par.lin.zero = 0xba118, .par.lin.exact = 0xfbdff) \ + MACRO(__VA_ARGS__, f32_linear_fma_xyz_xXX00_XXX0x_XXX0x , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x7, .par.lin.one = 0x40000, .par.lin.zero = 0xba118, .par.lin.exact = 0xfbdfe) \ MACRO(__VA_ARGS__, f32_linear_fma_xyz_x000x_0x00x_00x0x , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x7, .par.lin.one = 0x40000, .par.lin.zero = 0xbadae, .par.lin.exact = 0xfadae) \ MACRO(__VA_ARGS__, f32_linear_fma_xyz_X000x_0X00x_00X0x , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x7, .par.lin.one = 0x40000, .par.lin.zero = 0xbadae, .par.lin.exact = 0xfbdef) \ MACRO(__VA_ARGS__, f32_linear_fma_xyz_x0000_0x000_00x00 , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x7, .par.lin.one = 0x40000, .par.lin.zero = 0xbefbe, .par.lin.exact = 0xfefbe) \ - MACRO(__VA_ARGS__, f32_linear_fma_xyz_x0000_0X000_00X00 , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x7, .par.lin.one = 0x40000, .par.lin.zero = 0xbefbe, .par.lin.exact = 0xffffe) \ - MACRO(__VA_ARGS__, f32_linear_fma_xyz_X0000_0X000_00x00 , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x7, .par.lin.one = 0x40000, .par.lin.zero = 0xbefbe, .par.lin.exact = 0xfefff) \ - MACRO(__VA_ARGS__, f32_linear_fma_xyz_X0000_0X000_00X00 , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x7, .par.lin.one = 0x40000, .par.lin.zero = 0xbefbe, .par.lin.exact = 0xfffff) \ MACRO(__VA_ARGS__, f32_linear_fma_xyz_10X0x_1XX0x_1X00x , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x7, .par.lin.one = 0x40421, .par.lin.zero = 0xbb10a, .par.lin.exact = 0xfbdef) \ - MACRO(__VA_ARGS__, f32_linear_fma_w_000X0 , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x8, .par.lin.one = 0x1041, .par.lin.zero = 0xbefbe, .par.lin.exact = 0xfffff) \ + MACRO(__VA_ARGS__, f32_linear_fma_w_000x0 , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x8, .par.lin.one = 0x1041, .par.lin.zero = 0xbefbe, .par.lin.exact = 0xbffff) \ MACRO(__VA_ARGS__, f32_linear_fma_xw_x000x_000x0 , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x9, .par.lin.one = 0x1040, .par.lin.zero = 0xbefae, .par.lin.exact = 0xbffee) \ - MACRO(__VA_ARGS__, f32_linear_fma_xw_X000x_000X0 , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x9, .par.lin.one = 0x1040, .par.lin.zero = 0xbefae, .par.lin.exact = 0xfffef) \ + MACRO(__VA_ARGS__, f32_linear_fma_xw_X000x_000x0 , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x9, .par.lin.one = 0x1040, .par.lin.zero = 0xbefae, .par.lin.exact = 0xbffef) \ MACRO(__VA_ARGS__, f32_linear_fma_xw_xxx00_000x0 , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x9, .par.lin.one = 0x1040, .par.lin.zero = 0xbefb8, .par.lin.exact = 0xbfff8) \ - MACRO(__VA_ARGS__, f32_linear_fma_xw_XXX00_000X0 , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x9, .par.lin.one = 0x1040, .par.lin.zero = 0xbefb8, .par.lin.exact = 0xfffff) \ + MACRO(__VA_ARGS__, f32_linear_fma_xw_xXX00_000x0 , .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0x9, .par.lin.one = 0x1040, .par.lin.zero = 0xbefb8, .par.lin.exact = 0xbfffe) \ MACRO(__VA_ARGS__, f32_linear_fma_xyzw_xxx0x_xxx0x_xxx0x_000x0, .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0xf, .par.lin.one = 0x0, .par.lin.zero = 0xba108, .par.lin.exact = 0xba108) \ - MACRO(__VA_ARGS__, f32_linear_fma_xyzw_XXX0x_XXX0x_XXX0x_000X0, .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0xf, .par.lin.one = 0x0, .par.lin.zero = 0xba108, .par.lin.exact = 0xfbdef) \ - MACRO(__VA_ARGS__, f32_linear_fma_xyzw_X0X0x_XXX0x_XX00x_000X0, .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0xf, .par.lin.one = 0x0, .par.lin.zero = 0xbb10a, .par.lin.exact = 0xfbdef) \ - MACRO(__VA_ARGS__, f32_linear_fma_xyzw_X0000_0X000_00X00_000X0, .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0xf, .par.lin.one = 0x0, .par.lin.zero = 0xbefbe, .par.lin.exact = 0xfffff) + MACRO(__VA_ARGS__, f32_linear_fma_xyzw_XXX0x_XXX0x_XXX0x_000x0, .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0xf, .par.lin.one = 0x0, .par.lin.zero = 0xba108, .par.lin.exact = 0xbbdef) \ + MACRO(__VA_ARGS__, f32_linear_fma_xyzw_X0X0x_XXX0x_XX00x_000x0, .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0xf, .par.lin.one = 0x0, .par.lin.zero = 0xbb10a, .par.lin.exact = 0xbbdef) \ + MACRO(__VA_ARGS__, f32_linear_fma_xyzw_x0000_0x000_00x00_000x0, .type = SWS_PIXEL_F32, .uop = SWS_UOP_LINEAR_FMA , .mask = 0xf, .par.lin.one = 0x0, .par.lin.zero = 0xbefbe, .par.lin.exact = 0xbefbe) #define SWS_FOR_F32_DITHER(MACRO, ...) \ MACRO(__VA_ARGS__, f32_dither_x_0_16x16 , SWS_PIXEL_F32, SWS_UOP_DITHER , 0x1, 0, 0, 0, 0, 4) \ MACRO(__VA_ARGS__, f32_dither_y_3_16x16 , SWS_PIXEL_F32, SWS_UOP_DITHER , 0x2, 0, 3, 0, 0, 4) \ -- 2.52.0 _______________________________________________ ffmpeg-devel mailing list -- [email protected] To unsubscribe send an email to [email protected]
