This is an automated email from the git hooks/post-receive script.

Git pushed a commit to branch master
in repository ffmpeg.

commit 9cbd8896708050f12ccfcb1dd9aefda9e57dc349
Author:     Niklas Haas <[email protected]>
AuthorDate: Tue Jun 9 12:49:20 2026 +0200
Commit:     Niklas Haas <[email protected]>
CommitDate: Sat Jun 20 14:08:49 2026 +0000

    swscale/x86/ops: add AVX2/SSE4 path for SWS_UOP_READ_PALETTE
    
    The AVX2 is a fairly straightforward vpgatherdd + 4x4 transpose. The SSE4
    fallback is an unrolled scalar loop, for lack of anything better to do.
    
    checkasm:
     - CPU: AMD Ryzen 9 9950X3D 16-Core Processor (00B40F40)
     - Timing source: x86 (rdtsc)
     - Bench duration: 10000 µs per function (45898205 cycles)
     - Random seed: 2518020648
    
    Benchmark results:
      name                             cycles (vs ref)
      u8_read_palette_xyzw_c:          2877.5
      u8_read_palette_xyzw_x86_sse4:   1951.9 ( 1.47x)
      u8_read_palette_xyzw_x86_avx2:   1051.6 ( 2.74x)
    
    Sponsored-by: Sovereign Tech Fund
    Signed-off-by: Niklas Haas <[email protected]>
---
 libswscale/x86/ops.c             |  2 ++
 libswscale/x86/ops_int.asm       | 57 ++++++++++++++++++++++++++++++++++++++++
 libswscale/x86/uops_macros.asm.h |  1 +
 3 files changed, 60 insertions(+)

diff --git a/libswscale/x86/ops.c b/libswscale/x86/ops.c
index b522ca06e3..31c2199622 100644
--- a/libswscale/x86/ops.c
+++ b/libswscale/x86/ops.c
@@ -312,6 +312,7 @@ static bool uop_is_type_invariant(const SwsUOpType uop)
 SWS_FOR_STRUCT(TYPE, READ_PACKED,     DECL_ENTRY, EXT, NULL, setup_rw_packed)  
 \
 SWS_FOR_STRUCT(TYPE, READ_NIBBLE,     DECL_ENTRY, EXT, NULL, NULL)             
 \
 SWS_FOR_STRUCT(TYPE, READ_BIT,        DECL_ENTRY, EXT, NULL, NULL)             
 \
+SWS_FOR_STRUCT(TYPE, READ_PALETTE,    DECL_ENTRY, EXT, NULL, NULL)             
 \
 SWS_FOR_STRUCT(TYPE, WRITE_PACKED,    DECL_ENTRY, EXT, NULL, setup_rw_packed)  
 \
 SWS_FOR_STRUCT(TYPE, WRITE_NIBBLE,    DECL_ENTRY, EXT, NULL, NULL)             
 \
 SWS_FOR_STRUCT(TYPE, WRITE_BIT,       DECL_ENTRY, EXT, NULL, NULL)             
 \
@@ -334,6 +335,7 @@ SWS_FOR_STRUCT(TYPE, DITHER,          DECL_ENTRY, EXT, 
NULL, setup_dither)
     SWS_FOR(TYPE, READ_PACKED,    REF_ENTRY, EXT)                              
 \
     SWS_FOR(TYPE, READ_NIBBLE,    REF_ENTRY, EXT)                              
 \
     SWS_FOR(TYPE, READ_BIT,       REF_ENTRY, EXT)                              
 \
+    SWS_FOR(TYPE, READ_PALETTE,   REF_ENTRY, EXT)                              
 \
     SWS_FOR(TYPE, WRITE_PACKED,   REF_ENTRY, EXT)                              
 \
     SWS_FOR(TYPE, WRITE_NIBBLE,   REF_ENTRY, EXT)                              
 \
     SWS_FOR(TYPE, WRITE_BIT,      REF_ENTRY, EXT)                              
 \
diff --git a/libswscale/x86/ops_int.asm b/libswscale/x86/ops_int.asm
index 111e6d0796..ce9ab1fdc9 100644
--- a/libswscale/x86/ops_int.asm
+++ b/libswscale/x86/ops_int.asm
@@ -282,6 +282,62 @@ IF1 V2, read_packed34 mx2, my2, mz2, mw2, in0q + mmsize * 
COMPS
         CONTINUE tmp0q
 %endmacro
 
+%macro read_pal8 6 ; x, y, z, w, palette, index
+%if cpuflag(avx2)
+        pmovzxbd %1, [%6 + 0]
+        pmovzxbd %2, [%6 + 8]
+        pmovzxbd %3, [%6 + 16]
+        pmovzxbd %4, [%6 + 24]
+        vperm2i128 m8,  %1, %3, q0200
+        vperm2i128 m9,  %1, %3, q0301
+        vperm2i128 m10, %2, %4, q0200
+        vperm2i128 m11, %2, %4, q0301
+        pcmpeqb m14, m14
+        pcmpeqb m15, m15
+        vpgatherdd %1, [%5 + 4 * m8], m14
+        vpgatherdd %2, [%5 + 4 * m9], m15
+        pcmpeqb m14, m14
+        pcmpeqb m15, m15
+        vpgatherdd %3, [%5 + 4 * m10], m14
+        vpgatherdd %4, [%5 + 4 * m11], m15
+        pshufb %1, m12
+        pshufb %2, m12
+        pshufb %3, m12
+        pshufb %4, m12
+        punpckldq m8,  %1, %2
+        punpckldq m9,  %3, %4
+        punpckhdq m10, %1, %2
+        punpckhdq m11, %3, %4
+        punpcklqdq %1, m8, m9
+        punpckhqdq %2, m8, m9
+        punpcklqdq %3, m10, m11
+        punpckhqdq %4, m10, m11
+%else ; !cpuflag(avx2)
+    %assign i 0
+    %rep 16
+        movzx tmp1d, byte [%6 + i]
+        pinsrb %1, [%5 + 4 * tmp1q + 0], i
+        pinsrb %2, [%5 + 4 * tmp1q + 1], i
+        pinsrb %3, [%5 + 4 * tmp1q + 2], i
+        pinsrb %4, [%5 + 4 * tmp1q + 3], i
+    %assign i i+1
+    %endrep
+%endif
+%endmacro
+
+%macro READ_PALETTE 0
+assert COMPS == 4
+assert BITS  == 8
+    %if cpuflag(avx2)
+        VBROADCASTI128 m12, [read8_unpack4]
+    %endif
+        LOAD_CONT tmp0q
+        read_pal8 mx,  my,  mz,  mw,  in1q, in0q
+IF1 V2, read_pal8 mx2, my2, mz2, mw2, in1q, in0q + mmsize
+        add in0q, BLOCK_SIZE
+        CONTINUE tmp0q
+%endmacro
+
 %macro write_packed2 0
     %if cpuflag(avx2)
         vpermq mx, mx, q3120       ; { X0 X2 | X1 X3 }
@@ -716,6 +772,7 @@ assert 0, SWS_UOP_DITHER is not implemented for integer 
types
     DECL_%1_READ_PACKED     (READ_PACKED)
     DECL_%1_READ_NIBBLE     (READ_NIBBLE)
     DECL_%1_READ_BIT        (READ_BIT)
+    DECL_%1_READ_PALETTE    (READ_PALETTE)
     DECL_%1_WRITE_PACKED    (WRITE_PACKED)
     DECL_%1_WRITE_NIBBLE    (WRITE_NIBBLE)
     DECL_%1_WRITE_BIT       (WRITE_BIT)
diff --git a/libswscale/x86/uops_macros.asm.h b/libswscale/x86/uops_macros.asm.h
index d9565d12f2..fce08f320f 100644
--- a/libswscale/x86/uops_macros.asm.h
+++ b/libswscale/x86/uops_macros.asm.h
@@ -57,6 +57,7 @@
     {DEF_MACRO(READ_PLANAR_FH,      TYPE)}, \
     {DEF_MACRO(READ_PLANAR_FV,      TYPE)}, \
     {DEF_MACRO(READ_PLANAR_FV_FMA,  TYPE)}, \
+    {DEF_MACRO(READ_PALETTE,        TYPE)}, \
     {DEF_MACRO(WRITE_BIT,           TYPE)}, \
     {DEF_MACRO(WRITE_NIBBLE,        TYPE)}, \
     {DEF_MACRO(WRITE_PACKED,        TYPE)}, \

_______________________________________________
ffmpeg-cvslog mailing list -- [email protected]
To unsubscribe send an email to [email protected]

Reply via email to