PR #20919 opened by brad
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20919
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20919.patch

Co-authored-by: Sebastien Marie <[email protected]>


>From c6926060a98f335e069c3e631b96cc4e0b8e758e Mon Sep 17 00:00:00 2001
From: Brad Smith <[email protected]>
Date: Fri, 14 Nov 2025 05:01:17 -0500
Subject: [PATCH] lavu/x86: add Intel CET support

Co-authored-by: Sebastien Marie <[email protected]>
---
 configure                  |  6 ++++++
 libavutil/x86/tx_float.asm | 24 ++++++++++++++++++++++++
 libavutil/x86/x86inc.asm   |  7 +++++++
 3 files changed, 37 insertions(+)

diff --git a/configure b/configure
index 659b428cfc..f1bcfcf729 100755
--- a/configure
+++ b/configure
@@ -2258,6 +2258,7 @@ ARCH_EXT_LIST_X86_SIMD="
     avx2
     avx512
     avx512icl
+    cet
     fma3
     fma4
     mmx
@@ -6639,6 +6640,11 @@ EOF
     enabled ssse3  && check_inline_asm ssse3_inline  '"pabsw %xmm0, %xmm0"'
     enabled mmxext && check_inline_asm mmxext_inline '"pmaxub %mm0, %mm1"'
 
+    # check whether Intel CET is in use
+    if enabled x86_64; then
+        check_cpp_condition cet "stddef.h" "defined(__CET__)"
+    fi
+
     probe_x86asm(){
         x86asmexe_probe=$1
         if test_cmd $x86asmexe_probe -v; then
diff --git a/libavutil/x86/tx_float.asm b/libavutil/x86/tx_float.asm
index c030147ce8..7f31b05c60 100644
--- a/libavutil/x86/tx_float.asm
+++ b/libavutil/x86/tx_float.asm
@@ -135,6 +135,7 @@ SECTION .text
 ; %1 - coefficients (r0.reim, r1.reim)
 ; %2 - temporary
 %macro FFT2 2
+    _CET_ENDBR
     shufps   %2, %1, %1, q3322
     shufps   %1, %1, %1, q1100
 
@@ -148,6 +149,7 @@ SECTION .text
 ; %2 - odd coefficients  (r1.reim, r3.reim, r5.reim, r7.reim)
 ; %3 - temporary
 %macro FFT4 3
+    _CET_ENDBR
     subps  %3, %1, %2         ;  r1234, [r5678]
     addps  %1, %1, %2         ;  t1234, [t5678]
 
@@ -171,6 +173,7 @@ SECTION .text
 ; %5 - temporary
 ; %6 - temporary
 %macro FFT8 6
+    _CET_ENDBR
     addps    %5, %1, %3               ; q1-8
     addps    %6, %2, %4               ; k1-8
 
@@ -212,6 +215,7 @@ SECTION .text
 ; %3 - temporary
 ; %4 - temporary
 %macro FFT8_AVX 4
+    _CET_ENDBR
     subps      %3, %1, %2               ;  r1234, r5678
     addps      %1, %1, %2               ;  q1234, q5678
 
@@ -251,6 +255,7 @@ SECTION .text
 ; %5, %6 - temporary
 ; %7, %8 - temporary (optional)
 %macro FFT16 6-8
+    _CET_ENDBR
     FFT4       %3, %4, %5
 %if %0 > 7
     FFT8_AVX   %1, %2, %6, %7
@@ -320,6 +325,7 @@ SECTION .text
 ; xm14 - out[0]
 ; xm15 - out[10, 5]
 %macro FFT15 0
+    _CET_ENDBR
     shufps xm1, xm0, xm0, q3223      ; in[1].imrereim
     shufps xm0, xm0, xm0, q1001      ; in[0].imrereim
 
@@ -438,6 +444,7 @@ SECTION .text
 ; Output is slightly permuted such that tx2,3's coefficients are interleaved
 ; on a 2-point basis (look at `doc/transforms.md`)
 %macro SPLIT_RADIX_COMBINE 17
+    _CET_ENDBR
 %if %1 && mmsize == 32
     vperm2f128 %14, %6, %7, 0x20     ; m2[0], m2[1], m3[0], m3[1] even
     vperm2f128 %16, %9, %8, 0x20     ; m2[0], m2[1], m3[0], m3[1] odd
@@ -517,6 +524,7 @@ SECTION .text
 ; however, if the twiddles aren't needed after this, the registers they use
 ; can be used as any of the temporary registers.
 %macro SPLIT_RADIX_COMBINE_HALF 10
+    _CET_ENDBR
 %if %1
     shufps     %8, %6, %6, q2200     ; cos00224466
     shufps     %9, %7, %7, q1133     ; wim77553311
@@ -559,6 +567,7 @@ SECTION .text
 
 ; Same as above, tries REALLY hard to use 2 temporary registers.
 %macro SPLIT_RADIX_COMBINE_LITE 9
+    _CET_ENDBR
 %if %1
     shufps     %8, %6, %6, q2200        ; cos00224466
     shufps     %9, %7, %7, q1133        ; wim77553311
@@ -607,6 +616,7 @@ SECTION .text
 %endmacro
 
 %macro SPLIT_RADIX_COMBINE_64 0
+    _CET_ENDBR
     SPLIT_RADIX_COMBINE_LITE 1, m0, m1, tx1_e0, tx2_e0, tw_e, tw_o, tmp1, tmp2
 
     movaps [outq +  0*mmsize], m0
@@ -648,6 +658,7 @@ SECTION .text
 ; combine loop
 ; %1 must contain len*2, %2 must contain len*4, %3 must contain len*6
 %macro SPLIT_RADIX_LOAD_COMBINE_4 8
+    _CET_ENDBR
     movaps m8,         [rtabq + (%5)*mmsize + %7]
     vperm2f128 m9, m9, [itabq - (%5)*mmsize + %8], 0x23
 
@@ -693,6 +704,7 @@ SECTION .text
 %else
 %define offset_i 0
 %endif
+    _CET_ENDBR
 
     SPLIT_RADIX_LOAD_COMBINE_4 %1, 2*%1, %2, 0, 0, offset_c, offset_r, offset_i
     SPLIT_RADIX_LOAD_COMBINE_4 %1, 2*%1, %2, 1, 1, offset_c, offset_r, offset_i
@@ -705,6 +717,7 @@ SECTION .text
 ; a full combine+deinterleave loop
 ; %3 must contain len*2, %4 must contain len*4, %5 must contain len*6
 %macro SPLIT_RADIX_COMBINE_DEINTERLEAVE_2 6
+    _CET_ENDBR
     movaps m8,         [rtabq + (0 + %2)*mmsize]
     vperm2f128 m9, m9, [itabq - (0 + %2)*mmsize], 0x23
 
@@ -807,6 +820,7 @@ SECTION .text
 %else
 %define offset 0
 %endif
+    _CET_ENDBR
     SPLIT_RADIX_COMBINE_DEINTERLEAVE_2 0, 0, %1, %1*2, %2, offset
     SPLIT_RADIX_COMBINE_DEINTERLEAVE_2 4, 2, %1, %1*2, %2, offset
 %endmacro
@@ -826,6 +840,7 @@ cglobal fft2_float, 4, 4, 2, ctx, out, in, stride
 
 %macro FFT4_FN 3
 INIT_XMM sse2
+    _CET_ENDBR
 %if %3
 cglobal fft4_ %+ %1 %+ _asm_float, 0, 0, 0, ctx, out, in, stride
 %else
@@ -862,6 +877,7 @@ FFT4_FN inv, 1, 1
 
 %macro FFT8_SSE_FN 1
 INIT_XMM sse3
+    _CET_ENDBR
 %if %1
 cglobal fft8_asm_float, 0, 0, 0, ctx, out, in, stride, tmp
     movaps m0, [inq + 0*mmsize]
@@ -907,6 +923,7 @@ FFT8_SSE_FN 1
 
 %macro FFT8_AVX_FN 1
 INIT_YMM avx
+    _CET_ENDBR
 %if %1
 cglobal fft8_asm_float, 0, 0, 0, ctx, out, in, stride, tmp
     movaps m0, [inq + 0*mmsize]
@@ -947,6 +964,7 @@ FFT8_AVX_FN 1
 
 %macro FFT16_FN 2
 INIT_YMM %1
+    _CET_ENDBR
 %if %2
 cglobal fft16_asm_float, 0, 0, 0, ctx, out, in, stride, tmp
     movaps m0, [inq + 0*mmsize]
@@ -998,6 +1016,7 @@ FFT16_FN fma3, 1
 
 %macro FFT32_FN 2
 INIT_YMM %1
+    _CET_ENDBR
 %if %2
 cglobal fft32_asm_float, 0, 0, 0, ctx, out, in, stride, tmp
     movaps m4, [inq + 4*mmsize]
@@ -1084,6 +1103,7 @@ FFT32_FN fma3, 1
 
 %macro FFT_SPLIT_RADIX_DEF 1-2
 ALIGN 16
+    _CET_ENDBR
 .%1 %+ pt:
     PUSH lenq
     mov lenq, (%1/4)
@@ -1122,6 +1142,7 @@ ALIGN 16
 
 %macro FFT_SPLIT_RADIX_FN 2
 INIT_YMM %1
+    _CET_ENDBR
 %if %2
 cglobal fft_sr_asm_float, 0, 0, 0, ctx, out, in, stride, len, lut, itab, rtab, 
tgt, tmp
 %else
@@ -1523,6 +1544,7 @@ FFT_SPLIT_RADIX_FN avx2, 1
 
 %macro FFT15_FN 2
 INIT_YMM avx2
+    _CET_ENDBR
 cglobal fft15_ %+ %2, 4, 10, 16, ctx, out, in, stride, len, lut, tmp, tgt5, 
stride3, stride5
     mov lutq, [ctxq + AVTXContext.map]
 
@@ -1586,6 +1608,7 @@ FFT15_FN 1, ns_float
 
 %macro IMDCT_FN 1
 INIT_YMM %1
+    _CET_ENDBR
 cglobal mdct_inv_float, 4, 14, 16, 320, ctx, out, in, stride, len, lut, exp, 
t1, t2, t3, \
                                         t4, t5, btmp
     movsxd lenq, dword [ctxq + AVTXContext.len]
@@ -1773,6 +1796,7 @@ IMDCT_FN avx2
 
 %macro PFA_15_FN 2
 INIT_YMM %1
+    _CET_ENDBR
 %if %2
 cglobal fft_pfa_15xM_asm_float, 0, 0, 0, ctx, out, in, stride, len, lut, buf, 
map, tgt, tmp, \
                                          tgt5, stride3, stride5, btmp
diff --git a/libavutil/x86/x86inc.asm b/libavutil/x86/x86inc.asm
index e61d924bc1..069a11433e 100644
--- a/libavutil/x86/x86inc.asm
+++ b/libavutil/x86/x86inc.asm
@@ -46,6 +46,12 @@
     %endif
 %endif
 
+%if HAVE_CET
+    %define _CET_ENDBR endbr64
+%else
+    %define _CET_ENDBR
+%endif
+
 %define WIN64  0
 %define UNIX64 0
 %if ARCH_X86_64
@@ -849,6 +855,7 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, 
jng, jnge, ja, jae,
     %endif
     align function_align
     %2:
+    _CET_ENDBR
     RESET_MM_PERMUTATION        ; needed for x86-64, also makes disassembly 
somewhat nicer
     %xdefine rstk rsp           ; copy of the original stack pointer, used 
when greater alignment than the known stack alignment is required
     %assign stack_offset 0      ; stack pointer offset relative to the return 
address
-- 
2.49.1

_______________________________________________
ffmpeg-devel mailing list -- [email protected]
To unsubscribe send an email to [email protected]

Reply via email to