PR #20919 opened by brad URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20919 Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20919.patch
Co-authored-by: Sebastien Marie <[email protected]> >From c6926060a98f335e069c3e631b96cc4e0b8e758e Mon Sep 17 00:00:00 2001 From: Brad Smith <[email protected]> Date: Fri, 14 Nov 2025 05:01:17 -0500 Subject: [PATCH] lavu/x86: add Intel CET support Co-authored-by: Sebastien Marie <[email protected]> --- configure | 6 ++++++ libavutil/x86/tx_float.asm | 24 ++++++++++++++++++++++++ libavutil/x86/x86inc.asm | 7 +++++++ 3 files changed, 37 insertions(+) diff --git a/configure b/configure index 659b428cfc..f1bcfcf729 100755 --- a/configure +++ b/configure @@ -2258,6 +2258,7 @@ ARCH_EXT_LIST_X86_SIMD=" avx2 avx512 avx512icl + cet fma3 fma4 mmx @@ -6639,6 +6640,11 @@ EOF enabled ssse3 && check_inline_asm ssse3_inline '"pabsw %xmm0, %xmm0"' enabled mmxext && check_inline_asm mmxext_inline '"pmaxub %mm0, %mm1"' + # check whether Intel CET is in use + if enabled x86_64; then + check_cpp_condition cet "stddef.h" "defined(__CET__)" + fi + probe_x86asm(){ x86asmexe_probe=$1 if test_cmd $x86asmexe_probe -v; then diff --git a/libavutil/x86/tx_float.asm b/libavutil/x86/tx_float.asm index c030147ce8..7f31b05c60 100644 --- a/libavutil/x86/tx_float.asm +++ b/libavutil/x86/tx_float.asm @@ -135,6 +135,7 @@ SECTION .text ; %1 - coefficients (r0.reim, r1.reim) ; %2 - temporary %macro FFT2 2 + _CET_ENDBR shufps %2, %1, %1, q3322 shufps %1, %1, %1, q1100 @@ -148,6 +149,7 @@ SECTION .text ; %2 - odd coefficients (r1.reim, r3.reim, r5.reim, r7.reim) ; %3 - temporary %macro FFT4 3 + _CET_ENDBR subps %3, %1, %2 ; r1234, [r5678] addps %1, %1, %2 ; t1234, [t5678] @@ -171,6 +173,7 @@ SECTION .text ; %5 - temporary ; %6 - temporary %macro FFT8 6 + _CET_ENDBR addps %5, %1, %3 ; q1-8 addps %6, %2, %4 ; k1-8 @@ -212,6 +215,7 @@ SECTION .text ; %3 - temporary ; %4 - temporary %macro FFT8_AVX 4 + _CET_ENDBR subps %3, %1, %2 ; r1234, r5678 addps %1, %1, %2 ; q1234, q5678 @@ -251,6 +255,7 @@ SECTION .text ; %5, %6 - temporary ; %7, %8 - temporary (optional) %macro FFT16 6-8 + _CET_ENDBR FFT4 %3, %4, %5 %if %0 > 7 FFT8_AVX %1, %2, %6, %7 @@ -320,6 +325,7 @@ SECTION .text ; xm14 - out[0] ; xm15 - out[10, 5] %macro FFT15 0 + _CET_ENDBR shufps xm1, xm0, xm0, q3223 ; in[1].imrereim shufps xm0, xm0, xm0, q1001 ; in[0].imrereim @@ -438,6 +444,7 @@ SECTION .text ; Output is slightly permuted such that tx2,3's coefficients are interleaved ; on a 2-point basis (look at `doc/transforms.md`) %macro SPLIT_RADIX_COMBINE 17 + _CET_ENDBR %if %1 && mmsize == 32 vperm2f128 %14, %6, %7, 0x20 ; m2[0], m2[1], m3[0], m3[1] even vperm2f128 %16, %9, %8, 0x20 ; m2[0], m2[1], m3[0], m3[1] odd @@ -517,6 +524,7 @@ SECTION .text ; however, if the twiddles aren't needed after this, the registers they use ; can be used as any of the temporary registers. %macro SPLIT_RADIX_COMBINE_HALF 10 + _CET_ENDBR %if %1 shufps %8, %6, %6, q2200 ; cos00224466 shufps %9, %7, %7, q1133 ; wim77553311 @@ -559,6 +567,7 @@ SECTION .text ; Same as above, tries REALLY hard to use 2 temporary registers. %macro SPLIT_RADIX_COMBINE_LITE 9 + _CET_ENDBR %if %1 shufps %8, %6, %6, q2200 ; cos00224466 shufps %9, %7, %7, q1133 ; wim77553311 @@ -607,6 +616,7 @@ SECTION .text %endmacro %macro SPLIT_RADIX_COMBINE_64 0 + _CET_ENDBR SPLIT_RADIX_COMBINE_LITE 1, m0, m1, tx1_e0, tx2_e0, tw_e, tw_o, tmp1, tmp2 movaps [outq + 0*mmsize], m0 @@ -648,6 +658,7 @@ SECTION .text ; combine loop ; %1 must contain len*2, %2 must contain len*4, %3 must contain len*6 %macro SPLIT_RADIX_LOAD_COMBINE_4 8 + _CET_ENDBR movaps m8, [rtabq + (%5)*mmsize + %7] vperm2f128 m9, m9, [itabq - (%5)*mmsize + %8], 0x23 @@ -693,6 +704,7 @@ SECTION .text %else %define offset_i 0 %endif + _CET_ENDBR SPLIT_RADIX_LOAD_COMBINE_4 %1, 2*%1, %2, 0, 0, offset_c, offset_r, offset_i SPLIT_RADIX_LOAD_COMBINE_4 %1, 2*%1, %2, 1, 1, offset_c, offset_r, offset_i @@ -705,6 +717,7 @@ SECTION .text ; a full combine+deinterleave loop ; %3 must contain len*2, %4 must contain len*4, %5 must contain len*6 %macro SPLIT_RADIX_COMBINE_DEINTERLEAVE_2 6 + _CET_ENDBR movaps m8, [rtabq + (0 + %2)*mmsize] vperm2f128 m9, m9, [itabq - (0 + %2)*mmsize], 0x23 @@ -807,6 +820,7 @@ SECTION .text %else %define offset 0 %endif + _CET_ENDBR SPLIT_RADIX_COMBINE_DEINTERLEAVE_2 0, 0, %1, %1*2, %2, offset SPLIT_RADIX_COMBINE_DEINTERLEAVE_2 4, 2, %1, %1*2, %2, offset %endmacro @@ -826,6 +840,7 @@ cglobal fft2_float, 4, 4, 2, ctx, out, in, stride %macro FFT4_FN 3 INIT_XMM sse2 + _CET_ENDBR %if %3 cglobal fft4_ %+ %1 %+ _asm_float, 0, 0, 0, ctx, out, in, stride %else @@ -862,6 +877,7 @@ FFT4_FN inv, 1, 1 %macro FFT8_SSE_FN 1 INIT_XMM sse3 + _CET_ENDBR %if %1 cglobal fft8_asm_float, 0, 0, 0, ctx, out, in, stride, tmp movaps m0, [inq + 0*mmsize] @@ -907,6 +923,7 @@ FFT8_SSE_FN 1 %macro FFT8_AVX_FN 1 INIT_YMM avx + _CET_ENDBR %if %1 cglobal fft8_asm_float, 0, 0, 0, ctx, out, in, stride, tmp movaps m0, [inq + 0*mmsize] @@ -947,6 +964,7 @@ FFT8_AVX_FN 1 %macro FFT16_FN 2 INIT_YMM %1 + _CET_ENDBR %if %2 cglobal fft16_asm_float, 0, 0, 0, ctx, out, in, stride, tmp movaps m0, [inq + 0*mmsize] @@ -998,6 +1016,7 @@ FFT16_FN fma3, 1 %macro FFT32_FN 2 INIT_YMM %1 + _CET_ENDBR %if %2 cglobal fft32_asm_float, 0, 0, 0, ctx, out, in, stride, tmp movaps m4, [inq + 4*mmsize] @@ -1084,6 +1103,7 @@ FFT32_FN fma3, 1 %macro FFT_SPLIT_RADIX_DEF 1-2 ALIGN 16 + _CET_ENDBR .%1 %+ pt: PUSH lenq mov lenq, (%1/4) @@ -1122,6 +1142,7 @@ ALIGN 16 %macro FFT_SPLIT_RADIX_FN 2 INIT_YMM %1 + _CET_ENDBR %if %2 cglobal fft_sr_asm_float, 0, 0, 0, ctx, out, in, stride, len, lut, itab, rtab, tgt, tmp %else @@ -1523,6 +1544,7 @@ FFT_SPLIT_RADIX_FN avx2, 1 %macro FFT15_FN 2 INIT_YMM avx2 + _CET_ENDBR cglobal fft15_ %+ %2, 4, 10, 16, ctx, out, in, stride, len, lut, tmp, tgt5, stride3, stride5 mov lutq, [ctxq + AVTXContext.map] @@ -1586,6 +1608,7 @@ FFT15_FN 1, ns_float %macro IMDCT_FN 1 INIT_YMM %1 + _CET_ENDBR cglobal mdct_inv_float, 4, 14, 16, 320, ctx, out, in, stride, len, lut, exp, t1, t2, t3, \ t4, t5, btmp movsxd lenq, dword [ctxq + AVTXContext.len] @@ -1773,6 +1796,7 @@ IMDCT_FN avx2 %macro PFA_15_FN 2 INIT_YMM %1 + _CET_ENDBR %if %2 cglobal fft_pfa_15xM_asm_float, 0, 0, 0, ctx, out, in, stride, len, lut, buf, map, tgt, tmp, \ tgt5, stride3, stride5, btmp diff --git a/libavutil/x86/x86inc.asm b/libavutil/x86/x86inc.asm index e61d924bc1..069a11433e 100644 --- a/libavutil/x86/x86inc.asm +++ b/libavutil/x86/x86inc.asm @@ -46,6 +46,12 @@ %endif %endif +%if HAVE_CET + %define _CET_ENDBR endbr64 +%else + %define _CET_ENDBR +%endif + %define WIN64 0 %define UNIX64 0 %if ARCH_X86_64 @@ -849,6 +855,7 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, %endif align function_align %2: + _CET_ENDBR RESET_MM_PERMUTATION ; needed for x86-64, also makes disassembly somewhat nicer %xdefine rstk rsp ; copy of the original stack pointer, used when greater alignment than the known stack alignment is required %assign stack_offset 0 ; stack pointer offset relative to the return address -- 2.49.1 _______________________________________________ ffmpeg-devel mailing list -- [email protected] To unsubscribe send an email to [email protected]
