Hi, For _mm256_broadcastsi128_si256 call with -mapxf enabled it may produce illegal vbroadcasti128 with egpr under high register pressure.
See https://godbolt.org/z/jbWTPfn1f Restrict the pattern to use "jm" and gpr16 for avx2 alternative. Bootstrapped & regtested on x86_64-pc-linux-gnu. Ok for trunk and backport down to gcc14? gcc/ChangeLog: * config/i386/sse.md (avx2_vbroadcasti128_<mode>): Constraint alternative 0 with jm and add gpr16 attr to avoid egpr usage. gcc/testsuite/ChangeLog: * gcc.target/i386/apx-broadcast.c: New test. --- gcc/config/i386/sse.md | 5 ++-- gcc/testsuite/gcc.target/i386/apx-broadcast.c | 24 +++++++++++++++++++ 2 files changed, 27 insertions(+), 2 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/apx-broadcast.c diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 7bab6eb3b97..4bfbd3c59a9 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -28332,17 +28332,18 @@ (define_insn "*vec_dupv2di" (define_insn "avx2_vbroadcasti128_<mode>" [(set (match_operand:VI_256 0 "register_operand" "=x,v,v") (vec_concat:VI_256 - (match_operand:<ssehalfvecmode> 1 "memory_operand" "m,m,m") + (match_operand:<ssehalfvecmode> 1 "memory_operand" "jm,m,m") (match_dup 1)))] "TARGET_AVX2" "@ vbroadcasti128\t{%1, %0|%0, %1} vbroadcast<i128vldq>\t{%1, %0|%0, %1} vbroadcast<shuffletype>32x4\t{%1, %0|%0, %1}" - [(set_attr "isa" "*,avx512dq,avx512vl") + [(set_attr "isa" "noavx512vl,avx512dq,avx512vl") (set_attr "type" "ssemov") (set_attr "prefix_extra" "1") (set_attr "prefix" "vex,evex,evex") + (set_attr "addr" "gpr16,*,*") (set_attr "mode" "OI")]) ;; optimize vlddqu + vinserti128 to vbroadcasti128, the former will use diff --git a/gcc/testsuite/gcc.target/i386/apx-broadcast.c b/gcc/testsuite/gcc.target/i386/apx-broadcast.c new file mode 100644 index 00000000000..7678dcd4550 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/apx-broadcast.c @@ -0,0 +1,24 @@ +/* { dg-do assemble { target { apxf && { ! ia32 } } } } */ +/* { dg-options "-mavx512vl -mapxf -O2" } */ + +#include <stdint.h> +#include <immintrin.h> + +void broadcast_avx2(int *sx, __m256i *coeff, __m128i *temp) { + + __m256i semp[8]; + + + for (int i = 0; i < 8; i++) + { + asm volatile ("" : : : "r8", "r9", "r10", "r11", "r12", "r13", + "r14", "r15", "rax", "rcx", "rsi", "rdi", "rdx"); + register volatile uint64_t sm asm ("%r16") = i; + semp[i] = _mm256_broadcastsi128_si256(temp[sm]); + } + + coeff[0] = _mm256_unpacklo_epi64(semp[0], semp[1]); + coeff[1] = _mm256_unpackhi_epi64(semp[2], semp[3]); + coeff[2] = _mm256_unpacklo_epi64(semp[4], semp[5]); + coeff[3] = _mm256_unpackhi_epi64(semp[6], semp[7]); +} -- 2.31.1
