On Fri, Aug 30, 2024 at 6:49 AM liuhongt <hongtao....@intel.com> wrote: > > > Can the above loop be a part of ix86_check_avx_upper_register, so this > > function would scan the full RTX for avx upper register? > Changed, also adjust ix86_check_avx_upper_stores and ix86_avx_u128_mode_needed > to either inline the old ix86_check_avx_upper_register or replace > FOR_EACH_SUBRTX > with new ix86_check_avx_upper_register. > > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}. > Ok for trunk and backport? > > For function arguments/return, when it's BLK mode, it's put in a > parallel with an expr_list, and the expr_list contains the real mode > and registers. > Current ix86_check_avx_upper_register only checked for SSE_REG_P, and > failed to handle that. The patch extend the handle to each subrtx. > > gcc/ChangeLog: > > PR target/116512 > * config/i386/i386.cc (ix86_check_avx_upper_register): Iterate > subrtx to scan for avx upper register. > (ix86_check_avx_upper_stores): Inline old > ix86_check_avx_upper_register. > (ix86_avx_u128_mode_needed): Ditto, and replace > FOR_EACH_SUBRTX with call to new > ix86_check_avx_upper_register. > > gcc/testsuite/ChangeLog: > > * gcc.target/i386/pr116512.c: New test.
OK for all branches. Perhaps we could put the repeated condition in a macro, but this could be an eventual follow-up patch. Thanks, Uros. > --- > gcc/config/i386/i386.cc | 36 +++++++++++++++--------- > gcc/testsuite/gcc.target/i386/pr116512.c | 26 +++++++++++++++++ > 2 files changed, 49 insertions(+), 13 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/i386/pr116512.c > > diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc > index 224a78cc832..c40cee5b885 100644 > --- a/gcc/config/i386/i386.cc > +++ b/gcc/config/i386/i386.cc > @@ -14881,9 +14881,19 @@ ix86_dirflag_mode_needed (rtx_insn *insn) > static bool > ix86_check_avx_upper_register (const_rtx exp) > { > - return (SSE_REG_P (exp) > - && !EXT_REX_SSE_REG_P (exp) > - && GET_MODE_BITSIZE (GET_MODE (exp)) > 128); > + /* construct_container may return a parallel with expr_list > + which contains the real reg and mode */ > + subrtx_iterator::array_type array; > + FOR_EACH_SUBRTX (iter, array, exp, NONCONST) > + { > + const_rtx x = *iter; > + if (SSE_REG_P (x) > + && !EXT_REX_SSE_REG_P (x) > + && GET_MODE_BITSIZE (GET_MODE (x)) > 128) > + return true; > + } > + > + return false; > } > > /* Check if a 256bit or 512bit AVX register is referenced in stores. */ > @@ -14891,7 +14901,9 @@ ix86_check_avx_upper_register (const_rtx exp) > static void > ix86_check_avx_upper_stores (rtx dest, const_rtx, void *data) > { > - if (ix86_check_avx_upper_register (dest)) > + if (SSE_REG_P (dest) > + && !EXT_REX_SSE_REG_P (dest) > + && GET_MODE_BITSIZE (GET_MODE (dest)) > 128) > { > bool *used = (bool *) data; > *used = true; > @@ -14950,14 +14962,14 @@ ix86_avx_u128_mode_needed (rtx_insn *insn) > return AVX_U128_CLEAN; > } > > - subrtx_iterator::array_type array; > - > rtx set = single_set (insn); > if (set) > { > rtx dest = SET_DEST (set); > rtx src = SET_SRC (set); > - if (ix86_check_avx_upper_register (dest)) > + if (SSE_REG_P (dest) > + && !EXT_REX_SSE_REG_P (dest) > + && GET_MODE_BITSIZE (GET_MODE (dest)) > 128) > { > /* This is an YMM/ZMM load. Return AVX_U128_DIRTY if the > source isn't zero. */ > @@ -14968,9 +14980,8 @@ ix86_avx_u128_mode_needed (rtx_insn *insn) > } > else > { > - FOR_EACH_SUBRTX (iter, array, src, NONCONST) > - if (ix86_check_avx_upper_register (*iter)) > - return AVX_U128_DIRTY; > + if (ix86_check_avx_upper_register (src)) > + return AVX_U128_DIRTY; > } > > /* This isn't YMM/ZMM load/store. */ > @@ -14981,9 +14992,8 @@ ix86_avx_u128_mode_needed (rtx_insn *insn) > Hardware changes state only when a 256bit register is written to, > but we need to prevent the compiler from moving optimal insertion > point above eventual read from 256bit or 512 bit register. */ > - FOR_EACH_SUBRTX (iter, array, PATTERN (insn), NONCONST) > - if (ix86_check_avx_upper_register (*iter)) > - return AVX_U128_DIRTY; > + if (ix86_check_avx_upper_register (PATTERN (insn))) > + return AVX_U128_DIRTY; > > return AVX_U128_ANY; > } > diff --git a/gcc/testsuite/gcc.target/i386/pr116512.c > b/gcc/testsuite/gcc.target/i386/pr116512.c > new file mode 100644 > index 00000000000..c2bc6c91b64 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr116512.c > @@ -0,0 +1,26 @@ > +/* { dg-do compile } */ > +/* { dg-options "-march=x86-64-v4 -O2" } */ > +/* { dg-final { scan-assembler-not "vzeroupper" { target { ! ia32 } } } } */ > + > +#include <immintrin.h> > + > +struct B { > + union { > + __m512 f; > + __m512i s; > + }; > +}; > + > +struct B foo(int n) { > + struct B res; > + res.s = _mm512_set1_epi32(n); > + > + return res; > +} > + > +__m512i bar(int n) { > + struct B res; > + res.s = _mm512_set1_epi32(n); > + > + return res.s; > +} > -- > 2.31.1 >