On Mon, Jan 7, 2019 at 6:40 PM H.J. Lu <hongjiu...@intel.com> wrote:
>
> There is no need to generate vzeroupper if caller uses upper bits of
> AVX/AVX512 registers,  We track caller's avx_u128_state and avoid
> vzeroupper when caller's avx_u128_state is AVX_U128_DIRTY.
>
> Tested on i686 and x86-64 with and without --with-arch=native.
>
> OK for trunk?

In principle OK, but I think we don't have to cache the result of
ix86_avx_u128_mode_entry. Simply call the function from
ix86_avx_u128_mode_exit; it is a simple function, so I guess we can
afford to re-call it one more time per function.

Uros.

> Thanks.
>
> H.J.
> ---
> gcc/
>
>         PR target/88717
>         * config/i386/i386.c (ix86_avx_u128_mode_entry): Set
>         caller_avx_u128_dirty to true when caller is AVX_U128_DIRTY.
>         (ix86_avx_u128_mode_exit): Set exit mode to AVX_U128_DIRTY if
>         caller is AVX_U128_DIRTY.
>         * config/i386/i386.h (machine_function): Add
>         caller_avx_u128_dirty.
>
> gcc/testsuite/
>
>         PR target/88717
>         * gcc.target/i386/pr88717.c: New test.
> ---
>  gcc/config/i386/i386.c                  | 10 +++++++++-
>  gcc/config/i386/i386.h                  |  3 +++
>  gcc/testsuite/gcc.target/i386/pr88717.c | 24 ++++++++++++++++++++++++
>  3 files changed, 36 insertions(+), 1 deletion(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr88717.c
>
> diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
> index d01278d866f..9b49a2c1d9c 100644
> --- a/gcc/config/i386/i386.c
> +++ b/gcc/config/i386/i386.c
> @@ -19100,7 +19100,11 @@ ix86_avx_u128_mode_entry (void)
>        rtx incoming = DECL_INCOMING_RTL (arg);
>
>        if (incoming && ix86_check_avx_upper_register (incoming))
> -       return AVX_U128_DIRTY;
> +       {
> +         /* Caller is AVX_U128_DIRTY.  */
> +         cfun->machine->caller_avx_u128_dirty = true;
> +         return AVX_U128_DIRTY;
> +       }
>      }
>
>    return AVX_U128_CLEAN;
> @@ -19130,6 +19134,10 @@ ix86_mode_entry (int entity)
>  static int
>  ix86_avx_u128_mode_exit (void)
>  {
> +  /* Exit mode is set to AVX_U128_DIRTY if caller is AVX_U128_DIRTY.  */
> +  if (cfun->machine->caller_avx_u128_dirty)
> +    return AVX_U128_DIRTY;
> +
>    rtx reg = crtl->return_rtx;
>
>    /* Exit mode is set to AVX_U128_DIRTY if there are 256bit
> diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
> index 83b025e0cf5..c053b657a55 100644
> --- a/gcc/config/i386/i386.h
> +++ b/gcc/config/i386/i386.h
> @@ -2747,6 +2747,9 @@ struct GTY(()) machine_function {
>    /* If true, ENDBR is queued at function entrance.  */
>    BOOL_BITFIELD endbr_queued_at_entrance : 1;
>
> +  /* If true, caller is AVX_U128_DIRTY.  */
> +  BOOL_BITFIELD caller_avx_u128_dirty : 1;
> +
>    /* The largest alignment, in bytes, of stack slot actually used.  */
>    unsigned int max_used_stack_alignment;
>
> diff --git a/gcc/testsuite/gcc.target/i386/pr88717.c 
> b/gcc/testsuite/gcc.target/i386/pr88717.c
> new file mode 100644
> index 00000000000..01680998f1b
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr88717.c
> @@ -0,0 +1,24 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mavx512f -mvzeroupper" } */
> +
> +#include <immintrin.h>
> +
> +__m128
> +foo1 (__m256 x)
> +{
> +  return _mm256_castps256_ps128 (x);
> +}
> +
> +void
> +foo2 (float *p, __m256 x)
> +{
> +  *p = ((__v8sf)x)[0];
> +}
> +
> +void
> +foo3 (float *p, __m512 x)
> +{
> +  *p = ((__v16sf)x)[0];
> +}
> +
> +/* { dg-final { scan-assembler-not "vzeroupper" } } */
> --
> 2.20.1
>

Reply via email to