On Tue, Oct 27, 2020 at 5:10 PM Qing Zhao <[email protected]> wrote:
>
> Uros,
>
> The following is the change compared to version 4 after fix all the issues
> you raised in the previous email.
>
> Let me know if there is any other issue.
LGTM for x86 part, with a couple of small review comments inline.
Thanks,
Uros.
> Thanks.
>
> Qing
>
> ---
> gcc/config/i386/i386.c | 162
> +++++++++++++++++----
> .../gcc.target/i386/zero-scratch-regs-28.c | 16 ++
> .../gcc.target/i386/zero-scratch-regs-29.c | 10 ++
> .../gcc.target/i386/zero-scratch-regs-30.c | 11 ++
> .../gcc.target/i386/zero-scratch-regs-31.c | 16 ++
> 5 files changed, 188 insertions(+), 27 deletions(-)
> create mode 100644 gcc/testsuite/gcc.target/i386/zero-scratch-regs-28.c
> create mode 100644 gcc/testsuite/gcc.target/i386/zero-scratch-regs-29.c
> create mode 100644 gcc/testsuite/gcc.target/i386/zero-scratchregs-30.c
> create mode 100644 gcc/testsuite/gcc.target/i386/zero-scratch-regs-31.c
>
> diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
> index e66dcf0d587..e6c5001b11e 100644
> --- a/gcc/config/i386/i386.c
> +++ b/gcc/config/i386/i386.c
> @@ -3554,17 +3554,17 @@ ix86_function_value_regno_p (const unsigned int regno)
> /* Check whether the register REGNO should be zeroed on X86.
> When ALL_SSE_ZEROED is true, all SSE registers have been zeroed
> together, no need to zero it again.
> - Stack registers (st0-st7) and mm0-mm7 are aliased with each other.
> - very hard to be zeroed individually, don't zero individual st or
> - mm registgers. */
> + When NEED_ZERO_MMX is true, MMX registers should be cleared. */
>
> static bool
> zero_call_used_regno_p (const unsigned int regno,
> - bool all_sse_zeroed)
> + bool all_sse_zeroed,
> + bool need_zero_mmx)
> {
> return GENERAL_REGNO_P (regno)
> || (!all_sse_zeroed && SSE_REGNO_P (regno))
> - || MASK_REGNO_P (regno);
> + || MASK_REGNO_P (regno)
> + || (need_zero_mmx && MMX_REGNO_P (regno));
> }
>
> /* Return the machine_mode that is used to zero register REGNO. */
> @@ -3579,8 +3579,12 @@ zero_call_used_regno_mode (const unsigned int regno)
> return SImode;
> else if (SSE_REGNO_P (regno))
> return V4SFmode;
> - else
> + else if (MASK_REGNO_P (regno))
> return HImode;
> + else if (MMX_REGNO_P (regno))
> + return V4HImode;
> + else
> + gcc_unreachable ();
> }
>
> /* Generate a rtx to zero all vector registers together if possible,
> @@ -3603,7 +3607,7 @@ zero_all_vector_registers (HARD_REG_SET
> need_zeroed_hardregs)
> return gen_avx_vzeroall ();
> }
>
> -/* Generate insns to zero all st/mm registers together.
> +/* Generate insns to zero all st registers together.
> Return true when zeroing instructions are generated.
> Assume the number of st registers that are zeroed is num_of_st,
> we will emit the following sequence to zero them together:
> @@ -3616,23 +3620,49 @@ zero_all_vector_registers (HARD_REG_SET
> need_zeroed_hardregs)
> ...
> fstp %%st(0);
> i.e., num_of_st fldz followed by num_of_st fstp to clear the stack
> - mark stack slots empty. */
> + mark stack slots empty.
> +
> + How to compute the num_of_st?
> + There is no direct mapping from stack registers to hard register
> + numbers. If one stack register need to be cleared, we don't know
needs
> + where in the stack the value remains. So, if any stack register
> + need to be cleared, the whole stack should be cleared. However,
needs
> + x87 stack registers that hold the return value should be excluded.
> + x87 returns in the top (two for complex values) register, so
> + num_of_st should be 7/6 when x87 returns, otherwise it will be 8. */
> +
>
> static bool
> -zero_all_st_mm_registers (HARD_REG_SET need_zeroed_hardregs)
> +zero_all_st_registers (HARD_REG_SET need_zeroed_hardregs)
> {
> unsigned int num_of_st = 0;
> for (unsigned int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
> - if (STACK_REGNO_P (regno)
> - && TEST_HARD_REG_BIT (need_zeroed_hardregs, regno)
> - /* When the corresponding mm register also need to be cleared too. */
> - && TEST_HARD_REG_BIT (need_zeroed_hardregs,
> - (regno - FIRST_STACK_REG + FIRST_MMX_REG)))
> - num_of_st++;
> + if ((STACK_REGNO_P (regno) || MMX_REGNO_P (regno))
> + && TEST_HARD_REG_BIT (need_zeroed_hardregs, regno))
> + {
> + num_of_st++;
> + break;
> + }
>
> if (num_of_st == 0)
> return false;
>
> + bool return_with_x87 = false;
> + return_with_x87 = (crtl->return_rtx
> + && (STACK_REG_P (crtl->return_rtx)));
> +
> + bool complex_return = false;
> + complex_return = (crtl->return_rtx
> + && COMPLEX_MODE_P (GET_MODE (crtl->return_rtx)));
> +
> + if (return_with_x87)
> + if (complex_return)
> + num_of_st = 6;
> + else
> + num_of_st = 7;
> + else
> + num_of_st = 8;
> +
> rtx st_reg = gen_rtx_REG (XFmode, FIRST_STACK_REG);
> for (unsigned int i = 0; i < num_of_st; i++)
> emit_insn (gen_rtx_SET (st_reg, CONST0_RTX (XFmode)));
> @@ -3646,6 +3676,43 @@ zero_all_st_mm_registers (HARD_REG_SET
> need_zeroed_hardregs)
> return true;
> }
>
> +
> +/* When the routine exit with MMX mode, if there is any ST registers
... exits in MMX mode, if any ST register needs ...
> + need to be zeroed, we should clear all MMX registers except the
> + one that holds the return value RET_MMX_REGNO. */
... except the RET_MMX_REGNO that holds the return value.
> +static bool
> +zero_all_mm_registers (HARD_REG_SET need_zeroed_hardregs,
> + unsigned int ret_mmx_regno)
> +{
> + bool need_zero_all_mm = false;
> + for (unsigned int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
> + if (STACK_REGNO_P (regno)
> + && TEST_HARD_REG_BIT (need_zeroed_hardregs, regno))
> + {
> + need_zero_all_mm = true;
> + break;
> + }
> +
> + if (!need_zero_all_mm)
> + return false;
> +
> + rtx zero_mmx = NULL_RTX;
> + machine_mode mode = V4HImode;
> + for (unsigned int regno = FIRST_MMX_REG; regno <= LAST_MMX_REG; regno++)
> + if (regno != ret_mmx_regno)
> + {
> + rtx reg = gen_rtx_REG (mode, regno);
> + if (zero_mmx == NULL_RTX)
> + {
> + zero_mmx = reg;
> + emit_insn (gen_rtx_SET (reg, CONST0_RTX(mode)));
space after CONST0_RTX
> + }
> + else
> + emit_move_insn (reg, zero_mmx);
> + }
> + return true;
> +}
> +
> /* TARGET_ZERO_CALL_USED_REGS. */
> /* Generate a sequence of instructions that zero registers specified by
> NEED_ZEROED_HARDREGS. Return the ZEROED_HARDREGS that are actually
> @@ -3655,7 +3722,10 @@ ix86_zero_call_used_regs (HARD_REG_SET
> need_zeroed_hardregs)
> {
> HARD_REG_SET zeroed_hardregs;
> bool all_sse_zeroed = false;
> - bool st_zeroed = false;
> + bool all_st_zeroed = false;
> + bool all_mm_zeroed = false;
> +
> + CLEAR_HARD_REG_SET (zeroed_hardregs);
>
> /* first, let's see whether we can zero all vector registers together. */
> rtx zero_all_vec_insn = zero_all_vector_registers (need_zeroed_hardregs);
> @@ -3665,38 +3735,67 @@ ix86_zero_call_used_regs (HARD_REG_SET
> need_zeroed_hardregs)
> all_sse_zeroed = true;
> }
>
> - /* then, let's see whether we can zero all st+mm registers togeter. */
> - st_zeroed = zero_all_st_mm_registers (need_zeroed_hardregs);
> + /* mm/st registers are shared registers set, we should follow the following
> + rules to clear them:
> + MMX exit mode x87 exit mode
> + -------------|----------------------|---------------
> + uses x87 reg | clear all MMX | clear all x87
> + uses MMX reg | clear individual MMX | clear all x87
> + x87 + MMX | clear all MMX | clear all x87
>
> - /* Now, generate instructions to zero all the registers. */
> + first, we should decide which mode (MMX mode or x87 mode) the function
> + exit with. */
>
> - CLEAR_HARD_REG_SET (zeroed_hardregs);
> - if (st_zeroed)
> - SET_HARD_REG_BIT (zeroed_hardregs, FIRST_STACK_REG);
> + bool exit_with_mmx_mode = (crtl->return_rtx
> + && (MMX_REG_P (crtl->return_rtx)));
> +
> + if (!exit_with_mmx_mode)
> + /* x87 exit mode, we should zero all st registers together. */
> + {
> + all_st_zeroed = zero_all_st_registers (need_zeroed_hardregs);
> + if (all_st_zeroed)
> + SET_HARD_REG_BIT (zeroed_hardregs, FIRST_STACK_REG);
> + }
> + else
> + /* MMX exit mode, check whether we can zero all mm registers. */
> + {
> + unsigned int exit_mmx_regno = REGNO (crtl->return_rtx);
> + all_mm_zeroed = zero_all_mm_registers (need_zeroed_hardregs,
> + exit_mmx_regno);
> + if (all_mm_zeroed)
> + for (unsigned int regno = FIRST_MMX_REG; regno <= LAST_MMX_REG;
> regno++)
> + if (regno != exit_mmx_regno)
> + SET_HARD_REG_BIT (zeroed_hardregs, regno);
> + }
> +
> + /* Now, generate instructions to zero all the other registers. */
>
> rtx zero_gpr = NULL_RTX;
> rtx zero_vector = NULL_RTX;
> rtx zero_mask = NULL_RTX;
> + rtx zero_mmx = NULL_RTX;
>
> for (unsigned int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
> {
> if (!TEST_HARD_REG_BIT (need_zeroed_hardregs, regno))
> continue;
> - if (!zero_call_used_regno_p (regno, all_sse_zeroed))
> + if (!zero_call_used_regno_p (regno, all_sse_zeroed,
> + exit_with_mmx_mode && !all_mm_zeroed))
> continue;
>
> SET_HARD_REG_BIT (zeroed_hardregs, regno);
>
> - rtx reg, tmp;
> + rtx reg, tmp, zero_rtx;
> machine_mode mode = zero_call_used_regno_mode (regno);
>
> reg = gen_rtx_REG (mode, regno);
> + zero_rtx = CONST0_RTX (mode);
>
> if (mode == SImode)
> if (zero_gpr == NULL_RTX)
> {
> zero_gpr = reg;
> - tmp = gen_rtx_SET (reg, const0_rtx);
> + tmp = gen_rtx_SET (reg, zero_rtx);
> if (!TARGET_USE_MOV0 || optimize_insn_for_size_p ())
> {
> rtx clob = gen_rtx_CLOBBER (VOIDmode,
> @@ -3714,7 +3813,7 @@ ix86_zero_call_used_regs (HARD_REG_SET
> need_zeroed_hardregs)
> if (zero_vector == NULL_RTX)
> {
> zero_vector = reg;
> - tmp = gen_rtx_SET (reg, const0_rtx);
> + tmp = gen_rtx_SET (reg, zero_rtx);
> emit_insn (tmp);
> }
> else
> @@ -3723,11 +3822,20 @@ ix86_zero_call_used_regs (HARD_REG_SET
> need_zeroed_hardregs)
> if (zero_mask == NULL_RTX)
> {
> zero_mask = reg;
> - tmp = gen_rtx_SET (reg, const0_rtx);
> + tmp = gen_rtx_SET (reg, zero_rtx);
> emit_insn (tmp);
> }
> else
> emit_move_insn (reg, zero_mask);
> + else if (mode == V4HImode)
> + if (zero_mmx == NULL_RTX)
> + {
> + zero_mmx = reg;
> + tmp = gen_rtx_SET (reg, zero_rtx);
> + emit_insn (tmp);
> + }
> + else
> + emit_move_insn (reg, zero_mmx);
> else
> gcc_unreachable ();
> }
> diff --git a/gcc/testsuite/gcc.target/i386/zero-scratch-regs-28.c
> b/gcc/testsuite/gcc.target/i386/zero-scratch-regs-28.c
> new file mode 100644
> index 00000000000..48b1f019a28
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/zero-scratch-regs-28.c
> @@ -0,0 +1,16 @@
> +/* { dg-do compile { target *-*-linux* } } */
> +/* { dg-options "-O2 -mmmx -fzero-call-used-regs=all" } */
> +/* { dg-require-effective-target ia32 } */
> +
> +__v2si ret_mmx (void)
> +{
> + return (__v2si) { 123, 345 };
> +}
> +
> +/* { dg-final { scan-assembler "pxor\[ \t\]*%mm1, %mm1" } } */
> +/* { dg-final { scan-assembler "movq\[ \t\]*%mm1, %mm2" } } */
> +/* { dg-final { scan-assembler "movq\[ \t\]*%mm1, %mm3" } } */
> +/* { dg-final { scan-assembler "movq\[ \t\]*%mm1, %mm4" } } */
> +/* { dg-final { scan-assembler "movq\[ \t\]*%mm1, %mm5" } } */
> +/* { dg-final { scan-assembler "movq\[ \t\]*%mm1, %mm6" } } */
> +/* { dg-final { scan-assembler "movq\[ \t\]*%mm1, %mm7" } } */
> diff --git a/gcc/testsuite/gcc.target/i386/zero-scratch-regs-29.c
> b/gcc/testsuite/gcc.target/i386/zero-scratch-regs-29.c
> new file mode 100644
> index 00000000000..8b5e1cd1602
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/zero-scratch-regs-29.c
> @@ -0,0 +1,10 @@
> +/* { dg-do compile { target *-*-linux* } } */
> +/* { dg-options "-O2 -fzero-call-used-regs=all" } */
> +
> +long double ret_x87 (void)
> +{
> + return 1.1L;
> +}
> +
> +/* { dg-final { scan-assembler-times "fldz" 7 } } */
> +/* { dg-final { scan-assembler-times "fstp" 7 } } */
> diff --git a/gcc/testsuite/gcc.target/i386/zero-scratch-regs-30.c
> b/gcc/testsuite/gcc.target/i386/zero-scratch-regs-30.c
> new file mode 100644
> index 00000000000..e6fb4acf0fa
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/zero-scratch-regs-30.c
> @@ -0,0 +1,11 @@
> +/* { dg-do compile { target *-*-linux* } } */
> +/* { dg-options "-O2 -fzero-call-used-regs=all" } */
> +/* { dg-require-effective-target lp64} */
Please remove the above line, we can get better test coverage by
conditional scans below.
> +
> +_Complex long double ret_x87_cplx (void)
> +{
> + return 1.1L + 1.2iL;
> +}
> +
> +/* { dg-final { scan-assembler-times "fldz" 6 } } */
> +/* { dg-final { scan-assembler-times "fstp" 6 } } */
/* { dg-final { scan-assembler-times "fldz" 8 { target ia32 } } } */
/* { dg-final { scan-assembler-times "fstp" 8 { target ia32 } } } */
/* { dg-final { scan-assembler-times "fldz" 6 { target { ! ia32 } } } } */
/* { dg-final { scan-assembler-times "fstp" 6 { target { ! ia32 } } } } */
> diff --git a/gcc/testsuite/gcc.target/i386/zero-scratch-regs-31.c
> b/gcc/testsuite/gcc.target/i386/zero-scratch-regs-31.c
> new file mode 100644
> index 00000000000..943508d1d26
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/zero-scratch-regs-31.c
> @@ -0,0 +1,16 @@
> +/* { dg-do compile { target *-*-linux* } } */
> +/* { dg-options "-O2 -mmmx -fzero-call-used-regs=all-arg" } */
> +/* { dg-require-effective-target ia32 } */
> +
> +__v2si ret_mmx (void)
> +{
> + return (__v2si) { 123, 345 };
> +}
> +
> +/* { dg-final { scan-assembler "pxor\[ \t\]*%mm1, %mm1" } } */
> +/* { dg-final { scan-assembler "movq\[ \t\]*%mm1, %mm2" } } */
> +/* { dg-final { scan-assembler-not "movq\[ \t\]*%mm1, %mm3" } } */
/* { dg-final { scan-assembler-not "movq\[ \t\]*%mm1, %mm\[34567\]" } } */
should achieve the same with one regexp.
> +/* { dg-final { scan-assembler-not "movq\[ \t\]*%mm1, %mm4" } } */
> +/* { dg-final { scan-assembler-not "movq\[ \t\]*%mm1, %mm5" } } */
> +/* { dg-final { scan-assembler-not "movq\[ \t\]*%mm1, %mm6" } } */
> +/* { dg-final { scan-assembler-not "movq\[ \t\]*%mm1, %mm7" } } */
> --
> 2.11.0