On Tue, Oct 20, 2020 at 4:01 PM Qing Zhao <qing.z...@oracle.com> wrote:
>
> Hi, Uros,
>
> Thanks a lot for your comments.
>
> On Oct 19, 2020, at 2:30 PM, Uros Bizjak <ubiz...@gmail.com> wrote:
>
>
> diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
> index f684954..620114f 100644
> --- a/gcc/config/i386/i386.c
> +++ b/gcc/config/i386/i386.c
> @@ -3551,6 +3551,161 @@ ix86_function_value_regno_p (const unsigned int regno)
>  return false;
> }
>
> +/* Check whether the register REGNO should be zeroed on X86.
> +   When ALL_SSE_ZEROED is true, all SSE registers have been zeroed
> +   together, no need to zero it again.
> +   Stack registers (st0-st7) and mm0-mm7 are aliased with each other.
> +   very hard to be zeroed individually, don't zero individual st or
> +   mm registgers at this time.  */
> +
> +static bool
> +zero_call_used_regno_p (const unsigned int regno,
> + bool all_sse_zeroed)
> +{
> +  return GENERAL_REGNO_P (regno)
> +  || (!all_sse_zeroed && SSE_REGNO_P (regno))
> +  || MASK_REGNO_P (regno);
> +}
> +
> +/* Return the machine_mode that is used to zero register REGNO.  */
> +
> +static machine_mode
> +zero_call_used_regno_mode (const unsigned int regno)
> +{
> +  /* NB: We only need to zero the lower 32 bits for integer registers
> +     and the lower 128 bits for vector registers since destination are
> +     zero-extended to the full register width.  */
> +  if (GENERAL_REGNO_P (regno))
> +    return SImode;
> +  else if (SSE_REGNO_P (regno))
> +    return V4SFmode;
> +  else
> +    return HImode;
> +}
> +
> +/* Generate a rtx to zero all vector registers togetehr if possible,
> +   otherwise, return NULL.  */
> +
> +static rtx
> +zero_all_vector_registers (HARD_REG_SET need_zeroed_hardregs)
> +{
> +  if (!TARGET_AVX)
> +    return NULL;
> +
> +  for (unsigned int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
> +    if ((IN_RANGE (regno, FIRST_SSE_REG, LAST_SSE_REG)
> +  || (TARGET_64BIT
> +      && (REX_SSE_REGNO_P (regno)
> +  || (TARGET_AVX512F && EXT_REX_SSE_REGNO_P (regno)))))
> + && !TEST_HARD_REG_BIT (need_zeroed_hardregs, regno))
> +      return NULL;
> +
> +  return gen_avx_vzeroall ();
> +}
> +
> +/* Generate a rtx to zero all st and mm registers togetehr if possible,
> +   otherwise, return NULL.  */
> +
> +static rtx
> +zero_all_st_mm_registers (HARD_REG_SET need_zeroed_hardregs)
> +{
> +  if (!TARGET_MMX)
> +    return NULL;
> +
> +  for (unsigned int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
> +    if ((STACK_REGNO_P (regno) || MMX_REGNO_P (regno))
> + && !TEST_HARD_REG_BIT (need_zeroed_hardregs, regno))
> +      return NULL;
> +
> +  return gen_mmx_emms ();
>
>
> emms is not clearing any register, it only loads x87FPUTagWord with
> FFFFH. So I think, the above is useless, as far as register clearing
> is concerned.
>
>
> Thanks for the info.
>
> So, for mm and st registers, should we clear them, and how?

I don't know.

Please note that %mm and %st share the same register file, and
touching %mm registers will block access to %st until emms is emitted.
You can't just blindly load 0 to %st registers, because the register
file can be in MMX mode and vice versa. For 32bit targets, function
can also  return a value in the %mm0.

>
>
> +}
> +
> +/* TARGET_ZERO_CALL_USED_REGS.  */
> +/* Generate a sequence of instructions that zero registers specified by
> +   NEED_ZEROED_HARDREGS.  Return the ZEROED_HARDREGS that are actually
> +   zeroed.  */
> +static HARD_REG_SET
> +ix86_zero_call_used_regs (HARD_REG_SET need_zeroed_hardregs)
> +{
> +  HARD_REG_SET zeroed_hardregs;
> +  bool all_sse_zeroed = false;
> +
> +  /* first, let's see whether we can zero all vector registers together.  */
> +  rtx zero_all_vec_insn = zero_all_vector_registers (need_zeroed_hardregs);
> +  if (zero_all_vec_insn)
> +    {
> +      emit_insn (zero_all_vec_insn);
> +      all_sse_zeroed = true;
> +    }
> +
> +  /* then, let's see whether we can zero all st+mm registers togeter.  */
> +  rtx zero_all_st_mm_insn = zero_all_st_mm_registers (need_zeroed_hardregs);
> +  if (zero_all_st_mm_insn)
> +    emit_insn (zero_all_st_mm_insn);
> +
> +  /* Now, generate instructions to zero all the registers.  */
> +
> +  CLEAR_HARD_REG_SET (zeroed_hardregs);
> +  rtx zero_gpr = NULL_RTX;
> +  rtx zero_vector = NULL_RTX;
> +  rtx zero_mask = NULL_RTX;
> +
> +  for (unsigned int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
> +    {
> +      if (!TEST_HARD_REG_BIT (need_zeroed_hardregs, regno))
> + continue;
> +      if (!zero_call_used_regno_p (regno, all_sse_zeroed))
> + continue;
> +
> +      SET_HARD_REG_BIT (zeroed_hardregs, regno);
> +
> +      rtx reg, tmp;
> +      machine_mode mode = zero_call_used_regno_mode (regno);
> +
> +      reg = gen_rtx_REG (mode, regno);
> +
> +      if (mode == SImode)
> + if (zero_gpr == NULL_RTX)
> +   {
> +     zero_gpr = reg;
> +     tmp = gen_rtx_SET (reg, const0_rtx);
> +     if (!TARGET_USE_MOV0 || optimize_insn_for_size_p ())
>
>
> No need to complicate here, there is a peephole2 pattern that will perform:
>
> ;; Attempt to always use XOR for zeroing registers (including FP modes).
> (define_peephole2
>  [(set (match_operand 0 "general_reg_operand")
>    (match_operand 1 "const0_operand"))]
>
> So, simply load a register with 0 and leave to the peephole to do its magic.
>
>
> Since the new register zeroing pass is after peephole2 pass, the above 
> peephole optimization cannot be applied.
>
>           NEXT_PASS (pass_peephole2);   ====> peephole2
>           NEXT_PASS (pass_if_after_reload);
>           NEXT_PASS (pass_regrename);
>           NEXT_PASS (pass_cprop_hardreg);
>           NEXT_PASS (pass_fast_rtl_dce);
>           NEXT_PASS (pass_reorder_blocks);
>           NEXT_PASS (pass_leaf_regs);
>           NEXT_PASS (pass_split_before_sched2);
>           NEXT_PASS (pass_sched2);
>           NEXT_PASS (pass_stack_regs);
>           PUSH_INSERT_PASSES_WITHIN (pass_stack_regs)
>               NEXT_PASS (pass_split_before_regstack);
>               NEXT_PASS (pass_stack_regs_run);
>           POP_INSERT_PASSES ()
>       POP_INSERT_PASSES ()
>       NEXT_PASS (pass_late_compilation);
>       PUSH_INSERT_PASSES_WITHIN (pass_late_compilation)
>           NEXT_PASS (pass_zero_call_used_regs);   ====> new zero registers 
> pass.
>           NEXT_PASS (pass_compute_alignments);
>           NEXT_PASS (pass_variable_tracking);
>
> So, the current code should still be necessary?

Yes, I was not aware that the pass is after peephole2.

Uros.

Reply via email to