Uros,

The following is the change compared to version 4 after fix all the issues you 
raised in the previous email.

Let me know if there is any other issue.

Thanks.

Qing

---
 gcc/config/i386/i386.c                             | 162 +++++++++++++++++----
 .../gcc.target/i386/zero-scratch-regs-28.c         |  16 ++
 .../gcc.target/i386/zero-scratch-regs-29.c         |  10 ++
 .../gcc.target/i386/zero-scratch-regs-30.c         |  11 ++
 .../gcc.target/i386/zero-scratch-regs-31.c         |  16 ++
 5 files changed, 188 insertions(+), 27 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/zero-scratch-regs-28.c
 create mode 100644 gcc/testsuite/gcc.target/i386/zero-scratch-regs-29.c
 create mode 100644 gcc/testsuite/gcc.target/i386/zero-scratch-regs-30.c
 create mode 100644 gcc/testsuite/gcc.target/i386/zero-scratch-regs-31.c

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index e66dcf0d587..e6c5001b11e 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -3554,17 +3554,17 @@ ix86_function_value_regno_p (const unsigned int regno)
 /* Check whether the register REGNO should be zeroed on X86.
    When ALL_SSE_ZEROED is true, all SSE registers have been zeroed
    together, no need to zero it again.
-   Stack registers (st0-st7) and mm0-mm7 are aliased with each other.
-   very hard to be zeroed individually, don't zero individual st or
-   mm registgers.  */
+   When NEED_ZERO_MMX is true, MMX registers should be cleared.  */
 
 static bool
 zero_call_used_regno_p (const unsigned int regno,
-                       bool all_sse_zeroed)
+                       bool all_sse_zeroed,
+                       bool need_zero_mmx)
 {
   return GENERAL_REGNO_P (regno)
         || (!all_sse_zeroed && SSE_REGNO_P (regno))
-        || MASK_REGNO_P (regno);
+        || MASK_REGNO_P (regno)
+        || (need_zero_mmx && MMX_REGNO_P (regno));
 }
 
 /* Return the machine_mode that is used to zero register REGNO.  */
@@ -3579,8 +3579,12 @@ zero_call_used_regno_mode (const unsigned int regno)
     return SImode;
   else if (SSE_REGNO_P (regno))
     return V4SFmode;
-  else
+  else if (MASK_REGNO_P (regno))
     return HImode;
+  else if (MMX_REGNO_P (regno))
+    return V4HImode;
+  else
+    gcc_unreachable ();
 }
 
 /* Generate a rtx to zero all vector registers together if possible,
@@ -3603,7 +3607,7 @@ zero_all_vector_registers (HARD_REG_SET 
need_zeroed_hardregs)
   return gen_avx_vzeroall ();
 }
 
-/* Generate insns to zero all st/mm registers together.
+/* Generate insns to zero all st registers together.
    Return true when zeroing instructions are generated.
    Assume the number of st registers that are zeroed is num_of_st,
    we will emit the following sequence to zero them together:
@@ -3616,23 +3620,49 @@ zero_all_vector_registers (HARD_REG_SET 
need_zeroed_hardregs)
                  ...
                  fstp %%st(0);
    i.e., num_of_st fldz followed by num_of_st fstp to clear the stack
-   mark stack slots empty.  */
+   mark stack slots empty.
+
+   How to compute the num_of_st?
+   There is no direct mapping from stack registers to hard register
+   numbers.  If one stack register need to be cleared, we don't know
+   where in the stack the value remains.  So, if any stack register 
+   need to be cleared, the whole stack should be cleared.  However,
+   x87 stack registers that hold the return value should be excluded.
+   x87 returns in the top (two for complex values) register, so
+   num_of_st should be 7/6 when x87 returns, otherwise it will be 8.  */
+
 
 static bool
-zero_all_st_mm_registers (HARD_REG_SET need_zeroed_hardregs)
+zero_all_st_registers (HARD_REG_SET need_zeroed_hardregs)
 {
   unsigned int num_of_st = 0;
   for (unsigned int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
-    if (STACK_REGNO_P (regno)
-       && TEST_HARD_REG_BIT (need_zeroed_hardregs, regno)
-       /* When the corresponding mm register also need to be cleared too.  */
-       && TEST_HARD_REG_BIT (need_zeroed_hardregs,
-                             (regno - FIRST_STACK_REG + FIRST_MMX_REG)))
-      num_of_st++;
+    if ((STACK_REGNO_P (regno) || MMX_REGNO_P (regno))
+       && TEST_HARD_REG_BIT (need_zeroed_hardregs, regno))
+      {
+       num_of_st++;
+       break;
+      }
 
   if (num_of_st == 0)
     return false;
 
+  bool return_with_x87 = false;
+  return_with_x87 = (crtl->return_rtx
+                    && (STACK_REG_P (crtl->return_rtx)));
+
+  bool complex_return = false;
+  complex_return = (crtl->return_rtx
+                   && COMPLEX_MODE_P (GET_MODE (crtl->return_rtx)));
+
+  if (return_with_x87)
+    if (complex_return)
+      num_of_st = 6;
+    else
+      num_of_st = 7;
+  else
+    num_of_st = 8;
+
   rtx st_reg = gen_rtx_REG (XFmode, FIRST_STACK_REG);
   for (unsigned int i = 0; i < num_of_st; i++)
     emit_insn (gen_rtx_SET (st_reg, CONST0_RTX (XFmode)));
@@ -3646,6 +3676,43 @@ zero_all_st_mm_registers (HARD_REG_SET 
need_zeroed_hardregs)
   return true;
 }
 
+
+/* When the routine exit with MMX mode, if there is any ST registers
+   need to be zeroed, we should clear all MMX registers except the
+   one that holds the return value RET_MMX_REGNO.  */
+static bool
+zero_all_mm_registers (HARD_REG_SET need_zeroed_hardregs,
+                      unsigned int ret_mmx_regno)
+{
+  bool need_zero_all_mm = false;
+  for (unsigned int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
+    if (STACK_REGNO_P (regno)
+       && TEST_HARD_REG_BIT (need_zeroed_hardregs, regno))
+      {
+       need_zero_all_mm = true;
+       break;
+      }
+
+  if (!need_zero_all_mm)
+    return false;
+
+  rtx zero_mmx = NULL_RTX;
+  machine_mode mode = V4HImode;
+  for (unsigned int regno = FIRST_MMX_REG; regno <= LAST_MMX_REG; regno++)
+    if (regno != ret_mmx_regno)
+      {
+       rtx reg = gen_rtx_REG (mode, regno);
+       if (zero_mmx == NULL_RTX)
+         {
+           zero_mmx = reg;
+           emit_insn (gen_rtx_SET (reg, CONST0_RTX(mode)));
+         }
+       else
+         emit_move_insn (reg, zero_mmx);
+      }
+  return true;
+}
+
 /* TARGET_ZERO_CALL_USED_REGS.  */
 /* Generate a sequence of instructions that zero registers specified by
    NEED_ZEROED_HARDREGS.  Return the ZEROED_HARDREGS that are actually
@@ -3655,7 +3722,10 @@ ix86_zero_call_used_regs (HARD_REG_SET 
need_zeroed_hardregs)
 {
   HARD_REG_SET zeroed_hardregs;
   bool all_sse_zeroed = false;
-  bool st_zeroed = false;
+  bool all_st_zeroed = false;
+  bool all_mm_zeroed = false;
+
+  CLEAR_HARD_REG_SET (zeroed_hardregs);
 
   /* first, let's see whether we can zero all vector registers together.  */
   rtx zero_all_vec_insn = zero_all_vector_registers (need_zeroed_hardregs);
@@ -3665,38 +3735,67 @@ ix86_zero_call_used_regs (HARD_REG_SET 
need_zeroed_hardregs)
       all_sse_zeroed = true;
     }
 
-  /* then, let's see whether we can zero all st+mm registers togeter.  */
-  st_zeroed = zero_all_st_mm_registers (need_zeroed_hardregs);
+  /* mm/st registers are shared registers set, we should follow the following
+     rules to clear them:
+                       MMX exit mode       x87 exit mode
+       -------------|----------------------|---------------
+       uses x87 reg | clear all MMX        | clear all x87
+       uses MMX reg | clear individual MMX | clear all x87
+       x87 + MMX    | clear all MMX        | clear all x87
 
-  /* Now, generate instructions to zero all the registers.  */
+     first, we should decide which mode (MMX mode or x87 mode) the function 
+     exit with.  */
 
-  CLEAR_HARD_REG_SET (zeroed_hardregs);
-  if (st_zeroed)
-    SET_HARD_REG_BIT (zeroed_hardregs, FIRST_STACK_REG);
+  bool exit_with_mmx_mode = (crtl->return_rtx
+                            && (MMX_REG_P (crtl->return_rtx)));
+
+  if (!exit_with_mmx_mode)
+    /* x87 exit mode, we should zero all st registers together.  */
+    {
+      all_st_zeroed = zero_all_st_registers (need_zeroed_hardregs);
+      if (all_st_zeroed)
+       SET_HARD_REG_BIT (zeroed_hardregs, FIRST_STACK_REG);
+    }
+  else 
+    /* MMX exit mode, check whether we can zero all mm registers.  */
+    {
+      unsigned int exit_mmx_regno = REGNO (crtl->return_rtx);
+      all_mm_zeroed = zero_all_mm_registers (need_zeroed_hardregs, 
+                                            exit_mmx_regno);
+      if (all_mm_zeroed)
+       for (unsigned int regno = FIRST_MMX_REG; regno <= LAST_MMX_REG; regno++)
+         if (regno != exit_mmx_regno)
+           SET_HARD_REG_BIT (zeroed_hardregs, regno);
+    }
+
+  /* Now, generate instructions to zero all the other registers.  */
 
   rtx zero_gpr = NULL_RTX;
   rtx zero_vector = NULL_RTX;
   rtx zero_mask = NULL_RTX;
+  rtx zero_mmx = NULL_RTX;
 
   for (unsigned int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
     {
       if (!TEST_HARD_REG_BIT (need_zeroed_hardregs, regno))
        continue;
-      if (!zero_call_used_regno_p (regno, all_sse_zeroed))
+      if (!zero_call_used_regno_p (regno, all_sse_zeroed, 
+                                  exit_with_mmx_mode && !all_mm_zeroed))
        continue;
 
       SET_HARD_REG_BIT (zeroed_hardregs, regno);
 
-      rtx reg, tmp;
+      rtx reg, tmp, zero_rtx;
       machine_mode mode = zero_call_used_regno_mode (regno);
 
       reg = gen_rtx_REG (mode, regno);
+      zero_rtx = CONST0_RTX (mode);
 
       if (mode == SImode)
        if (zero_gpr == NULL_RTX)
          {
            zero_gpr = reg;
-           tmp = gen_rtx_SET (reg, const0_rtx);
+           tmp = gen_rtx_SET (reg, zero_rtx);
            if (!TARGET_USE_MOV0 || optimize_insn_for_size_p ())
              {
                rtx clob = gen_rtx_CLOBBER (VOIDmode,
@@ -3714,7 +3813,7 @@ ix86_zero_call_used_regs (HARD_REG_SET 
need_zeroed_hardregs)
        if (zero_vector == NULL_RTX)
          {
            zero_vector = reg;
-           tmp = gen_rtx_SET (reg, const0_rtx);
+           tmp = gen_rtx_SET (reg, zero_rtx);
            emit_insn (tmp);
          }
        else
@@ -3723,11 +3822,20 @@ ix86_zero_call_used_regs (HARD_REG_SET 
need_zeroed_hardregs)
        if (zero_mask == NULL_RTX)
          {
            zero_mask = reg;
-           tmp = gen_rtx_SET (reg, const0_rtx);
+           tmp = gen_rtx_SET (reg, zero_rtx);
            emit_insn (tmp);
          }
        else
          emit_move_insn (reg, zero_mask);
+      else if (mode == V4HImode)
+       if (zero_mmx == NULL_RTX)
+         {
+           zero_mmx = reg;
+           tmp = gen_rtx_SET (reg, zero_rtx);
+           emit_insn (tmp);
+         }
+       else
+         emit_move_insn (reg, zero_mmx);
       else
        gcc_unreachable ();
     }
diff --git a/gcc/testsuite/gcc.target/i386/zero-scratch-regs-28.c 
b/gcc/testsuite/gcc.target/i386/zero-scratch-regs-28.c
new file mode 100644
index 00000000000..48b1f019a28
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/zero-scratch-regs-28.c
@@ -0,0 +1,16 @@
+/* { dg-do compile { target *-*-linux* } } */
+/* { dg-options "-O2 -mmmx -fzero-call-used-regs=all" } */
+/* { dg-require-effective-target ia32 } */
+
+__v2si ret_mmx (void)
+{
+  return (__v2si) { 123, 345 };
+}
+
+/* { dg-final { scan-assembler "pxor\[ \t\]*%mm1, %mm1" } } */
+/* { dg-final { scan-assembler "movq\[ \t\]*%mm1, %mm2" } } */
+/* { dg-final { scan-assembler "movq\[ \t\]*%mm1, %mm3" } } */
+/* { dg-final { scan-assembler "movq\[ \t\]*%mm1, %mm4" } } */
+/* { dg-final { scan-assembler "movq\[ \t\]*%mm1, %mm5" } } */
+/* { dg-final { scan-assembler "movq\[ \t\]*%mm1, %mm6" } } */
+/* { dg-final { scan-assembler "movq\[ \t\]*%mm1, %mm7" } } */
diff --git a/gcc/testsuite/gcc.target/i386/zero-scratch-regs-29.c 
b/gcc/testsuite/gcc.target/i386/zero-scratch-regs-29.c
new file mode 100644
index 00000000000..8b5e1cd1602
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/zero-scratch-regs-29.c
@@ -0,0 +1,10 @@
+/* { dg-do compile { target *-*-linux* } } */
+/* { dg-options "-O2 -fzero-call-used-regs=all" } */
+
+long double ret_x87 (void)
+{
+  return 1.1L;
+}
+
+/* { dg-final { scan-assembler-times "fldz" 7 } } */
+/* { dg-final { scan-assembler-times "fstp" 7 } } */
diff --git a/gcc/testsuite/gcc.target/i386/zero-scratch-regs-30.c 
b/gcc/testsuite/gcc.target/i386/zero-scratch-regs-30.c
new file mode 100644
index 00000000000..e6fb4acf0fa
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/zero-scratch-regs-30.c
@@ -0,0 +1,11 @@
+/* { dg-do compile { target *-*-linux* } } */
+/* { dg-options "-O2  -fzero-call-used-regs=all" } */
+/* { dg-require-effective-target lp64} */
+
+_Complex long double ret_x87_cplx (void)
+{
+  return 1.1L + 1.2iL;
+}
+
+/* { dg-final { scan-assembler-times "fldz" 6 } } */
+/* { dg-final { scan-assembler-times "fstp" 6 } } */
diff --git a/gcc/testsuite/gcc.target/i386/zero-scratch-regs-31.c 
b/gcc/testsuite/gcc.target/i386/zero-scratch-regs-31.c
new file mode 100644
index 00000000000..943508d1d26
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/zero-scratch-regs-31.c
@@ -0,0 +1,16 @@
+/* { dg-do compile { target *-*-linux* } } */
+/* { dg-options "-O2 -mmmx -fzero-call-used-regs=all-arg" } */
+/* { dg-require-effective-target ia32 } */
+
+__v2si ret_mmx (void)
+{
+  return (__v2si) { 123, 345 };
+}
+
+/* { dg-final { scan-assembler "pxor\[ \t\]*%mm1, %mm1" } } */
+/* { dg-final { scan-assembler "movq\[ \t\]*%mm1, %mm2" } } */
+/* { dg-final { scan-assembler-not "movq\[ \t\]*%mm1, %mm3" } } */
+/* { dg-final { scan-assembler-not "movq\[ \t\]*%mm1, %mm4" } } */
+/* { dg-final { scan-assembler-not "movq\[ \t\]*%mm1, %mm5" } } */
+/* { dg-final { scan-assembler-not "movq\[ \t\]*%mm1, %mm6" } } */
+/* { dg-final { scan-assembler-not "movq\[ \t\]*%mm1, %mm7" } } */
-- 
2.11.0





> On Oct 27, 2020, at 8:55 AM, Qing Zhao via Gcc-patches 
> <gcc-patches@gcc.gnu.org> wrote:
> 
> 
> 
>> On Oct 27, 2020, at 3:09 AM, Uros Bizjak <ubiz...@gmail.com> wrote:
>> 
>> On Tue, Oct 27, 2020 at 12:08 AM Qing Zhao <qing.z...@oracle.com 
>> <mailto:qing.z...@oracle.com>> wrote:
>>> 
>>> Hi, Uros,
>>> 
>>> Could you please check the change compared to the previous version for 
>>> i386.c as following:
>>> Let me know any issue there.
>> 
>> It looks that the combination when the function only touches MMX
>> registers (so, no x87 register is touched) and exits in MMX mode is
>> not handled in the optimal way.
> 
> My current code should handle this in the expected way already, as following:
> 
> 
>  /* Then, decide which mode (MMX mode or x87 mode) the function exit with.
>     In order to decide whether we need to clear the MMX registers or the
>     stack registers.  */
> 
>  bool exit_with_mmx_mode = (crtl->return_rtx
>                             && (GET_CODE (crtl->return_rtx) == REG)
>                             && (MMX_REG_P (crtl->return_rtx)));
> 
>  /* then, let's see whether we can zero all st registers together.  */
>  if (!exit_with_mmx_mode)
>    all_st_zeroed = zero_all_st_registers (need_zeroed_hardregs);
>  /* Or should we zero all MMX registers.  */
>  else
>    {
>      unsigned int exit_mmx_regno = REGNO (crtl->return_rtx);
>      all_mm_zeroed = zero_all_mm_registers (need_zeroed_hardregs,
>                                             exit_mmx_regno);
>    }
> 
> 
> “Zero_all_mm_registers” only zero all MM registers when any ST register need 
> to be cleared. Otherwise, it will not clear all MM registers.
> And individual MM registers will be cleared in the regular loop as all other 
> registers.
> 
>> In this case, MMX registers should be
>> handled in the same way as XMM registers, where only used/arg/all regs
>> can be cleared.
>> 
>>                 MMX exit mode       x87 exit mode
>> -------------|----------------------|---------------
>> uses x87 reg | clear all MMX        | clear all x87
>> uses MMX reg | clear individual MMX | clear all x87
>> x87 + MMX    | clear all MMX        | clear all x87
>> 
>> IOW, if x87 is used, we don't know where in the stack (or in which MMX
>> "register") the value lies. But when the function uses only MMX
>> registers and exits in MMX mode, we know which register was used, and
>> we *can* access them individually.
> 
> I will add the above table to the comment part of the implementation. 
>> 
>> Also, do we want to handle only arg/used registers?
> 
> Yes.  Arg/used register handling has been done in middle end.  (In 
> gcc/function.c) as following:
> 
>  /* For each of the hard registers, check to see whether we should zero it if:
>     1. it is a call-used-registers;
> and 2. it is not a fixed-registers;
> and 3. it is not live at the return of the routine;
> and 4. it is general registor if gpr_only is true;
> and 5. it is used in the routine if used_only is true;
> and 6. it is a register that passes parameter if arg_only is true;
>   */
> 
> The register set that i386 backend gets already satisfied all the above 
> requirement. 
> 
>> x87 has no arg
>> registers, so there is no need to clear anything. MMX has 3 argument
>> registers for 32bit targets, and is possible to clear them
>> individually when the function exits in MMX mode.
> 
> The above information should already been covered by :
> 
>     if (arg_only && !FUNCTION_ARG_REGNO_P (regno))
> 
> Right?
> 
> 
>> 
>> Please note review comments inline.
>> 
>> Uros.
>> 
>>> Thanks a lot.
>>> 
>>> Qing
>>> 
>>> ---
>>> gcc/config/i386/i386.c                             | 136 
>>> ++++++++++++++++++---
>>> .../gcc.target/i386/zero-scratch-regs-28.c         |  17 +++
>>> .../gcc.target/i386/zero-scratch-regs-29.c         |  11 ++
>>> .../gcc.target/i386/zero-scratch-regs-30.c         |  11 ++
>>> 4 files changed, 155 insertions(+), 20 deletions(-)
>>> create mode 100644 gcc/testsuite/gcc.target/i386/zero-scratch-regs-28.c
>>> create mode 100644 gcc/testsuite/gcc.target/i386/zero-scratch-regs-29.c
>>> create mode 100644 gcc/testsuite/gcc.target/i386/zero-scratch-regs-30.c
>>> 
>>> diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
>>> index e66dcf0d587..65f778112d9 100644
>>> --- a/gcc/config/i386/i386.c
>>> +++ b/gcc/config/i386/i386.c
>>> @@ -3554,17 +3554,17 @@ ix86_function_value_regno_p (const unsigned int 
>>> regno)
>>> /* Check whether the register REGNO should be zeroed on X86.
>>>   When ALL_SSE_ZEROED is true, all SSE registers have been zeroed
>>>   together, no need to zero it again.
>>> -   Stack registers (st0-st7) and mm0-mm7 are aliased with each other.
>>> -   very hard to be zeroed individually, don't zero individual st or
>>> -   mm registgers.  */
>>> +   When NEED_ZERO_MMX is true, MMX registers should be cleared.  */
>>> 
>>> static bool
>>> zero_call_used_regno_p (const unsigned int regno,
>>> - bool all_sse_zeroed)
>>> + bool all_sse_zeroed,
>>> + bool need_zero_mmx)
>>> {
>>>  return GENERAL_REGNO_P (regno)
>>>  || (!all_sse_zeroed && SSE_REGNO_P (regno))
>>> -  || MASK_REGNO_P (regno);
>>> +  || MASK_REGNO_P (regno)
>>> +  || (need_zero_mmx && MMX_REGNO_P (regno));
>>> }
>>> 
>>> /* Return the machine_mode that is used to zero register REGNO.  */
>>> @@ -3579,8 +3579,12 @@ zero_call_used_regno_mode (const unsigned int regno)
>>>    return SImode;
>>>  else if (SSE_REGNO_P (regno))
>>>    return V4SFmode;
>>> -  else
>>> +  else if (MASK_REGNO_P (regno))
>>>    return HImode;
>>> +  else if (MMX_REGNO_P (regno))
>>> +    return DImode;
>> 
>> Why DImode instead of V4HImode?
> 
> I tried  V4HImode, and V2SImode in the beginning, all failed during 
> compilation time with “unrecognized inns” error, so, I have to use “DImode”. 
> 
>> DImode is "natural" for integer
>> registers, and we risk moves from integer to MMX regs.
> 
> So, does this mean using DImode is not correct? 
>> 
>>> +  else
>>> +    gcc_unreachable ();
>>> }
>>> 
>>> /* Generate a rtx to zero all vector registers together if possible,
>>> @@ -3603,7 +3607,7 @@ zero_all_vector_registers (HARD_REG_SET 
>>> need_zeroed_hardregs)
>>>  return gen_avx_vzeroall ();
>>> }
>>> 
>>> -/* Generate insns to zero all st/mm registers together.
>>> +/* Generate insns to zero all st registers together.
>>>   Return true when zeroing instructions are generated.
>>>   Assume the number of st registers that are zeroed is num_of_st,
>>>   we will emit the following sequence to zero them together:
>>> @@ -3616,23 +3620,50 @@ zero_all_vector_registers (HARD_REG_SET 
>>> need_zeroed_hardregs)
>>>   ...
>>>   fstp %%st(0);
>>>   i.e., num_of_st fldz followed by num_of_st fstp to clear the stack
>>> -   mark stack slots empty.  */
>>> +   mark stack slots empty.
>>> +
>>> +   How to compute the num_of_st?
>>> +   There is no direct mapping from stack registers to hard register
>>> +   numbers.  If one stack register need to be cleared, we don't know
>>> +   where in the stack the value remains.  So, if any stack register
>>> +   need to be cleared, the whole stack should be cleared.  However,
>>> +   x87 stack registers that hold the return value should be excluded.
>>> +   x87 returns in the top (two for complex values) register, so
>>> +   num_of_st should be 7/6 when x87 returns, otherwise it will be 8.  */
>>> +
>>> 
>>> static bool
>>> -zero_all_st_mm_registers (HARD_REG_SET need_zeroed_hardregs)
>>> +zero_all_st_registers (HARD_REG_SET need_zeroed_hardregs)
>>> {
>>>  unsigned int num_of_st = 0;
>>>  for (unsigned int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
>>> -    if (STACK_REGNO_P (regno)
>>> - && TEST_HARD_REG_BIT (need_zeroed_hardregs, regno)
>>> - /* When the corresponding mm register also need to be cleared too.  */
>>> - && TEST_HARD_REG_BIT (need_zeroed_hardregs,
>>> -       (regno - FIRST_STACK_REG + FIRST_MMX_REG)))
>>> -      num_of_st++;
>>> +    if ((STACK_REGNO_P (regno) || MMX_REGNO_P (regno))
>>> + && TEST_HARD_REG_BIT (need_zeroed_hardregs, regno))
>>> +      {
>>> + num_of_st++;
>>> + break;
>>> +      }
>>> 
>>>  if (num_of_st == 0)
>>>    return false;
>>> 
>>> +  bool return_with_x87 = false;
>>> +  return_with_x87 = (crtl->return_rtx
>>> +      && (GET_CODE (crtl->return_rtx) == REG)
>>> +      && (STACK_REG_P (crtl->return_rtx)));
>> 
>> STACK_REG_P already checks for REG, no need for separate check.
> 
> Okay.
> 
>> 
>>> +
>>> +  bool complex_return = false;
>>> +  complex_return = (crtl->return_rtx
>>> +     && COMPLEX_MODE_P (GET_MODE (crtl->return_rtx)));
>>> +
>>> +  if (return_with_x87)
>>> +    if (complex_return)
>>> +      num_of_st = 6;
>>> +    else
>>> +      num_of_st = 7;
>>> +  else
>>> +    num_of_st = 8;
>>> +
>>>  rtx st_reg = gen_rtx_REG (XFmode, FIRST_STACK_REG);
>>>  for (unsigned int i = 0; i < num_of_st; i++)
>>>    emit_insn (gen_rtx_SET (st_reg, CONST0_RTX (XFmode)));
>>> @@ -3646,6 +3677,43 @@ zero_all_st_mm_registers (HARD_REG_SET 
>>> need_zeroed_hardregs)
>>>  return true;
>>> }
>>> 
>>> +
>>> +/* When the routine exit with MMX mode, if there is any ST registers
>>> +   need to be zeroed, we should clear all MMX registers except the
>>> +   one that holds the return value RET_MMX_REGNO.  */
>>> +static bool
>>> +zero_all_mm_registers (HARD_REG_SET need_zeroed_hardregs,
>>> +        unsigned int ret_mmx_regno)
>>> +{
>>> +  bool need_zero_all_mm = false;
>>> +  for (unsigned int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
>>> +    if (STACK_REGNO_P (regno)
>>> + && TEST_HARD_REG_BIT (need_zeroed_hardregs, regno))
>>> +      {
>>> + need_zero_all_mm = true;
>>> + break;
>>> +      }
>>> +
>>> +  if (!need_zero_all_mm)
>>> +    return false;
>>> +
>>> +  rtx zero_mmx = NULL_RTX;
>>> +  machine_mode mode = DImode;
>>> +  for (unsigned int regno = FIRST_MMX_REG; regno <= LAST_MMX_REG; regno++)
>>> +    if (regno != ret_mmx_regno)
>>> +      {
>>> + rtx reg = gen_rtx_REG (mode, regno);
>>> + if (zero_mmx == NULL_RTX)
>>> +   {
>>> +     zero_mmx = reg;
>>> +     emit_insn (gen_rtx_SET (reg, const0_rtx));
>> 
>> Use CONST0_RTX (mode), and you will be able to use V4HImode instead of 
>> DImode.
> Will try this.
>> 
>>> +   }
>>> + else
>>> +   emit_move_insn (reg, zero_mmx);
>>> +      }
>>> +  return true;
>>> +}
>>> +
>>> /* TARGET_ZERO_CALL_USED_REGS.  */
>>> /* Generate a sequence of instructions that zero registers specified by
>>>   NEED_ZEROED_HARDREGS.  Return the ZEROED_HARDREGS that are actually
>>> @@ -3655,7 +3723,8 @@ ix86_zero_call_used_regs (HARD_REG_SET 
>>> need_zeroed_hardregs)
>>> {
>>>  HARD_REG_SET zeroed_hardregs;
>>>  bool all_sse_zeroed = false;
>>> -  bool st_zeroed = false;
>>> +  bool all_st_zeroed = false;
>>> +  bool all_mm_zeroed = false;
>>> 
>>>  /* first, let's see whether we can zero all vector registers together.  */
>>>  rtx zero_all_vec_insn = zero_all_vector_registers (need_zeroed_hardregs);
>>> @@ -3665,24 +3734,42 @@ ix86_zero_call_used_regs (HARD_REG_SET 
>>> need_zeroed_hardregs)
>>>      all_sse_zeroed = true;
>>>    }
>>> 
>>> -  /* then, let's see whether we can zero all st+mm registers togeter.  */
>>> -  st_zeroed = zero_all_st_mm_registers (need_zeroed_hardregs);
>>> +  /* Then, decide which mode (MMX mode or x87 mode) the function exit with.
>>> +     In order to decide whether we need to clear the MMX registers or the
>>> +     stack registers.  */
>>> +
>>> +  bool exit_with_mmx_mode = (crtl->return_rtx
>>> +      && (GET_CODE (crtl->return_rtx) == REG)
>>> +      && (MMX_REG_P (crtl->return_rtx)));
>> 
>> MMX_REG_P also checks for REG internally.
> 
> Okay, will update.
>> 
>>> +
>>> +  /* then, let's see whether we can zero all st registers together.  */
>>> +  if (!exit_with_mmx_mode)
>>> +    all_st_zeroed = zero_all_st_registers (need_zeroed_hardregs);
>>> +  /* Or should we zero all MMX registers.  */
>>> +  else
>>> +    {
>>> +      unsigned int exit_mmx_regno = REGNO (crtl->return_rtx);
>>> +      all_mm_zeroed = zero_all_mm_registers (need_zeroed_hardregs,
>>> +      exit_mmx_regno);
>>> +    }
>>> 
>>>  /* Now, generate instructions to zero all the registers.  */
>>> 
>>>  CLEAR_HARD_REG_SET (zeroed_hardregs);
>>> -  if (st_zeroed)
>>> +  if (all_st_zeroed)
>>>    SET_HARD_REG_BIT (zeroed_hardregs, FIRST_STACK_REG);
>>> 
>>>  rtx zero_gpr = NULL_RTX;
>>>  rtx zero_vector = NULL_RTX;
>>>  rtx zero_mask = NULL_RTX;
>>> +  rtx zero_mmx = NULL_RTX;
>>> 
>>>  for (unsigned int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
>>>    {
>>>      if (!TEST_HARD_REG_BIT (need_zeroed_hardregs, regno))
>>> continue;
>>> -      if (!zero_call_used_regno_p (regno, all_sse_zeroed))
>>> +      if (!zero_call_used_regno_p (regno, all_sse_zeroed,
>>> +    exit_with_mmx_mode && !all_mm_zeroed))
>>> continue;
>>> 
>>>      SET_HARD_REG_BIT (zeroed_hardregs, regno);
>>> @@ -3728,6 +3815,15 @@ ix86_zero_call_used_regs (HARD_REG_SET 
>>> need_zeroed_hardregs)
>>>   }
>>> else
>>>   emit_move_insn (reg, zero_mask);
>>> +      else if (mode == DImode)
>>> + if (zero_mmx == NULL_RTX)
>>> +   {
>>> +     zero_mmx = reg;
>>> +     tmp = gen_rtx_SET (reg, const0_rtx);
>> 
>> CONST0_RTX (mode), and you will be able to use V4HImode.
> 
> Okay, will try this.
>> 
>>> +     emit_insn (tmp);
>>> +   }
>>> + else
>>> +   emit_move_insn (reg, zero_mmx);
>>>      else
>>> gcc_unreachable ();
>>>    }
>>> diff --git a/gcc/testsuite/gcc.target/i386/zero-scratch-regs-28.c 
>>> b/gcc/testsuite/gcc.target/i386/zero-scratch-regs-28.c
>>> new file mode 100644
>>> index 00000000000..61c0bb7a35c
>>> --- /dev/null
>>> +++ b/gcc/testsuite/gcc.target/i386/zero-scratch-regs-28.c
>>> @@ -0,0 +1,17 @@
>>> +/* { dg-do compile { target *-*-linux* } } */
>>> +/* { dg-options "-O2 -m32 -mmmx -fzero-call-used-regs=all" } */
>> 
>> -m32 should not be used explicitly. Use:
>> 
>> /* { dg-require-effective-target ia32 } */
>> 
>> instead.
>> 
>> Also, can we test -fzero-call-used-regs=used and
>> -fzero-call-used-regs=arg with MMX regs? As said above, when function
>> exits in MMX mode, and no x87 is touched, we can clear separate MMX
>> registers.
> 
> I will try to add these new testing.
>> 
>>> +
>>> +typedef int __v2si __attribute__ ((vector_size (8)));
>>> +
>>> +__v2si ret_mmx (void)
>>> +{
>>> +  return (__v2si) { 123, 345 };
>>> +}
>>> +
>>> +/* { dg-final { scan-assembler "pxor\[ \t\]*%mm1, %mm1" } } */
>>> +/* { dg-final { scan-assembler "movq\[ \t\]*%mm1, %mm2" } } */
>>> +/* { dg-final { scan-assembler "movq\[ \t\]*%mm1, %mm3" } } */
>>> +/* { dg-final { scan-assembler "movq\[ \t\]*%mm1, %mm4" } } */
>>> +/* { dg-final { scan-assembler "movq\[ \t\]*%mm1, %mm5" } } */
>>> +/* { dg-final { scan-assembler "movq\[ \t\]*%mm1, %mm6" } } */
>>> +/* { dg-final { scan-assembler "movq\[ \t\]*%mm1, %mm7" } } */
>>> diff --git a/gcc/testsuite/gcc.target/i386/zero-scratch-regs-29.c 
>>> b/gcc/testsuite/gcc.target/i386/zero-scratch-regs-29.c
>>> new file mode 100644
>>> index 00000000000..db636654e70
>>> --- /dev/null
>>> +++ b/gcc/testsuite/gcc.target/i386/zero-scratch-regs-29.c
>>> @@ -0,0 +1,11 @@
>>> +/* { dg-do compile { target *-*-linux* } } */
>>> +/* { dg-options "-O2 -m32 -mmmx -fzero-call-used-regs=all" } */
>> 
>> No need for "-m32 -mmmx", this test works the same for 32bit and 64bit 
>> targets..
> 
> Will fix this.
>> 
>>> +typedef int __v2si __attribute__ ((vector_size (8)));
>> 
>> The above is not needed in this test.
> 
> Okay. 
>>> +
>>> +long double ret_x87 (void)
>>> +{
>>> +  return 1.1L;
>>> +}
>>> +
>>> +/* { dg-final { scan-assembler-times "fldz" 7 } } */
>>> +/* { dg-final { scan-assembler-times "fstp" 7 } } */
>>> diff --git a/gcc/testsuite/gcc.target/i386/zero-scratch-regs-30.c 
>>> b/gcc/testsuite/gcc.target/i386/zero-scratch-regs-30.c
>>> new file mode 100644
>>> index 00000000000..7c20b569bfa
>>> --- /dev/null
>>> +++ b/gcc/testsuite/gcc.target/i386/zero-scratch-regs-30.c
>>> @@ -0,0 +1,11 @@
>>> +/* { dg-do compile { target *-*-linux* } } */
>>> +/* { dg-options "-O2  -fzero-call-used-regs=all" } */
>>> +typedef int __v2si __attribute__ ((vector_size (8)));
>> 
>> The above line is not needed.
> Okay.
> 
>> 
>>> +
>>> +_Complex long double ret_x87_cplx (void)
>>> +{
>>> +  return 1.1L + 1.2iL;
>>> +}
>>> +
>>> +/* { dg-final { scan-assembler-times "fldz" 6 } } */
>>> +/* { dg-final { scan-assembler-times "fstp" 6 } } */
>> 
>> The above applies only to 64bit target. 32bit targets pass complex
>> value via memory and should clear all 8 registers.
> 
> Will fix this.
> 
> Thanks.
> 
> Qing
> 

Reply via email to