Re: [i386] Rotate stack checking loop

2015-11-12 Thread Uros Bizjak
Hello!

> this patch rotates the loop generated in the prologue to do stack checking
> when -fstack-check is specified, thereby saving one branch instruction.  It
> was initially implemented as a WHILE loop to match the generic implementation
> but can be turned into a DO-WHILE loop because the amount of stack to be
> checked is known at compile time (since it's the static part of the frame).
>
> The patch also changes a mov+sub pair into an lea in the common case on Linux,
> saving one more instruction in the process.
>
> Tested on x86/Linux & x86-64/Linux (ix86_adjust_stack_and_probe path) and
> x86/Solaris (ix86_emit_probe_stack_range path).  OK for the mainline?
>
>
> 2015-11-12  Eric Botcazou  
>
> * config/i386/i386.c (ix86_adjust_stack_and_probe): Adjust and use
> an lea instruction when possible.
> (output_adjust_stack_and_probe): Rotate the loop and simplify.
> (ix86_emit_probe_stack_range): Adjust.
> (output_probe_stack_range): Rotate the loop and simplify.

OK.

Thanks,
Uros.


[i386] Rotate stack checking loop

2015-11-12 Thread Eric Botcazou
Hi,

this patch rotates the loop generated in the prologue to do stack checking 
when -fstack-check is specified, thereby saving one branch instruction.  It 
was initially implemented as a WHILE loop to match the generic implementation 
but can be turned into a DO-WHILE loop because the amount of stack to be 
checked is known at compile time (since it's the static part of the frame).

The patch also changes a mov+sub pair into an lea in the common case on Linux, 
saving one more instruction in the process.

Tested on x86/Linux & x86-64/Linux (ix86_adjust_stack_and_probe path) and 
x86/Solaris (ix86_emit_probe_stack_range path).  OK for the mainline?


2015-11-12  Eric Botcazou  

* config/i386/i386.c (ix86_adjust_stack_and_probe): Adjust and use
an lea instruction when possible.
(output_adjust_stack_and_probe): Rotate the loop and simplify.
(ix86_emit_probe_stack_range): Adjust.
(output_probe_stack_range): Rotate the loop and simplify.

-- 
Eric BotcazouIndex: config/i386/i386.c
===
--- config/i386/i386.c	(revision 230245)
+++ config/i386/i386.c	(working copy)
@@ -12137,10 +12137,10 @@ ix86_adjust_stack_and_probe (const HOST_
   rtx size_rtx = GEN_INT (size), last;
 
   /* See if we have a constant small number of probes to generate.  If so,
- that's the easy case.  The run-time loop is made up of 11 insns in the
+ that's the easy case.  The run-time loop is made up of 9 insns in the
  generic case while the compile-time loop is made up of 3+2*(n-1) insns
  for n # of intervals.  */
-  if (size <= 5 * PROBE_INTERVAL)
+  if (size <= 4 * PROBE_INTERVAL)
 {
   HOST_WIDE_INT i, adjust;
   bool first_probe = true;
@@ -12207,19 +12207,27 @@ ix86_adjust_stack_and_probe (const HOST_
 	 - (PROBE_INTERVAL + dope;
 
   /* LAST_ADDR = SP_0 + PROBE_INTERVAL + ROUNDED_SIZE.  */
-  emit_move_insn (sr.reg, GEN_INT (-rounded_size));
-  emit_insn (gen_rtx_SET (sr.reg,
-			  gen_rtx_PLUS (Pmode, sr.reg,
-	stack_pointer_rtx)));
+  if (rounded_size <= (HOST_WIDE_INT_1 << 31))
+	emit_insn (gen_rtx_SET (sr.reg,
+plus_constant (Pmode, stack_pointer_rtx,
+	   -rounded_size)));
+  else
+	{
+	  emit_move_insn (sr.reg, GEN_INT (-rounded_size));
+	  emit_insn (gen_rtx_SET (sr.reg,
+  gen_rtx_PLUS (Pmode, sr.reg,
+		stack_pointer_rtx)));
+	}
 
 
   /* Step 3: the loop
 
-	 while (SP != LAST_ADDR)
+	 do
 	   {
 	 SP = SP + PROBE_INTERVAL
 	 probe at SP
 	   }
+	 while (SP != LAST_ADDR)
 
 	 adjusts SP and probes to PROBE_INTERVAL + N * PROBE_INTERVAL for
 	 values of N from 1 until it is equal to ROUNDED_SIZE.  */
@@ -12275,23 +12283,16 @@ const char *
 output_adjust_stack_and_probe (rtx reg)
 {
   static int labelno = 0;
-  char loop_lab[32], end_lab[32];
+  char loop_lab[32];
   rtx xops[2];
 
-  ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno);
-  ASM_GENERATE_INTERNAL_LABEL (end_lab, "LPSRE", labelno++);
+  ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
 
+  /* Loop.  */
   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
 
-  /* Jump to END_LAB if SP == LAST_ADDR.  */
-  xops[0] = stack_pointer_rtx;
-  xops[1] = reg;
-  output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
-  fputs ("\tje\t", asm_out_file);
-  assemble_name_raw (asm_out_file, end_lab);
-  fputc ('\n', asm_out_file);
-
   /* SP = SP + PROBE_INTERVAL.  */
+  xops[0] = stack_pointer_rtx;
   xops[1] = GEN_INT (PROBE_INTERVAL);
   output_asm_insn ("sub%z0\t{%1, %0|%0, %1}", xops);
 
@@ -12299,12 +12300,16 @@ output_adjust_stack_and_probe (rtx reg)
   xops[1] = const0_rtx;
   output_asm_insn ("or%z0\t{%1, (%0)|DWORD PTR [%0], %1}", xops);
 
-  fprintf (asm_out_file, "\tjmp\t");
+  /* Test if SP == LAST_ADDR.  */
+  xops[0] = stack_pointer_rtx;
+  xops[1] = reg;
+  output_asm_insn ("cmp%z0\t{%1, %0|%0, %1}", xops);
+
+  /* Branch.  */
+  fputs ("\tjne\t", asm_out_file);
   assemble_name_raw (asm_out_file, loop_lab);
   fputc ('\n', asm_out_file);
 
-  ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, end_lab);
-
   return "";
 }
 
@@ -12315,10 +12320,10 @@ static void
 ix86_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
 {
   /* See if we have a constant small number of probes to generate.  If so,
- that's the easy case.  The run-time loop is made up of 7 insns in the
+ that's the easy case.  The run-time loop is made up of 6 insns in the
  generic case while the compile-time loop is made up of n insns for n #
  of intervals.  */
-  if (size <= 7 * PROBE_INTERVAL)
+  if (size <= 6 * PROBE_INTERVAL)
 {
   HOST_WIDE_INT i;
 
@@ -12362,11 +12367,12 @@ ix86_emit_probe_stack_range (HOST_WIDE_I
 
   /* Step 3: the loop
 
-	 while (TEST_ADDR != LAST_ADDR)
+	 do
 	   {
 	 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
 	 probe at TEST_ADDR
 	   }
+	 while (TEST_ADDR != LAST_ADDR)
 
  probes at FIRST + N * PROBE