The attached patch removes excess stack space allocation with
alloca in some situations.  Plese check the commit message in the
patch for details.

Ciao

Dominik ^_^  ^_^

-- 

Dominik Vogt
IBM Germany
gcc/ChangeLog

        * explow.c (round_push): Use know adjustment.
        (allocate_dynamic_stack_space): Pass known adjustment to round_push.
>From 9ea451aef0f1f2fb0a36a7b718f910cfe285541d Mon Sep 17 00:00:00 2001
From: Dominik Vogt <v...@linux.vnet.ibm.com>
Date: Fri, 29 Apr 2016 08:36:59 +0100
Subject: [PATCH] Drop excess size used for run time allocated stack
 variables.

The present calculation sometimes led to more stack memory being used than
necessary with alloca.  First, (STACK_BOUNDARY -1) would be added to the
allocated size:

  size = plus_constant (Pmode, size, extra);
  size = force_operand (size, NULL_RTX);

Then round_push was called and added another (STACK_BOUNDARY - 1) before
rounding down to a multiple of STACK_BOUNDARY.  On s390x this resulted in
adding 14 before rounding down for "x" in the test case pr36728-1.c.

round_push() now takes an argument to inform it about what has already been
added to size.
---
 gcc/explow.c | 33 ++++++++++++++++++++-------------
 1 file changed, 20 insertions(+), 13 deletions(-)

diff --git a/gcc/explow.c b/gcc/explow.c
index e0ce201..a039295 100644
--- a/gcc/explow.c
+++ b/gcc/explow.c
@@ -949,24 +949,30 @@ anti_adjust_stack (rtx adjust)
 }
 
 /* Round the size of a block to be pushed up to the boundary required
-   by this machine.  SIZE is the desired size, which need not be constant.  */
+   by this machine.  SIZE is the desired size, which need not be constant.
+   ALREADY_ADDED is the number of units that have already been added to SIZE for
+   other alignment reasons.
+*/
 
 static rtx
-round_push (rtx size)
+round_push (rtx size, int already_added)
 {
-  rtx align_rtx, alignm1_rtx;
+  rtx align_rtx, add_rtx;
 
   if (!SUPPORTS_STACK_ALIGNMENT
       || crtl->preferred_stack_boundary == MAX_SUPPORTED_STACK_ALIGNMENT)
     {
       int align = crtl->preferred_stack_boundary / BITS_PER_UNIT;
+      int add;
 
       if (align == 1)
 	return size;
 
+      add = (align > already_added) ? align - already_added - 1 : 0;
+
       if (CONST_INT_P (size))
 	{
-	  HOST_WIDE_INT new_size = (INTVAL (size) + align - 1) / align * align;
+	  HOST_WIDE_INT new_size = (INTVAL (size) + add) / align * align;
 
 	  if (INTVAL (size) != new_size)
 	    size = GEN_INT (new_size);
@@ -974,7 +980,7 @@ round_push (rtx size)
 	}
 
       align_rtx = GEN_INT (align);
-      alignm1_rtx = GEN_INT (align - 1);
+      add_rtx = (add > 0) ? GEN_INT (add) : const0_rtx;
     }
   else
     {
@@ -983,15 +989,15 @@ round_push (rtx size)
 	 substituted by the right value in vregs pass and optimized
 	 during combine.  */
       align_rtx = virtual_preferred_stack_boundary_rtx;
-      alignm1_rtx = force_operand (plus_constant (Pmode, align_rtx, -1),
-				   NULL_RTX);
+      add_rtx = force_operand (plus_constant (Pmode, align_rtx, -1), NULL_RTX);
     }
 
   /* CEIL_DIV_EXPR needs to worry about the addition overflowing,
      but we know it can't.  So add ourselves and then do
      TRUNC_DIV_EXPR.  */
-  size = expand_binop (Pmode, add_optab, size, alignm1_rtx,
-		       NULL_RTX, 1, OPTAB_LIB_WIDEN);
+  if (add_rtx != const0_rtx)
+    size = expand_binop (Pmode, add_optab, size, add_rtx,
+			 NULL_RTX, 1, OPTAB_LIB_WIDEN);
   size = expand_divmod (0, TRUNC_DIV_EXPR, Pmode, size, align_rtx,
 			NULL_RTX, 1);
   size = expand_mult (Pmode, size, align_rtx, NULL_RTX, 1);
@@ -1175,6 +1181,7 @@ allocate_dynamic_stack_space (rtx size, unsigned size_align,
   rtx_code_label *final_label;
   rtx final_target, target;
   unsigned extra_align = 0;
+  unsigned extra = 0;
   bool must_align;
 
   /* If we're asking for zero bytes, it doesn't matter what we point
@@ -1275,9 +1282,9 @@ allocate_dynamic_stack_space (rtx size, unsigned size_align,
   extra_align = BITS_PER_UNIT;
 #endif
 
-  if (must_align)
+  if (must_align && required_align > extra_align)
     {
-      unsigned extra = (required_align - extra_align) / BITS_PER_UNIT;
+      extra = (required_align - extra_align) / BITS_PER_UNIT;
 
       size = plus_constant (Pmode, size, extra);
       size = force_operand (size, NULL_RTX);
@@ -1285,7 +1292,7 @@ allocate_dynamic_stack_space (rtx size, unsigned size_align,
       if (flag_stack_usage_info)
 	stack_usage_size += extra;
 
-      if (extra && size_align > extra_align)
+      if (size_align > extra_align)
 	size_align = extra_align;
     }
 
@@ -1304,7 +1311,7 @@ allocate_dynamic_stack_space (rtx size, unsigned size_align,
      momentarily mis-aligning the stack.  */
   if (size_align % MAX_SUPPORTED_STACK_ALIGNMENT != 0)
     {
-      size = round_push (size);
+      size = round_push (size, extra);
 
       if (flag_stack_usage_info)
 	{
-- 
2.3.0

Reply via email to