Hi

I would like to have a stack check for threads with small stack space for each thread. (I'm using a ARM Cortex-M3 microcontroller with a stack size of a 1 KByte per Thread.)
Each thread having its own limit address.
The thread scheduler can then calculate the limit and store this value inside of a global variable. The compiler may generate code to check the stack for overflow at function entry.
In principal this can be done this way:
 - push registers as usual
- figure out if one or two work registers, that can be used directly without extra push
 - if not enough registers found push required work registers to stack
 - load limit address into first working register
 - load value of limit address (into the same register)
- if stack pointer will go to extend the stack (e.g. for local variables) load this size value too
   (here the second work register can be used)
 - compare for overflow
 - if overflow occur "call" stack_failure function
 - pop work registers that are pushed before
 - continue function prologue as usual e.g. extend stack pointer

The ARM target has an option "-mapcs-stack-check" but this is more or less not working. (implementaion missing)
There are also architecture independent options like
"-fstack-check=generic", "-fstack-limit-symbol=current_stack_limit" or "-fstack-limit-register=r6"
that can be used.

The generic stack check is doing a probe at end of function prologue phase
(e.g by writing 12K ahead the current stack pointer position).
If this stack space is not available the probe may generates a fault.
This require that the CPU is having a MPU or a MMU.
For machines with small memory space an additional mechanism should be available.

The option "-fstack-check" can be extend by the switches "direct" and "indirect" to emit compare code in function prologue. If switch "direct" is given the address of "-fstack-limit-symbol" represents the limit itself. If switch "indirect" is given "-fstack-limit-symbol" is a kind of global variable that needs be read before compare.

I have add an proposal to show how an integrateion of this behavior can be done at ARM architecture.

Is there interest to have such a feature at GCC side?
Is there someone with write permission who is willing to play the role as a volunteer for this task? Is the code still small enough to be acceptable or is additional paperwork required first?
The generated code itself will be small
e.g. if using "-fstack-check=indirect -fstack-limit-symbol=stack_limit_var"
->    push    {r0}
->    ldr    r0, =stack_limit_var
->    ldr    r0, [r0]
->    cmp    sp, r0
->    bhs    1f
->    push    {lr}
->    bl    __thumb_stack_failure    @ stack check
->.align
->.ltorg
->1:
->    pop    {r0}
The rest of the implementation overhead is only GCC specific.

Regards
 Thomas Klein

PS
Here are some more implementation hints.
introduce new parameters "direct" and "indirect" in gcc/opts.c and gcc/flag-types.h

gcc/explow.c function allocate_dynamic_stack_space:
- suppress stack probing if parameter "direct", "indirect" or if a stack-limit is given - do additional read of limit value if parameter "indirect" and a stack-limit symbol is given

gcc/config/arm/arm.c
- new function "stack_check_output_function" to write the stack check to the assember file - new function "stack_check_work_registers" to find possible working registers (only used by "stack check")
 - integration for ARM and Thumb-2 in function arm_expand_prologue
 - integration for Thumb-1 in function thumb1_output_function_prologue

gcc/config/arm/arm.md
- probe_stack: do not emit code when parameters "direct" or "indirect" given
                emit code as in gcc/explow.c
- probe_stack_done: dummy to make sure probe_stack insns are not optimized away - check_stack: if stack-limit and parameter "generic" is given use the limit the same way as in function allocate_dynamic_stack_space - stack_check: ARM/Thumb-2 insn to output function stack_check_output_function
 - trap: failure call used in function allocate_dynamic_stack_space


Index: gcc/opts.c
===================================================================
--- gcc/opts.c    (revision 168762)
+++ gcc/opts.c    (working copy)
@@ -1616,6 +1616,12 @@ common_handle_option (struct gcc_options *opts,
                : STACK_CHECK_STATIC_BUILTIN
                  ? STATIC_BUILTIN_STACK_CHECK
                  : GENERIC_STACK_CHECK;
+      else if (!strcmp (arg, "indirect"))
+    /* This is an other stack checking method.  */
+    opts->x_flag_stack_check = INDIRECT_STACK_CHECK;
+      else if (!strcmp (arg, "direct"))
+    /* This is an other stack checking method.  */
+    opts->x_flag_stack_check = DIRECT_STACK_CHECK;
       else
     warning_at (loc, 0, "unknown stack check parameter \"%s\"", arg);
       break;
Index: gcc/flag-types.h
===================================================================
--- gcc/flag-types.h    (revision 168762)
+++ gcc/flag-types.h    (working copy)
@@ -153,7 +153,11 @@ enum stack_check_type

   /* Check the stack and entirely rely on the target configuration
      files, i.e. do not use the generic mechanism at all.  */
-  FULL_BUILTIN_STACK_CHECK
+  FULL_BUILTIN_STACK_CHECK,
+
+  /* Check the stack after allocation at each function entry */
+  DIRECT_STACK_CHECK,
+  INDIRECT_STACK_CHECK
 };

 /* Names for the different levels of -Wstrict-overflow=N.  The numeric
Index: gcc/explow.c
===================================================================
--- gcc/explow.c    (revision 168762)
+++ gcc/explow.c    (working copy)
@@ -1395,7 +1395,12 @@ allocate_dynamic_stack_space (rtx size, unsigned s

/* If needed, check that we have the required amount of stack. Take into
      account what has already been checked.  */
-  if (STACK_CHECK_MOVING_SP)
+  if (  STACK_CHECK_MOVING_SP
+#ifdef HAVE_check_stack
+     || crtl->limit_stack
+#endif
+     || flag_stack_check == DIRECT_STACK_CHECK
+     || flag_stack_check == INDIRECT_STACK_CHECK)
     ;
   else if (flag_stack_check == GENERIC_STACK_CHECK)
probe_stack_range (STACK_OLD_CHECK_PROTECT + STACK_CHECK_MAX_FRAME_SIZE,
@@ -1437,15 +1442,23 @@ allocate_dynamic_stack_space (rtx size, unsigned s
       /* Check stack bounds if necessary.  */
       if (crtl->limit_stack)
     {
+          rtx limit_rtx;
       rtx available;
       rtx space_available = gen_label_rtx ();
+          if (  GET_CODE (stack_limit_rtx) == SYMBOL_REF
+ && flag_stack_check == INDIRECT_STACK_CHECK)
+            limit_rtx = expand_unop (Pmode, mov_optab,
+                    gen_rtx_MEM (Pmode, stack_limit_rtx),
+                    NULL_RTX, 1);
+          else
+            limit_rtx = stack_limit_rtx;
 #ifdef STACK_GROWS_DOWNWARD
       available = expand_binop (Pmode, sub_optab,
-                    stack_pointer_rtx, stack_limit_rtx,
+                    stack_pointer_rtx, limit_rtx,
                     NULL_RTX, 1, OPTAB_WIDEN);
 #else
       available = expand_binop (Pmode, sub_optab,
-                    stack_limit_rtx, stack_pointer_rtx,
+                    limit_rtx, stack_pointer_rtx,
                     NULL_RTX, 1, OPTAB_WIDEN);
 #endif
       emit_cmp_and_jump_insns (available, size, GEU, NULL_RTX, Pmode, 1,
Index: gcc/config/arm/arm.c
===================================================================
--- gcc/config/arm/arm.c    (revision 168762)
+++ gcc/config/arm/arm.c    (working copy)
@@ -14472,6 +14472,136 @@ arm_output_function_prologue (FILE *f, HOST_WIDE_I

 }

+void
+stack_check_output_function (FILE *f, int reg0, int reg1, unsigned amount,
+                             unsigned numregs)
+{
+  unsigned amount_needsreg, amount_const_ok, issym=0;
+
+  if (TARGET_THUMB1)
+    amount_const_ok = (amount < 256);
+  else
+    amount_const_ok = const_ok_for_arm (amount);
+
+  if(GET_CODE (stack_limit_rtx) == SYMBOL_REF) /*stack_limit_rtx*/
+    {
+      issym = 1;
+      amount_needsreg = !amount_const_ok;
+    }
+  else
+    amount_needsreg = (amount > 0);
+
+  if (issym && amount) /*need temp regs for limit and amount*/
+    {
+      if (numregs >= 2)
+        ; /*no need to push*/
+      else if (numregs == 1)
+        {
+          if (amount_needsreg)
+            {
+              /*push temp reg for amount*/
+              if (TARGET_ARM)
+ asm_fprintf (f, "\tstr\t%r, [%r, #-4]!\n", reg1, SP_REGNUM);
+              else
+                asm_fprintf (f, "\tpush\t{%r}\n", reg1);
+            }
+        }
+      else
+        {
+          /*push temp regs for limit and amount*/
+          if (TARGET_ARM)
+ asm_fprintf (f, "\tstmfd\t%r, {%r,%r}\n", SP_REGNUM, reg0, reg1);
+          else
+            asm_fprintf (f, "\tpush\t{%r,%r}\n", reg0, reg1);
+        }
+    }
+  else if ((issym || amount_needsreg) && numregs == 0)
+    { /*push temp reg either for limit or amount*/
+      if (TARGET_ARM)
+        asm_fprintf (f, "\tstr\t%1, [%0, #-4]!\n", reg0, SP_REGNUM);
+      else
+        asm_fprintf (f, "\tpush\t{%r}\n", reg0);
+    }
+
+  if (issym)
+    {
+      char *str ;
+      str = (char *) XSTR  (stack_limit_rtx, 0);
+
+      asm_fprintf (f, "\tldr\t%r, =%s\n", reg0, str);
+      if (flag_stack_check == INDIRECT_STACK_CHECK)
+        asm_fprintf (f, "\tldr\t%r, [%r]\n", reg0, reg0);
+      if (amount)
+        {
+          if (amount_const_ok)
+            {
+              asm_fprintf (f, "\tadds\t%r, %r, #%d\n", reg0, reg0, amount);
+            }
+          else
+            {
+              asm_fprintf (f, "\tldr\t%r, =#%d\n", reg1, amount);
+              asm_fprintf (f, "\tadd\t%r, %r, %r\n", reg0, reg0, reg1);
+            }
+        }
+      asm_fprintf (f, "\tcmp\t%r, %r\n", SP_REGNUM, reg0);
+    }
+  else if (amount)
+    {
+      if (amount_const_ok)
+        asm_fprintf (f, "\tmov\t%r, =#%d\n", reg0, amount);
+      else
+        asm_fprintf (f, "\tldr\t%r, =#%d\n", reg0, amount);
+ asm_fprintf (f, "\tadd\t%r, %r, %r\n", reg0, reg0, REGNO(stack_limit_rtx));
+      asm_fprintf (f, "\tcmp\t%r, %r\n", SP_REGNUM, reg0);
+    }
+  else
+    asm_fprintf (f, "\tcmp\t%r, %r\n", SP_REGNUM, REGNO(stack_limit_rtx));
+  asm_fprintf (f, "\tbhs\t1f\n");
+  if (TARGET_ARM)
+    {
+      asm_fprintf (f, "\stmfd\t%r, {%r}\n", SP_REGNUM, LR_REGNUM);
+      asm_fprintf (f, "\tbl\t__arm_stack_failure\t%@ stack check\n");
+    }
+  else
+    {
+      asm_fprintf (f, "\tpush\t{%r}\n", LR_REGNUM);
+      asm_fprintf (f, "\tbl\t__thumb_stack_failure\t%@ stack check\n");
+    }
+
+  if (issym || amount) /*temp regs: collect values from here*/
+    asm_fprintf (f, ".align\n.ltorg\n");
+  asm_fprintf (f, "1:\n");
+  if (issym && amount) /*pop temp regs used by limit and amount*/
+    {
+      if (numregs >= 2)
+        ; /*no need to pop*/
+      else if (numregs == 1)
+        {
+          if (amount_needsreg)
+            {
+              if (TARGET_ARM)
+                asm_fprintf (f, "\tldr\t%r, [%r, #4]!\n", reg1, SP_REGNUM);
+              else
+                asm_fprintf (f, "\tpop\t{%r}\n", reg1);
+            }
+        }
+      else
+        {
+          if (TARGET_ARM)
+ asm_fprintf (f, "\tldmfd\t%r, {%r,%r}\n", SP_REGNUM, reg0, reg1);
+          else
+            asm_fprintf (f, "\tpop\t{%r,%r}\n", reg0, reg1);
+        }
+    }
+  else if ((issym || amount_needsreg) && numregs == 0)
+    { /*pop temp reg used by limit or amount*/
+      if (TARGET_ARM)
+        asm_fprintf (f, "\tldr\t%r, [%r, #4]!\n", reg0, SP_REGNUM);
+      else
+        asm_fprintf (f, "\tpop\t{%r}\n", reg0);
+    }
+}
+
 const char *
 arm_output_epilogue (rtx sibling)
 {
@@ -15616,6 +15746,72 @@ thumb_set_frame_pointer (arm_stack_offsets *offset
   RTX_FRAME_RELATED_P (insn) = 1;
 }

+/*search for possible work registers for stack-check operation at prologue
+ return the number of register that can be used without extra push/pop */
+
+static int
+stack_check_work_registers (rtx *workreg)
+{
+  int reg, i, k, n, nregs;
+
+  if (crtl->args.info.pcs_variant <= ARM_PCS_AAPCS_LOCAL)
+    {
+      nregs = crtl->args.info.aapcs_next_ncrn;
+    }
+  else
+    nregs = crtl->args.info.nregs;
+
+
+  n = 0;
+  i = 0;
+ /* check if we can use one of the argument registers r0..r3 as long as they
+   * not holding data*/
+  for (reg = 0; reg <= LAST_ARG_REGNUM && i < 2; reg++)
+    {
+      if (  !df_regs_ever_live_p (reg)
+ || (cfun->machine->uses_anonymous_args && crtl->args.pretend_args_size
+ > (LAST_ARG_REGNUM - reg) * UNITS_PER_WORD)
+         || (!cfun->machine->uses_anonymous_args && nregs < reg + 1)
+         )
+        {
+      workreg[i++] = gen_rtx_REG (SImode, reg);
+      n = (reg + 1) % 4;
+        }
+    }
+
+  /* otherwise try to use r4..r7*/
+  for (reg = LAST_ARG_REGNUM + 1; reg <= LAST_LO_REGNUM && i < 2; reg++)
+    {
+      if (  df_regs_ever_live_p (reg)
+ && !fixed_regs[reg]
+ && reg != FP_REGNUM )
+        {
+      workreg[i++] = gen_rtx_REG (SImode, reg);
+        }
+    }
+
+  if (TARGET_32BIT)
+    {
+      /* ARM and Thumb-2 can use high regs.  */
+      for (reg = FIRST_HI_REGNUM; reg <= LAST_HI_REGNUM && i < 2; reg ++)
+        if (  df_regs_ever_live_p (reg)
+ && !fixed_regs[reg]
+ && reg != FP_REGNUM )
+          {
+        workreg[i++] = gen_rtx_REG (SImode, reg);
+          }
+    }
+
+  k = i;
+  /* if not enough found to be uses without extra push,
+   * collect next from r0..r4*/
+  for ( ; i<2; i++)
+    workreg[i] = gen_rtx_REG (SImode, n++);
+
+  return k;
+}
+
+
 /* Generate the prologue instructions for entry into an ARM or Thumb-2
    function.  */
 void
@@ -15865,6 +16061,24 @@ arm_expand_prologue (void)
     current_function_static_stack_size
       = offsets->outgoing_args - offsets->saved_args;

+  if (  crtl->limit_stack
+ && !(IS_INTERRUPT (func_type))
+ && (  flag_stack_check == DIRECT_STACK_CHECK
+        || flag_stack_check == INDIRECT_STACK_CHECK)
+ && (offsets->outgoing_args - offsets->saved_args) > 0
+     )
+    {
+      rtx reg[2], num_temp_regs;
+
+      amount = GEN_INT (offsets->outgoing_args - saved_regs
+            - offsets->saved_args);
+      num_temp_regs = GEN_INT (stack_check_work_registers(reg));
+      insn = gen_stack_check (stack_pointer_rtx,
+                              reg[0], reg[1], stack_limit_rtx,
+                              amount, num_temp_regs);
+      insn = emit_insn (insn);
+    }
+
   if (offsets->outgoing_args != offsets->saved_args + saved_regs)
     {
       /* This add can produce multiple insns for a large constant, so we
@@ -21050,6 +21264,26 @@ thumb1_output_function_prologue (FILE *f, HOST_WID
         thumb_pushpop (f, pushable_regs, 1, &cfa_offset, real_regs_mask);
     }
     }
+
+  if(  crtl->limit_stack
+ && (  flag_stack_check == DIRECT_STACK_CHECK
+       || flag_stack_check == INDIRECT_STACK_CHECK)
+ && (offsets->outgoing_args - offsets->saved_args)
+    )
+    {
+      unsigned amount, numregs;
+      int reg0, reg1;
+      rtx reg[2];
+
+      amount = offsets->outgoing_args - offsets->saved_regs;
+      amount -= 4 * thumb1_extra_regs_pushed (offsets, true);
+
+      numregs = stack_check_work_registers(reg);
+      reg0 = REGNO (reg[0]);
+      reg1 = REGNO (reg[1]);
+
+      stack_check_output_function  (f, reg0, reg1, amount, numregs);
+    }
 }

 /* Handle the case of a double word load into a low register from
Index: gcc/config/arm/arm.md
===================================================================
--- gcc/config/arm/arm.md    (revision 168762)
+++ gcc/config/arm/arm.md    (working copy)
@@ -104,6 +104,7 @@
    (UNSPEC_SYMBOL_OFFSET 27) ; The offset of the start of the symbol from
                              ; another symbolic address.
    (UNSPEC_MEMORY_BARRIER 28) ; Represent a memory barrier.
+   (UNSPEC_PROBE_STACK      29) ; probe stack memory reference
   ]
 )

@@ -10581,6 +10582,108 @@
   [(set_attr "conds" "clob")]
 )

+(define_expand "probe_stack"
+  [(match_operand 0 "memory_operand" "")]
+  "TARGET_EITHER"
+{
+  if (  flag_stack_check == DIRECT_STACK_CHECK
+     || flag_stack_check == INDIRECT_STACK_CHECK)
+    ;
+  else
+    {
+      emit_move_insn (operands[0], const0_rtx);
+      emit_insn (gen_probe_stack_done (operands[0]));
+      emit_insn (gen_blockage ());
+    }
+  DONE;
+}
+)
+
+(define_insn "probe_stack_done"
+  [(set (match_operand 0 "memory_operand" "=m")
+        (unspec [(const_int 0)] UNSPEC_PROBE_STACK))]
+  "TARGET_EITHER"
+  "@ probe stack done"
+  [(set_attr "type" "store1")]
+)
+
+(define_expand "check_stack"
+  [(match_operand 0 "memory_operand" "")]
+  "crtl->limit_stack
+ && flag_stack_check != DIRECT_STACK_CHECK
+ && flag_stack_check != INDIRECT_STACK_CHECK"
+{
+  rtx label = gen_label_rtx ();
+  rtx addr = copy_rtx (operands[0]);
+  addr = gen_rtx_fmt_ee (MINUS, Pmode, addr, GEN_INT (0));
+  addr = force_operand (addr, NULL_RTX);
+  emit_insn (gen_blockage ());
+  emit_cmp_and_jump_insns (stack_limit_rtx, addr, LEU, NULL_RTX, Pmode, 1,
+                           label);
+  emit_insn (gen_trap ());
+  emit_label (label);
+  emit_insn (gen_blockage ());
+  DONE;
+}
+)
+
+
+(define_insn "stack_check"
+  [(set
+   (match_operand:SI 0 "register_operand" "=k")
+   (match_operand:SI 3 "general_operand"  "sr")
+   )
+   (match_operand:SI 1 "register_operand" "r")
+   (match_operand:SI 2 "register_operand" "r")
+   (match_operand:SI 4 "general_operand"  "i")
+   (match_operand:SI 5 "general_operand"  "i")
+   (clobber (reg:CC CC_REGNUM))
+  ]
+  "TARGET_32BIT
+ && (operands[3] == stack_limit_rtx)
+ && (GET_CODE (operands[4]) == CONST_INT)
+ && (GET_CODE (operands[5]) == CONST_INT)"
+  "*
+  {
+    int reg0, reg1;
+    unsigned amount, numregs;
+
+    reg0 = REGNO (operands[1]);
+    reg1 = REGNO (operands[2]);
+    amount = INTVAL (operands[4]);
+    numregs = INTVAL (operands[5]);
+
+ stack_check_output_function (asm_out_file, reg0, reg1, amount, numregs);
+  }
+  return \"\";
+  "
+  [(set_attr "conds" "clob")]
+)
+
+(define_insn "trap"
+  [(trap_if (const_int 1) (const_int 0))]
+  "TARGET_EITHER"
+  "*
+  {
+    rtx ops[2];
+
+    ops[0] = stack_pointer_rtx;
+    ops[1] = gen_rtx_REG (SImode, LR_REGNUM);
+    if (TARGET_ARM)
+      {
+        output_asm_insn (\"str\\t%1, [%0, #-4]!\", ops);
+        output_asm_insn (\"bl\\t__arm_stack_failure\\t%@ trap call\", ops);
+      }
+    else
+      {
+        output_asm_insn (\"push\\t{%1}\", ops);
+ output_asm_insn (\"bl\\t__thumb_stack_failure\\t%@ trap call\", ops);
+      }
+  }
+  return \"\";
+  "
+)
+
 (define_insn "*arm_movtas_ze"
   [(set (zero_extract:SI (match_operand:SI 0 "s_register_operand" "+r")
                    (const_int 16)

Reply via email to