On 07/11/2016 16:59, Adhemerval Zanella wrote:
> On 14/10/2016 15:59, Wilco Dijkstra wrote:

> There is no limit afaik on gold split stack allocation handling,
> and I think one could be added for each backend (in the method
> override require to implement it).
> 
> In fact it is not really required to tie the nop generation with the
> instruction generated by 'aarch64_internal_mov_immediate', it is
> just a matter to simplify linker code.  

If there is no easy limit and you'll still require a nop, I think it is best 
then
to emit mov N+movk #0. Then the scheduler won't be able to reorder
them with the add/sub.

>> Is there any need to detect underflow of x10 or is there a guarantee that 
>> stacks are
>> never allocated in the low 2GB (given the maximum adjustment is 2GB)? It's 
>> safe
>> to do a signed comparison.
> 
> I do not think so, at least none of current backend that implements
> split stack do so.

OK, well a signed comparison like in your new version works for underflow.

Now to the patch:


@@ -3316,6 +3339,28 @@ aarch64_expand_prologue (void)
   aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
                             callee_adjust != 0 || frame_pointer_needed);
   aarch64_sub_sp (IP1_REGNUM, final_adjust, !frame_pointer_needed);
+
+  if (split_stack_arg_pointer_used_p ())
+    {
+      /* Setup the argument pointer (x10) for -fsplit-stack code.  If
+        __morestack was called, it will left the arg pointer to the
+        old stack in x28.  Otherwise, the argument pointer is the top
+        of current frame.  */
+      rtx x11 = gen_rtx_REG (Pmode, R11_REGNUM);
+      rtx x28 = gen_rtx_REG (Pmode, R28_REGNUM);
+      rtx cc_reg = gen_rtx_REG (CCmode, CC_REGNUM);
+
+      rtx not_more = gen_label_rtx ();
+
+      rtx cmp = gen_rtx_fmt_ee (LT, VOIDmode, cc_reg, const0_rtx);
+      rtx jump = emit_jump_insn (gen_condjump (cmp, cc_reg, not_more));
+      JUMP_LABEL (jump) = not_more;
+      LABEL_NUSES (not_more) += 1;
+
+      emit_move_insn (x11, x28);
+
+      emit_label (not_more);
+    }

If you pass the old sp in x11 when called from __morestack you can remove
the above thunk completely.

+  /* It limits total maximum stack allocation on 2G so its value can be
+     materialized using two instructions at most (movn/movk).  It might be
+     used by the linker to add some extra space for split calling non split
+     stack functions.  */
+  allocate = cfun->machine->frame.frame_size;
+  if (allocate > ((HOST_WIDE_INT) 1 << 31))
+    {
+      sorry ("Stack frame larger than 2G is not supported for -fsplit-stack");
+      return;
+    }

Note a 2-instruction mov/movk can generate any immediate up to 4GB and if
we need even large sizes, we could round up to a multiple of 64KB so that 2
instructions are enough for a 48-bit stack size...

+  int ninsn = aarch64_internal_mov_immediate (reg10, GEN_INT (-allocate),
+                                             true, Pmode);
+  gcc_assert (ninsn == 1 || ninsn == 2);
+  if (ninsn == 1)
+    emit_insn (gen_nop ());

To avoid any issues with the nop being scheduled, it's best to emit an explicit 
movk
here (0xffff0000 if allocate > 0, or 0 if zero) using gen_insv_immdi.

+void
+aarch64_split_stack_space_check (rtx size, rtx label)

Isn't very similar code used in aarch64_expand_split_stack_prologue? Any 
possibility
to share/reuse?

+static void
+aarch64_live_on_entry (bitmap regs)
+{
+  if (flag_split_stack)
+    bitmap_set_bit (regs, R11_REGNUM);
+}

I'm wondering whether you need extra code in aarch64_can_eliminate to deal
with the argument pointer? Also do we need to define a fixed register, or will 
GCC
automatically allocate it to a callee-save if necessary?

+++ b/libgcc/config/aarch64/morestack.S

+/* Offset from __morestack frame where the arguments size saved and
+   passed to __generic_morestack.  */
+#define ARGS_SIZE_SAVE         80

This define is unused.

+# The normal function prologue follows here, with a small addition at the
+# end to set up the argument pointer if required (the prolog):
+#
+#       [...]                  # default function prologue
+#      b.lt   function:
+#      mov    x11, x28

We don't need this if we pass sp in x11 when calling back to the original 
function.

+       stp     x8, x10, [sp, 80]
+       stp     x11, x12, [sp, 96]

No need to save x11 - it just contains original sp.

+       str     x28, [sp, 112]
+       .cfi_offset 28, -112
+
+       # Setup on x28 the function initial frame pointer.
+       add     x28, sp, MORESTACK_FRAMESIZE

Why save x28 when x28 = x29 + MORESTACK_FRAMESIZE? You can use x29
throughout the code as it is preserved by calls.

+       # Start using new stack
+       str     x29, [x0, -16]!

This has no use.

+       mov     sp, x0
+
+       # Set __private_ss stack guard for the new stack.
+       ldr     x9, [x28, STACKFRAME_BASE + NEWSTACK_SAVE]
+       add     x0, x0, BACKOFF

+       sub     x0, x0, 16

Neither has this.

+       ldp     x11, x12, [x28, STACKFRAME_BASE + 96]
+       # Indicate __morestack was called.
+       cmp     x12, 0
+       blr     x12

There is no need to restore x11, all we need to do is restore x12 and branch:

ldr x12, [x28, STACKFRAME_BASE + ...]
add x11, x29, MORESTACK_FRAMESIZE
blx x12

+       # Use old stack again.
+       #sub    sp, x28, 16
+       mov     sp, x28

Use:

add sp, x29, MORESTACK_FRAMESIZE

+       ldp     x0, x1, [x28, STACKFRAME_BASE + 16]
+       ldp     x2, x3, [x28, STACKFRAME_BASE + 32]
+       ldp     x4, x5, [x28, STACKFRAME_BASE + 48]
+       ldp     x6, x7, [x28, STACKFRAME_BASE + 64]
+       ldp     x29, x30, [x28, STACKFRAME_BASE]
+       ldr     x28, [x28, STACKFRAME_BASE + 112]
+
+       .cfi_remember_state
+       .cfi_restore 30
+       .cfi_restore 29
+       .cfi_restore 28
+       .cfi_def_cfa 31, 0

This needs to restore x29/x30 last to get correct unwinding:

ldp     x29, x30, [sp], MORESTACK_FRAMESIZE
        .cfi_remember_state
        .cfi_restore 30
        .cfi_restore 29
        .cfi_def_cfa 31, 0

Wilco

Reply via email to