On 07/11/2016 16:59, Adhemerval Zanella wrote: > On 14/10/2016 15:59, Wilco Dijkstra wrote:
> There is no limit afaik on gold split stack allocation handling, > and I think one could be added for each backend (in the method > override require to implement it). > > In fact it is not really required to tie the nop generation with the > instruction generated by 'aarch64_internal_mov_immediate', it is > just a matter to simplify linker code. If there is no easy limit and you'll still require a nop, I think it is best then to emit mov N+movk #0. Then the scheduler won't be able to reorder them with the add/sub. >> Is there any need to detect underflow of x10 or is there a guarantee that >> stacks are >> never allocated in the low 2GB (given the maximum adjustment is 2GB)? It's >> safe >> to do a signed comparison. > > I do not think so, at least none of current backend that implements > split stack do so. OK, well a signed comparison like in your new version works for underflow. Now to the patch: @@ -3316,6 +3339,28 @@ aarch64_expand_prologue (void) aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM, callee_adjust != 0 || frame_pointer_needed); aarch64_sub_sp (IP1_REGNUM, final_adjust, !frame_pointer_needed); + + if (split_stack_arg_pointer_used_p ()) + { + /* Setup the argument pointer (x10) for -fsplit-stack code. If + __morestack was called, it will left the arg pointer to the + old stack in x28. Otherwise, the argument pointer is the top + of current frame. */ + rtx x11 = gen_rtx_REG (Pmode, R11_REGNUM); + rtx x28 = gen_rtx_REG (Pmode, R28_REGNUM); + rtx cc_reg = gen_rtx_REG (CCmode, CC_REGNUM); + + rtx not_more = gen_label_rtx (); + + rtx cmp = gen_rtx_fmt_ee (LT, VOIDmode, cc_reg, const0_rtx); + rtx jump = emit_jump_insn (gen_condjump (cmp, cc_reg, not_more)); + JUMP_LABEL (jump) = not_more; + LABEL_NUSES (not_more) += 1; + + emit_move_insn (x11, x28); + + emit_label (not_more); + } If you pass the old sp in x11 when called from __morestack you can remove the above thunk completely. + /* It limits total maximum stack allocation on 2G so its value can be + materialized using two instructions at most (movn/movk). It might be + used by the linker to add some extra space for split calling non split + stack functions. */ + allocate = cfun->machine->frame.frame_size; + if (allocate > ((HOST_WIDE_INT) 1 << 31)) + { + sorry ("Stack frame larger than 2G is not supported for -fsplit-stack"); + return; + } Note a 2-instruction mov/movk can generate any immediate up to 4GB and if we need even large sizes, we could round up to a multiple of 64KB so that 2 instructions are enough for a 48-bit stack size... + int ninsn = aarch64_internal_mov_immediate (reg10, GEN_INT (-allocate), + true, Pmode); + gcc_assert (ninsn == 1 || ninsn == 2); + if (ninsn == 1) + emit_insn (gen_nop ()); To avoid any issues with the nop being scheduled, it's best to emit an explicit movk here (0xffff0000 if allocate > 0, or 0 if zero) using gen_insv_immdi. +void +aarch64_split_stack_space_check (rtx size, rtx label) Isn't very similar code used in aarch64_expand_split_stack_prologue? Any possibility to share/reuse? +static void +aarch64_live_on_entry (bitmap regs) +{ + if (flag_split_stack) + bitmap_set_bit (regs, R11_REGNUM); +} I'm wondering whether you need extra code in aarch64_can_eliminate to deal with the argument pointer? Also do we need to define a fixed register, or will GCC automatically allocate it to a callee-save if necessary? +++ b/libgcc/config/aarch64/morestack.S +/* Offset from __morestack frame where the arguments size saved and + passed to __generic_morestack. */ +#define ARGS_SIZE_SAVE 80 This define is unused. +# The normal function prologue follows here, with a small addition at the +# end to set up the argument pointer if required (the prolog): +# +# [...] # default function prologue +# b.lt function: +# mov x11, x28 We don't need this if we pass sp in x11 when calling back to the original function. + stp x8, x10, [sp, 80] + stp x11, x12, [sp, 96] No need to save x11 - it just contains original sp. + str x28, [sp, 112] + .cfi_offset 28, -112 + + # Setup on x28 the function initial frame pointer. + add x28, sp, MORESTACK_FRAMESIZE Why save x28 when x28 = x29 + MORESTACK_FRAMESIZE? You can use x29 throughout the code as it is preserved by calls. + # Start using new stack + str x29, [x0, -16]! This has no use. + mov sp, x0 + + # Set __private_ss stack guard for the new stack. + ldr x9, [x28, STACKFRAME_BASE + NEWSTACK_SAVE] + add x0, x0, BACKOFF + sub x0, x0, 16 Neither has this. + ldp x11, x12, [x28, STACKFRAME_BASE + 96] + # Indicate __morestack was called. + cmp x12, 0 + blr x12 There is no need to restore x11, all we need to do is restore x12 and branch: ldr x12, [x28, STACKFRAME_BASE + ...] add x11, x29, MORESTACK_FRAMESIZE blx x12 + # Use old stack again. + #sub sp, x28, 16 + mov sp, x28 Use: add sp, x29, MORESTACK_FRAMESIZE + ldp x0, x1, [x28, STACKFRAME_BASE + 16] + ldp x2, x3, [x28, STACKFRAME_BASE + 32] + ldp x4, x5, [x28, STACKFRAME_BASE + 48] + ldp x6, x7, [x28, STACKFRAME_BASE + 64] + ldp x29, x30, [x28, STACKFRAME_BASE] + ldr x28, [x28, STACKFRAME_BASE + 112] + + .cfi_remember_state + .cfi_restore 30 + .cfi_restore 29 + .cfi_restore 28 + .cfi_def_cfa 31, 0 This needs to restore x29/x30 last to get correct unwinding: ldp x29, x30, [sp], MORESTACK_FRAMESIZE .cfi_remember_state .cfi_restore 30 .cfi_restore 29 .cfi_def_cfa 31, 0 Wilco