Adhemerval Zanella writes:
>> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
>> index e56398a..2cf239f 100644
>> --- a/gcc/config/aarch64/aarch64.c
>> +++ b/gcc/config/aarch64/aarch64.c
>> @@ -3227,6 +3227,34 @@ aarch64_expand_prologue (void)
>> RTX_FRAME_RELATED_P (insn) = 1;
>> }
>> }
>> +
>> + if (flag_split_stack && offset)
>> + {
>> + /* Setup the argument pointer (x10) for -fsplit-stack code. If
>> + __morestack was called, it will left the arg pointer to the
>> + old stack in x28. Otherwise, the argument pointer is the top
>> + of current frame. */
>> + rtx x10 = gen_rtx_REG (Pmode, R10_REGNUM);
>> + rtx x11 = gen_rtx_REG (Pmode, R11_REGNUM);
>> + rtx x28 = gen_rtx_REG (Pmode, R28_REGNUM);
>> + rtx x29 = gen_rtx_REG (Pmode, R29_REGNUM);
>> + rtx not_more = gen_label_rtx ();
>> + rtx cc_reg = gen_rtx_REG (CCmode, CC_REGNUM);
>> + rtx jump;
>> +
>> + emit_move_insn (x11, GEN_INT (hard_fp_offset));
>> + emit_insn (gen_add3_insn (x10, x29, x11));
>> + jump = gen_rtx_IF_THEN_ELSE (VOIDmode,
>> + gen_rtx_GEU (VOIDmode, cc_reg,
>> + const0_rtx),
>> + gen_rtx_LABEL_REF (VOIDmode, not_more),
>> + pc_rtx);
>> + jump = emit_jump_insn (gen_rtx_SET (pc_rtx, jump));
>> + JUMP_LABEL (jump) = not_more;
>> + LABEL_NUSES (not_more) += 1;
>> + emit_move_insn (x10, x28);
>> + emit_label (not_more);
>> + }
>> }
This part needs rebase, there are major changes in AArch64 prologue code
recently.
>>
>> /* Return TRUE if we can use a simple_return insn.
>> @@ -3303,6 +3331,7 @@ aarch64_expand_epilogue (bool for_sibcall)
>> offset = offset - fp_offset;
>> }
>>
>> +
Unncessary new line.
>> if (offset > 0)
>> {
>> unsigned reg1 = cfun->machine->frame.wb_candidate1;
>> @@ -9648,7 +9677,7 @@ aarch64_expand_builtin_va_start (tree valist, rtx
>> nextarg ATTRIBUTE_UNUSED)
>> /* Emit code to initialize STACK, which points to the next varargs stack
>> argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
>> by named arguments. STACK is 8-byte aligned. */
>> - t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
>> + t = make_tree (TREE_TYPE (stack), crtl->args.internal_arg_pointer);
>> if (cum->aapcs_stack_size > 0)
>> t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size *
>> UNITS_PER_WORD);
>> t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
>> @@ -14010,6 +14039,196 @@ aarch64_optab_supported_p (int op, machine_mode
>> mode1, machine_mode,
>> }
>> }
>>
>> +/* -fsplit-stack support. */
>> +
>> +/* A SYMBOL_REF for __morestack. */
>> +static GTY(()) rtx morestack_ref;
>> +
>> +/* Emit -fsplit-stack prologue, which goes before the regular function
>> + prologue. */
>> +void
>> +aarch64_expand_split_stack_prologue (void)
>> +{
>> + HOST_WIDE_INT frame_size, args_size;
>> + rtx_code_label *ok_label = NULL;
>> + rtx mem, ssvalue, compare, jump, insn, call_fusage;
>> + rtx reg11, reg30, temp;
>> + rtx new_cfa, cfi_ops = NULL;
>> + /* Offset from thread pointer to __private_ss. */
>> + int psso = 0x10;
>> + int ninsn;
>> +
>> + gcc_assert (flag_split_stack && reload_completed);
>> +
>> + /* It limits total maximum stack allocation on 2G so its value can be
>> + materialized with two instruction at most (movn/movk). It might be
>> + used by the linker to add some extra space for split calling non split
>> + stack functions. */
>> + frame_size = cfun->machine->frame.frame_size;
>> + if (frame_size > ((HOST_WIDE_INT) 1 << 31))
>> + {
>> + sorry ("Stack frame larger than 2G is not supported for
>> -fsplit-stack");
>> + return;
>> + }
>> +
>> + if (morestack_ref == NULL_RTX)
>> + {
>> + morestack_ref = gen_rtx_SYMBOL_REF (Pmode, "__morestack");
>> + SYMBOL_REF_FLAGS (morestack_ref) |= (SYMBOL_FLAG_LOCAL
>> + | SYMBOL_FLAG_FUNCTION);
>> + }
>> +
>> + /* Load __private_ss from TCB. */
>> + ssvalue = gen_rtx_REG (Pmode, R9_REGNUM);
>> + emit_insn (gen_aarch64_load_tp_hard (ssvalue));
>> + mem = gen_rtx_MEM (Pmode, plus_constant (Pmode, ssvalue, psso));
>> + emit_move_insn (ssvalue, mem);
>> +
>> + temp = gen_rtx_REG (Pmode, R10_REGNUM);
>> +
>> + /* Always emit two insns to calculate the requested stack, so the linker
>> + can edit them when adjusting size for calling non-split-stack code. */
>> + ninsn = aarch64_internal_mov_immediate (temp, GEN_INT (-frame_size), true,
>> + Pmode);
>> + gcc_assert (ninsn == 1 || ninsn == 2);
>> + if (ninsn == 1)
>> + emit_insn (gen_nop ());
there will be trouble to linker if the following add is scheduled before
the nop?
>> diff --git a/libgcc/config/aarch64/morestack.S
>> b/libgcc/config/aarch64/morestack.S
>> new file mode 100644
>> ...
>> + # Set __private_ss stack guard for the new stack.
>> + ldr x9, [x28, STACKFRAME_BASE + NEWSTACK_SAVE]
>> + add x0, x0, BACKOFF
>> + sub x0, x0, 16
>> + sub x0, x0, x9
>> +.LEHB0:
>> + mrs x1, tpidr_el0
>> + str x0, [x1, TCB_PRIVATE_SS]
>> +
>> + # void __morestack_unblock_signals (void)
>> + bl __morestack_unblock_signals
>> +
>> + # Set up for a call to the target function.
>> + #ldp x29, x30, [x28, STACKFRAME_BASE]
>> + ldr x30, [x28, STACKFRAME_BASE + 8]
>> + ldp x0, x1, [x28, STACKFRAME_BASE + 16]
>> + ldp x2, x3, [x28, STACKFRAME_BASE + 32]
>> + ldp x4, x5, [x28, STACKFRAME_BASE + 48]
>> + ldp x6, x7, [x28, STACKFRAME_BASE + 64]
>> + add x9, x30, 8
>> + cmp x30, x9
Can you explain why do we need this "cmp" before jumping to target
function?
>> + blr x9
>> +
>> + stp x0, x1, [x28, STACKFRAME_BASE + 16]
>> + stp x2, x3, [x28, STACKFRAME_BASE + 32]
>> + stp x4, x5, [x28, STACKFRAME_BASE + 48]
>> + stp x6, x7, [x28, STACKFRAME_BASE + 64]
>> +
>> + bl __morestack_block_signals
>> +
>> + # void *__generic_releasestack (size_t *pavailable)
>> + add x0, x28, STACKFRAME_BASE + NEWSTACK_SAVE
>> + bl __generic_releasestack
>> +
>> + # Reset __private_ss stack guard to value for old stack
>> + ldr x9, [x28, STACKFRAME_BASE + NEWSTACK_SAVE]
>> + add x0, x0, BACKOFF
>> + sub x0, x0, x9
>> +
>> + # Update TCB split stack field
>> +.LEHE0:
>> + mrs x1, tpidr_el0
>> + str x0, [x1, TCB_PRIVATE_SS]
>> +
>> + bl __morestack_unblock_signals
>> +
>> + # Use old stack again.
>> + sub sp, x28, 16
>> +
>> + ldp x0, x1, [x28, STACKFRAME_BASE + 16]
>> + ldp x2, x3, [x28, STACKFRAME_BASE + 32]
>> + ldp x4, x5, [x28, STACKFRAME_BASE + 48]
>> + ldp x6, x7, [x28, STACKFRAME_BASE + 64]
>> + ldp x29, x30, [x28, STACKFRAME_BASE]
>> + ldr x28, [x28, STACKFRAME_BASE + 96]
>> +
>> + .cfi_remember_state
>> + .cfi_restore 30
>> + .cfi_restore 29
>> + .cfi_def_cfa 31, 0
>> +
>> + ret
>> +
>> +# This is the cleanup code called by the stack unwinder when
>> +# unwinding through code between .LEHB0 and .LEHE0 above.
>> +cleanup:
>> + .cfi_restore_state
>> + str x0, [x28, STACKFRAME_BASE]
>> + # size_t __generic_findstack (void *stack)
>> + mov x0, x28
>> + bl __generic_findstack
>> + sub x0, x28, x0
>> + add x0, x0, BACKOFF
>> + # Restore tcbhead_t.__private_ss
>> + mrs x1, tpidr_el0
>> + str x0, [x1, TCB_PRIVATE_SS]
>> + ldr x0, [x28, STACKFRAME_BASE]
>> + b _Unwind_Resume
>> + .cfi_endproc
>> +END(__morestack)
>> +
>> + .section .gcc_except_table,"a",@progbits
>> + .align 4
>> +.LLSDA1:
>> + # @LPStart format (omit)
>> + .byte 0xff
>> + # @TType format (omit)
>> + .byte 0xff
>> + # Call-site format (uleb128)
>> + .byte 0x1
>> + # Call-site table length
>> + .uleb128 .LLSDACSE1-.LLSDACSB1
>> +.LLSDACSB1:
>> + # region 0 start
>> + .uleb128 .LEHB0-.LFB1
>> + # length
>> + .uleb128 .LEHE0-.LEHB0
>> + # landing pad
>> + .uleb128 cleanup-.LFB1
>> + # no action (ie a cleanup)
>> + .uleb128 0
>> +.LLSDACSE1:
>> +
>> +
>> + .global __gcc_personality_v0
>> +#ifdef __PIC__
>> + # Build a position independent reference to the personality function.
>> + .hidden DW.ref.__gcc_personality_v0
>> + .weak DW.ref.__gcc_personality_v0
>> + .section
>> .data.DW.ref.__gcc_personality_v0,"awG",@progbits,DW.ref.__gcc_personality_v0,comdat
>> + .type DW.ref.__gcc_personality_v0, @object
>> + .align 3
>> +DW.ref.__gcc_personality_v0:
>> + .size DW.ref.__gcc_personality_v0, 8
>> + .quad __gcc_personality_v0
>> +#endif
>> +
>> + .section .note.GNU-stack,"",@progbits
>> + .section .note.GNU-split-stack,"",@progbits
>> + .section .note.GNU-no-split-stack,"",@progbits
>> diff --git a/libgcc/config/aarch64/t-stack-aarch64
>> b/libgcc/config/aarch64/t-stack-aarch64
>> new file mode 100644
>> index 0000000..4babb4e
>> --- /dev/null
>> +++ b/libgcc/config/aarch64/t-stack-aarch64
>> @@ -0,0 +1,3 @@
>> +# Makefile fragment to support -fsplit-stack for aarch64.
>> +LIB2ADD_ST += $(srcdir)/config/aarch64/morestack.S \
>> + $(srcdir)/config/aarch64/morestack-c.c
>> diff --git a/libgcc/generic-morestack.c b/libgcc/generic-morestack.c
>> index b8eec4e..fe7092b 100644
>> --- a/libgcc/generic-morestack.c
>> +++ b/libgcc/generic-morestack.c
>> @@ -943,6 +943,7 @@ __splitstack_find (void *segment_arg, void *sp, size_t
>> *len,
>> nsp -= 2 * 160;
>> #elif defined __s390__
>> nsp -= 2 * 96;
>> +#elif defined __aarch64__
>> #else
>> #error "unrecognized target"
>> #endif
>>
--
Regards,
Jiong