Re: [PATCH] aarch64: Add split-stack initial support

Jiong Wang Mon, 08 Aug 2016 04:01:13 -0700

Adhemerval Zanella writes:

>> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
>> index e56398a..2cf239f 100644
>> --- a/gcc/config/aarch64/aarch64.c
>> +++ b/gcc/config/aarch64/aarch64.c
>> @@ -3227,6 +3227,34 @@ aarch64_expand_prologue (void)
>>        RTX_FRAME_RELATED_P (insn) = 1;
>>      }
>>      }
>> +
>> +  if (flag_split_stack && offset)
>> +    {
>> +      /* Setup the argument pointer (x10) for -fsplit-stack code.  If
>> +     __morestack was called, it will left the arg pointer to the
>> +     old stack in x28.  Otherwise, the argument pointer is the top
>> +     of current frame.  */
>> +      rtx x10 = gen_rtx_REG (Pmode, R10_REGNUM);
>> +      rtx x11 = gen_rtx_REG (Pmode, R11_REGNUM);
>> +      rtx x28 = gen_rtx_REG (Pmode, R28_REGNUM);
>> +      rtx x29 = gen_rtx_REG (Pmode, R29_REGNUM);
>> +      rtx not_more = gen_label_rtx ();
>> +      rtx cc_reg = gen_rtx_REG (CCmode, CC_REGNUM);
>> +      rtx jump;
>> +
>> +      emit_move_insn (x11, GEN_INT (hard_fp_offset));
>> +      emit_insn (gen_add3_insn (x10, x29, x11));
>> +      jump = gen_rtx_IF_THEN_ELSE (VOIDmode,
>> +                               gen_rtx_GEU (VOIDmode, cc_reg,
>> +                                            const0_rtx),
>> +                               gen_rtx_LABEL_REF (VOIDmode, not_more),
>> +                               pc_rtx);
>> +      jump = emit_jump_insn (gen_rtx_SET (pc_rtx, jump));
>> +      JUMP_LABEL (jump) = not_more;
>> +      LABEL_NUSES (not_more) += 1;
>> +      emit_move_insn (x10, x28);
>> +      emit_label (not_more);
>> +    }
>>  }


This part needs rebase, there are major changes in AArch64 prologue code
recently.

>>  
>>  /* Return TRUE if we can use a simple_return insn.
>> @@ -3303,6 +3331,7 @@ aarch64_expand_epilogue (bool for_sibcall)
>>        offset = offset - fp_offset;
>>      }
>>  
>> +

Unncessary new line.

>>    if (offset > 0)
>>      {
>>        unsigned reg1 = cfun->machine->frame.wb_candidate1;
>> @@ -9648,7 +9677,7 @@ aarch64_expand_builtin_va_start (tree valist, rtx 
>> nextarg ATTRIBUTE_UNUSED)
>>    /* Emit code to initialize STACK, which points to the next varargs stack
>>       argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
>>       by named arguments.  STACK is 8-byte aligned.  */
>> -  t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
>> +  t = make_tree (TREE_TYPE (stack), crtl->args.internal_arg_pointer);
>>    if (cum->aapcs_stack_size > 0)
>>      t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * 
>> UNITS_PER_WORD);
>>    t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
>> @@ -14010,6 +14039,196 @@ aarch64_optab_supported_p (int op, machine_mode 
>> mode1, machine_mode,
>>      }
>>  }
>>  
>> +/* -fsplit-stack support.  */
>> +
>> +/* A SYMBOL_REF for __morestack.  */
>> +static GTY(()) rtx morestack_ref;
>> +
>> +/* Emit -fsplit-stack prologue, which goes before the regular function
>> +   prologue.  */
>> +void
>> +aarch64_expand_split_stack_prologue (void)
>> +{
>> +  HOST_WIDE_INT frame_size, args_size;
>> +  rtx_code_label *ok_label = NULL;
>> +  rtx mem, ssvalue, compare, jump, insn, call_fusage;
>> +  rtx reg11, reg30, temp;
>> +  rtx new_cfa, cfi_ops = NULL;
>> +  /* Offset from thread pointer to __private_ss.  */
>> +  int psso = 0x10;
>> +  int ninsn;
>> +
>> +  gcc_assert (flag_split_stack && reload_completed);
>> +
>> +  /* It limits total maximum stack allocation on 2G so its value can be
>> +     materialized with two instruction at most (movn/movk).  It might be
>> +     used by the linker to add some extra space for split calling non split
>> +     stack functions.  */
>> +  frame_size = cfun->machine->frame.frame_size;
>> +  if (frame_size > ((HOST_WIDE_INT) 1 << 31))
>> +    {
>> +      sorry ("Stack frame larger than 2G is not supported for 
>> -fsplit-stack");
>> +      return;
>> +    }
>> +
>> +  if (morestack_ref == NULL_RTX)
>> +    {
>> +      morestack_ref = gen_rtx_SYMBOL_REF (Pmode, "__morestack");
>> +      SYMBOL_REF_FLAGS (morestack_ref) |= (SYMBOL_FLAG_LOCAL
>> +                                       | SYMBOL_FLAG_FUNCTION);
>> +    }
>> +
>> +  /* Load __private_ss from TCB.  */
>> +  ssvalue = gen_rtx_REG (Pmode, R9_REGNUM);
>> +  emit_insn (gen_aarch64_load_tp_hard (ssvalue));
>> +  mem = gen_rtx_MEM (Pmode, plus_constant (Pmode, ssvalue, psso));
>> +  emit_move_insn (ssvalue, mem);
>> +
>> +  temp = gen_rtx_REG (Pmode, R10_REGNUM);
>> +
>> +  /* Always emit two insns to calculate the requested stack, so the linker
>> +     can edit them when adjusting size for calling non-split-stack code.  */
>> +  ninsn = aarch64_internal_mov_immediate (temp, GEN_INT (-frame_size), true,
>> +                                      Pmode);
>> +  gcc_assert (ninsn == 1 || ninsn == 2);
>> +  if (ninsn == 1)
>> +    emit_insn (gen_nop ());

there will be trouble to linker if the following add is scheduled before
the nop?

>> diff --git a/libgcc/config/aarch64/morestack.S 
>> b/libgcc/config/aarch64/morestack.S
>> new file mode 100644
>> ...
>> +    # Set __private_ss stack guard for the new stack.
>> +    ldr     x9, [x28, STACKFRAME_BASE + NEWSTACK_SAVE]
>> +    add     x0, x0, BACKOFF
>> +    sub     x0, x0, 16
>> +    sub     x0, x0, x9
>> +.LEHB0:
>> +    mrs     x1, tpidr_el0
>> +    str     x0, [x1, TCB_PRIVATE_SS]
>> +
>> +    # void __morestack_unblock_signals (void)
>> +    bl      __morestack_unblock_signals
>> +
>> +    # Set up for a call to the target function.
>> +    #ldp    x29, x30, [x28, STACKFRAME_BASE]
>> +    ldr     x30, [x28, STACKFRAME_BASE + 8]
>> +    ldp     x0, x1, [x28, STACKFRAME_BASE + 16]
>> +    ldp     x2, x3, [x28, STACKFRAME_BASE + 32]
>> +    ldp     x4, x5, [x28, STACKFRAME_BASE + 48]
>> +    ldp     x6, x7, [x28, STACKFRAME_BASE + 64]
>> +    add     x9, x30, 8
>> +    cmp     x30, x9

Can you explain why do we need this "cmp" before jumping to target
function?

>> +    blr     x9
>> +
>> +    stp     x0, x1, [x28, STACKFRAME_BASE + 16]
>> +    stp     x2, x3, [x28, STACKFRAME_BASE + 32]
>> +    stp     x4, x5, [x28, STACKFRAME_BASE + 48]
>> +    stp     x6, x7, [x28, STACKFRAME_BASE + 64]
>> +
>> +    bl      __morestack_block_signals
>> +
>> +    # void *__generic_releasestack (size_t *pavailable)
>> +    add     x0, x28, STACKFRAME_BASE + NEWSTACK_SAVE
>> +    bl      __generic_releasestack
>> +
>> +    # Reset __private_ss stack guard to value for old stack
>> +    ldr     x9, [x28, STACKFRAME_BASE + NEWSTACK_SAVE]
>> +    add     x0, x0, BACKOFF
>> +    sub     x0, x0, x9
>> +
>> +    # Update TCB split stack field
>> +.LEHE0:
>> +    mrs     x1, tpidr_el0
>> +    str     x0, [x1, TCB_PRIVATE_SS]
>> +
>> +    bl __morestack_unblock_signals
>> +
>> +    # Use old stack again.
>> +    sub     sp, x28, 16
>> +
>> +    ldp     x0, x1, [x28, STACKFRAME_BASE + 16]
>> +    ldp     x2, x3, [x28, STACKFRAME_BASE + 32]
>> +    ldp     x4, x5, [x28, STACKFRAME_BASE + 48]
>> +    ldp     x6, x7, [x28, STACKFRAME_BASE + 64]
>> +    ldp     x29, x30, [x28, STACKFRAME_BASE]
>> +    ldr     x28, [x28, STACKFRAME_BASE + 96]
>> +
>> +    .cfi_remember_state
>> +    .cfi_restore 30
>> +    .cfi_restore 29
>> +    .cfi_def_cfa 31, 0
>> +
>> +    ret
>> +
>> +# This is the cleanup code called by the stack unwinder when
>> +# unwinding through code between .LEHB0 and .LEHE0 above.
>> +cleanup:
>> +    .cfi_restore_state
>> +    str     x0, [x28, STACKFRAME_BASE]
>> +    # size_t __generic_findstack (void *stack)
>> +    mov     x0, x28
>> +    bl      __generic_findstack
>> +    sub     x0, x28, x0
>> +    add     x0, x0, BACKOFF
>> +    # Restore tcbhead_t.__private_ss
>> +    mrs     x1, tpidr_el0
>> +    str     x0, [x1, TCB_PRIVATE_SS]
>> +    ldr     x0, [x28, STACKFRAME_BASE]
>> +    b       _Unwind_Resume
>> +        .cfi_endproc
>> +END(__morestack)
>> +
>> +    .section .gcc_except_table,"a",@progbits
>> +    .align 4
>> +.LLSDA1:
>> +    # @LPStart format (omit)
>> +        .byte   0xff
>> +    # @TType format (omit)
>> +        .byte   0xff
>> +    # Call-site format (uleb128)
>> +        .byte   0x1
>> +    # Call-site table length
>> +        .uleb128 .LLSDACSE1-.LLSDACSB1
>> +.LLSDACSB1:
>> +    # region 0 start
>> +        .uleb128 .LEHB0-.LFB1
>> +    # length
>> +        .uleb128 .LEHE0-.LEHB0
>> +    # landing pad
>> +        .uleb128 cleanup-.LFB1
>> +    # no action (ie a cleanup)
>> +        .uleb128 0
>> +.LLSDACSE1:
>> +
>> +
>> +    .global __gcc_personality_v0
>> +#ifdef __PIC__
>> +    # Build a position independent reference to the personality function.
>> +    .hidden DW.ref.__gcc_personality_v0
>> +    .weak   DW.ref.__gcc_personality_v0
>> +    .section 
>> .data.DW.ref.__gcc_personality_v0,"awG",@progbits,DW.ref.__gcc_personality_v0,comdat
>> +    .type   DW.ref.__gcc_personality_v0, @object
>> +    .align 3
>> +DW.ref.__gcc_personality_v0:
>> +    .size   DW.ref.__gcc_personality_v0, 8
>> +    .quad   __gcc_personality_v0
>> +#endif
>> +
>> +    .section .note.GNU-stack,"",@progbits
>> +    .section .note.GNU-split-stack,"",@progbits
>> +    .section .note.GNU-no-split-stack,"",@progbits
>> diff --git a/libgcc/config/aarch64/t-stack-aarch64 
>> b/libgcc/config/aarch64/t-stack-aarch64
>> new file mode 100644
>> index 0000000..4babb4e
>> --- /dev/null
>> +++ b/libgcc/config/aarch64/t-stack-aarch64
>> @@ -0,0 +1,3 @@
>> +# Makefile fragment to support -fsplit-stack for aarch64.
>> +LIB2ADD_ST += $(srcdir)/config/aarch64/morestack.S \
>> +          $(srcdir)/config/aarch64/morestack-c.c
>> diff --git a/libgcc/generic-morestack.c b/libgcc/generic-morestack.c
>> index b8eec4e..fe7092b 100644
>> --- a/libgcc/generic-morestack.c
>> +++ b/libgcc/generic-morestack.c
>> @@ -943,6 +943,7 @@ __splitstack_find (void *segment_arg, void *sp, size_t 
>> *len,
>>        nsp -= 2 * 160;
>>  #elif defined __s390__
>>        nsp -= 2 * 96;
>> +#elif defined __aarch64__
>>  #else
>>  #error "unrecognized target"
>>  #endif
>> 

-- 
Regards,
Jiong

Re: [PATCH] aarch64: Add split-stack initial support

Reply via email to