Now generates RTL with appropriate stack restore and leave patterns. Slightly cleaned up code that calculates the number of vector elements for clarity.
Tests are good when rebased onto gcc-7_1_0-release as HEAD currently fails to bootstrap. Signed-off-by: Daniel Santos <daniel.san...@pobox.com> --- gcc/config/i386/i386.c | 287 +++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 278 insertions(+), 9 deletions(-) diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index f2772b2d10e..e43dc819f9a 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -14148,6 +14148,78 @@ ix86_elim_entry_set_got (rtx reg) } } +static rtx +gen_frame_set (rtx reg, rtx frame_reg, int offset, bool store) +{ + rtx addr, mem; + + if (offset) + addr = gen_rtx_PLUS (Pmode, frame_reg, GEN_INT (offset)); + mem = gen_frame_mem (GET_MODE (reg), offset ? addr : frame_reg); + return gen_rtx_SET (store ? mem : reg, store ? reg : mem); +} + +static inline rtx +gen_frame_load (rtx reg, rtx frame_reg, int offset) +{ + return gen_frame_set (reg, frame_reg, offset, false); +} + +static inline rtx +gen_frame_store (rtx reg, rtx frame_reg, int offset) +{ + return gen_frame_set (reg, frame_reg, offset, true); +} + +static void +ix86_emit_outlined_ms2sysv_save (const struct ix86_frame &frame) +{ + struct machine_function *m = cfun->machine; + const unsigned ncregs = NUM_X86_64_MS_CLOBBERED_REGS + + m->call_ms2sysv_extra_regs; + rtvec v = rtvec_alloc (ncregs + 1); + unsigned int align, i, vi = 0; + rtx_insn *insn; + rtx sym, addr; + rtx rax = gen_rtx_REG (word_mode, AX_REG); + const struct xlogue_layout &xlogue = xlogue_layout::get_instance (); + HOST_WIDE_INT rax_offset = xlogue.get_stub_ptr_offset () + m->fs.sp_offset; + HOST_WIDE_INT stack_alloc_size = frame.stack_pointer_offset - m->fs.sp_offset; + HOST_WIDE_INT stack_align_off_in = xlogue.get_stack_align_off_in (); + + /* Verify that the incoming stack 16-byte alignment offset matches the + layout we're using. */ + gcc_assert (stack_align_off_in == (m->fs.sp_offset & UNITS_PER_WORD)); + + /* Get the stub symbol. */ + sym = xlogue.get_stub_rtx (frame_pointer_needed ? XLOGUE_STUB_SAVE_HFP + : XLOGUE_STUB_SAVE); + RTVEC_ELT (v, vi++) = gen_rtx_USE (VOIDmode, sym); + + /* Setup RAX as the stub's base pointer. */ + align = GET_MODE_ALIGNMENT (V4SFmode); + addr = choose_baseaddr (rax_offset, &align); + gcc_assert (align >= GET_MODE_ALIGNMENT (V4SFmode)); + insn = emit_insn (gen_rtx_SET (rax, addr)); + + gcc_assert (stack_alloc_size >= xlogue.get_stack_space_used ()); + pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx, + GEN_INT (-stack_alloc_size), -1, + m->fs.cfa_reg == stack_pointer_rtx); + for (i = 0; i < ncregs; ++i) + { + const xlogue_layout::reginfo &r = xlogue.get_reginfo (i); + rtx reg = gen_rtx_REG ((SSE_REGNO_P (r.regno) ? V4SFmode : word_mode), + r.regno); + RTVEC_ELT (v, vi++) = gen_frame_store (reg, rax, -r.offset);; + } + + gcc_assert (vi == (unsigned)GET_NUM_ELEM (v)); + + insn = emit_insn (gen_rtx_PARALLEL (VOIDmode, v)); + RTX_FRAME_RELATED_P (insn) = true; +} + /* Expand the prologue into a bunch of separate insns. */ void @@ -14395,7 +14467,7 @@ ix86_expand_prologue (void) performing the actual alignment. Otherwise we cannot guarantee that there's enough storage above the realignment point. */ allocate = frame.stack_realign_allocate_offset - m->fs.sp_offset; - if (allocate) + if (allocate && !m->call_ms2sysv) pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx, GEN_INT (-allocate), -1, false); @@ -14403,7 +14475,6 @@ ix86_expand_prologue (void) insn = emit_insn (ix86_gen_andsp (stack_pointer_rtx, stack_pointer_rtx, GEN_INT (-align_bytes))); - /* For the purposes of register save area addressing, the stack pointer can no longer be used to access anything in the frame below m->fs.sp_realigned_offset and the frame pointer cannot be @@ -14420,6 +14491,9 @@ ix86_expand_prologue (void) m->fs.sp_valid = false; } + if (m->call_ms2sysv) + ix86_emit_outlined_ms2sysv_save (frame); + allocate = frame.stack_pointer_offset - m->fs.sp_offset; if (flag_stack_usage_info) @@ -14740,17 +14814,19 @@ ix86_emit_restore_regs_using_pop (void) unsigned int regno; for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++) - if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, false)) + if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, false, true)) ix86_emit_restore_reg_using_pop (gen_rtx_REG (word_mode, regno)); } -/* Emit code and notes for the LEAVE instruction. */ +/* Emit code and notes for the LEAVE instruction. If insn is non-null, + omits the emit and only attaches the notes. */ static void -ix86_emit_leave (void) +ix86_emit_leave (rtx_insn *insn) { struct machine_function *m = cfun->machine; - rtx_insn *insn = emit_insn (ix86_gen_leave ()); + if (!insn) + insn = emit_insn (ix86_gen_leave ()); ix86_add_queued_cfa_restore_notes (insn); @@ -14844,6 +14920,164 @@ ix86_emit_restore_sse_regs_using_mov (HOST_WIDE_INT cfa_offset, } } +static void +ix86_emit_outlined_ms2sysv_restore (const struct ix86_frame &frame, + bool use_call, int style) +{ + struct machine_function *m = cfun->machine; + const unsigned ncregs = NUM_X86_64_MS_CLOBBERED_REGS + + m->call_ms2sysv_extra_regs; + rtvec v; + unsigned int elems_needed, align, i, vi = 0; + rtx_insn *insn; + rtx sym, tmp; + rtx rsi = gen_rtx_REG (word_mode, SI_REG); + rtx r10 = NULL_RTX; + const struct xlogue_layout &xlogue = xlogue_layout::get_instance (); + HOST_WIDE_INT stub_ptr_offset = xlogue.get_stub_ptr_offset (); + HOST_WIDE_INT rsi_offset = frame.stack_realign_offset + stub_ptr_offset; + rtx rsi_frame_load = NULL_RTX; + HOST_WIDE_INT rsi_restore_offset = (HOST_WIDE_INT)-1; + enum xlogue_stub stub; + + gcc_assert (!m->fs.fp_valid || frame_pointer_needed); + + /* If using a realigned stack, we should never start with padding. */ + gcc_assert (!stack_realign_fp || !xlogue.get_stack_align_off_in ()); + + /* Setup RSI as the stub's base pointer. */ + align = GET_MODE_ALIGNMENT (V4SFmode); + tmp = choose_baseaddr (rsi_offset, &align); + gcc_assert (align >= GET_MODE_ALIGNMENT (V4SFmode)); + emit_insn (gen_rtx_SET (rsi, tmp)); + + /* Get a symbol for the stub. */ + if (frame_pointer_needed) + stub = use_call ? XLOGUE_STUB_RESTORE_HFP + : XLOGUE_STUB_RESTORE_HFP_TAIL; + else + stub = use_call ? XLOGUE_STUB_RESTORE + : XLOGUE_STUB_RESTORE_TAIL; + sym = xlogue.get_stub_rtx (stub); + + elems_needed = ncregs; + if (use_call) + elems_needed += 1; + else + elems_needed += frame_pointer_needed ? 5 : 3; + v = rtvec_alloc (elems_needed); + + /* We call the epilogue stub when we need to pop incoming args or we are + doing a sibling call as the tail. Otherwise, we will emit a jmp to the + epilogue stub and it is the tail-call. */ + if (use_call) + RTVEC_ELT (v, vi++) = gen_rtx_USE (VOIDmode, sym); + else + { + RTVEC_ELT (v, vi++) = ret_rtx; + RTVEC_ELT (v, vi++) = gen_rtx_USE (VOIDmode, sym); + if (frame_pointer_needed) + { + rtx rbp = gen_rtx_REG (DImode, BP_REG); + gcc_assert (m->fs.fp_valid); + gcc_assert (m->fs.cfa_reg == hard_frame_pointer_rtx); + + tmp = gen_rtx_PLUS (DImode, rbp, GEN_INT (8)); + RTVEC_ELT (v, vi++) = gen_rtx_SET (stack_pointer_rtx, tmp); + RTVEC_ELT (v, vi++) = gen_rtx_SET (rbp, gen_rtx_MEM (DImode, rbp)); + tmp = gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (VOIDmode)); + RTVEC_ELT (v, vi++) = gen_rtx_CLOBBER (VOIDmode, tmp); + } + else + { + /* If no hard frame pointer, we set R10 to the SP restore value. */ + gcc_assert (!m->fs.fp_valid); + gcc_assert (m->fs.cfa_reg == stack_pointer_rtx); + gcc_assert (m->fs.sp_valid); + + r10 = gen_rtx_REG (DImode, R10_REG); + tmp = gen_rtx_PLUS (Pmode, rsi, GEN_INT (stub_ptr_offset)); + emit_insn (gen_rtx_SET (r10, tmp)); + + RTVEC_ELT (v, vi++) = gen_rtx_SET (stack_pointer_rtx, r10); + } + } + + /* Generate frame load insns and restore notes. */ + for (i = 0; i < ncregs; ++i) + { + const xlogue_layout::reginfo &r = xlogue.get_reginfo (i); + enum machine_mode mode = SSE_REGNO_P (r.regno) ? V4SFmode : word_mode; + rtx reg, frame_load; + + reg = gen_rtx_REG (mode, r.regno); + frame_load = gen_frame_load (reg, rsi, r.offset); + + /* Save RSI frame load insn & note to add last. */ + if (r.regno == SI_REG) + { + gcc_assert (!rsi_frame_load); + rsi_frame_load = frame_load; + rsi_restore_offset = r.offset; + } + else + { + RTVEC_ELT (v, vi++) = frame_load; + ix86_add_cfa_restore_note (NULL, reg, r.offset); + } + } + + /* Add RSI frame load & restore note at the end. */ + gcc_assert (rsi_frame_load); + gcc_assert (rsi_restore_offset != (HOST_WIDE_INT)-1); + RTVEC_ELT (v, vi++) = rsi_frame_load; + ix86_add_cfa_restore_note (NULL, gen_rtx_REG (DImode, SI_REG), + rsi_restore_offset); + + /* Finally, for tail-call w/o a hard frame pointer, set SP to R10. */ + if (!use_call && !frame_pointer_needed) + { + gcc_assert (m->fs.sp_valid); + gcc_assert (!m->fs.sp_realigned); + + /* At this point, R10 should point to frame.stack_realign_offset. */ + if (m->fs.cfa_reg == stack_pointer_rtx) + m->fs.cfa_offset += m->fs.sp_offset - frame.stack_realign_offset; + m->fs.sp_offset = frame.stack_realign_offset; + } + + gcc_assert (vi == (unsigned int)GET_NUM_ELEM (v)); + tmp = gen_rtx_PARALLEL (VOIDmode, v); + if (use_call) + insn = emit_insn (tmp); + else + { + insn = emit_jump_insn (tmp); + JUMP_LABEL (insn) = ret_rtx; + + if (frame_pointer_needed) + ix86_emit_leave (insn); + else + { + /* Need CFA adjust note. */ + tmp = gen_rtx_SET (stack_pointer_rtx, r10); + add_reg_note (insn, REG_CFA_ADJUST_CFA, tmp); + } + } + + RTX_FRAME_RELATED_P (insn) = true; + ix86_add_queued_cfa_restore_notes (insn); + + /* If we're not doing a tail-call, we need to adjust the stack. */ + if (use_call && m->fs.sp_valid) + { + HOST_WIDE_INT dealloc = m->fs.sp_offset - frame.stack_realign_offset; + pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx, + GEN_INT (dealloc), style, + m->fs.cfa_reg == stack_pointer_rtx); + } +} + /* Restore function stack, frame, and registers. */ void @@ -14854,6 +15088,7 @@ ix86_expand_epilogue (int style) struct ix86_frame frame; bool restore_regs_via_mov; bool using_drap; + bool restore_stub_is_tail = false; ix86_finalize_stack_realign_flags (); ix86_compute_frame_layout (&frame); @@ -14956,7 +15191,37 @@ ix86_expand_epilogue (int style) ix86_emit_restore_sse_regs_using_mov (frame.sse_reg_save_offset, style == 2); - if (restore_regs_via_mov) + if (m->call_ms2sysv) + { + int pop_incoming_args = crtl->args.pops_args && crtl->args.size; + + /* We cannot use a tail-call for the stub if: + 1. We have to pop incoming args, + 2. We have additional int regs to restore, or + 3. A sibling call will be the tail-call, or + 4. We are emitting an eh_return_internal epilogue. + + TODO: Item 4 has not yet tested! + + If any of the above are true, we will call the stub rather than + jump to it. */ + restore_stub_is_tail = !(pop_incoming_args || frame.nregs || style != 1); + ix86_emit_outlined_ms2sysv_restore (frame, !restore_stub_is_tail, style); + } + + /* If using out-of-line stub that is a tail-call, then...*/ + if (m->call_ms2sysv && restore_stub_is_tail) + { + /* TODO: parinoid tests. (remove eventually) */ + gcc_assert (m->fs.sp_valid); + gcc_assert (!m->fs.sp_realigned); + gcc_assert (!m->fs.fp_valid); + gcc_assert (!m->fs.realigned); + gcc_assert (m->fs.sp_offset == UNITS_PER_WORD); + gcc_assert (!crtl->drap_reg); + gcc_assert (!frame.nregs); + } + else if (restore_regs_via_mov) { rtx t; @@ -15087,7 +15352,7 @@ ix86_expand_epilogue (int style) else if (TARGET_USE_LEAVE || optimize_bb_for_size_p (EXIT_BLOCK_PTR_FOR_FN (cfun)) || !cfun->machine->use_fast_prologue_epilogue) - ix86_emit_leave (); + ix86_emit_leave (NULL); else { pro_epilogue_adjust_stack (stack_pointer_rtx, @@ -15198,7 +15463,7 @@ ix86_expand_epilogue (int style) else emit_jump_insn (gen_simple_return_pop_internal (popc)); } - else + else if (!m->call_ms2sysv || !restore_stub_is_tail) emit_jump_insn (gen_simple_return_internal ()); /* Restore the state back to the state from the prologue, @@ -28927,6 +29192,10 @@ ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1, clobber_reg (&use, gen_rtx_REG (mode, regno)); } + + /* Set here, but it may get cleared later. */ + if (TARGET_CALL_MS2SYSV_XLOGUES) + cfun->machine->call_ms2sysv = true; } if (vec_len > 1) -- 2.11.0