On Tue, Oct 31, 2017 at 3:09 AM, Daniel Santos <daniel.san...@pobox.com> wrote: > When we are realigning the stack pointer, making an ms_abi to sysv_abi > call and alllocating 2GiB or more on the stack we end up with an invalid > INSN due to a non-immediate offset. This occurs both with and without > -mcall-ms2sysv-xlogues. Additionally, I've discovered that the stack > allocation with -mcall-ms2sysv-xlogues is incorrect as it ignores stack > checking, stack clash checking and probing. > > This patch fixes these problems by > > 1. No longer allocate stack space in ix86_emit_outlined_ms2sysv_save. > 2. Rearrange where we emit SSE saves or stub call: > a. Before frame allocation when offset from frame to save area is >= 2GiB. > b. After frame allocation when frame is < 2GiB. (Stack allocations > prior to the stub call can't be combined with those afterwards, so > this is better when possible.) > 3. Modify choose_baseaddr to take an optional scratch_regno argument > and never return rtx that cannot be used as an immediate. > > gcc: > config/i386/i386.c (choose_basereg): Use optional scratch > register and add assertion. > (x86_emit_outlined_ms2sysv_save): use scratch register when > needed, and don't allocate stack. > (ix86_expand_prologue): Rearrange where SSE saves/stub call is > emitted, correct wrong allocation with -mcall-ms2sysv-xlogues. > (ix86_emit_outlined_ms2sysv_restore): Fix non-immediate offsets. > > gcc/testsuite: > gcc.target/i386/pr82002-2a.c: Change from xfail to fail. > gcc.target/i386/pr82002-2b.c: Likewise. > > Signed-off-by: Daniel Santos <daniel.san...@pobox.com> > --- > gcc/config/i386/i386.c | 76 > ++++++++++++++++++++++++------ > gcc/testsuite/gcc.target/i386/pr82002-2a.c | 2 - > gcc/testsuite/gcc.target/i386/pr82002-2b.c | 2 - > 3 files changed, 62 insertions(+), 18 deletions(-) > > diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c > index 83a07afb3e1..abd8e937e0d 100644 > --- a/gcc/config/i386/i386.c > +++ b/gcc/config/i386/i386.c > @@ -11520,7 +11520,8 @@ choose_basereg (HOST_WIDE_INT cfa_offset, rtx > &base_reg, > The valid base registers are taken from CFUN->MACHINE->FS. */ > > static rtx > -choose_baseaddr (HOST_WIDE_INT cfa_offset, unsigned int *align) > +choose_baseaddr (HOST_WIDE_INT cfa_offset, unsigned int *align, > + int scratch_regno = -1) > { > rtx base_reg = NULL; > HOST_WIDE_INT base_offset = 0; > @@ -11534,6 +11535,28 @@ choose_baseaddr (HOST_WIDE_INT cfa_offset, unsigned > int *align) > choose_basereg (cfa_offset, base_reg, base_offset, 0, align); > > gcc_assert (base_reg != NULL); > + > + if (TARGET_64BIT) > + { > + rtx base_offset_rtx = GEN_INT (base_offset); > + > + if (scratch_regno >= 0) > + { > + if (!x86_64_immediate_operand (base_offset_rtx, DImode)) > + { > + rtx tmp; > + rtx scratch_reg = gen_rtx_REG (DImode, scratch_regno); > + > + emit_insn (gen_rtx_SET (scratch_reg, base_offset_rtx)); > + tmp = gen_rtx_PLUS (DImode, scratch_reg, base_reg); > + emit_insn (gen_rtx_SET (scratch_reg, tmp)); > + return scratch_reg; > + } > + } > + else > + gcc_assert (x86_64_immediate_operand (base_offset_rtx, DImode)); > + } > + > return plus_constant (Pmode, base_reg, base_offset); > }
This function doesn't need to return a register, it can return plus RTX. I'd suggest the following implementation: --cut here-- Index: i386.c =================================================================== --- i386.c (revision 254243) +++ i386.c (working copy) @@ -11520,7 +11520,8 @@ The valid base registers are taken from CFUN->MACHINE->FS. */ static rtx -choose_baseaddr (HOST_WIDE_INT cfa_offset, unsigned int *align) +choose_baseaddr (HOST_WIDE_INT cfa_offset, unsigned int *align, + unsigned int scratch_regno = INVALID_REGNUM) { rtx base_reg = NULL; HOST_WIDE_INT base_offset = 0; @@ -11534,6 +11535,19 @@ choose_basereg (cfa_offset, base_reg, base_offset, 0, align); gcc_assert (base_reg != NULL); + + rtx base_offset_rtx = GEN_INT (base_offset); + + if (!x86_64_immediate_operand (base_offset_rtx, Pmode)) + { + gcc_assert (scratch_regno != INVALID_REGNUM); + + rtx scratch_reg = gen_rtx_REG (Pmode, scratch_regno); + emit_move_insn (scratch_reg, base_offset_rtx); + + return gen_rtx_PLUS (Pmode, base_reg, scratch_reg); + } + return plus_constant (Pmode, base_reg, base_offset); } --cut here-- You have to always return Pmode, otherwise x32 will complain (you may try with -maddress-mode=short). Also, the above will immediately ICE when too large base_offset is used without the scratch, so one can backtrace to offending function. > @@ -12793,23 +12816,22 @@ ix86_emit_outlined_ms2sysv_save (const struct > ix86_frame &frame) > rtx sym, addr; > rtx rax = gen_rtx_REG (word_mode, AX_REG); > const struct xlogue_layout &xlogue = xlogue_layout::get_instance (); > - HOST_WIDE_INT allocate = frame.stack_pointer_offset - m->fs.sp_offset; > > /* AL should only be live with sysv_abi. */ > gcc_assert (!ix86_eax_live_at_start_p ()); > + gcc_assert (m->fs.sp_offset >= frame.sse_reg_save_offset); > > /* Setup RAX as the stub's base pointer. We use stack_realign_offset > rather > we've actually realigned the stack or not. */ > align = GET_MODE_ALIGNMENT (V4SFmode); > addr = choose_baseaddr (frame.stack_realign_offset > - + xlogue.get_stub_ptr_offset (), &align); > + + xlogue.get_stub_ptr_offset (), &align, AX_REG); > gcc_assert (align >= GET_MODE_ALIGNMENT (V4SFmode)); > - emit_insn (gen_rtx_SET (rax, addr)); > > - /* Allocate stack if not already done. */ > - if (allocate > 0) > - pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx, > - GEN_INT (-allocate), -1, false); > + /* If choose_baseaddr returned our scratch register, then we don't need to > + do another SET. */ > + if (!REG_P (addr) || REGNO (addr) != AX_REG) > + emit_insn (gen_rtx_SET (rax, addr)); You won't need the above change with a choose_baseaddr that returns PLUS RTX. > /* Get the stub symbol. */ > sym = xlogue.get_stub_rtx (frame_pointer_needed ? XLOGUE_STUB_SAVE_HFP > @@ -12841,6 +12863,7 @@ ix86_expand_prologue (void) > HOST_WIDE_INT allocate; > bool int_registers_saved; > bool sse_registers_saved; > + bool save_stub_call_needed; > rtx static_chain = NULL_RTX; > > if (ix86_function_naked (current_function_decl)) > @@ -13016,6 +13039,8 @@ ix86_expand_prologue (void) > > int_registers_saved = (frame.nregs == 0); > sse_registers_saved = (frame.nsseregs == 0); > + save_stub_call_needed = (m->call_ms2sysv); > + gcc_assert (!(!sse_registers_saved && save_stub_call_needed)); Oooh, double negation :( > if (frame_pointer_needed && !m->fs.fp_valid) > { > @@ -13110,10 +13135,27 @@ ix86_expand_prologue (void) > target. */ > if (TARGET_SEH) > m->fs.sp_valid = false; > - } > > - if (m->call_ms2sysv) > - ix86_emit_outlined_ms2sysv_save (frame); > + /* If SP offset is non-immediate after allocation of the stack frame, > + then emit SSE saves or stub call prior to allocating the rest of the > + stack frame. This is less efficient for the out-of-line stub because > + we can't combine allocations across the call barrier, but it's better > + than using a scratch register. */ > + else if (frame.stack_pointer_offset - m->fs.sp_realigned_offset > + > 0x7fffffff) Should we use x86_64_immediate_operand here that betters document the limitation instead of using magic constants? > + { > + if (!sse_registers_saved) > + { > + ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset); > + sse_registers_saved = true; > + } > + else if (save_stub_call_needed) > + { > + ix86_emit_outlined_ms2sysv_save (frame); > + save_stub_call_needed = false; > + } > + } > + } > > allocate = frame.stack_pointer_offset - m->fs.sp_offset; > > @@ -13337,6 +13379,8 @@ ix86_expand_prologue (void) > ix86_emit_save_regs_using_mov (frame.reg_save_offset); > if (!sse_registers_saved) > ix86_emit_save_sse_regs_using_mov (frame.sse_reg_save_offset); > + else if (save_stub_call_needed) > + ix86_emit_outlined_ms2sysv_save (frame); > > /* For the mcount profiling on 32 bit PIC mode we need to emit SET_GOT > in PROLOGUE. */ > @@ -13560,7 +13604,7 @@ ix86_emit_outlined_ms2sysv_restore (const struct > ix86_frame &frame, > rtvec v; > unsigned int elems_needed, align, i, vi = 0; > rtx_insn *insn; > - rtx sym, tmp; > + rtx sym, addr, tmp; > rtx rsi = gen_rtx_REG (word_mode, SI_REG); > rtx r10 = NULL_RTX; > const struct xlogue_layout &xlogue = xlogue_layout::get_instance (); > @@ -13577,9 +13621,13 @@ ix86_emit_outlined_ms2sysv_restore (const struct > ix86_frame &frame, > > /* Setup RSI as the stub's base pointer. */ > align = GET_MODE_ALIGNMENT (V4SFmode); > - tmp = choose_baseaddr (rsi_offset, &align); > + addr = choose_baseaddr (rsi_offset, &align, SI_REG); > gcc_assert (align >= GET_MODE_ALIGNMENT (V4SFmode)); > - emit_insn (gen_rtx_SET (rsi, tmp)); > + > + /* If choose_baseaddr returned our scratch register, then we don't need to > + do another SET. */ > + if (!REG_P (addr) || REGNO (addr) != SI_REG) > + emit_insn (gen_rtx_SET (rsi, addr)); Again, no need for these changes with the above implementation of choose_baseaddr. > /* Get a symbol for the stub. */ > if (frame_pointer_needed) > diff --git a/gcc/testsuite/gcc.target/i386/pr82002-2a.c > b/gcc/testsuite/gcc.target/i386/pr82002-2a.c > index bc85080ba8e..c31440debe2 100644 > --- a/gcc/testsuite/gcc.target/i386/pr82002-2a.c > +++ b/gcc/testsuite/gcc.target/i386/pr82002-2a.c > @@ -1,7 +1,5 @@ > /* { dg-do compile { target lp64 } } */ > /* { dg-options "-Ofast -mstackrealign -mabi=ms" } */ > -/* { dg-xfail-if "" { *-*-* } } */ > -/* { dg-xfail-run-if "" { *-*-* } } */ > > void __attribute__((sysv_abi)) a (char *); > void > diff --git a/gcc/testsuite/gcc.target/i386/pr82002-2b.c > b/gcc/testsuite/gcc.target/i386/pr82002-2b.c > index 10e44cd7b1d..939e069517d 100644 > --- a/gcc/testsuite/gcc.target/i386/pr82002-2b.c > +++ b/gcc/testsuite/gcc.target/i386/pr82002-2b.c > @@ -1,7 +1,5 @@ > /* { dg-do compile { target lp64 } } */ > /* { dg-options "-Ofast -mstackrealign -mabi=ms -mcall-ms2sysv-xlogues" } */ > -/* { dg-xfail-if "" { *-*-* } } */ > -/* { dg-xfail-run-if "" { *-*-* } } */ > > void __attribute__((sysv_abi)) a (char *); > void > -- > 2.14.3 >