From: Lili Cui <[email protected]>
Hi,
This patch is to enale separate shrink wrapping for x86.
Bootstrapped & regtested on x86-64-pc-linux-gnu.
Ok for trunk?
This commit implements the target macros (TARGET_SHRINK_WRAP_*) that
enable separate shrink wrapping for function prologues/epilogues in
x86.
When performing separate shrink wrapping, we choose to use mov instead
of push/pop, because using push/pop is more complicated to handle rsp
adjustment and may lose performance, so here we choose to use mov, which
has a small impact on code size, but guarantees performance.
Tested against SPEC CPU 2017, this change always has a net-positive
effect on the dynamic instruction count. See the following table for
the breakdown on how this reduces the number of dynamic instructions
per workload on a like-for-like (with/without this commit):
instruction count base with commit (commit-base)/commit
502.gcc_r 98666845943 96891561634 -1.80%
526.blender_r 6.21226E+11 6.12992E+11 -1.33%
520.omnetpp_r 1.1241E+11 1.11093E+11 -1.17%
500.perlbench_r 1271558717 1263268350 -0.65%
523.xalancbmk_r 2.20103E+11 2.18836E+11 -0.58%
531.deepsjeng_r 2.73591E+11 2.72114E+11 -0.54%
500.perlbench_r 64195557393 63881512409 -0.49%
541.leela_r 2.99097E+11 2.98245E+11 -0.29%
548.exchange2_r 1.27976E+11 1.27784E+11 -0.15%
527.cam4_r 88981458425 88887334679 -0.11%
554.roms_r 2.60072E+11 2.59809E+11 -0.10%
gcc/ChangeLog:
* config/i386/i386-protos.h (ix86_get_separate_components):
New function.
(ix86_components_for_bb): Likewise.
(ix86_disqualify_components): Likewise.
(ix86_emit_prologue_components): Likewise.
(ix86_emit_epilogue_components): Likewise.
(ix86_set_handled_components): Likewise.
* config/i386/i386.cc (save_regs_using_push_pop):
Encapsulate code.
(ix86_compute_frame_layout):
Handle save_regs_using_push_pop.
(ix86_emit_save_regs_using_mov):
Skip registers that are wrapped separately.
(ix86_expand_prologue): Likewise.
(ix86_emit_restore_regs_using_mov): Likewise.
(ix86_expand_epilogue): Likewise.
(ix86_get_separate_components): New function.
(ix86_components_for_bb): Likewise.
(ix86_disqualify_components): Likewise.
(ix86_emit_prologue_components): Likewise.
(ix86_emit_epilogue_components): Likewise.
(ix86_set_handled_components): Likewise.
(TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS): Define.
(TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB): Likewise.
(TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS): Likewise.
(TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS): Likewise.
(TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS): Likewise.
(TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS): Likewise.
* config/i386/i386.h (struct machine_function):Add
reg_is_wrapped_separately array for register wrapping
information.
gcc/testsuite/ChangeLog:
* gcc.target/x86_64/abi/callabi/leaf-2.c: Adjust the test.
* gcc.target/i386/interrupt-16.c: Likewise.
* g++.target/i386/shrink_wrap_separate.c: New test.
---
gcc/config/i386/i386-protos.h | 7 +
gcc/config/i386/i386.cc | 261 +++++++++++++++---
gcc/config/i386/i386.h | 1 +
.../g++.target/i386/shrink_wrap_separate.c | 24 ++
gcc/testsuite/gcc.target/i386/interrupt-16.c | 4 +-
.../gcc.target/x86_64/abi/callabi/leaf-2.c | 2 +-
6 files changed, 257 insertions(+), 42 deletions(-)
create mode 100644 gcc/testsuite/g++.target/i386/shrink_wrap_separate.c
diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index e85b925704b..11d26e93973 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -436,6 +436,13 @@ extern rtl_opt_pass *make_pass_align_tight_loops
(gcc::context *);
extern bool ix86_has_no_direct_extern_access;
extern bool ix86_rpad_gate ();
+extern sbitmap ix86_get_separate_components (void);
+extern sbitmap ix86_components_for_bb (basic_block);
+extern void ix86_disqualify_components (sbitmap, edge, sbitmap, bool);
+extern void ix86_emit_prologue_components (sbitmap);
+extern void ix86_emit_epilogue_components (sbitmap);
+extern void ix86_set_handled_components (sbitmap);
+
/* In i386-expand.cc. */
bool ix86_check_builtin_isa_match (unsigned int, HOST_WIDE_INT*,
HOST_WIDE_INT*);
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 3d629b06094..2e8485ddb8b 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -6909,6 +6909,26 @@ ix86_pro_and_epilogue_can_use_push2pop2 (int nregs)
&& (nregs + aligned) >= 3;
}
+/* Check if push pop should be used to save registers. */
+static bool
+save_regs_using_push_pop (HOST_WIDE_INT to_allocate)
+{
+ return ((!to_allocate && cfun->machine->frame.nregs <= 1)
+ || (TARGET_64BIT && to_allocate >= HOST_WIDE_INT_C (0x80000000))
+ /* If static stack checking is enabled and done with probes,
+ the registers need to be saved before allocating the frame. */
+ || flag_stack_check == STATIC_BUILTIN_STACK_CHECK
+ /* If stack clash probing needs a loop, then it needs a
+ scratch register. But the returned register is only guaranteed
+ to be safe to use after register saves are complete. So if
+ stack clash protections are enabled and the allocated frame is
+ larger than the probe interval, then use pushes to save
+ callee saved registers. */
+ || (flag_stack_clash_protection
+ && !ix86_target_stack_probe ()
+ && to_allocate > get_probe_interval ()));
+}
+
/* Fill structure ix86_frame about frame of currently computed function. */
static void
@@ -7193,20 +7213,7 @@ ix86_compute_frame_layout (void)
/* Size prologue needs to allocate. */
to_allocate = offset - frame->sse_reg_save_offset;
- if ((!to_allocate && frame->nregs <= 1)
- || (TARGET_64BIT && to_allocate >= HOST_WIDE_INT_C (0x80000000))
- /* If static stack checking is enabled and done with probes,
- the registers need to be saved before allocating the frame. */
- || flag_stack_check == STATIC_BUILTIN_STACK_CHECK
- /* If stack clash probing needs a loop, then it needs a
- scratch register. But the returned register is only guaranteed
- to be safe to use after register saves are complete. So if
- stack clash protections are enabled and the allocated frame is
- larger than the probe interval, then use pushes to save
- callee saved registers. */
- || (flag_stack_clash_protection
- && !ix86_target_stack_probe ()
- && to_allocate > get_probe_interval ()))
+ if (save_regs_using_push_pop (to_allocate))
frame->save_regs_using_mov = false;
if (ix86_using_red_zone ()
@@ -7664,7 +7671,9 @@ ix86_emit_save_regs_using_mov (HOST_WIDE_INT cfa_offset)
for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true))
{
- ix86_emit_save_reg_using_mov (word_mode, regno, cfa_offset);
+ /* Skip shrink warp separate already processed registers. */
+ if (!cfun->machine->reg_is_wrapped_separately[regno])
+ ix86_emit_save_reg_using_mov (word_mode, regno, cfa_offset);
cfa_offset -= UNITS_PER_WORD;
}
}
@@ -9227,6 +9236,18 @@ ix86_expand_prologue (void)
&& (! TARGET_STACK_PROBE
|| frame.stack_pointer_offset < CHECK_STACK_LIMIT))
{
+ HOST_WIDE_INT allocate_offset;
+ /* If shrink wrap separate works, we will adjust the total offset of
+ rsp at the beginning. */
+ if (crtl->shrink_wrapped_separate)
+ {
+ allocate_offset = m->fs.sp_offset - frame.stack_pointer_offset;
+ pro_epilogue_adjust_stack (stack_pointer_rtx, stack_pointer_rtx,
+ GEN_INT (allocate_offset), -1,
+ m->fs.cfa_reg == stack_pointer_rtx);
+ m->fs.sp_offset = cfun->machine->frame.stack_pointer_offset;
+ }
+
ix86_emit_save_regs_using_mov (frame.reg_save_offset);
cfun->machine->red_zone_used = true;
int_registers_saved = true;
@@ -9806,30 +9827,36 @@ ix86_emit_restore_regs_using_mov (HOST_WIDE_INT
cfa_offset,
for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, maybe_eh_return,
true))
{
- rtx reg = gen_rtx_REG (word_mode, regno);
- rtx mem;
- rtx_insn *insn;
-
- mem = choose_baseaddr (cfa_offset, NULL);
- mem = gen_frame_mem (word_mode, mem);
- insn = emit_move_insn (reg, mem);
- if (m->fs.cfa_reg == crtl->drap_reg && regno == REGNO (crtl->drap_reg))
+ /* Skip shrink warp separate already processed registers. */
+ if (!cfun->machine->reg_is_wrapped_separately[regno])
{
- /* Previously we'd represented the CFA as an expression
- like *(%ebp - 8). We've just popped that value from
- the stack, which means we need to reset the CFA to
- the drap register. This will remain until we restore
- the stack pointer. */
- add_reg_note (insn, REG_CFA_DEF_CFA, reg);
- RTX_FRAME_RELATED_P (insn) = 1;
+ rtx reg = gen_rtx_REG (word_mode, regno);
+ rtx mem;
+ rtx_insn *insn;
- /* This means that the DRAP register is valid for addressing. */
- m->fs.drap_valid = true;
- }
- else
- ix86_add_cfa_restore_note (NULL, reg, cfa_offset);
+ mem = choose_baseaddr (cfa_offset, NULL);
+ mem = gen_frame_mem (word_mode, mem);
+ insn = emit_move_insn (reg, mem);
+ if (m->fs.cfa_reg == crtl->drap_reg
+ && regno == REGNO (crtl->drap_reg))
+ {
+ /* Previously we'd represented the CFA as an expression
+ like *(%ebp - 8). We've just popped that value from
+ the stack, which means we need to reset the CFA to
+ the drap register. This will remain until we restore
+ the stack pointer. */
+ add_reg_note (insn, REG_CFA_DEF_CFA, reg);
+ RTX_FRAME_RELATED_P (insn) = 1;
+
+ /* This means that the DRAP register is valid for addressing.
+ */
+ m->fs.drap_valid = true;
+ }
+ else
+ ix86_add_cfa_restore_note (NULL, reg, cfa_offset);
+ }
cfa_offset -= UNITS_PER_WORD;
}
}
@@ -10108,10 +10135,11 @@ ix86_expand_epilogue (int style)
less work than reloading sp and popping the register. */
else if (!sp_valid_at (frame.hfp_save_offset) && frame.nregs <= 1)
restore_regs_via_mov = true;
- else if (TARGET_EPILOGUE_USING_MOVE
- && cfun->machine->use_fast_prologue_epilogue
- && (frame.nregs > 1
- || m->fs.sp_offset != reg_save_offset))
+ else if (crtl->shrink_wrapped_separate
+ || (TARGET_EPILOGUE_USING_MOVE
+ && cfun->machine->use_fast_prologue_epilogue
+ && (frame.nregs > 1
+ || m->fs.sp_offset != reg_save_offset)))
restore_regs_via_mov = true;
else if (frame_pointer_needed
&& !frame.nregs
@@ -28065,6 +28093,161 @@ ix86_cannot_copy_insn_p (rtx_insn *insn)
#undef TARGET_DOCUMENTATION_NAME
#define TARGET_DOCUMENTATION_NAME "x86"
+/* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */
+sbitmap
+ix86_get_separate_components (void)
+{
+ HOST_WIDE_INT offset, to_allocate;
+ sbitmap components = sbitmap_alloc (FIRST_PSEUDO_REGISTER);
+ bitmap_clear (components);
+
+ offset = cfun->machine->frame.stack_pointer_offset;
+ to_allocate = offset - cfun->machine->frame.sse_reg_save_offset;
+ /* When PPX is enabled, disable shrink wrap separate.
+ It is a trade-off. */
+ if (TARGET_APX_PPX && !crtl->calls_eh_return)
+ return components;
+
+ /* Since shrink wrapping separate uses MOV instead of PUSH/POP
+ We need to disable shrink wrap separate when move is prohibited. */
+ if (save_regs_using_push_pop (to_allocate))
+ return components;
+
+ for (unsigned int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
+ if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true))
+ {
+ /* We can only wrap registers that have small operand offsets.
+ For large offsets a pseudo register might be needed which
+ cannot be created during the shrink wrapping pass. */
+ if (IN_RANGE (offset, -0x8000, 0x7fff))
+ bitmap_set_bit (components, regno);
+ offset += UNITS_PER_WORD;
+ }
+
+ /* Don't mess with the following frame pointer. */
+ if (frame_pointer_needed)
+ bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
+
+ if (crtl->drap_reg)
+ bitmap_clear_bit (components, REGNO (crtl->drap_reg));
+
+ if (pic_offset_table_rtx)
+ bitmap_clear_bit (components, REAL_PIC_OFFSET_TABLE_REGNUM);
+
+ return components;
+}
+
+/* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */
+sbitmap
+ix86_components_for_bb (basic_block bb)
+{
+ bitmap in = DF_LIVE_IN (bb);
+ bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
+ bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
+
+ sbitmap components = sbitmap_alloc (FIRST_PSEUDO_REGISTER);
+ bitmap_clear (components);
+
+ function_abi_aggregator callee_abis;
+ rtx_insn *insn;
+ FOR_BB_INSNS (bb, insn)
+ if (CALL_P (insn))
+ callee_abis.note_callee_abi (insn_callee_abi (insn));
+ HARD_REG_SET extra_caller_saves = callee_abis.caller_save_regs (*crtl->abi);
+
+ /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */
+ for (unsigned int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
+ if (!fixed_regs[regno]
+ && (TEST_HARD_REG_BIT (extra_caller_saves, regno)
+ || bitmap_bit_p (in, regno)
+ || bitmap_bit_p (gen, regno)
+ || bitmap_bit_p (kill, regno)))
+ bitmap_set_bit (components, regno);
+
+ return components;
+}
+
+/* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS. */
+void
+ix86_disqualify_components (sbitmap, edge, sbitmap, bool)
+{
+ /* Nothing to do for i386. */
+}
+
+/* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */
+void
+ix86_emit_prologue_components (sbitmap components)
+{
+ HOST_WIDE_INT cfa_offset;
+ struct machine_function *m = cfun->machine;
+
+ cfa_offset = m->frame.reg_save_offset + m->fs.sp_offset
+ - m->frame.stack_pointer_offset;
+ for (unsigned int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
+ if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true))
+ {
+ if (bitmap_bit_p (components, regno))
+ ix86_emit_save_reg_using_mov (word_mode, regno, cfa_offset);
+ cfa_offset -= UNITS_PER_WORD;
+ }
+}
+
+/* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */
+void
+ix86_emit_epilogue_components (sbitmap components)
+{
+ HOST_WIDE_INT cfa_offset;
+ struct machine_function *m = cfun->machine;
+ cfa_offset = m->frame.reg_save_offset + m->fs.sp_offset
+ - m->frame.stack_pointer_offset;
+
+ for (unsigned int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
+ if (GENERAL_REGNO_P (regno) && ix86_save_reg (regno, true, true))
+ {
+ if (bitmap_bit_p (components, regno))
+ {
+ rtx reg = gen_rtx_REG (word_mode, regno);
+ rtx mem;
+ rtx_insn *insn;
+
+ mem = choose_baseaddr (cfa_offset, NULL);
+ mem = gen_frame_mem (word_mode, mem);
+ insn = emit_move_insn (reg, mem);
+
+ RTX_FRAME_RELATED_P (insn) = 1;
+ add_reg_note (insn, REG_CFA_RESTORE, reg);
+ }
+ cfa_offset -= UNITS_PER_WORD;
+ }
+}
+
+void
+ix86_set_handled_components (sbitmap components)
+{
+ for (unsigned int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
+ if (bitmap_bit_p (components, regno))
+ {
+ cfun->machine->reg_is_wrapped_separately[regno] = true;
+ cfun->machine->use_fast_prologue_epilogue = true;
+ cfun->machine->frame.save_regs_using_mov = true;
+ }
+}
+
+#undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
+#define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS ix86_get_separate_components
+#undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
+#define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB ix86_components_for_bb
+#undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
+#define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS ix86_disqualify_components
+#undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
+#define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
+ ix86_emit_prologue_components
+#undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
+#define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
+ ix86_emit_epilogue_components
+#undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
+#define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS ix86_set_handled_components
+
struct gcc_target targetm = TARGET_INITIALIZER;
#include "gt-i386.h"
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 18fa97a9eb0..e89e57f8278 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -2817,6 +2817,7 @@ struct GTY(()) machine_function {
/* Cached initial frame layout for the current function. */
struct ix86_frame frame;
+ bool reg_is_wrapped_separately[FIRST_PSEUDO_REGISTER];
/* For -fsplit-stack support: A stack local which holds a pointer to
the stack arguments for a function with a variable number of
arguments. This is set at the start of the function and is used
diff --git a/gcc/testsuite/g++.target/i386/shrink_wrap_separate.c
b/gcc/testsuite/g++.target/i386/shrink_wrap_separate.c
new file mode 100644
index 00000000000..8be04822ac3
--- /dev/null
+++ b/gcc/testsuite/g++.target/i386/shrink_wrap_separate.c
@@ -0,0 +1,24 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-rtl-pro_and_epilogue" } */
+typedef struct a b;
+typedef double c;
+struct a {
+ b *d;
+ b *e;
+};
+struct f {
+ c g;
+};
+inline bool h(c i, b *m) {
+ b *j = m->e;
+ for (; m->e; j = j->d)
+ if (h(i, j))
+ return 0;
+ return m;
+}
+bool k() {
+ f *l;
+ b *n;
+ h(l->g, n);
+}
+/* { dg-final { scan-rtl-dump "The components we wrap separately are \[sep 40
41 42 43\]" "pro_and_epilogue" } } */
diff --git a/gcc/testsuite/gcc.target/i386/interrupt-16.c
b/gcc/testsuite/gcc.target/i386/interrupt-16.c
index cb45ba54e3d..ca4441b3aee 100644
--- a/gcc/testsuite/gcc.target/i386/interrupt-16.c
+++ b/gcc/testsuite/gcc.target/i386/interrupt-16.c
@@ -18,5 +18,5 @@ foo (int i)
/* { dg-final { scan-assembler-not "(push|pop)(l|q)\[\\t \]*%(r|e)bp" } } */
/* { dg-final { scan-assembler-not "(push|pop)l\[\\t \]*%edi" { target ia32 }
} } */
/* { dg-final { scan-assembler-not "(push|pop)q\[\\t \]*%r\[0-9\]+" { target {
! ia32 } } } } */
-/* { dg-final { scan-assembler-times "pushq\[\\t \]*%rdi" 1 { target { ! ia32
} } } } */
-/* { dg-final { scan-assembler-times "popq\[\\t \]*%rdi" 1 { target { ! ia32 }
} } } */
+/* { dg-final { scan-assembler-times "(pushq.*%rdi|subq.*\\\$8,.*%rsp)" 1 {
target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-times "(popq.*%rdi|addq.*\\\$8,.*%rsp)" 1 {
target { ! ia32 } } } } */
diff --git a/gcc/testsuite/gcc.target/x86_64/abi/callabi/leaf-2.c
b/gcc/testsuite/gcc.target/x86_64/abi/callabi/leaf-2.c
index 5f3d3e166af..46fc4648dbd 100644
--- a/gcc/testsuite/gcc.target/x86_64/abi/callabi/leaf-2.c
+++ b/gcc/testsuite/gcc.target/x86_64/abi/callabi/leaf-2.c
@@ -1,5 +1,5 @@
/* { dg-do compile } */
-/* { dg-options "-O2 -fno-tree-vectorize -mabi=sysv" } */
+/* { dg-options "-O2 -fno-tree-vectorize -mabi=sysv -fno-shrink-wrap-separate"
} */
extern int glb1, gbl2, gbl3;
--
2.34.1