On Thu, Dec 4, 2014 at 2:55 PM, Brendan Gregg <[email protected]> wrote:
> G'Day, > > I've hacked hotspot to return the frame pointer, in part to see what this > involves, and also to have a working prototype for analysis. Along with an > agent to resolve symbols, this has allowed full stack profiling using Linux > perf_events. The following flame graphs show the resulting profiles. > > A mixed mode CPU flame graph of a vert.x benchmark (click to zoom): > > http://www.brendangregg.com/FlameGraphs/cpu-mixedmode-vertx.svg > > Same thing, but this time disabling inlining, to show more frames: > > http://www.brendangregg.com/FlameGraphs/cpu-mixedmode-flamegraph.svg > > As expected, performance is worse without inlining. You can compare the > flame graphs side by side to see why. Less time spent doing work / I/O! > > https://github.com/brendangregg/Misc/blob/master/java/openjdk8_b132-fp.diff > is my patch, > [...] > In case there's problems with the patch URL, the patch is: --- openjdk8clean/hotspot/src/cpu/x86/vm/x86_64.ad 2014-03-04 02:52:11.000000000 +0000 +++ openjdk8/hotspot/src/cpu/x86/vm/x86_64.ad 2014-11-08 01:10:49.686044933 +0000 @@ -166,10 +166,9 @@ // 3) reg_class stack_slots( /* one chunk of stack-based "registers" */ ) // -// Class for all pointer registers (including RSP) +// Class for all pointer registers (including RSP, excluding RBP) reg_class any_reg(RAX, RAX_H, RDX, RDX_H, - RBP, RBP_H, RDI, RDI_H, RSI, RSI_H, RCX, RCX_H, @@ -184,10 +183,9 @@ R14, R14_H, R15, R15_H); -// Class for all pointer registers except RSP +// Class for all pointer registers except RSP and RBP reg_class ptr_reg(RAX, RAX_H, RDX, RDX_H, - RBP, RBP_H, RDI, RDI_H, RSI, RSI_H, RCX, RCX_H, @@ -199,9 +197,8 @@ R13, R13_H, R14, R14_H); -// Class for all pointer registers except RAX and RSP +// Class for all pointer registers except RAX, RSP and RBP reg_class ptr_no_rax_reg(RDX, RDX_H, - RBP, RBP_H, RDI, RDI_H, RSI, RSI_H, RCX, RCX_H, @@ -226,9 +223,8 @@ R13, R13_H, R14, R14_H); -// Class for all pointer registers except RAX, RBX and RSP +// Class for all pointer registers except RAX, RBX, RSP and RBP reg_class ptr_no_rax_rbx_reg(RDX, RDX_H, - RBP, RBP_H, RDI, RDI_H, RSI, RSI_H, RCX, RCX_H, @@ -260,10 +256,9 @@ // Singleton class for TLS pointer reg_class ptr_r15_reg(R15, R15_H); -// Class for all long registers (except RSP) +// Class for all long registers (except RSP and RBP) reg_class long_reg(RAX, RAX_H, RDX, RDX_H, - RBP, RBP_H, RDI, RDI_H, RSI, RSI_H, RCX, RCX_H, @@ -275,9 +270,8 @@ R13, R13_H, R14, R14_H); -// Class for all long registers except RAX, RDX (and RSP) -reg_class long_no_rax_rdx_reg(RBP, RBP_H, - RDI, RDI_H, +// Class for all long registers except RAX, RDX (and RSP, RBP) +reg_class long_no_rax_rdx_reg(RDI, RDI_H, RSI, RSI_H, RCX, RCX_H, RBX, RBX_H, @@ -288,9 +282,8 @@ R13, R13_H, R14, R14_H); -// Class for all long registers except RCX (and RSP) -reg_class long_no_rcx_reg(RBP, RBP_H, - RDI, RDI_H, +// Class for all long registers except RCX (and RSP, RBP) +reg_class long_no_rcx_reg(RDI, RDI_H, RSI, RSI_H, RAX, RAX_H, RDX, RDX_H, @@ -302,9 +295,8 @@ R13, R13_H, R14, R14_H); -// Class for all long registers except RAX (and RSP) -reg_class long_no_rax_reg(RBP, RBP_H, - RDX, RDX_H, +// Class for all long registers except RAX (and RSP, RBP) +reg_class long_no_rax_reg(RDX, RDX_H, RDI, RDI_H, RSI, RSI_H, RCX, RCX_H, @@ -325,10 +317,9 @@ // Singleton class for RDX long register reg_class long_rdx_reg(RDX, RDX_H); -// Class for all int registers (except RSP) +// Class for all int registers (except RSP and RBP) reg_class int_reg(RAX, RDX, - RBP, RDI, RSI, RCX, @@ -340,10 +331,9 @@ R13, R14); -// Class for all int registers except RCX (and RSP) +// Class for all int registers except RCX (and RSP, RBP) reg_class int_no_rcx_reg(RAX, RDX, - RBP, RDI, RSI, RBX, @@ -355,8 +345,7 @@ R14); // Class for all int registers except RAX, RDX (and RSP) -reg_class int_no_rax_rdx_reg(RBP, - RDI, +reg_class int_no_rax_rdx_reg(RDI, RSI, RCX, RBX, @@ -718,6 +707,7 @@ st->print("# stack bang"); st->print("\n\t"); st->print("pushq rbp\t# Save rbp"); + // BDG consider: st->print("movq rbp, rsp\t# "); if (framesize) { st->print("\n\t"); st->print("subq rsp, #%d\t# Create frame",framesize); --- openjdk8clean/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp 2014-03-04 02:52:11.000000000 +0000 +++ openjdk8/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp 2014-11-07 23:57:11.589593723 +0000 @@ -5236,6 +5236,7 @@ // We always push rbp, so that on return to interpreter rbp, will be // restored correctly and we can correct the stack. push(rbp); + mov(rbp, rsp); // Remove word for ebp framesize -= wordSize; --- openjdk8clean/hotspot/src/cpu/x86/vm/c1_MacroAssembler_x86.cpp 2014-03-04 02:52:10.000000000 +0000 +++ openjdk8/hotspot/src/cpu/x86/vm/c1_MacroAssembler_x86.cpp 2014-11-07 23:57:21.933257882 +0000 @@ -358,6 +358,7 @@ generate_stack_overflow_check(frame_size_in_bytes); push(rbp); + mov(rbp, rsp); #ifdef TIERED // c2 leaves fpu stack dirty. Clean it on entry if (UseSSE < 2 ) { Brendan
