From: Andi Kleen <a...@linux.intel.com>

Implement printing full disassembled sequences for branch stacks in perf
script. This allows to directly print hot paths for individual samples,
together with branch misprediction and even cycle count information.

% perf record -b ...
% perf script -F brstackasm
...
        00007f0668d54e88        movsx (%rsi), %ecx
        00007f0668d54e8b        lea -0x30(%rcx), %eax
        00007f0668d54e8e        cmp $0x9, %al
        00007f0668d54e90        jbe 0x68d54eaf
        00007f0668d54e92        cmp %cl, %dl
        00007f0668d54e94        jnz 0x68d54eb5
        00007f0668d54e96        add $0x1, %rdi
        00007f0668d54e9a        movsx (%rdi), %edx
        00007f0668d54e9d        add $0x1, %rsi
        00007f0668d54ea1        test %dl, %dl
        00007f0668d54ea3        jnz _dl_cache_libcmp+11       # PRED 21 cycles
        00007f0668d54dfb        lea -0x30(%rdx), %eax
        00007f0668d54dfe        cmp $0x9, %al
        00007f0668d54e00        ja _dl_cache_libcmp+152       # PRED 2 cycles
        00007f0668d54e88        movsx (%rsi), %ecx
        00007f0668d54e8b        lea -0x30(%rcx), %eax
        00007f0668d54e8e        cmp $0x9, %al
        00007f0668d54e90        jbe 0x68d54eaf
        00007f0668d54e92        cmp %cl, %dl
        00007f0668d54e94        jnz 0x68d54eb5                # PRED 3 cycles
        00007f0668d54eb5        movsx %dl, %eax
        00007f0668d54eb8        sub %ecx, %eax
        00007f0668d54eba        ret                           # PRED 1 cycles
        00007f0668d54fae        test %eax, %eax
        00007f0668d54fb0        jz _dl_load_cache_lookup+688
        00007f0668d54fb6        jns 0x68d54f70
        00007f0668d54fb8        lea 0x1(%r14), %ebx
        00007f0668d54fbc        cmp %r15d, %ebx
        00007f0668d54fbf        nop
        00007f0668d54fc0        jle 0x68d54f79                # PRED 2 cycles

Open issues:
- Occasionally the path does not reach up to the sample IP, as the LBRs
may be freezed earlier. Use precise events to avoid that.

v2: Remove bogus hunk. Document --max-blocks. Fix some printfs.
Port to latest tree.
Signed-off-by: Andi Kleen <a...@linux.intel.com>
---
 tools/perf/Documentation/perf-script.txt |  14 ++-
 tools/perf/builtin-script.c              | 183 +++++++++++++++++++++++++++++++
 2 files changed, 195 insertions(+), 2 deletions(-)

diff --git a/tools/perf/Documentation/perf-script.txt 
b/tools/perf/Documentation/perf-script.txt
index f2b81d8..0903985 100644
--- a/tools/perf/Documentation/perf-script.txt
+++ b/tools/perf/Documentation/perf-script.txt
@@ -116,7 +116,7 @@ OPTIONS
 --fields::
         Comma separated list of fields to print. Options are:
         comm, tid, pid, time, cpu, event, trace, ip, sym, dso, addr, symoff,
-       srcline, period, iregs, brstack, brstacksym, flags, asm.
+       srcline, period, iregs, brstack, brstacksym, flags, asm, brstackasm
         Field list can be prepended with the type, trace, sw or hw,
         to indicate to which event type the field list applies.
         e.g., -f sw:comm,tid,time,ip,sym  and -f trace:time,cpu,trace
@@ -176,17 +176,24 @@ OPTIONS
        i.e., -f "" is not allowed.
 
        The brstack output includes branch related information with raw 
addresses using the
-       /v/v/v/v/ syntax in the following order:
+       /v/v/v/v/cycles syntax in the following order:
        FROM: branch source instruction
        TO  : branch target instruction
         M/P/-: M=branch target mispredicted or branch direction was 
mispredicted, P=target predicted or direction predicted, -=not supported
        X/- : X=branch inside a transactional region, -=not in transaction 
region or not supported
        A/- : A=TSX abort entry, -=not aborted region or not supported
+       cycles
 
        The brstacksym is identical to brstack, except that the FROM and TO 
addresses are printed in a symbolic form if possible.
 
        When asm is specified the assembler instruction of each sample is 
printed in disassembled form.
 
+       When brstackasm is specified the full assembler sequences of branch 
blocks for each sample
+       is printed (a branch block is a sequence of instructions not containing 
taken branches).
+       This is the full execution path leading to the sample. This is only 
supported when the
+       sample was recorded with perf record -b or -j any.
+       The maximum number of branch blocks to print can be configured with the 
--max-blocks option.
+
 -k::
 --vmlinux=<file>::
         vmlinux pathname
@@ -268,6 +275,9 @@ include::itrace.txt[]
 --force::
        Don't do ownership validation.
 
+--max-blocks=N:
+       Maximum number of branch blocks to print with -F brstackasm
+
 SEE ALSO
 --------
 linkperf:perf-record[1], linkperf:perf-script-perl[1],
diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index 323572e..1072cbb 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -42,6 +42,7 @@ static bool                   nanosecs;
 static const char              *cpu_list;
 static DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS);
 static struct perf_stat_config stat_config;
+static int                     max_blocks;
 
 unsigned int scripting_max_stack = PERF_MAX_STACK_DEPTH;
 
@@ -67,6 +68,7 @@ enum perf_output_field {
        PERF_OUTPUT_WEIGHT          = 1U << 18,
        PERF_OUTPUT_BPF_OUTPUT      = 1U << 19,
        PERF_OUTPUT_ASM             = 1U << 20,
+       PERF_OUTPUT_BRSTACKASM      = 1U << 21,
 };
 
 struct output_option {
@@ -94,6 +96,7 @@ struct output_option {
        {.str = "weight",   .field = PERF_OUTPUT_WEIGHT},
        {.str = "bpf-output",   .field = PERF_OUTPUT_BPF_OUTPUT},
        {.str = "asm", .field = PERF_OUTPUT_ASM},
+       {.str = "brstackasm", .field = PERF_OUTPUT_BRSTACKASM},
 };
 
 /* default set to maintain compatibility with current format */
@@ -293,6 +296,13 @@ static int perf_evsel__check_attr(struct perf_evsel *evsel,
                       "selected.\n");
                return -EINVAL;
        }
+       if (PRINT_FIELD(BRSTACKASM) &&
+           !(perf_evlist__combined_branch_type(session->evlist) &
+             PERF_SAMPLE_BRANCH_ANY)) {
+               pr_err("Display of branch stack assembler requested, but non 
all-branch filter set\n");
+               return -EINVAL;
+       }
+
        if ((PRINT_FIELD(PID) || PRINT_FIELD(TID)) &&
                perf_evsel__check_stype(evsel, PERF_SAMPLE_TID, "TID",
                                        PERF_OUTPUT_TID|PERF_OUTPUT_PID))
@@ -621,6 +631,175 @@ static void print_sample_brstacksym(struct perf_sample 
*sample,
        }
 }
 
+#ifdef HAVE_UDIS86
+#define MAXBB 16384UL
+#define MAXINSN 16
+
+static int grab_bb(char *buffer, u64 start, u64 end,
+                   struct machine *machine, struct thread *thread,
+                   bool *is64bit, u8 *cpumode)
+{
+       int offset, len;
+       struct addr_location al;
+       bool kernel;
+
+       if (!start || !end)
+               return 0;
+
+       kernel = machine__kernel_ip(machine, start);
+       if (kernel)
+               *cpumode = PERF_RECORD_MISC_KERNEL;
+       else
+               *cpumode = PERF_RECORD_MISC_USER;
+       if (kernel != machine__kernel_ip(machine, end))
+               return 0;
+
+       memset(&al, 0, sizeof(al));
+       if (end - start > MAXBB - MAXINSN) {
+               printf("\tbasic block %" PRIx64 "-%" PRIx64 " (%ld) too long to 
dump\n",
+                      start, end, end - start);
+               return 0;
+       }
+
+       thread__find_addr_map(thread, *cpumode, MAP__FUNCTION, start, &al);
+       if (!al.map || !al.map->dso) {
+               printf("\tcannot resolve %" PRIx64 "-%" PRIx64 "\n",
+                               start, end);
+               return 0;
+       }
+       if (al.map->dso->data.status == DSO_DATA_STATUS_ERROR) {
+               printf("\tcannot resolve %" PRIx64 "-%" PRIx64 "\n",
+                               start, end);
+               return 0;
+       }
+
+       /* Load maps to ensure dso->is_64_bit has been updated */
+       map__load(al.map, machine->symbol_filter);
+
+       offset = al.map->map_ip(al.map, start);
+       len = dso__data_read_offset(al.map->dso, machine,
+                                   offset, (u8 *)buffer,
+                                   end - start + MAXINSN);
+
+       *is64bit = al.map->dso->is_64_bit;
+       return len;
+}
+#endif
+
+static void print_sample_brstackasm(struct perf_sample *sample,
+                                   struct thread *thread __maybe_unused,
+                                   struct perf_event_attr *attr __maybe_unused,
+                                   struct machine *machine __maybe_unused)
+{
+#ifdef HAVE_UDIS86
+       struct branch_stack *br = sample->branch_stack;
+       u64 start, end;
+       int i;
+       static bool ud_initialized = false;
+       static struct perf_ud ud;
+       char buffer[MAXBB];
+       int len;
+       bool last;
+       bool is64bit;
+       int nr;
+
+       if (!(br && br->nr))
+               return;
+       nr = br->nr;
+       if (max_blocks && nr > max_blocks + 1)
+               nr = max_blocks + 1;
+
+       if (!ud_initialized) {
+               ud_initialized = true;
+               ud_init(&ud.ud_obj);
+               ud_set_syntax(&ud.ud_obj, UD_SYN_ATT);
+               ud_set_sym_resolver(&ud.ud_obj, dis_resolve);
+       }
+       ud.thread = thread;
+       ud.cpu = sample->cpu;
+
+       putchar('\n');
+       for (i = nr - 2; i >= 0; i--) {
+               if (br->entries[i].from || br->entries[i].to)
+                       printf("%d: %lx-%lx\n", i,
+                               br->entries[i].from,
+                               br->entries[i].to);
+               start = br->entries[i + 1].to;
+               end = br->entries[i].from;
+
+               /*
+                * Leave extra bytes for the final jump instruction for
+                * which we don't know the length
+                */
+               len = grab_bb(buffer, start, end + MAXINSN,
+                               machine, thread, &is64bit,
+                               &ud.cpumode);
+               if (len <= 0)
+                       continue;
+
+               ud_set_mode(&ud.ud_obj, is64bit ? 64 : 32);
+               ud_set_pc(&ud.ud_obj, start);
+               ud_set_input_buffer(&ud.ud_obj, (uint8_t *)buffer, len);
+               last = false;
+               while (ud_disassemble(&ud.ud_obj) && !last) {
+                       if (ud_insn_ptr(&ud.ud_obj) ==
+                                       (uint8_t *)buffer + end - start) {
+                               printf("\t%016" PRIx64 "\t%-30s\t#%s%s%s%s\n",
+                                       ud_insn_off(&ud.ud_obj),
+                                       ud_insn_asm(&ud.ud_obj),
+                                       br->entries[i].flags.predicted ? " 
PRED" : "",
+                                       br->entries[i].flags.mispred ? " 
MISPRED" : "",
+                                       br->entries[i].flags.in_tx ? " INTX" : 
"",
+                                       br->entries[i].flags.abort ? " ABORT" : 
"");
+                               if (br->entries[i].flags.cycles)
+                                       printf(" %d cycles", 
br->entries[i].flags.cycles);
+                               last = true;
+                       } else {
+                               printf("\t%016" PRIx64 "\t%s\n",
+                                               ud_insn_off(&ud.ud_obj),
+                                       ud_insn_asm(&ud.ud_obj));
+                       }
+               }
+       }
+
+       /*
+        * Hit the branch? In this case we are already done, and the target
+        * has not been executed yet.
+        */
+       if (br->entries[0].from == sample->ip)
+               return;
+       if (br->entries[0].flags.abort)
+               return;
+
+       /*
+        * Print final block upto sample
+        */
+       start = br->entries[0].to;
+       end = sample->ip;
+       len = grab_bb(buffer, start, end, machine, thread, &is64bit,
+                       &ud.cpumode);
+       ud_set_input_buffer(&ud.ud_obj, (uint8_t *)buffer, len);
+       if (len <= 0) {
+               /* Print at least last IP if basic block did not work */
+               len = grab_bb(buffer, sample->ip, sample->ip + MAXINSN,
+                               machine, thread, &is64bit, &ud.cpumode);
+               if (len <= 0)
+                       return;
+               ud_set_mode(&ud.ud_obj, is64bit ? 64 : 32);
+               ud_set_pc(&ud.ud_obj, sample->ip);
+               if (ud_disassemble(&ud.ud_obj))
+                       printf("\t%016" PRIx64 "\t%s\n", 
ud_insn_off(&ud.ud_obj),
+                              ud_insn_asm(&ud.ud_obj));
+               return;
+       }
+       ud_set_mode(&ud.ud_obj, is64bit ? 64 : 32);
+       ud_set_pc(&ud.ud_obj, start);
+       while (ud_disassemble(&ud.ud_obj) &&
+               ud_insn_ptr(&ud.ud_obj) <= (uint8_t *)buffer + end - start)
+               printf("\t%016" PRIx64 "\t%s\n", ud_insn_off(&ud.ud_obj),
+                              ud_insn_asm(&ud.ud_obj));
+#endif
+}
 
 static void print_sample_addr(struct perf_sample *sample,
                          struct thread *thread,
@@ -898,6 +1077,8 @@ print_rest:
        if (perf_evsel__is_bpf_output(evsel) && PRINT_FIELD(BPF_OUTPUT))
                print_sample_bpf_output(sample);
 
+       if (PRINT_FIELD(BRSTACKASM))
+               print_sample_brstackasm(sample, thread, attr, machine);
        if (PRINT_FIELD(ASM))
                print_sample_asm(sample, thread, attr, al, machine);
 
@@ -2129,6 +2310,8 @@ int cmd_script(int argc, const char **argv, const char 
*prefix __maybe_unused)
                    "Show the mmap events"),
        OPT_BOOLEAN('\0', "show-switch-events", &script.show_switch_events,
                    "Show context switch events (if recorded)"),
+       OPT_INTEGER(0, "max-blocks", &max_blocks,
+                   "Maximum number of code blocks to dump with brstackasm"),
        OPT_BOOLEAN('f', "force", &file.force, "don't complain, do it"),
        OPT_BOOLEAN(0, "ns", &nanosecs,
                    "Use 9 decimal places when displaying time"),
-- 
2.5.5

Reply via email to