From: David Ahern <dsah...@gmail.com> 'perf sched timehist' provides an analysis of scheduling events.
Example usage: perf sched record -- sleep 1 perf sched timehist By default it shows the individual schedule events, including the wait time (time between sched-out and next sched-in events for the task), the task scheduling delay (time between wakeup and actually running) and run time for the task: time cpu task name wait time sch delay run time [tid/pid] (msec) (msec) (msec) -------------- ------ -------------------- --------- --------- --------- 79371.874569 [0011] gcc[31949] 0.014 0.000 1.148 79371.874591 [0010] gcc[31951] 0.000 0.000 0.024 79371.874603 [0010] migration/10[59] 3.350 0.004 0.011 79371.874604 [0011] <idle> 1.148 0.000 0.035 79371.874723 [0005] <idle> 0.016 0.000 1.383 79371.874746 [0005] gcc[31949] 0.153 0.078 0.022 ... Times are in msec.usec. Signed-off-by: David Ahern <dsah...@gmail.com> Signed-off-by: Namhyung Kim <namhy...@kernel.org> --- tools/perf/builtin-sched.c | 594 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 589 insertions(+), 5 deletions(-) diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index fb3441211e4b..c0ac0c9557e8 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -13,12 +13,14 @@ #include "util/cloexec.h" #include "util/thread_map.h" #include "util/color.h" +#include "util/stat.h" #include <subcmd/parse-options.h> #include "util/trace-event.h" #include "util/debug.h" +#include <linux/log2.h> #include <sys/prctl.h> #include <sys/resource.h> @@ -194,6 +196,29 @@ struct perf_sched { struct perf_sched_map map; }; +/* per thread run time data */ +struct thread_runtime { + u64 last_time; /* time of previous sched in/out event */ + u64 dt_run; /* run time */ + u64 dt_wait; /* time between CPU access (off cpu) */ + u64 dt_delay; /* time between wakeup and sched-in */ + u64 ready_to_run; /* time of wakeup */ + + struct stats run_stats; + u64 total_run_time; +}; + +/* per event run time data */ +struct evsel_runtime { + u64 *last_time; /* time this event was last seen per cpu */ + u32 ncpu; /* highest cpu slot allocated */ +}; + +/* track idle times per cpu */ +static struct thread **idle_threads; +static int idle_max_cpu; +static char idle_comm[] = "<idle>"; + static u64 get_nsecs(void) { struct timespec ts; @@ -1654,6 +1679,546 @@ static int perf_sched__read_events(struct perf_sched *sched) return rc; } +/* + * scheduling times are printed as msec.usec + */ +static inline void print_sched_time(unsigned long long nsecs, int width) +{ + unsigned long msecs; + unsigned long usecs; + + msecs = nsecs / NSEC_PER_MSEC; + nsecs -= msecs * NSEC_PER_MSEC; + usecs = nsecs / NSEC_PER_USEC; + printf("%*lu.%03lu ", width, msecs, usecs); +} + +/* + * returns runtime data for event, allocating memory for it the + * first time it is used. + */ +static struct evsel_runtime *perf_evsel__get_runtime(struct perf_evsel *evsel) +{ + struct evsel_runtime *r = evsel->priv; + + if (r == NULL) { + r = zalloc(sizeof(struct evsel_runtime)); + evsel->priv = r; + } + + return r; +} + +/* + * save last time event was seen per cpu + */ +static void perf_evsel__save_time(struct perf_evsel *evsel, + u64 timestamp, u32 cpu) +{ + struct evsel_runtime *r = perf_evsel__get_runtime(evsel); + + if (r == NULL) + return; + + if ((cpu >= r->ncpu) || (r->last_time == NULL)) { + int i, n = __roundup_pow_of_two(cpu+1); + void *p = r->last_time; + + p = realloc(r->last_time, n * sizeof(u64)); + if (!p) + return; + + r->last_time = p; + for (i = r->ncpu; i < n; ++i) + r->last_time[i] = (u64) 0; + + r->ncpu = n; + } + + r->last_time[cpu] = timestamp; +} + +/* returns last time this event was seen on the given cpu */ +static u64 perf_evsel__get_time(struct perf_evsel *evsel, u32 cpu) +{ + struct evsel_runtime *r = perf_evsel__get_runtime(evsel); + + if ((r == NULL) || (r->last_time == NULL) || (cpu >= r->ncpu)) + return 0; + + return r->last_time[cpu]; +} + +static int comm_width = 20; + +static char *timehist_get_commstr(struct thread *thread) +{ + static char str[32]; + const char *comm = thread__comm_str(thread); + pid_t tid = thread->tid; + pid_t pid = thread->pid_; + int n; + + if (pid == 0) + n = scnprintf(str, sizeof(str), "%s", comm); + + else if (tid != pid) + n = scnprintf(str, sizeof(str), "%s[%d/%d]", comm, tid, pid); + + else + n = scnprintf(str, sizeof(str), "%s[%d]", comm, tid); + + if (n > comm_width) + comm_width = n; + + return str; +} + +static void timehist_header(void) +{ + printf("%15s %6s ", "time", "cpu"); + + printf(" %-20s %9s %9s %9s", + "task name", "wait time", "sch delay", "run time"); + + printf("\n"); + + /* + * units row + */ + printf("%15s %-6s ", "", ""); + + printf(" %-20s %9s %9s %9s\n", "[tid/pid]", "(msec)", "(msec)", "(msec)"); + + /* + * separator + */ + printf("%.15s %.6s ", graph_dotted_line, graph_dotted_line); + + printf(" %.20s %.9s %.9s %.9s", + graph_dotted_line, graph_dotted_line, graph_dotted_line, + graph_dotted_line); + + printf("\n"); +} + +static void timehist_print_sample(struct perf_sample *sample, + struct thread *thread) +{ + struct thread_runtime *tr = thread__priv(thread); + char tstr[64]; + + timestamp__scnprintf_usec(sample->time, tstr, sizeof(tstr)); + printf("%15s [%04d] ", tstr, sample->cpu); + + printf(" %-*s ", comm_width, timehist_get_commstr(thread)); + + print_sched_time(tr->dt_wait, 6); + print_sched_time(tr->dt_delay, 6); + print_sched_time(tr->dt_run, 6); + printf("\n"); +} + +/* + * Explanation of delta-time stats: + * + * t = time of current schedule out event + * tprev = time of previous sched out event + * also time of schedule-in event for current task + * last_time = time of last sched change event for current task + * (i.e, time process was last scheduled out) + * ready_to_run = time of wakeup for current task + * + * -----|------------|------------|------------|------ + * last ready tprev t + * time to run + * + * |-------- dt_wait --------| + * |- dt_delay -|-- dt_run --| + * + * dt_run = run time of current task + * dt_wait = time between last schedule out event for task and tprev + * represents time spent off the cpu + * dt_delay = time between wakeup and schedule-in of task + */ + +static void timehist_update_runtime_stats(struct thread_runtime *r, + u64 t, u64 tprev) +{ + r->dt_delay = 0; + r->dt_wait = 0; + r->dt_run = 0; + if (tprev) { + r->dt_run = t - tprev; + if (r->ready_to_run) { + if (r->ready_to_run > tprev) + pr_debug("time travel: wakeup time for task > previous sched_switch event\n"); + else + r->dt_delay = tprev - r->ready_to_run; + } + + if (r->last_time > tprev) + pr_debug("time travel: last sched out time for task > previous sched_switch event\n"); + else if (r->last_time) + r->dt_wait = tprev - r->last_time; + } + + update_stats(&r->run_stats, r->dt_run); + r->total_run_time += r->dt_run; +} + +static bool is_idle_sample(struct perf_sample *sample, + struct perf_evsel *evsel) +{ + /* pid 0 == swapper == idle task */ + if (sample->pid == 0) + return true; + + if (strcmp(perf_evsel__name(evsel), "sched:sched_switch") == 0) { + if (perf_evsel__intval(evsel, sample, "prev_pid") == 0) + return true; + } + return false; +} + +/* + * Track idle stats per cpu by maintaining a local thread + * struct for the idle task on each cpu. + */ +static int init_idle_threads(int ncpu) +{ + int i; + + idle_threads = zalloc(ncpu * sizeof(struct thread *)); + if (!idle_threads) + return -ENOMEM; + + idle_max_cpu = ncpu - 1; + + /* allocate the actual thread struct if needed */ + for (i = 0; i < ncpu; ++i) { + idle_threads[i] = thread__new(0, 0); + if (idle_threads[i] == NULL) + return -ENOMEM; + + thread__set_comm(idle_threads[i], idle_comm, 0); + } + + return 0; +} + +static void free_idle_threads(void) +{ + int i; + + if (idle_threads == NULL) + return; + + for (i = 0; i <= idle_max_cpu; ++i) { + if ((idle_threads[i])) + thread__delete(idle_threads[i]); + } + + free(idle_threads); +} + +static struct thread *get_idle_thread(int cpu) +{ + /* + * expand/allocate array of pointers to local thread + * structs if needed + */ + if ((cpu >= idle_max_cpu) || (idle_threads == NULL)) { + int i, j = __roundup_pow_of_two(cpu+1); + void *p; + + p = realloc(idle_threads, j * sizeof(struct thread *)); + if (!p) + return NULL; + + idle_threads = (struct thread **) p; + i = idle_max_cpu ? idle_max_cpu + 1 : 0; + for (; i < j; ++i) + idle_threads[i] = NULL; + + idle_max_cpu = j; + } + + /* allocate a new thread struct if needed */ + if (idle_threads[cpu] == NULL) { + idle_threads[cpu] = thread__new(0, 0); + + if (idle_threads[cpu]) { + idle_threads[cpu]->tid = 0; + thread__set_comm(idle_threads[cpu], idle_comm, 0); + } + } + + return idle_threads[cpu]; +} + +/* + * handle runtime stats saved per thread + */ +static struct thread_runtime *thread__init_runtime(struct thread *thread) +{ + struct thread_runtime *r; + + r = zalloc(sizeof(struct thread_runtime)); + if (!r) + return NULL; + + init_stats(&r->run_stats); + thread__set_priv(thread, r); + + return r; +} + +static struct thread_runtime *thread__get_runtime(struct thread *thread) +{ + struct thread_runtime *tr; + + tr = thread__priv(thread); + if (tr == NULL) { + tr = thread__init_runtime(thread); + if (tr == NULL) + pr_debug("Failed to malloc memory for runtime data.\n"); + } + + return tr; +} + +static struct thread *timehist_get_thread(struct perf_sample *sample, + struct machine *machine, + struct perf_evsel *evsel) +{ + struct thread *thread; + + if (is_idle_sample(sample, evsel)) { + thread = get_idle_thread(sample->cpu); + if (thread == NULL) + pr_err("Failed to get idle thread for cpu %d.\n", sample->cpu); + + } else { + thread = machine__findnew_thread(machine, sample->pid, sample->tid); + if (thread == NULL) { + pr_debug("Failed to get thread for tid %d. skipping sample.\n", + sample->tid); + } + } + + return thread; +} + +static bool timehist_skip_sample(struct thread *thread) +{ + bool rc = false; + + if (thread__is_filtered(thread)) + rc = true; + + return rc; +} + +static int timehist_sched_wakeup_event(struct perf_tool *tool __maybe_unused, + union perf_event *event __maybe_unused, + struct perf_evsel *evsel, + struct perf_sample *sample, + struct machine *machine) +{ + struct thread *thread; + struct thread_runtime *tr = NULL; + /* want pid of awakened task not pid in sample */ + const u32 pid = perf_evsel__intval(evsel, sample, "pid"); + + thread = machine__findnew_thread(machine, 0, pid); + if (thread == NULL) + return -1; + + tr = thread__get_runtime(thread); + if (tr == NULL) + return -1; + + if (tr->ready_to_run == 0) + tr->ready_to_run = sample->time; + + return 0; +} + +static int timehist_sched_change_event(struct perf_tool *tool __maybe_unused, + union perf_event *event, + struct perf_evsel *evsel, + struct perf_sample *sample, + struct machine *machine) +{ + struct addr_location al; + struct thread *thread; + struct thread_runtime *tr = NULL; + u64 tprev; + int rc = 0; + + if (machine__resolve(machine, &al, sample) < 0) { + pr_err("problem processing %d event. skipping it\n", + event->header.type); + rc = -1; + goto out; + } + + thread = timehist_get_thread(sample, machine, evsel); + if (thread == NULL) { + rc = -1; + goto out; + } + + if (timehist_skip_sample(thread)) + goto out; + + tr = thread__get_runtime(thread); + if (tr == NULL) { + rc = -1; + goto out; + } + + tprev = perf_evsel__get_time(evsel, sample->cpu); + + timehist_update_runtime_stats(tr, sample->time, tprev); + timehist_print_sample(sample, thread); + +out: + if (tr) { + /* time of this sched_switch event becomes last time task seen */ + tr->last_time = sample->time; + + /* sched out event for task so reset ready to run time */ + tr->ready_to_run = 0; + } + + perf_evsel__save_time(evsel, sample->time, sample->cpu); + + return rc; +} + +static int timehist_sched_switch_event(struct perf_tool *tool, + union perf_event *event, + struct perf_evsel *evsel, + struct perf_sample *sample, + struct machine *machine __maybe_unused) +{ + return timehist_sched_change_event(tool, event, evsel, sample, machine); +} + +static int process_lost(struct perf_tool *tool __maybe_unused, + union perf_event *event, + struct perf_sample *sample, + struct machine *machine __maybe_unused) +{ + char tstr[64]; + + timestamp__scnprintf_usec(sample->time, tstr, sizeof(tstr)); + printf("%15s ", tstr); + printf("lost %" PRIu64 " events on cpu %d\n", event->lost.lost, sample->cpu); + + return 0; +} + + +typedef int (*sched_handler)(struct perf_tool *tool, + union perf_event *event, + struct perf_evsel *evsel, + struct perf_sample *sample, + struct machine *machine); + +static int perf_timehist__process_sample(struct perf_tool *tool, + union perf_event *event, + struct perf_sample *sample, + struct perf_evsel *evsel, + struct machine *machine) +{ + struct perf_sched *sched = container_of(tool, struct perf_sched, tool); + int err = 0; + int this_cpu = sample->cpu; + + if (this_cpu > sched->max_cpu) + sched->max_cpu = this_cpu; + + if (evsel->handler != NULL) { + sched_handler f = evsel->handler; + + err = f(tool, event, evsel, sample, machine); + } + + return err; +} + +static int perf_sched__timehist(struct perf_sched *sched) +{ + const struct perf_evsel_str_handler handlers[] = { + { "sched:sched_switch", timehist_sched_switch_event, }, + { "sched:sched_wakeup", timehist_sched_wakeup_event, }, + { "sched:sched_wakeup_new", timehist_sched_wakeup_event, }, + }; + struct perf_data_file file = { + .path = input_name, + .mode = PERF_DATA_MODE_READ, + }; + + struct perf_session *session; + int err = -1; + + /* + * event handlers for timehist option + */ + sched->tool.sample = perf_timehist__process_sample; + sched->tool.mmap = perf_event__process_mmap; + sched->tool.comm = perf_event__process_comm; + sched->tool.exit = perf_event__process_exit; + sched->tool.fork = perf_event__process_fork; + sched->tool.lost = process_lost; + sched->tool.attr = perf_event__process_attr; + sched->tool.tracing_data = perf_event__process_tracing_data; + sched->tool.build_id = perf_event__process_build_id; + + sched->tool.ordered_events = true; + sched->tool.ordering_requires_timestamps = true; + + session = perf_session__new(&file, false, &sched->tool); + if (session == NULL) + return -ENOMEM; + + symbol__init(&session->header.env); + + setup_pager(); + + /* setup per-evsel handlers */ + if (perf_session__set_tracepoints_handlers(session, handlers)) + goto out; + + if (!perf_session__has_traces(session, "record -R")) + goto out; + + /* pre-allocate struct for per-CPU idle stats */ + sched->max_cpu = session->header.env.nr_cpus_online; + if (sched->max_cpu == 0) + sched->max_cpu = 4; + if (init_idle_threads(sched->max_cpu)) + goto out; + + timehist_header(); + + err = perf_session__process_events(session); + if (err) { + pr_err("Failed to process events, error %d", err); + goto out; + } + +out: + free_idle_threads(); + perf_session__delete(session); + + return err; +} + + static void print_bad_events(struct perf_sched *sched) { if (sched->nr_unordered_timestamps && sched->nr_timestamps) { @@ -1970,8 +2535,6 @@ int cmd_sched(int argc, const char **argv, const char *prefix __maybe_unused) const struct option latency_options[] = { OPT_STRING('s', "sort", &sched.sort_order, "key[,key2...]", "sort by key(s): runtime, switch, avg, max"), - OPT_INCR('v', "verbose", &verbose, - "be more verbose (show symbol address, etc)"), OPT_INTEGER('C', "CPU", &sched.profile_cpu, "CPU to profile on"), OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace, @@ -1983,8 +2546,6 @@ int cmd_sched(int argc, const char **argv, const char *prefix __maybe_unused) const struct option replay_options[] = { OPT_UINTEGER('r', "repeat", &sched.replay_repeat, "repeat the workload replay N times (-1: infinite)"), - OPT_INCR('v', "verbose", &verbose, - "be more verbose (show symbol address, etc)"), OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace, "dump raw trace in ASCII"), OPT_BOOLEAN('f', "force", &sched.force, "don't complain, do it"), @@ -2001,6 +2562,16 @@ int cmd_sched(int argc, const char **argv, const char *prefix __maybe_unused) "display given CPUs in map"), OPT_PARENT(sched_options) }; + const struct option timehist_options[] = { + OPT_STRING('k', "vmlinux", &symbol_conf.vmlinux_name, + "file", "vmlinux pathname"), + OPT_STRING(0, "kallsyms", &symbol_conf.kallsyms_name, + "file", "kallsyms pathname"), + OPT_STRING(0, "symfs", &symbol_conf.symfs, "directory", + "Look for files with symbols relative to this directory"), + OPT_PARENT(sched_options) + }; + const char * const latency_usage[] = { "perf sched latency [<options>]", NULL @@ -2013,8 +2584,13 @@ int cmd_sched(int argc, const char **argv, const char *prefix __maybe_unused) "perf sched map [<options>]", NULL }; + const char * const timehist_usage[] = { + "perf sched timehist [<options>]", + NULL + }; const char *const sched_subcommands[] = { "record", "latency", "map", - "replay", "script", NULL }; + "replay", "script", + "timehist", NULL }; const char *sched_usage[] = { NULL, NULL @@ -2077,6 +2653,14 @@ int cmd_sched(int argc, const char **argv, const char *prefix __maybe_unused) usage_with_options(replay_usage, replay_options); } return perf_sched__replay(&sched); + } else if (!strcmp(argv[0], "timehist")) { + if (argc) { + argc = parse_options(argc, argv, timehist_options, + timehist_usage, 0); + if (argc) + usage_with_options(timehist_usage, timehist_options); + } + return perf_sched__timehist(&sched); } else { usage_with_options(sched_usage, sched_options); } -- 2.10.1