Eelco Chaudron <[email protected]> writes:
> This addition uses the existing syscall probes to record statistics
> related to the OVS 'Unreasonably long ... ms poll interval' message.
> Basically, it records the min/max/average time between system poll
> calls. This can be used to determine if a long poll event has
> occurred during the capture.
>
> Signed-off-by: Eelco Chaudron <[email protected]>
> ---
> The long line warning can be ignored. I'm keeping the original
> output in the documentation.
>
> v2: - Calculate the average on display, not per sample.
> - Enhanced the documentation with additional log details.
> ---
Just some nits below, and otherwise looks good. Thanks!
> utilities/usdt-scripts/kernel_delay.py | 70 ++++++++++++++++++++++++-
> utilities/usdt-scripts/kernel_delay.rst | 17 +++++-
> 2 files changed, 83 insertions(+), 4 deletions(-)
>
> diff --git a/utilities/usdt-scripts/kernel_delay.py
> b/utilities/usdt-scripts/kernel_delay.py
> index 367d27e34..cbca7ef8b 100755
> --- a/utilities/usdt-scripts/kernel_delay.py
> +++ b/utilities/usdt-scripts/kernel_delay.py
> @@ -182,11 +182,27 @@ struct syscall_data_key_t {
> u32 syscall;
> };
>
> +struct long_poll_data_key_t {
> + u32 pid;
> + u32 tid;
> +};
> +
> +struct long_poll_data_t {
> + u64 count;
> + u64 total_ns;
> + u64 min_ns;
> + u64 max_ns;
> +};
> +
> BPF_HASH(syscall_start, u64, u64);
> BPF_HASH(syscall_data, struct syscall_data_key_t, struct syscall_data_t);
> +BPF_HASH(long_poll_start, u64, u64);
> +BPF_HASH(long_poll_data, struct long_poll_data_key_t, struct
> long_poll_data_t);
>
> TRACEPOINT_PROBE(raw_syscalls, sys_enter) {
> u64 pid_tgid = bpf_get_current_pid_tgid();
> + struct long_poll_data_t *val, init_val = {.min_ns = U64_MAX};
> + struct long_poll_data_key_t key;
>
> if (!capture_enabled(pid_tgid))
> return 0;
> @@ -194,6 +210,29 @@ TRACEPOINT_PROBE(raw_syscalls, sys_enter) {
> u64 t = bpf_ktime_get_ns();
> syscall_start.update(&pid_tgid, &t);
>
> + /* Do long poll handling from here on. */
> + if (args->id != <SYSCALL_POLL_ID>)
> + return 0;
> +
> + u64 *start_ns = long_poll_start.lookup(&pid_tgid);
> +
> + if (!start_ns || *start_ns == 0)
> + return 0;
> +
> + key.pid = pid_tgid >> 32;
> + key.tid = (u32)pid_tgid;
> +
> + val = long_poll_data.lookup_or_try_init(&key, &init_val);
> + if (val) {
> + u64 delta = t - *start_ns;
> + val->count++;
> + val->total_ns += delta;
> + if (delta > val->max_ns)
> + val->max_ns = delta;
> + else if (delta < val->min_ns)
Should this really be an else? It's possible that min will == U64_MAX
here so we might want to drop the 'else' and make sure it gets populated
on the first time through.
That said, I can also get behind the idea that after the first couple of
samples it won't likely matter because we'll have dropped into the 'min'
case anyway.
> + val->min_ns = delta;
> + }
> +
> return 0;
> }
>
> @@ -206,6 +245,12 @@ TRACEPOINT_PROBE(raw_syscalls, sys_exit) {
> if (!capture_enabled(pid_tgid))
> return 0;
>
> + u64 t = bpf_ktime_get_ns();
> +
> + if (args->id == <SYSCALL_POLL_ID>) {
> + long_poll_start.update(&pid_tgid, &t);
> + }
> +
> key.pid = pid_tgid >> 32;
> key.tid = (u32)pid_tgid;
> key.syscall = args->id;
> @@ -217,7 +262,7 @@ TRACEPOINT_PROBE(raw_syscalls, sys_exit) {
>
> val = syscall_data.lookup_or_try_init(&key, &zero);
> if (val) {
> - u64 delta = bpf_ktime_get_ns() - *start_ns;
> + u64 delta = t - *start_ns;
> val->count++;
> val->total_ns += delta;
> if (delta > val->worst_ns)
> @@ -1039,6 +1084,9 @@ def process_results(syscall_events=None,
> trigger_delta=None):
> threads_syscall = {k.tid for k, _ in bpf["syscall_data"].items()
> if k.syscall != 0xffffffff}
>
> + threads_long_poll = {k.tid for k, _ in bpf["long_poll_data"].items()
> + if k.pid != 0xffffffff}
> +
> threads_run = {k.tid for k, _ in bpf["run_data"].items()
> if k.pid != 0xffffffff}
>
> @@ -1055,7 +1103,8 @@ def process_results(syscall_events=None,
> trigger_delta=None):
> if k.pid != 0xffffffff}
>
> threads = sorted(threads_syscall | threads_run | threads_ready |
> - threads_stopped | threads_hardirq | threads_softirq,
> + threads_stopped | threads_hardirq | threads_softirq |
> + threads_long_poll,
> key=lambda x: get_thread_name(options.pid, x))
>
> #
> @@ -1099,6 +1148,21 @@ def process_results(syscall_events=None,
> trigger_delta=None):
> print("{}{:20.20} {:6} {:10} {:16,}".format(
> indent, "TOTAL( - poll):", "", total_count, total_ns))
>
> + #
> + # LONG POLL STATISTICS
> + #
> + for k, v in filter(lambda t: t[0].tid == thread,
> + bpf["long_poll_data"].items()):
> +
> + print("\n{:10} {:16} {}\n{}{:10} {:>16} {:>16} {:>16}".format(
> + "", "", "[LONG POLL STATISTICS]", indent,
> + "COUNT", "AVERAGE ns", "MIN ns", "MAX ns"))
> +
> + print("{}{:10} {:16,} {:16,} {:16,}".format(
> + indent, v.count, int(v.total_ns / v.count),
> + v.min_ns, v.max_ns))
On the off chance that count == 0 (which should *NEVER* happen) should
we put some kind of try{}? It's theoretical only I think, because count
should be populated as long as items() returns true... so probably not
super high priority.
> + break
> +
> #
> # THREAD RUN STATISTICS
> #
> @@ -1429,6 +1493,8 @@ def main():
> source = source.replace("<STACK_TRACE_ENABLED>", "true"
> if options.stack_trace_size > 0 else "false")
>
> + source = source.replace("<SYSCALL_POLL_ID>", str(poll_id))
> +
> source = source.replace("<INSTALL_OVS_DP_UPCALL_PROBE>", "0"
> if options.skip_upcall_stats else "1")
>
> diff --git a/utilities/usdt-scripts/kernel_delay.rst
> b/utilities/usdt-scripts/kernel_delay.rst
> index 41620d760..0a2a430a5 100644
> --- a/utilities/usdt-scripts/kernel_delay.rst
> +++ b/utilities/usdt-scripts/kernel_delay.rst
> @@ -67,13 +67,17 @@ with the ``--pid`` option.
> read 0 1 1,292
> 1,292
> TOTAL( - poll): 519 144,405,334
>
> + [LONG POLL STATISTICS]
> + COUNT AVERAGE ns MIN ns
> MAX ns
> + 58 76,773 7,388
> 234,129
> +
> [THREAD RUN STATISTICS]
> SCHED_CNT TOTAL ns MIN ns
> MAX ns
> - 6 136,764,071 1,480
> 115,146,424
> + 6 136,764,071 1,480
> 115,146,424
>
> [THREAD READY STATISTICS]
> SCHED_CNT TOTAL ns MAX ns
> - 7 11,334 6,636
> + 7 11,334 6,636
>
> [THREAD STOPPED STATISTICS]
> STOP_CNT TOTAL ns MAX ns
> @@ -104,6 +108,7 @@ For this, it displays the thread's id (``TID``) and name
> (``THREAD``),
> followed by resource-specific data. Which are:
>
> - ``SYSCALL STATISTICS``
> +- ``LONG POLL STATISTICS``
> - ``THREAD RUN STATISTICS``
> - ``THREAD READY STATISTICS``
> - ``THREAD STOPPED STATISTICS``
> @@ -129,6 +134,14 @@ Note that it only counts calls that started and stopped
> during the
> measurement interval!
>
>
> +``LONG POLL STATISTICS``
> +~~~~~~~~~~~~~~~~~~~~~~
> +``LONG POLL STATISTICS`` tell you how long the thread was running between two
> +poll system calls. This relates to the 'Unreasonably long ... ms poll
> interval'
> +message reported by ovs-vswitchd. More details about this message can be
> found
> +in the example section.
> +
> +
> ``THREAD RUN STATISTICS``
> ~~~~~~~~~~~~~~~~~~~~~~~~~
> ``THREAD RUN STATISTICS`` tell you how long the thread was running on a CPU
_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev