On Tue, Nov 15, 2022 at 2:04 AM TATSUKAWA KOSUKE(立川 江介) <[email protected]> wrote: > > Hi, > > I was using nfsslower in bcc tools and noticed that it doesn't print out > slow commit operations, which sometimes cause long delays in NFS writes. > I tried adding code to trace slow commit operations and created the > attached patch.
The concept and code looks okay to me. One thing is you could use RAW_TRACEPOINT_PROBE macro to be consistent with some other bcc tools raw tracepoint usages. Please submit a pull request. > > Please let me hear your opinions. > > NFS commits are quite slow and are not performed often, so the patch > shouldn't incur visible overhead. > > Below is a sample output from the modified tool when writing a large > file using dd command. Most commits are issued from kworker threads, > so the commit output may be mostly dropped when filtered by PID. > > TIME COMM PID T BYTES OFF_KB LAT(ms) FILENAME > ... > 16:46:11 dd 1212 W 1048576 651264 46.58 testfile > 16:46:11 dd 1212 W 1048576 653312 54.41 testfile > 16:46:11 dd 1212 W 1048576 654336 18.96 testfile > 16:46:11 dd 1212 W 1048576 655360 49.05 testfile > 16:46:11 dd 1212 W 1048576 657408 82.96 testfile > 16:46:11 dd 1212 W 1048576 659456 109.25 testfile > 16:46:12 dd 1212 W 1048576 660480 163.55 testfile > 16:46:12 dd 1212 W 1048576 662528 205.44 testfile > 16:46:13 dd 1212 W 1048576 663552 751.02 testfile > 16:46:37 kworker/u8:5 1207 C 0 0 27449.05 testfile > 16:46:37 kworker/u8:5 1207 C 0 0 26725.16 testfile > 16:46:37 kworker/u8:5 1207 C 0 0 27592.04 testfile > 16:46:37 kworker/u8:4 1206 C 0 0 22188.75 testfile > 16:46:37 kworker/u8:4 1206 C 0 0 26092.59 testfile > 16:46:37 kworker/u8:4 1206 C 0 0 27268.90 testfile > 16:46:37 kworker/u8:4 1206 C 0 0 22303.24 testfile > 16:46:37 kworker/u8:4 1206 C 0 0 27081.34 testfile > 16:46:37 dd 1212 W 1048576 664576 24337.80 testfile > 16:46:38 dd 1212 W 1048576 958464 61.77 testfile > 16:46:38 dd 1212 W 1048576 960512 56.60 testfile > 16:46:38 dd 1212 W 1048576 963584 55.75 testfile > 16:46:38 dd 1212 W 1048576 965632 54.84 testfile > ... > > Best regards. > --- > Kosuke TATSUKAWA | 1st Platform Software Division > | NEC Solution Innovators > | [email protected] > > ------------------------------------------------------------------------ > diff -rup a/tools/nfsslower.py b/tools/nfsslower.py > --- a/tools/nfsslower 2022-11-14 12:39:23.677073240 +0900 > +++ b/tools/nfsslower 2022-11-14 14:44:49.960073240 +0900 > @@ -9,6 +9,8 @@ > # This script traces some common NFS operations: read, write, opens and > # getattr. It measures the time spent in these operations, and prints details > # for each that exceeded a threshold. > +# The script also traces commit operations, which is specific to nfs and > could > +# be pretty slow. > # > # WARNING: This adds low-overhead instrumentation to these NFS operations, > # including reads and writes from the file system cache. Such reads and > writes > @@ -25,6 +27,8 @@ > # Currently there aren't any tracepoints available for nfs_read_file, > # nfs_write_file and nfs_open_file, nfs_getattr does have entry and exit > # tracepoints but we chose to use kprobes for consistency > +# Raw tracepoints are used to trace nfs:nfs_initiate_commit and > +# nfs:nfs_commit_done. > # > # 31-Aug-2017 Samuel Nair created this. Should work with NFSv{3,4} > > @@ -41,8 +45,8 @@ examples = """ > ./nfsslower -p 121 # trace pid 121 only > """ > parser = argparse.ArgumentParser( > - description="""Trace READ, WRITE, OPEN \ > -and GETATTR NFS calls slower than a threshold,\ > + description="""Trace READ, WRITE, OPEN, GETATTR \ > +and COMMIT NFS calls slower than a threshold,\ > supports NFSv{3,4}""", > formatter_class=argparse.RawDescriptionHelpFormatter, > epilog=examples) > @@ -66,11 +70,13 @@ bpf_text = """ > #include <linux/fs.h> > #include <linux/sched.h> > #include <linux/dcache.h> > +#include <linux/nfs_fs.h> > > #define TRACE_READ 0 > #define TRACE_WRITE 1 > #define TRACE_OPEN 2 > #define TRACE_GETATTR 3 > +#define TRACE_COMMIT 4 > > struct val_t { > u64 ts; > @@ -79,6 +85,12 @@ struct val_t { > struct dentry *d; > }; > > +struct commit_t { > + u64 ts; > + u64 offset; > + u64 count; > +}; > + > struct data_t { > // XXX: switch some to u32's when supported > u64 ts_us; > @@ -93,6 +105,7 @@ struct data_t { > > BPF_HASH(entryinfo, u64, struct val_t); > BPF_PERF_OUTPUT(events); > +BPF_HASH(commitinfo, u64, struct commit_t); > > int trace_rw_entry(struct pt_regs *ctx, struct kiocb *iocb, > struct iov_iter *data) > @@ -227,6 +240,56 @@ int trace_getattr_return(struct pt_regs > return trace_exit(ctx, TRACE_GETATTR); > } > > +int raw_nfs_initiate_commit(struct bpf_raw_tracepoint_args *ctx) { > + u64 key = (u64)ctx->args[0]; > + struct nfs_commit_data *cd = (struct nfs_commit_data *)key; > + struct commit_t c = { 0 }; > + > + c.ts = bpf_ktime_get_ns(); > + bpf_probe_read_kernel(&c.offset, sizeof(cd->args.offset), > + &cd->args.offset); > + bpf_probe_read_kernel(&c.count, sizeof(cd->args.count), &cd->args.count); > + commitinfo.update(&key, &c); > + return 0; > +} > + > +int raw_nfs_commit_done(struct bpf_raw_tracepoint_args *ctx) { > + u64 key = (u64)ctx->args[1]; > + struct commit_t *cp = commitinfo.lookup(&key); > + > + if (cp) { > + struct nfs_commit_data *cd = (struct nfs_commit_data *)key; > + struct nfs_open_context *p; > + struct dentry *de; > + struct qstr qs; > + u64 ts = bpf_ktime_get_ns(); > + u64 delta_us = (ts - cp->ts) / 1000; > + u32 pid = bpf_get_current_pid_tgid() >> 32; > + struct data_t data = {.type = TRACE_COMMIT, .offset = cp->offset, > + .size = cp->count, .ts_us = ts/1000, .delta_us = delta_us, > + .pid = pid}; > + > + commitinfo.delete(&key); > + bpf_get_current_comm(&data.task, sizeof(data.task)); > + > + if(FILTER_PID) > + return 0; > + > + if (FILTER_US) > + return 0; > + > + bpf_probe_read_kernel(&p, sizeof(p), &cd->context); > + bpf_probe_read_kernel(&de, sizeof(de), &p->dentry); > + bpf_probe_read_kernel(&qs, sizeof(qs), &de->d_name); > + if (qs.len) { > + bpf_probe_read_kernel(&data.file, sizeof(data.file), > + (void *)qs.name); > + events.perf_submit(ctx, &data, sizeof(data)); > + } > + } > + return 0; > +} > + > """ > if min_ms == 0: > bpf_text = bpf_text.replace('FILTER_US', '0') > @@ -253,6 +316,8 @@ def print_event(cpu, data, size): > type = 'O' > elif event.type == 3: > type = 'G' > + elif event.type == 4: > + type = 'C' > > if(csv): > print("%d,%s,%d,%s,%d,%d,%d,%s" % ( > @@ -273,7 +338,18 @@ def print_event(cpu, data, size): > # Currently specifically works for NFSv4, the other kprobes are generic > # so it should work with earlier NFS versions > > -b = BPF(text=bpf_text) > +# The following warning is shown on kernels after linux-5.18 when using bcc. > +# Add compile option to silence it. > +# In file included from /virtual/main.c:7: > +# In file included from include/linux/nfs_fs.h:31: > +# In file included from include/linux/sunrpc/auth.h:13: > +# In file included from include/linux/sunrpc/sched.h:19: > +# include/linux/sunrpc/xdr.h:751:10: warning: result of comparison of > constant 4611686018427387903 with expression of type '__u32' (aka 'unsigned > int') is always false [-Wtautological-constant-out-of-range-compare] > +# if (len > SIZE_MAX / sizeof(*p)) > +# ~~~ ^ ~~~~~~~~~~~~~~~~~~~~~ > +# 1 warning generated. > +b = BPF(text=bpf_text, > + cflags=["-Wno-tautological-constant-out-of-range-compare"]) > b.attach_kprobe(event="nfs_file_read", fn_name="trace_rw_entry") > b.attach_kprobe(event="nfs_file_write", fn_name="trace_rw_entry") > b.attach_kprobe(event="nfs4_file_open", fn_name="trace_file_open_entry") > @@ -286,6 +362,9 @@ b.attach_kretprobe(event="nfs4_file_open > b.attach_kretprobe(event="nfs_file_open", fn_name="trace_file_open_return") > b.attach_kretprobe(event="nfs_getattr", fn_name="trace_getattr_return") > > +b.attach_raw_tracepoint("nfs_initiate_commit", "raw_nfs_initiate_commit") > +b.attach_raw_tracepoint("nfs_commit_done", "raw_nfs_commit_done") > + > if(csv): > print("ENDTIME_us,TASK,PID,TYPE,BYTES,OFFSET_b,LATENCY_us,FILE") > else: > > > > -=-=-=-=-=-=-=-=-=-=-=- Links: You receive all messages sent to this group. View/Reply Online (#2024): https://lists.iovisor.org/g/iovisor-dev/message/2024 Mute This Topic: https://lists.iovisor.org/mt/95039760/21656 Group Owner: [email protected] Unsubscribe: https://lists.iovisor.org/g/iovisor-dev/unsub [[email protected]] -=-=-=-=-=-=-=-=-=-=-=-
