Hi,

I was using nfsslower in bcc tools and noticed that it doesn't print out
slow commit operations, which sometimes cause long delays in NFS writes.
I tried adding code to trace slow commit operations and created the
attached patch.

Please let me hear your opinions.

NFS commits are quite slow and are not performed often, so the patch
shouldn't incur visible overhead.

Below is a sample output from the modified tool when writing a large
file using dd command.  Most commits are issued from kworker threads,
so the commit output may be mostly dropped when filtered by PID.

TIME     COMM           PID    T BYTES   OFF_KB   LAT(ms) FILENAME
...
16:46:11 dd             1212   W 1048576 651264     46.58 testfile
16:46:11 dd             1212   W 1048576 653312     54.41 testfile
16:46:11 dd             1212   W 1048576 654336     18.96 testfile
16:46:11 dd             1212   W 1048576 655360     49.05 testfile
16:46:11 dd             1212   W 1048576 657408     82.96 testfile
16:46:11 dd             1212   W 1048576 659456    109.25 testfile
16:46:12 dd             1212   W 1048576 660480    163.55 testfile
16:46:12 dd             1212   W 1048576 662528    205.44 testfile
16:46:13 dd             1212   W 1048576 663552    751.02 testfile
16:46:37 kworker/u8:5   1207   C 0       0        27449.05 testfile
16:46:37 kworker/u8:5   1207   C 0       0        26725.16 testfile
16:46:37 kworker/u8:5   1207   C 0       0        27592.04 testfile
16:46:37 kworker/u8:4   1206   C 0       0        22188.75 testfile
16:46:37 kworker/u8:4   1206   C 0       0        26092.59 testfile
16:46:37 kworker/u8:4   1206   C 0       0        27268.90 testfile
16:46:37 kworker/u8:4   1206   C 0       0        22303.24 testfile
16:46:37 kworker/u8:4   1206   C 0       0        27081.34 testfile
16:46:37 dd             1212   W 1048576 664576   24337.80 testfile
16:46:38 dd             1212   W 1048576 958464     61.77 testfile
16:46:38 dd             1212   W 1048576 960512     56.60 testfile
16:46:38 dd             1212   W 1048576 963584     55.75 testfile
16:46:38 dd             1212   W 1048576 965632     54.84 testfile
...

Best regards.
---
Kosuke TATSUKAWA  | 1st Platform Software Division
                  | NEC Solution Innovators
                  | tatsu-...@nec.com

------------------------------------------------------------------------
diff -rup a/tools/nfsslower.py b/tools/nfsslower.py
--- a/tools/nfsslower   2022-11-14 12:39:23.677073240 +0900
+++ b/tools/nfsslower   2022-11-14 14:44:49.960073240 +0900
@@ -9,6 +9,8 @@
 # This script traces some common NFS operations: read, write, opens and
 # getattr. It measures the time spent in these operations, and prints details
 # for each that exceeded a threshold.
+# The script also traces commit operations, which is specific to nfs and could
+# be pretty slow.
 #
 # WARNING: This adds low-overhead instrumentation to these NFS operations,
 # including reads and writes from the file system cache. Such reads and writes
@@ -25,6 +27,8 @@
 # Currently there aren't any tracepoints available for nfs_read_file,
 # nfs_write_file and nfs_open_file, nfs_getattr does have entry and exit
 # tracepoints but we chose to use kprobes for consistency
+# Raw tracepoints are used to trace nfs:nfs_initiate_commit and
+# nfs:nfs_commit_done.
 #
 # 31-Aug-2017   Samuel Nair created this. Should work with NFSv{3,4}
 
@@ -41,8 +45,8 @@ examples = """
     ./nfsslower -p 121  # trace pid 121 only
 """
 parser = argparse.ArgumentParser(
-    description="""Trace READ, WRITE, OPEN \
-and GETATTR NFS calls slower than a threshold,\
+    description="""Trace READ, WRITE, OPEN, GETATTR \
+and COMMIT NFS calls slower than a threshold,\
 supports NFSv{3,4}""",
     formatter_class=argparse.RawDescriptionHelpFormatter,
     epilog=examples)
@@ -66,11 +70,13 @@ bpf_text = """
 #include <linux/fs.h>
 #include <linux/sched.h>
 #include <linux/dcache.h>
+#include <linux/nfs_fs.h>
 
 #define TRACE_READ 0
 #define TRACE_WRITE 1
 #define TRACE_OPEN 2
 #define TRACE_GETATTR 3
+#define TRACE_COMMIT 4
 
 struct val_t {
     u64 ts;
@@ -79,6 +85,12 @@ struct val_t {
     struct dentry *d;
 };
 
+struct commit_t {
+    u64 ts;
+    u64 offset;
+    u64 count;
+};
+
 struct data_t {
     // XXX: switch some to u32's when supported
     u64 ts_us;
@@ -93,6 +105,7 @@ struct data_t {
 
 BPF_HASH(entryinfo, u64, struct val_t);
 BPF_PERF_OUTPUT(events);
+BPF_HASH(commitinfo, u64, struct commit_t);
 
 int trace_rw_entry(struct pt_regs *ctx, struct kiocb *iocb,
                                 struct iov_iter *data)
@@ -227,6 +240,56 @@ int trace_getattr_return(struct pt_regs
     return trace_exit(ctx, TRACE_GETATTR);
 }
 
+int raw_nfs_initiate_commit(struct bpf_raw_tracepoint_args *ctx) {
+    u64 key = (u64)ctx->args[0];
+    struct nfs_commit_data *cd = (struct nfs_commit_data *)key;
+    struct commit_t c = { 0 };
+
+    c.ts = bpf_ktime_get_ns();
+    bpf_probe_read_kernel(&c.offset, sizeof(cd->args.offset),
+        &cd->args.offset);
+    bpf_probe_read_kernel(&c.count, sizeof(cd->args.count), &cd->args.count);
+    commitinfo.update(&key, &c);
+    return 0;
+}
+
+int raw_nfs_commit_done(struct bpf_raw_tracepoint_args *ctx) {
+    u64 key = (u64)ctx->args[1];
+    struct commit_t *cp = commitinfo.lookup(&key);
+
+    if (cp) {
+        struct nfs_commit_data *cd = (struct nfs_commit_data *)key;
+        struct nfs_open_context *p;
+        struct dentry *de;
+        struct qstr qs;
+        u64 ts = bpf_ktime_get_ns();
+        u64 delta_us = (ts - cp->ts) / 1000;
+        u32 pid = bpf_get_current_pid_tgid() >> 32;
+        struct data_t data = {.type = TRACE_COMMIT, .offset = cp->offset,
+            .size = cp->count, .ts_us = ts/1000, .delta_us = delta_us,
+            .pid = pid};
+
+        commitinfo.delete(&key);
+        bpf_get_current_comm(&data.task, sizeof(data.task));
+
+        if(FILTER_PID)
+            return 0;
+
+        if (FILTER_US)
+            return 0;
+
+        bpf_probe_read_kernel(&p, sizeof(p), &cd->context);
+        bpf_probe_read_kernel(&de, sizeof(de), &p->dentry);
+        bpf_probe_read_kernel(&qs, sizeof(qs), &de->d_name);
+        if (qs.len) {
+            bpf_probe_read_kernel(&data.file, sizeof(data.file),
+                (void *)qs.name);
+            events.perf_submit(ctx, &data, sizeof(data));
+        }
+    }
+    return 0;
+}
+
 """
 if min_ms == 0:
     bpf_text = bpf_text.replace('FILTER_US', '0')
@@ -253,6 +316,8 @@ def print_event(cpu, data, size):
         type = 'O'
     elif event.type == 3:
         type = 'G'
+    elif event.type == 4:
+        type = 'C'
 
     if(csv):
         print("%d,%s,%d,%s,%d,%d,%d,%s" % (
@@ -273,7 +338,18 @@ def print_event(cpu, data, size):
 # Currently specifically works for NFSv4, the other kprobes are generic
 # so it should work with earlier NFS versions
 
-b = BPF(text=bpf_text)
+# The following warning is shown on kernels after linux-5.18 when using bcc.
+# Add compile option to silence it.
+#   In file included from /virtual/main.c:7:
+#   In file included from include/linux/nfs_fs.h:31:
+#   In file included from include/linux/sunrpc/auth.h:13:
+#   In file included from include/linux/sunrpc/sched.h:19:
+#   include/linux/sunrpc/xdr.h:751:10: warning: result of comparison of 
constant 4611686018427387903 with expression of type '__u32' (aka 'unsigned 
int') is always false [-Wtautological-constant-out-of-range-compare]
+#           if (len > SIZE_MAX / sizeof(*p))
+#               ~~~ ^ ~~~~~~~~~~~~~~~~~~~~~
+#   1 warning generated.
+b = BPF(text=bpf_text,
+    cflags=["-Wno-tautological-constant-out-of-range-compare"])
 b.attach_kprobe(event="nfs_file_read", fn_name="trace_rw_entry")
 b.attach_kprobe(event="nfs_file_write", fn_name="trace_rw_entry")
 b.attach_kprobe(event="nfs4_file_open", fn_name="trace_file_open_entry")
@@ -286,6 +362,9 @@ b.attach_kretprobe(event="nfs4_file_open
 b.attach_kretprobe(event="nfs_file_open", fn_name="trace_file_open_return")
 b.attach_kretprobe(event="nfs_getattr", fn_name="trace_getattr_return")
 
+b.attach_raw_tracepoint("nfs_initiate_commit", "raw_nfs_initiate_commit")
+b.attach_raw_tracepoint("nfs_commit_done", "raw_nfs_commit_done")
+
 if(csv):
     print("ENDTIME_us,TASK,PID,TYPE,BYTES,OFFSET_b,LATENCY_us,FILE")
 else:

-=-=-=-=-=-=-=-=-=-=-=-
Links: You receive all messages sent to this group.
View/Reply Online (#2023): https://lists.iovisor.org/g/iovisor-dev/message/2023
Mute This Topic: https://lists.iovisor.org/mt/95039760/21656
Group Owner: iovisor-dev+ow...@lists.iovisor.org
Unsubscribe: https://lists.iovisor.org/g/iovisor-dev/unsub 
[arch...@mail-archive.com]
-=-=-=-=-=-=-=-=-=-=-=-


Reply via email to