On Thu, 2025-09-25 at 12:45 +0200, Max Kellermann wrote: > This patch adds trace points to the Ceph filesystem MDS client: >
It's a great idea. We really need in this. > - request submission (CEPH_MSG_CLIENT_REQUEST) and completion > (CEPH_MSG_CLIENT_REPLY) > - capabilities (CEPH_MSG_CLIENT_CAPS) > > These are the central pieces that are useful for analyzing MDS > latency/performance problems from the client's perspective. > > In the long run, all doutc() calls should be replaced with > tracepoints. This way, the Ceph filesystem can be traced at any time I don't think that we need to remove the doutc. It has different goal and it need for deeper debug and development. Also, it's not practical to remove all of it. > (without spamming the kernel log). Additionally, trace points can be If you don't enable the dynamic debug, then you have no output from doutc. > used in BPF programs (which can even deference the pointer parameters > and extract more values). Totally agreed. > > Signed-off-by: Max Kellermann <[email protected]> > --- > fs/ceph/caps.c | 4 + > fs/ceph/mds_client.c | 20 ++- > fs/ceph/super.c | 3 + > include/trace/events/ceph.h | 234 ++++++++++++++++++++++++++++++++++++ > 4 files changed, 259 insertions(+), 2 deletions(-) > create mode 100644 include/trace/events/ceph.h > > diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c > index b1a8ff612c41..2f663972da99 100644 > --- a/fs/ceph/caps.c > +++ b/fs/ceph/caps.c > @@ -18,6 +18,7 @@ > #include "crypto.h" > #include <linux/ceph/decode.h> > #include <linux/ceph/messenger.h> > +#include <trace/events/ceph.h> > > /* > * Capability management > @@ -4452,6 +4453,9 @@ void ceph_handle_caps(struct ceph_mds_session *session, > session->s_mds, ceph_cap_op_name(op), vino.ino, vino.snap, inode, > seq, issue_seq, mseq); > > + trace_ceph_handle_caps(mdsc, session, op, &vino, ceph_inode(inode), > + seq, issue_seq, mseq); > + > mutex_lock(&session->s_mutex); > > if (!inode) { > diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c > index 3bc72b47fe4d..90e4268b24f9 100644 > --- a/fs/ceph/mds_client.c > +++ b/fs/ceph/mds_client.c > @@ -24,6 +24,7 @@ > #include <linux/ceph/pagelist.h> > #include <linux/ceph/auth.h> > #include <linux/ceph/debugfs.h> > +#include <trace/events/ceph.h> > > #define RECONNECT_MAX_SIZE (INT_MAX - PAGE_SIZE) > > @@ -3282,6 +3283,8 @@ static void complete_request(struct ceph_mds_client > *mdsc, > { > req->r_end_latency = ktime_get(); > > + trace_ceph_mdsc_complete_request(mdsc, req); > + > if (req->r_callback) > req->r_callback(mdsc, req); > complete_all(&req->r_completion); > @@ -3413,6 +3416,8 @@ static int __send_request(struct ceph_mds_session > *session, > { > int err; > > + trace_ceph_mdsc_send_request(session, req); > + > err = __prepare_send_request(session, req, drop_cap_releases); > if (!err) { > ceph_msg_get(req->r_request); > @@ -3464,6 +3469,8 @@ static void __do_request(struct ceph_mds_client *mdsc, > } > if (mdsc->mdsmap->m_epoch == 0) { > doutc(cl, "no mdsmap, waiting for map\n"); > + trace_ceph_mdsc_suspend_request(mdsc, session, req, > + > ceph_mdsc_suspend_reason_no_mdsmap); > list_add(&req->r_wait, &mdsc->waiting_for_map); > return; > } > @@ -3485,6 +3492,8 @@ static void __do_request(struct ceph_mds_client *mdsc, > goto finish; > } > doutc(cl, "no mds or not active, waiting for map\n"); > + trace_ceph_mdsc_suspend_request(mdsc, session, req, > + > ceph_mdsc_suspend_reason_no_active_mds); > list_add(&req->r_wait, &mdsc->waiting_for_map); > return; > } > @@ -3530,9 +3539,11 @@ static void __do_request(struct ceph_mds_client *mdsc, > * it to the mdsc queue. > */ > if (session->s_state == CEPH_MDS_SESSION_REJECTED) { > - if (ceph_test_mount_opt(mdsc->fsc, CLEANRECOVER)) > + if (ceph_test_mount_opt(mdsc->fsc, CLEANRECOVER)) { > + trace_ceph_mdsc_suspend_request(mdsc, session, > req, > + > ceph_mdsc_suspend_reason_rejected); > list_add(&req->r_wait, &mdsc->waiting_for_map); > - else > + } else > err = -EACCES; > goto out_session; > } > @@ -3546,6 +3557,8 @@ static void __do_request(struct ceph_mds_client *mdsc, > if (random) > req->r_resend_mds = mds; > } > + trace_ceph_mdsc_suspend_request(mdsc, session, req, > + > ceph_mdsc_suspend_reason_session); > list_add(&req->r_wait, &session->s_waiting); > goto out_session; > } > @@ -3646,6 +3659,7 @@ static void __wake_requests(struct ceph_mds_client > *mdsc, > list_del_init(&req->r_wait); > doutc(cl, " wake request %p tid %llu\n", req, > req->r_tid); > + trace_ceph_mdsc_resume_request(mdsc, req); > __do_request(mdsc, req); > } > } > @@ -3672,6 +3686,7 @@ static void kick_requests(struct ceph_mds_client *mdsc, > int mds) > req->r_session->s_mds == mds) { > doutc(cl, " kicking tid %llu\n", req->r_tid); > list_del_init(&req->r_wait); > + trace_ceph_mdsc_resume_request(mdsc, req); > __do_request(mdsc, req); > } > } > @@ -3718,6 +3733,7 @@ int ceph_mdsc_submit_request(struct ceph_mds_client > *mdsc, struct inode *dir, > doutc(cl, "submit_request on %p for inode %p\n", req, dir); > mutex_lock(&mdsc->mutex); > __register_request(mdsc, req, dir); > + trace_ceph_mdsc_submit_request(mdsc, req); > __do_request(mdsc, req); > err = req->r_err; > mutex_unlock(&mdsc->mutex); > diff --git a/fs/ceph/super.c b/fs/ceph/super.c > index c3eb651862c5..f119d7260496 100644 > --- a/fs/ceph/super.c > +++ b/fs/ceph/super.c > @@ -30,6 +30,9 @@ > > #include <uapi/linux/magic.h> > > +#define CREATE_TRACE_POINTS > +#include <trace/events/ceph.h> > + > static DEFINE_SPINLOCK(ceph_fsc_lock); > static LIST_HEAD(ceph_fsc_list); > > diff --git a/include/trace/events/ceph.h b/include/trace/events/ceph.h > new file mode 100644 > index 000000000000..08cb0659fbfc > --- /dev/null > +++ b/include/trace/events/ceph.h > @@ -0,0 +1,234 @@ > +/* SPDX-License-Identifier: GPL-2.0-or-later */ > +/* Ceph filesystem support module tracepoints > + * > + * Copyright (C) 2025 IONOS SE. All Rights Reserved. > + * Written by Max Kellermann ([email protected]) > + */ > +#undef TRACE_SYSTEM > +#define TRACE_SYSTEM ceph > + > +#if !defined(_TRACE_CEPH_H) || defined(TRACE_HEADER_MULTI_READ) > +#define _TRACE_CEPH_H > + > +#include <linux/tracepoint.h> > + > +#define ceph_mdsc_suspend_reasons > \ > + EM(ceph_mdsc_suspend_reason_no_mdsmap, "no-mdsmap") > \ > + EM(ceph_mdsc_suspend_reason_no_active_mds, "no-active-mds") > \ > + EM(ceph_mdsc_suspend_reason_rejected, "rejected") > \ > + E_(ceph_mdsc_suspend_reason_session, "session") > + > +#ifndef __NETFS_DECLARE_TRACE_ENUMS_ONCE_ONLY > +#define __NETFS_DECLARE_TRACE_ENUMS_ONCE_ONLY > + > +#undef EM > +#undef E_ > +#define EM(a, b) a, > +#define E_(a, b) a > + > +enum ceph_mdsc_suspend_reason { ceph_mdsc_suspend_reasons } __mode(byte); It's strange to see the enum with only one item. Are you going to extend it in the near future? > + > +#endif > + > +/* > + * Export enum symbols via userspace. > + */ > +#undef EM > +#undef E_ > +#define EM(a, b) TRACE_DEFINE_ENUM(a); > +#define E_(a, b) TRACE_DEFINE_ENUM(a); > + > +ceph_mdsc_suspend_reasons; > + > +/* > + * Now redefine the EM() and E_() macros to map the enums to the strings that > + * will be printed in the output. > + */ > +#undef EM > +#undef E_ > +#define EM(a, b) { a, b }, > +#define E_(a, b) { a, b } > + > +TRACE_EVENT(ceph_mdsc_submit_request, > + TP_PROTO(struct ceph_mds_client *mdsc, > + struct ceph_mds_request *req), > + > + TP_ARGS(mdsc, req), > + > + TP_STRUCT__entry( > + __field(u64, tid) > + __field(int, op) > + __field(u64, ino) > + __field(u64, snap) > + ), > + > + TP_fast_assign( > + struct inode *inode; > + > + __entry->tid = req->r_tid; > + __entry->op = req->r_op; > + > + inode = req->r_inode; > + if (inode == NULL && req->r_dentry) > + inode = d_inode(req->r_dentry); > + > + if (inode) { > + __entry->ino = ceph_ino(inode); > + __entry->snap = ceph_snap(inode); > + } else { > + __entry->ino = __entry->snap = 0; > + } > + ), > + > + TP_printk("R=%llu op=%s ino=%llx,%llx", > + __entry->tid, > + ceph_mds_op_name(__entry->op), > + __entry->ino, __entry->snap) > +); > + > +TRACE_EVENT(ceph_mdsc_suspend_request, > + TP_PROTO(struct ceph_mds_client *mdsc, > + struct ceph_mds_session *session, > + struct ceph_mds_request *req, > + enum ceph_mdsc_suspend_reason reason), > + > + TP_ARGS(mdsc, session, req, reason), > + > + TP_STRUCT__entry( > + __field(u64, tid) > + __field(int, op) > + __field(int, mds) > + __field(enum ceph_mdsc_suspend_reason, reason) > + ), > + > + TP_fast_assign( > + __entry->tid = req->r_tid; > + __entry->op = req->r_op; > + __entry->mds = session ? session->s_mds : -1; > + __entry->reason = reason; > + ), > + > + TP_printk("R=%llu op=%s reason=%s", > + __entry->tid, > + ceph_mds_op_name(__entry->op), > + __print_symbolic(__entry->reason, ceph_mdsc_suspend_reasons)) > +); > + > +TRACE_EVENT(ceph_mdsc_resume_request, > + TP_PROTO(struct ceph_mds_client *mdsc, > + struct ceph_mds_request *req), > + > + TP_ARGS(mdsc, req), > + > + TP_STRUCT__entry( > + __field(u64, tid) > + __field(int, op) > + ), > + > + TP_fast_assign( > + __entry->tid = req->r_tid; > + __entry->op = req->r_op; > + ), > + > + TP_printk("R=%llu op=%s", > + __entry->tid, > + ceph_mds_op_name(__entry->op)) > +); > + > +TRACE_EVENT(ceph_mdsc_send_request, > + TP_PROTO(struct ceph_mds_session *session, > + struct ceph_mds_request *req), > + > + TP_ARGS(session, req), > + > + TP_STRUCT__entry( > + __field(u64, tid) > + __field(int, op) > + __field(int, mds) > + ), > + > + TP_fast_assign( > + __entry->tid = req->r_tid; > + __entry->op = req->r_op; > + __entry->mds = session->s_mds; > + ), > + > + TP_printk("R=%llu op=%s mds=%d", > + __entry->tid, > + ceph_mds_op_name(__entry->op), > + __entry->mds) > +); > + > +TRACE_EVENT(ceph_mdsc_complete_request, > + TP_PROTO(struct ceph_mds_client *mdsc, > + struct ceph_mds_request *req), > + > + TP_ARGS(mdsc, req), > + > + TP_STRUCT__entry( > + __field(u64, tid) > + __field(int, op) > + __field(int, err) > + __field(unsigned long, latency_ns) > + ), > + > + TP_fast_assign( > + __entry->tid = req->r_tid; > + __entry->op = req->r_op; > + __entry->err = req->r_err; > + __entry->latency_ns = req->r_end_latency - req->r_start_latency; > + ), > + > + TP_printk("R=%llu op=%s err=%d latency_ns=%lu", > + __entry->tid, > + ceph_mds_op_name(__entry->op), > + __entry->err, > + __entry->latency_ns) > +); > + > +TRACE_EVENT(ceph_handle_caps, > + TP_PROTO(struct ceph_mds_client *mdsc, > + struct ceph_mds_session *session, > + int op, > + const struct ceph_vino *vino, > + struct ceph_inode_info *inode, > + u32 seq, u32 mseq, u32 issue_seq), > + > + TP_ARGS(mdsc, session, op, vino, inode, seq, mseq, issue_seq), > + > + TP_STRUCT__entry( > + __field(int, mds) > + __field(int, op) > + __field(u64, ino) > + __field(u64, snap) > + __field(u32, seq) > + __field(u32, mseq) > + __field(u32, issue_seq) > + ), > + > + TP_fast_assign( > + __entry->mds = session->s_mds; > + __entry->op = op; > + __entry->ino = vino->ino; > + __entry->snap = vino->snap; > + __entry->seq = seq; > + __entry->mseq = mseq; > + __entry->issue_seq = issue_seq; > + ), > + > + TP_printk("mds=%d op=%s vino=%llx.%llx seq=%u iseq=%u mseq=%u", > + __entry->mds, > + ceph_cap_op_name(__entry->op), > + __entry->ino, > + __entry->snap, > + __entry->seq, > + __entry->issue_seq, > + __entry->mseq) > +); > + > +#undef EM > +#undef E_ > +#endif /* _TRACE_CEPH_H */ > + > +/* This part must be outside protection */ > +#include <trace/define_trace.h> Looks good. Reviewed-by: Viacheslav Dubeyko <[email protected]> Thanks, Slava.
