[PATCH V4 08/12] blktrace: export cgroup info in trace

2017-06-28 Thread Shaohua Li
From: Shaohua Li 

Currently blktrace isn't cgroup aware. blktrace prints out task name of
current context, but the task of current context isn't always in the
cgroup where the BIO comes from. We can't use task name to find out IO
cgroup. For example, Writeback BIOs always comes from flusher thread but
the BIOs are for different blk cgroups. Request could be requeued and
dispatched from completely different tasks. MD/DM are another examples.

This patch tries to fix the gap. We print out cgroup fhandle info in
blktrace. Userspace can use open_by_handle_at() syscall to find the
cgroup by fhandle. Or userspace can use name_to_handle_at() syscall to
find fhandle for a cgroup and use a BPF program to filter out blktrace
for a specific cgroup.

We add a new 'blk_cgroup' trace option for blk tracer. It's default off.
Application which doesn't know the new option isn't affected.  When it's
on, we output fhandle info right after blk_io_trace with an extra bit
set in event action. So from application point of view, blktrace with
the option will output new actions.

I didn't change blk trace event yet, since I'm not sure if changing the
trace event output is an ABI issue. If not, I'll do it later.

Signed-off-by: Shaohua Li 
---
 include/uapi/linux/blktrace_api.h |   3 +
 kernel/trace/blktrace.c   | 231 ++
 2 files changed, 161 insertions(+), 73 deletions(-)

diff --git a/include/uapi/linux/blktrace_api.h 
b/include/uapi/linux/blktrace_api.h
index c590ca6..9cdaede 100644
--- a/include/uapi/linux/blktrace_api.h
+++ b/include/uapi/linux/blktrace_api.h
@@ -52,6 +52,7 @@ enum blktrace_act {
__BLK_TA_REMAP, /* bio was remapped */
__BLK_TA_ABORT, /* request aborted */
__BLK_TA_DRV_DATA,  /* driver-specific binary data */
+   __BLK_TA_CGROUP = 1 << 8,   /* from a cgroup*/
 };
 
 /*
@@ -61,6 +62,7 @@ enum blktrace_notify {
__BLK_TN_PROCESS = 0,   /* establish pid/name mapping */
__BLK_TN_TIMESTAMP, /* include system clock */
__BLK_TN_MESSAGE,   /* Character string message */
+   __BLK_TN_CGROUP = __BLK_TA_CGROUP, /* from a cgroup */
 };
 
 
@@ -107,6 +109,7 @@ struct blk_io_trace {
__u32 cpu;  /* on what cpu did it happen */
__u16 error;/* completion error */
__u16 pdu_len;  /* length of data after this trace */
+   /* cgroup id will be stored here if exists */
 };
 
 /*
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index bc364f8..f393d7a 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -27,6 +27,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "../../block/blk.h"
 
@@ -46,10 +47,14 @@ static __cacheline_aligned_in_smp 
DEFINE_SPINLOCK(running_trace_lock);
 
 /* Select an alternative, minimalistic output than the original one */
 #define TRACE_BLK_OPT_CLASSIC  0x1
+#define TRACE_BLK_OPT_CGROUP   0x2
 
 static struct tracer_opt blk_tracer_opts[] = {
/* Default disable the minimalistic output */
{ TRACER_OPT(blk_classic, TRACE_BLK_OPT_CLASSIC) },
+#ifdef CONFIG_BLK_CGROUP
+   { TRACER_OPT(blk_cgroup, TRACE_BLK_OPT_CGROUP) },
+#endif
{ }
 };
 
@@ -68,7 +73,8 @@ static void blk_unregister_tracepoints(void);
  * Send out a notify message.
  */
 static void trace_note(struct blk_trace *bt, pid_t pid, int action,
-  const void *data, size_t len)
+  const void *data, size_t len,
+  union kernfs_node_id *cgid)
 {
struct blk_io_trace *t;
struct ring_buffer_event *event = NULL;
@@ -76,12 +82,13 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int 
action,
int pc = 0;
int cpu = smp_processor_id();
bool blk_tracer = blk_tracer_enabled;
+   ssize_t cgid_len = cgid ? sizeof(*cgid) : 0;
 
if (blk_tracer) {
buffer = blk_tr->trace_buffer.buffer;
pc = preempt_count();
event = trace_buffer_lock_reserve(buffer, TRACE_BLK,
- sizeof(*t) + len,
+ sizeof(*t) + len + cgid_len,
  0, pc);
if (!event)
return;
@@ -92,17 +99,19 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int 
action,
if (!bt->rchan)
return;
 
-   t = relay_reserve(bt->rchan, sizeof(*t) + len);
+   t = relay_reserve(bt->rchan, sizeof(*t) + len + cgid_len);
if (t) {
t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
t->time = ktime_to_ns(ktime_get());
 record_it:
t->device = bt->dev;
-   t->action = action;
+   t->action = action | (cgid ? __BLK_TN_CGROUP : 0);
t->pid = pid;

Re: [PATCH V4 08/12] blktrace: export cgroup info in trace

2017-06-28 Thread Steven Rostedt
On Wed, 28 Jun 2017 09:29:58 -0700
Shaohua Li  wrote:

> From: Shaohua Li 
> 
> Currently blktrace isn't cgroup aware. blktrace prints out task name of
> current context, but the task of current context isn't always in the
> cgroup where the BIO comes from. We can't use task name to find out IO
> cgroup. For example, Writeback BIOs always comes from flusher thread but
> the BIOs are for different blk cgroups. Request could be requeued and
> dispatched from completely different tasks. MD/DM are another examples.
> 
> This patch tries to fix the gap. We print out cgroup fhandle info in
> blktrace. Userspace can use open_by_handle_at() syscall to find the
> cgroup by fhandle. Or userspace can use name_to_handle_at() syscall to
> find fhandle for a cgroup and use a BPF program to filter out blktrace
> for a specific cgroup.
> 
> We add a new 'blk_cgroup' trace option for blk tracer. It's default off.
> Application which doesn't know the new option isn't affected.  When it's
> on, we output fhandle info right after blk_io_trace with an extra bit
> set in event action. So from application point of view, blktrace with
> the option will output new actions.
> 
> I didn't change blk trace event yet, since I'm not sure if changing the
> trace event output is an ABI issue. If not, I'll do it later.
> 
> Signed-off-by: Shaohua Li 
> ---
>  include/uapi/linux/blktrace_api.h |   3 +
>  kernel/trace/blktrace.c   | 231 
> ++
>  2 files changed, 161 insertions(+), 73 deletions(-)

Doing a quick scan of the patch, nothing sticks out as an issue to me.

Acked-by: Steven Rostedt (VMware) 

-- Steve