Re: [RFC][PATCH 2/4] tracing: Use pid bitmap instead of a pid array for set_event_pid
On Fri, 22 Apr 2016 11:45:30 +0900 Namhyung Kimwrote: > > + pid_list->pid_max = READ_ONCE(pid_max); > > + /* Only truncating will shrink pid_max */ > > + if (filtered_pids && filtered_pids->pid_max > pid_list->pid_max) > > + pid_list->pid_max = filtered_pids->pid_max; > > + pid_list->pids = vzalloc((pid_list->pid_max + 7) >> 3); > > + if (!pid_list->pids) { > > + kfree(pid_list); > > + read = -ENOMEM; > > + goto out; > > + } > > + if (filtered_pids) { > > + /* copy the current bits to the new max */ > > + pid = find_first_bit(filtered_pids->pids, > > +filtered_pids->pid_max); > > + while (pid < filtered_pids->pid_max) { > > + set_bit(pid, pid_list->pids); > > + pid = find_next_bit(filtered_pids->pids, > > + filtered_pids->pid_max, > > + pid + 1); > > + nr_pids++; > > + } > > Why not just use memcpy and keep nr_pids in the pid_list? This is the slow path (very slow ;-), and this was the first method that came to my mind (while I programmed this during a conference). I could use memcpy, or simply one of the bitmask copies, and then get the nr_pids from bitmask_weight(). I would not keep nr_pids in pid_list because that would mean that I would have to manage it in the fast path. Maybe later I'll convert it to bitmap_copy(), but for now I'll just keep it as is. I move this code in my queue for 4.8, and don't want to deal with conflicts unless there's a real bug discovered. Thanks for looking at this code! -- Steve
Re: [RFC][PATCH 2/4] tracing: Use pid bitmap instead of a pid array for set_event_pid
On Fri, 22 Apr 2016 11:45:30 +0900 Namhyung Kim wrote: > > + pid_list->pid_max = READ_ONCE(pid_max); > > + /* Only truncating will shrink pid_max */ > > + if (filtered_pids && filtered_pids->pid_max > pid_list->pid_max) > > + pid_list->pid_max = filtered_pids->pid_max; > > + pid_list->pids = vzalloc((pid_list->pid_max + 7) >> 3); > > + if (!pid_list->pids) { > > + kfree(pid_list); > > + read = -ENOMEM; > > + goto out; > > + } > > + if (filtered_pids) { > > + /* copy the current bits to the new max */ > > + pid = find_first_bit(filtered_pids->pids, > > +filtered_pids->pid_max); > > + while (pid < filtered_pids->pid_max) { > > + set_bit(pid, pid_list->pids); > > + pid = find_next_bit(filtered_pids->pids, > > + filtered_pids->pid_max, > > + pid + 1); > > + nr_pids++; > > + } > > Why not just use memcpy and keep nr_pids in the pid_list? This is the slow path (very slow ;-), and this was the first method that came to my mind (while I programmed this during a conference). I could use memcpy, or simply one of the bitmask copies, and then get the nr_pids from bitmask_weight(). I would not keep nr_pids in pid_list because that would mean that I would have to manage it in the fast path. Maybe later I'll convert it to bitmap_copy(), but for now I'll just keep it as is. I move this code in my queue for 4.8, and don't want to deal with conflicts unless there's a real bug discovered. Thanks for looking at this code! -- Steve
Re: [RFC][PATCH 2/4] tracing: Use pid bitmap instead of a pid array for set_event_pid
Hi Steve, On Tue, Apr 19, 2016 at 10:34:23AM -0400, Steven Rostedt wrote: > From: Steven Rostedt> > In order to add the ability to let tasks that are filtered by the events > have their children also be traced on fork (and then not traced on exit), > convert the array into a pid bitmask. Most of the time the number of pids is > only 32768 pids or a 4k bitmask, which is the same size as the default list > currently is, and that list could grow if more pids are listed. > > This also greatly simplifies the code. > > Suggested-by: "H. Peter Anvin" > Signed-off-by: Steven Rostedt > --- [SNIP] > @@ -1571,7 +1572,7 @@ ftrace_event_pid_write(struct file *filp, const char > __user *ubuf, > struct seq_file *m = filp->private_data; > struct trace_array *tr = m->private; > struct trace_pid_list *filtered_pids = NULL; > - struct trace_pid_list *pid_list = NULL; > + struct trace_pid_list *pid_list; > struct trace_event_file *file; > struct trace_parser parser; > unsigned long val; > @@ -1579,7 +1580,7 @@ ftrace_event_pid_write(struct file *filp, const char > __user *ubuf, > ssize_t read = 0; > ssize_t ret = 0; > pid_t pid; > - int i; > + int nr_pids = 0; > > if (!cnt) > return 0; > @@ -1592,10 +1593,43 @@ ftrace_event_pid_write(struct file *filp, const char > __user *ubuf, > return -ENOMEM; > > mutex_lock(_mutex); > + filtered_pids = rcu_dereference_protected(tr->filtered_pids, > + lockdep_is_held(_mutex)); > + > /* > - * Load as many pids into the array before doing a > - * swap from the tr->filtered_pids to the new list. > + * Always recreate a new array. The write is an all or nothing > + * operation. Always create a new array when adding new pids by > + * the user. If the operation fails, then the current list is > + * not modified. >*/ > + pid_list = kmalloc(sizeof(*pid_list), GFP_KERNEL); > + if (!pid_list) { > + read = -ENOMEM; > + goto out; > + } > + pid_list->pid_max = READ_ONCE(pid_max); > + /* Only truncating will shrink pid_max */ > + if (filtered_pids && filtered_pids->pid_max > pid_list->pid_max) > + pid_list->pid_max = filtered_pids->pid_max; > + pid_list->pids = vzalloc((pid_list->pid_max + 7) >> 3); > + if (!pid_list->pids) { > + kfree(pid_list); > + read = -ENOMEM; > + goto out; > + } > + if (filtered_pids) { > + /* copy the current bits to the new max */ > + pid = find_first_bit(filtered_pids->pids, > + filtered_pids->pid_max); > + while (pid < filtered_pids->pid_max) { > + set_bit(pid, pid_list->pids); > + pid = find_next_bit(filtered_pids->pids, > + filtered_pids->pid_max, > + pid + 1); > + nr_pids++; > + } Why not just use memcpy and keep nr_pids in the pid_list? Thanks, Namhyung > + } > + > while (cnt > 0) { > > this_pos = 0; > @@ -1613,92 +1647,35 @@ ftrace_event_pid_write(struct file *filp, const char > __user *ubuf, > ret = -EINVAL; > if (kstrtoul(parser.buffer, 0, )) > break; > - if (val > INT_MAX) > + if (val >= pid_list->pid_max) > break; > > pid = (pid_t)val; > > - ret = -ENOMEM; > - if (!pid_list) { > - pid_list = kmalloc(sizeof(*pid_list), GFP_KERNEL); > - if (!pid_list) > - break; > - > - filtered_pids = > rcu_dereference_protected(tr->filtered_pids, > - > lockdep_is_held(_mutex)); > - if (filtered_pids) > - pid_list->order = filtered_pids->order; > - else > - pid_list->order = 0; > - > - pid_list->pids = (void *)__get_free_pages(GFP_KERNEL, > - > pid_list->order); > - if (!pid_list->pids) > - break; > - > - if (filtered_pids) { > - pid_list->nr_pids = filtered_pids->nr_pids; > - memcpy(pid_list->pids, filtered_pids->pids, > -pid_list->nr_pids * sizeof(pid_t)); > - } else > - pid_list->nr_pids = 0; > - } > - > - if (pid_list->nr_pids >= max_pids(pid_list)) { > -
Re: [RFC][PATCH 2/4] tracing: Use pid bitmap instead of a pid array for set_event_pid
Hi Steve, On Tue, Apr 19, 2016 at 10:34:23AM -0400, Steven Rostedt wrote: > From: Steven Rostedt > > In order to add the ability to let tasks that are filtered by the events > have their children also be traced on fork (and then not traced on exit), > convert the array into a pid bitmask. Most of the time the number of pids is > only 32768 pids or a 4k bitmask, which is the same size as the default list > currently is, and that list could grow if more pids are listed. > > This also greatly simplifies the code. > > Suggested-by: "H. Peter Anvin" > Signed-off-by: Steven Rostedt > --- [SNIP] > @@ -1571,7 +1572,7 @@ ftrace_event_pid_write(struct file *filp, const char > __user *ubuf, > struct seq_file *m = filp->private_data; > struct trace_array *tr = m->private; > struct trace_pid_list *filtered_pids = NULL; > - struct trace_pid_list *pid_list = NULL; > + struct trace_pid_list *pid_list; > struct trace_event_file *file; > struct trace_parser parser; > unsigned long val; > @@ -1579,7 +1580,7 @@ ftrace_event_pid_write(struct file *filp, const char > __user *ubuf, > ssize_t read = 0; > ssize_t ret = 0; > pid_t pid; > - int i; > + int nr_pids = 0; > > if (!cnt) > return 0; > @@ -1592,10 +1593,43 @@ ftrace_event_pid_write(struct file *filp, const char > __user *ubuf, > return -ENOMEM; > > mutex_lock(_mutex); > + filtered_pids = rcu_dereference_protected(tr->filtered_pids, > + lockdep_is_held(_mutex)); > + > /* > - * Load as many pids into the array before doing a > - * swap from the tr->filtered_pids to the new list. > + * Always recreate a new array. The write is an all or nothing > + * operation. Always create a new array when adding new pids by > + * the user. If the operation fails, then the current list is > + * not modified. >*/ > + pid_list = kmalloc(sizeof(*pid_list), GFP_KERNEL); > + if (!pid_list) { > + read = -ENOMEM; > + goto out; > + } > + pid_list->pid_max = READ_ONCE(pid_max); > + /* Only truncating will shrink pid_max */ > + if (filtered_pids && filtered_pids->pid_max > pid_list->pid_max) > + pid_list->pid_max = filtered_pids->pid_max; > + pid_list->pids = vzalloc((pid_list->pid_max + 7) >> 3); > + if (!pid_list->pids) { > + kfree(pid_list); > + read = -ENOMEM; > + goto out; > + } > + if (filtered_pids) { > + /* copy the current bits to the new max */ > + pid = find_first_bit(filtered_pids->pids, > + filtered_pids->pid_max); > + while (pid < filtered_pids->pid_max) { > + set_bit(pid, pid_list->pids); > + pid = find_next_bit(filtered_pids->pids, > + filtered_pids->pid_max, > + pid + 1); > + nr_pids++; > + } Why not just use memcpy and keep nr_pids in the pid_list? Thanks, Namhyung > + } > + > while (cnt > 0) { > > this_pos = 0; > @@ -1613,92 +1647,35 @@ ftrace_event_pid_write(struct file *filp, const char > __user *ubuf, > ret = -EINVAL; > if (kstrtoul(parser.buffer, 0, )) > break; > - if (val > INT_MAX) > + if (val >= pid_list->pid_max) > break; > > pid = (pid_t)val; > > - ret = -ENOMEM; > - if (!pid_list) { > - pid_list = kmalloc(sizeof(*pid_list), GFP_KERNEL); > - if (!pid_list) > - break; > - > - filtered_pids = > rcu_dereference_protected(tr->filtered_pids, > - > lockdep_is_held(_mutex)); > - if (filtered_pids) > - pid_list->order = filtered_pids->order; > - else > - pid_list->order = 0; > - > - pid_list->pids = (void *)__get_free_pages(GFP_KERNEL, > - > pid_list->order); > - if (!pid_list->pids) > - break; > - > - if (filtered_pids) { > - pid_list->nr_pids = filtered_pids->nr_pids; > - memcpy(pid_list->pids, filtered_pids->pids, > -pid_list->nr_pids * sizeof(pid_t)); > - } else > - pid_list->nr_pids = 0; > - } > - > - if (pid_list->nr_pids >= max_pids(pid_list)) { > - pid_t *pid_page; > - > - pid_page =
Re: [RFC][PATCH 2/4] tracing: Use pid bitmap instead of a pid array for set_event_pid
On Tue, 19 Apr 2016 22:59:14 + (UTC) Mathieu Desnoyerswrote: > - On Apr 19, 2016, at 6:49 PM, rostedt rost...@goodmis.org wrote: > > > On Tue, 19 Apr 2016 21:22:21 + (UTC) > > Mathieu Desnoyers wrote: > > > >> It makes sense. Anyway, looking back at my own implementation, I have > >> an array of 64 hlist_head entries (64 * 8 = 512 bytes), typically > >> populated by NULL pointers. It's only a factor 8 smaller than the > >> bitmap, so it's not a huge gain. > > > > Actually we talked about a second level bitmap for quicker searchers. I > > can't remember what it was called, but I'm sure HPA can ;-) > > > > Basically it was a much smaller bitmap, where each bit represents a > > number of bits in the main bitmap. When a bit is set in the main > > bitmap, its corresponding bit is set in the smaller one. This means > > that if you don't have many PIDs, the smaller bitmap wont have many > > bits set either, and you keep all the checks very cache local, because > > you are checking the smaller bitmap most of the time. But this too > > makes things more complex, especially when clearing a bit (although, > > that only happens on exit, where speed isn't a big deal). But we > > decided it still wasn't worth it. > > Seems like an interesting possible improvement if ever needed. > One reason we decided against it is, if you think about use cases; if you are tracing a single task, and other tasks are created around it in the same time period, you will have pids of tasks running that are close to the pid of the traced task. That means the bit in the smaller array will most likely be always set, and now you are taking two cache hits to find the pid you are looking for, and not gaining anything out of it. -- Steve
Re: [RFC][PATCH 2/4] tracing: Use pid bitmap instead of a pid array for set_event_pid
On Tue, 19 Apr 2016 22:59:14 + (UTC) Mathieu Desnoyers wrote: > - On Apr 19, 2016, at 6:49 PM, rostedt rost...@goodmis.org wrote: > > > On Tue, 19 Apr 2016 21:22:21 + (UTC) > > Mathieu Desnoyers wrote: > > > >> It makes sense. Anyway, looking back at my own implementation, I have > >> an array of 64 hlist_head entries (64 * 8 = 512 bytes), typically > >> populated by NULL pointers. It's only a factor 8 smaller than the > >> bitmap, so it's not a huge gain. > > > > Actually we talked about a second level bitmap for quicker searchers. I > > can't remember what it was called, but I'm sure HPA can ;-) > > > > Basically it was a much smaller bitmap, where each bit represents a > > number of bits in the main bitmap. When a bit is set in the main > > bitmap, its corresponding bit is set in the smaller one. This means > > that if you don't have many PIDs, the smaller bitmap wont have many > > bits set either, and you keep all the checks very cache local, because > > you are checking the smaller bitmap most of the time. But this too > > makes things more complex, especially when clearing a bit (although, > > that only happens on exit, where speed isn't a big deal). But we > > decided it still wasn't worth it. > > Seems like an interesting possible improvement if ever needed. > One reason we decided against it is, if you think about use cases; if you are tracing a single task, and other tasks are created around it in the same time period, you will have pids of tasks running that are close to the pid of the traced task. That means the bit in the smaller array will most likely be always set, and now you are taking two cache hits to find the pid you are looking for, and not gaining anything out of it. -- Steve
Re: [RFC][PATCH 2/4] tracing: Use pid bitmap instead of a pid array for set_event_pid
- On Apr 19, 2016, at 6:49 PM, rostedt rost...@goodmis.org wrote: > On Tue, 19 Apr 2016 21:22:21 + (UTC) > Mathieu Desnoyerswrote: > >> It makes sense. Anyway, looking back at my own implementation, I have >> an array of 64 hlist_head entries (64 * 8 = 512 bytes), typically >> populated by NULL pointers. It's only a factor 8 smaller than the >> bitmap, so it's not a huge gain. > > Actually we talked about a second level bitmap for quicker searchers. I > can't remember what it was called, but I'm sure HPA can ;-) > > Basically it was a much smaller bitmap, where each bit represents a > number of bits in the main bitmap. When a bit is set in the main > bitmap, its corresponding bit is set in the smaller one. This means > that if you don't have many PIDs, the smaller bitmap wont have many > bits set either, and you keep all the checks very cache local, because > you are checking the smaller bitmap most of the time. But this too > makes things more complex, especially when clearing a bit (although, > that only happens on exit, where speed isn't a big deal). But we > decided it still wasn't worth it. Seems like an interesting possible improvement if ever needed. > >> >> One alternative approach would be to keep a few entries (e.g. 16 PIDs) >> in a fast-path lookup array that fits in a single-cache line. When the >> number of PIDs to track go beyond that, fall-back to the bitmap instead. >> >> > >> > Note, that the check of the bitmap to trace a task or not is not done >> > at every tracepoint. It's only done at sched_switch, and then an >> > internal flag is set. That flag will determine if the event should be >> > traced, and that is a single bit checked all the time (very good for >> > cache). >> >> Could this be used by multiple tracers, and used in a multi-session scheme ? >> In lttng, one user may want to track a set of PIDs, whereas another user may >> be concurrently interested by another set. > > I should specify, the bit isn't in the task struct, because different > trace instances may have different criteria to what task may be traced. > That is, you can have multiple buffers tracing multiple tasks. The > tracepoint has a private data structure attached to it that is added > when a tracepoint is registered. This data is a descriptor that > represents the trace instance. This instance descriptor has a flag to > ignore or trace the task. > > >> >> Currently, in the lttng kernel tracer, we do the hash table query for >> each tracepoint hit, which is clearly not as efficient as checking a >> task struct flag. One option I see would be to set the task struct flag >> whenever there is at least one tracer/tracing session that is interested >> in this event (this would end up being a reference count on the flag). Then, >> for every flag check that passes, lttng could do HT/bitmap lookups to see if >> the event needs to go to each specific session. >> >> Is this task struct "trace" flag currently exposed to tracers through a >> reference-counting enable/disable API ? If not, do you think it would make >> sense ? >> > > Nope. As I said, it's on my own descriptor that is passed through the > tracepoint private data. > > If you look at my code, you'll notice that I pass around a > "trace_array" struct (tr), which represents all the information about a > single trace instance. What tracer is running, what events are enabled, > and even keeps track of the file descriptors holding the trace event > information. This "tr" has a per_cpu "buffer" section that contains per > cpu data (like the ring buffer). It also has a "data" section for > miscellaneous fields. One of them is now "ignore" which is set when > filtering is on and the sched_switch event noticed that the new task > shouldn't be traced for this instance. > > If there's multiple instances, then there will be multiple callbacks > done at each sched_switch. One for each instance. Got it. I'd have to extend the buffer structures available within each session to add this extra flag, and update it for each session from the fork/exit events to match the per-session bitmap, as well as whenever the bitmap is modified. Thanks for the explanation! :) Mathieu > > -- Steve -- Mathieu Desnoyers EfficiOS Inc. http://www.efficios.com
Re: [RFC][PATCH 2/4] tracing: Use pid bitmap instead of a pid array for set_event_pid
- On Apr 19, 2016, at 6:49 PM, rostedt rost...@goodmis.org wrote: > On Tue, 19 Apr 2016 21:22:21 + (UTC) > Mathieu Desnoyers wrote: > >> It makes sense. Anyway, looking back at my own implementation, I have >> an array of 64 hlist_head entries (64 * 8 = 512 bytes), typically >> populated by NULL pointers. It's only a factor 8 smaller than the >> bitmap, so it's not a huge gain. > > Actually we talked about a second level bitmap for quicker searchers. I > can't remember what it was called, but I'm sure HPA can ;-) > > Basically it was a much smaller bitmap, where each bit represents a > number of bits in the main bitmap. When a bit is set in the main > bitmap, its corresponding bit is set in the smaller one. This means > that if you don't have many PIDs, the smaller bitmap wont have many > bits set either, and you keep all the checks very cache local, because > you are checking the smaller bitmap most of the time. But this too > makes things more complex, especially when clearing a bit (although, > that only happens on exit, where speed isn't a big deal). But we > decided it still wasn't worth it. Seems like an interesting possible improvement if ever needed. > >> >> One alternative approach would be to keep a few entries (e.g. 16 PIDs) >> in a fast-path lookup array that fits in a single-cache line. When the >> number of PIDs to track go beyond that, fall-back to the bitmap instead. >> >> > >> > Note, that the check of the bitmap to trace a task or not is not done >> > at every tracepoint. It's only done at sched_switch, and then an >> > internal flag is set. That flag will determine if the event should be >> > traced, and that is a single bit checked all the time (very good for >> > cache). >> >> Could this be used by multiple tracers, and used in a multi-session scheme ? >> In lttng, one user may want to track a set of PIDs, whereas another user may >> be concurrently interested by another set. > > I should specify, the bit isn't in the task struct, because different > trace instances may have different criteria to what task may be traced. > That is, you can have multiple buffers tracing multiple tasks. The > tracepoint has a private data structure attached to it that is added > when a tracepoint is registered. This data is a descriptor that > represents the trace instance. This instance descriptor has a flag to > ignore or trace the task. > > >> >> Currently, in the lttng kernel tracer, we do the hash table query for >> each tracepoint hit, which is clearly not as efficient as checking a >> task struct flag. One option I see would be to set the task struct flag >> whenever there is at least one tracer/tracing session that is interested >> in this event (this would end up being a reference count on the flag). Then, >> for every flag check that passes, lttng could do HT/bitmap lookups to see if >> the event needs to go to each specific session. >> >> Is this task struct "trace" flag currently exposed to tracers through a >> reference-counting enable/disable API ? If not, do you think it would make >> sense ? >> > > Nope. As I said, it's on my own descriptor that is passed through the > tracepoint private data. > > If you look at my code, you'll notice that I pass around a > "trace_array" struct (tr), which represents all the information about a > single trace instance. What tracer is running, what events are enabled, > and even keeps track of the file descriptors holding the trace event > information. This "tr" has a per_cpu "buffer" section that contains per > cpu data (like the ring buffer). It also has a "data" section for > miscellaneous fields. One of them is now "ignore" which is set when > filtering is on and the sched_switch event noticed that the new task > shouldn't be traced for this instance. > > If there's multiple instances, then there will be multiple callbacks > done at each sched_switch. One for each instance. Got it. I'd have to extend the buffer structures available within each session to add this extra flag, and update it for each session from the fork/exit events to match the per-session bitmap, as well as whenever the bitmap is modified. Thanks for the explanation! :) Mathieu > > -- Steve -- Mathieu Desnoyers EfficiOS Inc. http://www.efficios.com
Re: [RFC][PATCH 2/4] tracing: Use pid bitmap instead of a pid array for set_event_pid
On Tue, 19 Apr 2016 21:22:21 + (UTC) Mathieu Desnoyerswrote: > It makes sense. Anyway, looking back at my own implementation, I have > an array of 64 hlist_head entries (64 * 8 = 512 bytes), typically > populated by NULL pointers. It's only a factor 8 smaller than the > bitmap, so it's not a huge gain. Actually we talked about a second level bitmap for quicker searchers. I can't remember what it was called, but I'm sure HPA can ;-) Basically it was a much smaller bitmap, where each bit represents a number of bits in the main bitmap. When a bit is set in the main bitmap, its corresponding bit is set in the smaller one. This means that if you don't have many PIDs, the smaller bitmap wont have many bits set either, and you keep all the checks very cache local, because you are checking the smaller bitmap most of the time. But this too makes things more complex, especially when clearing a bit (although, that only happens on exit, where speed isn't a big deal). But we decided it still wasn't worth it. > > One alternative approach would be to keep a few entries (e.g. 16 PIDs) > in a fast-path lookup array that fits in a single-cache line. When the > number of PIDs to track go beyond that, fall-back to the bitmap instead. > > > > > Note, that the check of the bitmap to trace a task or not is not done > > at every tracepoint. It's only done at sched_switch, and then an > > internal flag is set. That flag will determine if the event should be > > traced, and that is a single bit checked all the time (very good for > > cache). > > Could this be used by multiple tracers, and used in a multi-session scheme ? > In lttng, one user may want to track a set of PIDs, whereas another user may > be concurrently interested by another set. I should specify, the bit isn't in the task struct, because different trace instances may have different criteria to what task may be traced. That is, you can have multiple buffers tracing multiple tasks. The tracepoint has a private data structure attached to it that is added when a tracepoint is registered. This data is a descriptor that represents the trace instance. This instance descriptor has a flag to ignore or trace the task. > > Currently, in the lttng kernel tracer, we do the hash table query for > each tracepoint hit, which is clearly not as efficient as checking a > task struct flag. One option I see would be to set the task struct flag > whenever there is at least one tracer/tracing session that is interested > in this event (this would end up being a reference count on the flag). Then, > for every flag check that passes, lttng could do HT/bitmap lookups to see if > the event needs to go to each specific session. > > Is this task struct "trace" flag currently exposed to tracers through a > reference-counting enable/disable API ? If not, do you think it would make > sense ? > Nope. As I said, it's on my own descriptor that is passed through the tracepoint private data. If you look at my code, you'll notice that I pass around a "trace_array" struct (tr), which represents all the information about a single trace instance. What tracer is running, what events are enabled, and even keeps track of the file descriptors holding the trace event information. This "tr" has a per_cpu "buffer" section that contains per cpu data (like the ring buffer). It also has a "data" section for miscellaneous fields. One of them is now "ignore" which is set when filtering is on and the sched_switch event noticed that the new task shouldn't be traced for this instance. If there's multiple instances, then there will be multiple callbacks done at each sched_switch. One for each instance. -- Steve
Re: [RFC][PATCH 2/4] tracing: Use pid bitmap instead of a pid array for set_event_pid
On Tue, 19 Apr 2016 21:22:21 + (UTC) Mathieu Desnoyers wrote: > It makes sense. Anyway, looking back at my own implementation, I have > an array of 64 hlist_head entries (64 * 8 = 512 bytes), typically > populated by NULL pointers. It's only a factor 8 smaller than the > bitmap, so it's not a huge gain. Actually we talked about a second level bitmap for quicker searchers. I can't remember what it was called, but I'm sure HPA can ;-) Basically it was a much smaller bitmap, where each bit represents a number of bits in the main bitmap. When a bit is set in the main bitmap, its corresponding bit is set in the smaller one. This means that if you don't have many PIDs, the smaller bitmap wont have many bits set either, and you keep all the checks very cache local, because you are checking the smaller bitmap most of the time. But this too makes things more complex, especially when clearing a bit (although, that only happens on exit, where speed isn't a big deal). But we decided it still wasn't worth it. > > One alternative approach would be to keep a few entries (e.g. 16 PIDs) > in a fast-path lookup array that fits in a single-cache line. When the > number of PIDs to track go beyond that, fall-back to the bitmap instead. > > > > > Note, that the check of the bitmap to trace a task or not is not done > > at every tracepoint. It's only done at sched_switch, and then an > > internal flag is set. That flag will determine if the event should be > > traced, and that is a single bit checked all the time (very good for > > cache). > > Could this be used by multiple tracers, and used in a multi-session scheme ? > In lttng, one user may want to track a set of PIDs, whereas another user may > be concurrently interested by another set. I should specify, the bit isn't in the task struct, because different trace instances may have different criteria to what task may be traced. That is, you can have multiple buffers tracing multiple tasks. The tracepoint has a private data structure attached to it that is added when a tracepoint is registered. This data is a descriptor that represents the trace instance. This instance descriptor has a flag to ignore or trace the task. > > Currently, in the lttng kernel tracer, we do the hash table query for > each tracepoint hit, which is clearly not as efficient as checking a > task struct flag. One option I see would be to set the task struct flag > whenever there is at least one tracer/tracing session that is interested > in this event (this would end up being a reference count on the flag). Then, > for every flag check that passes, lttng could do HT/bitmap lookups to see if > the event needs to go to each specific session. > > Is this task struct "trace" flag currently exposed to tracers through a > reference-counting enable/disable API ? If not, do you think it would make > sense ? > Nope. As I said, it's on my own descriptor that is passed through the tracepoint private data. If you look at my code, you'll notice that I pass around a "trace_array" struct (tr), which represents all the information about a single trace instance. What tracer is running, what events are enabled, and even keeps track of the file descriptors holding the trace event information. This "tr" has a per_cpu "buffer" section that contains per cpu data (like the ring buffer). It also has a "data" section for miscellaneous fields. One of them is now "ignore" which is set when filtering is on and the sched_switch event noticed that the new task shouldn't be traced for this instance. If there's multiple instances, then there will be multiple callbacks done at each sched_switch. One for each instance. -- Steve
Re: [RFC][PATCH 2/4] tracing: Use pid bitmap instead of a pid array for set_event_pid
- On Apr 19, 2016, at 4:50 PM, rostedt rost...@goodmis.org wrote: > On Tue, 19 Apr 2016 20:17:29 + (UTC) > Mathieu Desnoyerswrote: > > >> Ah indeed, since there is a hard limit to 4194304, that makes the >> worse case bitmap 512k. > > Yep. > >> >> We could argue that given a sparse dataset in the PID table (typical >> in our use-cases), a small hash table would have better cache locality >> than the bitmap. But I agree that the hash table does add a bit of >> complexity, so it becomes a complexity vs cache locality tradeoff. >> So I understand why you would want to go for the simpler bitmap >> solution, unless the hash table would prove to bring a measurable >> performance improvement. > > We discussed this too (cache locality), and came to the same conclusion > that a bitmask would still be better. If you think about it, if you > have a lot of CPUs and lots of PIDs, tasks don't migrate as much, and > if they do, cache locality of this bitmap will be the least of the > performance issues. Then you have a limited amount of PIDs per CPU, and > thus those PIDs will probably be in the CPU cache for the bitmap. It makes sense. Anyway, looking back at my own implementation, I have an array of 64 hlist_head entries (64 * 8 = 512 bytes), typically populated by NULL pointers. It's only a factor 8 smaller than the bitmap, so it's not a huge gain. One alternative approach would be to keep a few entries (e.g. 16 PIDs) in a fast-path lookup array that fits in a single-cache line. When the number of PIDs to track go beyond that, fall-back to the bitmap instead. > > Note, that the check of the bitmap to trace a task or not is not done > at every tracepoint. It's only done at sched_switch, and then an > internal flag is set. That flag will determine if the event should be > traced, and that is a single bit checked all the time (very good for > cache). Could this be used by multiple tracers, and used in a multi-session scheme ? In lttng, one user may want to track a set of PIDs, whereas another user may be concurrently interested by another set. Currently, in the lttng kernel tracer, we do the hash table query for each tracepoint hit, which is clearly not as efficient as checking a task struct flag. One option I see would be to set the task struct flag whenever there is at least one tracer/tracing session that is interested in this event (this would end up being a reference count on the flag). Then, for every flag check that passes, lttng could do HT/bitmap lookups to see if the event needs to go to each specific session. Is this task struct "trace" flag currently exposed to tracers through a reference-counting enable/disable API ? If not, do you think it would make sense ? Thanks, Mathieu > > -- Steve > > > -- > To unsubscribe from this list: send the line "unsubscribe linux-trace-users" > in > the body of a message to majord...@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html -- Mathieu Desnoyers EfficiOS Inc. http://www.efficios.com
Re: [RFC][PATCH 2/4] tracing: Use pid bitmap instead of a pid array for set_event_pid
- On Apr 19, 2016, at 4:50 PM, rostedt rost...@goodmis.org wrote: > On Tue, 19 Apr 2016 20:17:29 + (UTC) > Mathieu Desnoyers wrote: > > >> Ah indeed, since there is a hard limit to 4194304, that makes the >> worse case bitmap 512k. > > Yep. > >> >> We could argue that given a sparse dataset in the PID table (typical >> in our use-cases), a small hash table would have better cache locality >> than the bitmap. But I agree that the hash table does add a bit of >> complexity, so it becomes a complexity vs cache locality tradeoff. >> So I understand why you would want to go for the simpler bitmap >> solution, unless the hash table would prove to bring a measurable >> performance improvement. > > We discussed this too (cache locality), and came to the same conclusion > that a bitmask would still be better. If you think about it, if you > have a lot of CPUs and lots of PIDs, tasks don't migrate as much, and > if they do, cache locality of this bitmap will be the least of the > performance issues. Then you have a limited amount of PIDs per CPU, and > thus those PIDs will probably be in the CPU cache for the bitmap. It makes sense. Anyway, looking back at my own implementation, I have an array of 64 hlist_head entries (64 * 8 = 512 bytes), typically populated by NULL pointers. It's only a factor 8 smaller than the bitmap, so it's not a huge gain. One alternative approach would be to keep a few entries (e.g. 16 PIDs) in a fast-path lookup array that fits in a single-cache line. When the number of PIDs to track go beyond that, fall-back to the bitmap instead. > > Note, that the check of the bitmap to trace a task or not is not done > at every tracepoint. It's only done at sched_switch, and then an > internal flag is set. That flag will determine if the event should be > traced, and that is a single bit checked all the time (very good for > cache). Could this be used by multiple tracers, and used in a multi-session scheme ? In lttng, one user may want to track a set of PIDs, whereas another user may be concurrently interested by another set. Currently, in the lttng kernel tracer, we do the hash table query for each tracepoint hit, which is clearly not as efficient as checking a task struct flag. One option I see would be to set the task struct flag whenever there is at least one tracer/tracing session that is interested in this event (this would end up being a reference count on the flag). Then, for every flag check that passes, lttng could do HT/bitmap lookups to see if the event needs to go to each specific session. Is this task struct "trace" flag currently exposed to tracers through a reference-counting enable/disable API ? If not, do you think it would make sense ? Thanks, Mathieu > > -- Steve > > > -- > To unsubscribe from this list: send the line "unsubscribe linux-trace-users" > in > the body of a message to majord...@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html -- Mathieu Desnoyers EfficiOS Inc. http://www.efficios.com
Re: [RFC][PATCH 2/4] tracing: Use pid bitmap instead of a pid array for set_event_pid
On Tue, 19 Apr 2016 20:17:29 + (UTC) Mathieu Desnoyerswrote: > Ah indeed, since there is a hard limit to 4194304, that makes the > worse case bitmap 512k. Yep. > > We could argue that given a sparse dataset in the PID table (typical > in our use-cases), a small hash table would have better cache locality > than the bitmap. But I agree that the hash table does add a bit of > complexity, so it becomes a complexity vs cache locality tradeoff. > So I understand why you would want to go for the simpler bitmap > solution, unless the hash table would prove to bring a measurable > performance improvement. We discussed this too (cache locality), and came to the same conclusion that a bitmask would still be better. If you think about it, if you have a lot of CPUs and lots of PIDs, tasks don't migrate as much, and if they do, cache locality of this bitmap will be the least of the performance issues. Then you have a limited amount of PIDs per CPU, and thus those PIDs will probably be in the CPU cache for the bitmap. Note, that the check of the bitmap to trace a task or not is not done at every tracepoint. It's only done at sched_switch, and then an internal flag is set. That flag will determine if the event should be traced, and that is a single bit checked all the time (very good for cache). -- Steve
Re: [RFC][PATCH 2/4] tracing: Use pid bitmap instead of a pid array for set_event_pid
On Tue, 19 Apr 2016 20:17:29 + (UTC) Mathieu Desnoyers wrote: > Ah indeed, since there is a hard limit to 4194304, that makes the > worse case bitmap 512k. Yep. > > We could argue that given a sparse dataset in the PID table (typical > in our use-cases), a small hash table would have better cache locality > than the bitmap. But I agree that the hash table does add a bit of > complexity, so it becomes a complexity vs cache locality tradeoff. > So I understand why you would want to go for the simpler bitmap > solution, unless the hash table would prove to bring a measurable > performance improvement. We discussed this too (cache locality), and came to the same conclusion that a bitmask would still be better. If you think about it, if you have a lot of CPUs and lots of PIDs, tasks don't migrate as much, and if they do, cache locality of this bitmap will be the least of the performance issues. Then you have a limited amount of PIDs per CPU, and thus those PIDs will probably be in the CPU cache for the bitmap. Note, that the check of the bitmap to trace a task or not is not done at every tracepoint. It's only done at sched_switch, and then an internal flag is set. That flag will determine if the event should be traced, and that is a single bit checked all the time (very good for cache). -- Steve
Re: [RFC][PATCH 2/4] tracing: Use pid bitmap instead of a pid array for set_event_pid
- On Apr 19, 2016, at 3:41 PM, rostedt rost...@goodmis.org wrote: > On Tue, 19 Apr 2016 11:57:32 -0700 > "H. Peter Anvin"wrote: > >> Also, I understand there is one of these bitmaps per ring buffer, and >> the ring buffer is in the tens of megabytes. > > Right, there's only one bitmap per tracing instance, which in most > cases is just one (I know of people who make more). And by default, the > tracing buffer is 1.4 megs per CPU. > > If you have a pid_max of the max size, I highly doubt you will be doing > that on a single CPU machine. If you have 48 CPUs, the ring buffer will > be 1.4 * 48 megs, making the 1/2 meg bitmap a nit. > > I will say, there may be two bitmaps soon, because I plan on adding > this same code to the function tracer logic. Ah indeed, since there is a hard limit to 4194304, that makes the worse case bitmap 512k. We could argue that given a sparse dataset in the PID table (typical in our use-cases), a small hash table would have better cache locality than the bitmap. But I agree that the hash table does add a bit of complexity, so it becomes a complexity vs cache locality tradeoff. So I understand why you would want to go for the simpler bitmap solution, unless the hash table would prove to bring a measurable performance improvement. Thanks, Mathieu -- Mathieu Desnoyers EfficiOS Inc. http://www.efficios.com
Re: [RFC][PATCH 2/4] tracing: Use pid bitmap instead of a pid array for set_event_pid
- On Apr 19, 2016, at 3:41 PM, rostedt rost...@goodmis.org wrote: > On Tue, 19 Apr 2016 11:57:32 -0700 > "H. Peter Anvin" wrote: > >> Also, I understand there is one of these bitmaps per ring buffer, and >> the ring buffer is in the tens of megabytes. > > Right, there's only one bitmap per tracing instance, which in most > cases is just one (I know of people who make more). And by default, the > tracing buffer is 1.4 megs per CPU. > > If you have a pid_max of the max size, I highly doubt you will be doing > that on a single CPU machine. If you have 48 CPUs, the ring buffer will > be 1.4 * 48 megs, making the 1/2 meg bitmap a nit. > > I will say, there may be two bitmaps soon, because I plan on adding > this same code to the function tracer logic. Ah indeed, since there is a hard limit to 4194304, that makes the worse case bitmap 512k. We could argue that given a sparse dataset in the PID table (typical in our use-cases), a small hash table would have better cache locality than the bitmap. But I agree that the hash table does add a bit of complexity, so it becomes a complexity vs cache locality tradeoff. So I understand why you would want to go for the simpler bitmap solution, unless the hash table would prove to bring a measurable performance improvement. Thanks, Mathieu -- Mathieu Desnoyers EfficiOS Inc. http://www.efficios.com
Re: [RFC][PATCH 2/4] tracing: Use pid bitmap instead of a pid array for set_event_pid
On Tue, 19 Apr 2016 11:57:32 -0700 "H. Peter Anvin"wrote: > Also, I understand there is one of these bitmaps per ring buffer, and > the ring buffer is in the tens of megabytes. Right, there's only one bitmap per tracing instance, which in most cases is just one (I know of people who make more). And by default, the tracing buffer is 1.4 megs per CPU. If you have a pid_max of the max size, I highly doubt you will be doing that on a single CPU machine. If you have 48 CPUs, the ring buffer will be 1.4 * 48 megs, making the 1/2 meg bitmap a nit. I will say, there may be two bitmaps soon, because I plan on adding this same code to the function tracer logic. -- Steve
Re: [RFC][PATCH 2/4] tracing: Use pid bitmap instead of a pid array for set_event_pid
On Tue, 19 Apr 2016 11:57:32 -0700 "H. Peter Anvin" wrote: > Also, I understand there is one of these bitmaps per ring buffer, and > the ring buffer is in the tens of megabytes. Right, there's only one bitmap per tracing instance, which in most cases is just one (I know of people who make more). And by default, the tracing buffer is 1.4 megs per CPU. If you have a pid_max of the max size, I highly doubt you will be doing that on a single CPU machine. If you have 48 CPUs, the ring buffer will be 1.4 * 48 megs, making the 1/2 meg bitmap a nit. I will say, there may be two bitmaps soon, because I plan on adding this same code to the function tracer logic. -- Steve
Re: [RFC][PATCH 2/4] tracing: Use pid bitmap instead of a pid array for set_event_pid
On April 19, 2016 10:19:47 AM PDT, Steven Rostedtwrote: >On Tue, 19 Apr 2016 16:55:28 + (UTC) >Mathieu Desnoyers wrote: > >> - On Apr 19, 2016, at 10:34 AM, rostedt rost...@goodmis.org >wrote: >> >> > From: Steven Rostedt >> > >> > In order to add the ability to let tasks that are filtered by the >events >> > have their children also be traced on fork (and then not traced on >exit), >> > convert the array into a pid bitmask. Most of the time the number >of pids is >> > only 32768 pids or a 4k bitmask, which is the same size as the >default list >> > currently is, and that list could grow if more pids are listed. >> > >> > This also greatly simplifies the code. >> >> The maximum PID number can be increased with sysctl. >> >> See "pid_max" in Documentation/sysctl/kernel.txt >> >> What happens when you have a very large pid_max set ? > >I discussed this with HPA, and it appears that the pid_max max would >require a bitmap of about 1/2 meg (the current default is 8k). This is >also why I chose to keep the bitmap as vmalloc and not a continuous >page allocation. > >> >> You say "most of the time" as if this was a fast-path vs a slow-path, >> but it is not the case here. > >I meant "most of the time" as "default". Yes, you can make the pid_max >really big, but in that case you better have enough memory in your >system to handle that many threads. Thus a 1/2 meg used for tracking >pids shouldn't be an issue. > >> >> This is a configuration option that can significantly hurt memory >usage >> in configurations using a large pid_max. > >No, it is created dynamically. If you never write anything into the >set_event_pid file, then you have nothing to worry about, as nothing >is allocated. It creates the array when a pid is added to the file, and >only then. If it fails to allocate, the write will return -ENOMEM as >the >errno. > >Again, if you have a large pid_max your box had better have a lot of >memory to begin with, because this array will be negligible compared to >the memory required to handle large number of tasks. > >> >> FWIW, I implement a similar feature with a hash table in >lttng-modules. >> I don't have the child process tracking though, which is a neat >improvement. > >I originally had a complex hash algorithm because I too was worried >about the size of pid_max and using a bitmap, but HPA convinced me it >was the way to go. > >-- Steve Also, I understand there is one of these bitmaps per ring buffer, and the ring buffer is in the tens of megabytes. -- Sent from my Android device with K-9 Mail. Please excuse brevity and formatting.
Re: [RFC][PATCH 2/4] tracing: Use pid bitmap instead of a pid array for set_event_pid
On April 19, 2016 10:19:47 AM PDT, Steven Rostedt wrote: >On Tue, 19 Apr 2016 16:55:28 + (UTC) >Mathieu Desnoyers wrote: > >> - On Apr 19, 2016, at 10:34 AM, rostedt rost...@goodmis.org >wrote: >> >> > From: Steven Rostedt >> > >> > In order to add the ability to let tasks that are filtered by the >events >> > have their children also be traced on fork (and then not traced on >exit), >> > convert the array into a pid bitmask. Most of the time the number >of pids is >> > only 32768 pids or a 4k bitmask, which is the same size as the >default list >> > currently is, and that list could grow if more pids are listed. >> > >> > This also greatly simplifies the code. >> >> The maximum PID number can be increased with sysctl. >> >> See "pid_max" in Documentation/sysctl/kernel.txt >> >> What happens when you have a very large pid_max set ? > >I discussed this with HPA, and it appears that the pid_max max would >require a bitmap of about 1/2 meg (the current default is 8k). This is >also why I chose to keep the bitmap as vmalloc and not a continuous >page allocation. > >> >> You say "most of the time" as if this was a fast-path vs a slow-path, >> but it is not the case here. > >I meant "most of the time" as "default". Yes, you can make the pid_max >really big, but in that case you better have enough memory in your >system to handle that many threads. Thus a 1/2 meg used for tracking >pids shouldn't be an issue. > >> >> This is a configuration option that can significantly hurt memory >usage >> in configurations using a large pid_max. > >No, it is created dynamically. If you never write anything into the >set_event_pid file, then you have nothing to worry about, as nothing >is allocated. It creates the array when a pid is added to the file, and >only then. If it fails to allocate, the write will return -ENOMEM as >the >errno. > >Again, if you have a large pid_max your box had better have a lot of >memory to begin with, because this array will be negligible compared to >the memory required to handle large number of tasks. > >> >> FWIW, I implement a similar feature with a hash table in >lttng-modules. >> I don't have the child process tracking though, which is a neat >improvement. > >I originally had a complex hash algorithm because I too was worried >about the size of pid_max and using a bitmap, but HPA convinced me it >was the way to go. > >-- Steve Also, I understand there is one of these bitmaps per ring buffer, and the ring buffer is in the tens of megabytes. -- Sent from my Android device with K-9 Mail. Please excuse brevity and formatting.
Re: [RFC][PATCH 2/4] tracing: Use pid bitmap instead of a pid array for set_event_pid
On Tue, 19 Apr 2016 16:55:28 + (UTC) Mathieu Desnoyerswrote: > - On Apr 19, 2016, at 10:34 AM, rostedt rost...@goodmis.org wrote: > > > From: Steven Rostedt > > > > In order to add the ability to let tasks that are filtered by the events > > have their children also be traced on fork (and then not traced on exit), > > convert the array into a pid bitmask. Most of the time the number of pids is > > only 32768 pids or a 4k bitmask, which is the same size as the default list > > currently is, and that list could grow if more pids are listed. > > > > This also greatly simplifies the code. > > The maximum PID number can be increased with sysctl. > > See "pid_max" in Documentation/sysctl/kernel.txt > > What happens when you have a very large pid_max set ? I discussed this with HPA, and it appears that the pid_max max would require a bitmap of about 1/2 meg (the current default is 8k). This is also why I chose to keep the bitmap as vmalloc and not a continuous page allocation. > > You say "most of the time" as if this was a fast-path vs a slow-path, > but it is not the case here. I meant "most of the time" as "default". Yes, you can make the pid_max really big, but in that case you better have enough memory in your system to handle that many threads. Thus a 1/2 meg used for tracking pids shouldn't be an issue. > > This is a configuration option that can significantly hurt memory usage > in configurations using a large pid_max. No, it is created dynamically. If you never write anything into the set_event_pid file, then you have nothing to worry about, as nothing is allocated. It creates the array when a pid is added to the file, and only then. If it fails to allocate, the write will return -ENOMEM as the errno. Again, if you have a large pid_max your box had better have a lot of memory to begin with, because this array will be negligible compared to the memory required to handle large number of tasks. > > FWIW, I implement a similar feature with a hash table in lttng-modules. > I don't have the child process tracking though, which is a neat improvement. I originally had a complex hash algorithm because I too was worried about the size of pid_max and using a bitmap, but HPA convinced me it was the way to go. -- Steve
Re: [RFC][PATCH 2/4] tracing: Use pid bitmap instead of a pid array for set_event_pid
On Tue, 19 Apr 2016 16:55:28 + (UTC) Mathieu Desnoyers wrote: > - On Apr 19, 2016, at 10:34 AM, rostedt rost...@goodmis.org wrote: > > > From: Steven Rostedt > > > > In order to add the ability to let tasks that are filtered by the events > > have their children also be traced on fork (and then not traced on exit), > > convert the array into a pid bitmask. Most of the time the number of pids is > > only 32768 pids or a 4k bitmask, which is the same size as the default list > > currently is, and that list could grow if more pids are listed. > > > > This also greatly simplifies the code. > > The maximum PID number can be increased with sysctl. > > See "pid_max" in Documentation/sysctl/kernel.txt > > What happens when you have a very large pid_max set ? I discussed this with HPA, and it appears that the pid_max max would require a bitmap of about 1/2 meg (the current default is 8k). This is also why I chose to keep the bitmap as vmalloc and not a continuous page allocation. > > You say "most of the time" as if this was a fast-path vs a slow-path, > but it is not the case here. I meant "most of the time" as "default". Yes, you can make the pid_max really big, but in that case you better have enough memory in your system to handle that many threads. Thus a 1/2 meg used for tracking pids shouldn't be an issue. > > This is a configuration option that can significantly hurt memory usage > in configurations using a large pid_max. No, it is created dynamically. If you never write anything into the set_event_pid file, then you have nothing to worry about, as nothing is allocated. It creates the array when a pid is added to the file, and only then. If it fails to allocate, the write will return -ENOMEM as the errno. Again, if you have a large pid_max your box had better have a lot of memory to begin with, because this array will be negligible compared to the memory required to handle large number of tasks. > > FWIW, I implement a similar feature with a hash table in lttng-modules. > I don't have the child process tracking though, which is a neat improvement. I originally had a complex hash algorithm because I too was worried about the size of pid_max and using a bitmap, but HPA convinced me it was the way to go. -- Steve
Re: [RFC][PATCH 2/4] tracing: Use pid bitmap instead of a pid array for set_event_pid
- On Apr 19, 2016, at 10:34 AM, rostedt rost...@goodmis.org wrote: > From: Steven Rostedt> > In order to add the ability to let tasks that are filtered by the events > have their children also be traced on fork (and then not traced on exit), > convert the array into a pid bitmask. Most of the time the number of pids is > only 32768 pids or a 4k bitmask, which is the same size as the default list > currently is, and that list could grow if more pids are listed. > > This also greatly simplifies the code. The maximum PID number can be increased with sysctl. See "pid_max" in Documentation/sysctl/kernel.txt What happens when you have a very large pid_max set ? You say "most of the time" as if this was a fast-path vs a slow-path, but it is not the case here. This is a configuration option that can significantly hurt memory usage in configurations using a large pid_max. FWIW, I implement a similar feature with a hash table in lttng-modules. I don't have the child process tracking though, which is a neat improvement. Thanks, Mathieu > > Suggested-by: "H. Peter Anvin" > Signed-off-by: Steven Rostedt > --- > kernel/trace/trace.h| 5 +- > kernel/trace/trace_events.c | 221 > 2 files changed, 102 insertions(+), 124 deletions(-) > > diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h > index 3fff4adfd431..68cbb8e10aea 100644 > --- a/kernel/trace/trace.h > +++ b/kernel/trace/trace.h > @@ -177,9 +177,8 @@ struct trace_options { > }; > > struct trace_pid_list { > - unsigned intnr_pids; > - int order; > - pid_t *pids; > + int pid_max; > + unsigned long *pids; > }; > > /* > diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c > index 598a18675a6b..45f7cc72bf25 100644 > --- a/kernel/trace/trace_events.c > +++ b/kernel/trace/trace_events.c > @@ -15,7 +15,7 @@ > #include > #include > #include > -#include > +#include > #include > #include > #include > @@ -471,23 +471,13 @@ static void ftrace_clear_events(struct trace_array *tr) > mutex_unlock(_mutex); > } > > -static int cmp_pid(const void *key, const void *elt) > -{ > - const pid_t *search_pid = key; > - const pid_t *pid = elt; > - > - if (*search_pid == *pid) > - return 0; > - if (*search_pid < *pid) > - return -1; > - return 1; > -} > +/* Shouldn't this be in a header? */ > +extern int pid_max; > > static bool > ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct > *task) > { > - pid_t search_pid; > - pid_t *pid; > + pid_t pid; > > /* >* Return false, because if filtered_pids does not exist, > @@ -496,15 +486,16 @@ ignore_this_task(struct trace_pid_list *filtered_pids, > struct task_struct *task) > if (!filtered_pids) > return false; > > - search_pid = task->pid; > + pid = task->pid; > > - pid = bsearch(_pid, filtered_pids->pids, > - filtered_pids->nr_pids, sizeof(pid_t), > - cmp_pid); > - if (!pid) > + /* > + * If pid_max changed after filtered_pids was created, we > + * by default ignore all pids greater than the previous pid_max. > + */ > + if (task->pid >= filtered_pids->pid_max) > return true; > > - return false; > + return !test_bit(task->pid, filtered_pids->pids); > } > > static void > @@ -602,7 +593,7 @@ static void __ftrace_clear_event_pids(struct trace_array > *tr) > /* Wait till all users are no longer using pid filtering */ > synchronize_sched(); > > - free_pages((unsigned long)pid_list->pids, pid_list->order); > + vfree(pid_list->pids); > kfree(pid_list); > } > > @@ -946,11 +937,32 @@ static void t_stop(struct seq_file *m, void *p) > mutex_unlock(_mutex); > } > > +static void * > +p_next(struct seq_file *m, void *v, loff_t *pos) > +{ > + struct trace_array *tr = m->private; > + struct trace_pid_list *pid_list = > rcu_dereference_sched(tr->filtered_pids); > + unsigned long pid = (unsigned long)v; > + > + (*pos)++; > + > + /* pid already is +1 of the actual prevous bit */ > + pid = find_next_bit(pid_list->pids, pid_list->pid_max, pid); > + > + /* Return pid + 1 to allow zero to be represented */ > + if (pid < pid_list->pid_max) > + return (void *)(pid + 1); > + > + return NULL; > +} > + > static void *p_start(struct seq_file *m, loff_t *pos) > __acquires(RCU) > { > struct trace_pid_list *pid_list; > struct trace_array *tr = m->private; > + unsigned long pid; > + loff_t l = 0; > > /* >* Grab the mutex, to keep calls to p_next() having the same > @@ -963,10 +975,18 @@ static void *p_start(struct seq_file *m, loff_t *pos)
Re: [RFC][PATCH 2/4] tracing: Use pid bitmap instead of a pid array for set_event_pid
- On Apr 19, 2016, at 10:34 AM, rostedt rost...@goodmis.org wrote: > From: Steven Rostedt > > In order to add the ability to let tasks that are filtered by the events > have their children also be traced on fork (and then not traced on exit), > convert the array into a pid bitmask. Most of the time the number of pids is > only 32768 pids or a 4k bitmask, which is the same size as the default list > currently is, and that list could grow if more pids are listed. > > This also greatly simplifies the code. The maximum PID number can be increased with sysctl. See "pid_max" in Documentation/sysctl/kernel.txt What happens when you have a very large pid_max set ? You say "most of the time" as if this was a fast-path vs a slow-path, but it is not the case here. This is a configuration option that can significantly hurt memory usage in configurations using a large pid_max. FWIW, I implement a similar feature with a hash table in lttng-modules. I don't have the child process tracking though, which is a neat improvement. Thanks, Mathieu > > Suggested-by: "H. Peter Anvin" > Signed-off-by: Steven Rostedt > --- > kernel/trace/trace.h| 5 +- > kernel/trace/trace_events.c | 221 > 2 files changed, 102 insertions(+), 124 deletions(-) > > diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h > index 3fff4adfd431..68cbb8e10aea 100644 > --- a/kernel/trace/trace.h > +++ b/kernel/trace/trace.h > @@ -177,9 +177,8 @@ struct trace_options { > }; > > struct trace_pid_list { > - unsigned intnr_pids; > - int order; > - pid_t *pids; > + int pid_max; > + unsigned long *pids; > }; > > /* > diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c > index 598a18675a6b..45f7cc72bf25 100644 > --- a/kernel/trace/trace_events.c > +++ b/kernel/trace/trace_events.c > @@ -15,7 +15,7 @@ > #include > #include > #include > -#include > +#include > #include > #include > #include > @@ -471,23 +471,13 @@ static void ftrace_clear_events(struct trace_array *tr) > mutex_unlock(_mutex); > } > > -static int cmp_pid(const void *key, const void *elt) > -{ > - const pid_t *search_pid = key; > - const pid_t *pid = elt; > - > - if (*search_pid == *pid) > - return 0; > - if (*search_pid < *pid) > - return -1; > - return 1; > -} > +/* Shouldn't this be in a header? */ > +extern int pid_max; > > static bool > ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct > *task) > { > - pid_t search_pid; > - pid_t *pid; > + pid_t pid; > > /* >* Return false, because if filtered_pids does not exist, > @@ -496,15 +486,16 @@ ignore_this_task(struct trace_pid_list *filtered_pids, > struct task_struct *task) > if (!filtered_pids) > return false; > > - search_pid = task->pid; > + pid = task->pid; > > - pid = bsearch(_pid, filtered_pids->pids, > - filtered_pids->nr_pids, sizeof(pid_t), > - cmp_pid); > - if (!pid) > + /* > + * If pid_max changed after filtered_pids was created, we > + * by default ignore all pids greater than the previous pid_max. > + */ > + if (task->pid >= filtered_pids->pid_max) > return true; > > - return false; > + return !test_bit(task->pid, filtered_pids->pids); > } > > static void > @@ -602,7 +593,7 @@ static void __ftrace_clear_event_pids(struct trace_array > *tr) > /* Wait till all users are no longer using pid filtering */ > synchronize_sched(); > > - free_pages((unsigned long)pid_list->pids, pid_list->order); > + vfree(pid_list->pids); > kfree(pid_list); > } > > @@ -946,11 +937,32 @@ static void t_stop(struct seq_file *m, void *p) > mutex_unlock(_mutex); > } > > +static void * > +p_next(struct seq_file *m, void *v, loff_t *pos) > +{ > + struct trace_array *tr = m->private; > + struct trace_pid_list *pid_list = > rcu_dereference_sched(tr->filtered_pids); > + unsigned long pid = (unsigned long)v; > + > + (*pos)++; > + > + /* pid already is +1 of the actual prevous bit */ > + pid = find_next_bit(pid_list->pids, pid_list->pid_max, pid); > + > + /* Return pid + 1 to allow zero to be represented */ > + if (pid < pid_list->pid_max) > + return (void *)(pid + 1); > + > + return NULL; > +} > + > static void *p_start(struct seq_file *m, loff_t *pos) > __acquires(RCU) > { > struct trace_pid_list *pid_list; > struct trace_array *tr = m->private; > + unsigned long pid; > + loff_t l = 0; > > /* >* Grab the mutex, to keep calls to p_next() having the same > @@ -963,10 +975,18 @@ static void *p_start(struct seq_file *m, loff_t *pos) > > pid_list =
[RFC][PATCH 2/4] tracing: Use pid bitmap instead of a pid array for set_event_pid
From: Steven RostedtIn order to add the ability to let tasks that are filtered by the events have their children also be traced on fork (and then not traced on exit), convert the array into a pid bitmask. Most of the time the number of pids is only 32768 pids or a 4k bitmask, which is the same size as the default list currently is, and that list could grow if more pids are listed. This also greatly simplifies the code. Suggested-by: "H. Peter Anvin" Signed-off-by: Steven Rostedt --- kernel/trace/trace.h| 5 +- kernel/trace/trace_events.c | 221 2 files changed, 102 insertions(+), 124 deletions(-) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 3fff4adfd431..68cbb8e10aea 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -177,9 +177,8 @@ struct trace_options { }; struct trace_pid_list { - unsigned intnr_pids; - int order; - pid_t *pids; + int pid_max; + unsigned long *pids; }; /* diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 598a18675a6b..45f7cc72bf25 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -15,7 +15,7 @@ #include #include #include -#include +#include #include #include #include @@ -471,23 +471,13 @@ static void ftrace_clear_events(struct trace_array *tr) mutex_unlock(_mutex); } -static int cmp_pid(const void *key, const void *elt) -{ - const pid_t *search_pid = key; - const pid_t *pid = elt; - - if (*search_pid == *pid) - return 0; - if (*search_pid < *pid) - return -1; - return 1; -} +/* Shouldn't this be in a header? */ +extern int pid_max; static bool ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct *task) { - pid_t search_pid; - pid_t *pid; + pid_t pid; /* * Return false, because if filtered_pids does not exist, @@ -496,15 +486,16 @@ ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct *task) if (!filtered_pids) return false; - search_pid = task->pid; + pid = task->pid; - pid = bsearch(_pid, filtered_pids->pids, - filtered_pids->nr_pids, sizeof(pid_t), - cmp_pid); - if (!pid) + /* +* If pid_max changed after filtered_pids was created, we +* by default ignore all pids greater than the previous pid_max. +*/ + if (task->pid >= filtered_pids->pid_max) return true; - return false; + return !test_bit(task->pid, filtered_pids->pids); } static void @@ -602,7 +593,7 @@ static void __ftrace_clear_event_pids(struct trace_array *tr) /* Wait till all users are no longer using pid filtering */ synchronize_sched(); - free_pages((unsigned long)pid_list->pids, pid_list->order); + vfree(pid_list->pids); kfree(pid_list); } @@ -946,11 +937,32 @@ static void t_stop(struct seq_file *m, void *p) mutex_unlock(_mutex); } +static void * +p_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct trace_array *tr = m->private; + struct trace_pid_list *pid_list = rcu_dereference_sched(tr->filtered_pids); + unsigned long pid = (unsigned long)v; + + (*pos)++; + + /* pid already is +1 of the actual prevous bit */ + pid = find_next_bit(pid_list->pids, pid_list->pid_max, pid); + + /* Return pid + 1 to allow zero to be represented */ + if (pid < pid_list->pid_max) + return (void *)(pid + 1); + + return NULL; +} + static void *p_start(struct seq_file *m, loff_t *pos) __acquires(RCU) { struct trace_pid_list *pid_list; struct trace_array *tr = m->private; + unsigned long pid; + loff_t l = 0; /* * Grab the mutex, to keep calls to p_next() having the same @@ -963,10 +975,18 @@ static void *p_start(struct seq_file *m, loff_t *pos) pid_list = rcu_dereference_sched(tr->filtered_pids); - if (!pid_list || *pos >= pid_list->nr_pids) + if (!pid_list) + return NULL; + + pid = find_first_bit(pid_list->pids, pid_list->pid_max); + if (pid >= pid_list->pid_max) return NULL; - return (void *)_list->pids[*pos]; + /* Return pid + 1 so that zero can be the exit value */ + for (pid++; pid && l < *pos; +pid = (unsigned long)p_next(m, (void *)pid, )) + ; + return (void *)pid; } static void p_stop(struct seq_file *m, void *p) @@ -976,25 +996,11 @@ static void p_stop(struct seq_file *m, void *p) mutex_unlock(_mutex); } -static void * -p_next(struct
[RFC][PATCH 2/4] tracing: Use pid bitmap instead of a pid array for set_event_pid
From: Steven Rostedt In order to add the ability to let tasks that are filtered by the events have their children also be traced on fork (and then not traced on exit), convert the array into a pid bitmask. Most of the time the number of pids is only 32768 pids or a 4k bitmask, which is the same size as the default list currently is, and that list could grow if more pids are listed. This also greatly simplifies the code. Suggested-by: "H. Peter Anvin" Signed-off-by: Steven Rostedt --- kernel/trace/trace.h| 5 +- kernel/trace/trace_events.c | 221 2 files changed, 102 insertions(+), 124 deletions(-) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 3fff4adfd431..68cbb8e10aea 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -177,9 +177,8 @@ struct trace_options { }; struct trace_pid_list { - unsigned intnr_pids; - int order; - pid_t *pids; + int pid_max; + unsigned long *pids; }; /* diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 598a18675a6b..45f7cc72bf25 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -15,7 +15,7 @@ #include #include #include -#include +#include #include #include #include @@ -471,23 +471,13 @@ static void ftrace_clear_events(struct trace_array *tr) mutex_unlock(_mutex); } -static int cmp_pid(const void *key, const void *elt) -{ - const pid_t *search_pid = key; - const pid_t *pid = elt; - - if (*search_pid == *pid) - return 0; - if (*search_pid < *pid) - return -1; - return 1; -} +/* Shouldn't this be in a header? */ +extern int pid_max; static bool ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct *task) { - pid_t search_pid; - pid_t *pid; + pid_t pid; /* * Return false, because if filtered_pids does not exist, @@ -496,15 +486,16 @@ ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct *task) if (!filtered_pids) return false; - search_pid = task->pid; + pid = task->pid; - pid = bsearch(_pid, filtered_pids->pids, - filtered_pids->nr_pids, sizeof(pid_t), - cmp_pid); - if (!pid) + /* +* If pid_max changed after filtered_pids was created, we +* by default ignore all pids greater than the previous pid_max. +*/ + if (task->pid >= filtered_pids->pid_max) return true; - return false; + return !test_bit(task->pid, filtered_pids->pids); } static void @@ -602,7 +593,7 @@ static void __ftrace_clear_event_pids(struct trace_array *tr) /* Wait till all users are no longer using pid filtering */ synchronize_sched(); - free_pages((unsigned long)pid_list->pids, pid_list->order); + vfree(pid_list->pids); kfree(pid_list); } @@ -946,11 +937,32 @@ static void t_stop(struct seq_file *m, void *p) mutex_unlock(_mutex); } +static void * +p_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct trace_array *tr = m->private; + struct trace_pid_list *pid_list = rcu_dereference_sched(tr->filtered_pids); + unsigned long pid = (unsigned long)v; + + (*pos)++; + + /* pid already is +1 of the actual prevous bit */ + pid = find_next_bit(pid_list->pids, pid_list->pid_max, pid); + + /* Return pid + 1 to allow zero to be represented */ + if (pid < pid_list->pid_max) + return (void *)(pid + 1); + + return NULL; +} + static void *p_start(struct seq_file *m, loff_t *pos) __acquires(RCU) { struct trace_pid_list *pid_list; struct trace_array *tr = m->private; + unsigned long pid; + loff_t l = 0; /* * Grab the mutex, to keep calls to p_next() having the same @@ -963,10 +975,18 @@ static void *p_start(struct seq_file *m, loff_t *pos) pid_list = rcu_dereference_sched(tr->filtered_pids); - if (!pid_list || *pos >= pid_list->nr_pids) + if (!pid_list) + return NULL; + + pid = find_first_bit(pid_list->pids, pid_list->pid_max); + if (pid >= pid_list->pid_max) return NULL; - return (void *)_list->pids[*pos]; + /* Return pid + 1 so that zero can be the exit value */ + for (pid++; pid && l < *pos; +pid = (unsigned long)p_next(m, (void *)pid, )) + ; + return (void *)pid; } static void p_stop(struct seq_file *m, void *p) @@ -976,25 +996,11 @@ static void p_stop(struct seq_file *m, void *p) mutex_unlock(_mutex); } -static void * -p_next(struct seq_file *m, void *v, loff_t *pos) -{ - struct trace_array