Re: [RFC][PATCH 2/4] tracing: Use pid bitmap instead of a pid array for set_event_pid

2016-04-22 Thread Steven Rostedt
On Fri, 22 Apr 2016 11:45:30 +0900
Namhyung Kim  wrote:

> > +   pid_list->pid_max = READ_ONCE(pid_max);
> > +   /* Only truncating will shrink pid_max */
> > +   if (filtered_pids && filtered_pids->pid_max > pid_list->pid_max)
> > +   pid_list->pid_max = filtered_pids->pid_max;
> > +   pid_list->pids = vzalloc((pid_list->pid_max + 7) >> 3);
> > +   if (!pid_list->pids) {
> > +   kfree(pid_list);
> > +   read = -ENOMEM;
> > +   goto out;
> > +   }
> > +   if (filtered_pids) {
> > +   /* copy the current bits to the new max */
> > +   pid = find_first_bit(filtered_pids->pids,
> > +filtered_pids->pid_max);
> > +   while (pid < filtered_pids->pid_max) {
> > +   set_bit(pid, pid_list->pids);
> > +   pid = find_next_bit(filtered_pids->pids,
> > +   filtered_pids->pid_max,
> > +   pid + 1);
> > +   nr_pids++;
> > +   }  
> 
> Why not just use memcpy and keep nr_pids in the pid_list?

This is the slow path (very slow ;-), and this was the first method
that came to my mind (while I programmed this during a conference). I
could use memcpy, or simply one of the bitmask copies, and then get the
nr_pids from bitmask_weight(). I would not keep nr_pids in pid_list
because that would mean that I would have to manage it in the fast path.

Maybe later I'll convert it to bitmap_copy(), but for now I'll just
keep it as is. I move this code in my queue for 4.8, and don't want to
deal with conflicts unless there's a real bug discovered.

Thanks for looking at this code!

-- Steve



Re: [RFC][PATCH 2/4] tracing: Use pid bitmap instead of a pid array for set_event_pid

2016-04-22 Thread Steven Rostedt
On Fri, 22 Apr 2016 11:45:30 +0900
Namhyung Kim  wrote:

> > +   pid_list->pid_max = READ_ONCE(pid_max);
> > +   /* Only truncating will shrink pid_max */
> > +   if (filtered_pids && filtered_pids->pid_max > pid_list->pid_max)
> > +   pid_list->pid_max = filtered_pids->pid_max;
> > +   pid_list->pids = vzalloc((pid_list->pid_max + 7) >> 3);
> > +   if (!pid_list->pids) {
> > +   kfree(pid_list);
> > +   read = -ENOMEM;
> > +   goto out;
> > +   }
> > +   if (filtered_pids) {
> > +   /* copy the current bits to the new max */
> > +   pid = find_first_bit(filtered_pids->pids,
> > +filtered_pids->pid_max);
> > +   while (pid < filtered_pids->pid_max) {
> > +   set_bit(pid, pid_list->pids);
> > +   pid = find_next_bit(filtered_pids->pids,
> > +   filtered_pids->pid_max,
> > +   pid + 1);
> > +   nr_pids++;
> > +   }  
> 
> Why not just use memcpy and keep nr_pids in the pid_list?

This is the slow path (very slow ;-), and this was the first method
that came to my mind (while I programmed this during a conference). I
could use memcpy, or simply one of the bitmask copies, and then get the
nr_pids from bitmask_weight(). I would not keep nr_pids in pid_list
because that would mean that I would have to manage it in the fast path.

Maybe later I'll convert it to bitmap_copy(), but for now I'll just
keep it as is. I move this code in my queue for 4.8, and don't want to
deal with conflicts unless there's a real bug discovered.

Thanks for looking at this code!

-- Steve



Re: [RFC][PATCH 2/4] tracing: Use pid bitmap instead of a pid array for set_event_pid

2016-04-21 Thread Namhyung Kim
Hi Steve,

On Tue, Apr 19, 2016 at 10:34:23AM -0400, Steven Rostedt wrote:
> From: Steven Rostedt 
> 
> In order to add the ability to let tasks that are filtered by the events
> have their children also be traced on fork (and then not traced on exit),
> convert the array into a pid bitmask. Most of the time the number of pids is
> only 32768 pids or a 4k bitmask, which is the same size as the default list
> currently is, and that list could grow if more pids are listed.
> 
> This also greatly simplifies the code.
> 
> Suggested-by: "H. Peter Anvin" 
> Signed-off-by: Steven Rostedt 
> ---

[SNIP]
> @@ -1571,7 +1572,7 @@ ftrace_event_pid_write(struct file *filp, const char 
> __user *ubuf,
>   struct seq_file *m = filp->private_data;
>   struct trace_array *tr = m->private;
>   struct trace_pid_list *filtered_pids = NULL;
> - struct trace_pid_list *pid_list = NULL;
> + struct trace_pid_list *pid_list;
>   struct trace_event_file *file;
>   struct trace_parser parser;
>   unsigned long val;
> @@ -1579,7 +1580,7 @@ ftrace_event_pid_write(struct file *filp, const char 
> __user *ubuf,
>   ssize_t read = 0;
>   ssize_t ret = 0;
>   pid_t pid;
> - int i;
> + int nr_pids = 0;
>  
>   if (!cnt)
>   return 0;
> @@ -1592,10 +1593,43 @@ ftrace_event_pid_write(struct file *filp, const char 
> __user *ubuf,
>   return -ENOMEM;
>  
>   mutex_lock(_mutex);
> + filtered_pids = rcu_dereference_protected(tr->filtered_pids,
> +  lockdep_is_held(_mutex));
> +
>   /*
> -  * Load as many pids into the array before doing a
> -  * swap from the tr->filtered_pids to the new list.
> +  * Always recreate a new array. The write is an all or nothing
> +  * operation. Always create a new array when adding new pids by
> +  * the user. If the operation fails, then the current list is
> +  * not modified.
>*/
> + pid_list = kmalloc(sizeof(*pid_list), GFP_KERNEL);
> + if (!pid_list) {
> + read = -ENOMEM;
> + goto out;
> + }
> + pid_list->pid_max = READ_ONCE(pid_max);
> + /* Only truncating will shrink pid_max */
> + if (filtered_pids && filtered_pids->pid_max > pid_list->pid_max)
> + pid_list->pid_max = filtered_pids->pid_max;
> + pid_list->pids = vzalloc((pid_list->pid_max + 7) >> 3);
> + if (!pid_list->pids) {
> + kfree(pid_list);
> + read = -ENOMEM;
> + goto out;
> + }
> + if (filtered_pids) {
> + /* copy the current bits to the new max */
> + pid = find_first_bit(filtered_pids->pids,
> +  filtered_pids->pid_max);
> + while (pid < filtered_pids->pid_max) {
> + set_bit(pid, pid_list->pids);
> + pid = find_next_bit(filtered_pids->pids,
> + filtered_pids->pid_max,
> + pid + 1);
> + nr_pids++;
> + }

Why not just use memcpy and keep nr_pids in the pid_list?

Thanks,
Namhyung



> + }
> +
>   while (cnt > 0) {
>  
>   this_pos = 0;
> @@ -1613,92 +1647,35 @@ ftrace_event_pid_write(struct file *filp, const char 
> __user *ubuf,
>   ret = -EINVAL;
>   if (kstrtoul(parser.buffer, 0, ))
>   break;
> - if (val > INT_MAX)
> + if (val >= pid_list->pid_max)
>   break;
>  
>   pid = (pid_t)val;
>  
> - ret = -ENOMEM;
> - if (!pid_list) {
> - pid_list = kmalloc(sizeof(*pid_list), GFP_KERNEL);
> - if (!pid_list)
> - break;
> -
> - filtered_pids = 
> rcu_dereference_protected(tr->filtered_pids,
> - 
> lockdep_is_held(_mutex));
> - if (filtered_pids)
> - pid_list->order = filtered_pids->order;
> - else
> - pid_list->order = 0;
> -
> - pid_list->pids = (void *)__get_free_pages(GFP_KERNEL,
> -   
> pid_list->order);
> - if (!pid_list->pids)
> - break;
> -
> - if (filtered_pids) {
> - pid_list->nr_pids = filtered_pids->nr_pids;
> - memcpy(pid_list->pids, filtered_pids->pids,
> -pid_list->nr_pids * sizeof(pid_t));
> - } else
> - pid_list->nr_pids = 0;
> - }
> -
> - if (pid_list->nr_pids >= max_pids(pid_list)) {
> - 

Re: [RFC][PATCH 2/4] tracing: Use pid bitmap instead of a pid array for set_event_pid

2016-04-21 Thread Namhyung Kim
Hi Steve,

On Tue, Apr 19, 2016 at 10:34:23AM -0400, Steven Rostedt wrote:
> From: Steven Rostedt 
> 
> In order to add the ability to let tasks that are filtered by the events
> have their children also be traced on fork (and then not traced on exit),
> convert the array into a pid bitmask. Most of the time the number of pids is
> only 32768 pids or a 4k bitmask, which is the same size as the default list
> currently is, and that list could grow if more pids are listed.
> 
> This also greatly simplifies the code.
> 
> Suggested-by: "H. Peter Anvin" 
> Signed-off-by: Steven Rostedt 
> ---

[SNIP]
> @@ -1571,7 +1572,7 @@ ftrace_event_pid_write(struct file *filp, const char 
> __user *ubuf,
>   struct seq_file *m = filp->private_data;
>   struct trace_array *tr = m->private;
>   struct trace_pid_list *filtered_pids = NULL;
> - struct trace_pid_list *pid_list = NULL;
> + struct trace_pid_list *pid_list;
>   struct trace_event_file *file;
>   struct trace_parser parser;
>   unsigned long val;
> @@ -1579,7 +1580,7 @@ ftrace_event_pid_write(struct file *filp, const char 
> __user *ubuf,
>   ssize_t read = 0;
>   ssize_t ret = 0;
>   pid_t pid;
> - int i;
> + int nr_pids = 0;
>  
>   if (!cnt)
>   return 0;
> @@ -1592,10 +1593,43 @@ ftrace_event_pid_write(struct file *filp, const char 
> __user *ubuf,
>   return -ENOMEM;
>  
>   mutex_lock(_mutex);
> + filtered_pids = rcu_dereference_protected(tr->filtered_pids,
> +  lockdep_is_held(_mutex));
> +
>   /*
> -  * Load as many pids into the array before doing a
> -  * swap from the tr->filtered_pids to the new list.
> +  * Always recreate a new array. The write is an all or nothing
> +  * operation. Always create a new array when adding new pids by
> +  * the user. If the operation fails, then the current list is
> +  * not modified.
>*/
> + pid_list = kmalloc(sizeof(*pid_list), GFP_KERNEL);
> + if (!pid_list) {
> + read = -ENOMEM;
> + goto out;
> + }
> + pid_list->pid_max = READ_ONCE(pid_max);
> + /* Only truncating will shrink pid_max */
> + if (filtered_pids && filtered_pids->pid_max > pid_list->pid_max)
> + pid_list->pid_max = filtered_pids->pid_max;
> + pid_list->pids = vzalloc((pid_list->pid_max + 7) >> 3);
> + if (!pid_list->pids) {
> + kfree(pid_list);
> + read = -ENOMEM;
> + goto out;
> + }
> + if (filtered_pids) {
> + /* copy the current bits to the new max */
> + pid = find_first_bit(filtered_pids->pids,
> +  filtered_pids->pid_max);
> + while (pid < filtered_pids->pid_max) {
> + set_bit(pid, pid_list->pids);
> + pid = find_next_bit(filtered_pids->pids,
> + filtered_pids->pid_max,
> + pid + 1);
> + nr_pids++;
> + }

Why not just use memcpy and keep nr_pids in the pid_list?

Thanks,
Namhyung



> + }
> +
>   while (cnt > 0) {
>  
>   this_pos = 0;
> @@ -1613,92 +1647,35 @@ ftrace_event_pid_write(struct file *filp, const char 
> __user *ubuf,
>   ret = -EINVAL;
>   if (kstrtoul(parser.buffer, 0, ))
>   break;
> - if (val > INT_MAX)
> + if (val >= pid_list->pid_max)
>   break;
>  
>   pid = (pid_t)val;
>  
> - ret = -ENOMEM;
> - if (!pid_list) {
> - pid_list = kmalloc(sizeof(*pid_list), GFP_KERNEL);
> - if (!pid_list)
> - break;
> -
> - filtered_pids = 
> rcu_dereference_protected(tr->filtered_pids,
> - 
> lockdep_is_held(_mutex));
> - if (filtered_pids)
> - pid_list->order = filtered_pids->order;
> - else
> - pid_list->order = 0;
> -
> - pid_list->pids = (void *)__get_free_pages(GFP_KERNEL,
> -   
> pid_list->order);
> - if (!pid_list->pids)
> - break;
> -
> - if (filtered_pids) {
> - pid_list->nr_pids = filtered_pids->nr_pids;
> - memcpy(pid_list->pids, filtered_pids->pids,
> -pid_list->nr_pids * sizeof(pid_t));
> - } else
> - pid_list->nr_pids = 0;
> - }
> -
> - if (pid_list->nr_pids >= max_pids(pid_list)) {
> - pid_t *pid_page;
> -
> - pid_page = 

Re: [RFC][PATCH 2/4] tracing: Use pid bitmap instead of a pid array for set_event_pid

2016-04-19 Thread Steven Rostedt
On Tue, 19 Apr 2016 22:59:14 + (UTC)
Mathieu Desnoyers  wrote:

> - On Apr 19, 2016, at 6:49 PM, rostedt rost...@goodmis.org wrote:
> 
> > On Tue, 19 Apr 2016 21:22:21 + (UTC)
> > Mathieu Desnoyers  wrote:
> >   
> >> It makes sense. Anyway, looking back at my own implementation, I have
> >> an array of 64 hlist_head entries (64 * 8 = 512 bytes), typically
> >> populated by NULL pointers. It's only a factor 8 smaller than the
> >> bitmap, so it's not a huge gain.  
> > 
> > Actually we talked about a second level bitmap for quicker searchers. I
> > can't remember what it was called, but I'm sure HPA can ;-)
> > 
> > Basically it was a much smaller bitmap, where each bit represents a
> > number of bits in the main bitmap. When a bit is set in the main
> > bitmap, its corresponding bit is set in the smaller one. This means
> > that if you don't have many PIDs, the smaller bitmap wont have many
> > bits set either, and you keep all the checks very cache local, because
> > you are checking the smaller bitmap most of the time. But this too
> > makes things more complex, especially when clearing a bit (although,
> > that only happens on exit, where speed isn't a big deal). But we
> > decided it still wasn't worth it.  
> 
> Seems like an interesting possible improvement if ever needed.
> 

One reason we decided against it is, if you think about use cases; if
you are tracing a single task, and other tasks are created around it in
the same time period, you will have pids of tasks running that are
close to the pid of the traced task. That means the bit in the smaller
array will most likely be always set, and now you are taking two cache
hits to find the pid you are looking for, and not gaining anything out
of it.

-- Steve


Re: [RFC][PATCH 2/4] tracing: Use pid bitmap instead of a pid array for set_event_pid

2016-04-19 Thread Steven Rostedt
On Tue, 19 Apr 2016 22:59:14 + (UTC)
Mathieu Desnoyers  wrote:

> - On Apr 19, 2016, at 6:49 PM, rostedt rost...@goodmis.org wrote:
> 
> > On Tue, 19 Apr 2016 21:22:21 + (UTC)
> > Mathieu Desnoyers  wrote:
> >   
> >> It makes sense. Anyway, looking back at my own implementation, I have
> >> an array of 64 hlist_head entries (64 * 8 = 512 bytes), typically
> >> populated by NULL pointers. It's only a factor 8 smaller than the
> >> bitmap, so it's not a huge gain.  
> > 
> > Actually we talked about a second level bitmap for quicker searchers. I
> > can't remember what it was called, but I'm sure HPA can ;-)
> > 
> > Basically it was a much smaller bitmap, where each bit represents a
> > number of bits in the main bitmap. When a bit is set in the main
> > bitmap, its corresponding bit is set in the smaller one. This means
> > that if you don't have many PIDs, the smaller bitmap wont have many
> > bits set either, and you keep all the checks very cache local, because
> > you are checking the smaller bitmap most of the time. But this too
> > makes things more complex, especially when clearing a bit (although,
> > that only happens on exit, where speed isn't a big deal). But we
> > decided it still wasn't worth it.  
> 
> Seems like an interesting possible improvement if ever needed.
> 

One reason we decided against it is, if you think about use cases; if
you are tracing a single task, and other tasks are created around it in
the same time period, you will have pids of tasks running that are
close to the pid of the traced task. That means the bit in the smaller
array will most likely be always set, and now you are taking two cache
hits to find the pid you are looking for, and not gaining anything out
of it.

-- Steve


Re: [RFC][PATCH 2/4] tracing: Use pid bitmap instead of a pid array for set_event_pid

2016-04-19 Thread Mathieu Desnoyers
- On Apr 19, 2016, at 6:49 PM, rostedt rost...@goodmis.org wrote:

> On Tue, 19 Apr 2016 21:22:21 + (UTC)
> Mathieu Desnoyers  wrote:
> 
>> It makes sense. Anyway, looking back at my own implementation, I have
>> an array of 64 hlist_head entries (64 * 8 = 512 bytes), typically
>> populated by NULL pointers. It's only a factor 8 smaller than the
>> bitmap, so it's not a huge gain.
> 
> Actually we talked about a second level bitmap for quicker searchers. I
> can't remember what it was called, but I'm sure HPA can ;-)
> 
> Basically it was a much smaller bitmap, where each bit represents a
> number of bits in the main bitmap. When a bit is set in the main
> bitmap, its corresponding bit is set in the smaller one. This means
> that if you don't have many PIDs, the smaller bitmap wont have many
> bits set either, and you keep all the checks very cache local, because
> you are checking the smaller bitmap most of the time. But this too
> makes things more complex, especially when clearing a bit (although,
> that only happens on exit, where speed isn't a big deal). But we
> decided it still wasn't worth it.

Seems like an interesting possible improvement if ever needed.

> 
>> 
>> One alternative approach would be to keep a few entries (e.g. 16 PIDs)
>> in a fast-path lookup array that fits in a single-cache line. When the
>> number of PIDs to track go beyond that, fall-back to the bitmap instead.
>> 
>> > 
>> > Note, that the check of the bitmap to trace a task or not is not done
>> > at every tracepoint. It's only done at sched_switch, and then an
>> > internal flag is set. That flag will determine if the event should be
>> > traced, and that is a single bit checked all the time (very good for
>> > cache).
>> 
>> Could this be used by multiple tracers, and used in a multi-session scheme ?
>> In lttng, one user may want to track a set of PIDs, whereas another user may
>> be concurrently interested by another set.
> 
> I should specify, the bit isn't in the task struct, because different
> trace instances may have different criteria to what task may be traced.
> That is, you can have multiple buffers tracing multiple tasks. The
> tracepoint has a private data structure attached to it that is added
> when a tracepoint is registered. This data is a descriptor that
> represents the trace instance. This instance descriptor has a flag to
> ignore or trace the task.
> 
> 
>> 
>> Currently, in the lttng kernel tracer, we do the hash table query for
>> each tracepoint hit, which is clearly not as efficient as checking a
>> task struct flag. One option I see would be to set the task struct flag
>> whenever there is at least one tracer/tracing session that is interested
>> in this event (this would end up being a reference count on the flag). Then,
>> for every flag check that passes, lttng could do HT/bitmap lookups to see if
>> the event needs to go to each specific session.
>> 
>> Is this task struct "trace" flag currently exposed to tracers through a
>> reference-counting enable/disable API ? If not, do you think it would make
>> sense ?
>> 
> 
> Nope. As I said, it's on my own descriptor that is passed through the
> tracepoint private data.
> 
> If you look at my code, you'll notice that I pass around a
> "trace_array" struct (tr), which represents all the information about a
> single trace instance. What tracer is running, what events are enabled,
> and even keeps track of the file descriptors holding the trace event
> information. This "tr" has a per_cpu "buffer" section that contains per
> cpu data (like the ring buffer). It also has a "data" section for
> miscellaneous fields. One of them is now "ignore" which is set when
> filtering is on and the sched_switch event noticed that the new task
> shouldn't be traced for this instance.
> 
> If there's multiple instances, then there will be multiple callbacks
> done at each sched_switch. One for each instance.

Got it. I'd have to extend the buffer structures available within each
session to add this extra flag, and update it for each session from the
fork/exit events to match the per-session bitmap, as well as whenever the
bitmap is modified.

Thanks for the explanation! :)

Mathieu

> 
> -- Steve

-- 
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com


Re: [RFC][PATCH 2/4] tracing: Use pid bitmap instead of a pid array for set_event_pid

2016-04-19 Thread Mathieu Desnoyers
- On Apr 19, 2016, at 6:49 PM, rostedt rost...@goodmis.org wrote:

> On Tue, 19 Apr 2016 21:22:21 + (UTC)
> Mathieu Desnoyers  wrote:
> 
>> It makes sense. Anyway, looking back at my own implementation, I have
>> an array of 64 hlist_head entries (64 * 8 = 512 bytes), typically
>> populated by NULL pointers. It's only a factor 8 smaller than the
>> bitmap, so it's not a huge gain.
> 
> Actually we talked about a second level bitmap for quicker searchers. I
> can't remember what it was called, but I'm sure HPA can ;-)
> 
> Basically it was a much smaller bitmap, where each bit represents a
> number of bits in the main bitmap. When a bit is set in the main
> bitmap, its corresponding bit is set in the smaller one. This means
> that if you don't have many PIDs, the smaller bitmap wont have many
> bits set either, and you keep all the checks very cache local, because
> you are checking the smaller bitmap most of the time. But this too
> makes things more complex, especially when clearing a bit (although,
> that only happens on exit, where speed isn't a big deal). But we
> decided it still wasn't worth it.

Seems like an interesting possible improvement if ever needed.

> 
>> 
>> One alternative approach would be to keep a few entries (e.g. 16 PIDs)
>> in a fast-path lookup array that fits in a single-cache line. When the
>> number of PIDs to track go beyond that, fall-back to the bitmap instead.
>> 
>> > 
>> > Note, that the check of the bitmap to trace a task or not is not done
>> > at every tracepoint. It's only done at sched_switch, and then an
>> > internal flag is set. That flag will determine if the event should be
>> > traced, and that is a single bit checked all the time (very good for
>> > cache).
>> 
>> Could this be used by multiple tracers, and used in a multi-session scheme ?
>> In lttng, one user may want to track a set of PIDs, whereas another user may
>> be concurrently interested by another set.
> 
> I should specify, the bit isn't in the task struct, because different
> trace instances may have different criteria to what task may be traced.
> That is, you can have multiple buffers tracing multiple tasks. The
> tracepoint has a private data structure attached to it that is added
> when a tracepoint is registered. This data is a descriptor that
> represents the trace instance. This instance descriptor has a flag to
> ignore or trace the task.
> 
> 
>> 
>> Currently, in the lttng kernel tracer, we do the hash table query for
>> each tracepoint hit, which is clearly not as efficient as checking a
>> task struct flag. One option I see would be to set the task struct flag
>> whenever there is at least one tracer/tracing session that is interested
>> in this event (this would end up being a reference count on the flag). Then,
>> for every flag check that passes, lttng could do HT/bitmap lookups to see if
>> the event needs to go to each specific session.
>> 
>> Is this task struct "trace" flag currently exposed to tracers through a
>> reference-counting enable/disable API ? If not, do you think it would make
>> sense ?
>> 
> 
> Nope. As I said, it's on my own descriptor that is passed through the
> tracepoint private data.
> 
> If you look at my code, you'll notice that I pass around a
> "trace_array" struct (tr), which represents all the information about a
> single trace instance. What tracer is running, what events are enabled,
> and even keeps track of the file descriptors holding the trace event
> information. This "tr" has a per_cpu "buffer" section that contains per
> cpu data (like the ring buffer). It also has a "data" section for
> miscellaneous fields. One of them is now "ignore" which is set when
> filtering is on and the sched_switch event noticed that the new task
> shouldn't be traced for this instance.
> 
> If there's multiple instances, then there will be multiple callbacks
> done at each sched_switch. One for each instance.

Got it. I'd have to extend the buffer structures available within each
session to add this extra flag, and update it for each session from the
fork/exit events to match the per-session bitmap, as well as whenever the
bitmap is modified.

Thanks for the explanation! :)

Mathieu

> 
> -- Steve

-- 
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com


Re: [RFC][PATCH 2/4] tracing: Use pid bitmap instead of a pid array for set_event_pid

2016-04-19 Thread Steven Rostedt
On Tue, 19 Apr 2016 21:22:21 + (UTC)
Mathieu Desnoyers  wrote:

> It makes sense. Anyway, looking back at my own implementation, I have
> an array of 64 hlist_head entries (64 * 8 = 512 bytes), typically
> populated by NULL pointers. It's only a factor 8 smaller than the
> bitmap, so it's not a huge gain.

Actually we talked about a second level bitmap for quicker searchers. I
can't remember what it was called, but I'm sure HPA can ;-)

Basically it was a much smaller bitmap, where each bit represents a
number of bits in the main bitmap. When a bit is set in the main
bitmap, its corresponding bit is set in the smaller one. This means
that if you don't have many PIDs, the smaller bitmap wont have many
bits set either, and you keep all the checks very cache local, because
you are checking the smaller bitmap most of the time. But this too
makes things more complex, especially when clearing a bit (although,
that only happens on exit, where speed isn't a big deal). But we
decided it still wasn't worth it.

> 
> One alternative approach would be to keep a few entries (e.g. 16 PIDs)
> in a fast-path lookup array that fits in a single-cache line. When the
> number of PIDs to track go beyond that, fall-back to the bitmap instead.
> 
> > 
> > Note, that the check of the bitmap to trace a task or not is not done
> > at every tracepoint. It's only done at sched_switch, and then an
> > internal flag is set. That flag will determine if the event should be
> > traced, and that is a single bit checked all the time (very good for
> > cache).  
> 
> Could this be used by multiple tracers, and used in a multi-session scheme ?
> In lttng, one user may want to track a set of PIDs, whereas another user may
> be concurrently interested by another set.

I should specify, the bit isn't in the task struct, because different
trace instances may have different criteria to what task may be traced.
That is, you can have multiple buffers tracing multiple tasks. The
tracepoint has a private data structure attached to it that is added
when a tracepoint is registered. This data is a descriptor that
represents the trace instance. This instance descriptor has a flag to
ignore or trace the task.


> 
> Currently, in the lttng kernel tracer, we do the hash table query for
> each tracepoint hit, which is clearly not as efficient as checking a
> task struct flag. One option I see would be to set the task struct flag
> whenever there is at least one tracer/tracing session that is interested
> in this event (this would end up being a reference count on the flag). Then,
> for every flag check that passes, lttng could do HT/bitmap lookups to see if
> the event needs to go to each specific session.
> 
> Is this task struct "trace" flag currently exposed to tracers through a
> reference-counting enable/disable API ? If not, do you think it would make
> sense ?
> 

Nope. As I said, it's on my own descriptor that is passed through the
tracepoint private data.

If you look at my code, you'll notice that I pass around a
"trace_array" struct (tr), which represents all the information about a
single trace instance. What tracer is running, what events are enabled,
and even keeps track of the file descriptors holding the trace event
information. This "tr" has a per_cpu "buffer" section that contains per
cpu data (like the ring buffer). It also has a "data" section for
miscellaneous fields. One of them is now "ignore" which is set when
filtering is on and the sched_switch event noticed that the new task
shouldn't be traced for this instance.

If there's multiple instances, then there will be multiple callbacks
done at each sched_switch. One for each instance.

-- Steve


Re: [RFC][PATCH 2/4] tracing: Use pid bitmap instead of a pid array for set_event_pid

2016-04-19 Thread Steven Rostedt
On Tue, 19 Apr 2016 21:22:21 + (UTC)
Mathieu Desnoyers  wrote:

> It makes sense. Anyway, looking back at my own implementation, I have
> an array of 64 hlist_head entries (64 * 8 = 512 bytes), typically
> populated by NULL pointers. It's only a factor 8 smaller than the
> bitmap, so it's not a huge gain.

Actually we talked about a second level bitmap for quicker searchers. I
can't remember what it was called, but I'm sure HPA can ;-)

Basically it was a much smaller bitmap, where each bit represents a
number of bits in the main bitmap. When a bit is set in the main
bitmap, its corresponding bit is set in the smaller one. This means
that if you don't have many PIDs, the smaller bitmap wont have many
bits set either, and you keep all the checks very cache local, because
you are checking the smaller bitmap most of the time. But this too
makes things more complex, especially when clearing a bit (although,
that only happens on exit, where speed isn't a big deal). But we
decided it still wasn't worth it.

> 
> One alternative approach would be to keep a few entries (e.g. 16 PIDs)
> in a fast-path lookup array that fits in a single-cache line. When the
> number of PIDs to track go beyond that, fall-back to the bitmap instead.
> 
> > 
> > Note, that the check of the bitmap to trace a task or not is not done
> > at every tracepoint. It's only done at sched_switch, and then an
> > internal flag is set. That flag will determine if the event should be
> > traced, and that is a single bit checked all the time (very good for
> > cache).  
> 
> Could this be used by multiple tracers, and used in a multi-session scheme ?
> In lttng, one user may want to track a set of PIDs, whereas another user may
> be concurrently interested by another set.

I should specify, the bit isn't in the task struct, because different
trace instances may have different criteria to what task may be traced.
That is, you can have multiple buffers tracing multiple tasks. The
tracepoint has a private data structure attached to it that is added
when a tracepoint is registered. This data is a descriptor that
represents the trace instance. This instance descriptor has a flag to
ignore or trace the task.


> 
> Currently, in the lttng kernel tracer, we do the hash table query for
> each tracepoint hit, which is clearly not as efficient as checking a
> task struct flag. One option I see would be to set the task struct flag
> whenever there is at least one tracer/tracing session that is interested
> in this event (this would end up being a reference count on the flag). Then,
> for every flag check that passes, lttng could do HT/bitmap lookups to see if
> the event needs to go to each specific session.
> 
> Is this task struct "trace" flag currently exposed to tracers through a
> reference-counting enable/disable API ? If not, do you think it would make
> sense ?
> 

Nope. As I said, it's on my own descriptor that is passed through the
tracepoint private data.

If you look at my code, you'll notice that I pass around a
"trace_array" struct (tr), which represents all the information about a
single trace instance. What tracer is running, what events are enabled,
and even keeps track of the file descriptors holding the trace event
information. This "tr" has a per_cpu "buffer" section that contains per
cpu data (like the ring buffer). It also has a "data" section for
miscellaneous fields. One of them is now "ignore" which is set when
filtering is on and the sched_switch event noticed that the new task
shouldn't be traced for this instance.

If there's multiple instances, then there will be multiple callbacks
done at each sched_switch. One for each instance.

-- Steve


Re: [RFC][PATCH 2/4] tracing: Use pid bitmap instead of a pid array for set_event_pid

2016-04-19 Thread Mathieu Desnoyers
- On Apr 19, 2016, at 4:50 PM, rostedt rost...@goodmis.org wrote:

> On Tue, 19 Apr 2016 20:17:29 + (UTC)
> Mathieu Desnoyers  wrote:
> 
> 
>> Ah indeed, since there is a hard limit to 4194304, that makes the
>> worse case bitmap 512k.
> 
> Yep.
> 
>> 
>> We could argue that given a sparse dataset in the PID table (typical
>> in our use-cases), a small hash table would have better cache locality
>> than the bitmap. But I agree that the hash table does add a bit of
>> complexity, so it becomes a complexity vs cache locality tradeoff.
>> So I understand why you would want to go for the simpler bitmap
>> solution, unless the hash table would prove to bring a measurable
>> performance improvement.
> 
> We discussed this too (cache locality), and came to the same conclusion
> that a bitmask would still be better. If you think about it, if you
> have a lot of CPUs and lots of PIDs, tasks don't migrate as much, and
> if they do, cache locality of this bitmap will be the least of the
> performance issues. Then you have a limited amount of PIDs per CPU, and
> thus those PIDs will probably be in the CPU cache for the bitmap.

It makes sense. Anyway, looking back at my own implementation, I have
an array of 64 hlist_head entries (64 * 8 = 512 bytes), typically
populated by NULL pointers. It's only a factor 8 smaller than the
bitmap, so it's not a huge gain.

One alternative approach would be to keep a few entries (e.g. 16 PIDs)
in a fast-path lookup array that fits in a single-cache line. When the
number of PIDs to track go beyond that, fall-back to the bitmap instead.

> 
> Note, that the check of the bitmap to trace a task or not is not done
> at every tracepoint. It's only done at sched_switch, and then an
> internal flag is set. That flag will determine if the event should be
> traced, and that is a single bit checked all the time (very good for
> cache).

Could this be used by multiple tracers, and used in a multi-session scheme ?
In lttng, one user may want to track a set of PIDs, whereas another user may
be concurrently interested by another set.

Currently, in the lttng kernel tracer, we do the hash table query for
each tracepoint hit, which is clearly not as efficient as checking a
task struct flag. One option I see would be to set the task struct flag
whenever there is at least one tracer/tracing session that is interested
in this event (this would end up being a reference count on the flag). Then,
for every flag check that passes, lttng could do HT/bitmap lookups to see if
the event needs to go to each specific session.

Is this task struct "trace" flag currently exposed to tracers through a
reference-counting enable/disable API ? If not, do you think it would make
sense ?

Thanks,

Mathieu

> 
> -- Steve
> 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-trace-users" 
> in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

-- 
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com


Re: [RFC][PATCH 2/4] tracing: Use pid bitmap instead of a pid array for set_event_pid

2016-04-19 Thread Mathieu Desnoyers
- On Apr 19, 2016, at 4:50 PM, rostedt rost...@goodmis.org wrote:

> On Tue, 19 Apr 2016 20:17:29 + (UTC)
> Mathieu Desnoyers  wrote:
> 
> 
>> Ah indeed, since there is a hard limit to 4194304, that makes the
>> worse case bitmap 512k.
> 
> Yep.
> 
>> 
>> We could argue that given a sparse dataset in the PID table (typical
>> in our use-cases), a small hash table would have better cache locality
>> than the bitmap. But I agree that the hash table does add a bit of
>> complexity, so it becomes a complexity vs cache locality tradeoff.
>> So I understand why you would want to go for the simpler bitmap
>> solution, unless the hash table would prove to bring a measurable
>> performance improvement.
> 
> We discussed this too (cache locality), and came to the same conclusion
> that a bitmask would still be better. If you think about it, if you
> have a lot of CPUs and lots of PIDs, tasks don't migrate as much, and
> if they do, cache locality of this bitmap will be the least of the
> performance issues. Then you have a limited amount of PIDs per CPU, and
> thus those PIDs will probably be in the CPU cache for the bitmap.

It makes sense. Anyway, looking back at my own implementation, I have
an array of 64 hlist_head entries (64 * 8 = 512 bytes), typically
populated by NULL pointers. It's only a factor 8 smaller than the
bitmap, so it's not a huge gain.

One alternative approach would be to keep a few entries (e.g. 16 PIDs)
in a fast-path lookup array that fits in a single-cache line. When the
number of PIDs to track go beyond that, fall-back to the bitmap instead.

> 
> Note, that the check of the bitmap to trace a task or not is not done
> at every tracepoint. It's only done at sched_switch, and then an
> internal flag is set. That flag will determine if the event should be
> traced, and that is a single bit checked all the time (very good for
> cache).

Could this be used by multiple tracers, and used in a multi-session scheme ?
In lttng, one user may want to track a set of PIDs, whereas another user may
be concurrently interested by another set.

Currently, in the lttng kernel tracer, we do the hash table query for
each tracepoint hit, which is clearly not as efficient as checking a
task struct flag. One option I see would be to set the task struct flag
whenever there is at least one tracer/tracing session that is interested
in this event (this would end up being a reference count on the flag). Then,
for every flag check that passes, lttng could do HT/bitmap lookups to see if
the event needs to go to each specific session.

Is this task struct "trace" flag currently exposed to tracers through a
reference-counting enable/disable API ? If not, do you think it would make
sense ?

Thanks,

Mathieu

> 
> -- Steve
> 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-trace-users" 
> in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

-- 
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com


Re: [RFC][PATCH 2/4] tracing: Use pid bitmap instead of a pid array for set_event_pid

2016-04-19 Thread Steven Rostedt
On Tue, 19 Apr 2016 20:17:29 + (UTC)
Mathieu Desnoyers  wrote:


> Ah indeed, since there is a hard limit to 4194304, that makes the
> worse case bitmap 512k.

Yep.

> 
> We could argue that given a sparse dataset in the PID table (typical
> in our use-cases), a small hash table would have better cache locality
> than the bitmap. But I agree that the hash table does add a bit of
> complexity, so it becomes a complexity vs cache locality tradeoff.
> So I understand why you would want to go for the simpler bitmap
> solution, unless the hash table would prove to bring a measurable
> performance improvement.

We discussed this too (cache locality), and came to the same conclusion
that a bitmask would still be better. If you think about it, if you
have a lot of CPUs and lots of PIDs, tasks don't migrate as much, and
if they do, cache locality of this bitmap will be the least of the
performance issues. Then you have a limited amount of PIDs per CPU, and
thus those PIDs will probably be in the CPU cache for the bitmap.

Note, that the check of the bitmap to trace a task or not is not done
at every tracepoint. It's only done at sched_switch, and then an
internal flag is set. That flag will determine if the event should be
traced, and that is a single bit checked all the time (very good for
cache).

-- Steve




Re: [RFC][PATCH 2/4] tracing: Use pid bitmap instead of a pid array for set_event_pid

2016-04-19 Thread Steven Rostedt
On Tue, 19 Apr 2016 20:17:29 + (UTC)
Mathieu Desnoyers  wrote:


> Ah indeed, since there is a hard limit to 4194304, that makes the
> worse case bitmap 512k.

Yep.

> 
> We could argue that given a sparse dataset in the PID table (typical
> in our use-cases), a small hash table would have better cache locality
> than the bitmap. But I agree that the hash table does add a bit of
> complexity, so it becomes a complexity vs cache locality tradeoff.
> So I understand why you would want to go for the simpler bitmap
> solution, unless the hash table would prove to bring a measurable
> performance improvement.

We discussed this too (cache locality), and came to the same conclusion
that a bitmask would still be better. If you think about it, if you
have a lot of CPUs and lots of PIDs, tasks don't migrate as much, and
if they do, cache locality of this bitmap will be the least of the
performance issues. Then you have a limited amount of PIDs per CPU, and
thus those PIDs will probably be in the CPU cache for the bitmap.

Note, that the check of the bitmap to trace a task or not is not done
at every tracepoint. It's only done at sched_switch, and then an
internal flag is set. That flag will determine if the event should be
traced, and that is a single bit checked all the time (very good for
cache).

-- Steve




Re: [RFC][PATCH 2/4] tracing: Use pid bitmap instead of a pid array for set_event_pid

2016-04-19 Thread Mathieu Desnoyers
- On Apr 19, 2016, at 3:41 PM, rostedt rost...@goodmis.org wrote:

> On Tue, 19 Apr 2016 11:57:32 -0700
> "H. Peter Anvin"  wrote:
> 
>> Also, I understand there is one of these bitmaps per ring buffer, and
>> the ring buffer is in the tens of megabytes.
> 
> Right, there's only one bitmap per tracing instance, which in most
> cases is just one (I know of people who make more). And by default, the
> tracing buffer is 1.4 megs per CPU.
> 
> If you have a pid_max of the max size, I highly doubt you will be doing
> that on a single CPU machine. If you have 48 CPUs, the ring buffer will
> be 1.4 * 48 megs, making the 1/2 meg bitmap a nit.
> 
> I will say, there may be two bitmaps soon, because I plan on adding
> this same code to the function tracer logic.

Ah indeed, since there is a hard limit to 4194304, that makes the
worse case bitmap 512k.

We could argue that given a sparse dataset in the PID table (typical
in our use-cases), a small hash table would have better cache locality
than the bitmap. But I agree that the hash table does add a bit of
complexity, so it becomes a complexity vs cache locality tradeoff.
So I understand why you would want to go for the simpler bitmap
solution, unless the hash table would prove to bring a measurable
performance improvement.

Thanks,

Mathieu

-- 
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com


Re: [RFC][PATCH 2/4] tracing: Use pid bitmap instead of a pid array for set_event_pid

2016-04-19 Thread Mathieu Desnoyers
- On Apr 19, 2016, at 3:41 PM, rostedt rost...@goodmis.org wrote:

> On Tue, 19 Apr 2016 11:57:32 -0700
> "H. Peter Anvin"  wrote:
> 
>> Also, I understand there is one of these bitmaps per ring buffer, and
>> the ring buffer is in the tens of megabytes.
> 
> Right, there's only one bitmap per tracing instance, which in most
> cases is just one (I know of people who make more). And by default, the
> tracing buffer is 1.4 megs per CPU.
> 
> If you have a pid_max of the max size, I highly doubt you will be doing
> that on a single CPU machine. If you have 48 CPUs, the ring buffer will
> be 1.4 * 48 megs, making the 1/2 meg bitmap a nit.
> 
> I will say, there may be two bitmaps soon, because I plan on adding
> this same code to the function tracer logic.

Ah indeed, since there is a hard limit to 4194304, that makes the
worse case bitmap 512k.

We could argue that given a sparse dataset in the PID table (typical
in our use-cases), a small hash table would have better cache locality
than the bitmap. But I agree that the hash table does add a bit of
complexity, so it becomes a complexity vs cache locality tradeoff.
So I understand why you would want to go for the simpler bitmap
solution, unless the hash table would prove to bring a measurable
performance improvement.

Thanks,

Mathieu

-- 
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com


Re: [RFC][PATCH 2/4] tracing: Use pid bitmap instead of a pid array for set_event_pid

2016-04-19 Thread Steven Rostedt
On Tue, 19 Apr 2016 11:57:32 -0700
"H. Peter Anvin"  wrote:

> Also, I understand there is one of these bitmaps per ring buffer, and
> the ring buffer is in the tens of megabytes.

Right, there's only one bitmap per tracing instance, which in most
cases is just one (I know of people who make more). And by default, the
tracing buffer is 1.4 megs per CPU.

If you have a pid_max of the max size, I highly doubt you will be doing
that on a single CPU machine. If you have 48 CPUs, the ring buffer will
be 1.4 * 48 megs, making the 1/2 meg bitmap a nit.

I will say, there may be two bitmaps soon, because I plan on adding
this same code to the function tracer logic.

-- Steve


Re: [RFC][PATCH 2/4] tracing: Use pid bitmap instead of a pid array for set_event_pid

2016-04-19 Thread Steven Rostedt
On Tue, 19 Apr 2016 11:57:32 -0700
"H. Peter Anvin"  wrote:

> Also, I understand there is one of these bitmaps per ring buffer, and
> the ring buffer is in the tens of megabytes.

Right, there's only one bitmap per tracing instance, which in most
cases is just one (I know of people who make more). And by default, the
tracing buffer is 1.4 megs per CPU.

If you have a pid_max of the max size, I highly doubt you will be doing
that on a single CPU machine. If you have 48 CPUs, the ring buffer will
be 1.4 * 48 megs, making the 1/2 meg bitmap a nit.

I will say, there may be two bitmaps soon, because I plan on adding
this same code to the function tracer logic.

-- Steve


Re: [RFC][PATCH 2/4] tracing: Use pid bitmap instead of a pid array for set_event_pid

2016-04-19 Thread H. Peter Anvin
On April 19, 2016 10:19:47 AM PDT, Steven Rostedt  wrote:
>On Tue, 19 Apr 2016 16:55:28 + (UTC)
>Mathieu Desnoyers  wrote:
>
>> - On Apr 19, 2016, at 10:34 AM, rostedt rost...@goodmis.org
>wrote:
>> 
>> > From: Steven Rostedt 
>> > 
>> > In order to add the ability to let tasks that are filtered by the
>events
>> > have their children also be traced on fork (and then not traced on
>exit),
>> > convert the array into a pid bitmask. Most of the time the number
>of pids is
>> > only 32768 pids or a 4k bitmask, which is the same size as the
>default list
>> > currently is, and that list could grow if more pids are listed.
>> > 
>> > This also greatly simplifies the code.  
>> 
>> The maximum PID number can be increased with sysctl.
>> 
>> See "pid_max" in Documentation/sysctl/kernel.txt
>> 
>> What happens when you have a very large pid_max set ?
>
>I discussed this with HPA, and it appears that the pid_max max would
>require a bitmap of about 1/2 meg (the current default is 8k). This is
>also why I chose to keep the bitmap as vmalloc and not a continuous
>page allocation.
>
>> 
>> You say "most of the time" as if this was a fast-path vs a slow-path,
>> but it is not the case here.
>
>I meant "most of the time" as "default". Yes, you can make the pid_max
>really big, but in that case you better have enough memory in your
>system to handle that many threads. Thus a 1/2 meg used for tracking
>pids shouldn't be an issue.
>
>> 
>> This is a configuration option that can significantly hurt memory
>usage
>> in configurations using a large pid_max.
>
>No, it is created dynamically. If you never write anything into the
>set_event_pid file, then you have nothing to worry about, as nothing
>is allocated. It creates the array when a pid is added to the file, and
>only then. If it fails to allocate, the write will return -ENOMEM as
>the
>errno.
>
>Again, if you have a large pid_max your box had better have a lot of
>memory to begin with, because this array will be negligible compared to
>the memory required to handle large number of tasks.
>
>> 
>> FWIW, I implement a similar feature with a hash table in
>lttng-modules.
>> I don't have the child process tracking though, which is a neat
>improvement.
>
>I originally had a complex hash algorithm because I too was worried
>about the size of pid_max and using a bitmap, but HPA convinced me it
>was the way to go.
>
>-- Steve

Also, I understand there is one of these bitmaps per ring buffer, and the ring 
buffer is in the tens of megabytes.
-- 
Sent from my Android device with K-9 Mail. Please excuse brevity and formatting.


Re: [RFC][PATCH 2/4] tracing: Use pid bitmap instead of a pid array for set_event_pid

2016-04-19 Thread H. Peter Anvin
On April 19, 2016 10:19:47 AM PDT, Steven Rostedt  wrote:
>On Tue, 19 Apr 2016 16:55:28 + (UTC)
>Mathieu Desnoyers  wrote:
>
>> - On Apr 19, 2016, at 10:34 AM, rostedt rost...@goodmis.org
>wrote:
>> 
>> > From: Steven Rostedt 
>> > 
>> > In order to add the ability to let tasks that are filtered by the
>events
>> > have their children also be traced on fork (and then not traced on
>exit),
>> > convert the array into a pid bitmask. Most of the time the number
>of pids is
>> > only 32768 pids or a 4k bitmask, which is the same size as the
>default list
>> > currently is, and that list could grow if more pids are listed.
>> > 
>> > This also greatly simplifies the code.  
>> 
>> The maximum PID number can be increased with sysctl.
>> 
>> See "pid_max" in Documentation/sysctl/kernel.txt
>> 
>> What happens when you have a very large pid_max set ?
>
>I discussed this with HPA, and it appears that the pid_max max would
>require a bitmap of about 1/2 meg (the current default is 8k). This is
>also why I chose to keep the bitmap as vmalloc and not a continuous
>page allocation.
>
>> 
>> You say "most of the time" as if this was a fast-path vs a slow-path,
>> but it is not the case here.
>
>I meant "most of the time" as "default". Yes, you can make the pid_max
>really big, but in that case you better have enough memory in your
>system to handle that many threads. Thus a 1/2 meg used for tracking
>pids shouldn't be an issue.
>
>> 
>> This is a configuration option that can significantly hurt memory
>usage
>> in configurations using a large pid_max.
>
>No, it is created dynamically. If you never write anything into the
>set_event_pid file, then you have nothing to worry about, as nothing
>is allocated. It creates the array when a pid is added to the file, and
>only then. If it fails to allocate, the write will return -ENOMEM as
>the
>errno.
>
>Again, if you have a large pid_max your box had better have a lot of
>memory to begin with, because this array will be negligible compared to
>the memory required to handle large number of tasks.
>
>> 
>> FWIW, I implement a similar feature with a hash table in
>lttng-modules.
>> I don't have the child process tracking though, which is a neat
>improvement.
>
>I originally had a complex hash algorithm because I too was worried
>about the size of pid_max and using a bitmap, but HPA convinced me it
>was the way to go.
>
>-- Steve

Also, I understand there is one of these bitmaps per ring buffer, and the ring 
buffer is in the tens of megabytes.
-- 
Sent from my Android device with K-9 Mail. Please excuse brevity and formatting.


Re: [RFC][PATCH 2/4] tracing: Use pid bitmap instead of a pid array for set_event_pid

2016-04-19 Thread Steven Rostedt
On Tue, 19 Apr 2016 16:55:28 + (UTC)
Mathieu Desnoyers  wrote:

> - On Apr 19, 2016, at 10:34 AM, rostedt rost...@goodmis.org wrote:
> 
> > From: Steven Rostedt 
> > 
> > In order to add the ability to let tasks that are filtered by the events
> > have their children also be traced on fork (and then not traced on exit),
> > convert the array into a pid bitmask. Most of the time the number of pids is
> > only 32768 pids or a 4k bitmask, which is the same size as the default list
> > currently is, and that list could grow if more pids are listed.
> > 
> > This also greatly simplifies the code.  
> 
> The maximum PID number can be increased with sysctl.
> 
> See "pid_max" in Documentation/sysctl/kernel.txt
> 
> What happens when you have a very large pid_max set ?

I discussed this with HPA, and it appears that the pid_max max would
require a bitmap of about 1/2 meg (the current default is 8k). This is
also why I chose to keep the bitmap as vmalloc and not a continuous
page allocation.

> 
> You say "most of the time" as if this was a fast-path vs a slow-path,
> but it is not the case here.

I meant "most of the time" as "default". Yes, you can make the pid_max
really big, but in that case you better have enough memory in your
system to handle that many threads. Thus a 1/2 meg used for tracking
pids shouldn't be an issue.

> 
> This is a configuration option that can significantly hurt memory usage
> in configurations using a large pid_max.

No, it is created dynamically. If you never write anything into the
set_event_pid file, then you have nothing to worry about, as nothing
is allocated. It creates the array when a pid is added to the file, and
only then. If it fails to allocate, the write will return -ENOMEM as the
errno.

Again, if you have a large pid_max your box had better have a lot of
memory to begin with, because this array will be negligible compared to
the memory required to handle large number of tasks.

> 
> FWIW, I implement a similar feature with a hash table in lttng-modules.
> I don't have the child process tracking though, which is a neat improvement.

I originally had a complex hash algorithm because I too was worried
about the size of pid_max and using a bitmap, but HPA convinced me it
was the way to go.

-- Steve


Re: [RFC][PATCH 2/4] tracing: Use pid bitmap instead of a pid array for set_event_pid

2016-04-19 Thread Steven Rostedt
On Tue, 19 Apr 2016 16:55:28 + (UTC)
Mathieu Desnoyers  wrote:

> - On Apr 19, 2016, at 10:34 AM, rostedt rost...@goodmis.org wrote:
> 
> > From: Steven Rostedt 
> > 
> > In order to add the ability to let tasks that are filtered by the events
> > have their children also be traced on fork (and then not traced on exit),
> > convert the array into a pid bitmask. Most of the time the number of pids is
> > only 32768 pids or a 4k bitmask, which is the same size as the default list
> > currently is, and that list could grow if more pids are listed.
> > 
> > This also greatly simplifies the code.  
> 
> The maximum PID number can be increased with sysctl.
> 
> See "pid_max" in Documentation/sysctl/kernel.txt
> 
> What happens when you have a very large pid_max set ?

I discussed this with HPA, and it appears that the pid_max max would
require a bitmap of about 1/2 meg (the current default is 8k). This is
also why I chose to keep the bitmap as vmalloc and not a continuous
page allocation.

> 
> You say "most of the time" as if this was a fast-path vs a slow-path,
> but it is not the case here.

I meant "most of the time" as "default". Yes, you can make the pid_max
really big, but in that case you better have enough memory in your
system to handle that many threads. Thus a 1/2 meg used for tracking
pids shouldn't be an issue.

> 
> This is a configuration option that can significantly hurt memory usage
> in configurations using a large pid_max.

No, it is created dynamically. If you never write anything into the
set_event_pid file, then you have nothing to worry about, as nothing
is allocated. It creates the array when a pid is added to the file, and
only then. If it fails to allocate, the write will return -ENOMEM as the
errno.

Again, if you have a large pid_max your box had better have a lot of
memory to begin with, because this array will be negligible compared to
the memory required to handle large number of tasks.

> 
> FWIW, I implement a similar feature with a hash table in lttng-modules.
> I don't have the child process tracking though, which is a neat improvement.

I originally had a complex hash algorithm because I too was worried
about the size of pid_max and using a bitmap, but HPA convinced me it
was the way to go.

-- Steve


Re: [RFC][PATCH 2/4] tracing: Use pid bitmap instead of a pid array for set_event_pid

2016-04-19 Thread Mathieu Desnoyers


- On Apr 19, 2016, at 10:34 AM, rostedt rost...@goodmis.org wrote:

> From: Steven Rostedt 
> 
> In order to add the ability to let tasks that are filtered by the events
> have their children also be traced on fork (and then not traced on exit),
> convert the array into a pid bitmask. Most of the time the number of pids is
> only 32768 pids or a 4k bitmask, which is the same size as the default list
> currently is, and that list could grow if more pids are listed.
> 
> This also greatly simplifies the code.

The maximum PID number can be increased with sysctl.

See "pid_max" in Documentation/sysctl/kernel.txt

What happens when you have a very large pid_max set ?

You say "most of the time" as if this was a fast-path vs a slow-path,
but it is not the case here.

This is a configuration option that can significantly hurt memory usage
in configurations using a large pid_max.

FWIW, I implement a similar feature with a hash table in lttng-modules.
I don't have the child process tracking though, which is a neat improvement.

Thanks,

Mathieu

> 
> Suggested-by: "H. Peter Anvin" 
> Signed-off-by: Steven Rostedt 
> ---
> kernel/trace/trace.h|   5 +-
> kernel/trace/trace_events.c | 221 
> 2 files changed, 102 insertions(+), 124 deletions(-)
> 
> diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
> index 3fff4adfd431..68cbb8e10aea 100644
> --- a/kernel/trace/trace.h
> +++ b/kernel/trace/trace.h
> @@ -177,9 +177,8 @@ struct trace_options {
> };
> 
> struct trace_pid_list {
> - unsigned intnr_pids;
> - int order;
> - pid_t   *pids;
> + int pid_max;
> + unsigned long   *pids;
> };
> 
> /*
> diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
> index 598a18675a6b..45f7cc72bf25 100644
> --- a/kernel/trace/trace_events.c
> +++ b/kernel/trace/trace_events.c
> @@ -15,7 +15,7 @@
> #include 
> #include 
> #include 
> -#include 
> +#include 
> #include 
> #include 
> #include 
> @@ -471,23 +471,13 @@ static void ftrace_clear_events(struct trace_array *tr)
>   mutex_unlock(_mutex);
> }
> 
> -static int cmp_pid(const void *key, const void *elt)
> -{
> - const pid_t *search_pid = key;
> - const pid_t *pid = elt;
> -
> - if (*search_pid == *pid)
> - return 0;
> - if (*search_pid < *pid)
> - return -1;
> - return 1;
> -}
> +/* Shouldn't this be in a header? */
> +extern int pid_max;
> 
> static bool
> ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct 
> *task)
> {
> - pid_t search_pid;
> - pid_t *pid;
> + pid_t pid;
> 
>   /*
>* Return false, because if filtered_pids does not exist,
> @@ -496,15 +486,16 @@ ignore_this_task(struct trace_pid_list *filtered_pids,
> struct task_struct *task)
>   if (!filtered_pids)
>   return false;
> 
> - search_pid = task->pid;
> + pid = task->pid;
> 
> - pid = bsearch(_pid, filtered_pids->pids,
> -   filtered_pids->nr_pids, sizeof(pid_t),
> -   cmp_pid);
> - if (!pid)
> + /*
> +  * If pid_max changed after filtered_pids was created, we
> +  * by default ignore all pids greater than the previous pid_max.
> +  */
> + if (task->pid >= filtered_pids->pid_max)
>   return true;
> 
> - return false;
> + return !test_bit(task->pid, filtered_pids->pids);
> }
> 
> static void
> @@ -602,7 +593,7 @@ static void __ftrace_clear_event_pids(struct trace_array
> *tr)
>   /* Wait till all users are no longer using pid filtering */
>   synchronize_sched();
> 
> - free_pages((unsigned long)pid_list->pids, pid_list->order);
> + vfree(pid_list->pids);
>   kfree(pid_list);
> }
> 
> @@ -946,11 +937,32 @@ static void t_stop(struct seq_file *m, void *p)
>   mutex_unlock(_mutex);
> }
> 
> +static void *
> +p_next(struct seq_file *m, void *v, loff_t *pos)
> +{
> + struct trace_array *tr = m->private;
> + struct trace_pid_list *pid_list = 
> rcu_dereference_sched(tr->filtered_pids);
> + unsigned long pid = (unsigned long)v;
> +
> + (*pos)++;
> +
> + /* pid already is +1 of the actual prevous bit */
> + pid = find_next_bit(pid_list->pids, pid_list->pid_max, pid);
> +
> + /* Return pid + 1 to allow zero to be represented */
> + if (pid < pid_list->pid_max)
> + return (void *)(pid + 1);
> +
> + return NULL;
> +}
> +
> static void *p_start(struct seq_file *m, loff_t *pos)
>   __acquires(RCU)
> {
>   struct trace_pid_list *pid_list;
>   struct trace_array *tr = m->private;
> + unsigned long pid;
> + loff_t l = 0;
> 
>   /*
>* Grab the mutex, to keep calls to p_next() having the same
> @@ -963,10 +975,18 @@ static void *p_start(struct seq_file *m, loff_t *pos)

Re: [RFC][PATCH 2/4] tracing: Use pid bitmap instead of a pid array for set_event_pid

2016-04-19 Thread Mathieu Desnoyers


- On Apr 19, 2016, at 10:34 AM, rostedt rost...@goodmis.org wrote:

> From: Steven Rostedt 
> 
> In order to add the ability to let tasks that are filtered by the events
> have their children also be traced on fork (and then not traced on exit),
> convert the array into a pid bitmask. Most of the time the number of pids is
> only 32768 pids or a 4k bitmask, which is the same size as the default list
> currently is, and that list could grow if more pids are listed.
> 
> This also greatly simplifies the code.

The maximum PID number can be increased with sysctl.

See "pid_max" in Documentation/sysctl/kernel.txt

What happens when you have a very large pid_max set ?

You say "most of the time" as if this was a fast-path vs a slow-path,
but it is not the case here.

This is a configuration option that can significantly hurt memory usage
in configurations using a large pid_max.

FWIW, I implement a similar feature with a hash table in lttng-modules.
I don't have the child process tracking though, which is a neat improvement.

Thanks,

Mathieu

> 
> Suggested-by: "H. Peter Anvin" 
> Signed-off-by: Steven Rostedt 
> ---
> kernel/trace/trace.h|   5 +-
> kernel/trace/trace_events.c | 221 
> 2 files changed, 102 insertions(+), 124 deletions(-)
> 
> diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
> index 3fff4adfd431..68cbb8e10aea 100644
> --- a/kernel/trace/trace.h
> +++ b/kernel/trace/trace.h
> @@ -177,9 +177,8 @@ struct trace_options {
> };
> 
> struct trace_pid_list {
> - unsigned intnr_pids;
> - int order;
> - pid_t   *pids;
> + int pid_max;
> + unsigned long   *pids;
> };
> 
> /*
> diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
> index 598a18675a6b..45f7cc72bf25 100644
> --- a/kernel/trace/trace_events.c
> +++ b/kernel/trace/trace_events.c
> @@ -15,7 +15,7 @@
> #include 
> #include 
> #include 
> -#include 
> +#include 
> #include 
> #include 
> #include 
> @@ -471,23 +471,13 @@ static void ftrace_clear_events(struct trace_array *tr)
>   mutex_unlock(_mutex);
> }
> 
> -static int cmp_pid(const void *key, const void *elt)
> -{
> - const pid_t *search_pid = key;
> - const pid_t *pid = elt;
> -
> - if (*search_pid == *pid)
> - return 0;
> - if (*search_pid < *pid)
> - return -1;
> - return 1;
> -}
> +/* Shouldn't this be in a header? */
> +extern int pid_max;
> 
> static bool
> ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct 
> *task)
> {
> - pid_t search_pid;
> - pid_t *pid;
> + pid_t pid;
> 
>   /*
>* Return false, because if filtered_pids does not exist,
> @@ -496,15 +486,16 @@ ignore_this_task(struct trace_pid_list *filtered_pids,
> struct task_struct *task)
>   if (!filtered_pids)
>   return false;
> 
> - search_pid = task->pid;
> + pid = task->pid;
> 
> - pid = bsearch(_pid, filtered_pids->pids,
> -   filtered_pids->nr_pids, sizeof(pid_t),
> -   cmp_pid);
> - if (!pid)
> + /*
> +  * If pid_max changed after filtered_pids was created, we
> +  * by default ignore all pids greater than the previous pid_max.
> +  */
> + if (task->pid >= filtered_pids->pid_max)
>   return true;
> 
> - return false;
> + return !test_bit(task->pid, filtered_pids->pids);
> }
> 
> static void
> @@ -602,7 +593,7 @@ static void __ftrace_clear_event_pids(struct trace_array
> *tr)
>   /* Wait till all users are no longer using pid filtering */
>   synchronize_sched();
> 
> - free_pages((unsigned long)pid_list->pids, pid_list->order);
> + vfree(pid_list->pids);
>   kfree(pid_list);
> }
> 
> @@ -946,11 +937,32 @@ static void t_stop(struct seq_file *m, void *p)
>   mutex_unlock(_mutex);
> }
> 
> +static void *
> +p_next(struct seq_file *m, void *v, loff_t *pos)
> +{
> + struct trace_array *tr = m->private;
> + struct trace_pid_list *pid_list = 
> rcu_dereference_sched(tr->filtered_pids);
> + unsigned long pid = (unsigned long)v;
> +
> + (*pos)++;
> +
> + /* pid already is +1 of the actual prevous bit */
> + pid = find_next_bit(pid_list->pids, pid_list->pid_max, pid);
> +
> + /* Return pid + 1 to allow zero to be represented */
> + if (pid < pid_list->pid_max)
> + return (void *)(pid + 1);
> +
> + return NULL;
> +}
> +
> static void *p_start(struct seq_file *m, loff_t *pos)
>   __acquires(RCU)
> {
>   struct trace_pid_list *pid_list;
>   struct trace_array *tr = m->private;
> + unsigned long pid;
> + loff_t l = 0;
> 
>   /*
>* Grab the mutex, to keep calls to p_next() having the same
> @@ -963,10 +975,18 @@ static void *p_start(struct seq_file *m, loff_t *pos)
> 
>   pid_list = 

[RFC][PATCH 2/4] tracing: Use pid bitmap instead of a pid array for set_event_pid

2016-04-19 Thread Steven Rostedt
From: Steven Rostedt 

In order to add the ability to let tasks that are filtered by the events
have their children also be traced on fork (and then not traced on exit),
convert the array into a pid bitmask. Most of the time the number of pids is
only 32768 pids or a 4k bitmask, which is the same size as the default list
currently is, and that list could grow if more pids are listed.

This also greatly simplifies the code.

Suggested-by: "H. Peter Anvin" 
Signed-off-by: Steven Rostedt 
---
 kernel/trace/trace.h|   5 +-
 kernel/trace/trace_events.c | 221 
 2 files changed, 102 insertions(+), 124 deletions(-)

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 3fff4adfd431..68cbb8e10aea 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -177,9 +177,8 @@ struct trace_options {
 };
 
 struct trace_pid_list {
-   unsigned intnr_pids;
-   int order;
-   pid_t   *pids;
+   int pid_max;
+   unsigned long   *pids;
 };
 
 /*
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 598a18675a6b..45f7cc72bf25 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -15,7 +15,7 @@
 #include 
 #include 
 #include 
-#include 
+#include 
 #include 
 #include 
 #include 
@@ -471,23 +471,13 @@ static void ftrace_clear_events(struct trace_array *tr)
mutex_unlock(_mutex);
 }
 
-static int cmp_pid(const void *key, const void *elt)
-{
-   const pid_t *search_pid = key;
-   const pid_t *pid = elt;
-
-   if (*search_pid == *pid)
-   return 0;
-   if (*search_pid < *pid)
-   return -1;
-   return 1;
-}
+/* Shouldn't this be in a header? */
+extern int pid_max;
 
 static bool
 ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct 
*task)
 {
-   pid_t search_pid;
-   pid_t *pid;
+   pid_t pid;
 
/*
 * Return false, because if filtered_pids does not exist,
@@ -496,15 +486,16 @@ ignore_this_task(struct trace_pid_list *filtered_pids, 
struct task_struct *task)
if (!filtered_pids)
return false;
 
-   search_pid = task->pid;
+   pid = task->pid;
 
-   pid = bsearch(_pid, filtered_pids->pids,
- filtered_pids->nr_pids, sizeof(pid_t),
- cmp_pid);
-   if (!pid)
+   /*
+* If pid_max changed after filtered_pids was created, we
+* by default ignore all pids greater than the previous pid_max.
+*/
+   if (task->pid >= filtered_pids->pid_max)
return true;
 
-   return false;
+   return !test_bit(task->pid, filtered_pids->pids);
 }
 
 static void
@@ -602,7 +593,7 @@ static void __ftrace_clear_event_pids(struct trace_array 
*tr)
/* Wait till all users are no longer using pid filtering */
synchronize_sched();
 
-   free_pages((unsigned long)pid_list->pids, pid_list->order);
+   vfree(pid_list->pids);
kfree(pid_list);
 }
 
@@ -946,11 +937,32 @@ static void t_stop(struct seq_file *m, void *p)
mutex_unlock(_mutex);
 }
 
+static void *
+p_next(struct seq_file *m, void *v, loff_t *pos)
+{
+   struct trace_array *tr = m->private;
+   struct trace_pid_list *pid_list = 
rcu_dereference_sched(tr->filtered_pids);
+   unsigned long pid = (unsigned long)v;
+
+   (*pos)++;
+
+   /* pid already is +1 of the actual prevous bit */
+   pid = find_next_bit(pid_list->pids, pid_list->pid_max, pid);
+
+   /* Return pid + 1 to allow zero to be represented */
+   if (pid < pid_list->pid_max)
+   return (void *)(pid + 1);
+
+   return NULL;
+}
+
 static void *p_start(struct seq_file *m, loff_t *pos)
__acquires(RCU)
 {
struct trace_pid_list *pid_list;
struct trace_array *tr = m->private;
+   unsigned long pid;
+   loff_t l = 0;
 
/*
 * Grab the mutex, to keep calls to p_next() having the same
@@ -963,10 +975,18 @@ static void *p_start(struct seq_file *m, loff_t *pos)
 
pid_list = rcu_dereference_sched(tr->filtered_pids);
 
-   if (!pid_list || *pos >= pid_list->nr_pids)
+   if (!pid_list)
+   return NULL;
+
+   pid = find_first_bit(pid_list->pids, pid_list->pid_max);
+   if (pid >= pid_list->pid_max)
return NULL;
 
-   return (void *)_list->pids[*pos];
+   /* Return pid + 1 so that zero can be the exit value */
+   for (pid++; pid && l < *pos;
+pid = (unsigned long)p_next(m, (void *)pid, ))
+   ;
+   return (void *)pid;
 }
 
 static void p_stop(struct seq_file *m, void *p)
@@ -976,25 +996,11 @@ static void p_stop(struct seq_file *m, void *p)
mutex_unlock(_mutex);
 }
 
-static void *
-p_next(struct 

[RFC][PATCH 2/4] tracing: Use pid bitmap instead of a pid array for set_event_pid

2016-04-19 Thread Steven Rostedt
From: Steven Rostedt 

In order to add the ability to let tasks that are filtered by the events
have their children also be traced on fork (and then not traced on exit),
convert the array into a pid bitmask. Most of the time the number of pids is
only 32768 pids or a 4k bitmask, which is the same size as the default list
currently is, and that list could grow if more pids are listed.

This also greatly simplifies the code.

Suggested-by: "H. Peter Anvin" 
Signed-off-by: Steven Rostedt 
---
 kernel/trace/trace.h|   5 +-
 kernel/trace/trace_events.c | 221 
 2 files changed, 102 insertions(+), 124 deletions(-)

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 3fff4adfd431..68cbb8e10aea 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -177,9 +177,8 @@ struct trace_options {
 };
 
 struct trace_pid_list {
-   unsigned intnr_pids;
-   int order;
-   pid_t   *pids;
+   int pid_max;
+   unsigned long   *pids;
 };
 
 /*
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 598a18675a6b..45f7cc72bf25 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -15,7 +15,7 @@
 #include 
 #include 
 #include 
-#include 
+#include 
 #include 
 #include 
 #include 
@@ -471,23 +471,13 @@ static void ftrace_clear_events(struct trace_array *tr)
mutex_unlock(_mutex);
 }
 
-static int cmp_pid(const void *key, const void *elt)
-{
-   const pid_t *search_pid = key;
-   const pid_t *pid = elt;
-
-   if (*search_pid == *pid)
-   return 0;
-   if (*search_pid < *pid)
-   return -1;
-   return 1;
-}
+/* Shouldn't this be in a header? */
+extern int pid_max;
 
 static bool
 ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct 
*task)
 {
-   pid_t search_pid;
-   pid_t *pid;
+   pid_t pid;
 
/*
 * Return false, because if filtered_pids does not exist,
@@ -496,15 +486,16 @@ ignore_this_task(struct trace_pid_list *filtered_pids, 
struct task_struct *task)
if (!filtered_pids)
return false;
 
-   search_pid = task->pid;
+   pid = task->pid;
 
-   pid = bsearch(_pid, filtered_pids->pids,
- filtered_pids->nr_pids, sizeof(pid_t),
- cmp_pid);
-   if (!pid)
+   /*
+* If pid_max changed after filtered_pids was created, we
+* by default ignore all pids greater than the previous pid_max.
+*/
+   if (task->pid >= filtered_pids->pid_max)
return true;
 
-   return false;
+   return !test_bit(task->pid, filtered_pids->pids);
 }
 
 static void
@@ -602,7 +593,7 @@ static void __ftrace_clear_event_pids(struct trace_array 
*tr)
/* Wait till all users are no longer using pid filtering */
synchronize_sched();
 
-   free_pages((unsigned long)pid_list->pids, pid_list->order);
+   vfree(pid_list->pids);
kfree(pid_list);
 }
 
@@ -946,11 +937,32 @@ static void t_stop(struct seq_file *m, void *p)
mutex_unlock(_mutex);
 }
 
+static void *
+p_next(struct seq_file *m, void *v, loff_t *pos)
+{
+   struct trace_array *tr = m->private;
+   struct trace_pid_list *pid_list = 
rcu_dereference_sched(tr->filtered_pids);
+   unsigned long pid = (unsigned long)v;
+
+   (*pos)++;
+
+   /* pid already is +1 of the actual prevous bit */
+   pid = find_next_bit(pid_list->pids, pid_list->pid_max, pid);
+
+   /* Return pid + 1 to allow zero to be represented */
+   if (pid < pid_list->pid_max)
+   return (void *)(pid + 1);
+
+   return NULL;
+}
+
 static void *p_start(struct seq_file *m, loff_t *pos)
__acquires(RCU)
 {
struct trace_pid_list *pid_list;
struct trace_array *tr = m->private;
+   unsigned long pid;
+   loff_t l = 0;
 
/*
 * Grab the mutex, to keep calls to p_next() having the same
@@ -963,10 +975,18 @@ static void *p_start(struct seq_file *m, loff_t *pos)
 
pid_list = rcu_dereference_sched(tr->filtered_pids);
 
-   if (!pid_list || *pos >= pid_list->nr_pids)
+   if (!pid_list)
+   return NULL;
+
+   pid = find_first_bit(pid_list->pids, pid_list->pid_max);
+   if (pid >= pid_list->pid_max)
return NULL;
 
-   return (void *)_list->pids[*pos];
+   /* Return pid + 1 so that zero can be the exit value */
+   for (pid++; pid && l < *pos;
+pid = (unsigned long)p_next(m, (void *)pid, ))
+   ;
+   return (void *)pid;
 }
 
 static void p_stop(struct seq_file *m, void *p)
@@ -976,25 +996,11 @@ static void p_stop(struct seq_file *m, void *p)
mutex_unlock(_mutex);
 }
 
-static void *
-p_next(struct seq_file *m, void *v, loff_t *pos)
-{
-   struct trace_array