Re: [PATCH 3/9] perf sched replay: Alloc the memory of pid_to_task dynamically to adapt to the unexpected change of pid_max

Yunlong Song Tue, 07 Apr 2015 06:33:34 -0700

On 2015/4/1 15:06, Yunlong Song wrote:
> On 2015/3/31 23:56, Arnaldo Carvalho de Melo wrote:
>> Em Tue, Mar 31, 2015 at 08:32:37AM -0600, David Ahern escreveu:
>>> On 3/31/15 7:46 AM, Yunlong Song wrote:
>>>> -  BUG_ON(pid >= MAX_PID);
>>>> +  if (sched->pid_to_task == NULL) {
>>>> +          if (sysctl__read_int("kernel/pid_max", &pid_max) < 0)
>>>> +                  pid_max = MAX_PID;
>>>> +          BUG_ON((sched->pid_to_task = calloc(pid_max, sizeof(struct 
>>>> task_desc *))) == NULL);
>>>> +  }
>>>> +  BUG_ON(pid >= (unsigned long)pid_max);
>>  
>>> so why the previous patch bumping the MAX_PID count if you move to dynamic
>>> here? And shouldn't MAX_PID get dropped here as well?
>>  
>>> So attached is what i put together last week; just have not had time to send
>>> it out.
>>
>> Yunlong, can you please check/Ack this?
>>
>> - Arnaldo
>>
>>> >From 159dc732e0ad66d9151e93761bc9c685872e9fa4 Mon Sep 17 00:00:00 2001
>>> From: David Ahern <[email protected]>
>>> Date: Tue, 24 Mar 2015 16:57:10 -0400
>>> Subject: [PATCH 3/5] perf sched: Remove max pid assumption from perf-sched
>>>
>>> 'perf sched replay' currently fails on sparc64:
>>>     $ perf sched replay
>>>     run measurement overhead: 2475 nsecs
>>>     sleep measurement overhead: 56165 nsecs
>>>     the run test took 999705 nsecs
>>>     the sleep test took 1059270 nsecs
>>>     perf: builtin-sched.c:384: register_pid: Assertion `!(pid >= 65536)' 
>>> failed.
>>>     Aborted
>>>
>>> The max pid limitation is removed by converting pid_to_task from a
>>> pid based array to an intlist (rblist) with the pid as the index
>>> and task_desc stored in the priv element.
>>>
>>> In the process pid is converted from a long int to int.
>>>
>>> Signed-off-by: David Ahern <[email protected]>
>>> ---
>>>  tools/perf/builtin-sched.c | 30 ++++++++++++++++++++----------
>>>  1 file changed, 20 insertions(+), 10 deletions(-)
>>>
>>> diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
>>> index cc52c993a1fa..858d85396d81 100644
>>> --- a/tools/perf/builtin-sched.c
>>> +++ b/tools/perf/builtin-sched.c
>>> @@ -33,13 +33,12 @@
>>>  #define MAX_CPUS           4096
>>>  #define COMM_LEN           20
>>>  #define SYM_LEN                    129
>>> -#define MAX_PID                    65536
>>>  
>>>  struct sched_atom;
>>>  
>>>  struct task_desc {
>>>     unsigned long           nr;
>>> -   unsigned long           pid;
>>> +   int                     pid;
>>>     char                    comm[COMM_LEN];
>>>  
>>>     unsigned long           nr_events;
>>> @@ -129,7 +128,7 @@ struct perf_sched {
>>>     struct perf_tool tool;
>>>     const char       *sort_order;
>>>     unsigned long    nr_tasks;
>>> -   struct task_desc *pid_to_task[MAX_PID];
>>> +   struct intlist   *pid_to_task;
>>>     struct task_desc **tasks;
>>>     const struct trace_sched_handler *tp_handler;
>>>     pthread_mutex_t  start_work_mutex;
>>> @@ -377,14 +376,18 @@ static void add_sched_event_sleep(struct perf_sched 
>>> *sched, struct task_desc *ta
>>>  }
>>>  
>>>  static struct task_desc *register_pid(struct perf_sched *sched,
>>> -                                 unsigned long pid, const char *comm)
>>> +                                 int pid, const char *comm)
>>>  {
>>> -   struct task_desc *task;
>>>  
>>> -   BUG_ON(pid >= MAX_PID);
>>> +   struct int_node *node = intlist__findnew(sched->pid_to_task, pid);
>>> +   struct task_desc *task;
>>>  
>>> -   task = sched->pid_to_task[pid];
>>> +   if (node == NULL) {
>>> +           pr_err("Failed to allocate entry for task\n");
>>> +           return NULL;
>>> +   }
>>>  
>>> +   task = (struct task_desc *) node->priv;
>>>     if (task)
>>>             return task;
>>>  
>>> @@ -392,20 +395,21 @@ static struct task_desc *register_pid(struct 
>>> perf_sched *sched,
>>>     task->pid = pid;
>>>     task->nr = sched->nr_tasks;
>>>     strcpy(task->comm, comm);
>>> +
>>>     /*
>>>      * every task starts in sleeping state - this gets ignored
>>>      * if there's no wakeup pointing to this sleep state:
>>>      */
>>>     add_sched_event_sleep(sched, task, 0, 0);
>>>  
>>> -   sched->pid_to_task[pid] = task;
>>> +   node->priv = task;
>>>     sched->nr_tasks++;
>>>     sched->tasks = realloc(sched->tasks, sched->nr_tasks * sizeof(struct 
>>> task_task *));
>>>     BUG_ON(!sched->tasks);
>>>     sched->tasks[task->nr] = task;
>>>  
>>>     if (verbose)
>>> -           printf("registered task #%ld, PID %ld (%s)\n", sched->nr_tasks, 
>>> pid, comm);
>>> +           printf("registered task #%ld, PID %d (%s)\n", sched->nr_tasks, 
>>> pid, comm);
>>>  
>>>     return task;
>>>  }
>>> @@ -418,7 +422,7 @@ static void print_task_traces(struct perf_sched *sched)
>>>  
>>>     for (i = 0; i < sched->nr_tasks; i++) {
>>>             task = sched->tasks[i];
>>> -           printf("task %6ld (%20s:%10ld), nr_events: %ld\n",
>>> +           printf("task %6ld (%20s:%10d), nr_events: %ld\n",
>>>                     task->nr, task->comm, task->pid, task->nr_events);
>>>     }
>>>  }
>>> @@ -2981,6 +2985,12 @@ int cmd_sched(int argc, const char **argv, const 
>>> char *prefix __maybe_unused)
>>>     };
>>>     unsigned int i;
>>>  
>>> +   sched.pid_to_task = intlist__new(NULL);
>>> +   if (sched.pid_to_task == NULL) {
>>> +           pr_err("Failed to allocate intlist for tracking tasks\n");
>>> +           return -ENOMEM;
>>> +   }
>>> +
>>>     for (i = 0; i < ARRAY_SIZE(sched.curr_pid); i++)
>>>             sched.curr_pid[i] = -1;
>>>  
>>> -- 
>>> 2.3.0
>>>
>>
>>
>> .
>>
> 
> I have checked David's patch, the difference with my patch is that David's 
> patch
> uses a rblist to sort and search the pid's task with a time complexity of 
> O(log n),
> while my patch uses the array (same as the original design) dynamically 
> created with
> a time complexity of O(1). For every simple, my patch does not need to 
> traverse a
> list and has nothing to do with the total number of tasks. The maximum value 
> of
> pid_max is PID_MAX_LIMIT, which equals to 4*1024*1024 in x86_64. Then for 
> each simple,
> David's patch has to cost O(log 4*1024*1024) in searching a pid's task, vs 
> O(1) of
> my patch. So I suggest to use array instead of rblist to solve this problem 
> and save
> unnecessary waste of time.
> 
> However, if you finally decide to take David's patch rather than my patch, 
> please ignore
> the following 3 patches:
> 
> [PATCH 2/9] perf sched replay: Increase the MAX_PID value to fix assertion 
> failure problem
> [PATCH 3/9] perf sched replay: Alloc the memory of pid_to_task dynamically to 
> adapt to the
> unexpected change of pid_max
> [PATCH 4/9] perf sched replay: Realloc the memory of pid_to_task stepwise to 
> adapt to the
> different pid_max configurations
> 
> The other 6 patches in these patch sets still need to be applied to make 
> other improvements
> and bug fixes.
>


These bugs in 'perf sched replay' reproduce one after another in x86_64 (with 
many cores), and
really need urgent fix.

-- 
Thanks,
Yunlong Song

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 3/9] perf sched replay: Alloc the memory of pid_to_task dynamically to adapt to the unexpected change of pid_max

Reply via email to