> Em Fri, Oct 13, 2017 at 07:09:26AM -0700, kan.li...@intel.com escreveu:
> > From: Kan Liang <kan.li...@intel.com>
> >
> > The process function process_synthesized_event writes the process
> > result to perf.data, which is not multithreading friendly.
> >
> > Realloc buffer for each thread to temporarily keep the processing
> > result. Write them to the perf.data at the end of event synthesization.
> > The new method doesn't impact the final result, because
> >  - The order of the synthesized event is not important.
> >  - The number of synthesized event is limited. Usually, it only needs
> >    hundreds of Kilobyte to store all the synthesized event.
> >    It's unlikly failed because of lack of memory.
> 
> Why not write to a per cpu file and then at the end merge them? 

I just thought merging from memory should be faster than merging from files.

> Leave
> the memory management to the kernel, i.e. in most cases you may even not
> end up touching the disk, when memory is plentiful, just rewind the per
> event files and go on dumping to the main perf.data file.
>

Agree. I will do it.

> At some point we may just don't do this merging, and keep per cpu files
> all the way to perf report, etc. This would be a first foray into
> that...
>

Yes, but it's a little bit complex.
I think I will do it in a separate improvement patch series later.
For now, I will do the file merge.

Thanks,
Kan
 
> - Arnaldo
> 
> > The threads number hard code to online CPU number. The following patch
> > will introduce an option to set it.
> >
> > The multithreading synthesize is only available for per cpu monitoring.
> >
> > Signed-off-by: Kan Liang <kan.li...@intel.com>
> > ---
> >  tools/perf/builtin-record.c | 86
> ++++++++++++++++++++++++++++++++++++++++++---
> >  1 file changed, 81 insertions(+), 5 deletions(-)
> >
> > diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
> > index 4ede9bf..1978021 100644
> > --- a/tools/perf/builtin-record.c
> > +++ b/tools/perf/builtin-record.c
> > @@ -62,6 +62,11 @@ struct switch_output {
> >     bool             set;
> >  };
> >
> > +struct synthesize_buf {
> > +   void            *entry;
> > +   size_t          size;
> > +};
> > +
> >  struct record {
> >     struct perf_tool        tool;
> >     struct record_opts      opts;
> > @@ -80,6 +85,7 @@ struct record {
> >     bool                    timestamp_filename;
> >     struct switch_output    switch_output;
> >     unsigned long long      samples;
> > +   struct synthesize_buf   *buf;
> >  };
> >
> >  static volatile int auxtrace_record__snapshot_started;
> > @@ -120,14 +126,35 @@ static int record__write(struct record *rec, void
> *bf, size_t size)
> >     return 0;
> >  }
> >
> > +static int buf__write(struct record *rec, int idx, void *bf, size_t size)
> > +{
> > +   size_t target_size = rec->buf[idx].size;
> > +   void *target;
> > +
> > +   target = realloc(rec->buf[idx].entry, target_size + size);
> > +   if (target == NULL)
> > +           return -ENOMEM;
> > +
> > +   memcpy(target + target_size, bf, size);
> > +
> > +   rec->buf[idx].entry = target;
> > +   rec->buf[idx].size += size;
> > +
> > +   return 0;
> > +}
> > +
> >  static int process_synthesized_event(struct perf_tool *tool,
> >                                  union perf_event *event,
> >                                  struct perf_sample *sample
> __maybe_unused,
> >                                  struct machine *machine
> __maybe_unused,
> > -                                struct thread_info *thread
> __maybe_unused)
> > +                                struct thread_info *thread)
> >  {
> >     struct record *rec = container_of(tool, struct record, tool);
> > -   return record__write(rec, event, event->header.size);
> > +
> > +   if (!perf_singlethreaded && thread)
> > +           return buf__write(rec, thread->idx, event, event-
> >header.size);
> > +   else
> > +           return record__write(rec, event, event->header.size);
> >  }
> >
> >  static int record__pushfn(void *to, void *bf, size_t size)
> > @@ -690,6 +717,48 @@ static const struct perf_event_mmap_page
> *record__pick_pc(struct record *rec)
> >     return NULL;
> >  }
> >
> > +static int record__multithread_synthesize(struct record *rec,
> > +                                     struct machine *machine,
> > +                                     struct perf_tool *tool,
> > +                                     struct record_opts *opts)
> > +{
> > +   int i, err, nr_thread = sysconf(_SC_NPROCESSORS_ONLN);
> > +
> > +   if (nr_thread > 1) {
> > +           perf_set_multithreaded();
> > +
> > +           rec->buf = calloc(nr_thread, sizeof(struct synthesize_buf));
> > +           if (rec->buf == NULL) {
> > +                   pr_err("Could not do multithread synthesize\n");
> > +                   nr_thread = 1;
> > +                   perf_set_singlethreaded();
> > +           }
> > +   }
> > +
> > +   err = __machine__synthesize_threads(machine, tool, &opts->target,
> > +                                       rec->evlist->threads,
> > +                                       process_synthesized_event,
> > +                                       opts->sample_address,
> > +                                       opts->proc_map_timeout,
> nr_thread);
> > +   if (err < 0)
> > +           goto free;
> > +
> > +   if (nr_thread > 1) {
> > +           for (i = 0; i < nr_thread; i++)
> > +                   record__write(rec, rec->buf[i].entry, rec->buf[i].size);
> > +   }
> > +
> > +free:
> > +   if (nr_thread > 1) {
> > +           for (i = 0; i < nr_thread; i++)
> > +                   free(rec->buf[i].entry);
> > +           free(rec->buf);
> > +           perf_set_singlethreaded();
> > +   }
> > +
> > +   return err;
> > +}
> > +
> >  static int record__synthesize(struct record *rec, bool tail)
> >  {
> >     struct perf_session *session = rec->session;
> > @@ -766,9 +835,16 @@ static int record__synthesize(struct record *rec,
> bool tail)
> >                                      perf_event__synthesize_guest_os,
> tool);
> >     }
> >
> > -   err = __machine__synthesize_threads(machine, tool, &opts->target,
> rec->evlist->threads,
> > -                                       process_synthesized_event, opts-
> >sample_address,
> > -                                       opts->proc_map_timeout, 1);
> > +   /* multithreading synthesize is only available for cpu monitoring */
> > +   if (target__has_cpu(&opts->target))
> > +           err = record__multithread_synthesize(rec, machine, tool,
> opts);
> > +   else
> > +           err = __machine__synthesize_threads(machine, tool,
> > +                                               &opts->target,
> > +                                               rec->evlist->threads,
> > +                                               process_synthesized_event,
> > +                                               opts->sample_address,
> > +                                               opts->proc_map_timeout,
> 1);
> >  out:
> >     return err;
> >  }
> > --
> > 2.7.4

Reply via email to