Implemented -f,--mmap-flush option that specifies minimal size of data
chunk that is extracted from mmaped kernel buffer to store into a trace.
The default option value is 1 byte what means every time trace writing
thread finds some new data in the mmaped buffer the data is extracted,
possibly compressed and written to a trace.

  $ tools/perf/perf record -f 1024 -e cycles -- matrix.gcc
  $ tools/perf/perf record --aio -f 1K -e cycles -- matrix.gcc

The option is independent from -z setting, doesn't vary with compression
level and can serve two purposes.

The first purpose is to increase the compression ratio of a trace data.
Larger data chunks are compressed more effectively so the implemented 
option allows specifying data chunk size to compress. Also at some cases 
executing more write syscalls with smaller data size can take longer 
than executing less write syscalls with bigger data size due to syscall 
overhead so extracting bigger data chunks specified by the option value 
could additionally decrease runtime overhead.

The second purpose is to avoid self monitoring live-lock issue in system
wide (-a) profiling mode. Profiling in system wide mode with compression
(-a -z) can additionally induce data into the kernel buffers along with 
the data from monitored processes. If performance data rate and volume 
from the monitored processes is high then trace streaming and compression 
activity in the tool is also high. High tool process activity can lead 
to subtle live-lock effect when compression of single new byte from some
of mmaped kernel buffer leads to generation of the next single byte at 
some mmaped buffer. So perf tool process ends up in endless self monitoring.

Implemented sync param is the mean to force data move independently from
the specified flush threshold value. Despite the provided flush value the
tool needs capability to unconditionally drain memory buffers, at least
in the end of the collection.

Signed-off-by: Alexey Budankov <alexey.budan...@linux.intel.com>
---
 tools/perf/Documentation/perf-record.txt | 13 +++++
 tools/perf/builtin-record.c              | 65 +++++++++++++++++++++---
 tools/perf/perf.h                        |  1 +
 tools/perf/util/evlist.c                 |  6 +--
 tools/perf/util/evlist.h                 |  3 +-
 tools/perf/util/mmap.c                   |  4 +-
 tools/perf/util/mmap.h                   |  3 +-
 7 files changed, 83 insertions(+), 12 deletions(-)

diff --git a/tools/perf/Documentation/perf-record.txt 
b/tools/perf/Documentation/perf-record.txt
index 8f0c2be34848..d1e6c1fd7387 100644
--- a/tools/perf/Documentation/perf-record.txt
+++ b/tools/perf/Documentation/perf-record.txt
@@ -459,6 +459,19 @@ Set affinity mask of trace reading thread according to the 
policy defined by 'mo
   node - thread affinity mask is set to NUMA node cpu mask of the processed 
mmap buffer
   cpu  - thread affinity mask is set to cpu of the processed mmap buffer
 
+-f::
+--mmap-flush=n::
+Specify minimal number of bytes that is extracted from mmap data pages and 
stored
+into a trace. The number specification is possible using B/K/M/G suffixes. 
Maximal allowed
+value is a quarter of the size of mmaped data pages. The default option value 
is 1 byte
+what means that every time trace writing thread finds some new data in the 
mmaped buffer
+the data is extracted, possibly compressed (-z) and written to a trace. Larger 
data chunks
+are compressed more effectively in comparison to smaller chunks so extraction 
of larger
+chunks from the mmap data pages is preferable from perspective of trace size 
reduction.
+Also at some cases executing less trace write syscalls with bigger data size 
can take
+shorter than executing more trace write syscalls with smaller data size thus 
lowering
+runtime profiling overhead.
+
 --all-kernel::
 Configure all used events to run in kernel space.
 
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index a468d882e74f..736a0f008959 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -334,6 +334,41 @@ static int record__aio_enabled(struct record *rec)
        return rec->opts.nr_cblocks > 0;
 }
 
+#define MMAP_FLUSH_DEFAULT 1
+static int record__mmap_flush_parse(const struct option *opt,
+                                   const char *str,
+                                   int unset)
+{
+       int flush_max;
+       struct record_opts *opts = (struct record_opts *)opt->value;
+       static struct parse_tag tags[] = {
+                       { .tag  = 'B', .mult = 1       },
+                       { .tag  = 'K', .mult = 1 << 10 },
+                       { .tag  = 'M', .mult = 1 << 20 },
+                       { .tag  = 'G', .mult = 1 << 30 },
+                       { .tag  = 0 },
+       };
+
+       if (unset)
+               return 0;
+
+       if (str) {
+               opts->mmap_flush = parse_tag_value(str, tags);
+               if (opts->mmap_flush == (int)-1)
+                       opts->mmap_flush = strtol(str, NULL, 0);
+       }
+
+       if (!opts->mmap_flush)
+               opts->mmap_flush = MMAP_FLUSH_DEFAULT;
+
+       flush_max = perf_evlist__mmap_size(opts->mmap_pages);
+       flush_max /= 4;
+       if (opts->mmap_flush > flush_max)
+               opts->mmap_flush = flush_max;
+
+       return 0;
+}
+
 static int process_synthesized_event(struct perf_tool *tool,
                                     union perf_event *event,
                                     struct perf_sample *sample __maybe_unused,
@@ -543,7 +578,8 @@ static int record__mmap_evlist(struct record *rec,
        if (perf_evlist__mmap_ex(evlist, opts->mmap_pages,
                                 opts->auxtrace_mmap_pages,
                                 opts->auxtrace_snapshot_mode,
-                                opts->nr_cblocks, opts->affinity) < 0) {
+                                opts->nr_cblocks, opts->affinity,
+                                opts->mmap_flush) < 0) {
                if (errno == EPERM) {
                        pr_err("Permission error mapping pages.\n"
                               "Consider increasing "
@@ -733,7 +769,7 @@ static void record__adjust_affinity(struct record *rec, 
struct perf_mmap *map)
 }
 
 static int record__mmap_read_evlist(struct record *rec, struct perf_evlist 
*evlist,
-                                   bool overwrite)
+                                   bool overwrite, bool sync)
 {
        u64 bytes_written = rec->bytes_written;
        int i;
@@ -756,12 +792,19 @@ static int record__mmap_read_evlist(struct record *rec, 
struct perf_evlist *evli
                off = record__aio_get_pos(trace_fd);
 
        for (i = 0; i < evlist->nr_mmaps; i++) {
+               u64 flush = 0;
                struct perf_mmap *map = &maps[i];
 
                if (map->base) {
                        record__adjust_affinity(rec, map);
+                       if (sync) {
+                               flush = map->flush;
+                               map->flush = 1;
+                       }
                        if (!record__aio_enabled(rec)) {
                                if (perf_mmap__push(map, rec, record__pushfn) 
!= 0) {
+                                       if (sync)
+                                               map->flush = flush;
                                        rc = -1;
                                        goto out;
                                }
@@ -774,10 +817,14 @@ static int record__mmap_read_evlist(struct record *rec, 
struct perf_evlist *evli
                                idx = record__aio_sync(map, false);
                                if (perf_mmap__aio_push(map, rec, idx, 
record__aio_pushfn, &off) != 0) {
                                        record__aio_set_pos(trace_fd, off);
+                                       if (sync)
+                                               map->flush = flush;
                                        rc = -1;
                                        goto out;
                                }
                        }
+                       if (sync)
+                               map->flush = flush;
                }
 
                if (map->auxtrace_mmap.base && 
!rec->opts.auxtrace_snapshot_mode &&
@@ -803,15 +850,15 @@ static int record__mmap_read_evlist(struct record *rec, 
struct perf_evlist *evli
        return rc;
 }
 
-static int record__mmap_read_all(struct record *rec)
+static int record__mmap_read_all(struct record *rec, bool sync)
 {
        int err;
 
-       err = record__mmap_read_evlist(rec, rec->evlist, false);
+       err = record__mmap_read_evlist(rec, rec->evlist, false, sync);
        if (err)
                return err;
 
-       return record__mmap_read_evlist(rec, rec->evlist, true);
+       return record__mmap_read_evlist(rec, rec->evlist, true, sync);
 }
 
 static void record__init_features(struct record *rec)
@@ -1312,7 +1359,7 @@ static int __cmd_record(struct record *rec, int argc, 
const char **argv)
                if (trigger_is_hit(&switch_output_trigger) || done || draining)
                        perf_evlist__toggle_bkw_mmap(rec->evlist, 
BKW_MMAP_DATA_PENDING);
 
-               if (record__mmap_read_all(rec) < 0) {
+               if (record__mmap_read_all(rec, false) < 0) {
                        trigger_error(&auxtrace_snapshot_trigger);
                        trigger_error(&switch_output_trigger);
                        err = -1;
@@ -1413,6 +1460,7 @@ static int __cmd_record(struct record *rec, int argc, 
const char **argv)
                record__synthesize_workload(rec, true);
 
 out_child:
+       record__mmap_read_all(rec, true);
        record__aio_mmap_read_sync(rec);
 
        if (forks) {
@@ -1815,6 +1863,7 @@ static struct record record = {
                        .uses_mmap   = true,
                        .default_per_cpu = true,
                },
+               .mmap_flush          = MMAP_FLUSH_DEFAULT,
        },
        .tool = {
                .sample         = process_sample_event,
@@ -1881,6 +1930,9 @@ static struct option __record_options[] = {
        OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
                     "number of mmap data pages and AUX area tracing mmap 
pages",
                     record__parse_mmap_pages),
+       OPT_CALLBACK('f', "mmap-flush", &record.opts, "bytes",
+                    "Minimal number of bytes that is extracted from mmap data 
pages (default: 1)",
+                    record__mmap_flush_parse),
        OPT_BOOLEAN(0, "group", &record.opts.group,
                    "put the counters into a counter group"),
        OPT_CALLBACK_NOOPT('g', NULL, &callchain_param,
@@ -2184,6 +2236,7 @@ int cmd_record(int argc, const char **argv)
                pr_info("nr_cblocks: %d\n", rec->opts.nr_cblocks);
 
        pr_debug("affinity: %s\n", affinity_tags[rec->opts.affinity]);
+       pr_debug("mmap flush: %d\n", rec->opts.mmap_flush);
 
        err = __cmd_record(&record, argc, argv);
 out:
diff --git a/tools/perf/perf.h b/tools/perf/perf.h
index b120e547ddc7..7886cc9771cf 100644
--- a/tools/perf/perf.h
+++ b/tools/perf/perf.h
@@ -85,6 +85,7 @@ struct record_opts {
        u64          clockid_res_ns;
        int          nr_cblocks;
        int          affinity;
+       int          mmap_flush;
 };
 
 enum perf_affinity {
diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c
index ed20f4379956..8858d829983b 100644
--- a/tools/perf/util/evlist.c
+++ b/tools/perf/util/evlist.c
@@ -1037,7 +1037,7 @@ int perf_evlist__parse_mmap_pages(const struct option 
*opt, const char *str,
  */
 int perf_evlist__mmap_ex(struct perf_evlist *evlist, unsigned int pages,
                         unsigned int auxtrace_pages,
-                        bool auxtrace_overwrite, int nr_cblocks, int affinity)
+                        bool auxtrace_overwrite, int nr_cblocks, int affinity, 
int flush)
 {
        struct perf_evsel *evsel;
        const struct cpu_map *cpus = evlist->cpus;
@@ -1047,7 +1047,7 @@ int perf_evlist__mmap_ex(struct perf_evlist *evlist, 
unsigned int pages,
         * Its value is decided by evsel's write_backward.
         * So &mp should not be passed through const pointer.
         */
-       struct mmap_params mp = { .nr_cblocks = nr_cblocks, .affinity = 
affinity };
+       struct mmap_params mp = { .nr_cblocks = nr_cblocks, .affinity = 
affinity, .flush = flush };
 
        if (!evlist->mmap)
                evlist->mmap = perf_evlist__alloc_mmap(evlist, false);
@@ -1079,7 +1079,7 @@ int perf_evlist__mmap_ex(struct perf_evlist *evlist, 
unsigned int pages,
 
 int perf_evlist__mmap(struct perf_evlist *evlist, unsigned int pages)
 {
-       return perf_evlist__mmap_ex(evlist, pages, 0, false, 0, 
PERF_AFFINITY_SYS);
+       return perf_evlist__mmap_ex(evlist, pages, 0, false, 0, 
PERF_AFFINITY_SYS, 1);
 }
 
 int perf_evlist__create_maps(struct perf_evlist *evlist, struct target *target)
diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h
index 744906dd4887..edf18811e39f 100644
--- a/tools/perf/util/evlist.h
+++ b/tools/perf/util/evlist.h
@@ -165,7 +165,8 @@ unsigned long perf_event_mlock_kb_in_pages(void);
 
 int perf_evlist__mmap_ex(struct perf_evlist *evlist, unsigned int pages,
                         unsigned int auxtrace_pages,
-                        bool auxtrace_overwrite, int nr_cblocks, int affinity);
+                        bool auxtrace_overwrite, int nr_cblocks,
+                        int affinity, int flush);
 int perf_evlist__mmap(struct perf_evlist *evlist, unsigned int pages);
 void perf_evlist__munmap(struct perf_evlist *evlist);
 
diff --git a/tools/perf/util/mmap.c b/tools/perf/util/mmap.c
index cdc7740fc181..ef3d79b2c90b 100644
--- a/tools/perf/util/mmap.c
+++ b/tools/perf/util/mmap.c
@@ -440,6 +440,8 @@ int perf_mmap__mmap(struct perf_mmap *map, struct 
mmap_params *mp, int fd, int c
 
        perf_mmap__setup_affinity_mask(map, mp);
 
+       map->flush = mp->flush;
+
        if (auxtrace_mmap__mmap(&map->auxtrace_mmap,
                                &mp->auxtrace_mp, map->base, fd))
                return -1;
@@ -492,7 +494,7 @@ static int __perf_mmap__read_init(struct perf_mmap *md)
        md->start = md->overwrite ? head : old;
        md->end = md->overwrite ? old : head;
 
-       if (md->start == md->end)
+       if ((md->end - md->start) < md->flush)
                return -EAGAIN;
 
        size = md->end - md->start;
diff --git a/tools/perf/util/mmap.h b/tools/perf/util/mmap.h
index e566c19b242b..b82f8c2d55c4 100644
--- a/tools/perf/util/mmap.h
+++ b/tools/perf/util/mmap.h
@@ -39,6 +39,7 @@ struct perf_mmap {
        } aio;
 #endif
        cpu_set_t       affinity_mask;
+       u64             flush;
 };
 
 /*
@@ -70,7 +71,7 @@ enum bkw_mmap_state {
 };
 
 struct mmap_params {
-       int                         prot, mask, nr_cblocks, affinity;
+       int                         prot, mask, nr_cblocks, affinity, flush;
        struct auxtrace_mmap_params auxtrace_mp;
 };
 
-- 
2.20.1

Reply via email to