[PATCH v8 2/3]: perf record: enable asynchronous trace writing

Alexey Budankov Fri, 07 Sep 2018 00:20:02 -0700


Trace file offset is calculated and updated linearly prior
enqueuing aio write at record__pushfn().


record__aio_sync() blocks till completion of started AIO operation 
and then proceeds.

record__mmap_read_sync() implements a barrier for all incomplete
aio write requests.

Signed-off-by: Alexey Budankov <[email protected]>
---
 Changes in v8:
 -  split AIO completion check into separate record__aio_complete()
 Changes in v6:
 - handled errno == EAGAIN case from aio_write();
 Changes in v5:
 - data loss metrics decreased from 25% to 2x in trialed configuration;
 - avoided nanosleep() prior calling aio_suspend();
 - switched to per cpu multi record__aio_sync() aio
 - record_mmap_read_sync() now does global barrier just before 
   switching trace file or collection stop;
 - resolved livelock on perf record -e intel_pt// -- dd if=/dev/zero 
of=/dev/null count=100000
 Changes in v4:
 - converted void *bf to struct perf_mmap *md in signatures
 - written comment in perf_mmap__push() just before perf_mmap__get();
 - written comment in record__mmap_read_sync() on possible restarting 
   of aio_write() operation and releasing perf_mmap object after all;
 - added perf_mmap__put() for the cases of failed aio_write();
 Changes in v3:
 - written comments about nanosleep(0.5ms) call prior aio_suspend()
   to cope with intrusiveness of its implementation in glibc;
 - written comments about rationale behind coping profiling data 
   into mmap->data buffer;
---
 tools/perf/builtin-record.c | 128 +++++++++++++++++++++++++++++++++++++++++++-
 tools/perf/util/mmap.c      |  54 ++++++++++++++-----
 tools/perf/util/mmap.h      |   2 +-
 3 files changed, 169 insertions(+), 15 deletions(-)

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 22ebeb92ac51..d4857572cf33 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -121,6 +121,93 @@ static int record__write(struct record *rec, void *bf, 
size_t size)
        return 0;
 }
 
+static int record__aio_write(struct aiocb *cblock, int trace_fd,
+               void *buf, size_t size, off_t off)
+{
+       int rc;
+
+       cblock->aio_fildes = trace_fd;
+       cblock->aio_buf    = buf;
+       cblock->aio_nbytes = size;
+       cblock->aio_offset = off;
+       cblock->aio_sigevent.sigev_notify = SIGEV_NONE;
+
+       do {
+               rc = aio_write(cblock);
+               if (rc == 0) {
+                       break;
+               } else if (errno != EAGAIN) {
+                       cblock->aio_fildes = -1;
+                       pr_err("failed to queue perf data, error: %m\n");
+                       break;
+               }
+       } while (1);
+
+       return rc;
+}
+
+static int record__aio_complete(struct perf_mmap *md, struct aiocb *cblock)
+{
+       void *rem_buf;
+       off_t rem_off;
+       size_t rem_size;
+       int rc, aio_errno;
+       ssize_t aio_ret, written;
+
+       aio_errno = aio_error(cblock);
+       if (aio_errno == EINPROGRESS)
+               return 0;
+
+       written = aio_ret = aio_return(cblock);
+       if (aio_ret < 0) {
+               if (!(aio_errno == EINTR))
+                       pr_err("failed to write perf data, error: %m\n");
+               written = 0;
+       }
+
+       rem_size = cblock->aio_nbytes - written;
+
+       if (rem_size == 0) {
+               cblock->aio_fildes = -1;
+               /*
+                * md->refcount is incremented in perf_mmap__push() for
+                * every enqueued aio write request so decrement it because
+                * the request is now complete.
+                */
+               perf_mmap__put(md);
+               rc = 1;
+       } else {
+               /*
+                * aio write request may require restart with the
+                * reminder if the kernel didn't write whole
+                * chunk at once.
+                */
+               rem_off = cblock->aio_offset + written;
+               rem_buf = (void *)(cblock->aio_buf + written);
+               record__aio_write(cblock, cblock->aio_fildes,
+                               rem_buf, rem_size, rem_off);
+               rc = 0;
+       }
+
+       return rc;
+}
+
+static void record__aio_sync(struct perf_mmap *md)
+{
+       struct aiocb *cblock = &md->cblock;
+       struct timespec timeout = { 0, 1000 * 1000  * 1 }; // 1ms
+
+       do {
+               if (cblock->aio_fildes == -1 || record__aio_complete(md, 
cblock))
+                       return;
+
+               while (aio_suspend((const struct aiocb**)&cblock, 1, &timeout)) 
{
+                       if (!(errno == EAGAIN || errno == EINTR))
+                               pr_err("failed to sync perf data, error: %m\n");
+               }
+       } while (1);
+}
+
 static int process_synthesized_event(struct perf_tool *tool,
                                     union perf_event *event,
                                     struct perf_sample *sample __maybe_unused,
@@ -130,12 +217,27 @@ static int process_synthesized_event(struct perf_tool 
*tool,
        return record__write(rec, event, event->header.size);
 }
 
-static int record__pushfn(void *to, void *bf, size_t size)
+static int record__pushfn(void *to, struct aiocb *cblock, void *data, size_t 
size)
 {
+       off_t off;
        struct record *rec = to;
+       int ret, trace_fd = rec->session->data->file.fd;
 
        rec->samples++;
-       return record__write(rec, bf, size);
+
+       off = lseek(trace_fd, 0, SEEK_CUR);
+       lseek(trace_fd, off + size, SEEK_SET);
+       ret = record__aio_write(cblock, trace_fd, data, size, off);
+       if (!ret) {
+               rec->bytes_written += size;
+
+               if (switch_output_size(rec))
+                       trigger_hit(&switch_output_trigger);
+       } else {
+               lseek(trace_fd, off, SEEK_SET);
+       }
+
+       return ret;
 }
 
 static volatile int done;
@@ -510,6 +612,19 @@ static struct perf_event_header finished_round_event = {
        .type = PERF_RECORD_FINISHED_ROUND,
 };
 
+static void record__mmap_read_sync(struct record *rec)
+{
+       int i;
+       struct perf_evlist *evlist = rec->evlist;
+       struct perf_mmap *maps = evlist->mmap;
+
+       for (i = 0; i < evlist->nr_mmaps; i++) {
+               struct perf_mmap *map = &maps[i];
+               if (map->base)
+                       record__aio_sync(map);
+       }
+}
+
 static int record__mmap_read_evlist(struct record *rec, struct perf_evlist 
*evlist,
                                    bool overwrite)
 {
@@ -532,6 +647,11 @@ static int record__mmap_read_evlist(struct record *rec, 
struct perf_evlist *evli
                struct auxtrace_mmap *mm = &maps[i].auxtrace_mmap;
 
                if (maps[i].base) {
+                       /*
+                        * Call record__aio_sync() to wait till map->data buffer
+                        * becomes available after previous aio write request.
+                        */
+                       record__aio_sync(&maps[i]);
                        if (perf_mmap__push(&maps[i], rec, record__pushfn) != 
0) {
                                rc = -1;
                                goto out;
@@ -1054,6 +1174,7 @@ static int __cmd_record(struct record *rec, int argc, 
const char **argv)
                        perf_evlist__toggle_bkw_mmap(rec->evlist, 
BKW_MMAP_DATA_PENDING);
 
                if (record__mmap_read_all(rec) < 0) {
+                       record__mmap_read_sync(rec);
                        trigger_error(&auxtrace_snapshot_trigger);
                        trigger_error(&switch_output_trigger);
                        err = -1;
@@ -1065,6 +1186,7 @@ static int __cmd_record(struct record *rec, int argc, 
const char **argv)
                        if (!trigger_is_error(&auxtrace_snapshot_trigger))
                                record__read_auxtrace_snapshot(rec);
                        if (trigger_is_error(&auxtrace_snapshot_trigger)) {
+                               record__mmap_read_sync(rec);
                                pr_err("AUX area tracing snapshot failed\n");
                                err = -1;
                                goto out_child;
@@ -1083,6 +1205,7 @@ static int __cmd_record(struct record *rec, int argc, 
const char **argv)
                         */
                        if (rec->evlist->bkw_mmap_state == BKW_MMAP_RUNNING)
                                continue;
+                       record__mmap_read_sync(rec);
                        trigger_ready(&switch_output_trigger);
 
                        /*
@@ -1136,6 +1259,7 @@ static int __cmd_record(struct record *rec, int argc, 
const char **argv)
                        disabled = true;
                }
        }
+       record__mmap_read_sync(rec);
        trigger_off(&auxtrace_snapshot_trigger);
        trigger_off(&switch_output_trigger);
 
diff --git a/tools/perf/util/mmap.c b/tools/perf/util/mmap.c
index e53038d76445..71c5628df3db 100644
--- a/tools/perf/util/mmap.c
+++ b/tools/perf/util/mmap.c
@@ -305,11 +305,11 @@ int perf_mmap__read_init(struct perf_mmap *map)
 }
 
 int perf_mmap__push(struct perf_mmap *md, void *to,
-                   int push(void *to, void *buf, size_t size))
+                   int push(void *to, struct aiocb *cblock, void *data, size_t 
size))
 {
        u64 head = perf_mmap__read_head(md);
        unsigned char *data = md->base + page_size;
-       unsigned long size;
+       unsigned long size, size0 = 0;
        void *buf;
        int rc = 0;
 
@@ -317,31 +317,61 @@ int perf_mmap__push(struct perf_mmap *md, void *to,
        if (rc < 0)
                return (rc == -EAGAIN) ? 0 : -1;
 
+       /*
+        * md->base data is copied into md->data buffer to
+        * release space in the kernel buffer as fast as possible,
+        * thru perf_mmap__consume() below.
+        *
+        * That lets the kernel to proceed with storing more
+        * profiling data into the kernel buffer earlier than other
+        * per-cpu kernel buffers are handled.
+        *
+        * Coping can be done in two steps in case the chunk of
+        * profiling data crosses the upper bound of the kernel buffer.
+        * In this case we first move part of data from md->start
+        * till the upper bound and then the reminder from the
+        * beginning of the kernel buffer till the end of
+        * the data chunk.
+        */
+
        size = md->end - md->start;
 
        if ((md->start & md->mask) + size != (md->end & md->mask)) {
                buf = &data[md->start & md->mask];
                size = md->mask + 1 - (md->start & md->mask);
                md->start += size;
-
-               if (push(to, buf, size) < 0) {
-                       rc = -1;
-                       goto out;
-               }
+               memcpy(md->data, buf, size);
+               size0 = size;
        }
 
        buf = &data[md->start & md->mask];
        size = md->end - md->start;
        md->start += size;
+       memcpy(md->data + size0, buf, size);
 
-       if (push(to, buf, size) < 0) {
-               rc = -1;
-               goto out;
-       }
+       /*
+        * Increment md->refcount to guard md->data buffer
+        * from premature deallocation because md object can be
+        * released earlier than aio write request started
+        * on mmap->data is complete.
+        *
+        * perf_mmap__put() is done at record__aio_complete()
+        * after started request completion.
+        */
+       perf_mmap__get(md);
 
        md->prev = head;
        perf_mmap__consume(md);
-out:
+
+       rc = push(to, &md->cblock, md->data, size0 + size);
+       if (rc) {
+               /*
+                * Decrement md->refcount back if aio write
+                * operation failed to start.
+                */
+               perf_mmap__put(md);
+       }
+
        return rc;
 }
 
diff --git a/tools/perf/util/mmap.h b/tools/perf/util/mmap.h
index 1974e621e36b..a9795a5fe200 100644
--- a/tools/perf/util/mmap.h
+++ b/tools/perf/util/mmap.h
@@ -95,7 +95,7 @@ union perf_event *perf_mmap__read_forward(struct perf_mmap 
*map);
 union perf_event *perf_mmap__read_event(struct perf_mmap *map);
 
 int perf_mmap__push(struct perf_mmap *md, void *to,
-                   int push(void *to, void *buf, size_t size));
+                   int push(void *to, struct aiocb *cblock, void *data, size_t 
size));
 
 size_t perf_mmap__mmap_len(struct perf_mmap *map);

[PATCH v8 2/3]: perf record: enable asynchronous trace writing

Reply via email to