[PATCH v8 2/3]: perf record: enable asynchronous trace writing

2018-09-07 Thread Alexey Budankov


Trace file offset is calculated and updated linearly prior
enqueuing aio write at record__pushfn().

record__aio_sync() blocks till completion of started AIO operation 
and then proceeds.

record__mmap_read_sync() implements a barrier for all incomplete
aio write requests.

Signed-off-by: Alexey Budankov 
---
 Changes in v8:
 -  split AIO completion check into separate record__aio_complete()
 Changes in v6:
 - handled errno == EAGAIN case from aio_write();
 Changes in v5:
 - data loss metrics decreased from 25% to 2x in trialed configuration;
 - avoided nanosleep() prior calling aio_suspend();
 - switched to per cpu multi record__aio_sync() aio
 - record_mmap_read_sync() now does global barrier just before 
   switching trace file or collection stop;
 - resolved livelock on perf record -e intel_pt// -- dd if=/dev/zero 
of=/dev/null count=10
 Changes in v4:
 - converted void *bf to struct perf_mmap *md in signatures
 - written comment in perf_mmap__push() just before perf_mmap__get();
 - written comment in record__mmap_read_sync() on possible restarting 
   of aio_write() operation and releasing perf_mmap object after all;
 - added perf_mmap__put() for the cases of failed aio_write();
 Changes in v3:
 - written comments about nanosleep(0.5ms) call prior aio_suspend()
   to cope with intrusiveness of its implementation in glibc;
 - written comments about rationale behind coping profiling data 
   into mmap->data buffer;
---
 tools/perf/builtin-record.c | 128 +++-
 tools/perf/util/mmap.c  |  54 ++-
 tools/perf/util/mmap.h  |   2 +-
 3 files changed, 169 insertions(+), 15 deletions(-)

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 22ebeb92ac51..d4857572cf33 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -121,6 +121,93 @@ static int record__write(struct record *rec, void *bf, 
size_t size)
return 0;
 }
 
+static int record__aio_write(struct aiocb *cblock, int trace_fd,
+   void *buf, size_t size, off_t off)
+{
+   int rc;
+
+   cblock->aio_fildes = trace_fd;
+   cblock->aio_buf= buf;
+   cblock->aio_nbytes = size;
+   cblock->aio_offset = off;
+   cblock->aio_sigevent.sigev_notify = SIGEV_NONE;
+
+   do {
+   rc = aio_write(cblock);
+   if (rc == 0) {
+   break;
+   } else if (errno != EAGAIN) {
+   cblock->aio_fildes = -1;
+   pr_err("failed to queue perf data, error: %m\n");
+   break;
+   }
+   } while (1);
+
+   return rc;
+}
+
+static int record__aio_complete(struct perf_mmap *md, struct aiocb *cblock)
+{
+   void *rem_buf;
+   off_t rem_off;
+   size_t rem_size;
+   int rc, aio_errno;
+   ssize_t aio_ret, written;
+
+   aio_errno = aio_error(cblock);
+   if (aio_errno == EINPROGRESS)
+   return 0;
+
+   written = aio_ret = aio_return(cblock);
+   if (aio_ret < 0) {
+   if (!(aio_errno == EINTR))
+   pr_err("failed to write perf data, error: %m\n");
+   written = 0;
+   }
+
+   rem_size = cblock->aio_nbytes - written;
+
+   if (rem_size == 0) {
+   cblock->aio_fildes = -1;
+   /*
+* md->refcount is incremented in perf_mmap__push() for
+* every enqueued aio write request so decrement it because
+* the request is now complete.
+*/
+   perf_mmap__put(md);
+   rc = 1;
+   } else {
+   /*
+* aio write request may require restart with the
+* reminder if the kernel didn't write whole
+* chunk at once.
+*/
+   rem_off = cblock->aio_offset + written;
+   rem_buf = (void *)(cblock->aio_buf + written);
+   record__aio_write(cblock, cblock->aio_fildes,
+   rem_buf, rem_size, rem_off);
+   rc = 0;
+   }
+
+   return rc;
+}
+
+static void record__aio_sync(struct perf_mmap *md)
+{
+   struct aiocb *cblock = >cblock;
+   struct timespec timeout = { 0, 1000 * 1000  * 1 }; // 1ms
+
+   do {
+   if (cblock->aio_fildes == -1 || record__aio_complete(md, 
cblock))
+   return;
+
+   while (aio_suspend((const struct aiocb**), 1, )) 
{
+   if (!(errno == EAGAIN || errno == EINTR))
+   pr_err("failed to sync perf data, error: %m\n");
+   }
+   } while (1);
+}
+
 static int process_synthesized_event(struct perf_tool *tool,
 union perf_event *event,
 struct perf_sample *sample __maybe_unused,
@@ -130,12 +217,27 @@ static int process_synthesized_event(struct 

[PATCH v8 2/3]: perf record: enable asynchronous trace writing

2018-09-07 Thread Alexey Budankov


Trace file offset is calculated and updated linearly prior
enqueuing aio write at record__pushfn().

record__aio_sync() blocks till completion of started AIO operation 
and then proceeds.

record__mmap_read_sync() implements a barrier for all incomplete
aio write requests.

Signed-off-by: Alexey Budankov 
---
 Changes in v8:
 -  split AIO completion check into separate record__aio_complete()
 Changes in v6:
 - handled errno == EAGAIN case from aio_write();
 Changes in v5:
 - data loss metrics decreased from 25% to 2x in trialed configuration;
 - avoided nanosleep() prior calling aio_suspend();
 - switched to per cpu multi record__aio_sync() aio
 - record_mmap_read_sync() now does global barrier just before 
   switching trace file or collection stop;
 - resolved livelock on perf record -e intel_pt// -- dd if=/dev/zero 
of=/dev/null count=10
 Changes in v4:
 - converted void *bf to struct perf_mmap *md in signatures
 - written comment in perf_mmap__push() just before perf_mmap__get();
 - written comment in record__mmap_read_sync() on possible restarting 
   of aio_write() operation and releasing perf_mmap object after all;
 - added perf_mmap__put() for the cases of failed aio_write();
 Changes in v3:
 - written comments about nanosleep(0.5ms) call prior aio_suspend()
   to cope with intrusiveness of its implementation in glibc;
 - written comments about rationale behind coping profiling data 
   into mmap->data buffer;
---
 tools/perf/builtin-record.c | 128 +++-
 tools/perf/util/mmap.c  |  54 ++-
 tools/perf/util/mmap.h  |   2 +-
 3 files changed, 169 insertions(+), 15 deletions(-)

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 22ebeb92ac51..d4857572cf33 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -121,6 +121,93 @@ static int record__write(struct record *rec, void *bf, 
size_t size)
return 0;
 }
 
+static int record__aio_write(struct aiocb *cblock, int trace_fd,
+   void *buf, size_t size, off_t off)
+{
+   int rc;
+
+   cblock->aio_fildes = trace_fd;
+   cblock->aio_buf= buf;
+   cblock->aio_nbytes = size;
+   cblock->aio_offset = off;
+   cblock->aio_sigevent.sigev_notify = SIGEV_NONE;
+
+   do {
+   rc = aio_write(cblock);
+   if (rc == 0) {
+   break;
+   } else if (errno != EAGAIN) {
+   cblock->aio_fildes = -1;
+   pr_err("failed to queue perf data, error: %m\n");
+   break;
+   }
+   } while (1);
+
+   return rc;
+}
+
+static int record__aio_complete(struct perf_mmap *md, struct aiocb *cblock)
+{
+   void *rem_buf;
+   off_t rem_off;
+   size_t rem_size;
+   int rc, aio_errno;
+   ssize_t aio_ret, written;
+
+   aio_errno = aio_error(cblock);
+   if (aio_errno == EINPROGRESS)
+   return 0;
+
+   written = aio_ret = aio_return(cblock);
+   if (aio_ret < 0) {
+   if (!(aio_errno == EINTR))
+   pr_err("failed to write perf data, error: %m\n");
+   written = 0;
+   }
+
+   rem_size = cblock->aio_nbytes - written;
+
+   if (rem_size == 0) {
+   cblock->aio_fildes = -1;
+   /*
+* md->refcount is incremented in perf_mmap__push() for
+* every enqueued aio write request so decrement it because
+* the request is now complete.
+*/
+   perf_mmap__put(md);
+   rc = 1;
+   } else {
+   /*
+* aio write request may require restart with the
+* reminder if the kernel didn't write whole
+* chunk at once.
+*/
+   rem_off = cblock->aio_offset + written;
+   rem_buf = (void *)(cblock->aio_buf + written);
+   record__aio_write(cblock, cblock->aio_fildes,
+   rem_buf, rem_size, rem_off);
+   rc = 0;
+   }
+
+   return rc;
+}
+
+static void record__aio_sync(struct perf_mmap *md)
+{
+   struct aiocb *cblock = >cblock;
+   struct timespec timeout = { 0, 1000 * 1000  * 1 }; // 1ms
+
+   do {
+   if (cblock->aio_fildes == -1 || record__aio_complete(md, 
cblock))
+   return;
+
+   while (aio_suspend((const struct aiocb**), 1, )) 
{
+   if (!(errno == EAGAIN || errno == EINTR))
+   pr_err("failed to sync perf data, error: %m\n");
+   }
+   } while (1);
+}
+
 static int process_synthesized_event(struct perf_tool *tool,
 union perf_event *event,
 struct perf_sample *sample __maybe_unused,
@@ -130,12 +217,27 @@ static int process_synthesized_event(struct