[PATCH v8 2/3]: perf record: enable asynchronous trace writing
Trace file offset is calculated and updated linearly prior enqueuing aio write at record__pushfn(). record__aio_sync() blocks till completion of started AIO operation and then proceeds. record__mmap_read_sync() implements a barrier for all incomplete aio write requests. Signed-off-by: Alexey Budankov --- Changes in v8: - split AIO completion check into separate record__aio_complete() Changes in v6: - handled errno == EAGAIN case from aio_write(); Changes in v5: - data loss metrics decreased from 25% to 2x in trialed configuration; - avoided nanosleep() prior calling aio_suspend(); - switched to per cpu multi record__aio_sync() aio - record_mmap_read_sync() now does global barrier just before switching trace file or collection stop; - resolved livelock on perf record -e intel_pt// -- dd if=/dev/zero of=/dev/null count=10 Changes in v4: - converted void *bf to struct perf_mmap *md in signatures - written comment in perf_mmap__push() just before perf_mmap__get(); - written comment in record__mmap_read_sync() on possible restarting of aio_write() operation and releasing perf_mmap object after all; - added perf_mmap__put() for the cases of failed aio_write(); Changes in v3: - written comments about nanosleep(0.5ms) call prior aio_suspend() to cope with intrusiveness of its implementation in glibc; - written comments about rationale behind coping profiling data into mmap->data buffer; --- tools/perf/builtin-record.c | 128 +++- tools/perf/util/mmap.c | 54 ++- tools/perf/util/mmap.h | 2 +- 3 files changed, 169 insertions(+), 15 deletions(-) diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 22ebeb92ac51..d4857572cf33 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -121,6 +121,93 @@ static int record__write(struct record *rec, void *bf, size_t size) return 0; } +static int record__aio_write(struct aiocb *cblock, int trace_fd, + void *buf, size_t size, off_t off) +{ + int rc; + + cblock->aio_fildes = trace_fd; + cblock->aio_buf= buf; + cblock->aio_nbytes = size; + cblock->aio_offset = off; + cblock->aio_sigevent.sigev_notify = SIGEV_NONE; + + do { + rc = aio_write(cblock); + if (rc == 0) { + break; + } else if (errno != EAGAIN) { + cblock->aio_fildes = -1; + pr_err("failed to queue perf data, error: %m\n"); + break; + } + } while (1); + + return rc; +} + +static int record__aio_complete(struct perf_mmap *md, struct aiocb *cblock) +{ + void *rem_buf; + off_t rem_off; + size_t rem_size; + int rc, aio_errno; + ssize_t aio_ret, written; + + aio_errno = aio_error(cblock); + if (aio_errno == EINPROGRESS) + return 0; + + written = aio_ret = aio_return(cblock); + if (aio_ret < 0) { + if (!(aio_errno == EINTR)) + pr_err("failed to write perf data, error: %m\n"); + written = 0; + } + + rem_size = cblock->aio_nbytes - written; + + if (rem_size == 0) { + cblock->aio_fildes = -1; + /* +* md->refcount is incremented in perf_mmap__push() for +* every enqueued aio write request so decrement it because +* the request is now complete. +*/ + perf_mmap__put(md); + rc = 1; + } else { + /* +* aio write request may require restart with the +* reminder if the kernel didn't write whole +* chunk at once. +*/ + rem_off = cblock->aio_offset + written; + rem_buf = (void *)(cblock->aio_buf + written); + record__aio_write(cblock, cblock->aio_fildes, + rem_buf, rem_size, rem_off); + rc = 0; + } + + return rc; +} + +static void record__aio_sync(struct perf_mmap *md) +{ + struct aiocb *cblock = >cblock; + struct timespec timeout = { 0, 1000 * 1000 * 1 }; // 1ms + + do { + if (cblock->aio_fildes == -1 || record__aio_complete(md, cblock)) + return; + + while (aio_suspend((const struct aiocb**), 1, )) { + if (!(errno == EAGAIN || errno == EINTR)) + pr_err("failed to sync perf data, error: %m\n"); + } + } while (1); +} + static int process_synthesized_event(struct perf_tool *tool, union perf_event *event, struct perf_sample *sample __maybe_unused, @@ -130,12 +217,27 @@ static int process_synthesized_event(struct
[PATCH v8 2/3]: perf record: enable asynchronous trace writing
Trace file offset is calculated and updated linearly prior enqueuing aio write at record__pushfn(). record__aio_sync() blocks till completion of started AIO operation and then proceeds. record__mmap_read_sync() implements a barrier for all incomplete aio write requests. Signed-off-by: Alexey Budankov --- Changes in v8: - split AIO completion check into separate record__aio_complete() Changes in v6: - handled errno == EAGAIN case from aio_write(); Changes in v5: - data loss metrics decreased from 25% to 2x in trialed configuration; - avoided nanosleep() prior calling aio_suspend(); - switched to per cpu multi record__aio_sync() aio - record_mmap_read_sync() now does global barrier just before switching trace file or collection stop; - resolved livelock on perf record -e intel_pt// -- dd if=/dev/zero of=/dev/null count=10 Changes in v4: - converted void *bf to struct perf_mmap *md in signatures - written comment in perf_mmap__push() just before perf_mmap__get(); - written comment in record__mmap_read_sync() on possible restarting of aio_write() operation and releasing perf_mmap object after all; - added perf_mmap__put() for the cases of failed aio_write(); Changes in v3: - written comments about nanosleep(0.5ms) call prior aio_suspend() to cope with intrusiveness of its implementation in glibc; - written comments about rationale behind coping profiling data into mmap->data buffer; --- tools/perf/builtin-record.c | 128 +++- tools/perf/util/mmap.c | 54 ++- tools/perf/util/mmap.h | 2 +- 3 files changed, 169 insertions(+), 15 deletions(-) diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 22ebeb92ac51..d4857572cf33 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -121,6 +121,93 @@ static int record__write(struct record *rec, void *bf, size_t size) return 0; } +static int record__aio_write(struct aiocb *cblock, int trace_fd, + void *buf, size_t size, off_t off) +{ + int rc; + + cblock->aio_fildes = trace_fd; + cblock->aio_buf= buf; + cblock->aio_nbytes = size; + cblock->aio_offset = off; + cblock->aio_sigevent.sigev_notify = SIGEV_NONE; + + do { + rc = aio_write(cblock); + if (rc == 0) { + break; + } else if (errno != EAGAIN) { + cblock->aio_fildes = -1; + pr_err("failed to queue perf data, error: %m\n"); + break; + } + } while (1); + + return rc; +} + +static int record__aio_complete(struct perf_mmap *md, struct aiocb *cblock) +{ + void *rem_buf; + off_t rem_off; + size_t rem_size; + int rc, aio_errno; + ssize_t aio_ret, written; + + aio_errno = aio_error(cblock); + if (aio_errno == EINPROGRESS) + return 0; + + written = aio_ret = aio_return(cblock); + if (aio_ret < 0) { + if (!(aio_errno == EINTR)) + pr_err("failed to write perf data, error: %m\n"); + written = 0; + } + + rem_size = cblock->aio_nbytes - written; + + if (rem_size == 0) { + cblock->aio_fildes = -1; + /* +* md->refcount is incremented in perf_mmap__push() for +* every enqueued aio write request so decrement it because +* the request is now complete. +*/ + perf_mmap__put(md); + rc = 1; + } else { + /* +* aio write request may require restart with the +* reminder if the kernel didn't write whole +* chunk at once. +*/ + rem_off = cblock->aio_offset + written; + rem_buf = (void *)(cblock->aio_buf + written); + record__aio_write(cblock, cblock->aio_fildes, + rem_buf, rem_size, rem_off); + rc = 0; + } + + return rc; +} + +static void record__aio_sync(struct perf_mmap *md) +{ + struct aiocb *cblock = >cblock; + struct timespec timeout = { 0, 1000 * 1000 * 1 }; // 1ms + + do { + if (cblock->aio_fildes == -1 || record__aio_complete(md, cblock)) + return; + + while (aio_suspend((const struct aiocb**), 1, )) { + if (!(errno == EAGAIN || errno == EINTR)) + pr_err("failed to sync perf data, error: %m\n"); + } + } while (1); +} + static int process_synthesized_event(struct perf_tool *tool, union perf_event *event, struct perf_sample *sample __maybe_unused, @@ -130,12 +217,27 @@ static int process_synthesized_event(struct