[PATCH v2] libstdc++: basic_filebuf: don't flush more often than necessary.
`basic_filebuf::xsputn` would bypass the buffer when passed a chunk of size 1024 and above, seemingly as an optimisation. This can have a significant performance impact if the overhead of a `write` syscall is non-negligible, e.g. on a slow disk, on network filesystems, or simply during IO contention because instead of flushing every `BUFSIZ` (by default), we can flush every 1024 char. The impact is even greater with custom larger buffers, e.g. for network filesystems, because the code could issue `write` for example 1000X more often than necessary with respect to the buffer size. It also introduces a significant discontinuity in performance when writing chunks of size 1024 and above. See this reproducer which writes down a fixed number of chunks to a file open with `O_SYNC` - to replicate high-latency `write` - for varying size of chunks: ``` $ cat test_fstream_flush.cpp int main(int argc, char* argv[]) { assert(argc == 3); const auto* path = argv[1]; const auto chunk_size = std::stoul(argv[2]); const auto fd = open(path, O_CREAT | O_TRUNC | O_WRONLY | O_SYNC | O_CLOEXEC, 0666); assert(fd >= 0); auto filebuf = __gnu_cxx::stdio_filebuf(fd, std::ios_base::out); auto stream = std::ostream(); const auto chunk = std::vector(chunk_size); for (auto i = 0; i < 1'000; ++i) { stream.write(chunk.data(), chunk.size()); } return 0; } ``` ``` $ g++ -o /tmp/test_fstream_flush test_fstream_flush.cpp -std=c++17 $ for i in $(seq 1021 1025); do echo -e "\n$i"; time /tmp/test_fstream_flush /tmp/foo $i; done 1021 real0m0.997s user0m0.000s sys 0m0.038s 1022 real0m0.939s user0m0.005s sys 0m0.032s 1023 real0m0.954s user0m0.005s sys 0m0.034s 1024 real0m7.102s user0m0.040s sys 0m0.192s 1025 real0m7.204s user0m0.025s sys 0m0.209s ``` See the huge drop in performance at the 1024-boundary. An `strace` confirms that from size 1024 we effectively defeat buffering: 1023-sized writes ``` $ strace -P /tmp/foo -e openat,write,writev /tmp/test_fstream_flush /tmp/foo 1023 2>&1 | head -n5 openat(AT_FDCWD, "/tmp/foo", O_WRONLY|O_CREAT|O_TRUNC|O_SYNC|O_CLOEXEC, 0666) = 3 writev(3, [{iov_base="\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., iov_len=8184}, {iov_base="\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., iov_len=1023}], 2) = 9207 writev(3, [{iov_base="\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., iov_len=8184}, {iov_base="\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., iov_len=1023}], 2) = 9207 writev(3, [{iov_base="\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., iov_len=8184}, {iov_base="\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., iov_len=1023}], 2) = 9207 writev(3, [{iov_base="\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., iov_len=8184}, {iov_base="\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., iov_len=1023}], 2) = 9207 ``` vs 1024-sized writes ``` $ strace -P /tmp/foo -e openat,write,writev /tmp/test_fstream_flush /tmp/foo 1024 2>&1 | head -n5 openat(AT_FDCWD, "/tmp/foo", O_WRONLY|O_CREAT|O_TRUNC|O_SYNC|O_CLOEXEC, 0666) = 3 writev(3, [{iov_base=NULL, iov_len=0}, {iov_base="\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., iov_len=1024}], 2) = 1024 writev(3, [{iov_base="", iov_len=0}, {iov_base="\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., iov_len=1024}], 2) = 1024 writev(3, [{iov_base="", iov_len=0}, {iov_base="\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., iov_len=1024}], 2) = 1024 writev(3, [{iov_base="", iov_len=0}, {iov_base="\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., iov_len=1024}], 2) = 1024 ``` Instead, it makes sense to only bypass the buffer if the amount of data to be written is larger than the buffer capacity. Closes https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63746 Signed-off-by: Charles-Francois Natali --- libstdc++-v3/include/bits/fstream.tcc | 9 ++--- .../27_io/basic_filebuf/sputn/char/63746.cc | 38 +++ 2 files changed, 41 insertions(+), 6 deletions(-) create mode 100644 libstdc++-v3/testsuite/27_io/basic_filebuf/sputn/char/63746.cc diff --git a/libstdc++-v3/include/bits/fstream.tcc b/libstdc++-v3/include/bits/fstream.tcc index 7ccc887b8..2e9369628 100644 --- a/libstdc++-v3/include/bits/fstream.tcc +++ b/libstdc++-v3/include/bits/fstream.tcc @@ -757,23 +757,20 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION { streamsize __ret = 0; // Optimization in the always_noconv() case, to be generalized in the - // future: when __n is sufficiently large we write directly instead of - // using the buffer. + // future: when __n is larger than the available capacity we write + // directly instead of using the buffer. const bool __testout =
[PATCH] libstdc++: basic_filebuf: don't flush more often than necessary.
`basic_filebuf::xsputn` would bypass the buffer when passed a chunk of size 1024 and above, seemingly as an optimisation. This can have a significant performance impact if the overhead of a `write` syscall is non-negligible, e.g. on a slow disk, on network filesystems, or simply during IO contention because instead of flushing every `BUFSIZ` (by default), we can flush every 1024 char. The impact is even greater with custom larger buffers, e.g. for network filesystems, because the code could issue `write` for example 1000X more often than necessary with respect to the buffer size. It also introduces a significant discontinuity in performance when writing chunks of size 1024 and above. See this reproducer which writes down a fixed number of chunks to a file open with `O_SYNC` - to replicate high-latency `write` - for varying size of chunks: ``` $ cat test_fstream_flush.cpp int main(int argc, char* argv[]) { assert(argc == 3); const auto* path = argv[1]; const auto chunk_size = std::stoul(argv[2]); const auto fd = open(path, O_CREAT | O_TRUNC | O_WRONLY | O_SYNC | O_CLOEXEC, 0666); assert(fd >= 0); auto filebuf = __gnu_cxx::stdio_filebuf(fd, std::ios_base::out); auto stream = std::ostream(); const auto chunk = std::vector(chunk_size); for (auto i = 0; i < 1'000; ++i) { stream.write(chunk.data(), chunk.size()); } return 0; } ``` ``` $ g++ -o /tmp/test_fstream_flush test_fstream_flush.cpp -std=c++17 $ for i in $(seq 1021 1025); do echo -e "\n$i"; time /tmp/test_fstream_flush /tmp/foo $i; done 1021 real0m0.997s user0m0.000s sys 0m0.038s 1022 real0m0.939s user0m0.005s sys 0m0.032s 1023 real0m0.954s user0m0.005s sys 0m0.034s 1024 real0m7.102s user0m0.040s sys 0m0.192s 1025 real0m7.204s user0m0.025s sys 0m0.209s ``` See the huge drop in performance at the 1024-boundary. An `strace` confirms that from size 1024 we effectively defeat buffering: 1023-sized writes ``` $ strace -P /tmp/foo -e openat,write,writev /tmp/test_fstream_flush /tmp/foo 1023 2>&1 | head -n5 openat(AT_FDCWD, "/tmp/foo", O_WRONLY|O_CREAT|O_TRUNC|O_SYNC|O_CLOEXEC, 0666) = 3 writev(3, [{iov_base="\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., iov_len=8184}, {iov_base="\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., iov_len=1023}], 2) = 9207 writev(3, [{iov_base="\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., iov_len=8184}, {iov_base="\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., iov_len=1023}], 2) = 9207 writev(3, [{iov_base="\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., iov_len=8184}, {iov_base="\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., iov_len=1023}], 2) = 9207 writev(3, [{iov_base="\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., iov_len=8184}, {iov_base="\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., iov_len=1023}], 2) = 9207 ``` vs 1024-sized writes ``` $ strace -P /tmp/foo -e openat,write,writev /tmp/test_fstream_flush /tmp/foo 1024 2>&1 | head -n5 openat(AT_FDCWD, "/tmp/foo", O_WRONLY|O_CREAT|O_TRUNC|O_SYNC|O_CLOEXEC, 0666) = 3 writev(3, [{iov_base=NULL, iov_len=0}, {iov_base="\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., iov_len=1024}], 2) = 1024 writev(3, [{iov_base="", iov_len=0}, {iov_base="\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., iov_len=1024}], 2) = 1024 writev(3, [{iov_base="", iov_len=0}, {iov_base="\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., iov_len=1024}], 2) = 1024 writev(3, [{iov_base="", iov_len=0}, {iov_base="\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., iov_len=1024}], 2) = 1024 ``` Instead, it makes sense to only bypass the buffer if the amount of data to be written is larger than the buffer capacity. Closes https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63746 --- libstdc++-v3/include/bits/fstream.tcc | 9 +-- .../27_io/basic_filebuf/sputn/char/63746.cc | 55 +++ 2 files changed, 58 insertions(+), 6 deletions(-) create mode 100644 libstdc++-v3/testsuite/27_io/basic_filebuf/sputn/char/63746.cc diff --git a/libstdc++-v3/include/bits/fstream.tcc b/libstdc++-v3/include/bits/fstream.tcc index 7ccc887b8..2e9369628 100644 --- a/libstdc++-v3/include/bits/fstream.tcc +++ b/libstdc++-v3/include/bits/fstream.tcc @@ -757,23 +757,20 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION { streamsize __ret = 0; // Optimization in the always_noconv() case, to be generalized in the - // future: when __n is sufficiently large we write directly instead of - // using the buffer. + // future: when __n is larger than the available capacity we write + // directly instead of using the buffer. const bool __testout = (_M_mode & ios_base::out