[PATCH v2] libstdc++: basic_filebuf: don't flush more often than necessary.

2022-10-06 Thread Charles-Francois Natali via Gcc-patches
`basic_filebuf::xsputn` would bypass the buffer when passed a chunk of
size 1024 and above, seemingly as an optimisation.

This can have a significant performance impact if the overhead of a
`write` syscall is non-negligible, e.g. on a slow disk, on network
filesystems, or simply during IO contention because instead of flushing
every `BUFSIZ` (by default), we can flush every 1024 char.
The impact is even greater with custom larger buffers, e.g. for network
filesystems, because the code could issue `write` for example 1000X more
often than necessary with respect to the buffer size.
It also introduces a significant discontinuity in performance when
writing chunks of size 1024 and above.

See this reproducer which writes down a fixed number of chunks to a file
open with `O_SYNC` - to replicate high-latency `write` - for varying
size of chunks:

```
$ cat test_fstream_flush.cpp

int
main(int argc, char* argv[])
{
  assert(argc == 3);

  const auto* path = argv[1];
  const auto chunk_size = std::stoul(argv[2]);

  const auto fd =
open(path, O_CREAT | O_TRUNC | O_WRONLY | O_SYNC | O_CLOEXEC, 0666);
  assert(fd >= 0);

  auto filebuf = __gnu_cxx::stdio_filebuf(fd, std::ios_base::out);
  auto stream = std::ostream();

  const auto chunk = std::vector(chunk_size);

  for (auto i = 0; i < 1'000; ++i) {
stream.write(chunk.data(), chunk.size());
  }

  return 0;
}
```

```
$ g++ -o /tmp/test_fstream_flush test_fstream_flush.cpp -std=c++17
$ for i in $(seq 1021 1025); do echo -e "\n$i"; time /tmp/test_fstream_flush 
/tmp/foo $i; done

1021

real0m0.997s
user0m0.000s
sys 0m0.038s

1022

real0m0.939s
user0m0.005s
sys 0m0.032s

1023

real0m0.954s
user0m0.005s
sys 0m0.034s

1024

real0m7.102s
user0m0.040s
sys 0m0.192s

1025

real0m7.204s
user0m0.025s
sys 0m0.209s
```

See the huge drop in performance at the 1024-boundary.

An `strace` confirms that from size 1024 we effectively defeat
buffering:
1023-sized writes
```
$ strace -P /tmp/foo -e openat,write,writev /tmp/test_fstream_flush /tmp/foo 
1023 2>&1 | head -n5
openat(AT_FDCWD, "/tmp/foo", O_WRONLY|O_CREAT|O_TRUNC|O_SYNC|O_CLOEXEC, 0666) = 
3
writev(3, 
[{iov_base="\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"...,
 iov_len=8184}, 
{iov_base="\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"...,
 iov_len=1023}], 2) = 9207
writev(3, 
[{iov_base="\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"...,
 iov_len=8184}, 
{iov_base="\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"...,
 iov_len=1023}], 2) = 9207
writev(3, 
[{iov_base="\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"...,
 iov_len=8184}, 
{iov_base="\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"...,
 iov_len=1023}], 2) = 9207
writev(3, 
[{iov_base="\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"...,
 iov_len=8184}, 
{iov_base="\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"...,
 iov_len=1023}], 2) = 9207
```

vs 1024-sized writes
```
$ strace -P /tmp/foo -e openat,write,writev /tmp/test_fstream_flush /tmp/foo 
1024 2>&1 | head -n5
openat(AT_FDCWD, "/tmp/foo", O_WRONLY|O_CREAT|O_TRUNC|O_SYNC|O_CLOEXEC, 0666) = 
3
writev(3, [{iov_base=NULL, iov_len=0}, 
{iov_base="\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"...,
 iov_len=1024}], 2) = 1024
writev(3, [{iov_base="", iov_len=0}, 
{iov_base="\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"...,
 iov_len=1024}], 2) = 1024
writev(3, [{iov_base="", iov_len=0}, 
{iov_base="\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"...,
 iov_len=1024}], 2) = 1024
writev(3, [{iov_base="", iov_len=0}, 
{iov_base="\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"...,
 iov_len=1024}], 2) = 1024
```

Instead, it makes sense to only bypass the buffer if the amount of data
to be written is larger than the buffer capacity.

Closes https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63746

Signed-off-by: Charles-Francois Natali 
---
 libstdc++-v3/include/bits/fstream.tcc |  9 ++---
 .../27_io/basic_filebuf/sputn/char/63746.cc   | 38 +++
 2 files changed, 41 insertions(+), 6 deletions(-)
 create mode 100644 
libstdc++-v3/testsuite/27_io/basic_filebuf/sputn/char/63746.cc

diff --git a/libstdc++-v3/include/bits/fstream.tcc 
b/libstdc++-v3/include/bits/fstream.tcc
index 7ccc887b8..2e9369628 100644
--- a/libstdc++-v3/include/bits/fstream.tcc
+++ b/libstdc++-v3/include/bits/fstream.tcc
@@ -757,23 +757,20 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 {
   streamsize __ret = 0;
   // Optimization in the always_noconv() case, to be generalized in the
-  // future: when __n is sufficiently large we write directly instead of
-  // using the buffer.
+  // future: when __n is larger than the available capacity we write
+  // directly instead of using the buffer.
   const bool __testout = 

[PATCH] libstdc++: basic_filebuf: don't flush more often than necessary.

2022-09-05 Thread Charles-Francois Natali via Gcc-patches
`basic_filebuf::xsputn` would bypass the buffer when passed a chunk of
size 1024 and above, seemingly as an optimisation.

This can have a significant performance impact if the overhead of a
`write` syscall is non-negligible, e.g. on a slow disk, on network
filesystems, or simply during IO contention because instead of flushing
every `BUFSIZ` (by default), we can flush every 1024 char.
The impact is even greater with custom larger buffers, e.g. for network
filesystems, because the code could issue `write` for example 1000X more
often than necessary with respect to the buffer size.
It also introduces a significant discontinuity in performance when
writing chunks of size 1024 and above.

See this reproducer which writes down a fixed number of chunks to a file
open with `O_SYNC` - to replicate high-latency `write` - for varying
size of chunks:

```
$ cat test_fstream_flush.cpp

int
main(int argc, char* argv[])
{
  assert(argc == 3);

  const auto* path = argv[1];
  const auto chunk_size = std::stoul(argv[2]);

  const auto fd =
open(path, O_CREAT | O_TRUNC | O_WRONLY | O_SYNC | O_CLOEXEC, 0666);
  assert(fd >= 0);

  auto filebuf = __gnu_cxx::stdio_filebuf(fd, std::ios_base::out);
  auto stream = std::ostream();

  const auto chunk = std::vector(chunk_size);

  for (auto i = 0; i < 1'000; ++i) {
stream.write(chunk.data(), chunk.size());
  }

  return 0;
}
```

```
$ g++ -o /tmp/test_fstream_flush test_fstream_flush.cpp -std=c++17
$ for i in $(seq 1021 1025); do echo -e "\n$i"; time /tmp/test_fstream_flush 
/tmp/foo $i; done

1021

real0m0.997s
user0m0.000s
sys 0m0.038s

1022

real0m0.939s
user0m0.005s
sys 0m0.032s

1023

real0m0.954s
user0m0.005s
sys 0m0.034s

1024

real0m7.102s
user0m0.040s
sys 0m0.192s

1025

real0m7.204s
user0m0.025s
sys 0m0.209s
```

See the huge drop in performance at the 1024-boundary.

An `strace` confirms that from size 1024 we effectively defeat
buffering:
1023-sized writes
```
$ strace -P /tmp/foo -e openat,write,writev /tmp/test_fstream_flush /tmp/foo 
1023 2>&1 | head -n5
openat(AT_FDCWD, "/tmp/foo", O_WRONLY|O_CREAT|O_TRUNC|O_SYNC|O_CLOEXEC, 0666) = 
3
writev(3, 
[{iov_base="\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"...,
 iov_len=8184}, 
{iov_base="\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"...,
 iov_len=1023}], 2) = 9207
writev(3, 
[{iov_base="\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"...,
 iov_len=8184}, 
{iov_base="\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"...,
 iov_len=1023}], 2) = 9207
writev(3, 
[{iov_base="\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"...,
 iov_len=8184}, 
{iov_base="\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"...,
 iov_len=1023}], 2) = 9207
writev(3, 
[{iov_base="\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"...,
 iov_len=8184}, 
{iov_base="\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"...,
 iov_len=1023}], 2) = 9207
```

vs 1024-sized writes
```
$ strace -P /tmp/foo -e openat,write,writev /tmp/test_fstream_flush /tmp/foo 
1024 2>&1 | head -n5
openat(AT_FDCWD, "/tmp/foo", O_WRONLY|O_CREAT|O_TRUNC|O_SYNC|O_CLOEXEC, 0666) = 
3
writev(3, [{iov_base=NULL, iov_len=0}, 
{iov_base="\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"...,
 iov_len=1024}], 2) = 1024
writev(3, [{iov_base="", iov_len=0}, 
{iov_base="\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"...,
 iov_len=1024}], 2) = 1024
writev(3, [{iov_base="", iov_len=0}, 
{iov_base="\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"...,
 iov_len=1024}], 2) = 1024
writev(3, [{iov_base="", iov_len=0}, 
{iov_base="\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"...,
 iov_len=1024}], 2) = 1024
```

Instead, it makes sense to only bypass the buffer if the amount of data
to be written is larger than the buffer capacity.

Closes https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63746
---
 libstdc++-v3/include/bits/fstream.tcc |  9 +--
 .../27_io/basic_filebuf/sputn/char/63746.cc   | 55 +++
 2 files changed, 58 insertions(+), 6 deletions(-)
 create mode 100644 
libstdc++-v3/testsuite/27_io/basic_filebuf/sputn/char/63746.cc

diff --git a/libstdc++-v3/include/bits/fstream.tcc 
b/libstdc++-v3/include/bits/fstream.tcc
index 7ccc887b8..2e9369628 100644
--- a/libstdc++-v3/include/bits/fstream.tcc
+++ b/libstdc++-v3/include/bits/fstream.tcc
@@ -757,23 +757,20 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 {
   streamsize __ret = 0;
   // Optimization in the always_noconv() case, to be generalized in the
-  // future: when __n is sufficiently large we write directly instead of
-  // using the buffer.
+  // future: when __n is larger than the available capacity we write
+  // directly instead of using the buffer.
   const bool __testout = (_M_mode & ios_base::out