A good reference for the concepts used here is:
https://mazzo.li/posts/fast-pipes.html
We don't consider huge pages or busy loops here,
but use vmsplice(), and splice() to get significant speedups:
i7-5600U-laptop $ taskset 1 yes | taskset 2 pv > /dev/null
... [4.98GiB/s]
i7-5600U-laptop $ taskset 1 src/yes | taskset 2 pv > /dev/null
... [34.1GiB/s]
IBM,9043-MRX $ taskset 1 yes | taskset 2 pv > /dev/null
... [11.6GiB/s]
IBM,9043-MRX $ taskset 1 src/yes | taskset 2 pv > /dev/null
... [175GiB/s]
Also throughput to file (on BTRFS) was seen to increase significantly.
With a Fedora 43 laptop improving from 690MiB/s to 1.1GiB/s.
* bootstrap.conf: Ensure sys/uio.h is present.
This was an existing transitive dependency.
* m4/jm-macros.m4: Define HAVE_SPLICE appropriately.
We assume vmsplice() is available if splice() is as they
were introduced at the same time to Linux and glibc.
* src/yes.c (repeat_pattern): A new function to efficiently
duplicate a pattern in a buffer with memcpy calls that double in size.
This also makes the setup for the existing write() path more efficient.
(pipe_splice_size): A new function to increase the kernel pipe buffer
if possible, and use an appropriately sized buffer based on that (25%).
(splice_write): A new function to call vmplice() when outputting
to a pipe, and also splice() if outputting to a non-pipe.
* tests/misc/yes.sh: Verify the non-pipe output case,
(main): Adjust to always calling write on the minimal buffer first,
then trying vmsplice(), then falling back to write from bigger buffer.
and the vmsplice() fallback to write() case.
* NEWS: Mention the improvement.
---
NEWS | 3 +
bootstrap.conf | 1 +
m4/jm-macros.m4 | 1 +
src/yes.c | 161 +++++++++++++++++++++++++++++++++++++++++++---
tests/misc/yes.sh | 13 ++++
5 files changed, 171 insertions(+), 8 deletions(-)
diff --git a/NEWS b/NEWS
index cec03a581..c69a55d61 100644
--- a/NEWS
+++ b/NEWS
@@ -50,6 +50,9 @@ GNU coreutils NEWS -*-
outline -*-
'wc -l' now operates up to three times faster on hosts that support Neon
instructions.
+ 'yes' now uses zero-copy I/O on Linux to significantly increase throughput.
+ E.g., increases from 12GiB/s to 175GiB/s were seen on some systems.
+
** Build-related
./configure --enable-single-binary=hardlinks is now supported on systems
diff --git a/bootstrap.conf b/bootstrap.conf
index 331382841..5f15a82f9 100644
--- a/bootstrap.conf
+++ b/bootstrap.conf
@@ -287,6 +287,7 @@ gnulib_modules="
sys_resource-h
sys_stat-h
sys_types-h
+ sys_uio-h
sys_wait-h
targetdir
tempname
diff --git a/m4/jm-macros.m4 b/m4/jm-macros.m4
index 53be2e342..fbf082703 100644
--- a/m4/jm-macros.m4
+++ b/m4/jm-macros.m4
@@ -70,6 +70,7 @@ AC_DEFUN([coreutils_MACROS],
setgroups
sethostname
siginterrupt
+ splice
sync
sysinfo
tcgetpgrp
diff --git a/src/yes.c b/src/yes.c
index 66b6243db..7f8022de9 100644
--- a/src/yes.c
+++ b/src/yes.c
@@ -19,10 +19,13 @@
#include <config.h>
#include <stdio.h>
#include <sys/types.h>
+#include <sys/uio.h>
#include "system.h"
+#include "alignalloc.h"
#include "full-write.h"
+#include "isapipe.h"
#include "long-options.h"
/* The official name of this program (e.g., no 'g' prefix). */
@@ -54,6 +57,144 @@ Repeatedly output a line with all specified STRING(s), or
'y'.\n\
exit (status);
}
+/* Fill DEST[0..BUFSIZE-1] with repeated copies of SRC[0..SRCSIZE-1],
+ doubling the copy size each iteration. DEST may equal SRC. */
+
+static void
+repeat_pattern (char *dest, char const *src, size_t srcsize, size_t bufsize)
+{
+ if (dest != src)
+ memcpy (dest, src, srcsize);
+ for (size_t filled = srcsize; filled < bufsize; )
+ {
+ size_t chunk = MIN (filled, bufsize - filled);
+ memcpy (dest + filled, dest, chunk);
+ filled += chunk;
+ }
+}
+
+#if HAVE_SPLICE
+
+/* Empirically determined pipe size for best throughput.
+ Needs to be <= /proc/sys/fs/pipe-max-size */
+enum { SPLICE_PIPE_SIZE = 512 * 1024 };
+
+/* Enlarge a pipe towards SPLICE_PIPE_SIZE and return the actual
+ capacity as a quarter of the pipe size (the empirical sweet spot
+ for vmsplice throughput), rounded down to a multiple of COPYSIZE.
+ Return 0 if the result would be smaller than COPYSIZE. */
+
+static size_t
+pipe_splice_size (int fd, size_t copysize)
+{
+ int pipe_cap = 0;
+# if defined F_SETPIPE_SZ && defined F_GETPIPE_SZ
+ if ((pipe_cap = fcntl (fd, F_SETPIPE_SZ, SPLICE_PIPE_SIZE)) < 0)
+ pipe_cap = fcntl (fd, F_GETPIPE_SZ);
+# endif
+ if (pipe_cap <= 0)
+ pipe_cap = 64 * 1024;
+
+ size_t buf_cap = pipe_cap / 4;
+ return buf_cap / copysize * copysize;
+}
+
+#endif
+
+/* Repeatedly write the COPYSIZE-byte pattern in BUF to standard output
+ using vmsplice/splice zero-copy I/O. Since the data never varies,
+ SPLICE_F_GIFT tells the kernel the pages will not be modified.
+
+ Return TRUE if splice I/O was used (caller should check errno and
+ report any error). Return FALSE if splice could not be used. */
+
+static bool
+splice_write (char const *buf, size_t copysize)
+{
+ bool output_started = false;
+#if HAVE_SPLICE
+ size_t page_size = getpagesize ();
+
+ bool stdout_is_pipe = isapipe (STDOUT_FILENO) > 0;
+
+ /* Determine buffer size: enlarge the target pipe,
+ then use 1/4 of actual capacity as the transfer size. */
+ int pipefd[2] = { -1, -1 };
+ size_t splice_bufsize;
+ char *splice_buf = NULL;
+
+ if (stdout_is_pipe)
+ splice_bufsize = pipe_splice_size (STDOUT_FILENO, copysize);
+ else
+ {
+ if (pipe2 (pipefd, 0) < 0)
+ return false;
+ splice_bufsize = pipe_splice_size (pipefd[0], copysize);
+ }
+
+ if (splice_bufsize == 0)
+ goto done;
+
+ /* Allocate page-aligned buffer for vmsplice. */
+ if (! (splice_buf = alignalloc (page_size, splice_bufsize)))
+ goto done;
+
+ repeat_pattern (splice_buf, buf, copysize, splice_bufsize);
+
+ /* For the pipe case, vmsplice directly to stdout.
+ For the non-pipe case, vmsplice into the intermediate pipe
+ and then splice from it to stdout. */
+ int vmsplice_fd = stdout_is_pipe ? STDOUT_FILENO : pipefd[1];
+
+ for (;;)
+ {
+ struct iovec iov = { .iov_base = splice_buf,
+ .iov_len = splice_bufsize };
+
+ while (iov.iov_len > 0)
+ {
+ /* Use SPLICE_F_{GIFT,MOVE} to allow the kernel to take references
+ to the pages. I.e., we're indicating we won't make changes.
+ SPLICE_F_GIFT is only appropriate for full pages. */
+ unsigned int flags = iov.iov_len % page_size ? 0 : SPLICE_F_GIFT;
+ ssize_t n = vmsplice (vmsplice_fd, &iov, 1, flags);
+ if (n <= 0)
+ goto done;
+ if (stdout_is_pipe)
+ output_started = true;
+ iov.iov_base = (char *) iov.iov_base + n;
+ iov.iov_len -= n;
+ }
+
+ /* For non-pipe stdout, drain intermediate pipe to stdout. */
+ if (! stdout_is_pipe)
+ {
+ size_t remaining = splice_bufsize;
+ while (remaining > 0)
+ {
+ ssize_t s = splice (pipefd[0], NULL, STDOUT_FILENO, NULL,
+ remaining, SPLICE_F_MOVE);
+ if (s <= 0)
+ goto done;
+ output_started = true;
+ remaining -= s;
+ }
+ }
+ }
+
+done:
+ if (pipefd[0] >= 0)
+ {
+ int saved_errno = errno;
+ close (pipefd[0]);
+ close (pipefd[1]);
+ errno = saved_errno;
+ }
+ alignfree (splice_buf);
+#endif
+ return output_started;
+}
+
int
main (int argc, char **argv)
{
@@ -117,18 +258,22 @@ main (int argc, char **argv)
while (++operandp < operand_lim);
buf[bufused - 1] = '\n';
- /* If a larger buffer was allocated, fill it by repeating the buffer
- contents. */
size_t copysize = bufused;
- for (size_t copies = bufalloc / copysize; --copies; )
+
+ /* Repeatedly output the buffer until there is a write error; then fail.
+ Do a minimal write first to check output with minimal set up cost.
+ If successful then set up for efficient repetition. */
+ if (full_write (STDOUT_FILENO, buf, copysize) == copysize
+ && splice_write (buf, copysize) == 0)
{
- memcpy (buf + bufused, buf, copysize);
- bufused += copysize;
+ /* If a larger buffer was allocated, fill it by repeated copies. */
+ bufused = bufalloc / copysize * copysize;
+ if (bufused > copysize)
+ repeat_pattern (buf, buf, copysize, bufused);
+ while (full_write (STDOUT_FILENO, buf, bufused) == bufused)
+ continue;
}
- /* Repeatedly output the buffer until there is a write error; then fail. */
- while (full_write (STDOUT_FILENO, buf, bufused) == bufused)
- continue;
error (0, errno, _("standard output"));
main_exit (EXIT_FAILURE);
}
diff --git a/tests/misc/yes.sh b/tests/misc/yes.sh
index ba340c9fa..640b6ad5c 100755
--- a/tests/misc/yes.sh
+++ b/tests/misc/yes.sh
@@ -19,6 +19,7 @@
. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src
print_ver_ yes
getlimits_
+uses_strace_
# Check basic operation
test "$(yes | head -n1)" = 'y' || fail=1
@@ -56,4 +57,16 @@ if test -w /dev/full && test -c /dev/full; then
done
fi
+# Check the non pipe output case, since that is different with splice
+if timeout 10 true; then
+ timeout .1 yes >/dev/null
+ test $? = 124 || fail=1
+fi
+
+# Ensure we fallback to write() if there is an issue with vmsplice
+no_vmsplice() { strace -e inject=vmsplice:error=ENOSYS "$@"; }
+if no_vmsplice true; then
+ test "$(no_vmsplice yes | head -n2 | paste -s -d '')" = 'yy' || fail=1
+fi
+
Exit $fail
--
2.53.0