A good reference for the concepts used here is:
https://mazzo.li/posts/fast-pipes.html
We don't consider huge pages or busy loops here,
but use vmsplice(), and splice() to get significant speedups:

  i7-5600U-laptop $ taskset 1 yes | taskset 2 pv > /dev/null
  ... [4.98GiB/s]
  i7-5600U-laptop $ taskset 1 src/yes | taskset 2 pv > /dev/null
  ... [34.1GiB/s]

  IBM,9043-MRX $ taskset 1 yes | taskset 2 pv > /dev/null
  ... [11.6GiB/s]
  IBM,9043-MRX $ taskset 1 src/yes | taskset 2 pv > /dev/null
  ... [175GiB/s]

Also throughput to file (on BTRFS) was seen to increase significantly.
With a Fedora 43 laptop improving from 690MiB/s to 1.1GiB/s.

* bootstrap.conf: Ensure sys/uio.h is present.
This was an existing transitive dependency.
* m4/jm-macros.m4: Define HAVE_SPLICE appropriately.
We assume vmsplice() is available if splice() is as they
were introduced at the same time to Linux and glibc.
* src/yes.c (repeat_pattern): A new function to efficiently
duplicate a pattern in a buffer with memcpy calls that double in size.
This also makes the setup for the existing write() path more efficient.
(pipe_splice_size): A new function to increase the kernel pipe buffer
if possible, and use an appropriately sized buffer based on that (25%).
(splice_write): A new function to call vmplice() when outputting
to a pipe, and also splice() if outputting to a non-pipe.
* tests/misc/yes.sh: Verify the non-pipe output case,
(main): Adjust to always calling write on the minimal buffer first,
then trying vmsplice(), then falling back to write from bigger buffer.
and the vmsplice() fallback to write() case.
* NEWS: Mention the improvement.
---
 NEWS              |   3 +
 bootstrap.conf    |   1 +
 m4/jm-macros.m4   |   1 +
 src/yes.c         | 161 +++++++++++++++++++++++++++++++++++++++++++---
 tests/misc/yes.sh |  13 ++++
 5 files changed, 171 insertions(+), 8 deletions(-)

diff --git a/NEWS b/NEWS
index cec03a581..c69a55d61 100644
--- a/NEWS
+++ b/NEWS
@@ -50,6 +50,9 @@ GNU coreutils NEWS                                    -*- 
outline -*-
   'wc -l' now operates up to three times faster on hosts that support Neon
   instructions.
 
+  'yes' now uses zero-copy I/O on Linux to significantly increase throughput.
+  E.g., increases from 12GiB/s to 175GiB/s were seen on some systems.
+
 ** Build-related
 
   ./configure --enable-single-binary=hardlinks is now supported on systems
diff --git a/bootstrap.conf b/bootstrap.conf
index 331382841..5f15a82f9 100644
--- a/bootstrap.conf
+++ b/bootstrap.conf
@@ -287,6 +287,7 @@ gnulib_modules="
   sys_resource-h
   sys_stat-h
   sys_types-h
+  sys_uio-h
   sys_wait-h
   targetdir
   tempname
diff --git a/m4/jm-macros.m4 b/m4/jm-macros.m4
index 53be2e342..fbf082703 100644
--- a/m4/jm-macros.m4
+++ b/m4/jm-macros.m4
@@ -70,6 +70,7 @@ AC_DEFUN([coreutils_MACROS],
     setgroups
     sethostname
     siginterrupt
+    splice
     sync
     sysinfo
     tcgetpgrp
diff --git a/src/yes.c b/src/yes.c
index 66b6243db..7f8022de9 100644
--- a/src/yes.c
+++ b/src/yes.c
@@ -19,10 +19,13 @@
 #include <config.h>
 #include <stdio.h>
 #include <sys/types.h>
+#include <sys/uio.h>
 
 #include "system.h"
 
+#include "alignalloc.h"
 #include "full-write.h"
+#include "isapipe.h"
 #include "long-options.h"
 
 /* The official name of this program (e.g., no 'g' prefix).  */
@@ -54,6 +57,144 @@ Repeatedly output a line with all specified STRING(s), or 
'y'.\n\
   exit (status);
 }
 
+/* Fill DEST[0..BUFSIZE-1] with repeated copies of SRC[0..SRCSIZE-1],
+   doubling the copy size each iteration.  DEST may equal SRC.  */
+
+static void
+repeat_pattern (char *dest, char const *src, size_t srcsize, size_t bufsize)
+{
+  if (dest != src)
+    memcpy (dest, src, srcsize);
+  for (size_t filled = srcsize; filled < bufsize; )
+    {
+      size_t chunk = MIN (filled, bufsize - filled);
+      memcpy (dest + filled, dest, chunk);
+      filled += chunk;
+    }
+}
+
+#if HAVE_SPLICE
+
+/* Empirically determined pipe size for best throughput.
+   Needs to be <= /proc/sys/fs/pipe-max-size  */
+enum { SPLICE_PIPE_SIZE = 512 * 1024 };
+
+/* Enlarge a pipe towards SPLICE_PIPE_SIZE and return the actual
+   capacity as a quarter of the pipe size (the empirical sweet spot
+   for vmsplice throughput), rounded down to a multiple of COPYSIZE.
+   Return 0 if the result would be smaller than COPYSIZE.  */
+
+static size_t
+pipe_splice_size (int fd, size_t copysize)
+{
+  int pipe_cap = 0;
+# if defined F_SETPIPE_SZ && defined F_GETPIPE_SZ
+  if ((pipe_cap = fcntl (fd, F_SETPIPE_SZ, SPLICE_PIPE_SIZE)) < 0)
+    pipe_cap = fcntl (fd, F_GETPIPE_SZ);
+# endif
+  if (pipe_cap <= 0)
+    pipe_cap = 64 * 1024;
+
+  size_t buf_cap = pipe_cap / 4;
+  return buf_cap / copysize * copysize;
+}
+
+#endif
+
+/* Repeatedly write the COPYSIZE-byte pattern in BUF to standard output
+   using vmsplice/splice zero-copy I/O.  Since the data never varies,
+   SPLICE_F_GIFT tells the kernel the pages will not be modified.
+
+   Return TRUE if splice I/O was used (caller should check errno and
+   report any error).  Return FALSE if splice could not be used.  */
+
+static bool
+splice_write (char const *buf, size_t copysize)
+{
+  bool output_started = false;
+#if HAVE_SPLICE
+  size_t page_size = getpagesize ();
+
+  bool stdout_is_pipe = isapipe (STDOUT_FILENO) > 0;
+
+  /* Determine buffer size: enlarge the target pipe,
+     then use 1/4 of actual capacity as the transfer size.  */
+  int pipefd[2] = { -1, -1 };
+  size_t splice_bufsize;
+  char *splice_buf = NULL;
+
+  if (stdout_is_pipe)
+    splice_bufsize = pipe_splice_size (STDOUT_FILENO, copysize);
+  else
+    {
+      if (pipe2 (pipefd, 0) < 0)
+        return false;
+      splice_bufsize = pipe_splice_size (pipefd[0], copysize);
+    }
+
+  if (splice_bufsize == 0)
+    goto done;
+
+  /* Allocate page-aligned buffer for vmsplice.  */
+  if (! (splice_buf = alignalloc (page_size, splice_bufsize)))
+    goto done;
+
+  repeat_pattern (splice_buf, buf, copysize, splice_bufsize);
+
+  /* For the pipe case, vmsplice directly to stdout.
+     For the non-pipe case, vmsplice into the intermediate pipe
+     and then splice from it to stdout.  */
+  int vmsplice_fd = stdout_is_pipe ? STDOUT_FILENO : pipefd[1];
+
+  for (;;)
+    {
+      struct iovec iov = { .iov_base = splice_buf,
+                           .iov_len = splice_bufsize };
+
+      while (iov.iov_len > 0)
+        {
+         /* Use SPLICE_F_{GIFT,MOVE} to allow the kernel to take references
+            to the pages.  I.e., we're indicating we won't make changes.
+            SPLICE_F_GIFT is only appropriate for full pages.  */
+          unsigned int flags = iov.iov_len % page_size ? 0 : SPLICE_F_GIFT;
+          ssize_t n = vmsplice (vmsplice_fd, &iov, 1, flags);
+          if (n <= 0)
+            goto done;
+          if (stdout_is_pipe)
+            output_started = true;
+          iov.iov_base = (char *) iov.iov_base + n;
+          iov.iov_len -= n;
+        }
+
+      /* For non-pipe stdout, drain intermediate pipe to stdout.  */
+      if (! stdout_is_pipe)
+        {
+          size_t remaining = splice_bufsize;
+          while (remaining > 0)
+            {
+              ssize_t s = splice (pipefd[0], NULL, STDOUT_FILENO, NULL,
+                                  remaining, SPLICE_F_MOVE);
+              if (s <= 0)
+                goto done;
+              output_started = true;
+              remaining -= s;
+            }
+        }
+    }
+
+done:
+  if (pipefd[0] >= 0)
+    {
+      int saved_errno = errno;
+      close (pipefd[0]);
+      close (pipefd[1]);
+      errno = saved_errno;
+    }
+  alignfree (splice_buf);
+#endif
+  return output_started;
+}
+
 int
 main (int argc, char **argv)
 {
@@ -117,18 +258,22 @@ main (int argc, char **argv)
   while (++operandp < operand_lim);
   buf[bufused - 1] = '\n';
 
-  /* If a larger buffer was allocated, fill it by repeating the buffer
-     contents.  */
   size_t copysize = bufused;
-  for (size_t copies = bufalloc / copysize; --copies; )
+
+  /* Repeatedly output the buffer until there is a write error; then fail.
+     Do a minimal write first to check output with minimal set up cost.
+     If successful then set up for efficient repetition.  */
+  if (full_write (STDOUT_FILENO, buf, copysize) == copysize
+      && splice_write (buf, copysize) == 0)
     {
-      memcpy (buf + bufused, buf, copysize);
-      bufused += copysize;
+      /* If a larger buffer was allocated, fill it by repeated copies.  */
+      bufused = bufalloc / copysize * copysize;
+      if (bufused > copysize)
+        repeat_pattern (buf, buf, copysize, bufused);
+      while (full_write (STDOUT_FILENO, buf, bufused) == bufused)
+        continue;
     }
 
-  /* Repeatedly output the buffer until there is a write error; then fail.  */
-  while (full_write (STDOUT_FILENO, buf, bufused) == bufused)
-    continue;
   error (0, errno, _("standard output"));
   main_exit (EXIT_FAILURE);
 }
diff --git a/tests/misc/yes.sh b/tests/misc/yes.sh
index ba340c9fa..640b6ad5c 100755
--- a/tests/misc/yes.sh
+++ b/tests/misc/yes.sh
@@ -19,6 +19,7 @@
 . "${srcdir=.}/tests/init.sh"; path_prepend_ ./src
 print_ver_ yes
 getlimits_
+uses_strace_
 
 # Check basic operation
 test "$(yes | head -n1)" = 'y' || fail=1
@@ -56,4 +57,16 @@ if test -w /dev/full && test -c /dev/full; then
   done
 fi
 
+# Check the non pipe output case, since that is different with splice
+if timeout 10 true; then
+  timeout .1 yes >/dev/null
+  test $? = 124 || fail=1
+fi
+
+# Ensure we fallback to write() if there is an issue with vmsplice
+no_vmsplice() { strace -e inject=vmsplice:error=ENOSYS "$@"; }
+if no_vmsplice true; then
+  test "$(no_vmsplice yes | head -n2 | paste -s -d '')" = 'yy' || fail=1
+fi
+
 Exit $fail
-- 
2.53.0


Reply via email to