Pádraig Brady <[email protected]> writes:

> Note we don't exit upon splice_cat failure,
> which does lead to a change in behavior.
> So one change to my previous patch is
> I'd add an EXIT_FAILURE param to the error(..."standard error") call.
>
> That still doesn't help when it's ambiguous though.
> This shows we fallback to the write() path for subsequent files:
>
>   $ cat /dev/zero | (trap '' PIPE; src/cat - - -) | head -c1 >/dev/null  cat: 
> splice error: Broken pipe
>   cat: write error: Broken pipe
>
> Not huge issues, but it would be nice to avoid.
> Specifically if output is gone, we'd like to avoid subsequent files.
> Consider how this now hangs for example:
>
>   $ yes | (trap '' PIPE; src/cat - /dev/tty) | head -c1 >/dev/null
>
> A bit contrived I know.

Nice catches, thanks.

> Note this is only in the case where we don't create an intermediate pipe.
> When we do have the intermediate pipe we can distinguish read errors
> from write errors, so perhaps we should always do that.
> Actually testing that here, gives faster operation anyway
> (I guess due to the sizing of the pipes):
>
> $ timeout 2 src/yes | pv -r | src/cat >/dev/null
> Creating intermediate pipe
> [36.9GiB/s]
>
> $ timeout 2 src/yes | pv -r | src/cat >/dev/null
> [22.6GiB/s]

Interesting. I wrongly assumed it would be faster without an
intermediate pipe. I think I am for always creating an intermediate
pipe. It also simplifies the code quite a bit.

Most people will never use 'ulimit -n 4 cat' or hit system wide file
descriptor limits. If they do, I don't think read and write are slow
enough for them to care (or even notice).

Attatched a v4 patch.

Collin

>From 63a23e962d16241710b960a37a9947e6f6dbed4c Mon Sep 17 00:00:00 2001
Message-ID: <63a23e962d16241710b960a37a9947e6f6dbed4c.1775269558.git.collin.fu...@gmail.com>
From: Collin Funk <[email protected]>
Date: Sun, 29 Mar 2026 16:13:01 -0700
Subject: [PATCH v4] cat: use splice if operating on pipes or if
 copy_file_range fails

On a AMD Ryzen 7 3700X system:

    $ timeout 10 taskset 1 ./src/cat-prev /dev/zero \
        | taskset 2 pv -r > /dev/null
    [1.67GiB/s]
    $ timeout 10 taskset 1 ./src/cat /dev/zero \
        | taskset 2 pv -r > /dev/null
    [9.03GiB/s]

On a Power10 system:

    $ taskset 1 ./src/yes | timeout 10 taskset 2 ./src/cat-prev \
        | taskset 3 pv -r > /dev/null
    [12.9GiB/s]
    $ taskset 1 ./src/yes | timeout 10 taskset 2 ./src/cat \
            | taskset 3 pv -r > /dev/null
    [81.8GiB/s]

* NEWS: Mention the improvement.
* src/cat.c: Include isapipe.h, splice.h, and unistd--.h.
(splice_cat): New function.
(main): Use it.
* src/local.mk (noinst_HEADERS): Add src/splice.h.
* src/splice.h: New file, based on definitions from src/yes.c.
* src/yes.c: Include splice.h.
(pipe_splice_size): Use increase_pipe_size from src/splice.h.
(SPLICE_PIPE_SIZE): Remove definition, moved to src/splice.h.
* tests/cat/splice.sh: New file, based on some tests in
tests/misc/yes.sh.
* tests/local.mk (all_tests): Add the new test.
---
 NEWS                |   4 ++
 src/cat.c           | 119 ++++++++++++++++++++++++++++++++++++++++++--
 src/local.mk        |   1 +
 src/splice.h        |  41 +++++++++++++++
 src/yes.c           |  15 +-----
 tests/cat/splice.sh |  66 ++++++++++++++++++++++++
 tests/local.mk      |   1 +
 7 files changed, 231 insertions(+), 16 deletions(-)
 create mode 100644 src/splice.h
 create mode 100755 tests/cat/splice.sh

diff --git a/NEWS b/NEWS
index 2fabd07b7..e46ab72c2 100644
--- a/NEWS
+++ b/NEWS
@@ -39,6 +39,10 @@ GNU coreutils NEWS                                    -*- outline -*-
 
 ** Improvements
 
+  'cat' now uses zero-copy I/O on Linux when the input or output are pipes,
+  to significantly increase throughput.
+  E.g., throughput improved 6x from 12.9GiB/s to 81.8GiB/s on a Power10 system.
+
   'df --local' recognises more file system types as remote.
   Specifically: autofs, ncpfs, smb, smb2, gfs, gfs2, userlandfs.
 
diff --git a/src/cat.c b/src/cat.c
index f9c92005c..ed7e7ab3f 100644
--- a/src/cat.c
+++ b/src/cat.c
@@ -37,6 +37,9 @@
 #include "ioblksize.h"
 #include "fadvise.h"
 #include "full-write.h"
+#include "isapipe.h"
+#include "splice.h"
+#include "unistd--.h"
 #include "xbinary-io.h"
 
 /* The official name of this program (e.g., no 'g' prefix).  */
@@ -545,6 +548,107 @@ copy_cat (void)
       }
 }
 
+/* Copy data from input to output using splice if possible.
+   Return 1 if successful, 0 if ordinary read+write should be tried,
+   -1 if a serious problem has been diagnosed.  */
+
+static int
+splice_cat (void)
+{
+  bool some_copied = false;
+  bool in_ok = true;
+  bool out_ok = true;
+
+#if HAVE_SPLICE
+
+  static int stdout_is_pipe = -1;
+  static idx_t stdout_pipe_size = 0;
+  if (stdout_is_pipe == -1)
+    {
+      stdout_is_pipe = 0 < isapipe (STDOUT_FILENO);
+      if (stdout_is_pipe)
+        stdout_pipe_size = increase_pipe_size (STDOUT_FILENO);
+    }
+
+  bool input_is_pipe = 0 < isapipe (input_desc);
+
+  idx_t pipe_size = stdout_pipe_size;
+  if (input_is_pipe)
+    pipe_size = MAX (pipe_size, increase_pipe_size (input_desc));
+
+  int pipefd[2] = { -1, -1 };
+
+  /* Create an intermediate pipe.
+     Even if both input and output are pipes,
+     so that read and write errors can be distinguished.  */
+  if (pipe (pipefd) < 0)
+    return false;
+  pipe_size = MAX (pipe_size, increase_pipe_size (pipefd[1]));
+
+  while (true)
+    {
+      ssize_t bytes_read = splice (input_desc, NULL, pipefd[1], NULL,
+                                   pipe_size, 0);
+      /* If we successfully splice'd input previously, assume that any
+         subsequent error is fatal.  If not, then fall back to read
+         and write.  */
+      in_ok = 0 <= bytes_read || ! some_copied;
+      if (bytes_read <= 0)
+        goto done;
+      /* We need to drain the intermediate pipe to standard output.  */
+      while (0 < bytes_read)
+        {
+          ssize_t bytes_written = splice (pipefd[0], NULL, STDOUT_FILENO, NULL,
+                                          pipe_size, 0);
+          /* If we successfully splice'd output, assume any subsequent
+             error is fatal.  If not, than drain the intermediate pipe and
+             continue using read and write.  */
+          if (bytes_written < 0)
+            {
+              if (some_copied)
+                out_ok = false;
+              else
+                {
+                  char buf[BUFSIZ];
+                  while (0 < bytes_read)
+                    {
+                      ssize_t count = MIN (bytes_read, sizeof buf);
+                      ssize_t n_read = read (pipefd[0], buf, count);
+                      /* Failure not associated with in or out.  */
+                      in_ok = out_ok = 0 <= n_read;
+                      if (n_read <= 0)
+                        goto done;
+                      if (full_write (STDOUT_FILENO, buf, n_read) != n_read)
+                        write_error ();
+                      bytes_read -= n_read;
+                    }
+                }
+            }
+          if (bytes_written <= 0)
+            goto done;
+          some_copied = true;
+          bytes_read -= bytes_written;
+        }
+    }
+
+ done:
+  if (! in_ok && ! out_ok)
+    error (0, errno, "%s", _("splice error"));
+  else if (! in_ok)
+    error (0, errno, "%s", quotef (infile));
+  else if (! out_ok)
+    write_error ();
+  if (0 <= pipefd[0])
+    {
+      int saved_errno = errno;
+      close (pipefd[0]);
+      close (pipefd[1]);
+      errno = saved_errno;
+    }
+#endif
+
+  return (in_ok && out_ok) ? some_copied : -1;
+}
 
 int
 main (int argc, char **argv)
@@ -760,9 +864,18 @@ main (int argc, char **argv)
             }
           else
             {
-              insize = MAX (insize, outsize);
-              inbuf = xalignalloc (page_size, insize);
-              ok &= simple_cat (inbuf, insize);
+              int splice_cat_status = splice_cat ();
+              if (splice_cat_status != 0)
+                {
+                  inbuf = NULL;
+                  ok &= 0 < splice_cat_status;
+                }
+              else
+                {
+                  insize = MAX (insize, outsize);
+                  inbuf = xalignalloc (page_size, insize);
+                  ok &= simple_cat (inbuf, insize);
+                }
             }
         }
       else
diff --git a/src/local.mk b/src/local.mk
index bf88f7d0e..9d9c9814b 100644
--- a/src/local.mk
+++ b/src/local.mk
@@ -61,6 +61,7 @@ noinst_HEADERS =		\
   src/remove.h			\
   src/set-fields.h		\
   src/show-date.h		\
+  src/splice.h			\
   src/statx.h			\
   src/system.h			\
   src/temp-stream.h		\
diff --git a/src/splice.h b/src/splice.h
new file mode 100644
index 000000000..1fb55054d
--- /dev/null
+++ b/src/splice.h
@@ -0,0 +1,41 @@
+/* Common definitions for splice and vmsplice.
+   Copyright (C) 2026 Free Software Foundation, Inc.
+
+   This program is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
+
+#ifndef SPLICE_H
+# define SPLICE_H 1
+
+# if HAVE_SPLICE
+
+/* Empirically determined pipe size for best throughput.
+   Needs to be <= /proc/sys/fs/pipe-max-size  */
+enum { SPLICE_PIPE_SIZE = 512 * 1024 };
+
+static inline idx_t
+increase_pipe_size (int fd)
+{
+  int pipe_cap = 0;
+#  if defined F_SETPIPE_SZ && defined F_GETPIPE_SZ
+  if ((pipe_cap = fcntl (fd, F_SETPIPE_SZ, SPLICE_PIPE_SIZE)) < 0)
+    pipe_cap = fcntl (fd, F_GETPIPE_SZ);
+#  endif
+  if (pipe_cap <= 0)
+    pipe_cap = 64 * 1024;
+  return pipe_cap;
+}
+
+# endif
+
+#endif
diff --git a/src/yes.c b/src/yes.c
index 1a1d74ce5..d111b125e 100644
--- a/src/yes.c
+++ b/src/yes.c
@@ -27,6 +27,7 @@
 #include "full-write.h"
 #include "isapipe.h"
 #include "long-options.h"
+#include "splice.h"
 #include "unistd--.h"
 
 /* The official name of this program (e.g., no 'g' prefix).  */
@@ -76,10 +77,6 @@ repeat_pattern (char *dest, char const *src, idx_t srcsize, idx_t bufsize)
 
 #if HAVE_SPLICE
 
-/* Empirically determined pipe size for best throughput.
-   Needs to be <= /proc/sys/fs/pipe-max-size  */
-enum { SPLICE_PIPE_SIZE = 512 * 1024 };
-
 /* Enlarge a pipe towards SPLICE_PIPE_SIZE and return the actual
    capacity as a quarter of the pipe size (the empirical sweet spot
    for vmsplice throughput), rounded down to a multiple of COPYSIZE.
@@ -88,15 +85,7 @@ enum { SPLICE_PIPE_SIZE = 512 * 1024 };
 static idx_t
 pipe_splice_size (int fd, idx_t copysize)
 {
-  int pipe_cap = 0;
-# if defined F_SETPIPE_SZ && defined F_GETPIPE_SZ
-  if ((pipe_cap = fcntl (fd, F_SETPIPE_SZ, SPLICE_PIPE_SIZE)) < 0)
-    pipe_cap = fcntl (fd, F_GETPIPE_SZ);
-# endif
-  if (pipe_cap <= 0)
-    pipe_cap = 64 * 1024;
-
-  size_t buf_cap = pipe_cap / 4;
+  size_t buf_cap = increase_pipe_size (fd) / 4;
   return buf_cap / copysize * copysize;
 }
 
diff --git a/tests/cat/splice.sh b/tests/cat/splice.sh
new file mode 100755
index 000000000..513a33181
--- /dev/null
+++ b/tests/cat/splice.sh
@@ -0,0 +1,66 @@
+#!/bin/sh
+# Test some cases where 'cat' uses the splice system call.
+
+# Copyright (C) 2026 Free Software Foundation, Inc.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src
+print_ver_ cat
+getlimits_
+uses_strace_
+
+# Check the non pipe output case, since that is different with splice
+if timeout 10 true; then
+  timeout .1 cat /dev/zero >/dev/null
+  test $? = 124 || fail=1
+fi
+
+# Test that splice errors are diagnosed.
+# Odd numbers are for input, even for output
+if strace -o /dev/null -e inject=splice:error=EIO:when=3 true; then
+  for when in 3 4; do
+    test "$when" = 4 && efile='write error' || efile='/dev/zero'
+    printf 'cat: %s: %s\n' "$efile" "$EIO" > exp || framework_failure_
+    returns_ 1 timeout 10 strace -o /dev/null \
+      -e inject=splice:error=EIO:when=$when \
+      cat /dev/zero >/dev/null 2>err || fail=1
+    compare exp err || fail=1
+  done
+fi
+
+# Ensure we fallback to write() if there is an issue with (async) zero-copy
+zc_syscalls='io_uring_setup io_uring_enter io_uring_register memfd_create
+             sendfile splice tee vmsplice'
+syscalls=$(
+  for s in $zc_syscalls; do
+    strace -qe "$s" true >/dev/null 2>&1 && echo "$s"
+  done | paste -s -d,)
+
+no_zero_copy() {
+  strace -f -o /dev/null -e inject=${syscalls}:error=ENOSYS "$@"
+}
+if no_zero_copy true; then
+  test "$(no_zero_copy cat /dev/zero | head -c 2 | tr '\0' 'y')" = 'yy' \
+    || fail=1
+fi
+# Ensure we fallback to write() if there is an issue with pipe2()
+# For example if we don't have enough file descriptors available.
+no_pipe() { strace -f -o /dev/null -e inject=pipe,pipe2:error=EMFILE "$@"; }
+if no_pipe true; then
+  no_pipe timeout .1 cat /dev/zero >/dev/null
+  test $? = 124 || fail=1
+fi
+
+Exit $fail
diff --git a/tests/local.mk b/tests/local.mk
index 590978297..2e889e207 100644
--- a/tests/local.mk
+++ b/tests/local.mk
@@ -308,6 +308,7 @@ all_tests =					\
   tests/cat/cat-proc.sh				\
   tests/cat/cat-buf.sh				\
   tests/cat/cat-self.sh				\
+  tests/cat/splice.sh				\
   tests/misc/basename.pl			\
   tests/basenc/base64.pl			\
   tests/basenc/basenc.pl			\
-- 
2.53.0

Reply via email to