Pádraig Brady <[email protected]> writes: > Note we don't exit upon splice_cat failure, > which does lead to a change in behavior. > So one change to my previous patch is > I'd add an EXIT_FAILURE param to the error(..."standard error") call. > > That still doesn't help when it's ambiguous though. > This shows we fallback to the write() path for subsequent files: > > $ cat /dev/zero | (trap '' PIPE; src/cat - - -) | head -c1 >/dev/null cat: > splice error: Broken pipe > cat: write error: Broken pipe > > Not huge issues, but it would be nice to avoid. > Specifically if output is gone, we'd like to avoid subsequent files. > Consider how this now hangs for example: > > $ yes | (trap '' PIPE; src/cat - /dev/tty) | head -c1 >/dev/null > > A bit contrived I know.
Nice catches, thanks. > Note this is only in the case where we don't create an intermediate pipe. > When we do have the intermediate pipe we can distinguish read errors > from write errors, so perhaps we should always do that. > Actually testing that here, gives faster operation anyway > (I guess due to the sizing of the pipes): > > $ timeout 2 src/yes | pv -r | src/cat >/dev/null > Creating intermediate pipe > [36.9GiB/s] > > $ timeout 2 src/yes | pv -r | src/cat >/dev/null > [22.6GiB/s] Interesting. I wrongly assumed it would be faster without an intermediate pipe. I think I am for always creating an intermediate pipe. It also simplifies the code quite a bit. Most people will never use 'ulimit -n 4 cat' or hit system wide file descriptor limits. If they do, I don't think read and write are slow enough for them to care (or even notice). Attatched a v4 patch. Collin
>From 63a23e962d16241710b960a37a9947e6f6dbed4c Mon Sep 17 00:00:00 2001 Message-ID: <63a23e962d16241710b960a37a9947e6f6dbed4c.1775269558.git.collin.fu...@gmail.com> From: Collin Funk <[email protected]> Date: Sun, 29 Mar 2026 16:13:01 -0700 Subject: [PATCH v4] cat: use splice if operating on pipes or if copy_file_range fails On a AMD Ryzen 7 3700X system: $ timeout 10 taskset 1 ./src/cat-prev /dev/zero \ | taskset 2 pv -r > /dev/null [1.67GiB/s] $ timeout 10 taskset 1 ./src/cat /dev/zero \ | taskset 2 pv -r > /dev/null [9.03GiB/s] On a Power10 system: $ taskset 1 ./src/yes | timeout 10 taskset 2 ./src/cat-prev \ | taskset 3 pv -r > /dev/null [12.9GiB/s] $ taskset 1 ./src/yes | timeout 10 taskset 2 ./src/cat \ | taskset 3 pv -r > /dev/null [81.8GiB/s] * NEWS: Mention the improvement. * src/cat.c: Include isapipe.h, splice.h, and unistd--.h. (splice_cat): New function. (main): Use it. * src/local.mk (noinst_HEADERS): Add src/splice.h. * src/splice.h: New file, based on definitions from src/yes.c. * src/yes.c: Include splice.h. (pipe_splice_size): Use increase_pipe_size from src/splice.h. (SPLICE_PIPE_SIZE): Remove definition, moved to src/splice.h. * tests/cat/splice.sh: New file, based on some tests in tests/misc/yes.sh. * tests/local.mk (all_tests): Add the new test. --- NEWS | 4 ++ src/cat.c | 119 ++++++++++++++++++++++++++++++++++++++++++-- src/local.mk | 1 + src/splice.h | 41 +++++++++++++++ src/yes.c | 15 +----- tests/cat/splice.sh | 66 ++++++++++++++++++++++++ tests/local.mk | 1 + 7 files changed, 231 insertions(+), 16 deletions(-) create mode 100644 src/splice.h create mode 100755 tests/cat/splice.sh diff --git a/NEWS b/NEWS index 2fabd07b7..e46ab72c2 100644 --- a/NEWS +++ b/NEWS @@ -39,6 +39,10 @@ GNU coreutils NEWS -*- outline -*- ** Improvements + 'cat' now uses zero-copy I/O on Linux when the input or output are pipes, + to significantly increase throughput. + E.g., throughput improved 6x from 12.9GiB/s to 81.8GiB/s on a Power10 system. + 'df --local' recognises more file system types as remote. Specifically: autofs, ncpfs, smb, smb2, gfs, gfs2, userlandfs. diff --git a/src/cat.c b/src/cat.c index f9c92005c..ed7e7ab3f 100644 --- a/src/cat.c +++ b/src/cat.c @@ -37,6 +37,9 @@ #include "ioblksize.h" #include "fadvise.h" #include "full-write.h" +#include "isapipe.h" +#include "splice.h" +#include "unistd--.h" #include "xbinary-io.h" /* The official name of this program (e.g., no 'g' prefix). */ @@ -545,6 +548,107 @@ copy_cat (void) } } +/* Copy data from input to output using splice if possible. + Return 1 if successful, 0 if ordinary read+write should be tried, + -1 if a serious problem has been diagnosed. */ + +static int +splice_cat (void) +{ + bool some_copied = false; + bool in_ok = true; + bool out_ok = true; + +#if HAVE_SPLICE + + static int stdout_is_pipe = -1; + static idx_t stdout_pipe_size = 0; + if (stdout_is_pipe == -1) + { + stdout_is_pipe = 0 < isapipe (STDOUT_FILENO); + if (stdout_is_pipe) + stdout_pipe_size = increase_pipe_size (STDOUT_FILENO); + } + + bool input_is_pipe = 0 < isapipe (input_desc); + + idx_t pipe_size = stdout_pipe_size; + if (input_is_pipe) + pipe_size = MAX (pipe_size, increase_pipe_size (input_desc)); + + int pipefd[2] = { -1, -1 }; + + /* Create an intermediate pipe. + Even if both input and output are pipes, + so that read and write errors can be distinguished. */ + if (pipe (pipefd) < 0) + return false; + pipe_size = MAX (pipe_size, increase_pipe_size (pipefd[1])); + + while (true) + { + ssize_t bytes_read = splice (input_desc, NULL, pipefd[1], NULL, + pipe_size, 0); + /* If we successfully splice'd input previously, assume that any + subsequent error is fatal. If not, then fall back to read + and write. */ + in_ok = 0 <= bytes_read || ! some_copied; + if (bytes_read <= 0) + goto done; + /* We need to drain the intermediate pipe to standard output. */ + while (0 < bytes_read) + { + ssize_t bytes_written = splice (pipefd[0], NULL, STDOUT_FILENO, NULL, + pipe_size, 0); + /* If we successfully splice'd output, assume any subsequent + error is fatal. If not, than drain the intermediate pipe and + continue using read and write. */ + if (bytes_written < 0) + { + if (some_copied) + out_ok = false; + else + { + char buf[BUFSIZ]; + while (0 < bytes_read) + { + ssize_t count = MIN (bytes_read, sizeof buf); + ssize_t n_read = read (pipefd[0], buf, count); + /* Failure not associated with in or out. */ + in_ok = out_ok = 0 <= n_read; + if (n_read <= 0) + goto done; + if (full_write (STDOUT_FILENO, buf, n_read) != n_read) + write_error (); + bytes_read -= n_read; + } + } + } + if (bytes_written <= 0) + goto done; + some_copied = true; + bytes_read -= bytes_written; + } + } + + done: + if (! in_ok && ! out_ok) + error (0, errno, "%s", _("splice error")); + else if (! in_ok) + error (0, errno, "%s", quotef (infile)); + else if (! out_ok) + write_error (); + if (0 <= pipefd[0]) + { + int saved_errno = errno; + close (pipefd[0]); + close (pipefd[1]); + errno = saved_errno; + } +#endif + + return (in_ok && out_ok) ? some_copied : -1; +} int main (int argc, char **argv) @@ -760,9 +864,18 @@ main (int argc, char **argv) } else { - insize = MAX (insize, outsize); - inbuf = xalignalloc (page_size, insize); - ok &= simple_cat (inbuf, insize); + int splice_cat_status = splice_cat (); + if (splice_cat_status != 0) + { + inbuf = NULL; + ok &= 0 < splice_cat_status; + } + else + { + insize = MAX (insize, outsize); + inbuf = xalignalloc (page_size, insize); + ok &= simple_cat (inbuf, insize); + } } } else diff --git a/src/local.mk b/src/local.mk index bf88f7d0e..9d9c9814b 100644 --- a/src/local.mk +++ b/src/local.mk @@ -61,6 +61,7 @@ noinst_HEADERS = \ src/remove.h \ src/set-fields.h \ src/show-date.h \ + src/splice.h \ src/statx.h \ src/system.h \ src/temp-stream.h \ diff --git a/src/splice.h b/src/splice.h new file mode 100644 index 000000000..1fb55054d --- /dev/null +++ b/src/splice.h @@ -0,0 +1,41 @@ +/* Common definitions for splice and vmsplice. + Copyright (C) 2026 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <https://www.gnu.org/licenses/>. */ + +#ifndef SPLICE_H +# define SPLICE_H 1 + +# if HAVE_SPLICE + +/* Empirically determined pipe size for best throughput. + Needs to be <= /proc/sys/fs/pipe-max-size */ +enum { SPLICE_PIPE_SIZE = 512 * 1024 }; + +static inline idx_t +increase_pipe_size (int fd) +{ + int pipe_cap = 0; +# if defined F_SETPIPE_SZ && defined F_GETPIPE_SZ + if ((pipe_cap = fcntl (fd, F_SETPIPE_SZ, SPLICE_PIPE_SIZE)) < 0) + pipe_cap = fcntl (fd, F_GETPIPE_SZ); +# endif + if (pipe_cap <= 0) + pipe_cap = 64 * 1024; + return pipe_cap; +} + +# endif + +#endif diff --git a/src/yes.c b/src/yes.c index 1a1d74ce5..d111b125e 100644 --- a/src/yes.c +++ b/src/yes.c @@ -27,6 +27,7 @@ #include "full-write.h" #include "isapipe.h" #include "long-options.h" +#include "splice.h" #include "unistd--.h" /* The official name of this program (e.g., no 'g' prefix). */ @@ -76,10 +77,6 @@ repeat_pattern (char *dest, char const *src, idx_t srcsize, idx_t bufsize) #if HAVE_SPLICE -/* Empirically determined pipe size for best throughput. - Needs to be <= /proc/sys/fs/pipe-max-size */ -enum { SPLICE_PIPE_SIZE = 512 * 1024 }; - /* Enlarge a pipe towards SPLICE_PIPE_SIZE and return the actual capacity as a quarter of the pipe size (the empirical sweet spot for vmsplice throughput), rounded down to a multiple of COPYSIZE. @@ -88,15 +85,7 @@ enum { SPLICE_PIPE_SIZE = 512 * 1024 }; static idx_t pipe_splice_size (int fd, idx_t copysize) { - int pipe_cap = 0; -# if defined F_SETPIPE_SZ && defined F_GETPIPE_SZ - if ((pipe_cap = fcntl (fd, F_SETPIPE_SZ, SPLICE_PIPE_SIZE)) < 0) - pipe_cap = fcntl (fd, F_GETPIPE_SZ); -# endif - if (pipe_cap <= 0) - pipe_cap = 64 * 1024; - - size_t buf_cap = pipe_cap / 4; + size_t buf_cap = increase_pipe_size (fd) / 4; return buf_cap / copysize * copysize; } diff --git a/tests/cat/splice.sh b/tests/cat/splice.sh new file mode 100755 index 000000000..513a33181 --- /dev/null +++ b/tests/cat/splice.sh @@ -0,0 +1,66 @@ +#!/bin/sh +# Test some cases where 'cat' uses the splice system call. + +# Copyright (C) 2026 Free Software Foundation, Inc. + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. + +. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src +print_ver_ cat +getlimits_ +uses_strace_ + +# Check the non pipe output case, since that is different with splice +if timeout 10 true; then + timeout .1 cat /dev/zero >/dev/null + test $? = 124 || fail=1 +fi + +# Test that splice errors are diagnosed. +# Odd numbers are for input, even for output +if strace -o /dev/null -e inject=splice:error=EIO:when=3 true; then + for when in 3 4; do + test "$when" = 4 && efile='write error' || efile='/dev/zero' + printf 'cat: %s: %s\n' "$efile" "$EIO" > exp || framework_failure_ + returns_ 1 timeout 10 strace -o /dev/null \ + -e inject=splice:error=EIO:when=$when \ + cat /dev/zero >/dev/null 2>err || fail=1 + compare exp err || fail=1 + done +fi + +# Ensure we fallback to write() if there is an issue with (async) zero-copy +zc_syscalls='io_uring_setup io_uring_enter io_uring_register memfd_create + sendfile splice tee vmsplice' +syscalls=$( + for s in $zc_syscalls; do + strace -qe "$s" true >/dev/null 2>&1 && echo "$s" + done | paste -s -d,) + +no_zero_copy() { + strace -f -o /dev/null -e inject=${syscalls}:error=ENOSYS "$@" +} +if no_zero_copy true; then + test "$(no_zero_copy cat /dev/zero | head -c 2 | tr '\0' 'y')" = 'yy' \ + || fail=1 +fi +# Ensure we fallback to write() if there is an issue with pipe2() +# For example if we don't have enough file descriptors available. +no_pipe() { strace -f -o /dev/null -e inject=pipe,pipe2:error=EMFILE "$@"; } +if no_pipe true; then + no_pipe timeout .1 cat /dev/zero >/dev/null + test $? = 124 || fail=1 +fi + +Exit $fail diff --git a/tests/local.mk b/tests/local.mk index 590978297..2e889e207 100644 --- a/tests/local.mk +++ b/tests/local.mk @@ -308,6 +308,7 @@ all_tests = \ tests/cat/cat-proc.sh \ tests/cat/cat-buf.sh \ tests/cat/cat-self.sh \ + tests/cat/splice.sh \ tests/misc/basename.pl \ tests/basenc/base64.pl \ tests/basenc/basenc.pl \ -- 2.53.0
