On 19/11/2022 08:14, Korn Andras wrote:
Hi,

on zfs, newfstatat() can return an st_blksize that is approximately equal to 
the file size in bytes (if the file fit into a single zfs record).

The block size for filesystems can also be quite large (currently, up to 16M).

The code at https://github.com/coreutils/coreutils/blob/master/src/copy.c#L1343 will try 
to compute the least common multiple of the input and output block sizes, then allocate a 
buffer of that size using mmap(). With such unusual block sizes, the least common 
multiple can be very large, causing the mmap() to return ENOMEM and cp to abort with 
"memory exhausted".

Example:

openat(AT_FDCWD, "tmp", O_RDONLY|O_PATH|O_DIRECTORY) = 3</t/tmp>
newfstatat(AT_FDCWD, "usr/src/zfs-2.1.6/configure", {st_dev=makedev(0, 0x30), 
st_ino=29267, st_mode=S_IFREG|0755, st_nlink=1, st_uid=0, st_gid=0, st_blksize=2647552, 
st_blocks=5177, st_size=2647046, st_atime=1668786119 /* 
2022-11-18T16:41:59.760703544+0100 */, st_atime_nsec=760703544, st_mtime=1667705386 /* 
2022-11-06T04:29:46+0100 */, st_mtime_nsec=0, st_ctime=1668785062 /* 
2022-11-18T16:24:22.866737598+0100 */, st_ctime_nsec=866737598}, AT_SYMLINK_NOFOLLOW) = 0
newfstatat(3</t/tmp>, "configure", {st_dev=makedev(0, 0x32), st_ino=3838, 
st_mode=S_IFREG|0700, st_nlink=1, st_uid=0, st_gid=0, st_blksize=4194304, st_blocks=1, 
st_size=0, st_atime=1668788205 /* 2022-11-18T17:16:45.416328293+0100 */, 
st_atime_nsec=416328293, st_mtime=1668788255 /* 2022-11-18T17:17:35.809758585+0100 */, 
st_mtime_nsec=809758585, st_ctime=1668788255 /* 2022-11-18T17:17:35.809758585+0100 */, 
st_ctime_nsec=809758585}, 0) = 0
openat(AT_FDCWD, "usr/src/zfs-2.1.6/configure", O_RDONLY|O_NOFOLLOW) = 
4</t/usr/src/zfs-2.1.6/configure>
newfstatat(4</t/usr/src/zfs-2.1.6/configure>, "", {st_dev=makedev(0, 0x30), 
st_ino=29267, st_mode=S_IFREG|0755, st_nlink=1, st_uid=0, st_gid=0, st_blksize=2647552, 
st_blocks=5177, st_size=2647046, st_atime=1668786119 /* 2022-11-18T16:41:59.760703544+0100 */, 
st_atime_nsec=760703544, st_mtime=1667705386 /* 2022-11-06T04:29:46+0100 */, st_mtime_nsec=0, 
st_ctime=1668785062 /* 2022-11-18T16:24:22.866737598+0100 */, st_ctime_nsec=866737598}, 
AT_EMPTY_PATH) = 0
openat(3</t/tmp>, "configure", O_WRONLY|O_TRUNC) = 5</t/tmp/configure>
ioctl(5</t/tmp/configure>, BTRFS_IOC_CLONE or FICLONE, 4) = -1 EXDEV (Invalid 
cross-device link)
newfstatat(5</t/tmp/configure>, "", {st_dev=makedev(0, 0x32), st_ino=3838, 
st_mode=S_IFREG|0700, st_nlink=1, st_uid=0, st_gid=0, st_blksize=4194304, st_blocks=1, 
st_size=0, st_atime=1668788205 /* 2022-11-18T17:16:45.416328293+0100 */, 
st_atime_nsec=416328293, st_mtime=1668788266 /* 2022-11-18T17:17:46.326056957+0100 */, 
st_mtime_nsec=326056957, st_ctime=1668788266 /* 2022-11-18T17:17:46.326056957+0100 */, 
st_ctime_nsec=326056957}, AT_EMPTY_PATH) = 0
fadvise64(4</t/usr/src/zfs-2.1.6/configure>, 0, 0, POSIX_FADV_SEQUENTIAL) = 0
copy_file_range(4</t/usr/src/zfs-2.1.6/configure>, NULL, 5</t/tmp/configure>, 
NULL, 9223372035781033984, 0) = -1 EXDEV (Invalid cross-device link)
mmap(NULL, 21688754176, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) 
= -1 ENOMEM (Cannot allocate memory)
brk(0x55a7fa4b0000)                     = 0x55a2ed8ab000
mmap(NULL, 21689794560, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) 
= -1 ENOMEM (Cannot allocate memory)
openat(AT_FDCWD, "/usr/share/locale/en_US.UTF-8/LC_MESSAGES/coreutils.mo", 
O_RDONLY) = -1 ENOENT (No such file or directory)
openat(AT_FDCWD, "/usr/share/locale/en_US.utf8/LC_MESSAGES/coreutils.mo", 
O_RDONLY) = -1 ENOENT (No such file or directory)
openat(AT_FDCWD, "/usr/share/locale/en_US/LC_MESSAGES/coreutils.mo", O_RDONLY) 
= -1 ENOENT (No such file or directory)
openat(AT_FDCWD, "/usr/share/locale/en.UTF-8/LC_MESSAGES/coreutils.mo", 
O_RDONLY) = -1 ENOENT (No such file or directory)
openat(AT_FDCWD, "/usr/share/locale/en.utf8/LC_MESSAGES/coreutils.mo", 
O_RDONLY) = -1 ENOENT (No such file or directory)
openat(AT_FDCWD, "/usr/share/locale/en/LC_MESSAGES/coreutils.mo", O_RDONLY) = 
-1 ENOENT (No such file or directory)
write(2</dev/pts/3<char 136:3>>, "cp: ", 4) = 4
write(2</dev/pts/3<char 136:3>>, "memory exhausted", 16) = 16
write(2</dev/pts/3<char 136:3>>, "\n", 1) = 1
lseek(0</dev/pts/3<char 136:3>>, 0, SEEK_CUR) = -1 ESPIPE (Illegal seek)
close(0</dev/pts/3<char 136:3>>)        = 0
close(1</dev/pts/3<char 136:3>>)        = 0
close(2</dev/pts/3<char 136:3>>)        = 0
exit_group(1)                           = ?

I think cp(1) shouldn't insist on using the lcm if it's very large; 
additionally, it should never try to allocate a buffer that is (much) larger 
than the source file.

Perhaps you could also try to use successively smaller buffers if allocating a 
large one fails?

Thank you for the detailed report.
This buffer_lcm code has been present for a long time,
but I agree in the presence of disparate st_blksize values
it can result in inappropriate buffer sizes.

In your case:
  src st_blksize = 2647552 (0x286600) (st_size = 2647046)
  dst st_blksize = 4194304 (0x400000) (st_size = 0)
Which implies a lowest common multiple of 21688745984

Once the st_blksize is based on st_size we can get very large values like this.
In the zfs case it seems to set st_blksize to st_size rounded to next multiple 
of 512.

The proposed patch attached removes the use of buffer_lcm()
and just picks the largest st_blksize, which would be 4MiB in your case.
It also limits the max buffer size to 32MiB in the edge case
where st_blksize returns a larger value that this.

cheers,
Pádraig
From 48b03eb888a1178f8b31b7ab17351ccaee2d533c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?P=C3=A1draig=20Brady?= <p...@draigbrady.com>
Date: Sat, 19 Nov 2022 17:09:17 +0000
Subject: [PATCH] copy: constrain buffer size to a reasonable value

Remove use of buffer_lcm() to determine a buffer_size to use,
and instead just pick the largest io_blksize()
for the src and dst file, which in turn is constrained
to a reasonable value of 32MiB currently.

* bootstrap.conf: Remove buffer-lcm module.
* gl/lib/buffer-lcm.c: Likewise.
* gl/lib/buffer-lcm.h: Likewise.
* gl/modules/buffer-lcm: Likewise.
* src/copy.c: Pick a buffer value as per the above description.
* src/ioblksize.h: Constrain io_blksize() to 32MiB,
* NEWS: Mention the fix.
to avoid any subsequent memory or type overflow issues.
Fixes https://bugs.gnu.org/59382
---
 bootstrap.conf        |  1 -
 gl/lib/buffer-lcm.c   | 59 -------------------------------------------
 gl/lib/buffer-lcm.h   |  2 --
 gl/modules/buffer-lcm | 23 -----------------
 src/copy.c            | 18 +++++--------
 src/ioblksize.h       | 10 +++++---
 6 files changed, 14 insertions(+), 99 deletions(-)
 delete mode 100644 gl/lib/buffer-lcm.c
 delete mode 100644 gl/lib/buffer-lcm.h
 delete mode 100644 gl/modules/buffer-lcm

diff --git a/bootstrap.conf b/bootstrap.conf
index 8e257a254..f37ac4e47 100644
--- a/bootstrap.conf
+++ b/bootstrap.conf
@@ -42,7 +42,6 @@ gnulib_modules="
   base32
   base64
   btowc
-  buffer-lcm
   byteswap
   c-strcase
   cl-strtod
diff --git a/gl/lib/buffer-lcm.c b/gl/lib/buffer-lcm.c
deleted file mode 100644
index 392dcb753..000000000
--- a/gl/lib/buffer-lcm.c
+++ /dev/null
@@ -1,59 +0,0 @@
-/* buffer-lcm.c - compute a good buffer size for dealing with two files
-
-   Copyright (C) 2002-2022 Free Software Foundation, Inc.
-
-   This program is free software: you can redistribute it and/or modify
-   it under the terms of the GNU General Public License as published by
-   the Free Software Foundation, either version 3 of the License, or
-   (at your option) any later version.
-
-   This program is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-   GNU General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
-
-/* Written by Paul Eggert.  */
-
-#include <config.h>
-#include "buffer-lcm.h"
-
-/* Return a buffer size suitable for doing I/O with files whose block
-   sizes are A and B.  However, never return a value greater than
-   LCM_MAX.  */
-
-size_t
-buffer_lcm (size_t a, size_t b, size_t lcm_max)
-{
-  size_t size;
-
-  /* Use reasonable values if buffer sizes are zero.  */
-  if (!a)
-    size = b ? b : 8 * 1024;
-  else
-    {
-      if (b)
-        {
-          /* Return lcm (A, B) if it is in range; otherwise, fall back
-             on A.  */
-
-          size_t lcm, m, n, q, r;
-
-          /* N = gcd (A, B).  */
-          for (m = a, n = b;  (r = m % n) != 0;  m = n, n = r)
-            continue;
-
-          /* LCM = lcm (A, B), if in range.  */
-          q = a / n;
-          lcm = q * b;
-          if (lcm <= lcm_max && lcm / b == q)
-            return lcm;
-        }
-
-      size = a;
-    }
-
-  return size <= lcm_max ? size : lcm_max;
-}
diff --git a/gl/lib/buffer-lcm.h b/gl/lib/buffer-lcm.h
deleted file mode 100644
index 454c65be8..000000000
--- a/gl/lib/buffer-lcm.h
+++ /dev/null
@@ -1,2 +0,0 @@
-#include <stddef.h>
-size_t buffer_lcm (size_t, size_t, size_t) _GL_ATTRIBUTE_CONST;
diff --git a/gl/modules/buffer-lcm b/gl/modules/buffer-lcm
deleted file mode 100644
index 1d86f715e..000000000
--- a/gl/modules/buffer-lcm
+++ /dev/null
@@ -1,23 +0,0 @@
-Description:
-Compute a good buffer size for dealing with two files.
-
-Files:
-lib/buffer-lcm.c
-lib/buffer-lcm.h
-
-Depends-on:
-stddef
-
-configure.ac:
-
-Makefile.am:
-lib_SOURCES += buffer-lcm.c buffer-lcm.h
-
-Include:
-"buffer-lcm.h"
-
-License:
-GPL
-
-Maintainer:
-Paul Eggert
diff --git a/src/copy.c b/src/copy.c
index e465271ef..5ccac99d5 100644
--- a/src/copy.c
+++ b/src/copy.c
@@ -34,7 +34,6 @@
 #include "acl.h"
 #include "alignalloc.h"
 #include "backupfile.h"
-#include "buffer-lcm.h"
 #include "canonicalize.h"
 #include "copy.h"
 #include "cp-hash.h"
@@ -1342,22 +1341,19 @@ copy_reg (char const *src_name, char const *dst_name,
         {
           /* Compute the least common multiple of the input and output
              buffer sizes, adjusting for outlandish values.  */
-          size_t blcm_max = MIN (SIZE_MAX, SSIZE_MAX);
-          size_t blcm = buffer_lcm (io_blksize (src_open_sb), buf_size,
-                                    blcm_max);
+          size_t bmax = MAX (io_blksize (src_open_sb), buf_size);
 
           /* Do not bother with a buffer larger than the input file, plus one
              byte to make sure the file has not grown while reading it.  */
           if (S_ISREG (src_open_sb.st_mode) && src_open_sb.st_size < buf_size)
             buf_size = src_open_sb.st_size + 1;
 
-          /* However, stick with a block size that is a positive multiple of
-             blcm, overriding the above adjustments.  Watch out for
-             overflow.  */
-          buf_size += blcm - 1;
-          buf_size -= buf_size % blcm;
-          if (buf_size == 0 || blcm_max < buf_size)
-            buf_size = blcm;
+          /* However, stick with a buffer size that is a positive multiple of
+             bmax, overriding the above adjustments.  Watch out for overflow. */
+          buf_size += bmax - 1;
+          buf_size -= buf_size % bmax;
+          if (buf_size == 0 || IO_BUFSIZE_MAX < buf_size)
+            buf_size = bmax;
         }
 
       off_t n_read;
diff --git a/src/ioblksize.h b/src/ioblksize.h
index 8bd18ba05..b8e9ebf32 100644
--- a/src/ioblksize.h
+++ b/src/ioblksize.h
@@ -17,6 +17,7 @@
 /* Include this file _after_ system headers if possible.  */
 
 /* sys/stat.h and minmax.h will already have been included by system.h. */
+#include <assert.h>
 #include "idx.h"
 #include "stat-size.h"
 
@@ -72,11 +73,14 @@
    and default to io_blksize() if not.
  */
 enum { IO_BUFSIZE = 128 * 1024 };
+
+/* Set a max constraint to avoid excessive mem usage or type overflow.  */
+enum { IO_BUFSIZE_MAX = 128 * IO_BUFSIZE };
+static_assert (IO_BUFSIZE_MAX <= MIN (IDX_MAX, SIZE_MAX) / 2 + 1);
+
 static inline idx_t
 io_blksize (struct stat sb)
 {
-  /* Don’t go above the largest power of two that fits in idx_t and size_t,
-     as that is asking for trouble.  */
-  return MIN (MIN (IDX_MAX, SIZE_MAX) / 2 + 1,
+  return MIN (IO_BUFSIZE_MAX,
               MAX (IO_BUFSIZE, ST_BLKSIZE (sb)));
 }
-- 
2.26.2

Reply via email to