This implements these interconnected options: --allocated --destination-is-zero (alias: --target-is-zero) --no-extents --- TODO | 6 +- copy/Makefile.am | 7 + copy/copy-sparse-allocated.sh | 92 ++++++++++++++ copy/copy-sparse-no-extents.sh | 92 ++++++++++++++ copy/copy-sparse.sh | 97 ++++++++++++++ copy/file-ops.c | 181 ++++++++++++++++++++++++++ copy/main.c | 107 ++++++++++++++-- copy/multi-thread-copying.c | 226 ++++++++++++++++++++++++--------- copy/nbd-ops.c | 176 +++++++++++++++++++++++++ copy/nbdcopy.h | 50 ++++++++ copy/nbdcopy.pod | 32 ++++- copy/pipe-ops.c | 30 ++++- 12 files changed, 1015 insertions(+), 81 deletions(-)
diff --git a/TODO b/TODO index 9e5c821..8c0402e 100644 --- a/TODO +++ b/TODO @@ -30,9 +30,9 @@ Performance: Chart it over various buffer sizes and threads, as that Examine other fuzzers: https://gitlab.com/akihe/radamsa nbdcopy: - - Properly handle extents/sparseness in input and output. - - Write zeroes efficiently. - - Detect zeroes (optionally) and turn into sparseness. + - --synchronous mode does not yet support extents. + - Detect zeroes (optionally) and turn into sparseness + (like qemu-img convert -S). - Progress bar: allow it to be written to a file descriptor and/or written in a machine-consumable format. - Minimum/preferred/maximum block size. diff --git a/copy/Makefile.am b/copy/Makefile.am index 8f2d168..cfbc386 100644 --- a/copy/Makefile.am +++ b/copy/Makefile.am @@ -26,6 +26,9 @@ EXTRA_DIST = \ copy-nbd-to-small-block-error.sh \ copy-nbd-to-small-nbd-error.sh \ copy-nbd-to-stdout.sh \ + copy-sparse.sh \ + copy-sparse-allocated.sh \ + copy-sparse-no-extents.sh \ copy-stdin-to-nbd.sh \ nbdcopy.pod \ $(NULL) @@ -46,6 +49,7 @@ nbdcopy_SOURCES = \ $(NULL) nbdcopy_CPPFLAGS = \ -I$(top_srcdir)/include \ + -I$(top_srcdir)/common/include \ -I$(top_srcdir)/common/utils \ $(NULL) nbdcopy_CFLAGS = \ @@ -88,6 +92,9 @@ TESTS += \ copy-nbd-to-small-nbd-error.sh \ copy-stdin-to-nbd.sh \ copy-nbd-to-stdout.sh \ + copy-sparse.sh \ + copy-sparse-allocated.sh \ + copy-sparse-no-extents.sh \ $(ROOT_TESTS) \ $(NULL) diff --git a/copy/copy-sparse-allocated.sh b/copy/copy-sparse-allocated.sh new file mode 100755 index 0000000..203c3b9 --- /dev/null +++ b/copy/copy-sparse-allocated.sh @@ -0,0 +1,92 @@ +#!/usr/bin/env bash +# nbd client library in userspace +# Copyright (C) 2020 Red Hat Inc. +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + +# Adapted from copy-sparse.sh. +# +# This test depends on the nbdkit default sparse block size (32K). + +. ../tests/functions.sh + +set -e +set -x + +requires nbdkit --version +requires nbdkit --exit-with-parent --version +requires nbdkit data --version +requires nbdkit eval --version + +out=copy-sparse-allocated.out +cleanup_fn rm -f $out + +$VG nbdcopy --allocated -- \ + [ nbdkit --exit-with-parent data data=' + 1 + @1073741823 1 + @4294967295 1 + @4294967296 1 + ' ] \ + [ nbdkit --exit-with-parent eval \ + get_size=' echo 7E ' \ + pwrite=" echo \$@ >> $out " \ + trim=" echo \$@ >> $out " \ + zero=" echo \$@ >> $out " ] + +sort -o $out $out + +echo Output: +cat $out + +if [ "$(cat $out)" != "pwrite 1 4294967296 +pwrite 32768 0 +pwrite 32768 1073709056 +pwrite 32768 4294934528 +zero 134184960 32768 +zero 134184960 4160749568 +zero 134184960 939524096 +zero 134217728 1073741824 +zero 134217728 1207959552 +zero 134217728 134217728 +zero 134217728 1342177280 +zero 134217728 1476395008 +zero 134217728 1610612736 +zero 134217728 1744830464 +zero 134217728 1879048192 +zero 134217728 2013265920 +zero 134217728 2147483648 +zero 134217728 2281701376 +zero 134217728 2415919104 +zero 134217728 2550136832 +zero 134217728 268435456 +zero 134217728 2684354560 +zero 134217728 2818572288 +zero 134217728 2952790016 +zero 134217728 3087007744 +zero 134217728 3221225472 +zero 134217728 3355443200 +zero 134217728 3489660928 +zero 134217728 3623878656 +zero 134217728 3758096384 +zero 134217728 3892314112 +zero 134217728 402653184 +zero 134217728 4026531840 +zero 134217728 536870912 +zero 134217728 671088640 +zero 134217728 805306368" ]; then + echo "$0: output does not match expected" + exit 1 +fi diff --git a/copy/copy-sparse-no-extents.sh b/copy/copy-sparse-no-extents.sh new file mode 100755 index 0000000..e976d55 --- /dev/null +++ b/copy/copy-sparse-no-extents.sh @@ -0,0 +1,92 @@ +#!/usr/bin/env bash +# nbd client library in userspace +# Copyright (C) 2020 Red Hat Inc. +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + +# Adapted from copy-sparse.sh +# +# This test depends on the nbdkit default sparse block size (32K). + +. ../tests/functions.sh + +set -e +set -x + +# Skip this test under valgrind, it takes too long. +if [ "x$LIBNBD_VALGRIND" = "x1" ]; then + echo "$0: test skipped under valgrind" + exit 77 +fi + +requires nbdkit --version +requires nbdkit --exit-with-parent --version +requires nbdkit data --version +requires nbdkit eval --version + +out=copy-sparse-no-extents.out +cleanup_fn rm -f $out + +$VG nbdcopy --no-extents -- \ + [ nbdkit --exit-with-parent data data=' + 1 + @1073741823 1 + ' ] \ + [ nbdkit --exit-with-parent eval \ + get_size=' echo 7E ' \ + pwrite=" echo \$@ >> $out " \ + trim=" echo \$@ >> $out " \ + zero=" echo \$@ >> $out " ] + +sort -n -o $out $out + +echo Output: +cat $out + +if [ "$(cat $out)" != "pwrite 33554432 0 +pwrite 33554432 100663296 +pwrite 33554432 1006632960 +pwrite 33554432 1040187392 +pwrite 33554432 134217728 +pwrite 33554432 167772160 +pwrite 33554432 201326592 +pwrite 33554432 234881024 +pwrite 33554432 268435456 +pwrite 33554432 301989888 +pwrite 33554432 33554432 +pwrite 33554432 335544320 +pwrite 33554432 369098752 +pwrite 33554432 402653184 +pwrite 33554432 436207616 +pwrite 33554432 469762048 +pwrite 33554432 503316480 +pwrite 33554432 536870912 +pwrite 33554432 570425344 +pwrite 33554432 603979776 +pwrite 33554432 637534208 +pwrite 33554432 67108864 +pwrite 33554432 671088640 +pwrite 33554432 704643072 +pwrite 33554432 738197504 +pwrite 33554432 771751936 +pwrite 33554432 805306368 +pwrite 33554432 838860800 +pwrite 33554432 872415232 +pwrite 33554432 905969664 +pwrite 33554432 939524096 +pwrite 33554432 973078528" ]; then + echo "$0: output does not match expected" + exit 1 +fi diff --git a/copy/copy-sparse.sh b/copy/copy-sparse.sh new file mode 100755 index 0000000..2fc4d9a --- /dev/null +++ b/copy/copy-sparse.sh @@ -0,0 +1,97 @@ +#!/usr/bin/env bash +# nbd client library in userspace +# Copyright (C) 2020 Red Hat Inc. +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + +# This test depends on the nbdkit default sparse block size (32K). + +. ../tests/functions.sh + +set -e +set -x + +requires nbdkit --version +requires nbdkit --exit-with-parent --version +requires nbdkit data --version +requires nbdkit eval --version + +out=copy-sparse.out +cleanup_fn rm -f $out + +# Copy from a sparse data disk to an nbdkit-eval-plugin instance which +# is logging everything. This allows us to see exactly what nbdcopy +# is writing, to ensure it is writing and trimming the target as +# expected. +$VG nbdcopy -- \ + [ nbdkit --exit-with-parent data data=' + 1 + @1073741823 1 + @4294967295 1 + @4294967296 1 + ' ] \ + [ nbdkit --exit-with-parent eval \ + get_size=' echo 7E ' \ + pwrite=" echo \$@ >> $out " \ + trim=" echo \$@ >> $out " \ + zero=" echo \$@ >> $out " ] + +# Order of the output could vary because requests are sent in +# parallel. +sort -n -o $out $out + +echo Output: +cat $out + +# Check the output matches expected. +if [ "$(cat $out)" != "pwrite 1 4294967296 +pwrite 32768 0 +pwrite 32768 1073709056 +pwrite 32768 4294934528 +trim 134184960 32768 +trim 134184960 4160749568 +trim 134184960 939524096 +trim 134217728 1073741824 +trim 134217728 1207959552 +trim 134217728 134217728 +trim 134217728 1342177280 +trim 134217728 1476395008 +trim 134217728 1610612736 +trim 134217728 1744830464 +trim 134217728 1879048192 +trim 134217728 2013265920 +trim 134217728 2147483648 +trim 134217728 2281701376 +trim 134217728 2415919104 +trim 134217728 2550136832 +trim 134217728 268435456 +trim 134217728 2684354560 +trim 134217728 2818572288 +trim 134217728 2952790016 +trim 134217728 3087007744 +trim 134217728 3221225472 +trim 134217728 3355443200 +trim 134217728 3489660928 +trim 134217728 3623878656 +trim 134217728 3758096384 +trim 134217728 3892314112 +trim 134217728 402653184 +trim 134217728 4026531840 +trim 134217728 536870912 +trim 134217728 671088640 +trim 134217728 805306368" ]; then + echo "$0: output does not match expected" + exit 1 +fi diff --git a/copy/file-ops.c b/copy/file-ops.c index 9e94b30..cd19e81 100644 --- a/copy/file-ops.c +++ b/copy/file-ops.c @@ -24,7 +24,16 @@ #include <fcntl.h> #include <unistd.h> #include <errno.h> +#include <sys/ioctl.h> +#include <sys/types.h> +#include <pthread.h> + +#if defined (__linux__) +#include <linux/fs.h> /* For BLKZEROOUT */ +#endif + +#include "isaligned.h" #include "nbdcopy.h" static size_t @@ -74,6 +83,64 @@ file_synch_write (struct rw *rw, } } +static bool +file_synch_trim (struct rw *rw, uint64_t offset, uint64_t count) +{ + assert (rw->t == LOCAL); + +#ifdef FALLOC_FL_PUNCH_HOLE + int fd = rw->u.local.fd; + int r; + + r = fallocate (fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, + offset, count); + if (r == -1) { + perror ("fallocate: FALLOC_FL_PUNCH_HOLE"); + exit (EXIT_FAILURE); + } + return true; +#else /* !FALLOC_FL_PUNCH_HOLE */ + return false; +#endif +} + +static bool +file_synch_zero (struct rw *rw, uint64_t offset, uint64_t count) +{ + assert (rw->t == LOCAL); + + if (S_ISREG (rw->u.local.stat.st_mode)) { +#ifdef FALLOC_FL_ZERO_RANGE + int fd = rw->u.local.fd; + int r; + + r = fallocate (fd, FALLOC_FL_ZERO_RANGE, offset, count); + if (r == -1) { + perror ("fallocate: FALLOC_FL_ZERO_RANGE"); + exit (EXIT_FAILURE); + } + return true; +#endif + } + else if (S_ISBLK (rw->u.local.stat.st_mode) && + IS_ALIGNED (offset | count, rw->u.local.sector_size)) { +#ifdef BLKZEROOUT + int fd = rw->u.local.fd; + int r; + uint64_t range[2] = {offset, count}; + + r = ioctl (fd, BLKZEROOUT, &range); + if (r == -1) { + perror ("ioctl: BLKZEROOUT"); + exit (EXIT_FAILURE); + } + return true; +#endif + } + + return false; +} + static void file_asynch_read (struct rw *rw, struct buffer *buffer, @@ -104,9 +171,123 @@ file_asynch_write (struct rw *rw, } } +static bool +file_asynch_trim (struct rw *rw, struct buffer *buffer, + nbd_completion_callback cb) +{ + assert (rw->t == LOCAL); + + if (!file_synch_trim (rw, buffer->offset, buffer->len)) + return false; + errno = 0; + if (cb.callback (cb.user_data, &errno) == -1) { + perror (rw->name); + exit (EXIT_FAILURE); + } + return true; +} + +static bool +file_asynch_zero (struct rw *rw, struct buffer *buffer, + nbd_completion_callback cb) +{ + assert (rw->t == LOCAL); + + if (!file_synch_zero (rw, buffer->offset, buffer->len)) + return false; + errno = 0; + if (cb.callback (cb.user_data, &errno) == -1) { + perror (rw->name); + exit (EXIT_FAILURE); + } + return true; +} + +static void +file_get_extents (struct rw *rw, uintptr_t index, + uint64_t offset, uint64_t count, + extent_list *ret) +{ + assert (rw->t == LOCAL); + + ret->size = 0; + +#ifdef SEEK_HOLE + static pthread_mutex_t lseek_lock = PTHREAD_MUTEX_INITIALIZER; + + if (rw->u.local.seek_hole_supported) { + uint64_t end = offset + count; + int fd = rw->u.local.fd; + off_t pos; + struct extent e; + + pthread_mutex_lock (&lseek_lock); + + /* This loop is taken pretty much verbatim from nbdkit-file-plugin. */ + do { + pos = lseek (fd, offset, SEEK_DATA); + if (pos == -1) { + if (errno == ENXIO) + pos = end; + else { + perror ("lseek: SEEK_DATA"); + exit (EXIT_FAILURE); + } + } + + /* We know there is a hole from offset to pos-1. */ + if (pos > offset) { + e.offset = offset; + e.length = pos - offset; + e.hole = true; + if (extent_list_append (ret, e) == -1) { + perror ("realloc"); + exit (EXIT_FAILURE); + } + } + + offset = pos; + if (offset >= end) + break; + + pos = lseek (fd, offset, SEEK_HOLE); + if (pos == -1) { + perror ("lseek: SEEK_HOLE"); + exit (EXIT_FAILURE); + } + + /* We know there is allocated data from offset to pos-1. */ + if (pos > offset) { + e.offset = offset; + e.length = pos - offset; + e.hole = false; + if (extent_list_append (ret, e) == -1) { + perror ("realloc"); + exit (EXIT_FAILURE); + } + } + + offset = pos; + } while (offset < end); + + pthread_mutex_unlock (&lseek_lock); + return; + } +#endif + + /* Otherwise return the default extent covering the whole range. */ + default_get_extents (rw, index, offset, count, ret); +} + + struct rw_ops file_ops = { .synch_read = file_synch_read, .synch_write = file_synch_write, + .synch_trim = file_synch_trim, + .synch_zero = file_synch_zero, .asynch_read = file_asynch_read, .asynch_write = file_asynch_write, + .asynch_trim = file_asynch_trim, + .asynch_zero = file_asynch_zero, + .get_extents = file_get_extents, }; diff --git a/copy/main.c b/copy/main.c index 0b0589e..8187944 100644 --- a/copy/main.c +++ b/copy/main.c @@ -27,9 +27,11 @@ #include <limits.h> #include <fcntl.h> #include <unistd.h> +#include <errno.h> +#include <assert.h> #include <sys/types.h> #include <sys/stat.h> -#include <assert.h> +#include <sys/ioctl.h> #include <pthread.h> @@ -37,7 +39,10 @@ #include "nbdcopy.h" +bool allocated; /* --allocated flag */ unsigned connections = 4; /* --connections */ +bool destination_is_zero; /* --destination-is-zero flag */ +bool extents = true; /* ! --no-extents flag */ bool flush; /* --flush flag */ unsigned max_requests = 64; /* --requests */ bool progress; /* -p flag */ @@ -46,13 +51,14 @@ unsigned threads; /* --threads */ struct rw src, dst; /* The source and destination. */ static bool is_nbd_uri (const char *s); +static bool seek_hole_supported (int fd); static int open_local (const char *prog, const char *filename, bool writing, struct rw *rw); static void open_nbd_uri (const char *prog, - const char *uri, struct rw *rw); + const char *uri, bool writing, struct rw *rw); static void open_nbd_subprocess (const char *prog, const char **argv, size_t argc, - struct rw *rw); + bool writing, struct rw *rw); static void __attribute__((noreturn)) usage (FILE *fp, int exitcode) @@ -85,19 +91,26 @@ main (int argc, char *argv[]) HELP_OPTION = CHAR_MAX + 1, LONG_OPTIONS, SHORT_OPTIONS, + ALLOCATED_OPTION, + DESTINATION_IS_ZERO_OPTION, FLUSH_OPTION, + NO_EXTENTS_OPTION, SYNCHRONOUS_OPTION, }; const char *short_options = "C:pR:T:V"; const struct option long_options[] = { { "help", no_argument, NULL, HELP_OPTION }, { "long-options", no_argument, NULL, LONG_OPTIONS }, + { "allocated", no_argument, NULL, ALLOCATED_OPTION }, { "connections", required_argument, NULL, 'C' }, + { "destination-is-zero",no_argument, NULL, DESTINATION_IS_ZERO_OPTION }, { "flush", no_argument, NULL, FLUSH_OPTION }, + { "no-extents", no_argument, NULL, NO_EXTENTS_OPTION }, { "progress", no_argument, NULL, 'p' }, { "requests", required_argument, NULL, 'R' }, { "short-options", no_argument, NULL, SHORT_OPTIONS }, { "synchronous", no_argument, NULL, SYNCHRONOUS_OPTION }, + { "target-is-zero", no_argument, NULL, DESTINATION_IS_ZERO_OPTION }, { "threads", required_argument, NULL, 'T' }, { "version", no_argument, NULL, 'V' }, { NULL } @@ -129,10 +142,22 @@ main (int argc, char *argv[]) } exit (EXIT_SUCCESS); + case ALLOCATED_OPTION: + allocated = true; + break; + + case DESTINATION_IS_ZERO_OPTION: + destination_is_zero = true; + break; + case FLUSH_OPTION: flush = true; break; + case NO_EXTENTS_OPTION: + extents = false; + break; + case SYNCHRONOUS_OPTION: synchronous = true; break; @@ -191,7 +216,8 @@ main (int argc, char *argv[]) src.t = NBD; src.name = argv[optind+1]; open_nbd_subprocess (argv[0], - (const char **) &argv[optind+1], i-optind-1, &src); + (const char **) &argv[optind+1], i-optind-1, + false, &src); optind = i+1; } else { /* Source is not [...]. */ @@ -201,7 +227,7 @@ main (int argc, char *argv[]) if (src.t == LOCAL) src.u.local.fd = open_local (argv[0], src.name, false, &src); else - open_nbd_uri (argv[0], src.name, &src); + open_nbd_uri (argv[0], src.name, false, &src); } if (optind >= argc) @@ -218,7 +244,8 @@ main (int argc, char *argv[]) dst.t = NBD; dst.name = argv[optind+1]; open_nbd_subprocess (argv[0], - (const char **) &argv[optind+1], i-optind-1, &dst); + (const char **) &argv[optind+1], i-optind-1, + true, &dst); optind = i+1; } else { /* Destination is not [...] */ @@ -228,7 +255,7 @@ main (int argc, char *argv[]) if (dst.t == LOCAL) dst.u.local.fd = open_local (argv[0], dst.name, true /* writing */, &dst); else { - open_nbd_uri (argv[0], dst.name, &dst); + open_nbd_uri (argv[0], dst.name, true, &dst); /* Obviously this is not going to work if the server is * advertising read-only, so fail early with a nice error message. @@ -318,6 +345,7 @@ main (int argc, char *argv[]) perror ("truncate"); exit (EXIT_FAILURE); } + destination_is_zero = true; } else if (dst.t == NBD) { dst.size = nbd_get_size (dst.u.nbd.ptr[0]); @@ -345,16 +373,23 @@ main (int argc, char *argv[]) if (src.t == NBD) { for (i = 1; i < connections; ++i) - open_nbd_uri (argv[0], src.name, &src); + open_nbd_uri (argv[0], src.name, false, &src); assert (src.u.nbd.size == connections); } if (dst.t == NBD) { for (i = 1; i < connections; ++i) - open_nbd_uri (argv[0], dst.name, &dst); + open_nbd_uri (argv[0], dst.name, true, &dst); assert (dst.u.nbd.size == connections); } } + /* If the source is NBD and we couldn't negotiate meta + * base:allocation then turn off extents. + */ + if (src.t == NBD && + !nbd_can_meta_context (src.u.nbd.ptr[0], "base:allocation")) + extents = false; + /* Start copying. */ if (synchronous) synch_copying (); @@ -483,11 +518,18 @@ open_local (const char *prog, perror ("lseek"); exit (EXIT_FAILURE); } + rw->u.local.seek_hole_supported = seek_hole_supported (fd); + rw->u.local.sector_size = 4096; +#ifdef BLKSSZGET + if (ioctl (fd, BLKSSZGET, &rw->u.local.sector_size)) + fprintf (stderr, "warning: cannot get sector size: %s: %m", rw->name); +#endif } else if (S_ISREG (rw->u.local.stat.st_mode)) { /* Regular file. */ rw->ops = &file_ops; rw->size = rw->u.local.stat.st_size; + rw->u.local.seek_hole_supported = seek_hole_supported (fd); } else { /* Probably stdin/stdout, a pipe or a socket. Set size == -1 @@ -496,14 +538,26 @@ open_local (const char *prog, synchronous = true; rw->ops = &pipe_ops; rw->size = -1; + rw->u.local.seek_hole_supported = false; } return fd; } +static bool +seek_hole_supported (int fd) +{ +#ifndef SEEK_HOLE + return false; +#else + off_t r = lseek (fd, 0, SEEK_HOLE); + return r >= 0; +#endif +} + static void open_nbd_uri (const char *prog, - const char *uri, struct rw *rw) + const char *uri, bool writing, struct rw *rw) { struct nbd_handle *nbd; @@ -514,6 +568,11 @@ open_nbd_uri (const char *prog, exit (EXIT_FAILURE); } nbd_set_uri_allow_local_file (nbd, true); /* Allow ?tls-psk-file. */ + if (extents && !writing && + nbd_add_meta_context (nbd, "base:allocation") == -1) { + fprintf (stderr, "%s: %s\n", prog, nbd_get_error ()); + exit (EXIT_FAILURE); + } if (handles_append (&rw->u.nbd, nbd) == -1) { perror ("realloc"); @@ -531,7 +590,7 @@ DEFINE_VECTOR_TYPE (const_string_vector, const char *); static void open_nbd_subprocess (const char *prog, const char **argv, size_t argc, - struct rw *rw) + bool writing, struct rw *rw) { struct nbd_handle *nbd; const_string_vector copy = empty_vector; @@ -543,6 +602,11 @@ open_nbd_subprocess (const char *prog, fprintf (stderr, "%s: %s\n", prog, nbd_get_error ()); exit (EXIT_FAILURE); } + if (extents && !writing && + nbd_add_meta_context (nbd, "base:allocation") == -1) { + fprintf (stderr, "%s: %s\n", prog, nbd_get_error ()); + exit (EXIT_FAILURE); + } if (handles_append (&rw->u.nbd, nbd) == -1) { memory_error: @@ -565,3 +629,24 @@ open_nbd_subprocess (const char *prog, free (copy.ptr); } + +/* Default implementation of rw->ops->get_extents for backends which + * don't/can't support extents. Also used for the --no-extents case. + */ +void +default_get_extents (struct rw *rw, uintptr_t index, + uint64_t offset, uint64_t count, + extent_list *ret) +{ + struct extent e; + + ret->size = 0; + + e.offset = offset; + e.length = count; + e.hole = false; + if (extent_list_append (ret, e) == -1) { + perror ("realloc"); + exit (EXIT_FAILURE); + } +} diff --git a/copy/multi-thread-copying.c b/copy/multi-thread-copying.c index 3805daf..8081bb1 100644 --- a/copy/multi-thread-copying.c +++ b/copy/multi-thread-copying.c @@ -27,6 +27,7 @@ #include <poll.h> #include <errno.h> #include <assert.h> +#include <sys/stat.h> #include <pthread.h> @@ -122,12 +123,14 @@ multi_thread_copying (void) free (workers); } +static void wait_for_request_slots (uintptr_t index); static unsigned in_flight (struct nbd_handle *src_nbd, struct nbd_handle *dst_nbd); static void poll_both_ends (struct nbd_handle *src_nbd, struct nbd_handle *dst_nbd); static int finished_read (void *vp, int *error); -static int finished_write (void *vp, int *error); +static int free_buffer (void *vp, int *error); +static void fill_dst_range_with_zeroes (struct buffer *buffer); /* There are 'threads' worker threads, each copying work ranges from * src to dst until there are no more work ranges. @@ -138,13 +141,7 @@ worker_thread (void *indexp) uintptr_t index = (uintptr_t) indexp; uint64_t offset, count; struct nbd_handle *src_nbd, *dst_nbd; - bool done = false; - - if (! get_next_offset (&offset, &count)) - /* No work to do, return immediately. Can happen for files which - * are smaller than THREAD_WORK_SIZE where multi-conn is enabled. - */ - return NULL; + extent_list exts = empty_vector; /* In the case where src or dst is NBD, use * {src|dst}.u.nbd.ptr[index] so that each thread is connected to @@ -161,54 +158,77 @@ worker_thread (void *indexp) else dst_nbd = NULL; - while (!done) { - struct buffer *buffer; - char *data; - size_t len; - - if (count == 0) { - /* Get another work range. */ - done = ! get_next_offset (&offset, &count); - if (done) break; - assert (0 < count && count <= THREAD_WORK_SIZE); - } - - /* If the number of requests in flight exceeds the limit, poll - * waiting for at least one request to finish. This enforces the - * user --requests option. - */ - while (in_flight (src_nbd, dst_nbd) >= max_requests) - poll_both_ends (src_nbd, dst_nbd); - - /* Create a new buffer. This will be freed in a callback handler. */ - len = count; - if (len > MAX_REQUEST_SIZE) - len = MAX_REQUEST_SIZE; - data = malloc (len); - if (data == NULL) { - perror ("malloc"); - exit (EXIT_FAILURE); - } - buffer = malloc (sizeof *buffer); - if (buffer == NULL) { - perror ("malloc"); - exit (EXIT_FAILURE); - } - buffer->offset = offset; - buffer->len = len; - buffer->data = data; - buffer->free_data = free; - buffer->index = index; - - /* Begin the asynch read operation. */ - src.ops->asynch_read (&src, buffer, - (nbd_completion_callback) { - .callback = finished_read, - .user_data = buffer, - }); - - offset += len; - count -= len; + while (get_next_offset (&offset, &count)) { + size_t i; + + assert (0 < count && count <= THREAD_WORK_SIZE); + if (extents) + src.ops->get_extents (&src, index, offset, count, &exts); + else + default_get_extents (&src, index, offset, count, &exts); + + for (i = 0; i < exts.size; ++i) { + struct buffer *buffer; + char *data; + size_t len; + + if (exts.ptr[i].hole) { + /* The source is a hole so we can proceed directly to + * skipping, trimming or writing zeroes at the destination. + */ + buffer = calloc (1, sizeof *buffer); + if (buffer == NULL) { + perror ("malloc"); + exit (EXIT_FAILURE); + } + buffer->offset = exts.ptr[i].offset; + buffer->len = exts.ptr[i].length; + buffer->index = index; + fill_dst_range_with_zeroes (buffer); + } + + else /* data */ { + /* As the extent might be larger than permitted for a single + * command, we may have to split this into multiple read + * requests. + */ + while (exts.ptr[i].length > 0) { + len = exts.ptr[i].length; + if (len > MAX_REQUEST_SIZE) + len = MAX_REQUEST_SIZE; + data = malloc (len); + if (data == NULL) { + perror ("malloc"); + exit (EXIT_FAILURE); + } + buffer = calloc (1, sizeof *buffer); + if (buffer == NULL) { + perror ("malloc"); + exit (EXIT_FAILURE); + } + buffer->offset = exts.ptr[i].offset; + buffer->len = len; + buffer->data = data; + buffer->free_data = free; + buffer->index = index; + + wait_for_request_slots (index); + + /* Begin the asynch read operation. */ + src.ops->asynch_read (&src, buffer, + (nbd_completion_callback) { + .callback = finished_read, + .user_data = buffer, + }); + + exts.ptr[i].offset += len; + exts.ptr[i].length -= len; + } + } + + offset += count; + count = 0; + } /* for extents */ } /* Wait for in flight NBD requests to finish. */ @@ -218,14 +238,37 @@ worker_thread (void *indexp) if (progress) progress_bar (1, 1); + free (exts.ptr); return NULL; } +/* If the number of requests in flight exceeds the limit, poll + * waiting for at least one request to finish. This enforces + * the user --requests option. + */ +static void +wait_for_request_slots (uintptr_t index) +{ + struct nbd_handle *src_nbd, *dst_nbd; + + if (src.t == NBD) + src_nbd = src.u.nbd.ptr[index]; + else + src_nbd = NULL; + if (dst.t == NBD) + dst_nbd = dst.u.nbd.ptr[index]; + else + dst_nbd = NULL; + + while (in_flight (src_nbd, dst_nbd) >= max_requests) + poll_both_ends (src_nbd, dst_nbd); +} + /* Count the number of NBD commands in flight. Since the commands are * auto-retired in the callbacks we don't need to count "done" * commands. */ -static inline unsigned +static unsigned in_flight (struct nbd_handle *src_nbd, struct nbd_handle *dst_nbd) { return @@ -335,18 +378,79 @@ finished_read (void *vp, int *error) dst.ops->asynch_write (&dst, buffer, (nbd_completion_callback) { - .callback = finished_write, + .callback = free_buffer, .user_data = buffer, }); return 1; /* auto-retires the command */ } -/* Callback called when dst has finished one write command. We can - * now free the buffer. +/* Fill a range in dst with zeroes. This is called from the copying + * loop when we see a hole in the source. Depending on the command + * line flags this could mean: + * + * --destination-is-zero: + * do nothing + * + * --allocated: we must write zeroes either using an efficient + * zeroing command or writing a buffer of zeroes + * + * (neither flag) try trimming if supported, else write zeroes + * as above + * + * This takes over ownership of the buffer and frees it eventually. */ +static void +fill_dst_range_with_zeroes (struct buffer *buffer) +{ + char *data; + + if (destination_is_zero) + goto free_and_return; + + if (!allocated) { + /* Try trimming. */ + wait_for_request_slots (buffer->index); + if (dst.ops->asynch_trim (&dst, buffer, + (nbd_completion_callback) { + .callback = free_buffer, + .user_data = buffer, + })) + return; + } + + /* Try efficient zeroing. */ + wait_for_request_slots (buffer->index); + if (dst.ops->asynch_zero (&dst, buffer, + (nbd_completion_callback) { + .callback = free_buffer, + .user_data = buffer, + })) + return; + + /* Fall back to loop writing zeroes. This is going to be slow + * anyway, so do it synchronously. XXX + */ + data = calloc (1, BUFSIZ); + if (!data) { + perror ("calloc"); + exit (EXIT_FAILURE); + } + while (buffer->len > 0) { + size_t len = buffer->len > BUFSIZ ? BUFSIZ : buffer->len; + + dst.ops->synch_write (&dst, data, len, buffer->offset); + buffer->len -= len; + buffer->offset += len; + } + free (data); + + free_and_return: + free_buffer (buffer, &errno); +} + static int -finished_write (void *vp, int *error) +free_buffer (void *vp, int *error) { struct buffer *buffer = vp; diff --git a/copy/nbd-ops.c b/copy/nbd-ops.c index 3ae01ad..6a8ac95 100644 --- a/copy/nbd-ops.c +++ b/copy/nbd-ops.c @@ -57,6 +57,37 @@ nbd_synch_write (struct rw *rw, } } +static bool +nbd_synch_trim (struct rw *rw, uint64_t offset, uint64_t count) +{ + assert (rw->t == NBD); + + if (nbd_can_trim (rw->u.nbd.ptr[0]) == 0) + return false; + + if (nbd_trim (rw->u.nbd.ptr[0], count, offset, 0) == -1) { + fprintf (stderr, "%s: %s\n", rw->name, nbd_get_error ()); + exit (EXIT_FAILURE); + } + return true; +} + +static bool +nbd_synch_zero (struct rw *rw, uint64_t offset, uint64_t count) +{ + assert (rw->t == NBD); + + if (nbd_can_zero (rw->u.nbd.ptr[0]) == 0) + return false; + + if (nbd_zero (rw->u.nbd.ptr[0], + count, offset, LIBNBD_CMD_FLAG_NO_HOLE) == -1) { + fprintf (stderr, "%s: %s\n", rw->name, nbd_get_error ()); + exit (EXIT_FAILURE); + } + return true; +} + static void nbd_asynch_read (struct rw *rw, struct buffer *buffer, @@ -87,9 +118,154 @@ nbd_asynch_write (struct rw *rw, } } +static bool +nbd_asynch_trim (struct rw *rw, struct buffer *buffer, + nbd_completion_callback cb) +{ + assert (rw->t == NBD); + + if (nbd_can_trim (rw->u.nbd.ptr[0]) == 0) + return false; + + if (nbd_aio_trim (rw->u.nbd.ptr[buffer->index], + buffer->len, buffer->offset, + cb, 0) == -1) { + fprintf (stderr, "%s: %s\n", rw->name, nbd_get_error ()); + exit (EXIT_FAILURE); + } + return true; +} + +static bool +nbd_asynch_zero (struct rw *rw, struct buffer *buffer, + nbd_completion_callback cb) +{ + assert (rw->t == NBD); + + if (nbd_can_zero (rw->u.nbd.ptr[0]) == 0) + return false; + + if (nbd_aio_zero (rw->u.nbd.ptr[buffer->index], + buffer->len, buffer->offset, + cb, LIBNBD_CMD_FLAG_NO_HOLE) == -1) { + fprintf (stderr, "%s: %s\n", rw->name, nbd_get_error ()); + exit (EXIT_FAILURE); + } + return true; +} + +static int +add_extent (void *vp, const char *metacontext, + uint64_t offset, uint32_t *entries, size_t nr_entries, + int *error) +{ + extent_list *ret = vp; + size_t i; + + if (strcmp (metacontext, "base:allocation") != 0) + return 0; + + for (i = 0; i < nr_entries; i += 2) { + struct extent e; + + e.offset = offset; + e.length = entries[i]; + /* Note we deliberately don't care about the ZERO flag. */ + e.hole = (entries[i+1] & LIBNBD_STATE_HOLE) != 0; + if (extent_list_append (ret, e) == -1) { + perror ("realloc"); + exit (EXIT_FAILURE); + } + + offset += entries[i]; + } + + return 0; +} + +/* This is done synchronously, but that's fine because commands from + * the previous work range in flight continue to run, it's difficult + * to (sanely) start new work until we have the full list of extents, + * and in almost every case the remote NBD server can answer our + * request for extents in a single round trip. + */ +static void +nbd_get_extents (struct rw *rw, uintptr_t index, + uint64_t offset, uint64_t count, + extent_list *ret) +{ + extent_list exts = empty_vector; + struct nbd_handle *nbd; + + assert (rw->t == NBD); + nbd = rw->u.nbd.ptr[index]; + + ret->size = 0; + + while (count > 0) { + size_t i; + + exts.size = 0; + if (nbd_block_status (nbd, count, offset, + (nbd_extent_callback) { + .user_data = &exts, + .callback = add_extent + }, 0) == -1) { + /* XXX We could call default_get_extents, but unclear if it's + * the right thing to do if the server is returning errors. + */ + fprintf (stderr, "%s: %s\n", rw->name, nbd_get_error ()); + exit (EXIT_FAILURE); + } + + /* The server should always make progress. */ + if (exts.size == 0) { + fprintf (stderr, "%s: NBD server is broken: it is not returning extent information.\nTry nbdcopy --no-extents as a workaround.\n", + rw->name); + exit (EXIT_FAILURE); + } + + /* Copy the extents returned into the final list (ret). This is + * complicated because the extents returned by the server may + * begin earlier and begin or end later than the requested size. + */ + for (i = 0; i < exts.size; ++i) { + uint64_t d; + + if (exts.ptr[i].offset + exts.ptr[i].length <= offset) + continue; + if (exts.ptr[i].offset < offset) { + d = offset - exts.ptr[i].offset; + exts.ptr[i].offset += d; + exts.ptr[i].length -= d; + assert (exts.ptr[i].offset == offset); + } + if (exts.ptr[i].offset + exts.ptr[i].length > offset + count) { + d = offset + count - exts.ptr[i].offset - exts.ptr[i].length; + exts.ptr[i].length -= d; + assert (exts.ptr[i].length == offset + count); + } + if (extent_list_append (ret, exts.ptr[i]) == -1) { + perror ("realloc"); + exit (EXIT_FAILURE); + } + + offset += exts.ptr[i].length; + count -= exts.ptr[i].length; + } + } + + free (exts.ptr); +} + struct rw_ops nbd_ops = { .synch_read = nbd_synch_read, .synch_write = nbd_synch_write, + .synch_trim = nbd_synch_trim, + .synch_zero = nbd_synch_zero, .asynch_read = nbd_asynch_read, .asynch_write = nbd_asynch_write, + .asynch_trim = nbd_asynch_trim, + .asynch_zero = nbd_asynch_zero, + .get_extents = nbd_get_extents, }; diff --git a/copy/nbdcopy.h b/copy/nbdcopy.h index 9e4fc19..d74abad 100644 --- a/copy/nbdcopy.h +++ b/copy/nbdcopy.h @@ -47,6 +47,8 @@ struct rw { struct { /* For LOCAL. */ int fd; struct stat stat; + bool seek_hole_supported; + int sector_size; } local; handles nbd; /* For NBD, one handle per connection. */ } u; @@ -63,6 +65,14 @@ struct buffer { uintptr_t index; /* Thread number. */ }; +/* List of extents for rw->ops->get_extents. */ +struct extent { + uint64_t offset; + uint64_t length; + bool hole; +}; +DEFINE_VECTOR_TYPE(extent_list, struct extent); + /* The operations struct hides some of the differences between local * file, NBD and pipes from the copying code. * @@ -80,6 +90,16 @@ struct rw_ops { void (*synch_write) (struct rw *rw, const void *data, size_t len, uint64_t offset); + /* Synchronously trim. buffer->data is not used. If not possible, + * returns false. + */ + bool (*synch_trim) (struct rw *rw, uint64_t offset, uint64_t count); + + /* Synchronously zero. buffer->data is not used. If not possible, + * returns false. + */ + bool (*synch_zero) (struct rw *rw, uint64_t offset, uint64_t count); + /* Asynchronous I/O operations. These start the operation and call * 'cb' on completion. * @@ -95,12 +115,42 @@ struct rw_ops { void (*asynch_write) (struct rw *rw, struct buffer *buffer, nbd_completion_callback cb); + + /* Asynchronously trim. buffer->data is not used. If not possible, + * returns false. + */ + bool (*asynch_trim) (struct rw *rw, struct buffer *buffer, + nbd_completion_callback cb); + + /* Asynchronously zero. buffer->data is not used. If not possible, + * returns false. + */ + bool (*asynch_zero) (struct rw *rw, struct buffer *buffer, + nbd_completion_callback cb); + + /* Read base:allocation extents metadata for a region of the source. + * For local files the same information is read from the kernel. + * + * Note that qemu-img fetches extents for the entire disk up front, + * and we want to avoid doing that because it had very negative + * behaviour for certain sources (ie. VDDK). + */ + void (*get_extents) (struct rw *rw, uintptr_t index, + uint64_t offset, uint64_t count, + extent_list *ret); }; extern struct rw_ops file_ops; extern struct rw_ops nbd_ops; extern struct rw_ops pipe_ops; +extern void default_get_extents (struct rw *rw, uintptr_t index, + uint64_t offset, uint64_t count, + extent_list *ret); + +extern bool allocated; extern unsigned connections; +extern bool destination_is_zero; +extern bool extents; extern bool flush; extern unsigned max_requests; extern bool progress; diff --git a/copy/nbdcopy.pod b/copy/nbdcopy.pod index f654f65..5ff7434 100644 --- a/copy/nbdcopy.pod +++ b/copy/nbdcopy.pod @@ -4,7 +4,9 @@ nbdcopy - copy to and from an NBD server =head1 SYNOPSIS - nbdcopy [-C N|--connections=N] [--flush] [-p|--progress] + nbdcopy [--allocated] [-C N|--connections=N] + [--destination-is-zero|--target-is-zero] + [--flush] [--no-extents] [-p|--progress] [-R N|--requests=N] [--synchronous] [-T N|--threads=N] SOURCE DESTINATION @@ -74,6 +76,15 @@ formats use C<qemu-img convert>, see L<qemu-img(1)>. Display brief command line help and exit. +=item B<--allocated> + +Normally nbdcopy tries to create a sparse output (with holes), if the +destination supports that. It does this in two ways: either using +extent informtation from the source to copy holes (see +I<--no-extents>), or by detecting runs of zeroes (see I<-S>). If you +use I<--allocated> then nbdcopy creates a fully allocated, non-sparse +output on the destination. + =item B<-C> N =item B<--connections=>N @@ -82,11 +93,30 @@ Set the maximum number of NBD connections ("multi-conn"). By default nbdcopy will try to use multi-conn with up to 4 connections if the NBD server supports it. +=item B<--destination-is-zero> + +=item B<--target-is-zero> + +Assume the destination is already zeroed. This allows nbdcopy to skip +copying blocks of zeroes from the source to the destination. This is +not safe unless the destination device is already zeroed. +(I<--target-is-zero> is provided for compatibility with +L<qemu-img(1)>.) + =item B<--flush> Flush writes to ensure that everything is written to persistent storage before nbdcopy exits. +=item B<--no-extents> + +Normally nbdcopy uses extent metadata to skip over parts of the source +disk which contain holes. If you use this flag, nbdcopy ignores +extent information and reads everything, which is usually slower. You +might use this flag in two situations: the source NBD server has +incorrect metadata information; or the source has very slow extent +querying so it's faster to simply read all of the data. + =item B<-p> =item B<--progress> diff --git a/copy/pipe-ops.c b/copy/pipe-ops.c index e10a31e..0788aae 100644 --- a/copy/pipe-ops.c +++ b/copy/pipe-ops.c @@ -61,6 +61,12 @@ pipe_synch_write (struct rw *rw, } } +static bool +pipe_synch_trim_zero (struct rw *rw, uint64_t offset, uint64_t count) +{ + return false; /* not supported by pipes */ +} + static void pipe_asynch_read (struct rw *rw, struct buffer *buffer, @@ -77,16 +83,30 @@ pipe_asynch_write (struct rw *rw, abort (); /* See comment below. */ } +static bool +pipe_asynch_trim_zero (struct rw *rw, struct buffer *buffer, + nbd_completion_callback cb) +{ + return false; /* not supported by pipes */ +} + struct rw_ops pipe_ops = { .synch_read = pipe_synch_read, .synch_write = pipe_synch_write, + .synch_trim = pipe_synch_trim_zero, + .synch_zero = pipe_synch_trim_zero, - /* Asynch pipe operations are not defined. These should never be - * called because pipes/streams/sockets force --synchronous. - * Because calling a NULL pointer screws up the stack trace when - * we're not using frame pointers, these are defined to functions - * that call abort(). + /* Asynch pipe read/write operations are not defined. These should + * never be called because pipes/streams/sockets force synchronous + * mode. Because calling a NULL pointer screws up the stack trace + * when we're not using frame pointers, these are defined to + * functions that call abort(). */ .asynch_read = pipe_asynch_read, .asynch_write = pipe_asynch_write, + + .asynch_trim = pipe_asynch_trim_zero, + .asynch_zero = pipe_asynch_trim_zero, + + .get_extents = default_get_extents, }; -- 2.29.0.rc2 _______________________________________________ Libguestfs mailing list Libguestfs@redhat.com https://www.redhat.com/mailman/listinfo/libguestfs