24.01.2019 18:31, Kevin Wolf wrote: > Am 24.01.2019 um 15:36 hat Vladimir Sementsov-Ogievskiy geschrieben: >> 23.01.2019 19:33, Kevin Wolf wrote: >>> Am 23.01.2019 um 12:53 hat Vladimir Sementsov-Ogievskiy geschrieben: >>>> 22.01.2019 21:57, Kevin Wolf wrote: >>>>> Am 11.01.2019 um 12:40 hat Vladimir Sementsov-Ogievskiy geschrieben: >>>>>> 11.01.2019 13:41, Kevin Wolf wrote: >>>>>>> Am 10.01.2019 um 14:20 hat Vladimir Sementsov-Ogievskiy geschrieben: >>>>>>>> drv_co_block_status digs bs->file for additional, more accurate search >>>>>>>> for hole inside region, reported as DATA by bs since 5daa74a6ebc. >>>>>>>> >>>>>>>> This accuracy is not free: assume we have qcow2 disk. Actually, qcow2 >>>>>>>> knows, where are holes and where is data. But every block_status >>>>>>>> request calls lseek additionally. Assume a big disk, full of >>>>>>>> data, in any iterative copying block job (or img convert) we'll call >>>>>>>> lseek(HOLE) on every iteration, and each of these lseeks will have to >>>>>>>> iterate through all metadata up to the end of file. It's obviously >>>>>>>> ineffective behavior. And for many scenarios we don't need this lseek >>>>>>>> at all. >>>>>>>> >>>>>>>> So, let's "5daa74a6ebc" by default, leaving an option to return >>>>>>>> previous behavior, which is needed for scenarios with preallocated >>>>>>>> images. >>>>>>>> >>>>>>>> Add iotest illustrating new option semantics. >>>>>>>> >>>>>>>> Signed-off-by: Vladimir Sementsov-Ogievskiy <vsement...@virtuozzo.com> >>>>>>> >>>>>>> I still think that an option isn't a good solution and we should try use >>>>>>> some heuristics instead. >>>>>> >>>>>> Do you think that heuristics would be better than fair cache for lseek >>>>>> results? >>>>> >>>>> I just played a bit with this (qemu-img convert only), and how much >>>>> caching lseek() results helps depends completely on the image. As it >>>>> happened, my test image was the worst case where caching didn't buy us >>>>> much. Obviously, I can just as easily construct an image where it makes >>>>> a huge difference. I think that most real-world images should be able to >>>>> take good advantage of it, though, and it doesn't hurt, so maybe that's >>>>> a first thing that we can do in any case. It might not be the complete >>>>> solution, though. >>>>> >>>>> Let me explain my test images: The case where all of this actually >>>>> matters for qemu-img convert is fragmented qcow2 images. If your image >>>>> isn't fragmented, we don't do lseek() a lot anyway because a single >>>>> bdrv_block_status() call already gives you the information for the whole >>>>> image. So I constructed a fragmented image, by writing to it backwards: >>>>> >>>>> ./qemu-img create -f qcow2 /tmp/test.qcow2 1G >>>>> for i in $(seq 16384 -1 0); do >>>>> echo "write $((i * 65536)) 64k" >>>>> done | ./qemu-io /tmp/test.qcow2 >>>>> >>>>> It's not really surprising that caching the lseek() results doesn't help >>>>> much there as we're moving backwards and lseek() only returns results >>>>> about the things after the current position, not before the current >>>>> position. So this is probably the worst case. >>>>> >>>>> So I constructed a second image, which is fragmented, too, but starts at >>>>> the beginning of the image file: >>>>> >>>>> ./qemu-img create -f qcow2 /tmp/test_forward.qcow2 1G >>>>> for i in $(seq 0 2 16384); do >>>>> echo "write $((i * 65536)) 64k" >>>>> done | ./qemu-io /tmp/test_forward.qcow2 >>>>> for i in $(seq 1 2 16384); do >>>>> echo "write $((i * 65536)) 64k" >>>>> done | ./qemu-io /tmp/test_forward.qcow2 >>>>> >>>>> Here caching makes a huge difference: >>>>> >>>>> time ./qemu-img convert -p -n $IMG null-co:// >>>>> >>>>> uncached cached >>>>> test.qcow2 ~145s ~70s >>>>> test_forward.qcow2 ~110s ~0.2s >>>> >>>> Unsure about your results, at least 0.2s means, that we benefit from >>>> cached read, not lseek. >>> >>> Yes, all reads are from the kernel page cache, this is on tmpfs. >>> >>> I chose tmpfs for two reasons: I wanted to get expensive I/O out of the >>> way so that the lseek() performance is even visible; and tmpfs was >>> reported to perform especially bad for SEEK_DATA/HOLE (which my results >>> confirm). So yes, this setup really makes the lseek() calls stand out >>> much more than in the common case (which makes sense when you want to >>> fix the overhead introduced by them). >> >> Ok, missed this. On the other hand tmpfs is not a real production case.. > > Yes, I fully agree. But it was a simple case where I knew there is a > problem. > > I also have a bug report on XFS with an image that is very fragmented on > the file system level. But I don't know how to produce such a file to > run benchmarks on it. >
I've experimented around very fragmented images, but didn't find lseek problems, may be because I don't have enough big hdd to test. Here is a program I used to produce fragmented file. The idea is fallocate all space, and then reallocate it. Usage is as simple as ./frag /data/test 500G Attached code may be ugly, I didn't prepare it for publishing( -- Best regards, Vladimir
#define _GNU_SOURCE #include <stdint.h> #include <assert.h> #include <stdio.h> #include <sys/types.h> #include <sys/stat.h> #include <fcntl.h> #include <string.h> #include <unistd.h> #include <error.h> #include <libgen.h> #include <err.h> #include <ctype.h> #include <errno.h> #include <stdlib.h> #include <math.h> #include <sys/statvfs.h> uint64_t max_size(const char *path) { struct statvfs stat; int ret = statvfs(path, &stat); assert(ret == 0); return (uint64_t)stat.f_bsize * stat.f_bavail; } #define qemu_toupper(c) toupper((unsigned char)(c)) static int64_t suffix_mul(char suffix, int64_t unit) { switch (qemu_toupper(suffix)) { case 'B': return 1; case 'K': return unit; case 'M': return unit * unit; case 'G': return unit * unit * unit; case 'T': return unit * unit * unit * unit; case 'P': return unit * unit * unit * unit * unit; case 'E': return unit * unit * unit * unit * unit * unit; } return -1; } /* * Convert string to bytes, allowing either B/b for bytes, K/k for KB, * M/m for MB, G/g for GB or T/t for TB. End pointer will be returned * in *end, if not NULL. Return -ERANGE on overflow, Return -EINVAL on * other error. */ static int do_strtosz(const char *nptr, char **end, const char default_suffix, int64_t unit, uint64_t *result) { int retval; char *endptr; unsigned char c; int mul_required = 0; double val, mul, integral, fraction; errno = 0; val = strtod(nptr, &endptr); if (isnan(val) || endptr == nptr || errno != 0) { retval = -EINVAL; goto out; } fraction = modf(val, &integral); if (fraction != 0) { mul_required = 1; } c = *endptr; mul = suffix_mul(c, unit); if (mul >= 0) { endptr++; } else { mul = suffix_mul(default_suffix, unit); assert(mul >= 0); } if (mul == 1 && mul_required) { retval = -EINVAL; goto out; } /* * Values >= 0xfffffffffffffc00 overflow uint64_t after their trip * through double (53 bits of precision). */ if ((val * mul >= 0xfffffffffffffc00) || val < 0) { retval = -ERANGE; goto out; } *result = val * mul; retval = 0; out: if (end) { *end = endptr; } else if (*endptr) { retval = -EINVAL; } return retval; } int qemu_strtosz(const char *nptr, char **end, uint64_t *result) { return do_strtosz(nptr, end, 'B', 1024, result); } static int64_t cvtnum(const char *s) { int err; uint64_t value; err = qemu_strtosz(s, NULL, &value); if (err < 0) { return err; } if (value > INT64_MAX) { return -ERANGE; } return value; } int balloon_fd; int create_balloon(const char *filename) { int fd = open(filename, O_RDWR); uint64_t sz = max_size(filename); printf("%ld\n", sz); } int main(int argc, char *argv[]) { int ret = 0; int fd = -1; struct statvfs stat; const char *filename = argv[1]; int64_t len = cvtnum(argv[2]), disk_len, off; char *dir; int64_t block = 64 * 1024, try; int64_t flen; char *buf[block]; dir = strdup(filename); dirname(dir); ret = statvfs(dir, &stat); if (ret < 0) { perror("Can not get fs stats"); goto fail; } disk_len = (uint64_t)stat.f_bsize * stat.f_bavail; if (disk_len < len) { perror("Not enough space on disk"); goto fail; } fd = open(filename, O_CREAT | O_RDWR); if (fd < 0) { perror("Can not open file for RW"); goto fail; } sync(); printf("fallocating max space. You can monitor progress by 'watch -d \"df -h %s\"'\n", dir); flen = len; try = (disk_len - 10 * block) / 2 / block * block; while (try >= block) { ret = statvfs(dir, &stat); if (ret < 0) { perror("Can not get fs stats"); goto fail; } disk_len = (uint64_t)stat.f_bsize * stat.f_bavail; try = disk_len / 2 / block * block; if (try < 10 * block) { break; } ret = fallocate(fd, 0, flen, try); if (ret >= 0) { flen += try; } else { break; } } ret = statvfs(dir, &stat); if (ret < 0) { perror("Can not get fs stats"); goto fail; } disk_len = (uint64_t)stat.f_bsize * stat.f_bavail; printf("done (remaining space %ld).\n\n", disk_len); printf("reallocating final blocks:\n"); for (off = 0; off < len; off += block) { while (fallocate(fd, 0, off, block) < 0) { ret = ftruncate(fd, flen - block); if (ret < 0) { error(0, errno, "Failed to punch hole for block %ld", off / block); goto fail; } flen -= block; } lseek(fd, off, SEEK_SET); write(fd, buf, block); printf("done bytes: %ld/%ld \r", off, len); } printf("done. \n\n"); printf("truncating file to final size\n"); ret = ftruncate(fd, len); if (ret < 0) { perror("Failed to truncate to final size"); goto fail; } fail: free(dir); if (fd >= 0) { close(fd); if (ret < 0) { unlink(filename); } } return -ret; }