[PATCH 3/4] btrfs-progs: change seen_fsid to hold fd and DIR*
Change seen_fsid to hold fd and DIR* in order to keep access to each fs. This will be used for 'subvol delete --commit-after'. Signed-off-by: Tomohiro Misono--- cmds-filesystem.c | 4 ++-- utils.c | 6 +- utils.h | 5 - 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/cmds-filesystem.c b/cmds-filesystem.c index c7dae40..4bbff43 100644 --- a/cmds-filesystem.c +++ b/cmds-filesystem.c @@ -277,7 +277,7 @@ static void print_one_uuid(struct btrfs_fs_devices *fs_devices, u64 devs_found = 0; u64 total; - if (add_seen_fsid(fs_devices->fsid, seen_fsid_hash)) + if (add_seen_fsid(fs_devices->fsid, seen_fsid_hash, -1, NULL)) return; uuid_unparse(fs_devices->fsid, uuidbuf); @@ -324,7 +324,7 @@ static int print_one_fs(struct btrfs_ioctl_fs_info_args *fs_info, struct btrfs_ioctl_dev_info_args *tmp_dev_info; int ret; - ret = add_seen_fsid(fs_info->fsid, seen_fsid_hash); + ret = add_seen_fsid(fs_info->fsid, seen_fsid_hash, -1, NULL); if (ret == -EEXIST) return 0; else if (ret) diff --git a/utils.c b/utils.c index f91d41e..bdfbfe0 100644 --- a/utils.c +++ b/utils.c @@ -1804,7 +1804,8 @@ int is_seen_fsid(u8 *fsid, struct seen_fsid *seen_fsid_hash[]) return 0; } -int add_seen_fsid(u8 *fsid, struct seen_fsid *seen_fsid_hash[]) +int add_seen_fsid(u8 *fsid, struct seen_fsid *seen_fsid_hash[], + int fd, DIR *dirstream) { u8 hash = fsid[0]; int slot = hash % SEEN_FSID_HASH_SIZE; @@ -1832,6 +1833,8 @@ insert: alloc->next = NULL; memcpy(alloc->fsid, fsid, BTRFS_FSID_SIZE); + alloc->fd = fd; + alloc->dirstream = dirstream; if (seen) seen->next = alloc; @@ -1851,6 +1854,7 @@ void free_seen_fsid(struct seen_fsid *seen_fsid_hash[]) seen = seen_fsid_hash[slot]; while (seen) { next = seen->next; + close_file_or_dir(seen->fd, seen->dirstream); free(seen); seen = next; } diff --git a/utils.h b/utils.h index da34e6c..bac7688 100644 --- a/utils.h +++ b/utils.h @@ -107,9 +107,12 @@ int get_fsid(const char *path, u8 *fsid, int silent); struct seen_fsid { u8 fsid[BTRFS_FSID_SIZE]; struct seen_fsid *next; + int fd; + DIR *dirstream; }; int is_seen_fsid(u8 *fsid, struct seen_fsid *seen_fsid_hash[]); -int add_seen_fsid(u8 *fsid, struct seen_fsid *seen_fsid_hash[]); +int add_seen_fsid(u8 *fsid, struct seen_fsid *seen_fsid_hash[], + int fd, DIR *dirstream); void free_seen_fsid(struct seen_fsid *seen_fsid_hash[]); int get_label(const char *btrfs_dev, char *label); -- 2.9.5 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 4/4] btrfs-progs: subvol: fix subvol del --commit-after
Fix subvol del --commit-after to work properly: - SYNC ioctl will be issued even when last delete is failed - SYNC ioctl will be issued on each file system only once in the end To achieve this, get_fsid() and add_seen_fsid() is called after each delete to keep only one fd for each fs. In the end, seen_fsid_hash will be traversed and SYNC is issued on each fs. Signed-off-by: Tomohiro Misono--- cmds-subvolume.c | 77 1 file changed, 56 insertions(+), 21 deletions(-) diff --git a/cmds-subvolume.c b/cmds-subvolume.c index 666f6e0..bcbd737 100644 --- a/cmds-subvolume.c +++ b/cmds-subvolume.c @@ -263,6 +263,10 @@ static int cmd_subvol_delete(int argc, char **argv) DIR *dirstream = NULL; int verbose = 0; int commit_mode = 0; + u8 fsid[BTRFS_FSID_SIZE]; + char uuidbuf[BTRFS_UUID_UNPARSED_SIZE]; + struct seen_fsid *seen_fsid_hash[SEEN_FSID_HASH_SIZE] = {NULL,}; + enum { COMMIT_AFTER = 1, COMMIT_EACH = 2 }; while (1) { int c; @@ -279,10 +283,10 @@ static int cmd_subvol_delete(int argc, char **argv) switch(c) { case 'c': - commit_mode = 1; + commit_mode = COMMIT_AFTER; break; case 'C': - commit_mode = 2; + commit_mode = COMMIT_EACH; break; case 'v': verbose++; @@ -298,7 +302,7 @@ static int cmd_subvol_delete(int argc, char **argv) if (verbose > 0) { printf("Transaction commit: %s\n", !commit_mode ? "none (default)" : - commit_mode == 1 ? "at the end" : "after each"); + commit_mode == COMMIT_AFTER ? "at the end" : "after each"); } cnt = optind; @@ -338,50 +342,81 @@ again: } printf("Delete subvolume (%s): '%s/%s'\n", - commit_mode == 2 || (commit_mode == 1 && cnt + 1 == argc) - ? "commit" : "no-commit", dname, vname); + commit_mode == COMMIT_EACH ? "commit" : "no-commit", dname, vname); memset(, 0, sizeof(args)); strncpy_null(args.name, vname); res = ioctl(fd, BTRFS_IOC_SNAP_DESTROY, ); - if(res < 0 ){ + if (res < 0) { error("cannot delete '%s/%s': %s", dname, vname, strerror(errno)); ret = 1; goto out; } - if (commit_mode == 1) { + if (commit_mode == COMMIT_EACH) { res = wait_for_commit(fd); if (res < 0) { - error("unable to wait for commit after '%s': %s", + error("unable to wait for commit after deleting '%s': %s", path, strerror(errno)); ret = 1; } + } else if (commit_mode == COMMIT_AFTER) { + res = get_fsid(dname, fsid, 0); + if (res < 0) { + error("unable to get fsid for '%s': %s", path, strerror(-res)); + error("delete suceeded but commit may not be done in the end"); + ret = 1; + goto out; + } + + if (add_seen_fsid(fsid, seen_fsid_hash, fd, dirstream) == 0) { + if (verbose > 0) { + uuid_unparse(fsid, uuidbuf); + printf(" new fs is found for '%s', fsid: %s\n", path, uuidbuf); + } + // this is the first time a subvolume on this filesystem is deleted + // keep fd in order to issue SYNC ioctl in the end + goto keep_fd; + } } out: + close_file_or_dir(fd, dirstream); +keep_fd: + fd = -1; + dirstream = NULL; free(dupdname); free(dupvname); dupdname = NULL; dupvname = NULL; cnt++; - if (cnt < argc) { - close_file_or_dir(fd, dirstream); - /* avoid double free */ - fd = -1; - dirstream = NULL; + if (cnt < argc) goto again; - } - if (commit_mode == 2 && fd != -1) { - res = wait_for_commit(fd); - if (res < 0) { - error("unable to do final sync after deletion: %s", - strerror(errno)); - ret = 1; + if (commit_mode == COMMIT_AFTER) { + // traverse seen_fsid_hash and issue SYNC ioctl on each filesystem + int slot; + struct seen_fsid *seen; + + for (slot = 0; slot < SEEN_FSID_HASH_SIZE; slot++) { + seen =
[PATCH 2/4] btrfs-progs: move seen_fsid to util.c
Make is_seen_fsid()/add_seen_fsid()/free_seen_fsid() to common functions. This will be used for 'subvol delete --commit-after'. Signed-off-by: Tomohiro Misono--- cmds-filesystem.c | 88 --- utils.c | 70 +++ utils.h | 11 +++ 3 files changed, 86 insertions(+), 83 deletions(-) diff --git a/cmds-filesystem.c b/cmds-filesystem.c index 018857c..c7dae40 100644 --- a/cmds-filesystem.c +++ b/cmds-filesystem.c @@ -30,7 +30,6 @@ #include "kerncompat.h" #include "ctree.h" -#include "ioctl.h" #include "utils.h" #include "volumes.h" #include "commands.h" @@ -43,85 +42,8 @@ * for btrfs fi show, we maintain a hash of fsids we've already printed. * This way we don't print dups if a given FS is mounted more than once. */ -#define SEEN_FSID_HASH_SIZE 256 - -struct seen_fsid { - u8 fsid[BTRFS_FSID_SIZE]; - struct seen_fsid *next; -}; - static struct seen_fsid *seen_fsid_hash[SEEN_FSID_HASH_SIZE] = {NULL,}; -static int is_seen_fsid(u8 *fsid) -{ - u8 hash = fsid[0]; - int slot = hash % SEEN_FSID_HASH_SIZE; - struct seen_fsid *seen = seen_fsid_hash[slot]; - - while (seen) { - if (memcmp(seen->fsid, fsid, BTRFS_FSID_SIZE) == 0) - return 1; - - seen = seen->next; - } - - return 0; -} - -static int add_seen_fsid(u8 *fsid) -{ - u8 hash = fsid[0]; - int slot = hash % SEEN_FSID_HASH_SIZE; - struct seen_fsid *seen = seen_fsid_hash[slot]; - struct seen_fsid *alloc; - - if (!seen) - goto insert; - - while (1) { - if (memcmp(seen->fsid, fsid, BTRFS_FSID_SIZE) == 0) - return -EEXIST; - - if (!seen->next) - break; - - seen = seen->next; - } - -insert: - - alloc = malloc(sizeof(*alloc)); - if (!alloc) - return -ENOMEM; - - alloc->next = NULL; - memcpy(alloc->fsid, fsid, BTRFS_FSID_SIZE); - - if (seen) - seen->next = alloc; - else - seen_fsid_hash[slot] = alloc; - - return 0; -} - -static void free_seen_fsid(void) -{ - int slot; - struct seen_fsid *seen; - struct seen_fsid *next; - - for (slot = 0; slot < SEEN_FSID_HASH_SIZE; slot++) { - seen = seen_fsid_hash[slot]; - while (seen) { - next = seen->next; - free(seen); - seen = next; - } - seen_fsid_hash[slot] = NULL; - } -} - static const char * const filesystem_cmd_group_usage[] = { "btrfs filesystem [] []", NULL @@ -355,7 +277,7 @@ static void print_one_uuid(struct btrfs_fs_devices *fs_devices, u64 devs_found = 0; u64 total; - if (add_seen_fsid(fs_devices->fsid)) + if (add_seen_fsid(fs_devices->fsid, seen_fsid_hash)) return; uuid_unparse(fs_devices->fsid, uuidbuf); @@ -402,7 +324,7 @@ static int print_one_fs(struct btrfs_ioctl_fs_info_args *fs_info, struct btrfs_ioctl_dev_info_args *tmp_dev_info; int ret; - ret = add_seen_fsid(fs_info->fsid); + ret = add_seen_fsid(fs_info->fsid, seen_fsid_hash); if (ret == -EEXIST) return 0; else if (ret) @@ -474,7 +396,7 @@ static int btrfs_scan_kernel(void *search, unsigned unit_mode) goto out; /* skip all fs already shown as mounted fs */ - if (is_seen_fsid(fs_info_arg.fsid)) + if (is_seen_fsid(fs_info_arg.fsid, seen_fsid_hash)) continue; ret = get_label_mounted(mnt->mnt_dir, label); @@ -676,7 +598,7 @@ static int search_umounted_fs_uuids(struct list_head *all_uuids, } /* skip all fs already shown as mounted fs */ - if (is_seen_fsid(cur_fs->fsid)) + if (is_seen_fsid(cur_fs->fsid, seen_fsid_hash)) continue; fs_copy = calloc(1, sizeof(*fs_copy)); @@ -908,7 +830,7 @@ devs_only: free_fs_devices(fs_devices); } out: - free_seen_fsid(); + free_seen_fsid(seen_fsid_hash); return ret; } diff --git a/utils.c b/utils.c index 4a5dc60..f91d41e 100644 --- a/utils.c +++ b/utils.c @@ -1788,6 +1788,76 @@ out: return ret; } +int is_seen_fsid(u8 *fsid, struct seen_fsid *seen_fsid_hash[]) +{ + u8 hash = fsid[0]; + int slot = hash % SEEN_FSID_HASH_SIZE; + struct seen_fsid *seen = seen_fsid_hash[slot]; + + while (seen) { + if (memcmp(seen->fsid, fsid, BTRFS_FSID_SIZE) == 0) + return 1; + + seen = seen->next; + } + + return 0; +} + +int
[PATCH 1/4] btrfs-progs: move get_fsid() to util.c
Make get_fsid() to a common function. This will be used for 'subvol delete --commit-after'. Signed-off-by: Tomohiro Misono--- cmds-property.c | 30 -- utils.c | 31 +++ utils.h | 1 + 3 files changed, 32 insertions(+), 30 deletions(-) diff --git a/cmds-property.c b/cmds-property.c index 9ae1246..03bafa0 100644 --- a/cmds-property.c +++ b/cmds-property.c @@ -48,36 +48,6 @@ static int parse_prop(const char *arg, const struct prop_handler *props, return -1; } -static int get_fsid(const char *path, u8 *fsid, int silent) -{ - int ret; - int fd; - struct btrfs_ioctl_fs_info_args args; - - fd = open(path, O_RDONLY); - if (fd < 0) { - ret = -errno; - if (!silent) - error("failed to open %s: %s", path, - strerror(-ret)); - goto out; - } - - ret = ioctl(fd, BTRFS_IOC_FS_INFO, ); - if (ret < 0) { - ret = -errno; - goto out; - } - - memcpy(fsid, args.fsid, BTRFS_FSID_SIZE); - ret = 0; - -out: - if (fd != -1) - close(fd); - return ret; -} - static int check_btrfs_object(const char *object) { int ret; diff --git a/utils.c b/utils.c index 7a2710f..4a5dc60 100644 --- a/utils.c +++ b/utils.c @@ -1758,6 +1758,37 @@ out: return ret; } +int get_fsid(const char *path, u8 *fsid, int silent) +{ + int ret; + int fd; + struct btrfs_ioctl_fs_info_args args; + + fd = open(path, O_RDONLY); + if (fd < 0) { + ret = -errno; + if (!silent) + error("failed to open %s: %s", path, + strerror(-ret)); + goto out; + } + + ret = ioctl(fd, BTRFS_IOC_FS_INFO, ); + if (ret < 0) { + ret = -errno; + goto out; + } + + memcpy(fsid, args.fsid, BTRFS_FSID_SIZE); + ret = 0; + +out: + if (fd != -1) + close(fd); + return ret; +} + + static int group_profile_devs_min(u64 flag) { switch (flag & BTRFS_BLOCK_GROUP_PROFILE_MASK) { diff --git a/utils.h b/utils.h index d28a05a..b3aabe1 100644 --- a/utils.h +++ b/utils.h @@ -100,6 +100,7 @@ int open_file_or_dir3(const char *fname, DIR **dirstream, int open_flags); void close_file_or_dir(int fd, DIR *dirstream); int get_fs_info(const char *path, struct btrfs_ioctl_fs_info_args *fi_args, struct btrfs_ioctl_dev_info_args **di_ret); +int get_fsid(const char *path, u8 *fsid, int silent); int get_label(const char *btrfs_dev, char *label); int set_label(const char *btrfs_dev, const char *label); -- 2.9.5 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 0/4] btrfs-progs: subvol: fix del --commit-after
Fix subvol del --commit-after to work properly: - SYNC ioctl will be issued even when last delete is failed - SYNC ioctl will be issued on each file system only once in the end To achieve this, each deleted subvol's (parent's) fsid is checked each time. If the fsid is seen for the first time, its fd will be kept in order to issue SYNC ioctl in the end. There already exists get_fsid() in cmds-property.c and seen_fsid which keeps fsid in hush function in cmds-filesystem.c. First three patches make them to common functions and last one is the main part. Tomohiro Misono (4): btrfs-progs: move get_fsid() to util.c btrfs-progs: move seen_fsid to util.c btrfs-progs: change seen_fsid to hold fd and DIR* btrfs-progs: subvol: fix subvol del --commit-after cmds-filesystem.c | 88 +++-- cmds-property.c | 30 cmds-subvolume.c | 77 --- utils.c | 105 ++ utils.h | 15 5 files changed, 181 insertions(+), 134 deletions(-) -- 2.9.5 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: question: behavior of original check
On 2017年09月26日 11:01, Su Yue wrote: Hi During writing patches about check, I found something confusing about btrfs-progs original check. Setup: Version: Btrfs progs v4.13.1 $ make TEST=006\* test-fsck [TEST] fsck-tests.sh [TEST/fsck] 006-bad-root-items $ cat tests/fsck-tests-results.txt | tail -n 25 ### .../btrfs-progs/btrfs check test.img checking extents ref mismatch on [15474688 905216] extent item 1, found 4 Backref 15474688 parent 30883840 owner 0 offset 0 num_refs 0 not found in extent tree Incorrect local backref count on 15474688 parent 30883840 owner 0 offset 0 found 1 wanted 0 back 0x561cdc0b2770 Backref 15474688 parent 31326208 owner 0 offset 0 num_refs 0 not found in extent tree Incorrect local backref count on 15474688 parent 31326208 owner 0 offset 0 found 1 wanted 0 back 0x561cdc0b5390 Backref 15474688 parent 31817728 owner 0 offset 0 num_refs 0 not found in extent tree Incorrect local backref count on 15474688 parent 31817728 owner 0 offset 0 found 1 wanted 0 back 0x561cdc0b5330 backpointer mismatch on [15474688 905216] checking free space cache checking fs roots checking csums checking root refs Checking filesystem on test.img UUID: 3857c23d-4219-4600-a636-ac7707dc4ff3 cache and super generation don't match, space cache will be invalidated found 6291456 bytes used, *no error* found total csum bytes: 660 total tree bytes: 786432 total fs tree bytes: 688128 total extent tree bytes: 16384 btree space waste bytes: 459860 file data blocks allocated: 35536896 referenced 25890816 It seems that errors in extent-tree are not important so btrfs-progs just returns 0. It's a bug that ignored the error found in extent tree. It's exposed quite a long time ago IIRC. Is this behavior normal(on purpose)? Definitely not. I have written a patch to fix it but then test-fsck/006 will fail due to nonzero value progs returned. The problem is that btrfs check doesn't repair the bug. You would better also fix the repair code. Thanks, Qu Thanks, Su -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: question: behavior of original check
On 2017年09月26日 11:01, Su Yue wrote: Hi During writing patches about check, I found something confusing about btrfs-progs original check. Setup: Version: Btrfs progs v4.13.1 $ make TEST=006\* test-fsck [TEST] fsck-tests.sh [TEST/fsck] 006-bad-root-items $ cat tests/fsck-tests-results.txt | tail -n 25 ### .../btrfs-progs/btrfs check test.img checking extents ref mismatch on [15474688 905216] extent item 1, found 4 Backref 15474688 parent 30883840 owner 0 offset 0 num_refs 0 not found in extent tree Incorrect local backref count on 15474688 parent 30883840 owner 0 offset 0 found 1 wanted 0 back 0x561cdc0b2770 Backref 15474688 parent 31326208 owner 0 offset 0 num_refs 0 not found in extent tree Incorrect local backref count on 15474688 parent 31326208 owner 0 offset 0 found 1 wanted 0 back 0x561cdc0b5390 Backref 15474688 parent 31817728 owner 0 offset 0 num_refs 0 not found in extent tree Incorrect local backref count on 15474688 parent 31817728 owner 0 offset 0 found 1 wanted 0 back 0x561cdc0b5330 backpointer mismatch on [15474688 905216] checking free space cache checking fs roots checking csums checking root refs Checking filesystem on test.img UUID: 3857c23d-4219-4600-a636-ac7707dc4ff3 cache and super generation don't match, space cache will be invalidated found 6291456 bytes used, *no error* found total csum bytes: 660 total tree bytes: 786432 total fs tree bytes: 688128 total extent tree bytes: 16384 btree space waste bytes: 459860 file data blocks allocated: 35536896 referenced 25890816 It seems that errors in extent-tree are not important so btrfs-progs just returns 0. It's a bug that ignored the error found in extent tree. It's exposed quite a long time ago IIRC. Is this behavior normal(on purpose)? Definitely not. I have written a patch to fix it but then test-fsck/006 will fail due to nonzero value progs returned. The problem is that btrfs check doesn't repair the bug. You would better also fix the repair code. Thanks, Qu Thanks, Su -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
question: behavior of original check
Hi During writing patches about check, I found something confusing about btrfs-progs original check. Setup: Version: Btrfs progs v4.13.1 $ make TEST=006\* test-fsck [TEST] fsck-tests.sh [TEST/fsck] 006-bad-root-items $ cat tests/fsck-tests-results.txt | tail -n 25 ### .../btrfs-progs/btrfs check test.img checking extents ref mismatch on [15474688 905216] extent item 1, found 4 Backref 15474688 parent 30883840 owner 0 offset 0 num_refs 0 not found in extent tree Incorrect local backref count on 15474688 parent 30883840 owner 0 offset 0 found 1 wanted 0 back 0x561cdc0b2770 Backref 15474688 parent 31326208 owner 0 offset 0 num_refs 0 not found in extent tree Incorrect local backref count on 15474688 parent 31326208 owner 0 offset 0 found 1 wanted 0 back 0x561cdc0b5390 Backref 15474688 parent 31817728 owner 0 offset 0 num_refs 0 not found in extent tree Incorrect local backref count on 15474688 parent 31817728 owner 0 offset 0 found 1 wanted 0 back 0x561cdc0b5330 backpointer mismatch on [15474688 905216] checking free space cache checking fs roots checking csums checking root refs Checking filesystem on test.img UUID: 3857c23d-4219-4600-a636-ac7707dc4ff3 cache and super generation don't match, space cache will be invalidated found 6291456 bytes used, *no error* found total csum bytes: 660 total tree bytes: 786432 total fs tree bytes: 688128 total extent tree bytes: 16384 btree space waste bytes: 459860 file data blocks allocated: 35536896 referenced 25890816 It seems that errors in extent-tree are not important so btrfs-progs just returns 0. Is this behavior normal(on purpose)? I have written a patch to fix it but then test-fsck/006 will fail due to nonzero value progs returned. Thanks, Su -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] Btrfs: rework outstanding_extents
On Mon, Sep 25, 2017 at 04:00:09PM +0200, David Sterba wrote: >On Thu, Aug 24, 2017 at 05:31:15PM +0800, Lu Fengqi wrote: >> On Thu, Aug 17, 2017 at 01:01:29PM -0400, jo...@toxicpanda.com wrote: >> >From: Josef Bacik>> > >> >Right now we do a lot of weird hoops around outstanding_extents in order >> >to keep the extent count consistent. This is because we logically >> >transfer the outstanding_extent count from the initial reservation >> >through the set_delalloc_bits. This makes it pretty difficult to get a >> >handle on how and when we need to mess with outstanding_extents. >> > >> >Fix this by revamping the rules of how we deal with outstanding_extents. >> >Now instead everybody that is holding on to a delalloc extent is >> >required to increase the outstanding extents count for itself. This >> >means we'll have something like this >> > >> >btrfs_dealloc_reserve_metadata - outstanding_extents = 1 >> > btrfs_set_delalloc - outstanding_extents = 2 >> >btrfs_release_delalloc_extents - outstanding_extents = 1 >> > >> >for an initial file write. Now take the append write where we extend an >> >existing delalloc range but still under the maximum extent size >> > >> >btrfs_delalloc_reserve_metadata - outstanding_extents = 2 >> > btrfs_set_delalloc >> >btrfs_set_bit_hook - outstanding_extents = 3 >> >btrfs_merge_bit_hook- outstanding_extents = 2 >> >btrfs_release_delalloc_extents - outstanding_extnets = 1 >> > >> >In order to make the ordered extent transition we of course must now >> >make ordered extents carry their own outstanding_extent reservation, so >> >for cow_file_range we end up with >> > >> >btrfs_add_ordered_extent- outstanding_extents = 2 >> >clear_extent_bit- outstanding_extents = 1 >> >btrfs_remove_ordered_extent - outstanding_extents = 0 >> > >> >This makes all manipulations of outstanding_extents much more explicit. >> >Every successful call to btrfs_reserve_delalloc_metadata _must_ now be >> >combined with btrfs_release_delalloc_extents, even in the error case, as >> >that is the only function that actually modifies the >> >outstanding_extents counter. >> >> s/btrfs_reserve_delalloc_metadata/btrfs_delalloc_reserve_metadata >> s/btrfs_release_delalloc_extents/btrfs_delalloc_release_extents >> >> Acked-by: Lu Fengqi > >BTW, > >Documentation/process/5.Posting.rst > >216 - Acked-by: indicates an agreement by another developer (often a >217maintainer of the relevant code) that the patch is appropriate for >218inclusion into the kernel. > >and is commonly used by maintainers when someone else modifies the >respective codebase by tree-wide, API or similar. > >If you agree with the way the patch is solving a particular problem and >think it's important enough to be stated, you can write that in plain >text. > > Sorry for any inconvenience caused. I misunderstood about the meaning of Acked-by. I just want to express that it looks good to me. -- Thanks, Lu -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v2 3/4] btrfs: Add sanity check for EXTENT_DATA when reading out leaf
On 2017年09月26日 08:28, Qu Wenruo wrote: On 2017年09月25日 23:45, David Sterba wrote: On Wed, Aug 23, 2017 at 04:57:58PM +0900, Qu Wenruo wrote: Add extra checker for item with EXTENT_DATA type. This checks the following thing: 0) Key offset All key offset must be aligned to sectorsize. Inline extent must have 0 for key offset. 1) Item size Plain text inline file extent size must match item size. 'plain text' seems to be a bit misleading, I don't think we've ever referred to uncompressed extent as such, although it makes some sense. I think 'uncompressed' would work too. I'll use 'uncompressed' instead. (compressed inline file extent has no info about its on-disk size) Regular/preallocated file extent size must be a fixed value. 2) Every member of regular file extent item Including alignment for bytenr and offset, possible value for compression/encryption/type. 3) Type/compression/encode must be one of the valid values. This should be the most comprehensive and restrict check in the context of btrfs_item for EXTENT_DATA. Signed-off-by: Qu Wenruo--- fs/btrfs/disk-io.c | 108 include/uapi/linux/btrfs_tree.h | 1 + 2 files changed, 109 insertions(+) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index e034d08bd036..b92296c6a698 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -549,6 +549,103 @@ static int check_tree_block_fsid(struct btrfs_fs_info *fs_info, btrfs_header_level(eb) == 0 ? "leaf" : "node", \ reason, btrfs_header_bytenr(eb), root->objectid, slot) +static int check_extent_data_item(struct btrfs_root *root, + struct extent_buffer *leaf, + struct btrfs_key *key, int slot) +{ + struct btrfs_file_extent_item *fi; + u32 sectorsize = root->fs_info->sectorsize; + u32 item_size = btrfs_item_size_nr(leaf, slot); + + if (!IS_ALIGNED(key->offset, sectorsize)) { + CORRUPT("unaligned key offset for file extent", The CORRUPT macro does not print any details beyond what it gets from the parameters, so here we'd like to know which extent it is and what's the size. The sectorsize can be found elsewhere so it does not need to be printed. Did you mean, the corrupt can be enhanced just like btrfs_printk() to have custom format and args list? Thanks, Qu Did you mean despite bytenr of the tree block and root objectid, we should output more info about the key? By "size" did you mean the disk_num_bytes of the extent? But since the offset is already invalid, I don't think we can trust any data from that structure, so what we can output is only about the key. I can enhance the output to output the key, but I'm afraid I can't output anything more than that. + leaf, root, slot); + return -EUCLEAN; + } + + fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); + + if (btrfs_file_extent_type(leaf, fi) >= BTRFS_FILE_EXTENT_LAST_TYPE) { + CORRUPT("invalid file extent type", leaf, root, slot); "invalid file extent type %d" and actually, we could add some item-specific helpers to report the corruption so if it's for an extent, print the generic extent information, plus an additional information what exactly was wrong. For additional info I'm completely fine. But I'm not sure if it's good to output more info about the item. The biggest problem is, when any member of the item can't pass validation checker, we can't trust the whole item. So the item-specific info may not contain meaningful info. + return -EUCLEAN; + } + + /* + * Support for new compression/encrption must introduce incompat flag, + * and must be caught in open_ctree(). + */ + if (btrfs_file_extent_compression(leaf, fi) >= BTRFS_COMPRESS_LAST) { + CORRUPT("invalid file extent compression", leaf, root, slot); + return -EUCLEAN; + } + if (btrfs_file_extent_encryption(leaf, fi)) { + CORRUPT("invalid file extent encryption", leaf, root, slot); + return -EUCLEAN; + } + if (btrfs_file_extent_type(leaf, fi) == BTRFS_FILE_EXTENT_INLINE) { + /* Inline extent must have 0 as key offset */ + if (key->offset) { + CORRUPT("inline extent has non-zero key offset", + leaf, root, slot); + return -EUCLEAN; + } + + /* Compressed inline extent has no on-disk size, skip it */ + if (btrfs_file_extent_compression(leaf, fi) != + BTRFS_COMPRESS_NONE) + return 0; + + /* Plaintext inline extent size must match item size */ + if (item_size != BTRFS_FILE_EXTENT_INLINE_DATA_START + + btrfs_file_extent_ram_bytes(leaf, fi)) { + CORRUPT("plaintext inline extent has invalid size", + leaf, root, slot); + return -EUCLEAN; + } + return 0; + } + + + /*
Re: [PATCH v2 3/4] btrfs: Add sanity check for EXTENT_DATA when reading out leaf
On 2017年09月25日 23:45, David Sterba wrote: On Wed, Aug 23, 2017 at 04:57:58PM +0900, Qu Wenruo wrote: Add extra checker for item with EXTENT_DATA type. This checks the following thing: 0) Key offset All key offset must be aligned to sectorsize. Inline extent must have 0 for key offset. 1) Item size Plain text inline file extent size must match item size. 'plain text' seems to be a bit misleading, I don't think we've ever referred to uncompressed extent as such, although it makes some sense. I think 'uncompressed' would work too. I'll use 'uncompressed' instead. (compressed inline file extent has no info about its on-disk size) Regular/preallocated file extent size must be a fixed value. 2) Every member of regular file extent item Including alignment for bytenr and offset, possible value for compression/encryption/type. 3) Type/compression/encode must be one of the valid values. This should be the most comprehensive and restrict check in the context of btrfs_item for EXTENT_DATA. Signed-off-by: Qu Wenruo--- fs/btrfs/disk-io.c | 108 include/uapi/linux/btrfs_tree.h | 1 + 2 files changed, 109 insertions(+) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index e034d08bd036..b92296c6a698 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -549,6 +549,103 @@ static int check_tree_block_fsid(struct btrfs_fs_info *fs_info, btrfs_header_level(eb) == 0 ? "leaf" : "node", \ reason, btrfs_header_bytenr(eb), root->objectid, slot) +static int check_extent_data_item(struct btrfs_root *root, + struct extent_buffer *leaf, + struct btrfs_key *key, int slot) +{ + struct btrfs_file_extent_item *fi; + u32 sectorsize = root->fs_info->sectorsize; + u32 item_size = btrfs_item_size_nr(leaf, slot); + + if (!IS_ALIGNED(key->offset, sectorsize)) { + CORRUPT("unaligned key offset for file extent", The CORRUPT macro does not print any details beyond what it gets from the parameters, so here we'd like to know which extent it is and what's the size. The sectorsize can be found elsewhere so it does not need to be printed. Did you mean despite bytenr of the tree block and root objectid, we should output more info about the key? By "size" did you mean the disk_num_bytes of the extent? But since the offset is already invalid, I don't think we can trust any data from that structure, so what we can output is only about the key. I can enhance the output to output the key, but I'm afraid I can't output anything more than that. + leaf, root, slot); + return -EUCLEAN; + } + + fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); + + if (btrfs_file_extent_type(leaf, fi) >= BTRFS_FILE_EXTENT_LAST_TYPE) { + CORRUPT("invalid file extent type", leaf, root, slot); "invalid file extent type %d" and actually, we could add some item-specific helpers to report the corruption so if it's for an extent, print the generic extent information, plus an additional information what exactly was wrong. For additional info I'm completely fine. But I'm not sure if it's good to output more info about the item. The biggest problem is, when any member of the item can't pass validation checker, we can't trust the whole item. So the item-specific info may not contain meaningful info. + return -EUCLEAN; + } + + /* +* Support for new compression/encrption must introduce incompat flag, +* and must be caught in open_ctree(). +*/ + if (btrfs_file_extent_compression(leaf, fi) >= BTRFS_COMPRESS_LAST) { + CORRUPT("invalid file extent compression", leaf, root, slot); + return -EUCLEAN; + } + if (btrfs_file_extent_encryption(leaf, fi)) { + CORRUPT("invalid file extent encryption", leaf, root, slot); + return -EUCLEAN; + } + if (btrfs_file_extent_type(leaf, fi) == BTRFS_FILE_EXTENT_INLINE) { + /* Inline extent must have 0 as key offset */ + if (key->offset) { + CORRUPT("inline extent has non-zero key offset", + leaf, root, slot); + return -EUCLEAN; + } + + /* Compressed inline extent has no on-disk size, skip it */ + if (btrfs_file_extent_compression(leaf, fi) != + BTRFS_COMPRESS_NONE) + return 0; + + /* Plaintext inline extent size must match item size */ + if (item_size != BTRFS_FILE_EXTENT_INLINE_DATA_START + + btrfs_file_extent_ram_bytes(leaf, fi)) { + CORRUPT("plaintext inline extent has invalid size", +
[PATCH] New ioctl BTRFS_IOC_GET_CHUNK_INFO
From: Goffredo BaroncelliAdd a new ioctl to get info about chunk without requiring the root privileges. This allow to a non root user to know how the space of the filesystem is allocated. Signed-off-by: Goffredo Baroncelli --- fs/btrfs/ioctl.c | 215 + include/uapi/linux/btrfs.h | 38 2 files changed, 253 insertions(+) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index fa1b78cf25f6..d9ba9ed52693 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2220,6 +2220,219 @@ static noinline int btrfs_ioctl_tree_search_v2(struct file *file, return ret; } +/* + * Return: + * 0 -> copied all data, possible further data + * 1 -> copied all data, no further data + * -EAGAIN -> not enough space, restart it + * -EFAULT -> the user passed an invalid address/size pair + */ +static noinline int copy_chunk_info(struct btrfs_path *path, + char __user *ubuf, + size_t buf_size, + u64 *used_buf, + int *num_found, + u64 *offset) +{ + struct extent_buffer *leaf; + unsigned long item_off; + unsigned long item_len; + int nritems; + int i; + int slot; + int ret = 0; + struct btrfs_key key; + + leaf = path->nodes[0]; + slot = path->slots[0]; + nritems = btrfs_header_nritems(leaf); + + for (i = slot; i < nritems; i++) { + u64 destsize; + struct btrfs_chunk_info ci; + struct btrfs_chunk chunk; + int j, chunk_size; + + item_off = btrfs_item_ptr_offset(leaf, i); + item_len = btrfs_item_size_nr(leaf, i); + + btrfs_item_key_to_cpu(leaf, , i); + /* +* we are not interested in other items type +*/ + if (key.type != BTRFS_CHUNK_ITEM_KEY) { + ret = 1; + goto out; + } + + /* +* In any case, the next search must start from here +*/ + *offset = key.offset; + read_extent_buffer(leaf, , item_off, sizeof(chunk)); + + /* +* chunk.num_stripes-1 is correct, because btrfs_chunk includes +* already a stripe +*/ + destsize = sizeof(struct btrfs_chunk_info) + + (chunk.num_stripes - 1) * sizeof(struct btrfs_stripe); + + BUG_ON(destsize > item_len); + + if (buf_size < destsize + *used_buf) { + if (*num_found) + /* try onother time */ + ret = -EAGAIN; + else + /* in any case the buffer is too small */ + ret = -EOVERFLOW; + goto out; + } + + /* copy chunk */ + chunk_size = offsetof(struct btrfs_chunk_info, stripes); + memset(, 0, chunk_size); + ci.length = btrfs_stack_chunk_length(); + ci.stripe_len = btrfs_stack_chunk_stripe_len(); + ci.type = btrfs_stack_chunk_type(); + ci.num_stripes = btrfs_stack_chunk_num_stripes(); + ci.sub_stripes = btrfs_stack_chunk_sub_stripes(); + ci.offset = key.offset; + + if (copy_to_user(ubuf + *used_buf, , chunk_size)) { + ret = -EFAULT; + goto out; + } + *used_buf += chunk_size; + + /* copy stripes */ + for (j = 0 ; j < chunk.num_stripes ; j++) { + struct btrfs_stripe chunk_stripe; + struct btrfs_chunk_info_stripe csi; + + /* +* j-1 is correct, because btrfs_chunk includes already +* a stripe +*/ + read_extent_buffer(leaf, _stripe, + item_off + sizeof(struct btrfs_chunk) + + sizeof(struct btrfs_stripe) * + (j - 1), sizeof(chunk_stripe)); + + memset(, 0, sizeof(csi)); + + csi.devid = btrfs_stack_stripe_devid(_stripe); + csi.offset = btrfs_stack_stripe_offset(_stripe); + memcpy(csi.dev_uuid, chunk_stripe.dev_uuid, + sizeof(chunk_stripe.dev_uuid)); + if (copy_to_user(ubuf + *used_buf, , sizeof(csi))) { + ret = -EFAULT; +
[RFC] btrfs: new ioctl BTRFS_IOC_GET_CHUNK_INFO
The aim of this patch is to replace the BTRFS_IOC_TREE_SEARCH ioctl when it is used to obtain information about the chunks/block groups. The problems in using the BTRFS_IOC_TREE_SEARCH is that it access the very low data structure of BTRFS. This means: 1) this would be complicated a possible change of the disk format 2) it requires the root privileges The new BTRFS_IOC_GET_CHUNK_INFO is an attempt to address these issue. This ioctl can be called from a not root user: I think that the data exposed are not sensibile data. In a separate patches set, there is an update to the btrfs utility to allow it to use this new ioctl int the load_chunk_info() function. So now it is possible to do "btrfs fi us" without root privileges. This is an RFC, because I want to be sure that 1) it is right my understanding that the chunk info are not a "sensibile data" 2) the api is a good api. In particular I am not sure that returning -EAGAIN is the right thing to do when further data is available but the buffer is not enough. Comments are welcome BR G.Baroncelli -- gpg @keyserver.linux.it: Goffredo Baroncelli Key fingerprint BBF5 1610 0B64 DAC6 5F7D 17B2 0EDA 9B37 8B82 E0B5 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 2/2] Use the new ioctl BTRFS_IOC_GET_CHUNK_INFO for load_chunk_info()
From: Goffredo BaroncelliUse the new ioctl BTRFS_IOC_GET_CHUNK_INFO in load_chunk_info() instead of the BTRFS_IOC_TREE_SEARCH. The old method is still present as fallback for compatibility reason. The goal is to avoid BTRFS_IOC_GET_CHUNK_INFO because it requires root privileges. Signed-off-by: Goffredo Baroncelli --- cmds-fi-usage.c | 108 ++-- ioctl.h | 60 +++ 2 files changed, 165 insertions(+), 3 deletions(-) diff --git a/cmds-fi-usage.c b/cmds-fi-usage.c index 6c846c15..6a101e28 100644 --- a/cmds-fi-usage.c +++ b/cmds-fi-usage.c @@ -36,7 +36,7 @@ /* * Add the chunk info to the chunk_info list */ -static int add_info_to_list(struct chunk_info **info_ptr, +static int legacy_add_info_to_list(struct chunk_info **info_ptr, int *info_count, struct btrfs_chunk *chunk) { @@ -127,7 +127,8 @@ static int cmp_chunk_info(const void *a, const void *b) ((struct chunk_info *)b)->type); } -static int load_chunk_info(int fd, struct chunk_info **info_ptr, int *info_count) +static int legacy_load_chunk_info(int fd, struct chunk_info **info_ptr, + int *info_count) { int ret; struct btrfs_ioctl_search_args args; @@ -180,7 +181,7 @@ static int load_chunk_info(int fd, struct chunk_info **info_ptr, int *info_count off += sizeof(*sh); item = (struct btrfs_chunk *)(args.buf + off); - ret = add_info_to_list(info_ptr, info_count, item); + ret = legacy_add_info_to_list(info_ptr, info_count, item); if (ret) { *info_ptr = NULL; return 1; @@ -213,6 +214,107 @@ static int load_chunk_info(int fd, struct chunk_info **info_ptr, int *info_count return 0; } +/* + * Add the chunk info to the chunk_info list + */ +static int add_info_to_list(struct chunk_info **info_ptr, + int *info_count, + struct btrfs_chunk_info *chunk) +{ + + u64 type = chunk->type; + u64 size = chunk->length; + int num_stripes = chunk->num_stripes; + int j; + + for (j = 0 ; j < num_stripes ; j++) { + int i; + struct chunk_info *p = NULL; + u64devid; + + devid = chunk->stripes[j].devid; + + for (i = 0 ; i < *info_count ; i++) + if ((*info_ptr)[i].type == type && + (*info_ptr)[i].devid == devid && + (*info_ptr)[i].num_stripes == num_stripes) { + p = (*info_ptr) + i; + break; + } + + if (!p) { + int tmp = sizeof(struct btrfs_chunk) * + (*info_count + 1); + struct chunk_info *res = realloc(*info_ptr, tmp); + + if (!res) { + free(*info_ptr); + error("not enough memory"); + return -ENOMEM; + } + + *info_ptr = res; + p = res + *info_count; + (*info_count)++; + + p->devid = devid; + p->type = type; + p->size = 0; + p->num_stripes = num_stripes; + } + + p->size += size; + + } + + return 0; + +} + +static int load_chunk_info(int fd, struct chunk_info **info_ptr, + int *info_count) +{ + + char buf[4096]; + struct btrfs_ioctl_chunk_info *bici = + (struct btrfs_ioctl_chunk_info *)buf; + int cont; + + bici->buf_size = sizeof(buf); + bici->offset = (u64)0; + + do { + int i; + struct btrfs_chunk_info *ci; + int ret; + + cont = false; + ret = ioctl(fd, BTRFS_IOC_GET_CHUNK_INFO, bici); + if (ret < 0) { + int e = errno; + + if (e == ENOTTY) + return legacy_load_chunk_info(fd, info_ptr, + info_count); + else if (e == EAGAIN) + cont = true; + else + return -e; + } + + ci = btrfs_first_chunk_info(bici); + for (i = 0 ; i < bici->items_count ; i++) { + add_info_to_list(info_ptr, info_count, ci); + ci = btrfs_next_chunk_info(ci); +
[RFC] btrfs-progs: use the new ioctl BTRFS_IOC_GET_CHUNK_INFO
Hi all, this patches set uses the new ioctl BTRFS_IOC_GET_CHUNK_INFO to get information about the chunk (see my other emails). The first patch change the function get_partition_size() to avoid ioctl which would require root privileges (BLKGETSIZE64). Instead it obtain the information via the sysfs filesystem. The second patch use the BTRFS_IOC_GET_CHUNK_INFO instead of TREE_SEARCH_IOCTL. The old code is still in place as fallback when the kernel doesn't have the new ioctl. These patches allow to use "btrfs fi usage" without root privileges. Comments are welcome BR G.Baroncelli -- gpg @keyserver.linux.it: Goffredo Baroncelli Key fingerprint BBF5 1610 0B64 DAC6 5F7D 17B2 0EDA 9B37 8B82 E0B5 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 1/2] get_partition_size() doens't require root privileges
From: Goffredo BaroncelliAllow the get_partition_size() to be called without root privileges. Signed-off-by: Goffredo baroncelli --- utils.c | 27 +++ 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/utils.c b/utils.c index 7a2710fe..ee3ed544 100644 --- a/utils.c +++ b/utils.c @@ -2080,18 +2080,29 @@ u64 disk_size(const char *path) u64 get_partition_size(const char *dev) { - u64 result; - int fd = open(dev, O_RDONLY); + struct stat statbuf; + int r; + int fd; + char buf[100]; - if (fd < 0) - return 0; - if (ioctl(fd, BLKGETSIZE64, ) < 0) { - close(fd); + r = stat(dev, ); + if (r != 0) return 0; - } + + snprintf(buf, sizeof(buf), + "/sys/dev/block/%d:%d/size", major(statbuf.st_rdev), +minor(statbuf.st_rdev)); + + fd = open(buf, O_RDONLY); + BUG_ON(fd < 0); + + r = read(fd, buf, sizeof(buf)-1); close(fd); - return result; + BUG_ON(r < 0); + buf[r] = 0; + + return atoll(buf) * 512; } /* -- 2.14.1 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Btrfs Issues
Hi Satoru, I'm so sorry for the late reply! I rewrote the test programs to invoke syscalls in the conventional way and added a few comments. Please find attached the revised report. As to your questions on "mmap", you are totally right that the first mmap in the original 1/min.cpp is irrelevant. The original program was automatically generated from a fuzzer I used and I hope the revised ones to be much more understandable. Thanks, Amy On Sun, Sep 17, 2017 at 11:10 PM, Satoru Takeuchiwrote: > At Wed, 13 Sep 2017 11:53:35 -0400, > Ruoxin Jiang wrote: >> >> [1 ] >> Hello, >> >> We are researchers from Columbia University, New York. As part of our >> current research we have found some semantic discrepancies between >> btrfs and other popular filesystems. >> >> We have attached two cases. The first one involves an invalid O_DIRECT >> write() that fails back to buffered write instead of failing with >> error EINVAL. In directory 2, we discovered that btrfs calculates >> write_bytes in __btrfs_buffered_write differently from that in >> generic_perform_writes in fs/mmap.c. This can cause inconsistent >> behavior between btrfs and other filesystems when program invokes the >> same writev/write() syscall. >> >> In each directory, you will find a Readme.md describing the issue and >> pointing to the code that may cause the problem. For your convenience, >> we also included test programs (min.cpp) and instructions in Readme to >> help reproduce the issues. >> >> We would appreciate very much if you could confirm the two cases at >> your conveniences. > > I took a look at your test programs, btrfs_issues/{1,2}/min.cpp. It looks > very hard to read since you call syscalls in odd ways and all flags are > hardcoded as literal hexadecimal numbers. Could rewrite these program > to improve readability? > > In addition, I have two questions about btrfs_issues/1/min.cpp. > > 1. Why you set 'filename' as the 1st argument of mmap()? > 2. What's the purpose of mmap() call? I guess mmap() is not related to issue > 1. > > Thanks, > Satoru > >> >> Thanks, >> Amy >> [2 btrfs_issues.tar.gz ] > -- > To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in > the body of a message to majord...@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html -- Ruoxin(Amy) Jiang Columbia University, M.S. Computer Science 347-277-5455 btrfs_issues_revised.tar.gz Description: GNU Zip compressed data
Re: [PATCH][v3] Btrfs: add a extent ref verify tool
On Fri, Sep 01, 2017 at 03:09:30AM -0400, jo...@toxicpanda.com wrote: > From: Josef Bacik> > We were having corruption issues that were tied back to problems with the > extent > tree. In order to track them down I built this tool to try and find the > culprit, which was pretty successful. If you compile with this tool on it > will > live verify every ref update that the fs makes and make sure it is consistent > and valid. I've run this through with xfstests and haven't gotten any false > positives. Thanks, > > Signed-off-by: Josef Bacik > --- > v2->v3: > - fix use after free in an error case Can you please split the patch into more parts? This is just too big. * the parameter changes, one per patch * add ref-veriy.* * new mount option and enabling the ref-verify Apart from the specific comments written inline, here's list of thing that I saw repeatd in several places: printk instead of the btrfs_* error helpers - the bare printk will not tell youw which filesystem is affected so it's not helpful when there are several btrfs filesytems active please don't split long strings please don't use %Lu or %Ld format string, %llu GFP_NOFS -- it's used on the open_ctree path so GFP_KERNEL is the right and safe flag misc small coding style issues I'm half way through reviewing it from the functional side, so far it looks good. > fs/btrfs/Kconfig | 10 + > fs/btrfs/Makefile |1 + > fs/btrfs/ctree.c |2 +- > fs/btrfs/ctree.h | 14 +- > fs/btrfs/disk-io.c | 15 + > fs/btrfs/extent-tree.c | 44 ++- > fs/btrfs/file.c| 10 +- > fs/btrfs/inode.c |9 +- > fs/btrfs/ioctl.c |2 +- > fs/btrfs/ref-verify.c | 1019 > > fs/btrfs/ref-verify.h | 51 +++ > fs/btrfs/relocation.c | 14 +- > fs/btrfs/super.c | 17 + > fs/btrfs/tree-log.c|2 +- > 14 files changed, 1178 insertions(+), 32 deletions(-) > create mode 100644 fs/btrfs/ref-verify.c > create mode 100644 fs/btrfs/ref-verify.h > > diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig > index 80e9c18..77d7f74 100644 > --- a/fs/btrfs/Kconfig > +++ b/fs/btrfs/Kconfig > @@ -89,3 +89,13 @@ config BTRFS_ASSERT > any of the assertions trip. This is meant for btrfs developers only. > > If unsure, say N. > + > +config BTRFS_FS_REF_VERIFY > + bool "Btrfs with the ref verify tool compiled in" > + depends on BTRFS_FS must be N by default > + help > + Enable run-time extent reference verification instrumentation. This > + is meant to be used by btrfs developers for tracking down extent > + reference problems or verifying they didn't break something. > + > + If unsure, say N. > diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile > index 128ce17..3172751 100644 > --- a/fs/btrfs/Makefile > +++ b/fs/btrfs/Makefile > @@ -13,6 +13,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o > root-tree.o dir-item.o \ > > btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o > btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o > +btrfs-$(CONFIG_BTRFS_FS_REF_VERIFY) += ref-verify.o > > btrfs-$(CONFIG_BTRFS_FS_RUN_SANITY_TESTS) += tests/free-space-tests.o \ > tests/extent-buffer-tests.o tests/btrfs-tests.o \ > diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c > index 6d49db7..a4812ce 100644 > --- a/fs/btrfs/ctree.c > +++ b/fs/btrfs/ctree.c > @@ -192,7 +192,7 @@ struct extent_buffer *btrfs_lock_root_node(struct > btrfs_root *root) > * tree until you end up with a lock on the root. A locked buffer > * is returned, with a reference held. > */ > -static struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root > *root) > +struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root) > { > struct extent_buffer *eb; > > diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h > index d49b045..4fa3ddd 100644 > --- a/fs/btrfs/ctree.h > +++ b/fs/btrfs/ctree.h > @@ -1098,6 +1098,12 @@ struct btrfs_fs_info { > u32 nodesize; > u32 sectorsize; > u32 stripesize; > + > +#ifdef CONFIG_BTRFS_FS_REF_VERIFY > + spinlock_t ref_verify_lock; > + struct rb_root block_tree; > + bool ref_verify_enabled; the on/off bit can be added to the fs_info::flags instead > +#endif > }; > > static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb) > @@ -1336,6 +1342,7 @@ static inline u32 BTRFS_MAX_XATTR_SIZE(const struct > btrfs_fs_info *info) > #define BTRFS_MOUNT_FRAGMENT_METADATA(1 << 25) > #define BTRFS_MOUNT_FREE_SPACE_TREE (1 << 26) > #define BTRFS_MOUNT_NOLOGREPLAY (1 << 27) > +#define BTRFS_MOUNT_REF_VERIFY (1 << 28) > > #define BTRFS_DEFAULT_COMMIT_INTERVAL(30) > #define BTRFS_DEFAULT_MAX_INLINE (2048) > @@ -2627,7 +2634,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle > *trans, > struct extent_buffer
Wrong device?
Hi, I have 15 x 6 TB disks in md-raid (Because btrfs's raid6 was marked as not-for-real-use when I first time installed this machine) Now I have got both problem twice. At this last time I have 233 subvolums, and millions of files (all together) Then filesystem went to read only with this dmesg: [Sat Sep 23 07:25:28 2017] [ cut here ] [Sat Sep 23 07:25:28 2017] WARNING: CPU: 5 PID: 5431 at /build/linux-hwe-edge-CrMNv8/linux-hwe-edge-4.10.0/fs/btrfs/extent-tree.c:6947 __btrfs_free_extent.isra.61+0x2cb/0xeb0 [btrfs] [Sat Sep 23 07:25:28 2017] BTRFS: Transaction aborted (error -28) [Sat Sep 23 07:25:28 2017] Modules linked in: 8021q garp mrp stp llc bonding ip6table_filter ip6_tables ipt_REJECT nf_reject_ipv4 nf_log_ipv4 nf_log_common xt_LOG xt_limit xt_tcpudp xt_multiport iptable_filter ip_tables x_tables intel_rapl sb_edac edac_core x86_pkg_temp_thermal intel_powerclamp coretemp ipmi_ssif kvm_intel kvm irqbypass intel_cstate intel_rapl_perf joydev input_leds mei_me mei lpc_ich shpchp ioatdma ipmi_si ipmi_devintf ipmi_msghandler acpi_pad acpi_power_meter mac_hid ib_iser rdma_cm iw_cm ib_cm ib_core configfs iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi autofs4 btrfs raid10 raid0 multipath linear raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx xor raid6_pq libcrc32c raid1 ses enclosure ixgbe ast crct10dif_pclmul i2c_algo_bit crc32_pclmul ttm drm_kms_helper ghash_clmulni_intel [Sat Sep 23 07:25:28 2017] dca syscopyarea pcbc sysfillrect aesni_intel aes_x86_64 hid_generic sysimgblt crypto_simd mpt3sas fb_sys_fops ptp r8169 raid_class ahci glue_helper usbhid pps_core mxm_wmi cryptd drm hid libahci mdio scsi_transport_sas mii fjes wmi [Sat Sep 23 07:25:28 2017] CPU: 5 PID: 5431 Comm: btrfs-transacti Tainted: G W 4.10.0-26-generic #30~16.04.1-Ubuntu [Sat Sep 23 07:25:28 2017] Hardware name: Supermicro X10DRi/X10DRI-T, BIOS 2.1 09/13/2016 [Sat Sep 23 07:25:28 2017] Call Trace: [Sat Sep 23 07:25:28 2017] dump_stack+0x63/0x90 [Sat Sep 23 07:25:28 2017] __warn+0xcb/0xf0 [Sat Sep 23 07:25:28 2017] warn_slowpath_fmt+0x5f/0x80 [Sat Sep 23 07:25:28 2017] __btrfs_free_extent.isra.61+0x2cb/0xeb0 [btrfs] [Sat Sep 23 07:25:28 2017] ? btrfs_merge_delayed_refs+0x64/0x640 [btrfs] [Sat Sep 23 07:25:28 2017] ? btrfs_search_slot+0x981/0x9c0 [btrfs] [Sat Sep 23 07:25:28 2017] __btrfs_run_delayed_refs+0xb44/0x13a0 [btrfs] [Sat Sep 23 07:25:28 2017] ? btree_set_page_dirty+0xe/0x10 [btrfs] [Sat Sep 23 07:25:28 2017] ? free_extent_buffer+0x4b/0xa0 [btrfs] [Sat Sep 23 07:25:28 2017] ? igrab+0x1e/0x60 [Sat Sep 23 07:25:28 2017] btrfs_run_delayed_refs+0x7f/0x2a0 [btrfs] [Sat Sep 23 07:25:28 2017] btrfs_write_dirty_block_groups+0x169/0x3c0 [btrfs] [Sat Sep 23 07:25:28 2017] commit_cowonly_roots+0x221/0x2c0 [btrfs] [Sat Sep 23 07:25:28 2017] btrfs_commit_transaction+0x542/0x950 [btrfs] [Sat Sep 23 07:25:28 2017] transaction_kthread+0x1a6/0x1c0 [btrfs] [Sat Sep 23 07:25:28 2017] kthread+0x109/0x140 [Sat Sep 23 07:25:28 2017] ? btrfs_cleanup_transaction+0x540/0x540 [btrfs] [Sat Sep 23 07:25:28 2017] ? kthread_create_on_node+0x60/0x60 [Sat Sep 23 07:25:28 2017] ret_from_fork+0x2c/0x40 [Sat Sep 23 07:25:28 2017] ---[ end trace 76fd3c22e75c2b85 ]--- [Sat Sep 23 07:25:28 2017] BTRFS: error (device sdb) in __btrfs_free_extent:6947: errno=-28 No space left [Sat Sep 23 07:25:28 2017] BTRFS: error (device sdb) in btrfs_drop_snapshot:9193: errno=-28 No space left [Sat Sep 23 07:25:28 2017] BTRFS info (device sdb): forced readonly After a long googling (about more complex situations) I suddenly noticed "device sdb" WTF??? Filesystem is mounted from /dev/md3 (sdb is part of that mdraid) so btrfs should not even know anything about that /dev/sdb. So maybe the error is more simple that I thought. It tries to allocate more metadata from physical drive (not from /dev/md3 as it was supposed to do), so that "no space left on device" could be the REAL error... But why it is doing so? Does it help if I add some extra virtualization layer, like LVM? # grep data /etc/fstab /dev/md3 /data btrfs compress=zlib,space_cache,noatime 0 0 # btrfs fi df /data Data, single: total=4.77TiB, used=4.77TiB System, DUP: total=32.00MiB, used=592.00KiB Metadata, DUP: total=92.00GiB, used=90.87GiB GlobalReserve, single: total=512.00MiB, used=172.50MiB # btrfs fi show /data Label: none uuid: 84a31ba6-d22d-4653-b071-5525dbdfe561 Total devices 1 FS bytes used 4.85TiB devid2 size 70.95TiB used 4.95TiB path /dev/md3 # cat /proc/mdstat Personalities : [raid1] [raid6] [raid5] [raid4] [linear] [multipath] [raid0] [raid10] md3 : active raid6 sdl[17](S) sdb[16] sdq[15] sdp[14] sdm[11] sdc[1] sdo[13] sde[3] sdj[8] sdn[12] sdh[6] sdi[7] sdk[9] sdf[4] sdd[2] sdg[5] 76185088512 blocks super 1.2 level 6, 512k chunk, algorithm 2 [15/15] [UUU] bitmap: 0/44 pages [0KB], 65536KB chunk # btrfs --version btrfs-progs v4.4
Btrfs progs release 4.13.1
Hi, btrfs-progs version 4.13.1 have been released. The ZSTD support has been added, the kernel code will land in 4.14. Now configure will detect if libzstd is available and will use it. In the future this will become a mandatory dependency. Please check your distro for libzstd and ask them to add it eventually. As a fallback, it is and will be possible to disable zstd detection (and btrfs restore will obviously lack the support). Changes: * image: speed up generating the sanitized names, do not generate unprintable chars * completion: add missing commands, better mount point detection * restore: add zstd support; libzstd detected automatically, will be requested by default in the future, or can be configured out * other: * misc fixes found by sparse * doc enhancements, ioctl manual page started * updated and new tests * build fixes Tarballs: https://www.kernel.org/pub/linux/kernel/people/kdave/btrfs-progs/ Git: git://git.kernel.org/pub/scm/linux/kernel/git/kdave/btrfs-progs.git Shortlog: David Sterba (18): btrfs-progs: README: update links, enhance contribution section btrfs-progs: tests: fix run_mustfail in cli-tests/007-check-force btrfs-progs: build: sync Makefile to Android.mk btrfs-progs: convert: use correct string printing for errcode_t btrfs-progs: tests: remove temporary loopdev files btrfs-progs: tests: remove temporary images in mkfs/005 and mkfs/006 btrfs-progs: build: whitespace adjustment of [LD] fssum btrfs-progs: tests: make sure _is_file_or_command does not get confused btrfs-progs: docs: start ioctl documentation manual page btrfs-progs: fix debugging macro checks btrfs-progs: print-tree: use proper helper for reading offset btrfs-progs: free-space-cache: fix endianity when reading from disk_key btrfs-progs: updated and add missing function attributes to the definition btrfs-progs: build: add missing defines for the C=1 build btrfs-progs: build: use -std=gnu89 for sparse btrfs-progs: build: add more sparse warning checks btrfs-progs: tests: check there are no unprintable characters in btrfs-image -ss output btrfs-progs: update CHANGES for v4.13.1 Misono, Tomohiro (2): btrfs-progs: update btrfs-completion btrfs-progs: cleanup whitespaces of btrfs-completion Naohiro Aota (2): btrfs-progs: build: generate all dependency files btrfs-progs: build: omit unnecessary -MD flag Nicholas D Steeves (2): btrfs-progs: tests: Add required IETF Trust copyright to SHA implementation btrfs-progs: tests: Remove misleading BCP 78 boilerplate from SHA implementation Nick Terrell (2): btrfs-progs: Add zstd support btrfs-progs: tests: add testing image for zstd for btrfs-restore Piotr Pawlow (6): btrfs-progs: image: fix non-printable characters in generated file names btrfs-progs: image: move core find_collision code to a separate function btrfs-progs: image: add reverse CRC32C table btrfs-progs: image: add a function to calculate CRC32C collisions btrfs-progs: image: add a function to check if generated filename suffix is valid btrfs-progs: image: use CRC32C reversing instead of brute force to find collisions Qu Wenruo (2): btrfs-progs: Refactor find_next_chunk to get rid of parameter root and objectid btrfs-progs: Fix one-byte overlap bug in free_block_group_cache Uli Heller (1): btrfs-progs: Removed missing header file, fixes compilation error yingyil (1): btrfs-progs: mkfs: refactor create_data_reloc_tree -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC 0/3]: settable compression level for zstd
On Fri, Sep 15, 2017 at 05:34:57PM +0200, Adam Borowski wrote: > Hi! > Here's a patch set that allows changing the compression level for zstd, > currently at mount time only. I've played with it for a month, so despite > being a quick hack, it's reasonably well tested. Tested on 4.13 + > btrfs-for-4.14 only, though -- I've booted 11th-day-of-merge-window only an > hour ago on one machine, no explosions yet. > > As a quick hack, it doesn't conserve memory as it should: all workspace > allocations assume level 15 and waste space otherwise. > > Because of an (easily changeable) quirk of compression level encoding, the > max is set at 15, but I guess higher levels are pointless for 128KB blocks. > Nick and co can tell us more -- for me zstd is mostly a black box so it's > you who knows anything about tuning it. > > There are three patches: > * [David Sterba] btrfs: allow to set compression level for zlib > Unmodified version of the patch from Jul 24, I'm re-sending it for > convenience. > * btrfs: allow setting zlib compression level via :9 > Some bikeshedding: it looks like Chris Mason also favours zlib:9 over > zlib9 as the former is more readable. If you disagree... well, it's up > to you to decide anyway. This patch accepts both syntaxes. > * btrfs: allow setting zstd level I'll add the patches to for-next as it does not affect default behaviour and the changes are contained in compression. Further updates are fine, though I think we'll have to resolve the workspace waste before the patches can be merged to master. I tend agree to add the separator and slightly prefer ':' over '-'. In order to utilize the zstd at higher levels, we can enhance the compressed chunk size and then we could use another ':' to separate that from the rest. -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v2 3/4] btrfs: Add sanity check for EXTENT_DATA when reading out leaf
On Wed, Aug 23, 2017 at 04:57:58PM +0900, Qu Wenruo wrote: > Add extra checker for item with EXTENT_DATA type. > This checks the following thing: > 0) Key offset >All key offset must be aligned to sectorsize. >Inline extent must have 0 for key offset. > > 1) Item size >Plain text inline file extent size must match item size. 'plain text' seems to be a bit misleading, I don't think we've ever referred to uncompressed extent as such, although it makes some sense. I think 'uncompressed' would work too. >(compressed inline file extent has no info about its on-disk size) >Regular/preallocated file extent size must be a fixed value. > > 2) Every member of regular file extent item >Including alignment for bytenr and offset, possible value for >compression/encryption/type. > > 3) Type/compression/encode must be one of the valid values. > > This should be the most comprehensive and restrict check in the context > of btrfs_item for EXTENT_DATA. > > Signed-off-by: Qu Wenruo> --- > fs/btrfs/disk-io.c | 108 > > include/uapi/linux/btrfs_tree.h | 1 + > 2 files changed, 109 insertions(+) > > diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c > index e034d08bd036..b92296c6a698 100644 > --- a/fs/btrfs/disk-io.c > +++ b/fs/btrfs/disk-io.c > @@ -549,6 +549,103 @@ static int check_tree_block_fsid(struct btrfs_fs_info > *fs_info, > btrfs_header_level(eb) == 0 ? "leaf" : "node", \ > reason, btrfs_header_bytenr(eb), root->objectid, slot) > > +static int check_extent_data_item(struct btrfs_root *root, > + struct extent_buffer *leaf, > + struct btrfs_key *key, int slot) > +{ > + struct btrfs_file_extent_item *fi; > + u32 sectorsize = root->fs_info->sectorsize; > + u32 item_size = btrfs_item_size_nr(leaf, slot); > + > + if (!IS_ALIGNED(key->offset, sectorsize)) { > + CORRUPT("unaligned key offset for file extent", The CORRUPT macro does not print any details beyond what it gets from the parameters, so here we'd like to know which extent it is and what's the size. The sectorsize can be found elsewhere so it does not need to be printed. > + leaf, root, slot); > + return -EUCLEAN; > + } > + > + fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); > + > + if (btrfs_file_extent_type(leaf, fi) >= BTRFS_FILE_EXTENT_LAST_TYPE) { > + CORRUPT("invalid file extent type", leaf, root, slot); "invalid file extent type %d" and actually, we could add some item-specific helpers to report the corruption so if it's for an extent, print the generic extent information, plus an additional information what exactly was wrong. > + return -EUCLEAN; > + } > + > + /* > + * Support for new compression/encrption must introduce incompat flag, > + * and must be caught in open_ctree(). > + */ > + if (btrfs_file_extent_compression(leaf, fi) >= BTRFS_COMPRESS_LAST) { > + CORRUPT("invalid file extent compression", leaf, root, slot); > + return -EUCLEAN; > + } > + if (btrfs_file_extent_encryption(leaf, fi)) { > + CORRUPT("invalid file extent encryption", leaf, root, slot); > + return -EUCLEAN; > + } > + if (btrfs_file_extent_type(leaf, fi) == BTRFS_FILE_EXTENT_INLINE) { > + /* Inline extent must have 0 as key offset */ > + if (key->offset) { > + CORRUPT("inline extent has non-zero key offset", > + leaf, root, slot); > + return -EUCLEAN; > + } > + > + /* Compressed inline extent has no on-disk size, skip it */ > + if (btrfs_file_extent_compression(leaf, fi) != > + BTRFS_COMPRESS_NONE) > + return 0; > + > + /* Plaintext inline extent size must match item size */ > + if (item_size != BTRFS_FILE_EXTENT_INLINE_DATA_START + > + btrfs_file_extent_ram_bytes(leaf, fi)) { > + CORRUPT("plaintext inline extent has invalid size", > + leaf, root, slot); > + return -EUCLEAN; > + } > + return 0; > + } > + > + > + /* regular or preallocated extent has fixed item size */ > + if (item_size != sizeof(*fi)) { > + CORRUPT( > + "regluar or preallocated extent data item size is invalid", > + leaf, root, slot); > + return -EUCLEAN; > + } > + if (!IS_ALIGNED(btrfs_file_extent_ram_bytes(leaf, fi), sectorsize) || Is this condition right? I think I've seen random numbers of ram_bytes in the output of dump-tree so I wonder if this applies only to some extents where the condition holds. > +
btrfs send -p
Hi guys, It seems that btrfs v4.12.1 allows: (1) btrfs send -p but disallows (2) btrfs send -p Code-wise it assumes that is always found at optind == 1. I was about to patch this but I'm not sure which way we'd like to go with this: Actually only allow (1) and block (2) properly or allow both. In any case, this seems to me like a pretty serious regression and we'd like to get this settled soon. :) Thanks! Christian -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v2 0/3] Introduce comprehensive sanity check framework and
On Wed, Aug 23, 2017 at 04:57:55PM +0900, Qu Wenruo wrote: > The patchset introduce a new framework to do more comprehensive (if not > the most) sanity check when reading out a leaf. > > The new sanity checker will include: > > 1) Key order >Existing code > > 2) Item boundary >Existing code with enhanced checker to ensure item pointer doesn't >overlap with item itself. > > 3) Key type based sanity checker >Only EXTENT_DATA and EXTENT_CSUM checker is implemented yet. >As each checker should go through review and tests, or it can easily >make a valid btrfs failed to be mounted. >So only two checkers are implemented as an example. > >Existing checker like INODE_REF checker can be moved to this >framework easily, and we can centralize all existing checkers, make >the rest of codes more clean. > > Performance wise, it's just iterating a leaf. > And it will only get triggered when reading a leaf, cached leaf will > not go through such checker. > So it won't be a performance breaker. The series looks great, I really like it. Some of the error messages could be more verbose about what exactly went wrong, eg. what is the value that's not expected. I'll reply to the patches with examples what I'd expect. -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] btrfs-progs: subvolume: outputs message only when operation succeeds
On 25/09/17 17:33, Qu Wenruo wrote: (Any in this case, anyone in the maillist can help review messages) If this is a question, I can help with assigning levels to messages. Although I think many levels are only required for complex daemons or network tools, while btrfs utils mostly perform atomic operations which either succeed or fail. But it's of course hard to be sure without seeing all actual messages, probably there's some use for 4 levels. -- With Best Regards, Marat Khalili -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/3] btrfs: cleanup btrfs_init_new_device()
On Mon, Sep 25, 2017 at 08:31:03PM +0800, Anand Jain wrote: > Moves btrfs_abort_transaction() to the error goto and renames > error_trans to error_sysfs. This is a preparatory patch to > remove the BUG_ON() in btrfs_init_new_device(). Please keep all transaction abort exactly at the place where they happened and do not merge them to one. This pattern should be used everwhere and is important when debugging because we can pinpoint the line in the code from the syslog message and do not have to guess which way it got to the merged call. This makes the binary large, looks like a code duplication in the code but for a good reason. -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 1/3] btrfs: undo writable when sprouting fails
On Mon, Sep 25, 2017 at 08:31:02PM +0800, Anand Jain wrote: > Signed-off-by: Anand JainThis does not look like change trivial enough to just leave out the description. Please explain in more detail. -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH][v2] btrfs: change how we decide to commit transactions during flushing
On Fri, Aug 25, 2017 at 11:32:21AM +0300, Nikolay Borisov wrote: > > > On 24.08.2017 17:43, Nikolay Borisov wrote: > > > > > > On 22.08.2017 23:00, jo...@toxicpanda.com wrote: > >> From: Josef Bacik> >> > >> Nikolay reported that generic/273 was failing currently with ENOSPC. > >> Turns out this is because we get to the point where the outstanding > >> reservations are greater than the pinned space on the fs. This is a > >> mistake, previously we used the current reservation amount in > >> may_commit_transaction, not the entire outstanding reservation amount. > >> Fix this to find the minimum byte size needed to make progress in > >> flushing, and pass that into may_commit_transaction. From there we can > >> make a smarter decision on whether to commit the transaction or not. > >> This fixes the failure in generic/273. > >> > >> Reported-by: Nikolay Borisov > >> Signed-off-by: Josef Bacik > > > > Reviewed-and-tested-by: Nikolay Borisov > > > For this commit we might also add: > > Fixes: 957780eb2788 ("Btrfs: introduce ticketed enospc infrastructure") > Cc: sta...@vger.kernel.org # 4.8 JFI, tags added plus some additional explanation from N. -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] btrfs-progs: subvolume: outputs message only when operation succeeds
On 2017年09月25日 22:19, Hugo Mills wrote: On Mon, Sep 25, 2017 at 04:04:03PM +0800, Qu Wenruo wrote: On 2017年09月25日 15:52, Hugo Mills wrote: On Mon, Sep 25, 2017 at 03:46:15PM +0800, Qu Wenruo wrote: On 2017年09月25日 15:42, Marat Khalili wrote: On 25/09/17 10:30, Nikolay Borisov wrote: On 19.09.2017 10:41, Misono, Tomohiro wrote: "btrfs subvolume create/delete" outputs the message of "Create/Delete subvolume ..." even when an operation fails. Since it is confusing, let's outputs the message only when an operation succeeds. Please change the verb to past tense, more strongly signaling success - i.e. "Created subvolume" What about recalling some UNIX standards and returning to NOT outputting any message when operation succeeds? My scripts are full of grep -v calls after each btrfs command, and this sucks (and I don't think I'm alone in this situation). Isn't the correct way to catch the return value instead of grepping the output? It is, but if, for example, you're using the command in a cron script which is expected to work, you don't want it producing output because then you get a mail every time the script runs. So you have to grep -v on the "success" output to make the successful script silent. What about redirecting stdout to /dev/null and redirecting stderr to mail if return value is not 0? As for expected-to-work case, the stdout doesn't has much meaning and return value should be good enough to judge the result. If it's some command not returning value properly, would you please report it as a bug so we can fix it. It's not the return value that's problematic (although those used to be a real mess). It's the fact that a successful run of the command produces noise on stdout, which most commands don't. Yes, a lot of tried-and-true tools don't output anything for successful run, but also a lot of other tools do output something by default, especially for complex tools like LVM. btrfs sub create and btrfs sub delete, though, aren't complex. They're about as complex as mkdir and rmdir, from a user point of view. What's more, and like mkdir/rmdir, the effects of those commands show up in the filesystem at the path given, so manual verification could be as simple as "ls -d !$" or "ls !$/..". It's really, really not necessary to have this command unconditionally print "yes, I created a directory for you" to stdout. Yes, there's ways to deal with it in shell scripts, but wouldn't life be so much better if you didn't have to? Like you don't have to filter out success reports from mkdir. Maybe we can introduce a global --quite option to silent some output. Or drop the spurious output unless it's asked for with --verbose. Because then it makes it so much easier to compose tools together into bigger and more complex things, which is, after all, one of the fundamental things that UNIX does right. I think we need at least 4 levels of message: debug, info, warn, error. And only show debug and higher for --verbose. Default to show info, and warn/error always get show (stderr). (And during the new framework, we can add support for translation) And we also need to reassign all existing message to new message level. It may be time consuming but we just need to do it since there is real user demand here. (Any in this case, anyone in the maillist can help review messages) Thanks, Qu Hugo. Thanks, Qu Hugo. Thanks, Qu If you change the message a lot of scripts will have to be changed, at least make it worth it. -- With Best Regards, Marat Khaliili -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] btrfs-progs: subvolume: outputs message only when operation succeeds
On Mon, Sep 25, 2017 at 04:04:03PM +0800, Qu Wenruo wrote: > > > On 2017年09月25日 15:52, Hugo Mills wrote: > >On Mon, Sep 25, 2017 at 03:46:15PM +0800, Qu Wenruo wrote: > >> > >> > >>On 2017年09月25日 15:42, Marat Khalili wrote: > >>>On 25/09/17 10:30, Nikolay Borisov wrote: > On 19.09.2017 10:41, Misono, Tomohiro wrote: > >"btrfs subvolume create/delete" outputs the message of "Create/Delete > >subvolume ..." even when an operation fails. > >Since it is confusing, let's outputs the message only when an > >operation succeeds. > Please change the verb to past tense, more strongly signaling success - > i.e. "Created subvolume" > >>>What about recalling some UNIX standards and returning to NOT > >>>outputting any message when operation succeeds? My scripts are > >>>full of grep -v calls after each btrfs command, and this sucks > >>>(and I don't think I'm alone in this situation). > >> > >>Isn't the correct way to catch the return value instead of grepping > >>the output? > > > >It is, but if, for example, you're using the command in a cron > >script which is expected to work, you don't want it producing output > >because then you get a mail every time the script runs. So you have to > >grep -v on the "success" output to make the successful script silent. > > What about redirecting stdout to /dev/null and redirecting stderr to > mail if return value is not 0? > As for expected-to-work case, the stdout doesn't has much meaning > and return value should be good enough to judge the result. > > > > >>If it's some command not returning value properly, would you please > >>report it as a bug so we can fix it. > > > >It's not the return value that's problematic (although those used > >to be a real mess). It's the fact that a successful run of the command > >produces noise on stdout, which most commands don't. > > Yes, a lot of tried-and-true tools don't output anything for > successful run, but also a lot of other tools do output something by > default, especially for complex tools like LVM. btrfs sub create and btrfs sub delete, though, aren't complex. They're about as complex as mkdir and rmdir, from a user point of view. What's more, and like mkdir/rmdir, the effects of those commands show up in the filesystem at the path given, so manual verification could be as simple as "ls -d !$" or "ls !$/..". It's really, really not necessary to have this command unconditionally print "yes, I created a directory for you" to stdout. Yes, there's ways to deal with it in shell scripts, but wouldn't life be so much better if you didn't have to? Like you don't have to filter out success reports from mkdir. > Maybe we can introduce a global --quite option to silent some output. Or drop the spurious output unless it's asked for with --verbose. Because then it makes it so much easier to compose tools together into bigger and more complex things, which is, after all, one of the fundamental things that UNIX does right. Hugo. > Thanks, > Qu > > > >Hugo. > >>Thanks, > >>Qu > >> > >>>If you change the message a lot of scripts will have to be > >>>changed, at least make it worth it. > >>> > >>> -- > >>> > >>>With Best Regards, > >>>Marat Khaliili > >>> > > -- Hugo Mills | Great films about cricket: The Fantastic Four hugo@... carfax.org.uk | http://carfax.org.uk/ | PGP: E2AB1DE4 | signature.asc Description: Digital signature
Re: [PATCH] btrfs: Remove memory barrier from block_group_cache_done
On Wed, Aug 30, 2017 at 07:35:42PM +0300, Nikolay Borisov wrote: > This memory barrier was introduced in 817d52f8dba2 ("Btrfs: async block group > caching"), but even at that time it's usage was broken since it didn't pair > with anything. There was one situation where the cached member was set to > BTRFS_CACHE_FINISHED in a spinlock region which *might* have acted as a > pairing > barrier. Since there is no clear semantics how it's supposed to work better > to just remove it. > > Signed-off-by: Nikolay BorisovAs discussed offline, this needs a better description that removing the barrier will not break it. -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH][v2] Btrfs: rework outstanding_extents
On Mon, Aug 21, 2017 at 10:11:20AM -0400, Josef Bacik wrote: > Ignore this, I git format-patch'ed the wrong patch. So if you're going to send another version, please split the tracepoints to another patch and update the format strings to the preferred format https://btrfs.wiki.kernel.org/index.php/Development_notes#Tracepoints -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] Btrfs: rework outstanding_extents
On Thu, Aug 24, 2017 at 05:31:15PM +0800, Lu Fengqi wrote: > On Thu, Aug 17, 2017 at 01:01:29PM -0400, jo...@toxicpanda.com wrote: > >From: Josef Bacik> > > >Right now we do a lot of weird hoops around outstanding_extents in order > >to keep the extent count consistent. This is because we logically > >transfer the outstanding_extent count from the initial reservation > >through the set_delalloc_bits. This makes it pretty difficult to get a > >handle on how and when we need to mess with outstanding_extents. > > > >Fix this by revamping the rules of how we deal with outstanding_extents. > >Now instead everybody that is holding on to a delalloc extent is > >required to increase the outstanding extents count for itself. This > >means we'll have something like this > > > >btrfs_dealloc_reserve_metadata - outstanding_extents = 1 > > btrfs_set_delalloc - outstanding_extents = 2 > >btrfs_release_delalloc_extents - outstanding_extents = 1 > > > >for an initial file write. Now take the append write where we extend an > >existing delalloc range but still under the maximum extent size > > > >btrfs_delalloc_reserve_metadata - outstanding_extents = 2 > > btrfs_set_delalloc > >btrfs_set_bit_hook - outstanding_extents = 3 > >btrfs_merge_bit_hook - outstanding_extents = 2 > >btrfs_release_delalloc_extents - outstanding_extnets = 1 > > > >In order to make the ordered extent transition we of course must now > >make ordered extents carry their own outstanding_extent reservation, so > >for cow_file_range we end up with > > > >btrfs_add_ordered_extent - outstanding_extents = 2 > >clear_extent_bit - outstanding_extents = 1 > >btrfs_remove_ordered_extent - outstanding_extents = 0 > > > >This makes all manipulations of outstanding_extents much more explicit. > >Every successful call to btrfs_reserve_delalloc_metadata _must_ now be > >combined with btrfs_release_delalloc_extents, even in the error case, as > >that is the only function that actually modifies the > >outstanding_extents counter. > > s/btrfs_reserve_delalloc_metadata/btrfs_delalloc_reserve_metadata > s/btrfs_release_delalloc_extents/btrfs_delalloc_release_extents > > Acked-by: Lu Fengqi BTW, Documentation/process/5.Posting.rst 216 - Acked-by: indicates an agreement by another developer (often a 217maintainer of the relevant code) that the patch is appropriate for 218inclusion into the kernel. and is commonly used by maintainers when someone else modifies the respective codebase by tree-wide, API or similar. If you agree with the way the patch is solving a particular problem and think it's important enough to be stated, you can write that in plain text. -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 08/10] btrfs: remove unused parameter in cow_file_range
On Mon, Sep 25, 2017 at 03:18:31PM +0300, Nikolay Borisov wrote: > On 21.08.2017 12:43, Nikolay Borisov wrote: > > The delalloc_end parameter was only passed to extent_clear_unlock_delalloc, > > but > > since its usage was removed form that function let's also remove it from the > > caller's parameter list. > > > > Signed-off-by: Nikolay Borisov> > ping, since most of the series has been merged but this one has been > left out I assume it could have been missed. https://marc.info/?l=linux-btrfs=150481210920815 Dependent change, so not merged. -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 08/10] btrfs: remove unused parameter in cow_file_range
On 2017年09月25日 20:18, Nikolay Borisov wrote: On 21.08.2017 12:43, Nikolay Borisov wrote: The delalloc_end parameter was only passed to extent_clear_unlock_delalloc, but since its usage was removed form that function let's also remove it from the caller's parameter list. Signed-off-by: Nikolay Borisovping, since most of the series has been merged but this one has been left out I assume it could have been missed. Commit 897a41b1167955bd543bb252fd3f06f5844f2177 btrfs: extend btrfs_set_extent_delalloc and its friends to support in-band dedupe and subpage size patchset That line is added to support subpage sized sectorsize. (BTW, inband dedupe is not really using it IIRC) It's my fault since at that time I believed subpage size patchset would be merged before inband dedupe, but the truth is neither is going to be merged soon. I think it's better to revert that commit until we really have a schedule for sub page size patchset. (BTW, according to the update internal of patchset "Allow I/O on blocks whose size is less than page size", it may be updated in recent month if it gets updated) Thanks, Qu --- fs/btrfs/inode.c | 20 +--- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 98b56aca0a6f..156be9face5b 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -105,9 +105,9 @@ static int btrfs_truncate(struct inode *inode); static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent); static noinline int cow_file_range(struct inode *inode, struct page *locked_page, - u64 start, u64 end, u64 delalloc_end, - int *page_started, unsigned long *nr_written, - int unlock, struct btrfs_dedupe_hash *hash); + u64 start, u64 end, int *page_started, + unsigned long *nr_written, int unlock, + struct btrfs_dedupe_hash *hash); static struct extent_map *create_io_em(struct inode *inode, u64 start, u64 len, u64 orig_start, u64 block_start, u64 block_len, u64 orig_block_len, @@ -739,8 +739,6 @@ static noinline void submit_compressed_extents(struct inode *inode, async_extent->start, async_extent->start + async_extent->ram_size - 1, -async_extent->start + -async_extent->ram_size - 1, _started, _written, 0, NULL); @@ -930,9 +928,9 @@ static u64 get_extent_allocation_hint(struct inode *inode, u64 start, */ static noinline int cow_file_range(struct inode *inode, struct page *locked_page, - u64 start, u64 end, u64 delalloc_end, - int *page_started, unsigned long *nr_written, - int unlock, struct btrfs_dedupe_hash *hash) + u64 start, u64 end, int *page_started, + unsigned long *nr_written, int unlock, + struct btrfs_dedupe_hash *hash) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; @@ -1431,7 +1429,7 @@ static noinline int run_delalloc_nocow(struct inode *inode, if (cow_start != (u64)-1) { ret = cow_file_range(inode, locked_page, cow_start, found_key.offset - 1, -end, page_started, nr_written, 1, +page_started, nr_written, 1, NULL); if (ret) { if (!nolock && nocow) @@ -1517,7 +1515,7 @@ static noinline int run_delalloc_nocow(struct inode *inode, } if (cow_start != (u64)-1) { - ret = cow_file_range(inode, locked_page, cow_start, end, end, + ret = cow_file_range(inode, locked_page, cow_start, end, page_started, nr_written, 1, NULL); if (ret) goto error; @@ -1574,7 +1572,7 @@ static int run_delalloc_range(void *private_data, struct page *locked_page, ret = run_delalloc_nocow(inode, locked_page, start, end, page_started, 0, nr_written); } else if (!inode_need_compress(inode, start, end)) { - ret =
Re: [PATCH 2/3] btrfs: cleanup btrfs_init_new_device()
On 2017年09月25日 20:31, Anand Jain wrote: Moves btrfs_abort_transaction() to the error goto and renames error_trans to error_sysfs. This is a preparatory patch to remove the BUG_ON() in btrfs_init_new_device(). Signed-off-by: Anand Jain--- fs/btrfs/volumes.c | 23 +-- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 9d64700cc9b6..5c34eea9d12d 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2443,26 +2443,20 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path mutex_lock(_info->chunk_mutex); ret = init_first_rw_device(trans, fs_info); mutex_unlock(_info->chunk_mutex); - if (ret) { - btrfs_abort_transaction(trans, ret); - goto error_trans; - } + if (ret) + goto error_sysfs; } ret = btrfs_add_device(trans, fs_info, device); - if (ret) { - btrfs_abort_transaction(trans, ret); - goto error_trans; - } + if (ret) + goto error_sysfs; if (seeding_dev) { char fsid_buf[BTRFS_UUID_UNPARSED_SIZE]; ret = btrfs_finish_sprout(trans, fs_info); - if (ret) { - btrfs_abort_transaction(trans, ret); - goto error_trans; - } + if (ret) + goto error_sysfs; /* Sprouting would change fsid of the mounted root, * so rename the fsid on the sysfs @@ -2500,12 +2494,13 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path update_dev_time(device_path); return ret; -error_trans: +error_sysfs: + btrfs_sysfs_rm_device_link(fs_info->fs_devices, device); if (seeding_dev) sb->s_flags |= MS_RDONLY; + btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); Abort then end? Looks redundant at least. rcu_string_free(device->name); - btrfs_sysfs_rm_device_link(fs_info->fs_devices, device); Any particular reason to move this line ahead? I didn't see it has something to do with transaction, so just curious about the purpose. Thanks, Qu kfree(device); error: blkdev_put(bdev, FMODE_EXCL); -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/3] btrfs: cleanup btrfs_init_new_device()
On 25.09.2017 16:07, Nikolay Borisov wrote: > > > On 25.09.2017 15:31, Anand Jain wrote: >> Moves btrfs_abort_transaction() to the error goto and renames >> error_trans to error_sysfs. This is a preparatory patch to >> remove the BUG_ON() in btrfs_init_new_device(). >> >> Signed-off-by: Anand Jain>> --- >> fs/btrfs/volumes.c | 23 +-- >> 1 file changed, 9 insertions(+), 14 deletions(-) >> >> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c >> index 9d64700cc9b6..5c34eea9d12d 100644 >> --- a/fs/btrfs/volumes.c >> +++ b/fs/btrfs/volumes.c >> @@ -2443,26 +2443,20 @@ int btrfs_init_new_device(struct btrfs_fs_info >> *fs_info, const char *device_path >> mutex_lock(_info->chunk_mutex); >> ret = init_first_rw_device(trans, fs_info); >> mutex_unlock(_info->chunk_mutex); >> -if (ret) { >> -btrfs_abort_transaction(trans, ret); >> -goto error_trans; >> -} >> +if (ret) >> +goto error_sysfs; >> } >> >> ret = btrfs_add_device(trans, fs_info, device); >> -if (ret) { >> -btrfs_abort_transaction(trans, ret); >> -goto error_trans; >> -} >> +if (ret) >> +goto error_sysfs; >> >> if (seeding_dev) { >> char fsid_buf[BTRFS_UUID_UNPARSED_SIZE]; >> >> ret = btrfs_finish_sprout(trans, fs_info); >> -if (ret) { >> -btrfs_abort_transaction(trans, ret); >> -goto error_trans; >> -} >> +if (ret) >> +goto error_sysfs; >> >> /* Sprouting would change fsid of the mounted root, >> * so rename the fsid on the sysfs >> @@ -2500,12 +2494,13 @@ int btrfs_init_new_device(struct btrfs_fs_info >> *fs_info, const char *device_path >> update_dev_time(device_path); >> return ret; >> >> -error_trans: >> +error_sysfs: >> +btrfs_sysfs_rm_device_link(fs_info->fs_devices, device); > > Any reason why you move btrfs_sysfs_rm_device_link? > >> if (seeding_dev) >> sb->s_flags |= MS_RDONL> + btrfs_abort_transaction(trans, >> ret); >> btrfs_end_transaction(trans); >> rcu_string_free(device->name); >> -btrfs_sysfs_rm_device_link(fs_info->fs_devices, device); >> kfree(device);> error: >> blkdev_put(bdev, FMODE_EXCL); > > Also which branch is your code based on since in Linus' master the error > handling (before your patch) looks a bit different. E.g. the MS_RDONL > stuff look wrong. > > error_trans: > > btrfs_end_transaction(trans); > > rcu_string_free(device->name); > > btrfs_sysfs_rm_device_link(fs_info->fs_devices, device); > > kfree(device); > > error: > > blkdev_put(bdev, FMODE_EXCL); > > if (seeding_dev) { > > mutex_unlock(_mutex); > > up_write(>s_umount); > > } > > return ret; Ok, turned out I hadn't received 1/3 so this answers this question. The sysfs rm stands though > >> -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/3] btrfs: cleanup btrfs_init_new_device()
On 25.09.2017 15:31, Anand Jain wrote: > Moves btrfs_abort_transaction() to the error goto and renames > error_trans to error_sysfs. This is a preparatory patch to > remove the BUG_ON() in btrfs_init_new_device(). > > Signed-off-by: Anand Jain> --- > fs/btrfs/volumes.c | 23 +-- > 1 file changed, 9 insertions(+), 14 deletions(-) > > diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c > index 9d64700cc9b6..5c34eea9d12d 100644 > --- a/fs/btrfs/volumes.c > +++ b/fs/btrfs/volumes.c > @@ -2443,26 +2443,20 @@ int btrfs_init_new_device(struct btrfs_fs_info > *fs_info, const char *device_path > mutex_lock(_info->chunk_mutex); > ret = init_first_rw_device(trans, fs_info); > mutex_unlock(_info->chunk_mutex); > - if (ret) { > - btrfs_abort_transaction(trans, ret); > - goto error_trans; > - } > + if (ret) > + goto error_sysfs; > } > > ret = btrfs_add_device(trans, fs_info, device); > - if (ret) { > - btrfs_abort_transaction(trans, ret); > - goto error_trans; > - } > + if (ret) > + goto error_sysfs; > > if (seeding_dev) { > char fsid_buf[BTRFS_UUID_UNPARSED_SIZE]; > > ret = btrfs_finish_sprout(trans, fs_info); > - if (ret) { > - btrfs_abort_transaction(trans, ret); > - goto error_trans; > - } > + if (ret) > + goto error_sysfs; > > /* Sprouting would change fsid of the mounted root, >* so rename the fsid on the sysfs > @@ -2500,12 +2494,13 @@ int btrfs_init_new_device(struct btrfs_fs_info > *fs_info, const char *device_path > update_dev_time(device_path); > return ret; > > -error_trans: > +error_sysfs: > + btrfs_sysfs_rm_device_link(fs_info->fs_devices, device); Any reason why you move btrfs_sysfs_rm_device_link? > if (seeding_dev) > sb->s_flags |= MS_RDONL> + btrfs_abort_transaction(trans, > ret); > btrfs_end_transaction(trans); > rcu_string_free(device->name); > - btrfs_sysfs_rm_device_link(fs_info->fs_devices, device); > kfree(device);> error: > blkdev_put(bdev, FMODE_EXCL); Also which branch is your code based on since in Linus' master the error handling (before your patch) looks a bit different. E.g. the MS_RDONL stuff look wrong. error_trans: btrfs_end_transaction(trans); rcu_string_free(device->name); btrfs_sysfs_rm_device_link(fs_info->fs_devices, device); kfree(device); error: blkdev_put(bdev, FMODE_EXCL); if (seeding_dev) { mutex_unlock(_mutex); up_write(>s_umount); } return ret; > -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 1/3] btrfs: undo writable when sprouting fails
On 2017年09月25日 20:31, Anand Jain wrote: Signed-off-by: Anand JainThe error_trans tag needs to revert the RDONLY flag without doubt. But just a small question. Just lines before error_trans tag, we're committing transaction and then relocating system chunks. In which we don't use goto error branch but returning directly. Don't we need to to revert the RDONLY flag in that error handler? Thanks, Qu --- fs/btrfs/volumes.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 0e8f16c305df..9d64700cc9b6 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2501,6 +2501,8 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path return ret; error_trans: + if (seeding_dev) + sb->s_flags |= MS_RDONLY; btrfs_end_transaction(trans); rcu_string_free(device->name); btrfs_sysfs_rm_device_link(fs_info->fs_devices, device); -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Wiki updates
Hi, I've updated the Status page and synced the manual pages from btrfs-progs git. The status of various features is split to one more column, Performance. There's a section for each 'mostly OK' status below the table and linked from the note. This will hopefully be more understandable than just a single color table cell. Please note that the page is locked, so either drop me a mail or use the talk page to suggest edits. -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 2/3] btrfs: cleanup btrfs_init_new_device()
Moves btrfs_abort_transaction() to the error goto and renames error_trans to error_sysfs. This is a preparatory patch to remove the BUG_ON() in btrfs_init_new_device(). Signed-off-by: Anand Jain--- fs/btrfs/volumes.c | 23 +-- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 9d64700cc9b6..5c34eea9d12d 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2443,26 +2443,20 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path mutex_lock(_info->chunk_mutex); ret = init_first_rw_device(trans, fs_info); mutex_unlock(_info->chunk_mutex); - if (ret) { - btrfs_abort_transaction(trans, ret); - goto error_trans; - } + if (ret) + goto error_sysfs; } ret = btrfs_add_device(trans, fs_info, device); - if (ret) { - btrfs_abort_transaction(trans, ret); - goto error_trans; - } + if (ret) + goto error_sysfs; if (seeding_dev) { char fsid_buf[BTRFS_UUID_UNPARSED_SIZE]; ret = btrfs_finish_sprout(trans, fs_info); - if (ret) { - btrfs_abort_transaction(trans, ret); - goto error_trans; - } + if (ret) + goto error_sysfs; /* Sprouting would change fsid of the mounted root, * so rename the fsid on the sysfs @@ -2500,12 +2494,13 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path update_dev_time(device_path); return ret; -error_trans: +error_sysfs: + btrfs_sysfs_rm_device_link(fs_info->fs_devices, device); if (seeding_dev) sb->s_flags |= MS_RDONLY; + btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); rcu_string_free(device->name); - btrfs_sysfs_rm_device_link(fs_info->fs_devices, device); kfree(device); error: blkdev_put(bdev, FMODE_EXCL); -- 2.13.1 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 3/3] btrfs: fix BUG_ON in btrfs_init_new_device()
Here we will simply return the error to the caller. And handle the fail condition by calling the abort transaction through the error path. Signed-off-by: Anand Jain--- fs/btrfs/volumes.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 5c34eea9d12d..93eaf314fe72 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2399,7 +2399,8 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path if (seeding_dev) { sb->s_flags &= ~MS_RDONLY; ret = btrfs_prepare_sprout(fs_info); - BUG_ON(ret); /* -ENOMEM */ + if (ret) + goto error_trans; } device->fs_devices = fs_info->fs_devices; @@ -2496,6 +2497,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path error_sysfs: btrfs_sysfs_rm_device_link(fs_info->fs_devices, device); +error_trans: if (seeding_dev) sb->s_flags |= MS_RDONLY; btrfs_abort_transaction(trans, ret); -- 2.13.1 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 0/3] fix bug in btrfs_init_new_device()
1/3 fixes a bug which failed to reset writable when sprouting failed 2/3 is cleanup and preparatory to fix 3/3 3/3 fixes BUG_ON in btrfs_init_new_device() Anand Jain (3): btrfs: undo writable when sprouting fails btrfs: cleanup btrfs_init_new_device() btrfs: fix BUG_ON in btrfs_init_new_device() fs/btrfs/volumes.c | 27 +-- 1 file changed, 13 insertions(+), 14 deletions(-) -- 2.13.1 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 1/3] btrfs: undo writable when sprouting fails
Signed-off-by: Anand Jain--- fs/btrfs/volumes.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 0e8f16c305df..9d64700cc9b6 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2501,6 +2501,8 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path return ret; error_trans: + if (seeding_dev) + sb->s_flags |= MS_RDONLY; btrfs_end_transaction(trans); rcu_string_free(device->name); btrfs_sysfs_rm_device_link(fs_info->fs_devices, device); -- 2.13.1 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 08/10] btrfs: remove unused parameter in cow_file_range
On 21.08.2017 12:43, Nikolay Borisov wrote: > The delalloc_end parameter was only passed to extent_clear_unlock_delalloc, > but > since its usage was removed form that function let's also remove it from the > caller's parameter list. > > Signed-off-by: Nikolay Borisovping, since most of the series has been merged but this one has been left out I assume it could have been missed. > --- > fs/btrfs/inode.c | 20 +--- > 1 file changed, 9 insertions(+), 11 deletions(-) > > diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c > index 98b56aca0a6f..156be9face5b 100644 > --- a/fs/btrfs/inode.c > +++ b/fs/btrfs/inode.c > @@ -105,9 +105,9 @@ static int btrfs_truncate(struct inode *inode); > static int btrfs_finish_ordered_io(struct btrfs_ordered_extent > *ordered_extent); > static noinline int cow_file_range(struct inode *inode, > struct page *locked_page, > -u64 start, u64 end, u64 delalloc_end, > -int *page_started, unsigned long *nr_written, > -int unlock, struct btrfs_dedupe_hash *hash); > +u64 start, u64 end, int *page_started, > +unsigned long *nr_written, int unlock, > +struct btrfs_dedupe_hash *hash); > static struct extent_map *create_io_em(struct inode *inode, u64 start, u64 > len, > u64 orig_start, u64 block_start, > u64 block_len, u64 orig_block_len, > @@ -739,8 +739,6 @@ static noinline void submit_compressed_extents(struct > inode *inode, >async_extent->start, >async_extent->start + >async_extent->ram_size - 1, > - async_extent->start + > - async_extent->ram_size - 1, >_started, _written, 0, >NULL); > > @@ -930,9 +928,9 @@ static u64 get_extent_allocation_hint(struct inode > *inode, u64 start, > */ > static noinline int cow_file_range(struct inode *inode, > struct page *locked_page, > -u64 start, u64 end, u64 delalloc_end, > -int *page_started, unsigned long *nr_written, > -int unlock, struct btrfs_dedupe_hash *hash) > +u64 start, u64 end, int *page_started, > +unsigned long *nr_written, int unlock, > +struct btrfs_dedupe_hash *hash) > { > struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); > struct btrfs_root *root = BTRFS_I(inode)->root; > @@ -1431,7 +1429,7 @@ static noinline int run_delalloc_nocow(struct inode > *inode, > if (cow_start != (u64)-1) { > ret = cow_file_range(inode, locked_page, >cow_start, found_key.offset - 1, > - end, page_started, nr_written, 1, > + page_started, nr_written, 1, >NULL); > if (ret) { > if (!nolock && nocow) > @@ -1517,7 +1515,7 @@ static noinline int run_delalloc_nocow(struct inode > *inode, > } > > if (cow_start != (u64)-1) { > - ret = cow_file_range(inode, locked_page, cow_start, end, end, > + ret = cow_file_range(inode, locked_page, cow_start, end, >page_started, nr_written, 1, NULL); > if (ret) > goto error; > @@ -1574,7 +1572,7 @@ static int run_delalloc_range(void *private_data, > struct page *locked_page, > ret = run_delalloc_nocow(inode, locked_page, start, end, >page_started, 0, nr_written); > } else if (!inode_need_compress(inode, start, end)) { > - ret = cow_file_range(inode, locked_page, start, end, end, > + ret = cow_file_range(inode, locked_page, start, end, > page_started, nr_written, 1, NULL); > } else { > set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, > -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] Btrfs: send, apply asynchronous page cache readahead to enhance page read
On 15.09.2017 11:47, peterh wrote: > From: Kuanling Huang> > By analyzing the perf on btrfs send, we found it take large > amount of cpu time on page_cache_sync_readahead. This effort > can be reduced after switching to asynchronous one. Overall > performance gain on HDD and SSD were 9 and 15 percent if > simply send a large file. > > Signed-off-by: Kuanling Huang Reviewed-by: Nikolay Borisov > --- > fs/btrfs/send.c | 21 - > 1 file changed, 16 insertions(+), 5 deletions(-) > > diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c > index 63a6152..7a5eb66 100644 > --- a/fs/btrfs/send.c > +++ b/fs/btrfs/send.c > @@ -4475,16 +4475,27 @@ static ssize_t fill_read_buf(struct send_ctx *sctx, > u64 offset, u32 len) > /* initial readahead */ > memset(>ra, 0, sizeof(struct file_ra_state)); > file_ra_state_init(>ra, inode->i_mapping); > - btrfs_force_ra(inode->i_mapping, >ra, NULL, index, > -last_index - index + 1); > > while (index <= last_index) { > unsigned cur_len = min_t(unsigned, len, >PAGE_CACHE_SIZE - pg_offset); > - page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); > + page = find_lock_page(inode->i_mapping, index); > if (!page) { > - ret = -ENOMEM; > - break; > + page_cache_sync_readahead(inode->i_mapping, > + >ra, NULL, index, > + last_index + 1 - index); > + > + page = find_or_create_page(inode->i_mapping, index, > GFP_KERNEL); > + if (!page) { > + ret = -ENOMEM; > + break; > + } > + } > + > + if (PageReadahead(page)) { > + page_cache_async_readahead(inode->i_mapping, > + >ra, NULL, page, index, > + last_index + 1 - index); > } > > if (!PageUptodate(page)) { > -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v3 07/14] btrfs-progs: Doc/mkfs: Add extra condition for rootdir option
On 2017-09-22 11:07, Qu Wenruo wrote: On 2017年09月22日 21:33, Austin S. Hemmelgarn wrote: On 2017-09-22 08:32, Qu Wenruo wrote: On 2017年09月22日 19:38, Austin S. Hemmelgarn wrote: On 2017-09-22 06:39, Qu Wenruo wrote: As I already stated in an other thread, if you want to shrink, do it in another command line tool. Do one thing and do it simple. (Although Btrfs itself is already out of the UNIX way) Unless I'm reading the code wrong, the shrinking isn't happening in a second pass, so this _is_ doing one thing, and it appears to be doing it as simply as possible (although arguably not correctly because of the 1MB reserved area being used). If you're referring to my V1 implementation of shrink, that's doing *one* thing. But the original shrinking code? Nope, or we won't have the custom chunk allocator at all. What I really mean is, if one wants to shrink, at least don't couple the shrink code into "mkfs.btrfs". Do shrink in its own tool/subcommand, not in a really unrelated tool. There are two cases for shrinking a filesystem: 1. You're resizing it to move to a smaller disk (or speed up copying to another disk). 2. You're generating a filesystem image that needs to be as small as possible. I would argue there is no meaning of creating *smallest* image. (Of course it exists). There is an exact meaning given the on-disk layout. It's an image whose size is equal to the sum of: 1. 1MB (for the reserved space at the beginning). 2. However many superblocks it should have given the size. 3. The total amount of file data and extended attribute data to be included, rounding up for block size 4. The exact amount of metadata space needed to represent the tree from 3, also rounding up for block size. 5. The exact amount of system chunk space needed to handle 3 and 4, plus enough room to allocate at least one more chunk of each type (to ultimately allow for resizing the filesystem if desired). 6. Exactly enough reserved metadata space to resize the FS. We could put tons of code to implement, and more (or less) test cases to verify it. But the demand doesn't validate the effort. And how much effort has been put into ripping this out completely together with the other fixes? How much more would it have been to just move it to another option and fix the reserved area usage? All my points are clear for this patchset: I know I removed one function, and my reason is: 1) No or little usage And it's anti intuition. So split it to a separate tool (mkimage maybe?), and fix mkfs to behave sensibly. I absolutely agree on the fact that it's non-intuitive. It should either be it's own option (with a dependency on -r being passed of course), or a separate tool if you're so worried about mkfs being too complex. As to usage, given the current data, there is no proof that I'm the only one using it, but there is also no proof that anybody other than me is using it, which means that you can't reasonably base an argument on actual usage of this option, since you can't prove anything about usage. All you know is that you have one person who uses it, and one who was confused by it (but appears to want to use it in a different way). It's a niche use case though, and when dealing with something like this, there is a threshold of usage below which you won't see much in the way of discussion of the option on the list, since only a reasonably small percentage of BTRFS users are actually subscribed. 2) Dead code (not tested nor well documented) It _IS NOT_ dead code. It is absolutely reachable from code external to itself. It's potentially unused code, but that is not the same thing as dead code. That aside, I can fix up the documentation, and I've actually tested it reasonably thoroughly (I use it every month or so when I update stuff I have using seed devices, and it also gets used by my testing infrastructure when generating images pre-loaded with files for tests to save time). I'll agree it hasn't been rigorously tested, but it does appear to work as (not) advertised, even when used in odd ways. 3) Possible workaround There are three options for workarounds, and both of them are sub-par to this even aside from the reduced simplicity it offers to userspace: 1. Resize after mkfs. This is impractical both because there is no offline resize (having to mount the FS RW prior to use as a seed device means that you don't have a guaranteed reproducible image, which is a pretty common request for container usage there days), and it will end up with wasted space (the smallest possible filesystem created through a resize is consistently larger (by more than 1MB) than what the -r option to mkfs generates). 2. Use a binary search to determine the smallest size to a reasonable margin. This is impractical simply because it takes too long, and again can't reliably get the smallest possible image. 3. Attempt to compute the smallest possible image without using a binary
Re: [PATCH v3 07/14] btrfs-progs: Doc/mkfs: Add extra condition for rootdir option
On 2017-09-24 10:08, Goffredo Baroncelli wrote: On 09/24/2017 12:10 PM, Anand Jain wrote: All my points are clear for this patchset: I know I removed one function, and my reason is: 1) No or little usage And it's anti intuition. 2) Dead code (not tested nor well documented) 3) Possible workaround I can add several extra reasons as I stated before, but number of reasons won't validate anything anyway. End user convenience wins over the developer's technical difficulties. Sorry, but I have to agree with Qu; there is no a big demand (other than Austin) for this functionality; even you stated that "...at some point it may..."; until now the UI is quite unfriendly (we should use a big enough device, and then cut it by hand on the basis of the output of mkfs.btrfs)... I will comment that I agree that it should not be the default. It's not intuitive for most people, and as you say it's a niche feature. Just removing it completely though with the above argument sounds very much like trying to meet quotas for coding. I fear that this is another feature which increase the complexity of btrfs (and tools) with little or none usage On average? It only increases the complexity of mkfs once there's a fix for the (theoretically trivial to fix) issue with it ignoring the fact that the first 1MB of space is supposed to be untouched by BTRFS. The work of Qu is a nice cleanup; I hope that this will be the direction which BTRFS will takes: removing of "strange/unused" features improving the reliability of the others. The two are not inherently interdependent, and that argument doesn't exactly hold up to scrutiny considering that mkfs is already perfectly reliable even with this feature, and it does not compromise the reliability of other features (again, once you fix the usage of the reserved area at the beginning of the image). -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: btrfs scrub crashes OS
On 2017年09月25日 17:50, Lukas Pirl wrote: Dear all, I experience reproducible OS crashes when scrubbing a btrfs file system. Apart from that, the file system mounts rw and is usable without any problems (including modifying snapshots and all that). When the system crashes (i.e., freezes), there are no errors printed to the system logs or via `dmesg` (had a display connected). Even no dmesg output using tty or netconsole? That's strange. Normally it should be kernel BUG_ON() to cause such problem. And if the system is still responsible (either from TTY or ssh), is there anything strange like tons of IO or CPU usage? Recovery is only possible via power-cycling the machine. The host experienced a lot of crashes and ATA errors due to hardware failures in the past. To the best of my knowledge, the hardware is stable now. `btrfs device stats` outputs zeros for all counters. `btrfsck --readonly --mode lowmem` outputs a bunch of referencer count mismatch … and ERROR: data extent[… …] backref lost see https://pastebin.com/seC4fReP for the full log. There is a known bug for lowmem mode to report such false alert. Btrfs-progs v4.13 should have fixed it. As long as v4.13 btrfs check reports no error, its metadata should be good. System info: btrfs RAID 1 (~1.5 years old), 7 SATA HDDs Oh, RAID1, so normal "btrfs check --check-data-csum" can't really check all data checksum. (It will pass 2nd mirror if 1st one matches the csum) You could try the out-of-tree offline scrub to do a full scrub of your fs unmounted, so it won't crash your system (if nothing wrong happened) https://github.com/gujx2017/btrfs-progs/tree/offline_scrub MIXED_BACKREF, BIG_METADATA, EXTENDED_IREF, SKINNY_METADATA, NO_HOLES Only NO_HOLES is not ordinary, but shouldn't cause a problem. Without kernel backtrace, it's tricky to locate the problem. So I would recommend to use netconsole (IIRC more reliable, as I use it on my test VM to capture the dying message) or TTY output to verify there is no kernel message/backtrace. Thanks, Qu no quotas in use see also https://pastebin.com/4me6zDsN for more details btrfs-progs v4.12 GNU/Linux 4.12.0-0.bpo.1-amd64 #1 SMP Debian 4.12.6-1~bpo9+1 x86_64 The question, obviously, is how can I make this fs "scrubable" again? Are the errors found by btrfsck safe to repair using btrfsck or some other tool? Thank you so much in advance, Lukas -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
btrfs scrub crashes OS
Dear all, I experience reproducible OS crashes when scrubbing a btrfs file system. Apart from that, the file system mounts rw and is usable without any problems (including modifying snapshots and all that). When the system crashes (i.e., freezes), there are no errors printed to the system logs or via `dmesg` (had a display connected). Recovery is only possible via power-cycling the machine. The host experienced a lot of crashes and ATA errors due to hardware failures in the past. To the best of my knowledge, the hardware is stable now. `btrfs device stats` outputs zeros for all counters. `btrfsck --readonly --mode lowmem` outputs a bunch of referencer count mismatch … and ERROR: data extent[… …] backref lost see https://pastebin.com/seC4fReP for the full log. System info: btrfs RAID 1 (~1.5 years old), 7 SATA HDDs MIXED_BACKREF, BIG_METADATA, EXTENDED_IREF, SKINNY_METADATA, NO_HOLES no quotas in use see also https://pastebin.com/4me6zDsN for more details btrfs-progs v4.12 GNU/Linux 4.12.0-0.bpo.1-amd64 #1 SMP Debian 4.12.6-1~bpo9+1 x86_64 The question, obviously, is how can I make this fs "scrubable" again? Are the errors found by btrfsck safe to repair using btrfsck or some other tool? Thank you so much in advance, Lukas -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] btrfs-progs: check: check invalid extent_inline_ref type in lowmem
On 09/24/2017 10:17 PM, Nikolay Borisov wrote: On 19.09.2017 13:00, Qu Wenruo wrote: On 2017年09月19日 17:48, Su Yue wrote: On 09/19/2017 04:48 PM, Qu Wenruo wrote: On 2017年09月19日 16:32, Su Yue wrote: Lowmem check does not skip invalid type in extent_inline_ref then calls btrfs_extent_inline_ref_size(type) which causes crash. Example: $ ./btrfs-corrupt-block -e -l 20971520 /tmp/data_small corrupting extent record: key 20971520 169 0 $ btrfs check --mode=lowmem /tmp/data_small Checking filesystem on /tmp/data_small UUID: ee205d69-8724-4aa2-a4f5-bc8558a62169 checking extents ERROR: extent[20971520 16384] backref type mismatch, missing bit: 2 ERROR: extent[20971520 16384] backref generation mismatch, wanted: 7, have: 0 ERROR: extent[20971520 16384] is referred by other roots than 3 ctree.h:1754: btrfs_extent_inline_ref_size: BUG_ON `1` triggered, value 1 btrfs(+0x543db)[0x55fabc2ab3db] btrfs(+0x587f7)[0x55fabc2af7f7] btrfs(+0x5fa44)[0x55fabc2b6a44] btrfs(cmd_check+0x194a)[0x55fabc2bd717] btrfs(main+0x88)[0x55fabc2682e0] /usr/lib/libc.so.6(__libc_start_main+0xea)[0x7f021c3824ca] btrfs(_start+0x2a)[0x55fabc267e7a] [1] 5188 abort (core dumped) btrfs check --mode=lowmem /tmp/data_small Fix it by checking type before obtaining inline_ref size. Signed-off-by: Su YueCode itself looks good. And test case please. Thanks for review. I'll add test case in next version. Reviewed-by: Qu Wenruo However, such whac-a-mole fix will finally be a nightmare to maintain. What about integrating all of such validation checkers into one place? So fsck part will only need to check their cross reference without bothering such corruption. I was confused how to fix the bug(new checker or little changes in this patch). The reason why I fix it in this way is that most callers do check type before calling btrfs_extent_inline_ref_size(). Since you prefer the former, I'm going to do it. Current version looks good enough as a fix.> Just saying we'd better using an integrated solution later. I agree with you that we should have an integrated solution but frankly I'd rather see it sooner rather than later, because history has shown that if something is not done when it's first talked about it usually gets bogged down by other work. With this in mind, Su, might I ask you that you repost the patch but with the centralised idea of handling the validation lifted from Qu's kernel side patches. OK. Thanks for your suggestion:). Most of work has been finished. But I found the original mode can not handle the new test case correctly. I'm Working on it. Thanks, Su Thanks, Qu Thanks Su Just like the kernel patch I submitted some times ago. https://www.spinics.net/lists/linux-btrfs/msg68498.html Thanks, Qu --- cmds-check.c | 9 + 1 file changed, 9 insertions(+) diff --git a/cmds-check.c b/cmds-check.c index 4e0b0fe4..74c10c75 100644 --- a/cmds-check.c +++ b/cmds-check.c @@ -11565,6 +11565,10 @@ static int check_tree_block_ref(struct btrfs_root *root, offset, level + 1, owner, NULL); } + } else { + error("extent[%llu %u %llu] has unknown ref type: %d", + key.objectid, key.type, key.offset, type); + break; } if (found_ref) @@ -11831,6 +11835,11 @@ static int check_extent_data_item(struct btrfs_root *root, found_dbackref = !check_tree_block_ref(root, NULL, btrfs_extent_inline_ref_offset(leaf, iref), 0, owner, NULL); + } else { + error("extent[%llu %u %llu] has unknown ref type: %d", + disk_bytenr, BTRFS_EXTENT_DATA_KEY, + disk_num_bytes, type); + break; } if (found_dbackref) -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] btrfs-progs: subvolume: outputs message only when operation succeeds
On 25/09/17 11:08, Qu Wenruo wrote: What about redirecting stdout to /dev/null and redirecting stderr to mail if return value is not 0? s/if return value is not 0/if return value is 0/. The main point is, if btrfs returns 0, then nothing to worry about. (Unless there is a bug, even in that case keep an eye on stderr should be enough to catch that) Redirection to /dev/null will work. However, 1) It will always looks suspicious. grep -v with expected message is at least clear about its intent and consequences. 2) Although shorter than grep -v, it will still take space in shell scripts and force one to remember btrfs commands one has to add it after. This is already inconvenient enough to want a fix. -- With Best Regards, Marat Khalili -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] btrfs-progs: subvolume: outputs message only when operation succeeds
On 25/09/17 10:52, Hugo Mills wrote: Isn't the correct way to catch the return value instead of grepping the output? It is, but if, for example, you're using the command in a cron script which is expected to work, you don't want it producing output because then you get a mail every time the script runs. So you have to grep -v on the "success" output to make the successful script silent. If it's some command not returning value properly, would you please report it as a bug so we can fix it. It's not the return value that's problematic (although those used to be a real mess). It's the fact that a successful run of the command produces noise on stdout, which most commands don't. Yes, exactly: cron, mail -E and just long scripts where btrfs operations are small steps here and there. (On top of this, actually catching the return value from the right command before `| grep -v` with errexit and pipefail on is so difficult that I usually end up rewriting whole mess in Python. Which would be nice result in itself if it didn't take a whole day in place of one minute for bash line.) -- With Best Regards, Marat Khalili -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Btrfs performance with small blocksize on SSD
Am Mon, 25 Sep 2017 07:04:14 + schrieb "Fuhrmann, Carsten": > Well the correct translation for "Laufzeit" is runtime and not > latency. But thank you for that hint, I'll change it to "gesamt > Laufzeit" to make it more clear. How about better translating it to English in the first place as you're trying to reach an international community? Also, it would be nice to put the exact test you did as a command line or configuration file, so it can be replayed on other systems, and - most important - by the developers, to easily uncover what is causing the behavior... > Best regards > > Carsten > > -Ursprüngliche Nachricht- > Von: Andrei Borzenkov [mailto:arvidj...@gmail.com] > Gesendet: Sonntag, 24. September 2017 18:43 > An: Fuhrmann, Carsten ; Qu Wenruo > ; linux-btrfs@vger.kernel.org Betreff: Re: > AW: Btrfs performance with small blocksize on SSD > > 24.09.2017 16:53, Fuhrmann, Carsten пишет: > > Hello, > > > > 1) > > I used direct write (no page cache) but I didn't disable the Disk > > cache of the HDD/SSD itself. In all tests I wrote 1GB and looked > > for the runtime of that write process. > > So "latency" on your diagram means total time to write 1GiB file? > That is highly unusual meaning for "latency" which normally means > time to perform single IO. If so, you should better rename Y-axis to > something like "total run time". > N_r__yb_Xv_^_)__{.n_+{_n___)w*jg______j/___z_2___&_)___a_____G___h__j:+v___w -- Regards, Kai Replies to list-only preferred. -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] btrfs-progs: subvolume: outputs message only when operation succeeds
On 2017年09月25日 16:04, Qu Wenruo wrote: On 2017年09月25日 15:52, Hugo Mills wrote: On Mon, Sep 25, 2017 at 03:46:15PM +0800, Qu Wenruo wrote: On 2017年09月25日 15:42, Marat Khalili wrote: On 25/09/17 10:30, Nikolay Borisov wrote: On 19.09.2017 10:41, Misono, Tomohiro wrote: "btrfs subvolume create/delete" outputs the message of "Create/Delete subvolume ..." even when an operation fails. Since it is confusing, let's outputs the message only when an operation succeeds. Please change the verb to past tense, more strongly signaling success - i.e. "Created subvolume" What about recalling some UNIX standards and returning to NOT outputting any message when operation succeeds? My scripts are full of grep -v calls after each btrfs command, and this sucks (and I don't think I'm alone in this situation). Isn't the correct way to catch the return value instead of grepping the output? It is, but if, for example, you're using the command in a cron script which is expected to work, you don't want it producing output because then you get a mail every time the script runs. So you have to grep -v on the "success" output to make the successful script silent. What about redirecting stdout to /dev/null and redirecting stderr to mail if return value is not 0? s/if return value is not 0/if return value is 0/. The main point is, if btrfs returns 0, then nothing to worry about. (Unless there is a bug, even in that case keep an eye on stderr should be enough to catch that) Thanks, Qu As for expected-to-work case, the stdout doesn't has much meaning and return value should be good enough to judge the result. If it's some command not returning value properly, would you please report it as a bug so we can fix it. It's not the return value that's problematic (although those used to be a real mess). It's the fact that a successful run of the command produces noise on stdout, which most commands don't. Yes, a lot of tried-and-true tools don't output anything for successful run, but also a lot of other tools do output something by default, especially for complex tools like LVM. Maybe we can introduce a global --quite option to silent some output. Thanks, Qu Hugo. Thanks, Qu If you change the message a lot of scripts will have to be changed, at least make it worth it. -- With Best Regards, Marat Khaliili -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] btrfs-progs: subvolume: outputs message only when operation succeeds
On 2017年09月25日 15:52, Hugo Mills wrote: On Mon, Sep 25, 2017 at 03:46:15PM +0800, Qu Wenruo wrote: On 2017年09月25日 15:42, Marat Khalili wrote: On 25/09/17 10:30, Nikolay Borisov wrote: On 19.09.2017 10:41, Misono, Tomohiro wrote: "btrfs subvolume create/delete" outputs the message of "Create/Delete subvolume ..." even when an operation fails. Since it is confusing, let's outputs the message only when an operation succeeds. Please change the verb to past tense, more strongly signaling success - i.e. "Created subvolume" What about recalling some UNIX standards and returning to NOT outputting any message when operation succeeds? My scripts are full of grep -v calls after each btrfs command, and this sucks (and I don't think I'm alone in this situation). Isn't the correct way to catch the return value instead of grepping the output? It is, but if, for example, you're using the command in a cron script which is expected to work, you don't want it producing output because then you get a mail every time the script runs. So you have to grep -v on the "success" output to make the successful script silent. What about redirecting stdout to /dev/null and redirecting stderr to mail if return value is not 0? As for expected-to-work case, the stdout doesn't has much meaning and return value should be good enough to judge the result. If it's some command not returning value properly, would you please report it as a bug so we can fix it. It's not the return value that's problematic (although those used to be a real mess). It's the fact that a successful run of the command produces noise on stdout, which most commands don't. Yes, a lot of tried-and-true tools don't output anything for successful run, but also a lot of other tools do output something by default, especially for complex tools like LVM. Maybe we can introduce a global --quite option to silent some output. Thanks, Qu Hugo. Thanks, Qu If you change the message a lot of scripts will have to be changed, at least make it worth it. -- With Best Regards, Marat Khaliili -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] btrfs-progs: subvolume: outputs message only when operation succeeds
On Mon, Sep 25, 2017 at 03:46:15PM +0800, Qu Wenruo wrote: > > > On 2017年09月25日 15:42, Marat Khalili wrote: > >On 25/09/17 10:30, Nikolay Borisov wrote: > >>On 19.09.2017 10:41, Misono, Tomohiro wrote: > >>>"btrfs subvolume create/delete" outputs the message of "Create/Delete > >>>subvolume ..." even when an operation fails. > >>>Since it is confusing, let's outputs the message only when an > >>>operation succeeds. > >>Please change the verb to past tense, more strongly signaling success - > >>i.e. "Created subvolume" > >What about recalling some UNIX standards and returning to NOT > >outputting any message when operation succeeds? My scripts are > >full of grep -v calls after each btrfs command, and this sucks > >(and I don't think I'm alone in this situation). > > Isn't the correct way to catch the return value instead of grepping > the output? It is, but if, for example, you're using the command in a cron script which is expected to work, you don't want it producing output because then you get a mail every time the script runs. So you have to grep -v on the "success" output to make the successful script silent. > If it's some command not returning value properly, would you please > report it as a bug so we can fix it. It's not the return value that's problematic (although those used to be a real mess). It's the fact that a successful run of the command produces noise on stdout, which most commands don't. Hugo. > Thanks, > Qu > > >If you change the message a lot of scripts will have to be > >changed, at least make it worth it. > > > > -- > > > >With Best Regards, > >Marat Khaliili > > -- Hugo Mills | If you see something, say nothing and drink to hugo@... carfax.org.uk | forget http://carfax.org.uk/ | PGP: E2AB1DE4 | Welcome to Night Vale signature.asc Description: Digital signature
Re: [PATCH] btrfs-progs: subvolume: outputs message only when operation succeeds
On 2017年09月25日 15:42, Marat Khalili wrote: On 25/09/17 10:30, Nikolay Borisov wrote: On 19.09.2017 10:41, Misono, Tomohiro wrote: "btrfs subvolume create/delete" outputs the message of "Create/Delete subvolume ..." even when an operation fails. Since it is confusing, let's outputs the message only when an operation succeeds. Please change the verb to past tense, more strongly signaling success - i.e. "Created subvolume" What about recalling some UNIX standards and returning to NOT outputting any message when operation succeeds? My scripts are full of grep -v calls after each btrfs command, and this sucks (and I don't think I'm alone in this situation). Isn't the correct way to catch the return value instead of grepping the output? If it's some command not returning value properly, would you please report it as a bug so we can fix it. Thanks, Qu If you change the message a lot of scripts will have to be changed, at least make it worth it. -- With Best Regards, Marat Khaliili -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] btrfs-progs: subvolume: outputs message only when operation succeeds
On Mon, Sep 25, 2017 at 10:42:06AM +0300, Marat Khalili wrote: > On 25/09/17 10:30, Nikolay Borisov wrote: > >On 19.09.2017 10:41, Misono, Tomohiro wrote: > >>"btrfs subvolume create/delete" outputs the message of "Create/Delete > >>subvolume ..." even when an operation fails. > >>Since it is confusing, let's outputs the message only when an operation > >>succeeds. > >Please change the verb to past tense, more strongly signaling success - > >i.e. "Created subvolume" > What about recalling some UNIX standards and returning to NOT > outputting any message when operation succeeds? My scripts are full > of grep -v calls after each btrfs command, and this sucks (and I > don't think I'm alone in this situation). If you change the message > a lot of scripts will have to be changed, at least make it worth it. Seconded. Make sure the return code reflects the result, and drop the printed message (or keep it if there's a --verbose flag, maybe). Hugo. -- Hugo Mills | If you see something, say nothing and drink to hugo@... carfax.org.uk | forget http://carfax.org.uk/ | PGP: E2AB1DE4 | Welcome to Night Vale signature.asc Description: Digital signature
Re: [PATCH] btrfs-progs: subvolume: outputs message only when operation succeeds
On 25/09/17 10:30, Nikolay Borisov wrote: On 19.09.2017 10:41, Misono, Tomohiro wrote: "btrfs subvolume create/delete" outputs the message of "Create/Delete subvolume ..." even when an operation fails. Since it is confusing, let's outputs the message only when an operation succeeds. Please change the verb to past tense, more strongly signaling success - i.e. "Created subvolume" What about recalling some UNIX standards and returning to NOT outputting any message when operation succeeds? My scripts are full of grep -v calls after each btrfs command, and this sucks (and I don't think I'm alone in this situation). If you change the message a lot of scripts will have to be changed, at least make it worth it. -- With Best Regards, Marat Khaliili -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] btrfs-progs: subvolume: outputs message only when operation succeeds
On 19.09.2017 10:41, Misono, Tomohiro wrote: > "btrfs subvolume create/delete" outputs the message of "Create/Delete > subvolume ..." even when an operation fails. > Since it is confusing, let's outputs the message only when an operation > succeeds. > > Signed-off-by: Tomohiro Misono> --- > cmds-subvolume.c | 8 > 1 file changed, 4 insertions(+), 4 deletions(-) > > diff --git a/cmds-subvolume.c b/cmds-subvolume.c > index 666f6e0..6d4b0fe 100644 > --- a/cmds-subvolume.c > +++ b/cmds-subvolume.c > @@ -189,7 +189,6 @@ static int cmd_subvol_create(int argc, char **argv) > if (fddst < 0) > goto out; > > - printf("Create subvolume '%s/%s'\n", dstdir, newname); > if (inherit) { > struct btrfs_ioctl_vol_args_v2 args; > > @@ -213,6 +212,7 @@ static int cmd_subvol_create(int argc, char **argv) > error("cannot create subvolume: %s", strerror(errno)); > goto out; > } > + printf("Create subvolume '%s/%s'\n", dstdir, newname); Please change the verb to past tense, more strongly signaling success - i.e. "Created subvolume" > > retval = 0; /* success */ > out: > @@ -337,9 +337,6 @@ again: > goto out; > } > > - printf("Delete subvolume (%s): '%s/%s'\n", > - commit_mode == 2 || (commit_mode == 1 && cnt + 1 == argc) > - ? "commit" : "no-commit", dname, vname); > memset(, 0, sizeof(args)); > strncpy_null(args.name, vname); > res = ioctl(fd, BTRFS_IOC_SNAP_DESTROY, ); > @@ -349,6 +346,9 @@ again: > ret = 1; > goto out; > } > + printf("Delete subvolume (%s): '%s/%s'\n", > + commit_mode == 2 || (commit_mode == 1 && cnt + 1 == argc) > + ? "commit" : "no-commit", dname, vname); Ditto: "Deleted subvolume" > > if (commit_mode == 1) { > res = wait_for_commit(fd); > -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 4/4] btrfs: remove unused setup_root_args()
Since setup_root_args() is not used anymore, just remove it. Signed-off-by: Tomohiro Misono--- fs/btrfs/super.c | 35 --- 1 file changed, 35 deletions(-) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 7edd74d..f589c5b 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1367,41 +1367,6 @@ static inline int is_subvolume_inode(struct inode *inode) return 0; } -/* - * This will add subvolid=0 to the argument string while removing any subvol= - * and subvolid= arguments to make sure we get the top-level root for path - * walking to the subvol we want. - */ -static char *setup_root_args(char *args) -{ - char *buf, *dst, *sep; - - if (!args) - return kstrdup("subvolid=0", GFP_NOFS); - - /* The worst case is that we add ",subvolid=0" to the end. */ - buf = dst = kmalloc(strlen(args) + strlen(",subvolid=0") + 1, GFP_NOFS); - if (!buf) - return NULL; - - while (1) { - sep = strchrnul(args, ','); - if (!strstarts(args, "subvol=") && - !strstarts(args, "subvolid=")) { - memcpy(dst, args, sep - args); - dst += sep - args; - *dst++ = ','; - } - if (*sep) - args = sep + 1; - else - break; - } - strcpy(dst, "subvolid=0"); - - return buf; -} - static struct dentry *mount_subvol(const char *subvol_name, u64 subvol_objectid, int flags, const char *device_name, char *data, struct vfsmount *mnt) -- 2.9.5 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 3/4] btrfs: split parse_early_options() in two
Now parse_early_options() is used by both btrfs_mount() and mount_root(). However, the former only needs subvol related part and the latter needs the others. Therefore extract the subvol related parts from parse_early_options() and move it to new parse function (parse_subvol_options()). Signed-off-by: Tomohiro Misono--- fs/btrfs/super.c | 85 +++- 1 file changed, 60 insertions(+), 25 deletions(-) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 1c34ca6..7edd74d 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -448,7 +448,8 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, case Opt_subvolrootid: case Opt_device: /* -* These are parsed by btrfs_parse_early_options +* These are parsed by btrfs_parse_subvol_options +* and btrfs_parse_early_options * and can be happily ignored here. */ break; @@ -855,11 +856,63 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, * only when we need to allocate a new super block. */ static int btrfs_parse_early_options(const char *options, fmode_t flags, - void *holder, char **subvol_name, u64 *subvol_objectid, - struct btrfs_fs_devices **fs_devices) + void *holder, struct btrfs_fs_devices **fs_devices) { substring_t args[MAX_OPT_ARGS]; char *device_name, *opts, *orig, *p; + int error = 0; + + if (!options) + return 0; + + /* +* strsep changes the string, duplicate it because btrfs_parse_options +* gets called later +*/ + opts = kstrdup(options, GFP_KERNEL); + if (!opts) + return -ENOMEM; + orig = opts; + + while ((p = strsep(, ",")) != NULL) { + int token; + if (!*p) + continue; + + token = match_token(p, tokens, args); + switch (token) { + case Opt_device: + device_name = match_strdup([0]); + if (!device_name) { + error = -ENOMEM; + goto out; + } + error = btrfs_scan_one_device(device_name, + flags, holder, fs_devices); + kfree(device_name); + if (error) + goto out; + break; + default: + break; + } + } + +out: + kfree(orig); + return error; +} + +/* + * Parse mount options that are related to subvolume id + * + * The value is later passed to mount_subvol() + */ +static int btrfs_parse_subvol_options(const char *options, fmode_t flags, + void *holder, char **subvol_name, u64 *subvol_objectid) +{ + substring_t args[MAX_OPT_ARGS]; + char *opts, *orig, *p; char *num = NULL; int error = 0; @@ -867,8 +920,8 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags, return 0; /* -* strsep changes the string, duplicate it because parse_options -* gets called twice +* strsep changes the string, duplicate it because +* btrfs_parse_early_options gets called later */ opts = kstrdup(options, GFP_KERNEL); if (!opts) @@ -907,18 +960,6 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags, case Opt_subvolrootid: pr_warn("BTRFS: 'subvolrootid' mount option is deprecated and has no effect\n"); break; - case Opt_device: - device_name = match_strdup([0]); - if (!device_name) { - error = -ENOMEM; - goto out; - } - error = btrfs_scan_one_device(device_name, - flags, holder, fs_devices); - kfree(device_name); - if (error) - goto out; - break; default: break; } @@ -1492,18 +1533,14 @@ static struct dentry *mount_root(struct file_system_type *fs_type, int flags, struct btrfs_fs_info *fs_info = NULL; struct security_mnt_opts new_sec_opts; fmode_t mode = FMODE_READ; - char *subvol_name = NULL; - u64 subvol_objectid = 0; int error = 0; if (!(flags & MS_RDONLY)) mode |= FMODE_WRITE; error = btrfs_parse_early_options(data,
[PATCH 2/4] btrfs: cleanup btrfs_mount() using mount_root()
Cleanups btrfs_mount() by using mount_root(). This avoids getting btrfs_mount() called twice in mount path. Old btrfs_mount() will do: 0. VFS layer calls vfs_kern_mount() with registered file_system_type (for btrfs, btrfs_fs_type). btrfs_mount() is called on the way. 1. btrfs_parse_early_options() parses "subvolid=" mount option and set the value to subvol_objectid. Otherwise, subvol_objectid has the initial value of 0 2. check subvol_objectid is 5 or not. This time id is not 5, and btrfs_mount() returns by calling mount_subvol() 3. In mount_subvol(), original mount options are modified to contain "subvolid=0" in setup_root_args(). Then, vfs_kern_mount() is called with btrfs_fs_type and new options 4. btrfs_mount() is called again 5. btrfs_parse_early_options() parses "subvolid=0" and set 5 (instead of 0) to subvol_objectid 6. check subvol_objectid is 5 or not. This time id is 5 and mount_subvol() is not called. btrfs_mount() finishes mounting a root 7. (in mount_subvol()) with using a return vale of vfs_kern_mount(), it calls mount_subtree() 8 return subvolume's dentry Reusing the same file_system_type (and btrfs_mount()) for vfs_kern_mount() is the cause of complication. Instead, new btrfs_mount() will do: 1. parse subvol id related options for later use in mount_subvol() 2. mount device's root by calling vfs_kern_mount() with btrfs_root_fs_type, which is not registered to VFS by register_filesystem(). As a result, mount_root() is called 3. return by calling mount_subvol() The code of 2. is moved from the first part of mount_subvol(). Signed-off-by: Tomohiro Misono--- fs/btrfs/super.c | 186 +-- 1 file changed, 58 insertions(+), 128 deletions(-) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index fe43606..1c34ca6 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1363,48 +1363,11 @@ static char *setup_root_args(char *args) static struct dentry *mount_subvol(const char *subvol_name, u64 subvol_objectid, int flags, const char *device_name, - char *data) + char *data, struct vfsmount *mnt) { struct dentry *root; - struct vfsmount *mnt = NULL; - char *newargs; int ret; - newargs = setup_root_args(data); - if (!newargs) { - root = ERR_PTR(-ENOMEM); - goto out; - } - - mnt = vfs_kern_mount(_fs_type, flags, device_name, newargs); - if (PTR_ERR_OR_ZERO(mnt) == -EBUSY) { - if (flags & MS_RDONLY) { - mnt = vfs_kern_mount(_fs_type, flags & ~MS_RDONLY, -device_name, newargs); - } else { - mnt = vfs_kern_mount(_fs_type, flags | MS_RDONLY, -device_name, newargs); - if (IS_ERR(mnt)) { - root = ERR_CAST(mnt); - mnt = NULL; - goto out; - } - - down_write(>mnt_sb->s_umount); - ret = btrfs_remount(mnt->mnt_sb, , NULL); - up_write(>mnt_sb->s_umount); - if (ret < 0) { - root = ERR_PTR(ret); - goto out; - } - } - } - if (IS_ERR(mnt)) { - root = ERR_CAST(mnt); - mnt = NULL; - goto out; - } - if (!subvol_name) { if (!subvol_objectid) { ret = get_default_subvol_objectid(btrfs_sb(mnt->mnt_sb), @@ -1460,7 +1423,6 @@ static struct dentry *mount_subvol(const char *subvol_name, u64 subvol_objectid, out: mntput(mnt); - kfree(newargs); kfree(subvol_name); return root; } @@ -1515,6 +1477,12 @@ static int setup_security_options(struct btrfs_fs_info *fs_info, return ret; } +/* + * Find a superblock for the given device / mount point. + * + * Note: This is based on mount_bdev from fs/super.c with a few additions + * for multiple device setup. Make sure to keep it in sync. + */ static struct dentry *mount_root(struct file_system_type *fs_type, int flags, const char *device_name, void *data) { @@ -1621,20 +1589,34 @@ static struct dentry *mount_root(struct file_system_type *fs_type, int flags, security_free_mnt_opts(_sec_opts); return ERR_PTR(error); } + /* - * Find a superblock for the given device / mount point. + * Mount function which is called by VFS layer. * - * Note: This is based on get_sb_bdev from fs/super.c with a few additions - * for multiple device setup. Make sure to keep it in sync. + * In order to allow mounting a
[PATCH 1/4] btrfs: add mount_root() and new file_system_type
Add mount_root() and new file_system_type for preparation of cleanup of btrfs_mount(). Code path is not changed yet. mount_root() is almost the same as current btrfs_mount(), but doesn't have subvolume related part. Signed-off-by: Tomohiro Misono--- fs/btrfs/super.c | 116 +++ 1 file changed, 116 insertions(+) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 12540b6..fe43606 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -66,6 +66,7 @@ static const struct super_operations btrfs_super_ops; static struct file_system_type btrfs_fs_type; +static struct file_system_type btrfs_root_fs_type; static int btrfs_remount(struct super_block *sb, int *flags, char *data); @@ -1514,6 +1515,112 @@ static int setup_security_options(struct btrfs_fs_info *fs_info, return ret; } +static struct dentry *mount_root(struct file_system_type *fs_type, int flags, + const char *device_name, void *data) +{ + struct block_device *bdev = NULL; + struct super_block *s; + struct btrfs_fs_devices *fs_devices = NULL; + struct btrfs_fs_info *fs_info = NULL; + struct security_mnt_opts new_sec_opts; + fmode_t mode = FMODE_READ; + char *subvol_name = NULL; + u64 subvol_objectid = 0; + int error = 0; + + if (!(flags & MS_RDONLY)) + mode |= FMODE_WRITE; + + error = btrfs_parse_early_options(data, mode, fs_type, + _name, _objectid, + _devices); + if (error) { + kfree(subvol_name); + return ERR_PTR(error); + } + + security_init_mnt_opts(_sec_opts); + if (data) { + error = parse_security_options(data, _sec_opts); + if (error) + return ERR_PTR(error); + } + + error = btrfs_scan_one_device(device_name, mode, fs_type, _devices); + if (error) + goto error_sec_opts; + + /* +* Setup a dummy root and fs_info for test/set super. This is because +* we don't actually fill this stuff out until open_ctree, but we need +* it for searching for existing supers, so this lets us do that and +* then open_ctree will properly initialize everything later. +*/ + fs_info = kzalloc(sizeof(struct btrfs_fs_info), GFP_NOFS); + if (!fs_info) { + error = -ENOMEM; + goto error_sec_opts; + } + + fs_info->fs_devices = fs_devices; + + fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS); + fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS); + security_init_mnt_opts(_info->security_opts); + if (!fs_info->super_copy || !fs_info->super_for_commit) { + error = -ENOMEM; + goto error_fs_info; + } + + error = btrfs_open_devices(fs_devices, mode, fs_type); + if (error) + goto error_fs_info; + + if (!(flags & MS_RDONLY) && fs_devices->rw_devices == 0) { + error = -EACCES; + goto error_close_devices; + } + + bdev = fs_devices->latest_bdev; + s = sget(fs_type, btrfs_test_super, btrfs_set_super, flags | MS_NOSEC, +fs_info); + if (IS_ERR(s)) { + error = PTR_ERR(s); + goto error_close_devices; + } + + if (s->s_root) { + btrfs_close_devices(fs_devices); + free_fs_info(fs_info); + if ((flags ^ s->s_flags) & MS_RDONLY) + error = -EBUSY; + } else { + snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev); + btrfs_sb(s)->bdev_holder = fs_type; + error = btrfs_fill_super(s, fs_devices, data); + } + if (error) { + deactivate_locked_super(s); + goto error_sec_opts; + } + + fs_info = btrfs_sb(s); + error = setup_security_options(fs_info, s, _sec_opts); + if (error) { + deactivate_locked_super(s); + goto error_sec_opts; + } + + return dget(s->s_root); + +error_close_devices: + btrfs_close_devices(fs_devices); +error_fs_info: + free_fs_info(fs_info); +error_sec_opts: + security_free_mnt_opts(_sec_opts); + return ERR_PTR(error); +} /* * Find a superblock for the given device / mount point. * @@ -2133,6 +2240,15 @@ static struct file_system_type btrfs_fs_type = { .kill_sb= btrfs_kill_super, .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA, }; + +static struct file_system_type btrfs_root_fs_type = { + .owner = THIS_MODULE, + .name = "btrfs", + .mount = mount_root, + .kill_sb= btrfs_kill_super, + .fs_flags = FS_REQUIRES_DEV |
[PATCH v3 0/4] btrfs: cleanup mount path
Summary: Cleanup mount path by avoiding calling btrfs_mount() twice. No functional change. See below for longer explanation. Changelog: v3: Reorganized patches again into four and added comments to the source. Each patch can be applied and compiled while maintaining functionality. The first one is the preparation and the second one is the main part. The last two are small cleanups. v2: Split the patch into three parts. Tomohiro Misono (4): btrfs: add mount_root() and new file_system_type btrfs: cleanup btrfs_mount() using mount_root() btrfs: split parse_early_options() in two btrfs: remove unused setup_root_args() fs/btrfs/super.c | 252 --- 1 file changed, 149 insertions(+), 103 deletions(-) Background Explanation: btrfs uses mount_subtree() to mount a subvolume directly. This function needs a vfsmount* of device's root (/), which is a return value of vfs_kern_mount() (therefore root has to be mounted internally anyway). Current approach of getting root's vfsmount* in mount time is a bit tricky: 0. VFS layer calls vfs_kern_mount() with registered file_system_type (for btrfs, btrfs_fs_type). btrfs_mount() is called on the way. 1. btrfs_parse_early_options() parses "subvolid=" mount option and set the value to subvol_objectid. Otherwise, subvol_objectid has the initial value of 0 2. check subvol_objectid is 5 or not. This time id is not 5, and btrfs_mount() returns by calling mount_subvol() 3. In mount_subvol(), original mount options are modified to contain "subvolid=0" in setup_root_args(). Then, vfs_kern_mount() is called with btrfs_fs_type and new options 4. btrfs_mount() is called again 5. btrfs_parse_early_options() parses "subvolid=0" and set 5 (instead of 0) to subvol_objectid 6. check subvol_objectid is 5 or not. This time id is 5 and mount_subvol() is not called. btrfs_mount() finishes mounting a root 7. (in mount_subvol()) with using a return vale of vfs_kern_mount(), it calls mount_subtree() 8 return subvolume's dentry As illustrated above, calling btrfs_mount() twice complicates the problem. We can use another file_system_type (which has different callback function to mount root) for arguments of our vfs_kern_mount() call to avoid this. In this approach: 1. parse subvol id related options for later use in mount_subvol() 2. mount device's root by calling vfs_kern_mount() with different file_system_type 3. return by calling mount_subvol() I think this approach is the same as nfsv4, which is the only other filesystem using mount_subtree() currently, and easy to understand. Most of the change is done by just reorganizing the original code of btrfs_mount()/mount_subvol() into btrfs_mount()/mount_subvol()/mount_root() btrfs_parse_early_options() is split into two parts to avoid "device=" option will be handled twice (though it cause no harm). setup_root_args() is deleted as not needed anymore. -- 2.9.5 -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
AW: AW: Btrfs performance with small blocksize on SSD
Well the correct translation for "Laufzeit" is runtime and not latency. But thank you for that hint, I'll change it to "gesamt Laufzeit" to make it more clear. Best regards Carsten -Ursprüngliche Nachricht- Von: Andrei Borzenkov [mailto:arvidj...@gmail.com] Gesendet: Sonntag, 24. September 2017 18:43 An: Fuhrmann, Carsten; Qu Wenruo ; linux-btrfs@vger.kernel.org Betreff: Re: AW: Btrfs performance with small blocksize on SSD 24.09.2017 16:53, Fuhrmann, Carsten пишет: > Hello, > > 1) > I used direct write (no page cache) but I didn't disable the Disk cache of > the HDD/SSD itself. In all tests I wrote 1GB and looked for the runtime of > that write process. So "latency" on your diagram means total time to write 1GiB file? That is highly unusual meaning for "latency" which normally means time to perform single IO. If so, you should better rename Y-axis to something like "total run time". N�r��yb�X��ǧv�^�){.n�+{�n�߲)w*jg����ݢj/���z�ޖ��2�ޙ&�)ߡ�a�����G���h��j:+v���w��٥