from:"Mark Fasheh"

[PATCH 57/76] fs/orangefs: Use inode_sb() helper instead of inode->i_sb

2018-05-08 Thread Mark Fasheh

Signed-off-by: Mark Fasheh <mfas...@suse.de>
---
 fs/orangefs/file.c|  2 +-
 fs/orangefs/namei.c   | 12 
 fs/orangefs/orangefs-kernel.h |  8 
 3 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c
index 0d228cd087e6..123a0c5b4a22 100644
--- a/fs/orangefs/file.c
+++ b/fs/orangefs/file.c
@@ -722,7 +722,7 @@ static int orangefs_lock(struct file *filp, int cmd, struct 
file_lock *fl)
 {
int rc = -EINVAL;
 
-   if (ORANGEFS_SB(file_inode(filp)->i_sb)->flags & 
ORANGEFS_OPT_LOCAL_LOCK) {
+   if (ORANGEFS_SB(inode_sb(file_inode(filp)))->flags & 
ORANGEFS_OPT_LOCAL_LOCK) {
if (cmd == F_GETLK) {
rc = 0;
posix_test_lock(filp, fl);
diff --git a/fs/orangefs/namei.c b/fs/orangefs/namei.c
index 6e3134e6d98a..79b76f8f0ba8 100644
--- a/fs/orangefs/namei.c
+++ b/fs/orangefs/namei.c
@@ -60,7 +60,8 @@ static int orangefs_create(struct inode *dir,
ref = new_op->downcall.resp.create.refn;
op_release(new_op);
 
-   inode = orangefs_new_inode(dir->i_sb, dir, S_IFREG | mode, 0, );
+   inode = orangefs_new_inode(inode_sb(dir), dir, S_IFREG | mode, 0,
+  );
if (IS_ERR(inode)) {
gossip_err("%s: Failed to allocate inode for file :%pd:\n",
   __func__,
@@ -192,7 +193,8 @@ static struct dentry *orangefs_lookup(struct inode *dir, 
struct dentry *dentry,
 
orangefs_set_timeout(dentry);
 
-   inode = orangefs_iget(dir->i_sb, _op->downcall.resp.lookup.refn);
+   inode = orangefs_iget(inode_sb(dir),
+ _op->downcall.resp.lookup.refn);
if (IS_ERR(inode)) {
gossip_debug(GOSSIP_NAME_DEBUG,
"error %ld from iget\n", PTR_ERR(inode));
@@ -320,7 +322,8 @@ static int orangefs_symlink(struct inode *dir,
ref = new_op->downcall.resp.sym.refn;
op_release(new_op);
 
-   inode = orangefs_new_inode(dir->i_sb, dir, S_IFLNK | mode, 0, );
+   inode = orangefs_new_inode(inode_sb(dir), dir, S_IFLNK | mode, 0,
+  );
if (IS_ERR(inode)) {
gossip_err
("*** Failed to allocate orangefs symlink inode\n");
@@ -391,7 +394,8 @@ static int orangefs_mkdir(struct inode *dir, struct dentry 
*dentry, umode_t mode
ref = new_op->downcall.resp.mkdir.refn;
op_release(new_op);
 
-   inode = orangefs_new_inode(dir->i_sb, dir, S_IFDIR | mode, 0, );
+   inode = orangefs_new_inode(inode_sb(dir), dir, S_IFDIR | mode, 0,
+  );
if (IS_ERR(inode)) {
gossip_err("*** Failed to allocate orangefs dir inode\n");
ret = PTR_ERR(inode);
diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h
index eebbaece85ef..c006a3f6dedd 100644
--- a/fs/orangefs/orangefs-kernel.h
+++ b/fs/orangefs/orangefs-kernel.h
@@ -325,11 +325,11 @@ static inline int is_root_handle(struct inode *inode)
gossip_debug(GOSSIP_DCACHE_DEBUG,
 "%s: root handle: %pU, this handle: %pU:\n",
 __func__,
-_SB(inode->i_sb)->root_khandle,
+_SB(inode_sb(inode))->root_khandle,
 get_khandle_from_ino(inode));
 
-   if (ORANGEFS_khandle_cmp(&(ORANGEFS_SB(inode->i_sb)->root_khandle),
-get_khandle_from_ino(inode)))
+   if (ORANGEFS_khandle_cmp(&(ORANGEFS_SB(inode_sb(inode))->root_khandle),
+get_khandle_from_ino(inode)))
return 0;
else
return 1;
@@ -513,7 +513,7 @@ int service_operation(struct orangefs_kernel_op_s *op,
  int flags);
 
 #define get_interruptible_flag(inode) \
-   ((ORANGEFS_SB(inode->i_sb)->flags & ORANGEFS_OPT_INTR) ? \
+   ((ORANGEFS_SB(inode_sb(inode))->flags & ORANGEFS_OPT_INTR) ? \
ORANGEFS_OP_INTERRUPTIBLE : 0)
 
 #define fill_default_sys_attrs(sys_attr, type, mode)   \
-- 
2.15.1

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 59/76] fs/proc: Use inode_sb() helper instead of inode->i_sb

2018-05-08 Thread Mark Fasheh

Signed-off-by: Mark Fasheh <mfas...@suse.de>
---
 fs/proc/array.c   |  2 +-
 fs/proc/base.c| 22 --
 fs/proc/fd.c  |  4 ++--
 fs/proc/generic.c |  2 +-
 fs/proc/namespaces.c  |  2 +-
 fs/proc/nommu.c   |  2 +-
 fs/proc/proc_sysctl.c |  4 ++--
 fs/proc/self.c|  2 +-
 fs/proc/task_mmu.c|  2 +-
 fs/proc/task_nommu.c  |  2 +-
 fs/proc/thread_self.c |  2 +-
 11 files changed, 24 insertions(+), 22 deletions(-)

diff --git a/fs/proc/array.c b/fs/proc/array.c
index 598803576e4c..e7a668a89caf 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -687,7 +687,7 @@ static int children_seq_show(struct seq_file *seq, void *v)
struct inode *inode = seq->private;
pid_t pid;
 
-   pid = pid_nr_ns(v, inode->i_sb->s_fs_info);
+   pid = pid_nr_ns(v, inode_sb(inode)->s_fs_info);
seq_printf(seq, "%d ", pid);
 
return 0;
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 9298324325ed..a39fdd56f5d9 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -698,7 +698,7 @@ static bool has_pid_permissions(struct pid_namespace *pid,
 
 static int proc_pid_permission(struct inode *inode, int mask)
 {
-   struct pid_namespace *pid = inode->i_sb->s_fs_info;
+   struct pid_namespace *pid = inode_sb(inode)->s_fs_info;
struct task_struct *task;
bool has_perms;
 
@@ -738,7 +738,7 @@ static int proc_single_show(struct seq_file *m, void *v)
struct task_struct *task;
int ret;
 
-   ns = inode->i_sb->s_fs_info;
+   ns = inode_sb(inode)->s_fs_info;
pid = proc_pid(inode);
task = get_pid_task(pid, PIDTYPE_PID);
if (!task)
@@ -1410,7 +1410,7 @@ static const struct file_operations 
proc_fail_nth_operations = {
 static int sched_show(struct seq_file *m, void *v)
 {
struct inode *inode = m->private;
-   struct pid_namespace *ns = inode->i_sb->s_fs_info;
+   struct pid_namespace *ns = inode_sb(inode)->s_fs_info;
struct task_struct *p;
 
p = get_proc_task(inode);
@@ -2065,7 +2065,7 @@ proc_map_files_instantiate(struct inode *dir, struct 
dentry *dentry,
struct proc_inode *ei;
struct inode *inode;
 
-   inode = proc_pid_make_inode(dir->i_sb, task, S_IFLNK |
+   inode = proc_pid_make_inode(inode_sb(dir), task, S_IFLNK |
((mode & FMODE_READ ) ? S_IRUSR : 0) |
((mode & FMODE_WRITE) ? S_IWUSR : 0));
if (!inode)
@@ -2327,7 +2327,7 @@ static int proc_timers_open(struct inode *inode, struct 
file *file)
return -ENOMEM;
 
tp->pid = proc_pid(inode);
-   tp->ns = inode->i_sb->s_fs_info;
+   tp->ns = inode_sb(inode)->s_fs_info;
return 0;
 }
 
@@ -2432,7 +2432,7 @@ static int proc_pident_instantiate(struct inode *dir,
struct inode *inode;
struct proc_inode *ei;
 
-   inode = proc_pid_make_inode(dir->i_sb, task, p->mode);
+   inode = proc_pid_make_inode(inode_sb(dir), task, p->mode);
if (!inode)
goto out;
 
@@ -3137,7 +3137,8 @@ static int proc_pid_instantiate(struct inode *dir,
 {
struct inode *inode;
 
-   inode = proc_pid_make_inode(dir->i_sb, task, S_IFDIR | S_IRUGO | 
S_IXUGO);
+   inode = proc_pid_make_inode(inode_sb(dir), task,
+   S_IFDIR | S_IRUGO | S_IXUGO);
if (!inode)
goto out;
 
@@ -3232,7 +3233,7 @@ static struct tgid_iter next_tgid(struct pid_namespace 
*ns, struct tgid_iter ite
 int proc_pid_readdir(struct file *file, struct dir_context *ctx)
 {
struct tgid_iter iter;
-   struct pid_namespace *ns = file_inode(file)->i_sb->s_fs_info;
+   struct pid_namespace *ns = inode_sb(file_inode(file))->s_fs_info;
loff_t pos = ctx->pos;
 
if (pos >= PID_MAX_LIMIT + TGID_OFFSET)
@@ -3435,7 +3436,8 @@ static int proc_task_instantiate(struct inode *dir,
struct dentry *dentry, struct task_struct *task, const void *ptr)
 {
struct inode *inode;
-   inode = proc_pid_make_inode(dir->i_sb, task, S_IFDIR | S_IRUGO | 
S_IXUGO);
+   inode = proc_pid_make_inode(inode_sb(dir), task,
+   S_IFDIR | S_IRUGO | S_IXUGO);
 
if (!inode)
goto out;
@@ -3584,7 +3586,7 @@ static int proc_task_readdir(struct file *file, struct 
dir_context *ctx)
/* f_version caches the tgid value that the last readdir call couldn't
 * return. lseek aka telldir automagically resets f_version to 0.
 */
-   ns = inode->i_sb->s_fs_info;
+   ns = inode_sb(inode)->s_fs_info;
tid = (int)file->f_version;
file->f_version = 0;
for (task = first_tid(proc_pid(inode), tid, ctx->pos - 2, ns);
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index 6b80cd1e419a..818f

[PATCH 60/76] fs/qnx4: Use inode_sb() helper instead of inode->i_sb

2018-05-08 Thread Mark Fasheh

Signed-off-by: Mark Fasheh <mfas...@suse.de>
---
 fs/qnx4/dir.c   | 2 +-
 fs/qnx4/inode.c | 4 ++--
 fs/qnx4/namei.c | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/fs/qnx4/dir.c b/fs/qnx4/dir.c
index a6ee23aadd28..c0e764ce79dd 100644
--- a/fs/qnx4/dir.c
+++ b/fs/qnx4/dir.c
@@ -31,7 +31,7 @@ static int qnx4_readdir(struct file *file, struct dir_context 
*ctx)
 
while (ctx->pos < inode->i_size) {
blknum = qnx4_block_map(inode, ctx->pos >> 
QNX4_BLOCK_SIZE_BITS);
-   bh = sb_bread(inode->i_sb, blknum);
+   bh = sb_bread(inode_sb(inode), blknum);
if (bh == NULL) {
printk(KERN_ERR "qnx4_readdir: bread failed (%ld)\n", 
blknum);
return 0;
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index 3d46fe302fcb..2b5e5b18e084 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -60,7 +60,7 @@ static int qnx4_get_block( struct inode *inode, sector_t 
iblock, struct buffer_h
phys = qnx4_block_map( inode, iblock );
if ( phys ) {
// logical block is before EOF
-   map_bh(bh, inode->i_sb, phys);
+   map_bh(bh, inode_sb(inode), phys);
}
return 0;
 }
@@ -94,7 +94,7 @@ unsigned long qnx4_block_map( struct inode *inode, long 
iblock )
while ( --nxtnt > 0 ) {
if ( ix == 0 ) {
// read next xtnt block.
-   bh = sb_bread(inode->i_sb, i_xblk - 1);
+   bh = sb_bread(inode_sb(inode), i_xblk - 1);
if ( !bh ) {
QNX4DEBUG((KERN_ERR "qnx4: I/O error 
reading xtnt block [%ld])\n", i_xblk - 1));
return -EIO;
diff --git a/fs/qnx4/namei.c b/fs/qnx4/namei.c
index eca27878079d..3a84a8f6c6a7 100644
--- a/fs/qnx4/namei.c
+++ b/fs/qnx4/namei.c
@@ -67,7 +67,7 @@ static struct buffer_head *qnx4_find_entry(int len, struct 
inode *dir,
if (!bh) {
block = qnx4_block_map(dir, blkofs);
if (block)
-   bh = sb_bread(dir->i_sb, block);
+   bh = sb_bread(inode_sb(dir), block);
if (!bh) {
blkofs++;
continue;
@@ -113,7 +113,7 @@ struct dentry * qnx4_lookup(struct inode *dir, struct 
dentry *dentry, unsigned i
}
brelse(bh);
 
-   foundinode = qnx4_iget(dir->i_sb, ino);
+   foundinode = qnx4_iget(inode_sb(dir), ino);
if (IS_ERR(foundinode)) {
QNX4DEBUG((KERN_ERR "qnx4: lookup->iget -> error %ld\n",
   PTR_ERR(foundinode)));
-- 
2.15.1

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 61/76] fs/qnx6: Use inode_sb() helper instead of inode->i_sb

2018-05-08 Thread Mark Fasheh

Signed-off-by: Mark Fasheh <mfas...@suse.de>
---
 fs/qnx6/dir.c   | 8 
 fs/qnx6/inode.c | 4 ++--
 fs/qnx6/namei.c | 2 +-
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/fs/qnx6/dir.c b/fs/qnx6/dir.c
index c1cfb8a19e9d..655d0eb9d82a 100644
--- a/fs/qnx6/dir.c
+++ b/fs/qnx6/dir.c
@@ -65,7 +65,7 @@ static int qnx6_dir_longfilename(struct inode *inode,
unsigned de_inode)
 {
struct qnx6_long_filename *lf;
-   struct super_block *s = inode->i_sb;
+   struct super_block *s = inode_sb(inode);
struct qnx6_sb_info *sbi = QNX6_SB(s);
struct page *page;
int lf_size;
@@ -112,7 +112,7 @@ static int qnx6_dir_longfilename(struct inode *inode,
 static int qnx6_readdir(struct file *file, struct dir_context *ctx)
 {
struct inode *inode = file_inode(file);
-   struct super_block *s = inode->i_sb;
+   struct super_block *s = inode_sb(inode);
struct qnx6_sb_info *sbi = QNX6_SB(s);
loff_t pos = ctx->pos & ~(QNX6_DIR_ENTRY_SIZE - 1);
unsigned long npages = dir_pages(inode);
@@ -175,7 +175,7 @@ static int qnx6_readdir(struct file *file, struct 
dir_context *ctx)
 static unsigned qnx6_long_match(int len, const char *name,
struct qnx6_long_dir_entry *de, struct inode *dir)
 {
-   struct super_block *s = dir->i_sb;
+   struct super_block *s = inode_sb(dir);
struct qnx6_sb_info *sbi = QNX6_SB(s);
struct page *page;
int thislen;
@@ -213,7 +213,7 @@ static unsigned qnx6_match(struct super_block *s, int len, 
const char *name,
 unsigned qnx6_find_entry(int len, struct inode *dir, const char *name,
 struct page **res_page)
 {
-   struct super_block *s = dir->i_sb;
+   struct super_block *s = inode_sb(dir);
struct qnx6_inode_info *ei = QNX6_I(dir);
struct page *page = NULL;
unsigned long start, n;
diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c
index 4aeb26bcb4d0..4be77b89f11d 100644
--- a/fs/qnx6/inode.c
+++ b/fs/qnx6/inode.c
@@ -79,7 +79,7 @@ static int qnx6_get_block(struct inode *inode, sector_t 
iblock,
phys = qnx6_block_map(inode, iblock);
if (phys) {
/* logical block is before EOF */
-   map_bh(bh, inode->i_sb, phys);
+   map_bh(bh, inode_sb(inode), phys);
}
return 0;
 }
@@ -110,7 +110,7 @@ static int qnx6_readpages(struct file *file, struct 
address_space *mapping,
  */
 static unsigned qnx6_block_map(struct inode *inode, unsigned no)
 {
-   struct super_block *s = inode->i_sb;
+   struct super_block *s = inode_sb(inode);
struct qnx6_sb_info *sbi = QNX6_SB(s);
struct qnx6_inode_info *ei = QNX6_I(inode);
unsigned block = 0;
diff --git a/fs/qnx6/namei.c b/fs/qnx6/namei.c
index 72c2770830be..0b1a626c20d8 100644
--- a/fs/qnx6/namei.c
+++ b/fs/qnx6/namei.c
@@ -27,7 +27,7 @@ struct dentry *qnx6_lookup(struct inode *dir, struct dentry 
*dentry,
 
ino = qnx6_find_entry(len, dir, name, );
if (ino) {
-   foundinode = qnx6_iget(dir->i_sb, ino);
+   foundinode = qnx6_iget(inode_sb(dir), ino);
qnx6_put_page(page);
if (IS_ERR(foundinode)) {
pr_debug("lookup->iget ->  error %ld\n",
-- 
2.15.1

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 62/76] fs/quota: Use inode_sb() helper instead of inode->i_sb

2018-05-08 Thread Mark Fasheh

Signed-off-by: Mark Fasheh <mfas...@suse.de>
---
 fs/quota/dquot.c | 30 +++---
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 020c597ef9b6..ba6d549323cb 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -920,7 +920,7 @@ EXPORT_SYMBOL(dqget);
 
 static inline struct dquot **i_dquot(struct inode *inode)
 {
-   return inode->i_sb->s_op->get_dquots(inode);
+   return inode_sb(inode)->s_op->get_dquots(inode);
 }
 
 static int dqinit_needed(struct inode *inode, int type)
@@ -1406,7 +1406,7 @@ static int info_bdq_free(struct dquot *dquot, qsize_t 
space)
 
 static int dquot_active(const struct inode *inode)
 {
-   struct super_block *sb = inode->i_sb;
+   struct super_block *sb = inode_sb(inode);
 
if (IS_NOQUOTA(inode))
return 0;
@@ -1423,7 +1423,7 @@ static int __dquot_initialize(struct inode *inode, int 
type)
 {
int cnt, init_needed = 0;
struct dquot **dquots, *got[MAXQUOTAS] = {};
-   struct super_block *sb = inode->i_sb;
+   struct super_block *sb = inode_sb(inode);
qsize_t rsv;
int ret = 0;
 
@@ -1462,7 +1462,7 @@ static int __dquot_initialize(struct inode *inode, int 
type)
qid = make_kqid_gid(inode->i_gid);
break;
case PRJQUOTA:
-   rc = inode->i_sb->dq_op->get_projid(inode, );
+   rc = inode_sb(inode)->dq_op->get_projid(inode, );
if (rc)
continue;
qid = make_kqid_projid(projid);
@@ -1540,7 +1540,7 @@ bool dquot_initialize_needed(struct inode *inode)
 
dquots = i_dquot(inode);
for (i = 0; i < MAXQUOTAS; i++)
-   if (!dquots[i] && sb_has_quota_active(inode->i_sb, i))
+   if (!dquots[i] && sb_has_quota_active(inode_sb(inode), i))
return true;
return false;
 }
@@ -1603,13 +1603,13 @@ static qsize_t *inode_reserved_space(struct inode * 
inode)
 {
/* Filesystem must explicitly define it's own method in order to use
 * quota reservation interface */
-   BUG_ON(!inode->i_sb->dq_op->get_reserved_space);
-   return inode->i_sb->dq_op->get_reserved_space(inode);
+   BUG_ON(!inode_sb(inode)->dq_op->get_reserved_space);
+   return inode_sb(inode)->dq_op->get_reserved_space(inode);
 }
 
 static qsize_t __inode_get_rsv_space(struct inode *inode)
 {
-   if (!inode->i_sb->dq_op->get_reserved_space)
+   if (!inode_sb(inode)->dq_op->get_reserved_space)
return 0;
return *inode_reserved_space(inode);
 }
@@ -1618,7 +1618,7 @@ static qsize_t inode_get_rsv_space(struct inode *inode)
 {
qsize_t ret;
 
-   if (!inode->i_sb->dq_op->get_reserved_space)
+   if (!inode_sb(inode)->dq_op->get_reserved_space)
return 0;
spin_lock(>i_lock);
ret = __inode_get_rsv_space(inode);
@@ -1955,8 +1955,8 @@ int __dquot_transfer(struct inode *inode, struct dquot 
**transfer_to)
if (IS_NOQUOTA(inode))
return 0;
 
-   if (inode->i_sb->dq_op->get_inode_usage) {
-   ret = inode->i_sb->dq_op->get_inode_usage(inode, _usage);
+   if (inode_sb(inode)->dq_op->get_inode_usage) {
+   ret = inode_sb(inode)->dq_op->get_inode_usage(inode, 
_usage);
if (ret)
return ret;
}
@@ -1988,7 +1988,7 @@ int __dquot_transfer(struct inode *inode, struct dquot 
**transfer_to)
if (!transfer_to[cnt])
continue;
/* Avoid races with quotaoff() */
-   if (!sb_has_quota_active(inode->i_sb, cnt))
+   if (!sb_has_quota_active(inode_sb(inode), cnt))
continue;
is_valid[cnt] = 1;
transfer_from[cnt] = i_dquot(inode)[cnt];
@@ -2070,7 +2070,7 @@ int dquot_transfer(struct inode *inode, struct iattr 
*iattr)
 {
struct dquot *transfer_to[MAXQUOTAS] = {};
struct dquot *dquot;
-   struct super_block *sb = inode->i_sb;
+   struct super_block *sb = inode_sb(inode);
int ret;
 
if (!dquot_active(inode))
@@ -2302,7 +2302,7 @@ static int vfs_load_quota_inode(struct inode *inode, int 
type, int format_id,
unsigned int flags)
 {
struct quota_format_type *fmt = find_quota_format(format_id);
-   struct super_block *sb = inode->i_sb;
+   struct super_block *sb = inode_sb(inode);
struct quota_info *dqopt = sb_dqopt(sb);
int error;
 
@@ -2464,7 +2464,7 @@ EXPORT_SYMBOL(dquot_quota_on);
 int dquot_enable(struct inode *inode, int type, int format_id,
 unsigned int flags)
 {
-

[PATCH 64/76] fs/read: Use inode_sb() helper instead of inode->i_sb

2018-05-08 Thread Mark Fasheh

Signed-off-by: Mark Fasheh <mfas...@suse.de>
---
 fs/read_write.c | 11 ++-
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/fs/read_write.c b/fs/read_write.c
index f8547b82dfb3..cf9900707558 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -146,7 +146,7 @@ loff_t generic_file_llseek(struct file *file, loff_t 
offset, int whence)
struct inode *inode = file->f_mapping->host;
 
return generic_file_llseek_size(file, offset, whence,
-   inode->i_sb->s_maxbytes,
+   inode_sb(inode)->s_maxbytes,
i_size_read(inode));
 }
 EXPORT_SYMBOL(generic_file_llseek);
@@ -1389,7 +1389,8 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t 
*ppos,
goto fput_out;
 
if (!max)
-   max = min(in_inode->i_sb->s_maxbytes, 
out_inode->i_sb->s_maxbytes);
+   max = min(inode_sb(in_inode)->s_maxbytes,
+ inode_sb(out_inode)->s_maxbytes);
 
if (unlikely(pos + count > max)) {
retval = -EOVERFLOW;
@@ -1549,7 +1550,7 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t 
pos_in,
return -EBADF;
 
/* this could be relaxed once a method supports cross-fs copies */
-   if (inode_in->i_sb != inode_out->i_sb)
+   if (inode_sb(inode_in) != inode_sb(inode_out))
return -EXDEV;
 
if (len == 0)
@@ -1694,7 +1695,7 @@ int vfs_clone_file_prep_inodes(struct inode *inode_in, 
loff_t pos_in,
   struct inode *inode_out, loff_t pos_out,
   u64 *len, bool is_dedupe)
 {
-   loff_t bs = inode_out->i_sb->s_blocksize;
+   loff_t bs = inode_sb(inode_out)->s_blocksize;
loff_t blen;
loff_t isize;
bool same_inode = (inode_in == inode_out);
@@ -1808,7 +1809,7 @@ int vfs_clone_file_range(struct file *file_in, loff_t 
pos_in,
 * the same mount. Practically, they only need to be on the same file
 * system.
 */
-   if (inode_in->i_sb != inode_out->i_sb)
+   if (inode_sb(inode_in) != inode_sb(inode_out))
return -EXDEV;
 
if (!(file_in->f_mode & FMODE_READ) ||
-- 
2.15.1

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 67/76] fs/squashfs: Use inode_sb() helper instead of inode->i_sb

2018-05-08 Thread Mark Fasheh

Signed-off-by: Mark Fasheh <mfas...@suse.de>
---
 fs/squashfs/dir.c | 26 ++
 fs/squashfs/export.c  |  2 +-
 fs/squashfs/file.c| 34 ++
 fs/squashfs/file_cache.c  |  5 +++--
 fs/squashfs/file_direct.c |  9 +
 fs/squashfs/inode.c   |  2 +-
 fs/squashfs/namei.c   | 28 
 fs/squashfs/symlink.c |  2 +-
 fs/squashfs/xattr.c   |  4 ++--
 9 files changed, 61 insertions(+), 51 deletions(-)

diff --git a/fs/squashfs/dir.c b/fs/squashfs/dir.c
index a5845f94a2a1..c184017e4e70 100644
--- a/fs/squashfs/dir.c
+++ b/fs/squashfs/dir.c
@@ -110,7 +110,7 @@ static int get_dir_index_using_offset(struct super_block 
*sb,
 static int squashfs_readdir(struct file *file, struct dir_context *ctx)
 {
struct inode *inode = file_inode(file);
-   struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
+   struct squashfs_sb_info *msblk = inode_sb(inode)->s_fs_info;
u64 block = squashfs_i(inode)->start + msblk->directory_table;
int offset = squashfs_i(inode)->offset, length, err;
unsigned int inode_number, dir_count, size, type;
@@ -154,18 +154,18 @@ static int squashfs_readdir(struct file *file, struct 
dir_context *ctx)
ctx->pos += size;
}
 
-   length = get_dir_index_using_offset(inode->i_sb, , ,
-   squashfs_i(inode)->dir_idx_start,
-   squashfs_i(inode)->dir_idx_offset,
-   squashfs_i(inode)->dir_idx_cnt,
-   ctx->pos);
+   length = get_dir_index_using_offset(inode_sb(inode), , ,
+   squashfs_i(inode)->dir_idx_start,
+   squashfs_i(inode)->dir_idx_offset,
+   squashfs_i(inode)->dir_idx_cnt,
+   ctx->pos);
 
while (length < i_size_read(inode)) {
/*
 * Read directory header
 */
-   err = squashfs_read_metadata(inode->i_sb, , ,
-   , sizeof(dirh));
+   err = squashfs_read_metadata(inode_sb(inode), , ,
+, sizeof(dirh));
if (err < 0)
goto failed_read;
 
@@ -180,8 +180,9 @@ static int squashfs_readdir(struct file *file, struct 
dir_context *ctx)
/*
 * Read directory entry.
 */
-   err = squashfs_read_metadata(inode->i_sb, dire, ,
-   , sizeof(*dire));
+   err = squashfs_read_metadata(inode_sb(inode), dire,
+,
+, sizeof(*dire));
if (err < 0)
goto failed_read;
 
@@ -191,8 +192,9 @@ static int squashfs_readdir(struct file *file, struct 
dir_context *ctx)
if (size > SQUASHFS_NAME_LEN)
goto failed_read;
 
-   err = squashfs_read_metadata(inode->i_sb, dire->name,
-   , , size);
+   err = squashfs_read_metadata(inode_sb(inode),
+dire->name,
+, , size);
if (err < 0)
goto failed_read;
 
diff --git a/fs/squashfs/export.c b/fs/squashfs/export.c
index 8073b6532cf0..ce7615f66d5c 100644
--- a/fs/squashfs/export.c
+++ b/fs/squashfs/export.c
@@ -113,7 +113,7 @@ static struct dentry *squashfs_get_parent(struct dentry 
*child)
struct inode *inode = d_inode(child);
unsigned int parent_ino = squashfs_i(inode)->parent;
 
-   return squashfs_export_iget(inode->i_sb, parent_ino);
+   return squashfs_export_iget(inode_sb(inode), parent_ino);
 }
 
 
diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c
index 13d80947bf9e..afad108e0d36 100644
--- a/fs/squashfs/file.c
+++ b/fs/squashfs/file.c
@@ -61,7 +61,7 @@ static struct meta_index *locate_meta_index(struct inode 
*inode, int offset,
int index)
 {
struct meta_index *meta = NULL;
-   struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
+   struct squashfs_sb_info *msblk = inode_sb(inode)->s_fs_info;
int i;
 
mutex_lock(>meta_index_mutex);
@@ -99,7 +99,7 @@ static struct meta_index *locate_meta_index(struct inode 
*inode, int offset,
 static struct meta_index *empty_meta_index(struct inode *inode, int offset,
int skip)
 {
-   struct squ

[PATCH 63/76] fs/ramfs: Use inode_sb() helper instead of inode->i_sb

2018-05-08 Thread Mark Fasheh

Signed-off-by: Mark Fasheh <mfas...@suse.de>
---
 fs/ramfs/inode.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index 11201b2d06b9..57b78ae51ed1 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -101,7 +101,7 @@ struct inode *ramfs_get_inode(struct super_block *sb,
 static int
 ramfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
 {
-   struct inode * inode = ramfs_get_inode(dir->i_sb, dir, mode, dev);
+   struct inode * inode = ramfs_get_inode(inode_sb(dir), dir, mode, dev);
int error = -ENOSPC;
 
if (inode) {
@@ -131,7 +131,7 @@ static int ramfs_symlink(struct inode * dir, struct dentry 
*dentry, const char *
struct inode *inode;
int error = -ENOSPC;
 
-   inode = ramfs_get_inode(dir->i_sb, dir, S_IFLNK|S_IRWXUGO, 0);
+   inode = ramfs_get_inode(inode_sb(dir), dir, S_IFLNK|S_IRWXUGO, 0);
if (inode) {
int l = strlen(symname)+1;
error = page_symlink(inode, symname, l);
-- 
2.15.1

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 68/76] fs/sysv: Use inode_sb() helper instead of inode->i_sb

2018-05-08 Thread Mark Fasheh

Signed-off-by: Mark Fasheh <mfas...@suse.de>
---
 fs/sysv/dir.c| 12 ++--
 fs/sysv/ialloc.c |  8 
 fs/sysv/inode.c  |  6 +++---
 fs/sysv/itree.c  | 29 +++--
 fs/sysv/namei.c  |  4 ++--
 5 files changed, 30 insertions(+), 29 deletions(-)

diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c
index 88e38cd8f5c9..84a11fda6a28 100644
--- a/fs/sysv/dir.c
+++ b/fs/sysv/dir.c
@@ -65,7 +65,7 @@ static int sysv_readdir(struct file *file, struct dir_context 
*ctx)
 {
unsigned long pos = ctx->pos;
struct inode *inode = file_inode(file);
-   struct super_block *sb = inode->i_sb;
+   struct super_block *sb = inode_sb(inode);
unsigned long npages = dir_pages(inode);
unsigned offset;
unsigned long n;
@@ -214,7 +214,7 @@ int sysv_add_link(struct dentry *dentry, struct inode 
*inode)
goto out_unlock;
memcpy (de->name, name, namelen);
memset (de->name + namelen, 0, SYSV_DIRSIZE - namelen - 2);
-   de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), inode->i_ino);
+   de->inode = cpu_to_fs16(SYSV_SB(inode_sb(inode)), inode->i_ino);
err = dir_commit_chunk(page, pos, SYSV_DIRSIZE);
dir->i_mtime = dir->i_ctime = current_time(dir);
mark_inode_dirty(dir);
@@ -265,10 +265,10 @@ int sysv_make_empty(struct inode *inode, struct inode 
*dir)
memset(base, 0, PAGE_SIZE);
 
de = (struct sysv_dir_entry *) base;
-   de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), inode->i_ino);
+   de->inode = cpu_to_fs16(SYSV_SB(inode_sb(inode)), inode->i_ino);
strcpy(de->name,".");
de++;
-   de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), dir->i_ino);
+   de->inode = cpu_to_fs16(SYSV_SB(inode_sb(inode)), dir->i_ino);
strcpy(de->name,"..");
 
kunmap(page);
@@ -283,7 +283,7 @@ int sysv_make_empty(struct inode *inode, struct inode *dir)
  */
 int sysv_empty_dir(struct inode * inode)
 {
-   struct super_block *sb = inode->i_sb;
+   struct super_block *sb = inode_sb(inode);
struct page *page = NULL;
unsigned long i, npages = dir_pages(inode);
 
@@ -335,7 +335,7 @@ void sysv_set_link(struct sysv_dir_entry *de, struct page 
*page,
lock_page(page);
err = sysv_prepare_chunk(page, pos, SYSV_DIRSIZE);
BUG_ON(err);
-   de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), inode->i_ino);
+   de->inode = cpu_to_fs16(SYSV_SB(inode_sb(inode)), inode->i_ino);
err = dir_commit_chunk(page, pos, SYSV_DIRSIZE);
dir_put_page(page);
dir->i_mtime = dir->i_ctime = current_time(dir);
diff --git a/fs/sysv/ialloc.c b/fs/sysv/ialloc.c
index 6c9801986af6..2515367bf047 100644
--- a/fs/sysv/ialloc.c
+++ b/fs/sysv/ialloc.c
@@ -100,14 +100,14 @@ static int refill_free_cache(struct super_block *sb)
 
 void sysv_free_inode(struct inode * inode)
 {
-   struct super_block *sb = inode->i_sb;
+   struct super_block *sb = inode_sb(inode);
struct sysv_sb_info *sbi = SYSV_SB(sb);
unsigned int ino;
struct buffer_head * bh;
struct sysv_inode * raw_inode;
unsigned count;
 
-   sb = inode->i_sb;
+   sb = inode_sb(inode);
ino = inode->i_ino;
if (ino <= SYSV_ROOT_INO || ino > sbi->s_ninodes) {
printk("sysv_free_inode: inode 0,1,2 or nonexistent inode\n");
@@ -116,7 +116,7 @@ void sysv_free_inode(struct inode * inode)
raw_inode = sysv_raw_inode(sb, ino, );
if (!raw_inode) {
printk("sysv_free_inode: unable to read inode block on device "
-  "%s\n", inode->i_sb->s_id);
+  "%s\n", inode_sb(inode)->s_id);
return;
}
mutex_lock(>s_lock);
@@ -135,7 +135,7 @@ void sysv_free_inode(struct inode * inode)
 
 struct inode * sysv_new_inode(const struct inode * dir, umode_t mode)
 {
-   struct super_block *sb = dir->i_sb;
+   struct super_block *sb = inode_sb(dir);
struct sysv_sb_info *sbi = SYSV_SB(sb);
struct inode *inode;
sysv_ino_t ino;
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index bec9f79adb25..9d04a4a2c248 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -192,7 +192,7 @@ struct inode *sysv_iget(struct super_block *sb, unsigned 
int ino)
raw_inode = sysv_raw_inode(sb, ino, );
if (!raw_inode) {
printk("Major problem: unable to read inode from dev %s\n",
-  inode->i_sb->s_id);
+  inode_sb(inode)->s_id);
goto bad_inode;
}
/* SystemV FS: kludge permissions if ino==SYSV_ROOT_INO ?? */
@@ -230,7 +230,7 @@ struct inode *sysv_iget(struct super_block *sb, unsigned 
int ino)
 
 static int __sysv_write_inode(s

[PATCH 66/76] fs/romfs: Use inode_sb() helper instead of inode->i_sb

2018-05-08 Thread Mark Fasheh

Signed-off-by: Mark Fasheh <mfas...@suse.de>
---
 fs/romfs/mmap-nommu.c |  4 ++--
 fs/romfs/super.c  | 24 +---
 2 files changed, 15 insertions(+), 13 deletions(-)

diff --git a/fs/romfs/mmap-nommu.c b/fs/romfs/mmap-nommu.c
index 1118a0dc6b45..0dbf9be30283 100644
--- a/fs/romfs/mmap-nommu.c
+++ b/fs/romfs/mmap-nommu.c
@@ -26,7 +26,7 @@ static unsigned long romfs_get_unmapped_area(struct file 
*file,
 unsigned long flags)
 {
struct inode *inode = file->f_mapping->host;
-   struct mtd_info *mtd = inode->i_sb->s_mtd;
+   struct mtd_info *mtd = inode_sb(inode)->s_mtd;
unsigned long isize, offset, maxpages, lpages;
int ret;
 
@@ -72,7 +72,7 @@ static int romfs_mmap(struct file *file, struct 
vm_area_struct *vma)
 
 static unsigned romfs_mmap_capabilities(struct file *file)
 {
-   struct mtd_info *mtd = file_inode(file)->i_sb->s_mtd;
+   struct mtd_info *mtd = inode_sb(file_inode(file))->s_mtd;
 
if (!mtd)
return NOMMU_MAP_COPY;
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index 8f06fd1f3d69..eb0b7d3775bb 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -122,7 +122,7 @@ static int romfs_readpage(struct file *file, struct page 
*page)
 
pos = ROMFS_I(inode)->i_dataoffset + offset;
 
-   ret = romfs_dev_read(inode->i_sb, pos, buf, fillsize);
+   ret = romfs_dev_read(inode_sb(inode), pos, buf, fillsize);
if (ret < 0) {
SetPageError(page);
fillsize = 0;
@@ -157,12 +157,12 @@ static int romfs_readdir(struct file *file, struct 
dir_context *ctx)
char fsname[ROMFS_MAXFN];   /* XXX dynamic? */
int ret;
 
-   maxoff = romfs_maxsize(i->i_sb);
+   maxoff = romfs_maxsize(inode_sb(i));
 
offset = ctx->pos;
if (!offset) {
offset = i->i_ino & ROMFH_MASK;
-   ret = romfs_dev_read(i->i_sb, offset, , ROMFH_SIZE);
+   ret = romfs_dev_read(inode_sb(i), offset, , ROMFH_SIZE);
if (ret < 0)
goto out;
offset = be32_to_cpu(ri.spec) & ROMFH_MASK;
@@ -178,16 +178,17 @@ static int romfs_readdir(struct file *file, struct 
dir_context *ctx)
ctx->pos = offset;
 
/* Fetch inode info */
-   ret = romfs_dev_read(i->i_sb, offset, , ROMFH_SIZE);
+   ret = romfs_dev_read(inode_sb(i), offset, , ROMFH_SIZE);
if (ret < 0)
goto out;
 
-   j = romfs_dev_strnlen(i->i_sb, offset + ROMFH_SIZE,
+   j = romfs_dev_strnlen(inode_sb(i), offset + ROMFH_SIZE,
  sizeof(fsname) - 1);
if (j < 0)
goto out;
 
-   ret = romfs_dev_read(i->i_sb, offset + ROMFH_SIZE, fsname, j);
+   ret = romfs_dev_read(inode_sb(i), offset + ROMFH_SIZE, fsname,
+j);
if (ret < 0)
goto out;
fsname[j] = '\0';
@@ -219,13 +220,13 @@ static struct dentry *romfs_lookup(struct inode *dir, 
struct dentry *dentry,
int len, ret;
 
offset = dir->i_ino & ROMFH_MASK;
-   ret = romfs_dev_read(dir->i_sb, offset, , ROMFH_SIZE);
+   ret = romfs_dev_read(inode_sb(dir), offset, , ROMFH_SIZE);
if (ret < 0)
goto error;
 
/* search all the file entries in the list starting from the one
 * pointed to by the directory's special data */
-   maxoff = romfs_maxsize(dir->i_sb);
+   maxoff = romfs_maxsize(inode_sb(dir));
offset = be32_to_cpu(ri.spec) & ROMFH_MASK;
 
name = dentry->d_name.name;
@@ -235,12 +236,13 @@ static struct dentry *romfs_lookup(struct inode *dir, 
struct dentry *dentry,
if (!offset || offset >= maxoff)
goto out0;
 
-   ret = romfs_dev_read(dir->i_sb, offset, , sizeof(ri));
+   ret = romfs_dev_read(inode_sb(dir), offset, , sizeof(ri));
if (ret < 0)
goto error;
 
/* try to match the first 16 bytes of name */
-   ret = romfs_dev_strcmp(dir->i_sb, offset + ROMFH_SIZE, name,
+   ret = romfs_dev_strcmp(inode_sb(dir), offset + ROMFH_SIZE,
+  name,
   len);
if (ret < 0)
goto error;
@@ -255,7 +257,7 @@ static struct dentry *romfs_lookup(struct inode *dir, 
struct dentry *dentry,
if ((be32_to_cpu(ri.next) & ROMFH_TYPE) == ROMFH_HRD)
offset = be32_to_cpu(ri.spec) & ROMFH_MASK;
 
-   inode = romfs_iget(dir->i_sb, offset);
+

[PATCH 69/76] fs/ubifs: Use inode_sb() helper instead of inode->i_sb

2018-05-08 Thread Mark Fasheh

Signed-off-by: Mark Fasheh <mfas...@suse.de>
---
 fs/ubifs/crypto.c |  4 ++--
 fs/ubifs/dir.c| 30 +++---
 fs/ubifs/file.c   | 42 +-
 fs/ubifs/ioctl.c  |  4 ++--
 fs/ubifs/super.c  |  4 ++--
 fs/ubifs/xattr.c  | 10 +-
 6 files changed, 47 insertions(+), 47 deletions(-)

diff --git a/fs/ubifs/crypto.c b/fs/ubifs/crypto.c
index 616a688f5d8f..3c8122065ed5 100644
--- a/fs/ubifs/crypto.c
+++ b/fs/ubifs/crypto.c
@@ -35,7 +35,7 @@ static unsigned int ubifs_crypt_max_namelen(struct inode 
*inode)
 int ubifs_encrypt(const struct inode *inode, struct ubifs_data_node *dn,
  unsigned int in_len, unsigned int *out_len, int block)
 {
-   struct ubifs_info *c = inode->i_sb->s_fs_info;
+   struct ubifs_info *c = inode_sb(inode)->s_fs_info;
void *p = >data;
struct page *ret;
unsigned int pad_len = round_up(in_len, UBIFS_CIPHER_BLOCK_SIZE);
@@ -61,7 +61,7 @@ int ubifs_encrypt(const struct inode *inode, struct 
ubifs_data_node *dn,
 int ubifs_decrypt(const struct inode *inode, struct ubifs_data_node *dn,
  unsigned int *out_len, int block)
 {
-   struct ubifs_info *c = inode->i_sb->s_fs_info;
+   struct ubifs_info *c = inode_sb(inode)->s_fs_info;
int err;
unsigned int clen = le16_to_cpu(dn->compr_size);
unsigned int dlen = *out_len;
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 9d7fb88e172e..6d168f5cc8ff 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -215,7 +215,7 @@ static struct dentry *ubifs_lookup(struct inode *dir, 
struct dentry *dentry,
union ubifs_key key;
struct inode *inode = NULL;
struct ubifs_dent_node *dent;
-   struct ubifs_info *c = dir->i_sb->s_fs_info;
+   struct ubifs_info *c = inode_sb(dir)->s_fs_info;
struct fscrypt_name nm;
 
dbg_gen("'%pd' in dir ino %lu", dentry, dir->i_ino);
@@ -262,7 +262,7 @@ static struct dentry *ubifs_lookup(struct inode *dir, 
struct dentry *dentry,
goto out_dent;
}
 
-   inode = ubifs_iget(dir->i_sb, le64_to_cpu(dent->inum));
+   inode = ubifs_iget(inode_sb(dir), le64_to_cpu(dent->inum));
if (IS_ERR(inode)) {
/*
 * This should not happen. Probably the file-system needs
@@ -307,7 +307,7 @@ static int ubifs_create(struct inode *dir, struct dentry 
*dentry, umode_t mode,
bool excl)
 {
struct inode *inode;
-   struct ubifs_info *c = dir->i_sb->s_fs_info;
+   struct ubifs_info *c = inode_sb(dir)->s_fs_info;
struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1,
.dirtied_ino = 1 };
struct ubifs_inode *dir_ui = ubifs_inode(dir);
@@ -376,7 +376,7 @@ static int do_tmpfile(struct inode *dir, struct dentry 
*dentry,
  umode_t mode, struct inode **whiteout)
 {
struct inode *inode;
-   struct ubifs_info *c = dir->i_sb->s_fs_info;
+   struct ubifs_info *c = inode_sb(dir)->s_fs_info;
struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1};
struct ubifs_budget_req ino_req = { .dirtied_ino = 1 };
struct ubifs_inode *ui, *dir_ui = ubifs_inode(dir);
@@ -525,7 +525,7 @@ static int ubifs_readdir(struct file *file, struct 
dir_context *ctx)
union ubifs_key key;
struct ubifs_dent_node *dent;
struct inode *dir = file_inode(file);
-   struct ubifs_info *c = dir->i_sb->s_fs_info;
+   struct ubifs_info *c = inode_sb(dir)->s_fs_info;
bool encrypted = ubifs_crypt_is_encrypted(dir);
 
dbg_gen("dir ino %lu, f_pos %#llx", dir->i_ino, ctx->pos);
@@ -712,7 +712,7 @@ static void unlock_2_inodes(struct inode *inode1, struct 
inode *inode2)
 static int ubifs_link(struct dentry *old_dentry, struct inode *dir,
  struct dentry *dentry)
 {
-   struct ubifs_info *c = dir->i_sb->s_fs_info;
+   struct ubifs_info *c = inode_sb(dir)->s_fs_info;
struct inode *inode = d_inode(old_dentry);
struct ubifs_inode *ui = ubifs_inode(inode);
struct ubifs_inode *dir_ui = ubifs_inode(dir);
@@ -786,7 +786,7 @@ static int ubifs_link(struct dentry *old_dentry, struct 
inode *dir,
 
 static int ubifs_unlink(struct inode *dir, struct dentry *dentry)
 {
-   struct ubifs_info *c = dir->i_sb->s_fs_info;
+   struct ubifs_info *c = inode_sb(dir)->s_fs_info;
struct inode *inode = d_inode(dentry);
struct ubifs_inode *dir_ui = ubifs_inode(dir);
int err, sz_change, budgeted = 1;
@@ -873,7 +873,7 @@ static int ubifs_unlink(struct inode *dir, struct dentry 
*dentry)
  */
 int ubifs_check_dir_empty(struct inode *dir)
 {
-   struct ubifs_info *c = dir->i_sb->s_fs_info;
+   struct ubifs_info *c = inode_sb(dir)->s_fs_info;

[PATCH 70/76] fs/udf: Use inode_sb() helper instead of inode->i_sb

2018-05-08 Thread Mark Fasheh

Signed-off-by: Mark Fasheh <mfas...@suse.de>
---
 fs/udf/dir.c   |   2 +-
 fs/udf/directory.c |  30 
 fs/udf/file.c  |   6 +-
 fs/udf/ialloc.c|  24 +++---
 fs/udf/inode.c | 209 +++--
 fs/udf/misc.c  |   4 +-
 fs/udf/namei.c |  76 ++-
 fs/udf/partition.c |   2 +-
 fs/udf/super.c |   2 +-
 fs/udf/symlink.c   |   7 +-
 fs/udf/truncate.c  |  26 +++
 11 files changed, 199 insertions(+), 189 deletions(-)

diff --git a/fs/udf/dir.c b/fs/udf/dir.c
index c19dba45aa20..ef5b632da782 100644
--- a/fs/udf/dir.c
+++ b/fs/udf/dir.c
@@ -57,7 +57,7 @@ static int udf_readdir(struct file *file, struct dir_context 
*ctx)
sector_t offset;
int i, num, ret = 0;
struct extent_position epos = { NULL, 0, {0, 0} };
-   struct super_block *sb = dir->i_sb;
+   struct super_block *sb = inode_sb(dir);
 
if (ctx->pos == 0) {
if (!dir_emit_dot(file, ctx))
diff --git a/fs/udf/directory.c b/fs/udf/directory.c
index 0a98a2369738..d5d490eaba6c 100644
--- a/fs/udf/directory.c
+++ b/fs/udf/directory.c
@@ -38,7 +38,7 @@ struct fileIdentDesc *udf_fileident_read(struct inode *dir, 
loff_t *nf_pos,
   (iinfo->i_efe ?
sizeof(struct extendedFileEntry) :
sizeof(struct fileEntry)),
-  dir->i_sb->s_blocksize,
+  inode_sb(dir)->s_blocksize,
   &(fibh->eoffset));
if (!fi)
return NULL;
@@ -51,15 +51,15 @@ struct fileIdentDesc *udf_fileident_read(struct inode *dir, 
loff_t *nf_pos,
return fi;
}
 
-   if (fibh->eoffset == dir->i_sb->s_blocksize) {
+   if (fibh->eoffset == inode_sb(dir)->s_blocksize) {
uint32_t lextoffset = epos->offset;
-   unsigned char blocksize_bits = dir->i_sb->s_blocksize_bits;
+   unsigned char blocksize_bits = inode_sb(dir)->s_blocksize_bits;
 
if (udf_next_aext(dir, epos, eloc, elen, 1) !=
(EXT_RECORDED_ALLOCATED >> 30))
return NULL;
 
-   block = udf_get_lb_pblock(dir->i_sb, eloc, *offset);
+   block = udf_get_lb_pblock(inode_sb(dir), eloc, *offset);
 
(*offset)++;
 
@@ -69,7 +69,7 @@ struct fileIdentDesc *udf_fileident_read(struct inode *dir, 
loff_t *nf_pos,
epos->offset = lextoffset;
 
brelse(fibh->sbh);
-   fibh->sbh = fibh->ebh = udf_tread(dir->i_sb, block);
+   fibh->sbh = fibh->ebh = udf_tread(inode_sb(dir), block);
if (!fibh->sbh)
return NULL;
fibh->soffset = fibh->eoffset = 0;
@@ -79,9 +79,9 @@ struct fileIdentDesc *udf_fileident_read(struct inode *dir, 
loff_t *nf_pos,
if (i + *offset > (*elen >> blocksize_bits))
i = (*elen >> blocksize_bits)-*offset;
for (num = 0; i > 0; i--) {
-   block = udf_get_lb_pblock(dir->i_sb, eloc,
+   block = udf_get_lb_pblock(inode_sb(dir), eloc,
  *offset + i);
-   tmp = udf_tgetblk(dir->i_sb, block);
+   tmp = udf_tgetblk(inode_sb(dir), block);
if (tmp && !buffer_uptodate(tmp) &&
!buffer_locked(tmp))
bha[num++] = tmp;
@@ -99,7 +99,7 @@ struct fileIdentDesc *udf_fileident_read(struct inode *dir, 
loff_t *nf_pos,
fibh->sbh = fibh->ebh;
}
 
-   fi = udf_get_fileident(fibh->sbh->b_data, dir->i_sb->s_blocksize,
+   fi = udf_get_fileident(fibh->sbh->b_data, inode_sb(dir)->s_blocksize,
   &(fibh->eoffset));
 
if (!fi)
@@ -107,29 +107,29 @@ struct fileIdentDesc *udf_fileident_read(struct inode 
*dir, loff_t *nf_pos,
 
*nf_pos += fibh->eoffset - fibh->soffset;
 
-   if (fibh->eoffset <= dir->i_sb->s_blocksize) {
+   if (fibh->eoffset <= inode_sb(dir)->s_blocksize) {
memcpy((uint8_t *)cfi, (uint8_t *)fi,
   sizeof(struct fileIdentDesc));
-   } else if (fibh->eoffset > dir->i_sb->s_blocksize) {
+   } else if (fibh->eoffset > inode_sb(dir)->s_blocksize) {
uint32_t lextoffset = epos->offset;
 
if (udf_next_aext(dir, epos, eloc, elen, 1) !=

[PATCH 71/76] fs/ufs: Use inode_sb() helper instead of inode->i_sb

2018-05-08 Thread Mark Fasheh

Signed-off-by: Mark Fasheh <mfas...@suse.de>
---
 fs/ufs/balloc.c | 23 ---
 fs/ufs/dir.c| 34 +-
 fs/ufs/ialloc.c |  4 ++--
 fs/ufs/inode.c  | 37 +++--
 fs/ufs/namei.c  |  6 +++---
 5 files changed, 53 insertions(+), 51 deletions(-)

diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c
index e727ee07dbe4..b6053d9c0e64 100644
--- a/fs/ufs/balloc.c
+++ b/fs/ufs/balloc.c
@@ -45,7 +45,7 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, 
unsigned count)
unsigned cgno, bit, end_bit, bbase, blkmap, i;
u64 blkno;

-   sb = inode->i_sb;
+   sb = inode_sb(inode);
uspi = UFS_SB(sb)->s_uspi;

UFSD("ENTER, fragment %llu, count %u\n",
@@ -141,7 +141,7 @@ void ufs_free_blocks(struct inode *inode, u64 fragment, 
unsigned count)
unsigned overflow, cgno, bit, end_bit, i;
u64 blkno;

-   sb = inode->i_sb;
+   sb = inode_sb(inode);
uspi = UFS_SB(sb)->s_uspi;
 
UFSD("ENTER, fragment %llu, count %u\n",
@@ -268,7 +268,7 @@ static void ufs_change_blocknr(struct inode *inode, 
sector_t beg,
if (!page)/* it was truncated */
continue;
if (IS_ERR(page)) {/* or EIO */
-   ufs_error(inode->i_sb, __func__,
+   ufs_error(inode_sb(inode), __func__,
  "read of page %llu failed\n",
  (unsigned long long)index);
continue;
@@ -294,12 +294,13 @@ static void ufs_change_blocknr(struct inode *inode, 
sector_t beg,
pos = (i - beg) + j;
 
if (!buffer_mapped(bh))
-   map_bh(bh, inode->i_sb, oldb + pos);
+   map_bh(bh, inode_sb(inode),
+  oldb + pos);
if (!buffer_uptodate(bh)) {
ll_rw_block(REQ_OP_READ, 0, 1, );
wait_on_buffer(bh);
if (!buffer_uptodate(bh)) {
-   ufs_error(inode->i_sb, __func__,
+   ufs_error(inode_sb(inode), __func__,
  "read of block failed\n");
break;
}
@@ -329,9 +330,9 @@ static void ufs_clear_frags(struct inode *inode, sector_t 
beg, unsigned int n,
sector_t end = beg + n;
 
for (; beg < end; ++beg) {
-   bh = sb_getblk(inode->i_sb, beg);
+   bh = sb_getblk(inode_sb(inode), beg);
lock_buffer(bh);
-   memset(bh->b_data, 0, inode->i_sb->s_blocksize);
+   memset(bh->b_data, 0, inode_sb(inode)->s_blocksize);
set_buffer_uptodate(bh);
mark_buffer_dirty(bh);
unlock_buffer(bh);
@@ -355,7 +356,7 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 
fragment,
 inode->i_ino, (unsigned long long)fragment,
 (unsigned long long)goal, count);

-   sb = inode->i_sb;
+   sb = inode_sb(inode);
uspi = UFS_SB(sb)->s_uspi;
usb1 = ubh_get_usb_first(uspi);
*err = -ENOSPC;
@@ -517,7 +518,7 @@ static u64 ufs_add_fragments(struct inode *inode, u64 
fragment,
UFSD("ENTER, fragment %llu, oldcount %u, newcount %u\n",
 (unsigned long long)fragment, oldcount, newcount);

-   sb = inode->i_sb;
+   sb = inode_sb(inode);
uspi = UFS_SB(sb)->s_uspi;
count = newcount - oldcount;

@@ -597,7 +598,7 @@ static u64 ufs_alloc_fragments(struct inode *inode, 
unsigned cgno,
UFSD("ENTER, ino %lu, cgno %u, goal %llu, count %u\n",
 inode->i_ino, cgno, (unsigned long long)goal, count);
 
-   sb = inode->i_sb;
+   sb = inode_sb(inode);
uspi = UFS_SB(sb)->s_uspi;
oldcg = cgno;

@@ -708,7 +709,7 @@ static u64 ufs_alloccg_block(struct inode *inode,
 
UFSD("ENTER, goal %llu\n", (unsigned long long)goal);
 
-   sb = inode->i_sb;
+   sb = inode_sb(inode);
uspi = UFS_SB(sb)->s_uspi;
ucg = ubh_get_ucg(UCPI_UBH(ucpi));
 
diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c
index b721d0bda5e5..ebe0ddc8b708 100644
--- a/fs/ufs/dir.c
+++ b/fs/ufs/dir.c
@@ -75,7 +75,7 @@ ino_t ufs_inode_by_name(struct inode *dir, const struct qstr 
*qstr)

de = ufs_find_entry(dir, qstr, );
if (de) {
-   res = fs32_to_cpu(dir->i_sb, de->d_ino);
+   res = fs32_to_cpu(inode_sb(dir), d

[PATCH 73/76] vfs: Move s_dev to to struct fs_view

2018-05-08 Thread Mark Fasheh

There are many places where a dev_t:ino_t pair are passed to userspace
to uniquely describe an inode.  Some file systems, like btrfs, have
multiple inode namespace internally and use a separate dev_t to make the
distinction between them.

The kernel typically uses sb->s_dev for the inode device. Most filesystems
are fine with that but btrfs needs to use the device beloning to the root
which that inode resides on. By moving s_dev into the fs_view we allow btrfs
and any similar filesystems to set this field on a per inode basis.

This patch adds a dev_t field to struct fs_view, v_dev and removes s_dev
from struct super_block. I also include a helper, inode_view() to make
referencing inode->i_view fields less clunky.

Signed-off-by: Mark Fasheh <mfas...@suse.de>
---
 include/linux/fs.h | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 5d4bb19b2a43..c93486505084 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1351,6 +1351,7 @@ struct sb_writers {
  */
 struct fs_view {
struct super_block  *v_sb;
+   dev_t   v_dev;  /* search index; _not_ kdev_t */
 };
 
 static inline struct super_block *inode_sb(const struct inode *inode)
@@ -1358,9 +1359,13 @@ static inline struct super_block *inode_sb(const struct 
inode *inode)
return inode->i_view->v_sb;
 }
 
+static inline struct fs_view *inode_view(const struct inode *inode)
+{
+   return inode->i_view;
+}
+
 struct super_block {
struct list_heads_list; /* Keep this first */
-   dev_t   s_dev;  /* search index; _not_ kdev_t */
unsigned char   s_blocksize_bits;
unsigned long   s_blocksize;
loff_t  s_maxbytes; /* Max file size */
-- 
2.15.1

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 72/76] fs/xfs: Use inode_sb() helper instead of inode->i_sb

2018-05-08 Thread Mark Fasheh

Signed-off-by: Mark Fasheh <mfas...@suse.de>
---
 fs/xfs/xfs_acl.c|  2 +-
 fs/xfs/xfs_aops.c   |  4 ++--
 fs/xfs/xfs_export.c |  4 ++--
 fs/xfs/xfs_file.c   | 10 -
 fs/xfs/xfs_ioctl.c  |  8 +++
 fs/xfs/xfs_iops.c   |  6 ++---
 fs/xfs/xfs_pnfs.c   |  2 +-
 fs/xfs/xfs_trace.h  | 64 ++---
 8 files changed, 50 insertions(+), 50 deletions(-)

diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index 3354140de07e..42b00b01ea1a 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -255,7 +255,7 @@ xfs_set_acl(struct inode *inode, struct posix_acl *acl, int 
type)
goto set_acl;
 
error = -E2BIG;
-   if (acl->a_count > XFS_ACL_MAX_ENTRIES(XFS_M(inode->i_sb)))
+   if (acl->a_count > XFS_ACL_MAX_ENTRIES(XFS_M(inode_sb(inode
return error;
 
if (type == ACL_TYPE_ACCESS) {
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 9c6a830da0ee..951ca9c4ed9e 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -219,7 +219,7 @@ xfs_setfilesize_trans_alloc(
 * We may pass freeze protection with a transaction.  So tell lockdep
 * we released it.
 */
-   __sb_writers_release(ioend->io_inode->i_sb, SB_FREEZE_FS);
+   __sb_writers_release(inode_sb(ioend->io_inode), SB_FREEZE_FS);
/*
 * We hand off the transaction to the completion thread now, so
 * clear the flag here.
@@ -288,7 +288,7 @@ xfs_setfilesize_ioend(
 * Similarly for freeze protection.
 */
current_set_flags_nested(>t_pflags, PF_MEMALLOC_NOFS);
-   __sb_writers_acquired(VFS_I(ip)->i_sb, SB_FREEZE_FS);
+   __sb_writers_acquired(inode_sb(VFS_I(ip)), SB_FREEZE_FS);
 
/* we abort the update if there was an IO error */
if (error) {
diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c
index fe1bfee35898..a78f6eb9987b 100644
--- a/fs/xfs/xfs_export.c
+++ b/fs/xfs/xfs_export.c
@@ -78,8 +78,8 @@ xfs_fs_encode_fh(
 * large enough filesystem may contain them, thus the slightly
 * confusing looking conditional below.
 */
-   if (!(XFS_M(inode->i_sb)->m_flags & XFS_MOUNT_SMALL_INUMS) ||
-   (XFS_M(inode->i_sb)->m_flags & XFS_MOUNT_32BITINODES))
+   if (!(XFS_M(inode_sb(inode))->m_flags & XFS_MOUNT_SMALL_INUMS) ||
+   (XFS_M(inode_sb(inode))->m_flags & XFS_MOUNT_32BITINODES))
fileid_type |= XFS_FILEID_TYPE_64FLAG;
 
/*
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 9ea08326f876..7b805a8a031e 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -821,7 +821,7 @@ xfs_file_fallocate(
}
 
/* check the new inode size does not wrap through zero */
-   if (new_size > inode->i_sb->s_maxbytes) {
+   if (new_size > inode_sb(inode)->s_maxbytes) {
error = -EFBIG;
goto out_unlock;
}
@@ -926,7 +926,7 @@ xfs_file_open(
 {
if (!(file->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS)
return -EFBIG;
-   if (XFS_FORCED_SHUTDOWN(XFS_M(inode->i_sb)))
+   if (XFS_FORCED_SHUTDOWN(XFS_M(inode_sb(inode
return -EIO;
file->f_mode |= FMODE_NOWAIT;
return 0;
@@ -1014,7 +1014,7 @@ xfs_file_llseek(
 
if (offset < 0)
return offset;
-   return vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
+   return vfs_setpos(file, offset, inode_sb(inode)->s_maxbytes);
 }
 
 /*
@@ -1040,7 +1040,7 @@ __xfs_filemap_fault(
trace_xfs_filemap_fault(ip, pe_size, write_fault);
 
if (write_fault) {
-   sb_start_pagefault(inode->i_sb);
+   sb_start_pagefault(inode_sb(inode));
file_update_time(vmf->vma->vm_file);
}
 
@@ -1060,7 +1060,7 @@ __xfs_filemap_fault(
xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
 
if (write_fault)
-   sb_end_pagefault(inode->i_sb);
+   sb_end_pagefault(inode_sb(inode));
return ret;
 }
 
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 89fb1eb80aae..8e492a123815 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -98,7 +98,7 @@ xfs_find_handle(
 * and only for regular files, directories or symbolic links.
 */
error = -EINVAL;
-   if (inode->i_sb->s_magic != XFS_SB_MAGIC)
+   if (inode_sb(inode)->s_magic != XFS_SB_MAGIC)
goto out_put;
 
error = -EBADF;
@@ -688,9 +688,9 @@ xfs_ioc_space(
}
 
if (bf->l_start < 0 ||
-   bf->l_start > inode->i_sb->s_maxbytes ||
+   bf->l_start > inode_sb(inode)->s_maxbytes ||
bf->l_start + bf->l_len < 0 ||
-   bf->l_s

[PATCH 74/76] fs: Use fs_view device from struct inode.

2018-05-08 Thread Mark Fasheh

Replace calls of inode_sb(inode)->s_dev with inode_view(inode)->v_dev.

Signed-off-by: Mark Fasheh <mfas...@suse.de>
---
 arch/arc/kernel/troubleshoot.c |   2 +-
 drivers/staging/lustre/lustre/llite/dir.c  |   2 +-
 drivers/staging/lustre/lustre/llite/file.c |   2 +-
 fs/eventpoll.c |   2 +-
 fs/f2fs/trace.c|   8 +-
 fs/fuse/dir.c  |   2 +-
 fs/locks.c |  12 +--
 fs/nfs/nfs4trace.h |  36 -
 fs/nfs/nfstrace.h  |  42 +-
 fs/nfsd/vfs.c  |   6 +-
 fs/notify/fdinfo.c |   4 +-
 fs/proc/nommu.c|   2 +-
 fs/proc/task_mmu.c |   2 +-
 fs/proc/task_nommu.c   |   2 +-
 fs/stat.c  |   2 +-
 fs/xfs/xfs_iops.c  |   2 +-
 fs/xfs/xfs_trace.h |  64 
 include/trace/events/ext4.h| 118 ++---
 include/trace/events/f2fs.h|  52 ++---
 include/trace/events/filelock.h|   8 +-
 include/trace/events/filemap.h |   6 +-
 include/trace/events/fs_dax.h  |  14 ++--
 include/trace/events/jbd2.h|   2 +-
 include/trace/events/writeback.h   |   2 +-
 kernel/audit.c |   2 +-
 kernel/audit_fsnotify.c|   2 +-
 kernel/audit_watch.c   |   4 +-
 kernel/auditsc.c   |   4 +-
 kernel/bpf/offload.c   |   4 +-
 kernel/events/core.c   |   4 +-
 mm/memory-failure.c|   2 +-
 security/tomoyo/condition.c|   2 +-
 32 files changed, 209 insertions(+), 209 deletions(-)

diff --git a/arch/arc/kernel/troubleshoot.c b/arch/arc/kernel/troubleshoot.c
index 18eb4984d555..b3aa20be118e 100644
--- a/arch/arc/kernel/troubleshoot.c
+++ b/arch/arc/kernel/troubleshoot.c
@@ -104,7 +104,7 @@ static void show_faulting_vma(unsigned long address, char 
*buf)
if (file) {
nm = file_path(file, buf, PAGE_SIZE - 1);
inode = file_inode(vma->vm_file);
-   dev = inode_sb(inode)->s_dev;
+   dev = inode_view(inode)->v_dev;
ino = inode->i_ino;
}
pr_info("@off 0x%lx in [%s]\n"
diff --git a/drivers/staging/lustre/lustre/llite/dir.c 
b/drivers/staging/lustre/lustre/llite/dir.c
index 809e493b61da..cb50d04327d4 100644
--- a/drivers/staging/lustre/lustre/llite/dir.c
+++ b/drivers/staging/lustre/lustre/llite/dir.c
@@ -1364,7 +1364,7 @@ static long ll_dir_ioctl(struct file *file, unsigned int 
cmd, unsigned long arg)
struct lov_user_mds_data __user *lmdp;
lstat_t st = { 0 };
 
-   st.st_dev = inode_sb(inode)->s_dev;
+   st.st_dev = inode_view(inode)->v_dev;
st.st_mode= body->mbo_mode;
st.st_nlink   = body->mbo_nlink;
st.st_uid = body->mbo_uid;
diff --git a/drivers/staging/lustre/lustre/llite/file.c 
b/drivers/staging/lustre/lustre/llite/file.c
index 64df47bd1118..749a74e49e61 100644
--- a/drivers/staging/lustre/lustre/llite/file.c
+++ b/drivers/staging/lustre/lustre/llite/file.c
@@ -2967,7 +2967,7 @@ int ll_getattr(const struct path *path, struct kstat 
*stat,
 
OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
 
-   stat->dev = inode_sb(inode)->s_dev;
+   stat->dev = inode_view(inode)->v_dev;
if (ll_need_32bit_api(sbi))
stat->ino = cl_fid_build_ino(>lli_fid, 1);
else
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index a7e3dbc83bbc..39487ae0eabd 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -955,7 +955,7 @@ static void ep_show_fdinfo(struct seq_file *m, struct file 
*f)
   epi->ffd.fd, epi->event.events,
   (long long)epi->event.data,
   (long long)epi->ffd.file->f_pos,
-  inode->i_ino, inode_sb(inode)->s_dev);
+  inode->i_ino, inode_view(inode)->v_dev);
if (seq_has_overflowed(m))
break;
}
diff --git a/fs/f2fs/trace.c b/fs/f2fs/trace.c
index 235a3bca1f5f..601cf0cb723e 100644
--- a/fs/f2fs/trace.c
+++ b/fs/f2fs/trace.c
@@ -74,8 +74,8 @@ void f2fs_trace_pid(struct page *page)
f2fs_radix_tree_insert(, pid, current);
 
trace_printk("%3x:%3x %4x %-16s\n",
-   MAJOR(inode_sb(inode)->s_dev),
-

[PATCH 76/76] btrfs: Use fs_view in roots, point inodes to it

2018-05-08 Thread Mark Fasheh

Ensure that our per-subvolume anonymous dev_t gets published to userspace,
instead of the global device in struct super_block. We do this by embedding a
struct fs_view in btrfs_root. The anonymous device number is placed in
view->v_dev. Each inode is then pointed to the view embedded in their owning 
root.

Signed-off-by: Mark Fasheh <mfas...@suse.de>
---
 fs/btrfs/ctree.h |  7 ++-
 fs/btrfs/disk-io.c   | 14 --
 fs/btrfs/disk-io.h   |  2 +-
 fs/btrfs/inode.c |  5 +++--
 fs/btrfs/root-tree.c |  2 +-
 5 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index a3cca35642e2..cffd3aa51e93 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1216,11 +1216,6 @@ struct btrfs_root {
 * protected by inode_lock
 */
struct radix_tree_root delayed_nodes_tree;
-   /*
-* right now this just gets used so that a root has its own devid
-* for stat.  It may be used for more later
-*/
-   dev_t anon_dev;
 
spinlock_t root_item_lock;
refcount_t refs;
@@ -1262,6 +1257,8 @@ struct btrfs_root {
 
/* For qgroup metadata space reserve */
atomic64_t qgroup_meta_rsv;
+
+   struct fs_view view; /* fill in and link to inodes for vfs usage */
 };
 
 struct btrfs_file_private {
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 334234da997c..c50af14b5856 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1191,7 +1191,8 @@ static void __setup_root(struct btrfs_root *root, struct 
btrfs_fs_info *fs_info,
else
root->defrag_trans_start = 0;
root->root_key.objectid = objectid;
-   root->anon_dev = 0;
+   memset(>view, 0, sizeof(struct fs_view));
+   root->view.v_sb = fs_info->sb;
 
spin_lock_init(>root_item_lock);
 }
@@ -1463,7 +1464,7 @@ struct btrfs_root *btrfs_read_fs_root(struct btrfs_root 
*tree_root,
return root;
 }
 
-int btrfs_init_fs_root(struct btrfs_root *root)
+int btrfs_init_fs_root(struct btrfs_root *root, struct super_block *sb)
 {
int ret;
struct btrfs_subvolume_writers *writers;
@@ -1487,7 +1488,8 @@ int btrfs_init_fs_root(struct btrfs_root *root)
spin_lock_init(>ino_cache_lock);
init_waitqueue_head(>ino_cache_wait);
 
-   ret = get_anon_bdev(>anon_dev);
+   root->view.v_sb = sb;
+   ret = get_anon_bdev(>view.v_dev);
if (ret)
goto fail;
 
@@ -1587,7 +1589,7 @@ struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info 
*fs_info,
goto fail;
}
 
-   ret = btrfs_init_fs_root(root);
+   ret = btrfs_init_fs_root(root, fs_info->sb);
if (ret)
goto fail;
 
@@ -3603,8 +3605,8 @@ static void free_fs_root(struct btrfs_root *root)
WARN_ON(!RB_EMPTY_ROOT(>inode_tree));
btrfs_free_block_rsv(root->fs_info, root->orphan_block_rsv);
root->orphan_block_rsv = NULL;
-   if (root->anon_dev)
-   free_anon_bdev(root->anon_dev);
+   if (root->view.v_dev)
+   free_anon_bdev(root->view.v_dev);
if (root->subv_writers)
btrfs_free_subvolume_writers(root->subv_writers);
free_extent_buffer(root->node);
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 301151a50ac1..af60e7d76449 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -72,7 +72,7 @@ int btrfs_read_dev_one_super(struct block_device *bdev, int 
copy_num,
 int btrfs_commit_super(struct btrfs_fs_info *fs_info);
 struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root,
  struct btrfs_key *location);
-int btrfs_init_fs_root(struct btrfs_root *root);
+int btrfs_init_fs_root(struct btrfs_root *root, struct super_block *sb);
 struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
u64 root_id);
 int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 1d4a28a4763a..17e93fb6d871 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -5705,6 +5705,7 @@ static int btrfs_init_locked_inode(struct inode *inode, 
void *p)
memcpy(_I(inode)->location, args->location,
   sizeof(*args->location));
BTRFS_I(inode)->root = args->root;
+   inode->i_view = >root->view;
return 0;
 }
 
@@ -6346,6 +6347,7 @@ static struct inode *btrfs_new_inode(struct 
btrfs_trans_handle *trans,
BTRFS_I(inode)->root = root;
BTRFS_I(inode)->generation = trans->transid;
inode->i_generation = BTRFS_I(inode)->generation;
+   inode->i_view = >view;
 
/*
 * We could have gotten an inode number from somebody who was fsynced
@@ -9528,8 +9530,7 @@ static int btrfs_getattr(const struct path *path, struct 
kstat *stat,

[PATCH 75/76] fs: Use fs view device from struct super_block

2018-05-08 Thread Mark Fasheh

We have some places which access s_dev directly from struct super_block.

Convert those to get v_dev from the default super block view.

Signed-off-by: Mark Fasheh <mfas...@suse.de>
---
 drivers/mtd/mtdsuper.c  |  2 +-
 drivers/staging/lustre/lustre/llite/llite_lib.c |  6 +-
 fs/autofs4/autofs_i.h   |  2 +-
 fs/autofs4/dev-ioctl.c  | 10 +--
 fs/ceph/super.c |  2 +-
 fs/cramfs/inode.c   |  4 +-
 fs/exofs/super.c|  2 +-
 fs/fuse/inode.c |  2 +-
 fs/gfs2/ops_fstype.c|  2 +-
 fs/gfs2/quota.c |  5 +-
 fs/gfs2/sys.c   |  3 +-
 fs/gfs2/trace_gfs2.h| 30 -
 fs/nfs/nfs4trace.h  |  2 +-
 fs/nfs/super.c  |  6 +-
 fs/nfsd/nfs3xdr.c   |  2 +-
 fs/nfsd/nfsfh.c |  6 +-
 fs/nilfs2/super.c   |  2 +-
 fs/ocfs2/journal.c  | 13 ++--
 fs/ocfs2/ocfs2_trace.h  |  4 +-
 fs/ocfs2/super.c|  4 +-
 fs/overlayfs/inode.c|  4 +-
 fs/overlayfs/readdir.c  |  4 +-
 fs/proc_namespace.c |  2 +-
 fs/quota/dquot.c|  2 +-
 fs/reiserfs/journal.c   |  6 +-
 fs/romfs/super.c|  6 +-
 fs/super.c  |  8 +--
 fs/xfs/scrub/trace.h| 26 
 fs/xfs/xfs_trace.h  | 84 -
 fs/xfs/xfs_trans_dquot.c|  2 +-
 include/trace/events/ext4.h | 36 +--
 include/trace/events/f2fs.h | 18 +++---
 include/trace/events/writeback.h|  2 +-
 init/do_mounts.c|  2 +-
 kernel/audit_watch.c|  2 +-
 net/unix/diag.c |  2 +-
 security/tomoyo/realpath.c  |  4 +-
 37 files changed, 162 insertions(+), 157 deletions(-)

diff --git a/drivers/mtd/mtdsuper.c b/drivers/mtd/mtdsuper.c
index d58a61c09304..f5346de9b29d 100644
--- a/drivers/mtd/mtdsuper.c
+++ b/drivers/mtd/mtdsuper.c
@@ -51,7 +51,7 @@ static int get_sb_mtd_set(struct super_block *sb, void *_mtd)
struct mtd_info *mtd = _mtd;
 
sb->s_mtd = mtd;
-   sb->s_dev = MKDEV(MTD_BLOCK_MAJOR, mtd->index);
+   sb->s_view.v_dev = MKDEV(MTD_BLOCK_MAJOR, mtd->index);
sb->s_bdi = bdi_get(mtd_bdi);
 
return 0;
diff --git a/drivers/staging/lustre/lustre/llite/llite_lib.c 
b/drivers/staging/lustre/lustre/llite/llite_lib.c
index 6f6df27635d4..ffa6e7d92080 100644
--- a/drivers/staging/lustre/lustre/llite/llite_lib.c
+++ b/drivers/staging/lustre/lustre/llite/llite_lib.c
@@ -524,7 +524,7 @@ static int client_common_fill_super(struct super_block *sb, 
char *md, char *dt)
goto out_lock_cn_cb;
}
 
-   sbi->ll_sdev_orig = sb->s_dev;
+   sbi->ll_sdev_orig = sb->s_view.v_dev;
 
/* We set sb->s_dev equal on all lustre clients in order to support
 * NFS export clustering.  NFSD requires that the FSID be the same
@@ -535,7 +535,7 @@ static int client_common_fill_super(struct super_block *sb, 
char *md, char *dt)
 */
uuid = obd_get_uuid(sbi->ll_md_exp);
if (uuid) {
-   sb->s_dev = get_uuid2int(uuid->uuid, strlen(uuid->uuid));
+   sb->s_view.v_dev = get_uuid2int(uuid->uuid, strlen(uuid->uuid));
get_uuid2fsid(uuid->uuid, strlen(uuid->uuid), >ll_fsid);
}
 
@@ -670,7 +670,7 @@ void ll_kill_super(struct super_block *sb)
 * in put_super not affected real removing devices
 */
if (sbi) {
-   sb->s_dev = sbi->ll_sdev_orig;
+   sb->s_view.v_dev = sbi->ll_sdev_orig;
sbi->ll_umounting = 1;
 
/* wait running statahead threads to quit */
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index 4737615f0eaa..31fcf15108eb 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -225,7 +225,7 @@ void autofs4_catatonic_mode(struct autofs_sb_info *);
 
 static inline u32 autofs4_get_dev(struct autofs_sb_info *sbi)
 {
-   return new_encode_dev(sbi->sb->s_dev);
+   return new_encode_dev(sbi->sb->s_view.v_dev);
 }
 
 static inline u64 autofs4_get_ino(struct autofs_sb_info *sbi)
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index 6b28b01e5022..6d1f1bc5db06 100644
--- a/fs/autofs4/dev-

Re: Announcing btrfs-dedupe

2016-11-08 Thread Mark Fasheh

On Mon, Nov 7, 2016 at 6:17 PM, Darrick J. Wong  wrote:
> On Mon, Nov 07, 2016 at 09:54:09PM +0100, Adam Borowski wrote:
>> Mark has already included XFS in documentation of duperemove, all that looks
>> amiss is btrfs-extent-same having an obsolete name.  But then, I never did
>> any non-superficial tests on XFS, beyond "seems to work".

I'd actually be ok dropping btrfs-extent-same completely at this point
but I'm concerned that it would leave some users behind.


> /me wonders if ocfs2 will ever catch up to the reflink/dedupe party. ;)

Hey, Ocfs2 started the reflink party! But yeah it's fallen behind
since then with respect to cow and dedupe. More importantly though I'd
like to see some extra extent tracking in there like XFS did with the
reflink b+tree.
   --Mark

-- 
"When the going gets weird, the weird turn pro."
Hunter S. Thompson
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: Announcing btrfs-dedupe

2016-11-08 Thread Mark Fasheh

On Mon, Nov 7, 2016 at 6:40 PM, Christoph Anton Mitterer
 wrote:
> On Mon, 2016-11-07 at 15:02 +0100, David Sterba wrote:
>> I think adding a whole-file dedup mode to duperemove would be better
>> (from user's POV) than writing a whole new tool
>
> What would IMO be really good from a user's POV was, if one of the
> tools, deemed to be the "best", would be added to the btrfs-progs and
> simply become "the official" one.

Yeah there's two problems, one being that the extent-same ioctl (and
duperemove) is cross-file system now so I. The other one James touches
on, which is that there's a non trivial amount of complexity in
duperemove so shoving it in btrfs progs just means we're going to have
parallel development streams solving some different problems.

That's not to say that every dedupe tool has to be complex - we have
xfs_io to run the ioctl and I don't think it'd be a bad idea if
btrfs-progs had a simple interface to it too.
   --Mark



-- 
"When the going gets weird, the weird turn pro."
Hunter S. Thompson
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: Announcing btrfs-dedupe

2016-11-07 Thread Mark Fasheh

Hi James,

Re the following text on your project page:

"IMPORTANT CAVEAT — I have read that there are race and/or error
conditions which can cause filesystem corruption in the kernel
implementation of the deduplication ioctl."

Can you expound on that? I'm not aware of any bugs right now but if
there is any it'd absolutely be worth having that info on the btrfs
list.

Thanks,
--Mark


On Sun, Nov 6, 2016 at 7:30 AM, James Pharaoh
 wrote:
> Hi all,
>
> I'm pleased to announce my btrfs deduplication utility, written in Rust.
> This operates on whole files, is fast, and I believe complements the
> existing utilities (duperemove, bedup), which exist currently.
>
> Please visit the homepage for more information:
>
> http://btrfs-dedupe.com
>
> James Pharaoh
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: Announcing btrfs-dedupe

2016-11-07 Thread Mark Fasheh

Hi David and James,

On Mon, Nov 7, 2016 at 6:02 AM, David Sterba  wrote:
> On Sun, Nov 06, 2016 at 02:30:52PM +0100, James Pharaoh wrote:
>> I'm pleased to announce my btrfs deduplication utility, written in Rust.
>> This operates on whole files, is fast, and I believe complements the
>> existing utilities (duperemove, bedup), which exist currently.
>
> Mark can correct me if I'm wrong, but AFAIK, duperemove can consume
> output of fdupes, which does the whole file scanning for duplicates. And
> I think adding a whole-file dedup mode to duperemove would be better
> (from user's POV) than writing a whole new tool, eg. because of existing
> availability of duperemove in the distros.

Yeah you are correct - fdupes -r /foo | duperemove --fdupes  will get
you the same effect.

There's been a request for us to do all of that internally so that the
whole file dedupe works with the mtime checking code. This is entirely
doable. I would probably either add a field to the files table or add
a new table to hold whole-file hashes. We can then squeeze down our
existing block hashes into one big one or just rehash the whole file.

> Also looking to your roadmap, some of the items are implemented in
> duperemove: database of existing csums, cross filesystem boundary,
> mtime-based speedups).

Yeah, rescanning based on mtime was a huge speedup for Duperemove as
was keeping checksums in a db. We do all this today, also on XFS with
the dedupe ioctl (I believe this should be out with Linux-4.9).

Btw, there's lots of little details and bug fixes which I feel add up
to a relatively complete (though far from perfect!) tool. For example,
the dedupe code can handle multiple kernel versions including old
kernels which couldn't dedupe on non aligned block boundaries. Every
major step in duperemove is threaded at this point too which has also
been an enormous performance increase (which new features benefit
from).

Thanks,
--Mark

-- 
"When the going gets weird, the weird turn pro."
Hunter S. Thompson
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v13 00/15] Btrfs In-band De-duplication

2016-09-09 Thread Mark Fasheh

On Thu, Sep 08, 2016 at 03:12:49PM +0800, Qu Wenruo wrote:
> This patchset can be fetched from github:
> https://github.com/adam900710/linux.git wang_dedupe_20160907
> 
> This version is just another small update, rebased to David's
> for-next-20160906 branch.
> 
> This updates only includes one small fix, which is exposed by recent
> commits which checks space_info->bytes_may_use at umount time.
> Caused by that we only free quota reserved space at hash hit, but doesn't
> free space_info->bytes_may_use.
> 
> Other rebase changes are all related to recent infrastructure change,
> like io_tree and quota flags change.
> 
> We ran xfstests with dedupe enabled.

Is there an xfstests patch for this I can look at? We want to be able to run
and reproduce the same tests as you.

Also where are the disk portion patches or did I miss them somehow?
--Mark

--
Mark Fasheh
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] vfs: allow FILE_EXTENT_SAME (dedupe_file_range) on a file opened ro

2016-07-18 Thread Mark Fasheh

On Mon, Jul 18, 2016 at 12:13:38AM +0200, Adam Borowski wrote:
> Instead of checking the mode of the file descriptor, let's check whether it
> could have been opened rw.  This allows fixing intermittent exec failures
> when deduping a live system: anyone trying to exec a file currently being
> deduped gets ETXTBSY.
> 
> Issuing this ioctl on a ro file was already allowed for root/cap.
> 
> Tested on btrfs and not-yet-merged xfs, as only them implement this ioctl.
> 
> Signed-off-by: Adam Borowski <kilob...@angband.pl>

Reviewed-by: Mark Fasheh <mfas...@suse.de>
--Mark

--
Mark Fasheh
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v5 0/4] Btrfs in-band de-duplication test cases

2016-06-30 Thread Mark Fasheh

Hi Qu, you might want to CC fste...@vger.kernel.org on these patches.

Thanks,
--Mark

On Thu, Jun 30, 2016 at 05:24:35PM +0800, Qu Wenruo wrote:
> Btrfs in-band de-duplication test case for in-memory backend.
> 
> v5:
>   Due to kernel ioctl change, add FORCE flag for "dedupe enable" ioctl call.
> v4:
>   Due to kernel patchset re-organization, remove on-disk backend test cases
> v3:
>   Add new test cases for on-disk backend with metadata balance
> v2:
>   Add new test cases for on-disk backend with full balance
> 
> 
> Qu Wenruo (4):
>   fstests: rename _require_btrfs to _require_btrfs_subcommand
>   fstests: btrfs: Add basic test for btrfs in-band de-duplication
>   fstests: btrfs: Add testcase for btrfs dedupe and metadata balance
> race test
>   fstests: btrfs: Test inband dedupe with data balance.
> 
>  common/defrag   |  13 ++
>  common/rc   |   2 +-
>  tests/btrfs/004 |   2 +-
>  tests/btrfs/048 |   2 +-
>  tests/btrfs/059 |   2 +-
>  tests/btrfs/200 | 116 
> 
>  tests/btrfs/200.out |  22 ++
>  tests/btrfs/201 | 112 ++
>  tests/btrfs/201.out |   2 +
>  tests/btrfs/203 | 110 +
>  tests/btrfs/203.out |   3 ++
>  tests/btrfs/group   |   3 ++
>  12 files changed, 385 insertions(+), 4 deletions(-)
>  create mode 100755 tests/btrfs/200
>  create mode 100644 tests/btrfs/200.out
>  create mode 100755 tests/btrfs/201
>  create mode 100644 tests/btrfs/201.out
>  create mode 100755 tests/btrfs/203
>  create mode 100644 tests/btrfs/203.out
> 
> -- 
> 2.9.0
> 
> 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
Mark Fasheh
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 2/2] btrfs-progs: btrfsck: verify qgroups above level 0

2016-06-15 Thread Mark Fasheh

At the moment we only check subvolume quota groups (level 0). With this
patch we can check groups above 0, thus verifying the entire qgroup
hierarchy on a file system.  The accounting portion of this patch is modeled
after the kernel - we are essentially reimplementing the 'quota rescan' case
here. Most other sections of this code went unchanged, in particular the
root counting works independently of the accounting.

Signed-off-by: Mark Fasheh <mfas...@suse.de>
---
 qgroup-verify.c | 305 ++--
 1 file changed, 251 insertions(+), 54 deletions(-)

diff --git a/qgroup-verify.c b/qgroup-verify.c
index 7b78504..4d2ea66 100644
--- a/qgroup-verify.c
+++ b/qgroup-verify.c
@@ -35,7 +35,8 @@
 /*#define QGROUP_VERIFY_DEBUG*/
 static unsigned long tot_extents_scanned = 0;
 
-static void add_bytes(u64 root_objectid, u64 num_bytes, int exclusive);
+struct qgroup_count;
+static struct qgroup_count *find_count(u64 qgroupid);
 
 struct qgroup_info {
u64 referenced;
@@ -54,6 +55,14 @@ struct qgroup_count {
struct qgroup_info info;
 
struct rb_node rb_node;
+
+   struct list_head groups;  /* parents when we are a child group */
+   struct list_head members; /* children when we are a parent
+group (not currently used but
+maintained to mirror kernel
+handling of qgroups) */
+
+   u64 cur_refcnt;
 };
 
 static struct counts_tree {
@@ -66,6 +75,39 @@ static struct counts_tree {
 static struct rb_root by_bytenr = RB_ROOT;
 
 /*
+ * glue structure to represent the relations between qgroups. Mirrored
+ * from kernel.
+ */
+struct btrfs_qgroup_list {
+   struct list_head next_group;
+   struct list_head next_member;
+   struct qgroup_count *group; /* Parent group */
+   struct qgroup_count *member;
+};
+
+/* allow us to reset ref counts during accounting without zeroing each group */
+static u64 qgroup_seq = 1ULL;
+
+static inline void update_cur_refcnt(struct qgroup_count *c)
+{
+   if (c->cur_refcnt < qgroup_seq)
+   c->cur_refcnt = qgroup_seq;
+   c->cur_refcnt += 1;
+}
+
+static inline u64 group_get_cur_refcnt(struct qgroup_count *c)
+{
+   if (c->cur_refcnt < qgroup_seq)
+   return 0;
+   return c->cur_refcnt - qgroup_seq;
+}
+
+static void inc_qgroup_seq(int root_count)
+{
+   qgroup_seq += root_count + 1;
+}
+
+/*
  * List of interior tree blocks. We walk this list after loading the
  * extent tree to resolve implied refs. For each interior node we'll
  * place a shared ref in the ref tree against each child object. This
@@ -296,9 +338,10 @@ static void find_parent_roots(struct ulist *roots, u64 
parent)
}
 
do {
-   if (ref->root)
-   ulist_add(roots, ref->root, 0, 0);
-   else
+   if (ref->root) {
+   if (is_fstree(ref->root))
+   ulist_add(roots, ref->root, 0, 0);
+   } else
find_parent_roots(roots, ref->parent);
 
node = rb_next(node);
@@ -307,6 +350,116 @@ static void find_parent_roots(struct ulist *roots, u64 
parent)
} while (node && ref->bytenr == parent);
 }
 
+static int account_one_extent(struct ulist *roots, u64 bytenr, u64 num_bytes)
+{
+   int ret;
+   u64 id, nr_roots, nr_refs;
+   struct qgroup_count *count;
+   struct ulist *counts = ulist_alloc(0);
+   struct ulist *tmp = ulist_alloc(0);
+   struct ulist_iterator uiter;
+   struct ulist_iterator tmp_uiter;
+   struct ulist_node *unode;
+   struct ulist_node *tmp_unode;
+   struct btrfs_qgroup_list *glist;
+
+   if (!counts || !tmp) {
+   ulist_free(counts);
+   ulist_free(tmp);
+   return ENOMEM;
+   }
+
+   ULIST_ITER_INIT();
+   while ((unode = ulist_next(roots, ))) {
+   BUG_ON(unode->val == 0ULL);
+
+   /*
+* For each root, find their corresponding tracking group and
+* add it to our qgroups list.
+*/
+   count = find_count(unode->val);
+   if (!count)
+   continue;
+
+   BUG_ON(!is_fstree(unode->val));
+   ret = ulist_add(counts, count->qgroupid, ptr_to_u64(count), 0);
+   if (ret < 0)
+   goto out;
+
+   /*
+* Now we look for parents (and parents of
+* those...). Use a tmp ulist here to avoid re-walking
+* (and re-incrementing) our already added items on every
+* loop iteration.
+*/
+   ulist_reinit(tmp);
+   ret = ulist_add(tmp, count->qgroupid, ptr_to_u64

[PATCH 0/2] btrfs-progs: qgroup verification update

2016-06-15 Thread Mark Fasheh

Hi David,

The following two patches update the qgroup verification code in btrfsck to
understand entire qgroup hierarchies.  The first patch is a simple bugfix
for some leaked objects and can be taken separately if you like.  The 2nd
patch implements the actual verification update.

If you prefer to pull via git there is a branch at:

https://github.com/markfasheh/btrfs-progs-patches qgroup-hierarchy

Review and comments are welcome.

Thanks,
--Mark
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v3] btrfs: fix check_shared for fiemap ioctl

2016-06-08 Thread Mark Fasheh

On Wed, Jun 08, 2016 at 01:13:03PM +0800, Lu Fengqi wrote:
> Only in the case of different root_id or different object_id, check_shared
> identified extent as the shared. However, If a extent was referred by
> different offset of same file, it should also be identified as shared.
> In addition, check_shared's loop scale is at least  n^3, so if a extent
> has too many references,  even causes soft hang up.
> 
> First, add all delayed_ref to the ref_tree and calculate the unqiue_refs,
> if the unique_refs is greater than one, return BACKREF_FOUND_SHARED.
> Then individually add the  on-disk reference(inline/keyed) to the ref_tree
> and calculate the unique_refs of the ref_tree to check if the unique_refs
> is greater than one.Because once there are two references to return
> SHARED, so the time complexity is close to the constant.
> 
> Reported-by: Tsutomu Itoh <t-i...@jp.fujitsu.com>
> Signed-off-by: Lu Fengqi <lufq.f...@cn.fujitsu.com>
> ---
> The caller is fiemap that called from an ioctl. Because it isn't on a
> writeout path, so we temporarily use GFP_KERNEL in ref_root_alloc() and
> ref_tree_add(). If we use ref_tree replace the existing backref structure
> later, we can consider whether to use GFP_NOFS again.

NACK.

You don't need to be on a writeout path to deadlock, you simply need to be
holding locks that the writeout path takes when you allocate. If the
allocator does writeout to free memory then you deadlock. Fiemap is locking 
down extents which may also get locked down when you allocate within those  
locks. See my e-mail here for details,

http://www.spinics.net/lists/linux-btrfs/msg55789.html
--Mark

--
Mark Fasheh
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v10 09/21] btrfs: dedupe: Inband in-memory only de-duplication implement

2016-06-07 Thread Mark Fasheh

On Tue, Jun 07, 2016 at 08:42:46AM +0800, Qu Wenruo wrote:
> 
> 
> At 06/07/2016 03:54 AM, Mark Fasheh wrote:
> >On Sat, Jun 04, 2016 at 06:26:39PM +0800, Qu Wenruo wrote:
> >>
> >>
> >>On 06/03/2016 10:27 PM, Josef Bacik wrote:
> >>>On 06/01/2016 09:12 PM, Qu Wenruo wrote:
> >>>>
> >>>>
> >>>>At 06/02/2016 06:08 AM, Mark Fasheh wrote:
> >>>>>On Fri, Apr 01, 2016 at 02:35:00PM +0800, Qu Wenruo wrote:
> >>>>>>Core implement for inband de-duplication.
> >>>>>>It reuse the async_cow_start() facility to do the calculate dedupe
> >>>>>>hash.
> >>>>>>And use dedupe hash to do inband de-duplication at extent level.
> >>>>>>
> >>>>>>The work flow is as below:
> >>>>>>1) Run delalloc range for an inode
> >>>>>>2) Calculate hash for the delalloc range at the unit of dedupe_bs
> >>>>>>3) For hash match(duplicated) case, just increase source extent ref
> >>>>>>  and insert file extent.
> >>>>>>  For hash mismatch case, go through the normal cow_file_range()
> >>>>>>  fallback, and add hash into dedupe_tree.
> >>>>>>  Compress for hash miss case is not supported yet.
> >>>>>>
> >>>>>>Current implement restore all dedupe hash in memory rb-tree, with LRU
> >>>>>>behavior to control the limit.
> >>>>>>
> >>>>>>Signed-off-by: Wang Xiaoguang <wangxg.f...@cn.fujitsu.com>
> >>>>>>Signed-off-by: Qu Wenruo <quwen...@cn.fujitsu.com>
> >>>>>>---
> >>>>>>fs/btrfs/extent-tree.c |  18 
> >>>>>>fs/btrfs/inode.c   | 235
> >>>>>>++---
> >>>>>>fs/btrfs/relocation.c  |  16 
> >>>>>>3 files changed, 236 insertions(+), 33 deletions(-)
> >>>>>>
> >>>>>>diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
> >>>>>>index 53e1297..dabd721 100644
> >>>>>>--- a/fs/btrfs/extent-tree.c
> >>>>>>+++ b/fs/btrfs/extent-tree.c
> >>>>>>@@ -37,6 +37,7 @@
> >>>>>>#include "math.h"
> >>>>>>#include "sysfs.h"
> >>>>>>#include "qgroup.h"
> >>>>>>+#include "dedupe.h"
> >>>>>>
> >>>>>>#undef SCRAMBLE_DELAYED_REFS
> >>>>>>
> >>>>>>@@ -2399,6 +2400,8 @@ static int run_one_delayed_ref(struct
> >>>>>>btrfs_trans_handle *trans,
> >>>>>>
> >>>>>>if (btrfs_delayed_ref_is_head(node)) {
> >>>>>>struct btrfs_delayed_ref_head *head;
> >>>>>>+struct btrfs_fs_info *fs_info = root->fs_info;
> >>>>>>+
> >>>>>>/*
> >>>>>> * we've hit the end of the chain and we were supposed
> >>>>>> * to insert this extent into the tree.  But, it got
> >>>>>>@@ -2413,6 +2416,15 @@ static int run_one_delayed_ref(struct
> >>>>>>btrfs_trans_handle *trans,
> >>>>>>btrfs_pin_extent(root, node->bytenr,
> >>>>>> node->num_bytes, 1);
> >>>>>>if (head->is_data) {
> >>>>>>+/*
> >>>>>>+ * If insert_reserved is given, it means
> >>>>>>+ * a new extent is revered, then deleted
> >>>>>>+ * in one tran, and inc/dec get merged to 0.
> >>>>>>+ *
> >>>>>>+ * In this case, we need to remove its dedup
> >>>>>>+ * hash.
> >>>>>>+ */
> >>>>>>+btrfs_dedupe_del(trans, fs_info, node->bytenr);
> >>>>>>ret = btrfs_del_csums(trans, root,
> >>>>>>  node->bytenr,
> >>>>>>  node->num_bytes);
> >>>>>>@@ -6713,6 +6725,12 @@ static int __btrfs_free_extent(struct
> >>>>>>btrfs_trans_handle *trans,
&

Re: [PATCH v10 09/21] btrfs: dedupe: Inband in-memory only de-duplication implement

2016-06-06 Thread Mark Fasheh

On Sat, Jun 04, 2016 at 06:26:39PM +0800, Qu Wenruo wrote:
> 
> 
> On 06/03/2016 10:27 PM, Josef Bacik wrote:
> >On 06/01/2016 09:12 PM, Qu Wenruo wrote:
> >>
> >>
> >>At 06/02/2016 06:08 AM, Mark Fasheh wrote:
> >>>On Fri, Apr 01, 2016 at 02:35:00PM +0800, Qu Wenruo wrote:
> >>>>Core implement for inband de-duplication.
> >>>>It reuse the async_cow_start() facility to do the calculate dedupe
> >>>>hash.
> >>>>And use dedupe hash to do inband de-duplication at extent level.
> >>>>
> >>>>The work flow is as below:
> >>>>1) Run delalloc range for an inode
> >>>>2) Calculate hash for the delalloc range at the unit of dedupe_bs
> >>>>3) For hash match(duplicated) case, just increase source extent ref
> >>>>   and insert file extent.
> >>>>   For hash mismatch case, go through the normal cow_file_range()
> >>>>   fallback, and add hash into dedupe_tree.
> >>>>   Compress for hash miss case is not supported yet.
> >>>>
> >>>>Current implement restore all dedupe hash in memory rb-tree, with LRU
> >>>>behavior to control the limit.
> >>>>
> >>>>Signed-off-by: Wang Xiaoguang <wangxg.f...@cn.fujitsu.com>
> >>>>Signed-off-by: Qu Wenruo <quwen...@cn.fujitsu.com>
> >>>>---
> >>>> fs/btrfs/extent-tree.c |  18 
> >>>> fs/btrfs/inode.c   | 235
> >>>>++---
> >>>> fs/btrfs/relocation.c  |  16 
> >>>> 3 files changed, 236 insertions(+), 33 deletions(-)
> >>>>
> >>>>diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
> >>>>index 53e1297..dabd721 100644
> >>>>--- a/fs/btrfs/extent-tree.c
> >>>>+++ b/fs/btrfs/extent-tree.c
> >>>>@@ -37,6 +37,7 @@
> >>>> #include "math.h"
> >>>> #include "sysfs.h"
> >>>> #include "qgroup.h"
> >>>>+#include "dedupe.h"
> >>>>
> >>>> #undef SCRAMBLE_DELAYED_REFS
> >>>>
> >>>>@@ -2399,6 +2400,8 @@ static int run_one_delayed_ref(struct
> >>>>btrfs_trans_handle *trans,
> >>>>
> >>>> if (btrfs_delayed_ref_is_head(node)) {
> >>>> struct btrfs_delayed_ref_head *head;
> >>>>+struct btrfs_fs_info *fs_info = root->fs_info;
> >>>>+
> >>>> /*
> >>>>  * we've hit the end of the chain and we were supposed
> >>>>  * to insert this extent into the tree.  But, it got
> >>>>@@ -2413,6 +2416,15 @@ static int run_one_delayed_ref(struct
> >>>>btrfs_trans_handle *trans,
> >>>> btrfs_pin_extent(root, node->bytenr,
> >>>>  node->num_bytes, 1);
> >>>> if (head->is_data) {
> >>>>+/*
> >>>>+ * If insert_reserved is given, it means
> >>>>+ * a new extent is revered, then deleted
> >>>>+ * in one tran, and inc/dec get merged to 0.
> >>>>+ *
> >>>>+ * In this case, we need to remove its dedup
> >>>>+ * hash.
> >>>>+ */
> >>>>+btrfs_dedupe_del(trans, fs_info, node->bytenr);
> >>>> ret = btrfs_del_csums(trans, root,
> >>>>   node->bytenr,
> >>>>   node->num_bytes);
> >>>>@@ -6713,6 +6725,12 @@ static int __btrfs_free_extent(struct
> >>>>btrfs_trans_handle *trans,
> >>>> btrfs_release_path(path);
> >>>>
> >>>> if (is_data) {
> >>>>+ret = btrfs_dedupe_del(trans, info, bytenr);
> >>>>+if (ret < 0) {
> >>>>+btrfs_abort_transaction(trans, extent_root,
> >>>>+ret);
> >>>
> >>>I don't see why an error here should lead to a readonly fs.
> >>>--Mark
> >>>
> >>
> >>Because such deletion error can lead to corruption.
> >>
> >>For example, extent A is already in hash pool.
> >>And wh

Re: [PATCH v2] btrfs: fix check_shared for fiemap ioctl

2016-06-02 Thread Mark Fasheh

On Thu, Jun 02, 2016 at 02:17:40PM -0700, Mark Fasheh wrote:
> On Thu, Jun 02, 2016 at 04:56:06PM -0400, Jeff Mahoney wrote:
> > On 6/2/16 3:08 PM, Mark Fasheh wrote:
> > > On Thu, Jun 02, 2016 at 07:07:32PM +0200, David Sterba wrote:
> > >> On Wed, Jun 01, 2016 at 02:15:22PM -0700, Mark Fasheh wrote:
> > >>>> +/* dynamically allocate and initialize a ref_root */
> > >>>> +static struct ref_root *ref_root_alloc(void)
> > >>>> +{
> > >>>> +  struct ref_root *ref_tree;
> > >>>> +
> > >>>> +  ref_tree = kmalloc(sizeof(*ref_tree), GFP_KERNEL);
> > >>>
> > >>> I'm pretty sure we want GFP_NOFS here.
> > >>
> > >> Then please explain to me why/where the reasoning below is wrong:
> > > 
> > > The general reasoning of when to use GFP_NOFS below is fine, I don't
> > > disagree with that at all. If there is no way a recursion back into btrfs
> > > can't happen at that allocation site then we can use GFP_KERNEL.
> > > 
> > > That said, have you closely audited this path? Does the allocation happen
> > > completely outside any locks that might be shared with the writeout path?
> > > What happens if we have to do writeout of the inode being fiemapped in 
> > > order
> > > to allocate space? If the answer to all my questions is "there is no way
> > > this can deadlock" then by all means, we should use GFP_KERNEL. Otherwise
> > > GFP_NOFS is a sensible guard against possible future deadlocks.
> > 
> > This is exactly the situation we discussed at LSF/MM this year.  The MM
> > folks are pushing back because the fs folks tend to use GFP_NOFS as a
> > talisman.  The audit needs to happen, otherwise that last sentence is
> > another talisman.
> 
> There's nothing here I disagree with. I'm not seeing a strong technical
> justification, which is what I want (being called from an ioctl means
> nothing in this case).

A small amount of searching shows me that extent_fiemap() does
lock_extent_bits() and writepage_delalloc() also calls lock_extent_bits()
(via find_lock_delalloc_range()).

I'm no expert on the extent locking but that seems pretty deadlocky to me.
--Mark

--
Mark Fasheh
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v2] btrfs: fix check_shared for fiemap ioctl

2016-06-02 Thread Mark Fasheh

On Thu, Jun 02, 2016 at 04:56:06PM -0400, Jeff Mahoney wrote:
> On 6/2/16 3:08 PM, Mark Fasheh wrote:
> > On Thu, Jun 02, 2016 at 07:07:32PM +0200, David Sterba wrote:
> >> On Wed, Jun 01, 2016 at 02:15:22PM -0700, Mark Fasheh wrote:
> >>>> +/* dynamically allocate and initialize a ref_root */
> >>>> +static struct ref_root *ref_root_alloc(void)
> >>>> +{
> >>>> +struct ref_root *ref_tree;
> >>>> +
> >>>> +ref_tree = kmalloc(sizeof(*ref_tree), GFP_KERNEL);
> >>>
> >>> I'm pretty sure we want GFP_NOFS here.
> >>
> >> Then please explain to me why/where the reasoning below is wrong:
> > 
> > The general reasoning of when to use GFP_NOFS below is fine, I don't
> > disagree with that at all. If there is no way a recursion back into btrfs
> > can't happen at that allocation site then we can use GFP_KERNEL.
> > 
> > That said, have you closely audited this path? Does the allocation happen
> > completely outside any locks that might be shared with the writeout path?
> > What happens if we have to do writeout of the inode being fiemapped in order
> > to allocate space? If the answer to all my questions is "there is no way
> > this can deadlock" then by all means, we should use GFP_KERNEL. Otherwise
> > GFP_NOFS is a sensible guard against possible future deadlocks.
> 
> This is exactly the situation we discussed at LSF/MM this year.  The MM
> folks are pushing back because the fs folks tend to use GFP_NOFS as a
> talisman.  The audit needs to happen, otherwise that last sentence is
> another talisman.

There's nothing here I disagree with. I'm not seeing a strong technical
justification, which is what I want (being called from an ioctl means
nothing in this case).
--Mark

--
Mark Fasheh
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v2] btrfs: fix check_shared for fiemap ioctl

2016-06-02 Thread Mark Fasheh

On Thu, Jun 02, 2016 at 01:46:27PM +0800, Lu Fengqi wrote:
> 
> At 06/02/2016 05:15 AM, Mark Fasheh wrote:
> >Thanks for trying to fix this problem, comments below.
> >
> >On Wed, Jun 01, 2016 at 01:48:05PM +0800, Lu Fengqi wrote:
> >>Only in the case of different root_id or different object_id, check_shared
> >>identified extent as the shared. However, If a extent was referred by
> >>different offset of same file, it should also be identified as shared.
> >>In addition, check_shared's loop scale is at least  n^3, so if a extent
> >>has too many references,  even causes soft hang up.
> >>
> >>First, add all delayed_ref to the ref_tree and calculate the unqiue_refs,
> >>if the unique_refs is greater than one, return BACKREF_FOUND_SHARED.
> >>Then individually add the  on-disk reference(inline/keyed) to the ref_tree
> >>and calculate the unique_refs of the ref_tree to check if the unique_refs
> >>is greater than one.Because once there are two references to return
> >>SHARED, so the time complexity is close to the constant.
> >Constant time in the best case, but still n^3 in the worst case right? I'm
> >not complaining btw, I just want to be sure we're not over promising  :)
> Only in case of a large number of delayed_ref, the worst case time
> complexity will be n^2*logn. Otherwise, it will be constant even if
> there are many on-disk references.

Ahh ok so it's driven more by the # of delayed refs. That makes sense,
thanks.


> >>@@ -34,6 +35,253 @@ struct extent_inode_elem {
> >>struct extent_inode_elem *next;
> >>  };
> >>
> >>+/*
> >>+ * ref_root is used as the root of the ref tree that hold a collection
> >>+ * of unique references.
> >>+ */
> >>+struct ref_root {
> >>+   struct rb_root rb_root;
> >>+
> >>+   /*
> >>+* the unique_refs represents the number of ref_nodes with a positive
> >>+* count stored in the tree. Even if a ref_node(the count is greater
> >>+* than one) is added, the unique_refs will only increase one.
> >>+*/
> >>+   unsigned int unique_refs;
> >>+};
> >>+
> >>+/* ref_node is used to store a unique reference to the ref tree. */
> >>+struct ref_node {
> >>+   struct rb_node rb_node;
> >>+
> >>+   /* for NORMAL_REF, otherwise all these fields should be set to 0 */
> >>+   u64 root_id;
> >>+   u64 object_id;
> >>+   u64 offset;
> >>+
> >>+   /* for SHARED_REF, otherwise parent field should be set to 0 */
> >>+   u64 parent;
> >>+
> >>+   /* ref to the ref_mod of btrfs_delayed_ref_node(delayed-ref.h) */
> >>+   int ref_mod;
> >>+};
> >Why are we mirroring so much of the backref structures here? It seems like
> >we're just throwing layers on top of layers. Can't we modify the backref
> >structures and code to handle whatever small amount of unique accounting you
> >must do?
> The original structure(struct __prelim_ref) store reference in list,
> and I have to perform many search operations that not suitable for
> list. However, if I modify the original structure, it would require
> a lot of rework. So I just want to fix fiemap with this patch. If
> necessary, we can use this structure to replace the original
> structure later.

Well there's room for an rb_node on that structure so we can solve the 'it
only uses a list' problem trivially. I definitely understand your reluctance
to modify the backref code, but to me that just sounds like we need someone
who is familiar with that code to review your work and provide advice when
needed.

Otherwise, I believe my point holds. If there's some technical reason why
this is a bad idea, that's a different story. So far though this just seems
like a situation where we need some extra review from the primary
developers. I cc'd Josef in the hopes he could shed some light for us.


> >>+/* dynamically allocate and initialize a ref_root */
> >>+static struct ref_root *ref_root_alloc(void)
> >>+{
> >>+   struct ref_root *ref_tree;
> >>+
> >>+   ref_tree = kmalloc(sizeof(*ref_tree), GFP_KERNEL);
> >I'm pretty sure we want GFP_NOFS here.
> Yes, perhaps you're right.
> >Because there's no need to narrow the allocation constraints. GFP_NOFS
> >is necessary when the caller is on a critical path that must not recurse
> >back to the filesystem through the allocation (ie. if the allocator
> >decides to free some memory and tries tro write dirty data). FIEMAP is
> >called from an ioctl.
> But David seems to have a different point of view with you, so I
> would like to ask for his advice again.

Sounds good, hopefully David and I can figure it out  :)

Thanks again Lu,
--Mark

--
Mark Fasheh
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v2] btrfs: fix check_shared for fiemap ioctl

2016-06-02 Thread Mark Fasheh

On Thu, Jun 02, 2016 at 07:07:32PM +0200, David Sterba wrote:
> On Wed, Jun 01, 2016 at 02:15:22PM -0700, Mark Fasheh wrote:
> > > +/* dynamically allocate and initialize a ref_root */
> > > +static struct ref_root *ref_root_alloc(void)
> > > +{
> > > + struct ref_root *ref_tree;
> > > +
> > > + ref_tree = kmalloc(sizeof(*ref_tree), GFP_KERNEL);
> > 
> > I'm pretty sure we want GFP_NOFS here.
> 
> Then please explain to me why/where the reasoning below is wrong:

The general reasoning of when to use GFP_NOFS below is fine, I don't
disagree with that at all. If there is no way a recursion back into btrfs
can't happen at that allocation site then we can use GFP_KERNEL.

That said, have you closely audited this path? Does the allocation happen
completely outside any locks that might be shared with the writeout path?
What happens if we have to do writeout of the inode being fiemapped in order
to allocate space? If the answer to all my questions is "there is no way
this can deadlock" then by all means, we should use GFP_KERNEL. Otherwise
GFP_NOFS is a sensible guard against possible future deadlocks.
--Mark

--
Mark Fasheh
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v10 10/21] btrfs: try more times to alloc metadata reserve space

2016-06-01 Thread Mark Fasheh

On Tue, May 17, 2016 at 03:20:16PM +0200, David Sterba wrote:
> On Fri, Apr 01, 2016 at 02:35:01PM +0800, Qu Wenruo wrote:
> > @@ -5815,6 +5817,23 @@ out_fail:
> > }
> > if (delalloc_lock)
> > mutex_unlock(_I(inode)->delalloc_mutex);
> > +   /*
> > +* The number of metadata bytes is calculated by the difference
> > +* between outstanding_extents and reserved_extents. Sometimes though
> > +* reserve_metadata_bytes() fails to reserve the wanted metadata bytes,
> > +* indeed it has already done some work to reclaim metadata space, hence
> > +* both outstanding_extents and reserved_extents would have changed and
> > +* the bytes we try to reserve would also has changed(may be smaller).
> > +* So here we try to reserve again. This is much useful for online
> > +* dedupe, which will easily eat almost all meta space.
> > +*
> > +* XXX: Indeed here 3 is arbitrarily choosed, it's a good workaround for
> > +* online dedupe, later we should find a better method to avoid dedupe
> > +* enospc issue.
> > +*/
> > +   if (unlikely(ret == -ENOSPC && loops++ < 3))
> > +   goto again;
> > +
> 
> This does not seem right and needs to be addressed properly before I
> consider adding the patchset to for-next. I don't have idea how to fix
> it.

Agreed, and this sort of issue is a reason why I strongly feel we don't want
to merge this series piecemeal until we know that after everything is
complete, we can end up with a fully baked in-band dedupe implementation.

Luckily Qu says he's on it so if he posts a workable fix here my whole point
can become moot. Until then though this is exactly the type of 'fix later'
coding we need to be avoiding.
--Mark

--
Mark Fasheh
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v10 09/21] btrfs: dedupe: Inband in-memory only de-duplication implement

2016-06-01 Thread Mark Fasheh

On Fri, Apr 01, 2016 at 02:35:00PM +0800, Qu Wenruo wrote:
> Core implement for inband de-duplication.
> It reuse the async_cow_start() facility to do the calculate dedupe hash.
> And use dedupe hash to do inband de-duplication at extent level.
> 
> The work flow is as below:
> 1) Run delalloc range for an inode
> 2) Calculate hash for the delalloc range at the unit of dedupe_bs
> 3) For hash match(duplicated) case, just increase source extent ref
>and insert file extent.
>For hash mismatch case, go through the normal cow_file_range()
>fallback, and add hash into dedupe_tree.
>Compress for hash miss case is not supported yet.
> 
> Current implement restore all dedupe hash in memory rb-tree, with LRU
> behavior to control the limit.
> 
> Signed-off-by: Wang Xiaoguang <wangxg.f...@cn.fujitsu.com>
> Signed-off-by: Qu Wenruo <quwen...@cn.fujitsu.com>
> ---
>  fs/btrfs/extent-tree.c |  18 
>  fs/btrfs/inode.c   | 235 
> ++---
>  fs/btrfs/relocation.c  |  16 
>  3 files changed, 236 insertions(+), 33 deletions(-)
> 
> diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
> index 53e1297..dabd721 100644
> --- a/fs/btrfs/extent-tree.c
> +++ b/fs/btrfs/extent-tree.c
> @@ -37,6 +37,7 @@
>  #include "math.h"
>  #include "sysfs.h"
>  #include "qgroup.h"
> +#include "dedupe.h"
>  
>  #undef SCRAMBLE_DELAYED_REFS
>  
> @@ -2399,6 +2400,8 @@ static int run_one_delayed_ref(struct 
> btrfs_trans_handle *trans,
>  
>   if (btrfs_delayed_ref_is_head(node)) {
>   struct btrfs_delayed_ref_head *head;
> + struct btrfs_fs_info *fs_info = root->fs_info;
> +
>   /*
>* we've hit the end of the chain and we were supposed
>* to insert this extent into the tree.  But, it got
> @@ -2413,6 +2416,15 @@ static int run_one_delayed_ref(struct 
> btrfs_trans_handle *trans,
>   btrfs_pin_extent(root, node->bytenr,
>node->num_bytes, 1);
>   if (head->is_data) {
> + /*
> +  * If insert_reserved is given, it means
> +  * a new extent is revered, then deleted
> +  * in one tran, and inc/dec get merged to 0.
> +  *
> +  * In this case, we need to remove its dedup
> +  * hash.
> +  */
> + btrfs_dedupe_del(trans, fs_info, node->bytenr);
>   ret = btrfs_del_csums(trans, root,
> node->bytenr,
> node->num_bytes);
> @@ -6713,6 +6725,12 @@ static int __btrfs_free_extent(struct 
> btrfs_trans_handle *trans,
>   btrfs_release_path(path);
>  
>   if (is_data) {
> + ret = btrfs_dedupe_del(trans, info, bytenr);
> + if (ret < 0) {
> + btrfs_abort_transaction(trans, extent_root,
> + ret);

I don't see why an error here should lead to a readonly fs.
--Mark

--
Mark Fasheh
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v10 08/21] btrfs: ordered-extent: Add support for dedupe

2016-06-01 Thread Mark Fasheh

On Fri, Apr 01, 2016 at 02:34:59PM +0800, Qu Wenruo wrote:
> From: Wang Xiaoguang <wangxg.f...@cn.fujitsu.com>
> 
> Add ordered-extent support for dedupe.
> 
> Note, current ordered-extent support only supports non-compressed source
> extent.
> Support for compressed source extent will be added later.
> 
> Signed-off-by: Qu Wenruo <quwen...@cn.fujitsu.com>
> Signed-off-by: Wang Xiaoguang <wangxg.f...@cn.fujitsu.com>
> ---
>  fs/btrfs/ordered-data.c | 44 
>  fs/btrfs/ordered-data.h | 13 +
>  2 files changed, 53 insertions(+), 4 deletions(-)
> 
> diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
> index 0de7da5..ef24ad1 100644
> --- a/fs/btrfs/ordered-data.c
> +++ b/fs/btrfs/ordered-data.c
> @@ -26,6 +26,7 @@
>  #include "extent_io.h"
>  #include "disk-io.h"
>  #include "compression.h"
> +#include "dedupe.h"
>  
>  static struct kmem_cache *btrfs_ordered_extent_cache;
>  
> @@ -184,7 +185,8 @@ static inline struct rb_node *tree_search(struct 
> btrfs_ordered_inode_tree *tree,
>   */
>  static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
> u64 start, u64 len, u64 disk_len,
> -   int type, int dio, int compress_type)
> +   int type, int dio, int compress_type,
> +   struct btrfs_dedupe_hash *hash)
>  {
>   struct btrfs_root *root = BTRFS_I(inode)->root;
>   struct btrfs_ordered_inode_tree *tree;
> @@ -204,6 +206,31 @@ static int __btrfs_add_ordered_extent(struct inode 
> *inode, u64 file_offset,
>   entry->inode = igrab(inode);
>   entry->compress_type = compress_type;
>   entry->truncated_len = (u64)-1;
> + entry->hash = NULL;
> + /*
> +  * Hash hit must go through dedupe routine at all cost, even dedupe
> +  * is disabled. As its delayed ref is already increased.
> +  */

Initially, I had a hard time understanding this comment but I'm pretty sure
I know what you mean.

/*
 * A hash hit means we have already incremented the extents delayed ref.
 * We must handle this even if another process raced to turn off dedupe
 * otherwise we might leak a reference.
 */

might be better. Hope that helps.
--Mark

--
Mark Fasheh
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v7 0/8] Inband dedupe for btrfs-progs

2016-06-01 Thread Mark Fasheh

On Fri, Apr 01, 2016 at 02:37:42PM +0800, Qu Wenruo wrote:
> No much change from previous version.
> 1) Rebased to latest devel branch
> 
> 2) Update ctree.h to follow kernel structure change
> 
> 3) Update print-tree to follow kernel structure change
> 
> Qu Wenruo (7):
>   btrfs-progs: Basic framework for dedupe command group
>   btrfs-progs: dedupe: Add enable command for dedupe command group
>   btrfs-progs: dedupe: Add disable support for inband dedupelication
>   btrfs-progs: dedupe: Add status subcommand
>   btrfs-progs: Add dedupe feature for mkfs and convert
>   btrfs-progs: Add show-super support for new DEDUPE flag
>   btrfs-progs: debug-tree: Add dedupe tree support
> 
> Wang Xiaoguang (1):
>   btrfs-progs: property: add a dedupe property
> 
>  Documentation/Makefile.in |   1 +
>  Documentation/btrfs-dedupe.asciidoc   | 150 
>  Documentation/btrfs-property.asciidoc |   2 +
>  Documentation/btrfs.asciidoc  |   4 +
>  Documentation/mkfs.btrfs.asciidoc |   9 +
>  Makefile.in   |   3 +-
>  btrfs-completion  |   6 +-
>  btrfs-convert.c   |  19 +-
>  btrfs.c   |   1 +
>  cmds-dedupe.c | 329 
> ++

Can we please have seperate and obvious namespaces for in-band dedupe and
out-of-band dedupe commands? I realize that there is no oob-dedupe
funcationality in btrfs-progs today but I would like to avoid confusing
users in the case that this code hits btrfs-progs.

Specifically by this, I mean I'd like to see anything except 'dedupe' as the
btrfs command, so a user who sees 'btrfs dedupe ' is not confusing the
two forms we have.

I don't personally care what other name is used and of course it could have
'dedupe' in the name just not solely 'dedupe'. As a poor example, we could
call it 'btrfs inband-dedupe ...'.

Thanks,
--Mark

--
Mark Fasheh
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v2] btrfs: fix check_shared for fiemap ioctl

2016-06-01 Thread Mark Fasheh

On Wed, Jun 01, 2016 at 02:15:22PM -0700, Mark Fasheh wrote:
> > +static int ref_tree_add(struct ref_root *ref_tree, u64 root_id, u64 
> > object_id,
> > +   u64 offset, u64 parent, int count)
> > +{
> > +   struct ref_node *node = NULL;
> > +   struct rb_node **pos = NULL;
> > +   struct rb_node *pos_parent = NULL;
> > +   int origin_count;
> > +   int ret;
> > +
> > +   if (!count)
> > +   return 0;
> > +
> > +   node = __ref_tree_search(ref_tree, , _parent, root_id,
> > +object_id, offset, parent);
> > +   if (node == NULL) {
> > +   node = kmalloc(sizeof(*node), GFP_KERNEL);
> > +   if (!node)
> > +   return -ENOMEM;
> > +
> > +   node->root_id = root_id;
> > +   node->object_id = object_id;
> > +   node->offset = offset;
> > +   node->parent = parent;
> > +   node->ref_mod = count;
> > +
> > +   ret = ref_tree_insert(ref_tree, pos, pos_parent, node);
> > +   ASSERT(!ret);
> > +   if (ret) {
> > +   kfree(node);
> > +   return ret;
> > +   }
> 
> If you put the open coded comparisons into their own function, then it
> should be trivial to call that here and we can have a 'standard' looking
> rbtree insert instead of this custom version. See the rbtree.h header for an
> example.

oops, I meant Documentation/rbtree.txt instead of include/linux/rbtree.h.
--Mark

--
Mark Fasheh
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v2] btrfs: fix check_shared for fiemap ioctl

2016-06-01 Thread Mark Fasheh

   continue;
> + }

These 4 comparisons need to be in their own cmp() function - we want to
avoid open coded comparisons with an rbtree search. This is a pretty
standard way to handle it.


> +
> + return cur;
> + }
> +
> + return NULL;
> +}
> +
> +/*
> + * insert a ref_node to the ref tree
> + * @pos used for specifiy the position to insert
> + * @pos_parent for specifiy pos's parent
> + *
> + * success, return 0;
> + * ref_node already exists, return -EEXIST;
> +*/
> +static int ref_tree_insert(struct ref_root *ref_tree, struct rb_node **pos,
> +struct rb_node *pos_parent, struct ref_node *ins)
> +{
> + struct rb_node **p = NULL;
> + struct rb_node *parent = NULL;
> + struct ref_node *cur = NULL;
> +
> + if (!pos) {
> + cur = __ref_tree_search(ref_tree, , , ins->root_id,
> + ins->object_id, ins->offset,
> + ins->parent);
> + if (cur)
> + return -EEXIST;
> + } else {
> + p = pos;
> + parent = pos_parent;
> + }
> +
> + rb_link_node(>rb_node, parent, p);
> + rb_insert_color(>rb_node, _tree->rb_root);
> +
> + return 0;
> +}
> +
> +/* erase and free ref_node, caller should update ref_root->unique_refs */
> +static void ref_tree_remove(struct ref_root *ref_tree, struct ref_node *node)
> +{
> + rb_erase(>rb_node, _tree->rb_root);
> + kfree(node);
> +}
> +
> +/*
> + * update ref_root->unique_refs
> + *
> + * call __ref_tree_search
> + *   1. if ref_node doesn't exist, ref_tree_insert this node, and update
> + *   ref_root->unique_refs:
> + *   if ref_node->ref_mod > 0, ref_root->unique_refs++;
> + *   if ref_node->ref_mod < 0, do noting;
> + *
> + *   2. if ref_node is found, then get origin ref_node->ref_mod, and update
> + *   ref_node->ref_mod.
> + *   if ref_node->ref_mod is equal to 0,then call ref_tree_remove
> + *
> + *   according to origin_mod and new_mod, update ref_root->items
> + *   ++--+-+
> + *   ||new_count <= 0|new_count > 0|
> + *   ++--+-+
> + *   |origin_count < 0|   0  |  1  |
> + *   ++--+-+
> + *   |origin_count > 0|  -1  |  0  |
> + *   ++--+-+
> + *
> + * In case of allocation failure, -ENOMEM is returned and the ref_tree stays
> + * unaltered.
> + * Success, return 0
> + */
> +static int ref_tree_add(struct ref_root *ref_tree, u64 root_id, u64 
> object_id,
> + u64 offset, u64 parent, int count)
> +{
> + struct ref_node *node = NULL;
> + struct rb_node **pos = NULL;
> + struct rb_node *pos_parent = NULL;
> + int origin_count;
> + int ret;
> +
> + if (!count)
> + return 0;
> +
> + node = __ref_tree_search(ref_tree, , _parent, root_id,
> +  object_id, offset, parent);
> + if (node == NULL) {
> + node = kmalloc(sizeof(*node), GFP_KERNEL);
> + if (!node)
> + return -ENOMEM;
> +
> + node->root_id = root_id;
> + node->object_id = object_id;
> + node->offset = offset;
> + node->parent = parent;
> + node->ref_mod = count;
> +
> + ret = ref_tree_insert(ref_tree, pos, pos_parent, node);
> + ASSERT(!ret);
> + if (ret) {
> + kfree(node);
> + return ret;
> + }

If you put the open coded comparisons into their own function, then it
should be trivial to call that here and we can have a 'standard' looking
rbtree insert instead of this custom version. See the rbtree.h header for an
example.

We really need to avoid open coded, 'custom' versions of standard linux
kernel programming conventions. rbtree's are coded in a specific and
standard way across the entire kernel that everyone can quickly understand
and verify. There is no reason for an exception here.

Also, __ref_tree_search() and ref_tree_insert() are both re-doing the same
search so your custom rbtree() code is also slower than what we'd get if
you used the usual methods.


Please CC me on any revisions to this patch as I am very interested to see
btrfs fiemap performance get back in line with other filesystems.

Thanks,
--Mark

--
Mark Fasheh
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v10 04/21] btrfs: dedupe: Introduce function to remove hash from in-memory tree

2016-06-01 Thread Mark Fasheh

On Fri, Apr 01, 2016 at 02:34:55PM +0800, Qu Wenruo wrote:
> From: Wang Xiaoguang <wangxg.f...@cn.fujitsu.com>
> 
> Introduce static function inmem_del() to remove hash from in-memory
> dedupe tree.
> And implement btrfs_dedupe_del() and btrfs_dedup_destroy() interfaces.
> 
> Signed-off-by: Qu Wenruo <quwen...@cn.fujitsu.com>
> Signed-off-by: Wang Xiaoguang <wangxg.f...@cn.fujitsu.com>
> ---
>  fs/btrfs/dedupe.c | 105 
> ++
>  1 file changed, 105 insertions(+)
> 
> diff --git a/fs/btrfs/dedupe.c b/fs/btrfs/dedupe.c
> index 4e8455e..a229ded 100644
> --- a/fs/btrfs/dedupe.c
> +++ b/fs/btrfs/dedupe.c
> @@ -303,3 +303,108 @@ int btrfs_dedupe_add(struct btrfs_trans_handle *trans,
>   return inmem_add(dedupe_info, hash);
>   return -EINVAL;
>  }
> +
> +static struct inmem_hash *
> +inmem_search_bytenr(struct btrfs_dedupe_info *dedupe_info, u64 bytenr)
> +{
> + struct rb_node **p = _info->bytenr_root.rb_node;
> + struct rb_node *parent = NULL;
> + struct inmem_hash *entry = NULL;
> +
> + while (*p) {
> + parent = *p;
> + entry = rb_entry(parent, struct inmem_hash, bytenr_node);
> +
> + if (bytenr < entry->bytenr)
> + p = &(*p)->rb_left;
> + else if (bytenr > entry->bytenr)
> + p = &(*p)->rb_right;
> + else
> + return entry;
> + }
> +
> + return NULL;
> +}
> +
> +/* Delete a hash from in-memory dedupe tree */
> +static int inmem_del(struct btrfs_dedupe_info *dedupe_info, u64 bytenr)
> +{
> + struct inmem_hash *hash;
> +
> + mutex_lock(_info->lock);
> + hash = inmem_search_bytenr(dedupe_info, bytenr);
> + if (!hash) {
> + mutex_unlock(_info->lock);
> + return 0;
> + }
> +
> + __inmem_del(dedupe_info, hash);
> + mutex_unlock(_info->lock);
> + return 0;
> +}
> +
> +/* Remove a dedupe hash from dedupe tree */
> +int btrfs_dedupe_del(struct btrfs_trans_handle *trans,
> +  struct btrfs_fs_info *fs_info, u64 bytenr)
> +{
> + struct btrfs_dedupe_info *dedupe_info = fs_info->dedupe_info;
> +
> + if (!fs_info->dedupe_enabled)
> + return 0;
> +
> + if (WARN_ON(dedupe_info == NULL))
> + return -EINVAL;
> +
> + if (dedupe_info->backend == BTRFS_DEDUPE_BACKEND_INMEMORY)
> + return inmem_del(dedupe_info, bytenr);
> + return -EINVAL;
> +}
> +
> +static void inmem_destroy(struct btrfs_dedupe_info *dedupe_info)
> +{
> + struct inmem_hash *entry, *tmp;
> +
> + mutex_lock(_info->lock);
> + list_for_each_entry_safe(entry, tmp, _info->lru_list, lru_list)
> + __inmem_del(dedupe_info, entry);
> + mutex_unlock(_info->lock);
> +}
> +
> +int btrfs_dedupe_disable(struct btrfs_fs_info *fs_info)
> +{
> + struct btrfs_dedupe_info *dedupe_info;
> + int ret;
> +
> + /* Here we don't want to increase refs of dedupe_info */
> + fs_info->dedupe_enabled = 0;

Can this clear of fs_info->dedupe_enabled race with another thread in write?
I don't see any locking (but perhaps that comes in a later patch).
--Mark

--
Mark Fasheh
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v10 03/21] btrfs: dedupe: Introduce function to add hash into in-memory tree

2016-06-01 Thread Mark Fasheh

On Fri, Apr 01, 2016 at 02:34:54PM +0800, Qu Wenruo wrote:
> From: Wang Xiaoguang <wangxg.f...@cn.fujitsu.com>
> 
> Introduce static function inmem_add() to add hash into in-memory tree.
> And now we can implement the btrfs_dedupe_add() interface.
> 
> Signed-off-by: Qu Wenruo <quwen...@cn.fujitsu.com>
> Signed-off-by: Wang Xiaoguang <wangxg.f...@cn.fujitsu.com>
> ---
>  fs/btrfs/dedupe.c | 151 
> ++
>  1 file changed, 151 insertions(+)
> 
> diff --git a/fs/btrfs/dedupe.c b/fs/btrfs/dedupe.c
> index 2211588..4e8455e 100644
> --- a/fs/btrfs/dedupe.c
> +++ b/fs/btrfs/dedupe.c
> @@ -32,6 +32,14 @@ struct inmem_hash {
>   u8 hash[];
>  };
>  
> +static inline struct inmem_hash *inmem_alloc_hash(u16 type)
> +{
> + if (WARN_ON(type >= ARRAY_SIZE(btrfs_dedupe_sizes)))
> + return NULL;
> + return kzalloc(sizeof(struct inmem_hash) + btrfs_dedupe_sizes[type],
> + GFP_NOFS);
> +}
> +
>  static int init_dedupe_info(struct btrfs_dedupe_info **ret_info, u16 type,
>   u16 backend, u64 blocksize, u64 limit)
>  {
> @@ -152,3 +160,146 @@ enable:
>   fs_info->dedupe_enabled = 1;
>   return ret;
>  }
> +
> +static int inmem_insert_hash(struct rb_root *root,
> +  struct inmem_hash *hash, int hash_len)
> +{
> + struct rb_node **p = >rb_node;
> + struct rb_node *parent = NULL;
> + struct inmem_hash *entry = NULL;
> +
> + while (*p) {
> + parent = *p;
> + entry = rb_entry(parent, struct inmem_hash, hash_node);
> + if (memcmp(hash->hash, entry->hash, hash_len) < 0)
> + p = &(*p)->rb_left;
> + else if (memcmp(hash->hash, entry->hash, hash_len) > 0)
> + p = &(*p)->rb_right;
> + else
> + return 1;
> + }
> + rb_link_node(>hash_node, parent, p);
> + rb_insert_color(>hash_node, root);
> + return 0;
> +}
> +
> +static int inmem_insert_bytenr(struct rb_root *root,
> +struct inmem_hash *hash)
> +{
> + struct rb_node **p = >rb_node;
> + struct rb_node *parent = NULL;
> + struct inmem_hash *entry = NULL;
> +
> + while (*p) {
> + parent = *p;
> + entry = rb_entry(parent, struct inmem_hash, bytenr_node);
> + if (hash->bytenr < entry->bytenr)
> + p = &(*p)->rb_left;
> + else if (hash->bytenr > entry->bytenr)
> + p = &(*p)->rb_right;
> + else
> + return 1;
> + }
> + rb_link_node(>bytenr_node, parent, p);
> + rb_insert_color(>bytenr_node, root);
> + return 0;
> +}
> +
> +static void __inmem_del(struct btrfs_dedupe_info *dedupe_info,
> + struct inmem_hash *hash)
> +{
> + list_del(>lru_list);
> + rb_erase(>hash_node, _info->hash_root);
> + rb_erase(>bytenr_node, _info->bytenr_root);
> +
> + if (!WARN_ON(dedupe_info->current_nr == 0))
> + dedupe_info->current_nr--;
> +
> + kfree(hash);
> +}
> +
> +/*
> + * Insert a hash into in-memory dedupe tree
> + * Will remove exceeding last recent use hash.
> + *
> + * If the hash mathced with existing one, we won't insert it, to
> + * save memory
> + */
> +static int inmem_add(struct btrfs_dedupe_info *dedupe_info,
> +  struct btrfs_dedupe_hash *hash)
> +{
> + int ret = 0;
> + u16 type = dedupe_info->hash_type;
> + struct inmem_hash *ihash;
> +
> + ihash = inmem_alloc_hash(type);
> +
> + if (!ihash)
> + return -ENOMEM;
> +
> + /* Copy the data out */
> + ihash->bytenr = hash->bytenr;
> + ihash->num_bytes = hash->num_bytes;
> + memcpy(ihash->hash, hash->hash, btrfs_dedupe_sizes[type]);
> +
> + mutex_lock(_info->lock);

Can you describe somewhere in a comment why we need this mutex? It is
unclear just based on reading the code why we need a sleeping lock here.
--Mark

--
Mark Fasheh
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] Improve balance performance when qgroups are turned on

2016-05-31 Thread Mark Fasheh

On Wed, Jun 01, 2016 at 09:31:14AM +0800, Qu Wenruo wrote:
> Thanks　for the test case.
> It would be better if you could submit a test case for it.

Yeah I'll handle this.


> Reproduced the problem. I'll track it down.
> Seems to be related with metadata.

Thanks, please CC me on any patches.
    --Mark

--
Mark Fasheh
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] Improve balance performance when qgroups are turned on

2016-05-31 Thread Mark Fasheh

On Mon, May 30, 2016 at 03:48:14PM +0800, Qu Wenruo wrote:
> 
> 
> Mark Fasheh wrote on 2016/05/26 17:18 -0700:
> >The btrfs balance operation is significantly slower when qgroups are
> >enabled. To the best of my knowledge, a balance shouldn't have an effect on
> >qgroups counts (extents are not changing between subvolumes), so we don't
> >need to actually run the qgroup code when we balance.
> 
> This assumption is questionable.
> 
> When balancing, it's true we will set the chunk to ro, so new
> *allocation* won't happen in that chunk.
> 
> However we can still de-refer an extent during balance.
> 
> If that happens and we skipped the qgroup accounting, corruption happens.
> As the extent before and after balance won't go through qgroup, so
> it's de-reference won't be accounted.

Ok, thanks for the review. I was afraid that this was the case.


> While without your patch, the final qgroup is stable with 16KiB.

Qgroups in general are broken with respect to balance. The following script
reproduces an inconsistency every time I run it. You'll notice that qgroups
aren't even turned on until before we do the balance op. Like the snap
create bug, I believe you simply need a non-trivial amount of data on the fs
for testing.


#!/bin/bash -x

MNT="/btrfs"
DEV="/dev/vdb1"

mkfs.btrfs -f $DEV  
mount -t btrfs $DEV $MNT

mkdir $MNT/snaps
echo "populate $MNT with some data"
#cp -a /usr/share/fonts $MNT/
cp -a /usr/ $MNT/ &
for i in `seq -w 0 8`; do
S="$MNT/snaps/snap$i"
echo "create and populate $S"
btrfs su snap $MNT $S;
cp -a /boot $S;
done;

#let the cp from above finish
wait

btrfs fi sync $MNT

btrfs quota enable $MNT
btrfs quota rescan -w $MNT
btrfs qg show $MNT

umount $MNT

mount -t btrfs $DEV $MNT


time btrfs balance start --full-balance $MNT

umount $MNT

btrfsck $DEV



> The xfstest case will follow soon.

Ok, that will help the next time someone tries to fix this, thanks.
--Mark

--
Mark Fasheh
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] Btrfs: don't BUG_ON() in btrfs_orphan_add

2016-05-31 Thread Mark Fasheh

On Fri, May 27, 2016 at 01:03:04PM -0400, Josef Bacik wrote:
> This is just a screwup for developers, so change it to an ASSERT() so 
> developers
> notice when things go wrong and deal with the error appropriately if ASSERT()
> isn't enabled.  Thanks,
> 
> Signed-off-by: Josef Bacik <jba...@fb.com>

Reviewed-by: Mark Fasheh <mfas...@suse.de>

--
Mark Fasheh
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] fstests: btrfs: add test for qgroup handle de-refer

2016-05-31 Thread Mark Fasheh

Thanks for this test.

On Tue, May 31, 2016 at 10:03:54AM +0800, Lu Fengqi wrote:
> +echo "Start balance" >>$seqres.full
> +_btrfs_stress_balance -d $SCRATCH_MNT >/dev/null 2>&1 &
> +balance_pid=$!
> +
> +# 30s is enough to trigger bug
> +sleep $((30*$TIME_FACTOR))
> +kill $fsstress_pid $balance_pid
> +wait
> +
> +# kill _btrfs_stress_balance can't end balance, so call btrfs balance cancel
> +# to cancel running or paused balance.
> +$BTRFS_UTIL_PROG balance cancel $SCRATCH_MNT &> /dev/null
> +
> +rm -rf $SCRATCH_MNT/*
> +_run_btrfs_util_prog filesystem sync $SCRATCH_MNT
> +units=`_btrfs_qgroup_units`
> +$BTRFS_UTIL_PROG qgroup show $units $SCRATCH_MNT | $SED_PROG -n '/[0-9]/p' | 
> \
> + $AWK_PROG '{print $2" "$3}'

Wouldn't it be better here to just have btrfsck check for inconsistencies?
If you look at tests/btrfs/122 this is what I mean:

+# generate a qgroup report and look for inconsistent groups
+$BTRFS_UTIL_PROG check --qgroup-report $SCRATCH_DEV 2>&1 | \
+   grep -q -E "Counts for qgroup.*are different"
+if [ $? -ne 0 ]; then
+   status=0
+fi

That way we're not keying on some specific value showing up but instead that
qgroup validation passes (which is really what we want to test).

Thanks,
--Mark

--
Mark Fasheh
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH] Improve balance performance when qgroups are turned on

2016-05-26 Thread Mark Fasheh

The btrfs balance operation is significantly slower when qgroups are
enabled. To the best of my knowledge, a balance shouldn't have an effect on
qgroups counts (extents are not changing between subvolumes), so we don't
need to actually run the qgroup code when we balance.

Since there's only one thread doing balance at a time, it's easy to recored
that thread on the fs_info and check it inside qgroup_insert_dirty_extent().
If we're the balance thread, we drop the qgroup record instead of inserting
it.

Here are some sample numbers before and after this patch. The example fs
below is 22 gigabytes in size and was creating by copying /usr and /boot
from my test machine (a few times).

Balance with qgroups enabled, before patch:
# time btrfs balance start --full-balance /btrfs
Done, had to relocate 26 out of 26 chunks

real3m7.515s
user0m0.002s
sys 2m0.852s

Balance with qgroups enabeld, after patch:
# time btrfs balance start --full-balance /btrfs
Done, had to relocate 26 out of 26 chunks

real2m2.806s
user0m0.000s
sys 0m54.174s

Signed-off-by: Mark Fasheh <mfas...@suse.de>
---
 fs/btrfs/ctree.h   | 1 +
 fs/btrfs/delayed-ref.c | 2 +-
 fs/btrfs/disk-io.c | 1 +
 fs/btrfs/extent-tree.c | 2 +-
 fs/btrfs/qgroup.c  | 6 +-
 fs/btrfs/qgroup.h  | 3 ++-
 fs/btrfs/volumes.c | 4 
 7 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index bfe4a33..994f19a 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1748,6 +1748,7 @@ struct btrfs_fs_info {
atomic_t balance_cancel_req;
struct btrfs_balance_control *balance_ctl;
wait_queue_head_t balance_wait_q;
+   struct task_struct *balance_thread;
 
unsigned data_chunk_allocations;
unsigned metadata_ratio;
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 914ac13..81e9b92 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -606,7 +606,7 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
qrecord->num_bytes = num_bytes;
qrecord->old_roots = NULL;
 
-   qexisting = btrfs_qgroup_insert_dirty_extent(delayed_refs,
+   qexisting = btrfs_qgroup_insert_dirty_extent(fs_info, 
delayed_refs,
 qrecord);
if (qexisting)
kfree(qrecord);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 4545e2e..0bbdf808 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2236,6 +2236,7 @@ static void btrfs_init_balance(struct btrfs_fs_info 
*fs_info)
atomic_set(_info->balance_cancel_req, 0);
fs_info->balance_ctl = NULL;
init_waitqueue_head(_info->balance_wait_q);
+   fs_info->balance_thread = NULL;
 }
 
 static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index e2287c7..33c784c 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -8195,7 +8195,7 @@ static int record_one_subtree_extent(struct 
btrfs_trans_handle *trans,
 
delayed_refs = >transaction->delayed_refs;
spin_lock(_refs->lock);
-   if (btrfs_qgroup_insert_dirty_extent(delayed_refs, qrecord))
+   if (btrfs_qgroup_insert_dirty_extent(root->fs_info, delayed_refs, 
qrecord))
kfree(qrecord);
spin_unlock(_refs->lock);
 
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 6541d56..994ccb2 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1454,7 +1454,8 @@ int btrfs_qgroup_prepare_account_extents(struct 
btrfs_trans_handle *trans,
 }
 
 struct btrfs_qgroup_extent_record
-*btrfs_qgroup_insert_dirty_extent(struct btrfs_delayed_ref_root *delayed_refs,
+*btrfs_qgroup_insert_dirty_extent(struct btrfs_fs_info *fs_info,
+ struct btrfs_delayed_ref_root *delayed_refs,
  struct btrfs_qgroup_extent_record *record)
 {
struct rb_node **p = _refs->dirty_extent_root.rb_node;
@@ -1462,6 +1463,9 @@ struct btrfs_qgroup_extent_record
struct btrfs_qgroup_extent_record *entry;
u64 bytenr = record->bytenr;
 
+   if (fs_info->balance_thread == current)
+   return record;
+
assert_spin_locked(_refs->lock);
trace_btrfs_qgroup_insert_dirty_extent(record);
 
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index ecb2c14..74e683d 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -64,7 +64,8 @@ struct btrfs_delayed_extent_op;
 int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans,
 struct btrfs_fs_info *fs_info);
 struct btrfs_qgroup_extent_record
-*btrfs_qgroup_insert_dirty_extent(struct btrfs_delayed_ref_root *delayed_refs,
+*btrfs_qgrou

Re: [PATCH] btrfs,vfs: allow FILE_EXTENT_SAME on a file opened ro

2016-05-26 Thread Mark Fasheh

On Fri, May 20, 2016 at 05:45:12AM +0200, Adam Borowski wrote:
> (Only btrfs currently implements dedupe_file_range.)
> 
> Instead of checking the mode of the file descriptor, let's check whether
> it could have been opened rw.  This allows fixing failures when deduping
> a live system: anyone trying to exec a file currently being deduped gets
> ETXTBSY.
> 
> Issuing this ioctl on a ro file was already allowed for root/cap.
> 
> Signed-off-by: Adam Borowski <kilob...@angband.pl>

Hi Adam, this patch seems reasonable to me but I have to admit to being
worried about 'unintended consequences'. I poked around the code in fs/ for
a bit and saw mostly checks against file open mode. It might be that dedupe
is a special case due to the potential for longer running operations, but
theoretically you'd see the same problem if trying to exec against a file
being cloned too, correct? If that's the case then I wonder how this issue
gets solved for other ioctls.
--Mark

--
Mark Fasheh
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: About in-band dedupe for v4.7

2016-05-12 Thread Mark Fasheh

On Wed, May 11, 2016 at 07:36:59PM +0200, David Sterba wrote:
> On Tue, May 10, 2016 at 07:52:11PM -0700, Mark Fasheh wrote:
> > Taking your history with qgroups out of this btw, my opinion does not
> > change.
> > 
> > With respect to in-memory only dedupe, it is my honest opinion that such a
> > limited feature is not worth the extra maintenance work. In particular
> > there's about 800 lines of code in the userspace patches which I'm sure
> > you'd want merged, because how could we test this then?
> 
> I like the in-memory dedup backend. It's lightweight, only a heuristic,
> does not need any IO or persistent storage. OTOH I consider it a subpart
> of the in-band deduplication that does all the persistency etc. So I
> treat the ioctl interface from a broader aspect.

Those are all nice qualities, but what do they all get us?

For example, my 'large' duperemove test involves about 750 gigabytes of
general purpose data - quite literally /home off my workstation.

After the run I'm usually seeing between 65-75 gigabytes saved for a total
of only 10% duplicated data. I would expect this to be fairly 'average' -
/home on my machine has the usual stuff - documents, source code, media,
etc.

So if you were writing your whole fs out you could expect about the same
from inline dedupe - 10%-ish. Let's be generous and go with that number
though as a general 'this is how much dedupe we get'.

What the memory backend is doing then is providing a cache of sha256/block
calculations. This cache is very expensive to fill, and every written block
must go through it. On top of that, the cache does not persist between
mounts, and has items regularly removed from it when we run low on memory.
All of this will drive down the amount of duplicated data we can find.

So our best case savings is probably way below 10% - let's be _really_ nice
and say 5%.

Now ask yourself the question - would you accept a write cache which is
expensive to fill and would only have a hit rate of less than 5%?

Oh and there's 800 lines of userspace we'd merge to manage this cache too,
kernel ioctls which would have to be finalized, etc.

> A usecase I find interesting is to keep the in-memory dedup cache and
> then flush it to disk on demand, compared to automatically synced dedup
> (eg. at commit time).

What's the benefit here? We're still going to be hashing blocks on the way
in, and if we're not deduping them at write time then we're just have to
remove the extents and dedupe them later.

> > A couple examples sore points in my review so far:
> > 
> > - Internally you're using a mutex (instead of a spinlock) to lock out 
> > queries
> >  to the in-memory hash, which I can see becoming a performance problem in 
> > the
> >  write path.
> > 
> > - Also, we're doing SHA256 in the write path which I expect will
> >  slow it down even more dramatically. Given that all the work done gets
> >  thrown out every time we fill the hash (or remount), I just don't see much
> >  benefit to the user with this.
> 
> I had some ideas to use faster hashes and do sha256 when it's going to
> be stored on disk, but there were some concerns. The objection against
> speed and performance hit at write time is valid. But we'll need to
> verify that in real performance tests, which haven't happend yet up to
> my knowledge.

This is the type of thing that IMHO absolutely must be provided with each
code drop of the feature. Dedupe is nice but _nobody_ will use it if it's
slow. I know this from experience. I personally feel that btrfs has had
enough of 'cute' and 'almost working' features. If we want inline dedupe we
should do it correctly and with the right metrics from the beginning.

This is slightly unrelated to our discussion but my other unsolicited
opinion: As a kernel developer and maintainer of a file system for well over
a decade I will say that balancing the number of out of tree patches is
necessary but we should never be accepting of large features just because
'they've been out for a long time'. Again I mention this because other parts
of the discussion felt like they were going in that direction.

Thanks,
--Mark

--
Mark Fasheh
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v4] btrfs: qgroup: Fix qgroup accounting when creating snapshot

2016-05-11 Thread Mark Fasheh

On Wed, May 11, 2016 at 09:59:52AM -0700, Josef Bacik wrote:
> On 05/11/2016 09:57 AM, Mark Fasheh wrote:
> >Hi Josef,
> >
> >On Fri, Apr 22, 2016 at 02:12:11PM -0400, Josef Bacik wrote:
> >>On 04/15/2016 05:08 AM, Qu Wenruo wrote:
> >>>Current btrfs qgroup design implies a requirement that after calling
> >>>btrfs_qgroup_account_extents() there must be a commit root switch.
> >>>
> >>>Normally this is OK, as btrfs_qgroup_accounting_extents() is only called
> >>>inside btrfs_commit_transaction() just be commit_cowonly_roots().
> >>>
> >>>However there is a exception at create_pending_snapshot(), which will
> >>>call btrfs_qgroup_account_extents() but no any commit root switch.
> >>>
> >>>In case of creating a snapshot whose parent root is itself (create a
> >>>snapshot of fs tree), it will corrupt qgroup by the following trace:
> >>>(skipped unrelated data)
> >>>==
> >>>btrfs_qgroup_account_extent: bytenr = 29786112, num_bytes = 16384, 
> >>>nr_old_roots = 0, nr_new_roots = 1
> >>>qgroup_update_counters: qgid = 5, cur_old_count = 0, cur_new_count = 1, 
> >>>rfer = 0, excl = 0
> >>>qgroup_update_counters: qgid = 5, cur_old_count = 0, cur_new_count = 1, 
> >>>rfer = 16384, excl = 16384
> >>>btrfs_qgroup_account_extent: bytenr = 29786112, num_bytes = 16384, 
> >>>nr_old_roots = 0, nr_new_roots = 0
> >>>==
> >>>
> >>>The problem here is in first qgroup_account_extent(), the
> >>>nr_new_roots of the extent is 1, which means its reference got
> >>>increased, and qgroup increased its rfer and excl.
> >>>
> >>>But at second qgroup_account_extent(), its reference got decreased, but
> >>>between these two qgroup_account_extent(), there is no switch roots.
> >>>This leads to the same nr_old_roots, and this extent just got ignored by
> >>>qgroup, which means this extent is wrongly accounted.
> >>>
> >>>Fix it by call commit_cowonly_roots() after qgroup_account_extent() in
> >>>create_pending_snapshot(), with needed preparation.
> >>>
> >>>Reported-by: Mark Fasheh <mfas...@suse.de>
> >>>Signed-off-by: Qu Wenruo <quwen...@cn.fujitsu.com>
> >>>---
> >>>v2:
> >>>  Fix a soft lockup caused by missing switch_commit_root() call.
> >>>  Fix a warning caused by dirty-but-not-committed root.
> >>>v3:
> >>>  Fix a bug which will cause qgroup accounting for dropping snapshot
> >>>  wrong
> >>>v4:
> >>>  Fix a bug caused by non-cowed btree modification.
> >>>
> >>>To Filipe:
> >>>  I'm sorry I didn't wait for your reply on the dropped roots.
> >>>  I reverted back the version where we deleted dropped roots in
> >>>  switch_commit_roots().
> >>>
> >>>  As I think as long as we called btrfs_qgroup_prepare_account_extents()
> >>>  and btrfs_qgroup_account_extents(), it should have already accounted
> >>>  extents for dropped roots, and then we are OK to drop them.
> >>>
> >>>  It would be very nice if you could point out what I missed.
> >>>  Thanks
> >>>  Qu
> >>>---
> >>> fs/btrfs/transaction.c | 117 
> >>> +++--
> >>> 1 file changed, 93 insertions(+), 24 deletions(-)
> >>>
> >>>diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
> >>>index 43885e5..92f8193 100644
> >>>--- a/fs/btrfs/transaction.c
> >>>+++ b/fs/btrfs/transaction.c
> >>>@@ -311,10 +311,11 @@ loop:
> >>>  * when the transaction commits
> >>>  */
> >>> static int record_root_in_trans(struct btrfs_trans_handle *trans,
> >>>- struct btrfs_root *root)
> >>>+ struct btrfs_root *root,
> >>>+ int force)
> >>> {
> >>>-  if (test_bit(BTRFS_ROOT_REF_COWS, >state) &&
> >>>-  root->last_trans < trans->transid) {
> >>>+  if ((test_bit(BTRFS_ROOT_REF_COWS, >state) &&
> >>>+  root->last_trans < trans->transid) || force) {
> >>>   WARN_ON(root == root->fs_info->extent_root);
> >>>   WARN_ON(root->commit_root != root->node);
> >>>
> >>&g

Re: [PATCH v4] btrfs: qgroup: Fix qgroup accounting when creating snapshot

2016-05-11 Thread Mark Fasheh

Hi Josef, 

On Fri, Apr 22, 2016 at 02:12:11PM -0400, Josef Bacik wrote:
> On 04/15/2016 05:08 AM, Qu Wenruo wrote:
> >Current btrfs qgroup design implies a requirement that after calling
> >btrfs_qgroup_account_extents() there must be a commit root switch.
> >
> >Normally this is OK, as btrfs_qgroup_accounting_extents() is only called
> >inside btrfs_commit_transaction() just be commit_cowonly_roots().
> >
> >However there is a exception at create_pending_snapshot(), which will
> >call btrfs_qgroup_account_extents() but no any commit root switch.
> >
> >In case of creating a snapshot whose parent root is itself (create a
> >snapshot of fs tree), it will corrupt qgroup by the following trace:
> >(skipped unrelated data)
> >==
> >btrfs_qgroup_account_extent: bytenr = 29786112, num_bytes = 16384, 
> >nr_old_roots = 0, nr_new_roots = 1
> >qgroup_update_counters: qgid = 5, cur_old_count = 0, cur_new_count = 1, rfer 
> >= 0, excl = 0
> >qgroup_update_counters: qgid = 5, cur_old_count = 0, cur_new_count = 1, rfer 
> >= 16384, excl = 16384
> >btrfs_qgroup_account_extent: bytenr = 29786112, num_bytes = 16384, 
> >nr_old_roots = 0, nr_new_roots = 0
> >==
> >
> >The problem here is in first qgroup_account_extent(), the
> >nr_new_roots of the extent is 1, which means its reference got
> >increased, and qgroup increased its rfer and excl.
> >
> >But at second qgroup_account_extent(), its reference got decreased, but
> >between these two qgroup_account_extent(), there is no switch roots.
> >This leads to the same nr_old_roots, and this extent just got ignored by
> >qgroup, which means this extent is wrongly accounted.
> >
> >Fix it by call commit_cowonly_roots() after qgroup_account_extent() in
> >create_pending_snapshot(), with needed preparation.
> >
> >Reported-by: Mark Fasheh <mfas...@suse.de>
> >Signed-off-by: Qu Wenruo <quwen...@cn.fujitsu.com>
> >---
> >v2:
> >   Fix a soft lockup caused by missing switch_commit_root() call.
> >   Fix a warning caused by dirty-but-not-committed root.
> >v3:
> >   Fix a bug which will cause qgroup accounting for dropping snapshot
> >   wrong
> >v4:
> >   Fix a bug caused by non-cowed btree modification.
> >
> >To Filipe:
> >   I'm sorry I didn't wait for your reply on the dropped roots.
> >   I reverted back the version where we deleted dropped roots in
> >   switch_commit_roots().
> >
> >   As I think as long as we called btrfs_qgroup_prepare_account_extents()
> >   and btrfs_qgroup_account_extents(), it should have already accounted
> >   extents for dropped roots, and then we are OK to drop them.
> >
> >   It would be very nice if you could point out what I missed.
> >   Thanks
> >   Qu
> >---
> >  fs/btrfs/transaction.c | 117 
> > +++--
> >  1 file changed, 93 insertions(+), 24 deletions(-)
> >
> >diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
> >index 43885e5..92f8193 100644
> >--- a/fs/btrfs/transaction.c
> >+++ b/fs/btrfs/transaction.c
> >@@ -311,10 +311,11 @@ loop:
> >   * when the transaction commits
> >   */
> >  static int record_root_in_trans(struct btrfs_trans_handle *trans,
> >-   struct btrfs_root *root)
> >+   struct btrfs_root *root,
> >+   int force)
> >  {
> >-if (test_bit(BTRFS_ROOT_REF_COWS, >state) &&
> >-root->last_trans < trans->transid) {
> >+if ((test_bit(BTRFS_ROOT_REF_COWS, >state) &&
> >+root->last_trans < trans->transid) || force) {
> > WARN_ON(root == root->fs_info->extent_root);
> > WARN_ON(root->commit_root != root->node);
> >
> >@@ -331,7 +332,7 @@ static int record_root_in_trans(struct 
> >btrfs_trans_handle *trans,
> > smp_wmb();
> >
> > spin_lock(>fs_info->fs_roots_radix_lock);
> >-if (root->last_trans == trans->transid) {
> >+if (root->last_trans == trans->transid && !force) {
> > spin_unlock(>fs_info->fs_roots_radix_lock);
> > return 0;
> > }
> >@@ -402,7 +403,7 @@ int btrfs_record_root_in_trans(struct btrfs_trans_handle 
> >*trans,
> > return 0;
> >
> > mutex_lock(>fs_info->reloc_mutex);
> >-record_root_in_trans(trans, root);
> >+r

Re: About in-band dedupe for v4.7

2016-05-10 Thread Mark Fasheh

On Wed, May 11, 2016 at 09:40:51AM +0800, Qu Wenruo wrote:
> 
> 
> Chris Mason wrote on 2016/05/10 20:37 -0400:
> >On Tue, May 10, 2016 at 03:19:52PM +0800, Qu Wenruo wrote:
> >>Hi, Chris, Josef and David,
> >>
> >>As merge window for v4.7 is coming, it would be good to hear your ideas
> >>about the inband dedupe.
> >>
> >>We are addressing the ENOSPC problem which Josef pointed out, and we believe
> >>the final fix patch would come out at the beginning of the merge
> >>window.(Next week)
> >>
> >>
> >>If it's fine, would you please consider to merge the in-memory backend
> >>patchset for v4.7 as an experimental feature?
> >>
> >>
> >>Most of the patch won't be changed from v10 patchset, only ENOSPC fix will
> >>be updated, and ioctl patchset will introduce a new Kconfig option of "btrfs
> >>experimental features" for inband dedupe.
> >>(With explain about unstable ioctl/on-disk format for experimental features)
> >>
> >>
> >>If you are all OK to merge inband dedupe in-memory backend, I'll prepare the
> >>new v11 patchset for this merge.
> >
> >We have to balance the part where we really want the features to come
> >in, and we want to lower the load on you to continue porting them.  But,
> >I really do agree that we need strong test suites included with every
> >major feature like this.
> >
> >-chris
> >
> >
> That's fine.
> 
> We're running all generic and btrfs test case with dedupe enabled,
> by modifying xfstest to call "btrfs dedeup enable" just after mount,
> to ensure dedupe won't corrupt any existing test case.

As Satoru mentioned, this is something that everybody needs to be able to
run. I would also like to see some basic analysis done on write-heavy
workloads. I think it's fair to understand what sort of impact this will
have on the write path.
--Mark

--
Mark Fasheh
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: About in-band dedupe for v4.7

2016-05-10 Thread Mark Fasheh

On Wed, May 11, 2016 at 09:03:24AM +0800, Qu Wenruo wrote:
> 
> 
> Mark Fasheh wrote on 2016/05/10 15:11 -0700:
> >On Tue, May 10, 2016 at 03:19:52PM +0800, Qu Wenruo wrote:
> >>Hi, Chris, Josef and David,
> >>
> >>As merge window for v4.7 is coming, it would be good to hear your
> >>ideas about the inband dedupe.
> >>
> >>We are addressing the ENOSPC problem which Josef pointed out, and we
> >>believe the final fix patch would come out at the beginning of the
> >>merge window.(Next week)
> >
> >How about the fiemap performance problem you referenced before? My guess is
> >that it happens because you don't coalesce writes into anything larger than
> >a page so you're stuck deduping at some silly size like 4k. This in turn
> >fragments the files so much that fiemap has a hard time walking backrefs.
> Nope. Default dedupe size is 128K, and minimal dedupe size is
> limited to 64K maximum to 8M.
> Yes, it's going to cause fragements, but just check the test case I
> submitted, it doesn't ever need dedupe to trigger the bug.
> Clone range will also trigger it.

Yes, as you might see I've been looking through the patches now.


> >I have to check the patches to be sure but perhaps you can tell me whether
> >my hunch is correct or not.
> >
> >
> >In fact, I actually asked privately for time to review your dedupe patches,
> >but I've been literally so busy cleaning up after the mess you left in your
> >last qgroups rewrite I haven't had time.
> >
> >You literally broke qgroups in almost every spot that matters. In some cases
> >(drop_snapshot) you tore out working code and left in a /* TODO */ comment
> >for someone else to complete.  Snapshot create was so trivially and
> >completely broken by your changes that weeks later, I'm still hunting a
> >solution which doesn't involve adding an extra _commit_ to our commit.  This
> >is a MASSIVE regression from where we were before.
> 
> If you think my rework is a mess, then before that, qgroup is just
> rubbish, it can't even handle reflink between subvolume(btrfs/091),
> not to mention the hell of leaked reserved space(btrfs/099).

That's fine, I agree that the qgroups code was rubbish, but you replaced it
with something that was known to be broken. Just look at the /* TODO */ in
your original patch series. We were making incremental improvements before
and you threw that all out. Can you fault me for wondering what unsolved
problems await us in your dedupe patches?


> You were just overlooking the old qgroup things, thinking old one
> working and use the corner spot to express your disappointment.

No, I'm not happy about you leaving qgroups more broken than when you
started. There's fixing a bug, and there's trading one bug for several
others. You have done the latter.


> >IMHO, you should not be trusted with large features or rewrites until you
> >can demonstrate:
> >
> > - A willingness to *completely* solve the problem you are trying to 'fix',
> >   not do half the job which someone else will have to complete for you.
> 
> OK, just let the qgroup break forever?

You don't get credit for a partial solution!

Stepping back here for a second, I'm happy that you tried to fix a bug. I'm
extremely dissapointed in how you went about it - with a laser focus on
fixing that one bug at the expense of others. Fixing reflink between
subvolumes does us no good if you broke subvolume create and subvolume
deletion - the accounting is still wrong! The same goes for reservations.
Who cares if they don't work when the basic accounting is reporting
subvolumes with -121231247283946 bytes in them.


> Have you ever noticed the problem before the rework? What are you
> doing before that?
> Did you ever want to fix it before the rework?
> 
> Yes good corner spot and good whole accounting is perfect.
> 
> But it's super wired for you to think bad whole accounting with good
> corner case is even better than good whole accounting with bad
> corner case.
> 
> Any sane will fix the whole part thing first, and then the corner case.

Where are your corner case fixes then? This is what I'm talking about with
respect to half complete solutions!


> > - Actual testing. The snapshot bug I reference above exists purely because
> >   nobody created a snapshot inside of one and checked the qgroup numbers!
> 
> Just check the qgroup test cases.
> 
> Same word can also be said to yourself.
> 
> Why didn't you spot leaked reserve space and reflink problem and
> even the long failed test cases of qgroup test group?
> 
> It's always easier to complain others work than provide a good one.

Just so we're clear I wrote none of the original qgroup code, so those would
b

Re: [PATCH v10 02/21] btrfs: dedupe: Introduce function to initialize dedupe info

2016-05-10 Thread Mark Fasheh

On Fri, Apr 01, 2016 at 05:59:13PM +0800, kbuild test robot wrote:
> Hi Wang,
> 
> [auto build test ERROR on btrfs/next]
> [also build test ERROR on v4.6-rc1 next-20160401]
> [if your patch is applied to the wrong git tree, please drop us a note to 
> help improving the system]
> 
> url:
> https://github.com/0day-ci/linux/commits/Qu-Wenruo/Btrfs-dedupe-framework/20160401-143937
> base:   https://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs.git 
> next
> config: x86_64-rhel (attached as .config)
> reproduce:
> # save the attached .config to linux build tree
> make ARCH=x86_64 
> 
> Note: the linux-review/Qu-Wenruo/Btrfs-dedupe-framework/20160401-143937 HEAD 
> 0a445f5009c064ee1d3fc966e41bb75627594afe builds fine.
>   It only hurts bisectibility.
> 
> All errors (new ones prefixed by >>):
> 
> >> ERROR: "btrfs_dedupe_disable" [fs/btrfs/btrfs.ko] undefined!
> 
> ---
> 0-DAY kernel test infrastructureOpen Source Technology Center
> https://lists.01.org/pipermail/kbuild-all   Intel Corporation

Please correct this, we need to be able to bisect a kernel without random
patches breaking the build.
--Mark

--
Mark Fasheh
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: About in-band dedupe for v4.7

2016-05-10 Thread Mark Fasheh

On Tue, May 10, 2016 at 03:19:52PM +0800, Qu Wenruo wrote:
> Hi, Chris, Josef and David,
> 
> As merge window for v4.7 is coming, it would be good to hear your
> ideas about the inband dedupe.
> 
> We are addressing the ENOSPC problem which Josef pointed out, and we
> believe the final fix patch would come out at the beginning of the
> merge window.(Next week)

How about the fiemap performance problem you referenced before? My guess is
that it happens because you don't coalesce writes into anything larger than
a page so you're stuck deduping at some silly size like 4k. This in turn
fragments the files so much that fiemap has a hard time walking backrefs.

I have to check the patches to be sure but perhaps you can tell me whether
my hunch is correct or not.

In fact, I actually asked privately for time to review your dedupe patches,
but I've been literally so busy cleaning up after the mess you left in your
last qgroups rewrite I haven't had time.

You literally broke qgroups in almost every spot that matters. In some cases
(drop_snapshot) you tore out working code and left in a /* TODO */ comment
for someone else to complete.  Snapshot create was so trivially and
completely broken by your changes that weeks later, I'm still hunting a
solution which doesn't involve adding an extra _commit_ to our commit.  This
is a MASSIVE regression from where we were before.

IMHO, you should not be trusted with large features or rewrites until you  
can demonstrate:

 - A willingness to *completely* solve the problem you are trying to 'fix',
   not do half the job which someone else will have to complete for you.

 - Actual testing. The snapshot bug I reference above exists purely because
   nobody created a snapshot inside of one and checked the qgroup numbers!  

Sorry to be so harsh.
   --Mark

--
Mark Fasheh
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: Idea on compatibility for old distributions

2016-05-09 Thread Mark Fasheh

On Tue, May 10, 2016 at 09:16:04AM +0800, Qu Wenruo wrote:
> In the recent test for new btrfs-convert backward compatibility, I
> found that cmds-fi-du.c uses FIEMAP_EXTENT_SHARED bits, which is not
> present in kernel of old distributions like RHEL6 (Sorry, didn't
> test on openSUSE equivalent).
> 
> Unlike e2fsprogs, we can check its version with pkgconfig, any idea
> to avoid such compiling error?

#ifndef FIEMAP_EXTENT_SHARED
#define FIEMAP_EXTENT_SHARED   0x2000
#endif

This is what I do in duperemove. Many distributions dind't update their
kernel header packages for that bit so even though it's in the kernel
userspace programs using it won't compile.

> And further more, without kernel support for FIEMAP_EXTENT_SHARED,
> will fi-du work anymore?

RHEL6 says they're using 2.6.32 which is just before the introduction of
SHARED. It couild be that they have it though considering there is an
immense number of backports done for distro kernels. I would honestly check
the source. What we're really looking to see is what version of btrfs they
have.

With the above patch we would compile and run everywhere. If someone runs an
obsolete version of btrfs then fi du won't report any shared extents. This
is unfortunate but so is running btrfs from 2.6.32.
    --Mark

--
Mark Fasheh
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] Btrfs: fix qgroup accounting when snapshotting

2016-05-06 Thread Mark Fasheh

;>But your fix provides a very potential fix method.
> >>If we didn't do the DIR_ITEM insert in create_pending_snapshot, but do
> >>the insert after all qgroup_inherit() is done,
> >>the problem may have a better fix.
> >>
> >>Although I am still concerning about the DIR_ITEM insert.
> >>As we still need to account them, and since we must run qgroup
> >>accounting before qgroup_inherit(), I'm afraid we still need to do the
> >>commit hack though.
> >>
> >
> >Ugh I forgot about that.  It would be nice to use the tree mod log here,
> >but the rework makes that tricky.  Basically we'd need to delay any
> >modifications to the extent tree until after we do the inherit, so do
> >btrfs_get_tree_mod_seq() and store it in the pending, and then do the
> >inherit, and then put the seq and re-run the delayed refs and the qgroup
> >accounting.
> >
> >This is hard because this will keep us from running delayed refs, and we
> >do btrfs_run_delayed_refs(-1) a few times in between so we'd deadlock
> >because we would find delayed refs on the tree still.
> >
> >I'm not sure how to fix this without undoing what we have and going
> >back.  I'll think about it some more.  Thanks,
> >
> >Josef
> >
> >
> >
> I think your idea on moving qgroup_inherit() out is already good enough.
> 
> If we use the __commit_trans() method, we can already make things
> much cleaner.
> 
> We only need to do one qgroup accounting (including switching roots
> though) before create_pending_snapshots() (don't do DIR ITEM
> insert).
> 
> Finally, doing all DIR_ITEM insert, and remaining qgroup will be
> accounted by normal commit routine.
> 
> Already a great improvement compared to old commit_trans() every
> time we create one snapshot.
> 
> For tree_mod_seq() method, maybe we can reverted it, but I'm not
> sure if there will cause qgroup problem, as the old qgroup bugs are
> all related to backref walk on delayed_refs (while backref walk on
> extent tree is always OK).

Josef, can I please get some more attention on this topic? What Qu proposes
above seems like it will still keep the partial commit which you were very
much against. However, your patch falls over quite quickly in testing. On
my end I've tried a few things, excluding the partial commit for obvious
reasons. As soon as I think I have something that works though, it falls
over once I poke it with a larger test.

In particular, I've been trying to move around the points at which we are
taking our values for qgroup_inherit(). I can get rfer values _almost_
always correct by recording them off the source qgroup at the top of
create_snapshot() (including running qgroup accounting before that).  excl
though is always blown up.  I also tried manually counting changed blocks
during our directory insert in create_snapshot() but that fell apart pretty
quickly.

One problem is that the assumption in btrfs_qgroup_inherit() that excl for
the target group should always be 1 tree node in size is laughably incorrect
and winds up overwriting the sometimes-correct values.  My guess is that for
the old code this was just an initial value (and the rest was added later). 
This is obviously not the case after Qu's rewrite.  So we'll have to correct
that somehow in our fix.

I can't quite tell (yet) what happens on my larger tests to make the values
go bad, but my guess based on the conversation so far is that we can't
reliably count roots at this stage in the commit process. If we can't do
that, 100% correctly every time then the qgroup accounting has no hope of
working correctly.

Do you agree with this? What can we do to fix the root counting code? Your
description above seems pretty good but there's a deadlock there I have no
clue how to fix (perhaps you could help with that?).  Finally, do we have to
look more seriously at doing the partial commit you wanted to avoid in the
first place?
--Mark

PS: In order to make this all go faster I have included my xfstests patch
for this bug so nobody has to gate behind my testing.


From: Mark Fasheh <mfas...@suse.de>

[PATCH] btrfs: Test that qgroup counts are valid after snapshot creation

This has been broken since Linux v4.1. We may have worked out a solution on
the btrfs list but in the meantime sending a test to expose the issue seems
like a good idea.

Signed-off-by: Mark Fasheh <mfas...@suse.de>
---
 tests/btrfs/122 | 88 +
 tests/btrfs/122.out |  1 +
 tests/btrfs/group   |  1 +
 3 files changed, 90 insertions(+)
 create mode 100755 tests/btrfs/122
 create mode 100644 tests/btrfs/122.out

diff --git a/tests/btrfs/122 b/tests/btrfs/122
new file mode 100755
index 000..82252ab
--- /dev/null
+++ b/tests/btrfs/122
@@ -0,0 +1,88 @@
+#! /b

Re: [PATCH] Btrfs: fix qgroup accounting when snapshotting

2016-04-26 Thread Mark Fasheh

Hi Josef,

On Tue, Apr 26, 2016 at 10:24:45AM -0400, Josef Bacik wrote:
> The new qgroup stuff needs the quota accounting to be run before doing the
> inherit, unfortunately they need the commit root switch to happen at a 
> specific
> time for this to work properly.  Fix this by delaying the inherit until after 
> we
> do the qgroup accounting, and remove the inherit and accounting dance in
> create_pending_snapshot.  Thanks,

Thanks for the patch. Unfortunately, this doesn't pass the xfstest case I
wrote for this bug:

http://www.spinics.net/lists/linux-btrfs/msg54403.html


I will also attach the patch to the bottom of this e-mail to make life
easier for you :)

But basically I get a difference of 16k in the qgroups. My trivial test
checks out (just make a couple of snapshots) so my guess is that we're
missing some metadata accounting.


Counts for qgroup id: 5 are different
our:referenced 672481280 referenced compressed 672481280
disk:   referenced 672481280 referenced compressed 672481280
our:exclusive 49152 exclusive compressed 49152
disk:   exclusive 16384 exclusive compressed 16384
diff:   exclusive 32768 exclusive compressed 32768
Counts for qgroup id: 260 are different
our:referenced 672481280 referenced compressed 672481280
disk:   referenced 672481280 referenced compressed 672481280
our:exclusive 32768 exclusive compressed 32768
disk:   exclusive 16384 exclusive compressed 16384
diff:   exclusive 16384 exclusive compressed 16384
Counts for qgroup id: 261 are different
our:referenced 672481280 referenced compressed 672481280
disk:   referenced 672481280 referenced compressed 672481280
our:exclusive 32768 exclusive compressed 32768
disk:   exclusive 16384 exclusive compressed 16384
diff:   exclusive 16384 exclusive compressed 16384
    --Mark

--
Mark Fasheh


From: Mark Fasheh <mfas...@suse.de>

[PATCH] btrfs: Test that qgroup counts are valid after snapshot creation

This has been broken since Linux v4.1. We may have worked out a solution on
the btrfs list but in the meantime sending a test to expose the issue seems
like a good idea.

Signed-off-by: Mark Fasheh <mfas...@suse.de>
---
 tests/btrfs/122 | 88 +
 tests/btrfs/122.out |  1 +
 tests/btrfs/group   |  1 +
 3 files changed, 90 insertions(+)
 create mode 100755 tests/btrfs/122
 create mode 100644 tests/btrfs/122.out

diff --git a/tests/btrfs/122 b/tests/btrfs/122
new file mode 100755
index 000..82252ab
--- /dev/null
+++ b/tests/btrfs/122
@@ -0,0 +1,88 @@
+#! /bin/bash
+# FS QA Test No. btrfs/122
+#
+# Test that qgroup counts are valid after snapshot creation. This has
+# been broken in btrfs since Linux v4.1
+#
+#---
+# Copyright (C) 2016 SUSE Linux Products GmbH. All Rights Reserved.
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it would be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write the Free Software Foundation,
+# Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+#
+#---
+#
+
+seq=`basename $0`
+seqres=$RESULT_DIR/$seq
+echo "QA output created by $seq"
+
+here=`pwd`
+tmp=/tmp/$$
+status=1   # failure is the default!
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+_cleanup()
+{
+   cd /
+   rm -f $tmp.*
+}
+
+# get standard environment, filters and checks
+. ./common/rc
+. ./common/filter
+
+# remove previous $seqres.full before test
+rm -f $seqres.full
+
+# real QA test starts here
+_supported_fs btrfs
+_supported_os Linux
+_require_scratch
+
+rm -f $seqres.full
+
+# Force a small leaf size to make it easier to blow out our root
+# subvolume tree
+_scratch_mkfs "--nodesize 16384"
+_scratch_mount
+_run_btrfs_util_prog quota enable $SCRATCH_MNT
+
+mkdir "$SCRATCH_MNT/snaps"
+
+# First make some simple snapshots - the bug was initially reproduced like this
+_run_btrfs_util_prog subvolume snapshot $SCRATCH_MNT 
"$SCRATCH_MNT/snaps/empty1"
+_run_btrfs_util_prog subvolume snapshot $SCRATCH_MNT 
"$SCRATCH_MNT/snaps/empty2"
+
+# This forces the fs tree out past level 0, adding at least one tree
+# block which must be properly accounted for when we make our next
+# snapshots.
+mkdir "$SCRATCH_MNT/data"
+for i in `seq 0 640`; do
+

Re: [PATCH] btrfs: Test that qgroup counts are valid after snapshot creation

2016-04-22 Thread Mark Fasheh

On Fri, Apr 22, 2016 at 08:26:33AM +0800, Qu Wenruo wrote:
> 
> 
> Mark Fasheh wrote on 2016/04/21 16:53 -0700:
> >Thank you for the review, comments are below.
> >
> >On Wed, Apr 20, 2016 at 09:48:54AM +0900, Satoru Takeuchi wrote:
> >>On 2016/04/20 7:25, Mark Fasheh wrote:
> >>>+# Force a small leaf size to make it easier to blow out our root
> >>>+# subvolume tree
> >>>+_scratch_mkfs "--nodesize 16384"
> >>
> >>nodesize 16384 is the default value. Do you
> >>intend other value, for example 4096?
> >
> >"future proofing" I suppose - if we up the default, the for loop below may
> >not create a level 1 tree.
> >
> >If we force it smaller than 16K I believe that may mean we can't run this
> >test on some kernels with page size larger than the typical 4k.
> > --Mark
> >
> >
> >--
> >Mark Fasheh
> >
> >
> 
> Sorry for the late reply.
> 
> Unfortunately, for system with 64K page size, it will fail(mount and
> mkfs) if we use 16K nodesize.
> 
> IIRC, like some other btrfs qgroup test case, we use 64K nodesize as
> the safest nodesize.
> 
> And for level 1 tree create, the idea is to use inline file extents
> to rapidly create level 1 tree.
> 
> 16 4K files should create a level 1 tree.
> Although in this case, max_inline=4096 would be added to mount
> option though.

That all sounds good, thanks. The only thing about filling it completely
with inline extents though is that we should be exercising qgroups a little
harder. But maybe we can blow out the tree with inline extents and then add
some actual data extents after that.
--Mark

--
Mark Fasheh
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v4] btrfs: qgroup: Fix qgroup accounting when creating snapshot

2016-04-22 Thread Mark Fasheh

On Fri, Apr 22, 2016 at 02:23:59PM -0400, Josef Bacik wrote:
> On 04/22/2016 02:21 PM, Mark Fasheh wrote:
> >On Fri, Apr 22, 2016 at 02:12:11PM -0400, Josef Bacik wrote:
> >>On 04/15/2016 05:08 AM, Qu Wenruo wrote:
> >>>+  /*
> >>>+   * Force parent root to be updated, as we recorded it before so its
> >>>+   * last_trans == cur_transid.
> >>>+   * Or it won't be committed again onto disk after later
> >>>+   * insert_dir_item()
> >>>+   */
> >>>+  if (!ret)
> >>>+  record_root_in_trans(trans, parent, 1);
> >>>+  return ret;
> >>>+}
> >>
> >>NACK, holy shit we aren't adding a special transaction commit only
> >>for qgroup snapshots.  Figure out a different way.  Thanks,
> >
> >Yeah I saw that. To be fair, we run a whole lot of the transaction stuff
> >multiple times (at least from my reading) so I'm really unclear on what the
> >performance impact is.
> >
> >Do you have any suggestion though? We've been banging our heads against this
> >for a while now and as slow as this patch might be, it actually works where
> >nothing else has so far.
> 
> I'm less concerned about committing another transaction and more
> concerned about the fact that it is an special variant of the
> transaction commit.  If this goes wrong, or at some point in the
> future we fail to update it along with btrfs_transaction_commit we
> suddenly are corrupting metadata.  If we have to commit a
> transaction then call btrfs_commit_transaction(), don't open code a
> stripped down version, here be dragons.  Thanks,

Ok yeah that makes perfect sense - I thought you were telling me that this
would be a big performance problem.
--Mark

--
Mark Fasheh
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v4] btrfs: qgroup: Fix qgroup accounting when creating snapshot

2016-04-22 Thread Mark Fasheh

On Fri, Apr 22, 2016 at 02:12:11PM -0400, Josef Bacik wrote:
> On 04/15/2016 05:08 AM, Qu Wenruo wrote:
> >+/*
> >+ * Force parent root to be updated, as we recorded it before so its
> >+ * last_trans == cur_transid.
> >+ * Or it won't be committed again onto disk after later
> >+ * insert_dir_item()
> >+ */
> >+if (!ret)
> >+record_root_in_trans(trans, parent, 1);
> >+return ret;
> >+}
> 
> NACK, holy shit we aren't adding a special transaction commit only
> for qgroup snapshots.  Figure out a different way.  Thanks,

Yeah I saw that. To be fair, we run a whole lot of the transaction stuff
multiple times (at least from my reading) so I'm really unclear on what the
performance impact is.

Do you have any suggestion though? We've been banging our heads against this
for a while now and as slow as this patch might be, it actually works where
nothing else has so far.
--Mark

--
Mark Fasheh
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: About fi du and reflink/dedupe

2016-04-22 Thread Mark Fasheh

On Fri, Apr 22, 2016 at 10:57:29AM +0800, Qu Wenruo wrote:
> Hi Mark,
> 
> Thanks for your contribution to btrfs-filesystem-du command.
> 
> However there seems to be some strange behavior related to
> reflinke(and further in-band dedupe).
> (And the root cause is lying quite deep into kernel backref resolving codes)
> 
> ["Exclusive" value not really exclsuive]
> When a file with 2 file extents, and the 2nd file extent points to
> the 1st one, the fi du gives wrong answer
> 
> The following command can create such file easily.
> 
> # mkfs.btrfs -f /dev/sdb5
> # mount /dev/sdb5 /mnt/test
> # xfs_io -f -c "pwrite 0 128K" /mnt/test/tmp
> # xfs_io -c "reflink /mnt/test/tmp 0 128K 128K" /mnt/test/tmp
> # btrfs fi du /mnt/test
>  Total   Exclusive  Set shared  Filename
>  256.00KiB   256.00KiB   -  /mnt/test//tmp
>  256.00KiB   256.00KiB   0.00B  /mnt/test/
> 
> Total seems to be OK, while I am confused of the exclusive value.
> 
> As the above method will only create one real data extent, which
> takes 128K, and if following the qgroup definition, its exclusive
> should be 128K other than 256K.

Ok that's a bug in how we're counting these. We already record extent start
offsets so it's easy enough to see when we have the same extent in a file
while we fiemap it. Thanks for reporting this I'll take a look at a fix.


> And what's more, if we modify btrfs_check_shared() to return SHARED
> flag for such case, we will get 0 exclusive value for it.
> Which is quite strang. (I assume the exclusive should be 128K)
> 
> [Slow btrfs_check_shared() performance]
> In above case, btrfs fi du returns very fast.
> But when the file is in-band deduped and size goes to 1G.
> btrfs_check_shared() will take a lot of time to return, as it will
> do backref walk through.
> 
> This would be a super huge problem for inband dedupe.
> 
> 
> [Possible solution]
> Would you please consider to judge shared extent in user space?
> And don't rely on the SHARED flag from fiemap.

_Absoletely Not_

We don't ask userspace to modify their applications if there's a peformance
problem in fiemap, we fix the performance problem in fiemap. Off the top of
my head I can think of at least TWO other applications which rely on fiemap
heavily. You will have very little luck in asking them to modify their
applications.

If btrfs fiemap is broken, we fix that full stop.

More specifically, If in-band dedupe is causing fiemap to go out to lunch
'for a year', we need to address the core problem in in-band dedupe. If it's
a general problem in btrfs fiemap when we need to track it down before users
start yelling at us.
--Mark

--
Mark Fasheh
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH v2] btrfs: Test that qgroup counts are valid after snapshot creation

2016-04-21 Thread Mark Fasheh

This has been broken since Linux v4.1. We may have worked out a solution on
the btrfs list but in the meantime sending a test to expose the issue seems
like a good idea.

Changes from v1-v2:
 - cleanups
 - added 122.out

Signed-off-by: Mark Fasheh <mfas...@suse.de>
---
 tests/btrfs/122 | 88 +
 tests/btrfs/122.out |  1 +
 tests/btrfs/group   |  1 +
 3 files changed, 90 insertions(+)
 create mode 100755 tests/btrfs/122
 create mode 100644 tests/btrfs/122.out

diff --git a/tests/btrfs/122 b/tests/btrfs/122
new file mode 100755
index 000..82252ab
--- /dev/null
+++ b/tests/btrfs/122
@@ -0,0 +1,88 @@
+#! /bin/bash
+# FS QA Test No. btrfs/122
+#
+# Test that qgroup counts are valid after snapshot creation. This has
+# been broken in btrfs since Linux v4.1
+#
+#---
+# Copyright (C) 2016 SUSE Linux Products GmbH. All Rights Reserved.
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it would be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write the Free Software Foundation,
+# Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+#
+#---
+#
+
+seq=`basename $0`
+seqres=$RESULT_DIR/$seq
+echo "QA output created by $seq"
+
+here=`pwd`
+tmp=/tmp/$$
+status=1   # failure is the default!
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+_cleanup()
+{
+   cd /
+   rm -f $tmp.*
+}
+
+# get standard environment, filters and checks
+. ./common/rc
+. ./common/filter
+
+# remove previous $seqres.full before test
+rm -f $seqres.full
+
+# real QA test starts here
+_supported_fs btrfs
+_supported_os Linux
+_require_scratch
+
+rm -f $seqres.full
+
+# Force a small leaf size to make it easier to blow out our root
+# subvolume tree
+_scratch_mkfs "--nodesize 16384"
+_scratch_mount
+_run_btrfs_util_prog quota enable $SCRATCH_MNT
+
+mkdir "$SCRATCH_MNT/snaps"
+
+# First make some simple snapshots - the bug was initially reproduced like this
+_run_btrfs_util_prog subvolume snapshot $SCRATCH_MNT 
"$SCRATCH_MNT/snaps/empty1"
+_run_btrfs_util_prog subvolume snapshot $SCRATCH_MNT 
"$SCRATCH_MNT/snaps/empty2"
+
+# This forces the fs tree out past level 0, adding at least one tree
+# block which must be properly accounted for when we make our next
+# snapshots.
+mkdir "$SCRATCH_MNT/data"
+for i in `seq 0 640`; do
+   $XFS_IO_PROG -f -c "pwrite 0 1M" "$SCRATCH_MNT/data/file$i" > /dev/null 
2>&1
+done
+
+# Snapshot twice.
+_run_btrfs_util_prog subvolume snapshot $SCRATCH_MNT "$SCRATCH_MNT/snaps/snap1"
+_run_btrfs_util_prog subvolume snapshot $SCRATCH_MNT "$SCRATCH_MNT/snaps/snap2"
+
+_scratch_unmount
+
+# generate a qgroup report and look for inconsistent groups
+$BTRFS_UTIL_PROG check --qgroup-report $SCRATCH_DEV 2>&1 | \
+   grep -q -E "Counts for qgroup.*are different"
+if [ $? -ne 0 ]; then
+   status=0
+fi
+
+exit
diff --git a/tests/btrfs/122.out b/tests/btrfs/122.out
new file mode 100644
index 000..2b1890e
--- /dev/null
+++ b/tests/btrfs/122.out
@@ -0,0 +1 @@
+QA output created by 122
diff --git a/tests/btrfs/group b/tests/btrfs/group
index 9403daa..f7e8cff 100644
--- a/tests/btrfs/group
+++ b/tests/btrfs/group
@@ -122,3 +122,4 @@
 119 auto quick snapshot metadata qgroup
 120 auto quick snapshot metadata
 121 auto quick snapshot qgroup
+122 auto quick snapshot qgroup
-- 
2.1.4

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] btrfs: Test that qgroup counts are valid after snapshot creation

2016-04-21 Thread Mark Fasheh

Thank you for the review, comments are below.

On Wed, Apr 20, 2016 at 09:48:54AM +0900, Satoru Takeuchi wrote:
> On 2016/04/20 7:25, Mark Fasheh wrote:
> >+# Force a small leaf size to make it easier to blow out our root
> >+# subvolume tree
> >+_scratch_mkfs "--nodesize 16384"
> 
> nodesize 16384 is the default value. Do you
> intend other value, for example 4096?

"future proofing" I suppose - if we up the default, the for loop below may
not create a level 1 tree.

If we force it smaller than 16K I believe that may mean we can't run this
test on some kernels with page size larger than the typical 4k.
--Mark


--
Mark Fasheh
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH] btrfs: Test that qgroup counts are valid after snapshot creation

2016-04-19 Thread Mark Fasheh

This has been broken since Linux v4.1. We may have worked out a solution on
the btrfs list but in the meantime sending a test to expose the issue seems
like a good idea.

Signed-off-by: Mark Fasheh <mfas...@suse.de>
---
 tests/btrfs/122   | 88 +++
 tests/btrfs/group |  1 +
 2 files changed, 89 insertions(+)
 create mode 100755 tests/btrfs/122

diff --git a/tests/btrfs/122 b/tests/btrfs/122
new file mode 100755
index 000..b7e9e4b
--- /dev/null
+++ b/tests/btrfs/122
@@ -0,0 +1,88 @@
+#! /bin/bash
+# FS QA Test No. btrfs/122
+#
+# Test that qgroup counts are valid after snapshot creation. This has
+# been broken in btrfs since Linux v4.1
+#
+#---
+# Copyright (C) 2016 SUSE Linux Products GmbH. All Rights Reserved.
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it would be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write the Free Software Foundation,
+# Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+#
+#---
+#
+
+seq=`basename $0`
+seqres=$RESULT_DIR/$seq
+echo "QA output created by $seq"
+
+here=`pwd`
+tmp=/tmp/$$
+status=1   # failure is the default!
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+_cleanup()
+{
+   cd /
+   rm -f $tmp.*
+}
+
+# get standard environment, filters and checks
+. ./common/rc
+. ./common/filter
+
+# remove previous $seqres.full before test
+rm -f $seqres.full
+
+# real QA test starts here
+_supported_fs btrfs
+_supported_os Linux
+_require_scratch
+
+rm -f $seqres.full
+
+# Force a small leaf size to make it easier to blow out our root
+# subvolume tree
+_scratch_mkfs "--nodesize 16384"
+_scratch_mount
+_run_btrfs_util_prog quota enable $SCRATCH_MNT
+
+mkdir "$SCRATCH_MNT/snaps"
+
+# First make some simple snapshots - the bug was initially reproduced like this
+_run_btrfs_util_prog subvolume snapshot $SCRATCH_MNT 
"$SCRATCH_MNT/snaps/empty1"
+_run_btrfs_util_prog subvolume snapshot $SCRATCH_MNT 
"$SCRATCH_MNT/snaps/empty2"
+
+# This forces the fs tree out past level 0, adding at least one tree
+# block which must be properly accounted for when we make our next
+# snapshots.
+mkdir "$SCRATCH_MNT/data"
+for i in `seq 0 640`; do
+$XFS_IO_PROG -f -c "pwrite 0 1M" "$SCRATCH_MNT/data/file$i" > /dev/null 
2>&1
+done;
+
+# Snapshot twice.
+_run_btrfs_util_prog subvolume snapshot $SCRATCH_MNT "$SCRATCH_MNT/snaps/snap1"
+_run_btrfs_util_prog subvolume snapshot $SCRATCH_MNT "$SCRATCH_MNT/snaps/snap2"
+
+_scratch_unmount
+
+# generate a qgroup report and look for inconsistent groups
+$BTRFS_UTIL_PROG check --qgroup-report $SCRATCH_DEV 2>&1 | \
+   grep -q -E "Counts for qgroup.*are different"
+if [ $? -ne 0 ]; then
+   status=0
+fi
+
+exit
diff --git a/tests/btrfs/group b/tests/btrfs/group
index 9403daa..f7e8cff 100644
--- a/tests/btrfs/group
+++ b/tests/btrfs/group
@@ -122,3 +122,4 @@
 119 auto quick snapshot metadata qgroup
 120 auto quick snapshot metadata
 121 auto quick snapshot qgroup
+122 auto quick snapshot qgroup
-- 
2.1.4

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v4] btrfs: qgroup: Fix qgroup accounting when creating snapshot

2016-04-19 Thread Mark Fasheh

On Fri, Apr 15, 2016 at 05:08:22PM +0800, Qu Wenruo wrote:
> Current btrfs qgroup design implies a requirement that after calling
> btrfs_qgroup_account_extents() there must be a commit root switch.
> 
> Normally this is OK, as btrfs_qgroup_accounting_extents() is only called
> inside btrfs_commit_transaction() just be commit_cowonly_roots().
> 
> However there is a exception at create_pending_snapshot(), which will
> call btrfs_qgroup_account_extents() but no any commit root switch.
> 
> In case of creating a snapshot whose parent root is itself (create a
> snapshot of fs tree), it will corrupt qgroup by the following trace:
> (skipped unrelated data)
> ==
> btrfs_qgroup_account_extent: bytenr = 29786112, num_bytes = 16384, 
> nr_old_roots = 0, nr_new_roots = 1
> qgroup_update_counters: qgid = 5, cur_old_count = 0, cur_new_count = 1, rfer 
> = 0, excl = 0
> qgroup_update_counters: qgid = 5, cur_old_count = 0, cur_new_count = 1, rfer 
> = 16384, excl = 16384
> btrfs_qgroup_account_extent: bytenr = 29786112, num_bytes = 16384, 
> nr_old_roots = 0, nr_new_roots = 0
> ==
> 
> The problem here is in first qgroup_account_extent(), the
> nr_new_roots of the extent is 1, which means its reference got
> increased, and qgroup increased its rfer and excl.
> 
> But at second qgroup_account_extent(), its reference got decreased, but
> between these two qgroup_account_extent(), there is no switch roots.
> This leads to the same nr_old_roots, and this extent just got ignored by
> qgroup, which means this extent is wrongly accounted.
> 
> Fix it by call commit_cowonly_roots() after qgroup_account_extent() in
> create_pending_snapshot(), with needed preparation.
> 
> Reported-by: Mark Fasheh <mfas...@suse.de>
> Signed-off-by: Qu Wenruo <quwen...@cn.fujitsu.com>

Ok, this version seems to be giving me the right numbers. I'll send a test
for it shortly. I'd still like to know if this patch introduces an
unintended side effects but otherwise, thanks Qu.
--Mark

--
Mark Fasheh
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v3] btrfs: qgroup: Fix qgroup accounting when creating snapshot

2016-04-15 Thread Mark Fasheh

On Fri, Apr 15, 2016 at 09:00:06AM +0800, Qu Wenruo wrote:
> 
> 
> Mark Fasheh wrote on 2016/04/14 14:42 -0700:
> >Hi Qu,
> >
> >On Thu, Apr 14, 2016 at 01:38:40PM +0800, Qu Wenruo wrote:
> >>Current btrfs qgroup design implies a requirement that after calling
> >>btrfs_qgroup_account_extents() there must be a commit root switch.
> >>
> >>Normally this is OK, as btrfs_qgroup_accounting_extents() is only called
> >>inside btrfs_commit_transaction() just be commit_cowonly_roots().
> >>
> >>However there is a exception at create_pending_snapshot(), which will
> >>call btrfs_qgroup_account_extents() but no any commit root switch.
> >>
> >>In case of creating a snapshot whose parent root is itself (create a
> >>snapshot of fs tree), it will corrupt qgroup by the following trace:
> >>(skipped unrelated data)
> >>==
> >>btrfs_qgroup_account_extent: bytenr = 29786112, num_bytes = 16384, 
> >>nr_old_roots = 0, nr_new_roots = 1
> >>qgroup_update_counters: qgid = 5, cur_old_count = 0, cur_new_count = 1, 
> >>rfer = 0, excl = 0
> >>qgroup_update_counters: qgid = 5, cur_old_count = 0, cur_new_count = 1, 
> >>rfer = 16384, excl = 16384
> >>btrfs_qgroup_account_extent: bytenr = 29786112, num_bytes = 16384, 
> >>nr_old_roots = 0, nr_new_roots = 0
> >>==
> >>
> >>The problem here is in first qgroup_account_extent(), the
> >>nr_new_roots of the extent is 1, which means its reference got
> >>increased, and qgroup increased its rfer and excl.
> >>
> >>But at second qgroup_account_extent(), its reference got decreased, but
> >>between these two qgroup_account_extent(), there is no switch roots.
> >>This leads to the same nr_old_roots, and this extent just got ignored by
> >>qgroup, which means this extent is wrongly accounted.
> >>
> >>Fix it by call commit_cowonly_roots() after qgroup_account_extent() in
> >>create_pending_snapshot(), with needed preparation.
> >>
> >>Reported-by: Mark Fasheh <mfas...@suse.de>
> >
> >Can you please CC me on this patch when you send it out? FYI it's customary
> >to CC anyone listed here as well as significant reviewers of your patch
> >(such as Filipe).
> >
> >
> >>Signed-off-by: Qu Wenruo <quwen...@cn.fujitsu.com>
> >>---
> >>v2:
> >>   Fix a soft lockup caused by missing switch_commit_root() call.
> >>   Fix a warning caused by dirty-but-not-committed root.
> >
> >This version doesn't introduce any lockups that I encountered, thanks!
> >
> >
> >>v3:
> >>   Fix a difference behavior that btrfs qgroup will start accounting
> >>   dropped roots if we are creating snapshots.
> >>   Other than always account them in next transaction.
> >
> >This still corrupts the qgroup numbers if you do anything significant to the
> >source subvolume. For example, this script shows a 16K difference. My guess
> >is that we're missing accounting of some metadata somewhere?
> >
> >
> >#!/bin/bash
> >
> >DEV=/dev/vdb1
> >MNT=/btrfs
> >
> >mkfs.btrfs -f $DEV
> >mount -t btrfs $DEV $MNT
> >btrfs quota enable $MNT
> >mkdir "$MNT/snaps"
> >mkdir "$MNT/data"
> >echo "populate $MNT with some data"
> >for i in `seq -w 0 640`; do
> > dd if=/dev/zero of="$MNT/data/file$i" bs=1M count=1 >&/dev/null
> >done;
> >for i in `seq -w 0 1`; do
> > S="$MNT/snaps/snap$i"
> > echo "create snapshot $S"
> > btrfs su snap $MNT $S;
> >done;
> >btrfs qg show $MNT
> >
> >umount $MNT
> >btrfsck $DEV
> >
> >
> >Sample output:
> >
> >btrfs-progs v4.4+20160122
> >See http://btrfs.wiki.kernel.org for more information.
> >
> >Label:  (null)
> >UUID:   a0b648b1-7a23-4213-9bc3-db02b8520efe
> >Node size:  16384
> >Sector size:4096
> >Filesystem size:16.00GiB
> >Block group profiles:
> >   Data: single8.00MiB
> >   Metadata: DUP   1.01GiB
> >   System:   DUP  12.00MiB
> >SSD detected:   no
> >Incompat features:  extref, skinny-metadata
> >Number of devices:  1
> >Devices:
> >IDSIZE  PATH
> > 116.00GiB  /dev/vdb1
> >
> >populate /btrfs with some data
> >create snapshot /btrfs/snaps/snap0
> >Create

[PATCH V2] btrfs: test snapshot create with invalid parent qgroup

2016-04-14 Thread Mark Fasheh

Test that an invalid parent qgroup does not cause snapshot create to   
force the FS readonly. 

In btrfs, create_pending_snapshot() will go readonly on _any_ error return
from 
btrfs_qgroup_inherit(). If qgroups are enabled, a user can crash their fs by
just making a snapshot and asking it to inherit from an invalid qgroup. 

This patch does exactly that test. If the FS goes readonly that will be 
reported and we will know that a regression was introduced. 

The btrfs fix this patch relates to can be found at the following url:  
http://thread.gmane.org/gmane.comp.file-systems.btrfs/54755 

Thanks, 
--Mark

Signed-off-by: Mark Fasheh <mfas...@suse.de>
Reviewed-by: Filipe Manana <fdman...@suse.com>

>From V1-V2:
- update test name
- don't manually unmount
- add this test to the snapshot group too

---
 tests/btrfs/121 | 68 +
 tests/btrfs/121.out |  2 ++
 tests/btrfs/group   |  1 +
 3 files changed, 71 insertions(+)
 create mode 100755 tests/btrfs/121
 create mode 100644 tests/btrfs/121.out

diff --git a/tests/btrfs/121 b/tests/btrfs/121
new file mode 100755
index 000..011c5a8
--- /dev/null
+++ b/tests/btrfs/121
@@ -0,0 +1,68 @@
+#! /bin/bash
+# FS QA Test No. btrfs/121
+#
+# Test that an invalid parent qgroup does not cause snapshot create to
+# force the FS readonly.
+#
+# This issue is fixed by the following btrfs patch:
+#  [PATCH] btrfs: handle non-fatal errors in btrfs_qgroup_inherit()
+#  http://thread.gmane.org/gmane.comp.file-systems.btrfs/54755
+#
+#---
+# Copyright (C) 2016 SUSE Linux Products GmbH. All Rights Reserved.
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it would be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write the Free Software Foundation,
+# Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+#
+#---
+#
+
+seq=`basename $0`
+seqres=$RESULT_DIR/$seq
+echo "QA output created by $seq"
+
+here=`pwd`
+tmp=/tmp/$$
+status=1   # failure is the default!
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+_cleanup()
+{
+   cd /
+   rm -f $tmp.*
+}
+
+# get standard environment, filters and checks
+. ./common/rc
+. ./common/filter
+
+# remove previous $seqres.full before test
+rm -f $seqres.full
+
+# real QA test starts here
+_supported_fs btrfs
+_supported_os Linux
+_require_scratch
+
+rm -f $seqres.full
+
+_scratch_mkfs
+_scratch_mount
+_run_btrfs_util_prog quota enable $SCRATCH_MNT
+# The qgroup '1/10' does not exist and should be silently ignored
+_run_btrfs_util_prog subvolume snapshot -i 1/10 $SCRATCH_MNT $SCRATCH_MNT/snap1
+
+echo "Silence is golden"
+
+status=0
+exit
diff --git a/tests/btrfs/121.out b/tests/btrfs/121.out
new file mode 100644
index 000..b71250d
--- /dev/null
+++ b/tests/btrfs/121.out
@@ -0,0 +1,2 @@
+QA output created by 121
+Silence is golden
diff --git a/tests/btrfs/group b/tests/btrfs/group
index 13aa1e5..ef6c260 100644
--- a/tests/btrfs/group
+++ b/tests/btrfs/group
@@ -121,3 +121,4 @@
 118 auto quick snapshot metadata
 119 auto quick snapshot metadata qgroup
 120 auto quick snapshot metadata
+121 auto quick snapshot qgroup
-- 
2.1.4

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v3] btrfs: qgroup: Fix qgroup accounting when creating snapshot

2016-04-14 Thread Mark Fasheh

Hi Qu,

On Thu, Apr 14, 2016 at 01:38:40PM +0800, Qu Wenruo wrote:
> Current btrfs qgroup design implies a requirement that after calling
> btrfs_qgroup_account_extents() there must be a commit root switch.
> 
> Normally this is OK, as btrfs_qgroup_accounting_extents() is only called
> inside btrfs_commit_transaction() just be commit_cowonly_roots().
> 
> However there is a exception at create_pending_snapshot(), which will
> call btrfs_qgroup_account_extents() but no any commit root switch.
> 
> In case of creating a snapshot whose parent root is itself (create a
> snapshot of fs tree), it will corrupt qgroup by the following trace:
> (skipped unrelated data)
> ==
> btrfs_qgroup_account_extent: bytenr = 29786112, num_bytes = 16384, 
> nr_old_roots = 0, nr_new_roots = 1
> qgroup_update_counters: qgid = 5, cur_old_count = 0, cur_new_count = 1, rfer 
> = 0, excl = 0
> qgroup_update_counters: qgid = 5, cur_old_count = 0, cur_new_count = 1, rfer 
> = 16384, excl = 16384
> btrfs_qgroup_account_extent: bytenr = 29786112, num_bytes = 16384, 
> nr_old_roots = 0, nr_new_roots = 0
> ==
> 
> The problem here is in first qgroup_account_extent(), the
> nr_new_roots of the extent is 1, which means its reference got
> increased, and qgroup increased its rfer and excl.
> 
> But at second qgroup_account_extent(), its reference got decreased, but
> between these two qgroup_account_extent(), there is no switch roots.
> This leads to the same nr_old_roots, and this extent just got ignored by
> qgroup, which means this extent is wrongly accounted.
> 
> Fix it by call commit_cowonly_roots() after qgroup_account_extent() in
> create_pending_snapshot(), with needed preparation.
> 
> Reported-by: Mark Fasheh <mfas...@suse.de>

Can you please CC me on this patch when you send it out? FYI it's customary
to CC anyone listed here as well as significant reviewers of your patch
(such as Filipe).


> Signed-off-by: Qu Wenruo <quwen...@cn.fujitsu.com>
> ---
> v2:
>   Fix a soft lockup caused by missing switch_commit_root() call.
>   Fix a warning caused by dirty-but-not-committed root.

This version doesn't introduce any lockups that I encountered, thanks!


> v3:
>   Fix a difference behavior that btrfs qgroup will start accounting
>   dropped roots if we are creating snapshots.
>   Other than always account them in next transaction.

This still corrupts the qgroup numbers if you do anything significant to the
source subvolume. For example, this script shows a 16K difference. My guess
is that we're missing accounting of some metadata somewhere?


#!/bin/bash

DEV=/dev/vdb1
MNT=/btrfs

mkfs.btrfs -f $DEV
mount -t btrfs $DEV $MNT
btrfs quota enable $MNT
mkdir "$MNT/snaps"
mkdir "$MNT/data"
echo "populate $MNT with some data"
for i in `seq -w 0 640`; do
dd if=/dev/zero of="$MNT/data/file$i" bs=1M count=1 >&/dev/null
done;
for i in `seq -w 0 1`; do
S="$MNT/snaps/snap$i"
echo "create snapshot $S"
btrfs su snap $MNT $S;
done;
btrfs qg show $MNT

umount $MNT
btrfsck $DEV


Sample output:

btrfs-progs v4.4+20160122
See http://btrfs.wiki.kernel.org for more information.

Label:  (null)
UUID:   a0b648b1-7a23-4213-9bc3-db02b8520efe
Node size:  16384
Sector size:4096
Filesystem size:16.00GiB
Block group profiles:
  Data: single8.00MiB
  Metadata: DUP   1.01GiB
  System:   DUP  12.00MiB
SSD detected:   no
Incompat features:  extref, skinny-metadata
Number of devices:  1
Devices:
   IDSIZE  PATH
116.00GiB  /dev/vdb1

populate /btrfs with some data
create snapshot /btrfs/snaps/snap0
Create a snapshot of '/btrfs' in '/btrfs/snaps/snap0'
create snapshot /btrfs/snaps/snap1
Create a snapshot of '/btrfs' in '/btrfs/snaps/snap1'
qgroupid rfer excl 
   
0/5 641.34MiB 16.00KiB 
0/258   641.34MiB 16.00KiB 
0/259   641.34MiB 16.00KiB 
Checking filesystem on /dev/vdb1
UUID: a0b648b1-7a23-4213-9bc3-db02b8520efe
checking extents
checking free space cache
checking fs roots
checking csums
checking root refs
checking quota groups
Counts for qgroup id: 5 are different
our:referenced 672497664 referenced compressed 672497664
disk:   referenced 672497664 referenced compressed 672497664
our:exclusive 49152 exclusive compressed 49152
disk:   exclusive 16384 exclusive compressed 16384
diff:   exclusive 32768 exclusive compressed 32768
found 673562626 bytes used err is 0
total csum bytes: 656384
total tree bytes: 1425408
total fs tree bytes: 442368
total extent tree bytes: 98304
btree space waste bytes: 385361
file data blocks allocated: 672661504
 referenced 672661504
e

Re: WARN_ON in record_root_in_trans() when deleting freshly renamed subvolume

2016-04-11 Thread Mark Fasheh

On Mon, Apr 11, 2016 at 09:05:47AM +0800, Qu Wenruo wrote:
> 
> 
> Mark Fasheh wrote on 2016/04/08 12:18 -0700:
> >On Fri, Apr 08, 2016 at 03:10:35PM +0200, Holger Hoffstätte wrote:
> >>[cc: Mark and Qu]
> >>
> >>On 04/08/16 13:51, Holger Hoffstätte wrote:
> >>>On 04/08/16 13:14, Filipe Manana wrote:
> >>>>Using Chris' for-linus-4.6 branch, which is 4.5-rc6 + all 4.6 btrfs
> >>>>patches, it didn't reproduce here:
> >>>
> >>>Great, that's good to know (sort of :). Thanks also to Liu Bo.
> >>>
> >>>>Are you sure that you are not using some patches not in 4.6?
> >>
> >>We have a bingo!
> >>
> >>Reverting "qgroup: Fix qgroup accounting when creating snapshot"
> >>from last Wednesday immediately fixes the problem.
> >
> >Not surprising, I had some issues testing it out too. I'm pretty sure this
> >patch is corrupting memory, I just haven't found where yet though my
> >educated guess is that the transaction is being reused improperly.
> > --Mark
> >
> >--
> >Mark Fasheh
> >
> >
> Still digging the bug Mark has reported about the patch.
> 
> Good to have another report, as I can't always reproduce the soft
> lockup from Mark.
> 
> It seems that the WARN_ON will bring another clue to fix it.
> 
> BTW, the memory corruption assumption seems to be quite helpful.
> I didn't consider in that way, but it seems to be the only reason
> causing dead spinlock while no other thread spinning and no lockdep
> warning.

It seems to be the call to commit_cowonly_roots() in your patch which sets
everything off. If I remove that call I can run all day without a crash.

Btw, I'm not convinced this fixes the qgroup numbers anyway - we are still
inconsistent even if I don't get a crash.

Have you tested that the actual numbers on your end are coming out ok?
--Mark

--
Mark Fasheh
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: WARN_ON in record_root_in_trans() when deleting freshly renamed subvolume

2016-04-08 Thread Mark Fasheh

On Fri, Apr 08, 2016 at 03:10:35PM +0200, Holger Hoffstätte wrote:
> [cc: Mark and Qu]
> 
> On 04/08/16 13:51, Holger Hoffstätte wrote:
> > On 04/08/16 13:14, Filipe Manana wrote:
> >> Using Chris' for-linus-4.6 branch, which is 4.5-rc6 + all 4.6 btrfs
> >> patches, it didn't reproduce here:
> > 
> > Great, that's good to know (sort of :). Thanks also to Liu Bo.
> > 
> >> Are you sure that you are not using some patches not in 4.6?
> 
> We have a bingo!
> 
> Reverting "qgroup: Fix qgroup accounting when creating snapshot"
> from last Wednesday immediately fixes the problem.

Not surprising, I had some issues testing it out too. I'm pretty sure this
patch is corrupting memory, I just haven't found where yet though my
educated guess is that the transaction is being reused improperly.
--Mark

--
Mark Fasheh
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] btrfs: qgroup: Fix qgroup accounting when creating snapshot

2016-04-07 Thread Mark Fasheh

On Thu, Apr 07, 2016 at 04:21:53PM +0800, Qu Wenruo wrote:
> I ran into one soft lockup with my patch only. So I assume it's not
> caused by your inherit patch though.
> But I didn't reproduce it once more. Not sure why.
> 
> What's the reproduce rate in your environment?

It happens every time for me. Just wait about 30 seconds or so (my guess is
to let a transaction commit kick in). Also I can force the issue to show up
if I unmount.
--Mark

--
Mark Fasheh
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] btrfs: qgroup: Fix qgroup accounting when creating snapshot

2016-04-06 Thread Mark Fasheh

btrfs quota enable /btrfs
mkdir /btrfs/snaps
echo "populate /btrfs/ with some data"
cp -a /usr/share /btrfs/  
btrfs qgroup create 1/0 /btrfs
for i in `seq -w 0 14`; do
S="/btrfs/snaps/snap$i"
echo "create and populate $S"
btrfs su snap -i 1/0 /btrfs/ $S;
cp -a /boot $S;
done;
for i in `seq -w 3 11 `; do
S="/btrfs/snaps/snap$i"
echo "remove snapshot $S"
btrfs su de $S
done;


This is on Linux 4.5 with my inherit fix and your patch applied. The script
I pasted above ran with no problems until I added your patch to my kernel so
my guess is it's not related to the btrfs_qgroup_inherit() patch.
Nonetheless, here's a link to it in case you want a 2nd look:

http://thread.gmane.org/gmane.comp.file-systems.btrfs/54755

Thanks,
--Mark

--
Mark Fasheh
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] btrfs: test snapshot create with invalid parent qgroup

2016-04-06 Thread Mark Fasheh

On Wed, Apr 06, 2016 at 10:40:02PM +0100, Filipe Manana wrote:
> On Wed, Apr 6, 2016 at 10:30 PM, Mark Fasheh <mfas...@suse.de> wrote:
> > Test that an invalid parent qgroup does not cause snapshot create to
> > force the FS readonly.
> >
> > In btrfs, create_pending_snapshot() will go readonly on _any_ error return 
> > from
> > btrfs_qgroup_inherit(). If qgroups are enabled, a user can crash their fs by
> > just making a snapshot and asking it to inherit from an invalid qgroup.
> >
> > This patch does exactly that test. If the FS goes readonly that will be
> > reported and we will know that a regression was introduced.
> >
> > The btrfs fix this patch relates to can be found at the following url:
> > http://thread.gmane.org/gmane.comp.file-systems.btrfs/54755
> >
> > Thanks,
> > --Mark
> >
> > Signed-off-by: Mark Fasheh <mfas...@suse.de>
> 
> Reviewed-by: Filipe Manana <fdman...@suse.com>
> 
> Looks good, just a few minor notes below.

Thanks for the review Filipe. How's this look?
--Mark

--
Mark Fasheh


From: Mark Fasheh <mfas...@suse.de>

[PATCH] btrfs: test snapshot create with invalid parent qgroup

Signed-off-by: Mark Fasheh <mfas...@suse.de>
Reviewed-by: Filipe Manana <fdman...@suse.com>
---
 tests/btrfs/121 | 68 +
 tests/btrfs/121.out |  2 ++
 tests/btrfs/group   |  1 +
 3 files changed, 71 insertions(+)
 create mode 100755 tests/btrfs/121
 create mode 100644 tests/btrfs/121.out

diff --git a/tests/btrfs/121 b/tests/btrfs/121
new file mode 100755
index 000..011c5a8
--- /dev/null
+++ b/tests/btrfs/121
@@ -0,0 +1,68 @@
+#! /bin/bash
+# FS QA Test No. btrfs/121
+#
+# Test that an invalid parent qgroup does not cause snapshot create to
+# force the FS readonly.
+#
+# This issue is fixed by the following btrfs patch:
+#  [PATCH] btrfs: handle non-fatal errors in btrfs_qgroup_inherit()
+#  http://thread.gmane.org/gmane.comp.file-systems.btrfs/54755
+#
+#---
+# Copyright (C) 2016 SUSE Linux Products GmbH. All Rights Reserved.
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it would be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write the Free Software Foundation,
+# Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+#
+#---
+#
+
+seq=`basename $0`
+seqres=$RESULT_DIR/$seq
+echo "QA output created by $seq"
+
+here=`pwd`
+tmp=/tmp/$$
+status=1   # failure is the default!
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+_cleanup()
+{
+   cd /
+   rm -f $tmp.*
+}
+
+# get standard environment, filters and checks
+. ./common/rc
+. ./common/filter
+
+# remove previous $seqres.full before test
+rm -f $seqres.full
+
+# real QA test starts here
+_supported_fs btrfs
+_supported_os Linux
+_require_scratch
+
+rm -f $seqres.full
+
+_scratch_mkfs
+_scratch_mount
+_run_btrfs_util_prog quota enable $SCRATCH_MNT
+# The qgroup '1/10' does not exist and should be silently ignored
+_run_btrfs_util_prog subvolume snapshot -i 1/10 $SCRATCH_MNT $SCRATCH_MNT/snap1
+
+echo "Silence is golden"
+
+status=0
+exit
diff --git a/tests/btrfs/121.out b/tests/btrfs/121.out
new file mode 100644
index 000..b71250d
--- /dev/null
+++ b/tests/btrfs/121.out
@@ -0,0 +1,2 @@
+QA output created by 121
+Silence is golden
diff --git a/tests/btrfs/group b/tests/btrfs/group
index 13aa1e5..ef6c260 100644
--- a/tests/btrfs/group
+++ b/tests/btrfs/group
@@ -121,3 +121,4 @@
 118 auto quick snapshot metadata
 119 auto quick snapshot metadata qgroup
 120 auto quick snapshot metadata
+121 auto quick snapshot qgroup
-- 
2.1.4

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] btrfs: test snapshot create with invalid parent qgroup

2016-04-06 Thread Mark Fasheh

On Wed, Apr 06, 2016 at 02:30:34PM -0700, Mark Fasheh wrote:
> Test that an invalid parent qgroup does not cause snapshot create to
> force the FS readonly.
> 
> In btrfs, create_pending_snapshot() will go readonly on _any_ error return 
> from
> btrfs_qgroup_inherit(). If qgroups are enabled, a user can crash their fs by
> just making a snapshot and asking it to inherit from an invalid qgroup.
> 
> This patch does exactly that test. If the FS goes readonly that will be
> reported and we will know that a regression was introduced.
> 
> The btrfs fix this patch relates to can be found at the following url:
> http://thread.gmane.org/gmane.comp.file-systems.btrfs/54755
> 
> Thanks,
>   --Mark
> 
> Signed-off-by: Mark Fasheh <mfas...@suse.de>

Oops, wrong patch was attached. This is a version that actually works :)
--Mark

--
Mark Fasheh


[PATCH] btrfs: test snapshot create with invalid parent qgroup

Signed-off-by: Mark Fasheh <mfas...@suse.de>
---
 tests/btrfs/119 | 70 +
 tests/btrfs/119.out |  2 ++
 tests/btrfs/group   |  1 +
 3 files changed, 73 insertions(+)
 create mode 100755 tests/btrfs/119
 create mode 100644 tests/btrfs/119.out

diff --git a/tests/btrfs/119 b/tests/btrfs/119
new file mode 100755
index 000..a48d0ed
--- /dev/null
+++ b/tests/btrfs/119
@@ -0,0 +1,70 @@
+#! /bin/bash
+# FS QA Test No. btrfs/119
+#
+# Test that an invalid parent qgroup does not cause snapshot create to
+# force the FS readonly.
+#
+# This issue is fixed by the following btrfs patch:
+#  [PATCH] btrfs: handle non-fatal errors in btrfs_qgroup_inherit()
+#  http://thread.gmane.org/gmane.comp.file-systems.btrfs/54755
+#
+#---
+# Copyright (C) 2016 SUSE Linux Products GmbH. All Rights Reserved.
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it would be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write the Free Software Foundation,
+# Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+#
+#---
+#
+
+seq=`basename $0`
+seqres=$RESULT_DIR/$seq
+echo "QA output created by $seq"
+
+here=`pwd`
+tmp=/tmp/$$
+status=1   # failure is the default!
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+_cleanup()
+{
+   cd /
+   rm -f $tmp.*
+}
+
+# get standard environment, filters and checks
+. ./common/rc
+. ./common/filter
+
+# remove previous $seqres.full before test
+rm -f $seqres.full
+
+# real QA test starts here
+_supported_fs btrfs
+_supported_os Linux
+_require_scratch
+
+rm -f $seqres.full
+
+_scratch_mkfs
+_scratch_mount
+_run_btrfs_util_prog quota enable $SCRATCH_MNT
+# The qgroup '1/10' does not exist and should be silently ignored
+_run_btrfs_util_prog subvolume snapshot -i 1/10 $SCRATCH_MNT $SCRATCH_MNT/snap1
+
+_scratch_unmount
+
+echo "Silence is golden"
+
+status=0
+exit
diff --git a/tests/btrfs/119.out b/tests/btrfs/119.out
new file mode 100644
index 000..e7b242e
--- /dev/null
+++ b/tests/btrfs/119.out
@@ -0,0 +1,2 @@
+QA output created by 119
+Silence is golden
diff --git a/tests/btrfs/group b/tests/btrfs/group
index a2fa412..19f4910 100644
--- a/tests/btrfs/group
+++ b/tests/btrfs/group
@@ -119,3 +119,4 @@
 116 auto quick metadata
 117 auto quick send clone
 118 auto quick snapshot metadata
+119 auto quick qgroup
-- 
2.1.4

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH] btrfs: test snapshot create with invalid parent qgroup

2016-04-06 Thread Mark Fasheh

Test that an invalid parent qgroup does not cause snapshot create to
force the FS readonly.

In btrfs, create_pending_snapshot() will go readonly on _any_ error return from
btrfs_qgroup_inherit(). If qgroups are enabled, a user can crash their fs by
just making a snapshot and asking it to inherit from an invalid qgroup.

This patch does exactly that test. If the FS goes readonly that will be
reported and we will know that a regression was introduced.

The btrfs fix this patch relates to can be found at the following url:
http://thread.gmane.org/gmane.comp.file-systems.btrfs/54755

Thanks,
--Mark

Signed-off-by: Mark Fasheh <mfas...@suse.de>
---
 tests/btrfs/119 | 70 +
 tests/btrfs/119.out |  2 ++
 tests/btrfs/group   |  1 +
 3 files changed, 73 insertions(+)
 create mode 100755 tests/btrfs/119
 create mode 100644 tests/btrfs/119.out

diff --git a/tests/btrfs/119 b/tests/btrfs/119
new file mode 100755
index 000..1692160
--- /dev/null
+++ b/tests/btrfs/119
@@ -0,0 +1,70 @@
+#! /bin/bash
+# FS QA Test No. btrfs/120
+#
+# Test that an invalid parent qgroup does not cause snapshot create to
+# force the FS readonly.
+#
+# This issue is fixed by the following btrfs patch:
+#  [PATCH] btrfs: handle non-fatal errors in btrfs_qgroup_inherit()
+#  http://thread.gmane.org/gmane.comp.file-systems.btrfs/54755
+#
+#---
+# Copyright (C) 2016 SUSE Linux Products GmbH. All Rights Reserved.
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it would be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write the Free Software Foundation,
+# Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+#
+#---
+#
+
+seq=`basename $0`
+seqres=$RESULT_DIR/$seq
+echo "QA output created by $seq"
+
+here=`pwd`
+tmp=/tmp/$$
+status=1   # failure is the default!
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+_cleanup()
+{
+   cd /
+   rm -f $tmp.*
+}
+
+# get standard environment, filters and checks
+. ./common/rc
+. ./common/filter
+
+# remove previous $seqres.full before test
+rm -f $seqres.full
+
+# real QA test starts here
+_supported_fs btrfs
+_supported_os Linux
+_require_scratch
+
+rm -f $seqres.full
+
+_scratch_mkfs
+_scratch_mount
+_run_btrfs_util_prog quota enable $SCRATCH_MNT
+# The qgroup '1/10' does not exist and should be silently ignored
+_run_btrfs_util_prog subvolume snapshot -i 1/10 $SCRATCH_MNT $SCRATCH_MNT/snap1
+
+_scratch_unmount
+
+echo "Silence is golden"
+
+status=0
+exit
diff --git a/tests/btrfs/119.out b/tests/btrfs/119.out
new file mode 100644
index 000..8e2ee9e
--- /dev/null
+++ b/tests/btrfs/119.out
@@ -0,0 +1,2 @@
+QA output created by 120
+Silence is golden
diff --git a/tests/btrfs/group b/tests/btrfs/group
index a2fa412..19f4910 100644
--- a/tests/btrfs/group
+++ b/tests/btrfs/group
@@ -119,3 +119,4 @@
 116 auto quick metadata
 117 auto quick send clone
 118 auto quick snapshot metadata
+119 auto quick qgroup
-- 
2.1.4

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] btrfs: handle non-fatal errors in btrfs_qgroup_inherit()

2016-04-06 Thread Mark Fasheh

On Wed, Apr 06, 2016 at 10:22:57AM +0100, Filipe Manana wrote:
> Mark, did you forgot to submit a patch with the test case for fstests,
> or is there any other reason why you didn't do it?

No, I was just waiting to see how my fix did in review. I'll be shooting
that test over later today.
--Mark

--
Mark Fasheh
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: Qgroups wrong after snapshot create

2016-04-05 Thread Mark Fasheh

On Tue, Apr 05, 2016 at 03:16:54PM -0700, Mark Fasheh wrote:
> On Tue, Apr 05, 2016 at 09:27:01AM +0800, Qu Wenruo wrote:
> > Mark Fasheh wrote on 2016/04/04 16:06 -0700:
> > >Hi,
> > >
> > >Making a snapshot gets us the wrong qgroup numbers. This is very easy to
> > >reproduce. From a fresh btrfs filesystem, simply enable qgroups and create 
> > >a
> > >snapshot. In this example we have mounted a newly created fresh filesystem
> > >and mounted it at /btrfs:
> > >
> > ># btrfs quota enable /btrfs
> > ># btrfs sub sna /btrfs/ /btrfs/snap1
> > ># btrfs qg show /btrfs
> > >
> > >qgroupid rfer excl
> > >  
> > >0/5  32.00KiB 32.00KiB
> > >0/25716.00KiB 16.00KiB
> > >
> > 
> > Also reproduced it.
> > 
> > My first idea is, old snapshot qgroup hack is involved.
> > 
> > Unlike btrfs_inc/dec_extent_ref(), snapshotting just use a dirty
> > hack to handle it:
> > Copy rfer from source subvolume, and directly set excl to nodesize.
> > 
> > If such work is before adding snapshot inode into src subvolume, it
> > may be the reason causing the bug.
> 
> Ok, thanks very much for looking into this Qu.
> 
> 
> > >In the example above, the default subvolume (0/5) should read 16KiB
> > >referenced and 16KiB exclusive.
> > >
> > >A rescan fixes things, so we know the rescan process is doing the math
> > >right:
> > >
> > ># btrfs quota rescan /btrfs
> > ># btrfs qgroup show /btrfs
> > >qgroupid rfer excl
> > >  
> > >0/5  16.00KiB 16.00KiB
> > >0/25716.00KiB 16.00KiB
> > >
> > 
> > So the base of qgroup code is not affected, or we may need another
> > painful rework.
> 
> Yeah as far as I can tell the core algorithm is fine. We're just running the
> extents incorrectly somehow.

Btw, I should add - my biggest fear was an algorithm change which would have
made older versions of btrfsck incompatible. It seems though we can still
use it for checking qgroups.
--Mark

--
Mark Fasheh
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: Qgroups wrong after snapshot create

2016-04-05 Thread Mark Fasheh

On Tue, Apr 05, 2016 at 09:27:01AM +0800, Qu Wenruo wrote:
> Mark Fasheh wrote on 2016/04/04 16:06 -0700:
> >Hi,
> >
> >Making a snapshot gets us the wrong qgroup numbers. This is very easy to
> >reproduce. From a fresh btrfs filesystem, simply enable qgroups and create a
> >snapshot. In this example we have mounted a newly created fresh filesystem
> >and mounted it at /btrfs:
> >
> ># btrfs quota enable /btrfs
> ># btrfs sub sna /btrfs/ /btrfs/snap1
> ># btrfs qg show /btrfs
> >
> >qgroupid rfer excl
> >  
> >0/5  32.00KiB 32.00KiB
> >0/25716.00KiB 16.00KiB
> >
> 
> Also reproduced it.
> 
> My first idea is, old snapshot qgroup hack is involved.
> 
> Unlike btrfs_inc/dec_extent_ref(), snapshotting just use a dirty
> hack to handle it:
> Copy rfer from source subvolume, and directly set excl to nodesize.
> 
> If such work is before adding snapshot inode into src subvolume, it
> may be the reason causing the bug.

Ok, thanks very much for looking into this Qu.


> >In the example above, the default subvolume (0/5) should read 16KiB
> >referenced and 16KiB exclusive.
> >
> >A rescan fixes things, so we know the rescan process is doing the math
> >right:
> >
> ># btrfs quota rescan /btrfs
> ># btrfs qgroup show /btrfs
> >qgroupid rfer excl
> >  
> >0/5  16.00KiB 16.00KiB
> >0/25716.00KiB 16.00KiB
> >
> 
> So the base of qgroup code is not affected, or we may need another
> painful rework.

Yeah as far as I can tell the core algorithm is fine. We're just running the
extents incorrectly somehow.


> >#  _-=> irqs-off
> ># / _=> need-resched
> >#| / _---=> hardirq/softirq
> >#|| / _--=> preempt-depth
> >#||| / delay
> >#   TASK-PID   CPU#  TIMESTAMP  FUNCTION
> >#  | |   |      | |
> >btrfs-10233 [001]  260298.823339: 
> > btrfs_qgroup_account_extent: bytenr = 29360128, num_bytes = 16384, 
> > nr_old_roots = 1, nr_new_roots = 0
> >btrfs-10233 [001]  260298.823342: qgroup_update_counters: 
> > qgid = 5, cur_old_count = 1, cur_new_count = 0, rfer = 16384, excl = 16384
> >btrfs-10233 [001]  260298.823342: qgroup_update_counters: 
> > qgid = 5, cur_old_count = 1, cur_new_count = 0, rfer = 0, excl = 0
> >btrfs-10233 [001]  260298.823343: 
> > btrfs_qgroup_account_extent: bytenr = 29720576, num_bytes = 16384, 
> > nr_old_roots = 0, nr_new_roots = 0
> >btrfs-10233 [001]  260298.823345: 
> > btrfs_qgroup_account_extent: bytenr = 29736960, num_bytes = 16384, 
> > nr_old_roots = 0, nr_new_roots = 0
> >btrfs-10233 [001]  260298.823347: 
> > btrfs_qgroup_account_extent: bytenr = 29786112, num_bytes = 16384, 
> > nr_old_roots = 0, nr_new_roots = 1
> 
> Now, for extent 29786112, its nr_new_roots is 1.
> 
> >btrfs-10233 [001]  260298.823347: qgroup_update_counters: 
> > qgid = 5, cur_old_count = 0, cur_new_count = 1, rfer = 0, excl = 0
> >btrfs-10233 [001]  260298.823348: qgroup_update_counters: 
> > qgid = 5, cur_old_count = 0, cur_new_count = 1, rfer = 16384, excl = 16384
> >btrfs-10233 [001]  260298.823421: 
> > btrfs_qgroup_account_extent: bytenr = 29786112, num_bytes = 16384, 
> > nr_old_roots = 0, nr_new_roots = 0
> 
> Now the problem is here, nr_old_roots should be 1, not 0.
> Just as previous trace shows, we increased extent ref on that
> extent, but now it dropped back to 0.
> 
> Since its old_root == new_root == 0, qgroup code doesn't do anything on it.
> If its nr_old_roots is 1, qgroup will drop it's excl/rfer to 0, and
> then accounting may goes back to normal.

Ok, so we're fine with the numbers going to zero so long as it gets back to
where it should be. That also explains the 'strange' behavior I saw.


> >btrfs-10233 [001]  260298.823422: 
> > btrfs_qgroup_account_extent: bytenr = 29835264, num_bytes = 16384, 
> > nr_old_roots = 0, nr_new_roots = 0
> >btrfs-10233 [001]  260298.823425: 
> > btrfs_qgroup_account_extent: bytenr = 29851648, num_bytes = 16384, 
> > nr_old_roots = 0, nr_new_roots = 1
> >btrfs-10233 [001]  260298.823426: qgroup_update_counters: 
> > qgid = 5, cur_old_count = 0, cur_new

Qgroups wrong after snapshot create

2016-04-04 Thread Mark Fasheh

inting each qgroup twice in qgroup_adjust_counters (once
before, once after). Sothen we can see then that extent 29851648 (len 16k)
is the extent being counted against qgroup 5 which makes the count invalid.

>From a btrfs-debug-tree I get the following records referencing that extent:

>From the root tree:
item 3 key (FS_TREE ROOT_ITEM 0) itemoff 14949 itemsize 439
root data bytenr 29851648 level 0 dirid 256 refs 1 gen 10 
lastsnap 10
uuid ----
ctransid 10 otransid 0 stransid 0 rtransid 0

>From the extent tree:
item 9 key (29851648 METADATA_ITEM 0) itemoff 15960 itemsize 33
extent refs 1 gen 10 flags TREE_BLOCK
tree block skinny level 0
tree block backref root 5

And here is the block itself:

fs tree key (FS_TREE ROOT_ITEM 0) 
leaf 29851648 items 4 free space 15941 generation 10 owner 5
fs uuid f7e55c97-b0b3-44e5-bab1-1fd55d54409b
chunk uuid b78fe016-e35f-4f57-8211-796cbc9be3a4
item 0 key (256 INODE_ITEM 0) itemoff 16123 itemsize 160
inode generation 3 transid 10 size 10 nbytes 16384
block group 0 mode 40755 links 1 uid 0 gid 0
rdev 0 flags 0x0
item 1 key (256 INODE_REF 256) itemoff 16111 itemsize 12
inode ref index 0 namelen 2 name: ..
item 2 key (256 DIR_ITEM 3390559794) itemoff 16076 itemsize 35
location key (257 ROOT_ITEM -1) type DIR
namelen 5 datalen 0 name: snap2
item 3 key (256 DIR_INDEX 2) itemoff 16041 itemsize 35
location key (257 ROOT_ITEM -1) type DIR
namelen 5 datalen 0 name: snap2


So unless I'm mistaken, it seems like we're counting the original snapshot
root against itself when creating a snapshot.

I found this looking for what I believe to be a _different_ corruption in
qgroups. In the meantime while I track that one down though I was hoping
that someone might be able to shed some light on this particular issue.

Qu, do you have any ideas how we might fix this?

Thanks,
--Mark

PS: I have attached the output of btrfs-debug-tree for the FS used in this
example.

--
Mark Fasheh
root tree
leaf 29884416 items 17 free space 11820 generation 11 owner 1
fs uuid f7e55c97-b0b3-44e5-bab1-1fd55d54409b
chunk uuid b78fe016-e35f-4f57-8211-796cbc9be3a4
item 0 key (EXTENT_TREE ROOT_ITEM 0) itemoff 15844 itemsize 439
root data bytenr 29900800 level 0 dirid 0 refs 1 gen 11 
lastsnap 0
uuid ----
item 1 key (DEV_TREE ROOT_ITEM 0) itemoff 15405 itemsize 439
root data bytenr 29507584 level 0 dirid 0 refs 1 gen 6 lastsnap 0
uuid ----
item 2 key (FS_TREE INODE_REF 6) itemoff 15388 itemsize 17
inode ref index 0 namelen 7 name: default
item 3 key (FS_TREE ROOT_ITEM 0) itemoff 14949 itemsize 439
root data bytenr 29851648 level 0 dirid 256 refs 1 gen 10 
lastsnap 10
uuid ----
ctransid 10 otransid 0 stransid 0 rtransid 0
item 4 key (FS_TREE ROOT_REF 257) itemoff 14926 itemsize 23
root ref key dirid 256 sequence 2 name snap2
item 5 key (ROOT_TREE_DIR INODE_ITEM 0) itemoff 14766 itemsize 160
inode generation 3 transid 0 size 0 nbytes 16384
block group 0 mode 40755 links 1 uid 0 gid 0
rdev 0 flags 0x0
item 6 key (ROOT_TREE_DIR INODE_REF 6) itemoff 14754 itemsize 12
inode ref index 0 namelen 2 name: ..
item 7 key (ROOT_TREE_DIR DIR_ITEM 2378154706) itemoff 14717 itemsize 37
location key (FS_TREE ROOT_ITEM -1) type DIR
namelen 7 datalen 0 name: default
item 8 key (CSUM_TREE ROOT_ITEM 0) itemoff 14278 itemsize 439
root data bytenr 29933568 level 0 dirid 0 refs 1 gen 11 
lastsnap 0
uuid ----
item 9 key (QUOTA_TREE ROOT_ITEM 0) itemoff 13839 itemsize 439
root data bytenr 29917184 level 0 dirid 0 refs 1 gen 11 
lastsnap 0
uuid d66e47c6-9943-ae4e-9adb-6d97065f6358
item 10 key (UUID_TREE ROOT_ITEM 0) itemoff 13400 itemsize 439
root data bytenr 29802496 level 0 dirid 0 refs 1 gen 10 
lastsnap 0
uuid 4bded89b-be0f-ba46-becf-15604fcc58fc
item 11 key (256 INODE_ITEM 0) itemoff 13240 itemsize 160
inode generation 11 transid 11 size 262144 nbytes 1572864
block group 0 mode 100600 links 1 uid 0 gid 0
rdev 0 flags 0x1b
item 12 key (256 EXTENT_DATA 0) itemoff 13187 itemsize 53
extent data disk byte 12845056 nr 262144
extent data offset 0 nr 262144 ram 262144
extent co

[PATCH] btrfs: handle non-fatal errors in btrfs_qgroup_inherit()

2016-03-30 Thread Mark Fasheh

create_pending_snapshot() will go readonly on _any_ error return from
btrfs_qgroup_inherit(). If qgroups are enabled, a user can crash their fs by
just making a snapshot and asking it to inherit from an invalid qgroup. For
example:

$ btrfs sub snap -i 1/10 /btrfs/ /btrfs/foo

Will cause a transaction abort.

Fix this by only throwing errors in btrfs_qgroup_inherit() when we know
going readonly is acceptable.

The following xfstests test case reproduces this bug:

  seq=`basename $0`
  seqres=$RESULT_DIR/$seq
  echo "QA output created by $seq"

  here=`pwd`
  tmp=/tmp/$$
  status=1  # failure is the default!
  trap "_cleanup; exit \$status" 0 1 2 3 15

  _cleanup()
  {
cd /
rm -f $tmp.*
  }

  # get standard environment, filters and checks
  . ./common/rc
  . ./common/filter

  # remove previous $seqres.full before test
  rm -f $seqres.full

  # real QA test starts here
  _supported_fs btrfs
  _supported_os Linux
  _require_scratch

  rm -f $seqres.full

  _scratch_mkfs
  _scratch_mount
  _run_btrfs_util_prog quota enable $SCRATCH_MNT
  # The qgroup '1/10' does not exist and should be silently ignored
  _run_btrfs_util_prog subvolume snapshot -i 1/10 $SCRATCH_MNT 
$SCRATCH_MNT/snap1

  _scratch_unmount

  echo "Silence is golden"

  status=0
  exit

Signed-off-by: Mark Fasheh <mfas...@suse.de>
---
 fs/btrfs/qgroup.c | 54 --
 1 file changed, 32 insertions(+), 22 deletions(-)

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 994dab0..9e11955 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1851,8 +1851,10 @@ out:
 }
 
 /*
- * copy the acounting information between qgroups. This is necessary when a
- * snapshot or a subvolume is created
+ * Copy the acounting information between qgroups. This is necessary
+ * when a snapshot or a subvolume is created. Throwing an error will
+ * cause a transaction abort so we take extra care here to only error
+ * when a readonly fs is a reasonable outcome.
  */
 int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
 struct btrfs_fs_info *fs_info, u64 srcid, u64 objectid,
@@ -1882,15 +1884,15 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle 
*trans,
   2 * inherit->num_excl_copies;
for (i = 0; i < nums; ++i) {
srcgroup = find_qgroup_rb(fs_info, *i_qgroups);
-   if (!srcgroup) {
-   ret = -EINVAL;
-   goto out;
-   }
 
-   if ((srcgroup->qgroupid >> 48) <= (objectid >> 48)) {
-   ret = -EINVAL;
-   goto out;
-   }
+   /*
+* Zero out invalid groups so we can ignore
+* them later.
+*/
+   if (!srcgroup ||
+   ((srcgroup->qgroupid >> 48) <= (objectid >> 48)))
+   *i_qgroups = 0ULL;
+
++i_qgroups;
}
}
@@ -1925,17 +1927,19 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle 
*trans,
 */
if (inherit) {
i_qgroups = (u64 *)(inherit + 1);
-   for (i = 0; i < inherit->num_qgroups; ++i) {
+   for (i = 0; i < inherit->num_qgroups; ++i, ++i_qgroups) {
+   if (*i_qgroups == 0)
+   continue;
ret = add_qgroup_relation_item(trans, quota_root,
   objectid, *i_qgroups);
-   if (ret)
+   if (ret && ret != -EEXIST)
goto out;
ret = add_qgroup_relation_item(trans, quota_root,
   *i_qgroups, objectid);
-   if (ret)
+   if (ret && ret != -EEXIST)
goto out;
-   ++i_qgroups;
}
+   ret = 0;
}
 
 
@@ -1996,17 +2000,22 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle 
*trans,
 
i_qgroups = (u64 *)(inherit + 1);
for (i = 0; i < inherit->num_qgroups; ++i) {
-   ret = add_relation_rb(quota_root->fs_info, objectid,
- *i_qgroups);
-   if (ret)
-   goto unlock;
+   if (*i_qgroups) {
+   ret = add_relation_rb(quota_root->fs_info, objectid,
+ *i_qgroups);
+   if (ret)
+   goto unlock;
+   }
++i_qgroups;
}
 
-

[RESEND][PATCH] btrfs: Add qgroup tracing

2016-03-29 Thread Mark Fasheh

This patch adds tracepoints to the qgroup code on both the reporting side
(insert_dirty_extents) and the accounting side. Taken together it allows us
to see what qgroup operations have happened, and what their result was.

Signed-off-by: Mark Fasheh <mfas...@suse.de>
---
 fs/btrfs/qgroup.c|  9 +
 include/trace/events/btrfs.h | 89 +++-
 2 files changed, 97 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 5279fda..994dab0 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1463,6 +1463,7 @@ struct btrfs_qgroup_extent_record
u64 bytenr = record->bytenr;
 
assert_spin_locked(_refs->lock);
+   trace_btrfs_qgroup_insert_dirty_extent(record);
 
while (*p) {
parent_node = *p;
@@ -1594,6 +1595,9 @@ static int qgroup_update_counters(struct btrfs_fs_info 
*fs_info,
cur_old_count = btrfs_qgroup_get_old_refcnt(qg, seq);
cur_new_count = btrfs_qgroup_get_new_refcnt(qg, seq);
 
+   trace_qgroup_update_counters(qg->qgroupid, cur_old_count,
+cur_new_count);
+
/* Rfer update part */
if (cur_old_count == 0 && cur_new_count > 0) {
qg->rfer += num_bytes;
@@ -1683,6 +1687,9 @@ btrfs_qgroup_account_extent(struct btrfs_trans_handle 
*trans,
goto out_free;
BUG_ON(!fs_info->quota_root);
 
+   trace_btrfs_qgroup_account_extent(bytenr, num_bytes, nr_old_roots,
+ nr_new_roots);
+
qgroups = ulist_alloc(GFP_NOFS);
if (!qgroups) {
ret = -ENOMEM;
@@ -1752,6 +1759,8 @@ int btrfs_qgroup_account_extents(struct 
btrfs_trans_handle *trans,
record = rb_entry(node, struct btrfs_qgroup_extent_record,
  node);
 
+   trace_btrfs_qgroup_account_extents(record);
+
if (!ret) {
/*
 * Use (u64)-1 as time_seq to do special search, which
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index d866f21..467a4d2 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -23,7 +23,7 @@ struct map_lookup;
 struct extent_buffer;
 struct btrfs_work;
 struct __btrfs_workqueue;
-struct btrfs_qgroup_operation;
+struct btrfs_qgroup_extent_record;
 
 #define show_ref_type(type)\
__print_symbolic(type,  \
@@ -1231,6 +1231,93 @@ DEFINE_EVENT(btrfs__qgroup_delayed_ref, 
btrfs_qgroup_free_delayed_ref,
 
TP_ARGS(ref_root, reserved)
 );
+
+DECLARE_EVENT_CLASS(btrfs_qgroup_extent,
+   TP_PROTO(struct btrfs_qgroup_extent_record *rec),
+
+   TP_ARGS(rec),
+
+   TP_STRUCT__entry(
+   __field(u64,  bytenr)
+   __field(u64,  num_bytes )
+   ),
+
+   TP_fast_assign(
+   __entry->bytenr = rec->bytenr,
+   __entry->num_bytes  = rec->num_bytes;
+   ),
+
+   TP_printk("bytenr = %llu, num_bytes = %llu",
+ (unsigned long long)__entry->bytenr,
+ (unsigned long long)__entry->num_bytes)
+);
+
+DEFINE_EVENT(btrfs_qgroup_extent, btrfs_qgroup_account_extents,
+
+   TP_PROTO(struct btrfs_qgroup_extent_record *rec),
+
+   TP_ARGS(rec)
+);
+
+DEFINE_EVENT(btrfs_qgroup_extent, btrfs_qgroup_insert_dirty_extent,
+
+   TP_PROTO(struct btrfs_qgroup_extent_record *rec),
+
+   TP_ARGS(rec)
+);
+
+TRACE_EVENT(btrfs_qgroup_account_extent,
+
+   TP_PROTO(u64 bytenr, u64 num_bytes, u64 nr_old_roots, u64 nr_new_roots),
+
+   TP_ARGS(bytenr, num_bytes, nr_old_roots, nr_new_roots),
+
+   TP_STRUCT__entry(
+   __field(u64,  bytenr)
+   __field(u64,  num_bytes )
+   __field(u64,  nr_old_roots  )
+   __field(u64,  nr_new_roots  )
+   ),
+
+   TP_fast_assign(
+   __entry->bytenr = bytenr;
+   __entry->num_bytes  = num_bytes;
+   __entry->nr_old_roots   = nr_old_roots;
+   __entry->nr_new_roots   = nr_new_roots;
+   ),
+
+   TP_printk("bytenr = %llu, num_bytes = %llu, nr_old_roots = %llu, "
+ "nr_new_roots = %llu",
+ __entry->bytenr,
+ __entry->num_bytes,
+ __entry->nr_old_roots,
+ __entry->nr_new_roots)
+);
+
+TRACE_EVENT(qgroup_update_counters,
+
+   TP_PROTO(u64 qgid, u64 cur_old_count, u64 cur_new_count),
+
+   TP_ARGS(qgid, cur_old_count, cur_new_count),
+
+   TP_STRUCT__entr

Re: [PATCH] btrfs-progs: add 'du' command

2016-02-02 Thread Mark Fasheh

On Tue, Feb 02, 2016 at 12:43:45AM +0100, David Sterba wrote:
> Hi,
> 
> On Wed, Jan 20, 2016 at 01:49:24PM -0800, Mark Fasheh wrote:
> > A git tree of the patches can be found here:
> > 
> > https://github.com/markfasheh/btrfs-progs-patches/tree/du
> 
> what changed since the previous posting?

Nothing major, I rebased it and cleaned up the patches to fit the general
style of btrfs-progs to make thing easier for you to merge.


> Otherwise code looks ok, I saw use of uint64_t, the units help can be
> extended to the full units set, and maybe other minor things that would
> be good to fix. I'll do a more fine grained review when applying the
> patches.

Awesome thanks for that, let me know if there's anything I can do to help
the process along.
--Mark


--
Mark Fasheh
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: Regression in: [PATCH 4/4] btrfs: qgroup: account shared subtree during snapshot delete

2015-11-05 Thread Mark Fasheh

On Wed, Nov 04, 2015 at 09:01:36AM +0800, Qu Wenruo wrote:
> 
> 
> Mark Fasheh wrote on 2015/11/03 11:26 -0800:
> >On Mon, Nov 02, 2015 at 09:34:24AM +0800, Qu Wenruo wrote:
> >>
> >>
> >>Stefan Priebe wrote on 2015/11/01 21:49 +0100:
> >>>Hi,
> >>>
> >>>this one: http://www.spinics.net/lists/linux-btrfs/msg47377.html
> >>>
> >>>adds a regression to my test systems with very large disks (30tb and 50tb).
> >>>
> >>>btrfs balance is super slow afterwards while heavily making use of cp
> >>>--reflink=always on big files (200gb - 500gb).
> >>>
> >>>Sorry didn't know how to correctly reply to that "old" message.
> >>>
> >>>Greets,
> >>>Stefan
> >>>--
> >>>To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> >>>the body of a message to majord...@vger.kernel.org
> >>>More majordomo info at  http://vger.kernel.org/majordomo-info.html
> >>
> >>Thanks for the testing.
> >>
> >>Are you using qgroup or just doing normal balance with qgroup disabled?
> >>
> >>For the latter case, that's should be optimized to skip the dirty
> >>extent insert in qgroup disabled case.
> >>
> >>For qgroup enabled case, I'm afraid that's the design.
> >>As relocation will drop a subtree to relocate, and to ensure qgroup
> >>consistent, we must walk down all the tree blocks and mark them
> >>dirty for later qgroup accounting.
> >
> >Qu, we're always going to have to walk the tree when deleting it, this is
> >part of removing a subvolume. We've walked shared subtrees in this code for
> >numerous kernel releases without incident before it was removed in 4.2.
> >
> >Do you have any actual evidence that this is a major performance regression?
> > From our previous conversations you seemed convinced of this, without even
> >having a working subtree walk to test. I remember the hand wringing
> >about an individual commit being too heavy with the qgroup code (even though
> >I pointed out that tree walk is a restartable transaction).
> >
> >It seems that you are confused still about how we handle removing a volume
> >wrt qgroups.
> >
> >If you have questions or concerns I would be happy to explain them but
> >IMHO your statements there are opinion and not based in fact.
> 
> Yes, I don't deny it.
> But it's quite hard to prove it, as we need such a huge storage like Stefan.
> What I have is only several hundred GB test storage.
> Even accounting all my home NAS, I only have 2T, far from the
> storage Stefan has.
> 
> And what Stefan report should already give some hint about the
> performance issue.
> 
> In your word "it won't be doing anything (ok some kmalloc/free of a
> very tiny object)", it's already slowing down balance, since balance
> also use btrfs_drop_subtree().

When I wrote that I was under the impression that the qgroup code was doing
it's own sanity checking (it used to) and since Stephan had them disabled
they couldn't be causing the problem. I read your e-mail explaining that the
qgroup api was now intertwined with delayed ref locking after this one.

The same exact code ran in either case before and after your patches, so my
guess is that the issue is actually inside the qgroup code that shouldn't
have been run. I wonder if we even just filled up his memory but never
cleaned the objects. The only other thing I can think of is if
account_leaf_items() got run in a really tight loop for some reason.

Kmalloc in the way we are using it is not usually a performance issue,
especially if we've been reading off disk in the same process. Ask yourself
this - your own patch series does the same kmalloc for every qgroup
operation. Did you notice a complete and massive performance slowdown like
the one Stefan reported?

I will say that we never had this problem reported before, and
account_leaf_items() is always run in all kernels, even without qgroups
enabled. That will change with my new patch though.

What we can say for sure is that drop_snapshot in the qgroup case will read
more disk and obviously that will have a negative impact depending on what
the tree looks like. So IMHO we ought to be focusing on reducing the amount
of I/O involved.


> But we can't just ignore such "possible" performance issue just
> because old code did the same thing.(Although not the same now,
> we're marking all subtree blocks dirty other than shared one).

Well, I can't disagree with that - the only reason we are talking right now
is because you intentionally ignored the qgroup code in drop_snapshot(). So
let's start with this - no more 'fixing' code by tearing it out and replacing
it with /* TODO: somebody else re-implement this */   ;)
--Mark

--
Mark Fasheh
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 3/3] btrfs: qgroup: account shared subtree during snapshot delete

2015-11-05 Thread Mark Fasheh

Commit 0ed4792 ('btrfs: qgroup: Switch to new extent-oriented qgroup
mechanism.') removed our qgroup accounting during
btrfs_drop_snapshot(). Predictably, this results in qgroup numbers
going bad shortly after a snapshot is removed.

Fix this by adding a dirty extent record when we encounter extents during
our shared subtree walk. This effectively restores the functionality we had
with the original shared subtree walking code in 1152651 (btrfs: qgroup:
account shared subtrees during snapshot delete).

The idea with the original patch (and this one) is that shared subtrees can
get skipped during drop_snapshot. The shared subtree walk then allows us a
chance to visit those extents and add them to the qgroup work for later
processing. This ultimately makes the accounting for drop snapshot work.

The new qgroup code nicely handles all the other extents during the tree
walk via the ref dec/inc functions so we don't have to add actions beyond
what we had originally.

Signed-off-by: Mark Fasheh <mfas...@suse.de>
---
 fs/btrfs/extent-tree.c | 47 ---
 fs/btrfs/qgroup.c  |  2 ++
 2 files changed, 42 insertions(+), 7 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 601d7d4..410b46d 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -7850,21 +7850,47 @@ reada:
 }
 
 /*
- * TODO: Modify related function to add related node/leaf to dirty_extent_root,
- * for later qgroup accounting.
- *
- * Current, this function does nothing.
+ * These may not be seen by the usual inc/dec ref code so we have to
+ * add them here.
  */
+static int record_one_subtree_extent(struct btrfs_trans_handle *trans,
+struct btrfs_root *root, u64 bytenr,
+u64 num_bytes)
+{
+   struct btrfs_qgroup_extent_record *qrecord;
+   struct btrfs_delayed_ref_root *delayed_refs;
+
+   qrecord = kmalloc(sizeof(*qrecord), GFP_NOFS);
+   if (!qrecord)
+   return -ENOMEM;
+
+   qrecord->bytenr = bytenr;
+   qrecord->num_bytes = num_bytes;
+   qrecord->old_roots = NULL;
+
+   delayed_refs = >transaction->delayed_refs;
+   spin_lock(_refs->lock);
+   if (btrfs_qgroup_insert_dirty_extent(delayed_refs, qrecord))
+   kfree(qrecord);
+   spin_unlock(_refs->lock);
+
+   return 0;
+}
+
 static int account_leaf_items(struct btrfs_trans_handle *trans,
  struct btrfs_root *root,
  struct extent_buffer *eb)
 {
int nr = btrfs_header_nritems(eb);
-   int i, extent_type;
+   int i, extent_type, ret;
struct btrfs_key key;
struct btrfs_file_extent_item *fi;
u64 bytenr, num_bytes;
 
+   /* We can be called directly from walk_up_proc() */
+   if (!root->fs_info->quota_enabled)
+   return 0;
+
for (i = 0; i < nr; i++) {
btrfs_item_key_to_cpu(eb, , i);
 
@@ -7883,6 +7909,10 @@ static int account_leaf_items(struct btrfs_trans_handle 
*trans,
continue;
 
num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi);
+
+   ret = record_one_subtree_extent(trans, root, bytenr, num_bytes);
+   if (ret)
+   return ret;
}
return 0;
 }
@@ -7951,8 +7981,6 @@ static int adjust_slots_upwards(struct btrfs_root *root,
 
 /*
  * root_eb is the subtree root and is locked before this function is called.
- * TODO: Modify this function to mark all (including complete shared node)
- * to dirty_extent_root to allow it get accounted in qgroup.
  */
 static int account_shared_subtree(struct btrfs_trans_handle *trans,
  struct btrfs_root *root,
@@ -8030,6 +8058,11 @@ walk_down:
btrfs_tree_read_lock(eb);
btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
path->locks[level] = BTRFS_READ_LOCK_BLOCKING;
+
+   ret = record_one_subtree_extent(trans, root, 
child_bytenr,
+   root->nodesize);
+   if (ret)
+   goto out;
}
 
if (level == 0) {
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index b068209..ce1cdcf 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1461,6 +1461,8 @@ struct btrfs_qgroup_extent_record
struct btrfs_qgroup_extent_record *entry;
u64 bytenr = record->bytenr;
 
+   assert_spin_locked(_refs->lock);
+
trace_btrfs_qgroup_insert_dirty_extent(record);
 
while (*p) {
-- 
2.1.2

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 2/3] btrfs: Add qgroup tracing

2015-11-05 Thread Mark Fasheh

This patch adds tracepoints to the qgroup code on both the reporting side
(insert_dirty_extents) and the accounting side. Taken together it allows us
to see what qgroup operations have happened, and what their result was.

Signed-off-by: Mark Fasheh <mfas...@suse.de>
---
 fs/btrfs/qgroup.c| 10 +
 include/trace/events/btrfs.h | 88 +++-
 2 files changed, 97 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index d904ee1..b068209 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1461,6 +1461,8 @@ struct btrfs_qgroup_extent_record
struct btrfs_qgroup_extent_record *entry;
u64 bytenr = record->bytenr;
 
+   trace_btrfs_qgroup_insert_dirty_extent(record);
+
while (*p) {
parent_node = *p;
entry = rb_entry(parent_node, struct btrfs_qgroup_extent_record,
@@ -1591,6 +1593,9 @@ static int qgroup_update_counters(struct btrfs_fs_info 
*fs_info,
cur_old_count = btrfs_qgroup_get_old_refcnt(qg, seq);
cur_new_count = btrfs_qgroup_get_new_refcnt(qg, seq);
 
+   trace_qgroup_update_counters(qg->qgroupid, cur_old_count,
+cur_new_count);
+
/* Rfer update part */
if (cur_old_count == 0 && cur_new_count > 0) {
qg->rfer += num_bytes;
@@ -1684,6 +1689,9 @@ btrfs_qgroup_account_extent(struct btrfs_trans_handle 
*trans,
goto out_free;
BUG_ON(!fs_info->quota_root);
 
+   trace_btrfs_qgroup_account_extent(bytenr, num_bytes, nr_old_roots,
+ nr_new_roots);
+
qgroups = ulist_alloc(GFP_NOFS);
if (!qgroups) {
ret = -ENOMEM;
@@ -1753,6 +1761,8 @@ int btrfs_qgroup_account_extents(struct 
btrfs_trans_handle *trans,
record = rb_entry(node, struct btrfs_qgroup_extent_record,
  node);
 
+   trace_btrfs_qgroup_account_extents(record);
+
if (!ret) {
/*
 * Use (u64)-1 as time_seq to do special search, which
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index 0b73af9..9d7b545 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -23,7 +23,7 @@ struct map_lookup;
 struct extent_buffer;
 struct btrfs_work;
 struct __btrfs_workqueue;
-struct btrfs_qgroup_operation;
+struct btrfs_qgroup_extent_record;
 
 #define show_ref_type(type)\
__print_symbolic(type,  \
@@ -1117,6 +1117,92 @@ DEFINE_EVENT(btrfs__workqueue_done, 
btrfs_workqueue_destroy,
TP_ARGS(wq)
 );
 
+DECLARE_EVENT_CLASS(btrfs_qgroup_extent,
+   TP_PROTO(struct btrfs_qgroup_extent_record *rec),
+
+   TP_ARGS(rec),
+
+   TP_STRUCT__entry(
+   __field(u64,  bytenr)
+   __field(u64,  num_bytes )
+   ),
+
+   TP_fast_assign(
+   __entry->bytenr = rec->bytenr,
+   __entry->num_bytes  = rec->num_bytes;
+   ),
+
+   TP_printk("bytenr = %llu, num_bytes = %llu",
+ (unsigned long long)__entry->bytenr,
+ (unsigned long long)__entry->num_bytes)
+);
+
+DEFINE_EVENT(btrfs_qgroup_extent, btrfs_qgroup_account_extents,
+
+   TP_PROTO(struct btrfs_qgroup_extent_record *rec),
+
+   TP_ARGS(rec)
+);
+
+DEFINE_EVENT(btrfs_qgroup_extent, btrfs_qgroup_insert_dirty_extent,
+
+   TP_PROTO(struct btrfs_qgroup_extent_record *rec),
+
+   TP_ARGS(rec)
+);
+
+TRACE_EVENT(btrfs_qgroup_account_extent,
+
+   TP_PROTO(u64 bytenr, u64 num_bytes, u64 nr_old_roots, u64 nr_new_roots),
+
+   TP_ARGS(bytenr, num_bytes, nr_old_roots, nr_new_roots),
+
+   TP_STRUCT__entry(
+   __field(u64,  bytenr)
+   __field(u64,  num_bytes )
+   __field(u64,  nr_old_roots  )
+   __field(u64,  nr_new_roots  )
+   ),
+
+   TP_fast_assign(
+   __entry->bytenr = bytenr;
+   __entry->num_bytes  = num_bytes;
+   __entry->nr_old_roots   = nr_old_roots;
+   __entry->nr_new_roots   = nr_new_roots;
+   ),
+
+   TP_printk("bytenr = %llu, num_bytes = %llu, nr_old_roots = %llu, "
+ "nr_new_roots = %llu",
+ __entry->bytenr,
+ __entry->num_bytes,
+ __entry->nr_old_roots,
+ __entry->nr_new_roots)
+);
+
+TRACE_EVENT(qgroup_update_counters,
+
+   TP_PROTO(u64 qgid, u64 cur_old_count, u64 cur_new_count),
+
+   TP_ARGS(qgid,

[PATCH 0/3] btrfs: update qgroups in drop snapshot, V2

2015-11-05 Thread Mark Fasheh

Hi,

The following 3 patches fix a regression introduced in Linux
4.2 where btrfs_drop_snapshot() wasn't updating qgroups, resulting in
them going bad.

The original e-mail pointing this out is below:

http://www.spinics.net/lists/linux-btrfs/msg46093.html

The first patch is from Josef and fix bugs in our counting of
roots (which is critical for qgroups to work correctly). It was previously
sent to the list:

http://www.spinics.net/lists/linux-btrfs/msg47035.html

Truth be told, most of the time fixing this was spent figuring out that and
another issue (which has also been fixed).  Once I realized I was seeing a
bug and we fixed it correctly, my drop snapshot patch got dramatically
smaller.

I also re-added some of the tracing in qgroup.c that we recently
lost. It is again possible to debug qgroup operations on a live
system, allowing us to find issues like the two above by narrowing
down our operations and manually walking through them via
cat sys/debug/tracing.

The entire patch series can be tested in xfstests test btrfs/104.

Thanks,
--Mark

Changes from V1, thanks to Qu for his comments:
  - lock around call to btrfs_qgroup_insert_dirty_extent()
  - check whether qgroups are enabled in account_leaf_items()
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 1/3] Btrfs: use btrfs_get_fs_root in resolve_indirect_ref

2015-11-05 Thread Mark Fasheh

From: Josef Bacik <jba...@fb.com>

The backref code will look up the fs_root we're trying to resolve our indirect
refs for, unfortunately we use btrfs_read_fs_root_no_name, which returns -ENOENT
if the ref is 0.  This isn't helpful for the qgroup stuff with snapshot delete
as it won't be able to search down the snapshot we are deleting, which will
cause us to miss roots.  So use btrfs_get_fs_root and send false for check_ref
so we can always get the root we're looking for.  Thanks,

Signed-off-by: Josef Bacik <jba...@fb.com>
Signed-off-by: Mark Fasheh <mfas...@suse.de>
---
 fs/btrfs/backref.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 9a2ec79..0e9da72 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -355,7 +355,7 @@ static int __resolve_indirect_ref(struct btrfs_fs_info 
*fs_info,
 
index = srcu_read_lock(_info->subvol_srcu);
 
-   root = btrfs_read_fs_root_no_name(fs_info, _key);
+   root = btrfs_get_fs_root(fs_info, _key, false);
if (IS_ERR(root)) {
srcu_read_unlock(_info->subvol_srcu, index);
ret = PTR_ERR(root);
-- 
2.1.2

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: Regression in: [PATCH 4/4] btrfs: qgroup: account shared subtree during snapshot delete

2015-11-05 Thread Mark Fasheh

On Fri, Nov 06, 2015 at 09:02:13AM +0800, Qu Wenruo wrote:
> >The same exact code ran in either case before and after your patches, so my
> >guess is that the issue is actually inside the qgroup code that shouldn't
> >have been run. I wonder if we even just filled up his memory but never
> >cleaned the objects. The only other thing I can think of is if
> >account_leaf_items() got run in a really tight loop for some reason.
> >
> >Kmalloc in the way we are using it is not usually a performance issue,
> >especially if we've been reading off disk in the same process. Ask yourself
> >this - your own patch series does the same kmalloc for every qgroup
> >operation. Did you notice a complete and massive performance slowdown like
> >the one Stefan reported?
> 
> You're right, such memory allocation may impact performance but not
> so noticeable, compared to other operations which may kick disk IO,
> like btrfs_find_all_roots().
> 
> But at least, enabling qgroup will impact performance.
> 
> Yeah, this time I has test data now.
> In a environment with 100 different snapshot, sysbench shows an
> overall performance drop about 5%, and in some case, up to 7%, with
> qgroup enabled.
> 
> Not sure about the kmalloc impact, maybe less than 1% or maybe 2~3%,
> but at least it's worthy trying to use kmem cache.

Ok cool, what'd you do to generate the snapshots? I can try a similar test
on one of my machines and see what I get. I'm not surprised that the
overhead is noticable, and I agree it's easy enough to try things like
replacing the allocation once we have a test going.

Thanks,
--Mark

--
Mark Fasheh
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: Regression in: [PATCH 4/4] btrfs: qgroup: account shared subtree during snapshot delete

2015-11-03 Thread Mark Fasheh

On Mon, Nov 02, 2015 at 09:34:24AM +0800, Qu Wenruo wrote:
> 
> 
> Stefan Priebe wrote on 2015/11/01 21:49 +0100:
> >Hi,
> >
> >this one: http://www.spinics.net/lists/linux-btrfs/msg47377.html
> >
> >adds a regression to my test systems with very large disks (30tb and 50tb).
> >
> >btrfs balance is super slow afterwards while heavily making use of cp
> >--reflink=always on big files (200gb - 500gb).
> >
> >Sorry didn't know how to correctly reply to that "old" message.
> >
> >Greets,
> >Stefan
> >--
> >To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> >the body of a message to majord...@vger.kernel.org
> >More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
> Thanks for the testing.
> 
> Are you using qgroup or just doing normal balance with qgroup disabled?
> 
> For the latter case, that's should be optimized to skip the dirty
> extent insert in qgroup disabled case.
> 
> For qgroup enabled case, I'm afraid that's the design.
> As relocation will drop a subtree to relocate, and to ensure qgroup
> consistent, we must walk down all the tree blocks and mark them
> dirty for later qgroup accounting.

Qu, we're always going to have to walk the tree when deleting it, this is
part of removing a subvolume. We've walked shared subtrees in this code for
numerous kernel releases without incident before it was removed in 4.2.

Do you have any actual evidence that this is a major performance regression?
>From our previous conversations you seemed convinced of this, without even
having a working subtree walk to test. I remember the hand wringing
about an individual commit being too heavy with the qgroup code (even though
I pointed out that tree walk is a restartable transaction).

It seems that you are confused still about how we handle removing a volume
wrt qgroups.

If you have questions or concerns I would be happy to explain them but
IMHO your statements there are opinion and not based in fact.

Yes btw, we might have to do more work for the uncommon case of a
qgroup being referenced by higher level groups but that is clearly not
happening here (and honestly it's not a common case at all).
--Mark


--
Mark Fasheh
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: Regression in: [PATCH 4/4] btrfs: qgroup: account shared subtree during snapshot delete

2015-11-03 Thread Mark Fasheh

On Mon, Nov 02, 2015 at 06:46:06AM +0100, Stefan Priebe wrote:
> Am 02.11.2015 um 02:34 schrieb Qu Wenruo:
> >
> >
> >Stefan Priebe wrote on 2015/11/01 21:49 +0100:
> >>Hi,
> >>
> >>this one: http://www.spinics.net/lists/linux-btrfs/msg47377.html
> >>
> >>adds a regression to my test systems with very large disks (30tb and
> >>50tb).
> >>
> >>btrfs balance is super slow afterwards while heavily making use of cp
> >>--reflink=always on big files (200gb - 500gb).
> >>
> >>Sorry didn't know how to correctly reply to that "old" message.
> >>
> >>Greets,
> >>Stefan
> >>--
> >>To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> >>the body of a message to majord...@vger.kernel.org
> >>More majordomo info at  http://vger.kernel.org/majordomo-info.html
> >
> >Thanks for the testing.
> >
> >Are you using qgroup or just doing normal balance with qgroup disabled?
> 
> just doing normal balance with qgroup disabled.

Then that patch is very unlikely to be your actual problem as it won't be
doing anything (ok some kmalloc/free of a very tiny object) since qgroups
are disabled.

Also, btrfs had working subtree accounting in that code for the last N
releases (doing the same exact thing) and it only changed for the one
release that Qu's rework was in (which lazily tore it out).
--Mark

--
Mark Fasheh
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: Regression in: [PATCH 4/4] btrfs: qgroup: account shared subtree during snapshot delete

2015-11-03 Thread Mark Fasheh

On Tue, Nov 03, 2015 at 08:42:33PM +0100, Stefan Priebe wrote:
> Sorry don't know much about the btrfs internals.
> 
> I just can reproduce this. Switching to a kernel with this patch and
> without. With it takes ages - without it's super fast. I prooved
> this several times by just rebooting to the other kernel.

That's fine, disregard my previous e-mail - I just saw the mail Qu sent me.
There's a problem in the code that the patch calls which is causing your
performance issues. I'll CC you when I put out a fix.

Thanks,
    --Mark


--
Mark Fasheh
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 4/4] btrfs: qgroup: account shared subtree during snapshot delete

2015-11-03 Thread Mark Fasheh

On Mon, Nov 02, 2015 at 09:59:01AM +0800, Qu Wenruo wrote:
> 
> 
> Mark Fasheh wrote on 2015/09/22 13:15 -0700:
> >Commit 0ed4792 ('btrfs: qgroup: Switch to new extent-oriented qgroup
> >mechanism.') removed our qgroup accounting during
> >btrfs_drop_snapshot(). Predictably, this results in qgroup numbers
> >going bad shortly after a snapshot is removed.
> >
> >Fix this by adding a dirty extent record when we encounter extents during
> >our shared subtree walk. This effectively restores the functionality we had
> >with the original shared subtree walking code in 1152651 (btrfs: qgroup:
> >account shared subtrees during snapshot delete).
> >
> >The idea with the original patch (and this one) is that shared subtrees can
> >get skipped during drop_snapshot. The shared subtree walk then allows us a
> >chance to visit those extents and add them to the qgroup work for later
> >processing. This ultimately makes the accounting for drop snapshot work.
> >
> >The new qgroup code nicely handles all the other extents during the tree
> >walk via the ref dec/inc functions so we don't have to add actions beyond
> >what we had originally.
> >
> >Signed-off-by: Mark Fasheh <mfas...@suse.de>
> 
> Hi Mark,
> 
> Despite the performance regression reported from Stefan Priebe,
> there is another problem, I'll comment inlined below.
> 
> >---
> >  fs/btrfs/extent-tree.c | 41 ++---
> >  1 file changed, 34 insertions(+), 7 deletions(-)
> >
> >diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
> >index 3a70e6c..89be620 100644
> >--- a/fs/btrfs/extent-tree.c
> >+++ b/fs/btrfs/extent-tree.c
> >@@ -7757,17 +7757,37 @@ reada:
> >  }
> >
> >  /*
> >- * TODO: Modify related function to add related node/leaf to 
> >dirty_extent_root,
> >- * for later qgroup accounting.
> >- *
> >- * Current, this function does nothing.
> >+ * These may not be seen by the usual inc/dec ref code so we have to
> >+ * add them here.
> >   */
> >+static int record_one_subtree_extent(struct btrfs_trans_handle *trans,
> >+ struct btrfs_root *root, u64 bytenr,
> >+ u64 num_bytes)
> >+{
> >+struct btrfs_qgroup_extent_record *qrecord;
> >+struct btrfs_delayed_ref_root *delayed_refs;
> >+
> >+qrecord = kmalloc(sizeof(*qrecord), GFP_NOFS);
> >+if (!qrecord)
> >+return -ENOMEM;
> >+
> >+qrecord->bytenr = bytenr;
> >+qrecord->num_bytes = num_bytes;
> >+qrecord->old_roots = NULL;
> >+
> >+delayed_refs = >transaction->delayed_refs;
> >+if (btrfs_qgroup_insert_dirty_extent(delayed_refs, qrecord))
> >+kfree(qrecord);
> 
> 1) Unprotected dirty_extent_root.
> 
> Unfortunately, btrfs_qgroup_insert_dirty_exntet() is not protected
> by any lock/mutex.
> 
> And I'm sorry not to add comment about that.
> 
> In fact, btrfs_qgroup_insert_dirty_extent should always be used with
> delayed_refs->lock hold.
> Just like add_delayed_ref_head(), where every caller of
> add_delayed_ref_head() holds delayed_refs->lock.
> 
> So here you will nned to hold delayed_refs->lock.

Ok, thanks for pointing this out. To your knowledge is there any reasion why
the followup patch shouldn't just wrap the call to
btrfs_qgroup_insert_dirty_extent() in the correct lock?



> 2) Performance regression.(Reported by Stefan Priebe)
> 
> The performance regression is not caused by your codes, at least not
> completely.
> 
> It's my fault not adding enough comment for insert_dirty_extent()
> function. (just like 1, I must say I'm a bad reviewer until there is
> bug report)
> 
> As I was only expecting it called inside add_delayed_ref_head(),
> and caller of add_delayed_ref_head() has judged whether qgroup is
> enabled before calling add_delayed_ref_head().
> 
> So for qgroup disabled case, insert_dirty_extent() won't ever be called.
> 
> 
> 
> As a result, if you want to call btrfs_qgroup_insert_dirty_extent()
> out of add_delayed_ref_head(), you will need to handle the
> delayed_refs->lock and judge whether qgroup is enabled.

Ok, so callers of btrfs_qgroup_insert_dirty_extent() also have to check
whether qgroups are enabled.


> BTW, if it's OK for you, you can also further improve the
> performance of qgroup by using kmem_cache for struct
> btrfs_qgroup_extent_record.
> 
> I assume the kmalloc() may be one performance hot spot considering
> the amount it called in qgroup enabled case.

We're reading disk in that case, I hardly think the small kmalloc() matters.
--Mark

--
Mark Fasheh
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] btrfs: fix use after free iterating extrefs

2015-10-13 Thread Mark Fasheh

On Tue, Oct 13, 2015 at 02:06:48PM -0400, Chris Mason wrote:
> The code for btrfs inode-resolve has never worked properly for
> files with enough hard links to trigger extrefs.  It was trying to
> get the leaf out of a path after freeing the path:
> 
>   btrfs_release_path(path);
>   leaf = path->nodes[0];
>   item_size = btrfs_item_size_nr(leaf, slot);
> 
> The fix here is to use the extent buffer we cloned just a little higher
> up to avoid deadlocks caused by using the leaf in the path.
> 
> Signed-off-by: Chris Mason <c...@fb.com>
> cc: sta...@vger.kernel.org # v3.7+
> cc: Mark Fasheh <mfas...@suse.de>
Reviewed-by: Mark Fasheh <mfas...@suse.de>

Thanks for the CC Chris.
--Mark

--
Mark Fasheh
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [RFC][PATCH] btrfs: add ioctl to monitor subvolume dropping

2015-10-06 Thread Mark Fasheh

On Tue, Oct 06, 2015 at 10:25:52AM +0200, David Sterba wrote:
> On Thu, Oct 01, 2015 at 02:30:47PM -0700, Mark Fasheh wrote:
> > At the moment, userspace has no way of knowing when a snapshot is finally
> > removed. This has become a problem when writing tests for btrfs,
> > 
> > http://article.gmane.org/gmane.comp.file-systems.fstests/1239/
> 
> In the meantime the command 'btrfs subvolume sync /path id' has been
> implemented which does what you need, without the new ioctl.

Ahh ok I didn't see that I'll take a look, thanks.


> 
> Also, you can query root_item::drop_progress directly through the
> SEARCH_TREE ioctl as well.

Yeah I realized this after I sent the patch but it seems like we don't need
either approach now if I can just wait with 'subvolume sync'
--Mark

--
Mark Fasheh
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] fstests: btrfs: add test for quota groups and drop snapshot

2015-10-01 Thread Mark Fasheh

On Tue, Sep 29, 2015 at 09:28:58AM +1000, Dave Chinner wrote:
> On Wed, Sep 23, 2015 at 02:05:16PM -0700, Mark Fasheh wrote:
> > Since the last time I sent this test, drop snapshot was broken again with
> > respect to qgroups. What practical step could I take to get a test for that
> > in here which I can beat the btrfs developers over the head with the next
> > time someone handwaves this problem away ;)
> 
> I'll merge tests that break a filesystem as a reminder to developers
> that there is a problem that needs fixing. We do that from time to
> time for XFS issues that are either really hard to fix or not urgent
> but require significant amounts of work to correct...

That sounds like a good policy, thank you.


> > From: Mark Fasheh <mfas...@suse.de>
> > 
> > [PATCH] btrfs: add test for quota groups and drop snapshot
> 
> > +# NOTE: The ability to vary tree height for this test is very useful
> > +# for debugging problems with drop_snapshot(). As a result we retain
> > +# that parameter even though the test below always does level 2 trees.
> > +_explode_fs_tree () {
> > +   local level=$1;
> > +   local loc="$2";
> > +   local n;
> > +
> > +   if [ -z $loc ]; then
> > +   echo "specify location for fileset"
> > +   exit 1;
> > +   fi
> > +
> > +   case $level in
> > +   1)# this always reproduces level 1 trees
> > +   n=10;
> > +   ;;
> > +   2)# this always reproduces level 2 trees
> > +   n=1500
> 
> Still some minor whitespace issues, but I can fix that on commit
> as everything else looks fine.

Erf, my bad I tried copying the style in some of the common/ dir but
obviously failed :(  Thanks for the review and help Dave, it is greatly
appreciated.
--Mark

--
Mark Fasheh
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[RFC][PATCH] btrfs: add ioctl to monitor subvolume dropping

2015-10-01 Thread Mark Fasheh

Dropping a subvolume in btrfs is a delayed operation which can persist
across mounts (or crashes) - progress for the subvolume drop is recorded in
a key on the root object.

At the moment, userspace has no way of knowing when a snapshot is finally
removed. This has become a problem when writing tests for btrfs,

http://article.gmane.org/gmane.comp.file-systems.fstests/1239/

The following patch tries to fix this by putting orphaned subvolumes on a
per-fs rbtree. We provide an ioctl which userspace can use to query the
state of a subvolume. Internally, we'll search the rbtree and if a match is
found, the drop progress from our disk key is returned. If a match is not
found, ENOENT is returned and userspace can safely assume that the root has
been dropped (or was never orphaned to begin with).

Obviously this wants a patch to btrfsprogs, which I am currently working on.
In the meantime the ioctl can be tested with the following userspace
program (I had to indent it by 1 space so git wouldn't swallow up the
preprocessor directives).

 #include 
 #include 
 #include 
 #include 
 #include 
 #include 
 #include 
 #include 
 #include 
 #include 
 #include 

 static void usage(const char *prog)
 {
printf("Usage: %s /path/to/btrfs rootid\n", prog);
 }

 int main(int argc, char **argv)
 {
int ret, fd;
char *filename;
uint64_t rootid, tmp;

if (argc != 3) {
usage(argv[0]);
return 1;
}
filename = argv[1];
rootid = atoll(argv[2]);

ret = open(filename, O_RDONLY);
if (ret < 0) {
ret = errno;
fprintf(stderr, "Could not open file %s: (%d) %s\n", filename, 
ret,
strerror(ret));
return ret;
}
fd = ret;

tmp = rootid;
ret = ioctl(fd, BTRFS_IOC_GET_DROP_STATUS, );
if (ret < 0 && ret != ENOENT) {
ret = errno;
fprintf(stderr, "ioctl returned error: (%d) %s\n", ret,
strerror(ret));
return ret;
}

close(fd);

if (ret == ENOENT)
printf("Subvolume not found or already dropped\n");
else
printf("Subvolume %"PRIu64" drop is at object: %"PRIu64"\n", 
rootid, tmp);

return 0;
 }

Signed-off-by: Mark Fasheh <mfas...@suse.de>
---
 fs/btrfs/ctree.h   |  8 ++
 fs/btrfs/disk-io.c |  4 +++
 fs/btrfs/extent-tree.c | 71 ++
 fs/btrfs/ioctl.c   | 24 
 fs/btrfs/root-tree.c   |  1 +
 include/uapi/linux/btrfs.h |  1 +
 6 files changed, 109 insertions(+)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 938efe3..45cd49e 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1782,6 +1782,8 @@ struct btrfs_fs_info {
 * and will be latter freed. Protected by fs_info->chunk_mutex.
 */
struct list_head pinned_chunks;
+
+   struct rb_root  dropping_roots;
 };
 
 struct btrfs_subvolume_writers {
@@ -1943,6 +1945,8 @@ struct btrfs_root {
int send_in_progress;
struct btrfs_subvolume_writers *subv_writers;
atomic_t will_be_snapshoted;
+
+   struct rb_node  drop_status;
 };
 
 struct btrfs_ioctl_defrag_range_args {
@@ -3647,6 +3651,10 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *node,
struct extent_buffer *parent);
+void btrfs_add_drop_status(struct btrfs_root *root);
+void btrfs_remove_drop_status(struct btrfs_root *root);
+int btrfs_get_drop_status(struct btrfs_fs_info *fs_info, u64 rootid,
+ u64 *status);
 static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info)
 {
/*
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 295795a..78dd6da 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1283,6 +1283,7 @@ static void __setup_root(u32 nodesize, u32 sectorsize, 
u32 stripesize,
root->anon_dev = 0;
 
spin_lock_init(>root_item_lock);
+   RB_CLEAR_NODE(>drop_status);
 }
 
 static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info)
@@ -2638,6 +2639,8 @@ int open_ctree(struct super_block *sb,
 
INIT_LIST_HEAD(_info->pinned_chunks);
 
+   fs_info->dropping_roots = RB_ROOT;
+
ret = btrfs_alloc_stripe_hash_table(fs_info);
if (ret) {
err = ret;
@@ -3639,6 +3642,7 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info 
*fs_info,
 
 static void free_fs_root(struct btrfs_root *root)
 {
+   btrfs_remove_drop_status(root);
iput(root->ino_cache_inode);
WARN_ON(!RB_EMPTY_ROOT(>inode_tree));
btrfs_free_block_rsv(root, root->orphan_block_rsv);
diff --g

Re: [PATCH 0/4] btrfs: update qgroups in drop snapshot

2015-09-23 Thread Mark Fasheh

On Wed, Sep 23, 2015 at 11:58:57AM +0800, Qu Wenruo wrote:
> Hi Mark,
> 
> I'd like to test the patchset, but it seems to be a little out of
> date, and failed to apply to integration-4.3.
> 
> Would you please rebase it to integration-4.3?

Hey Qu I think you just need to drop the patch titled:

Btrfs: keep dropped roots in cache until transaction commit

since it is already in integration-4.3. Everything else seems to apply on my
end.
    --Mark

--
Mark Fasheh
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 0/4] btrfs: update qgroups in drop snapshot

2015-09-23 Thread Mark Fasheh

SCRATCH_MNT/snap2
> >+_explode_fs_tree 1 $SCRATCH_MNT/snap2/files-snap2
> >+
> >+# Enable qgroups now that we have our filesystem prepared. This
> >+# will kick off a scan which we will have to wait for.
> >+_run_btrfs_util_prog quota enable $SCRATCH_MNT
> >+_run_btrfs_util_prog quota rescan -w $SCRATCH_MNT
> >+
> >+# Remount to clear cache, force everything to disk
> >+_scratch_unmount
> >+_scratch_mount
> 
> Is there anything special that needs to use umount/mount other than sync?

A couple times now it's been to my advantage to force btrfs to reread the
file trees. It might not be strictly necessary any more.


> >+# Finally, delete snap1 to trigger btrfs_drop_snapshot(). This
> >+# snapshot is most interesting to delete because it will cause some
> >+# nodes to go exclusively owned for snap2, while some will stay shared
> >+# with the default subvolume. That exercises a maximum of the drop
> >+# snapshot/qgroup interactions.
> >+#
> >+# snap2s imlied ref from to the 128K extent in files/ can be lost by
> >+# the root finding code in qgroup accounting due to snap1 no longer
> >+# providing a path to it. This was fixed by the first two patches
> >+# referenced above.
> >+_run_btrfs_util_prog subvolume delete $SCRATCH_MNT/snap1
> >+
> >+# There is no way from userspace to force btrfs_drop_snapshot to run
> >+# at a given time (even via mount/unmount). We must wait for it to
> >+# start and complete. This is the shortest time on my tests systems I
> >+# have found which always allows drop_snapshot to run to completion.
> >+sleep 45
> 
> Does "btrfs subv delete -c" help here?

Unfortunately not :( We need to wait for drop_snapshot() to get run. That
flag (from memory) just waits for the initial orphaning transaction to
finish.


> >+
> >+_scratch_unmount
> >+
> >+# generate a qgroup report and look for inconsistent groups
> >+#  - don't use _run_btrfs_util_prog here as it captures the output and
> >+#we need to grep it.
> >+$BTRFS_UTIL_PROG check --qgroup-report $SCRATCH_DEV 2>&1 | grep -E -q 
> >"Counts for qgroup.*are different"
> >+if [ $? -ne 0 ]; then
> >+status=0
> >+fi
> Quite a nice idea to use btrfsck to check qgroup validation.
> 
> But I don't see the reason not to use _run_btrfS_util_progs, as I
> don't think it's needed to grep.
> 
> If there is a bug in return value of btrfsck, then I'm OK with it as
> a workaround.
> 
> But if btrfsck --qgroup-report will return non-zero when it finds a
> qgroup mismatch, I think is better to just call
> _run_btrfs_util_prog, as it has judgment for return value check.

btrfsck --qgroup-report returns zero unless there was an issue generating
the report so the grep there is the only way to catch this consistently.

Thanks again,
--Mark

--
Mark Fasheh
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] fstests: btrfs: add test for quota groups and drop snapshot

2015-09-23 Thread Mark Fasheh

Hey Dave, thanks for the review. A refreshed patch for you to look at is
attached.

On Wed, Sep 23, 2015 at 12:47:08PM +1000, Dave Chinner wrote:
> On Tue, Sep 22, 2015 at 03:16:49PM -0700, Mark Fasheh wrote:
> > +tmp=/tmp/$$
> > +status=1   # failure is the default!
> > +trap "_cleanup; exit \$status" 0 1 2 3 15
> > +
> > +_cleanup()
> > +{
> > +   rm -fr $tmp
> > +}
> 
> Missing a "cd /" (see the "new" template).

Ok, added that and I updated from a fresh template as well.


> > +# Create an fs tree of a given height at a target location. This is
> > +# done by agressively creating inline extents to expand the number of
> > +# nodes required. We also add an traditional extent so that
> > +# drop_snapshot is forced to walk at least one extent that is not
> > +# stored in metadata.
> > +#
> > +# NOTE: The ability to vary tree height for this test is very useful
> > +# for debugging problems with drop_snapshot(). As a result we retain
> > +# that parameter even though the test below always does level 2 trees.
> > +_explode_fs_tree () {
> > +local level=$1;
> > +local loc="$2";
> > +local bs=4095;
> > +local cnt=1;
> > +local n;
> 
> 8 space tabs, please.

Ok hopefully I got that right this time around.


> Please use xfs_io, not dd. It's also only ever writing a single
> block of 4095 bytes, so you can drop the bs/cnt variables and just
> use:
> 
>   $XFS_IO_PROG -f -c "pwrite 0 4095" $loc/file$i > /dev/null 2>&1
> 
> > +done
> > +
> > +bs=131072
> > +cnt=1
> > +dd status=none if=/dev/zero of=$loc/extentfile bs=$bs count=$cnt
> 
> Variables for a single use? :P

Heh, I got a bit sloppy there sorry - it's from when I was experimenting
with different numbers to create various tree levels. I turned all the 'dd'
calls into $XFS_IO_PROG.


>   $XFS_IO_PROG -f -c "pwrite 0 128k $loc/extentfile > /dev/null 2>&1
> 
> > +# Force the default leaf size as the calculations for making our btree
> > +# heights are based on that.
> > +run_check _scratch_mkfs "--nodesize 16384"
> 
> Please, no new users of run_check.

Ok I took that out, I'm confused though whyen you say 'no new users of
run_check', does that include the usage of _run_btrfs_util_prog() in this
test?


> > +# Finally, delete snap1 to trigger btrfs_drop_snapshot(). This
> > +# snapshot is most interesting to delete because it will cause some
> > +# nodes to go exclusively owned for snap2, while some will stay shared
> > +# with the default subvolume. That exercises a maximum of the drop
> > +# snapshot/qgroup interactions.
> > +#
> > +# snap2s imlied ref from to the 128K extent in files/ can be lost by
> > +# the root finding code in qgroup accounting due to snap1 no longer
> > +# providing a path to it. This was fixed by the first two patches
> > +# referenced above.
> > +_run_btrfs_util_prog subvolume delete $SCRATCH_MNT/snap1
> > +
> > +# There is no way from userspace to force btrfs_drop_snapshot to run
> > +# at a given time (even via mount/unmount). We must wait for it to
> > +# start and complete. This is the shortest time on my tests systems I
> > +# have found which always allows drop_snapshot to run to completion.
> > +sleep 45
> 
> Which means it will not be long enough for someone else. We've had
> this discussion before - btrfs needs a way to query if a background
> operation is in progress or not

At least the situation improved since last time - I don't need a 'sleep'
around the qgroup calls any more ;)

On a more serious note I agree that this is a problem, but we're going to
have to add an ioctl for this as the usual one does not provide any room for
additional behavior. Also, drop_snapshot can happen across mounts which
might make things a bit difficult (or maybe not, if the process waiting
holds an open fd to the fs).

Since the last time I sent this test, drop snapshot was broken again with
respect to qgroups. What practical step could I take to get a test for that
in here which I can beat the btrfs developers over the head with the next
time someone handwaves this problem away ;)
--Mark

--
Mark Fasheh


From: Mark Fasheh <mfas...@suse.de>

[PATCH] btrfs: add test for quota groups and drop snapshot

Test btrfs quota group consistency operations during snapshot
delete. Btrfs has had long standing issues with drop snapshot
failing to properly account for quota groups. This test crafts
several snapshot trees with shared and exclusive elements. One of
the trees is removed and then quota group consistency is checked.

This issue is fixed by the foll

< 1 2 3 4 5 >

101 - 200 of 477 matches

Mail list logo