[Cluster-devel] [PATCH v3 2/3] gfs2: lookup local statfs inodes prior to journal recovery
We need to lookup the master statfs inode and the local statfs inodes earlier in the mount process (in init_journal) so journal recovery can use them when it attempts to recover the statfs info. We lookup all the local statfs inodes and store them in a linked list to allow a node to recover statfs info for other nodes in the cluster. Signed-off-by: Abhi Das --- fs/gfs2/incore.h | 8 +++ fs/gfs2/ops_fstype.c | 133 +++ fs/gfs2/super.c | 31 +- fs/gfs2/super.h | 3 + 4 files changed, 139 insertions(+), 36 deletions(-) diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index e34183e02a9e..0220f2b1d5bf 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -697,6 +697,13 @@ struct gfs2_pcpu_lkstats { struct gfs2_lkstats lkstats[10]; }; +/* List of local (per node) statfs inodes */ +struct lcl_statfs_inode { + struct list_head si_list; + struct inode *si_sc_inode; + unsigned int si_jid; /* journal id this statfs inode corresponds to */ +}; + struct gfs2_sbd { struct super_block *sd_vfs; struct gfs2_pcpu_lkstats __percpu *sd_lkstats; @@ -748,6 +755,7 @@ struct gfs2_sbd { struct inode *sd_jindex; struct inode *sd_statfs_inode; struct inode *sd_sc_inode; + struct list_head sd_sc_inodes_list; struct inode *sd_qc_inode; struct inode *sd_rindex; struct inode *sd_quota_inode; diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 03c33fc03c05..dc304f60b39a 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -110,6 +110,8 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb) spin_lock_init(&sdp->sd_trunc_lock); spin_lock_init(&sdp->sd_bitmap_lock); + INIT_LIST_HEAD(&sdp->sd_sc_inodes_list); + mapping = &sdp->sd_aspace; address_space_init_once(mapping); @@ -608,6 +610,90 @@ static int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh) return error; } +/** + * init_statfs - lookup and initialize master and local (per node) + * statfs inodes. This should be called after the jindex + * is initialized in init_journal() and before + * gfs2_journal_recovery() is called because we need to + * be able to write to these inodes during recovery. + * @sdp: The GFS2 superblock + * + * Returns: errno + */ +static int init_statfs(struct gfs2_sbd *sdp) +{ + int error = 0; + struct inode *master = d_inode(sdp->sd_master_dir); + struct inode *pn = NULL; + char buf[30]; + struct gfs2_jdesc *jd; + struct gfs2_inode *ip; + + sdp->sd_statfs_inode = gfs2_lookup_simple(master, "statfs"); + if (IS_ERR(sdp->sd_statfs_inode)) { + error = PTR_ERR(sdp->sd_statfs_inode); + fs_err(sdp, "can't read in statfs inode: %d\n", error); + goto fail; + } + + pn = gfs2_lookup_simple(master, "per_node"); + if (IS_ERR(pn)) { + error = PTR_ERR(pn); + fs_err(sdp, "can't find per_node directory: %d\n", error); + goto put_statfs; + } + + /* For each jid, lookup the corresponding local statfs inode in the +* per_node metafs directory and save it in the sdp->sd_sc_inodes_list. */ + list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) { + struct lcl_statfs_inode *lsi = + kmalloc(sizeof(struct lcl_statfs_inode), GFP_NOFS); + if (!lsi) { + error = -ENOMEM; + goto free_lcl; + } + sprintf(buf, "statfs_change%u", jd->jd_jid); + lsi->si_sc_inode = gfs2_lookup_simple(pn, buf); + if (IS_ERR(lsi->si_sc_inode)) { + error = PTR_ERR(lsi->si_sc_inode); + fs_err(sdp, "can't find local \"sc\" file#%u: %d\n", + jd->jd_jid, error); + goto free_lcl; + } + lsi->si_jid = jd->jd_jid; + if (jd->jd_jid == sdp->sd_jdesc->jd_jid) + sdp->sd_sc_inode = lsi->si_sc_inode; + + list_add_tail(&lsi->si_list, &sdp->sd_sc_inodes_list); + } + + iput(pn); + ip = GFS2_I(sdp->sd_sc_inode); + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, + &sdp->sd_sc_gh); + if (error) { + fs_err(sdp, "can't lock local \"sc\" file: %d\n", error); + goto free_lcl; + } + return 0; + +free_lcl: + free_lcl_statfs_inodes(sdp); + iput(pn); +put_statfs: + iput(sd
[Cluster-devel] [PATCH v3 3/3] gfs2: Recover statfs info in journal head
Apply the outstanding statfs changes in the journal head to the master statfs file. Zero out the local statfs file for good measure. Previously, statfs updates would be read in from the local statfs inode and synced to the master statfs inode during recovery. We now use the statfs updates in the journal head to update the master statfs inode instead of reading in from the local statfs inode. To preserve backward compatibility with kernels that can't do this, we still need to keep the local statfs inode up to date by writing changes to it. At some point in the future, we can do away with the local statfs inodes altogether and keep the statfs changes solely in the journal. Signed-off-by: Abhi Das --- fs/gfs2/lops.c | 2 +- fs/gfs2/lops.h | 1 + fs/gfs2/recovery.c | 104 + 3 files changed, 106 insertions(+), 1 deletion(-) diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index ed1da4323967..ed69298dd824 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -823,7 +823,7 @@ static int buf_lo_scan_elements(struct gfs2_jdesc *jd, u32 start, * */ -static void gfs2_meta_sync(struct gfs2_glock *gl) +void gfs2_meta_sync(struct gfs2_glock *gl) { struct address_space *mapping = gfs2_glock2aspace(gl); struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; diff --git a/fs/gfs2/lops.h b/fs/gfs2/lops.h index 9c5e4e491e03..4a3d8aecdf82 100644 --- a/fs/gfs2/lops.h +++ b/fs/gfs2/lops.h @@ -27,6 +27,7 @@ extern void gfs2_log_submit_bio(struct bio **biop, int opf); extern void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh); extern int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head, bool keep_cache); +extern void gfs2_meta_sync(struct gfs2_glock *gl); static inline unsigned int buf_limit(struct gfs2_sbd *sdp) { diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c index a8bb17e355b8..caaa35bd6349 100644 --- a/fs/gfs2/recovery.c +++ b/fs/gfs2/recovery.c @@ -296,6 +296,109 @@ static void gfs2_recovery_done(struct gfs2_sbd *sdp, unsigned int jid, sdp->sd_lockstruct.ls_ops->lm_recovery_result(sdp, jid, message); } +/** + * update_statfs_inode - Update the master statfs inode or zero out the local + * statfs inode for a given journal. + * @jd: The journal + * @head: If NULL, @inode is the local statfs inode and we need to zero it out. + *Otherwise, it @head contains the statfs change info that needs to be + *synced to the master statfs inode (pointed to by @inode). + * @inode: statfs inode to update. + */ +static int update_statfs_inode(struct gfs2_jdesc *jd, + struct gfs2_log_header_host *head, + struct inode *inode) +{ + struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); + struct gfs2_inode *ip; + struct buffer_head *bh; + struct gfs2_statfs_change_host sc; + int error = 0; + + BUG_ON(!inode); + ip = GFS2_I(inode); + + error = gfs2_meta_inode_buffer(ip, &bh); + if (error) + goto out; + + spin_lock(&sdp->sd_statfs_spin); + + if (head) { /* Update the master statfs inode */ + gfs2_statfs_change_in(&sc, bh->b_data + sizeof(struct gfs2_dinode)); + sc.sc_total += head->lh_local_total; + sc.sc_free += head->lh_local_free; + sc.sc_dinodes += head->lh_local_dinodes; + gfs2_statfs_change_out(&sc, bh->b_data + sizeof(struct gfs2_dinode)); + + fs_info(sdp, "jid=%u: Updated master statfs Total:%lld, " + "Free:%lld, Dinodes:%lld after change " + "[%+lld,%+lld,%+lld]\n", jd->jd_jid, sc.sc_total, + sc.sc_free, sc.sc_dinodes, head->lh_local_total, + head->lh_local_free, head->lh_local_dinodes); + } else { /* Zeroing out the local statfs inode */ + memset(bh->b_data + sizeof(struct gfs2_dinode), 0, + sizeof(struct gfs2_statfs_change)); + /* If it's our own journal, reset any in-memory changes too */ + if (jd->jd_jid == sdp->sd_lockstruct.ls_jid) { + memset(&sdp->sd_statfs_local, 0, + sizeof(struct gfs2_statfs_change_host)); + } + } + spin_unlock(&sdp->sd_statfs_spin); + + mark_buffer_dirty(bh); + brelse(bh); + gfs2_meta_sync(ip->i_gl); + +out: + return error; +} + +/** + * recover_local_statfs - Update the master and local statfs changes for this + *journal. + * + * Previously, statfs updates would be read in from the local statfs inode and + * synced to the master statfs inode during recovery. + * + * We now use the statfs updates in the
[Cluster-devel] [PATCH v3 0/3] gfs2: local statfs improvements
This patchset allows gfs2 to sync statfs info from the journal to the master statfs file during a log flush or during recovery. We still write to the local statfs file to allow older versions to recover the statfs info of newer kernels with this patchset. This version addresses the bug Andreas found with xfstests:generic/034. Turns out we weren't looking up the statfs inodes early enough in the mount process. I also added some comments in the code. Abhi Das (3): gfs2: Add fields for statfs info in struct gfs2_log_header_host gfs2: lookup local statfs inodes prior to journal recovery gfs2: Recover statfs info in journal head fs/gfs2/incore.h | 12 fs/gfs2/lops.c | 2 +- fs/gfs2/lops.h | 1 + fs/gfs2/ops_fstype.c | 133 +++ fs/gfs2/recovery.c | 108 +++ fs/gfs2/super.c | 33 ++- fs/gfs2/super.h | 5 ++ 7 files changed, 256 insertions(+), 38 deletions(-) -- 2.20.1
[Cluster-devel] [PATCH v3 1/3] gfs2: Add fields for statfs info in struct gfs2_log_header_host
And read these in __get_log_header() from the log header. Also make gfs2_statfs_change_out() non-static so it can be used outside of super.c Signed-off-by: Abhi Das --- fs/gfs2/incore.h | 4 fs/gfs2/recovery.c | 4 fs/gfs2/super.c| 2 +- fs/gfs2/super.h| 2 ++ 4 files changed, 11 insertions(+), 1 deletion(-) diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index c3ca9b8382ec..e34183e02a9e 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -41,6 +41,10 @@ struct gfs2_log_header_host { u32 lh_flags; /* GFS2_LOG_HEAD_... */ u32 lh_tail;/* Block number of log tail */ u32 lh_blkno; + + s64 lh_local_total; + s64 lh_local_free; + s64 lh_local_dinodes; }; /* diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c index 390ea79d682c..a8bb17e355b8 100644 --- a/fs/gfs2/recovery.c +++ b/fs/gfs2/recovery.c @@ -144,6 +144,10 @@ int __get_log_header(struct gfs2_sbd *sdp, const struct gfs2_log_header *lh, head->lh_tail = be32_to_cpu(lh->lh_tail); head->lh_blkno = be32_to_cpu(lh->lh_blkno); + head->lh_local_total = be64_to_cpu(lh->lh_local_total); + head->lh_local_free = be64_to_cpu(lh->lh_local_free); + head->lh_local_dinodes = be64_to_cpu(lh->lh_local_dinodes); + return 0; } /** diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 8e250ec42e91..e17961ea994d 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -230,7 +230,7 @@ void gfs2_statfs_change_in(struct gfs2_statfs_change_host *sc, const void *buf) sc->sc_dinodes = be64_to_cpu(str->sc_dinodes); } -static void gfs2_statfs_change_out(const struct gfs2_statfs_change_host *sc, void *buf) +void gfs2_statfs_change_out(const struct gfs2_statfs_change_host *sc, void *buf) { struct gfs2_statfs_change *str = buf; diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h index 51900554ed81..ed4f5cb29074 100644 --- a/fs/gfs2/super.h +++ b/fs/gfs2/super.h @@ -37,6 +37,8 @@ extern void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free, s64 dinodes); extern void gfs2_statfs_change_in(struct gfs2_statfs_change_host *sc, const void *buf); +extern void gfs2_statfs_change_out(const struct gfs2_statfs_change_host *sc, + void *buf); extern void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh, struct buffer_head *l_bh); extern int gfs2_statfs_sync(struct super_block *sb, int type); -- 2.20.1
[Cluster-devel] [PATCH v2 1/3] gfs2: Add fields for statfs info in struct gfs2_log_header_host
And read these in __get_log_header() from the log header. Also make gfs2_statfs_change_out() non-static so it can be used outside of super.c Signed-off-by: Abhi Das --- fs/gfs2/incore.h | 4 fs/gfs2/recovery.c | 4 fs/gfs2/super.c| 2 +- fs/gfs2/super.h| 2 ++ 4 files changed, 11 insertions(+), 1 deletion(-) diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index ca2ec02436ec..9fc12206a3ad 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -41,6 +41,10 @@ struct gfs2_log_header_host { u32 lh_flags; /* GFS2_LOG_HEAD_... */ u32 lh_tail;/* Block number of log tail */ u32 lh_blkno; + + s64 lh_local_total; + s64 lh_local_free; + s64 lh_local_dinodes; }; /* diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c index 390ea79d682c..a8bb17e355b8 100644 --- a/fs/gfs2/recovery.c +++ b/fs/gfs2/recovery.c @@ -144,6 +144,10 @@ int __get_log_header(struct gfs2_sbd *sdp, const struct gfs2_log_header *lh, head->lh_tail = be32_to_cpu(lh->lh_tail); head->lh_blkno = be32_to_cpu(lh->lh_blkno); + head->lh_local_total = be64_to_cpu(lh->lh_local_total); + head->lh_local_free = be64_to_cpu(lh->lh_local_free); + head->lh_local_dinodes = be64_to_cpu(lh->lh_local_dinodes); + return 0; } /** diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 3d9daac44e1c..20554db4ccab 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -230,7 +230,7 @@ void gfs2_statfs_change_in(struct gfs2_statfs_change_host *sc, const void *buf) sc->sc_dinodes = be64_to_cpu(str->sc_dinodes); } -static void gfs2_statfs_change_out(const struct gfs2_statfs_change_host *sc, void *buf) +void gfs2_statfs_change_out(const struct gfs2_statfs_change_host *sc, void *buf) { struct gfs2_statfs_change *str = buf; diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h index 51900554ed81..ed4f5cb29074 100644 --- a/fs/gfs2/super.h +++ b/fs/gfs2/super.h @@ -37,6 +37,8 @@ extern void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free, s64 dinodes); extern void gfs2_statfs_change_in(struct gfs2_statfs_change_host *sc, const void *buf); +extern void gfs2_statfs_change_out(const struct gfs2_statfs_change_host *sc, + void *buf); extern void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh, struct buffer_head *l_bh); extern int gfs2_statfs_sync(struct super_block *sb, int type); -- 2.20.1
[Cluster-devel] [PATCH v2 2/3] gfs2: lookup local statfs inodes at mount time
We require these inodes during journal recovery when we attempt to recover the statfs file. We are not able to lookup inodes at that time due to locks being blocked so we pre-lookup these inodes and save them in a linked list. Signed-off-by: Abhi Das --- fs/gfs2/incore.h | 7 +++ fs/gfs2/ops_fstype.c | 32 fs/gfs2/super.c | 28 +++- fs/gfs2/super.h | 3 +++ 4 files changed, 61 insertions(+), 9 deletions(-) diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 9fc12206a3ad..313e35c14860 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -705,6 +705,12 @@ struct gfs2_pcpu_lkstats { struct gfs2_lkstats lkstats[10]; }; +struct lcl_statfs_inode { + struct list_head si_list; + struct inode *si_sc_inode; + unsigned int si_jid; +}; + struct gfs2_sbd { struct super_block *sd_vfs; struct gfs2_pcpu_lkstats __percpu *sd_lkstats; @@ -755,6 +761,7 @@ struct gfs2_sbd { struct inode *sd_jindex; struct inode *sd_statfs_inode; struct inode *sd_sc_inode; + struct list_head sd_sc_inodes_list; struct inode *sd_qc_inode; struct inode *sd_rindex; struct inode *sd_quota_inode; diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 6d18d2c91add..042f3de79789 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -110,6 +110,8 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb) spin_lock_init(&sdp->sd_trunc_lock); spin_lock_init(&sdp->sd_bitmap_lock); + INIT_LIST_HEAD(&sdp->sd_sc_inodes_list); + mapping = &sdp->sd_aspace; address_space_init_once(mapping); @@ -814,6 +816,7 @@ static int init_per_node(struct gfs2_sbd *sdp, int undo) char buf[30]; int error = 0; struct gfs2_inode *ip; + struct gfs2_jdesc *jd; struct inode *master = d_inode(sdp->sd_master_dir); if (sdp->sd_args.ar_spectator) @@ -829,12 +832,26 @@ static int init_per_node(struct gfs2_sbd *sdp, int undo) return error; } - sprintf(buf, "statfs_change%u", sdp->sd_jdesc->jd_jid); - sdp->sd_sc_inode = gfs2_lookup_simple(pn, buf); - if (IS_ERR(sdp->sd_sc_inode)) { - error = PTR_ERR(sdp->sd_sc_inode); - fs_err(sdp, "can't find local \"sc\" file: %d\n", error); - goto fail; + list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) { + struct lcl_statfs_inode *lsi = + kmalloc(sizeof(struct lcl_statfs_inode), GFP_NOFS); + if (!lsi) { + error = -ENOMEM; + goto fail_ut_i; + } + sprintf(buf, "statfs_change%u", jd->jd_jid); + lsi->si_sc_inode = gfs2_lookup_simple(pn, buf); + if (IS_ERR(lsi->si_sc_inode)) { + error = PTR_ERR(lsi->si_sc_inode); + fs_err(sdp, "can't find local \"sc\" file #%u: %d\n", + jd->jd_jid, error); + goto fail_ut_i; + } + lsi->si_jid = jd->jd_jid; + if (jd->jd_jid == sdp->sd_jdesc->jd_jid) + sdp->sd_sc_inode = lsi->si_sc_inode; + + list_add_tail(&lsi->si_list, &sdp->sd_sc_inodes_list); } sprintf(buf, "quota_change%u", sdp->sd_jdesc->jd_jid); @@ -873,8 +890,7 @@ static int init_per_node(struct gfs2_sbd *sdp, int undo) fail_qc_i: iput(sdp->sd_qc_inode); fail_ut_i: - iput(sdp->sd_sc_inode); -fail: + free_lcl_statfs_inodes(sdp); iput(pn); return error; } diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 20554db4ccab..ac5ad16e5c96 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -729,7 +729,7 @@ static void gfs2_put_super(struct super_block *sb) gfs2_glock_dq_uninit(&sdp->sd_jinode_gh); gfs2_glock_dq_uninit(&sdp->sd_sc_gh); gfs2_glock_dq_uninit(&sdp->sd_qc_gh); - iput(sdp->sd_sc_inode); + free_lcl_statfs_inodes(sdp); iput(sdp->sd_qc_inode); } @@ -1560,6 +1560,32 @@ static void gfs2_free_inode(struct inode *inode) kmem_cache_free(gfs2_inode_cachep, GFS2_I(inode)); } +extern void free_lcl_statfs_inodes(struct gfs2_sbd *sdp) +{ + struct lcl_statfs_inode *lsi, *safe; + + list_for_each_entry_safe(lsi, safe, &sdp->sd_sc_inodes_list, si_list) { + if (lsi->si_jid == sdp->sd_jdesc->jd_jid) + sdp->sd_sc_inode = NULL; + if (lsi->si_sc_inode) + iput(lsi->si_sc_inode)
[Cluster-devel] [PATCH v2 0/3] gfs2: local statfs improvements
This patchset allows gfs2 to sync statfs info from the journal to the master statfs file during a log flush or during recovery. We still write to the local statfs file to allow older versions to recover the statfs info of newer kernels with this patchset. Abhi Das (3): gfs2: Add fields for statfs info in struct gfs2_log_header_host gfs2: lookup local statfs inodes at mount time gfs2: Recover statfs info in journal head fs/gfs2/incore.h | 11 +++ fs/gfs2/lops.c | 2 +- fs/gfs2/lops.h | 1 + fs/gfs2/ops_fstype.c | 32 ++- fs/gfs2/recovery.c | 75 fs/gfs2/super.c | 30 -- fs/gfs2/super.h | 5 +++ 7 files changed, 145 insertions(+), 11 deletions(-) -- 2.20.1
[Cluster-devel] [PATCH v2 3/3] gfs2: Recover statfs info in journal head
Apply the outstanding statfs changes in the journal head to the master statfs file. Zero out the local statfs file for good measure. Signed-off-by: Abhi Das --- fs/gfs2/lops.c | 2 +- fs/gfs2/lops.h | 1 + fs/gfs2/recovery.c | 71 ++ 3 files changed, 73 insertions(+), 1 deletion(-) diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index ed1da4323967..ed69298dd824 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -823,7 +823,7 @@ static int buf_lo_scan_elements(struct gfs2_jdesc *jd, u32 start, * */ -static void gfs2_meta_sync(struct gfs2_glock *gl) +void gfs2_meta_sync(struct gfs2_glock *gl) { struct address_space *mapping = gfs2_glock2aspace(gl); struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; diff --git a/fs/gfs2/lops.h b/fs/gfs2/lops.h index 9c5e4e491e03..4a3d8aecdf82 100644 --- a/fs/gfs2/lops.h +++ b/fs/gfs2/lops.h @@ -27,6 +27,7 @@ extern void gfs2_log_submit_bio(struct bio **biop, int opf); extern void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh); extern int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head, bool keep_cache); +extern void gfs2_meta_sync(struct gfs2_glock *gl); static inline unsigned int buf_limit(struct gfs2_sbd *sdp) { diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c index a8bb17e355b8..21661c5e497e 100644 --- a/fs/gfs2/recovery.c +++ b/fs/gfs2/recovery.c @@ -296,6 +296,76 @@ static void gfs2_recovery_done(struct gfs2_sbd *sdp, unsigned int jid, sdp->sd_lockstruct.ls_ops->lm_recovery_result(sdp, jid, message); } +static int update_statfs_inode(struct gfs2_jdesc *jd, + struct gfs2_log_header_host *head, + struct inode *inode) +{ + struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); + struct gfs2_inode *ip; + struct buffer_head *bh; + struct gfs2_statfs_change_host sc; + int error = 0; + + BUG_ON(!inode); + ip = GFS2_I(inode); + + error = gfs2_meta_inode_buffer(ip, &bh); + if (error) + goto out; + + spin_lock(&sdp->sd_statfs_spin); + + if (head) { /* Updating the master statfs inode */ + gfs2_statfs_change_in(&sc, bh->b_data + sizeof(struct gfs2_dinode)); + sc.sc_total += head->lh_local_total; + sc.sc_free += head->lh_local_free; + sc.sc_dinodes += head->lh_local_dinodes; + gfs2_statfs_change_out(&sc, bh->b_data + sizeof(struct gfs2_dinode)); + + fs_info(sdp, "jid=%u: Updated master statfs Total:%lld, " + "Free:%lld, Dinodes:%lld after change " + "[%+lld,%+lld,%+lld]\n", jd->jd_jid, sc.sc_total, + sc.sc_free, sc.sc_dinodes, head->lh_local_total, + head->lh_local_free, head->lh_local_dinodes); + } else { /* Zeroing out one of the local statfs inodes */ + memset(bh->b_data + sizeof(struct gfs2_dinode), 0, + sizeof(struct gfs2_statfs_change)); + if (jd->jd_jid == sdp->sd_lockstruct.ls_jid) { + memset(&sdp->sd_statfs_local, 0, + sizeof(struct gfs2_statfs_change_host)); + } + } + spin_unlock(&sdp->sd_statfs_spin); + + mark_buffer_dirty(bh); + brelse(bh); + gfs2_meta_sync(ip->i_gl); + +out: + return error; +} + +static void recover_local_statfs(struct gfs2_jdesc *jd, +struct gfs2_log_header_host *head) +{ + int error; + struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); + + if (!head->lh_local_total && !head->lh_local_free + && !head->lh_local_dinodes) /* No change */ + goto zero_local; + + error = update_statfs_inode(jd, head, sdp->sd_statfs_inode); + if (error) + goto out; + +zero_local: + error = update_statfs_inode(jd, NULL, + find_lcl_statfs_inode(sdp, jd->jd_jid)); +out: + return; +} + void gfs2_recover_func(struct work_struct *work) { struct gfs2_jdesc *jd = container_of(work, struct gfs2_jdesc, jd_work); @@ -415,6 +485,7 @@ void gfs2_recover_func(struct work_struct *work) goto fail_gunlock_thaw; } + recover_local_statfs(jd, &head); clean_journal(jd, &head); up_read(&sdp->sd_log_flush_lock); -- 2.20.1
[Cluster-devel] [PATCH 1/3] gfs2: Don't write updates to local statfs file
We store the local statfs info in the journal header now so there's no need to write to the local statfs file anymore. Signed-off-by: Abhi Das --- fs/gfs2/lops.c | 10 +- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index cb2a11b458c6..53d2dbf6605e 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -104,7 +104,15 @@ static void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh, BUG_ON(!buffer_pinned(bh)); lock_buffer(bh); - mark_buffer_dirty(bh); + /* +* We want to eliminate the local statfs file eventually. +* But, for now, we're simply not going to update it by +* never marking its buffers dirty +*/ + if (!(bd->bd_gl->gl_name.ln_type == LM_TYPE_INODE && + bd->bd_gl->gl_object == GFS2_I(sdp->sd_sc_inode))) + mark_buffer_dirty(bh); + clear_buffer_pinned(bh); if (buffer_is_rgrp(bd)) -- 2.20.1
[Cluster-devel] [PATCH 2/3] gfs2: Add fields for statfs info in struct gfs2_log_header_host
And read these in __get_log_header() from the log header. Also make gfs2_statfs_change_out() non-static so it can be used outside of super.c Signed-off-by: Abhi Das --- fs/gfs2/incore.h | 4 fs/gfs2/recovery.c | 4 fs/gfs2/super.c| 2 +- fs/gfs2/super.h| 2 ++ 4 files changed, 11 insertions(+), 1 deletion(-) diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index ca2ec02436ec..9fc12206a3ad 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -41,6 +41,10 @@ struct gfs2_log_header_host { u32 lh_flags; /* GFS2_LOG_HEAD_... */ u32 lh_tail;/* Block number of log tail */ u32 lh_blkno; + + s64 lh_local_total; + s64 lh_local_free; + s64 lh_local_dinodes; }; /* diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c index 390ea79d682c..a8bb17e355b8 100644 --- a/fs/gfs2/recovery.c +++ b/fs/gfs2/recovery.c @@ -144,6 +144,10 @@ int __get_log_header(struct gfs2_sbd *sdp, const struct gfs2_log_header *lh, head->lh_tail = be32_to_cpu(lh->lh_tail); head->lh_blkno = be32_to_cpu(lh->lh_blkno); + head->lh_local_total = be64_to_cpu(lh->lh_local_total); + head->lh_local_free = be64_to_cpu(lh->lh_local_free); + head->lh_local_dinodes = be64_to_cpu(lh->lh_local_dinodes); + return 0; } /** diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 9f4d9e7be839..4c51d30d65c5 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -224,7 +224,7 @@ void gfs2_statfs_change_in(struct gfs2_statfs_change_host *sc, const void *buf) sc->sc_dinodes = be64_to_cpu(str->sc_dinodes); } -static void gfs2_statfs_change_out(const struct gfs2_statfs_change_host *sc, void *buf) +void gfs2_statfs_change_out(const struct gfs2_statfs_change_host *sc, void *buf) { struct gfs2_statfs_change *str = buf; diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h index 51900554ed81..ed4f5cb29074 100644 --- a/fs/gfs2/super.h +++ b/fs/gfs2/super.h @@ -37,6 +37,8 @@ extern void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free, s64 dinodes); extern void gfs2_statfs_change_in(struct gfs2_statfs_change_host *sc, const void *buf); +extern void gfs2_statfs_change_out(const struct gfs2_statfs_change_host *sc, + void *buf); extern void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh, struct buffer_head *l_bh); extern int gfs2_statfs_sync(struct super_block *sb, int type); -- 2.20.1
[Cluster-devel] [PATCH 0/3] local statfs improvements
With this patchset, we don't write to the local statfs file anymore. The local statfs data is written into the journal and synced to the master statfs file during a log flush or during recovery. Abhi Das (3): gfs2: Don't write updates to local statfs file gfs2: Add fields for statfs info in struct gfs2_log_header_host gfs2: Recover statfs info in journal head fs/gfs2/incore.h | 4 ++ fs/gfs2/lops.c | 12 - fs/gfs2/lops.h | 1 + fs/gfs2/recovery.c | 125 + fs/gfs2/super.c| 2 +- fs/gfs2/super.h| 2 + 6 files changed, 143 insertions(+), 3 deletions(-) -- 2.20.1
[Cluster-devel] [PATCH 3/3] gfs2: Recover statfs info in journal head
Apply the outstanding statfs changes in the journal head to the master statfs file. Zero out the local statfs file for good measure. Signed-off-by: Abhi Das --- fs/gfs2/lops.c | 2 +- fs/gfs2/lops.h | 1 + fs/gfs2/recovery.c | 121 + 3 files changed, 123 insertions(+), 1 deletion(-) diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index 53d2dbf6605e..061747b959c8 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -831,7 +831,7 @@ static int buf_lo_scan_elements(struct gfs2_jdesc *jd, u32 start, * */ -static void gfs2_meta_sync(struct gfs2_glock *gl) +void gfs2_meta_sync(struct gfs2_glock *gl) { struct address_space *mapping = gfs2_glock2aspace(gl); struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; diff --git a/fs/gfs2/lops.h b/fs/gfs2/lops.h index 9c5e4e491e03..4a3d8aecdf82 100644 --- a/fs/gfs2/lops.h +++ b/fs/gfs2/lops.h @@ -27,6 +27,7 @@ extern void gfs2_log_submit_bio(struct bio **biop, int opf); extern void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh); extern int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head, bool keep_cache); +extern void gfs2_meta_sync(struct gfs2_glock *gl); static inline unsigned int buf_limit(struct gfs2_sbd *sdp) { diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c index a8bb17e355b8..428a0aad49c6 100644 --- a/fs/gfs2/recovery.c +++ b/fs/gfs2/recovery.c @@ -296,6 +296,126 @@ static void gfs2_recovery_done(struct gfs2_sbd *sdp, unsigned int jid, sdp->sd_lockstruct.ls_ops->lm_recovery_result(sdp, jid, message); } +static int lookup_statfs_inodes(struct gfs2_jdesc *jd, struct inode **master, + struct inode **local) +{ + int error = 0; + char buf[30]; + struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); + struct inode *md = d_inode(sdp->sd_master_dir), *pn; + + *master = gfs2_lookup_simple(md, "statfs"); + if (IS_ERR(*master)) { + error = PTR_ERR(*master); + fs_err(sdp, "Can't read in statfs inode: %d\n", error); + goto out; + } + pn = gfs2_lookup_simple(md, "per_node"); + if (IS_ERR(pn)) { + error = PTR_ERR(pn); + fs_err(sdp, "Can't find per_node directory: %d\n", error); + goto put_m_ip; + } + sprintf(buf, "statfs_change%u", jd->jd_jid); + *local = gfs2_lookup_simple(pn, buf); + if (IS_ERR(*local)) { + error = PTR_ERR(*local); + fs_err(sdp, "Can't find local \"sc\" file for jid:%u: %d\n", + jd->jd_jid, error); + } + iput(pn); + if (!error) + return error; +put_m_ip: + iput(*master); +out: + return error; +} + +static int update_statfs_inode(struct gfs2_jdesc *jd, struct gfs2_inode *ip, + struct gfs2_log_header_host *head) +{ + /* +* If head is NULL, ip points to a local statfs file. +* zero out the statfs data in the inode pointed to by ip. +*/ + struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); + struct gfs2_statfs_change_host sc; + struct gfs2_holder gh; + struct buffer_head *bh; + int error = 0; + + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_NOCACHE, &gh); + if (error) + goto out; + + error = gfs2_meta_inode_buffer(ip, &bh); + if (error) + goto out_unlock; + + spin_lock(&sdp->sd_statfs_spin); + if (head) { + gfs2_statfs_change_in(&sc, bh->b_data + sizeof(struct gfs2_dinode)); + sc.sc_total += head->lh_local_total; + sc.sc_free += head->lh_local_free; + sc.sc_dinodes += head->lh_local_dinodes; + gfs2_statfs_change_out(&sc, bh->b_data + sizeof(struct gfs2_dinode)); + fs_info(sdp, "jid=%u: Updated master statfs Total:%lld, " + "Free:%lld, Dinodes:%lld after change " + "[%+lld,%+lld,%+lld]\n", jd->jd_jid, sc.sc_total, + sc.sc_free, sc.sc_dinodes, head->lh_local_total, + head->lh_local_free, head->lh_local_dinodes); + } else { + memset(bh->b_data + sizeof(struct gfs2_dinode), 0, + sizeof(struct gfs2_statfs_change)); + if (jd->jd_jid == sdp->sd_lockstruct.ls_jid) { /* This node's journal */ + sdp->sd_statfs_local.sc_total = 0; + sdp->sd_statfs_local.sc_free = 0; + sdp->sd_statfs_local.sc_dinodes = 0; + } + } + spin_unlock(&sdp->sd_statfs_spin); +
[Cluster-devel] [GFS2 PATCH v2] gfs2: fix gfs2_find_jhead that returns uninitialized jhead with seq 0
Hi, This version changes the description as per discussion with Andreas. Cheers! --Abhi When the first log header in a journal happens to have a sequence number of 0, a bug in gfs2_find_jhead() causes it to prematurely exit, and return an uninitialized jhead with seq 0. This can cause failures in the caller. For instance, a mount fails in one test case. The correct behavior is for it to continue searching through the journal to find the correct journal head with the highest sequence number. Signed-off-by: Abhi Das --- fs/gfs2/lops.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index d9431724b788..c090d5ad3f22 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -422,7 +422,7 @@ static bool gfs2_jhead_pg_srch(struct gfs2_jdesc *jd, for (offset = 0; offset < PAGE_SIZE; offset += sdp->sd_sb.sb_bsize) { if (!__get_log_header(sdp, kaddr + offset, 0, &lh)) { - if (lh.lh_sequence > head->lh_sequence) + if (lh.lh_sequence >= head->lh_sequence) *head = lh; else { ret = true; -- 2.20.1
[Cluster-devel] [GFS2 PATCH] gfs2: fix gfs2_find_jhead that returns jhead with seq 0
When the first log header in a journal happens to have a sequence number of 0, a bug in gfs2_find_jhead() returns this to be the jhead, instead of seeking forward and looking further into the journal for a jhead with a higher sequence number. Signed-off-by: Abhi Das --- fs/gfs2/lops.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index d9431724b788..c090d5ad3f22 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -422,7 +422,7 @@ static bool gfs2_jhead_pg_srch(struct gfs2_jdesc *jd, for (offset = 0; offset < PAGE_SIZE; offset += sdp->sd_sb.sb_bsize) { if (!__get_log_header(sdp, kaddr + offset, 0, &lh)) { - if (lh.lh_sequence > head->lh_sequence) + if (lh.lh_sequence >= head->lh_sequence) *head = lh; else { ret = true; -- 2.20.1
[Cluster-devel] [PATCH] gfs2: read journal in large chunks to locate the head
Use bio(s) to read in the journal sequentially in large chunks and locate the head of the journal. Signed-off-by: Abhi Das Signed-off-by: Andreas Gruenbacher --- fs/gfs2/glops.c | 3 +- fs/gfs2/log.c| 4 +- fs/gfs2/lops.c | 212 +-- fs/gfs2/lops.h | 4 +- fs/gfs2/ops_fstype.c | 3 +- fs/gfs2/recovery.c | 125 + fs/gfs2/recovery.h | 2 - fs/gfs2/super.c | 5 +- 8 files changed, 219 insertions(+), 139 deletions(-) diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index 78510ab91835..24ada3ccc525 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -28,6 +28,7 @@ #include "util.h" #include "trans.h" #include "dir.h" +#include "lops.h" struct workqueue_struct *gfs2_freeze_wq; @@ -531,7 +532,7 @@ static int freeze_go_xmote_bh(struct gfs2_glock *gl, struct gfs2_holder *gh) if (test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) { j_gl->gl_ops->go_inval(j_gl, DIO_METADATA); - error = gfs2_find_jhead(sdp->sd_jdesc, &head); + error = gfs2_find_jhead(sdp->sd_jdesc, &head, false); if (error) gfs2_consist(sdp); if (!(head.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index a7febb4bd400..a2e1df488df0 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -744,7 +744,7 @@ void gfs2_write_log_header(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd, lh->lh_crc = cpu_to_be32(crc); gfs2_log_write(sdp, page, sb->s_blocksize, 0, dblock); - gfs2_log_submit_bio(&sdp->sd_log_bio, REQ_OP_WRITE, op_flags); + gfs2_log_submit_bio(&sdp->sd_log_bio, REQ_OP_WRITE | op_flags); log_flush_wait(sdp); } @@ -821,7 +821,7 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, u32 flags) gfs2_ordered_write(sdp); lops_before_commit(sdp, tr); - gfs2_log_submit_bio(&sdp->sd_log_bio, REQ_OP_WRITE, 0); + gfs2_log_submit_bio(&sdp->sd_log_bio, REQ_OP_WRITE); if (sdp->sd_log_head != sdp->sd_log_flush_head) { log_flush_wait(sdp); diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index 6af6a3cea967..ce048a9e058d 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -17,7 +17,9 @@ #include #include #include +#include +#include "bmap.h" #include "dir.h" #include "gfs2.h" #include "incore.h" @@ -194,7 +196,6 @@ static void gfs2_end_log_write_bh(struct gfs2_sbd *sdp, /** * gfs2_end_log_write - end of i/o to the log * @bio: The bio - * @error: Status of i/o request * * Each bio_vec contains either data from the pagecache or data * relating to the log itself. Here we iterate over the bio_vec @@ -232,20 +233,19 @@ static void gfs2_end_log_write(struct bio *bio) /** * gfs2_log_submit_bio - Submit any pending log bio * @biop: Address of the bio pointer - * @op: REQ_OP - * @op_flags: req_flag_bits + * @opf: REQ_OP | op_flags * * Submit any pending part-built or full bio to the block device. If * there is no pending bio, then this is a no-op. */ -void gfs2_log_submit_bio(struct bio **biop, int op, int op_flags) +void gfs2_log_submit_bio(struct bio **biop, int opf) { struct bio *bio = *biop; if (bio) { struct gfs2_sbd *sdp = bio->bi_private; atomic_inc(&sdp->sd_log_in_flight); - bio_set_op_attrs(bio, op, op_flags); + bio->bi_opf = opf; submit_bio(bio); *biop = NULL; } @@ -306,7 +306,7 @@ static struct bio *gfs2_log_get_bio(struct gfs2_sbd *sdp, u64 blkno, nblk >>= sdp->sd_fsb2bb_shift; if (blkno == nblk && !flush) return bio; - gfs2_log_submit_bio(biop, op, 0); + gfs2_log_submit_bio(biop, op); } *biop = gfs2_log_alloc_bio(sdp, blkno, end_io); @@ -377,6 +377,206 @@ void gfs2_log_write_page(struct gfs2_sbd *sdp, struct page *page) gfs2_log_bmap(sdp)); } +/** + * gfs2_end_log_read - end I/O callback for reads from the log + * @bio: The bio + * + * Simply unlock the pages in the bio. The main thread will wait on them and + * process them in order as necessary. + */ + +static void gfs2_end_log_read(struct bio *bio) +{ + struct page *page; + struct bio_vec *bvec; + int i; + struct bvec_iter_all iter_all; + + bio_for_each_segment_all(bvec, bio, i, iter_all) { + page = bvec->bv_page; + if (bio->bi_status) { + int err = blk_status_to_errno(bio->bi_status); + + SetPageError(page); + mapping_set_error(page->mapping, err); + }
[Cluster-devel] [PATCH v2] gfs2: fix race between gfs2_freeze_func and unmount
As part of the freeze operation, gfs2_freeze_func() is left blocking on a request to hold the sd_freeze_gl in SH. This glock is held in EX by the gfs2_freeze() code. A subsequent call to gfs2_unfreeze() releases the EXclusively held sd_freeze_gl, which allows gfs2_freeze_func() to acquire it in SH and resume its operation. gfs2_unfreeze(), however, doesn't wait for gfs2_freeze_func() to complete. If a umount is issued right after unfreeze, it could result in an inconsistent filesystem because some journal data (statfs update) isn't written out. Refer to commit 24972557b12c for a more detailed explanation of how freeze/unfreeze work. This patch causes gfs2_unfreeze() to wait for gfs2_freeze_func() to complete before returning to the user. Signed-off-by: Abhi Das --- fs/gfs2/incore.h | 1 + fs/gfs2/super.c | 8 +--- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 78c8e761b321..b15755068593 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -621,6 +621,7 @@ enum { SDF_SKIP_DLM_UNLOCK = 8, SDF_FORCE_AIL_FLUSH = 9, SDF_AIL1_IO_ERROR = 10, + SDF_FS_FROZEN = 11, }; enum gfs2_freeze_state { diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index a6a325b2a78b..ceec631efa49 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -973,8 +973,7 @@ void gfs2_freeze_func(struct work_struct *work) if (error) { printk(KERN_INFO "GFS2: couldn't get freeze lock : %d\n", error); gfs2_assert_withdraw(sdp, 0); - } - else { + } else { atomic_set(&sdp->sd_freeze_state, SFS_UNFROZEN); error = thaw_super(sb); if (error) { @@ -987,6 +986,8 @@ void gfs2_freeze_func(struct work_struct *work) gfs2_glock_dq_uninit(&freeze_gh); } deactivate_super(sb); + clear_bit_unlock(SDF_FS_FROZEN, &sdp->sd_flags); + wake_up_bit(&sdp->sd_flags, SDF_FS_FROZEN); return; } @@ -1029,6 +1030,7 @@ static int gfs2_freeze(struct super_block *sb) msleep(1000); } error = 0; + set_bit(SDF_FS_FROZEN, &sdp->sd_flags); out: mutex_unlock(&sdp->sd_freeze_mutex); return error; @@ -1053,7 +1055,7 @@ static int gfs2_unfreeze(struct super_block *sb) gfs2_glock_dq_uninit(&sdp->sd_freeze_gh); mutex_unlock(&sdp->sd_freeze_mutex); - return 0; + return wait_on_bit(&sdp->sd_flags, SDF_FS_FROZEN, TASK_INTERRUPTIBLE); } /** -- 2.20.1
[Cluster-devel] [PATCH] gfs2: fix race between gfs2_freeze_func and unmount
gfs2_unfreee() doesn't wait for gfs2_freeze_func() to complete. If a umount is issued right after unfreeze, it could result in an inconsistent filesystem because some journal data (statfs update) wasn't written out. This patch causes gfs2_unfreeze() to wait for gfs2_freeze_func() to complete before returning to the user. Signed-off-by: Abhi Das --- fs/gfs2/incore.h | 1 + fs/gfs2/super.c | 8 +--- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 78c8e761b321..b15755068593 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -621,6 +621,7 @@ enum { SDF_SKIP_DLM_UNLOCK = 8, SDF_FORCE_AIL_FLUSH = 9, SDF_AIL1_IO_ERROR = 10, + SDF_FS_FROZEN = 11, }; enum gfs2_freeze_state { diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index a6a325b2a78b..a81d7a5afe39 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -973,8 +973,7 @@ void gfs2_freeze_func(struct work_struct *work) if (error) { printk(KERN_INFO "GFS2: couldn't get freeze lock : %d\n", error); gfs2_assert_withdraw(sdp, 0); - } - else { + } else { atomic_set(&sdp->sd_freeze_state, SFS_UNFROZEN); error = thaw_super(sb); if (error) { @@ -987,6 +986,8 @@ void gfs2_freeze_func(struct work_struct *work) gfs2_glock_dq_uninit(&freeze_gh); } deactivate_super(sb); + clear_bit(SDF_FS_FROZEN, &sdp->sd_flags); + wake_up_bit(&sdp->sd_flags, SDF_FS_FROZEN); return; } @@ -1029,6 +1030,7 @@ static int gfs2_freeze(struct super_block *sb) msleep(1000); } error = 0; + set_bit(SDF_FS_FROZEN, &sdp->sd_flags); out: mutex_unlock(&sdp->sd_freeze_mutex); return error; @@ -1053,7 +1055,7 @@ static int gfs2_unfreeze(struct super_block *sb) gfs2_glock_dq_uninit(&sdp->sd_freeze_gh); mutex_unlock(&sdp->sd_freeze_mutex); - return 0; + return wait_on_bit(&sdp->sd_flags, SDF_FS_FROZEN, TASK_INTERRUPTIBLE); } /** -- 2.20.1
[Cluster-devel] [GFS2 PATCH] Revert "gfs2: read journal in large chunks to locate the head"
This reverts commit 2a5f14f279f59143139bcd1606903f2f80a34241. This patch causes xfstests generic/311 to fail. Reverting this for now until we have a proper fix. Signed-off-by: Abhi Das --- fs/gfs2/glops.c | 1 - fs/gfs2/log.c| 4 +- fs/gfs2/lops.c | 190 ++- fs/gfs2/lops.h | 4 +- fs/gfs2/ops_fstype.c | 1 - fs/gfs2/recovery.c | 123 fs/gfs2/recovery.h | 2 + fs/gfs2/super.c | 1 - 8 files changed, 134 insertions(+), 192 deletions(-) diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index f15b4c57c4bd..78510ab91835 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -28,7 +28,6 @@ #include "util.h" #include "trans.h" #include "dir.h" -#include "lops.h" struct workqueue_struct *gfs2_freeze_wq; diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 5bfaf381921a..b8830fda51e8 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -733,7 +733,7 @@ void gfs2_write_log_header(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd, lh->lh_crc = cpu_to_be32(crc); gfs2_log_write(sdp, page, sb->s_blocksize, 0, addr); - gfs2_log_submit_bio(&sdp->sd_log_bio, REQ_OP_WRITE | op_flags); + gfs2_log_submit_bio(&sdp->sd_log_bio, REQ_OP_WRITE, op_flags); log_flush_wait(sdp); } @@ -810,7 +810,7 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, u32 flags) gfs2_ordered_write(sdp); lops_before_commit(sdp, tr); - gfs2_log_submit_bio(&sdp->sd_log_bio, REQ_OP_WRITE); + gfs2_log_submit_bio(&sdp->sd_log_bio, REQ_OP_WRITE, 0); if (sdp->sd_log_head != sdp->sd_log_flush_head) { log_flush_wait(sdp); diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index 94dcab655bc0..2295042bc625 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -17,9 +17,7 @@ #include #include #include -#include -#include "bmap.h" #include "dir.h" #include "gfs2.h" #include "incore.h" @@ -195,6 +193,7 @@ static void gfs2_end_log_write_bh(struct gfs2_sbd *sdp, struct bio_vec *bvec, /** * gfs2_end_log_write - end of i/o to the log * @bio: The bio + * @error: Status of i/o request * * Each bio_vec contains either data from the pagecache or data * relating to the log itself. Here we iterate over the bio_vec @@ -231,19 +230,20 @@ static void gfs2_end_log_write(struct bio *bio) /** * gfs2_log_submit_bio - Submit any pending log bio * @biop: Address of the bio pointer - * @opf: REQ_OP | op_flags + * @op: REQ_OP + * @op_flags: req_flag_bits * * Submit any pending part-built or full bio to the block device. If * there is no pending bio, then this is a no-op. */ -void gfs2_log_submit_bio(struct bio **biop, int opf) +void gfs2_log_submit_bio(struct bio **biop, int op, int op_flags) { struct bio *bio = *biop; if (bio) { struct gfs2_sbd *sdp = bio->bi_private; atomic_inc(&sdp->sd_log_in_flight); - bio->bi_opf = opf; + bio_set_op_attrs(bio, op, op_flags); submit_bio(bio); *biop = NULL; } @@ -304,7 +304,7 @@ static struct bio *gfs2_log_get_bio(struct gfs2_sbd *sdp, u64 blkno, nblk >>= sdp->sd_fsb2bb_shift; if (blkno == nblk && !flush) return bio; - gfs2_log_submit_bio(biop, op); + gfs2_log_submit_bio(biop, op, 0); } *biop = gfs2_log_alloc_bio(sdp, blkno, end_io); @@ -375,184 +375,6 @@ void gfs2_log_write_page(struct gfs2_sbd *sdp, struct page *page) gfs2_log_bmap(sdp)); } -/** - * gfs2_end_log_read - end I/O callback for reads from the log - * @bio: The bio - * - * Simply unlock the pages in the bio. The main thread will wait on them and - * process them in order as necessary. - */ - -static void gfs2_end_log_read(struct bio *bio) -{ - struct page *page; - struct bio_vec *bvec; - int i; - - bio_for_each_segment_all(bvec, bio, i) { - page = bvec->bv_page; - if (bio->bi_status) { - int err = blk_status_to_errno(bio->bi_status); - - SetPageError(page); - mapping_set_error(page->mapping, err); - } - unlock_page(page); - } - - bio_put(bio); -} - -/** - * gfs2_jhead_pg_srch - Look for the journal head in a given page. - * @jd: The journal descriptor - * @page: The page to look in - * - * Returns: 1 if found, 0 otherwise. - */ - -static bool gfs2_jhead_pg_srch(struct gfs2_jdesc *jd, - struct gfs2_log_header_host *head, - struct page *page) -{ - struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); - struct gfs2_log
[Cluster-devel] [GFS2 v2 PATCH 4/4] gfs2: read journal in large chunks to locate the head
Use bio(s) to read in the journal sequentially in large chunks and locate the head of the journal. This version addresses the issues Christoph pointed out w.r.t error handling and using deprecated API. Signed-off-by: Abhi Das Signed-off-by: Andreas Gruenbacher Cc: Christoph Hellwig --- fs/gfs2/glops.c | 1 + fs/gfs2/log.c| 4 +- fs/gfs2/lops.c | 190 +-- fs/gfs2/lops.h | 4 +- fs/gfs2/ops_fstype.c | 1 + fs/gfs2/recovery.c | 123 - fs/gfs2/recovery.h | 2 - fs/gfs2/super.c | 1 + 8 files changed, 192 insertions(+), 134 deletions(-) diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index c63bee9..f79ef95 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -28,6 +28,7 @@ #include "util.h" #include "trans.h" #include "dir.h" +#include "lops.h" struct workqueue_struct *gfs2_freeze_wq; diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 93a94df..c68a829 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -734,7 +734,7 @@ void gfs2_write_log_header(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd, lh->lh_crc = cpu_to_be32(crc); gfs2_log_write(sdp, page, sb->s_blocksize, 0, addr); - gfs2_log_submit_bio(&sdp->sd_log_bio, REQ_OP_WRITE, op_flags); + gfs2_log_submit_bio(&sdp->sd_log_bio, REQ_OP_WRITE | op_flags); log_flush_wait(sdp); } @@ -811,7 +811,7 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, u32 flags) gfs2_ordered_write(sdp); lops_before_commit(sdp, tr); - gfs2_log_submit_bio(&sdp->sd_log_bio, REQ_OP_WRITE, 0); + gfs2_log_submit_bio(&sdp->sd_log_bio, REQ_OP_WRITE); if (sdp->sd_log_head != sdp->sd_log_flush_head) { log_flush_wait(sdp); diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index 2295042..94dcab6 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -17,7 +17,9 @@ #include #include #include +#include +#include "bmap.h" #include "dir.h" #include "gfs2.h" #include "incore.h" @@ -193,7 +195,6 @@ static void gfs2_end_log_write_bh(struct gfs2_sbd *sdp, struct bio_vec *bvec, /** * gfs2_end_log_write - end of i/o to the log * @bio: The bio - * @error: Status of i/o request * * Each bio_vec contains either data from the pagecache or data * relating to the log itself. Here we iterate over the bio_vec @@ -230,20 +231,19 @@ static void gfs2_end_log_write(struct bio *bio) /** * gfs2_log_submit_bio - Submit any pending log bio * @biop: Address of the bio pointer - * @op: REQ_OP - * @op_flags: req_flag_bits + * @opf: REQ_OP | op_flags * * Submit any pending part-built or full bio to the block device. If * there is no pending bio, then this is a no-op. */ -void gfs2_log_submit_bio(struct bio **biop, int op, int op_flags) +void gfs2_log_submit_bio(struct bio **biop, int opf) { struct bio *bio = *biop; if (bio) { struct gfs2_sbd *sdp = bio->bi_private; atomic_inc(&sdp->sd_log_in_flight); - bio_set_op_attrs(bio, op, op_flags); + bio->bi_opf = opf; submit_bio(bio); *biop = NULL; } @@ -304,7 +304,7 @@ static struct bio *gfs2_log_get_bio(struct gfs2_sbd *sdp, u64 blkno, nblk >>= sdp->sd_fsb2bb_shift; if (blkno == nblk && !flush) return bio; - gfs2_log_submit_bio(biop, op, 0); + gfs2_log_submit_bio(biop, op); } *biop = gfs2_log_alloc_bio(sdp, blkno, end_io); @@ -375,6 +375,184 @@ void gfs2_log_write_page(struct gfs2_sbd *sdp, struct page *page) gfs2_log_bmap(sdp)); } +/** + * gfs2_end_log_read - end I/O callback for reads from the log + * @bio: The bio + * + * Simply unlock the pages in the bio. The main thread will wait on them and + * process them in order as necessary. + */ + +static void gfs2_end_log_read(struct bio *bio) +{ + struct page *page; + struct bio_vec *bvec; + int i; + + bio_for_each_segment_all(bvec, bio, i) { + page = bvec->bv_page; + if (bio->bi_status) { + int err = blk_status_to_errno(bio->bi_status); + + SetPageError(page); + mapping_set_error(page->mapping, err); + } + unlock_page(page); + } + + bio_put(bio); +} + +/** + * gfs2_jhead_pg_srch - Look for the journal head in a given page. + * @jd: The journal descriptor + * @page: The page to look in + * + * Returns: 1 if found, 0 otherwise. + */ + +static bool gfs2_jhead_pg_srch(struct gfs2_jdesc *jd, + struct gfs2_log_header_host *head, + struct page
[Cluster-devel] [GFS2 PATCH 4/4] gfs2: read journal in large chunks to locate the head
Use bio(s) to read in the journal sequentially in large chunks and locate the head of the journal. Signed-off-by: Abhi Das Signed-off-by: Andreas Gruenbacher --- fs/gfs2/glops.c | 1 + fs/gfs2/lops.c | 167 ++- fs/gfs2/lops.h | 2 + fs/gfs2/ops_fstype.c | 1 + fs/gfs2/recovery.c | 123 - fs/gfs2/recovery.h | 2 - fs/gfs2/super.c | 1 + 7 files changed, 171 insertions(+), 126 deletions(-) diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index c63bee9..f79ef95 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -28,6 +28,7 @@ #include "util.h" #include "trans.h" #include "dir.h" +#include "lops.h" struct workqueue_struct *gfs2_freeze_wq; diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index 2295042..568b6cc 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -18,6 +18,7 @@ #include #include +#include "bmap.h" #include "dir.h" #include "gfs2.h" #include "incore.h" @@ -193,7 +194,6 @@ static void gfs2_end_log_write_bh(struct gfs2_sbd *sdp, struct bio_vec *bvec, /** * gfs2_end_log_write - end of i/o to the log * @bio: The bio - * @error: Status of i/o request * * Each bio_vec contains either data from the pagecache or data * relating to the log itself. Here we iterate over the bio_vec @@ -375,6 +375,171 @@ void gfs2_log_write_page(struct gfs2_sbd *sdp, struct page *page) gfs2_log_bmap(sdp)); } +/** + * gfs2_end_log_read - end I/O callback for reads from the log + * @bio: The bio + * + * Simply unlock the pages in the bio. The main thread will wait on them and + * process them in order as necessary. + */ + +static void gfs2_end_log_read(struct bio *bio) +{ + struct gfs2_sbd *sdp = bio->bi_private; + struct page *page; + struct bio_vec *bvec; + int i; + + if (bio->bi_status) + fs_err(sdp, "Error %d reading from journal\n", bio->bi_status); + + bio_for_each_segment_all(bvec, bio, i) + unlock_page(bvec->bv_page); + + bio_put(bio); +} + +/** + * gfs2_jhead_pg_srch - Look for the journal head in a given page. + * @jd: The journal descriptor + * @page: The page to look in + * + * Returns: 1 if found, 0 otherwise. + */ + +static bool gfs2_jhead_pg_srch(struct gfs2_jdesc *jd, + struct gfs2_log_header_host *head, + struct page *page) +{ + struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); + struct gfs2_log_header_host uninitialized_var(lh); + void *kaddr = kmap_atomic(page); + unsigned int offset; + bool ret = false; + + for (offset = 0; offset < PAGE_SIZE; offset += sdp->sd_sb.sb_bsize) { + if (!__get_log_header(sdp, kaddr + offset, 0, &lh)) { + if (lh.lh_sequence > head->lh_sequence) + *head = lh; + else { + ret = true; + break; + } + } + } + kunmap_atomic(kaddr); + return ret; +} + +/** + * gfs2_jhead_process_page - Search/cleanup a page + * @jd: The journal descriptor + * @index: Index of the page to look into + * @done: If set, perform only cleanup, else search and set if found. + * + * Find the page with 'index' in the journal's mapping. Search the page for + * the journal head if requested (cleanup == false). Release refs on the + * page so the page cache can reclaim it (put_page() twice). We grabbed a + * reference on this page two times, first when we did a find_or_create_page() + * to obtain the page to add it to the bio and second when we do a + * find_get_page() here to get the page to wait on while I/O on it is being + * completed. + * This function is also used to free up a page we might've grabbed but not + * used. Maybe we added it to a bio, but not submitted it for I/O. Or we + * submitted the I/O, but we already found the jhead so we only need to drop + * our references to the page. + */ + +static void gfs2_jhead_process_page(struct gfs2_jdesc *jd, unsigned long index, + struct gfs2_log_header_host *head, + bool *done) +{ + struct page *page; + + page = find_get_page(jd->jd_inode->i_mapping, index); + wait_on_page_locked(page); + + if (!*done) + *done = gfs2_jhead_pg_srch(jd, head, page); + + put_page(page); /* Once for find_get_page */ + put_page(page); /* Once more for find_or_create_page */ +} + +/** + * gfs2_find_jhead - find the head of a log + * @jd: The journal descriptor + * @head: The log descriptor for the head of the log is returned here + * + * Do a search of a journal by reading it in large chunk
[Cluster-devel] [GFS2 PATCH 3/4] gfs2: add a helper function to get_log_header that can be used elsewhere
Move and re-order the error checks and hash/crc computations into another function __get_log_header() so it can be used in scenarios where buffer_heads are not being used for the log header. Signed-off-by: Abhi Das --- fs/gfs2/recovery.c | 53 - fs/gfs2/recovery.h | 2 ++ 2 files changed, 34 insertions(+), 21 deletions(-) diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c index b0717a0..2dac430 100644 --- a/fs/gfs2/recovery.c +++ b/fs/gfs2/recovery.c @@ -120,6 +120,35 @@ void gfs2_revoke_clean(struct gfs2_jdesc *jd) } } +int __get_log_header(struct gfs2_sbd *sdp, const struct gfs2_log_header *lh, +unsigned int blkno, struct gfs2_log_header_host *head) +{ + u32 hash, crc; + + if (lh->lh_header.mh_magic != cpu_to_be32(GFS2_MAGIC) || + lh->lh_header.mh_type != cpu_to_be32(GFS2_METATYPE_LH) || + (blkno && be32_to_cpu(lh->lh_blkno) != blkno)) + return 1; + + hash = crc32(~0, lh, LH_V1_SIZE - 4); + hash = ~crc32_le_shift(hash, 4); /* assume lh_hash is zero */ + + if (be32_to_cpu(lh->lh_hash) != hash) + return 1; + + crc = crc32c(~0, (void *)lh + LH_V1_SIZE + 4, +sdp->sd_sb.sb_bsize - LH_V1_SIZE - 4); + + if ((lh->lh_crc != 0 && be32_to_cpu(lh->lh_crc) != crc)) + return 1; + + head->lh_sequence = be64_to_cpu(lh->lh_sequence); + head->lh_flags = be32_to_cpu(lh->lh_flags); + head->lh_tail = be32_to_cpu(lh->lh_tail); + head->lh_blkno = be32_to_cpu(lh->lh_blkno); + + return 0; +} /** * get_log_header - read the log header for a given segment * @jd: the journal @@ -137,36 +166,18 @@ void gfs2_revoke_clean(struct gfs2_jdesc *jd) static int get_log_header(struct gfs2_jdesc *jd, unsigned int blk, struct gfs2_log_header_host *head) { - struct gfs2_log_header *lh; + struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); struct buffer_head *bh; - u32 hash, crc; int error; error = gfs2_replay_read_block(jd, blk, &bh); if (error) return error; - lh = (void *)bh->b_data; - - hash = crc32(~0, lh, LH_V1_SIZE - 4); - hash = ~crc32_le_shift(hash, 4); /* assume lh_hash is zero */ - - crc = crc32c(~0, (void *)lh + LH_V1_SIZE + 4, -bh->b_size - LH_V1_SIZE - 4); - - error = lh->lh_header.mh_magic != cpu_to_be32(GFS2_MAGIC) || - lh->lh_header.mh_type != cpu_to_be32(GFS2_METATYPE_LH) || - be32_to_cpu(lh->lh_blkno) != blk || - be32_to_cpu(lh->lh_hash) != hash || - (lh->lh_crc != 0 && be32_to_cpu(lh->lh_crc) != crc); + error = __get_log_header(sdp, (const struct gfs2_log_header *)bh->b_data, +blk, head); brelse(bh); - if (!error) { - head->lh_sequence = be64_to_cpu(lh->lh_sequence); - head->lh_flags = be32_to_cpu(lh->lh_flags); - head->lh_tail = be32_to_cpu(lh->lh_tail); - head->lh_blkno = be32_to_cpu(lh->lh_blkno); - } return error; } diff --git a/fs/gfs2/recovery.h b/fs/gfs2/recovery.h index 11fdfab..943a67c 100644 --- a/fs/gfs2/recovery.h +++ b/fs/gfs2/recovery.h @@ -31,6 +31,8 @@ extern int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head); extern int gfs2_recover_journal(struct gfs2_jdesc *gfs2_jd, bool wait); extern void gfs2_recover_func(struct work_struct *work); +extern int __get_log_header(struct gfs2_sbd *sdp, const struct gfs2_log_header *lh, + unsigned int blkno, struct gfs2_log_header_host *head); #endif /* __RECOVERY_DOT_H__ */ -- 2.4.11
[Cluster-devel] [GFS2 PATCH 2/4] gfs2: changes to gfs2_log_XXX_bio
Change gfs2_log_XXX_bio family of functions so they can be used with different bios, not just sdp->sd_log_bio. This patch also contains some clean up suggested by Andreas. Signed-off-by: Abhi Das Signed-off-by: Andreas Gruenbacher --- fs/gfs2/log.c | 4 ++-- fs/gfs2/lops.c | 73 +++--- fs/gfs2/lops.h | 2 +- 3 files changed, 42 insertions(+), 37 deletions(-) diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 96706a2..93a94df 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -734,7 +734,7 @@ void gfs2_write_log_header(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd, lh->lh_crc = cpu_to_be32(crc); gfs2_log_write(sdp, page, sb->s_blocksize, 0, addr); - gfs2_log_flush_bio(sdp, REQ_OP_WRITE, op_flags); + gfs2_log_submit_bio(&sdp->sd_log_bio, REQ_OP_WRITE, op_flags); log_flush_wait(sdp); } @@ -811,7 +811,7 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, u32 flags) gfs2_ordered_write(sdp); lops_before_commit(sdp, tr); - gfs2_log_flush_bio(sdp, REQ_OP_WRITE, 0); + gfs2_log_submit_bio(&sdp->sd_log_bio, REQ_OP_WRITE, 0); if (sdp->sd_log_head != sdp->sd_log_flush_head) { log_flush_wait(sdp); diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index 4c7069b..2295042 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -228,8 +228,8 @@ static void gfs2_end_log_write(struct bio *bio) } /** - * gfs2_log_flush_bio - Submit any pending log bio - * @sdp: The superblock + * gfs2_log_submit_bio - Submit any pending log bio + * @biop: Address of the bio pointer * @op: REQ_OP * @op_flags: req_flag_bits * @@ -237,74 +237,78 @@ static void gfs2_end_log_write(struct bio *bio) * there is no pending bio, then this is a no-op. */ -void gfs2_log_flush_bio(struct gfs2_sbd *sdp, int op, int op_flags) +void gfs2_log_submit_bio(struct bio **biop, int op, int op_flags) { - if (sdp->sd_log_bio) { + struct bio *bio = *biop; + if (bio) { + struct gfs2_sbd *sdp = bio->bi_private; atomic_inc(&sdp->sd_log_in_flight); - bio_set_op_attrs(sdp->sd_log_bio, op, op_flags); - submit_bio(sdp->sd_log_bio); - sdp->sd_log_bio = NULL; + bio_set_op_attrs(bio, op, op_flags); + submit_bio(bio); + *biop = NULL; } } /** - * gfs2_log_alloc_bio - Allocate a new bio for log writing - * @sdp: The superblock - * @blkno: The next device block number we want to write to + * gfs2_log_alloc_bio - Allocate a bio + * @sdp: The super block + * @blkno: The device block number we want to write to + * @end_io: The bi_end_io callback * - * This should never be called when there is a cached bio in the - * super block. When it returns, there will be a cached bio in the - * super block which will have as many bio_vecs as the device is - * happy to handle. + * Allocate a new bio, initialize it with the given parameters and return it. * - * Returns: Newly allocated bio + * Returns: The newly allocated bio */ -static struct bio *gfs2_log_alloc_bio(struct gfs2_sbd *sdp, u64 blkno) +static struct bio *gfs2_log_alloc_bio(struct gfs2_sbd *sdp, u64 blkno, + bio_end_io_t *end_io) { struct super_block *sb = sdp->sd_vfs; - struct bio *bio; + struct bio *bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES); - BUG_ON(sdp->sd_log_bio); - - bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES); bio->bi_iter.bi_sector = blkno * (sb->s_blocksize >> 9); bio_set_dev(bio, sb->s_bdev); - bio->bi_end_io = gfs2_end_log_write; + bio->bi_end_io = end_io; bio->bi_private = sdp; - sdp->sd_log_bio = bio; - return bio; } /** * gfs2_log_get_bio - Get cached log bio, or allocate a new one - * @sdp: The superblock + * @sdp: The super block * @blkno: The device block number we want to write to + * @bio: The bio to get or allocate + * @op: REQ_OP + * @end_io: The bi_end_io callback + * @flush: Always flush the current bio and allocate a new one? * * If there is a cached bio, then if the next block number is sequential * with the previous one, return it, otherwise flush the bio to the - * device. If there is not a cached bio, or we just flushed it, then + * device. If there is no cached bio, or we just flushed it, then * allocate a new one. * * Returns: The bio to use for log writes */ -static struct bio *gfs2_log_get_bio(struct gfs2_sbd *sdp, u64 blkno) +static struct bio *gfs2_log_get_bio(struct gfs2_sbd *sdp, u64 blkno, + struct bio **biop, int op, + bio_end_io_t *end_io, bool flush) { - struct bio *bio = sdp->sd_log_bio; - u64 nblk; + struct bio *bio = *biop; if (bio) { +
[Cluster-devel] [GFS2 PATCH 1/4] gfs2: add more timing info to journal recovery process
Tells you how many milliseconds map_journal_extents and find_jhead take. Signed-off-by: Abhi Das --- fs/gfs2/bmap.c | 8 ++-- fs/gfs2/recovery.c | 2 ++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 5f3ea07..aaf3682 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -14,6 +14,7 @@ #include #include #include +#include #include "gfs2.h" #include "incore.h" @@ -2248,7 +2249,9 @@ int gfs2_map_journal_extents(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd) unsigned int shift = sdp->sd_sb.sb_bsize_shift; u64 size; int rc; + ktime_t start, end; + start = ktime_get(); lblock_stop = i_size_read(jd->jd_inode) >> shift; size = (lblock_stop - lblock) << shift; jd->nr_extents = 0; @@ -2268,8 +2271,9 @@ int gfs2_map_journal_extents(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd) lblock += (bh.b_size >> ip->i_inode.i_blkbits); } while(size > 0); - fs_info(sdp, "journal %d mapped with %u extents\n", jd->jd_jid, - jd->nr_extents); + end = ktime_get(); + fs_info(sdp, "journal %d mapped with %u extents in %lldms\n", jd->jd_jid, + jd->nr_extents, ktime_ms_delta(end, start)); return 0; fail: diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c index 0f501f9..b0717a0 100644 --- a/fs/gfs2/recovery.c +++ b/fs/gfs2/recovery.c @@ -460,6 +460,8 @@ void gfs2_recover_func(struct work_struct *work) if (error) goto fail_gunlock_ji; t_jhd = ktime_get(); + fs_info(sdp, "jid=%u: Journal head lookup took %lldms\n", jd->jd_jid, + ktime_ms_delta(t_jhd, t_jlck)); if (!(head.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) { fs_info(sdp, "jid=%u: Acquiring the transaction lock...\n", -- 2.4.11
[Cluster-devel] [GFS2 PATCH 0/4] jhead lookup using bios
This is my latest version of this patchset based on inputs from Andreas and Steve. We readahead the journal sequentially in large chunks using bios. Pagecache pages for the journal inode's mapping are used for the I/O. There's also some cleanup of the bio functions with this patchset. xfstests ran to completion with this. Abhi Das (4): gfs2: add more timing info to journal recovery process gfs2: changes to gfs2_log_XXX_bio gfs2: add a helper function to get_log_header that can be used elsewhere gfs2: read journal in large chunks to locate the head fs/gfs2/bmap.c | 8 +- fs/gfs2/glops.c | 1 + fs/gfs2/log.c| 4 +- fs/gfs2/lops.c | 240 +++ fs/gfs2/lops.h | 4 +- fs/gfs2/ops_fstype.c | 1 + fs/gfs2/recovery.c | 178 -- fs/gfs2/recovery.h | 4 +- fs/gfs2/super.c | 1 + 9 files changed, 255 insertions(+), 186 deletions(-) -- 2.4.11
[Cluster-devel] [GFS2 RFC PATCH 3/3] gfs2: introduce bio_pool to readahead journal to find jhead
This patch adds a new data structure called bio_pool. This is basically a dynamically allocated array of struct bio* and associated variables to manage this data structure. The array is used in a circular fashion until the entire array has bios that are in flight. i.e. they need to be waited on and consumed upon completion, in order to make room for more. To locate the journal head, we read the journal sequentially from the beginning, creating bios and submitting them as necessary. We wait for these inflight bios in the order we submit them even though the block layer may complete them out of order. This strict ordering allows us to determine the journal head without having to do extra reads. A tunable allows us to configure the size of the bio_pool. Signed-off-by: Abhi Das --- fs/gfs2/incore.h | 3 + fs/gfs2/lops.c | 359 +++ fs/gfs2/lops.h | 1 + fs/gfs2/ops_fstype.c | 2 + fs/gfs2/recovery.c | 116 ++--- fs/gfs2/sys.c| 27 ++-- 6 files changed, 391 insertions(+), 117 deletions(-) diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index b96d39c..424687f 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -542,6 +542,8 @@ struct gfs2_jdesc { int jd_recover_error; /* Replay stuff */ + struct gfs2_log_header_host jd_jhead; + struct mutex jd_jh_mutex; unsigned int jd_found_blocks; unsigned int jd_found_revokes; unsigned int jd_replayed_blocks; @@ -610,6 +612,7 @@ struct gfs2_tune { unsigned int gt_complain_secs; unsigned int gt_statfs_quantum; unsigned int gt_statfs_slow; + unsigned int gt_bio_pool_size; /* No of bios to use for the bio_pool */ }; enum { diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index f2567f9..69fc058 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -18,6 +18,7 @@ #include #include +#include "bmap.h" #include "dir.h" #include "gfs2.h" #include "incore.h" @@ -370,6 +371,364 @@ void gfs2_log_write_page(struct gfs2_sbd *sdp, struct page *page) gfs2_log_bmap(sdp)); } +/* + * The bio_pool structure is an array of bios of length 'size'. + * 'cur' is the index of the next bio to be submitted for I/O. + * 'wait' is the index of bio we need to wait on for I/O completion. + * 'inflight' is the number of bios submitted, but not yet completed. + */ +struct bio_pool { + struct bio **bios; + unsigned int size; + unsigned int cur; + unsigned int wait; + unsigned int inflight; +}; +typedef int (search_bio_t) (struct gfs2_jdesc *jd, const void *ptr); + +/** + * bio_pool_submit_bio - Submit the current bio in the pool + * + * @pool: The bio pool + * + * Submit the current bio (pool->bios[pool->cur]) and update internal pool + * management variables. If pool->inflight == pool->size, we've maxed out all + * the bios in our pool and the caller needs to wait on some bios, process and + * free them so new ones can be added. + * + * Returns: 1 if we maxed out our bios, 0, otherwise + */ + +static int bio_pool_submit_bio(struct bio_pool *pool) +{ + int ret = 0; + BUG_ON(!pool || !pool->bios || !pool->bios[pool->cur]); + + bio_set_op_attrs(pool->bios[pool->cur], REQ_OP_READ, 0); + submit_bio(pool->bios[pool->cur]); + pool->cur = pool->cur == pool->size - 1 ? 0 : pool->cur + 1; + pool->inflight++; + if (pool->inflight == pool->size) + ret = 1; + return ret; +} + +/** + * bio_pool_get_cur - Do what's necessary to get a valid bio for the caller. + * + * @pool: The bio pool + * @sdp: The gfs2 superblock + * @blkno: The block number we wish to add to a bio + * @end_io: The end_io completion callback + * + * If there's no currently active bio, we allocate one for the blkno and return. + * + * If there's an active bio at pool->bios[pool->cur], we check if the requested + * block maybe to tacked onto it. If yes, we do nothing and return. + * + * If the block can't be added (non-contiguous), we submit the current bio. + * pool->cur, pool->inflight will change and we fall through to allocate a new + * bio and return. In this case, it is possible that submitting the current bio + * has maxed out our readahead (bio_pool_submit_bio() returns 1). We pass this + * error code back to the caller. + * + * Returns: 1 if bio_pool_submit_bio() maxed readahead, else 0. + */ + +static int bio_pool_get_cur(struct bio_pool *pool, struct gfs2_sbd *sdp, + u64 blkno, bio_end_io_t end_io, void *private) +{ + struct super_block *sb = sdp->sd_vfs; + struct bio *bio; + int ret = 0; + + BUG_ON(!pool || !pool->bios); + + if (pool->bios[pool->cur]) { + u64 nblk; +
[Cluster-devel] [GFS2 RFC PATCH 0/3] Locating jhead using a pool of bios
This patchset does things a bit differently from the previous attempts to find the journal head based on Andreas' suggestions. It uses a pool of bios to maintain a readahead queue of sorts that allows us to process the completed bios in sequential order to locate the jhead. I've done a little bit of testing and it seems to be holding up so far. I plan to do more testing. I haven't done a performance analysis vs the old method yet, so I don't know how well this does. There might be some optimizations we can do w.r.t repeated allocations and such. Abhi Das (3): gfs2: add more timing info to the journal recovery process gfs2: add a helper function to get_log_header that can be used elsewhere gfs2: introduce bio_pool to readahead journal to find jhead fs/gfs2/bmap.c | 8 +- fs/gfs2/incore.h | 3 + fs/gfs2/lops.c | 359 +++ fs/gfs2/lops.h | 1 + fs/gfs2/ops_fstype.c | 2 + fs/gfs2/recovery.c | 171 ++-- fs/gfs2/recovery.h | 2 + fs/gfs2/sys.c| 27 ++-- 8 files changed, 433 insertions(+), 140 deletions(-) -- 2.4.11
[Cluster-devel] [GFS2 RFC PATCH 1/3] gfs2: add more timing info to the journal recovery process
Time the gfs2_map_journal_extents() function and the journal head lookup and report. Signed-off-by: Abhi Das --- fs/gfs2/bmap.c | 8 ++-- fs/gfs2/recovery.c | 2 ++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 03128ed..dddb5a4 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -14,6 +14,7 @@ #include #include #include +#include #include "gfs2.h" #include "incore.h" @@ -2248,7 +2249,9 @@ int gfs2_map_journal_extents(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd) unsigned int shift = sdp->sd_sb.sb_bsize_shift; u64 size; int rc; + ktime_t start, end; + start = ktime_get(); lblock_stop = i_size_read(jd->jd_inode) >> shift; size = (lblock_stop - lblock) << shift; jd->nr_extents = 0; @@ -2268,8 +2271,9 @@ int gfs2_map_journal_extents(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd) lblock += (bh.b_size >> ip->i_inode.i_blkbits); } while(size > 0); - fs_info(sdp, "journal %d mapped with %u extents\n", jd->jd_jid, - jd->nr_extents); + end = ktime_get(); + fs_info(sdp, "journal %d mapped with %u extents in %lldms\n", jd->jd_jid, + jd->nr_extents, ktime_ms_delta(end, start)); return 0; fail: diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c index 0f501f9..b0717a0 100644 --- a/fs/gfs2/recovery.c +++ b/fs/gfs2/recovery.c @@ -460,6 +460,8 @@ void gfs2_recover_func(struct work_struct *work) if (error) goto fail_gunlock_ji; t_jhd = ktime_get(); + fs_info(sdp, "jid=%u: Journal head lookup took %lldms\n", jd->jd_jid, + ktime_ms_delta(t_jhd, t_jlck)); if (!(head.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) { fs_info(sdp, "jid=%u: Acquiring the transaction lock...\n", -- 2.4.11
[Cluster-devel] [GFS2 RFC PATCH 2/3] gfs2: add a helper function to get_log_header that can be used elsewhere
Move and re-order the error checks and hash/crc computations into another function __get_log_header() so it can be used in scenarios where buffer_heads are not being used for the log header. Signed-off-by: Abhi Das --- fs/gfs2/recovery.c | 53 - fs/gfs2/recovery.h | 2 ++ 2 files changed, 34 insertions(+), 21 deletions(-) diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c index b0717a0..2dac430 100644 --- a/fs/gfs2/recovery.c +++ b/fs/gfs2/recovery.c @@ -120,6 +120,35 @@ void gfs2_revoke_clean(struct gfs2_jdesc *jd) } } +int __get_log_header(struct gfs2_sbd *sdp, const struct gfs2_log_header *lh, +unsigned int blkno, struct gfs2_log_header_host *head) +{ + u32 hash, crc; + + if (lh->lh_header.mh_magic != cpu_to_be32(GFS2_MAGIC) || + lh->lh_header.mh_type != cpu_to_be32(GFS2_METATYPE_LH) || + (blkno && be32_to_cpu(lh->lh_blkno) != blkno)) + return 1; + + hash = crc32(~0, lh, LH_V1_SIZE - 4); + hash = ~crc32_le_shift(hash, 4); /* assume lh_hash is zero */ + + if (be32_to_cpu(lh->lh_hash) != hash) + return 1; + + crc = crc32c(~0, (void *)lh + LH_V1_SIZE + 4, +sdp->sd_sb.sb_bsize - LH_V1_SIZE - 4); + + if ((lh->lh_crc != 0 && be32_to_cpu(lh->lh_crc) != crc)) + return 1; + + head->lh_sequence = be64_to_cpu(lh->lh_sequence); + head->lh_flags = be32_to_cpu(lh->lh_flags); + head->lh_tail = be32_to_cpu(lh->lh_tail); + head->lh_blkno = be32_to_cpu(lh->lh_blkno); + + return 0; +} /** * get_log_header - read the log header for a given segment * @jd: the journal @@ -137,36 +166,18 @@ void gfs2_revoke_clean(struct gfs2_jdesc *jd) static int get_log_header(struct gfs2_jdesc *jd, unsigned int blk, struct gfs2_log_header_host *head) { - struct gfs2_log_header *lh; + struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); struct buffer_head *bh; - u32 hash, crc; int error; error = gfs2_replay_read_block(jd, blk, &bh); if (error) return error; - lh = (void *)bh->b_data; - - hash = crc32(~0, lh, LH_V1_SIZE - 4); - hash = ~crc32_le_shift(hash, 4); /* assume lh_hash is zero */ - - crc = crc32c(~0, (void *)lh + LH_V1_SIZE + 4, -bh->b_size - LH_V1_SIZE - 4); - - error = lh->lh_header.mh_magic != cpu_to_be32(GFS2_MAGIC) || - lh->lh_header.mh_type != cpu_to_be32(GFS2_METATYPE_LH) || - be32_to_cpu(lh->lh_blkno) != blk || - be32_to_cpu(lh->lh_hash) != hash || - (lh->lh_crc != 0 && be32_to_cpu(lh->lh_crc) != crc); + error = __get_log_header(sdp, (const struct gfs2_log_header *)bh->b_data, +blk, head); brelse(bh); - if (!error) { - head->lh_sequence = be64_to_cpu(lh->lh_sequence); - head->lh_flags = be32_to_cpu(lh->lh_flags); - head->lh_tail = be32_to_cpu(lh->lh_tail); - head->lh_blkno = be32_to_cpu(lh->lh_blkno); - } return error; } diff --git a/fs/gfs2/recovery.h b/fs/gfs2/recovery.h index 11fdfab..943a67c 100644 --- a/fs/gfs2/recovery.h +++ b/fs/gfs2/recovery.h @@ -31,6 +31,8 @@ extern int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head); extern int gfs2_recover_journal(struct gfs2_jdesc *gfs2_jd, bool wait); extern void gfs2_recover_func(struct work_struct *work); +extern int __get_log_header(struct gfs2_sbd *sdp, const struct gfs2_log_header *lh, + unsigned int blkno, struct gfs2_log_header_host *head); #endif /* __RECOVERY_DOT_H__ */ -- 2.4.11
[Cluster-devel] [GFS2 v2 PATCH 0/4] Speed up journal head lookup
This is a revised version of the patchset I'd posted a few days ago. It contains fixes and some cleanup suggested by Andreas and Bob. It is slightly different in parts from the rhel7 patchset I'd posted originally, owing to some bits already being present and the hash/crc computation code being different due to the updated log header structure. Cheers! --Abhi Abhi Das (4): gfs2: add timing info to map_journal_extents gfs2: changes to gfs2_log_XXX_bio gfs2: add a helper function to get_log_header that can be used elsewhere gfs2: read journal in large chunks to locate the head fs/gfs2/bmap.c | 8 ++- fs/gfs2/incore.h | 8 ++- fs/gfs2/log.c| 4 +- fs/gfs2/lops.c | 180 +-- fs/gfs2/lops.h | 3 +- fs/gfs2/ops_fstype.c | 1 + fs/gfs2/recovery.c | 168 --- fs/gfs2/recovery.h | 2 + 8 files changed, 194 insertions(+), 180 deletions(-) -- 2.4.11
[Cluster-devel] [GFS2 v2 PATCH 2/4] gfs2: changes to gfs2_log_XXX_bio
Change gfs2_log_XXX_bio family of functions so they can be used with read operations also. This patch also contains some clean up and coalescing of the above functions suggested by Andreas. Signed-off-by: Abhi Das Signed-off-by: Andreas Gruenbacher --- fs/gfs2/log.c | 4 +-- fs/gfs2/lops.c | 86 ++ fs/gfs2/lops.h | 2 +- 3 files changed, 41 insertions(+), 51 deletions(-) diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index ee20ea42..b80fb30 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -731,7 +731,7 @@ void gfs2_write_log_header(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd, lh->lh_crc = cpu_to_be32(crc); gfs2_log_write(sdp, page, sb->s_blocksize, 0, addr); - gfs2_log_flush_bio(sdp, REQ_OP_WRITE, op_flags); + gfs2_log_flush_bio(&sdp->sd_log_bio, REQ_OP_WRITE, op_flags); log_flush_wait(sdp); } @@ -808,7 +808,7 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, u32 flags) gfs2_ordered_write(sdp); lops_before_commit(sdp, tr); - gfs2_log_flush_bio(sdp, REQ_OP_WRITE, 0); + gfs2_log_flush_bio(&sdp->sd_log_bio, REQ_OP_WRITE, 0); if (sdp->sd_log_head != sdp->sd_log_flush_head) { log_flush_wait(sdp); diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index f2567f9..f5f31a6 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -229,7 +229,7 @@ static void gfs2_end_log_write(struct bio *bio) /** * gfs2_log_flush_bio - Submit any pending log bio - * @sdp: The superblock + * @biop: Address of the bio pointer * @op: REQ_OP * @op_flags: req_flag_bits * @@ -237,74 +237,61 @@ static void gfs2_end_log_write(struct bio *bio) * there is no pending bio, then this is a no-op. */ -void gfs2_log_flush_bio(struct gfs2_sbd *sdp, int op, int op_flags) +void gfs2_log_flush_bio(struct bio **biop, int op, int op_flags) { - if (sdp->sd_log_bio) { + struct bio *bio = *biop; + if (bio) { + struct gfs2_sbd *sdp = bio->bi_private; atomic_inc(&sdp->sd_log_in_flight); - bio_set_op_attrs(sdp->sd_log_bio, op, op_flags); - submit_bio(sdp->sd_log_bio); - sdp->sd_log_bio = NULL; + bio_set_op_attrs(bio, op, op_flags); + submit_bio(bio); + *biop = NULL; } } /** - * gfs2_log_alloc_bio - Allocate a new bio for log writing - * @sdp: The superblock - * @blkno: The next device block number we want to write to - * - * This should never be called when there is a cached bio in the - * super block. When it returns, there will be a cached bio in the - * super block which will have as many bio_vecs as the device is - * happy to handle. - * - * Returns: Newly allocated bio - */ - -static struct bio *gfs2_log_alloc_bio(struct gfs2_sbd *sdp, u64 blkno) -{ - struct super_block *sb = sdp->sd_vfs; - struct bio *bio; - - BUG_ON(sdp->sd_log_bio); - - bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES); - bio->bi_iter.bi_sector = blkno * (sb->s_blocksize >> 9); - bio_set_dev(bio, sb->s_bdev); - bio->bi_end_io = gfs2_end_log_write; - bio->bi_private = sdp; - - sdp->sd_log_bio = bio; - - return bio; -} - -/** * gfs2_log_get_bio - Get cached log bio, or allocate a new one - * @sdp: The superblock + * @sdp: The super block * @blkno: The device block number we want to write to + * @bio: The bio to get or allocate + * @op: REQ_OP + * @end_io: The bi_end_io callback + * @private: The bi_private value + * @flush: Always flush the current bio and allocate a new one? * * If there is a cached bio, then if the next block number is sequential * with the previous one, return it, otherwise flush the bio to the - * device. If there is not a cached bio, or we just flushed it, then + * device. If there is no cached bio, or we just flushed it, then * allocate a new one. * * Returns: The bio to use for log writes */ -static struct bio *gfs2_log_get_bio(struct gfs2_sbd *sdp, u64 blkno) +static struct bio *gfs2_log_get_bio(struct gfs2_sbd *sdp, u64 blkno, + struct bio **biop, int op, + bio_end_io_t *end_io, void *private, + bool flush) { - struct bio *bio = sdp->sd_log_bio; - u64 nblk; + struct super_block *sb = sdp->sd_vfs; + struct bio *bio = *biop; if (bio) { + u64 nblk; + nblk = bio_end_sector(bio); nblk >>= sdp->sd_fsb2bb_shift; - if (blkno == nblk) + if (blkno == nblk && !flush) return bio; - gfs2_log_flush_bio(sdp, REQ_OP_WRITE, 0); + gfs2_log_flush_bio(biop, op, 0); } - return gfs2_log_alloc_bio(sdp, blkno);
[Cluster-devel] [GFS2 v2 PATCH 3/4] gfs2: add a helper function to get_log_header that can be used elsewhere
Move and re-order the error checks and hash/crc computations into another function __get_log_header() so it can be used in scenarios where buffer_heads are not being used for the log header. Signed-off-by: Abhi Das --- fs/gfs2/recovery.c | 53 - fs/gfs2/recovery.h | 2 ++ 2 files changed, 34 insertions(+), 21 deletions(-) diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c index 0f501f9..1b95294 100644 --- a/fs/gfs2/recovery.c +++ b/fs/gfs2/recovery.c @@ -120,6 +120,35 @@ void gfs2_revoke_clean(struct gfs2_jdesc *jd) } } +int __get_log_header(struct gfs2_sbd *sdp, const struct gfs2_log_header *lh, +unsigned int blkno, struct gfs2_log_header_host *head) +{ + u32 hash, crc; + + if (lh->lh_header.mh_magic != cpu_to_be32(GFS2_MAGIC) || + lh->lh_header.mh_type != cpu_to_be32(GFS2_METATYPE_LH) || + (blkno && be32_to_cpu(lh->lh_blkno) != blkno)) + return 1; + + hash = crc32(~0, lh, LH_V1_SIZE - 4); + hash = ~crc32_le_shift(hash, 4); /* assume lh_hash is zero */ + + if (be32_to_cpu(lh->lh_hash) != hash) + return 1; + + crc = crc32c(~0, (void *)lh + LH_V1_SIZE + 4, +sdp->sd_sb.sb_bsize - LH_V1_SIZE - 4); + + if ((lh->lh_crc != 0 && be32_to_cpu(lh->lh_crc) != crc)) + return 1; + + head->lh_sequence = be64_to_cpu(lh->lh_sequence); + head->lh_flags = be32_to_cpu(lh->lh_flags); + head->lh_tail = be32_to_cpu(lh->lh_tail); + head->lh_blkno = be32_to_cpu(lh->lh_blkno); + + return 0; +} /** * get_log_header - read the log header for a given segment * @jd: the journal @@ -137,36 +166,18 @@ void gfs2_revoke_clean(struct gfs2_jdesc *jd) static int get_log_header(struct gfs2_jdesc *jd, unsigned int blk, struct gfs2_log_header_host *head) { - struct gfs2_log_header *lh; + struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); struct buffer_head *bh; - u32 hash, crc; int error; error = gfs2_replay_read_block(jd, blk, &bh); if (error) return error; - lh = (void *)bh->b_data; - - hash = crc32(~0, lh, LH_V1_SIZE - 4); - hash = ~crc32_le_shift(hash, 4); /* assume lh_hash is zero */ - - crc = crc32c(~0, (void *)lh + LH_V1_SIZE + 4, -bh->b_size - LH_V1_SIZE - 4); - - error = lh->lh_header.mh_magic != cpu_to_be32(GFS2_MAGIC) || - lh->lh_header.mh_type != cpu_to_be32(GFS2_METATYPE_LH) || - be32_to_cpu(lh->lh_blkno) != blk || - be32_to_cpu(lh->lh_hash) != hash || - (lh->lh_crc != 0 && be32_to_cpu(lh->lh_crc) != crc); + error = __get_log_header(sdp, (const struct gfs2_log_header *)bh->b_data, +blk, head); brelse(bh); - if (!error) { - head->lh_sequence = be64_to_cpu(lh->lh_sequence); - head->lh_flags = be32_to_cpu(lh->lh_flags); - head->lh_tail = be32_to_cpu(lh->lh_tail); - head->lh_blkno = be32_to_cpu(lh->lh_blkno); - } return error; } diff --git a/fs/gfs2/recovery.h b/fs/gfs2/recovery.h index 11fdfab..943a67c 100644 --- a/fs/gfs2/recovery.h +++ b/fs/gfs2/recovery.h @@ -31,6 +31,8 @@ extern int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head); extern int gfs2_recover_journal(struct gfs2_jdesc *gfs2_jd, bool wait); extern void gfs2_recover_func(struct work_struct *work); +extern int __get_log_header(struct gfs2_sbd *sdp, const struct gfs2_log_header *lh, + unsigned int blkno, struct gfs2_log_header_host *head); #endif /* __RECOVERY_DOT_H__ */ -- 2.4.11
[Cluster-devel] [GFS2 v2 PATCH 1/4] gfs2: add timing info to map_journal_extents
Tells you how many milliseconds map_journal_extents takes. Signed-off-by: Abhi Das --- fs/gfs2/bmap.c | 8 ++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 03128ed..dddb5a4 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -14,6 +14,7 @@ #include #include #include +#include #include "gfs2.h" #include "incore.h" @@ -2248,7 +2249,9 @@ int gfs2_map_journal_extents(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd) unsigned int shift = sdp->sd_sb.sb_bsize_shift; u64 size; int rc; + ktime_t start, end; + start = ktime_get(); lblock_stop = i_size_read(jd->jd_inode) >> shift; size = (lblock_stop - lblock) << shift; jd->nr_extents = 0; @@ -2268,8 +2271,9 @@ int gfs2_map_journal_extents(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd) lblock += (bh.b_size >> ip->i_inode.i_blkbits); } while(size > 0); - fs_info(sdp, "journal %d mapped with %u extents\n", jd->jd_jid, - jd->nr_extents); + end = ktime_get(); + fs_info(sdp, "journal %d mapped with %u extents in %lldms\n", jd->jd_jid, + jd->nr_extents, ktime_ms_delta(end, start)); return 0; fail: -- 2.4.11
[Cluster-devel] [GFS2 v2 PATCH 4/4] gfs2: read journal in large chunks to locate the head
Use bio(s) to read in the journal sequentially in large chunks and locate the head of the journal. This is faster in most cases when compared to the existing bisect method which operates one block at a time. Signed-off-by: Abhi Das --- fs/gfs2/incore.h | 8 +++- fs/gfs2/lops.c | 96 +- fs/gfs2/lops.h | 1 + fs/gfs2/ops_fstype.c | 1 + fs/gfs2/recovery.c | 115 +-- 5 files changed, 114 insertions(+), 107 deletions(-) diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index b96d39c..b24c105 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -529,6 +529,11 @@ struct gfs2_journal_extent { u64 blocks; }; +enum { + JDF_RECOVERY = 1, + JDF_JHEAD= 2, +}; + struct gfs2_jdesc { struct list_head jd_list; struct list_head extent_list; @@ -536,12 +541,13 @@ struct gfs2_jdesc { struct work_struct jd_work; struct inode *jd_inode; unsigned long jd_flags; -#define JDF_RECOVERY 1 unsigned int jd_jid; unsigned int jd_blocks; int jd_recover_error; /* Replay stuff */ + struct gfs2_log_header_host jd_jhead; + struct bio *jd_rd_bio; /* bio used for reading this journal */ unsigned int jd_found_blocks; unsigned int jd_found_revokes; unsigned int jd_replayed_blocks; diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index f5f31a6..24d5dba 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -18,6 +18,7 @@ #include #include +#include "bmap.h" #include "dir.h" #include "gfs2.h" #include "incore.h" @@ -227,6 +228,50 @@ static void gfs2_end_log_write(struct bio *bio) wake_up(&sdp->sd_log_flush_wait); } +static void gfs2_end_log_read(struct bio *bio) +{ + struct gfs2_jdesc *jd = bio->bi_private; + struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); + struct page *page; + struct bio_vec *bvec; + int i, last; + + if (bio->bi_status) { + fs_err(sdp, "Error %d reading from journal, jid=%u\n", + bio->bi_status, jd->jd_jid); + } + + bio_for_each_segment_all(bvec, bio, i) { + struct gfs2_log_header_host uninitialized_var(lh); + void *ptr; + + page = bvec->bv_page; + ptr = page_address(page); + last = page_private(page); + + if (!test_bit(JDF_JHEAD, &jd->jd_flags)) { + mempool_free(page, gfs2_page_pool); + continue; + } + + if (!__get_log_header(sdp, ptr, 0, &lh)) { + if (lh.lh_sequence > jd->jd_jhead.lh_sequence) + jd->jd_jhead = lh; + else + goto found; + } + + if (last) { + found: + clear_bit(JDF_JHEAD, &jd->jd_flags); + wake_up_bit(&jd->jd_flags, JDF_JHEAD); + } + mempool_free(page, gfs2_page_pool); + } + + bio_put(bio); +} + /** * gfs2_log_flush_bio - Submit any pending log bio * @biop: Address of the bio pointer @@ -241,8 +286,10 @@ void gfs2_log_flush_bio(struct bio **biop, int op, int op_flags) { struct bio *bio = *biop; if (bio) { - struct gfs2_sbd *sdp = bio->bi_private; - atomic_inc(&sdp->sd_log_in_flight); + if (op != REQ_OP_READ) { + struct gfs2_sbd *sdp = bio->bi_private; + atomic_inc(&sdp->sd_log_in_flight); + } bio_set_op_attrs(bio, op, op_flags); submit_bio(bio); *biop = NULL; @@ -360,6 +407,51 @@ void gfs2_log_write_page(struct gfs2_sbd *sdp, struct page *page) gfs2_log_bmap(sdp)); } +static void gfs2_log_read_extent(struct gfs2_jdesc *jd, u64 dblock, + unsigned int blocks, int last) +{ + struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); + struct super_block *sb = sdp->sd_vfs; + struct page *page; + int i, ret; + struct bio *bio; + + for (i = 0; i < blocks; i++) { + page = mempool_alloc(gfs2_page_pool, GFP_NOIO); + /* flag the last page of the journal we plan to read in */ + page_private(page) = (last && i == (blocks - 1)); + + bio = gfs2_log_get_bio(sdp, dblock + i, &jd->jd_rd_bio, + REQ_OP_READ, gfs2_end_log_read, + jd, false); + ret = bio_add_page(bio, page, sb->s_blocksize, 0); + if (ret == 0) { +
[Cluster-devel] [GFS2 PATCH 3/4] gfs2: add a helper function to get_log_header that can be used elsewhere
Move and re-order the error checks and hash/crc computations into another function __get_log_header() so it can be used in scenarios where buffer_heads are not being used for the log header. Signed-off-by: Abhi Das --- fs/gfs2/recovery.c | 53 - fs/gfs2/recovery.h | 2 ++ 2 files changed, 34 insertions(+), 21 deletions(-) diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c index 0f501f9..1b95294 100644 --- a/fs/gfs2/recovery.c +++ b/fs/gfs2/recovery.c @@ -120,6 +120,35 @@ void gfs2_revoke_clean(struct gfs2_jdesc *jd) } } +int __get_log_header(struct gfs2_sbd *sdp, const struct gfs2_log_header *lh, +unsigned int blkno, struct gfs2_log_header_host *head) +{ + u32 hash, crc; + + if (lh->lh_header.mh_magic != cpu_to_be32(GFS2_MAGIC) || + lh->lh_header.mh_type != cpu_to_be32(GFS2_METATYPE_LH) || + (blkno && be32_to_cpu(lh->lh_blkno) != blkno)) + return 1; + + hash = crc32(~0, lh, LH_V1_SIZE - 4); + hash = ~crc32_le_shift(hash, 4); /* assume lh_hash is zero */ + + if (be32_to_cpu(lh->lh_hash) != hash) + return 1; + + crc = crc32c(~0, (void *)lh + LH_V1_SIZE + 4, +sdp->sd_sb.sb_bsize - LH_V1_SIZE - 4); + + if ((lh->lh_crc != 0 && be32_to_cpu(lh->lh_crc) != crc)) + return 1; + + head->lh_sequence = be64_to_cpu(lh->lh_sequence); + head->lh_flags = be32_to_cpu(lh->lh_flags); + head->lh_tail = be32_to_cpu(lh->lh_tail); + head->lh_blkno = be32_to_cpu(lh->lh_blkno); + + return 0; +} /** * get_log_header - read the log header for a given segment * @jd: the journal @@ -137,36 +166,18 @@ void gfs2_revoke_clean(struct gfs2_jdesc *jd) static int get_log_header(struct gfs2_jdesc *jd, unsigned int blk, struct gfs2_log_header_host *head) { - struct gfs2_log_header *lh; + struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); struct buffer_head *bh; - u32 hash, crc; int error; error = gfs2_replay_read_block(jd, blk, &bh); if (error) return error; - lh = (void *)bh->b_data; - - hash = crc32(~0, lh, LH_V1_SIZE - 4); - hash = ~crc32_le_shift(hash, 4); /* assume lh_hash is zero */ - - crc = crc32c(~0, (void *)lh + LH_V1_SIZE + 4, -bh->b_size - LH_V1_SIZE - 4); - - error = lh->lh_header.mh_magic != cpu_to_be32(GFS2_MAGIC) || - lh->lh_header.mh_type != cpu_to_be32(GFS2_METATYPE_LH) || - be32_to_cpu(lh->lh_blkno) != blk || - be32_to_cpu(lh->lh_hash) != hash || - (lh->lh_crc != 0 && be32_to_cpu(lh->lh_crc) != crc); + error = __get_log_header(sdp, (const struct gfs2_log_header *)bh->b_data, +blk, head); brelse(bh); - if (!error) { - head->lh_sequence = be64_to_cpu(lh->lh_sequence); - head->lh_flags = be32_to_cpu(lh->lh_flags); - head->lh_tail = be32_to_cpu(lh->lh_tail); - head->lh_blkno = be32_to_cpu(lh->lh_blkno); - } return error; } diff --git a/fs/gfs2/recovery.h b/fs/gfs2/recovery.h index 11fdfab..943a67c 100644 --- a/fs/gfs2/recovery.h +++ b/fs/gfs2/recovery.h @@ -31,6 +31,8 @@ extern int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head); extern int gfs2_recover_journal(struct gfs2_jdesc *gfs2_jd, bool wait); extern void gfs2_recover_func(struct work_struct *work); +extern int __get_log_header(struct gfs2_sbd *sdp, const struct gfs2_log_header *lh, + unsigned int blkno, struct gfs2_log_header_host *head); #endif /* __RECOVERY_DOT_H__ */ -- 2.4.11
[Cluster-devel] [GFS2 PATCH 2/4] gfs2: changes to gfs2_log_XXX_bio
Change gfs2_log_flush_bio to accept a pointer to the struct bio* to be flushed. Change gfs2_log_alloc_bio and gfs2_log_get_bio to take a struct gfs2_jdesc* instead of gfs2_sbd. Signed-off-by: Abhi Das --- fs/gfs2/log.c | 4 ++-- fs/gfs2/lops.c | 34 +++--- fs/gfs2/lops.h | 2 +- 3 files changed, 22 insertions(+), 18 deletions(-) diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index ee20ea42..b80fb30 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -731,7 +731,7 @@ void gfs2_write_log_header(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd, lh->lh_crc = cpu_to_be32(crc); gfs2_log_write(sdp, page, sb->s_blocksize, 0, addr); - gfs2_log_flush_bio(sdp, REQ_OP_WRITE, op_flags); + gfs2_log_flush_bio(&sdp->sd_log_bio, REQ_OP_WRITE, op_flags); log_flush_wait(sdp); } @@ -808,7 +808,7 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, u32 flags) gfs2_ordered_write(sdp); lops_before_commit(sdp, tr); - gfs2_log_flush_bio(sdp, REQ_OP_WRITE, 0); + gfs2_log_flush_bio(&sdp->sd_log_bio, REQ_OP_WRITE, 0); if (sdp->sd_log_head != sdp->sd_log_flush_head) { log_flush_wait(sdp); diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index f2567f9..4cc19af 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -229,7 +229,7 @@ static void gfs2_end_log_write(struct bio *bio) /** * gfs2_log_flush_bio - Submit any pending log bio - * @sdp: The superblock + * @biop: Address of the bio pointer * @op: REQ_OP * @op_flags: req_flag_bits * @@ -237,19 +237,21 @@ static void gfs2_end_log_write(struct bio *bio) * there is no pending bio, then this is a no-op. */ -void gfs2_log_flush_bio(struct gfs2_sbd *sdp, int op, int op_flags) +void gfs2_log_flush_bio(struct bio **biop, int op, int op_flags) { - if (sdp->sd_log_bio) { + struct bio *bio = *biop; + if (bio) { + struct gfs2_sbd *sdp = bio->bi_private; atomic_inc(&sdp->sd_log_in_flight); - bio_set_op_attrs(sdp->sd_log_bio, op, op_flags); - submit_bio(sdp->sd_log_bio); - sdp->sd_log_bio = NULL; + bio_set_op_attrs(bio, op, op_flags); + submit_bio(bio); + *biop = NULL; } } /** * gfs2_log_alloc_bio - Allocate a new bio for log writing - * @sdp: The superblock + * @jd: The journal descriptor * @blkno: The next device block number we want to write to * * This should never be called when there is a cached bio in the @@ -260,8 +262,9 @@ void gfs2_log_flush_bio(struct gfs2_sbd *sdp, int op, int op_flags) * Returns: Newly allocated bio */ -static struct bio *gfs2_log_alloc_bio(struct gfs2_sbd *sdp, u64 blkno) +static struct bio *gfs2_log_alloc_bio(struct gfs2_jdesc *jd, u64 blkno) { + struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); struct super_block *sb = sdp->sd_vfs; struct bio *bio; @@ -280,7 +283,7 @@ static struct bio *gfs2_log_alloc_bio(struct gfs2_sbd *sdp, u64 blkno) /** * gfs2_log_get_bio - Get cached log bio, or allocate a new one - * @sdp: The superblock + * @jd: The journal descriptor * @blkno: The device block number we want to write to * * If there is a cached bio, then if the next block number is sequential @@ -291,8 +294,9 @@ static struct bio *gfs2_log_alloc_bio(struct gfs2_sbd *sdp, u64 blkno) * Returns: The bio to use for log writes */ -static struct bio *gfs2_log_get_bio(struct gfs2_sbd *sdp, u64 blkno) +static struct bio *gfs2_log_get_bio(struct gfs2_jdesc *jd, u64 blkno) { + struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); struct bio *bio = sdp->sd_log_bio; u64 nblk; @@ -301,10 +305,10 @@ static struct bio *gfs2_log_get_bio(struct gfs2_sbd *sdp, u64 blkno) nblk >>= sdp->sd_fsb2bb_shift; if (blkno == nblk) return bio; - gfs2_log_flush_bio(sdp, REQ_OP_WRITE, 0); + gfs2_log_flush_bio(&sdp->sd_log_bio, REQ_OP_WRITE, 0); } - return gfs2_log_alloc_bio(sdp, blkno); + return gfs2_log_alloc_bio(sdp->sd_jdesc, blkno); } /** @@ -326,11 +330,11 @@ void gfs2_log_write(struct gfs2_sbd *sdp, struct page *page, struct bio *bio; int ret; - bio = gfs2_log_get_bio(sdp, blkno); + bio = gfs2_log_get_bio(sdp->sd_jdesc, blkno); ret = bio_add_page(bio, page, size, offset); if (ret == 0) { - gfs2_log_flush_bio(sdp, REQ_OP_WRITE, 0); - bio = gfs2_log_alloc_bio(sdp, blkno); + gfs2_log_flush_bio(&sdp->sd_log_bio, REQ_OP_WRITE, 0); + bio = gfs2_log_alloc_bio(sdp->sd_jdesc, blkno); ret = bio_add_page(bio, page, size, offset); WARN_ON(ret == 0); } diff --git a/fs/gfs2/lops.h b/fs/gfs2/lops.h index e4
[Cluster-devel] [GFS2 PATCH 4/4] gfs2: read journal in large chunks to locate the head
Use bio(s) to read in the journal sequentially in large chunks and locate the head of the journal. This is faster in most cases when compared to the existing bisect method which operates one block at a time. Signed-off-by: Abhi Das --- fs/gfs2/incore.h | 8 +++- fs/gfs2/lops.c | 122 +-- fs/gfs2/lops.h | 1 + fs/gfs2/ops_fstype.c | 1 + fs/gfs2/recovery.c | 115 +--- 5 files changed, 129 insertions(+), 118 deletions(-) diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index b96d39c..b24c105 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -529,6 +529,11 @@ struct gfs2_journal_extent { u64 blocks; }; +enum { + JDF_RECOVERY = 1, + JDF_JHEAD= 2, +}; + struct gfs2_jdesc { struct list_head jd_list; struct list_head extent_list; @@ -536,12 +541,13 @@ struct gfs2_jdesc { struct work_struct jd_work; struct inode *jd_inode; unsigned long jd_flags; -#define JDF_RECOVERY 1 unsigned int jd_jid; unsigned int jd_blocks; int jd_recover_error; /* Replay stuff */ + struct gfs2_log_header_host jd_jhead; + struct bio *jd_rd_bio; /* bio used for reading this journal */ unsigned int jd_found_blocks; unsigned int jd_found_revokes; unsigned int jd_replayed_blocks; diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index 4cc19af..21979b2 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -18,6 +18,7 @@ #include #include +#include "bmap.h" #include "dir.h" #include "gfs2.h" #include "incore.h" @@ -227,6 +228,50 @@ static void gfs2_end_log_write(struct bio *bio) wake_up(&sdp->sd_log_flush_wait); } +static void gfs2_end_log_read(struct bio *bio) +{ + struct gfs2_jdesc *jd = bio->bi_private; + struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); + struct page *page; + struct bio_vec *bvec; + int i, last; + + if (bio->bi_status) { + fs_err(sdp, "Error %d reading from journal, jid=%u\n", + bio->bi_status, jd->jd_jid); + } + + bio_for_each_segment_all(bvec, bio, i) { + struct gfs2_log_header_host uninitialized_var(lh); + void *ptr; + + page = bvec->bv_page; + ptr = page_address(page); + last = page_private(page); + + if (!test_bit(JDF_JHEAD, &jd->jd_flags)) { + mempool_free(page, gfs2_page_pool); + continue; + } + + if (!__get_log_header(sdp, ptr, 0, &lh)) { + if (lh.lh_sequence > jd->jd_jhead.lh_sequence) + jd->jd_jhead = lh; + else + goto found; + } + + if (last) { + found: + clear_bit(JDF_JHEAD, &jd->jd_flags); + wake_up_bit(&jd->jd_flags, JDF_JHEAD); + } + mempool_free(page, gfs2_page_pool); + } + + bio_put(bio); +} + /** * gfs2_log_flush_bio - Submit any pending log bio * @biop: Address of the bio pointer @@ -241,8 +286,10 @@ void gfs2_log_flush_bio(struct bio **biop, int op, int op_flags) { struct bio *bio = *biop; if (bio) { - struct gfs2_sbd *sdp = bio->bi_private; - atomic_inc(&sdp->sd_log_in_flight); + if (op != REQ_OP_READ) { + struct gfs2_sbd *sdp = bio->bi_private; + atomic_inc(&sdp->sd_log_in_flight); + } bio_set_op_attrs(bio, op, op_flags); submit_bio(bio); *biop = NULL; @@ -253,6 +300,7 @@ void gfs2_log_flush_bio(struct bio **biop, int op, int op_flags) * gfs2_log_alloc_bio - Allocate a new bio for log writing * @jd: The journal descriptor * @blkno: The next device block number we want to write to + * @op: REQ_OP * * This should never be called when there is a cached bio in the * super block. When it returns, there will be a cached bio in the @@ -262,21 +310,24 @@ void gfs2_log_flush_bio(struct bio **biop, int op, int op_flags) * Returns: Newly allocated bio */ -static struct bio *gfs2_log_alloc_bio(struct gfs2_jdesc *jd, u64 blkno) +static struct bio *gfs2_log_alloc_bio(struct gfs2_jdesc *jd, u64 blkno, int op) { struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); struct super_block *sb = sdp->sd_vfs; struct bio *bio; - BUG_ON(sdp->sd_log_bio); + BUG_ON((op == REQ_OP_READ ? jd->jd_rd_bio : sdp->sd_log_bio)); bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES); bio->bi_iter.bi_sector = blkno * (sb->s_blocksiz
[Cluster-devel] [GFS2 PATCH 1/4] gfs2: add timing info to map_journal_extents
Tells you how many milliseconds map_journal_extents takes. Signed-off-by: Abhi Das --- fs/gfs2/bmap.c | 8 ++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 03128ed..dddb5a4 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -14,6 +14,7 @@ #include #include #include +#include #include "gfs2.h" #include "incore.h" @@ -2248,7 +2249,9 @@ int gfs2_map_journal_extents(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd) unsigned int shift = sdp->sd_sb.sb_bsize_shift; u64 size; int rc; + ktime_t start, end; + start = ktime_get(); lblock_stop = i_size_read(jd->jd_inode) >> shift; size = (lblock_stop - lblock) << shift; jd->nr_extents = 0; @@ -2268,8 +2271,9 @@ int gfs2_map_journal_extents(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd) lblock += (bh.b_size >> ip->i_inode.i_blkbits); } while(size > 0); - fs_info(sdp, "journal %d mapped with %u extents\n", jd->jd_jid, - jd->nr_extents); + end = ktime_get(); + fs_info(sdp, "journal %d mapped with %u extents in %lldms\n", jd->jd_jid, + jd->nr_extents, ktime_ms_delta(end, start)); return 0; fail: -- 2.4.11
[Cluster-devel] [GFS2 PATCH 0/4] Speed up journal head lookup
This is the upstream version of the rhel7 patchset I'd posted earlier for review. It is slightly different in parts owing to some bits already being present and the hash/crc computation code being different due to the updated log header structure. Cheers! --Abhi *** BLURB HERE *** Abhi Das (4): gfs2: add timing info to map_journal_extents gfs2: changes to gfs2_log_XXX_bio gfs2: add a helper function to get_log_header that can be used elsewhere gfs2: read journal in large chunks to locate the head fs/gfs2/bmap.c | 8 ++- fs/gfs2/incore.h | 8 ++- fs/gfs2/log.c| 4 +- fs/gfs2/lops.c | 142 --- fs/gfs2/lops.h | 3 +- fs/gfs2/ops_fstype.c | 1 + fs/gfs2/recovery.c | 168 +-- fs/gfs2/recovery.h | 2 + 8 files changed, 184 insertions(+), 152 deletions(-) -- 2.4.11
[Cluster-devel] [GFS2 PATCH] gfs2: getlabel support
Add support for the GETFSLABEL ioctl in gfs2. I tested this patch and it works as expected. Signed-off-by: Steve Whitehouse Tested-by: Abhi Das --- fs/gfs2/file.c | 14 ++ 1 file changed, 14 insertions(+) diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 08369c6..6510f4e 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -314,6 +314,17 @@ static int gfs2_set_flags(struct file *filp, u32 __user *ptr) return do_gfs2_set_flags(filp, gfsflags, mask); } +static int gfs2_getlabel(struct file *filp, char __user *label) +{ + struct inode *inode = file_inode(filp); + struct gfs2_sbd *sdp = GFS2_SB(inode); + + if (copy_to_user(label, sdp->sd_sb.sb_locktable, GFS2_LOCKNAME_LEN)) + return -EFAULT; + + return 0; +} + static long gfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { switch(cmd) { @@ -323,7 +334,10 @@ static long gfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return gfs2_set_flags(filp, (u32 __user *)arg); case FITRIM: return gfs2_fitrim(filp, (void __user *)arg); + case FS_IOC_GETFSLABEL: + return gfs2_getlabel(filp, (char __user *)arg); } + return -ENOTTY; } -- 2.4.11
[Cluster-devel] [RFC v2 PATCH 1/5] gfs2: allow map_journal_extents() to take a journal descriptor as argument
This function now maps the extents for the journal whose descriptor is passed in as argument. Signed-off-by: Abhi Das --- fs/gfs2/log.h| 1 + fs/gfs2/ops_fstype.c | 5 ++--- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h index 92dcbe7..19c93df 100644 --- a/fs/gfs2/log.h +++ b/fs/gfs2/log.h @@ -75,4 +75,5 @@ extern int gfs2_logd(void *data); extern void gfs2_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd); extern void gfs2_write_revokes(struct gfs2_sbd *sdp); +extern int map_journal_extents(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd); #endif /* __LOG_DOT_H__ */ diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 228f38e..cf3e366 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -524,9 +524,8 @@ out: * but since it's only done at mount time, I'm not worried about the * time it takes. */ -static int map_journal_extents(struct gfs2_sbd *sdp) +int map_journal_extents(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd) { - struct gfs2_jdesc *jd = sdp->sd_jdesc; unsigned int lb; u64 db, prev_db; /* logical block, disk block, prev disk block */ struct gfs2_inode *ip = GFS2_I(jd->jd_inode); @@ -772,7 +771,7 @@ static int init_journal(struct gfs2_sbd *sdp, int undo) atomic_set(&sdp->sd_log_thresh2, 4*sdp->sd_jdesc->jd_blocks/5); /* Map the extents for this journal's blocks */ - map_journal_extents(sdp); + map_journal_extents(sdp, sdp->sd_jdesc); } trace_gfs2_log_blocks(sdp, atomic_read(&sdp->sd_log_blks_free)); -- 2.4.11
[Cluster-devel] [RFC v2 PATCH 2/5] gfs2: add timing info for various stages of journal recovery
Tells you how many milliseconds each stage of journal recovery takes. Signed-off-by: Abhi Das --- fs/gfs2/ops_fstype.c | 5 + fs/gfs2/recovery.c | 20 ++-- 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index cf3e366..fd460c1 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -532,7 +532,9 @@ int map_journal_extents(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd) struct gfs2_journal_extent *jext = NULL; struct buffer_head bh; int rc = 0; + ktime_t start, end; + start = ktime_get(); prev_db = 0; for (lb = 0; lb < i_size_read(jd->jd_inode) >> sdp->sd_sb.sb_bsize_shift; lb++) { @@ -564,6 +566,9 @@ int map_journal_extents(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd) } prev_db = db; } + end = ktime_get(); + fs_info(sdp, "jid=%u: Journal extent mapped in %lldms\n", jd->jd_jid, + ktime_ms_delta(end, start)); return rc; } diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c index 56dea44..4b042db 100644 --- a/fs/gfs2/recovery.c +++ b/fs/gfs2/recovery.c @@ -14,6 +14,7 @@ #include #include #include +#include #include "gfs2.h" #include "incore.h" @@ -455,12 +456,13 @@ void gfs2_recover_func(struct work_struct *work) struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); struct gfs2_log_header_host head; struct gfs2_holder j_gh, ji_gh, t_gh; - unsigned long t; + ktime_t t_start, t_jlck, t_jhd, t_tlck, t_rep; int ro = 0; unsigned int pass; int error; int jlocked = 0; + t_start = ktime_get(); if (sdp->sd_args.ar_spectator || (jd->jd_jid != sdp->sd_lockstruct.ls_jid)) { fs_info(sdp, "jid=%u: Trying to acquire journal lock...\n", @@ -492,6 +494,7 @@ void gfs2_recover_func(struct work_struct *work) fs_info(sdp, "jid=%u, already locked for use\n", jd->jd_jid); } + t_jlck = ktime_get(); fs_info(sdp, "jid=%u: Looking at journal...\n", jd->jd_jid); error = gfs2_jdesc_check(jd); @@ -501,13 +504,12 @@ void gfs2_recover_func(struct work_struct *work) error = gfs2_find_jhead(jd, &head); if (error) goto fail_gunlock_ji; + t_jhd = ktime_get(); if (!(head.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) { fs_info(sdp, "jid=%u: Acquiring the transaction lock...\n", jd->jd_jid); - t = jiffies; - /* Acquire a shared hold on the transaction lock */ error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, @@ -541,6 +543,7 @@ void gfs2_recover_func(struct work_struct *work) goto fail_gunlock_tr; } + t_tlck = ktime_get(); fs_info(sdp, "jid=%u: Replaying journal...\n", jd->jd_jid); for (pass = 0; pass < 2; pass++) { @@ -557,9 +560,14 @@ void gfs2_recover_func(struct work_struct *work) goto fail_gunlock_tr; gfs2_glock_dq_uninit(&t_gh); - t = DIV_ROUND_UP(jiffies - t, HZ); - fs_info(sdp, "jid=%u: Journal replayed in %lus\n", - jd->jd_jid, t); + t_rep = ktime_get(); + fs_info(sdp, "jid=%u: Journal replayed in %lldms [jlck:%lldms, " + "jhead:%lldms, tlck:%lldms, replay:%lldms]\n", + jd->jd_jid, ktime_ms_delta(t_rep, t_start), + ktime_ms_delta(t_jlck, t_start), + ktime_ms_delta(t_jhd, t_jlck), + ktime_ms_delta(t_tlck, t_jhd), + ktime_ms_delta(t_rep, t_tlck)); } gfs2_recovery_done(sdp, jd->jd_jid, LM_RD_SUCCESS); -- 2.4.11
[Cluster-devel] [RFC v2 PATCH 3/5] gfs2: changes to gfs2_log_XXX_bio
Change gfs2_log_flush_bio to accept a pointer to the struct bio to be flushed. Change gfs2_log_alloc_bio and gfs2_log_get_bio to take a struct gfs2_jdesc instead of gfs2_sbd. Signed-off-by: Abhi Das --- fs/gfs2/log.c | 4 ++-- fs/gfs2/lops.c | 32 ++-- fs/gfs2/lops.h | 2 +- 3 files changed, 21 insertions(+), 17 deletions(-) diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 15a3a8c..87b7d87 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -655,7 +655,7 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags) sdp->sd_log_idle = (tail == sdp->sd_log_flush_head); gfs2_log_write_page(sdp, page); - gfs2_log_flush_bio(sdp, rw); + gfs2_log_flush_bio(&sdp->sd_log_bio, rw); log_flush_wait(sdp); if (sdp->sd_log_tail != tail) @@ -699,7 +699,7 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl) gfs2_ordered_write(sdp); lops_before_commit(sdp, tr); - gfs2_log_flush_bio(sdp, WRITE); + gfs2_log_flush_bio(&sdp->sd_log_bio, WRITE); if (sdp->sd_log_head != sdp->sd_log_flush_head) { log_flush_wait(sdp); diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index 4da6055..0284648 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -230,25 +230,27 @@ static void gfs2_end_log_write(struct bio *bio, int error) /** * gfs2_log_flush_bio - Submit any pending log bio - * @sdp: The superblock + * @biop: Pointer to the bio we want to flush * @rw: The rw flags * * Submit any pending part-built or full bio to the block device. If * there is no pending bio, then this is a no-op. */ -void gfs2_log_flush_bio(struct gfs2_sbd *sdp, int rw) +void gfs2_log_flush_bio(struct bio **biop, int rw) { - if (sdp->sd_log_bio) { + struct bio *bio = *biop; + if (bio) { + struct gfs2_sbd *sdp = bio->bi_private; atomic_inc(&sdp->sd_log_in_flight); - submit_bio(rw, sdp->sd_log_bio); - sdp->sd_log_bio = NULL; + submit_bio(rw, bio); + *biop = NULL; } } /** * gfs2_log_alloc_bio - Allocate a new bio for log writing - * @sdp: The superblock + * @jd: The journal descriptor * @blkno: The next device block number we want to write to * * This should never be called when there is a cached bio in the @@ -259,8 +261,9 @@ void gfs2_log_flush_bio(struct gfs2_sbd *sdp, int rw) * Returns: Newly allocated bio */ -static struct bio *gfs2_log_alloc_bio(struct gfs2_sbd *sdp, u64 blkno) +static struct bio *gfs2_log_alloc_bio(struct gfs2_jdesc *jd, u64 blkno) { + struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); struct super_block *sb = sdp->sd_vfs; unsigned nrvecs = bio_get_nr_vecs(sb->s_bdev); struct bio *bio; @@ -286,7 +289,7 @@ static struct bio *gfs2_log_alloc_bio(struct gfs2_sbd *sdp, u64 blkno) /** * gfs2_log_get_bio - Get cached log bio, or allocate a new one - * @sdp: The superblock + * @jd: The journal descriptor * @blkno: The device block number we want to write to * * If there is a cached bio, then if the next block number is sequential @@ -297,8 +300,9 @@ static struct bio *gfs2_log_alloc_bio(struct gfs2_sbd *sdp, u64 blkno) * Returns: The bio to use for log writes */ -static struct bio *gfs2_log_get_bio(struct gfs2_sbd *sdp, u64 blkno) +static struct bio *gfs2_log_get_bio(struct gfs2_jdesc *jd, u64 blkno) { + struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); struct bio *bio = sdp->sd_log_bio; u64 nblk; @@ -307,10 +311,10 @@ static struct bio *gfs2_log_get_bio(struct gfs2_sbd *sdp, u64 blkno) nblk >>= sdp->sd_fsb2bb_shift; if (blkno == nblk) return bio; - gfs2_log_flush_bio(sdp, WRITE); + gfs2_log_flush_bio(&sdp->sd_log_bio, WRITE); } - return gfs2_log_alloc_bio(sdp, blkno); + return gfs2_log_alloc_bio(sdp->sd_jdesc, blkno); } @@ -333,11 +337,11 @@ static void gfs2_log_write(struct gfs2_sbd *sdp, struct page *page, struct bio *bio; int ret; - bio = gfs2_log_get_bio(sdp, blkno); + bio = gfs2_log_get_bio(sdp->sd_jdesc, blkno); ret = bio_add_page(bio, page, size, offset); if (ret == 0) { - gfs2_log_flush_bio(sdp, WRITE); - bio = gfs2_log_alloc_bio(sdp, blkno); + gfs2_log_flush_bio(&sdp->sd_log_bio, WRITE); + bio = gfs2_log_alloc_bio(sdp->sd_jdesc, blkno); ret = bio_add_page(bio, page, size, offset); WARN_ON(ret == 0); } diff --git a/fs/gfs2/lops.h b/fs/gfs2/lops.h index 06793e3..3044347 100644 --- a/fs/gfs2/lops.h +++ b/fs/gfs2/lops.h @@ -28,7 +28,7 @@ extern const struct gfs2_log_operations gfs2_databuf_lops; extern const struct gfs2_log_operations *gfs
[Cluster-devel] [RFC v2 PATCH 0/5] Speed up journal head lookup
This is a revised version of the patch set I'd posted earlier to speed up jhead lookup during recovery. I've made some changes as per Steve's suggestions based on the previous version: https://www.redhat.com/archives/cluster-devel/2018-May/msg00088.html As before, this patchset is based on the latest RHEL7 codebase as it is easier for me to test. Upstream version shouldn't be very dissimilar and I'll post the upstream port if it looks good. I'll do a bit more testing and report some performance numbers shortly. Cheers! --Abhi Abhi Das (5): gfs2: allow map_journal_extents() to take a journal descriptor as argument gfs2: add timing info for various stages of journal recovery gfs2: changes to gfs2_log_XXX_bio gfs2: read journal in large chunks to locate the head gfs2: add tracepoint debugging for gfs2_end_log_read fs/gfs2/incore.h | 9 +++- fs/gfs2/log.c| 4 +- fs/gfs2/log.h| 1 + fs/gfs2/lops.c | 142 +++ fs/gfs2/lops.h | 15 +- fs/gfs2/ops_fstype.c | 12 +++-- fs/gfs2/recovery.c | 138 ++--- fs/gfs2/recovery.h | 1 + fs/gfs2/trace_gfs2.h | 25 + 9 files changed, 208 insertions(+), 139 deletions(-) -- 2.4.11
[Cluster-devel] [RFC v2 PATCH 4/5] gfs2: read journal in large chunks to locate the head
Use bio(s) to read in the journal sequentially in large chunks and locate the head of the journal. This is faster in most cases when compared to the existing bisect method which operates one block at a time. Signed-off-by: Abhi Das --- fs/gfs2/incore.h | 8 +++- fs/gfs2/lops.c | 121 +-- fs/gfs2/lops.h | 13 ++ fs/gfs2/ops_fstype.c | 1 + fs/gfs2/recovery.c | 118 + fs/gfs2/recovery.h | 1 + 6 files changed, 142 insertions(+), 120 deletions(-) diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index f303616..31188c0 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -494,18 +494,24 @@ struct gfs2_journal_extent { u64 blocks; }; +enum { + JDF_RECOVERY = 1, + JDF_JHEAD= 2, +}; + struct gfs2_jdesc { struct list_head jd_list; struct list_head extent_list; struct work_struct jd_work; struct inode *jd_inode; unsigned long jd_flags; -#define JDF_RECOVERY 1 unsigned int jd_jid; unsigned int jd_blocks; int jd_recover_error; /* Replay stuff */ + struct gfs2_log_header_host jd_jhead; + struct bio *jd_rd_bio; /* bio used for reading this journal */ unsigned int jd_found_blocks; unsigned int jd_found_revokes; unsigned int jd_replayed_blocks; diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index 0284648..518b786 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -228,6 +228,53 @@ static void gfs2_end_log_write(struct bio *bio, int error) wake_up(&sdp->sd_log_flush_wait); } +static void gfs2_end_log_read(struct bio *bio, int error) +{ + struct gfs2_jdesc *jd = bio->bi_private; + struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); + struct page *page; + struct bio_vec *bvec; + int i, last; + + if (error) { + sdp->sd_log_error = error; + fs_err(sdp, "Error %d reading from journal, jid=%u\n", error, + jd->jd_jid); + } + + bio_for_each_segment_all(bvec, bio, i) { + struct gfs2_log_header_host uninitialized_var(lh); + void *ptr; + + page = bvec->bv_page; + ptr = page_address(page); + error = gfs2_log_header_in(&lh, ptr); + last = page_private(page); + + if (!test_bit(JDF_JHEAD, &jd->jd_flags)) { + mempool_free(page, gfs2_page_pool); + continue; + } + + if (!error && lh.lh_hash == compute_hash(ptr)) { + if (lh.lh_sequence > jd->jd_jhead.lh_sequence) + jd->jd_jhead = lh; + else + goto found; + } + + if (last) { + found: + clear_bit(JDF_JHEAD, &jd->jd_flags); + smp_mb__after_clear_bit(); + wake_up_bit(&jd->jd_flags, JDF_JHEAD); + } + mempool_free(page, gfs2_page_pool); + } + + bio_put(bio); +} + /** * gfs2_log_flush_bio - Submit any pending log bio * @biop: Pointer to the bio we want to flush @@ -241,8 +288,10 @@ void gfs2_log_flush_bio(struct bio **biop, int rw) { struct bio *bio = *biop; if (bio) { - struct gfs2_sbd *sdp = bio->bi_private; - atomic_inc(&sdp->sd_log_in_flight); + if (rw != READ) { + struct gfs2_sbd *sdp = bio->bi_private; + atomic_inc(&sdp->sd_log_in_flight); + } submit_bio(rw, bio); *biop = NULL; } @@ -261,14 +310,14 @@ void gfs2_log_flush_bio(struct bio **biop, int rw) * Returns: Newly allocated bio */ -static struct bio *gfs2_log_alloc_bio(struct gfs2_jdesc *jd, u64 blkno) +static struct bio *gfs2_log_alloc_bio(struct gfs2_jdesc *jd, u64 blkno, int rw) { struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); struct super_block *sb = sdp->sd_vfs; unsigned nrvecs = bio_get_nr_vecs(sb->s_bdev); struct bio *bio; - BUG_ON(sdp->sd_log_bio); + BUG_ON((rw == READ ? jd->jd_rd_bio : sdp->sd_log_bio)); while (1) { bio = bio_alloc(GFP_NOIO, nrvecs); @@ -279,10 +328,13 @@ static struct bio *gfs2_log_alloc_bio(struct gfs2_jdesc *jd, u64 blkno) bio->bi_sector = blkno * (sb->s_blocksize >> 9); bio->bi_bdev = sb->s_bdev; - bio->bi_end_io = gfs2_end_log_write; - bio->bi_private = sdp; + bio->bi_end_io = rw == READ ? gfs2_end_log_read : gfs2_end_log_write; + bio->bi_private = rw == READ ? (void*)jd : (void*)sdp;
[Cluster-devel] [RFC v2 PATCH 5/5] gfs2: add tracepoint debugging for gfs2_end_log_read
Use a tracepoint and a counter in gfs2_jdesc to count the number of outstanding reads (in pages) as we read through a journal to aid debugging. Signed-off-by: Abhi Das --- fs/gfs2/incore.h | 1 + fs/gfs2/lops.c | 3 +++ fs/gfs2/ops_fstype.c | 1 + fs/gfs2/trace_gfs2.h | 25 + 4 files changed, 30 insertions(+) diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 31188c0..bb4446d 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -512,6 +512,7 @@ struct gfs2_jdesc { struct gfs2_log_header_host jd_jhead; struct bio *jd_rd_bio; /* bio used for reading this journal */ + atomic_t jd_rd_pg_ct; unsigned int jd_found_blocks; unsigned int jd_found_revokes; unsigned int jd_replayed_blocks; diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index 518b786..a261398 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -250,6 +250,7 @@ static void gfs2_end_log_read(struct bio *bio, int error) ptr = page_address(page); error = gfs2_log_header_in(&lh, ptr); last = page_private(page); + atomic_dec(&jd->jd_rd_pg_ct); if (!test_bit(JDF_JHEAD, &jd->jd_flags)) { mempool_free(page, gfs2_page_pool); @@ -273,6 +274,7 @@ static void gfs2_end_log_read(struct bio *bio, int error) } bio_put(bio); + trace_gfs2_end_log_read(jd); } /** @@ -454,6 +456,7 @@ void gfs2_log_read_extent(struct gfs2_jdesc *jd, u64 dblock, ret = bio_add_page(bio, page, sb->s_blocksize, 0); WARN_ON(ret == 0); } + atomic_inc(&jd->jd_rd_pg_ct); bio->bi_private = jd; } } diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 4a17eaf..ac9855a 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -643,6 +643,7 @@ static int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh) break; } jd->jd_rd_bio = NULL; + atomic_set(&jd->jd_rd_pg_ct, 0); spin_lock(&sdp->sd_jindex_spin); jd->jd_jid = sdp->sd_journals++; diff --git a/fs/gfs2/trace_gfs2.h b/fs/gfs2/trace_gfs2.h index d1de2ed..9f0cc8d 100644 --- a/fs/gfs2/trace_gfs2.h +++ b/fs/gfs2/trace_gfs2.h @@ -613,6 +613,31 @@ TRACE_EVENT(gfs2_rs, rs_func_name(__entry->func), (unsigned long)__entry->free) ); +TRACE_EVENT(gfs2_end_log_read, + + TP_PROTO(const struct gfs2_jdesc *jd), + + + TP_ARGS(jd), + + TP_STRUCT__entry( + __field(dev_t,dev ) + __field(unsigned int, jid ) + __field(unsigned int, pages ) + ), + + TP_fast_assign( + __entry->dev= jd->jd_inode->i_sb->s_dev; + __entry->jid= jd->jd_jid; + __entry->pages = atomic_read(&jd->jd_rd_pg_ct); + ), + + TP_printk("%u,%u end_log_read jid:%u outstanding pages:%u", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned int)__entry->jid, + (unsigned int)__entry->pages) +); + #endif /* _TRACE_GFS2_H */ /* This part must be outside protection */ -- 2.4.11
[Cluster-devel] [PATCH v3 1/2] gfs2: Pass write offset to gfs2_write_calc_reserv
Pass the offset of the write to gfs2_write_calc_reserv so that we can then compute a better upper bound of the number of indirect blocks required. Fixed comments in quota.c:do_sync() to better explain block reservation calculation for quotas. Signed-off-by: Andreas Gruenbacher Acked-by: Abhi Das --- fs/gfs2/bmap.c | 4 ++-- fs/gfs2/bmap.h | 2 ++ fs/gfs2/file.c | 12 ++-- fs/gfs2/quota.c | 46 -- 4 files changed, 34 insertions(+), 30 deletions(-) diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 89f1f7d..7d3bb32 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -1003,8 +1003,8 @@ static int gfs2_iomap_begin_write(struct inode *inode, loff_t pos, alloc_required = unstuff || iomap->type == IOMAP_HOLE; if (alloc_required || gfs2_is_jdata(ip)) - gfs2_write_calc_reserv(ip, iomap->length, &data_blocks, - &ind_blocks); + gfs2_write_calc_reserv(ip, iomap->offset, iomap->length, + &data_blocks, &ind_blocks); if (alloc_required) { struct gfs2_alloc_parms ap = { diff --git a/fs/gfs2/bmap.h b/fs/gfs2/bmap.h index 6b18fb3..6497053 100644 --- a/fs/gfs2/bmap.h +++ b/fs/gfs2/bmap.h @@ -22,6 +22,7 @@ struct page; /** * gfs2_write_calc_reserv - calculate number of blocks needed to write to a file * @ip: the file + * @pos: file offset of the write * @len: the number of bytes to be written to the file * @data_blocks: returns the number of data blocks required * @ind_blocks: returns the number of indirect blocks required @@ -29,6 +30,7 @@ struct page; */ static inline void gfs2_write_calc_reserv(const struct gfs2_inode *ip, + u64 pos, unsigned int len, unsigned int *data_blocks, unsigned int *ind_blocks) diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 08369c6..93f59f9 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -435,7 +435,7 @@ static vm_fault_t gfs2_page_mkwrite(struct vm_fault *vmf) if (ret) goto out_unlock; - gfs2_write_calc_reserv(ip, PAGE_SIZE, &data_blocks, &ind_blocks); + gfs2_write_calc_reserv(ip, pos, PAGE_SIZE, &data_blocks, &ind_blocks); ap.target = data_blocks + ind_blocks; ret = gfs2_quota_lock_check(ip, &ap); if (ret) @@ -918,7 +918,7 @@ static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len, * * Returns: void, but @len, @data_blocks and @ind_blocks are filled in. */ -static void calc_max_reserv(struct gfs2_inode *ip, loff_t *len, +static void calc_max_reserv(struct gfs2_inode *ip, loff_t pos, loff_t *len, unsigned int *data_blocks, unsigned int *ind_blocks, unsigned int max_blocks) { @@ -936,7 +936,7 @@ static void calc_max_reserv(struct gfs2_inode *ip, loff_t *len, *len = ((loff_t)max_data - 3) << sdp->sd_sb.sb_bsize_shift; if (*len > max) { *len = max; - gfs2_write_calc_reserv(ip, max, data_blocks, ind_blocks); + gfs2_write_calc_reserv(ip, pos, max, data_blocks, ind_blocks); } } @@ -969,7 +969,7 @@ static long __gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t gfs2_size_hint(file, offset, len); - gfs2_write_calc_reserv(ip, PAGE_SIZE, &data_blocks, &ind_blocks); + gfs2_write_calc_reserv(ip, offset, PAGE_SIZE, &data_blocks, &ind_blocks); ap.min_target = data_blocks + ind_blocks; while (len > 0) { @@ -991,7 +991,7 @@ static long __gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t * calculate a more realistic 'bytes' to serve as a good * starting point for the number of bytes we may be able * to write */ - gfs2_write_calc_reserv(ip, bytes, &data_blocks, &ind_blocks); + gfs2_write_calc_reserv(ip, offset, bytes, &data_blocks, &ind_blocks); ap.target = data_blocks + ind_blocks; error = gfs2_quota_lock_check(ip, &ap); @@ -1014,7 +1014,7 @@ static long __gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t /* Almost done. Calculate bytes that can be written using * max_blks. We also recompute max_bytes, data_blocks and * ind_blocks */ - calc_max_reserv(ip, &max_bytes, &data_blocks, + calc_max_reserv(ip, offset, &max_bytes, &data_blocks, &ind_blocks, max_blks); rblocks = RES_DINODE + ind_blocks + RES_STATFS + RES_QUOTA + diff --git a/fs/gfs2/quota.c b/f
[Cluster-devel] [RFC PATCH 2/3] gfs2: add timing info for various stages of journal recovery
Tells you how many milliseconds each stage of journal recovery takes. Signed-off-by: Abhi Das --- fs/gfs2/ops_fstype.c | 5 + fs/gfs2/recovery.c | 20 ++-- 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index cf3e366..fd460c1 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -532,7 +532,9 @@ int map_journal_extents(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd) struct gfs2_journal_extent *jext = NULL; struct buffer_head bh; int rc = 0; + ktime_t start, end; + start = ktime_get(); prev_db = 0; for (lb = 0; lb < i_size_read(jd->jd_inode) >> sdp->sd_sb.sb_bsize_shift; lb++) { @@ -564,6 +566,9 @@ int map_journal_extents(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd) } prev_db = db; } + end = ktime_get(); + fs_info(sdp, "jid=%u: Journal extent mapped in %lldms\n", jd->jd_jid, + ktime_ms_delta(end, start)); return rc; } diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c index 56dea44..4b042db 100644 --- a/fs/gfs2/recovery.c +++ b/fs/gfs2/recovery.c @@ -14,6 +14,7 @@ #include #include #include +#include #include "gfs2.h" #include "incore.h" @@ -455,12 +456,13 @@ void gfs2_recover_func(struct work_struct *work) struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); struct gfs2_log_header_host head; struct gfs2_holder j_gh, ji_gh, t_gh; - unsigned long t; + ktime_t t_start, t_jlck, t_jhd, t_tlck, t_rep; int ro = 0; unsigned int pass; int error; int jlocked = 0; + t_start = ktime_get(); if (sdp->sd_args.ar_spectator || (jd->jd_jid != sdp->sd_lockstruct.ls_jid)) { fs_info(sdp, "jid=%u: Trying to acquire journal lock...\n", @@ -492,6 +494,7 @@ void gfs2_recover_func(struct work_struct *work) fs_info(sdp, "jid=%u, already locked for use\n", jd->jd_jid); } + t_jlck = ktime_get(); fs_info(sdp, "jid=%u: Looking at journal...\n", jd->jd_jid); error = gfs2_jdesc_check(jd); @@ -501,13 +504,12 @@ void gfs2_recover_func(struct work_struct *work) error = gfs2_find_jhead(jd, &head); if (error) goto fail_gunlock_ji; + t_jhd = ktime_get(); if (!(head.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) { fs_info(sdp, "jid=%u: Acquiring the transaction lock...\n", jd->jd_jid); - t = jiffies; - /* Acquire a shared hold on the transaction lock */ error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, @@ -541,6 +543,7 @@ void gfs2_recover_func(struct work_struct *work) goto fail_gunlock_tr; } + t_tlck = ktime_get(); fs_info(sdp, "jid=%u: Replaying journal...\n", jd->jd_jid); for (pass = 0; pass < 2; pass++) { @@ -557,9 +560,14 @@ void gfs2_recover_func(struct work_struct *work) goto fail_gunlock_tr; gfs2_glock_dq_uninit(&t_gh); - t = DIV_ROUND_UP(jiffies - t, HZ); - fs_info(sdp, "jid=%u: Journal replayed in %lus\n", - jd->jd_jid, t); + t_rep = ktime_get(); + fs_info(sdp, "jid=%u: Journal replayed in %lldms [jlck:%lldms, " + "jhead:%lldms, tlck:%lldms, replay:%lldms]\n", + jd->jd_jid, ktime_ms_delta(t_rep, t_start), + ktime_ms_delta(t_jlck, t_start), + ktime_ms_delta(t_jhd, t_jlck), + ktime_ms_delta(t_tlck, t_jhd), + ktime_ms_delta(t_rep, t_tlck)); } gfs2_recovery_done(sdp, jd->jd_jid, LM_RD_SUCCESS); -- 2.4.11
[Cluster-devel] [RFC PATCH 3/3] gfs2: read journal in large chunks to locate the head
Use bio(s) to read in the journal sequentially in large chunks and locate the head of the journal. This is faster in most cases when compared to the existing bisect method which operates one block at a time. Signed-off-by: Abhi Das --- fs/gfs2/incore.h | 7 +++- fs/gfs2/log.c | 2 +- fs/gfs2/log.h | 1 + fs/gfs2/lops.c | 115 +++ fs/gfs2/lops.h | 1 + fs/gfs2/recovery.c | 118 - fs/gfs2/recovery.h | 1 + 7 files changed, 127 insertions(+), 118 deletions(-) diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index f303616..82d50a0 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -494,18 +494,23 @@ struct gfs2_journal_extent { u64 blocks; }; +enum { + JDF_RECOVERY= 1, + JDF_JHEAD = 2, +}; + struct gfs2_jdesc { struct list_head jd_list; struct list_head extent_list; struct work_struct jd_work; struct inode *jd_inode; unsigned long jd_flags; -#define JDF_RECOVERY 1 unsigned int jd_jid; unsigned int jd_blocks; int jd_recover_error; /* Replay stuff */ + struct gfs2_log_header_host jd_jhead; unsigned int jd_found_blocks; unsigned int jd_found_revokes; unsigned int jd_replayed_blocks; diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 15a3a8c..523cf79 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -469,7 +469,7 @@ static void log_pull_tail(struct gfs2_sbd *sdp, unsigned int new_tail) } -static void log_flush_wait(struct gfs2_sbd *sdp) +void log_flush_wait(struct gfs2_sbd *sdp) { DEFINE_WAIT(wait); diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h index 19c93df..95b1ea3 100644 --- a/fs/gfs2/log.h +++ b/fs/gfs2/log.h @@ -65,6 +65,7 @@ extern unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct, extern int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks); extern void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl); +extern void log_flush_wait(struct gfs2_sbd *sdp); extern void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *trans); extern void gfs2_remove_from_ail(struct gfs2_bufdata *bd); extern void gfs2_ail1_flush(struct gfs2_sbd *sdp, struct writeback_control *wbc); diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index 4da6055..48d7e6d 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -16,6 +16,7 @@ #include #include #include +#include #include "gfs2.h" #include "incore.h" @@ -228,6 +229,60 @@ static void gfs2_end_log_write(struct bio *bio, int error) wake_up(&sdp->sd_log_flush_wait); } +static void gfs2_end_log_read(struct bio *bio, int error) +{ + struct gfs2_jdesc *jd = bio->bi_private; + struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); + struct page *page; + struct bio_vec *bvec; + int i, last; + + if (error) { + sdp->sd_log_error = error; + fs_err(sdp, "Error %d reading from journal, jid=%u\n", error, + jd->jd_jid); + wake_up(&sdp->sd_logd_waitq); + } + + bio_for_each_segment_all(bvec, bio, i) { + struct gfs2_log_header_host uninitialized_var(lh); + void *ptr; + const u32 nothing = 0; + u32 hash; + + page = bvec->bv_page; + ptr = page_address(page); + hash = crc32_le((u32)~0, ptr, sizeof(struct gfs2_log_header) - sizeof(u32)); + hash = crc32_le(hash, (unsigned char const *)¬hing, sizeof(nothing)); + hash ^= (u32)~0; + error = gfs2_log_header_in(&lh, ptr); + last = page_private(page); + mempool_free(page, gfs2_page_pool); + + if (!test_bit(JDF_JHEAD, &jd->jd_flags)) + continue; + + if (!error && lh.lh_hash == hash) { + if (lh.lh_sequence > jd->jd_jhead.lh_sequence) + jd->jd_jhead = lh; + else + goto found; + } + + if (last) { + found: + clear_bit(JDF_JHEAD, &jd->jd_flags); + smp_mb__after_clear_bit(); + wake_up_bit(&jd->jd_flags, JDF_JHEAD); + } + } + + bio_put(bio); + + if (atomic_dec_and_test(&sdp->sd_log_in_flight)) + wake_up(&sdp->sd_log_flush_wait); +} + /** * gfs2_log_flush_bio - Submit any pending log bio * @sdp: The superblock @@ -259,7 +314,8 @@ void gfs2_log_flush_bio(struct gfs2_sbd *sdp, int rw) * Returns: Newly allocated bio */ -static struct bio *gfs2_log_alloc_bio(struct gfs2_sbd *sdp, u64 blkno) +static struct
[Cluster-devel] [RFC PATCH 1/3] gfs2: allow map_journal_extents() to take a journal descriptor as argument
This function now maps the extents for the journal whose descriptor is passed in as argument. Signed-off-by: Abhi Das --- fs/gfs2/log.h| 1 + fs/gfs2/ops_fstype.c | 5 ++--- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h index 92dcbe7..19c93df 100644 --- a/fs/gfs2/log.h +++ b/fs/gfs2/log.h @@ -75,4 +75,5 @@ extern int gfs2_logd(void *data); extern void gfs2_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd); extern void gfs2_write_revokes(struct gfs2_sbd *sdp); +extern int map_journal_extents(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd); #endif /* __LOG_DOT_H__ */ diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 228f38e..cf3e366 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -524,9 +524,8 @@ out: * but since it's only done at mount time, I'm not worried about the * time it takes. */ -static int map_journal_extents(struct gfs2_sbd *sdp) +int map_journal_extents(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd) { - struct gfs2_jdesc *jd = sdp->sd_jdesc; unsigned int lb; u64 db, prev_db; /* logical block, disk block, prev disk block */ struct gfs2_inode *ip = GFS2_I(jd->jd_inode); @@ -772,7 +771,7 @@ static int init_journal(struct gfs2_sbd *sdp, int undo) atomic_set(&sdp->sd_log_thresh2, 4*sdp->sd_jdesc->jd_blocks/5); /* Map the extents for this journal's blocks */ - map_journal_extents(sdp); + map_journal_extents(sdp, sdp->sd_jdesc); } trace_gfs2_log_blocks(sdp, atomic_read(&sdp->sd_log_blks_free)); -- 2.4.11
[Cluster-devel] [RFC PATCH 0/3] Speed up journal head lookup
This patchset uses bio(s) to read in the journal in large chunks to lookup the journal head. This will replace the existing bisect method of reading the journal block-by-block to find the head. These are RHEL7 patches because it was simpler for me to test. Upstream versions shouldn't be very dissimilar. I'll port once I get some feedback on this. I'm seeing some promising results with some preliminary testing. I have 5 lock_nolock filesystems saved off using 'gfs2_edit savemeta' after killing the node mid-IO so as to save unrecovered journals. All times are in milliseconds. This table below shows the overall time taken for recovery and the jhead column represents the time taken to find the jhead alone. Finding the jhead is a big chunk of the overall time taken for recovery. bisect| bio overall jhead | overall jhead --|--- fs1 17021642 | 394 325 fs2 761 334 | 706 356 fs3 1150904 | 13561089 fs4 953 596 | 691 343 fs5 12621233 | 554 524 Abhi Das (3): gfs2: allow map_journal_extents() to take a journal descriptor as argument gfs2: add timing info for various stages of journal recovery gfs2: read journal in large chunks to locate the head fs/gfs2/incore.h | 7 ++- fs/gfs2/log.c| 2 +- fs/gfs2/log.h| 2 + fs/gfs2/lops.c | 115 +++--- fs/gfs2/lops.h | 1 + fs/gfs2/ops_fstype.c | 10 ++-- fs/gfs2/recovery.c | 138 +-- fs/gfs2/recovery.h | 1 + 8 files changed, 149 insertions(+), 127 deletions(-) -- 2.4.11
[Cluster-devel] [PATCH] gfs2: time journal recovery steps accurately
This patch spits out the time taken by the various steps in the journal recover process. Previously, the journal recovery time didn't account for finding the journal head in the log which takes up a significant portion of time. Signed-off-by: Abhi Das --- fs/gfs2/recovery.c | 20 ++-- 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c index b6b2589..d8b622c 100644 --- a/fs/gfs2/recovery.c +++ b/fs/gfs2/recovery.c @@ -15,6 +15,7 @@ #include #include #include +#include #include "gfs2.h" #include "incore.h" @@ -409,12 +410,13 @@ void gfs2_recover_func(struct work_struct *work) struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); struct gfs2_log_header_host head; struct gfs2_holder j_gh, ji_gh, thaw_gh; - unsigned long t; + ktime_t t_start, t_jlck, t_jhd, t_tlck, t_rep; int ro = 0; unsigned int pass; int error; int jlocked = 0; + t_start = ktime_get(); if (sdp->sd_args.ar_spectator || (jd->jd_jid != sdp->sd_lockstruct.ls_jid)) { fs_info(sdp, "jid=%u: Trying to acquire journal lock...\n", @@ -446,6 +448,7 @@ void gfs2_recover_func(struct work_struct *work) fs_info(sdp, "jid=%u, already locked for use\n", jd->jd_jid); } + t_jlck = ktime_get(); fs_info(sdp, "jid=%u: Looking at journal...\n", jd->jd_jid); error = gfs2_jdesc_check(jd); @@ -455,13 +458,12 @@ void gfs2_recover_func(struct work_struct *work) error = gfs2_find_jhead(jd, &head); if (error) goto fail_gunlock_ji; + t_jhd = ktime_get(); if (!(head.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) { fs_info(sdp, "jid=%u: Acquiring the transaction lock...\n", jd->jd_jid); - t = jiffies; - /* Acquire a shared hold on the freeze lock */ error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_SHARED, @@ -495,6 +497,7 @@ void gfs2_recover_func(struct work_struct *work) goto fail_gunlock_thaw; } + t_tlck = ktime_get(); fs_info(sdp, "jid=%u: Replaying journal...\n", jd->jd_jid); for (pass = 0; pass < 2; pass++) { @@ -509,9 +512,14 @@ void gfs2_recover_func(struct work_struct *work) clean_journal(jd, &head); gfs2_glock_dq_uninit(&thaw_gh); - t = DIV_ROUND_UP(jiffies - t, HZ); - fs_info(sdp, "jid=%u: Journal replayed in %lus\n", - jd->jd_jid, t); + t_rep = ktime_get(); + fs_info(sdp, "jid=%u: Journal replayed in %lldms [jlck:%lldms, " + "jhead:%lldms, tlck:%lldms, replay:%lldms]\n", + jd->jd_jid, ktime_ms_delta(t_rep, t_start), + ktime_ms_delta(t_jlck, t_start), + ktime_ms_delta(t_jhd, t_jlck), + ktime_ms_delta(t_tlck, t_jhd), + ktime_ms_delta(t_rep, t_tlck)); } gfs2_recovery_done(sdp, jd->jd_jid, LM_RD_SUCCESS); -- 2.4.11
[Cluster-devel] [GFS2 PATCH] gfs2: Remove inode from ordered write list in gfs2_write_inode()
The vfs clears the I_DIRTY inode flag before calling gfs2_write_inode() having queued any data that needed to be written to disk. This is a good time to remove such inodes from our ordered write list so they don't hang around for long periods of time. Signed-off-by: Abhi Das --- fs/gfs2/super.c | 6 ++ 1 file changed, 6 insertions(+) diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index d81d46e..596feb6 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -766,6 +766,12 @@ static int gfs2_write_inode(struct inode *inode, struct writeback_control *wbc) ret = filemap_fdatawait(metamapping); if (ret) mark_inode_dirty_sync(inode); + else { + spin_lock(&inode->i_lock); + if (!(inode->i_flags & I_DIRTY)) + gfs2_ordered_del_inode(ip); + spin_unlock(&inode->i_lock); + } return ret; } -- 2.4.11
[Cluster-devel] [GFS2 PATCH] gfs2: Trim the ordered write list in gfs2_ordered_write()
We iterate through the entire ordered writes list in gfs2_ordered_write() to write out inodes. It's a good place to try and shrink the list by throwing out inodes that don't have any pages. Signed-off-by: Abhi Das --- fs/gfs2/log.c | 7 +-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index f72c442..ab18d2c 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -538,9 +538,12 @@ static void gfs2_ordered_write(struct gfs2_sbd *sdp) list_sort(NULL, &sdp->sd_log_le_ordered, &ip_cmp); while (!list_empty(&sdp->sd_log_le_ordered)) { ip = list_entry(sdp->sd_log_le_ordered.next, struct gfs2_inode, i_ordered); - list_move(&ip->i_ordered, &written); - if (ip->i_inode.i_mapping->nrpages == 0) + if (ip->i_inode.i_mapping->nrpages == 0) { + test_and_clear_bit(GIF_ORDERED, &ip->i_flags); + list_del(&ip->i_ordered); continue; + } + list_move(&ip->i_ordered, &written); spin_unlock(&sdp->sd_ordered_lock); filemap_fdatawrite(ip->i_inode.i_mapping); spin_lock(&sdp->sd_ordered_lock); -- 2.4.11
[Cluster-devel] [RFC RHEL7 GFS2 PATCH 2/3] gfs2: try to free empty mapping inodes from ordered list
Add a new function gfs2_ordered_shrink() that is called when syncfs is run. This function runs through the ordered list of inodes and removes the ones that don't have any pages in need of writing. Signed-off-by: Abhi Das --- fs/gfs2/log.c | 16 fs/gfs2/log.h | 1 + fs/gfs2/super.c | 4 +++- 3 files changed, 20 insertions(+), 1 deletion(-) diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 0257704..6d618a1 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -537,6 +537,22 @@ static void gfs2_ordered_wait(struct gfs2_sbd *sdp) spin_unlock(&sdp->sd_ordered_lock); } +void gfs2_ordered_shrink(struct gfs2_sbd *sdp, int whence) +{ + struct gfs2_inode *ip, *tmp; + + spin_lock(&sdp->sd_ordered_lock); + list_for_each_entry_safe(ip, tmp, &sdp->sd_log_le_ordered, i_ordered) { + if (ip->i_inode.i_mapping->nrpages != 0) + continue; + if (test_and_clear_bit(GIF_ORDERED, &ip->i_flags)) { + list_del(&ip->i_ordered); + ord_stats_adjust(sdp, -1, whence); + } + } + spin_unlock(&sdp->sd_ordered_lock); +} + void gfs2_ordered_del_inode(struct gfs2_inode *ip, int whence) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h index bf7729c..80c8861 100644 --- a/fs/gfs2/log.h +++ b/fs/gfs2/log.h @@ -90,6 +90,7 @@ static inline void gfs2_ordered_add_inode(struct gfs2_inode *ip) } } +extern void gfs2_ordered_shrink(struct gfs2_sbd *sdp, int whence); extern void gfs2_ordered_del_inode(struct gfs2_inode *ip, int whence); extern unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct, unsigned int ssize); diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index ee15a50..773e98d 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -948,8 +948,10 @@ static int gfs2_sync_fs(struct super_block *sb, int wait) struct gfs2_sbd *sdp = sb->s_fs_info; gfs2_quota_sync(sb, -1); - if (wait && sdp) + if (wait && sdp) { gfs2_log_flush(sdp, NULL); + gfs2_ordered_shrink(sdp, ORD_WHENCE_SYNCFS); + } return sdp->sd_log_error; } -- 2.4.11
[Cluster-devel] [RFC RHEL7 GFS2 PATCH 3/3] gfs2: ordered write list addendum patch
Trim the list in gfs2_ordered_write() as we run through it to write out inodes. Also attempt to remove an inode from the list after it is fsync'ed. Finally, call gfs2_ordered_write() in case we were not able to shrink the list in gfs2_ordered_shrink() in the hopes that it will eventually cause the list to shrink. --- fs/gfs2/file.c | 3 +++ fs/gfs2/incore.h | 1 + fs/gfs2/log.c| 13 +++-- fs/gfs2/log.h| 4 +++- fs/gfs2/quota.c | 4 ++-- 5 files changed, 20 insertions(+), 5 deletions(-) diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 757ec66..75f9ac0 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -697,6 +697,9 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end, if (mapping->nrpages) ret = filemap_fdatawait_range(mapping, start, end); + if (!ret && !ret1) + gfs2_ordered_del_inode(ip, ORD_WHENCE_FSYNC); + return ret ? ret : ret1; } diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 6fcad2a..93da360 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -661,6 +661,7 @@ struct ord_stats { unsigned long os_rm_trunc; unsigned long os_rm_evict; unsigned long os_rm_wait; + unsigned long os_rm_fsync; unsigned long os_rm_syncfs; unsigned long os_rm_write; unsigned long os_rm_setflags; diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 6d618a1..4cfef47 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -507,9 +507,13 @@ static void gfs2_ordered_write(struct gfs2_sbd *sdp) list_sort(NULL, &sdp->sd_log_le_ordered, &ip_cmp); while (!list_empty(&sdp->sd_log_le_ordered)) { ip = list_entry(sdp->sd_log_le_ordered.next, struct gfs2_inode, i_ordered); - list_move(&ip->i_ordered, &written); - if (ip->i_inode.i_mapping->nrpages == 0) + if (ip->i_inode.i_mapping->nrpages == 0) { + test_and_clear_bit(GIF_ORDERED, &ip->i_flags); + list_del(&ip->i_ordered); + ord_stats_adjust(sdp, -1, ORD_WHENCE_ORD_WRITE); continue; + } + list_move(&ip->i_ordered, &written); spin_unlock(&sdp->sd_ordered_lock); filemap_fdatawrite(ip->i_inode.i_mapping); spin_lock(&sdp->sd_ordered_lock); @@ -540,17 +544,22 @@ static void gfs2_ordered_wait(struct gfs2_sbd *sdp) void gfs2_ordered_shrink(struct gfs2_sbd *sdp, int whence) { struct gfs2_inode *ip, *tmp; + bool removed; spin_lock(&sdp->sd_ordered_lock); list_for_each_entry_safe(ip, tmp, &sdp->sd_log_le_ordered, i_ordered) { if (ip->i_inode.i_mapping->nrpages != 0) continue; if (test_and_clear_bit(GIF_ORDERED, &ip->i_flags)) { + removed = true; list_del(&ip->i_ordered); ord_stats_adjust(sdp, -1, whence); } } spin_unlock(&sdp->sd_ordered_lock); + + if (!removed) + gfs2_ordered_write(sdp); } void gfs2_ordered_del_inode(struct gfs2_inode *ip, int whence) diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h index 80c8861..0bc3620 100644 --- a/fs/gfs2/log.h +++ b/fs/gfs2/log.h @@ -55,7 +55,8 @@ enum { ORD_WHENCE_SYNCFS = 3, ORD_WHENCE_ORD_WRITE= 4, ORD_WHENCE_SETFLAGS = 5, - ORD_WHENCE_ADD = 6, + ORD_WHENCE_FSYNC= 6, + ORD_WHENCE_ADD = 7, }; static inline void ord_stats_adjust(struct gfs2_sbd *sdp, int count, int whence) @@ -70,6 +71,7 @@ static inline void ord_stats_adjust(struct gfs2_sbd *sdp, int count, int whence) case ORD_WHENCE_SYNCFS: os->os_rm_syncfs += -(count); break; case ORD_WHENCE_ORD_WRITE: os->os_rm_write += -(count); break; case ORD_WHENCE_SETFLAGS: os->os_rm_setflags += -(count); break; + case ORD_WHENCE_FSYNC: os->os_rm_fsync += -(count); break; case ORD_WHENCE_ADD:os->os_add += count; break; default: break; diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 66c5126..63e3afa 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -1575,11 +1575,11 @@ int gfs2_quotad(void *data) printk(KERN_WARNING "Ord list Size:%lu +[add_inode:%lu] " "-[trunc:%lu evict:%lu wait:%lu syncfs:%lu ord_write:%lu" - " setflags:%lu]\n", sdp->sd_ord_stats.os_ct, + " setflags:%lu fsync:%lu]\n", sdp->sd_ord_stats.os_ct, sdp->sd_ord_stats.os_add, sdp->sd_ord_stats.os_rm_trunc, sdp->sd_ord_stats.os_rm_evict, sdp->sd_ord_stats.os_rm_wait, sdp->sd_ord_stats.os_rm_syncfs, sdp->sd_ord_stats.os_rm_write, - sdp->sd_ord_stats.os_rm_setflags); + sdp->sd_ord_stats.os_rm_setflags, sdp-
[Cluster-devel] [RFC RHEL7 GFS2 PATCH 1/3] gfs2: ordered list instrumentation
Keep stats on the size of the ordered list and keep track of where it's added to and removed from --- fs/gfs2/bmap.c | 2 +- fs/gfs2/file.c | 2 +- fs/gfs2/incore.h | 12 fs/gfs2/log.c| 7 +-- fs/gfs2/log.h| 35 +-- fs/gfs2/ops_fstype.c | 1 + fs/gfs2/quota.c | 8 fs/gfs2/super.c | 2 +- 8 files changed, 62 insertions(+), 7 deletions(-) diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 8569bf3..d9720ce 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -1504,7 +1504,7 @@ static int trunc_end(struct gfs2_inode *ip) ip->i_height = 0; ip->i_goal = ip->i_no_addr; gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode)); - gfs2_ordered_del_inode(ip); + gfs2_ordered_del_inode(ip, ORD_WHENCE_TRUNC); } ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; ip->i_diskflags &= ~GFS2_DIF_TRUNC_IN_PROG; diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 31b5986..757ec66 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -284,7 +284,7 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask) if (error) goto out; if (new_flags & GFS2_DIF_JDATA) - gfs2_ordered_del_inode(ip); + gfs2_ordered_del_inode(ip, ORD_WHENCE_SETFLAGS); } error = gfs2_trans_begin(sdp, RES_DINODE, 0); if (error) diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 7e78d8a..6fcad2a 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -655,6 +655,17 @@ struct gfs2_pcpu_lkstats { struct gfs2_lkstats lkstats[10]; }; +struct ord_stats { + unsigned long os_ct; + unsigned long os_add; + unsigned long os_rm_trunc; + unsigned long os_rm_evict; + unsigned long os_rm_wait; + unsigned long os_rm_syncfs; + unsigned long os_rm_write; + unsigned long os_rm_setflags; +}; + struct gfs2_sbd { struct super_block *sd_vfs; struct gfs2_pcpu_lkstats __percpu *sd_lkstats; @@ -773,6 +784,7 @@ struct gfs2_sbd { struct list_head sd_log_le_revoke; struct list_head sd_log_le_ordered; + struct ord_stats sd_ord_stats; spinlock_t sd_ordered_lock; atomic_t sd_log_thresh1; diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index cc3f7d1..0257704 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -526,6 +526,7 @@ static void gfs2_ordered_wait(struct gfs2_sbd *sdp) while (!list_empty(&sdp->sd_log_le_ordered)) { ip = list_entry(sdp->sd_log_le_ordered.next, struct gfs2_inode, i_ordered); list_del(&ip->i_ordered); + ord_stats_adjust(sdp, -1, ORD_WHENCE_WAIT); WARN_ON(!test_and_clear_bit(GIF_ORDERED, &ip->i_flags)); if (ip->i_inode.i_mapping->nrpages == 0) continue; @@ -536,13 +537,15 @@ static void gfs2_ordered_wait(struct gfs2_sbd *sdp) spin_unlock(&sdp->sd_ordered_lock); } -void gfs2_ordered_del_inode(struct gfs2_inode *ip) +void gfs2_ordered_del_inode(struct gfs2_inode *ip, int whence) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); spin_lock(&sdp->sd_ordered_lock); - if (test_and_clear_bit(GIF_ORDERED, &ip->i_flags)) + if (test_and_clear_bit(GIF_ORDERED, &ip->i_flags)) { list_del(&ip->i_ordered); + ord_stats_adjust(sdp, -1, whence); + } spin_unlock(&sdp->sd_ordered_lock); } diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h index 3721663..bf7729c 100644 --- a/fs/gfs2/log.h +++ b/fs/gfs2/log.h @@ -48,18 +48,49 @@ static inline void gfs2_log_pointers_init(struct gfs2_sbd *sdp, sdp->sd_log_head = sdp->sd_log_tail = value; } +enum { + ORD_WHENCE_TRUNC= 0, + ORD_WHENCE_EVICT= 1, + ORD_WHENCE_WAIT = 2, + ORD_WHENCE_SYNCFS = 3, + ORD_WHENCE_ORD_WRITE= 4, + ORD_WHENCE_SETFLAGS = 5, + ORD_WHENCE_ADD = 6, +}; + +static inline void ord_stats_adjust(struct gfs2_sbd *sdp, int count, int whence) +{ + struct ord_stats *os = &sdp->sd_ord_stats; + + os->os_ct += count; + switch (whence) { + case ORD_WHENCE_TRUNC: os->os_rm_trunc += -(count); break; + case ORD_WHENCE_EVICT: os->os_rm_evict += -(count); break; + case ORD_WHENCE_WAIT: os->os_rm_wait += -(count); break; + case ORD_WHENCE_SYNCFS: os->os_rm_syncfs += -(count); break; + case ORD_WHENCE_ORD_WRITE: os->os_rm_write += -(count); break; + case ORD_WHENCE_SETFLAGS: os->os_rm_setflags += -(count); break; + + case ORD_WHENCE_ADD:os->os_add += count; break; + default: break; + } +} + static inline void gfs2_ordered_add_inode(struct gfs2_inode *ip) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_ino
[Cluster-devel] [RFC RHEL7 GFS2 PATCH 0/3] Trimming the ordered write inode list
This is a RHEL7 patchset to address bz 1511599. The first patch in this series is just an instrumentation patch to assess the impact of the subsequent patches on the ordered write list. It is not intended to be merged into the code base. The second patch adds a function gfs2_ordered_shrink() which attempts to remove inodes from the ordered list that don't have any pages in need of writing. This function is called from gfs2_sync_fs(). The third patch does a few things: - Call gfs2_ordered_write() in case we didn't trim the ordered write list in gfs2_ordered_shrink(). - Have gfs2_ordered_write() trim the list of inodes that don't have pages that need to be written. Should happen every time on a log flush when gfs2_log_flush() calls gfs2_ordered_write() - Call gfs2_ordered_del_inode() from gfs2_fsync() Other ideas: - Change gfs2_ordered_write() so that it can work on a subset of the ordered list. That way, we can call it to write a percentage (or N inodes), wait for the writes to complete and then remove those inodes from the list. Abhi Das (3): gfs2: ordered list instrumentation gfs2: try to free empty mapping inodes from ordered list gfs2: ordered write list addendum patch fs/gfs2/bmap.c | 2 +- fs/gfs2/file.c | 5 - fs/gfs2/incore.h | 13 + fs/gfs2/log.c| 36 fs/gfs2/log.h| 38 -- fs/gfs2/ops_fstype.c | 1 + fs/gfs2/quota.c | 8 fs/gfs2/super.c | 6 -- 8 files changed, 99 insertions(+), 10 deletions(-) -- 2.4.11
[Cluster-devel] [PATCH] gfs2: forcibly flush ail to relieve memory pressure
On systems with low memory, it is possible for gfs2 to infinitely loop in balance_dirty_pages() under heavy IO (creating sparse files). balance_dirty_pages() attempts to write out the dirty pages via gfs2_writepages() but none are found because these dirty pages are being used by the journaling code in the ail. Normally, the journal has an upper threshold which when hit triggers an automatic flush of the ail. But this threshold can be higher than the number of allowable dirty pages and result in the ail never being flushed. This patch forces an ail flush when gfs2_writepages() fails to write anything. This is a good indication that the ail might be holding some dirty pages. Resolves: rhbz#1389079 Signed-off-by: Abhi Das --- fs/gfs2/aops.c | 14 +- fs/gfs2/incore.h | 1 + fs/gfs2/log.c| 4 3 files changed, 18 insertions(+), 1 deletion(-) diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index ed7a2e2..68ed069 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -234,7 +234,19 @@ static int gfs2_jdata_writepage(struct page *page, struct writeback_control *wbc static int gfs2_writepages(struct address_space *mapping, struct writeback_control *wbc) { - return mpage_writepages(mapping, wbc, gfs2_get_block_noalloc); + struct gfs2_sbd *sdp = gfs2_mapping2sbd(mapping); + int ret = mpage_writepages(mapping, wbc, gfs2_get_block_noalloc); + + /* +* Even if we didn't write any pages here, we might still be holding +* dirty pages in the ail. We forcibly flush the ail because we don't +* want balance_dirty_pages() to loop indefinitely trying to write out +* pages held in the ail that it can't find. +*/ + if (ret == 0) + set_bit(SDF_FORCE_AIL_FLUSH, &sdp->sd_flags); + + return ret; } /** diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 73fce76..a7b0331 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -606,6 +606,7 @@ enum { SDF_NOJOURNALID = 6, SDF_RORECOVERY = 7, /* read only recovery */ SDF_SKIP_DLM_UNLOCK = 8, + SDF_FORCE_AIL_FLUSH = 9, }; enum gfs2_freeze_state { diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 9a624f6..31585c2 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -898,6 +898,10 @@ static inline int gfs2_jrnl_flush_reqd(struct gfs2_sbd *sdp) static inline int gfs2_ail_flush_reqd(struct gfs2_sbd *sdp) { unsigned int used_blocks = sdp->sd_jdesc->jd_blocks - atomic_read(&sdp->sd_log_blks_free); + + if (test_and_clear_bit(SDF_FORCE_AIL_FLUSH, &sdp->sd_flags)) + return 1; + return used_blocks + atomic_read(&sdp->sd_log_blks_needed) >= atomic_read(&sdp->sd_log_thresh2); } -- 2.4.11
[Cluster-devel] [PATCH] fs: Do not check for valid page->mapping in page_cache_pipe_buf_confirm
If the page is truncated after being spliced into the pipe, it's probably not invalid. For filesystems that invalidate pages, we used to return -ENODATA even though the data is there, it's just possibly different from what was spliced into the pipe. We shouldn't have to throw away the buffer or return error in this case. Signed-off-by: Abhi Das CC: Miklos Szeredi CC: Jens Axboe CC: Al Viro --- fs/splice.c | 9 - 1 file changed, 9 deletions(-) diff --git a/fs/splice.c b/fs/splice.c index dd9bf7e..b9899b99 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -106,15 +106,6 @@ static int page_cache_pipe_buf_confirm(struct pipe_inode_info *pipe, lock_page(page); /* -* Page got truncated/unhashed. This will cause a 0-byte -* splice, if this is the first page. -*/ - if (!page->mapping) { - err = -ENODATA; - goto error; - } - - /* * Uh oh, read-error from disk. */ if (!PageUptodate(page)) { -- 2.4.3
[Cluster-devel] [GFS2 PATCH] gfs2: use inode_lock/unlock instead of accessing i_mutex directly
i_mutex has been replaced by i_rwsem and directly accessing the non-existent i_mutex breaks the kernel build. Signed-off-by: Abhi Das CC: Stephen Rothwell CC: Al Viro --- fs/gfs2/file.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index f33fd92..374dd53 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -960,16 +960,16 @@ static ssize_t gfs2_file_splice_read(struct file *in, loff_t *ppos, struct gfs2_holder gh; int ret; - mutex_lock(&inode->i_mutex); + inode_lock(inode); ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &gh); if (ret) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } gfs2_glock_dq_uninit(&gh); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return generic_file_splice_read(in, ppos, pipe, len, flags); } -- 2.4.3
[Cluster-devel] [GFS2 PATCH] gfs2: Use gfs2 wrapper to sync inode before calling generic_file_splice_read()
gfs2_file_splice_read() f_op grabs and releases the cluster-wide inode glock to sync the inode size to the latest. Without this, generic_file_splice_read() uses an older i_size value and can return EOF for valid offsets in the inode. Resolves: rhbz#1300756 Signed-off-by: Abhi Das --- fs/gfs2/file.c | 28 ++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index c9384f9..fd9a10e 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -950,6 +950,30 @@ out_uninit: return ret; } +static ssize_t gfs2_file_splice_read(struct file *in, loff_t *ppos, +struct pipe_inode_info *pipe, size_t len, +unsigned int flags) +{ + struct inode *inode = in->f_mapping->host; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder gh; + int ret; + + mutex_lock(&inode->i_mutex); + + ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &gh); + if (ret) { + mutex_unlock(&inode->i_mutex); + return ret; + } + + gfs2_glock_dq_uninit(&gh); + mutex_unlock(&inode->i_mutex); + + return generic_file_splice_read(in, ppos, pipe, len, flags); +} + + static ssize_t gfs2_file_splice_write(struct pipe_inode_info *pipe, struct file *out, loff_t *ppos, size_t len, unsigned int flags) @@ -1112,7 +1136,7 @@ const struct file_operations gfs2_file_fops = { .fsync = gfs2_fsync, .lock = gfs2_lock, .flock = gfs2_flock, - .splice_read= generic_file_splice_read, + .splice_read= gfs2_file_splice_read, .splice_write = gfs2_file_splice_write, .setlease = simple_nosetlease, .fallocate = gfs2_fallocate, @@ -1140,7 +1164,7 @@ const struct file_operations gfs2_file_fops_nolock = { .open = gfs2_open, .release= gfs2_release, .fsync = gfs2_fsync, - .splice_read= generic_file_splice_read, + .splice_read= gfs2_file_splice_read, .splice_write = gfs2_file_splice_write, .setlease = generic_setlease, .fallocate = gfs2_fallocate, -- 2.4.3
[Cluster-devel] [GFS2 PATCH] gfs2: Automatically set GFS2_DIF_SYSTEM flag on system files
When new files and directories are created inside a parent directory we automatically inherit the GFS2_DIF_SYSTEM flag (if set) and assign it to the new file/dirs. All new system files/dirs created in the metafs by, say gfs2_jadd, will have this flag set because they will have parent directories in the metafs whose GFS2_DIF_SYSTEM flag has already been set (most likely by a previous mkfs.gfs2) Resolves: rhbz#1272086 Signed-off-by: Abhi Das --- fs/gfs2/file.c | 4 ++-- fs/gfs2/inode.c | 5 + 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 71cd138..63fe948 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -298,9 +298,9 @@ static int gfs2_set_flags(struct file *filp, u32 __user *ptr) gfsflags &= ~GFS2_DIF_TOPDIR; if (gfsflags & GFS2_DIF_INHERIT_JDATA) gfsflags ^= (GFS2_DIF_JDATA | GFS2_DIF_INHERIT_JDATA); - return do_gfs2_set_flags(filp, gfsflags, ~0); + return do_gfs2_set_flags(filp, gfsflags, ~GFS2_DIF_SYSTEM); } - return do_gfs2_set_flags(filp, gfsflags, ~GFS2_DIF_JDATA); + return do_gfs2_set_flags(filp, gfsflags, ~(GFS2_DIF_SYSTEM | GFS2_DIF_JDATA)); } static long gfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 063fdfc..2c05bc3 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -685,6 +685,11 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, ip->i_entries = 2; break; } + + /* Force SYSTEM flag on all files and subdirs of a SYSTEM directory */ + if (dip->i_diskflags & GFS2_DIF_SYSTEM) + ip->i_diskflags |= GFS2_DIF_SYSTEM; + gfs2_set_inode_flags(inode); if ((GFS2_I(d_inode(sdp->sd_root_dir)) == dip) || -- 2.4.3
[Cluster-devel] [GFS2 PATCH] gfs2: allow userspace to set GFS2_DIF_SYSTEM using FS_RESERVED_FL
Repurpose the existing VFS FS_RESERVED_FL flag to set GFS2_DIF_SYSTEM using the FS_IOC_SETFLAGS ioctl. Resolves: rhbz#1272086 Signed-off-by: Abhi Das --- fs/gfs2/file.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 71cd138..ead0d0a 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -137,6 +137,7 @@ static const u32 fsflags_to_gfs2[32] = { [12] = GFS2_DIF_EXHASH, [14] = GFS2_DIF_INHERIT_JDATA, [17] = GFS2_DIF_TOPDIR, + [31] = GFS2_DIF_SYSTEM, }; static const u32 gfs2_to_fsflags[32] = { @@ -147,6 +148,7 @@ static const u32 gfs2_to_fsflags[32] = { [gfs2fl_ExHash] = FS_INDEX_FL, [gfs2fl_TopLevel] = FS_TOPDIR_FL, [gfs2fl_InheritJdata] = FS_JOURNAL_DATA_FL, + [gfs2fl_System] = FS_RESERVED_FL, }; static int gfs2_get_flags(struct file *filp, u32 __user *ptr) -- 2.4.3
[Cluster-devel] [GFS2] gfs2-utils: make gfs2_jadd set GFS2_DIF_SYSTEM flag on system inodes
Reuse the VFS FS_RESERVED_FL to set the GFS2_DIF_SYSTEM flag on gfs2 metafs inodes (journals and such) created by gfs2_jadd. A companion kernel patch that actually does the mapping from the VFS flag to GFS2 is required in order for this to work correctly. Resolves: rhbz#1272086 Signed-off-by: Abhi Das --- gfs2/mkfs/main_jadd.c | 6 -- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/gfs2/mkfs/main_jadd.c b/gfs2/mkfs/main_jadd.c index de5d9c5..538d820 100644 --- a/gfs2/mkfs/main_jadd.c +++ b/gfs2/mkfs/main_jadd.c @@ -274,7 +274,7 @@ static void add_ir(struct jadd_opts *opts) { struct gfs2_inum_range ir; - set_flags(fd, JA_FL_SET, FS_JOURNAL_DATA_FL); + set_flags(fd, JA_FL_SET, FS_JOURNAL_DATA_FL|FS_RESERVED_FL); memset(&ir, 0, sizeof(struct gfs2_inum_range)); if (write(fd, (void*)&ir, sizeof(struct gfs2_inum_range)) != sizeof(struct gfs2_inum_range)) { @@ -303,7 +303,7 @@ static void add_sc(struct jadd_opts *opts) { struct gfs2_statfs_change sc; - set_flags(fd, JA_FL_SET, FS_JOURNAL_DATA_FL); + set_flags(fd, JA_FL_SET, FS_JOURNAL_DATA_FL|FS_RESERVED_FL); memset(&sc, 0, sizeof(struct gfs2_statfs_change)); if (write(fd, (void*)&sc, sizeof(struct gfs2_statfs_change)) != @@ -341,6 +341,7 @@ static void add_qc(struct gfs2_sbd *sdp, struct jadd_opts *opts) dummy_bh.b_data = buf; set_flags(fd, JA_FL_CLEAR, FS_JOURNAL_DATA_FL); + set_flags(fd, JA_FL_SET, FS_RESERVED_FL); memset(buf, 0, sdp->bsize); for (x=0; xbsize); for (x=0; xbsize) != sdp->bsize) { -- 2.4.3
[Cluster-devel] [GFS2 PATCH] gfs2: s64 cast for negative quota value
One-line fix to cast quota value to s64 before comparison. By default the quantity is treated as u64. Signed-off-by: Abhi Das --- fs/gfs2/quota.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index dcd598a..c2607a2 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -798,7 +798,7 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc, loc -= sizeof(q); /* gfs2_internal_read would've advanced the loc ptr */ err = -EIO; be64_add_cpu(&q.qu_value, change); - if (be64_to_cpu(q.qu_value) < 0) + if (((s64)be64_to_cpu(q.qu_value)) < 0) q.qu_value = 0; /* Never go negative on quota usage */ qd->qd_qb.qb_value = q.qu_value; if (fdq) { -- 1.8.1.4
[Cluster-devel] [GFS2 PATCH] gfs2: Don't support fallocate on jdata files
We cannot provide an efficient implementation due to the headers on the data blocks, so there doesn't seem much point in having it. Resolves: rhbz#1221331 Signed-off-by: Abhi Das --- fs/gfs2/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index c706c6d..8252115 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -917,7 +917,7 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t le struct gfs2_holder gh; int ret; - if (mode & ~FALLOC_FL_KEEP_SIZE) + if ((mode & ~FALLOC_FL_KEEP_SIZE) || gfs2_is_jdata(ip)) return -EOPNOTSUPP; mutex_lock(&inode->i_mutex); -- 1.8.1.4
[Cluster-devel] [GFS2 v2 PATCH 1/2] gfs2: fix quota updates on block boundaries
For smaller block sizes (512B, 1K, 2K), some quotas straddle block boundaries such that the usage value is on one block and the rest of the quota is on the previous block. In such cases, the value does not get updated correctly. This patch fixes that by addressing the boundary conditions correctly. This patch also adds a (s64) cast that was missing in a call to gfs2_quota_change() in inode.c Resolves: rhbz#1174295 Signed-off-by: Abhi Das --- fs/gfs2/inode.c | 2 +- fs/gfs2/quota.c | 197 +--- 2 files changed, 119 insertions(+), 80 deletions(-) diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 4d809eb..a088e54 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -1889,7 +1889,7 @@ static int setattr_chown(struct inode *inode, struct iattr *attr) if (!uid_eq(ouid, NO_UID_QUOTA_CHANGE) || !gid_eq(ogid, NO_GID_QUOTA_CHANGE)) { - gfs2_quota_change(ip, -ap.target, ouid, ogid); + gfs2_quota_change(ip, -(s64)ap.target, ouid, ogid); gfs2_quota_change(ip, ap.target, nuid, ngid); } diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 5c27e48..7c20031 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -652,6 +652,112 @@ static void do_qc(struct gfs2_quota_data *qd, s64 change) mutex_unlock(&sdp->sd_quota_mutex); } +static int gfs2_write_buf_to_page(struct gfs2_inode *ip, unsigned long index, + unsigned off, void *buf, unsigned bytes) +{ + struct inode *inode = &ip->i_inode; + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct address_space *mapping = inode->i_mapping; + struct page *page; + struct buffer_head *bh; + void *kaddr; + u64 blk; + unsigned bsize = sdp->sd_sb.sb_bsize, bnum = 0, boff = 0; + unsigned to_write = bytes, pg_off = off; + int done = 0; + + blk = index << (PAGE_CACHE_SHIFT - sdp->sd_sb.sb_bsize_shift); + boff = off % bsize; + + page = find_or_create_page(mapping, index, GFP_NOFS); + if (!page) + return -ENOMEM; + if (!page_has_buffers(page)) + create_empty_buffers(page, bsize, 0); + + bh = page_buffers(page); + while (!done) { + /* Find the beginning block within the page */ + if (pg_off >= ((bnum * bsize) + bsize)) { + bh = bh->b_this_page; + bnum++; + blk++; + continue; + } + if (!buffer_mapped(bh)) { + gfs2_block_map(inode, blk, bh, 1); + if (!buffer_mapped(bh)) + goto unlock_out; + /* If it's a newly allocated disk block, zero it */ + if (buffer_new(bh)) + zero_user(page, bnum * bsize, bh->b_size); + } + if (PageUptodate(page)) + set_buffer_uptodate(bh); + if (!buffer_uptodate(bh)) { + ll_rw_block(READ | REQ_META, 1, &bh); + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) + goto unlock_out; + } + gfs2_trans_add_data(ip->i_gl, bh); + + /* If we need to write to the next block as well */ + if (to_write > (bsize - boff)) { + pg_off += (bsize - boff); + to_write -= (bsize - boff); + boff = pg_off % bsize; + continue; + } + done = 1; + } + + /* Write to the page, now that we have setup the buffer(s) */ + kaddr = kmap_atomic(page); + memcpy(kaddr + off, buf, bytes); + flush_dcache_page(page); + kunmap_atomic(kaddr); + unlock_page(page); + page_cache_release(page); + + return 0; + +unlock_out: + unlock_page(page); + page_cache_release(page); + return -EIO; +} + +static int gfs2_write_disk_quota(struct gfs2_inode *ip, struct gfs2_quota *qp, +loff_t loc) +{ + unsigned long pg_beg; + unsigned pg_off, nbytes, overflow = 0; + int pg_oflow = 0, error; + void *ptr; + + nbytes = sizeof(struct gfs2_quota); + + pg_beg = loc >> PAGE_CACHE_SHIFT; + pg_off = loc % PAGE_CACHE_SIZE; + + /* If the quota straddles a page boundary, split the write in two */ + if ((pg_off + nbytes) > PAGE_CACHE_SIZE) { + pg_oflow = 1; + overflow = (pg_off + nbytes) - PAGE_CACHE_SIZE; + } + + ptr = qp; + error = gfs2_write_buf_to_page(ip, pg_beg, pg_off, ptr, + nbytes - overflow); + /* If there's an overflo
[Cluster-devel] [GFS2 v2 PATCH 2/2] gfs2: limit quota log messages
This patch makes the quota subsystem only report once that a particular user/group has exceeded their allotted quota. Previously, it was possible for a program to continuously try exceeding quota (despite receiving EDQUOT) and in turn trigger gfs2 to issue a kernel log message about quota exceed. In theory, this could get out of hand and flood the log and the filesystem hosting the log files. Resolves: rhbz#1174295 Signed-off-by: Abhi Das --- fs/gfs2/incore.h | 1 + fs/gfs2/quota.c | 15 +++ 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 58b75ab..304a223 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -432,6 +432,7 @@ enum { QDF_CHANGE = 1, QDF_LOCKED = 2, QDF_REFRESH = 3, + QDF_QMSG_QUIET = 4, }; struct gfs2_quota_data { diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 7c20031..dcd598a 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -649,6 +649,8 @@ static void do_qc(struct gfs2_quota_data *qd, s64 change) slot_hold(qd); } + if (change < 0) /* Reset quiet flag if we freed some blocks */ + clear_bit(QDF_QMSG_QUIET, &qd->qd_flags); mutex_unlock(&sdp->sd_quota_mutex); } @@ -1187,10 +1189,13 @@ int gfs2_quota_check(struct gfs2_inode *ip, kuid_t uid, kgid_t gid, /* If no min_target specified or we don't meet * min_target, return -EDQUOT */ if (!ap->min_target || ap->min_target > ap->allowed) { - print_message(qd, "exceeded"); - quota_send_warning(qd->qd_id, - sdp->sd_vfs->s_dev, - QUOTA_NL_BHARDWARN); + if (!test_and_set_bit(QDF_QMSG_QUIET, + &qd->qd_flags)) { + print_message(qd, "exceeded"); + quota_send_warning(qd->qd_id, + sdp->sd_vfs->s_dev, + QUOTA_NL_BHARDWARN); + } error = -EDQUOT; break; } @@ -1685,6 +1690,8 @@ static int gfs2_set_dqblk(struct super_block *sb, struct kqid qid, /* Apply changes */ error = gfs2_adjust_quota(ip, offset, 0, qd, fdq); + if (!error) + clear_bit(QDF_QMSG_QUIET, &qd->qd_flags); gfs2_trans_end(sdp); out_release: -- 1.8.1.4
[Cluster-devel] [GFS2 v2 PATCH 0/2] More quota fixes
This is a follow-up set of patches to fix the issues discovered during testing of my previous set that reworked significant portions of the quota code. Also addresses some of the fixes Bob suggested earlier. Abhi Das (2): gfs2: fix quota updates on block boundaries gfs2: limit quota log messages fs/gfs2/incore.h | 1 + fs/gfs2/inode.c | 2 +- fs/gfs2/quota.c | 212 +-- 3 files changed, 131 insertions(+), 84 deletions(-) -- 1.8.1.4
[Cluster-devel] [GFS2 PATCH 1/2] gfs2: fix quota updates on block boundaries
For smaller block sizes (512B, 1K, 2K), some quotas straddle block boundaries such that the usage value is on one block and the rest of the quota is on the previous block. In such cases, the value does not get updated correctly. This patch fixes that by addressing the boundary conditions correctly. This patch also adds a (s64) cast that was missing in a call to gfs2_quota_change() in inode.c Resolves: rhbz#1174295 Signed-off-by: Abhi Das --- fs/gfs2/inode.c | 2 +- fs/gfs2/quota.c | 197 +--- 2 files changed, 119 insertions(+), 80 deletions(-) diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 4d809eb..a088e54 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -1889,7 +1889,7 @@ static int setattr_chown(struct inode *inode, struct iattr *attr) if (!uid_eq(ouid, NO_UID_QUOTA_CHANGE) || !gid_eq(ogid, NO_GID_QUOTA_CHANGE)) { - gfs2_quota_change(ip, -ap.target, ouid, ogid); + gfs2_quota_change(ip, -(s64)ap.target, ouid, ogid); gfs2_quota_change(ip, ap.target, nuid, ngid); } diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 5c27e48..01f4d40 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -652,6 +652,112 @@ static void do_qc(struct gfs2_quota_data *qd, s64 change) mutex_unlock(&sdp->sd_quota_mutex); } +static int gfs2_write_buf_to_page(struct gfs2_inode *ip, unsigned long index, + unsigned off, void *buf, unsigned bytes) +{ + struct inode *inode = &ip->i_inode; + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct address_space *mapping = inode->i_mapping; + struct page *page; + struct buffer_head *bh; + void *kaddr; + unsigned long fs_blk; + unsigned bsize = sdp->sd_sb.sb_bsize, bnum = 0, boff = 0; + unsigned to_write = bytes, pg_off = off; + int done = 0; + + fs_blk = index << (PAGE_CACHE_SHIFT - sdp->sd_sb.sb_bsize_shift); + boff = off % bsize; + + page = find_or_create_page(mapping, index, GFP_NOFS); + if (!page) + return -ENOMEM; + if (!page_has_buffers(page)) + create_empty_buffers(page, bsize, 0); + + bh = page_buffers(page); + while (!done) { + /* Find the beginning block within the page */ + if (pg_off >= ((bnum * bsize) + bsize)) { + bh = bh->b_this_page; + bnum++; + fs_blk++; + continue; + } + if (!buffer_mapped(bh)) { + gfs2_block_map(inode, fs_blk, bh, 1); + if (!buffer_mapped(bh)) + goto unlock_out; + /* If it's a newly allocated disk block, zero it */ + if (buffer_new(bh)) + zero_user(page, bnum * bsize, bh->b_size); + } + if (PageUptodate(page)) + set_buffer_uptodate(bh); + if (!buffer_uptodate(bh)) { + ll_rw_block(READ | REQ_META, 1, &bh); + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) + goto unlock_out; + } + gfs2_trans_add_data(ip->i_gl, bh); + + /* If we need to write to the next block as well */ + if (to_write > (bsize - boff)) { + pg_off += (bsize - boff); + to_write -= (bsize - boff); + boff = pg_off % bsize; + continue; + } + done = 1; + } + + /* Write to the page, now that we have setup the buffer(s) */ + kaddr = kmap_atomic(page); + memcpy(kaddr + off, buf, bytes); + flush_dcache_page(page); + kunmap_atomic(kaddr); + unlock_page(page); + page_cache_release(page); + + return 0; + +unlock_out: + unlock_page(page); + page_cache_release(page); + return -EIO; +} + +static int gfs2_write_disk_quota(struct gfs2_inode *ip, struct gfs2_quota *qp, +loff_t loc) +{ + unsigned long pg_beg; + unsigned pg_off, nbytes, overflow = 0; + int pg_oflow = 0, error; + void *ptr; + + nbytes = sizeof(struct gfs2_quota); + + pg_beg = loc >> PAGE_CACHE_SHIFT; + pg_off = loc % PAGE_CACHE_SIZE; + + /* If the quota straddles a page boundary, split the write in two */ + if ((pg_off + nbytes) > PAGE_CACHE_SIZE) { + pg_oflow = 1; + overflow = (pg_off + nbytes) - PAGE_CACHE_SIZE; + } + + ptr = qp; + error = gfs2_write_buf_to_page(ip, pg_beg, pg_off, ptr, + nbytes - overflow); + /* If t
[Cluster-devel] [GFS2 PATCH 2/2] gfs2: limit quota log messages
This patch makes the quota subsystem only report once that a particular user/group has exceeded their allotted quota. Previously, it was possible for a program to continuously try exceeding quota (despite receiving EDQUOT) and in turn trigger gfs2 to issue a kernel log message about quota exceed. In theory, this could get out of hand and flood the log and the filesystem hosting the log files. Resolves: rhbz#1174295 Signed-off-by: Abhi Das --- fs/gfs2/incore.h | 1 + fs/gfs2/quota.c | 13 + 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 58b75ab..304a223 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -432,6 +432,7 @@ enum { QDF_CHANGE = 1, QDF_LOCKED = 2, QDF_REFRESH = 3, + QDF_QMSG_QUIET = 4, }; struct gfs2_quota_data { diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 01f4d40..3dc13b53 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -649,6 +649,8 @@ static void do_qc(struct gfs2_quota_data *qd, s64 change) slot_hold(qd); } + if (change < 0) /* Reset quiet flag if we freed some blocks */ + clear_bit(QDF_QMSG_QUIET, &qd->qd_flags); mutex_unlock(&sdp->sd_quota_mutex); } @@ -1187,10 +1189,13 @@ int gfs2_quota_check(struct gfs2_inode *ip, kuid_t uid, kgid_t gid, /* If no min_target specified or we don't meet * min_target, return -EDQUOT */ if (!ap->min_target || ap->min_target > ap->allowed) { - print_message(qd, "exceeded"); - quota_send_warning(qd->qd_id, - sdp->sd_vfs->s_dev, - QUOTA_NL_BHARDWARN); + if (!test_bit(QDF_QMSG_QUIET, &qd->qd_flags)) { + print_message(qd, "exceeded"); + quota_send_warning(qd->qd_id, + sdp->sd_vfs->s_dev, + QUOTA_NL_BHARDWARN); + set_bit(QDF_QMSG_QUIET, &qd->qd_flags); + } error = -EDQUOT; break; } -- 1.8.1.4
[Cluster-devel] [GFS2 PATCH 0/2] More quota fixes
This is a follow-up set of patches to fix the issues discovered during testing of my previous set that reworked significant portions of the quota code. Abhi Das (2): gfs2: fix quota updates on block boundaries gfs2: limit quota log messages fs/gfs2/incore.h | 1 + fs/gfs2/inode.c | 2 +- fs/gfs2/quota.c | 210 +-- 3 files changed, 129 insertions(+), 84 deletions(-) -- 1.8.1.4
[Cluster-devel] [GFS2] gfs2: handle NULL rgd in set_rgrp_preferences
The function set_rgrp_preferences() does not handle the (rarely returned) NULL value from gfs2_rgrpd_get_next() and this patch fixes that. The fs image in question is only 150MB in size which allows for only 1 rgrp to be created. The in-memory rb tree has only 1 node and when gfs2_rgrpd_get_next() is called on this sole rgrp, it returns NULL. (Default behavior is to wrap around the rb tree and return the first node to give the illusion of a circular linked list. In the case of only 1 rgrp, we can't have gfs2_rgrpd_get_next() return the same rgrp (first, last, next all point to the same rgrp)... that would cause unintended consequences and infinite loops.) Resolves: rhbz#1211663 Signed-off-by: Abhi Das --- fs/gfs2/rgrp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index cb27065..900e515 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -978,10 +978,10 @@ static void set_rgrp_preferences(struct gfs2_sbd *sdp) rgd->rd_flags |= GFS2_RDF_PREFERRED; for (i = 0; i < sdp->sd_journals; i++) { rgd = gfs2_rgrpd_get_next(rgd); - if (rgd == first) + if (!rgd || rgd == first) break; } - } while (rgd != first); + } while (rgd && rgd != first); } /** -- 1.8.1.4
[Cluster-devel] [GFS2] fsck.gfs2: replace recent i_goal fixes with simple logic
This patch reverses the recent set of i_goal fixes for fsck.gfs2. This is because of two problems. 1. It is not possible to determine if a valid block within the fs is the correct goal block for a given inode. 2. Conversely, given an inode, it is also not possible to accurately determine what its goal block should be. The previous patches assumed that the last block of a file is its goal block, but that is not true if the file is a directory or if its blocks are not allocated sequentially. fsck.gfs2 would flag these inodes incorrectly as having bad i_goal values. This patch takes a simple approach. It checks if the i_goal of a given inode is out of bounds of the fs. If so, we can be certain that it is wrong and we set it to the inode metadata block. This is a safe starting point for gfs2 to determine where to allocate from next. Resolves: rhbz#1186515 Signed-off-by: Abhi Das --- gfs2/fsck/metawalk.c | 92 +++--- gfs2/fsck/metawalk.h | 5 --- gfs2/fsck/pass1.c | 35 --- gfs2/libgfs2/libgfs2.h | 1 - 4 files changed, 34 insertions(+), 99 deletions(-) diff --git a/gfs2/fsck/metawalk.c b/gfs2/fsck/metawalk.c index f05fb51..4d5a660 100644 --- a/gfs2/fsck/metawalk.c +++ b/gfs2/fsck/metawalk.c @@ -1428,8 +1428,7 @@ static int build_and_check_metalist(struct gfs2_inode *ip, osi_list_t *mlp, */ static int check_data(struct gfs2_inode *ip, struct metawalk_fxns *pass, struct gfs2_buffer_head *bh, int head_size, - uint64_t *last_block, uint64_t *blks_checked, - uint64_t *error_blk) + uint64_t *blks_checked, uint64_t *error_blk) { int error = 0, rc = 0; uint64_t block, *ptr; @@ -1444,7 +1443,7 @@ static int check_data(struct gfs2_inode *ip, struct metawalk_fxns *pass, if (skip_this_pass || fsck_abort) return error; - *last_block = block = be64_to_cpu(*ptr); + block = be64_to_cpu(*ptr); /* It's important that we don't call valid_block() and bypass calling check_data on invalid blocks because that would defeat the rangecheck_block related functions in @@ -1548,15 +1547,12 @@ int check_metatree(struct gfs2_inode *ip, struct metawalk_fxns *pass) struct gfs2_buffer_head *bh; uint32_t height = ip->i_di.di_height; int i, head_size; - uint64_t blks_checked = 0, last_block = 0; + uint64_t blks_checked = 0; int error, rc; int metadata_clean = 0; uint64_t error_blk = 0; int hit_error_blk = 0; - if (!height && pass->check_i_goal) - pass->check_i_goal(ip, ip->i_di.di_num.no_addr, - pass->private); if (!height && !is_dir(&ip->i_di, ip->i_sbd->gfs1)) return 0; @@ -1575,9 +1571,6 @@ int check_metatree(struct gfs2_inode *ip, struct metawalk_fxns *pass) * comprise the directory hash table, so we perform the directory * checks and exit. */ if (is_dir(&ip->i_di, ip->i_sbd->gfs1)) { - last_block = ip->i_di.di_num.no_addr; - if (pass->check_i_goal) - pass->check_i_goal(ip, last_block, pass->private); if (!(ip->i_di.di_flags & GFS2_DIF_EXHASH)) goto out; /* check validity of leaf blocks and leaf chains */ @@ -1604,7 +1597,7 @@ int check_metatree(struct gfs2_inode *ip, struct metawalk_fxns *pass) if (pass->check_data) error = check_data(ip, pass, bh, head_size, - &last_block, &blks_checked, &error_blk); + &blks_checked, &error_blk); if (pass->big_file_msg && ip->i_di.di_blocks > COMFORTABLE_BLKS) pass->big_file_msg(ip, blks_checked); } @@ -1616,8 +1609,6 @@ int check_metatree(struct gfs2_inode *ip, struct metawalk_fxns *pass) (unsigned long long)ip->i_di.di_num.no_addr); fflush(stdout); } - if (!error && pass->check_i_goal) - pass->check_i_goal(ip, last_block, pass->private); undo_metalist: if (!error) goto out; @@ -1958,80 +1949,6 @@ static int alloc_leaf(struct gfs2_inode *ip, uint64_t block, void *private) return 0; } -/** - * rgrp_contains_block - Check if the rgrp provided contains the - * given block. Taken directly from the gfs2 kernel code - * @rgd: The rgrp to search within - * @block: The block to search for - * - * Returns: 1 if present, 0 if not. - */ -static inline int rgrp_contains_block(struct rgrp_tree *rgd, uint64_
[Cluster-devel] [GFS2] gfs2: fix quota refresh race in do_glock()
quotad periodically syncs in-memory quotas to the ondisk quota file and sets the QDF_REFRESH flag so that a subsequent read of a synced quota is re-read from disk. gfs2_quota_lock() checks for this flag and sets a 'force' bit to force re-read from disk if requested. However, there is a race condition here. It is possible for gfs2_quota_lock() to find the QDF_REFRESH flag unset (i.e force=0) and quotad comes in immediately after and syncs the relevant quota and sets the QDF_REFRESH flag. gfs2_quota_lock() resumes with force=0 and uses the stale in-memory quota usage values that result in miscalculations. This patch fixes this race by moving the check for the QDF_REFRESH flag check further out into the gfs2_quota_lock() process, i.e, in do_glock(), under the protection of the quota glock. Resolves: rhbz#1174295 Signed-off-by: Abhi Das --- fs/gfs2/quota.c | 8 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 5561468..5c27e48 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -923,6 +923,9 @@ restart: if (error) return error; + if (test_and_clear_bit(QDF_REFRESH, &qd->qd_flags)) + force_refresh = FORCE; + qd->qd_qb = *(struct gfs2_quota_lvb *)qd->qd_gl->gl_lksb.sb_lvbptr; if (force_refresh || qd->qd_qb.qb_magic != cpu_to_be32(GFS2_MAGIC)) { @@ -974,11 +977,8 @@ int gfs2_quota_lock(struct gfs2_inode *ip, kuid_t uid, kgid_t gid) sizeof(struct gfs2_quota_data *), sort_qd, NULL); for (x = 0; x < ip->i_res->rs_qa_qd_num; x++) { - int force = NO_FORCE; qd = ip->i_res->rs_qa_qd[x]; - if (test_and_clear_bit(QDF_REFRESH, &qd->qd_flags)) - force = FORCE; - error = do_glock(qd, force, &ip->i_res->rs_qa_qd_ghs[x]); + error = do_glock(qd, NO_FORCE, &ip->i_res->rs_qa_qd_ghs[x]); if (error) break; } -- 1.8.1.4
[Cluster-devel] gfs2_utils: more gfs2_convert i_goal fixes
The correct goal was only being set on files that are >= meta height 2. This patch fixes this and sets correct goal values for stuffed, height 1 and jdata files as well Resolves: rhbz#1186847 Signed-off-by: Abhi Das --- gfs2/convert/gfs2_convert.c | 18 -- 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/gfs2/convert/gfs2_convert.c b/gfs2/convert/gfs2_convert.c index 9a42985..bb4e3a2 100644 --- a/gfs2/convert/gfs2_convert.c +++ b/gfs2/convert/gfs2_convert.c @@ -435,9 +435,9 @@ static void jdata_mp_gfs1_to_gfs2(struct gfs2_sbd *sbp, int gfs1_h, int gfs2_h, } } -static void fix_jdatatree(struct gfs2_sbd *sbp, struct gfs2_inode *ip, - struct blocklist *blk, char *srcptr, - unsigned int size) +static uint64_t fix_jdatatree(struct gfs2_sbd *sbp, struct gfs2_inode *ip, + struct blocklist *blk, char *srcptr, + unsigned int size) { uint64_t block; struct gfs2_buffer_head *bh; @@ -499,6 +499,7 @@ static void fix_jdatatree(struct gfs2_sbd *sbp, struct gfs2_inode *ip, amount = size - copied; ptramt = 0; } + return block; } static int get_inode_metablocks(struct gfs2_sbd *sbp, struct gfs2_inode *ip, struct blocklist *blocks) @@ -620,7 +621,7 @@ static int fix_ind_reg_or_dir(struct gfs2_sbd *sbp, struct gfs2_inode *ip, uint3 blk->height -= di_height - gfs2_hgt; if (len) { fix_metatree(sbp, ip, blk, ptr1, len); - ip->i_di.di_goal_data = ip->i_di.di_goal_meta = be64_to_cpu(*ptr2); + ip->i_di.di_goal_meta = be64_to_cpu(*ptr2); } return 0; @@ -687,7 +688,8 @@ static int fix_ind_jdata(struct gfs2_sbd *sbp, struct gfs2_inode *ip, uint32_t d memcpy(&newblk->mp, &gfs2mp, sizeof(struct metapath)); newblk->height -= di_height - gfs2_hgt; if (len) - fix_jdatatree(sbp, ip, newblk, newblk->ptrbuf, len); + ip->i_di.di_goal_meta = fix_jdatatree(sbp, ip, newblk, + newblk->ptrbuf, len); free(newblk->ptrbuf); free(newblk); } @@ -705,12 +707,16 @@ static int adjust_indirect_blocks(struct gfs2_sbd *sbp, struct gfs2_inode *ip) int isdir = S_ISDIR(ip->i_di.di_mode); /* is always jdata */ int isjdata = ((GFS2_DIF_JDATA & ip->i_di.di_flags) && !isdir); int isreg = (!isjdata && !isdir); + int issys = (GFS2_DIF_SYSTEM & ip->i_di.di_flags); /* regular files and dirs are same upto height=2 jdata files (not dirs) are same only when height=0 */ if (((isreg||isdir) && ip->i_di.di_height <= 1) || - (isjdata && ip->i_di.di_height == 0)) + (isjdata && ip->i_di.di_height == 0)) { + if (!issys) + ip->i_di.di_goal_meta = ip->i_di.di_num.no_addr; return 0; /* nothing to do */ + } osi_list_init(&blocks.list); -- 1.8.1.4
[Cluster-devel] gfs2-utils: more fsck.gfs2 i_goal fixes
fsck.gfs2 doesn't traverse the metadata tree for dirs in pass1 to be able to get at the last allocated block for it and attempts to set it to the inode block itself when it finds the i_goal value to be outside of the current rgrp. This is not desirable and fsck.gfs2 should probably leave directories alone. This patch simply skips over directories whose goal blocks fall within the boundaries of the fs, assuming they are valid. Resolves: rhbz#1186515 Signed-off-by: Abhi Das --- gfs2/fsck/metawalk.c | 8 1 file changed, 8 insertions(+) diff --git a/gfs2/fsck/metawalk.c b/gfs2/fsck/metawalk.c index 6daaf9f..f05fb51 100644 --- a/gfs2/fsck/metawalk.c +++ b/gfs2/fsck/metawalk.c @@ -1995,6 +1995,14 @@ int check_i_goal(struct gfs2_inode *ip, uint64_t goal_blk, if (sdp->gfs1 || ip->i_di.di_flags & GFS2_DIF_SYSTEM || ip->i_di.di_goal_meta == i_block) return 0; + /* Don't fix directory goal blocks unless we know they're wrong. +* i.e. out of bounds of the fs. Directories can easily have blocks +* outside of the dinode's rgrp and thus we have no way of knowing +* if the goal block is bogus or not. */ + if (is_dir(&ip->i_di, ip->i_sbd->gfs1) && + (ip->i_di.di_goal_meta > sdp->sb_addr && +ip->i_di.di_goal_meta <= sdp->fssize)) + return 0; /* We default to the inode block */ if (!goal_blk) goal_blk = i_block; -- 1.8.1.4
[Cluster-devel] [GFS2 3/3] gfs2: allow fallocate to max out quotas/fs efficiently
We can quickly get an estimate of how many blocks are available for allocation restricted by quota and fs size respectively, using the ap->allowed field in the gfs2_alloc_parms structure. gfs2_quota_check() and gfs2_inplace_reserve() provide these values. Once we have the total number of blocks available to us, we can compute how many bytes of data can be written using those blocks instead of guessing inefficiently. Signed-off-by: Abhi Das --- fs/gfs2/file.c | 70 +++--- 1 file changed, 47 insertions(+), 23 deletions(-) diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index c569adb..4d31087 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -765,22 +765,30 @@ out: brelse(dibh); return error; } - -static void calc_max_reserv(struct gfs2_inode *ip, loff_t max, loff_t *len, - unsigned int *data_blocks, unsigned int *ind_blocks) +/** + * calc_max_reserv() - Reverse of write_calc_reserv. Given a number of + * blocks, determine how many bytes can be written. + * @ip: The inode in question. + * @len: Max cap of bytes. What we return in *len must be <= this. + * @data_blocks: Compute and return the number of data blocks needed + * @ind_blocks: Compute and return the number of indirect blocks needed + * @max_blocks: The total blocks available to work with. + * + * Returns: void, but @len, @data_blocks and @ind_blocks are filled in. + */ +static void calc_max_reserv(struct gfs2_inode *ip, loff_t *len, + unsigned int *data_blocks, unsigned int *ind_blocks, + unsigned int max_blocks) { + loff_t max = *len; const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - unsigned int max_blocks = ip->i_rgd->rd_free_clone; unsigned int tmp, max_data = max_blocks - 3 * (sdp->sd_max_height - 1); for (tmp = max_data; tmp > sdp->sd_diptrs;) { tmp = DIV_ROUND_UP(tmp, sdp->sd_inptrs); max_data -= tmp; } - /* This calculation isn't the exact reverse of gfs2_write_calc_reserve, - so it might end up with fewer data blocks */ - if (max_data <= *data_blocks) - return; + *data_blocks = max_data; *ind_blocks = max_blocks - max_data; *len = ((loff_t)max_data - 3) << sdp->sd_sb.sb_bsize_shift; @@ -797,7 +805,7 @@ static long __gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_alloc_parms ap = { .aflags = 0, }; unsigned int data_blocks = 0, ind_blocks = 0, rblocks; - loff_t bytes, max_bytes; + loff_t bytes, max_bytes, max_blks = UINT_MAX; int error; const loff_t pos = offset; const loff_t count = len; @@ -819,6 +827,9 @@ static long __gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t gfs2_size_hint(file, offset, len); + gfs2_write_calc_reserv(ip, PAGE_SIZE, &data_blocks, &ind_blocks); + ap.min_target = data_blocks + ind_blocks; + while (len > 0) { if (len < bytes) bytes = len; @@ -827,28 +838,41 @@ static long __gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t offset += bytes; continue; } -retry: + + /* We need to determine how many bytes we can actually +* fallocate without exceeding quota or going over the +* end of the fs. We start off optimistically by assuming +* we can write max_bytes */ + max_bytes = (len > max_chunk_size) ? max_chunk_size : len; + + /* Since max_bytes is most likely a theoretical max, we +* calculate a more realistic 'bytes' to serve as a good +* starting point for the number of bytes we may be able +* to write */ gfs2_write_calc_reserv(ip, bytes, &data_blocks, &ind_blocks); ap.target = data_blocks + ind_blocks; error = gfs2_quota_lock_check(ip, &ap); if (error) return error; + /* ap.allowed tells us how many blocks quota will allow +* us to write. Check if this reduces max_blks */ + if (ap.allowed && ap.allowed < max_blks) + max_blks = ap.allowed; + error = gfs2_inplace_reserve(ip, &ap); - if (error) { - if (error == -ENOSPC && bytes > sdp->sd_sb.sb_bsize) { - bytes >>= 1; - bytes &= bsize_mask; - if (bytes == 0) -
[Cluster-devel] [GFS2 1/3] gfs2: perform quota checks against allocation parameters
Use struct gfs2_alloc_parms as an argument to gfs2_quota_check() and gfs2_quota_lock_check() to check for quota violations while accounting for the new blocks requested by the current operation in ap->target. Previously, the number of new blocks requested during an operation were not accounted for during quota_check and would allow these operations to exceed quota. This was not very apparent since most operations allocated only 1 block at a time and quotas would get violated in the next operation. i.e. quota excess would only be by 1 block or so. With fallocate, (where we allocate a bunch of blocks at once) the quota excess is non-trivial and is addressed by this patch. Resolves: rhbz#1174295 Signed-off-by: Abhi Das --- fs/gfs2/aops.c | 6 +++--- fs/gfs2/bmap.c | 2 +- fs/gfs2/file.c | 15 --- fs/gfs2/incore.h | 2 +- fs/gfs2/inode.c | 18 ++ fs/gfs2/quota.c | 6 +++--- fs/gfs2/quota.h | 8 +--- fs/gfs2/xattr.c | 2 +- 8 files changed, 32 insertions(+), 27 deletions(-) diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 4ad4f94..7bc5c82 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -671,12 +671,12 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping, if (alloc_required) { struct gfs2_alloc_parms ap = { .aflags = 0, }; - error = gfs2_quota_lock_check(ip); + requested = data_blocks + ind_blocks; + ap.target = requested; + error = gfs2_quota_lock_check(ip, &ap); if (error) goto out_unlock; - requested = data_blocks + ind_blocks; - ap.target = requested; error = gfs2_inplace_reserve(ip, &ap); if (error) goto out_qunlock; diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index f0b945a..61296ec 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -1224,7 +1224,7 @@ static int do_grow(struct inode *inode, u64 size) if (gfs2_is_stuffed(ip) && (size > (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode { - error = gfs2_quota_lock_check(ip); + error = gfs2_quota_lock_check(ip, &ap); if (error) return error; diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 7353c0a..c569adb 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -429,11 +429,11 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) if (ret) goto out_unlock; - ret = gfs2_quota_lock_check(ip); - if (ret) - goto out_unlock; gfs2_write_calc_reserv(ip, PAGE_CACHE_SIZE, &data_blocks, &ind_blocks); ap.target = data_blocks + ind_blocks; + ret = gfs2_quota_lock_check(ip, &ap); + if (ret) + goto out_unlock; ret = gfs2_inplace_reserve(ip, &ap); if (ret) goto out_quota_unlock; @@ -827,13 +827,13 @@ static long __gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t offset += bytes; continue; } - error = gfs2_quota_lock_check(ip); - if (error) - return error; retry: gfs2_write_calc_reserv(ip, bytes, &data_blocks, &ind_blocks); - ap.target = data_blocks + ind_blocks; + + error = gfs2_quota_lock_check(ip, &ap); + if (error) + return error; error = gfs2_inplace_reserve(ip, &ap); if (error) { if (error == -ENOSPC && bytes > sdp->sd_sb.sb_bsize) { @@ -841,6 +841,7 @@ retry: bytes &= bsize_mask; if (bytes == 0) bytes = sdp->sd_sb.sb_bsize; + gfs2_quota_unlock(ip); goto retry; } goto out_qunlock; diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 7a2dbbc..3a4ea50 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -301,7 +301,7 @@ struct gfs2_blkreserv { * to the allocation code. */ struct gfs2_alloc_parms { - u32 target; + u64 target; u32 aflags; }; diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 73c72253..08bc84d 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -382,7 +382,7 @@ static int alloc_dinode(struct gfs2_inode *ip, u32 flags, unsigned *dblocks) struct gfs2_alloc_parms ap = { .target = *dblocks, .aflags = flags, }; int error; - error = gfs2_quota_lock_check(ip); + error = gfs2_quota_lock_check(ip, &ap); if (error) goto out; @@ -525,7 +525,7 @@ static int link_dinode(struct gfs
[Cluster-devel] [GFS2 2/3] gfs2: allow quota_check and inplace_reserve to return available blocks
struct gfs2_alloc_parms is passed to gfs2_quota_check() and gfs2_inplace_reserve() with ap->target containing the number of blocks being requested for allocation in the current operation. We add a new field to struct gfs2_alloc_parms called 'allowed'. gfs2_quota_check() and gfs2_inplace_reserve() return the max blocks allowed by quota and the max blocks allowed by the chosen rgrp respectively in 'allowed'. A new field 'min_target', when non-zero, tells gfs2_quota_check() and gfs2_inplace_reserve() to not return -EDQUOT/-ENOSPC when there are atleast 'min_target' blocks allowable/available. The assumption is that the caller is ok with just 'min_target' blocks and will likely proceed with allocating them. Signed-off-by: Abhi Das --- fs/gfs2/incore.h | 2 ++ fs/gfs2/quota.c | 52 +++- fs/gfs2/rgrp.c | 20 +++- fs/gfs2/rgrp.h | 3 ++- 4 files changed, 58 insertions(+), 19 deletions(-) diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 3a4ea50..58b75ab 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -302,7 +302,9 @@ struct gfs2_blkreserv { */ struct gfs2_alloc_parms { u64 target; + u32 min_target; u32 aflags; + u64 allowed; }; enum { diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 964a769..5561468 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -1094,15 +1094,33 @@ static int print_message(struct gfs2_quota_data *qd, char *type) return 0; } +/** + * gfs2_quota_check - check if allocating new blocks will exceed quota + * @ip: The inode for which this check is being performed + * @uid: The uid to check against + * @gid: The gid to check against + * @ap: The allocation parameters. ap->target contains the requested + * blocks. ap->min_target, if set, contains the minimum blks + * requested. + * + * Returns: 0 on success. + * min_req = ap->min_target ? ap->min_target : ap->target; + * quota must allow atleast min_req blks for success and + * ap->allowed is set to the number of blocks allowed + * + * -EDQUOT otherwise, quota violation. ap->allowed is set to number + * of blocks available. + */ int gfs2_quota_check(struct gfs2_inode *ip, kuid_t uid, kgid_t gid, struct gfs2_alloc_parms *ap) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_quota_data *qd; - s64 value; + s64 value, warn, limit; unsigned int x; int error = 0; + ap->allowed = UINT_MAX; /* Assume we are permitted a whole lot */ if (!test_bit(GIF_QD_LOCKED, &ip->i_flags)) return 0; @@ -1116,29 +1134,37 @@ int gfs2_quota_check(struct gfs2_inode *ip, kuid_t uid, kgid_t gid, qid_eq(qd->qd_id, make_kqid_gid(gid continue; + warn = (s64)be64_to_cpu(qd->qd_qb.qb_warn); + limit = (s64)be64_to_cpu(qd->qd_qb.qb_limit); value = (s64)be64_to_cpu(qd->qd_qb.qb_value); spin_lock(&qd_lock); - value += qd->qd_change + ap->target; + value += qd->qd_change; spin_unlock(&qd_lock); - if (be64_to_cpu(qd->qd_qb.qb_limit) && (s64)be64_to_cpu(qd->qd_qb.qb_limit) < value) { - print_message(qd, "exceeded"); - quota_send_warning(qd->qd_id, - sdp->sd_vfs->s_dev, QUOTA_NL_BHARDWARN); - error = -EDQUOT; - break; - } else if (be64_to_cpu(qd->qd_qb.qb_warn) && - (s64)be64_to_cpu(qd->qd_qb.qb_warn) < value && + if (limit > 0 && (limit - value) < ap->allowed) + ap->allowed = limit - value; + /* If we can't meet the target */ + if (limit && limit < (value + (s64)ap->target)) { + /* If no min_target specified or we don't meet +* min_target, return -EDQUOT */ + if (!ap->min_target || ap->min_target > ap->allowed) { + print_message(qd, "exceeded"); + quota_send_warning(qd->qd_id, + sdp->sd_vfs->s_dev, + QUOTA_NL_BHARDWARN); + error = -EDQUOT; + break; + } + } else if (warn && warn < value && time_after_eq(jiffies, qd->qd_l
[Cluster-devel] [GFS2 0/3] fallocate and quota fixes
This is a revised version of the patches required to properly fix the fallocate quota issue described in bz1174295 patch1: This patch supplies gfs2_quota_check() with the number of blocks the caller intends to allocate in the current operation, resulting in a more accurate quota check. patch2: gfs2_quota_check() and gfs2_inplace_reserve() return the number of blocks available subject to quota limits and rgrp size respectively. These functions don't return error if atleast ap.min_target (if set) blocks are allocatable. patch3: The fallocate() function uses the features of patch2 to determine how many total blocks are available for allocation and uses them right away, instead of guessing/retrying inefficiently. The behavior of quota enforcement is altered by this patchset. i. Quotas are exceeded (a warning message is also issued to syslog) before the actual usage blocks exceed the imposed limit. In fact, the actual usage can never exceed the limit. Whenever it is determined that the completion of an operation will cause a quota to exceed its limit, such an operation is aborted with -EDQUOT and a 'quota exceeded' message is dispatched. Note: When min_target is set and allowed blocks are >= min_target, we don't issue an error. It is assumed that the caller will only allocate the allowed blocks. ii. The gfs2_write_calc_reserv()/calc_max_reserv() functions are used to map between available blocks and the data bytes that can be written using them. Typically, for large files, some blocks are used up for metadata and only the remaining blocks can be used for data. Example: To write only a handful of bytes that would easily fit in one block, we might have to allocate an extra bunch of intermediate metadata blocks. If we had only 1 block left in our allotted quota, this operation would likely fail. The functions mentioned in ii. are not very efficient. They always compute the worst case number of extra blocks required and it is often the case that not all those extra blocks are used. We need to find a better algorithm to get a tighter estimate on the blocks needed for a given number of bytes. I've run some basic tests and things seem to be holding up. The failing case in bz1174295 is fixed using this patchset. I'll do test build and pass it on to Nate to test with. Abhi Das (3): gfs2: perform quota checks against allocation parameters gfs2: allow quota_check and inplace_reserve to return available blocks gfs2: allow fallocate to max out quotas/fs efficiently fs/gfs2/aops.c | 6 ++--- fs/gfs2/bmap.c | 2 +- fs/gfs2/file.c | 81 fs/gfs2/incore.h | 4 ++- fs/gfs2/inode.c | 18 +++-- fs/gfs2/quota.c | 54 +++-- fs/gfs2/quota.h | 8 +++--- fs/gfs2/rgrp.c | 20 ++ fs/gfs2/rgrp.h | 3 ++- fs/gfs2/xattr.c | 2 +- 10 files changed, 133 insertions(+), 65 deletions(-) -- 1.8.1.4
[Cluster-devel] [GFS2 1/3] gfs2: perform quota checks against allocation parameters
Use struct gfs2_alloc_parms as an argument to gfs2_quota_check() and gfs2_quota_lock_check() to check for quota violations while accounting for the new blocks requested by the current operation in ap->target. Previously, the number of new blocks requested during an operation were not accounted for during quota_check and would allow these operations to exceed quota. This was not very apparent since most operations allocated only 1 block at a time and quotas would get violated in the next operation. i.e. quota excess would only be by 1 block or so. With fallocate, (where we allocate a bunch of blocks at once) the quota excess is non-trivial and is addressed by this patch. Resolves: rhbz#1174295 Signed-off-by: Abhi Das --- fs/gfs2/aops.c | 6 +++--- fs/gfs2/bmap.c | 2 +- fs/gfs2/file.c | 15 --- fs/gfs2/incore.h | 2 +- fs/gfs2/inode.c | 18 ++ fs/gfs2/quota.c | 6 +++--- fs/gfs2/quota.h | 8 +--- fs/gfs2/xattr.c | 2 +- 8 files changed, 32 insertions(+), 27 deletions(-) diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 4ad4f94..7bc5c82 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -671,12 +671,12 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping, if (alloc_required) { struct gfs2_alloc_parms ap = { .aflags = 0, }; - error = gfs2_quota_lock_check(ip); + requested = data_blocks + ind_blocks; + ap.target = requested; + error = gfs2_quota_lock_check(ip, &ap); if (error) goto out_unlock; - requested = data_blocks + ind_blocks; - ap.target = requested; error = gfs2_inplace_reserve(ip, &ap); if (error) goto out_qunlock; diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index f0b945a..61296ec 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -1224,7 +1224,7 @@ static int do_grow(struct inode *inode, u64 size) if (gfs2_is_stuffed(ip) && (size > (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode { - error = gfs2_quota_lock_check(ip); + error = gfs2_quota_lock_check(ip, &ap); if (error) return error; diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 3e32bb8..96b6526 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -429,11 +429,11 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) if (ret) goto out_unlock; - ret = gfs2_quota_lock_check(ip); - if (ret) - goto out_unlock; gfs2_write_calc_reserv(ip, PAGE_CACHE_SIZE, &data_blocks, &ind_blocks); ap.target = data_blocks + ind_blocks; + ret = gfs2_quota_lock_check(ip, &ap); + if (ret) + goto out_unlock; ret = gfs2_inplace_reserve(ip, &ap); if (ret) goto out_quota_unlock; @@ -827,13 +827,13 @@ static long __gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t offset += bytes; continue; } - error = gfs2_quota_lock_check(ip); - if (error) - return error; retry: gfs2_write_calc_reserv(ip, bytes, &data_blocks, &ind_blocks); - ap.target = data_blocks + ind_blocks; + + error = gfs2_quota_lock_check(ip, &ap); + if (error) + return error; error = gfs2_inplace_reserve(ip, &ap); if (error) { if (error == -ENOSPC && bytes > sdp->sd_sb.sb_bsize) { @@ -841,6 +841,7 @@ retry: bytes &= bsize_mask; if (bytes == 0) bytes = sdp->sd_sb.sb_bsize; + gfs2_quota_unlock(ip); goto retry; } goto out_qunlock; diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 7a2dbbc..3a4ea50 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -301,7 +301,7 @@ struct gfs2_blkreserv { * to the allocation code. */ struct gfs2_alloc_parms { - u32 target; + u64 target; u32 aflags; }; diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 73c72253..08bc84d 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -382,7 +382,7 @@ static int alloc_dinode(struct gfs2_inode *ip, u32 flags, unsigned *dblocks) struct gfs2_alloc_parms ap = { .target = *dblocks, .aflags = flags, }; int error; - error = gfs2_quota_lock_check(ip); + error = gfs2_quota_lock_check(ip, &ap); if (error) goto out; @@ -525,7 +525,7 @@ static int link_dinode(struct gfs
[Cluster-devel] [GFS2 3/3] gfs2: allow fallocate to max out quotas/fs efficiently
We can quickly get an estimate of how many blocks are available for allocation restricted by quota and fs size respectively, using the ap->allowed field in the gfs2_alloc_parms structure. gfs2_quota_check() and gfs2_inplace_reserve() provide these values. Once we have the total number of blocks available to us, we can compute how many bytes of data can be written using those blocks instead of guessing inefficiently. Signed-off-by: Abhi Das --- fs/gfs2/file.c | 64 +- 1 file changed, 45 insertions(+), 19 deletions(-) diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 96b6526..5ce918a 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -765,22 +765,30 @@ out: brelse(dibh); return error; } - -static void calc_max_reserv(struct gfs2_inode *ip, loff_t max, loff_t *len, - unsigned int *data_blocks, unsigned int *ind_blocks) +/** + * calc_max_reserv() - Reverse of write_calc_reserv. Given a number of + * blocks, determine how many bytes can be written. + * @ip: The inode in question. + * @len: Max cap of bytes. What we return in *len must be <= this. + * @data_blocks: Compute and return the number of data blocks needed + * @ind_blocks: Compute and return the number of indirect blocks needed + * @max_blocks: The total blocks available to work with. + * + * Returns: void, but @len, @data_blocks and @ind_blocks are filled in. + */ +static void calc_max_reserv(struct gfs2_inode *ip, loff_t *len, + unsigned int *data_blocks, unsigned int *ind_blocks, + unsigned int max_blocks) { + loff_t max = *len; const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - unsigned int max_blocks = ip->i_rgd->rd_free_clone; unsigned int tmp, max_data = max_blocks - 3 * (sdp->sd_max_height - 1); for (tmp = max_data; tmp > sdp->sd_diptrs;) { tmp = DIV_ROUND_UP(tmp, sdp->sd_inptrs); max_data -= tmp; } - /* This calculation isn't the exact reverse of gfs2_write_calc_reserve, - so it might end up with fewer data blocks */ - if (max_data <= *data_blocks) - return; + *data_blocks = max_data; *ind_blocks = max_blocks - max_data; *len = ((loff_t)max_data - 3) << sdp->sd_sb.sb_bsize_shift; @@ -797,7 +805,7 @@ static long __gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_alloc_parms ap = { .aflags = 0, }; unsigned int data_blocks = 0, ind_blocks = 0, rblocks; - loff_t bytes, max_bytes; + loff_t bytes, max_bytes, max_blks = UINT_MAX; int error; const loff_t pos = offset; const loff_t count = len; @@ -827,28 +835,46 @@ static long __gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t offset += bytes; continue; } -retry: + + /* We need to determine how many bytes we can actually +* fallocate without exceeding quota or going over the +* end of the fs. We start off optimistically by assuming +* we can write max_bytes */ + max_bytes = (len > max_chunk_size) ? max_chunk_size : len; + + /* Since max_bytes is most likely a theoretical max, we +* calculate a more realistic 'bytes' to serve as a good +* starting point for the number of bytes we may be able +* to write */ gfs2_write_calc_reserv(ip, bytes, &data_blocks, &ind_blocks); + ap.target = data_blocks + ind_blocks; + ap.aflags |= GFS2_AF_NO_ERR; error = gfs2_quota_lock_check(ip, &ap); if (error) return error; + /* ap.allowed tells us how many blocks quota will allow +* us to write. Check if this reduces max_blks */ + if (ap.allowed && ap.allowed < max_blks) + max_blks = ap.allowed; + retry: error = gfs2_inplace_reserve(ip, &ap); if (error) { - if (error == -ENOSPC && bytes > sdp->sd_sb.sb_bsize) { - bytes >>= 1; - bytes &= bsize_mask; - if (bytes == 0) - bytes = sdp->sd_sb.sb_bsize; - gfs2_quota_unlock(ip); + if (error == -ENOSPC && ap.allowed) { + ap.target = ap.allowed; goto retry; } goto
[Cluster-devel] [GFS2 0/3] fallocate and quota fixes
This is a revised version of the patches required to properly fix the fallocate quota issue described in bz1174295 patch1: This patch supplies gfs2_quota_check() with the number of blocks the caller intends to allocate in the current operation, resulting in a more accurate quota check. patch2: gfs2_quota_check() and gfs2_inplace_reserve() return the number of blocks available subject to quota limits and rgrp size respectively. The difference from previous versions of this patch is that we return the available blocks even on success. patch3: The fallocate() function uses the features of patch2 to determine how many total blocks are available for allocation and uses them right away, instead of guessing inefficiently. The behavior of quota enforcement is altered by this patchset. i. Quotas are exceeded (a warning message is also issued to syslog) before the actual usage blocks exceed the imposed limit. In fact, the actual usage can never exceed the limit. Whenever it is determined that the completion of an operation will cause a quota to exceed its limit, such an operation is aborted with -EDQUOT and a 'quota exceeded' message is dispatched. ii. The gfs2_write_calc_reserv()/calc_max_reserv() functions are used to map between available blocks and the data bytes that can be written using them. Typically, for large files, some blocks are used up for metadata and only the remaining blocks can be used for data. Example: To write only a handful of bytes that would easily fit in one block, we might have to allocate an extra bunch of intermediate metadata blocks. If we had only 1 block left in our allotted quota, this operation would likely fail. The functions mentioned in ii. are not very efficient. They always compute the worst case number of extra blocks required and it is often the case that not all those extra blocks are used. We need to find a better algorithm to get a tighter estimate on the blocks needed for a given number of bytes. I've run some basic tests and things seem to be holding up. The failing case in bz1174295 is fixed using this patchset. I'll do test build and pass it on to Nate to test with. Abhi Das (3): gfs2: perform quota checks against allocation parameters gfs2: allow quota_check and inplace_reserve to return available blocks gfs2: allow fallocate to max out quotas/fs efficiently fs/gfs2/aops.c | 6 ++--- fs/gfs2/bmap.c | 2 +- fs/gfs2/file.c | 75 ++-- fs/gfs2/incore.h | 3 ++- fs/gfs2/inode.c | 18 -- fs/gfs2/quota.c | 40 ++ fs/gfs2/quota.h | 8 +++--- fs/gfs2/rgrp.c | 12 +++-- fs/gfs2/rgrp.h | 6 +++-- fs/gfs2/xattr.c | 2 +- 10 files changed, 117 insertions(+), 55 deletions(-) -- 1.8.1.4
[Cluster-devel] [GFS2 2/3] gfs2: allow quota_check and inplace_reserve to return available blocks
struct gfs2_alloc_parms is passed to gfs2_quota_check() and gfs2_inplace_reserve() with ap->target containing the number of blocks being requested for allocation in the current operation. We add a new field to struct gfs2_alloc_parms called 'allowed'. gfs2_quota_check() and gfs2_inplace_reserve() return the max blocks allowed by quota and rgrps respectively in 'allowed'. A new flag GFS2_AF_NO_ERR, when set, tells gfs2_quota_check() to not return -EDQUOT when there are only 'x' blocks available to allocate. Where, 0 < x < ap->target. The assumption is that the caller is ok with just 'x' blocks and will likely proceed with allocating them. When there is no quota violation, 'allowed' is set to the maximum number of blocks quotas will allow. If gfs2_inplace_reserve() is successful in finding an rgrp with more than the requested number of free blocks, 'allowed' is set to the total number of free blocks in that rgrp. If not, -ENOSPC is returned and 'allowed' is set to the maximum number of free blocks that were found in any rgrp. Signed-off-by: Abhi Das --- fs/gfs2/incore.h | 1 + fs/gfs2/quota.c | 34 +++--- fs/gfs2/rgrp.c | 12 ++-- fs/gfs2/rgrp.h | 6 -- 4 files changed, 42 insertions(+), 11 deletions(-) diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 3a4ea50..bff2d7f 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -303,6 +303,7 @@ struct gfs2_blkreserv { struct gfs2_alloc_parms { u64 target; u32 aflags; + u64 allowed; }; enum { diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 964a769..91e77ae 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -1094,15 +1094,32 @@ static int print_message(struct gfs2_quota_data *qd, char *type) return 0; } +/** + * gfs2_quota_check - check if allocating new blocks will exceed quota + * @ip: The inode for which this check is being performed + * @uid: The uid to check against + * @gid: The gid to check against + * @ap: The allocation parameters. ap->target contains the requested + * blocks. + * + * Returns: 0 on success, ap->allowed is set to the number of blocks + * availble + * -EDQUOT on quota violation, ap->allowed is set to number of + * blocks available. Note: If GFS2_AF_NO_ERR is set, + * -EDQUOT is not returned if atleast 1 block can be + * allocated w/o exceeding quota, regardless of what + * was requested in ap->target. + */ int gfs2_quota_check(struct gfs2_inode *ip, kuid_t uid, kgid_t gid, struct gfs2_alloc_parms *ap) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_quota_data *qd; - s64 value; + s64 value, warn, limit, avail = 0; unsigned int x; int error = 0; + ap->allowed = UINT_MAX; /* Assume we are permitted a whole lot */ if (!test_bit(GIF_QD_LOCKED, &ip->i_flags)) return 0; @@ -1116,28 +1133,31 @@ int gfs2_quota_check(struct gfs2_inode *ip, kuid_t uid, kgid_t gid, qid_eq(qd->qd_id, make_kqid_gid(gid continue; + warn = (s64)be64_to_cpu(qd->qd_qb.qb_warn); + limit = (s64)be64_to_cpu(qd->qd_qb.qb_limit); value = (s64)be64_to_cpu(qd->qd_qb.qb_value); spin_lock(&qd_lock); value += qd->qd_change + ap->target; spin_unlock(&qd_lock); - if (be64_to_cpu(qd->qd_qb.qb_limit) && (s64)be64_to_cpu(qd->qd_qb.qb_limit) < value) { + avail = limit - value - (s64)ap->target; + if (limit && limit < value) { print_message(qd, "exceeded"); quota_send_warning(qd->qd_id, sdp->sd_vfs->s_dev, QUOTA_NL_BHARDWARN); - error = -EDQUOT; + if (!(ap->aflags & GFS2_AF_NO_ERR) || avail <= 0) + error = -EDQUOT; break; - } else if (be64_to_cpu(qd->qd_qb.qb_warn) && - (s64)be64_to_cpu(qd->qd_qb.qb_warn) < value && + } else if (warn && warn < value && time_after_eq(jiffies, qd->qd_last_warn + -gfs2_tune_get(sdp, - gt_quota_warn_period) * HZ)) { +gfs2_tune_get(sdp, gt_quota_warn_period) * HZ)) { quota_send_warning(qd->qd_id, sdp->sd_vfs->s_dev, QUOTA_NL_BSOFTWARN);
[Cluster-devel] [GFS2 PATCH 2/3] gfs2: allow quota_check and inplace_reserve to return available blocks
struct gfs2_alloc_parms is passed to gfs2_quota_check() and gfs2_inplace_reserve() with ap->target containing the number of blocks being requested for allocation in the current operation. We add a new field to struct gfs2_alloc_parms called 'allowed'. gfs2_quota_check() and gfs2_inplace_reserve() return -EDQUOT and -ENOSPC respectively when they can't allow ap->target blocks to be allocated due to quota violations or lack of space on the fs. In such cases, we make these functions return the number of blocks available in ap->allowed. A subsequent call with this value set as ap->target is less likely to fail. Signed-off-by: Abhi Das --- fs/gfs2/incore.h | 1 + fs/gfs2/quota.c | 15 +++ fs/gfs2/rgrp.c | 8 +++- fs/gfs2/rgrp.h | 2 +- 4 files changed, 24 insertions(+), 2 deletions(-) diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 3a4ea50..bff2d7f 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -303,6 +303,7 @@ struct gfs2_blkreserv { struct gfs2_alloc_parms { u64 target; u32 aflags; + u64 allowed; }; enum { diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 3a0b780..c0d36e9 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -1093,6 +1093,19 @@ static int print_message(struct gfs2_quota_data *qd, char *type) return 0; } +/** + * gfs2_quota_check - check if allocating new blocks will exceed quota + * @ip: The inode for which this check is being performed + * @uid: The uid to check against + * @gid: The gid to check against + * @ap: The allocation parameters. ap->target contains the requested + * blocks. + * + * Returns: 0 on success + * -EDQUOT on quota violation, ap->allowed is set to number of + * blocks available + * error code on any other error. + */ int gfs2_quota_check(struct gfs2_inode *ip, kuid_t uid, kgid_t gid, struct gfs2_alloc_parms *ap) { @@ -1125,6 +1138,8 @@ int gfs2_quota_check(struct gfs2_inode *ip, kuid_t uid, kgid_t gid, quota_send_warning(qd->qd_id, sdp->sd_vfs->s_dev, QUOTA_NL_BHARDWARN); error = -EDQUOT; + ap->allowed = (s64)be64_to_cpu(qd->qd_qb.qb_limit) - + (value - ap->target); break; } else if (be64_to_cpu(qd->qd_qb.qb_warn) && (s64)be64_to_cpu(qd->qd_qb.qb_warn) < value && diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 9150207..e763f31 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -1947,9 +1947,11 @@ static inline int fast_to_acquire(struct gfs2_rgrpd *rgd) * @ap: the allocation parameters * * Returns: errno + * if error is -ENOSPC, ap->allowed is set to the maximum number + * of blocks available for allocation. */ -int gfs2_inplace_reserve(struct gfs2_inode *ip, const struct gfs2_alloc_parms *ap) +int gfs2_inplace_reserve(struct gfs2_inode *ip, struct gfs2_alloc_parms *ap) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_rgrpd *begin = NULL; @@ -1958,6 +1960,7 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, const struct gfs2_alloc_parms *a u64 last_unlinked = NO_BLOCK; int loops = 0; u32 skip = 0; + u32 avail = 0; if (sdp->sd_args.ar_rgrplvb) flags |= GL_SKIP; @@ -2030,6 +2033,8 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, const struct gfs2_alloc_parms *a if (rs->rs_rbm.rgd->rd_free_clone >= ap->target) { ip->i_rgd = rs->rs_rbm.rgd; return 0; + } else if (rs->rs_rbm.rgd->rd_free_clone > avail) { + avail = rs->rs_rbm.rgd->rd_free_clone; } check_rgrp: @@ -2068,6 +2073,7 @@ next_rgrp: gfs2_log_flush(sdp, NULL, NORMAL_FLUSH); } + ap->allowed = avail; return -ENOSPC; } diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h index b104f4a..d38e0b1 100644 --- a/fs/gfs2/rgrp.h +++ b/fs/gfs2/rgrp.h @@ -41,7 +41,7 @@ extern void gfs2_rgrp_go_unlock(struct gfs2_holder *gh); extern struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip); #define GFS2_AF_ORLOV 1 -extern int gfs2_inplace_reserve(struct gfs2_inode *ip, const struct gfs2_alloc_parms *ap); +extern int gfs2_inplace_reserve(struct gfs2_inode *ip, struct gfs2_alloc_parms *ap); extern void gfs2_inplace_release(struct gfs2_inode *ip); extern int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *n, -- 1.8.1.4
[Cluster-devel] [GFS2 PATCH 0/3] fallocate quota fixes
This is a revised set of patches for bz1174295 to address fallocate/quota issues. These patches are only compile tested so far. patch 1 - This is the patch that actually addresses the quota exceed issue. Quota checks were not being performed against the blocks about to be allocated. patch 2 - Modify gfs2_inplace_reserve() and gfs2_quota_check() to return available number of blocks in case of failure. patch 3 - Allows fallocate to take advantage of patch 2 to efficiently max out quotas or fill up the fs instead of returning -EDQUOT/-ENOSPC and leaving some available blocks unallocated. Abhi Das (3): gfs2: perform quota checks against allocation parameters gfs2: allow quota_check and inplace_reserve to return available blocks gfs2: allow fallocate to max out quotas/fs efficiently fs/gfs2/aops.c | 6 +++--- fs/gfs2/bmap.c | 2 +- fs/gfs2/file.c | 27 --- fs/gfs2/incore.h | 3 ++- fs/gfs2/inode.c | 18 ++ fs/gfs2/quota.c | 21 ++--- fs/gfs2/quota.h | 8 +--- fs/gfs2/rgrp.c | 8 +++- fs/gfs2/rgrp.h | 2 +- fs/gfs2/xattr.c | 2 +- 10 files changed, 64 insertions(+), 33 deletions(-) -- 1.8.1.4
[Cluster-devel] [GFS2 PATCH 3/3] gfs2: allow fallocate to max out quotas/fs efficiently
We can quickly get an estimate of how many more blocks are available for allocation restricted by quota and fs size respectively using the ap->allowed field in the gfs2_alloc_parms structure. gfs2_quota_check() and gfs2_inplace_reserve() provide these values. By re-trying to allocate what's available instead of guessing, we can max out quotas or the filesystem efficiently. Bear in mind that this applies only when the requested fallocate operation would otherwise error out with -EDQUOT or -ENOSPC without utilizing all the blocks that might still be available. Signed-off-by: Abhi Das --- fs/gfs2/file.c | 18 +++--- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 2ea420a..57129fa 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -829,20 +829,24 @@ static long __gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t continue; } ap.target = bytes >> sdp->sd_sb.sb_bsize_shift; + quota_retry: error = gfs2_quota_lock_check(ip, &ap); - if (error) + if (error) { + if (error == -EDQUOT && ap.allowed) { + bytes = ap.allowed << sdp->sd_sb.sb_bsize_shift; + ap.target = ap.allowed; + goto quota_retry; + } return error; -retry: + } + retry: gfs2_write_calc_reserv(ip, bytes, &data_blocks, &ind_blocks); ap.target = data_blocks + ind_blocks; error = gfs2_inplace_reserve(ip, &ap); if (error) { - if (error == -ENOSPC && bytes > sdp->sd_sb.sb_bsize) { - bytes >>= 1; - bytes &= bsize_mask; - if (bytes == 0) - bytes = sdp->sd_sb.sb_bsize; + if (error == -ENOSPC && ap.allowed) { + bytes = ap.allowed << sdp->sd_sb.sb_bsize_shift; goto retry; } goto out_qunlock; -- 1.8.1.4
[Cluster-devel] [GFS2 PATCH 1/3] gfs2: perform quota checks against allocation parameters
Use struct gfs2_alloc_parms as an argument to gfs2_quota_check() and gfs2_quota_lock_check() to check for quota violations while accounting for the new blocks requested by the current operation in ap->target. Previously, the number of new blocks requested during an operation were not accounted for during quota_check and would allow these operations to exceed quota. This was not very apparent since most operations allocated only 1 block at a time and quotas would get violated in the next operation. i.e. quota excess would only be by 1 block or so. With fallocate, (where we allocate a bunch of blocks at once) the quota excess is non-trivial and is addressed by this patch. Resolves: rhbz#1174295 Signed-off-by: Abhi Das --- fs/gfs2/aops.c | 6 +++--- fs/gfs2/bmap.c | 2 +- fs/gfs2/file.c | 9 + fs/gfs2/incore.h | 2 +- fs/gfs2/inode.c | 18 ++ fs/gfs2/quota.c | 6 +++--- fs/gfs2/quota.h | 8 +--- fs/gfs2/xattr.c | 2 +- 8 files changed, 29 insertions(+), 24 deletions(-) diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 805b37f..0261126 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -671,12 +671,12 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping, if (alloc_required) { struct gfs2_alloc_parms ap = { .aflags = 0, }; - error = gfs2_quota_lock_check(ip); + requested = data_blocks + ind_blocks; + ap.target = requested; + error = gfs2_quota_lock_check(ip, &ap); if (error) goto out_unlock; - requested = data_blocks + ind_blocks; - ap.target = requested; error = gfs2_inplace_reserve(ip, &ap); if (error) goto out_qunlock; diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index f0b945a..61296ec 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -1224,7 +1224,7 @@ static int do_grow(struct inode *inode, u64 size) if (gfs2_is_stuffed(ip) && (size > (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode { - error = gfs2_quota_lock_check(ip); + error = gfs2_quota_lock_check(ip, &ap); if (error) return error; diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 6e600ab..2ea420a 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -429,11 +429,11 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) if (ret) goto out_unlock; - ret = gfs2_quota_lock_check(ip); - if (ret) - goto out_unlock; gfs2_write_calc_reserv(ip, PAGE_CACHE_SIZE, &data_blocks, &ind_blocks); ap.target = data_blocks + ind_blocks; + ret = gfs2_quota_lock_check(ip, &ap); + if (ret) + goto out_unlock; ret = gfs2_inplace_reserve(ip, &ap); if (ret) goto out_quota_unlock; @@ -828,7 +828,8 @@ static long __gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t offset += bytes; continue; } - error = gfs2_quota_lock_check(ip); + ap.target = bytes >> sdp->sd_sb.sb_bsize_shift; + error = gfs2_quota_lock_check(ip, &ap); if (error) return error; retry: diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 7a2dbbc..3a4ea50 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -301,7 +301,7 @@ struct gfs2_blkreserv { * to the allocation code. */ struct gfs2_alloc_parms { - u32 target; + u64 target; u32 aflags; }; diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 73c72253..08bc84d 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -382,7 +382,7 @@ static int alloc_dinode(struct gfs2_inode *ip, u32 flags, unsigned *dblocks) struct gfs2_alloc_parms ap = { .target = *dblocks, .aflags = flags, }; int error; - error = gfs2_quota_lock_check(ip); + error = gfs2_quota_lock_check(ip, &ap); if (error) goto out; @@ -525,7 +525,7 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name, int error; if (da->nr_blocks) { - error = gfs2_quota_lock_check(dip); + error = gfs2_quota_lock_check(dip, &ap); if (error) goto fail_quota_locks; @@ -953,7 +953,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir, if (da.nr_blocks) { struct gfs2_alloc_parms ap = { .target = da.nr_blocks, }; - error = gfs2_quota_lock_check(dip); + error = gfs2_quota_lock_check(dip, &ap); if (error) goto out_gunlock; @@ -1470,7 +1470,7 @@ static int gfs2_renam
[Cluster-devel] [GFS2 PATCH 2/4] gfs2: add new quota check functions
gfs2_quota_chk_ret_allow and gfs2_quota_lck_chk_ret_allow are variants of gfs2_quota_check and gfs2_quota_lock_check respectively. If an operation will not succeed due to a quota violation, these functions will return the number of blocks that quota will actually allow without failing in an extra parameter 'allow' If acceptable to the caller logic, any of the quota_check functions may be called again with the 'allow'ed blocks to try and avoid a quota violation. Signed-off-by: Abhi Das --- fs/gfs2/quota.c | 24 fs/gfs2/quota.h | 20 +--- 2 files changed, 37 insertions(+), 7 deletions(-) diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index e2f86ec..98cdf97 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -1092,8 +1092,20 @@ static int print_message(struct gfs2_quota_data *qd, char *type) return 0; } - -int gfs2_quota_check(struct gfs2_inode *ip, kuid_t uid, kgid_t gid, s64 exp_change) +/** + * gfs2_quota_chk_ret_allow - Checks if adding the specified number of + * blocks will exceed usr/grp quotas + * @ip: The inode for which this check is being performed + * @uid: The uid to check against + * @gid: The gid to check against + * @exp_change: The expected change in blocks + * @allow: If non-NULL, we should return the number of blocks + * quota will allow if exp_change exceeds limits + * + * Returns: 0 on success, error code otherwise. + */ +int gfs2_quota_chk_ret_allow(struct gfs2_inode *ip, kuid_t uid, kgid_t gid, +s64 exp_change, u32 *allow) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_quota_data *qd; @@ -1119,12 +1131,16 @@ int gfs2_quota_check(struct gfs2_inode *ip, kuid_t uid, kgid_t gid, s64 exp_chan value += qd->qd_change + exp_change; spin_unlock(&qd_lock); - if (be64_to_cpu(qd->qd_qb.qb_limit) && (s64)be64_to_cpu(qd->qd_qb.qb_limit) < value) { + if (be64_to_cpu(qd->qd_qb.qb_limit) && + (s64)be64_to_cpu(qd->qd_qb.qb_limit) < value) { print_message(qd, "exceeded"); quota_send_warning(qd->qd_id, sdp->sd_vfs->s_dev, QUOTA_NL_BHARDWARN); - error = -EDQUOT; + if (allow) { + *allow = (s64)be64_to_cpu(qd->qd_qb.qb_limit) - + (value - exp_change); + } break; } else if (be64_to_cpu(qd->qd_qb.qb_warn) && (s64)be64_to_cpu(qd->qd_qb.qb_warn) < value && diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h index 1457c66..49d1fd9 100644 --- a/fs/gfs2/quota.h +++ b/fs/gfs2/quota.h @@ -24,7 +24,14 @@ extern void gfs2_quota_unhold(struct gfs2_inode *ip); extern int gfs2_quota_lock(struct gfs2_inode *ip, kuid_t uid, kgid_t gid); extern void gfs2_quota_unlock(struct gfs2_inode *ip); -extern int gfs2_quota_check(struct gfs2_inode *ip, kuid_t uid, kgid_t gid, s64 exp_change); +extern int gfs2_quota_chk_ret_allow(struct gfs2_inode *ip, kuid_t uid, + kgid_t gid, s64 exp_change, u32 *allow); +static inline int gfs2_quota_check(struct gfs2_inode *ip, kuid_t uid, kgid_t gid, + s64 exp_change) +{ + return gfs2_quota_chk_ret_allow(ip, uid, gid, exp_change, NULL); +} + extern void gfs2_quota_change(struct gfs2_inode *ip, s64 change, kuid_t uid, kgid_t gid); @@ -37,7 +44,8 @@ extern int gfs2_quotad(void *data); extern void gfs2_wake_up_statfs(struct gfs2_sbd *sdp); -static inline int gfs2_quota_lock_check(struct gfs2_inode *ip, s64 exp_change) +static inline int gfs2_quota_lck_chk_ret_allow(struct gfs2_inode *ip, + s64 exp_change, u32 *allow) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); int ret; @@ -48,12 +56,18 @@ static inline int gfs2_quota_lock_check(struct gfs2_inode *ip, s64 exp_change) return ret; if (sdp->sd_args.ar_quota != GFS2_QUOTA_ON) return 0; - ret = gfs2_quota_check(ip, ip->i_inode.i_uid, ip->i_inode.i_gid, exp_change); + ret = gfs2_quota_chk_ret_allow(ip, ip->i_inode.i_uid, ip->i_inode.i_gid, + exp_change, allow); if (ret) gfs2_quota_unlock(ip); return ret; } +static inline int gfs2_quota_lock_check(struct gfs2_inode *ip, s64 exp_change) +{ + return gfs2_quota_lck_chk_ret_allow(ip, exp_change, NULL); +} + extern const struct quotactl_ops gfs2_quotactl_ops; extern struct shrinker gfs2_qd_shrinker; extern struct list_lru gfs2_qd_lru; -- 1.8.1.4
[Cluster-devel] [GFS2 PATCH 4/4] gfs2: allow fallocate to max out quotas/fs efficiently
With the addition of gfs2_quota_lck_chk_ret_allow() and gfs2_inpl_rsrv_ret_max_avl(), we can quickly get an estimate of how many more blocks are available for allocation restricted by quota and fs size respectively. By trying to allocate what's available instead of guessing, we can max out quotas or the filesystem efficiently. Bear in mind that this applies only when the requested fallocate operation would otherwise error out with -EDQUOT or -ENOSPC without utilizing all the blocks that might still be available. Signed-off-by: Abhi Das --- fs/gfs2/file.c | 24 +++- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index c9482ae..9d8e03a 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -805,6 +805,7 @@ static long __gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t loff_t bsize_mask = ~((loff_t)sdp->sd_sb.sb_bsize - 1); loff_t next = (offset + len - 1) >> sdp->sd_sb.sb_bsize_shift; loff_t max_chunk_size = UINT_MAX & bsize_mask; + u32 allow; next = (next + 1) << sdp->sd_sb.sb_bsize_shift; @@ -828,20 +829,25 @@ static long __gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t offset += bytes; continue; } - error = gfs2_quota_lock_check(ip, bytes >> sdp->sd_sb.sb_bsize_shift); - if (error) + quota_retry: + error = gfs2_quota_lck_chk_ret_allow(ip, +bytes >> sdp->sd_sb.sb_bsize_shift, +&allow); + if (error) { + if (error == -EDQUOT && allow) { + bytes = allow << sdp->sd_sb.sb_bsize_shift; + goto quota_retry; + } return error; -retry: + } + retry: gfs2_write_calc_reserv(ip, bytes, &data_blocks, &ind_blocks); ap.target = data_blocks + ind_blocks; - error = gfs2_inplace_reserve(ip, &ap); + error = gfs2_inpl_rsrv_ret_max_avl(ip, &ap, &allow); if (error) { - if (error == -ENOSPC && bytes > sdp->sd_sb.sb_bsize) { - bytes >>= 1; - bytes &= bsize_mask; - if (bytes == 0) - bytes = sdp->sd_sb.sb_bsize; + if (error == -ENOSPC && allow) { + bytes = allow << sdp->sd_sb.sb_bsize_shift; goto retry; } goto out_qunlock; -- 1.8.1.4
[Cluster-devel] [GFS2 PATCH 1/4] gfs2: check quota for blocks we're about to allocate
This patch allows gfs2_quota_check() to take an extra argument called 'exp_change'. Prior to any allocation, gfs2_quota_check() or gfs2_quota_lock_check() is called with exp_change containing the number of blocks we expect to allocate in this operation. gfs2_quota_check() will add this number to the current usage and check the sum against the quota warns and limits and fail the operation if necessary. Resolves: rhbz#1174295 Signed-off-by: Abhi Das --- fs/gfs2/aops.c | 6 +++--- fs/gfs2/bmap.c | 2 +- fs/gfs2/file.c | 8 fs/gfs2/inode.c | 14 -- fs/gfs2/quota.c | 4 ++-- fs/gfs2/quota.h | 6 +++--- fs/gfs2/xattr.c | 2 +- 7 files changed, 22 insertions(+), 20 deletions(-) diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 805b37f..aa7700a 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -671,12 +671,12 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping, if (alloc_required) { struct gfs2_alloc_parms ap = { .aflags = 0, }; - error = gfs2_quota_lock_check(ip); + requested = data_blocks + ind_blocks; + ap.target = requested; + error = gfs2_quota_lock_check(ip, ap.target); if (error) goto out_unlock; - requested = data_blocks + ind_blocks; - ap.target = requested; error = gfs2_inplace_reserve(ip, &ap); if (error) goto out_qunlock; diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index f0b945a..86cc7b2 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -1224,7 +1224,7 @@ static int do_grow(struct inode *inode, u64 size) if (gfs2_is_stuffed(ip) && (size > (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode { - error = gfs2_quota_lock_check(ip); + error = gfs2_quota_lock_check(ip, ap.target); if (error) return error; diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 6e600ab..c9482ae 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -429,11 +429,11 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) if (ret) goto out_unlock; - ret = gfs2_quota_lock_check(ip); - if (ret) - goto out_unlock; gfs2_write_calc_reserv(ip, PAGE_CACHE_SIZE, &data_blocks, &ind_blocks); ap.target = data_blocks + ind_blocks; + ret = gfs2_quota_lock_check(ip, ap.target); + if (ret) + goto out_unlock; ret = gfs2_inplace_reserve(ip, &ap); if (ret) goto out_quota_unlock; @@ -828,7 +828,7 @@ static long __gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t offset += bytes; continue; } - error = gfs2_quota_lock_check(ip); + error = gfs2_quota_lock_check(ip, bytes >> sdp->sd_sb.sb_bsize_shift); if (error) return error; retry: diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 73c72253..67ffa07 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -382,7 +382,7 @@ static int alloc_dinode(struct gfs2_inode *ip, u32 flags, unsigned *dblocks) struct gfs2_alloc_parms ap = { .target = *dblocks, .aflags = flags, }; int error; - error = gfs2_quota_lock_check(ip); + error = gfs2_quota_lock_check(ip, ap.target); if (error) goto out; @@ -525,7 +525,7 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name, int error; if (da->nr_blocks) { - error = gfs2_quota_lock_check(dip); + error = gfs2_quota_lock_check(dip, ap.target); if (error) goto fail_quota_locks; @@ -953,7 +953,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir, if (da.nr_blocks) { struct gfs2_alloc_parms ap = { .target = da.nr_blocks, }; - error = gfs2_quota_lock_check(dip); + error = gfs2_quota_lock_check(dip, ap.target); if (error) goto out_gunlock; @@ -1470,7 +1470,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, if (da.nr_blocks) { struct gfs2_alloc_parms ap = { .target = da.nr_blocks, }; - error = gfs2_quota_lock_check(ndip); + error = gfs2_quota_lock_check(ndip, ap.target); if (error) goto out_gunlock; @@ -1669,6 +1669,7 @@ static int setattr_chown(struct inode *inode, struct iattr *attr) kuid_t ouid, nuid; kgid_t ogid, ngid; int error; + u64 blocks; ouid = inode->i_uid; ogid = inode->i_gid; @@ -1696,9 +1697,11 @@ static int setattr_
[Cluster-devel] [GFS2 PATCH 3/4] gfs2: add new function gfs2_inpl_rsrv_ret_max_avl
This is a variant of the existing gfs2_inplace_reserve() function. If the requested number of blocks are not available to be reserved from any of the rgrps, gfs2_inplace_reserve() return -ENOSPC. gfs2_inpl_rsrv_ret_max_val() will also return the maximum blocks available in an extra parameter 'max_avail'. If acceptable to the caller logic, either of these inplace resreve functions may be called again requesting 'max_avail' blocks to avoid the -ENOSPC error. Signed-off-by: Abhi Das --- fs/gfs2/rgrp.c | 13 +++-- fs/gfs2/rgrp.h | 10 +- 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 9150207..0fa9ae3 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -1942,14 +1942,17 @@ static inline int fast_to_acquire(struct gfs2_rgrpd *rgd) } /** - * gfs2_inplace_reserve - Reserve space in the filesystem + * gfs2_inpl_rsrv_ret_max_avl - Reserve space in the filesystem * @ip: the inode to reserve space for * @ap: the allocation parameters + * @max_avail: If non-NULL, return the max available extent if -ENOSPC * * Returns: errno */ -int gfs2_inplace_reserve(struct gfs2_inode *ip, const struct gfs2_alloc_parms *ap) +int gfs2_inpl_rsrv_ret_max_avl(struct gfs2_inode *ip, + const struct gfs2_alloc_parms *ap, + u32 *max_avail) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_rgrpd *begin = NULL; @@ -1958,6 +1961,7 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, const struct gfs2_alloc_parms *a u64 last_unlinked = NO_BLOCK; int loops = 0; u32 skip = 0; + u32 max_avlbl = 0; if (sdp->sd_args.ar_rgrplvb) flags |= GL_SKIP; @@ -2030,6 +2034,8 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, const struct gfs2_alloc_parms *a if (rs->rs_rbm.rgd->rd_free_clone >= ap->target) { ip->i_rgd = rs->rs_rbm.rgd; return 0; + } else if (rs->rs_rbm.rgd->rd_free_clone > max_avlbl) { + max_avlbl = rs->rs_rbm.rgd->rd_free_clone; } check_rgrp: @@ -2068,6 +2074,9 @@ next_rgrp: gfs2_log_flush(sdp, NULL, NORMAL_FLUSH); } + if (max_avail) + *max_avail = max_avlbl; + return -ENOSPC; } diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h index b104f4a..2adffca 100644 --- a/fs/gfs2/rgrp.h +++ b/fs/gfs2/rgrp.h @@ -41,7 +41,15 @@ extern void gfs2_rgrp_go_unlock(struct gfs2_holder *gh); extern struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip); #define GFS2_AF_ORLOV 1 -extern int gfs2_inplace_reserve(struct gfs2_inode *ip, const struct gfs2_alloc_parms *ap); +extern int gfs2_inpl_rsrv_ret_max_avl(struct gfs2_inode *ip, + const struct gfs2_alloc_parms *ap, + u32 *max_avail); +static inline int gfs2_inplace_reserve(struct gfs2_inode *ip, + const struct gfs2_alloc_parms *ap) +{ + return gfs2_inpl_rsrv_ret_max_avl(ip, ap, NULL); +} + extern void gfs2_inplace_release(struct gfs2_inode *ip); extern int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *n, -- 1.8.1.4
[Cluster-devel] [GFS2 PATCH 0/4] fallocate quota fixes
These patches are related to bz1174295 where fallocate could exceed quota. I'm posting these for early feedback as these patches are only compile-tested so far. patch 1 - This is the patch that actually addresses the quota exceed issue. Quota checks were not being performed against the blocks about to be allocated. patch 2 - Adds new variants of quota check functions that return the number of allowed blocks if quotas are violated by the number of requested blocks patch 3 - Adds a new variant of gfs2_inplace_reserve that returns the max number of available blocks if the function returns -ENOSPC due to unavailability of the requested number of blocks. patch 4 - Allows fallocate to take advantage of patches 2 and 3 to efficiently max out quotas or fill up the fs instead of returning -EDQUOT/-ENOSPC and leaving some available blocks unallocated. Abhi Das (4): gfs2: check quota for blocks we're about to allocate gfs2: add new quota check functions gfs2: add new function gfs2_inpl_rsrv_ret_max_avl gfs2: allow fallocate to max out quotas/fs efficiently fs/gfs2/aops.c | 6 +++--- fs/gfs2/bmap.c | 2 +- fs/gfs2/file.c | 30 ++ fs/gfs2/inode.c | 14 -- fs/gfs2/quota.c | 26 +- fs/gfs2/quota.h | 20 +--- fs/gfs2/rgrp.c | 13 +++-- fs/gfs2/rgrp.h | 10 +- fs/gfs2/xattr.c | 2 +- 9 files changed, 89 insertions(+), 34 deletions(-) -- 1.8.1.4
[Cluster-devel] [GFS2] fsck.gfs2: addendum to fix broken i_goal values in inodes
This patch moves some code around and fixes some corner cases that the previous patches did not address. This patch also fixes some trailing whitespace and removes a test that is no longer valid from test/fsck.at Resolves: rhbz#1149516 Signed-off-by: Abhi Das --- gfs2/fsck/metawalk.c | 70 ++ gfs2/fsck/metawalk.h | 2 ++ gfs2/fsck/pass1.c | 54 -- gfs2/libgfs2/libgfs2.h | 1 + tests/fsck.at | 1 - 5 files changed, 78 insertions(+), 50 deletions(-) diff --git a/gfs2/fsck/metawalk.c b/gfs2/fsck/metawalk.c index 217bb07..5f432d6 100644 --- a/gfs2/fsck/metawalk.c +++ b/gfs2/fsck/metawalk.c @@ -1549,6 +1549,9 @@ int check_metatree(struct gfs2_inode *ip, struct metawalk_fxns *pass) uint64_t error_blk = 0; int hit_error_blk = 0; + if (!height && pass->check_i_goal) + pass->check_i_goal(ip, ip->i_di.di_num.no_addr, + pass->private); if (!height && !is_dir(&ip->i_di, ip->i_sbd->gfs1)) return 0; @@ -1945,6 +1948,72 @@ static int alloc_leaf(struct gfs2_inode *ip, uint64_t block, void *private) return 0; } +/** + * rgrp_contains_block - Check if the rgrp provided contains the + * given block. Taken directly from the gfs2 kernel code + * @rgd: The rgrp to search within + * @block: The block to search for + * + * Returns: 1 if present, 0 if not. + */ +static inline int rgrp_contains_block(struct rgrp_tree *rgd, uint64_t block) +{ + uint64_t first = rgd->ri.ri_data0; + uint64_t last = first + rgd->ri.ri_data; + return first <= block && block < last; +} + +/** + * check_i_goal + * @ip + * @goal_blk: What the goal block should be for this inode + * + * The goal block for a regular file is typically the last + * data block of the file. If we can't get the right value, + * the inode metadata block is the next best thing. + * + * Returns: 0 if corrected, 1 if not corrected + */ +int check_i_goal(struct gfs2_inode *ip, uint64_t goal_blk, + void *private) +{ + struct gfs2_sbd *sdp = ip->i_sbd; + uint64_t i_block = ip->i_di.di_num.no_addr; + + /* Don't fix gfs1 inodes, system inodes or inodes whose goal blocks are +* set to the inode blocks themselves. */ + if (sdp->gfs1 || ip->i_di.di_flags & GFS2_DIF_SYSTEM || + ip->i_di.di_goal_meta == i_block) + return 0; + /* We default to the inode block */ + if (!goal_blk) + goal_blk = i_block; + + if (ip->i_di.di_goal_meta != goal_blk) { + /* If the existing goal block is in the same rgrp as the inode, +* we give the benefit of doubt and assume the value is correct */ + if (ip->i_rgd && + rgrp_contains_block(ip->i_rgd, ip->i_di.di_goal_meta)) + goto skip; + log_err( _("Error: inode %llu (0x%llx) has invalid " + "allocation goal block %llu (0x%llx). Should" + " be %llu (0x%llx)\n"), +(unsigned long long)i_block, (unsigned long long)i_block, +(unsigned long long)ip->i_di.di_goal_meta, +(unsigned long long)ip->i_di.di_goal_meta, +(unsigned long long)goal_blk, (unsigned long long)goal_blk); + if (query( _("Fix the invalid goal block? (y/n) "))) { + ip->i_di.di_goal_meta = ip->i_di.di_goal_data = goal_blk; + bmodified(ip->i_bh); + } else { + log_err(_("Invalid goal block not fixed.\n")); + return 1; + } + } +skip: + return 0; +} + struct metawalk_fxns alloc_fxns = { .private = NULL, .check_leaf = alloc_leaf, @@ -1955,6 +2024,7 @@ struct metawalk_fxns alloc_fxns = { .check_dentry = NULL, .check_eattr_entry = NULL, .check_eattr_extentry = NULL, + .check_i_goal = check_i_goal, .finish_eattr_indir = NULL, }; diff --git a/gfs2/fsck/metawalk.h b/gfs2/fsck/metawalk.h index 0d9de3f..aae9121 100644 --- a/gfs2/fsck/metawalk.h +++ b/gfs2/fsck/metawalk.h @@ -51,6 +51,8 @@ extern int _fsck_blockmap_set(struct gfs2_inode *ip, uint64_t bblock, extern int check_n_fix_bitmap(struct gfs2_sbd *sdp, uint64_t blk, int error_on_dinode, enum gfs2_mark_block new_blockmap_state); +extern int check_i_goal(struct gfs2_inode *ip, uint64_t goal_blk, + void *private); extern void reprocess_inode(struct gfs2_inode *ip, const char *desc); extern struct duptree *dupfind(uint64_t bloc
[Cluster-devel] [GFS2 PATCH] gfs2: fix bad inode i_goal values during block allocation
This patch checks if i_goal is either zero or if doesn't exist within any rgrp (i.e gfs2_blk2rgrpd() returns NULL). If so, it assigns the ip->i_no_addr block as the i_goal. There are two scenarios where a bad i_goal can result in a -EBADSLT error. 1. Attempting to allocate to an existing inode: Control reaches gfs2_inplace_reserve() and ip->i_goal is bad. We need to fix i_goal here. 2. A new inode is created in a directory whose i_goal is hosed: In this case, the parent dir's i_goal is copied onto the new inode. Since the new inode is not yet created, the ip->i_no_addr field is invalid and so, the fix in gfs2_inplace_reserve() as per 1) won't work in this scenario. We need to catch and fix it sooner in the parent dir itself (gfs2_create_inode()), before it is copied to the new inode. Signed-off-by: Abhi Das --- fs/gfs2/inode.c | 1 + fs/gfs2/rgrp.c | 8 fs/gfs2/rgrp.h | 1 + 3 files changed, 10 insertions(+) diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index fc8ac2e..9516f5c 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -672,6 +672,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; gfs2_set_inode_blocks(inode, 1); munge_mode_uid_gid(dip, inode); + check_and_update_goal(dip); ip->i_goal = dip->i_goal; ip->i_diskflags = 0; ip->i_eattr = 0; diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index f4cb9c0..55ef72d 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -577,6 +577,13 @@ struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd) return rgd; } +void check_and_update_goal(struct gfs2_inode *ip) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + if (!ip->i_goal || gfs2_blk2rgrpd(sdp, ip->i_goal, 1) == NULL) + ip->i_goal = ip->i_no_addr; +} + void gfs2_free_clones(struct gfs2_rgrpd *rgd) { int x; @@ -1910,6 +1917,7 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, const struct gfs2_alloc_parms *a } else if (ip->i_rgd && rgrp_contains_block(ip->i_rgd, ip->i_goal)) { rs->rs_rbm.rgd = begin = ip->i_rgd; } else { + check_and_update_goal(ip); rs->rs_rbm.rgd = begin = gfs2_blk2rgrpd(sdp, ip->i_goal, 1); } if (S_ISDIR(ip->i_inode.i_mode) && (ap->aflags & GFS2_AF_ORLOV)) diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h index 463ab2e..5d8f085 100644 --- a/fs/gfs2/rgrp.h +++ b/fs/gfs2/rgrp.h @@ -80,4 +80,5 @@ static inline bool gfs2_rs_active(struct gfs2_blkreserv *rs) return rs && !RB_EMPTY_NODE(&rs->rs_node); } +extern void check_and_update_goal(struct gfs2_inode *ip); #endif /* __RGRP_DOT_H__ */ -- 1.8.1.4
[Cluster-devel] [GFS2 PATCH] GFS2: check and correct zero i_goal
A GFS1->GFS2 converted filesystem can have the ip->i_goal field set to zero for inodes. This incorrect value results in -EBADSLT when the user attempts to allocate blocks to such inodes. This patch assigns the goal block to be the block address of the inode itself, which serves as a reasonable starting point for the allocation logic to find the next available block. Resolves: rhbz#1130684 Signed-off-by: Abhi Das --- fs/gfs2/glops.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index 2ffc67d..799427b 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -349,6 +349,9 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf) ip->i_inode.i_ctime.tv_nsec = be32_to_cpu(str->di_ctime_nsec); ip->i_goal = be64_to_cpu(str->di_goal_meta); + if (!ip->i_goal) /* From a previous gfs2_convert, perhaps */ + ip->i_goal = ip->i_no_addr; + ip->i_generation = be64_to_cpu(str->di_generation); ip->i_diskflags = be32_to_cpu(str->di_flags); -- 1.8.1.4
[Cluster-devel] [RFC PATCH 3/5] gfs2: Add a dynamic buffer backed by a vector of pages
This patch adds a new buffer called 'vbuf' that is backed by a vector of pages. It is dynamic and can be expanded as needed with low overhead. Signed-off-by: Abhi Das --- fs/gfs2/util.c | 299 + fs/gfs2/util.h | 43 + 2 files changed, 342 insertions(+) diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c index 86d2035..7345489 100644 --- a/fs/gfs2/util.c +++ b/fs/gfs2/util.c @@ -263,3 +263,302 @@ int gfs2_io_error_bh_i(struct gfs2_sbd *sdp, struct buffer_head *bh, return rv; } +/* + * Fast vector-of-pages backed buffer + */ + +static int vp_alloc_pages(struct vp_ctx *vpx, int start, int end) +{ + int i; + + for (i = start; i < end; i++) { + vpx->vp_pages[i] = alloc_page(GFP_KERNEL | GFP_NOFS); + if (vpx->vp_pages[i] == NULL) + goto free; + } + return 0; +free: + for (i = start; i < end; i++) + if (vpx->vp_pages[i]) { + __free_page(vpx->vp_pages[i]); + vpx->vp_pages[i] = NULL; + } + return -ENOMEM; +} + +static void vp_free_pages(struct vp_ctx *vpx) +{ + int i; + + for (i = 0; i < vpx->vp_size; i++) + if (vpx->vp_pages[i]) { + __free_page(vpx->vp_pages[i]); + vpx->vp_pages[i] = NULL; + } +} + +static int vp_extend(struct vp_ctx *vpx, int size) +{ + struct gfs2_sbd *sdp = vpx->vp_sdp; + + /* first make room for more pointers */ + if (size <= 0) + return -EINVAL; + + vpx->vp_pages = krealloc(vpx->vp_pages, +sizeof(struct page *) * (vpx->vp_size + size), +GFP_KERNEL); + if (vpx->vp_pages == NULL) + goto out; + + /* Zero out the new pointers and allocate pages*/ + memset(&vpx->vp_pages[vpx->vp_size], 0, sizeof(struct page *) * size); + if (vp_alloc_pages(vpx, vpx->vp_size, vpx->vp_size + size)) + goto out; + + vpx->vp_size += size; + return 0; +out: + return -ENOMEM; +} + +int vp_init(struct gfs2_sbd *sdp, struct vbuf *vb, int init_cap) +{ + int cap, err = -ENOMEM; + struct vp_ctx *vpx; + + cap = DIV_ROUND_UP(init_cap, PAGE_SIZE); + + vpx = kmalloc(sizeof(struct vp_ctx), GFP_KERNEL); + if (vpx == NULL) + goto out; + + vpx->vp_magic = VP_MAGIC; + vpx->vp_size = cap; + vpx->vp_pages = kzalloc(sizeof(struct page *) * cap, GFP_KERNEL); + if (vpx->vp_pages == NULL) + goto free; + + if (vp_alloc_pages(vpx, 0, cap)) + goto free_all; + + vpx->vp_baseptr = vpx->vp_top = page_address(vpx->vp_pages[0]); + vpx->vp_sdp = sdp; + vb->v_ptr = vpx->vp_baseptr; + vb->v_opaque = vpx; + + err = 0; + goto out; + +free_all: + vp_free_pages(vpx); + kfree(vpx->vp_pages); +free: + kfree(vpx); + vpx = NULL; +out: + return err; +} + +void vp_uninit(struct vbuf *vb) +{ + struct vp_ctx *vpx; + + if (!vb || !vb->v_opaque) + return; + + vpx = vb->v_opaque; + if (vpx->vp_magic != VP_MAGIC) + return; + + vp_free_pages(vpx); + kfree(vpx->vp_pages); + kfree(vpx); + vb->v_ptr = vb->v_opaque = NULL; +} + +static int vp_rw_pages(struct vp_ctx *vpx, void *to, const void *from, + size_t count, int what) +{ + int pg_ind, pg_off, bytes, rw = 0; + + while (count > 0) { + pg_ind = what == VP_READ ? VP_PAGE_INDEX(vpx, from) + : VP_PAGE_INDEX(vpx, to); + pg_off = what == VP_READ ? VP_PAGE_OFFSET(vpx, from) + : VP_PAGE_OFFSET(vpx, to); + bytes = what == VP_READ ? VP_PAGE_BYTES_LEFT(vpx, from) + : VP_PAGE_BYTES_LEFT(vpx, to); + bytes = min(count, (size_t) bytes); + + if (what == VP_READ) + memcpy(to, VP_PAGE_PTR(vpx, pg_ind, pg_off), bytes); + else { + if (what == VP_WRITE) + memcpy(VP_PAGE_PTR(vpx, pg_ind, pg_off), + from, bytes); + else if (what == VP_MEMSET) + memset(VP_PAGE_PTR(vpx, pg_ind, pg_off), + (*(const int*)from), bytes); + if ((to + count) > vpx->vp_top) + vpx->vp_top = to + count; + } + to += bytes; + if (what != VP_MEMSET) + from += bytes; + rw += bytes; +
[Cluster-devel] [RFC PATCH 0/5] xgetdents system call
This system call takes 5 arguments: fd- file descriptor of the directory flags - flags used by xstat mask - field mask used to request stat/xattr info buf - user buf to return collected info count - size of user buffer in bytes xgetdents() should read the directory entries, collect stat and xattr information for each entry as requested and return all the data back to the user in a container structure (linux_xdirent) as part of the supplied user buffer. Abhi Das (5): fs: xstat system call VFS bits fs: Add xgetdents system call and xreaddir file operation gfs2: Add a dynamic buffer backed by a vector of pages gfs2: Add sort functionality with extra parameter gfs2: Add xreaddir file operation and supporting functions arch/x86/syscalls/syscall_32.tbl |3 + arch/x86/syscalls/syscall_64.tbl |3 + fs/gfs2/Makefile |3 +- fs/gfs2/dir.c| 80 +-- fs/gfs2/dir.h| 13 +- fs/gfs2/export.c |2 +- fs/gfs2/file.c | 17 +- fs/gfs2/incore.h |6 + fs/gfs2/inode.c |3 +- fs/gfs2/inode.h |5 + fs/gfs2/ops_fstype.c |4 + fs/gfs2/sys.c| 26 +- fs/gfs2/util.c | 363 ++ fs/gfs2/util.h | 47 ++ fs/gfs2/xattr.c | 27 +- fs/gfs2/xattr.h | 23 + fs/gfs2/xreaddir.c | 1024 ++ fs/gfs2/xreaddir.h | 84 fs/readdir.c | 42 ++ fs/stat.c| 340 - include/linux/fs.h |5 + include/linux/stat.h | 16 +- include/linux/syscalls.h |5 + include/uapi/linux/fcntl.h |1 + include/uapi/linux/stat.h| 143 ++ 25 files changed, 2196 insertions(+), 89 deletions(-) create mode 100644 fs/gfs2/xreaddir.c create mode 100644 fs/gfs2/xreaddir.h -- 1.8.1.4
[Cluster-devel] [RFC PATCH 2/5] fs: Add xgetdents system call and xreaddir file operation
Also add linux_xdirent structure that will be the container for dirent, stat and xattr info. Signed-off-by: Abhi Das --- arch/x86/syscalls/syscall_32.tbl | 1 + arch/x86/syscalls/syscall_64.tbl | 1 + fs/readdir.c | 42 fs/stat.c| 4 +++- include/linux/fs.h | 1 + include/linux/stat.h | 2 ++ include/uapi/linux/stat.h| 33 +++ 7 files changed, 83 insertions(+), 1 deletion(-) diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl index 6d6ca37..35723e3 100644 --- a/arch/x86/syscalls/syscall_32.tbl +++ b/arch/x86/syscalls/syscall_32.tbl @@ -362,3 +362,4 @@ 353i386renameat2 sys_renameat2 354i386xstat sys_xstat 355i386fxstat sys_fxstat +356i386xgetdents sys_xgetdents diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl index 1308ee3..566aab1 100644 --- a/arch/x86/syscalls/syscall_64.tbl +++ b/arch/x86/syscalls/syscall_64.tbl @@ -325,6 +325,7 @@ 316common renameat2 sys_renameat2 317common xstat sys_xstat 318common fxstat sys_fxstat +319common xgetdents sys_xgetdents # # x32-specific system call numbers start at 512 to avoid cache impact diff --git a/fs/readdir.c b/fs/readdir.c index 33fd922..d676088 100644 --- a/fs/readdir.c +++ b/fs/readdir.c @@ -224,6 +224,48 @@ SYSCALL_DEFINE3(getdents, unsigned int, fd, return error; } +SYSCALL_DEFINE5(xgetdents, unsigned int, fd, unsigned, flags, unsigned int, mask, + void __user *, buf, unsigned int, count) +{ + struct fd f; + struct inode *inode; + int error = -ENOTDIR; + + if (!count) + return -EINVAL; + + if (!access_ok(VERIFY_WRITE, buf, count)) + return -EFAULT; + + f = fdget(fd); + if (!f.file) + return -EBADF; + + inode = f.file->f_path.dentry->d_inode; + + error = -ENOTSUPP; + if (!f.file->f_op || !f.file->f_op->xreaddir) + goto out; + + error = security_file_permission(f.file, MAY_READ); + if (error) + goto out; + + error = mutex_lock_killable(&inode->i_mutex); + if (error) + goto out; + + error = -ENOENT; + if (!IS_DEADDIR(inode)) { + error = f.file->f_op->xreaddir(f.file, flags, mask, buf, count); + file_accessed(f.file); + } + mutex_unlock(&inode->i_mutex); +out: + fdput(f); + return error; +} + struct getdents_callback64 { struct dir_context ctx; struct linux_dirent64 __user * current_dir; diff --git a/fs/stat.c b/fs/stat.c index 1fd0b3e..db45f8b 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -651,7 +651,7 @@ static int xstat_get_params(unsigned int mask, struct xstat __user *buffer, * Otherwise we copy the extended stats to userspace and return the amount of * data written into the buffer (or -EFAULT). */ -static long xstat_set_result(struct kstat *stat, struct xstat __user *buffer) +long xstat_set_result(struct kstat *stat, struct xstat __user *buffer) { u32 mask = stat->result_mask, gran = stat->tv_granularity; @@ -701,6 +701,8 @@ static long xstat_set_result(struct kstat *stat, struct xstat __user *buffer) return 0; } +EXPORT_SYMBOL(xstat_set_result); + /* * System call to get extended stats by path */ diff --git a/include/linux/fs.h b/include/linux/fs.h index b91f235..79c7d39 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1464,6 +1464,7 @@ struct file_operations { ssize_t (*read_iter) (struct kiocb *, struct iov_iter *); ssize_t (*write_iter) (struct kiocb *, struct iov_iter *); int (*iterate) (struct file *, struct dir_context *); + size_t (*xreaddir) (struct file *, unsigned int, unsigned int, void __user *, size_t); unsigned int (*poll) (struct file *, struct poll_table_struct *); long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long); long (*compat_ioctl) (struct file *, unsigned int, unsigned long); diff --git a/include/linux/stat.h b/include/linux/stat.h index 552e047..75be415 100644 --- a/include/linux/stat.h +++ b/include/linux/stat.h @@ -46,4 +46,6 @@ struct kstat { unsigned char volume_id[16]; /* volume identifier */ }; +long xstat_set_result(struct kstat *stat, struct xstat __user *buffer); + #endif diff --git a/include/uapi/linux/stat.h b/include/uapi/linux/stat.h index 2907352..d7ea6c5 100644 --- a/include/uapi/linux/stat.h +++ b/include/uapi/linux/stat.h @@ -90,6 +90,14 @@ #define XSTAT_VOLUME_ID0x8000U /* want/got st_volume_id */ #define XSTAT_ALL_STATS0xU /* a