[PATCH] lscp: accelerate backward checkpoint listing
If the minimum checkpoint number of valid checkpoints is large to some extent, lscp -r command takes very long time: $ lscp -r CNODATE TIME MODE FLG BLKCNT ICNT 6541269 2015-02-11 18:38:30 cp- 435 2 6541268 2015-02-11 18:38:25 cp- 484 51 the command looks to enter a busy loop This is because it tries to find younger checkpoints tracking back the checkpoint list in a constant step size. This fixes the issue by lengthening or shortening the step size depending on whether the backward search found a younger checkpoint or not. This patch also inserts a dummy nilfs_get_cpinfo() call before starting the backward search to make successive nilfs_get_cpinfo() calls much faster. Signed-off-by: Ryusuke Konishi konishi.ryus...@lab.ntt.co.jp --- bin/lscp.c | 96 -- 1 file changed, 87 insertions(+), 9 deletions(-) diff --git a/bin/lscp.c b/bin/lscp.c index c855def..023be5c 100644 --- a/bin/lscp.c +++ b/bin/lscp.c @@ -84,6 +84,12 @@ static const struct option long_option[] = { #define LSCP_NCPINFO 512 #define LSCP_MINDELTA 64 /* Minimum delta for reverse direction */ +enum lscp_state { + LSCP_INIT_ST, /* Initial state */ + LSCP_NORMAL_ST, /* Normal state */ + LSCP_ACCEL_ST, /* Accelerate state */ + LSCP_DECEL_ST, /* Decelerate state */ +}; static __u64 param_index; static __u64 param_lines; @@ -176,35 +182,107 @@ static int lscp_backward_cpinfo(struct nilfs *nilfs, struct nilfs_cpinfo *cpi; nilfs_cno_t sidx; /* start index (inclusive) */ nilfs_cno_t eidx; /* end index (exclusive) */ - __u64 rest, delta; + nilfs_cno_t prev_head = 0; + __u64 rest, delta, v; + int state = LSCP_INIT_ST; ssize_t n; rest = param_lines param_lines cpstat-cs_ncps ? param_lines : cpstat-cs_ncps; + if (!rest) + goto out; eidx = param_index param_index cpstat-cs_cno ? param_index + 1 : cpstat-cs_cno; - for ( ; rest 0 eidx NILFS_CNO_MIN; eidx = sidx) { - delta = min_t(__u64, LSCP_NCPINFO, - max_t(__u64, rest, LSCP_MINDELTA)); - sidx = (eidx = NILFS_CNO_MIN + delta) ? eidx - delta : - NILFS_CNO_MIN; +recalc_delta: + delta = min_t(__u64, LSCP_NCPINFO, max_t(__u64, rest, LSCP_MINDELTA)); + v = delta; - n = lscp_get_cpinfo(nilfs, sidx, NILFS_CHECKPOINT, eidx - sidx); + while (eidx NILFS_CNO_MIN) { + if (eidx NILFS_CNO_MIN + v || state == LSCP_INIT_ST) + sidx = NILFS_CNO_MIN; + else + sidx = eidx - v; + + n = lscp_get_cpinfo(nilfs, sidx, NILFS_CHECKPOINT, + state == LSCP_NORMAL_ST ? eidx - sidx : 1); if (n 0) return n; if (!n) break; - for (cpi = cpinfos + n - 1; cpi = cpinfos rest 0; cpi--) { + if (state == LSCP_INIT_ST) { + /* +* This state makes succesive +* nilfs_get_cpinfo() calls much faster by +* setting minimum checkpoint number in nilfs +* struct. +*/ + if (cpinfos[0].ci_cno = eidx) + goto out; /* out of range */ + state = LSCP_NORMAL_ST; + continue; + } else if (cpinfos[0].ci_cno == prev_head) { + /* No younger checkpoint was found */ + + if (sidx == NILFS_CNO_MIN) + break; + + /* go further back */ + switch (state) { + case LSCP_NORMAL_ST: + state = LSCP_ACCEL_ST; + /* fall through */ + case LSCP_ACCEL_ST: + if ((v 1) v) + v = 1; + break; + case LSCP_DECEL_ST: + state = LSCP_NORMAL_ST; + v = delta; + break; + } + eidx = sidx; + continue; + } else { + switch (state) { + case LSCP_ACCEL_ST: + case LSCP_DECEL_ST: + if (cpinfos[n - 1].ci_cno + 1 prev_head) { + /* search again more slowly
[PATCH 3/7] nilfs2: use bgl_lock_ptr()
Simplify nilfs_mdt_bgl_lock() by utilizing bgl_lock_ptr() helper in linux/blockgroup_lock.h. Signed-off-by: Ryusuke Konishi konishi.ryus...@lab.ntt.co.jp --- fs/nilfs2/mdt.h | 7 +-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/nilfs2/mdt.h b/fs/nilfs2/mdt.h index ab172e8..a294ea3 100644 --- a/fs/nilfs2/mdt.h +++ b/fs/nilfs2/mdt.h @@ -111,7 +111,10 @@ static inline __u64 nilfs_mdt_cno(struct inode *inode) return ((struct the_nilfs *)inode-i_sb-s_fs_info)-ns_cno; } -#define nilfs_mdt_bgl_lock(inode, bg) \ - (NILFS_MDT(inode)-mi_bgl-locks[(bg) (NR_BG_LOCKS-1)].lock) +static inline spinlock_t * +nilfs_mdt_bgl_lock(struct inode *inode, unsigned int block_group) +{ + return bgl_lock_ptr(NILFS_MDT(inode)-mi_bgl, block_group); +} #endif /* _NILFS_MDT_H */ -- 1.8.3.1 -- To unsubscribe from this list: send the line unsubscribe linux-nilfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 9/9] nilfs2: prevent starvation of segments protected by snapshots
/nilfs2_fs.h @@ -222,11 +222,13 @@ struct nilfs_super_block { */ #define NILFS_FEATURE_COMPAT_SUFILE_EXTENSION(1ULL 0) #define NILFS_FEATURE_COMPAT_TRACK_LIVE_BLKS (1ULL 1) +#define NILFS_FEATURE_COMPAT_TRACK_SNAPSHOTS (1ULL 2) #define NILFS_FEATURE_COMPAT_RO_BLOCK_COUNT (1ULL 0) #define NILFS_FEATURE_COMPAT_SUPP(NILFS_FEATURE_COMPAT_SUFILE_EXTENSION \ - | NILFS_FEATURE_COMPAT_TRACK_LIVE_BLKS) + | NILFS_FEATURE_COMPAT_TRACK_LIVE_BLKS \ + | NILFS_FEATURE_COMPAT_TRACK_SNAPSHOTS) #define NILFS_FEATURE_COMPAT_RO_SUPP NILFS_FEATURE_COMPAT_RO_BLOCK_COUNT #define NILFS_FEATURE_INCOMPAT_SUPP 0ULL You don't have to add three compat flags just for this one patchset. Please unify it. #define NILFS_FEATURE_COMPAT_TRACK_LIVE_BLKS(1ULL 0) looks to be enough. Regards, Ryusuke Konishi @@ -630,7 +632,7 @@ struct nilfs_segment_usage { __le32 su_nblocks; __le32 su_flags; __le32 su_nlive_blks; - __le32 su_pad; + __le32 su_nsnapshot_blks; __le64 su_nlive_lastmod; }; @@ -682,7 +684,7 @@ nilfs_segment_usage_set_clean(struct nilfs_segment_usage *su, size_t susz) su-su_flags = cpu_to_le32(0); if (susz = NILFS_EXT_SEGMENT_USAGE_SIZE) { su-su_nlive_blks = cpu_to_le32(0); - su-su_pad = cpu_to_le32(0); + su-su_nsnapshot_blks = cpu_to_le32(0); su-su_nlive_lastmod = cpu_to_le64(0); } } @@ -723,7 +725,7 @@ struct nilfs_suinfo { __u32 sui_nblocks; __u32 sui_flags; __u32 sui_nlive_blks; - __u32 sui_pad; + __u32 sui_nsnapshot_blks; __u64 sui_nlive_lastmod; }; @@ -770,6 +772,7 @@ enum { NILFS_SUINFO_UPDATE_FLAGS, NILFS_SUINFO_UPDATE_NLIVE_BLKS, NILFS_SUINFO_UPDATE_NLIVE_LASTMOD, + NILFS_SUINFO_UPDATE_NSNAPSHOT_BLKS, __NR_NILFS_SUINFO_UPDATE_FIELDS, }; @@ -794,6 +797,7 @@ NILFS_SUINFO_UPDATE_FNS(LASTMOD, lastmod) NILFS_SUINFO_UPDATE_FNS(NBLOCKS, nblocks) NILFS_SUINFO_UPDATE_FNS(FLAGS, flags) NILFS_SUINFO_UPDATE_FNS(NLIVE_BLKS, nlive_blks) +NILFS_SUINFO_UPDATE_FNS(NSNAPSHOT_BLKS, nsnapshot_blks) NILFS_SUINFO_UPDATE_FNS(NLIVE_LASTMOD, nlive_lastmod) #define NILFS_MIN_SUINFO_UPDATE_SIZE \ -- 2.3.0 -- To unsubscribe from this list: send the line unsubscribe linux-nilfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line unsubscribe linux-nilfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 5/9] nilfs2: add simple tracking of block deletions and updates
) nilfs_segctor_fill_in_super_root(sci, nilfs); } - nilfs_segctor_update_segusage(sci, nilfs-ns_sufile); + nilfs_segctor_update_segusage(sci, nilfs); /* Write partial segments */ nilfs_segctor_prepare_write(sci); Please separate changes below. diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c index 69bd801..606fdfc 100644 --- a/fs/nilfs2/the_nilfs.c +++ b/fs/nilfs2/the_nilfs.c @@ -630,6 +630,10 @@ int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb, char *data) get_random_bytes(nilfs-ns_next_generation, sizeof(nilfs-ns_next_generation)); + nilfs-ns_feature_compat = le64_to_cpu(sbp-s_feature_compat); + nilfs-ns_feature_compat_ro = le64_to_cpu(sbp-s_feature_compat_ro); + nilfs-ns_feature_incompat = le64_to_cpu(sbp-s_feature_incompat); + err = nilfs_store_disk_layout(nilfs, sbp); if (err) goto failed_sbh; diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h index 23778d3..87cab10 100644 --- a/fs/nilfs2/the_nilfs.h +++ b/fs/nilfs2/the_nilfs.h @@ -101,6 +101,9 @@ enum { * @ns_dev_kobj: /sys/fs/nilfs/device * @ns_dev_kobj_unregister: completion state * @ns_dev_subgroups: device subgroups pointer + * @ns_feature_compat: Compatible feature set + * @ns_feature_compat_ro: Read-only compatible feature set + * @ns_feature_incompat: Incompatible feature set */ struct the_nilfs { unsigned long ns_flags; @@ -201,6 +204,11 @@ struct the_nilfs { struct kobject ns_dev_kobj; struct completion ns_dev_kobj_unregister; struct nilfs_sysfs_dev_subgroups *ns_dev_subgroups; + + /* Features */ + __u64 ns_feature_compat; + __u64 ns_feature_compat_ro; + __u64 ns_feature_incompat; }; #define THE_NILFS_FNS(bit, name) \ @@ -393,4 +401,12 @@ static inline int nilfs_flush_device(struct the_nilfs *nilfs) return err; } +static inline int nilfs_feature_track_live_blks(struct the_nilfs *nilfs) +{ + return (nilfs-ns_feature_compat + NILFS_FEATURE_COMPAT_TRACK_LIVE_BLKS) + (nilfs-ns_feature_compat + NILFS_FEATURE_COMPAT_SUFILE_EXTENSION); +} + This should be written as below: static inline int nilfs_feature_track_live_blks(struct the_nilfs *nilfs) { const __u64 required_bits = NILFS_FEATURE_COMPAT_TRACK_LIVE_BLKS | NILFS_FEATURE_COMPAT_SUFILE_EXTENSION; return ((nilfs-ns_feature_compat required_bits) == required_bits); } Or you can drop the track flag at mount time if NILFS_FEATURE_COMPAT_SUFILE_EXTENSION flag is not set or nilfs_sufile_ext_supported(sufile) is false. Regards, Ryusuke Konishi #endif /* _THE_NILFS_H */ diff --git a/include/linux/nilfs2_fs.h b/include/linux/nilfs2_fs.h index 5d83c55..6ccb2ad 100644 --- a/include/linux/nilfs2_fs.h +++ b/include/linux/nilfs2_fs.h @@ -221,10 +221,12 @@ struct nilfs_super_block { * doesn't know about, it should refuse to mount the filesystem. */ #define NILFS_FEATURE_COMPAT_SUFILE_EXTENSION(1ULL 0) +#define NILFS_FEATURE_COMPAT_TRACK_LIVE_BLKS (1ULL 1) #define NILFS_FEATURE_COMPAT_RO_BLOCK_COUNT (1ULL 0) -#define NILFS_FEATURE_COMPAT_SUPPNILFS_FEATURE_COMPAT_SUFILE_EXTENSION +#define NILFS_FEATURE_COMPAT_SUPP(NILFS_FEATURE_COMPAT_SUFILE_EXTENSION \ + | NILFS_FEATURE_COMPAT_TRACK_LIVE_BLKS) #define NILFS_FEATURE_COMPAT_RO_SUPP NILFS_FEATURE_COMPAT_RO_BLOCK_COUNT #define NILFS_FEATURE_INCOMPAT_SUPP 0ULL -- 2.3.0 -- To unsubscribe from this list: send the line unsubscribe linux-nilfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line unsubscribe linux-nilfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 3/9] nilfs2: extend SUFILE on-disk format to enable counting of live blocks
On Tue, 24 Feb 2015 20:01:38 +0100, Andreas Rohner wrote: *buf, int cleansi, cleansu, dirtysi, dirtysu; long ncleaned = 0, ndirtied = 0; int ret = 0; + bool sup_ext = (supsz = NILFS_EXT_SUINFO_UPDATE_SIZE); + bool su_ext = nilfs_sufile_ext_supported(sufile); if (unlikely(nsup == 0)) return ret; @@ -926,6 +949,9 @@ ssize_t nilfs_sufile_set_suinfo(struct inode *sufile, void *buf, (~0UL __NR_NILFS_SUINFO_UPDATE_FIELDS)) || (nilfs_suinfo_update_nblocks(sup) sup-sup_sui.sui_nblocks + nilfs-ns_blocks_per_segment) + || (nilfs_suinfo_update_nlive_blks(sup) sup_ext + sup-sup_sui.sui_nlive_blks nilfs-ns_blocks_per_segment)) return -EINVAL; } @@ -953,6 +979,14 @@ ssize_t nilfs_sufile_set_suinfo(struct inode *sufile, void *buf, if (nilfs_suinfo_update_nblocks(sup)) su-su_nblocks = cpu_to_le32(sup-sup_sui.sui_nblocks); + if (nilfs_suinfo_update_nlive_blks(sup) sup_ext su_ext) + su-su_nlive_blks = + cpu_to_le32(sup-sup_sui.sui_nlive_blks); + + if (nilfs_suinfo_update_nlive_lastmod(sup) sup_ext su_ext) + su-su_nlive_lastmod = + cpu_to_le64(sup-sup_sui.sui_nlive_lastmod); + if (nilfs_suinfo_update_flags(sup)) { /* * Active flag is a virtual flag projected by running diff --git a/fs/nilfs2/sufile.h b/fs/nilfs2/sufile.h index c446325..d56498b 100644 --- a/fs/nilfs2/sufile.h +++ b/fs/nilfs2/sufile.h @@ -28,6 +28,11 @@ #include linux/nilfs2_fs.h #include mdt.h +static inline int +nilfs_sufile_ext_supported(const struct inode *sufile) +{ + return NILFS_MDT(sufile)-mi_entry_size = NILFS_EXT_SEGMENT_USAGE_SIZE; +} static inline unsigned long nilfs_sufile_get_nsegments(struct inode *sufile) { diff --git a/include/linux/nilfs2_fs.h b/include/linux/nilfs2_fs.h index ff3fea3..5d83c55 100644 --- a/include/linux/nilfs2_fs.h +++ b/include/linux/nilfs2_fs.h @@ -220,9 +220,11 @@ struct nilfs_super_block { * If there is a bit set in the incompatible feature set that the kernel * doesn't know about, it should refuse to mount the filesystem. */ -#define NILFS_FEATURE_COMPAT_RO_BLOCK_COUNT 0x0001ULL +#define NILFS_FEATURE_COMPAT_SUFILE_EXTENSION(1ULL 0) This feature name is not good. sufile can be extended more in a future. You should name it based on the meaning of the extension of this time. As I mentioned in another patch, I think this could be unified to the TRACK_LIVE_BLKS feature that a later patch adds since the live block counting of this patchset is inherently depending on the extention of sufile. -#define NILFS_FEATURE_COMPAT_SUPP0ULL +#define NILFS_FEATURE_COMPAT_RO_BLOCK_COUNT (1ULL 0) + Regards, Ryusuke Konishi -- To unsubscribe from this list: send the line unsubscribe linux-nilfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 6/9] nilfs2: use modification cache to improve performance
); + nilfs_sufile_mc_destroy(mcp); + return; failed: + nilfs_sufile_flush_nlive_blks(nilfs-ns_sufile, mcp); + nilfs_sufile_mc_destroy(mcp); nilfs_warning(ii-vfs_inode.i_sb, __func__, failed to truncate bmap (ino=%lu, err=%d), ii-vfs_inode.i_ino, ret); diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 6059f53..dc0070c 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -511,7 +511,8 @@ static int nilfs_collect_file_data(struct nilfs_sc_info *sci, { int err; - err = nilfs_bmap_propagate(NILFS_I(inode)-i_bmap, bh); + err = nilfs_bmap_propagate_with_mc(NILFS_I(inode)-i_bmap, +sci-sc_mc, bh); if (err 0) return err; @@ -526,7 +527,8 @@ static int nilfs_collect_file_node(struct nilfs_sc_info *sci, struct buffer_head *bh, struct inode *inode) { - return nilfs_bmap_propagate(NILFS_I(inode)-i_bmap, bh); + return nilfs_bmap_propagate_with_mc(NILFS_I(inode)-i_bmap, + sci-sc_mc, bh); } static int nilfs_collect_file_bmap(struct nilfs_sc_info *sci, @@ -1386,7 +1388,7 @@ static void nilfs_segctor_update_segusage(struct nilfs_sc_info *sci, segbuf-sb_nlive_blks_added = segbuf-sb_sum.nfileblk; if (nilfs_feature_track_live_blks(nilfs)) - nilfs_sufile_mod_nlive_blks(sufile, NULL, + nilfs_sufile_mod_nlive_blks(sufile, sci-sc_mc, segbuf-sb_segnum, segbuf-sb_nlive_blks_added); } @@ -2014,6 +2016,9 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode) } nilfs_segctor_update_segusage(sci, nilfs); + nilfs_sufile_flush_nlive_blks(nilfs-ns_sufile, + sci-sc_mc); + /* Write partial segments */ nilfs_segctor_prepare_write(sci); @@ -2603,6 +2608,7 @@ static struct nilfs_sc_info *nilfs_segctor_new(struct super_block *sb, { struct the_nilfs *nilfs = sb-s_fs_info; struct nilfs_sc_info *sci; + int ret; sci = kzalloc(sizeof(*sci), GFP_KERNEL); if (!sci) @@ -2633,6 +2639,18 @@ static struct nilfs_sc_info *nilfs_segctor_new(struct super_block *sb, sci-sc_interval = HZ * nilfs-ns_interval; if (nilfs-ns_watermark) sci-sc_watermark = nilfs-ns_watermark; + + if (nilfs_feature_track_live_blks(nilfs)) { + sci-sc_mc = kmalloc(sizeof(*(sci-sc_mc)), GFP_KERNEL); + if (sci-sc_mc) { + ret = nilfs_sufile_mc_init(sci-sc_mc, +NILFS_SUFILE_MC_SIZE_EXT); + if (ret) { + kfree(sci-sc_mc); + sci-sc_mc = NULL; + } + } + } return sci; } @@ -2701,6 +2719,8 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci) down_write(nilfs-ns_segctor_sem); del_timer_sync(sci-sc_timer); + nilfs_sufile_mc_destroy(sci-sc_mc); + kfree(sci-sc_mc); kfree(sci); } diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h index a48d6de..a857527 100644 --- a/fs/nilfs2/segment.h +++ b/fs/nilfs2/segment.h @@ -80,6 +80,7 @@ struct nilfs_cstage { }; struct nilfs_segment_buffer; +struct nilfs_sufile_mod_cache; struct nilfs_segsum_pointer { struct buffer_head *bh; @@ -129,6 +130,7 @@ struct nilfs_segsum_pointer { * @sc_watermark: Watermark for the number of dirty buffers * @sc_timer: Timer for segctord * @sc_task: current thread of segctord + * @sc_mc: mod cache to add up updates for SUFILE during seg construction */ struct nilfs_sc_info { struct super_block *sc_super; @@ -185,6 +187,7 @@ struct nilfs_sc_info { struct timer_list sc_timer; struct task_struct *sc_task; + struct nilfs_sufile_mod_cache *sc_mc; }; /* sc_flags */ Again, I really hope you eliminate this changes by hiding the cache in sufile. Regards, Ryusuke Konishi -- 2.3.0 -- To unsubscribe from this list: send the line unsubscribe linux-nilfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line unsubscribe linux-nilfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/9] nilfs2: add simple cache for modifications to SUFILE
On Tue, 24 Feb 2015 20:01:37 +0100, Andreas Rohner wrote: This patch adds a simple, small cache that can be used to accumulate modifications to SUFILE entries. This is for example useful for keeping track of reclaimable blocks, because most of the modifications consist of small increments or decrements. By adding these up and temporarily storing them in a small cache, the performance can be improved. Additionally lock contention is reduced. Signed-off-by: Andreas Rohner andreas.roh...@gmx.net --- fs/nilfs2/sufile.c | 178 + fs/nilfs2/sufile.h | 44 + 2 files changed, 222 insertions(+) diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c index 1e8cac6..a369c30 100644 --- a/fs/nilfs2/sufile.c +++ b/fs/nilfs2/sufile.c @@ -1168,6 +1168,184 @@ out_sem: } /** + * nilfs_sufile_mc_init - inits segusg modification cache + * @mc: modification cache + * @capacity: maximum capacity of the mod cache + * + * Description: Allocates memory for an array of nilfs_sufile_mod structures + * according to @capacity. This memory must be freed with + * nilfs_sufile_mc_destroy(). + * + * Return Value: On success, 0 is returned. On error, one of the following + * negative error codes is returned. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-EINVAL - Invalid capacity. + */ +int nilfs_sufile_mc_init(struct nilfs_sufile_mod_cache *mc, size_t capacity) +{ + mc-mc_capacity = capacity; + if (!capacity) + return -EINVAL; + + mc-mc_mods = kmalloc(capacity * sizeof(struct nilfs_sufile_mod), + GFP_KERNEL); GFP_NOFS must be used instead of GFP_KERNEL to avoid initiating other filesystem operations. The abbreviation mc is not good, which is already used as the abbreviation of minimum clean in userland. + if (!mc-mc_mods) + return -ENOMEM; + + mc-mc_size = 0; + + return 0; +} + +/** + * nilfs_sufile_mc_add - add signed value to segusg modification cache + * @mc: modification cache + * @segnum: segment number + * @value: signed value (can be positive and negative) + * + * Description: nilfs_sufile_mc_add() tries to add a pair of @segnum and + * @value to the modification cache. If the cache already contains a + * segment number equal to @segnum, then @value is simply added to the + * existing value. This way thousands of small modifications can be + * accumulated into one value. If @segnum cannot be found and the + * capacity allows it, a new element is added to the cache. If the + * capacity is reached an error value is returned. + * + * Return Value: On success, 0 is returned. On error, one of the following + * negative error codes is returned. + * + * %-ENOSPC - The mod cache has reached its capacity and must be flushed. + */ +static inline int nilfs_sufile_mc_add(struct nilfs_sufile_mod_cache *mc, + __u64 segnum, __s64 value) +{ + struct nilfs_sufile_mod *mods = mc-mc_mods; + int i; + + for (i = 0; i mc-mc_size; ++i, ++mods) { + if (mods-m_segnum == segnum) { + mods-m_value += value; + return 0; + } + } + + if (mc-mc_size mc-mc_capacity) { + mods-m_segnum = segnum; + mods-m_value = value; + mc-mc_size++; + return 0; + } + + return -ENOSPC; +} + +/** + * nilfs_sufile_mc_clear - set mc_size to 0 + * @mc: modification cache + * + * Description: nilfs_sufile_mc_clear() sets mc_size to 0, which enables + * nilfs_sufile_mc_add() to overwrite the elements in @mc. + */ +static inline void nilfs_sufile_mc_clear(struct nilfs_sufile_mod_cache *mc) +{ + mc-mc_size = 0; +} + +/** + * nilfs_sufile_mc_reset - clear cache and add one element + * @mc: modification cache + * @segnum: segment number + * @value: signed value (can be positive and negative) + * + * Description: Clears the modification cache in @mc and adds a new pair of + * @segnum and @value to it at the same time. + */ +static inline void nilfs_sufile_mc_reset(struct nilfs_sufile_mod_cache *mc, + __u64 segnum, __s64 value) +{ + struct nilfs_sufile_mod *mods = mc-mc_mods; + + mods-m_segnum = segnum; + mods-m_value = value; + mc-mc_size = 1; +} The name of this function is confusing. Actual meaning of this function is reset and add, and that can be replaced with mc_clear and mc_add. Remove this function to simplify interface. Regards, Ryusuke Konishi +/** + * nilfs_sufile_mc_flush - flush modification cache + * @sufile: inode of segment usage file + * @mc: modification cache + * @dofunc: primitive operation for the update + * + * Description: nilfs_sufile_mc_flush() flushes the cached modifications + * and applies them to the segment usages on disk
Re: [PATCH 0/9] nilfs2: implementation of cost-benefit GC policy
On Sat, 14 Mar 2015 13:24:25 +0100, Andreas Rohner wrote: Hi Ryusuke, Thank you very much for your detailed review and feedback. I agree with all of your points and I will start working on a rewrite immediately. On 2015-03-12 13:54, Ryusuke Konishi wrote: Hi Andreas, On Tue, 10 Mar 2015 21:37:50 +0100, Andreas Rohner wrote: Hi Ryusuke, Thanks for your thorough review. On 2015-03-10 06:21, Ryusuke Konishi wrote: Hi Andreas, I looked through whole kernel patches and a part of util patches. Overall comments are as follows: [Algorithm] As for algorithm, it looks about OK except for the starvation countermeasure. The stavation countermeasure looks adhoc/hacky, but it's good that it doesn't change kernel/userland interface; we may be able to replace it with better ways in a future or in a revised version of this patchset. (1) Drawback of the starvation countermeasure The patch 9/9 looks to make the execution time of chcp operation worse since it will scan through sufile to modify live block counters. How much does it prolong the execution time ? I'll do some tests, but I haven't noticed any significant performance drop. The GC basically does the same thing, every time it selects segments to reclaim. GC is performed in background by an independent process. What I'm care about it that NILFS_IOCTL_CHANGE_CPMODE ioctl is called from command line interface or application. They differ in this meaning. Was a worse case senario considered in the test ? For example: 1. Fill a TB class drive with data file(s), and make a snapshot on it. 2. Run one pass GC to update snapshot block counts. 3. And do chcp cp If we don't observe noticeable delay on this class of drive, then I think we can put the problem off. Yesterday I did a worst case test as you suggested. I used an old 1 TB hard drive I had lying around. This was my setup: 1. Write a 850GB file 2. Create a snapshot 3. Delete the file 4. Let GC run through all segments 5. Verify with lssu that the GC has updated all SUFILE entries 6. Drop the page cache 7. chcp cp The following results are with the page cache dropped immediately before each call: 1. chcp ss real 0m1.337s user 0m0.017s sys 0m0.030s 2. chcp cp real 0m6.377s user 0m0.023s sys 0m0.053s The following results are without the drop of the page cache: 1. chcp ss real 0m0.137s user 0m0.010s sys 0m0.000s 2. chcp cp real 0m0.016s user 0m0.010s sys 0m0.007s There are 119233 segments in my test. Each SUFILE entry uses 32 bytes. So the worst case for 1 TB with 8 MB segments would be 3.57 MB of random reads and one 3.57 MB continuous write. You only get 6.377s because my hard drive is so slow. You wouldn't notice any difference on a modern SSD. Furthermore the SUFILE is also scanned by the segment allocation algorithm and the GC, so it is very likely already in the page cache. 6.377s is too long because nilfs_sufile_fix_starving_segs() locks sufile mi_sem, and even lengthens lock period of the following locks: - cpfile mi_sem (held at nilfs_cpfile_clear_snapshot()). - transaction lock (held at nilfs_ioctl_change_cpmode()). - ns_snapshot_mount_mutex (held at nilfs_ioctl_change_cpmode()). leading to freeze of all write operations, lssu, lscp, cleanerd, and snapshot mount, etc. It is preferable for the function to be moved outside of them and to release/reacquire transaction lock and sufile mi_sem regularly in some way. Regards, Ryusuke Konishi -- To unsubscribe from this list: send the line unsubscribe linux-nilfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 4/7] nilfs2: unify type of key arguments in bmap interface
The type of key arguments in block mapping interface varies depending on function. For instance, nilfs_bmap_lookup_at_level() takes __u64 for its key argument whereas nilfs_bmap_lookup() takes unsigned long. This fits them to __u64 to eliminate the variation. Signed-off-by: Ryusuke Konishi konishi.ryus...@lab.ntt.co.jp --- fs/nilfs2/alloc.c | 5 +++-- fs/nilfs2/bmap.c | 17 ++--- fs/nilfs2/bmap.h | 8 fs/nilfs2/inode.c | 6 +++--- 4 files changed, 16 insertions(+), 20 deletions(-) diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c index 741fd02..8df0f3b 100644 --- a/fs/nilfs2/alloc.c +++ b/fs/nilfs2/alloc.c @@ -405,13 +405,14 @@ nilfs_palloc_rest_groups_in_desc_block(const struct inode *inode, static int nilfs_palloc_count_desc_blocks(struct inode *inode, unsigned long *desc_blocks) { - unsigned long blknum; + __u64 blknum; int ret; ret = nilfs_bmap_last_key(NILFS_I(inode)-i_bmap, blknum); if (likely(!ret)) *desc_blocks = DIV_ROUND_UP( - blknum, NILFS_MDT(inode)-mi_blocks_per_desc_block); + (unsigned long)blknum, + NILFS_MDT(inode)-mi_blocks_per_desc_block); return ret; } diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c index aadbd0b..c82f436 100644 --- a/fs/nilfs2/bmap.c +++ b/fs/nilfs2/bmap.c @@ -152,9 +152,7 @@ static int nilfs_bmap_do_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr) * * %-EEXIST - A record associated with @key already exist. */ -int nilfs_bmap_insert(struct nilfs_bmap *bmap, - unsigned long key, - unsigned long rec) +int nilfs_bmap_insert(struct nilfs_bmap *bmap, __u64 key, unsigned long rec) { int ret; @@ -191,19 +189,16 @@ static int nilfs_bmap_do_delete(struct nilfs_bmap *bmap, __u64 key) return bmap-b_ops-bop_delete(bmap, key); } -int nilfs_bmap_last_key(struct nilfs_bmap *bmap, unsigned long *key) +int nilfs_bmap_last_key(struct nilfs_bmap *bmap, __u64 *keyp) { - __u64 lastkey; int ret; down_read(bmap-b_sem); - ret = bmap-b_ops-bop_last_key(bmap, lastkey); + ret = bmap-b_ops-bop_last_key(bmap, keyp); up_read(bmap-b_sem); if (ret 0) ret = nilfs_bmap_convert_error(bmap, __func__, ret); - else - *key = lastkey; return ret; } @@ -224,7 +219,7 @@ int nilfs_bmap_last_key(struct nilfs_bmap *bmap, unsigned long *key) * * %-ENOENT - A record associated with @key does not exist. */ -int nilfs_bmap_delete(struct nilfs_bmap *bmap, unsigned long key) +int nilfs_bmap_delete(struct nilfs_bmap *bmap, __u64 key) { int ret; @@ -235,7 +230,7 @@ int nilfs_bmap_delete(struct nilfs_bmap *bmap, unsigned long key) return nilfs_bmap_convert_error(bmap, __func__, ret); } -static int nilfs_bmap_do_truncate(struct nilfs_bmap *bmap, unsigned long key) +static int nilfs_bmap_do_truncate(struct nilfs_bmap *bmap, __u64 key) { __u64 lastkey; int ret; @@ -276,7 +271,7 @@ static int nilfs_bmap_do_truncate(struct nilfs_bmap *bmap, unsigned long key) * * %-ENOMEM - Insufficient amount of memory available. */ -int nilfs_bmap_truncate(struct nilfs_bmap *bmap, unsigned long key) +int nilfs_bmap_truncate(struct nilfs_bmap *bmap, __u64 key) { int ret; diff --git a/fs/nilfs2/bmap.h b/fs/nilfs2/bmap.h index b89e680..9230d33 100644 --- a/fs/nilfs2/bmap.h +++ b/fs/nilfs2/bmap.h @@ -153,10 +153,10 @@ int nilfs_bmap_test_and_clear_dirty(struct nilfs_bmap *); int nilfs_bmap_read(struct nilfs_bmap *, struct nilfs_inode *); void nilfs_bmap_write(struct nilfs_bmap *, struct nilfs_inode *); int nilfs_bmap_lookup_contig(struct nilfs_bmap *, __u64, __u64 *, unsigned); -int nilfs_bmap_insert(struct nilfs_bmap *, unsigned long, unsigned long); -int nilfs_bmap_delete(struct nilfs_bmap *, unsigned long); -int nilfs_bmap_last_key(struct nilfs_bmap *, unsigned long *); -int nilfs_bmap_truncate(struct nilfs_bmap *, unsigned long); +int nilfs_bmap_insert(struct nilfs_bmap *bmap, __u64 key, unsigned long rec); +int nilfs_bmap_delete(struct nilfs_bmap *bmap, __u64 key); +int nilfs_bmap_last_key(struct nilfs_bmap *bmap, __u64 *keyp); +int nilfs_bmap_truncate(struct nilfs_bmap *bmap, __u64 key); void nilfs_bmap_clear(struct nilfs_bmap *); int nilfs_bmap_propagate(struct nilfs_bmap *, struct buffer_head *); void nilfs_bmap_lookup_dirty_buffers(struct nilfs_bmap *, struct list_head *); diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 8b59695..cf9e489 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -106,7 +106,7 @@ int nilfs_get_block(struct inode *inode, sector_t blkoff, err = nilfs_transaction_begin(inode-i_sb, ti, 1); if (unlikely(err)) goto out; - err = nilfs_bmap_insert(ii-i_bmap, (unsigned long)blkoff
[PATCH 1/7] nilfs2: do not use async write flag for segment summary buffers
The async write flag is introduced to nilfs2 in the commit 7f42ec394156 (nilfs2: fix issue with race condition of competition between segments for dirty blocks), but the flag only makes sense for data buffers and btree node buffers. It is not needed for segment summary buffers. This gits rid of the latter uses as part of refactoring of atomic bit operations on buffer state bitmap. Signed-off-by: Ryusuke Konishi konishi.ryus...@lab.ntt.co.jp Cc: Vyacheslav Dubeyko sl...@dubeyko.com --- fs/nilfs2/segment.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 0c3f303..c9a4e60 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -1588,7 +1588,6 @@ static void nilfs_segctor_prepare_write(struct nilfs_sc_info *sci) list_for_each_entry(bh, segbuf-sb_segsum_buffers, b_assoc_buffers) { - set_buffer_async_write(bh); if (bh-b_page != bd_page) { if (bd_page) { lock_page(bd_page); @@ -1688,7 +1687,6 @@ static void nilfs_abort_logs(struct list_head *logs, int err) list_for_each_entry(segbuf, logs, sb_list) { list_for_each_entry(bh, segbuf-sb_segsum_buffers, b_assoc_buffers) { - clear_buffer_async_write(bh); if (bh-b_page != bd_page) { if (bd_page) end_page_writeback(bd_page); @@ -1768,7 +1766,6 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci) b_assoc_buffers) { set_buffer_uptodate(bh); clear_buffer_dirty(bh); - clear_buffer_async_write(bh); if (bh-b_page != bd_page) { if (bd_page) end_page_writeback(bd_page); -- 1.8.3.1 -- To unsubscribe from this list: send the line unsubscribe linux-nilfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 0/9] nilfs2: implementation of cost-benefit GC policy
Hi Andreas, On Tue, 10 Mar 2015 21:37:50 +0100, Andreas Rohner wrote: Hi Ryusuke, Thanks for your thorough review. On 2015-03-10 06:21, Ryusuke Konishi wrote: Hi Andreas, I looked through whole kernel patches and a part of util patches. Overall comments are as follows: [Algorithm] As for algorithm, it looks about OK except for the starvation countermeasure. The stavation countermeasure looks adhoc/hacky, but it's good that it doesn't change kernel/userland interface; we may be able to replace it with better ways in a future or in a revised version of this patchset. (1) Drawback of the starvation countermeasure The patch 9/9 looks to make the execution time of chcp operation worse since it will scan through sufile to modify live block counters. How much does it prolong the execution time ? I'll do some tests, but I haven't noticed any significant performance drop. The GC basically does the same thing, every time it selects segments to reclaim. GC is performed in background by an independent process. What I'm care about it that NILFS_IOCTL_CHANGE_CPMODE ioctl is called from command line interface or application. They differ in this meaning. Was a worse case senario considered in the test ? For example: 1. Fill a TB class drive with data file(s), and make a snapshot on it. 2. Run one pass GC to update snapshot block counts. 3. And do chcp cp If we don't observe noticeable delay on this class of drive, then I think we can put the problem off. In a use case of nilfs, many snapshots are created and they are automatically changed back to plain checkpoints because old snapshots are thinned out over time. The patch 9/9 may impact on such usage. (2) Compatibility What will happen in the following case: 1. Create a file system, use it with the new module, and create snapshots. 2. Mount it with an old module, and release snapshot with chcp cp 3. Mount it with the new module, and cleanerd runs gc with cost benefit or greedy policy. Some segments could be subject to starvation. But it would probably only affect a small number of segments and it could be fixed by chcp ss CP; chcp cp CP. Ok, let's treat this as a restriction for now. If you come up with any good idea, please propose. (3) Durability against unexpected power failures (just a note) The current patchset looks not to cause starvation issue even when unexpected power failure occurs during or after executing chcp cp because nilfs_ioctl_change_cpmode() do changes in a transactional way with nilfs_transaction_begin/commit. We should always think this kind of situtation to keep consistency. [Coding Style] (4) This patchset has several coding style issues. Please fix them and re-check with the latest checkpatch script (script/checkpatch.pl). I'll fix that. Sorry. patch 2: WARNING: Prefer kmalloc_array over kmalloc with multiply #85: FILE: fs/nilfs2/sufile.c:1192: +mc-mc_mods = kmalloc(capacity * sizeof(struct nilfs_sufile_mod), patch 5,6: WARNING: 'aquired' may be misspelled - perhaps 'acquired'? #60: the same semaphore has to be aquired. So if the DAT-Entry belongs to WARNING: 'aquired' may be misspelled - perhaps 'acquired'? #46: be aquired, which blocks the entire SUFILE and effectively turns WARNING: 'aquired' may be misspelled - perhaps 'acquired'? #53: afore mentioned lock only needs to be aquired, if the cache is full (5) sub_sizeof macro: The same definition exists as offsetofend() in vfio.h, and a patch to move it to stddef.h is now proposed. Please use the same name, and redefine it only if it's not defined: #ifndef offsetofend #define offsetofend(TYPE, MEMBER) \ (offsetof(TYPE, MEMBER) + sizeof(((TYPE *)0)-MEMBER)) #endif Ok I'll change that. [Implementation] (6) b_blocknr Please do not use bh-b_blocknr to store disk block number. This field is used to keep virtual block number except for DAT files. It is only replaced to an actual block number during calling submit_bh(). Keep this policy. As far as I can tell, this is only true for blocks of GC inodes and node blocks. All other buffer_heads are always mapped to on disk blocks by nilfs_get_block(). I only added the mapping in nilfs_segbuf_submit_bh() to correctly set the value in b_blocknr to the new location. nilfs_get_block() is only used for regular files, directories, and so on. Blocks on metadata files are mapped through nilfs_mdt_submit_block(). Anyway, yes, they stores actual disk block number in b_blocknr in the current implementation. But, it is just a cutting corner of the current implementation, which comes from the reason that we have to set actual disk block numbers when reading blocks with vfs/mm functions. Anyway I don't like you touch nilfs_get_block() and nilfs_segbuf_submit_bh() in a part of the big patch. At least
Re: Making Nilfs ZAC Compliant
Hi, On Thu, 26 Feb 2015 19:54:48 +, Benixon Dhas wrote: Hi All, We are trying to make Nilfs work with a SMR Device which adheres to Zoned ATA Commands(ZAC) Specification. One of the restrictions in the specification is reading an unwritten part of the Zone(Segment in Nilfs) will cause a read error. We observe that Nilfs does not write a complete physical segment(we use 256MB segment) always. After digging in the source a while we figured that this is due to the fact that Nilfs requires a certain number of minimum blocks for constructing a partial segment (NILFS_PSEG_MIN_BLOCKS), which currently is 2. So we see some segments where the last block (in our case a block is 4k) is not being written to. For recovery and GC, NILFS needs to insert one or more header blocks before writing payload blocks. Inevitably, the minimum size of a partial segment becomes 2. When some utilities like garbage collector and dump segment reads (May not be an exhaustive list) a segment it tries to read the entire physical segment. This causes read errors in the kernel and hence retries for the last unwritten block in certain segments. The recovery function of NILFS also needs to read entire physical segment. It never reads unwritten blocks if the file system was cleanly unmounted, however, this is not the case for unclean shutdown or panic. Worse yet, if it gets an EIO from the underlying block layer, the recovery will fail and the mount system call will abort. In an attempt to solve this problem we were trying to figure out if we can write some dummy data to the remaining unutilized blocks in the segment. But we are not sure what would be the best way to do this. Another solution we had in mind was to figure out all places where segments are read, and modify it to prevent it from reading unwritten blocks. But we feel this might be more complex solution and might impact performance more. Looks like sufile is available for this purpose. It is maintaining how many blocks are written for each segment. You can see it in the NBLOCKS field of the output of lssu command. One restriction is that this metadata file (sufile) is unavailable until mount system call succeeds. The recovery code cannot use it. Please advise us on the best way to solve the problem. Also what would be architecturally a best place to fix the problem. Writing dummy data to the dead space for SMR devices looks better to me because it's simpler and the performance penalty seems not so high. But, What will happen if an unexpected power failure hits the device ? Does that cause the file system to read unwritten blocks ? If so, it seems that we need translation layer to hide these issues, or a new error code or a new mechanism to make it possible for file systems to know/handle them. Regards, Ryusuke Konishi -- To unsubscribe from this list: send the line unsubscribe linux-nilfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [systemd-devel] nilfs-cleanerd startup on boot
Hi On 2015/03/04 0:44, dennis.mur...@wipro.com wrote: I had mis-typed the address for the nilfs mail group -Original Message- From: Lennart Poettering [mailto:lenn...@poettering.net] Sent: Saturday, February 28, 2015 12:34 PM To: Dennis Murata (WT01 - ENU) Cc: systemd-de...@lists.freedesktop.org; linus-ni...@vger.kernel.org Subject: Re: [systemd-devel] nilfs-cleanerd startup on boot On Fri, 27.02.15 18:31, dennis.mur...@wipro.com (dennis.mur...@wipro.com) wrote: I have a fedora 21 system that where I mount an nilfs2 file system. I use a simple /etc/modules-load.d/nilfs.conf file to load the kernel module and have an entry in the fstab. Creating the modules-load.d snippet should not be necessary, as the kernel should autoload the kernel module for it when it is first required. I did not find this to be the case for fedora 21. Without creating the file to load the module, any attempt I made to mount the file system would get a unknown filetype error. Does this point at adding this module to the initrd file? Is nilfs2.ko installed in your environment? Try modinfo nilfs2 Older fedora needed kernel-modules-extra package. [1] [1] http://nilfs.sourceforge.net/en/pkg_fedora.html The file system mounts on boot as it should, but the nilfs-cleanerd program does not startup. If I umount /nilfs then mount /nilfs the nilfs-cleanerd program starts as it should to cleanup the checkpoints. How is that daemon supposed to be started? Is it forked off /bin/mount? Does systemd use a different mount program at boot? It uses /bin/mount for mounting normal file systems. nilfs_cleanerd is invoked through /sbin/mount.nilfs2 helper. [2] The helper is called from /sbin/mount if it exists. /sbin/mount.nilfs2 is included in nilfs-utils package. nilfs_cleanerd is just a user-land process, so it can be manually invoked if you have root privilege. [3] # /sbin/nilfs_cleanerd device directory But, in this case, you need to kill nilfs_cleanerd manually before umount. So, I recommend running cleanerd through mount.nilfs2. The above explanation may not suit for the recent fedora since nilfs-utils is not yet tuned to systemd environment. [2] http://nilfs.sourceforge.net/en/man8/mount.nilfs2.8.html [3] http://nilfs.sourceforge.net/en/man8/nilfs_cleanerd.8.html Regards, Ryusuke Konishi Is there something else that should be included other than the nilfs.conf file? I have just started using a system with systemd as the init so please forgive my ignorance. I have no idea about nilfs really, and we had no reports about any problems with it before. I wanted to look at the performance of nilfs and f2fs. This is my first try at using these file systems Lennart -- Lennart Poettering, Red Hat -- To unsubscribe from this list: send the line unsubscribe linux-nilfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line unsubscribe linux-nilfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 1/1] nilfs2: fix deadlock of segment constructor during recovery
According to a report from Yuxuan Shui, nilfs2 in kernel 3.19 got stuck during recovery at mount time. The code path that caused the deadlock was as follows: nilfs_fill_super() load_nilfs() nilfs_salvage_orphan_logs() * Do roll-forwarding, attach segment constructor for recovery, and kick it. nilfs_segctor_thread() nilfs_segctor_thread_construct() * A lock is held with nilfs_transaction_lock() nilfs_segctor_do_construct() nilfs_segctor_drop_written_files() iput() iput_final() write_inode_now() writeback_single_inode() __writeback_single_inode() do_writepages() nilfs_writepage() nilfs_construct_dsync_segment() nilfs_transaction_lock() -- deadlock This can happen if commit 7ef3ff2fea8b (nilfs2: fix deadlock of segment constructor over I_SYNC flag) is applied and roll-forward recovery was performed at mount time. The roll-forward recovery can happen if datasync write is done and the file system crashes immediately after that. For instance, we can reproduce the issue with the following steps: nilfs2 is mounted on /nilfs (device: /dev/sdb1) # dd if=/dev/zero of=/nilfs/test bs=4k count=1 sync # dd if=/dev/zero of=/nilfs/test conv=notrunc oflag=dsync bs=4k count=1 reboot -nfh the system will immediately reboot # mount -t nilfs2 /dev/sdb1 /nilfs The deadlock occurs because iput() can run segment constructor through writeback_single_inode() if MS_ACTIVE flag is not set on sb-s_flags. The above commit changed segment constructor so that it calls iput() asynchronously for inodes with i_nlink == 0, but that change was imperfect. This fixes the another deadlock by deferring iput() in segment constructor even for the case that mount is not finished, that is, for the case that MS_ACTIVE flag is not set. Reported-by: Yuxuan Shui yshu...@gmail.com Signed-off-by: Ryusuke Konishi konishi.ryus...@lab.ntt.co.jp Cc: Al Viro v...@zeniv.linux.org.uk Tested-by: Ryusuke Konishi konishi.ryus...@lab.ntt.co.jp Cc: sta...@vger.kernel.org --- fs/nilfs2/segment.c | 7 --- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 469086b..0c3f303 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -1907,6 +1907,7 @@ static void nilfs_segctor_drop_written_files(struct nilfs_sc_info *sci, struct the_nilfs *nilfs) { struct nilfs_inode_info *ii, *n; + int during_mount = !(sci-sc_super-s_flags MS_ACTIVE); int defer_iput = false; spin_lock(nilfs-ns_inode_lock); @@ -1919,10 +1920,10 @@ static void nilfs_segctor_drop_written_files(struct nilfs_sc_info *sci, brelse(ii-i_bh); ii-i_bh = NULL; list_del_init(ii-i_dirty); - if (!ii-vfs_inode.i_nlink) { + if (!ii-vfs_inode.i_nlink || during_mount) { /* -* Defer calling iput() to avoid a deadlock -* over I_SYNC flag for inodes with i_nlink == 0 +* Defer calling iput() to avoid deadlocks if +* i_nlink == 0 or mount is not yet finished. */ list_add_tail(ii-i_dirty, sci-sc_iput_queue); defer_iput = true; -- 1.8.3.1 -- To unsubscribe from this list: send the line unsubscribe linux-nilfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 0/1] nilfs2: fix deadlock of segment constructor during recovery
Hi Andrew, Please send the following bug fix to upstream. It fixes another deadlock issue of nilfs2 segment constructor, which was recently reported. Thanks, Ryusuke Konishi -- Ryusuke Konishi (1): nilfs2: fix deadlock of segment constructor during recovery fs/nilfs2/segment.c | 7 --- 1 file changed, 4 insertions(+), 3 deletions(-) -- To unsubscribe from this list: send the line unsubscribe linux-nilfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 1/4] nilfs2: unify type of key arguments in bmap interface
The type of key arguments in block mapping interface varies depending on function. For instance, nilfs_bmap_lookup_at_level() takes __u64 for its key argument whereas nilfs_bmap_lookup() takes unsigned long. This fits them to __u64 to eliminate the variation. Signed-off-by: Ryusuke Konishi konishi.ryus...@lab.ntt.co.jp --- fs/nilfs2/alloc.c | 5 +++-- fs/nilfs2/bmap.c | 17 ++--- fs/nilfs2/bmap.h | 8 fs/nilfs2/inode.c | 6 +++--- 4 files changed, 16 insertions(+), 20 deletions(-) diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c index 741fd02..8df0f3b 100644 --- a/fs/nilfs2/alloc.c +++ b/fs/nilfs2/alloc.c @@ -405,13 +405,14 @@ nilfs_palloc_rest_groups_in_desc_block(const struct inode *inode, static int nilfs_palloc_count_desc_blocks(struct inode *inode, unsigned long *desc_blocks) { - unsigned long blknum; + __u64 blknum; int ret; ret = nilfs_bmap_last_key(NILFS_I(inode)-i_bmap, blknum); if (likely(!ret)) *desc_blocks = DIV_ROUND_UP( - blknum, NILFS_MDT(inode)-mi_blocks_per_desc_block); + (unsigned long)blknum, + NILFS_MDT(inode)-mi_blocks_per_desc_block); return ret; } diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c index aadbd0b..c82f436 100644 --- a/fs/nilfs2/bmap.c +++ b/fs/nilfs2/bmap.c @@ -152,9 +152,7 @@ static int nilfs_bmap_do_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr) * * %-EEXIST - A record associated with @key already exist. */ -int nilfs_bmap_insert(struct nilfs_bmap *bmap, - unsigned long key, - unsigned long rec) +int nilfs_bmap_insert(struct nilfs_bmap *bmap, __u64 key, unsigned long rec) { int ret; @@ -191,19 +189,16 @@ static int nilfs_bmap_do_delete(struct nilfs_bmap *bmap, __u64 key) return bmap-b_ops-bop_delete(bmap, key); } -int nilfs_bmap_last_key(struct nilfs_bmap *bmap, unsigned long *key) +int nilfs_bmap_last_key(struct nilfs_bmap *bmap, __u64 *keyp) { - __u64 lastkey; int ret; down_read(bmap-b_sem); - ret = bmap-b_ops-bop_last_key(bmap, lastkey); + ret = bmap-b_ops-bop_last_key(bmap, keyp); up_read(bmap-b_sem); if (ret 0) ret = nilfs_bmap_convert_error(bmap, __func__, ret); - else - *key = lastkey; return ret; } @@ -224,7 +219,7 @@ int nilfs_bmap_last_key(struct nilfs_bmap *bmap, unsigned long *key) * * %-ENOENT - A record associated with @key does not exist. */ -int nilfs_bmap_delete(struct nilfs_bmap *bmap, unsigned long key) +int nilfs_bmap_delete(struct nilfs_bmap *bmap, __u64 key) { int ret; @@ -235,7 +230,7 @@ int nilfs_bmap_delete(struct nilfs_bmap *bmap, unsigned long key) return nilfs_bmap_convert_error(bmap, __func__, ret); } -static int nilfs_bmap_do_truncate(struct nilfs_bmap *bmap, unsigned long key) +static int nilfs_bmap_do_truncate(struct nilfs_bmap *bmap, __u64 key) { __u64 lastkey; int ret; @@ -276,7 +271,7 @@ static int nilfs_bmap_do_truncate(struct nilfs_bmap *bmap, unsigned long key) * * %-ENOMEM - Insufficient amount of memory available. */ -int nilfs_bmap_truncate(struct nilfs_bmap *bmap, unsigned long key) +int nilfs_bmap_truncate(struct nilfs_bmap *bmap, __u64 key) { int ret; diff --git a/fs/nilfs2/bmap.h b/fs/nilfs2/bmap.h index b89e680..9230d33 100644 --- a/fs/nilfs2/bmap.h +++ b/fs/nilfs2/bmap.h @@ -153,10 +153,10 @@ int nilfs_bmap_test_and_clear_dirty(struct nilfs_bmap *); int nilfs_bmap_read(struct nilfs_bmap *, struct nilfs_inode *); void nilfs_bmap_write(struct nilfs_bmap *, struct nilfs_inode *); int nilfs_bmap_lookup_contig(struct nilfs_bmap *, __u64, __u64 *, unsigned); -int nilfs_bmap_insert(struct nilfs_bmap *, unsigned long, unsigned long); -int nilfs_bmap_delete(struct nilfs_bmap *, unsigned long); -int nilfs_bmap_last_key(struct nilfs_bmap *, unsigned long *); -int nilfs_bmap_truncate(struct nilfs_bmap *, unsigned long); +int nilfs_bmap_insert(struct nilfs_bmap *bmap, __u64 key, unsigned long rec); +int nilfs_bmap_delete(struct nilfs_bmap *bmap, __u64 key); +int nilfs_bmap_last_key(struct nilfs_bmap *bmap, __u64 *keyp); +int nilfs_bmap_truncate(struct nilfs_bmap *bmap, __u64 key); void nilfs_bmap_clear(struct nilfs_bmap *); int nilfs_bmap_propagate(struct nilfs_bmap *, struct buffer_head *); void nilfs_bmap_lookup_dirty_buffers(struct nilfs_bmap *, struct list_head *); diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 8b59695..cf9e489 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -106,7 +106,7 @@ int nilfs_get_block(struct inode *inode, sector_t blkoff, err = nilfs_transaction_begin(inode-i_sb, ti, 1); if (unlikely(err)) goto out; - err = nilfs_bmap_insert(ii-i_bmap, (unsigned long)blkoff
[PATCH 0/4] nilfs2: shorten execution time of lscp command
The older a filesystem gets, the slower lscp command becomes. This is because nilfs_cpfile_do_get_cpinfo() function meets more hole blocks as the start offset of valid checkpoint numbers gets bigger. This series introduces some helper functions which help to skip hole blocks efficiently, and reduces the overhead with them. A measurement result of this series is as follows: Before: $ time lscp CNODATE TIME MODE FLG BLKCNT ICNT 5769303 2015-02-22 19:31:33 cp- 108 1 5769304 2015-02-22 19:38:54 cp- 108 1 real0m0.182s user0m0.003s sys 0m0.180s After: $ time lscp CNODATE TIME MODE FLG BLKCNT ICNT 5769303 2015-02-22 19:31:33 cp- 108 1 5769304 2015-02-22 19:38:54 cp- 108 1 real0m0.003s user0m0.001s sys 0m0.002s Thanks, Ryusuke Konishi -- Ryusuke Konishi (4): nilfs2: unify type of key arguments in bmap interface nilfs2: add bmap function to seek a valid key nilfs2: add helper to find existent block on metadata file nilfs2: improve execution time of NILFS_IOCTL_GET_CPINFO ioctl fs/nilfs2/alloc.c | 5 +++-- fs/nilfs2/bmap.c | 48 ++- fs/nilfs2/bmap.h | 13 ++- fs/nilfs2/btree.c | 66 ++ fs/nilfs2/cpfile.c | 58 ++- fs/nilfs2/direct.c | 17 ++ fs/nilfs2/inode.c | 6 ++--- fs/nilfs2/mdt.c| 54 fs/nilfs2/mdt.h| 3 +++ 9 files changed, 243 insertions(+), 27 deletions(-) -- To unsubscribe from this list: send the line unsubscribe linux-nilfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 4/4] nilfs2: improve execution time of NILFS_IOCTL_GET_CPINFO ioctl
The older a filesystem gets, the slower lscp command becomes. This is because nilfs_cpfile_do_get_cpinfo() function meets more hole blocks as the start offset of valid checkpoint numbers gets bigger. This reduces the overhead by skipping hole blocks efficiently with nilfs_mdt_find_block() helper. A measurement result of this patch is as follows: Before: $ time lscp CNODATE TIME MODE FLG BLKCNT ICNT 5769303 2015-02-22 19:31:33 cp- 108 1 5769304 2015-02-22 19:38:54 cp- 108 1 real0m0.182s user0m0.003s sys 0m0.180s After: $ time lscp CNODATE TIME MODE FLG BLKCNT ICNT 5769303 2015-02-22 19:31:33 cp- 108 1 5769304 2015-02-22 19:38:54 cp- 108 1 real0m0.003s user0m0.001s sys 0m0.002s Signed-off-by: Ryusuke Konishi konishi.ryus...@lab.ntt.co.jp --- fs/nilfs2/cpfile.c | 58 -- 1 file changed, 52 insertions(+), 6 deletions(-) diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c index 0d58075..b6596ca 100644 --- a/fs/nilfs2/cpfile.c +++ b/fs/nilfs2/cpfile.c @@ -53,6 +53,13 @@ nilfs_cpfile_get_offset(const struct inode *cpfile, __u64 cno) return do_div(tcno, nilfs_cpfile_checkpoints_per_block(cpfile)); } +static __u64 nilfs_cpfile_first_checkpoint_in_block(const struct inode *cpfile, + unsigned long blkoff) +{ + return (__u64)nilfs_cpfile_checkpoints_per_block(cpfile) * blkoff + + 1 - NILFS_MDT(cpfile)-mi_first_entry_offset; +} + static unsigned long nilfs_cpfile_checkpoints_in_block(const struct inode *cpfile, __u64 curr, @@ -146,6 +153,44 @@ static inline int nilfs_cpfile_get_checkpoint_block(struct inode *cpfile, create, nilfs_cpfile_block_init, bhp); } +/** + * nilfs_cpfile_find_checkpoint_block - find and get a buffer on cpfile + * @cpfile: inode of cpfile + * @start_cno: start checkpoint number (inclusive) + * @end_cno: end checkpoint number (inclusive) + * @cnop: place to store the next checkpoint number + * @bhp: place to store a pointer to buffer_head struct + * + * Return Value: On success, it returns 0. On error, the following negative + * error code is returned. + * + * %-ENOMEM - Insufficient memory available. + * + * %-EIO - I/O error + * + * %-ENOENT - no block exists in the range. + */ +static int nilfs_cpfile_find_checkpoint_block(struct inode *cpfile, + __u64 start_cno, __u64 end_cno, + __u64 *cnop, + struct buffer_head **bhp) +{ + unsigned long start, end, blkoff; + int ret; + + if (unlikely(start_cno end_cno)) + return -ENOENT; + + start = nilfs_cpfile_get_blkoff(cpfile, start_cno); + end = nilfs_cpfile_get_blkoff(cpfile, end_cno); + + ret = nilfs_mdt_find_block(cpfile, start, end, blkoff, bhp); + if (!ret) + *cnop = (blkoff == start) ? start_cno : + nilfs_cpfile_first_checkpoint_in_block(cpfile, blkoff); + return ret; +} + static inline int nilfs_cpfile_delete_checkpoint_block(struct inode *cpfile, __u64 cno) { @@ -403,14 +448,15 @@ static ssize_t nilfs_cpfile_do_get_cpinfo(struct inode *cpfile, __u64 *cnop, return -ENOENT; /* checkpoint number 0 is invalid */ down_read(NILFS_MDT(cpfile)-mi_sem); - for (n = 0; cno cur_cno n nci; cno += ncps) { - ncps = nilfs_cpfile_checkpoints_in_block(cpfile, cno, cur_cno); - ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, bh); + for (n = 0; n nci; cno += ncps) { + ret = nilfs_cpfile_find_checkpoint_block( + cpfile, cno, cur_cno - 1, cno, bh); if (ret 0) { - if (ret != -ENOENT) - goto out; - continue; /* skip hole */ + if (likely(ret == -ENOENT)) + break; + goto out; } + ncps = nilfs_cpfile_checkpoints_in_block(cpfile, cno, cur_cno); kaddr = kmap_atomic(bh-b_page); cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, bh, kaddr); -- 1.8.3.1 -- To unsubscribe from this list: send the line unsubscribe linux-nilfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 1/1] nilfs2: fix potential memory overrun on inode
On Fri, 20 Feb 2015 13:58:42 -0800, Andrew Morton wrote: On Fri, 20 Feb 2015 22:46:35 +0900 Ryusuke Konishi konishi.ryus...@lab.ntt.co.jp wrote: Each inode of nilfs2 stores a root node of a b-tree, and it turned out to have a memory overrun issue: Each b-tree node of nilfs2 stores a set of key-value pairs and the number of them (in bn_nchildren member of nilfs_btree_node struct), as well as a few other bn_* members. Since the value of bn_nchildren is used for operations on the key-values within the b-tree node, it can cause memory access overrun if a large number is incorrectly set to bn_nchildren. For instance, nilfs_btree_node_lookup() function determines the range of binary search with it, and too large bn_nchildren leads nilfs_btree_node_get_key() in that function to overrun. As for intermediate b-tree nodes, this is prevented by a sanity check performed when each node is read from a drive, however, no sanity check has been done for root nodes stored in inodes. This patch fixes the issue by adding missing sanity check against b-tree root nodes so that it's called when on-memory inodes are read from ifile, inode metadata file. How would one trigger this overrun? Mount an fs with a deliberately corrupted/inconsistent fs image? Yes, this can be triggered by mounting an fs with a corrupted image deliberately or by chance. Memory overrun sounds nasty so I'm thinking we add cc:stable to this one. OK? Agreed. Ryusuke Konishi -- To unsubscribe from this list: send the line unsubscribe linux-nilfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] nilfs2: fix sanity check of btree level in nilfs_btree_root_broken()
The range check for b-tree level parameter in nilfs_btree_root_broken() is wrong; it accepts the case of level == NILFS_BTREE_LEVEL_MAX even though the level is limited to values in the range of 0 to (NILFS_BTREE_LEVEL_MAX - 1). Since the level parameter is read from storage device and used to index nilfs_btree_path array whose element count is NILFS_BTREE_LEVEL_MAX, it can cause memory overrun during btree operations if the boundary value is set to the level parameter on device. This fixes the broken sanity check and adds a comment to clarify that the upper bound NILFS_BTREE_LEVEL_MAX is exclusive. Signed-off-by: Ryusuke Konishi konishi.ryus...@lab.ntt.co.jp Cc: sta...@vger.kernel.org --- fs/nilfs2/btree.c | 2 +- include/linux/nilfs2_fs.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c index 059f371..919fd5b 100644 --- a/fs/nilfs2/btree.c +++ b/fs/nilfs2/btree.c @@ -388,7 +388,7 @@ static int nilfs_btree_root_broken(const struct nilfs_btree_node *node, nchildren = nilfs_btree_node_get_nchildren(node); if (unlikely(level NILFS_BTREE_LEVEL_NODE_MIN || -level NILFS_BTREE_LEVEL_MAX || +level = NILFS_BTREE_LEVEL_MAX || nchildren 0 || nchildren NILFS_BTREE_ROOT_NCHILDREN_MAX)) { pr_crit(NILFS: bad btree root (inode number=%lu): level = %d, flags = 0x%x, nchildren = %d\n, diff --git a/include/linux/nilfs2_fs.h b/include/linux/nilfs2_fs.h index ff3fea3..9abb763 100644 --- a/include/linux/nilfs2_fs.h +++ b/include/linux/nilfs2_fs.h @@ -460,7 +460,7 @@ struct nilfs_btree_node { /* level */ #define NILFS_BTREE_LEVEL_DATA 0 #define NILFS_BTREE_LEVEL_NODE_MIN (NILFS_BTREE_LEVEL_DATA + 1) -#define NILFS_BTREE_LEVEL_MAX 14 +#define NILFS_BTREE_LEVEL_MAX 14 /* Max level (exclusive) */ /** * struct nilfs_palloc_group_desc - block group descriptor -- 1.8.3.1 -- To unsubscribe from this list: send the line unsubscribe linux-nilfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v2 4/9] nilfs2: add kmem_cache for SUFILE cache nodes
On Sat, 09 May 2015 21:10:21 +0200, Andreas Rohner wrote: On 2015-05-09 04:41, Ryusuke Konishi wrote: On Sun, 3 May 2015 12:05:17 +0200, Andreas Rohner wrote: +static void nilfs_sufile_cache_node_init_once(void *obj) +{ + memset(obj, 0, sizeof(struct nilfs_sufile_cache_node)); +} + Note that nilfs_sufile_cache_node_init_once() is only called when each cache entry is allocated first time. It doesn't ensure each cache entry is clean when it will be allocated with kmem_cache_alloc() the second time and afterwards. I kind of assumed it would be called for every object returned by kmem_cache_alloc(). In that case I have to do the initialization in nilfs_sufile_alloc_cache_node() and remove this function. Regards, Andreas Rohner You can use kmem_cache_zalloc() instead in that case. Regards, Ryusuke Konishi -- To unsubscribe from this list: send the line unsubscribe linux-nilfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/3] NILFS2: support NFSv2 export
On Fri, 08 May 2015 10:16:23 +1000, NeilBrown ne...@suse.de wrote: The fh_len passed to -fh_to_* is not guaranteed to be that same as that returned by encode_fh - it may be larger. With NFSv2, the filehandle is fixed length, so it may appear longer than expected and be zero-padded. So we must test that fh_len is at least some value, not exactly equal to it. Signed-off-by: NeilBrown ne...@suse.de --- fs/nilfs2/namei.c |6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c index 22180836ec22..b65fb79d16fd 100644 --- a/fs/nilfs2/namei.c +++ b/fs/nilfs2/namei.c @@ -496,8 +496,8 @@ static struct dentry *nilfs_fh_to_dentry(struct super_block *sb, struct fid *fh, { struct nilfs_fid *fid = (struct nilfs_fid *)fh; - if ((fh_len != NILFS_FID_SIZE_NON_CONNECTABLE - fh_len != NILFS_FID_SIZE_CONNECTABLE) || + if ((fh_len NILFS_FID_SIZE_NON_CONNECTABLE + fh_len NILFS_FID_SIZE_CONNECTABLE) || (fh_type != FILEID_NILFS_WITH_PARENT fh_type != FILEID_NILFS_WITHOUT_PARENT)) return NULL; A bit weird. fh_len NILFS_FID_SIZE_CONNECTABLE implies fh_len NILFS_FID_SIZE_NON_CONNECTABLE. How about the following fix ? if ((fh_type != FILEID_NILFS_WITH_PARENT || fh_len NILFS_FID_SIZE_CONNECTABLE) (fh_type != FILEID_NILFS_WITHOUT_PARENT || fh_len NILFS_FID_SIZE_NON_CONNECTABLE)) return NULL; Regards, Ryusuke Konishi @@ -510,7 +510,7 @@ static struct dentry *nilfs_fh_to_parent(struct super_block *sb, struct fid *fh, { struct nilfs_fid *fid = (struct nilfs_fid *)fh; - if (fh_len != NILFS_FID_SIZE_CONNECTABLE || + if (fh_len NILFS_FID_SIZE_CONNECTABLE || fh_type != FILEID_NILFS_WITH_PARENT) return NULL; -- To unsubscribe from this list: send the line unsubscribe linux-nilfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line unsubscribe linux-nilfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v2 1/9] nilfs2: copy file system feature flags to the nilfs object
On Sun, 3 May 2015 12:05:14 +0200, Andreas Rohner wrote: This patch adds three new attributes to the nilfs object, which contain a copy of the feature flags from the super block. This can be used, to efficiently test whether file system feature flags are set or not. Signed-off-by: Andreas Rohner andreas.roh...@gmx.net --- fs/nilfs2/the_nilfs.c | 4 fs/nilfs2/the_nilfs.h | 8 2 files changed, 12 insertions(+) diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c index 69bd801..606fdfc 100644 --- a/fs/nilfs2/the_nilfs.c +++ b/fs/nilfs2/the_nilfs.c @@ -630,6 +630,10 @@ int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb, char *data) get_random_bytes(nilfs-ns_next_generation, sizeof(nilfs-ns_next_generation)); + nilfs-ns_feature_compat = le64_to_cpu(sbp-s_feature_compat); + nilfs-ns_feature_compat_ro = le64_to_cpu(sbp-s_feature_compat_ro); + nilfs-ns_feature_incompat = le64_to_cpu(sbp-s_feature_incompat); Consider moving these initialization to just before calling nilfs_check_feature_compatibility(). It uses compat flags, and I'd like to unfold the function using these internal variables sometime. + err = nilfs_store_disk_layout(nilfs, sbp); if (err) goto failed_sbh; diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h index 23778d3..12cd91d 100644 --- a/fs/nilfs2/the_nilfs.h +++ b/fs/nilfs2/the_nilfs.h @@ -101,6 +101,9 @@ enum { * @ns_dev_kobj: /sys/fs/nilfs/device * @ns_dev_kobj_unregister: completion state * @ns_dev_subgroups: device subgroups pointer + * @ns_feature_compat: Compatible feature set + * @ns_feature_compat_ro: Read-only compatible feature set + * @ns_feature_incompat: Incompatible feature set */ struct the_nilfs { unsigned long ns_flags; @@ -201,6 +204,11 @@ struct the_nilfs { struct kobject ns_dev_kobj; struct completion ns_dev_kobj_unregister; struct nilfs_sysfs_dev_subgroups *ns_dev_subgroups; + + /* Features */ + __u64 ns_feature_compat; + __u64 ns_feature_compat_ro; + __u64 ns_feature_incompat; }; #define THE_NILFS_FNS(bit, name) \ -- 2.3.7 -- To unsubscribe from this list: send the line unsubscribe linux-nilfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line unsubscribe linux-nilfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 0/2] nilfs2: fix issues with nilfs_set_inode_flags()
Hi Andrew, Please queue the following changes for the next merge window: Ryusuke Konishi (2): nilfs2: put out gfp mask manipulation from nilfs_set_inode_flags() nilfs2: use inode_set_flags() in nilfs_set_inode_flags() These fix issues related to nilfs_set_inode_flags() function. Thanks, Ryusuke Konishi -- fs/nilfs2/inode.c | 19 ++- 1 file changed, 10 insertions(+), 9 deletions(-) -- To unsubscribe from this list: send the line unsubscribe linux-nilfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 2/2] nilfs2: use inode_set_flags() in nilfs_set_inode_flags()
Use inode_set_flags() to atomically set i_flags instead of clearing out the S_IMMUTABLE, S_APPEND, etc. flags and then setting them from the FS_IMMUTABLE_FL, FS_APPEND_FL flags to avoid a race where an immutable file has the immutable flag cleared for a brief window of time. This is a similar fix to commit 5f16f3225b06 (ext4: atomically set inode-i_flags in ext4_set_inode_flags()). Signed-off-by: Ryusuke Konishi konishi.ryus...@lab.ntt.co.jp Cc: Theodore Ts'o ty...@mit.edu --- fs/nilfs2/inode.c | 15 --- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 0c28ccb..72c7fbf 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -443,19 +443,20 @@ struct inode *nilfs_new_inode(struct inode *dir, umode_t mode) void nilfs_set_inode_flags(struct inode *inode) { unsigned int flags = NILFS_I(inode)-i_flags; + unsigned int new_fl = 0; - inode-i_flags = ~(S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME | - S_DIRSYNC); if (flags FS_SYNC_FL) - inode-i_flags |= S_SYNC; + new_fl |= S_SYNC; if (flags FS_APPEND_FL) - inode-i_flags |= S_APPEND; + new_fl |= S_APPEND; if (flags FS_IMMUTABLE_FL) - inode-i_flags |= S_IMMUTABLE; + new_fl |= S_IMMUTABLE; if (flags FS_NOATIME_FL) - inode-i_flags |= S_NOATIME; + new_fl |= S_NOATIME; if (flags FS_DIRSYNC_FL) - inode-i_flags |= S_DIRSYNC; + new_fl |= S_DIRSYNC; + inode_set_flags(inode, new_fl, S_SYNC | S_APPEND | S_IMMUTABLE | + S_NOATIME | S_DIRSYNC); } int nilfs_read_inode_common(struct inode *inode, -- 1.8.3.1 -- To unsubscribe from this list: send the line unsubscribe linux-nilfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] nilfs2: fix gcc warning at nilfs_checkpoint_is_mounted()
Fix the following build warning: fs/nilfs2/super.c: In function 'nilfs_checkpoint_is_mounted': fs/nilfs2/super.c:1023:10: warning: comparison of unsigned expression 0 is always false [-Wtype-limits] if (cno 0 || cno nilfs-ns_cno) ^ This warning indicates that the comparision cno 0 is useless because variable cno has an unsigned integer type __u64. Reported-by: David Binderman dcb...@hotmail.com Signed-off-by: Ryusuke Konishi konishi.ryus...@lab.ntt.co.jp --- fs/nilfs2/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 5bc2a1c..c1725f20 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -1020,7 +1020,7 @@ int nilfs_checkpoint_is_mounted(struct super_block *sb, __u64 cno) struct dentry *dentry; int ret; - if (cno 0 || cno nilfs-ns_cno) + if (cno nilfs-ns_cno) return false; if (cno = nilfs_last_cno(nilfs)) -- 1.8.3.1 -- To unsubscribe from this list: send the line unsubscribe linux-nilfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v2 9/9] nilfs2: prevent starvation of segments protected by snapshots
On Wed, 20 May 2015 23:43:35 +0900 (JST), Ryusuke Konishi wrote: On Sun, 3 May 2015 12:05:22 +0200, Andreas Rohner wrote: It doesn't really matter if the number of reclaimable blocks for a segment is inaccurate, as long as the overall performance is better than the simple timestamp algorithm and starvation is prevented. The following steps will lead to starvation of a segment: 1. The segment is written 2. A snapshot is created 3. The files in the segment are deleted and the number of live blocks for the segment is decremented to a very low value 4. The GC tries to free the segment, but there are no reclaimable blocks, because they are all protected by the snapshot. To prevent an infinite loop the GC has to adjust the number of live blocks to the correct value. 5. The snapshot is converted to a checkpoint and the blocks in the segment are now reclaimable. 6. The GC will never attempt to clean the segment again, because it looks as if it had a high number of live blocks. To prevent this, the already existing padding field of the SUFILE entry is used to track the number of snapshot blocks in the segment. This number is only set by the GC, since it collects the necessary information anyway. So there is no need, to track which block belongs to which segment. In step 4 of the list above the GC will set the new field su_nsnapshot_blks. In step 5 all entries in the SUFILE are checked and entries with a big su_nsnapshot_blks field get their su_nlive_blks field reduced. Signed-off-by: Andreas Rohner andreas.roh...@gmx.net I still don't know whether this workaround is the way we should take or not. This patch has several drawbacks: 1. It introduces overheads to every chcp cp operation due to traversal rewrite of sufile. If the ratio of snapshot protected blocks is high, then this overheads will be big. 2. The traversal rewrite of sufile will causes many sufile blocks will be written out. If most blocks are protected by a snapshot, more than 4MB of sufile blocks will be written per 1TB capacity. Even though this rewrite may not happen for contiguous chcp cp operations, it still has potential for creating sufile write blocks if the application of nilfs manipulates snapshots frequently. 3. The ratio of the threshold max_segblks is hard coded to 50% of blocks_per_segment. It is not clear if the ratio is good (versatile). I will add comments inline below. --- fs/nilfs2/ioctl.c | 50 +++- fs/nilfs2/sufile.c | 85 ++ fs/nilfs2/sufile.h | 3 ++ 3 files changed, 137 insertions(+), 1 deletion(-) diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c index 40bf74a..431725f 100644 --- a/fs/nilfs2/ioctl.c +++ b/fs/nilfs2/ioctl.c @@ -200,6 +200,49 @@ static int nilfs_ioctl_getversion(struct inode *inode, void __user *argp) } /** + * nilfs_ioctl_fix_starving_segs - fix potentially starving segments + * @nilfs: nilfs object + * @inode: inode object + * + * Description: Scans for segments, which are potentially starving and + * reduces the number of live blocks to less than half of the maximum + * number of blocks in a segment. This requires a scan of the whole SUFILE, + * which can take a long time on certain devices and under certain conditions. + * To avoid blocking other file system operations for too long the SUFILE is + * scanned in steps of NILFS_SUFILE_STARVING_SEGS_STEP. After each step the + * locks are released and cond_resched() is called. + * + * Return Value: On success, 0 is returned and on error, one of the + * following negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + */ +static int nilfs_ioctl_fix_starving_segs(struct the_nilfs *nilfs, + struct inode *inode) { This inode argument is meaningless for this routine. Consider passing sb instead. I feel odd for the function name fix starving segs. It looks to give a workaround rather than solve the root problem of gc in nilfs. It looks like what this patch is doing, is calibrating live block count. +struct nilfs_transaction_info ti; +unsigned long i, nsegs = nilfs_sufile_get_nsegments(nilfs-ns_sufile); nsegs is set outside the transaction lock. Since the file system can be resized (both shrinked or extended) outside the lock, nsegs must be initialized or updated in the section where the tranaction lock is held. +int ret = 0; + +for (i = 0; i nsegs; i += NILFS_SUFILE_STARVING_SEGS_STEP) { +nilfs_transaction_begin(inode-i_sb, ti, 0); + +ret = nilfs_sufile_fix_starving_segs(nilfs-ns_sufile, i, +NILFS_SUFILE_STARVING_SEGS_STEP); +if (unlikely(ret 0)) { +nilfs_transaction_abort
Re: NILFS2: double uuid
(CCed to linux-nilfs@vger.kernel.org) Hi Heinz, On 2015/06/08 15:43, Heinz Diehl wrote: Hi, a nilfs2 formatted disk fails to mount via fstab due to double uuid's. See lsblk output below. The logs indicate that the system attempts to mount /dev/sdb rather than /dev/sdb1, which of course fails. In addition, /dev/sdb should not have any uuid at all. Don't know why that happens. The phenomenon is easily reproducible: format a partition with nilfs2, register it with the proper uuid in fstab and reboot. Tried both with USB memory and real HDD. [root@keera ~]# lsblk -f NAMEFSTYPE LABEL UUID MOUNTPOINT sdb ff17dda9-fcae-42e7-a438-9087de58902e `-sdb1 xfs ff17dda9-fcae-42e7-a438-9087de58902e Thanks, Heinz On 2015/06/08 15:49, Heinz Diehl wrote: On 08.06.2015, Heinz Diehl wrote: [root@keera ~]# lsblk -f NAMEFSTYPE LABEL UUID MOUNTPOINT sdb ff17dda9-fcae-42e7-a438-9087de58902e `-sdb1 xfs ff17dda9-fcae-42e7-a438-9087de58902e Copy error: replace xfs with nilfs2. Sorry! I couldn't reproduce the issue (in a CentOS 7 environment). Could you tell us the version information of distro, lsblk, libblkid, nilfs-utils, and kernel you are using ? The following is an example of mine: $ lsblk -f NAME FSTYPE LABEL UUID MOUNTPOINT sda └─sda1 nilfs29dcd01c0-2bc8-41bf-a400-8ad8755aac6a $ lsblk --version lsblk from util-linux 2.23.2 $ lscp -V lscp (nilfs-utils 2.2.3) $ rpm -q libblkid util-linux libblkid-2.23.2-22.el7_1.x86_64 util-linux-2.23.2-22.el7_1.x86_64 $ uname -r 4.1.0-rc7 Regards, Ryusuke Konishi -- To unsubscribe from this list: send the line unsubscribe linux-nilfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: NILFS2: double uuid
On Tue, 9 Jun 2015 16:07:42 +0200, Karel Zak k...@redhat.com wrote: On Tue, Jun 09, 2015 at 10:04:15PM +0900, Ryusuke Konishi wrote: $ sudo nilfs-resize -y /dev/sdb1 1G Partition size = 2146435072 bytes. Shrink the filesystem size from 2146435072 bytes to 1073741824 bytes. 128 segments will be truncated from segnum 127. Moving 103 in-use segments. progress |***| Done. $ sudo umount /test $ sudo mount /dev/sdb1 /test $ sudo LD_LIBRARY_PATH=/usr/local/lib lsblk -f NAME FSTYPE LABEL UUID MOUNTPOINT [...] sdb `-sdb1 /test This blank state continued until I shrank the partition or re-extended the filesystem to the partition size. Could you consider confining the s_dev_size test only to the backup superblock ? Hmm... why nilfs-resize does not update the size in the superblock? It seems like nilfs-resize bug. nilfs-resize (to be exact, RESIZE ioctl of nilfs2) updates s_dev_size in both superblocks. What nilfs-resize doesn't change is the partition size. (It needs help of a partitioning tool) It seems that we don't have to drop the primary super block even if s_dev_size doesn't fit to the partition size. Yes, fixed. I have also enabled the s_dev_size check for whole-disk devices only to minimize number of situations when we rely on the s_dev_size. Karel Thanks again. The updated libblkid/lsblk works frawlessly. Regards, Ryusuke Konishi -- To unsubscribe from this list: send the line unsubscribe linux-nilfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: NILFS2: double uuid
Hi, On 2015/06/09 17:53, Karel Zak wrote: On Tue, Jun 09, 2015 at 12:31:27AM +0900, Ryusuke Konishi wrote: It looks like the backup super block should be dropped from candidates if its device size (sbp-s_dev_size) doesn't match the partition size. Yeah, fixed: http://git.kernel.org/cgit/utils/util-linux/util-linux.git/commit/?id=00817742ce360119e079a33e12cf84118ff7c63e Note that workaround is to not use nilfs2 on the last partition or have a tiny gap (1 sector is enough) between last partition and the end of the whole-disk. Karel Thanks for your quick work! I tested the patch. It almost worked fine. One issue I found is a transient state after fs-resizing. After shrinking the file system, both superblocks dropped and lsblk failed to detect the filesystem: $ sudo LD_LIBRARY_PATH=/usr/local/lib lsblk -f NAME FSTYPE LABEL UUID MOUNTPOINT [...] sdb `-sdb1 nilfs2 2d7cd130-82a0-4a3c-b8a8-4ac5a26f5703 /test $ sudo nilfs-resize -y /dev/sdb1 1G Partition size = 2146435072 bytes. Shrink the filesystem size from 2146435072 bytes to 1073741824 bytes. 128 segments will be truncated from segnum 127. Moving 103 in-use segments. progress |***| Done. $ sudo umount /test $ sudo mount /dev/sdb1 /test $ sudo LD_LIBRARY_PATH=/usr/local/lib lsblk -f NAME FSTYPE LABEL UUID MOUNTPOINT [...] sdb `-sdb1 /test This blank state continued until I shrank the partition or re-extended the filesystem to the partition size. Could you consider confining the s_dev_size test only to the backup superblock ? It seems that we don't have to drop the primary super block even if s_dev_size doesn't fit to the partition size. Regards, Ryusuke Konishi -- To unsubscribe from this list: send the line unsubscribe linux-nilfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v2 9/9] nilfs2: prevent starvation of segments protected by snapshots
On Fri, 22 May 2015 20:10:05 +0200, Andreas Rohner wrote: On 2015-05-20 16:43, Ryusuke Konishi wrote: On Sun, 3 May 2015 12:05:22 +0200, Andreas Rohner wrote: It doesn't really matter if the number of reclaimable blocks for a segment is inaccurate, as long as the overall performance is better than the simple timestamp algorithm and starvation is prevented. The following steps will lead to starvation of a segment: 1. The segment is written 2. A snapshot is created 3. The files in the segment are deleted and the number of live blocks for the segment is decremented to a very low value 4. The GC tries to free the segment, but there are no reclaimable blocks, because they are all protected by the snapshot. To prevent an infinite loop the GC has to adjust the number of live blocks to the correct value. 5. The snapshot is converted to a checkpoint and the blocks in the segment are now reclaimable. 6. The GC will never attempt to clean the segment again, because it looks as if it had a high number of live blocks. To prevent this, the already existing padding field of the SUFILE entry is used to track the number of snapshot blocks in the segment. This number is only set by the GC, since it collects the necessary information anyway. So there is no need, to track which block belongs to which segment. In step 4 of the list above the GC will set the new field su_nsnapshot_blks. In step 5 all entries in the SUFILE are checked and entries with a big su_nsnapshot_blks field get their su_nlive_blks field reduced. Signed-off-by: Andreas Rohner andreas.roh...@gmx.net I still don't know whether this workaround is the way we should take or not. This patch has several drawbacks: 1. It introduces overheads to every chcp cp operation due to traversal rewrite of sufile. If the ratio of snapshot protected blocks is high, then this overheads will be big. 2. The traversal rewrite of sufile will causes many sufile blocks will be written out. If most blocks are protected by a snapshot, more than 4MB of sufile blocks will be written per 1TB capacity. Even though this rewrite may not happen for contiguous chcp cp operations, it still has potential for creating sufile write blocks if the application of nilfs manipulates snapshots frequently. I could also implement this functionality in nilfs_cleanerd in userspace. Every time a chcp cp happens some kind of permanent flag like snapshot_was_recently_deleted is set at an appropriate location. The flag could be returned with GET_SUSTAT ioctl(). Then nilfs_cleanerd would, at certain intervals and if the flag is set, check all segments with GET_SUINFO ioctl() and set the ones that have potentially invalid values with SET_SUINFO ioctl(). After that it would clear the snapshot_was_recently_deleted flag. What do you think about this idea? Sorry for my late reply. I think moving the functionality to cleanerd and notifying some sort of information to userland through ioctl for that, is a good idea except that I feel the ioctl should be GET_CPSTAT instead of GET_SUINFO because it's checkpoint/snapshot related information. I think the parameter that should be added is a set of statistics information including the number of deleted snapshots since the file system was mounted last (1). The counter (1) can serve as the snapshot_was_recently_deleted flag if it monotonically increases. Although we can use timestamp of when a snapshot was deleted last time, it's not preferable than the counter (1) because the system clock may be rewinded and it also has an issue related to precision. Note that we must add GET_CPSTAT_V2 (or GET_SUSTAT_V2) and the corresponding structure (i.e. nilfs_cpstat_v2, or so) since ioctl codes depend on the size of argument data and it will be changed in both ioctls; unfortunately, neither GET_CPSTAT nor GET_SUSTAT ioctl is expandable. Some ioctls like EVIOCGKEYCODE_V2 will be a reference for this issue. If the policy is timestamp the GC would of course skip this scan, because it is unnecessary. 3. The ratio of the threshold max_segblks is hard coded to 50% of blocks_per_segment. It is not clear if the ratio is good (versatile). The interval and percentage could be set in /etc/nilfs_cleanerd.conf. I chose 50% kind of arbitrarily. My intent was to encourage the GC to check the segment again in the future. I guess anything between 25% and 75% would also work. Sound reasonable. By the way, I am thinking we should move cleanerd into kernel as soon as we can. It's not only inefficient due to a large amount of data exchange between kernel and user-land, but also is hindering changes like we are trying. We have to care compatibility unnecessarily due to the early design mistake (i.e. the separation of gc to user-land). Regards, Ryusuke Konishi -- To unsubscribe from this list: send the line unsubscribe linux-nilfs in the body of a message
Re: [PATCH v2 9/9] nilfs2: prevent starvation of segments protected by snapshots
On Sun, 31 May 2015 20:13:44 +0200, Andreas Rohner wrote: On 2015-05-31 18:45, Ryusuke Konishi wrote: On Fri, 22 May 2015 20:10:05 +0200, Andreas Rohner wrote: On 2015-05-20 16:43, Ryusuke Konishi wrote: On Sun, 3 May 2015 12:05:22 +0200, Andreas Rohner wrote: [...] 3. The ratio of the threshold max_segblks is hard coded to 50% of blocks_per_segment. It is not clear if the ratio is good (versatile). The interval and percentage could be set in /etc/nilfs_cleanerd.conf. I chose 50% kind of arbitrarily. My intent was to encourage the GC to check the segment again in the future. I guess anything between 25% and 75% would also work. Sound reasonable. By the way, I am thinking we should move cleanerd into kernel as soon as we can. It's not only inefficient due to a large amount of data exchange between kernel and user-land, but also is hindering changes like we are trying. We have to care compatibility unnecessarily due to the early design mistake (i.e. the separation of gc to user-land). I am a bit confused. Is it OK if I implement this functionality in nilfs_cleanerd for this patch set, or would it be better to implement it with a workqueue in the kernel, like you've suggested before? If you intend to move nilfs_cleanerd into the kernel anyway, then the latter would make more sense to me. Which implementation do you prefer for this patch set? If nilfs_cleanerd will remain in userland, then the userland implementation looks better. But, yes, if we will move the cleaner into kernel, then the kernel implementation looks better because we may be able to avoid unnecessary API change. It's a dilemma. Do you have any good idea to reduce or hide overhead of the calibration (i.e. traversal rewrite of sufile) in regard to the kernel implementation ? I'm inclined to leave that in kernel for now. Regards, Ryusuke Konishi Regards, Andreas Rohner Regards, Ryusuke Konishi -- To unsubscribe from this list: send the line unsubscribe linux-nilfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line unsubscribe linux-nilfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line unsubscribe linux-nilfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v2 7/9] nilfs2: ensure that all dirty blocks are written out
On Sun, 10 May 2015 13:04:18 +0200, Andreas Rohner wrote: On 2015-05-09 14:17, Ryusuke Konishi wrote: On Sun, 3 May 2015 12:05:20 +0200, Andreas Rohner wrote: [...] Uum. This still looks to have potential for leak of dirty block collection between DAT and SUFILE since this retry is limited by the fixed retry count. How about adding function temporarily turning off the live block tracking and using it after this propagation loop until log write finishes ? It would reduce the accuracy of live block count, but is it enough ? How do you think ? We have to eliminate the possibility of the leak because it can cause file system corruption. Every checkpoint must be self-contained. How exactly could it lead to file system corruption? Maybe I miss something important here, but it seems to me, that no corruption is possible. The nilfs_sufile_flush_cache_node() function only reads in already existing blocks. No new blocks are created. If I mark those blocks dirty, the btree is not changed at all. If I do not call nilfs_bmap_propagate(), then the btree stays unchanged and there are no dangling pointers. The resulting checkpoint should be self-contained. Good point. As for btree, it looks like no inconsistency issue arises since nilfs_sufile_flush_cache_node() never inserts new blocks as you pointed out. Even though we also must care inconsistency between sufile header and sufile data blocks, and block count in inode as well, fortunately these look to be ok, too. However, I still think it's not good to carry over dirty blocks to the next segment construction to avoid extra checkpoint creation and to simplify things. From this viewpoint, I also prefer that nilfs_sufile_flush_cache() and nilfs_sufile_flush_cache_node() are changed a bit so that they will skip adjusting su_nlive_blks and su_nlive_lastmod if the sufile block that includes the segment usage is not marked dirty and only_mark == 0 as well as turing off live block counting temporarily after the sufile/DAT propagation loop. The only problem would be, that I could lose some nlive_blks updates. Regards, Ryusuke Konishi -- To unsubscribe from this list: send the line unsubscribe linux-nilfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 0/1] NILFS2: support NFSv2 export
Hi Andrew, please queue the following patch for the next merge window. It fixes an NFSv2 related issue reported in: [1] http://marc.info/?l=linux-fsdevelm=143104630128997 [PATCH 0/3] make BTRFS, UDF, NILFS2 work with NFSv2. Thanks, Ryusuke Konishi -- NeilBrown (1): NILFS2: support NFSv2 export fs/nilfs2/namei.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) -- To unsubscribe from this list: send the line unsubscribe linux-nilfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/9 linux-next] nilfs2: remove dir_pages() declaration
On Sun, 24 May 2015 17:19:42 +0200, Fabian Frederick f...@skynet.be wrote: dir_pages() is now declared in pagemap.h Signed-off-by: Fabian Frederick f...@skynet.be --- fs/nilfs2/dir.c | 5 - 1 file changed, 5 deletions(-) diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c index 0ee0bed..6b8b92b 100644 --- a/fs/nilfs2/dir.c +++ b/fs/nilfs2/dir.c @@ -61,11 +61,6 @@ static inline void nilfs_put_page(struct page *page) page_cache_release(page); } -static inline unsigned long dir_pages(struct inode *inode) -{ - return (inode-i_size+PAGE_CACHE_SIZE-1)PAGE_CACHE_SHIFT; -} - Can you include this and similar changes in the first patch pagemap.h: declare dir_pages() ? The first patch transiently breaks build because it inserts a duplicate definition of the dir_pages() inline function until it gets removed from each file system by the successive patches. This series looks non-divisible except the patch of ufs. Regards, Ryusuke Konishi /* * Return the offset into page `page_nr' of the last valid * byte in that page, plus one. -- 2.4.1 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/ -- To unsubscribe from this list: send the line unsubscribe linux-nilfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v2 9/9] nilfs2: prevent starvation of segments protected by snapshots
; + } + + nilfs_transaction_commit(inode-i_sb); /* never fails */ + cond_resched(); + } + + return ret; +} + +/** * nilfs_ioctl_change_cpmode - change checkpoint mode (checkpoint/snapshot) * @inode: inode object * @filp: file object @@ -224,7 +267,7 @@ static int nilfs_ioctl_change_cpmode(struct inode *inode, struct file *filp, struct the_nilfs *nilfs = inode-i_sb-s_fs_info; struct nilfs_transaction_info ti; struct nilfs_cpmode cpmode; - int ret; + int ret, is_snapshot; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -240,6 +283,7 @@ static int nilfs_ioctl_change_cpmode(struct inode *inode, struct file *filp, mutex_lock(nilfs-ns_snapshot_mount_mutex); nilfs_transaction_begin(inode-i_sb, ti, 0); + is_snapshot = nilfs_cpfile_is_snapshot(nilfs-ns_cpfile, cpmode.cm_cno); ret = nilfs_cpfile_change_cpmode( nilfs-ns_cpfile, cpmode.cm_cno, cpmode.cm_mode); if (unlikely(ret 0)) @@ -248,6 +292,10 @@ static int nilfs_ioctl_change_cpmode(struct inode *inode, struct file *filp, nilfs_transaction_commit(inode-i_sb); /* never fails */ mutex_unlock(nilfs-ns_snapshot_mount_mutex); + + if (is_snapshot 0 cpmode.cm_mode == NILFS_CHECKPOINT + nilfs_feature_track_live_blks(nilfs)) + ret = nilfs_ioctl_fix_starving_segs(nilfs, inode); Should we use this return value ? This doesn't relate to the success and failure of chcp operation. nilfs_ioctl_fix_starving_segs() is called every time chcp cp is called. I prefer to delay this extra work with a workqueue and to skip starting a new work if the previous work is still running. out: mnt_drop_write_file(filp); return ret; diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c index 9cd8820d..47e2c05 100644 --- a/fs/nilfs2/sufile.c +++ b/fs/nilfs2/sufile.c @@ -1215,6 +1215,91 @@ out_sem: } /** + * nilfs_sufile_fix_starving_segs - fix potentially starving segments + * @sufile: inode of segment usage file + * @segnum: segnum to start + * @nsegs: number of segments to check + * + * Description: Scans for segments, which are potentially starving and + * reduces the number of live blocks to less than half of the maximum + * number of blocks in a segment. This way the segment is more likely to be + * chosen by the GC. A segment is marked as potentially starving, if more + * than half of the blocks it contains are protected by snapshots. + * + * Return Value: On success, 0 is returned and on error, one of the + * following negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + */ +int nilfs_sufile_fix_starving_segs(struct inode *sufile, __u64 segnum, +__u64 nsegs) +{ + struct buffer_head *su_bh; + struct nilfs_segment_usage *su; + size_t n, i, susz = NILFS_MDT(sufile)-mi_entry_size; + struct the_nilfs *nilfs = sufile-i_sb-s_fs_info; + void *kaddr; + unsigned long maxnsegs, segusages_per_block; + __u32 max_segblks = nilfs-ns_blocks_per_segment 1; + int ret = 0, blkdirty, dirty = 0; + + down_write(NILFS_MDT(sufile)-mi_sem); + + maxnsegs = nilfs_sufile_get_nsegments(sufile); + segusages_per_block = nilfs_sufile_segment_usages_per_block(sufile); + nsegs += segnum; + if (nsegs maxnsegs) + nsegs = maxnsegs; + + while (segnum nsegs) { This local variable nsegs is used as an (exclusive) end segment number. It's confusing. You should define end variable separately. It can be simply calculated by: end = min_t(__u64, segnum + nsegs, nilfs_sufile_get_nsegments(sufile)); (maxnsegs can be removed.) Note that the evaluation of each argument will never be done twice in min_t() macro since min_t() temporarily stores the evaluation results to hidden local variables and uses them for comparison. Regards, Ryusuke Konishi + n = nilfs_sufile_segment_usages_in_block(sufile, segnum, + nsegs - 1); + + ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, +0, su_bh); + if (ret 0) { + if (ret != -ENOENT) + goto out; + /* hole */ + segnum += n; + continue; + } + + kaddr = kmap_atomic(su_bh-b_page); + su = nilfs_sufile_block_get_segment_usage(sufile, segnum, + su_bh, kaddr); + blkdirty = 0; + for (i = 0; i n; ++i, ++segnum, su = (void *)su + susz) { + if (le32_to_cpu(su-su_nsnapshot_blks) = max_segblks) + continue; + if (le32_to_cpu
[PATCH 0/2] nilfs2: fix build warnings
Hi Andrew, Please send the following fixes to upstream: Ryusuke Konishi (2): nilfs2: fix gcc unused-but-set-variable warnings nilfs2: fix gcc uninitialized-variable warnings in powerpc build These prevent reported warnings in powerpc build and minor warnings during build with "W=1". Both were detected on the mainline. Thanks, Ryusuke Konishi -- fs/nilfs2/alloc.c| 3 +-- fs/nilfs2/btree.c| 7 +-- fs/nilfs2/dat.c | 2 -- fs/nilfs2/recovery.c | 4 ++-- fs/nilfs2/segment.c | 3 +-- fs/nilfs2/sufile.c | 3 +-- fs/nilfs2/super.c| 5 - 7 files changed, 10 insertions(+), 17 deletions(-) -- To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 3/5] nilfs2: add tracepoints for analyzing sufile manipulation
From: Hitoshi Mitake <mitake.hito...@lab.ntt.co.jp> This patch adds tracepoints which would be useful for analyzing segment usage from a perspective of high level sufile manipulation (check, alloc, free). sufile is an important in-place updated metadata file, so analyzing the behavior would be useful for performance turning. example of usage (a case of allocation): $ sudo bin/tpoint nilfs2:nilfs2_segment_usage_allocated Tracing nilfs2:nilfs2_segment_usage_allocated. Ctrl-C to end. segctord-17800 [002] ...1 10671.867294: nilfs2_segment_usage_allocated: sufile = 880054f908a8 segnum = 2 segctord-17800 [002] ...1 10675.073477: nilfs2_segment_usage_allocated: sufile = 880054f908a8 segnum = 3 Cc: Benixon Dhas <benixon.d...@wdc.com> Cc: TK Kato <tk.k...@wdc.com> Signed-off-by: Hitoshi Mitake <mitake.hito...@lab.ntt.co.jp> Signed-off-by: Ryusuke Konishi <konishi.ryus...@lab.ntt.co.jp> Cc: Steven Rostedt <rost...@goodmis.org> --- fs/nilfs2/sufile.c| 8 ++ include/trace/events/nilfs2.h | 67 +++ 2 files changed, 75 insertions(+) diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c index 2a869c3..7ff8f15 100644 --- a/fs/nilfs2/sufile.c +++ b/fs/nilfs2/sufile.c @@ -30,6 +30,8 @@ #include "mdt.h" #include "sufile.h" +#include + /** * struct nilfs_sufile_info - on-memory private data of sufile * @mi: on-memory private data of metadata file @@ -358,6 +360,7 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump) break; /* never happens */ } } + trace_nilfs2_segment_usage_check(sufile, segnum, cnt); ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 1, _bh); if (ret < 0) @@ -388,6 +391,9 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump) nilfs_mdt_mark_dirty(sufile); brelse(su_bh); *segnump = segnum; + + trace_nilfs2_segment_usage_allocated(sufile, segnum); + goto out_header; } @@ -490,6 +496,8 @@ void nilfs_sufile_do_free(struct inode *sufile, __u64 segnum, NILFS_SUI(sufile)->ncleansegs++; nilfs_mdt_mark_dirty(sufile); + + trace_nilfs2_segment_usage_freed(sufile, segnum); } /** diff --git a/include/trace/events/nilfs2.h b/include/trace/events/nilfs2.h index e5649ac..1b65ba6 100644 --- a/include/trace/events/nilfs2.h +++ b/include/trace/events/nilfs2.h @@ -95,6 +95,73 @@ TRACE_EVENT(nilfs2_transaction_transition, show_transaction_state(__entry->state)) ); +TRACE_EVENT(nilfs2_segment_usage_check, + TP_PROTO(struct inode *sufile, +__u64 segnum, +unsigned long cnt), + + TP_ARGS(sufile, segnum, cnt), + + TP_STRUCT__entry( + __field(struct inode *, sufile) + __field(__u64, segnum) + __field(unsigned long, cnt) + ), + + TP_fast_assign( + __entry->sufile = sufile; + __entry->segnum = segnum; + __entry->cnt = cnt; + ), + + TP_printk("sufile = %p segnum = %llu cnt = %lu", + __entry->sufile, + __entry->segnum, + __entry->cnt) +); + +TRACE_EVENT(nilfs2_segment_usage_allocated, + TP_PROTO(struct inode *sufile, +__u64 segnum), + + TP_ARGS(sufile, segnum), + + TP_STRUCT__entry( + __field(struct inode *, sufile) + __field(__u64, segnum) + ), + + TP_fast_assign( + __entry->sufile = sufile; + __entry->segnum = segnum; + ), + + TP_printk("sufile = %p segnum = %llu", + __entry->sufile, + __entry->segnum) +); + +TRACE_EVENT(nilfs2_segment_usage_freed, + TP_PROTO(struct inode *sufile, +__u64 segnum), + + TP_ARGS(sufile, segnum), + + TP_STRUCT__entry( + __field(struct inode *, sufile) + __field(__u64, segnum) + ), + + TP_fast_assign( + __entry->sufile = sufile; + __entry->segnum = segnum; + ), + + TP_printk("sufile = %p segnum = %llu", + __entry->sufile, + __entry->segnum) +); + #endif /* _TRACE_NILFS2_H */ /* This part must be outside protection */ -- 1.8.3.1 -- To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 5/5] MAINTAINERS: nilfs2: add header file for tracing
This adds header file "include/trace/events/nilfs2.h" to maintainer-ship of nilfs2 so that updates to the nilfs2 header file go to the mailing list of nilfs2. Signed-off-by: Ryusuke Konishi <konishi.ryus...@lab.ntt.co.jp> Cc: Hitoshi Mitake <mitake.hito...@lab.ntt.co.jp> --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 797236b..6b15b7a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7371,6 +7371,7 @@ S:Supported F: Documentation/filesystems/nilfs2.txt F: fs/nilfs2/ F: include/linux/nilfs2_fs.h +F: include/trace/events/nilfs2.h NINJA SCSI-3 / NINJA SCSI-32Bi (16bit/CardBus) PCMCIA SCSI HOST ADAPTER DRIVER M: YOKOTA Hiroshi <yok...@netlab.is.tsukuba.ac.jp> -- 1.8.3.1 -- To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: nilfs.org pwned
On Wed, 26 Aug 2015 11:30:49 +0100, csm...@csmith-bm.vm.bytemark.co.uk wrote: The website, www.nilfs.org, appears to have been defaced. I was going to post a link on a forum for it, but decided against it in the end due to the defacement. Otherwise, NILFS generally rocks, keep up the good work :) Christian Please refer to nilfs.sourceforge.net instead. We no longer hosts nilfs.org. Ryusuke Konishi -- To unsubscribe from this list: send the line unsubscribe linux-nilfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 02/39] nilfs2: drop null test before destroy functions
On Sun, 13 Sep 2015 14:14:55 +0200, Julia Lawall <julia.law...@lip6.fr> wrote: > Remove unneeded NULL test. > > The semantic patch that makes this change is as follows: > (http://coccinelle.lip6.fr/) > > // > @@ expression x; @@ > -if (x != NULL) > \(kmem_cache_destroy\|mempool_destroy\|dma_pool_destroy\)(x); > // > > Signed-off-by: Julia Lawall <julia.law...@lip6.fr> Looks OK. I'll queue this in my tree. Thanks, Ryusuke Konishi > > --- > fs/nilfs2/super.c | 12 > 1 file changed, 4 insertions(+), 8 deletions(-) > > diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c > index f47585b..c69455a 100644 > --- a/fs/nilfs2/super.c > +++ b/fs/nilfs2/super.c > @@ -1405,14 +1405,10 @@ static void nilfs_destroy_cachep(void) >*/ > rcu_barrier(); > > - if (nilfs_inode_cachep) > - kmem_cache_destroy(nilfs_inode_cachep); > - if (nilfs_transaction_cachep) > - kmem_cache_destroy(nilfs_transaction_cachep); > - if (nilfs_segbuf_cachep) > - kmem_cache_destroy(nilfs_segbuf_cachep); > - if (nilfs_btree_path_cache) > - kmem_cache_destroy(nilfs_btree_path_cache); > + kmem_cache_destroy(nilfs_inode_cachep); > + kmem_cache_destroy(nilfs_transaction_cachep); > + kmem_cache_destroy(nilfs_segbuf_cachep); > + kmem_cache_destroy(nilfs_btree_path_cache); > } > > static int __init nilfs_init_cachep(void) > > -- > To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in > the body of a message to majord...@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 14/17] fs/nilfs2: remove unnecessary new_valid_dev check
On 2015/09/28 23:33, Yaowei Bai wrote: > As new_valid_dev always returns 1, so !new_valid_dev check is not > needed, remove it. > > Signed-off-by: Yaowei Bai <bywxiao...@163.com> Acked-by: Ryusuke Konishi <konishi.ryus...@lab.ntt.co.jp> > --- > fs/nilfs2/namei.c | 3 --- > 1 file changed, 3 deletions(-) > > diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c > index 37dd6b0..c9a1a49 100644 > --- a/fs/nilfs2/namei.c > +++ b/fs/nilfs2/namei.c > @@ -120,9 +120,6 @@ nilfs_mknod(struct inode *dir, struct dentry *dentry, > umode_t mode, dev_t rdev) > struct nilfs_transaction_info ti; > int err; > > - if (!new_valid_dev(rdev)) > - return -EINVAL; > - > err = nilfs_transaction_begin(dir->i_sb, , 1); > if (err) > return err; > -- To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 1/7] nilfs2: drop null test before destroy functions
From: Julia Lawall <julia.law...@lip6.fr> Remove unneeded NULL test. The semantic patch that makes this change is as follows: (http://coccinelle.lip6.fr/) // @@ expression x; @@ -if (x != NULL) \(kmem_cache_destroy\|mempool_destroy\|dma_pool_destroy\)(x); // Signed-off-by: Julia Lawall <julia.law...@lip6.fr> Signed-off-by: Ryusuke Konishi <konishi.ryus...@lab.ntt.co.jp> --- fs/nilfs2/super.c | 12 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index f47585b..c69455a 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -1405,14 +1405,10 @@ static void nilfs_destroy_cachep(void) */ rcu_barrier(); - if (nilfs_inode_cachep) - kmem_cache_destroy(nilfs_inode_cachep); - if (nilfs_transaction_cachep) - kmem_cache_destroy(nilfs_transaction_cachep); - if (nilfs_segbuf_cachep) - kmem_cache_destroy(nilfs_segbuf_cachep); - if (nilfs_btree_path_cache) - kmem_cache_destroy(nilfs_btree_path_cache); + kmem_cache_destroy(nilfs_inode_cachep); + kmem_cache_destroy(nilfs_transaction_cachep); + kmem_cache_destroy(nilfs_segbuf_cachep); + kmem_cache_destroy(nilfs_btree_path_cache); } static int __init nilfs_init_cachep(void) -- 1.8.3.1 -- To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 6/7] nilfs2: add helper functions to delete blocks from dat file
This adds delete functions for data blocks of metadata files using bitmap based allocator. nilfs_palloc_delete_entry_block() deletes an entry block (e.g. block storing dat entries), and nilfs_palloc_delete_bitmap_block() deletes a bitmap block, respectively. These helpers are intended to be used in the successive change on deallocator of block addresses ("nilfs2: free unused dat file blocks during garbage collection"). Signed-off-by: Ryusuke Konishi <konishi.ryus...@lab.ntt.co.jp> --- fs/nilfs2/alloc.c | 50 ++ 1 file changed, 50 insertions(+) diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c index 5b7ee36..225b797 100644 --- a/fs/nilfs2/alloc.c +++ b/fs/nilfs2/alloc.c @@ -236,6 +236,26 @@ static int nilfs_palloc_get_block(struct inode *inode, unsigned long blkoff, } /** + * nilfs_palloc_delete_block - delete a block on the persistent allocator file + * @inode: inode of metadata file using this allocator + * @blkoff: block offset + * @prev: nilfs_bh_assoc struct of the last used buffer + * @lock: spin lock protecting @prev + */ +static int nilfs_palloc_delete_block(struct inode *inode, unsigned long blkoff, +struct nilfs_bh_assoc *prev, +spinlock_t *lock) +{ + spin_lock(lock); + if (prev->bh && blkoff == prev->blkoff) { + brelse(prev->bh); + prev->bh = NULL; + } + spin_unlock(lock); + return nilfs_mdt_delete_block(inode, blkoff); +} + +/** * nilfs_palloc_get_desc_block - get buffer head of a group descriptor block * @inode: inode of metadata file using this allocator * @group: group number @@ -274,6 +294,22 @@ static int nilfs_palloc_get_bitmap_block(struct inode *inode, } /** + * nilfs_palloc_delete_bitmap_block - delete a bitmap block + * @inode: inode of metadata file using this allocator + * @group: group number + */ +static int nilfs_palloc_delete_bitmap_block(struct inode *inode, + unsigned long group) +{ + struct nilfs_palloc_cache *cache = NILFS_MDT(inode)->mi_palloc_cache; + + return nilfs_palloc_delete_block(inode, +nilfs_palloc_bitmap_blkoff(inode, + group), +>prev_bitmap, >lock); +} + +/** * nilfs_palloc_get_entry_block - get buffer head of an entry block * @inode: inode of metadata file using this allocator * @nr: serial number of the entry (e.g. inode number) @@ -292,6 +328,20 @@ int nilfs_palloc_get_entry_block(struct inode *inode, __u64 nr, } /** + * nilfs_palloc_delete_entry_block - delete an entry block + * @inode: inode of metadata file using this allocator + * @nr: serial number of the entry + */ +static int nilfs_palloc_delete_entry_block(struct inode *inode, __u64 nr) +{ + struct nilfs_palloc_cache *cache = NILFS_MDT(inode)->mi_palloc_cache; + + return nilfs_palloc_delete_block(inode, +nilfs_palloc_entry_blkoff(inode, nr), +>prev_entry, >lock); +} + +/** * nilfs_palloc_block_get_group_desc - get kernel address of a group descriptor * @inode: inode of metadata file using this allocator * @group: group number -- 1.8.3.1 -- To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 7/7] nilfs2: free unused dat file blocks during garbage collection
As a nilfs2 volume ages, the amount of available disk space decreases little by little due to bloat of DAT (disk address translation) metadata file. Even if we delete all files in a file system and free their block addresses from the DAT file through a garbage collection, empty DAT blocks are not freed. This fixes the issue by extending the deallocator of block addresses so that empty data blocks and empty bitmap blocks of DAT are deleted. The following comparison shows the effect of this patch. Each shows disk amount information of a nilfs2 volume that we cleaned out by deleting all files and running gc after having filled 90% of its capacity. Before: Filesystem 1K-blocks Used Available Use% Mounted on /dev/sda1 500105212 3022844 472072192 1% /test After: Filesystem 1K-blocks Used Available Use% Mounted on /dev/sda1 50010521216380 475078656 1% /test Signed-off-by: Ryusuke Konishi <konishi.ryus...@lab.ntt.co.jp> --- fs/nilfs2/alloc.c | 91 --- fs/nilfs2/alloc.h | 1 + 2 files changed, 75 insertions(+), 17 deletions(-) diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c index 225b797..b335a32 100644 --- a/fs/nilfs2/alloc.c +++ b/fs/nilfs2/alloc.c @@ -154,13 +154,17 @@ nilfs_palloc_group_desc_nfrees(const struct nilfs_palloc_group_desc *desc, * @lock: spin lock protecting @desc * @n: delta to be added */ -static void +static u32 nilfs_palloc_group_desc_add_entries(struct nilfs_palloc_group_desc *desc, spinlock_t *lock, u32 n) { + u32 nfree; + spin_lock(lock); le32_add_cpu(>pg_nfrees, n); + nfree = le32_to_cpu(desc->pg_nfrees); spin_unlock(lock); + return nfree; } /** @@ -735,12 +739,18 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems) unsigned char *bitmap; void *desc_kaddr, *bitmap_kaddr; unsigned long group, group_offset; - __u64 group_min_nr; + __u64 group_min_nr, last_nrs[8]; const unsigned long epg = nilfs_palloc_entries_per_group(inode); + const unsigned epb = NILFS_MDT(inode)->mi_entries_per_block; + unsigned entry_start, end, pos; spinlock_t *lock; - int i, j, n, ret; + int i, j, k, ret; + u32 nfree; for (i = 0; i < nitems; i = j) { + int change_group = false; + int nempties = 0, n = 0; + group = nilfs_palloc_group(inode, entry_nrs[i], _offset); ret = nilfs_palloc_get_desc_block(inode, group, 0, _bh); if (ret < 0) @@ -755,17 +765,13 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems) /* Get the first entry number of the group */ group_min_nr = (__u64)group * epg; - desc_kaddr = kmap(desc_bh->b_page); - desc = nilfs_palloc_block_get_group_desc( - inode, group, desc_bh, desc_kaddr); bitmap_kaddr = kmap(bitmap_bh->b_page); bitmap = bitmap_kaddr + bh_offset(bitmap_bh); lock = nilfs_mdt_bgl_lock(inode, group); - for (j = i, n = 0; -j < nitems && entry_nrs[j] >= group_min_nr && -entry_nrs[j] < group_min_nr + epg; -j++) { - group_offset = entry_nrs[j] - group_min_nr; + + j = i; + entry_start = rounddown(group_offset, epb); + do { if (!nilfs_clear_bit_atomic(lock, group_offset, bitmap)) { nilfs_warning(inode->i_sb, __func__, @@ -775,18 +781,69 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems) } else { n++; } - } - nilfs_palloc_group_desc_add_entries(desc, lock, n); + + j++; + if (j >= nitems || entry_nrs[j] < group_min_nr || + entry_nrs[j] >= group_min_nr + epg) { + change_group = true; + } else { + group_offset = entry_nrs[j] - group_min_nr; + if (group_offset >= entry_start && + group_offset < entry_start + epb) { + /* This entry is in the same block */ + continue; + } + } + + /* Test if the entry block is empty or not */ + end = entry_start + epb; + pos = nilfs_f
Re: [PATCH] nilfs2: add tracepoints for analyzing reading and writing metadata files
On Mon, 5 Oct 2015 19:21:43 +0900, Mitake Hitoshi wrote: > On Sun, Oct 4, 2015 at 10:33 PM, Ryusuke Konishi > <konishi.ryus...@lab.ntt.co.jp> wrote: >> On Sun, 4 Oct 2015 01:02:42 +0900, Mitake Hitoshi wrote: >>> This patch adds tracepoints for analyzing requests of reading and >>> writing metadata files. The tracepoints cover every in-place mdt files >>> (cpfile, sufile, and datfile). >>> >>> Example of tracing mdt_insert_new_block(): >>> cp-14635 [000] ...1 30598.199309: >>> nilfs2_mdt_insert_new_block: inode = 88022a8d0178 ino = 3 block = 155 >>> cp-14635 [000] ...1 30598.199520: >>> nilfs2_mdt_insert_new_block: inode = 88022a8d0178 ino = 3 block = 5 >>> cp-14635 [000] ...1 30598.200828: >>> nilfs2_mdt_insert_new_block: inode = 88022a8d0178 ino = 3 block = 253 >>> >>> Cc: TK Kato <tk.k...@wdc.com> >>> Signed-off-by: Hitoshi Mitake <mitake.hito...@lab.ntt.co.jp> >> >> Applied to the tracepoints branch. Thanks. > > Thanks, could you send the patches in the tracepoints branch to upstream? Sure, I will. Regards, Ryusuke Konishi -- To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html