[PATCH] lscp: accelerate backward checkpoint listing

2015-02-14 Thread Ryusuke Konishi
If the minimum checkpoint number of valid checkpoints is large to some
extent, lscp -r command takes very long time:

 $ lscp -r
 CNODATE TIME  MODE  FLG  BLKCNT   ICNT
 6541269  2015-02-11 18:38:30   cp-  435  2
 6541268  2015-02-11 18:38:25   cp-  484 51
  the command looks to enter a busy loop

This is because it tries to find younger checkpoints tracking back the
checkpoint list in a constant step size.

This fixes the issue by lengthening or shortening the step size
depending on whether the backward search found a younger checkpoint or
not.

This patch also inserts a dummy nilfs_get_cpinfo() call before
starting the backward search to make successive nilfs_get_cpinfo()
calls much faster.

Signed-off-by: Ryusuke Konishi konishi.ryus...@lab.ntt.co.jp
---
 bin/lscp.c | 96 --
 1 file changed, 87 insertions(+), 9 deletions(-)

diff --git a/bin/lscp.c b/bin/lscp.c
index c855def..023be5c 100644
--- a/bin/lscp.c
+++ b/bin/lscp.c
@@ -84,6 +84,12 @@ static const struct option long_option[] = {
 #define LSCP_NCPINFO   512
 #define LSCP_MINDELTA  64  /* Minimum delta for reverse direction */
 
+enum lscp_state {
+   LSCP_INIT_ST,   /* Initial state */
+   LSCP_NORMAL_ST, /* Normal state */
+   LSCP_ACCEL_ST,  /* Accelerate state */
+   LSCP_DECEL_ST,  /* Decelerate state */
+};
 
 static __u64 param_index;
 static __u64 param_lines;
@@ -176,35 +182,107 @@ static int lscp_backward_cpinfo(struct nilfs *nilfs,
struct nilfs_cpinfo *cpi;
nilfs_cno_t sidx; /* start index (inclusive) */
nilfs_cno_t eidx; /* end index (exclusive) */
-   __u64 rest, delta;
+   nilfs_cno_t prev_head = 0;
+   __u64 rest, delta, v;
+   int state = LSCP_INIT_ST;
ssize_t n;
 
rest = param_lines  param_lines  cpstat-cs_ncps ? param_lines :
cpstat-cs_ncps;
+   if (!rest)
+   goto out;
eidx = param_index  param_index  cpstat-cs_cno ? param_index + 1 :
cpstat-cs_cno;
 
-   for ( ; rest  0  eidx  NILFS_CNO_MIN; eidx = sidx) {
-   delta = min_t(__u64, LSCP_NCPINFO,
- max_t(__u64, rest, LSCP_MINDELTA));
-   sidx = (eidx = NILFS_CNO_MIN + delta) ? eidx - delta :
-   NILFS_CNO_MIN;
+recalc_delta:
+   delta = min_t(__u64, LSCP_NCPINFO, max_t(__u64, rest, LSCP_MINDELTA));
+   v = delta;
 
-   n = lscp_get_cpinfo(nilfs, sidx, NILFS_CHECKPOINT, eidx - sidx);
+   while (eidx  NILFS_CNO_MIN) {
+   if (eidx  NILFS_CNO_MIN + v || state == LSCP_INIT_ST)
+   sidx = NILFS_CNO_MIN;
+   else
+   sidx = eidx - v;
+
+   n = lscp_get_cpinfo(nilfs, sidx, NILFS_CHECKPOINT,
+   state == LSCP_NORMAL_ST ? eidx - sidx : 1);
if (n  0)
return n;
if (!n)
break;
 
-   for (cpi = cpinfos + n - 1; cpi = cpinfos  rest  0; cpi--) {
+   if (state == LSCP_INIT_ST) {
+   /*
+* This state makes succesive
+* nilfs_get_cpinfo() calls much faster by
+* setting minimum checkpoint number in nilfs
+* struct.
+*/
+   if (cpinfos[0].ci_cno = eidx)
+   goto out; /* out of range */
+   state = LSCP_NORMAL_ST;
+   continue;
+   } else if (cpinfos[0].ci_cno == prev_head) {
+   /* No younger checkpoint was found */
+
+   if (sidx == NILFS_CNO_MIN)
+   break;
+
+   /* go further back */
+   switch (state) {
+   case LSCP_NORMAL_ST:
+   state = LSCP_ACCEL_ST;
+   /* fall through */
+   case LSCP_ACCEL_ST:
+   if ((v  1)  v)
+   v = 1;
+   break;
+   case LSCP_DECEL_ST:
+   state = LSCP_NORMAL_ST;
+   v = delta;
+   break;
+   }
+   eidx = sidx;
+   continue;
+   } else {
+   switch (state) {
+   case LSCP_ACCEL_ST:
+   case LSCP_DECEL_ST:
+   if (cpinfos[n - 1].ci_cno + 1  prev_head) {
+   /* search again more slowly

[PATCH 3/7] nilfs2: use bgl_lock_ptr()

2015-03-12 Thread Ryusuke Konishi
Simplify nilfs_mdt_bgl_lock() by utilizing bgl_lock_ptr() helper in
linux/blockgroup_lock.h.

Signed-off-by: Ryusuke Konishi konishi.ryus...@lab.ntt.co.jp
---
 fs/nilfs2/mdt.h | 7 +--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/fs/nilfs2/mdt.h b/fs/nilfs2/mdt.h
index ab172e8..a294ea3 100644
--- a/fs/nilfs2/mdt.h
+++ b/fs/nilfs2/mdt.h
@@ -111,7 +111,10 @@ static inline __u64 nilfs_mdt_cno(struct inode *inode)
return ((struct the_nilfs *)inode-i_sb-s_fs_info)-ns_cno;
 }
 
-#define nilfs_mdt_bgl_lock(inode, bg) \
-   (NILFS_MDT(inode)-mi_bgl-locks[(bg)  (NR_BG_LOCKS-1)].lock)
+static inline spinlock_t *
+nilfs_mdt_bgl_lock(struct inode *inode, unsigned int block_group)
+{
+   return bgl_lock_ptr(NILFS_MDT(inode)-mi_bgl, block_group);
+}
 
 #endif /* _NILFS_MDT_H */
-- 
1.8.3.1

--
To unsubscribe from this list: send the line unsubscribe linux-nilfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 9/9] nilfs2: prevent starvation of segments protected by snapshots

2015-03-13 Thread Ryusuke Konishi
/nilfs2_fs.h
 @@ -222,11 +222,13 @@ struct nilfs_super_block {
   */
  #define NILFS_FEATURE_COMPAT_SUFILE_EXTENSION(1ULL  0)
  #define NILFS_FEATURE_COMPAT_TRACK_LIVE_BLKS (1ULL  1)
 +#define NILFS_FEATURE_COMPAT_TRACK_SNAPSHOTS (1ULL  2)
  
  #define NILFS_FEATURE_COMPAT_RO_BLOCK_COUNT  (1ULL  0)
  
  #define NILFS_FEATURE_COMPAT_SUPP(NILFS_FEATURE_COMPAT_SUFILE_EXTENSION \
 - | NILFS_FEATURE_COMPAT_TRACK_LIVE_BLKS)
 + | NILFS_FEATURE_COMPAT_TRACK_LIVE_BLKS \
 + | NILFS_FEATURE_COMPAT_TRACK_SNAPSHOTS)
  #define NILFS_FEATURE_COMPAT_RO_SUPP NILFS_FEATURE_COMPAT_RO_BLOCK_COUNT
  #define NILFS_FEATURE_INCOMPAT_SUPP  0ULL
  

You don't have to add three compat flags just for this one patchset.
Please unify it.

#define NILFS_FEATURE_COMPAT_TRACK_LIVE_BLKS(1ULL  0)

looks to be enough.

Regards,
Ryusuke Konishi


 @@ -630,7 +632,7 @@ struct nilfs_segment_usage {
   __le32 su_nblocks;
   __le32 su_flags;
   __le32 su_nlive_blks;
 - __le32 su_pad;
 + __le32 su_nsnapshot_blks;
   __le64 su_nlive_lastmod;
  };
  
 @@ -682,7 +684,7 @@ nilfs_segment_usage_set_clean(struct nilfs_segment_usage 
 *su, size_t susz)
   su-su_flags = cpu_to_le32(0);
   if (susz = NILFS_EXT_SEGMENT_USAGE_SIZE) {
   su-su_nlive_blks = cpu_to_le32(0);
 - su-su_pad = cpu_to_le32(0);
 + su-su_nsnapshot_blks = cpu_to_le32(0);
   su-su_nlive_lastmod = cpu_to_le64(0);
   }
  }
 @@ -723,7 +725,7 @@ struct nilfs_suinfo {
   __u32 sui_nblocks;
   __u32 sui_flags;
   __u32 sui_nlive_blks;
 - __u32 sui_pad;
 + __u32 sui_nsnapshot_blks;
   __u64 sui_nlive_lastmod;
  };
  
 @@ -770,6 +772,7 @@ enum {
   NILFS_SUINFO_UPDATE_FLAGS,
   NILFS_SUINFO_UPDATE_NLIVE_BLKS,
   NILFS_SUINFO_UPDATE_NLIVE_LASTMOD,
 + NILFS_SUINFO_UPDATE_NSNAPSHOT_BLKS,
   __NR_NILFS_SUINFO_UPDATE_FIELDS,
  };
  
 @@ -794,6 +797,7 @@ NILFS_SUINFO_UPDATE_FNS(LASTMOD, lastmod)
  NILFS_SUINFO_UPDATE_FNS(NBLOCKS, nblocks)
  NILFS_SUINFO_UPDATE_FNS(FLAGS, flags)
  NILFS_SUINFO_UPDATE_FNS(NLIVE_BLKS, nlive_blks)
 +NILFS_SUINFO_UPDATE_FNS(NSNAPSHOT_BLKS, nsnapshot_blks)
  NILFS_SUINFO_UPDATE_FNS(NLIVE_LASTMOD, nlive_lastmod)
  
  #define NILFS_MIN_SUINFO_UPDATE_SIZE \
 -- 
 2.3.0
 
 --
 To unsubscribe from this list: send the line unsubscribe linux-nilfs in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line unsubscribe linux-nilfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 5/9] nilfs2: add simple tracking of block deletions and updates

2015-03-13 Thread Ryusuke Konishi
)
  
   nilfs_segctor_fill_in_super_root(sci, nilfs);
   }
 - nilfs_segctor_update_segusage(sci, nilfs-ns_sufile);
 + nilfs_segctor_update_segusage(sci, nilfs);
  
   /* Write partial segments */
   nilfs_segctor_prepare_write(sci);

Please separate changes below.

 diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
 index 69bd801..606fdfc 100644
 --- a/fs/nilfs2/the_nilfs.c
 +++ b/fs/nilfs2/the_nilfs.c
 @@ -630,6 +630,10 @@ int init_nilfs(struct the_nilfs *nilfs, struct 
 super_block *sb, char *data)
   get_random_bytes(nilfs-ns_next_generation,
sizeof(nilfs-ns_next_generation));
  
 + nilfs-ns_feature_compat = le64_to_cpu(sbp-s_feature_compat);
 + nilfs-ns_feature_compat_ro = le64_to_cpu(sbp-s_feature_compat_ro);
 + nilfs-ns_feature_incompat = le64_to_cpu(sbp-s_feature_incompat);
 +
   err = nilfs_store_disk_layout(nilfs, sbp);
   if (err)
   goto failed_sbh;
 diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
 index 23778d3..87cab10 100644
 --- a/fs/nilfs2/the_nilfs.h
 +++ b/fs/nilfs2/the_nilfs.h
 @@ -101,6 +101,9 @@ enum {
   * @ns_dev_kobj: /sys/fs/nilfs/device
   * @ns_dev_kobj_unregister: completion state
   * @ns_dev_subgroups: device subgroups pointer
 + * @ns_feature_compat: Compatible feature set
 + * @ns_feature_compat_ro: Read-only compatible feature set
 + * @ns_feature_incompat: Incompatible feature set
   */
  struct the_nilfs {
   unsigned long   ns_flags;
 @@ -201,6 +204,11 @@ struct the_nilfs {
   struct kobject ns_dev_kobj;
   struct completion ns_dev_kobj_unregister;
   struct nilfs_sysfs_dev_subgroups *ns_dev_subgroups;
 +
 + /* Features */
 + __u64   ns_feature_compat;
 + __u64   ns_feature_compat_ro;
 + __u64   ns_feature_incompat;
  };
  
  #define THE_NILFS_FNS(bit, name) \
 @@ -393,4 +401,12 @@ static inline int nilfs_flush_device(struct the_nilfs 
 *nilfs)
   return err;
  }
  
 +static inline int nilfs_feature_track_live_blks(struct the_nilfs *nilfs)
 +{
 + return (nilfs-ns_feature_compat 
 + NILFS_FEATURE_COMPAT_TRACK_LIVE_BLKS) 
 + (nilfs-ns_feature_compat 
 + NILFS_FEATURE_COMPAT_SUFILE_EXTENSION);
 +}
 +

This should be written as below:

static inline int nilfs_feature_track_live_blks(struct the_nilfs *nilfs)
{
const __u64 required_bits = NILFS_FEATURE_COMPAT_TRACK_LIVE_BLKS |
NILFS_FEATURE_COMPAT_SUFILE_EXTENSION;

return ((nilfs-ns_feature_compat  required_bits) == required_bits);
}

Or you can drop the track flag at mount time if
NILFS_FEATURE_COMPAT_SUFILE_EXTENSION flag is not set or
nilfs_sufile_ext_supported(sufile) is false.

Regards,
Ryusuke Konishi

  #endif /* _THE_NILFS_H */
 diff --git a/include/linux/nilfs2_fs.h b/include/linux/nilfs2_fs.h
 index 5d83c55..6ccb2ad 100644
 --- a/include/linux/nilfs2_fs.h
 +++ b/include/linux/nilfs2_fs.h
 @@ -221,10 +221,12 @@ struct nilfs_super_block {
   * doesn't know about, it should refuse to mount the filesystem.
   */
  #define NILFS_FEATURE_COMPAT_SUFILE_EXTENSION(1ULL  0)
 +#define NILFS_FEATURE_COMPAT_TRACK_LIVE_BLKS (1ULL  1)
  
  #define NILFS_FEATURE_COMPAT_RO_BLOCK_COUNT  (1ULL  0)
  
 -#define NILFS_FEATURE_COMPAT_SUPPNILFS_FEATURE_COMPAT_SUFILE_EXTENSION
 +#define NILFS_FEATURE_COMPAT_SUPP(NILFS_FEATURE_COMPAT_SUFILE_EXTENSION \
 + | NILFS_FEATURE_COMPAT_TRACK_LIVE_BLKS)
  #define NILFS_FEATURE_COMPAT_RO_SUPP NILFS_FEATURE_COMPAT_RO_BLOCK_COUNT
  #define NILFS_FEATURE_INCOMPAT_SUPP  0ULL
  
 -- 
 2.3.0
 
 --
 To unsubscribe from this list: send the line unsubscribe linux-nilfs in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line unsubscribe linux-nilfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 3/9] nilfs2: extend SUFILE on-disk format to enable counting of live blocks

2015-03-13 Thread Ryusuke Konishi
On Tue, 24 Feb 2015 20:01:38 +0100, Andreas Rohner wrote:
 *buf,
   int cleansi, cleansu, dirtysi, dirtysu;
   long ncleaned = 0, ndirtied = 0;
   int ret = 0;
 + bool sup_ext = (supsz = NILFS_EXT_SUINFO_UPDATE_SIZE);
 + bool su_ext = nilfs_sufile_ext_supported(sufile);
  
   if (unlikely(nsup == 0))
   return ret;
 @@ -926,6 +949,9 @@ ssize_t nilfs_sufile_set_suinfo(struct inode *sufile, 
 void *buf,
   (~0UL  __NR_NILFS_SUINFO_UPDATE_FIELDS))
   || (nilfs_suinfo_update_nblocks(sup) 
   sup-sup_sui.sui_nblocks 
 + nilfs-ns_blocks_per_segment)
 + || (nilfs_suinfo_update_nlive_blks(sup)  sup_ext 
 + sup-sup_sui.sui_nlive_blks 
   nilfs-ns_blocks_per_segment))
   return -EINVAL;
   }
 @@ -953,6 +979,14 @@ ssize_t nilfs_sufile_set_suinfo(struct inode *sufile, 
 void *buf,
   if (nilfs_suinfo_update_nblocks(sup))
   su-su_nblocks = cpu_to_le32(sup-sup_sui.sui_nblocks);
  
 + if (nilfs_suinfo_update_nlive_blks(sup)  sup_ext  su_ext)
 + su-su_nlive_blks =
 + cpu_to_le32(sup-sup_sui.sui_nlive_blks);
 +
 + if (nilfs_suinfo_update_nlive_lastmod(sup)  sup_ext  su_ext)
 + su-su_nlive_lastmod =
 + cpu_to_le64(sup-sup_sui.sui_nlive_lastmod);
 +
   if (nilfs_suinfo_update_flags(sup)) {
   /*
* Active flag is a virtual flag projected by running
 diff --git a/fs/nilfs2/sufile.h b/fs/nilfs2/sufile.h
 index c446325..d56498b 100644
 --- a/fs/nilfs2/sufile.h
 +++ b/fs/nilfs2/sufile.h
 @@ -28,6 +28,11 @@
  #include linux/nilfs2_fs.h
  #include mdt.h
  
 +static inline int
 +nilfs_sufile_ext_supported(const struct inode *sufile)
 +{
 + return NILFS_MDT(sufile)-mi_entry_size = NILFS_EXT_SEGMENT_USAGE_SIZE;
 +}
  
  static inline unsigned long nilfs_sufile_get_nsegments(struct inode *sufile)
  {
 diff --git a/include/linux/nilfs2_fs.h b/include/linux/nilfs2_fs.h
 index ff3fea3..5d83c55 100644
 --- a/include/linux/nilfs2_fs.h
 +++ b/include/linux/nilfs2_fs.h
 @@ -220,9 +220,11 @@ struct nilfs_super_block {
   * If there is a bit set in the incompatible feature set that the kernel
   * doesn't know about, it should refuse to mount the filesystem.
   */
 -#define NILFS_FEATURE_COMPAT_RO_BLOCK_COUNT  0x0001ULL
 +#define NILFS_FEATURE_COMPAT_SUFILE_EXTENSION(1ULL  0)

This feature name is not good.  sufile can be extended more in a future.
You should name it based on the meaning of the extension of this time.

As I mentioned in another patch, I think this could be unified to the
TRACK_LIVE_BLKS feature that a later patch adds since the live block
counting of this patchset is inherently depending on the extention of
sufile.

  
 -#define NILFS_FEATURE_COMPAT_SUPP0ULL
 +#define NILFS_FEATURE_COMPAT_RO_BLOCK_COUNT  (1ULL  0)
 +

Regards,
Ryusuke Konishi
--
To unsubscribe from this list: send the line unsubscribe linux-nilfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 6/9] nilfs2: use modification cache to improve performance

2015-03-13 Thread Ryusuke Konishi
);
 + nilfs_sufile_mc_destroy(mcp);
 + return;
  failed:
 + nilfs_sufile_flush_nlive_blks(nilfs-ns_sufile, mcp);
 + nilfs_sufile_mc_destroy(mcp);
   nilfs_warning(ii-vfs_inode.i_sb, __func__,
 failed to truncate bmap (ino=%lu, err=%d),
 ii-vfs_inode.i_ino, ret);
 diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
 index 6059f53..dc0070c 100644
 --- a/fs/nilfs2/segment.c
 +++ b/fs/nilfs2/segment.c
 @@ -511,7 +511,8 @@ static int nilfs_collect_file_data(struct nilfs_sc_info 
 *sci,
  {
   int err;
  
 - err = nilfs_bmap_propagate(NILFS_I(inode)-i_bmap, bh);
 + err = nilfs_bmap_propagate_with_mc(NILFS_I(inode)-i_bmap,
 +sci-sc_mc, bh);
   if (err  0)
   return err;
  
 @@ -526,7 +527,8 @@ static int nilfs_collect_file_node(struct nilfs_sc_info 
 *sci,
  struct buffer_head *bh,
  struct inode *inode)
  {
 - return nilfs_bmap_propagate(NILFS_I(inode)-i_bmap, bh);
 + return nilfs_bmap_propagate_with_mc(NILFS_I(inode)-i_bmap,
 + sci-sc_mc, bh);
  }
  
  static int nilfs_collect_file_bmap(struct nilfs_sc_info *sci,
 @@ -1386,7 +1388,7 @@ static void nilfs_segctor_update_segusage(struct 
 nilfs_sc_info *sci,
   segbuf-sb_nlive_blks_added = segbuf-sb_sum.nfileblk;
  
   if (nilfs_feature_track_live_blks(nilfs))
 - nilfs_sufile_mod_nlive_blks(sufile, NULL,
 + nilfs_sufile_mod_nlive_blks(sufile, sci-sc_mc,
   segbuf-sb_segnum,
   segbuf-sb_nlive_blks_added);
   }
 @@ -2014,6 +2016,9 @@ static int nilfs_segctor_do_construct(struct 
 nilfs_sc_info *sci, int mode)
   }
   nilfs_segctor_update_segusage(sci, nilfs);
  
 + nilfs_sufile_flush_nlive_blks(nilfs-ns_sufile,
 +   sci-sc_mc);
 +
   /* Write partial segments */
   nilfs_segctor_prepare_write(sci);
  
 @@ -2603,6 +2608,7 @@ static struct nilfs_sc_info *nilfs_segctor_new(struct 
 super_block *sb,
  {
   struct the_nilfs *nilfs = sb-s_fs_info;
   struct nilfs_sc_info *sci;
 + int ret;
  
   sci = kzalloc(sizeof(*sci), GFP_KERNEL);
   if (!sci)
 @@ -2633,6 +2639,18 @@ static struct nilfs_sc_info *nilfs_segctor_new(struct 
 super_block *sb,
   sci-sc_interval = HZ * nilfs-ns_interval;
   if (nilfs-ns_watermark)
   sci-sc_watermark = nilfs-ns_watermark;
 +
 + if (nilfs_feature_track_live_blks(nilfs)) {
 + sci-sc_mc = kmalloc(sizeof(*(sci-sc_mc)), GFP_KERNEL);
 + if (sci-sc_mc) {
 + ret = nilfs_sufile_mc_init(sci-sc_mc,
 +NILFS_SUFILE_MC_SIZE_EXT);
 + if (ret) {
 + kfree(sci-sc_mc);
 + sci-sc_mc = NULL;
 + }
 + }
 + }
   return sci;
  }
  
 @@ -2701,6 +2719,8 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info 
 *sci)
   down_write(nilfs-ns_segctor_sem);
  
   del_timer_sync(sci-sc_timer);
 + nilfs_sufile_mc_destroy(sci-sc_mc);
 + kfree(sci-sc_mc);
   kfree(sci);
  }
  
 diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h
 index a48d6de..a857527 100644
 --- a/fs/nilfs2/segment.h
 +++ b/fs/nilfs2/segment.h
 @@ -80,6 +80,7 @@ struct nilfs_cstage {
  };
  
  struct nilfs_segment_buffer;
 +struct nilfs_sufile_mod_cache;
  
  struct nilfs_segsum_pointer {
   struct buffer_head *bh;
 @@ -129,6 +130,7 @@ struct nilfs_segsum_pointer {
   * @sc_watermark: Watermark for the number of dirty buffers
   * @sc_timer: Timer for segctord
   * @sc_task: current thread of segctord
 + * @sc_mc: mod cache to add up updates for SUFILE during seg construction
   */
  struct nilfs_sc_info {
   struct super_block *sc_super;
 @@ -185,6 +187,7 @@ struct nilfs_sc_info {
  
   struct timer_list   sc_timer;
   struct task_struct *sc_task;
 + struct nilfs_sufile_mod_cache *sc_mc;
  };
  
  /* sc_flags */

Again, I really hope you eliminate this changes by hiding the cache in
sufile.

Regards,
Ryusuke Konishi

 -- 
 2.3.0
 
 --
 To unsubscribe from this list: send the line unsubscribe linux-nilfs in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line unsubscribe linux-nilfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/9] nilfs2: add simple cache for modifications to SUFILE

2015-03-13 Thread Ryusuke Konishi
On Tue, 24 Feb 2015 20:01:37 +0100, Andreas Rohner wrote:
 This patch adds a simple, small cache that can be used to accumulate
 modifications to SUFILE entries. This is for example useful for
 keeping track of reclaimable blocks, because most of the
 modifications consist of small increments or decrements. By adding
 these up and temporarily storing them in a small cache, the
 performance can be improved. Additionally lock contention is
 reduced.
 
 Signed-off-by: Andreas Rohner andreas.roh...@gmx.net
 ---
  fs/nilfs2/sufile.c | 178 
 +
  fs/nilfs2/sufile.h |  44 +
  2 files changed, 222 insertions(+)
 
 diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
 index 1e8cac6..a369c30 100644
 --- a/fs/nilfs2/sufile.c
 +++ b/fs/nilfs2/sufile.c
 @@ -1168,6 +1168,184 @@ out_sem:
  }
  
  /**
 + * nilfs_sufile_mc_init - inits segusg modification cache
 + * @mc: modification cache
 + * @capacity: maximum capacity of the mod cache
 + *
 + * Description: Allocates memory for an array of nilfs_sufile_mod structures
 + * according to @capacity. This memory must be freed with
 + * nilfs_sufile_mc_destroy().
 + *
 + * Return Value: On success, 0 is returned. On error, one of the following
 + * negative error codes is returned.
 + *
 + * %-ENOMEM - Insufficient amount of memory available.
 + *
 + * %-EINVAL - Invalid capacity.
 + */
 +int nilfs_sufile_mc_init(struct nilfs_sufile_mod_cache *mc, size_t capacity)
 +{
 + mc-mc_capacity = capacity;
 + if (!capacity)
 + return -EINVAL;
 +
 + mc-mc_mods = kmalloc(capacity * sizeof(struct nilfs_sufile_mod),
 +   GFP_KERNEL);

GFP_NOFS must be used instead of GFP_KERNEL to avoid initiating other
filesystem operations.

The abbreviation mc is not good, which is already used as the
abbreviation of minimum clean in userland.

 + if (!mc-mc_mods)
 + return -ENOMEM;
 +
 + mc-mc_size = 0;
 +
 + return 0;
 +}
 +
 +/**
 + * nilfs_sufile_mc_add - add signed value to segusg modification cache
 + * @mc: modification cache
 + * @segnum: segment number
 + * @value: signed value (can be positive and negative)
 + *
 + * Description: nilfs_sufile_mc_add() tries to add a pair of @segnum and
 + * @value to the modification cache. If the cache already contains a
 + * segment number equal to @segnum, then @value is simply added to the
 + * existing value. This way thousands of small modifications can be
 + * accumulated into one value. If @segnum cannot be found and the
 + * capacity allows it, a new element is added to the cache. If the
 + * capacity is reached an error value is returned.
 + *
 + * Return Value: On success, 0 is returned. On error, one of the following
 + * negative error codes is returned.
 + *
 + * %-ENOSPC - The mod cache has reached its capacity and must be flushed.
 + */
 +static inline int nilfs_sufile_mc_add(struct nilfs_sufile_mod_cache *mc,
 +   __u64 segnum, __s64 value)
 +{
 + struct nilfs_sufile_mod *mods = mc-mc_mods;
 + int i;
 +
 + for (i = 0; i  mc-mc_size; ++i, ++mods) {
 + if (mods-m_segnum == segnum) {
 + mods-m_value += value;
 + return 0;
 + }
 + }
 +
 + if (mc-mc_size  mc-mc_capacity) {
 + mods-m_segnum = segnum;
 + mods-m_value = value;
 + mc-mc_size++;
 + return 0;
 + }
 +
 + return -ENOSPC;
 +}
 +
 +/**
 + * nilfs_sufile_mc_clear - set mc_size to 0
 + * @mc: modification cache
 + *
 + * Description: nilfs_sufile_mc_clear() sets mc_size to 0, which enables
 + * nilfs_sufile_mc_add() to overwrite the elements in @mc.
 + */
 +static inline void nilfs_sufile_mc_clear(struct nilfs_sufile_mod_cache *mc)
 +{
 + mc-mc_size = 0;
 +}
 +
 +/**
 + * nilfs_sufile_mc_reset - clear cache and add one element
 + * @mc: modification cache
 + * @segnum: segment number
 + * @value: signed value (can be positive and negative)
 + *
 + * Description: Clears the modification cache in @mc and adds a new pair of
 + * @segnum and @value to it at the same time.
 + */
 +static inline void nilfs_sufile_mc_reset(struct nilfs_sufile_mod_cache *mc,
 +  __u64 segnum, __s64 value)
 +{
 + struct nilfs_sufile_mod *mods = mc-mc_mods;
 +
 + mods-m_segnum = segnum;
 + mods-m_value = value;
 + mc-mc_size = 1;
 +}

The name of this function is confusing.  Actual meaning of this
function is reset and add, and that can be replaced with mc_clear
and mc_add.  Remove this function to simplify interface.

Regards,
Ryusuke Konishi

 +/**
 + * nilfs_sufile_mc_flush - flush modification cache
 + * @sufile: inode of segment usage file
 + * @mc: modification cache
 + * @dofunc: primitive operation for the update
 + *
 + * Description: nilfs_sufile_mc_flush() flushes the cached modifications
 + * and applies them to the segment usages on disk

Re: [PATCH 0/9] nilfs2: implementation of cost-benefit GC policy

2015-03-14 Thread Ryusuke Konishi
On Sat, 14 Mar 2015 13:24:25 +0100, Andreas Rohner wrote:
 Hi Ryusuke,
 
 Thank you very much for your detailed review and feedback. I agree with
 all of your points and I will start working on a rewrite immediately.
 
 On 2015-03-12 13:54, Ryusuke Konishi wrote:
 Hi Andreas,
 
 On Tue, 10 Mar 2015 21:37:50 +0100, Andreas Rohner wrote:
 Hi Ryusuke,

 Thanks for your thorough review.

 On 2015-03-10 06:21, Ryusuke Konishi wrote:
 Hi Andreas,

 I looked through whole kernel patches and a part of util patches.
 Overall comments are as follows:

 [Algorithm]
 As for algorithm, it looks about OK except for the starvation
 countermeasure.  The stavation countermeasure looks adhoc/hacky, but
 it's good that it doesn't change kernel/userland interface; we may be
 able to replace it with better ways in a future or in a revised
 version of this patchset.

 (1) Drawback of the starvation countermeasure
 The patch 9/9 looks to make the execution time of chcp operation
 worse since it will scan through sufile to modify live block
 counters.  How much does it prolong the execution time ?

 I'll do some tests, but I haven't noticed any significant performance
 drop. The GC basically does the same thing, every time it selects
 segments to reclaim.
 
 GC is performed in background by an independent process.  What I'm
 care about it that NILFS_IOCTL_CHANGE_CPMODE ioctl is called from
 command line interface or application.  They differ in this meaning.
 
 Was a worse case senario considered in the test ?
 
 For example:
 1. Fill a TB class drive with data file(s), and make a snapshot on it.
 2. Run one pass GC to update snapshot block counts.
 3. And do chcp cp
 
 If we don't observe noticeable delay on this class of drive, then I
 think we can put the problem off.
 
 Yesterday I did a worst case test as you suggested. I used an old 1 TB
 hard drive I had lying around. This was my setup:
 
 1. Write a 850GB file
 2. Create a snapshot
 3. Delete the file
 4. Let GC run through all segments
 5. Verify with lssu that the GC has updated all SUFILE entries
 6. Drop the page cache
 7. chcp cp
 
 The following results are with the page cache dropped immediately before
 each call:
 
 1. chcp ss
 real  0m1.337s
 user  0m0.017s
 sys   0m0.030s
 
 2. chcp cp
 real  0m6.377s
 user  0m0.023s
 sys   0m0.053s
 
 The following results are without the drop of the page cache:
 
 1. chcp ss
 real  0m0.137s
 user  0m0.010s
 sys   0m0.000s
 
 2. chcp cp
 real  0m0.016s
 user  0m0.010s
 sys   0m0.007s
 
 There are 119233 segments in my test. Each SUFILE entry uses 32 bytes.
 So the worst case for 1 TB with 8 MB segments would be 3.57 MB of random
 reads and one 3.57 MB continuous write. You only get 6.377s because my
 hard drive is so slow. You wouldn't notice any difference on a modern
 SSD. Furthermore the SUFILE is also scanned by the segment allocation
 algorithm and the GC, so it is very likely already in the page cache.

6.377s is too long because nilfs_sufile_fix_starving_segs() locks
sufile mi_sem, and even lengthens lock period of the following locks:

 - cpfile mi_sem (held at nilfs_cpfile_clear_snapshot()).
 - transaction lock (held at nilfs_ioctl_change_cpmode()).
 - ns_snapshot_mount_mutex (held at nilfs_ioctl_change_cpmode()).

leading to freeze of all write operations, lssu, lscp, cleanerd, and
snapshot mount, etc.

It is preferable for the function to be moved outside of them and to
release/reacquire transaction lock and sufile mi_sem regularly in some
way.

Regards,
Ryusuke Konishi
--
To unsubscribe from this list: send the line unsubscribe linux-nilfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 4/7] nilfs2: unify type of key arguments in bmap interface

2015-03-12 Thread Ryusuke Konishi
The type of key arguments in block mapping interface varies depending
on function.  For instance, nilfs_bmap_lookup_at_level() takes __u64
for its key argument whereas nilfs_bmap_lookup() takes unsigned
long.

This fits them to __u64 to eliminate the variation.

Signed-off-by: Ryusuke Konishi konishi.ryus...@lab.ntt.co.jp
---
 fs/nilfs2/alloc.c |  5 +++--
 fs/nilfs2/bmap.c  | 17 ++---
 fs/nilfs2/bmap.h  |  8 
 fs/nilfs2/inode.c |  6 +++---
 4 files changed, 16 insertions(+), 20 deletions(-)

diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c
index 741fd02..8df0f3b 100644
--- a/fs/nilfs2/alloc.c
+++ b/fs/nilfs2/alloc.c
@@ -405,13 +405,14 @@ nilfs_palloc_rest_groups_in_desc_block(const struct inode 
*inode,
 static int nilfs_palloc_count_desc_blocks(struct inode *inode,
unsigned long *desc_blocks)
 {
-   unsigned long blknum;
+   __u64 blknum;
int ret;
 
ret = nilfs_bmap_last_key(NILFS_I(inode)-i_bmap, blknum);
if (likely(!ret))
*desc_blocks = DIV_ROUND_UP(
-   blknum, NILFS_MDT(inode)-mi_blocks_per_desc_block);
+   (unsigned long)blknum,
+   NILFS_MDT(inode)-mi_blocks_per_desc_block);
return ret;
 }
 
diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c
index aadbd0b..c82f436 100644
--- a/fs/nilfs2/bmap.c
+++ b/fs/nilfs2/bmap.c
@@ -152,9 +152,7 @@ static int nilfs_bmap_do_insert(struct nilfs_bmap *bmap, 
__u64 key, __u64 ptr)
  *
  * %-EEXIST - A record associated with @key already exist.
  */
-int nilfs_bmap_insert(struct nilfs_bmap *bmap,
- unsigned long key,
- unsigned long rec)
+int nilfs_bmap_insert(struct nilfs_bmap *bmap, __u64 key, unsigned long rec)
 {
int ret;
 
@@ -191,19 +189,16 @@ static int nilfs_bmap_do_delete(struct nilfs_bmap *bmap, 
__u64 key)
return bmap-b_ops-bop_delete(bmap, key);
 }
 
-int nilfs_bmap_last_key(struct nilfs_bmap *bmap, unsigned long *key)
+int nilfs_bmap_last_key(struct nilfs_bmap *bmap, __u64 *keyp)
 {
-   __u64 lastkey;
int ret;
 
down_read(bmap-b_sem);
-   ret = bmap-b_ops-bop_last_key(bmap, lastkey);
+   ret = bmap-b_ops-bop_last_key(bmap, keyp);
up_read(bmap-b_sem);
 
if (ret  0)
ret = nilfs_bmap_convert_error(bmap, __func__, ret);
-   else
-   *key = lastkey;
return ret;
 }
 
@@ -224,7 +219,7 @@ int nilfs_bmap_last_key(struct nilfs_bmap *bmap, unsigned 
long *key)
  *
  * %-ENOENT - A record associated with @key does not exist.
  */
-int nilfs_bmap_delete(struct nilfs_bmap *bmap, unsigned long key)
+int nilfs_bmap_delete(struct nilfs_bmap *bmap, __u64 key)
 {
int ret;
 
@@ -235,7 +230,7 @@ int nilfs_bmap_delete(struct nilfs_bmap *bmap, unsigned 
long key)
return nilfs_bmap_convert_error(bmap, __func__, ret);
 }
 
-static int nilfs_bmap_do_truncate(struct nilfs_bmap *bmap, unsigned long key)
+static int nilfs_bmap_do_truncate(struct nilfs_bmap *bmap, __u64 key)
 {
__u64 lastkey;
int ret;
@@ -276,7 +271,7 @@ static int nilfs_bmap_do_truncate(struct nilfs_bmap *bmap, 
unsigned long key)
  *
  * %-ENOMEM - Insufficient amount of memory available.
  */
-int nilfs_bmap_truncate(struct nilfs_bmap *bmap, unsigned long key)
+int nilfs_bmap_truncate(struct nilfs_bmap *bmap, __u64 key)
 {
int ret;
 
diff --git a/fs/nilfs2/bmap.h b/fs/nilfs2/bmap.h
index b89e680..9230d33 100644
--- a/fs/nilfs2/bmap.h
+++ b/fs/nilfs2/bmap.h
@@ -153,10 +153,10 @@ int nilfs_bmap_test_and_clear_dirty(struct nilfs_bmap *);
 int nilfs_bmap_read(struct nilfs_bmap *, struct nilfs_inode *);
 void nilfs_bmap_write(struct nilfs_bmap *, struct nilfs_inode *);
 int nilfs_bmap_lookup_contig(struct nilfs_bmap *, __u64, __u64 *, unsigned);
-int nilfs_bmap_insert(struct nilfs_bmap *, unsigned long, unsigned long);
-int nilfs_bmap_delete(struct nilfs_bmap *, unsigned long);
-int nilfs_bmap_last_key(struct nilfs_bmap *, unsigned long *);
-int nilfs_bmap_truncate(struct nilfs_bmap *, unsigned long);
+int nilfs_bmap_insert(struct nilfs_bmap *bmap, __u64 key, unsigned long rec);
+int nilfs_bmap_delete(struct nilfs_bmap *bmap, __u64 key);
+int nilfs_bmap_last_key(struct nilfs_bmap *bmap, __u64 *keyp);
+int nilfs_bmap_truncate(struct nilfs_bmap *bmap, __u64 key);
 void nilfs_bmap_clear(struct nilfs_bmap *);
 int nilfs_bmap_propagate(struct nilfs_bmap *, struct buffer_head *);
 void nilfs_bmap_lookup_dirty_buffers(struct nilfs_bmap *, struct list_head *);
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 8b59695..cf9e489 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -106,7 +106,7 @@ int nilfs_get_block(struct inode *inode, sector_t blkoff,
err = nilfs_transaction_begin(inode-i_sb, ti, 1);
if (unlikely(err))
goto out;
-   err = nilfs_bmap_insert(ii-i_bmap, (unsigned long)blkoff

[PATCH 1/7] nilfs2: do not use async write flag for segment summary buffers

2015-03-12 Thread Ryusuke Konishi
The async write flag is introduced to nilfs2 in the commit
7f42ec394156 (nilfs2: fix issue with race condition of competition
between segments for dirty blocks), but the flag only makes sense for
data buffers and btree node buffers.  It is not needed for segment
summary buffers.

This gits rid of the latter uses as part of refactoring of atomic bit
operations on buffer state bitmap.

Signed-off-by: Ryusuke Konishi konishi.ryus...@lab.ntt.co.jp
Cc: Vyacheslav Dubeyko sl...@dubeyko.com
---
 fs/nilfs2/segment.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 0c3f303..c9a4e60 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -1588,7 +1588,6 @@ static void nilfs_segctor_prepare_write(struct 
nilfs_sc_info *sci)
 
list_for_each_entry(bh, segbuf-sb_segsum_buffers,
b_assoc_buffers) {
-   set_buffer_async_write(bh);
if (bh-b_page != bd_page) {
if (bd_page) {
lock_page(bd_page);
@@ -1688,7 +1687,6 @@ static void nilfs_abort_logs(struct list_head *logs, int 
err)
list_for_each_entry(segbuf, logs, sb_list) {
list_for_each_entry(bh, segbuf-sb_segsum_buffers,
b_assoc_buffers) {
-   clear_buffer_async_write(bh);
if (bh-b_page != bd_page) {
if (bd_page)
end_page_writeback(bd_page);
@@ -1768,7 +1766,6 @@ static void nilfs_segctor_complete_write(struct 
nilfs_sc_info *sci)
b_assoc_buffers) {
set_buffer_uptodate(bh);
clear_buffer_dirty(bh);
-   clear_buffer_async_write(bh);
if (bh-b_page != bd_page) {
if (bd_page)
end_page_writeback(bd_page);
-- 
1.8.3.1

--
To unsubscribe from this list: send the line unsubscribe linux-nilfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 0/9] nilfs2: implementation of cost-benefit GC policy

2015-03-12 Thread Ryusuke Konishi
Hi Andreas,

On Tue, 10 Mar 2015 21:37:50 +0100, Andreas Rohner wrote:
 Hi Ryusuke,
 
 Thanks for your thorough review.
 
 On 2015-03-10 06:21, Ryusuke Konishi wrote:
 Hi Andreas,
 
 I looked through whole kernel patches and a part of util patches.
 Overall comments are as follows:
 
 [Algorithm]
 As for algorithm, it looks about OK except for the starvation
 countermeasure.  The stavation countermeasure looks adhoc/hacky, but
 it's good that it doesn't change kernel/userland interface; we may be
 able to replace it with better ways in a future or in a revised
 version of this patchset.
 
 (1) Drawback of the starvation countermeasure
 The patch 9/9 looks to make the execution time of chcp operation
 worse since it will scan through sufile to modify live block
 counters.  How much does it prolong the execution time ?
 
 I'll do some tests, but I haven't noticed any significant performance
 drop. The GC basically does the same thing, every time it selects
 segments to reclaim.

GC is performed in background by an independent process.  What I'm
care about it that NILFS_IOCTL_CHANGE_CPMODE ioctl is called from
command line interface or application.  They differ in this meaning.

Was a worse case senario considered in the test ?

For example:
1. Fill a TB class drive with data file(s), and make a snapshot on it.
2. Run one pass GC to update snapshot block counts.
3. And do chcp cp

If we don't observe noticeable delay on this class of drive, then I
think we can put the problem off.

 In a use case of nilfs, many snapshots are created and they are
 automatically changed back to plain checkpoints because old
 snapshots are thinned out over time.  The patch 9/9 may impact on
 such usage.

 (2) Compatibility
 What will happen in the following case:
 1. Create a file system, use it with the new module, and
create snapshots.
 2. Mount it with an old module, and release snapshot with chcp cp
 3. Mount it with the new module, and cleanerd runs gc with
cost benefit or greedy policy.
 
 Some segments could be subject to starvation. But it would probably only
 affect a small number of segments and it could be fixed by chcp ss
 CP; chcp cp CP.

Ok, let's treat this as a restriction for now.
If you come up with any good idea, please propose.

 (3) Durability against unexpected power failures (just a note)
 The current patchset looks not to cause starvation issue even when
 unexpected power failure occurs during or after executing chcp
 cp because nilfs_ioctl_change_cpmode() do changes in a
 transactional way with nilfs_transaction_begin/commit.
 We should always think this kind of situtation to keep consistency.
 
 [Coding Style]
 (4) This patchset has several coding style issues. Please fix them and
 re-check with the latest checkpatch script (script/checkpatch.pl).
 
 I'll fix that. Sorry.
 
 patch 2:
 WARNING: Prefer kmalloc_array over kmalloc with multiply
 #85: FILE: fs/nilfs2/sufile.c:1192:
 +mc-mc_mods = kmalloc(capacity * sizeof(struct nilfs_sufile_mod),
 
 patch 5,6:
 WARNING: 'aquired' may be misspelled - perhaps 'acquired'?
 #60: 
 the same semaphore has to be aquired. So if the DAT-Entry belongs to
 
 WARNING: 'aquired' may be misspelled - perhaps 'acquired'?
 #46: 
 be aquired, which blocks the entire SUFILE and effectively turns
 
 WARNING: 'aquired' may be misspelled - perhaps 'acquired'?
 #53: 
 afore mentioned lock only needs to be aquired, if the cache is full
 
 (5) sub_sizeof macro:
 The same definition exists as offsetofend() in vfio.h,
 and a patch to move it to stddef.h is now proposed.
 
 Please use the same name, and redefine it only if it's not
 defined:
 
 #ifndef offsetofend
 #define offsetofend(TYPE, MEMBER) \
 (offsetof(TYPE, MEMBER) + sizeof(((TYPE *)0)-MEMBER))
 #endif
 
 Ok I'll change that.
 
 [Implementation]
 (6) b_blocknr
 Please do not use bh-b_blocknr to store disk block number.  This
 field is used to keep virtual block number except for DAT files.
 It is only replaced to an actual block number during calling
 submit_bh().  Keep this policy.
 
 As far as I can tell, this is only true for blocks of GC inodes and node
 blocks. All other buffer_heads are always mapped to on disk blocks by
 nilfs_get_block(). I only added the mapping in nilfs_segbuf_submit_bh()
 to correctly set the value in b_blocknr to the new location.
 

nilfs_get_block() is only used for regular files, directories, and so
on.  Blocks on metadata files are mapped through
nilfs_mdt_submit_block().  Anyway, yes, they stores actual disk block
number in b_blocknr in the current implementation.  But, it is just a
cutting corner of the current implementation, which comes from the
reason that we have to set actual disk block numbers when reading
blocks with vfs/mm functions.

Anyway I don't like you touch nilfs_get_block() and
nilfs_segbuf_submit_bh() in a part of the big patch.  At least

Re: Making Nilfs ZAC Compliant

2015-02-27 Thread Ryusuke Konishi
Hi,
On Thu, 26 Feb 2015 19:54:48 +, Benixon Dhas wrote:
 Hi All,
 
 We are trying to make Nilfs work with a SMR Device which adheres to
 Zoned ATA Commands(ZAC) Specification.  One of the restrictions in
 the specification is reading an unwritten part of the Zone(Segment
 in Nilfs) will cause a read error.
 
 We observe that Nilfs does not write a complete physical segment(we
 use 256MB segment) always. After digging in the source a while we
 figured that this is due to the fact that Nilfs requires a certain
 number of minimum blocks for constructing a partial segment
 (NILFS_PSEG_MIN_BLOCKS), which currently is 2.  So we see some
 segments where the last block (in our case a block is 4k) is not
 being written to.

For recovery and GC, NILFS needs to insert one or more header blocks
before writing payload blocks.  Inevitably, the minimum size of a
partial segment becomes 2.

 When some utilities like garbage collector and dump segment reads
 (May not be an exhaustive list) a segment it tries to read the
 entire physical segment. This causes read errors in the kernel and
 hence retries for the last unwritten block in certain segments.

The recovery function of NILFS also needs to read entire physical
segment.  It never reads unwritten blocks if the file system was
cleanly unmounted, however, this is not the case for unclean shutdown
or panic.

Worse yet, if it gets an EIO from the underlying block layer, the
recovery will fail and the mount system call will abort.

 In an attempt to solve this problem we were trying to figure out if
 we can write some dummy data to the remaining unutilized blocks in
 the segment. But we are not sure what would be the best way to do
 this.
 
 Another solution we had in mind was to figure out all places where
 segments are read, and modify it to prevent it from reading
 unwritten blocks. But we feel this might be more complex solution
 and might impact performance more.

Looks like sufile is available for this purpose.  It is maintaining
how many blocks are written for each segment.  You can see it in the
NBLOCKS field of the output of lssu command.

One restriction is that this metadata file (sufile) is unavailable
until mount system call succeeds.  The recovery code cannot use it.

 Please advise us on the best way to solve the problem. Also what
 would be architecturally a best place to fix the problem.

Writing dummy data to the dead space for SMR devices looks better to
me because it's simpler and the performance penalty seems not so high.

But,
What will happen if an unexpected power failure hits the device ?
Does that cause the file system to read unwritten blocks ?

If so, it seems that we need translation layer to hide these issues,
or a new error code or a new mechanism to make it possible for file
systems to know/handle them.

Regards,
Ryusuke Konishi
--
To unsubscribe from this list: send the line unsubscribe linux-nilfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [systemd-devel] nilfs-cleanerd startup on boot

2015-03-03 Thread Ryusuke Konishi

Hi

On 2015/03/04 0:44, dennis.mur...@wipro.com wrote:

I had mis-typed the address for the nilfs mail group


-Original Message-
From: Lennart Poettering [mailto:lenn...@poettering.net]
Sent: Saturday, February 28, 2015 12:34 PM
To: Dennis Murata (WT01 - ENU)
Cc: systemd-de...@lists.freedesktop.org; linus-ni...@vger.kernel.org
Subject: Re: [systemd-devel] nilfs-cleanerd startup on boot

On Fri, 27.02.15 18:31, dennis.mur...@wipro.com
(dennis.mur...@wipro.com) wrote:


I have a fedora 21 system that where I mount an nilfs2 file system.
I use a simple /etc/modules-load.d/nilfs.conf file to load the kernel
module and have an entry in the fstab.


Creating the modules-load.d snippet should not be necessary, as the kernel
should autoload the kernel module for it when it is first required.

I did not find this to be the case for fedora 21.

 Without creating the file to load the module, any attempt I made to mount
 the file system would get a unknown filetype error.  Does this point at
 adding this module to the initrd file?

Is nilfs2.ko installed in your environment?

Try modinfo nilfs2

Older fedora needed kernel-modules-extra package. [1]

[1] http://nilfs.sourceforge.net/en/pkg_fedora.html


The file system mounts on boot as it should, but the nilfs-cleanerd
program does not startup.  If I umount /nilfs then mount /nilfs the
nilfs-cleanerd program starts as it should to cleanup the checkpoints.


How is that daemon supposed to be started? Is it forked off /bin/mount?


Does systemd use a different mount program at boot?


It uses /bin/mount for mounting normal file systems.


nilfs_cleanerd is invoked through /sbin/mount.nilfs2 helper. [2]
The helper is called from /sbin/mount if it exists.

/sbin/mount.nilfs2 is included in nilfs-utils package.

nilfs_cleanerd is just a user-land process, so it can be
manually invoked if you have root privilege. [3]

  # /sbin/nilfs_cleanerd device directory

But, in this case, you need to kill nilfs_cleanerd
manually before umount.  So, I recommend running cleanerd
through mount.nilfs2.

The above explanation may not suit for the recent fedora
since nilfs-utils is not yet tuned to systemd environment.

[2] http://nilfs.sourceforge.net/en/man8/mount.nilfs2.8.html
[3] http://nilfs.sourceforge.net/en/man8/nilfs_cleanerd.8.html

Regards,
Ryusuke Konishi


Is there something else that should be included other than the
nilfs.conf file?  I have just started using a system with systemd as
the init so please forgive my ignorance.


I have no idea about nilfs really, and we had no reports about any problems
with it before.

I wanted to look at the performance of nilfs and f2fs.

 This is my first try at using these file systems


Lennart

--
Lennart Poettering, Red Hat

--
To unsubscribe from this list: send the line unsubscribe linux-nilfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


--
To unsubscribe from this list: send the line unsubscribe linux-nilfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/1] nilfs2: fix deadlock of segment constructor during recovery

2015-03-03 Thread Ryusuke Konishi
According to a report from Yuxuan Shui, nilfs2 in kernel 3.19 got
stuck during recovery at mount time.  The code path that caused the
deadlock was as follows:

  nilfs_fill_super()
load_nilfs()
  nilfs_salvage_orphan_logs()
* Do roll-forwarding, attach segment constructor for recovery,
  and kick it.

nilfs_segctor_thread()
  nilfs_segctor_thread_construct()
   * A lock is held with nilfs_transaction_lock()
 nilfs_segctor_do_construct()
   nilfs_segctor_drop_written_files()
 iput()
   iput_final()
 write_inode_now()
   writeback_single_inode()
 __writeback_single_inode()
   do_writepages()
 nilfs_writepage()
   nilfs_construct_dsync_segment()
 nilfs_transaction_lock() -- deadlock

This can happen if commit 7ef3ff2fea8b (nilfs2: fix deadlock of
segment constructor over I_SYNC flag) is applied and roll-forward
recovery was performed at mount time.  The roll-forward recovery can
happen if datasync write is done and the file system crashes
immediately after that.  For instance, we can reproduce the issue with
the following steps:

  nilfs2 is mounted on /nilfs (device: /dev/sdb1) 
 # dd if=/dev/zero of=/nilfs/test bs=4k count=1  sync
 # dd if=/dev/zero of=/nilfs/test conv=notrunc oflag=dsync bs=4k
 count=1  reboot -nfh
  the system will immediately reboot 
 # mount -t nilfs2 /dev/sdb1 /nilfs

The deadlock occurs because iput() can run segment constructor through
writeback_single_inode() if MS_ACTIVE flag is not set on sb-s_flags.
The above commit changed segment constructor so that it calls iput()
asynchronously for inodes with i_nlink == 0, but that change was
imperfect.

This fixes the another deadlock by deferring iput() in segment
constructor even for the case that mount is not finished, that is, for
the case that MS_ACTIVE flag is not set.

Reported-by: Yuxuan Shui yshu...@gmail.com
Signed-off-by: Ryusuke Konishi konishi.ryus...@lab.ntt.co.jp
Cc: Al Viro v...@zeniv.linux.org.uk
Tested-by: Ryusuke Konishi konishi.ryus...@lab.ntt.co.jp
Cc: sta...@vger.kernel.org
---
 fs/nilfs2/segment.c | 7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 469086b..0c3f303 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -1907,6 +1907,7 @@ static void nilfs_segctor_drop_written_files(struct 
nilfs_sc_info *sci,
 struct the_nilfs *nilfs)
 {
struct nilfs_inode_info *ii, *n;
+   int during_mount = !(sci-sc_super-s_flags  MS_ACTIVE);
int defer_iput = false;
 
spin_lock(nilfs-ns_inode_lock);
@@ -1919,10 +1920,10 @@ static void nilfs_segctor_drop_written_files(struct 
nilfs_sc_info *sci,
brelse(ii-i_bh);
ii-i_bh = NULL;
list_del_init(ii-i_dirty);
-   if (!ii-vfs_inode.i_nlink) {
+   if (!ii-vfs_inode.i_nlink || during_mount) {
/*
-* Defer calling iput() to avoid a deadlock
-* over I_SYNC flag for inodes with i_nlink == 0
+* Defer calling iput() to avoid deadlocks if
+* i_nlink == 0 or mount is not yet finished.
 */
list_add_tail(ii-i_dirty, sci-sc_iput_queue);
defer_iput = true;
-- 
1.8.3.1

--
To unsubscribe from this list: send the line unsubscribe linux-nilfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 0/1] nilfs2: fix deadlock of segment constructor during recovery

2015-03-03 Thread Ryusuke Konishi
Hi Andrew,

Please send the following bug fix to upstream.  It fixes another
deadlock issue of nilfs2 segment constructor, which was recently
reported.

Thanks,
Ryusuke Konishi
--
Ryusuke Konishi (1):
  nilfs2: fix deadlock of segment constructor during recovery

 fs/nilfs2/segment.c | 7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

--
To unsubscribe from this list: send the line unsubscribe linux-nilfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/4] nilfs2: unify type of key arguments in bmap interface

2015-02-22 Thread Ryusuke Konishi
The type of key arguments in block mapping interface varies depending
on function.  For instance, nilfs_bmap_lookup_at_level() takes __u64
for its key argument whereas nilfs_bmap_lookup() takes unsigned
long.

This fits them to __u64 to eliminate the variation.

Signed-off-by: Ryusuke Konishi konishi.ryus...@lab.ntt.co.jp
---
 fs/nilfs2/alloc.c |  5 +++--
 fs/nilfs2/bmap.c  | 17 ++---
 fs/nilfs2/bmap.h  |  8 
 fs/nilfs2/inode.c |  6 +++---
 4 files changed, 16 insertions(+), 20 deletions(-)

diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c
index 741fd02..8df0f3b 100644
--- a/fs/nilfs2/alloc.c
+++ b/fs/nilfs2/alloc.c
@@ -405,13 +405,14 @@ nilfs_palloc_rest_groups_in_desc_block(const struct inode 
*inode,
 static int nilfs_palloc_count_desc_blocks(struct inode *inode,
unsigned long *desc_blocks)
 {
-   unsigned long blknum;
+   __u64 blknum;
int ret;
 
ret = nilfs_bmap_last_key(NILFS_I(inode)-i_bmap, blknum);
if (likely(!ret))
*desc_blocks = DIV_ROUND_UP(
-   blknum, NILFS_MDT(inode)-mi_blocks_per_desc_block);
+   (unsigned long)blknum,
+   NILFS_MDT(inode)-mi_blocks_per_desc_block);
return ret;
 }
 
diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c
index aadbd0b..c82f436 100644
--- a/fs/nilfs2/bmap.c
+++ b/fs/nilfs2/bmap.c
@@ -152,9 +152,7 @@ static int nilfs_bmap_do_insert(struct nilfs_bmap *bmap, 
__u64 key, __u64 ptr)
  *
  * %-EEXIST - A record associated with @key already exist.
  */
-int nilfs_bmap_insert(struct nilfs_bmap *bmap,
- unsigned long key,
- unsigned long rec)
+int nilfs_bmap_insert(struct nilfs_bmap *bmap, __u64 key, unsigned long rec)
 {
int ret;
 
@@ -191,19 +189,16 @@ static int nilfs_bmap_do_delete(struct nilfs_bmap *bmap, 
__u64 key)
return bmap-b_ops-bop_delete(bmap, key);
 }
 
-int nilfs_bmap_last_key(struct nilfs_bmap *bmap, unsigned long *key)
+int nilfs_bmap_last_key(struct nilfs_bmap *bmap, __u64 *keyp)
 {
-   __u64 lastkey;
int ret;
 
down_read(bmap-b_sem);
-   ret = bmap-b_ops-bop_last_key(bmap, lastkey);
+   ret = bmap-b_ops-bop_last_key(bmap, keyp);
up_read(bmap-b_sem);
 
if (ret  0)
ret = nilfs_bmap_convert_error(bmap, __func__, ret);
-   else
-   *key = lastkey;
return ret;
 }
 
@@ -224,7 +219,7 @@ int nilfs_bmap_last_key(struct nilfs_bmap *bmap, unsigned 
long *key)
  *
  * %-ENOENT - A record associated with @key does not exist.
  */
-int nilfs_bmap_delete(struct nilfs_bmap *bmap, unsigned long key)
+int nilfs_bmap_delete(struct nilfs_bmap *bmap, __u64 key)
 {
int ret;
 
@@ -235,7 +230,7 @@ int nilfs_bmap_delete(struct nilfs_bmap *bmap, unsigned 
long key)
return nilfs_bmap_convert_error(bmap, __func__, ret);
 }
 
-static int nilfs_bmap_do_truncate(struct nilfs_bmap *bmap, unsigned long key)
+static int nilfs_bmap_do_truncate(struct nilfs_bmap *bmap, __u64 key)
 {
__u64 lastkey;
int ret;
@@ -276,7 +271,7 @@ static int nilfs_bmap_do_truncate(struct nilfs_bmap *bmap, 
unsigned long key)
  *
  * %-ENOMEM - Insufficient amount of memory available.
  */
-int nilfs_bmap_truncate(struct nilfs_bmap *bmap, unsigned long key)
+int nilfs_bmap_truncate(struct nilfs_bmap *bmap, __u64 key)
 {
int ret;
 
diff --git a/fs/nilfs2/bmap.h b/fs/nilfs2/bmap.h
index b89e680..9230d33 100644
--- a/fs/nilfs2/bmap.h
+++ b/fs/nilfs2/bmap.h
@@ -153,10 +153,10 @@ int nilfs_bmap_test_and_clear_dirty(struct nilfs_bmap *);
 int nilfs_bmap_read(struct nilfs_bmap *, struct nilfs_inode *);
 void nilfs_bmap_write(struct nilfs_bmap *, struct nilfs_inode *);
 int nilfs_bmap_lookup_contig(struct nilfs_bmap *, __u64, __u64 *, unsigned);
-int nilfs_bmap_insert(struct nilfs_bmap *, unsigned long, unsigned long);
-int nilfs_bmap_delete(struct nilfs_bmap *, unsigned long);
-int nilfs_bmap_last_key(struct nilfs_bmap *, unsigned long *);
-int nilfs_bmap_truncate(struct nilfs_bmap *, unsigned long);
+int nilfs_bmap_insert(struct nilfs_bmap *bmap, __u64 key, unsigned long rec);
+int nilfs_bmap_delete(struct nilfs_bmap *bmap, __u64 key);
+int nilfs_bmap_last_key(struct nilfs_bmap *bmap, __u64 *keyp);
+int nilfs_bmap_truncate(struct nilfs_bmap *bmap, __u64 key);
 void nilfs_bmap_clear(struct nilfs_bmap *);
 int nilfs_bmap_propagate(struct nilfs_bmap *, struct buffer_head *);
 void nilfs_bmap_lookup_dirty_buffers(struct nilfs_bmap *, struct list_head *);
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 8b59695..cf9e489 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -106,7 +106,7 @@ int nilfs_get_block(struct inode *inode, sector_t blkoff,
err = nilfs_transaction_begin(inode-i_sb, ti, 1);
if (unlikely(err))
goto out;
-   err = nilfs_bmap_insert(ii-i_bmap, (unsigned long)blkoff

[PATCH 0/4] nilfs2: shorten execution time of lscp command

2015-02-22 Thread Ryusuke Konishi
The older a filesystem gets, the slower lscp command becomes.  This is
because nilfs_cpfile_do_get_cpinfo() function meets more hole blocks
as the start offset of valid checkpoint numbers gets bigger.

This series introduces some helper functions which help to skip hole
blocks efficiently, and reduces the overhead with them.

A measurement result of this series is as follows:

Before:
$ time lscp
 CNODATE TIME  MODE  FLG  BLKCNT   ICNT
 5769303  2015-02-22 19:31:33   cp-  108  1
 5769304  2015-02-22 19:38:54   cp-  108  1

real0m0.182s
user0m0.003s
sys 0m0.180s

After:
$ time lscp
 CNODATE TIME  MODE  FLG  BLKCNT   ICNT
 5769303  2015-02-22 19:31:33   cp-  108  1
 5769304  2015-02-22 19:38:54   cp-  108  1

real0m0.003s
user0m0.001s
sys 0m0.002s


Thanks,
Ryusuke Konishi
--
Ryusuke Konishi (4):
  nilfs2: unify type of key arguments in bmap interface
  nilfs2: add bmap function to seek a valid key
  nilfs2: add helper to find existent block on metadata file
  nilfs2: improve execution time of NILFS_IOCTL_GET_CPINFO ioctl

 fs/nilfs2/alloc.c  |  5 +++--
 fs/nilfs2/bmap.c   | 48 ++-
 fs/nilfs2/bmap.h   | 13 ++-
 fs/nilfs2/btree.c  | 66 ++
 fs/nilfs2/cpfile.c | 58 ++-
 fs/nilfs2/direct.c | 17 ++
 fs/nilfs2/inode.c  |  6 ++---
 fs/nilfs2/mdt.c| 54 
 fs/nilfs2/mdt.h|  3 +++
 9 files changed, 243 insertions(+), 27 deletions(-)

--
To unsubscribe from this list: send the line unsubscribe linux-nilfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 4/4] nilfs2: improve execution time of NILFS_IOCTL_GET_CPINFO ioctl

2015-02-22 Thread Ryusuke Konishi
The older a filesystem gets, the slower lscp command becomes.  This is
because nilfs_cpfile_do_get_cpinfo() function meets more hole blocks
as the start offset of valid checkpoint numbers gets bigger.

This reduces the overhead by skipping hole blocks efficiently with
nilfs_mdt_find_block() helper.

A measurement result of this patch is as follows:

Before:
$ time lscp
 CNODATE TIME  MODE  FLG  BLKCNT   ICNT
 5769303  2015-02-22 19:31:33   cp-  108  1
 5769304  2015-02-22 19:38:54   cp-  108  1

real0m0.182s
user0m0.003s
sys 0m0.180s

After:
$ time lscp
 CNODATE TIME  MODE  FLG  BLKCNT   ICNT
 5769303  2015-02-22 19:31:33   cp-  108  1
 5769304  2015-02-22 19:38:54   cp-  108  1

real0m0.003s
user0m0.001s
sys 0m0.002s

Signed-off-by: Ryusuke Konishi konishi.ryus...@lab.ntt.co.jp
---
 fs/nilfs2/cpfile.c | 58 --
 1 file changed, 52 insertions(+), 6 deletions(-)

diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c
index 0d58075..b6596ca 100644
--- a/fs/nilfs2/cpfile.c
+++ b/fs/nilfs2/cpfile.c
@@ -53,6 +53,13 @@ nilfs_cpfile_get_offset(const struct inode *cpfile, __u64 
cno)
return do_div(tcno, nilfs_cpfile_checkpoints_per_block(cpfile));
 }
 
+static __u64 nilfs_cpfile_first_checkpoint_in_block(const struct inode *cpfile,
+   unsigned long blkoff)
+{
+   return (__u64)nilfs_cpfile_checkpoints_per_block(cpfile) * blkoff
+   + 1 - NILFS_MDT(cpfile)-mi_first_entry_offset;
+}
+
 static unsigned long
 nilfs_cpfile_checkpoints_in_block(const struct inode *cpfile,
  __u64 curr,
@@ -146,6 +153,44 @@ static inline int nilfs_cpfile_get_checkpoint_block(struct 
inode *cpfile,
   create, nilfs_cpfile_block_init, bhp);
 }
 
+/**
+ * nilfs_cpfile_find_checkpoint_block - find and get a buffer on cpfile
+ * @cpfile: inode of cpfile
+ * @start_cno: start checkpoint number (inclusive)
+ * @end_cno: end checkpoint number (inclusive)
+ * @cnop: place to store the next checkpoint number
+ * @bhp: place to store a pointer to buffer_head struct
+ *
+ * Return Value: On success, it returns 0. On error, the following negative
+ * error code is returned.
+ *
+ * %-ENOMEM - Insufficient memory available.
+ *
+ * %-EIO - I/O error
+ *
+ * %-ENOENT - no block exists in the range.
+ */
+static int nilfs_cpfile_find_checkpoint_block(struct inode *cpfile,
+ __u64 start_cno, __u64 end_cno,
+ __u64 *cnop,
+ struct buffer_head **bhp)
+{
+   unsigned long start, end, blkoff;
+   int ret;
+
+   if (unlikely(start_cno  end_cno))
+   return -ENOENT;
+
+   start = nilfs_cpfile_get_blkoff(cpfile, start_cno);
+   end = nilfs_cpfile_get_blkoff(cpfile, end_cno);
+
+   ret = nilfs_mdt_find_block(cpfile, start, end, blkoff, bhp);
+   if (!ret)
+   *cnop = (blkoff == start) ? start_cno :
+   nilfs_cpfile_first_checkpoint_in_block(cpfile, blkoff);
+   return ret;
+}
+
 static inline int nilfs_cpfile_delete_checkpoint_block(struct inode *cpfile,
   __u64 cno)
 {
@@ -403,14 +448,15 @@ static ssize_t nilfs_cpfile_do_get_cpinfo(struct inode 
*cpfile, __u64 *cnop,
return -ENOENT; /* checkpoint number 0 is invalid */
down_read(NILFS_MDT(cpfile)-mi_sem);
 
-   for (n = 0; cno  cur_cno  n  nci; cno += ncps) {
-   ncps = nilfs_cpfile_checkpoints_in_block(cpfile, cno, cur_cno);
-   ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, bh);
+   for (n = 0; n  nci; cno += ncps) {
+   ret = nilfs_cpfile_find_checkpoint_block(
+   cpfile, cno, cur_cno - 1, cno, bh);
if (ret  0) {
-   if (ret != -ENOENT)
-   goto out;
-   continue; /* skip hole */
+   if (likely(ret == -ENOENT))
+   break;
+   goto out;
}
+   ncps = nilfs_cpfile_checkpoints_in_block(cpfile, cno, cur_cno);
 
kaddr = kmap_atomic(bh-b_page);
cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, bh, kaddr);
-- 
1.8.3.1

--
To unsubscribe from this list: send the line unsubscribe linux-nilfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/1] nilfs2: fix potential memory overrun on inode

2015-02-20 Thread Ryusuke Konishi
On Fri, 20 Feb 2015 13:58:42 -0800, Andrew Morton wrote:
 On Fri, 20 Feb 2015 22:46:35 +0900 Ryusuke Konishi 
 konishi.ryus...@lab.ntt.co.jp wrote:
 
 Each inode of nilfs2 stores a root node of a b-tree, and it turned out
 to have a memory overrun issue:
 
 Each b-tree node of nilfs2 stores a set of key-value pairs and the
 number of them (in bn_nchildren member of nilfs_btree_node struct),
 as well as a few other bn_* members.
 
 Since the value of bn_nchildren is used for operations on the
 key-values within the b-tree node, it can cause memory access overrun
 if a large number is incorrectly set to bn_nchildren.
 
 For instance, nilfs_btree_node_lookup() function determines the range
 of binary search with it, and too large bn_nchildren leads
 nilfs_btree_node_get_key() in that function to overrun.
 
 As for intermediate b-tree nodes, this is prevented by a sanity check
 performed when each node is read from a drive, however, no sanity
 check has been done for root nodes stored in inodes.
 
 This patch fixes the issue by adding missing sanity check against
 b-tree root nodes so that it's called when on-memory inodes are read
 from ifile, inode metadata file.
 
 How would one trigger this overrun?  Mount an fs with a deliberately
 corrupted/inconsistent fs image?

Yes, this can be triggered by mounting an fs with a corrupted image
deliberately or by chance.

 Memory overrun sounds nasty so I'm thinking we add cc:stable to this
 one.  OK?

Agreed.

Ryusuke Konishi
--
To unsubscribe from this list: send the line unsubscribe linux-nilfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] nilfs2: fix sanity check of btree level in nilfs_btree_root_broken()

2015-04-29 Thread Ryusuke Konishi
The range check for b-tree level parameter in nilfs_btree_root_broken()
is wrong; it accepts the case of level == NILFS_BTREE_LEVEL_MAX even
though the level is limited to values in the range of 0 to
(NILFS_BTREE_LEVEL_MAX - 1).

Since the level parameter is read from storage device and used to index
nilfs_btree_path array whose element count is NILFS_BTREE_LEVEL_MAX, it
can cause memory overrun during btree operations if the boundary value
is set to the level parameter on device.

This fixes the broken sanity check and adds a comment to clarify that
the upper bound NILFS_BTREE_LEVEL_MAX is exclusive.

Signed-off-by: Ryusuke Konishi konishi.ryus...@lab.ntt.co.jp
Cc: sta...@vger.kernel.org
---
 fs/nilfs2/btree.c | 2 +-
 include/linux/nilfs2_fs.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index 059f371..919fd5b 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -388,7 +388,7 @@ static int nilfs_btree_root_broken(const struct 
nilfs_btree_node *node,
nchildren = nilfs_btree_node_get_nchildren(node);
 
if (unlikely(level  NILFS_BTREE_LEVEL_NODE_MIN ||
-level  NILFS_BTREE_LEVEL_MAX ||
+level = NILFS_BTREE_LEVEL_MAX ||
 nchildren  0 ||
 nchildren  NILFS_BTREE_ROOT_NCHILDREN_MAX)) {
pr_crit(NILFS: bad btree root (inode number=%lu): level = %d, 
flags = 0x%x, nchildren = %d\n,
diff --git a/include/linux/nilfs2_fs.h b/include/linux/nilfs2_fs.h
index ff3fea3..9abb763 100644
--- a/include/linux/nilfs2_fs.h
+++ b/include/linux/nilfs2_fs.h
@@ -460,7 +460,7 @@ struct nilfs_btree_node {
 /* level */
 #define NILFS_BTREE_LEVEL_DATA  0
 #define NILFS_BTREE_LEVEL_NODE_MIN  (NILFS_BTREE_LEVEL_DATA + 1)
-#define NILFS_BTREE_LEVEL_MAX   14
+#define NILFS_BTREE_LEVEL_MAX   14 /* Max level (exclusive) */
 
 /**
  * struct nilfs_palloc_group_desc - block group descriptor
-- 
1.8.3.1

--
To unsubscribe from this list: send the line unsubscribe linux-nilfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 4/9] nilfs2: add kmem_cache for SUFILE cache nodes

2015-05-09 Thread Ryusuke Konishi
On Sat, 09 May 2015 21:10:21 +0200, Andreas Rohner wrote:
 On 2015-05-09 04:41, Ryusuke Konishi wrote:
 On Sun,  3 May 2015 12:05:17 +0200, Andreas Rohner wrote:
 +static void nilfs_sufile_cache_node_init_once(void *obj)
 +{
 +   memset(obj, 0, sizeof(struct nilfs_sufile_cache_node));
 +}
 +
 
 Note that nilfs_sufile_cache_node_init_once() is only called when each
 cache entry is allocated first time.  It doesn't ensure each cache
 entry is clean when it will be allocated with kmem_cache_alloc()
 the second time and afterwards.
 
 I kind of assumed it would be called for every object returned by
 kmem_cache_alloc(). In that case I have to do the initialization in
 nilfs_sufile_alloc_cache_node() and remove this function.
 
 Regards,
 Andreas Rohner

You can use kmem_cache_zalloc() instead in that case.

Regards,
Ryusuke Konishi
--
To unsubscribe from this list: send the line unsubscribe linux-nilfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/3] NILFS2: support NFSv2 export

2015-05-10 Thread Ryusuke Konishi
On Fri, 08 May 2015 10:16:23 +1000, NeilBrown ne...@suse.de wrote:
 The fh_len passed to -fh_to_* is not guaranteed to be that same as
 that returned by encode_fh - it may be larger.
 
 With NFSv2, the filehandle is fixed length, so it may appear longer
 than expected and be zero-padded.
 
 So we must test that fh_len is at least some value, not exactly equal
 to it.
 
 Signed-off-by: NeilBrown ne...@suse.de
 ---
  fs/nilfs2/namei.c |6 +++---
  1 file changed, 3 insertions(+), 3 deletions(-)
 
 diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
 index 22180836ec22..b65fb79d16fd 100644
 --- a/fs/nilfs2/namei.c
 +++ b/fs/nilfs2/namei.c
 @@ -496,8 +496,8 @@ static struct dentry *nilfs_fh_to_dentry(struct 
 super_block *sb, struct fid *fh,
  {
   struct nilfs_fid *fid = (struct nilfs_fid *)fh;
  
 - if ((fh_len != NILFS_FID_SIZE_NON_CONNECTABLE 
 -  fh_len != NILFS_FID_SIZE_CONNECTABLE) ||

 + if ((fh_len  NILFS_FID_SIZE_NON_CONNECTABLE 
 +  fh_len  NILFS_FID_SIZE_CONNECTABLE) ||
   (fh_type != FILEID_NILFS_WITH_PARENT 
fh_type != FILEID_NILFS_WITHOUT_PARENT))
   return NULL;

A bit weird.  fh_len  NILFS_FID_SIZE_CONNECTABLE implies fh_len 
NILFS_FID_SIZE_NON_CONNECTABLE.

How about the following fix ?

if ((fh_type != FILEID_NILFS_WITH_PARENT ||
 fh_len  NILFS_FID_SIZE_CONNECTABLE) 
(fh_type != FILEID_NILFS_WITHOUT_PARENT ||
 fh_len  NILFS_FID_SIZE_NON_CONNECTABLE))
return NULL;

Regards,
Ryusuke Konishi

 @@ -510,7 +510,7 @@ static struct dentry *nilfs_fh_to_parent(struct 
 super_block *sb, struct fid *fh,
  {
   struct nilfs_fid *fid = (struct nilfs_fid *)fh;
  
 - if (fh_len != NILFS_FID_SIZE_CONNECTABLE ||
 + if (fh_len  NILFS_FID_SIZE_CONNECTABLE ||
   fh_type != FILEID_NILFS_WITH_PARENT)
   return NULL;
  
 
 
 --
 To unsubscribe from this list: send the line unsubscribe linux-nilfs in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line unsubscribe linux-nilfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 1/9] nilfs2: copy file system feature flags to the nilfs object

2015-05-08 Thread Ryusuke Konishi
On Sun,  3 May 2015 12:05:14 +0200, Andreas Rohner wrote:
 This patch adds three new attributes to the nilfs object, which contain
 a copy of the feature flags from the super block. This can be used, to
 efficiently test whether file system feature flags are set or not.
 
 Signed-off-by: Andreas Rohner andreas.roh...@gmx.net
 ---
  fs/nilfs2/the_nilfs.c | 4 
  fs/nilfs2/the_nilfs.h | 8 
  2 files changed, 12 insertions(+)
 
 diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
 index 69bd801..606fdfc 100644
 --- a/fs/nilfs2/the_nilfs.c
 +++ b/fs/nilfs2/the_nilfs.c
 @@ -630,6 +630,10 @@ int init_nilfs(struct the_nilfs *nilfs, struct 
 super_block *sb, char *data)
   get_random_bytes(nilfs-ns_next_generation,
sizeof(nilfs-ns_next_generation));
  
 + nilfs-ns_feature_compat = le64_to_cpu(sbp-s_feature_compat);
 + nilfs-ns_feature_compat_ro = le64_to_cpu(sbp-s_feature_compat_ro);
 + nilfs-ns_feature_incompat = le64_to_cpu(sbp-s_feature_incompat);

Consider moving these initialization to just before calling
nilfs_check_feature_compatibility().

It uses compat flags, and I'd like to unfold the function using these
internal variables sometime.

 +
   err = nilfs_store_disk_layout(nilfs, sbp);
   if (err)
   goto failed_sbh;
 diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
 index 23778d3..12cd91d 100644
 --- a/fs/nilfs2/the_nilfs.h
 +++ b/fs/nilfs2/the_nilfs.h
 @@ -101,6 +101,9 @@ enum {
   * @ns_dev_kobj: /sys/fs/nilfs/device
   * @ns_dev_kobj_unregister: completion state
   * @ns_dev_subgroups: device subgroups pointer
 + * @ns_feature_compat: Compatible feature set
 + * @ns_feature_compat_ro: Read-only compatible feature set
 + * @ns_feature_incompat: Incompatible feature set
   */
  struct the_nilfs {
   unsigned long   ns_flags;
 @@ -201,6 +204,11 @@ struct the_nilfs {
   struct kobject ns_dev_kobj;
   struct completion ns_dev_kobj_unregister;
   struct nilfs_sysfs_dev_subgroups *ns_dev_subgroups;
 +
 + /* Features */
 + __u64   ns_feature_compat;
 + __u64   ns_feature_compat_ro;
 + __u64   ns_feature_incompat;
  };
  
  #define THE_NILFS_FNS(bit, name) \
 -- 
 2.3.7
 
 --
 To unsubscribe from this list: send the line unsubscribe linux-nilfs in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line unsubscribe linux-nilfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 0/2] nilfs2: fix issues with nilfs_set_inode_flags()

2015-04-08 Thread Ryusuke Konishi
Hi Andrew,

Please queue the following changes for the next merge window:

Ryusuke Konishi (2):
  nilfs2: put out gfp mask manipulation from nilfs_set_inode_flags()
  nilfs2: use inode_set_flags() in nilfs_set_inode_flags()

These fix issues related to nilfs_set_inode_flags() function.

Thanks,
Ryusuke Konishi
--

 fs/nilfs2/inode.c | 19 ++-
 1 file changed, 10 insertions(+), 9 deletions(-)

--
To unsubscribe from this list: send the line unsubscribe linux-nilfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/2] nilfs2: use inode_set_flags() in nilfs_set_inode_flags()

2015-04-08 Thread Ryusuke Konishi
Use inode_set_flags() to atomically set i_flags instead of clearing
out the S_IMMUTABLE, S_APPEND, etc. flags and then setting them from
the FS_IMMUTABLE_FL, FS_APPEND_FL flags to avoid a race where an
immutable file has the immutable flag cleared for a brief window of
time.

This is a similar fix to commit 5f16f3225b06 (ext4: atomically set
inode-i_flags in ext4_set_inode_flags()).

Signed-off-by: Ryusuke Konishi konishi.ryus...@lab.ntt.co.jp
Cc: Theodore Ts'o ty...@mit.edu
---
 fs/nilfs2/inode.c | 15 ---
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 0c28ccb..72c7fbf 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -443,19 +443,20 @@ struct inode *nilfs_new_inode(struct inode *dir, umode_t 
mode)
 void nilfs_set_inode_flags(struct inode *inode)
 {
unsigned int flags = NILFS_I(inode)-i_flags;
+   unsigned int new_fl = 0;
 
-   inode-i_flags = ~(S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME |
-   S_DIRSYNC);
if (flags  FS_SYNC_FL)
-   inode-i_flags |= S_SYNC;
+   new_fl |= S_SYNC;
if (flags  FS_APPEND_FL)
-   inode-i_flags |= S_APPEND;
+   new_fl |= S_APPEND;
if (flags  FS_IMMUTABLE_FL)
-   inode-i_flags |= S_IMMUTABLE;
+   new_fl |= S_IMMUTABLE;
if (flags  FS_NOATIME_FL)
-   inode-i_flags |= S_NOATIME;
+   new_fl |= S_NOATIME;
if (flags  FS_DIRSYNC_FL)
-   inode-i_flags |= S_DIRSYNC;
+   new_fl |= S_DIRSYNC;
+   inode_set_flags(inode, new_fl, S_SYNC | S_APPEND | S_IMMUTABLE |
+   S_NOATIME | S_DIRSYNC);
 }
 
 int nilfs_read_inode_common(struct inode *inode,
-- 
1.8.3.1

--
To unsubscribe from this list: send the line unsubscribe linux-nilfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] nilfs2: fix gcc warning at nilfs_checkpoint_is_mounted()

2015-04-02 Thread Ryusuke Konishi
Fix the following build warning:

 fs/nilfs2/super.c: In function 'nilfs_checkpoint_is_mounted':
 fs/nilfs2/super.c:1023:10: warning: comparison of unsigned expression  0 is 
always false [-Wtype-limits]
   if (cno  0 || cno  nilfs-ns_cno)
   ^

This warning indicates that the comparision cno  0 is useless
because variable cno has an unsigned integer type __u64.

Reported-by: David Binderman dcb...@hotmail.com
Signed-off-by: Ryusuke Konishi konishi.ryus...@lab.ntt.co.jp
---
 fs/nilfs2/super.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 5bc2a1c..c1725f20 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -1020,7 +1020,7 @@ int nilfs_checkpoint_is_mounted(struct super_block *sb, 
__u64 cno)
struct dentry *dentry;
int ret;
 
-   if (cno  0 || cno  nilfs-ns_cno)
+   if (cno  nilfs-ns_cno)
return false;
 
if (cno = nilfs_last_cno(nilfs))
-- 
1.8.3.1

--
To unsubscribe from this list: send the line unsubscribe linux-nilfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 9/9] nilfs2: prevent starvation of segments protected by snapshots

2015-05-20 Thread Ryusuke Konishi
On Wed, 20 May 2015 23:43:35 +0900 (JST), Ryusuke Konishi wrote:
 On Sun,  3 May 2015 12:05:22 +0200, Andreas Rohner wrote:
 It doesn't really matter if the number of reclaimable blocks for a
 segment is inaccurate, as long as the overall performance is better than
 the simple timestamp algorithm and starvation is prevented.
 
 The following steps will lead to starvation of a segment:
 
 1. The segment is written
 2. A snapshot is created
 3. The files in the segment are deleted and the number of live
blocks for the segment is decremented to a very low value
 4. The GC tries to free the segment, but there are no reclaimable
blocks, because they are all protected by the snapshot. To prevent an
infinite loop the GC has to adjust the number of live blocks to the
correct value.
 5. The snapshot is converted to a checkpoint and the blocks in the
segment are now reclaimable.
 6. The GC will never attempt to clean the segment again, because it
looks as if it had a high number of live blocks.
 
 To prevent this, the already existing padding field of the SUFILE entry
 is used to track the number of snapshot blocks in the segment. This
 number is only set by the GC, since it collects the necessary
 information anyway. So there is no need, to track which block belongs to
 which segment. In step 4 of the list above the GC will set the new field
 su_nsnapshot_blks. In step 5 all entries in the SUFILE are checked and
 entries with a big su_nsnapshot_blks field get their su_nlive_blks field
 reduced.
 
 Signed-off-by: Andreas Rohner andreas.roh...@gmx.net
 
 I still don't know whether this workaround is the way we should take
 or not.  This patch has several drawbacks:
 
  1. It introduces overheads to every chcp cp operation
 due to traversal rewrite of sufile.
 If the ratio of snapshot protected blocks is high, then
 this overheads will be big.
 
  2. The traversal rewrite of sufile will causes many sufile blocks will be
 written out.   If most blocks are protected by a snapshot,
 more than 4MB of sufile blocks will be written per 1TB capacity.
 
 Even though this rewrite may not happen for contiguous chcp cp
 operations, it still has potential for creating sufile write blocks
 if the application of nilfs manipulates snapshots frequently.
 
  3. The ratio of the threshold max_segblks is hard coded to 50%
 of blocks_per_segment.  It is not clear if the ratio is good
 (versatile).
 
 I will add comments inline below.
 
 ---
  fs/nilfs2/ioctl.c  | 50 +++-
  fs/nilfs2/sufile.c | 85 
 ++
  fs/nilfs2/sufile.h |  3 ++
  3 files changed, 137 insertions(+), 1 deletion(-)
 
 diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
 index 40bf74a..431725f 100644
 --- a/fs/nilfs2/ioctl.c
 +++ b/fs/nilfs2/ioctl.c
 @@ -200,6 +200,49 @@ static int nilfs_ioctl_getversion(struct inode *inode, 
 void __user *argp)
  }
  
  /**
 + * nilfs_ioctl_fix_starving_segs - fix potentially starving segments
 + * @nilfs: nilfs object
 + * @inode: inode object
 + *
 + * Description: Scans for segments, which are potentially starving and
 + * reduces the number of live blocks to less than half of the maximum
 + * number of blocks in a segment. This requires a scan of the whole SUFILE,
 + * which can take a long time on certain devices and under certain 
 conditions.
 + * To avoid blocking other file system operations for too long the SUFILE is
 + * scanned in steps of NILFS_SUFILE_STARVING_SEGS_STEP. After each step the
 + * locks are released and cond_resched() is called.
 + *
 + * Return Value: On success, 0 is returned and on error, one of the
 + * following negative error codes is returned.
 + *
 + * %-EIO - I/O error.
 + *
 + * %-ENOMEM - Insufficient amount of memory available.
 + */
 
 +static int nilfs_ioctl_fix_starving_segs(struct the_nilfs *nilfs,
 + struct inode *inode) {
 
 This inode argument is meaningless for this routine.
 Consider passing sb instead.
 
 I feel odd for the function name fix starving segs.  It looks to
 give a workaround rather than solve the root problem of gc in nilfs.
 It looks like what this patch is doing, is calibrating live block
 count.
 
 +struct nilfs_transaction_info ti;
 
 +unsigned long i, nsegs = nilfs_sufile_get_nsegments(nilfs-ns_sufile);
 
 nsegs is set outside the transaction lock.
 
 Since the file system can be resized (both shrinked or extended)
 outside the lock, nsegs must be initialized or updated in the
 section where the tranaction lock is held.
 
 +int ret = 0;
 +
 +for (i = 0; i  nsegs; i += NILFS_SUFILE_STARVING_SEGS_STEP) {
 +nilfs_transaction_begin(inode-i_sb, ti, 0);
 +
 +ret = nilfs_sufile_fix_starving_segs(nilfs-ns_sufile, i,
 +NILFS_SUFILE_STARVING_SEGS_STEP);
 +if (unlikely(ret  0)) {
 +nilfs_transaction_abort

Re: NILFS2: double uuid

2015-06-08 Thread Ryusuke Konishi

(CCed to linux-nilfs@vger.kernel.org)
Hi Heinz,

On 2015/06/08 15:43, Heinz Diehl wrote:

Hi,

a nilfs2 formatted disk fails to mount via fstab due to double uuid's.
See lsblk output below. The logs indicate that the system attempts to
mount /dev/sdb rather than /dev/sdb1, which of course fails. In
addition, /dev/sdb should not have any uuid at all. Don't know why
that happens.

The phenomenon is easily reproducible: format a partition with nilfs2,
register it with the proper uuid in fstab and reboot. Tried both with
USB memory and real HDD.

[root@keera ~]# lsblk -f
NAMEFSTYPE LABEL UUID MOUNTPOINT
sdb  ff17dda9-fcae-42e7-a438-9087de58902e
`-sdb1  xfs  ff17dda9-fcae-42e7-a438-9087de58902e

Thanks,
  Heinz


On 2015/06/08 15:49, Heinz Diehl wrote:
 On 08.06.2015, Heinz Diehl wrote:

 [root@keera ~]# lsblk -f
 NAMEFSTYPE LABEL UUID MOUNTPOINT
 sdb  ff17dda9-fcae-42e7-a438-9087de58902e
 `-sdb1  xfs  ff17dda9-fcae-42e7-a438-9087de58902e

 Copy error: replace xfs with nilfs2. Sorry!

I couldn't reproduce the issue (in a CentOS 7 environment).

Could you tell us the version information of distro,
lsblk, libblkid, nilfs-utils, and kernel you are using ?

The following is an example of mine:

$ lsblk -f
NAME   FSTYPE LABEL  UUID MOUNTPOINT
sda
└─sda1 nilfs29dcd01c0-2bc8-41bf-a400-8ad8755aac6a
$ lsblk --version
lsblk from util-linux 2.23.2

$ lscp -V
lscp (nilfs-utils 2.2.3)

$ rpm -q libblkid util-linux
libblkid-2.23.2-22.el7_1.x86_64
util-linux-2.23.2-22.el7_1.x86_64

$ uname -r
4.1.0-rc7


Regards,
Ryusuke Konishi

--
To unsubscribe from this list: send the line unsubscribe linux-nilfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: NILFS2: double uuid

2015-06-09 Thread Ryusuke Konishi
On Tue, 9 Jun 2015 16:07:42 +0200, Karel Zak k...@redhat.com wrote:
 On Tue, Jun 09, 2015 at 10:04:15PM +0900, Ryusuke Konishi wrote:
 $ sudo nilfs-resize -y /dev/sdb1 1G
 Partition size = 2146435072 bytes.
 Shrink the filesystem size from 2146435072 bytes to 1073741824 bytes.
 128 segments will be truncated from segnum 127.
 Moving 103 in-use segments.
 progress |***|
 Done.
 
 $ sudo umount /test
 $ sudo mount /dev/sdb1 /test
 $ sudo LD_LIBRARY_PATH=/usr/local/lib lsblk -f
 NAME   FSTYPE  LABEL   UUID MOUNTPOINT
 [...]
 sdb
 `-sdb1  /test
 
 This blank state continued until I shrank the partition or
 re-extended the filesystem to the partition size.
 
 Could you consider confining the s_dev_size test only to the
 backup superblock ?
 
 Hmm... why nilfs-resize does not update the size in the superblock?
 It seems like nilfs-resize bug.

nilfs-resize (to be exact, RESIZE ioctl of nilfs2) updates s_dev_size
in both superblocks.  What nilfs-resize doesn't change is the
partition size.  (It needs help of a partitioning tool)

 
 It seems that we don't have to drop the primary super block
 even if s_dev_size doesn't fit to the partition size.
 
 Yes, fixed. I have also enabled the s_dev_size check for whole-disk
 devices only to minimize number of situations when we rely on the
 s_dev_size.
 
 Karel

Thanks again.  The updated libblkid/lsblk works frawlessly.

Regards,
Ryusuke Konishi
--
To unsubscribe from this list: send the line unsubscribe linux-nilfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: NILFS2: double uuid

2015-06-09 Thread Ryusuke Konishi

Hi,

On 2015/06/09 17:53, Karel Zak wrote:

On Tue, Jun 09, 2015 at 12:31:27AM +0900, Ryusuke Konishi wrote:

It looks like the backup super block should be dropped from candidates
if its device size (sbp-s_dev_size) doesn't match the partition size.


Yeah, fixed:
http://git.kernel.org/cgit/utils/util-linux/util-linux.git/commit/?id=00817742ce360119e079a33e12cf84118ff7c63e

Note that workaround is to not use nilfs2 on the last partition or
have a tiny gap (1 sector is enough) between last partition and the
end of the whole-disk.

 Karel



Thanks for your quick work!

I tested the patch.  It almost worked fine.
One issue I found is a transient state after fs-resizing.

After shrinking the file system, both superblocks dropped and
lsblk failed to detect the filesystem:

$ sudo LD_LIBRARY_PATH=/usr/local/lib lsblk -f
NAME   FSTYPE  LABEL   UUID 
MOUNTPOINT

[...]
sdb
`-sdb1 nilfs2  2d7cd130-82a0-4a3c-b8a8-4ac5a26f5703 /test

$ sudo nilfs-resize -y /dev/sdb1 1G
Partition size = 2146435072 bytes.
Shrink the filesystem size from 2146435072 bytes to 1073741824 bytes.
128 segments will be truncated from segnum 127.
Moving 103 in-use segments.
progress |***|
Done.

$ sudo umount /test
$ sudo mount /dev/sdb1 /test
$ sudo LD_LIBRARY_PATH=/usr/local/lib lsblk -f
NAME   FSTYPE  LABEL   UUID 
MOUNTPOINT

[...]
sdb
`-sdb1  /test

This blank state continued until I shrank the partition or
re-extended the filesystem to the partition size.

Could you consider confining the s_dev_size test only to the
backup superblock ?

It seems that we don't have to drop the primary super block
even if s_dev_size doesn't fit to the partition size.


Regards,
Ryusuke Konishi

--
To unsubscribe from this list: send the line unsubscribe linux-nilfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 9/9] nilfs2: prevent starvation of segments protected by snapshots

2015-05-31 Thread Ryusuke Konishi
On Fri, 22 May 2015 20:10:05 +0200, Andreas Rohner wrote:
 On 2015-05-20 16:43, Ryusuke Konishi wrote:
 On Sun,  3 May 2015 12:05:22 +0200, Andreas Rohner wrote:
 It doesn't really matter if the number of reclaimable blocks for a
 segment is inaccurate, as long as the overall performance is better than
 the simple timestamp algorithm and starvation is prevented.

 The following steps will lead to starvation of a segment:

 1. The segment is written
 2. A snapshot is created
 3. The files in the segment are deleted and the number of live
blocks for the segment is decremented to a very low value
 4. The GC tries to free the segment, but there are no reclaimable
blocks, because they are all protected by the snapshot. To prevent an
infinite loop the GC has to adjust the number of live blocks to the
correct value.
 5. The snapshot is converted to a checkpoint and the blocks in the
segment are now reclaimable.
 6. The GC will never attempt to clean the segment again, because it
looks as if it had a high number of live blocks.

 To prevent this, the already existing padding field of the SUFILE entry
 is used to track the number of snapshot blocks in the segment. This
 number is only set by the GC, since it collects the necessary
 information anyway. So there is no need, to track which block belongs to
 which segment. In step 4 of the list above the GC will set the new field
 su_nsnapshot_blks. In step 5 all entries in the SUFILE are checked and
 entries with a big su_nsnapshot_blks field get their su_nlive_blks field
 reduced.

 Signed-off-by: Andreas Rohner andreas.roh...@gmx.net
 
 I still don't know whether this workaround is the way we should take
 or not.  This patch has several drawbacks:
 
  1. It introduces overheads to every chcp cp operation
 due to traversal rewrite of sufile.
 If the ratio of snapshot protected blocks is high, then
 this overheads will be big.
 
  2. The traversal rewrite of sufile will causes many sufile blocks will be
 written out.   If most blocks are protected by a snapshot,
 more than 4MB of sufile blocks will be written per 1TB capacity.
 
 Even though this rewrite may not happen for contiguous chcp cp
 operations, it still has potential for creating sufile write blocks
 if the application of nilfs manipulates snapshots frequently.
 
 I could also implement this functionality in nilfs_cleanerd in
 userspace. Every time a chcp cp happens some kind of permanent flag
 like snapshot_was_recently_deleted is set at an appropriate location.
 The flag could be returned with GET_SUSTAT ioctl(). Then nilfs_cleanerd
 would, at certain intervals and if the flag is set, check all segments
 with GET_SUINFO ioctl() and set the ones that have potentially invalid
 values with SET_SUINFO ioctl(). After that it would clear the
 snapshot_was_recently_deleted flag. What do you think about this idea?

Sorry for my late reply.

I think moving the functionality to cleanerd and notifying some sort
of information to userland through ioctl for that, is a good idea
except that I feel the ioctl should be GET_CPSTAT instead of
GET_SUINFO because it's checkpoint/snapshot related information.

I think the parameter that should be added is a set of statistics
information including the number of deleted snapshots since the file
system was mounted last (1).  The counter (1) can serve as the
snapshot_was_recently_deleted flag if it monotonically increases.
Although we can use timestamp of when a snapshot was deleted last
time, it's not preferable than the counter (1) because the system
clock may be rewinded and it also has an issue related to precision.

Note that we must add GET_CPSTAT_V2 (or GET_SUSTAT_V2) and the
corresponding structure (i.e. nilfs_cpstat_v2, or so) since ioctl
codes depend on the size of argument data and it will be changed in
both ioctls; unfortunately, neither GET_CPSTAT nor GET_SUSTAT ioctl is
expandable.  Some ioctls like EVIOCGKEYCODE_V2 will be a reference for
this issue.

 
 If the policy is timestamp the GC would of course skip this scan,
 because it is unnecessary.
 
  3. The ratio of the threshold max_segblks is hard coded to 50%
 of blocks_per_segment.  It is not clear if the ratio is good
 (versatile).
 
 The interval and percentage could be set in /etc/nilfs_cleanerd.conf.
 
 I chose 50% kind of arbitrarily. My intent was to encourage the GC to
 check the segment again in the future. I guess anything between 25% and
 75% would also work.

Sound reasonable.

By the way, I am thinking we should move cleanerd into kernel as soon
as we can.  It's not only inefficient due to a large amount of data
exchange between kernel and user-land, but also is hindering changes
like we are trying.  We have to care compatibility unnecessarily due
to the early design mistake (i.e. the separation of gc to user-land).

Regards,
Ryusuke Konishi
--
To unsubscribe from this list: send the line unsubscribe linux-nilfs in
the body of a message

Re: [PATCH v2 9/9] nilfs2: prevent starvation of segments protected by snapshots

2015-05-31 Thread Ryusuke Konishi
On Sun, 31 May 2015 20:13:44 +0200, Andreas Rohner wrote:
 On 2015-05-31 18:45, Ryusuke Konishi wrote:
 On Fri, 22 May 2015 20:10:05 +0200, Andreas Rohner wrote:
 On 2015-05-20 16:43, Ryusuke Konishi wrote:
 On Sun,  3 May 2015 12:05:22 +0200, Andreas Rohner wrote:
[...]
  3. The ratio of the threshold max_segblks is hard coded to 50%
 of blocks_per_segment.  It is not clear if the ratio is good
 (versatile).

 The interval and percentage could be set in /etc/nilfs_cleanerd.conf.

 I chose 50% kind of arbitrarily. My intent was to encourage the GC to
 check the segment again in the future. I guess anything between 25% and
 75% would also work.
 
 Sound reasonable.
 
 By the way, I am thinking we should move cleanerd into kernel as soon
 as we can.  It's not only inefficient due to a large amount of data
 exchange between kernel and user-land, but also is hindering changes
 like we are trying.  We have to care compatibility unnecessarily due
 to the early design mistake (i.e. the separation of gc to user-land).
 
 I am a bit confused. Is it OK if I implement this functionality in
 nilfs_cleanerd for this patch set, or would it be better to implement it
 with a workqueue in the kernel, like you've suggested before?
 
 If you intend to move nilfs_cleanerd into the kernel anyway, then the
 latter would make more sense to me. Which implementation do you prefer
 for this patch set?

If nilfs_cleanerd will remain in userland, then the userland
implementation looks better.  But, yes, if we will move the cleaner
into kernel, then the kernel implementation looks better because we
may be able to avoid unnecessary API change.  It's a dilemma.

Do you have any good idea to reduce or hide overhead of the
calibration (i.e. traversal rewrite of sufile) in regard to the kernel
implementation ?
I'm inclined to leave that in kernel for now.

Regards,
Ryusuke Konishi

 
 Regards,
 Andreas Rohner
 
 Regards,
 Ryusuke Konishi
 --
 To unsubscribe from this list: send the line unsubscribe linux-nilfs in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html
 
 
 --
 To unsubscribe from this list: send the line unsubscribe linux-nilfs in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line unsubscribe linux-nilfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 7/9] nilfs2: ensure that all dirty blocks are written out

2015-05-31 Thread Ryusuke Konishi
On Sun, 10 May 2015 13:04:18 +0200, Andreas Rohner wrote:
 On 2015-05-09 14:17, Ryusuke Konishi wrote:
 On Sun,  3 May 2015 12:05:20 +0200, Andreas Rohner wrote:
[...]
 
 Uum. This still looks to have potential for leak of dirty block
 collection between DAT and SUFILE since this retry is limited by
 the fixed retry count.
 
 How about adding function temporarily turning off the live block
 tracking and using it after this propagation loop until log write
 finishes ?
 
 It would reduce the accuracy of live block count, but is it enough ?
 How do you think ?  We have to eliminate the possibility of the leak
 because it can cause file system corruption.  Every checkpoint must be
 self-contained.
 
 How exactly could it lead to file system corruption? Maybe I miss
 something important here, but it seems to me, that no corruption is
 possible.
 
 The nilfs_sufile_flush_cache_node() function only reads in already
 existing blocks. No new blocks are created. If I mark those blocks
 dirty, the btree is not changed at all. If I do not call
 nilfs_bmap_propagate(), then the btree stays unchanged and there are no
 dangling pointers. The resulting checkpoint should be self-contained.

Good point.  As for btree, it looks like no inconsistency issue arises
since nilfs_sufile_flush_cache_node() never inserts new blocks as you
pointed out.  Even though we also must care inconsistency between
sufile header and sufile data blocks, and block count in inode as
well, fortunately these look to be ok, too.

However, I still think it's not good to carry over dirty blocks to the
next segment construction to avoid extra checkpoint creation and to
simplify things.

From this viewpoint, I also prefer that nilfs_sufile_flush_cache() and
nilfs_sufile_flush_cache_node() are changed a bit so that they will
skip adjusting su_nlive_blks and su_nlive_lastmod if the sufile block
that includes the segment usage is not marked dirty and only_mark == 0
as well as turing off live block counting temporarily after the
sufile/DAT propagation loop.

 
 The only problem would be, that I could lose some nlive_blks updates.
 

Regards,
Ryusuke Konishi
--
To unsubscribe from this list: send the line unsubscribe linux-nilfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 0/1] NILFS2: support NFSv2 export

2015-05-26 Thread Ryusuke Konishi
Hi Andrew,

please queue the following patch for the next merge window.  It fixes
an NFSv2 related issue reported in:

[1] http://marc.info/?l=linux-fsdevelm=143104630128997
[PATCH 0/3] make BTRFS, UDF, NILFS2 work with NFSv2.

Thanks,
Ryusuke Konishi
--
NeilBrown (1):
  NILFS2: support NFSv2 export

 fs/nilfs2/namei.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)


--
To unsubscribe from this list: send the line unsubscribe linux-nilfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/9 linux-next] nilfs2: remove dir_pages() declaration

2015-05-24 Thread Ryusuke Konishi
On Sun, 24 May 2015 17:19:42 +0200, Fabian Frederick f...@skynet.be wrote:
 dir_pages() is now declared in pagemap.h
 
 Signed-off-by: Fabian Frederick f...@skynet.be
 ---
  fs/nilfs2/dir.c | 5 -
  1 file changed, 5 deletions(-)
 
 diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
 index 0ee0bed..6b8b92b 100644
 --- a/fs/nilfs2/dir.c
 +++ b/fs/nilfs2/dir.c
 @@ -61,11 +61,6 @@ static inline void nilfs_put_page(struct page *page)
   page_cache_release(page);
  }
  
 -static inline unsigned long dir_pages(struct inode *inode)
 -{
 - return (inode-i_size+PAGE_CACHE_SIZE-1)PAGE_CACHE_SHIFT;
 -}
 -

Can you include this and similar changes in the first patch
pagemap.h: declare dir_pages() ?

The first patch transiently breaks build because it inserts a
duplicate definition of the dir_pages() inline function until it gets
removed from each file system by the successive patches.

This series looks non-divisible except the patch of ufs.

Regards,
Ryusuke Konishi

  /*
   * Return the offset into page `page_nr' of the last valid
   * byte in that page, plus one.
 -- 
 2.4.1
 
 --
 To unsubscribe from this list: send the line unsubscribe linux-kernel in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html
 Please read the FAQ at  http://www.tux.org/lkml/
--
To unsubscribe from this list: send the line unsubscribe linux-nilfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 9/9] nilfs2: prevent starvation of segments protected by snapshots

2015-05-20 Thread Ryusuke Konishi
;
 + }
 +
 + nilfs_transaction_commit(inode-i_sb); /* never fails */
 + cond_resched();
 + }
 +
 + return ret;
 +}
 +
 +/**
   * nilfs_ioctl_change_cpmode - change checkpoint mode (checkpoint/snapshot)
   * @inode: inode object
   * @filp: file object
 @@ -224,7 +267,7 @@ static int nilfs_ioctl_change_cpmode(struct inode *inode, 
 struct file *filp,
   struct the_nilfs *nilfs = inode-i_sb-s_fs_info;
   struct nilfs_transaction_info ti;
   struct nilfs_cpmode cpmode;
 - int ret;
 + int ret, is_snapshot;
  
   if (!capable(CAP_SYS_ADMIN))
   return -EPERM;
 @@ -240,6 +283,7 @@ static int nilfs_ioctl_change_cpmode(struct inode *inode, 
 struct file *filp,
   mutex_lock(nilfs-ns_snapshot_mount_mutex);
  
   nilfs_transaction_begin(inode-i_sb, ti, 0);
 + is_snapshot = nilfs_cpfile_is_snapshot(nilfs-ns_cpfile, cpmode.cm_cno);
   ret = nilfs_cpfile_change_cpmode(
   nilfs-ns_cpfile, cpmode.cm_cno, cpmode.cm_mode);
   if (unlikely(ret  0))
 @@ -248,6 +292,10 @@ static int nilfs_ioctl_change_cpmode(struct inode 
 *inode, struct file *filp,
   nilfs_transaction_commit(inode-i_sb); /* never fails */
  
   mutex_unlock(nilfs-ns_snapshot_mount_mutex);
 +

 + if (is_snapshot  0  cpmode.cm_mode == NILFS_CHECKPOINT 
 + nilfs_feature_track_live_blks(nilfs))
 + ret = nilfs_ioctl_fix_starving_segs(nilfs, inode);

Should we use this return value ?
This doesn't relate to the success and failure of chcp operation.

nilfs_ioctl_fix_starving_segs() is called every time chcp cp is
called.  I prefer to delay this extra work with a workqueue and to
skip starting a new work if the previous work is still running.

  out:
   mnt_drop_write_file(filp);
   return ret;
 diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
 index 9cd8820d..47e2c05 100644
 --- a/fs/nilfs2/sufile.c
 +++ b/fs/nilfs2/sufile.c
 @@ -1215,6 +1215,91 @@ out_sem:
  }
  
  /**
 + * nilfs_sufile_fix_starving_segs - fix potentially starving segments
 + * @sufile: inode of segment usage file
 + * @segnum: segnum to start
 + * @nsegs: number of segments to check
 + *
 + * Description: Scans for segments, which are potentially starving and
 + * reduces the number of live blocks to less than half of the maximum
 + * number of blocks in a segment. This way the segment is more likely to be
 + * chosen by the GC. A segment is marked as potentially starving, if more
 + * than half of the blocks it contains are protected by snapshots.
 + *
 + * Return Value: On success, 0 is returned and on error, one of the
 + * following negative error codes is returned.
 + *
 + * %-EIO - I/O error.
 + *
 + * %-ENOMEM - Insufficient amount of memory available.
 + */
 +int nilfs_sufile_fix_starving_segs(struct inode *sufile, __u64 segnum,
 +__u64 nsegs)
 +{
 + struct buffer_head *su_bh;
 + struct nilfs_segment_usage *su;
 + size_t n, i, susz = NILFS_MDT(sufile)-mi_entry_size;
 + struct the_nilfs *nilfs = sufile-i_sb-s_fs_info;
 + void *kaddr;
 + unsigned long maxnsegs, segusages_per_block;
 + __u32 max_segblks = nilfs-ns_blocks_per_segment  1;
 + int ret = 0, blkdirty, dirty = 0;
 +
 + down_write(NILFS_MDT(sufile)-mi_sem);
 +

 + maxnsegs = nilfs_sufile_get_nsegments(sufile);
 + segusages_per_block = nilfs_sufile_segment_usages_per_block(sufile);
 + nsegs += segnum;
 + if (nsegs  maxnsegs)
 + nsegs = maxnsegs;
 +
 + while (segnum  nsegs) {

This local variable nsegs is used as an (exclusive) end segment number.
It's confusing.   You should define end variable separately.
It can be simply calculated by:

end = min_t(__u64, segnum + nsegs, nilfs_sufile_get_nsegments(sufile));

(maxnsegs can be removed.)

Note that the evaluation of each argument will never be done twice in
min_t() macro since min_t() temporarily stores the evaluation results
to hidden local variables and uses them for comparison.

Regards,
Ryusuke Konishi


 + n = nilfs_sufile_segment_usages_in_block(sufile, segnum,
 +  nsegs - 1);
 +
 + ret = nilfs_sufile_get_segment_usage_block(sufile, segnum,
 +0, su_bh);
 + if (ret  0) {
 + if (ret != -ENOENT)
 + goto out;
 + /* hole */
 + segnum += n;
 + continue;
 + }
 +
 + kaddr = kmap_atomic(su_bh-b_page);
 + su = nilfs_sufile_block_get_segment_usage(sufile, segnum,
 +   su_bh, kaddr);
 + blkdirty = 0;
 + for (i = 0; i  n; ++i, ++segnum, su = (void *)su + susz) {
 + if (le32_to_cpu(su-su_nsnapshot_blks) = max_segblks)
 + continue;
 + if (le32_to_cpu

[PATCH 0/2] nilfs2: fix build warnings

2015-10-11 Thread Ryusuke Konishi
Hi Andrew,

Please send the following fixes to upstream:

Ryusuke Konishi (2):
  nilfs2: fix gcc unused-but-set-variable warnings
  nilfs2: fix gcc uninitialized-variable warnings in powerpc build

These prevent reported warnings in powerpc build and minor warnings
during build with "W=1".  Both were detected on the mainline.

Thanks,
Ryusuke Konishi
--

 fs/nilfs2/alloc.c| 3 +--
 fs/nilfs2/btree.c| 7 +--
 fs/nilfs2/dat.c  | 2 --
 fs/nilfs2/recovery.c | 4 ++--
 fs/nilfs2/segment.c  | 3 +--
 fs/nilfs2/sufile.c   | 3 +--
 fs/nilfs2/super.c| 5 -
 7 files changed, 10 insertions(+), 17 deletions(-)

--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 3/5] nilfs2: add tracepoints for analyzing sufile manipulation

2015-10-06 Thread Ryusuke Konishi
From: Hitoshi Mitake <mitake.hito...@lab.ntt.co.jp>

This patch adds tracepoints which would be useful for analyzing
segment usage from a perspective of high level sufile manipulation
(check, alloc, free). sufile is an important in-place updated metadata
file, so analyzing the behavior would be useful for performance
turning.

example of usage (a case of allocation):

$ sudo bin/tpoint nilfs2:nilfs2_segment_usage_allocated
Tracing nilfs2:nilfs2_segment_usage_allocated. Ctrl-C to end.
segctord-17800 [002] ...1 10671.867294: nilfs2_segment_usage_allocated: 
sufile = 880054f908a8 segnum = 2
segctord-17800 [002] ...1 10675.073477: nilfs2_segment_usage_allocated: 
sufile = 880054f908a8 segnum = 3

Cc: Benixon Dhas <benixon.d...@wdc.com>
Cc: TK Kato <tk.k...@wdc.com>
Signed-off-by: Hitoshi Mitake <mitake.hito...@lab.ntt.co.jp>
Signed-off-by: Ryusuke Konishi <konishi.ryus...@lab.ntt.co.jp>
Cc: Steven Rostedt <rost...@goodmis.org>
---
 fs/nilfs2/sufile.c|  8 ++
 include/trace/events/nilfs2.h | 67 +++
 2 files changed, 75 insertions(+)

diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
index 2a869c3..7ff8f15 100644
--- a/fs/nilfs2/sufile.c
+++ b/fs/nilfs2/sufile.c
@@ -30,6 +30,8 @@
 #include "mdt.h"
 #include "sufile.h"
 
+#include 
+
 /**
  * struct nilfs_sufile_info - on-memory private data of sufile
  * @mi: on-memory private data of metadata file
@@ -358,6 +360,7 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump)
break; /* never happens */
}
}
+   trace_nilfs2_segment_usage_check(sufile, segnum, cnt);
ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 1,
   _bh);
if (ret < 0)
@@ -388,6 +391,9 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump)
nilfs_mdt_mark_dirty(sufile);
brelse(su_bh);
*segnump = segnum;
+
+   trace_nilfs2_segment_usage_allocated(sufile, segnum);
+
goto out_header;
}
 
@@ -490,6 +496,8 @@ void nilfs_sufile_do_free(struct inode *sufile, __u64 
segnum,
NILFS_SUI(sufile)->ncleansegs++;
 
nilfs_mdt_mark_dirty(sufile);
+
+   trace_nilfs2_segment_usage_freed(sufile, segnum);
 }
 
 /**
diff --git a/include/trace/events/nilfs2.h b/include/trace/events/nilfs2.h
index e5649ac..1b65ba6 100644
--- a/include/trace/events/nilfs2.h
+++ b/include/trace/events/nilfs2.h
@@ -95,6 +95,73 @@ TRACE_EVENT(nilfs2_transaction_transition,
  show_transaction_state(__entry->state))
 );
 
+TRACE_EVENT(nilfs2_segment_usage_check,
+   TP_PROTO(struct inode *sufile,
+__u64 segnum,
+unsigned long cnt),
+
+   TP_ARGS(sufile, segnum, cnt),
+
+   TP_STRUCT__entry(
+   __field(struct inode *, sufile)
+   __field(__u64, segnum)
+   __field(unsigned long, cnt)
+   ),
+
+   TP_fast_assign(
+   __entry->sufile = sufile;
+   __entry->segnum = segnum;
+   __entry->cnt = cnt;
+   ),
+
+   TP_printk("sufile = %p segnum = %llu cnt = %lu",
+ __entry->sufile,
+ __entry->segnum,
+ __entry->cnt)
+);
+
+TRACE_EVENT(nilfs2_segment_usage_allocated,
+   TP_PROTO(struct inode *sufile,
+__u64 segnum),
+
+   TP_ARGS(sufile, segnum),
+
+   TP_STRUCT__entry(
+   __field(struct inode *, sufile)
+   __field(__u64, segnum)
+   ),
+
+   TP_fast_assign(
+   __entry->sufile = sufile;
+   __entry->segnum = segnum;
+   ),
+
+   TP_printk("sufile = %p segnum = %llu",
+ __entry->sufile,
+ __entry->segnum)
+);
+
+TRACE_EVENT(nilfs2_segment_usage_freed,
+   TP_PROTO(struct inode *sufile,
+__u64 segnum),
+
+   TP_ARGS(sufile, segnum),
+
+   TP_STRUCT__entry(
+   __field(struct inode *, sufile)
+   __field(__u64, segnum)
+   ),
+
+   TP_fast_assign(
+   __entry->sufile = sufile;
+   __entry->segnum = segnum;
+   ),
+
+   TP_printk("sufile = %p segnum = %llu",
+ __entry->sufile,
+ __entry->segnum)
+);
+
 #endif /* _TRACE_NILFS2_H */
 
 /* This part must be outside protection */
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 5/5] MAINTAINERS: nilfs2: add header file for tracing

2015-10-06 Thread Ryusuke Konishi
This adds header file "include/trace/events/nilfs2.h" to
maintainer-ship of nilfs2 so that updates to the nilfs2 header file go
to the mailing list of nilfs2.

Signed-off-by: Ryusuke Konishi <konishi.ryus...@lab.ntt.co.jp>
Cc: Hitoshi Mitake <mitake.hito...@lab.ntt.co.jp>
---
 MAINTAINERS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index 797236b..6b15b7a 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7371,6 +7371,7 @@ S:Supported
 F: Documentation/filesystems/nilfs2.txt
 F: fs/nilfs2/
 F: include/linux/nilfs2_fs.h
+F: include/trace/events/nilfs2.h
 
 NINJA SCSI-3 / NINJA SCSI-32Bi (16bit/CardBus) PCMCIA SCSI HOST ADAPTER DRIVER
 M: YOKOTA Hiroshi <yok...@netlab.is.tsukuba.ac.jp>
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: nilfs.org pwned

2015-08-29 Thread Ryusuke Konishi
On Wed, 26 Aug 2015 11:30:49 +0100, csm...@csmith-bm.vm.bytemark.co.uk wrote:
 The website, www.nilfs.org, appears to have been defaced. I was going to post
 a link on a forum for it, but decided against it in the end due to the
 defacement.
 
 Otherwise, NILFS generally rocks, keep up the good work :)
 
 Christian

Please refer to nilfs.sourceforge.net instead.
We no longer hosts nilfs.org.

Ryusuke Konishi
--
To unsubscribe from this list: send the line unsubscribe linux-nilfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 02/39] nilfs2: drop null test before destroy functions

2015-09-13 Thread Ryusuke Konishi
On Sun, 13 Sep 2015 14:14:55 +0200, Julia Lawall <julia.law...@lip6.fr> wrote:
> Remove unneeded NULL test.
> 
> The semantic patch that makes this change is as follows:
> (http://coccinelle.lip6.fr/)
> 
> // 
> @@ expression x; @@
> -if (x != NULL)
>   \(kmem_cache_destroy\|mempool_destroy\|dma_pool_destroy\)(x);
> // 
> 
> Signed-off-by: Julia Lawall <julia.law...@lip6.fr>

Looks OK.  I'll queue this in my tree.

Thanks,
Ryusuke Konishi

> 
> ---
>  fs/nilfs2/super.c |   12 
>  1 file changed, 4 insertions(+), 8 deletions(-)
> 
> diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
> index f47585b..c69455a 100644
> --- a/fs/nilfs2/super.c
> +++ b/fs/nilfs2/super.c
> @@ -1405,14 +1405,10 @@ static void nilfs_destroy_cachep(void)
>*/
>   rcu_barrier();
>  
> - if (nilfs_inode_cachep)
> - kmem_cache_destroy(nilfs_inode_cachep);
> - if (nilfs_transaction_cachep)
> - kmem_cache_destroy(nilfs_transaction_cachep);
> - if (nilfs_segbuf_cachep)
> - kmem_cache_destroy(nilfs_segbuf_cachep);
> - if (nilfs_btree_path_cache)
> - kmem_cache_destroy(nilfs_btree_path_cache);
> + kmem_cache_destroy(nilfs_inode_cachep);
> + kmem_cache_destroy(nilfs_transaction_cachep);
> + kmem_cache_destroy(nilfs_segbuf_cachep);
> + kmem_cache_destroy(nilfs_btree_path_cache);
>  }
>  
>  static int __init nilfs_init_cachep(void)
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 14/17] fs/nilfs2: remove unnecessary new_valid_dev check

2015-09-29 Thread Ryusuke Konishi
On 2015/09/28 23:33, Yaowei Bai wrote:
> As new_valid_dev always returns 1, so !new_valid_dev check is not
> needed, remove it.
> 
> Signed-off-by: Yaowei Bai <bywxiao...@163.com>

Acked-by: Ryusuke Konishi <konishi.ryus...@lab.ntt.co.jp>

> ---
>   fs/nilfs2/namei.c | 3 ---
>   1 file changed, 3 deletions(-)
> 
> diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
> index 37dd6b0..c9a1a49 100644
> --- a/fs/nilfs2/namei.c
> +++ b/fs/nilfs2/namei.c
> @@ -120,9 +120,6 @@ nilfs_mknod(struct inode *dir, struct dentry *dentry, 
> umode_t mode, dev_t rdev)
>   struct nilfs_transaction_info ti;
>   int err;
>   
> - if (!new_valid_dev(rdev))
> - return -EINVAL;
> -
>   err = nilfs_transaction_begin(dir->i_sb, , 1);
>   if (err)
>   return err;
> 


--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/7] nilfs2: drop null test before destroy functions

2015-09-19 Thread Ryusuke Konishi
From: Julia Lawall <julia.law...@lip6.fr>

Remove unneeded NULL test.

The semantic patch that makes this change is as follows:
(http://coccinelle.lip6.fr/)

// 
@@ expression x; @@
-if (x != NULL)
  \(kmem_cache_destroy\|mempool_destroy\|dma_pool_destroy\)(x);
// 

Signed-off-by: Julia Lawall <julia.law...@lip6.fr>
Signed-off-by: Ryusuke Konishi <konishi.ryus...@lab.ntt.co.jp>
---
 fs/nilfs2/super.c | 12 
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index f47585b..c69455a 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -1405,14 +1405,10 @@ static void nilfs_destroy_cachep(void)
 */
rcu_barrier();
 
-   if (nilfs_inode_cachep)
-   kmem_cache_destroy(nilfs_inode_cachep);
-   if (nilfs_transaction_cachep)
-   kmem_cache_destroy(nilfs_transaction_cachep);
-   if (nilfs_segbuf_cachep)
-   kmem_cache_destroy(nilfs_segbuf_cachep);
-   if (nilfs_btree_path_cache)
-   kmem_cache_destroy(nilfs_btree_path_cache);
+   kmem_cache_destroy(nilfs_inode_cachep);
+   kmem_cache_destroy(nilfs_transaction_cachep);
+   kmem_cache_destroy(nilfs_segbuf_cachep);
+   kmem_cache_destroy(nilfs_btree_path_cache);
 }
 
 static int __init nilfs_init_cachep(void)
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 6/7] nilfs2: add helper functions to delete blocks from dat file

2015-09-19 Thread Ryusuke Konishi
This adds delete functions for data blocks of metadata files using
bitmap based allocator.  nilfs_palloc_delete_entry_block() deletes an
entry block (e.g. block storing dat entries), and
nilfs_palloc_delete_bitmap_block() deletes a bitmap block,
respectively.

These helpers are intended to be used in the successive change on
deallocator of block addresses ("nilfs2: free unused dat file blocks
during garbage collection").

Signed-off-by: Ryusuke Konishi <konishi.ryus...@lab.ntt.co.jp>
---
 fs/nilfs2/alloc.c | 50 ++
 1 file changed, 50 insertions(+)

diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c
index 5b7ee36..225b797 100644
--- a/fs/nilfs2/alloc.c
+++ b/fs/nilfs2/alloc.c
@@ -236,6 +236,26 @@ static int nilfs_palloc_get_block(struct inode *inode, 
unsigned long blkoff,
 }
 
 /**
+ * nilfs_palloc_delete_block - delete a block on the persistent allocator file
+ * @inode: inode of metadata file using this allocator
+ * @blkoff: block offset
+ * @prev: nilfs_bh_assoc struct of the last used buffer
+ * @lock: spin lock protecting @prev
+ */
+static int nilfs_palloc_delete_block(struct inode *inode, unsigned long blkoff,
+struct nilfs_bh_assoc *prev,
+spinlock_t *lock)
+{
+   spin_lock(lock);
+   if (prev->bh && blkoff == prev->blkoff) {
+   brelse(prev->bh);
+   prev->bh = NULL;
+   }
+   spin_unlock(lock);
+   return nilfs_mdt_delete_block(inode, blkoff);
+}
+
+/**
  * nilfs_palloc_get_desc_block - get buffer head of a group descriptor block
  * @inode: inode of metadata file using this allocator
  * @group: group number
@@ -274,6 +294,22 @@ static int nilfs_palloc_get_bitmap_block(struct inode 
*inode,
 }
 
 /**
+ * nilfs_palloc_delete_bitmap_block - delete a bitmap block
+ * @inode: inode of metadata file using this allocator
+ * @group: group number
+ */
+static int nilfs_palloc_delete_bitmap_block(struct inode *inode,
+   unsigned long group)
+{
+   struct nilfs_palloc_cache *cache = NILFS_MDT(inode)->mi_palloc_cache;
+
+   return nilfs_palloc_delete_block(inode,
+nilfs_palloc_bitmap_blkoff(inode,
+   group),
+>prev_bitmap, >lock);
+}
+
+/**
  * nilfs_palloc_get_entry_block - get buffer head of an entry block
  * @inode: inode of metadata file using this allocator
  * @nr: serial number of the entry (e.g. inode number)
@@ -292,6 +328,20 @@ int nilfs_palloc_get_entry_block(struct inode *inode, 
__u64 nr,
 }
 
 /**
+ * nilfs_palloc_delete_entry_block - delete an entry block
+ * @inode: inode of metadata file using this allocator
+ * @nr: serial number of the entry
+ */
+static int nilfs_palloc_delete_entry_block(struct inode *inode, __u64 nr)
+{
+   struct nilfs_palloc_cache *cache = NILFS_MDT(inode)->mi_palloc_cache;
+
+   return nilfs_palloc_delete_block(inode,
+nilfs_palloc_entry_blkoff(inode, nr),
+>prev_entry, >lock);
+}
+
+/**
  * nilfs_palloc_block_get_group_desc - get kernel address of a group descriptor
  * @inode: inode of metadata file using this allocator
  * @group: group number
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 7/7] nilfs2: free unused dat file blocks during garbage collection

2015-09-19 Thread Ryusuke Konishi
As a nilfs2 volume ages, the amount of available disk space
decreases little by little due to bloat of DAT (disk address
translation) metadata file.  Even if we delete all files in a file
system and free their block addresses from the DAT file through a
garbage collection, empty DAT blocks are not freed.

This fixes the issue by extending the deallocator of block addresses
so that empty data blocks and empty bitmap blocks of DAT are deleted.

The following comparison shows the effect of this patch.  Each shows
disk amount information of a nilfs2 volume that we cleaned out by
deleting all files and running gc after having filled 90% of its
capacity.

Before:
Filesystem 1K-blocks Used Available Use% Mounted on
/dev/sda1  500105212  3022844 472072192   1% /test

After:
Filesystem 1K-blocks Used Available Use% Mounted on
/dev/sda1  50010521216380 475078656   1% /test

Signed-off-by: Ryusuke Konishi <konishi.ryus...@lab.ntt.co.jp>
---
 fs/nilfs2/alloc.c | 91 ---
 fs/nilfs2/alloc.h |  1 +
 2 files changed, 75 insertions(+), 17 deletions(-)

diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c
index 225b797..b335a32 100644
--- a/fs/nilfs2/alloc.c
+++ b/fs/nilfs2/alloc.c
@@ -154,13 +154,17 @@ nilfs_palloc_group_desc_nfrees(const struct 
nilfs_palloc_group_desc *desc,
  * @lock: spin lock protecting @desc
  * @n: delta to be added
  */
-static void
+static u32
 nilfs_palloc_group_desc_add_entries(struct nilfs_palloc_group_desc *desc,
spinlock_t *lock, u32 n)
 {
+   u32 nfree;
+
spin_lock(lock);
le32_add_cpu(>pg_nfrees, n);
+   nfree = le32_to_cpu(desc->pg_nfrees);
spin_unlock(lock);
+   return nfree;
 }
 
 /**
@@ -735,12 +739,18 @@ int nilfs_palloc_freev(struct inode *inode, __u64 
*entry_nrs, size_t nitems)
unsigned char *bitmap;
void *desc_kaddr, *bitmap_kaddr;
unsigned long group, group_offset;
-   __u64 group_min_nr;
+   __u64 group_min_nr, last_nrs[8];
const unsigned long epg = nilfs_palloc_entries_per_group(inode);
+   const unsigned epb = NILFS_MDT(inode)->mi_entries_per_block;
+   unsigned entry_start, end, pos;
spinlock_t *lock;
-   int i, j, n, ret;
+   int i, j, k, ret;
+   u32 nfree;
 
for (i = 0; i < nitems; i = j) {
+   int change_group = false;
+   int nempties = 0, n = 0;
+
group = nilfs_palloc_group(inode, entry_nrs[i], _offset);
ret = nilfs_palloc_get_desc_block(inode, group, 0, _bh);
if (ret < 0)
@@ -755,17 +765,13 @@ int nilfs_palloc_freev(struct inode *inode, __u64 
*entry_nrs, size_t nitems)
/* Get the first entry number of the group */
group_min_nr = (__u64)group * epg;
 
-   desc_kaddr = kmap(desc_bh->b_page);
-   desc = nilfs_palloc_block_get_group_desc(
-   inode, group, desc_bh, desc_kaddr);
bitmap_kaddr = kmap(bitmap_bh->b_page);
bitmap = bitmap_kaddr + bh_offset(bitmap_bh);
lock = nilfs_mdt_bgl_lock(inode, group);
-   for (j = i, n = 0;
-j < nitems && entry_nrs[j] >= group_min_nr &&
-entry_nrs[j] < group_min_nr + epg;
-j++) {
-   group_offset = entry_nrs[j] - group_min_nr;
+
+   j = i;
+   entry_start = rounddown(group_offset, epb);
+   do {
if (!nilfs_clear_bit_atomic(lock, group_offset,
bitmap)) {
nilfs_warning(inode->i_sb, __func__,
@@ -775,18 +781,69 @@ int nilfs_palloc_freev(struct inode *inode, __u64 
*entry_nrs, size_t nitems)
} else {
n++;
}
-   }
-   nilfs_palloc_group_desc_add_entries(desc, lock, n);
+
+   j++;
+   if (j >= nitems || entry_nrs[j] < group_min_nr ||
+   entry_nrs[j] >= group_min_nr + epg) {
+   change_group = true;
+   } else {
+   group_offset = entry_nrs[j] - group_min_nr;
+   if (group_offset >= entry_start &&
+   group_offset < entry_start + epb) {
+   /* This entry is in the same block */
+   continue;
+   }
+   }
+
+   /* Test if the entry block is empty or not */
+   end = entry_start + epb;
+   pos = nilfs_f

Re: [PATCH] nilfs2: add tracepoints for analyzing reading and writing metadata files

2015-10-05 Thread Ryusuke Konishi
On Mon, 5 Oct 2015 19:21:43 +0900, Mitake Hitoshi wrote:
> On Sun, Oct 4, 2015 at 10:33 PM, Ryusuke Konishi
> <konishi.ryus...@lab.ntt.co.jp> wrote:
>> On Sun,  4 Oct 2015 01:02:42 +0900, Mitake Hitoshi wrote:
>>> This patch adds tracepoints for analyzing requests of reading and
>>> writing metadata files. The tracepoints cover every in-place mdt files
>>> (cpfile, sufile, and datfile).
>>>
>>> Example of tracing mdt_insert_new_block():
>>>   cp-14635 [000] ...1 30598.199309: 
>>> nilfs2_mdt_insert_new_block: inode = 88022a8d0178 ino = 3 block = 155
>>>   cp-14635 [000] ...1 30598.199520: 
>>> nilfs2_mdt_insert_new_block: inode = 88022a8d0178 ino = 3 block = 5
>>>   cp-14635 [000] ...1 30598.200828: 
>>> nilfs2_mdt_insert_new_block: inode = 88022a8d0178 ino = 3 block = 253
>>>
>>> Cc: TK Kato <tk.k...@wdc.com>
>>> Signed-off-by: Hitoshi Mitake <mitake.hito...@lab.ntt.co.jp>
>>
>> Applied to the tracepoints branch.  Thanks.
> 
> Thanks, could you send the patches in the tracepoints branch to upstream?

Sure, I will.

Regards,
Ryusuke Konishi

--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


<    1   2   3   4   5   6