[PATCH 1/1] f2fs: update multi-dev metadata in resize_fs

2019-09-18 Thread sunqiuyang
From: Qiuyang Sun 

Multi-device metadata should be updated in resize_fs as well.

Also, we check that the new FS size still reaches the last device.

Signed-off-by: Qiuyang Sun 
---
 fs/f2fs/gc.c | 32 ++--
 1 file changed, 30 insertions(+), 2 deletions(-)

diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 5877bd7..a2b8cbe 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -1431,26 +1431,46 @@ static void update_sb_metadata(struct f2fs_sb_info 
*sbi, int secs)
int segment_count_main = le32_to_cpu(raw_sb->segment_count_main);
long long block_count = le64_to_cpu(raw_sb->block_count);
int segs = secs * sbi->segs_per_sec;
+   int ndevs = sbi->s_ndevs;
 
raw_sb->section_count = cpu_to_le32(section_count + secs);
raw_sb->segment_count = cpu_to_le32(segment_count + segs);
raw_sb->segment_count_main = cpu_to_le32(segment_count_main + segs);
raw_sb->block_count = cpu_to_le64(block_count +
(long long)segs * sbi->blocks_per_seg);
+   if (ndevs > 1) {
+   int dev_segs =
+   le32_to_cpu(raw_sb->devs[ndevs - 1].total_segments);
+
+   raw_sb->devs[ndevs - 1].total_segments =
+   cpu_to_le32(dev_segs + segs);
+   }
 }
 
 static void update_fs_metadata(struct f2fs_sb_info *sbi, int secs)
 {
int segs = secs * sbi->segs_per_sec;
+   long long blks = (long long)segs * sbi->blocks_per_seg;
long long user_block_count =
le64_to_cpu(F2FS_CKPT(sbi)->user_block_count);
+   int ndevs = sbi->s_ndevs;
 
SM_I(sbi)->segment_count = (int)SM_I(sbi)->segment_count + segs;
MAIN_SEGS(sbi) = (int)MAIN_SEGS(sbi) + segs;
FREE_I(sbi)->free_sections = (int)FREE_I(sbi)->free_sections + secs;
FREE_I(sbi)->free_segments = (int)FREE_I(sbi)->free_segments + segs;
-   F2FS_CKPT(sbi)->user_block_count = cpu_to_le64(user_block_count +
-   (long long)segs * sbi->blocks_per_seg);
+   F2FS_CKPT(sbi)->user_block_count = cpu_to_le64(user_block_count + blks);
+
+   if (ndevs > 1) {
+   FDEV(ndevs - 1).total_segments =
+   (int)FDEV(ndevs - 1).total_segments + segs;
+   FDEV(ndevs - 1).end_blk =
+   (long long)FDEV(ndevs - 1).end_blk + blks;
+#ifdef CONFIG_BLK_DEV_ZONED
+   FDEV(ndevs - 1).nr_blkz = (int)FDEV(ndevs - 1).nr_blkz +
+   (int)(blks >> sbi->log_blocks_per_blkz);
+#endif
+   }
 }
 
 int f2fs_resize_fs(struct f2fs_sb_info *sbi, __u64 block_count)
@@ -1465,6 +1485,14 @@ int f2fs_resize_fs(struct f2fs_sb_info *sbi, __u64 
block_count)
if (block_count > old_block_count)
return -EINVAL;
 
+   if (sbi->s_ndevs > 1) {
+   __u64 last_segs = FDEV(sbi->s_ndevs - 1).total_segments;
+
+   if (block_count + last_segs * sbi->blocks_per_seg <=
+   old_block_count)
+   return -EINVAL;
+   }
+
/* new fs size should align to section size */
div_u64_rem(block_count, BLKS_PER_SEC(sbi), &rem);
if (rem)
-- 
1.8.3.1



RE: [PATCH 1/1] mm/migrate: fix list corruption in migration of non-LRU movable pages

2019-09-04 Thread sunqiuyang



From: Michal Hocko [mho...@kernel.org]
Sent: Wednesday, September 04, 2019 20:52
To: sunqiuyang
Cc: linux-kernel@vger.kernel.org; linux...@kvack.org
Subject: Re: [PATCH 1/1] mm/migrate: fix list corruption in migration of 
non-LRU movable pages

On Wed 04-09-19 12:19:11, sunqiuyang wrote:
> > Do not top post please
> >
> > On Wed 04-09-19 07:27:25, sunqiuyang wrote:
> > > isolate_migratepages_block() from another thread may try to isolate the 
> > > page again:
> > >
> > > for (; low_pfn < end_pfn; low_pfn++) {
> > >   /* ... */
> > >   page = pfn_to_page(low_pfn);
> > >  /* ... */
> > >   if (!PageLRU(page)) {
> > > if (unlikely(__PageMovable(page)) && !PageIsolated(page)) {
> > > /* ... */
> > > if (!isolate_movable_page(page, isolate_mode))
> > >   goto isolate_success;
> > >   /*... */
> > > isolate_success:
> > >  list_add(&page->lru, &cc->migratepages);
> > >
> > > And this page will be added to another list.
> > > Or, do you see any reason that the page cannot go through this path?
> >
> > The page shouldn't be __PageMovable after the migration is done. All the
> > state should have been transfered to the new page IIUC.
> >
>
> I don't see where page->mapping is modified after the migration is done.
>
> Actually, the last comment in move_to_new_page() says,
> "Anonymous and movable page->mapping will be cleard by
> free_pages_prepare so don't reset it here for keeping
> the type to work PageAnon, for example. "
>
> Or did I miss something? Thanks,

This talks about mapping rather than flags stored in the mapping.
I can see that in tree migration handlers (z3fold_page_migrate,
vmballoon_migratepage via balloon_page_delete, zs_page_migrate via
reset_page) all reset the movable flag. I am not sure whether that is a
documented requirement or just a coincidence. Maybe it should be
documented. I would like to hear from Minchan.

---
I checked the three migration handlers and only found __ClearPageMovable,
which clears registered address_space val with keeping PAGE_MAPPING_MOVABLE 
flag,
so the page should still be __PageMovable when caught by another migration 
thread. Right?

---

--
Michal Hocko
SUSE Labs


RE: [PATCH 1/1] mm/migrate: fix list corruption in migration of non-LRU movable pages

2019-09-04 Thread sunqiuyang



From: Michal Hocko [mho...@kernel.org]
Sent: Wednesday, September 04, 2019 16:14
To: sunqiuyang
Cc: linux-kernel@vger.kernel.org; linux...@kvack.org
Subject: Re: [PATCH 1/1] mm/migrate: fix list corruption in migration of 
non-LRU movable pages

Do not top post please

On Wed 04-09-19 07:27:25, sunqiuyang wrote:
> isolate_migratepages_block() from another thread may try to isolate the page 
> again:
>
> for (; low_pfn < end_pfn; low_pfn++) {
>   /* ... */
>   page = pfn_to_page(low_pfn);
>  /* ... */
>   if (!PageLRU(page)) {
> if (unlikely(__PageMovable(page)) && !PageIsolated(page)) {
> /* ... */
> if (!isolate_movable_page(page, isolate_mode))
>   goto isolate_success;
>   /*... */
> isolate_success:
>  list_add(&page->lru, &cc->migratepages);
>
> And this page will be added to another list.
> Or, do you see any reason that the page cannot go through this path?

The page shouldn't be __PageMovable after the migration is done. All the
state should have been transfered to the new page IIUC.


I don't see where page->mapping is modified after the migration is done. 

Actually, the last comment in move_to_new_page() says,
"Anonymous and movable page->mapping will be cleard by
free_pages_prepare so don't reset it here for keeping
the type to work PageAnon, for example. "

Or did I miss something? Thanks,

--
Michal Hocko
SUSE Labs


RE: [PATCH 1/1] mm/migrate: fix list corruption in migration of non-LRU movable pages

2019-09-04 Thread sunqiuyang
isolate_migratepages_block() from another thread may try to isolate the page 
again:

for (; low_pfn < end_pfn; low_pfn++) {
  /* ... */
  page = pfn_to_page(low_pfn);
 /* ... */
  if (!PageLRU(page)) {
if (unlikely(__PageMovable(page)) && !PageIsolated(page)) {
/* ... */
if (!isolate_movable_page(page, isolate_mode))
  goto isolate_success;
  /*... */
isolate_success:
 list_add(&page->lru, &cc->migratepages);

And this page will be added to another list.
Or, do you see any reason that the page cannot go through this path?

From: Michal Hocko [mho...@kernel.org]
Sent: Wednesday, September 04, 2019 14:38
To: sunqiuyang
Cc: linux-kernel@vger.kernel.org; linux...@kvack.org
Subject: Re: [PATCH 1/1] mm/migrate: fix list corruption in migration of 
non-LRU movable pages

On Wed 04-09-19 02:18:38, sunqiuyang wrote:
> The isolate path of non-lru movable pages:
>
> isolate_migratepages_block
>   isolate_movable_page
>   trylock_page
>   // if PageIsolated, goto out_no_isolated
>   a_ops->isolate_page
>   __SetPageIsolated
>   unlock_page
>   list_add(&page->lru, &cc->migratepages)
>
> The migration path:
>
> unmap_and_move
>   __unmap_and_move
>   lock_page
>   move_to_new_page
>   a_ops->migratepage
>   __ClearPageIsolated
>   unlock_page
>   /* here, the page could be isolated again by another thread, and added 
> into another cc->migratepages,
>   since PG_Isolated has been cleared, and not protected by page_lock */
>   list_del(&page->lru)

But the page has been migrated already and not freed yet because there
is still a pin on it. So nobody should be touching it at this stage.
Or do I still miss something?
--
Michal Hocko
SUSE Labs


RE: [PATCH 1/1] mm/migrate: fix list corruption in migration of non-LRU movable pages

2019-09-03 Thread sunqiuyang
The isolate path of non-lru movable pages:

isolate_migratepages_block
isolate_movable_page
trylock_page
// if PageIsolated, goto out_no_isolated
a_ops->isolate_page
__SetPageIsolated
unlock_page
list_add(&page->lru, &cc->migratepages)

The migration path:

unmap_and_move
__unmap_and_move
lock_page
move_to_new_page
a_ops->migratepage
__ClearPageIsolated
unlock_page
/* here, the page could be isolated again by another thread, and added 
into another cc->migratepages,
since PG_Isolated has been cleared, and not protected by page_lock */
list_del(&page->lru)

Suppose thread A isolates three pages in the order p1, p2, p3, A's 
cc->migratepages will be like
head_A - p3 - p2 - p1
After p2 is migrated (but before list_del), it is isolated by another thread B. 
Then list_del will delete p2
from the cc->migratepages of B (instead of A). When A continues to migrate and 
delete p1, it will find:
p1->prev == p2
p2->next == LIST_POISON1. 

So we will end up with a bug like
"list_del corruption. prev->next should be ffbf0a1eb8e0, but was 
dead0100"
(see __list_del_entry_valid).



From: Michal Hocko [mho...@kernel.org]
Sent: Tuesday, September 03, 2019 21:17
To: sunqiuyang
Cc: linux-kernel@vger.kernel.org; linux...@kvack.org
Subject: Re: [PATCH 1/1] mm/migrate: fix list corruption in migration of 
non-LRU movable pages

On Tue 03-09-19 16:27:46, sunqiuyang wrote:
> From: Qiuyang Sun 
>
> Currently, after a page is migrated, it
> 1) has its PG_isolated flag cleared in move_to_new_page(), and
> 2) is deleted from its LRU list (cc->migratepages) in unmap_and_move().
> However, between steps 1) and 2), the page could be isolated by another
> thread in isolate_movable_page(), and added to another LRU list, leading
> to list_del corruption later.

Care to explain the race? Both paths use page_lock AFAICS
>
> This patch fixes the bug by moving list_del into the critical section
> protected by lock_page(), so that a page will not be isolated again before
> it has been deleted from its LRU list.
>
> Signed-off-by: Qiuyang Sun 
> ---
>  mm/migrate.c | 11 +++
>  1 file changed, 3 insertions(+), 8 deletions(-)
>
> diff --git a/mm/migrate.c b/mm/migrate.c
> index a42858d..c58a606 100644
> --- a/mm/migrate.c
> +++ b/mm/migrate.c
> @@ -1124,6 +1124,8 @@ static int __unmap_and_move(struct page *page, struct 
> page *newpage,
>   /* Drop an anon_vma reference if we took one */
>   if (anon_vma)
>   put_anon_vma(anon_vma);
> + if (rc != -EAGAIN)
> + list_del(&page->lru);
>   unlock_page(page);
>  out:
>   /*
> @@ -1190,6 +1192,7 @@ static ICE_noinline int unmap_and_move(new_page_t 
> get_new_page,
>   put_new_page(newpage, private);
>   else
>   put_page(newpage);
> + list_del(&page->lru);
>   goto out;
>   }
>
> @@ -1200,14 +1203,6 @@ static ICE_noinline int unmap_and_move(new_page_t 
> get_new_page,
>  out:
>   if (rc != -EAGAIN) {
>   /*
> -  * A page that has been migrated has all references
> -  * removed and will be freed. A page that has not been
> -  * migrated will have kepts its references and be
> -  * restored.
> -  */
> - list_del(&page->lru);
> -
> - /*
>* Compaction can migrate also non-LRU pages which are
>* not accounted to NR_ISOLATED_*. They can be recognized
>* as __PageMovable
> --
> 1.8.3.1

--
Michal Hocko
SUSE Labs


[PATCH 1/1] mm/migrate: fix list corruption in migration of non-LRU movable pages

2019-09-03 Thread sunqiuyang
From: Qiuyang Sun 

Currently, after a page is migrated, it
1) has its PG_isolated flag cleared in move_to_new_page(), and
2) is deleted from its LRU list (cc->migratepages) in unmap_and_move().
However, between steps 1) and 2), the page could be isolated by another
thread in isolate_movable_page(), and added to another LRU list, leading
to list_del corruption later.

This patch fixes the bug by moving list_del into the critical section
protected by lock_page(), so that a page will not be isolated again before
it has been deleted from its LRU list.

Signed-off-by: Qiuyang Sun 
---
 mm/migrate.c | 11 +++
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/mm/migrate.c b/mm/migrate.c
index a42858d..c58a606 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1124,6 +1124,8 @@ static int __unmap_and_move(struct page *page, struct 
page *newpage,
/* Drop an anon_vma reference if we took one */
if (anon_vma)
put_anon_vma(anon_vma);
+   if (rc != -EAGAIN)
+   list_del(&page->lru);
unlock_page(page);
 out:
/*
@@ -1190,6 +1192,7 @@ static ICE_noinline int unmap_and_move(new_page_t 
get_new_page,
put_new_page(newpage, private);
else
put_page(newpage);
+   list_del(&page->lru);
goto out;
}
 
@@ -1200,14 +1203,6 @@ static ICE_noinline int unmap_and_move(new_page_t 
get_new_page,
 out:
if (rc != -EAGAIN) {
/*
-* A page that has been migrated has all references
-* removed and will be freed. A page that has not been
-* migrated will have kepts its references and be
-* restored.
-*/
-   list_del(&page->lru);
-
-   /*
 * Compaction can migrate also non-LRU pages which are
 * not accounted to NR_ISOLATED_*. They can be recognized
 * as __PageMovable
-- 
1.8.3.1



[PATCH v3 1/1] f2fs: ioctl for removing a range from F2FS

2019-05-20 Thread sunqiuyang
From: Qiuyang Sun 

This ioctl shrinks a given length (aligned to sections) from end of the
main area. Any cursegs and valid blocks will be moved out before
invalidating the range.

This feature can be used for adjusting partition sizes online.

Changlog v1 ==> v2:

Sahitya Tummala:
 - Add this ioctl for f2fs_compat_ioctl() as well.
 - Fix debugfs status to reflect the online resize changes.
 - Fix potential race between online resize path and allocate new data
   block path or gc path.

Others:
 - Rename some identifiers.
 - Add some error handling branches.
 - Clear sbi->next_victim_seg[BG_GC/FG_GC] in shrinking range.

Changelog v2 ==> v3:
Implement this interface as ext4's, and change the parameter from shrunk
bytes to new block count of F2FS.

Signed-off-by: Qiuyang Sun 
Signed-off-by: Sahitya Tummala 
---
 fs/f2fs/debug.c   |   7 
 fs/f2fs/f2fs.h|   4 ++
 fs/f2fs/file.c|  28 ++
 fs/f2fs/gc.c  | 113 +-
 fs/f2fs/segment.c |  49 +--
 fs/f2fs/segment.h |   1 +
 fs/f2fs/super.c   |   1 +
 7 files changed, 190 insertions(+), 13 deletions(-)

diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index 99e9a5c..7706049 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -27,8 +27,15 @@
 static void update_general_status(struct f2fs_sb_info *sbi)
 {
struct f2fs_stat_info *si = F2FS_STAT(sbi);
+   struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi);
int i;
 
+   /* these will be changed if online resize is done */
+   si->main_area_segs = le32_to_cpu(raw_super->segment_count_main);
+   si->main_area_sections = le32_to_cpu(raw_super->section_count);
+   si->main_area_zones = si->main_area_sections /
+   le32_to_cpu(raw_super->secs_per_zone);
+
/* validation check of the segment numbers */
si->hit_largest = atomic64_read(&sbi->read_hit_largest);
si->hit_cached = atomic64_read(&sbi->read_hit_cached);
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index a205d4d..065f917 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -423,6 +423,7 @@ static inline bool __has_cursum_space(struct f2fs_journal 
*journal,
 #define F2FS_IOC_SET_PIN_FILE  _IOW(F2FS_IOCTL_MAGIC, 13, __u32)
 #define F2FS_IOC_GET_PIN_FILE  _IOR(F2FS_IOCTL_MAGIC, 14, __u32)
 #define F2FS_IOC_PRECACHE_EXTENTS  _IO(F2FS_IOCTL_MAGIC, 15)
+#define F2FS_IOC_RESIZE_FS _IOW(F2FS_IOCTL_MAGIC, 16, __u64)
 
 #define F2FS_IOC_SET_ENCRYPTION_POLICY FS_IOC_SET_ENCRYPTION_POLICY
 #define F2FS_IOC_GET_ENCRYPTION_POLICY FS_IOC_GET_ENCRYPTION_POLICY
@@ -1309,6 +1310,7 @@ struct f2fs_sb_info {
unsigned int segs_per_sec;  /* segments per section */
unsigned int secs_per_zone; /* sections per zone */
unsigned int total_sections;/* total section count */
+   unsigned int current_total_sections;/* for shrink resize */
unsigned int total_node_count;  /* total node block count */
unsigned int total_valid_node_count;/* valid node block count */
loff_t max_file_blocks; /* max block index of file */
@@ -3175,6 +3177,7 @@ void f2fs_clear_prefree_segments(struct f2fs_sb_info *sbi,
 int f2fs_disable_cp_again(struct f2fs_sb_info *sbi, block_t unusable);
 void f2fs_release_discard_addrs(struct f2fs_sb_info *sbi);
 int f2fs_npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra);
+void allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type);
 void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi);
 int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range);
 bool f2fs_exist_trim_candidates(struct f2fs_sb_info *sbi,
@@ -3318,6 +3321,7 @@ int f2fs_migrate_page(struct address_space *mapping, 
struct page *newpage,
 int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, bool background,
unsigned int segno);
 void f2fs_build_gc_manager(struct f2fs_sb_info *sbi);
+int f2fs_resize_fs(struct f2fs_sb_info *sbi, __u64 block_count);
 
 /*
  * recovery.c
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index d05ac21..a37a0d4 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -3013,6 +3013,31 @@ static int f2fs_ioc_precache_extents(struct file *filp, 
unsigned long arg)
return f2fs_precache_extents(file_inode(filp));
 }
 
+static int f2fs_ioc_resize_fs(struct file *filp, unsigned long arg)
+{
+   struct f2fs_sb_info *sbi = F2FS_I_SB(file_inode(filp));
+   __u64 block_count;
+   int ret;
+
+   if (!capable(CAP_SYS_ADMIN))
+   return -EPERM;
+
+   if (f2fs_readonly(sbi->sb))
+   return -EROFS;
+
+   if (copy_from_user(&block_count, (__u64 __user *)arg, sizeof(__u64)))
+   return -EFAULT;
+
+   ret = mnt_want_write_file(filp);
+   if (ret)
+   return ret;
+
+   ret = f2fs_resize_fs(sbi, block_count);
+   mnt_drop_write_file(filp);

[RFC PATCH 1/1] f2fs-dev: ioctl for removing a range from F2FS

2019-02-20 Thread sunqiuyang
From: Qiuyang Sun 

This ioctl shrinks a given length (aligned to sections) from end of the
main area. Any cursegs and valid blocks will be moved out before
invalidating the range.

This feature can be used for adjusting partition sizes online.

Signed-off-by: Qiuyang Sun 
---
 fs/f2fs/f2fs.h|  9 ++
 fs/f2fs/file.c| 28 +++
 fs/f2fs/gc.c  | 83 +--
 fs/f2fs/segment.c | 47 +++
 fs/f2fs/segment.h |  1 +
 fs/f2fs/super.c   |  1 +
 6 files changed, 156 insertions(+), 13 deletions(-)

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 8c69e12..fd7f3ba 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -406,6 +406,8 @@ static inline bool __has_cursum_space(struct f2fs_journal 
*journal,
 #define F2FS_IOC_SET_PIN_FILE  _IOW(F2FS_IOCTL_MAGIC, 13, __u32)
 #define F2FS_IOC_GET_PIN_FILE  _IOR(F2FS_IOCTL_MAGIC, 14, __u32)
 #define F2FS_IOC_PRECACHE_EXTENTS  _IO(F2FS_IOCTL_MAGIC, 15)
+#define F2FS_IOC_RESIZE_FROM_END   _IOWR(F2FS_IOCTL_MAGIC, 16, \
+   struct f2fs_resize_from_end)
 
 #define F2FS_IOC_SET_ENCRYPTION_POLICY FS_IOC_SET_ENCRYPTION_POLICY
 #define F2FS_IOC_GET_ENCRYPTION_POLICY FS_IOC_GET_ENCRYPTION_POLICY
@@ -457,6 +459,10 @@ struct f2fs_flush_device {
u32 segments;   /* # of segments to flush */
 };
 
+struct f2fs_resize_from_end {
+   u64 len;/* bytes to shrink */
+};
+
 /* for inline stuff */
 #define DEF_INLINE_RESERVED_SIZE   1
 static inline int get_extra_isize(struct inode *inode);
@@ -1226,6 +1232,7 @@ struct f2fs_sb_info {
unsigned int segs_per_sec;  /* segments per section */
unsigned int secs_per_zone; /* sections per zone */
unsigned int total_sections;/* total section count */
+   unsigned int new_total_sections;/* for resize from end */
unsigned int total_node_count;  /* total node block count */
unsigned int total_valid_node_count;/* valid node block count */
loff_t max_file_blocks; /* max block index of file */
@@ -3008,6 +3015,7 @@ void f2fs_clear_prefree_segments(struct f2fs_sb_info *sbi,
 int f2fs_disable_cp_again(struct f2fs_sb_info *sbi);
 void f2fs_release_discard_addrs(struct f2fs_sb_info *sbi);
 int f2fs_npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra);
+void allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type);
 void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi);
 int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range);
 bool f2fs_exist_trim_candidates(struct f2fs_sb_info *sbi,
@@ -3146,6 +3154,7 @@ int f2fs_migrate_page(struct address_space *mapping, 
struct page *newpage,
 int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, bool background,
unsigned int segno);
 void f2fs_build_gc_manager(struct f2fs_sb_info *sbi);
+int f2fs_resize_from_end(struct f2fs_sb_info *sbi, size_t resize_len);
 
 /*
  * recovery.c
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index b8f5d12..29e70fd 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -2968,6 +2968,32 @@ static int f2fs_ioc_precache_extents(struct file *filp, 
unsigned long arg)
return f2fs_precache_extents(file_inode(filp));
 }
 
+static int f2fs_ioc_resize_from_end(struct file *filp, unsigned long arg)
+{
+   struct f2fs_sb_info *sbi = F2FS_I_SB(file_inode(filp));
+   struct f2fs_resize_from_end param;
+   int ret;
+
+   if (!capable(CAP_SYS_ADMIN))
+   return -EPERM;
+
+   if (f2fs_readonly(sbi->sb))
+   return -EROFS;
+
+   if (copy_from_user(¶m, (struct f2fs_resize_from_end __user *)arg,
+   sizeof(param)))
+   return -EFAULT;
+
+   ret = mnt_want_write_file(filp);
+   if (ret)
+   return ret;
+
+   ret = f2fs_resize_from_end(sbi, param.len);
+   mnt_drop_write_file(filp);
+
+   return ret;
+}
+
 long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
if (unlikely(f2fs_cp_error(F2FS_I_SB(file_inode(filp)
@@ -3024,6 +3050,8 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, 
unsigned long arg)
return f2fs_ioc_set_pin_file(filp, arg);
case F2FS_IOC_PRECACHE_EXTENTS:
return f2fs_ioc_precache_extents(filp, arg);
+   case F2FS_IOC_RESIZE_FROM_END:
+   return f2fs_ioc_resize_from_end(filp, arg);
default:
return -ENOTTY;
}
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 195cf0f..3877e99 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -311,7 +311,7 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
struct sit_info *sm = SIT_I(sbi);
struct victim_sel_policy p;
unsigned int secno, last_victim;
-   unsigned int last_segment = MAIN_SEGS(sbi);
+   unsi

[PATCH v3 1/2] f2fs-dev: support multi-device direct IO

2019-01-06 Thread sunqiuyang
From: Qiuyang Sun 

Changelog v1 ==> v2:
1. Modify the definition of update_device_state(),
   and call it in direct write;
2. Move some local variables into branches where they are used.

Changelog v2 ==> v3:
Rename update_device_state() to f2fs_update_device_state() like other
exported function names, otherwise it may pollute global namespace of
functions in kernel.

Signed-off-by: Qiuyang Sun 
---
 fs/f2fs/data.c| 25 -
 fs/f2fs/f2fs.h|  4 ++--
 fs/f2fs/segment.c | 12 ++--
 3 files changed, 32 insertions(+), 9 deletions(-)

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index e5cd3fd..b35e042 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -1076,6 +1076,7 @@ int f2fs_map_blocks(struct inode *inode, struct 
f2fs_map_blocks *map,
struct extent_info ei = {0,0,0};
block_t blkaddr;
unsigned int start_pgofs;
+   block_t end_blk;
 
if (!maxblocks)
return 0;
@@ -1207,8 +1208,17 @@ int f2fs_map_blocks(struct inode *inode, struct 
f2fs_map_blocks *map,
 
map->m_pblk = blkaddr;
map->m_len = 1;
+
+   if (sbi->s_ndevs && blkaddr != NEW_ADDR &&
+   blkaddr != NULL_ADDR) {
+   int devi;
+
+   devi = f2fs_target_device_index(sbi, blkaddr);
+   end_blk = FDEV(devi).end_blk;
+   }
} else if ((map->m_pblk != NEW_ADDR &&
-   blkaddr == (map->m_pblk + ofs)) ||
+   blkaddr == (map->m_pblk + ofs) &&
+   (!sbi->s_ndevs || blkaddr <= end_blk)) ||
(map->m_pblk == NEW_ADDR && blkaddr == NEW_ADDR) ||
flag == F2FS_GET_BLOCK_PRE_DIO) {
ofs++;
@@ -1322,6 +1332,7 @@ static int __get_data_block(struct inode *inode, sector_t 
iblock,
 {
struct f2fs_map_blocks map;
int err;
+   struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 
map.m_lblk = iblock;
map.m_len = bh->b_size >> inode->i_blkbits;
@@ -1333,6 +1344,18 @@ static int __get_data_block(struct inode *inode, 
sector_t iblock,
err = f2fs_map_blocks(inode, &map, create, flag);
if (!err) {
map_bh(bh, inode->i_sb, map.m_pblk);
+   if (sbi->s_ndevs) {
+   int devi;
+
+   devi = f2fs_target_device_index(sbi, map.m_pblk);
+   if (devi) {
+   bh->b_bdev = FDEV(devi).bdev;
+   bh->b_blocknr -= FDEV(devi).start_blk;
+   }
+   if (may_write)
+   f2fs_update_device_state(sbi, inode->i_ino,
+   map.m_pblk);
+   }
bh->b_state = (bh->b_state & ~F2FS_MAP_FLAGS) | map.m_flags;
bh->b_size = (u64)map.m_len << inode->i_blkbits;
}
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index eeede26..7e423e4 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -3056,6 +3056,8 @@ int f2fs_lookup_journal_in_cursum(struct f2fs_journal 
*journal, int type,
 int f2fs_rw_hint_to_seg_type(enum rw_hint hint);
 enum rw_hint f2fs_io_type_to_rw_hint(struct f2fs_sb_info *sbi,
enum page_type type, enum temp_type temp);
+void f2fs_update_device_state(struct f2fs_sb_info *sbi, nid_t ino,
+   block_t blkaddr);
 
 /*
  * checkpoint.c
@@ -3595,8 +3597,6 @@ static inline bool f2fs_force_buffered_io(struct inode 
*inode,
 
if (f2fs_post_read_required(inode))
return true;
-   if (sbi->s_ndevs)
-   return true;
/*
 * for blkzoned device, fallback direct IO to buffered IO, so
 * all IOs can be serialized by log-structured write.
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index a361d61..2a21d87 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -3050,18 +3050,18 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, 
struct page *page,
up_read(&SM_I(sbi)->curseg_lock);
 }
 
-static void update_device_state(struct f2fs_io_info *fio)
+void f2fs_update_device_state(struct f2fs_sb_info *sbi, nid_t ino,
+   block_t blkaddr)
 {
-   struct f2fs_sb_info *sbi = fio->sbi;
unsigned int devidx;
 
if (!sbi->s_ndevs)
return;
 
-   devidx = f2fs_target_device_index(sbi, fio->new_blkaddr);
+   devidx = f2fs_target_device_index(sbi, blkaddr);
 
/* update device state for fsync */
-   f2fs_set_dirty_device(sbi, fio->ino, devidx, FLUSH_INO);
+   f2fs_set_dirty_device(sbi, ino, devidx, FLUSH_INO);
 
/* update device state for checkpoint */
if (!f2fs_test_bit(devidx, (char *)&sbi->dirty_device)) {
@@ -3092,7 +3092,7 @@ static void do_write_page(struct f2fs_sum

[PATCH v2 1/2] f2fs-dev: support multi-device direct IO

2019-01-04 Thread sunqiuyang
From: Qiuyang Sun 

Changelog v1 ==> v2:
1. Modify the definition of update_device_state(),
   and call it in direct write;
2. Move some local variables into branches where they are used.

Signed-off-by: Qiuyang Sun 
---
 fs/f2fs/data.c| 25 -
 fs/f2fs/f2fs.h|  3 +--
 fs/f2fs/segment.c | 11 +--
 3 files changed, 30 insertions(+), 9 deletions(-)

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index e5cd3fd..010300c 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -1076,6 +1076,7 @@ int f2fs_map_blocks(struct inode *inode, struct 
f2fs_map_blocks *map,
struct extent_info ei = {0,0,0};
block_t blkaddr;
unsigned int start_pgofs;
+   block_t end_blk;
 
if (!maxblocks)
return 0;
@@ -1207,8 +1208,17 @@ int f2fs_map_blocks(struct inode *inode, struct 
f2fs_map_blocks *map,
 
map->m_pblk = blkaddr;
map->m_len = 1;
+
+   if (sbi->s_ndevs && blkaddr != NEW_ADDR &&
+   blkaddr != NULL_ADDR) {
+   int devi;
+
+   devi = f2fs_target_device_index(sbi, blkaddr);
+   end_blk = FDEV(devi).end_blk;
+   }
} else if ((map->m_pblk != NEW_ADDR &&
-   blkaddr == (map->m_pblk + ofs)) ||
+   blkaddr == (map->m_pblk + ofs) &&
+   (!sbi->s_ndevs || blkaddr <= end_blk)) ||
(map->m_pblk == NEW_ADDR && blkaddr == NEW_ADDR) ||
flag == F2FS_GET_BLOCK_PRE_DIO) {
ofs++;
@@ -1322,6 +1332,7 @@ static int __get_data_block(struct inode *inode, sector_t 
iblock,
 {
struct f2fs_map_blocks map;
int err;
+   struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 
map.m_lblk = iblock;
map.m_len = bh->b_size >> inode->i_blkbits;
@@ -1333,6 +1344,18 @@ static int __get_data_block(struct inode *inode, 
sector_t iblock,
err = f2fs_map_blocks(inode, &map, create, flag);
if (!err) {
map_bh(bh, inode->i_sb, map.m_pblk);
+   if (sbi->s_ndevs) {
+   int devi;
+
+   devi = f2fs_target_device_index(sbi, map.m_pblk);
+   if (devi) {
+   bh->b_bdev = FDEV(devi).bdev;
+   bh->b_blocknr -= FDEV(devi).start_blk;
+   }
+   if (may_write)
+   update_device_state(sbi, inode->i_ino,
+   map.m_pblk);
+   }
bh->b_state = (bh->b_state & ~F2FS_MAP_FLAGS) | map.m_flags;
bh->b_size = (u64)map.m_len << inode->i_blkbits;
}
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index eeede26..659e1e0 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -3056,6 +3056,7 @@ int f2fs_lookup_journal_in_cursum(struct f2fs_journal 
*journal, int type,
 int f2fs_rw_hint_to_seg_type(enum rw_hint hint);
 enum rw_hint f2fs_io_type_to_rw_hint(struct f2fs_sb_info *sbi,
enum page_type type, enum temp_type temp);
+void update_device_state(struct f2fs_sb_info *sbi, nid_t ino, block_t blkaddr);
 
 /*
  * checkpoint.c
@@ -3595,8 +3596,6 @@ static inline bool f2fs_force_buffered_io(struct inode 
*inode,
 
if (f2fs_post_read_required(inode))
return true;
-   if (sbi->s_ndevs)
-   return true;
/*
 * for blkzoned device, fallback direct IO to buffered IO, so
 * all IOs can be serialized by log-structured write.
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index a361d61..eec5db1 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -3050,18 +3050,17 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, 
struct page *page,
up_read(&SM_I(sbi)->curseg_lock);
 }
 
-static void update_device_state(struct f2fs_io_info *fio)
+void update_device_state(struct f2fs_sb_info *sbi, nid_t ino, block_t blkaddr)
 {
-   struct f2fs_sb_info *sbi = fio->sbi;
unsigned int devidx;
 
if (!sbi->s_ndevs)
return;
 
-   devidx = f2fs_target_device_index(sbi, fio->new_blkaddr);
+   devidx = f2fs_target_device_index(sbi, blkaddr);
 
/* update device state for fsync */
-   f2fs_set_dirty_device(sbi, fio->ino, devidx, FLUSH_INO);
+   f2fs_set_dirty_device(sbi, ino, devidx, FLUSH_INO);
 
/* update device state for checkpoint */
if (!f2fs_test_bit(devidx, (char *)&sbi->dirty_device)) {
@@ -3092,7 +3091,7 @@ static void do_write_page(struct f2fs_summary *sum, 
struct f2fs_io_info *fio)
goto reallocate;
}
 
-   update_device_state(fio);
+   update_device_state(fio->sbi, fio->ino, fio->new_blkaddr);
 
if (keep_order)
up_read(&fio->sbi->io_order_lock);
@@ -3168,7 

[PATCH 2/2] fs: support direct IO in a multi-device FS

2019-01-02 Thread sunqiuyang
From: Qiuyang Sun 

Don't use the bdev pointer in struct buffer_head for dio_bio_alloc(),
since it may have been changed to another device in the FS in
get_more_blocks().

Signed-off-by: Qiuyang Sun 
---
 fs/direct-io.c | 19 ++-
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/fs/direct-io.c b/fs/direct-io.c
index 722d17c..6cd6029 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -99,6 +99,7 @@ struct dio_submit {
unsigned cur_page_offset;   /* Offset into it, in bytes */
unsigned cur_page_len;  /* Nr of bytes at cur_page_offset */
sector_t cur_page_block;/* Where it starts */
+   struct block_device *cur_page_dev;
loff_t cur_page_fs_offset;  /* Offset in file */
 
struct iov_iter *iter;
@@ -729,7 +730,7 @@ static int get_more_blocks(struct dio *dio, struct 
dio_submit *sdio,
  * There is no bio.  Make one now.
  */
 static inline int dio_new_bio(struct dio *dio, struct dio_submit *sdio,
-   sector_t start_sector, struct buffer_head *map_bh)
+   sector_t start_sector)
 {
sector_t sector;
int ret, nr_pages;
@@ -740,7 +741,7 @@ static inline int dio_new_bio(struct dio *dio, struct 
dio_submit *sdio,
sector = start_sector << (sdio->blkbits - 9);
nr_pages = min(sdio->pages_in_io, BIO_MAX_PAGES);
BUG_ON(nr_pages <= 0);
-   dio_bio_alloc(dio, sdio, map_bh->b_bdev, sector, nr_pages);
+   dio_bio_alloc(dio, sdio, sdio->cur_page_dev, sector, nr_pages);
sdio->boundary = 0;
 out:
return ret;
@@ -785,8 +786,7 @@ static inline int dio_bio_add_page(struct dio_submit *sdio)
  * The caller of this function is responsible for removing cur_page from the
  * dio, and for dropping the refcount which came from that presence.
  */
-static inline int dio_send_cur_page(struct dio *dio, struct dio_submit *sdio,
-   struct buffer_head *map_bh)
+static inline int dio_send_cur_page(struct dio *dio, struct dio_submit *sdio)
 {
int ret = 0;
 
@@ -815,14 +815,14 @@ static inline int dio_send_cur_page(struct dio *dio, 
struct dio_submit *sdio,
}
 
if (sdio->bio == NULL) {
-   ret = dio_new_bio(dio, sdio, sdio->cur_page_block, map_bh);
+   ret = dio_new_bio(dio, sdio, sdio->cur_page_block);
if (ret)
goto out;
}
 
if (dio_bio_add_page(sdio) != 0) {
dio_bio_submit(dio, sdio);
-   ret = dio_new_bio(dio, sdio, sdio->cur_page_block, map_bh);
+   ret = dio_new_bio(dio, sdio, sdio->cur_page_block);
if (ret == 0) {
ret = dio_bio_add_page(sdio);
BUG_ON(ret != 0);
@@ -878,7 +878,7 @@ static inline int dio_send_cur_page(struct dio *dio, struct 
dio_submit *sdio,
 * If there's a deferred page already there then send it.
 */
if (sdio->cur_page) {
-   ret = dio_send_cur_page(dio, sdio, map_bh);
+   ret = dio_send_cur_page(dio, sdio);
put_page(sdio->cur_page);
sdio->cur_page = NULL;
if (ret)
@@ -890,6 +890,7 @@ static inline int dio_send_cur_page(struct dio *dio, struct 
dio_submit *sdio,
sdio->cur_page_offset = offset;
sdio->cur_page_len = len;
sdio->cur_page_block = blocknr;
+   sdio->cur_page_dev = map_bh->b_bdev;
sdio->cur_page_fs_offset = sdio->block_in_file << sdio->blkbits;
 out:
/*
@@ -897,7 +898,7 @@ static inline int dio_send_cur_page(struct dio *dio, struct 
dio_submit *sdio,
 * avoid metadata seeks.
 */
if (sdio->boundary) {
-   ret = dio_send_cur_page(dio, sdio, map_bh);
+   ret = dio_send_cur_page(dio, sdio);
if (sdio->bio)
dio_bio_submit(dio, sdio);
put_page(sdio->cur_page);
@@ -1348,7 +1349,7 @@ static inline int drop_refcount(struct dio *dio)
if (sdio.cur_page) {
ssize_t ret2;
 
-   ret2 = dio_send_cur_page(dio, &sdio, &map_bh);
+   ret2 = dio_send_cur_page(dio, &sdio);
if (retval == 0)
retval = ret2;
put_page(sdio.cur_page);
-- 
1.8.3.1



[PATCH 1/2] f2fs-dev: support multi-device direct IO

2019-01-02 Thread sunqiuyang
From: Qiuyang Sun 

The physical blocks in struct f2fs_map_blocks must be in the same device.

Signed-off-by: Qiuyang Sun 
---
 fs/f2fs/data.c | 20 +++-
 fs/f2fs/f2fs.h |  2 --
 2 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index e5cd3fd..7a6369e 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -1076,6 +1076,8 @@ int f2fs_map_blocks(struct inode *inode, struct 
f2fs_map_blocks *map,
struct extent_info ei = {0,0,0};
block_t blkaddr;
unsigned int start_pgofs;
+   int devi;
+   block_t end_blk;
 
if (!maxblocks)
return 0;
@@ -1207,8 +1209,15 @@ int f2fs_map_blocks(struct inode *inode, struct 
f2fs_map_blocks *map,
 
map->m_pblk = blkaddr;
map->m_len = 1;
+
+   if (sbi->s_ndevs && blkaddr != NEW_ADDR &&
+   blkaddr != NULL_ADDR) {
+   devi = f2fs_target_device_index(sbi, blkaddr);
+   end_blk = FDEV(devi).end_blk;
+   }
} else if ((map->m_pblk != NEW_ADDR &&
-   blkaddr == (map->m_pblk + ofs)) ||
+   blkaddr == (map->m_pblk + ofs) &&
+   (!sbi->s_ndevs || blkaddr <= end_blk)) ||
(map->m_pblk == NEW_ADDR && blkaddr == NEW_ADDR) ||
flag == F2FS_GET_BLOCK_PRE_DIO) {
ofs++;
@@ -1322,6 +1331,8 @@ static int __get_data_block(struct inode *inode, sector_t 
iblock,
 {
struct f2fs_map_blocks map;
int err;
+   struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+   int devi;
 
map.m_lblk = iblock;
map.m_len = bh->b_size >> inode->i_blkbits;
@@ -1333,6 +1344,13 @@ static int __get_data_block(struct inode *inode, 
sector_t iblock,
err = f2fs_map_blocks(inode, &map, create, flag);
if (!err) {
map_bh(bh, inode->i_sb, map.m_pblk);
+   if (sbi->s_ndevs) {
+   devi = f2fs_target_device_index(sbi, map.m_pblk);
+   if (devi) {
+   bh->b_bdev = FDEV(devi).bdev;
+   bh->b_blocknr -= FDEV(devi).start_blk;
+   }
+   }
bh->b_state = (bh->b_state & ~F2FS_MAP_FLAGS) | map.m_flags;
bh->b_size = (u64)map.m_len << inode->i_blkbits;
}
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index eeede26..b311471 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -3595,8 +3595,6 @@ static inline bool f2fs_force_buffered_io(struct inode 
*inode,
 
if (f2fs_post_read_required(inode))
return true;
-   if (sbi->s_ndevs)
-   return true;
/*
 * for blkzoned device, fallback direct IO to buffered IO, so
 * all IOs can be serialized by log-structured write.
-- 
1.8.3.1



[PATCH] f2fs: fix block address for __check_sit_bitmap

2018-12-18 Thread sunqiuyang
From: Qiuyang Sun 

Should use lstart (logical start address) instead of start (in dev) here.
This fixes a bug in multi-device scenarios.

Signed-off-by: Qiuyang Sun 
---
 fs/f2fs/segment.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 204d31e..1a11e7e 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -1150,7 +1150,7 @@ static int __submit_discard_cmd(struct f2fs_sb_info *sbi,
list_move_tail(&dc->list, wait_list);
 
/* sanity check on discard range */
-   __check_sit_bitmap(sbi, start, start + len);
+   __check_sit_bitmap(sbi, lstart, lstart + len);
 
bio->bi_private = dc;
bio->bi_end_io = f2fs_submit_discard_endio;
-- 
1.8.3.1



[PATCH 1/1] f2fs: release locks before return in f2fs_ioc_gc_range()

2018-03-13 Thread sunqiuyang
From: Qiuyang Sun 

Currently, we will leave the kernel with locks still held when the gc_range
is invalid. This patch fixes the bug.

Signed-off-by: Qiuyang Sun 
---
 fs/f2fs/file.c | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index c4c27e6..ee88058 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -2059,8 +2059,10 @@ static int f2fs_ioc_gc_range(struct file *filp, unsigned 
long arg)
return ret;
 
end = range.start + range.len;
-   if (range.start < MAIN_BLKADDR(sbi) || end >= MAX_BLKADDR(sbi))
-   return -EINVAL;
+   if (range.start < MAIN_BLKADDR(sbi) || end >= MAX_BLKADDR(sbi)) {
+   ret = -EINVAL;
+   goto out;
+   }
 do_more:
if (!range.sync) {
if (!mutex_trylock(&sbi->gc_mutex)) {
-- 
2.5.0



[PATCH 1/1] ext4: remove redundant assignment in ext4_iomap_begin()

2017-11-27 Thread sunqiuyang
From: Qiuyang Sun 

This line will not change the value of map.m_lblk in any case.

Signed-off-by: Qiuyang Sun 
---
 fs/ext4/inode.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 9f836e2..d4a42b1 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3443,7 +3443,6 @@ static int ext4_iomap_begin(struct inode *inode, loff_t 
offset, loff_t length,
 
if (es.es_lblk < map.m_lblk)
offs = map.m_lblk - es.es_lblk;
-   map.m_lblk = es.es_lblk + offs;
map.m_len = es.es_len - offs;
delalloc = true;
}
-- 
2.5.0



[PATCH RFC 1/1] f2fs: add per-device superblocks

2017-11-08 Thread sunqiuyang
From: Qiuyang Sun 

Currently, a multi-device F2FS only has superblocks written in its start
device, but not the others. Thus, we cannot tell if a single device is part
of a F2FS by reading itself only, which may be unsafe in scenarios like
transferring devices between computer systems. This patch embeds per-device
superblock sections into the main area.

- In the main area, we reserve the start section of each device for extra
  copies of the superblock on its 0th and 1st blocks. All blocks in such
  segments are counted as valid, disabling them to be allocated for other
  uses. These segments cannot be GCed.
- Reserve a bit in f2fs_super_block::feature to represent whether
  superblock sections exist on all devices. This feature is backward
  compatible by a mount option "-o per_dev_sb": Any valid data or node
  blocks in the target sections will be moved out by calling
  do_garbage_collect(); if succeeded, new superblock sections will be
  built, otherwise the mount fails.
- With this feature, a F2FS can be mounted from any of its devices.

TODO:
- f2fs-tools:
  - mkfs.f2fs: allow building per-device superblocks offline when
formatting a F2FS;
  - fsck.f2fs: treat the per-device superblock segments correctly.
- Identify each device in a F2FS by its UUID in struct f2fs_super_block,
  instead of the path.

Signed-off-by: Qiuyang Sun 
---
 fs/f2fs/f2fs.h|  18 
 fs/f2fs/gc.c  |   7 ++-
 fs/f2fs/gc.h  |   5 --
 fs/f2fs/segment.c | 102 +
 fs/f2fs/segment.h |   1 +
 fs/f2fs/super.c   | 133 +-
 6 files changed, 248 insertions(+), 18 deletions(-)

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index e1d3a94..f994dab 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -96,6 +96,7 @@ extern char *fault_name[FAULT_MAX];
 #define F2FS_MOUNT_PRJQUOTA0x0020
 #define F2FS_MOUNT_QUOTA   0x0040
 #define F2FS_MOUNT_INLINE_XATTR_SIZE   0x0080
+#define F2FS_MOUNT_PER_DEV_SB  0x0100
 
 #define clear_opt(sbi, option) ((sbi)->mount_opt.opt &= ~F2FS_MOUNT_##option)
 #define set_opt(sbi, option)   ((sbi)->mount_opt.opt |= F2FS_MOUNT_##option)
@@ -123,6 +124,7 @@ struct f2fs_mount_info {
 #define F2FS_FEATURE_INODE_CHKSUM  0x0020
 #define F2FS_FEATURE_FLEXIBLE_INLINE_XATTR 0x0040
 #define F2FS_FEATURE_QUOTA_INO 0x0080
+#define F2FS_FEATURE_PER_DEV_SB0x0100
 
 #define F2FS_HAS_FEATURE(sb, mask) \
((F2FS_SB(sb)->raw_super->feature & cpu_to_le32(mask)) != 0)
@@ -969,6 +971,7 @@ struct f2fs_dev_info {
unsigned int nr_blkz;   /* Total number of zones */
u8 *blkz_type;  /* Array of zones type */
 #endif
+   bool sb_valid[2];   /* Validity of two per-device superblocks */
 };
 
 enum inode_type {
@@ -2648,6 +2651,8 @@ int build_segment_manager(struct f2fs_sb_info *sbi);
 void destroy_segment_manager(struct f2fs_sb_info *sbi);
 int __init create_segment_manager_caches(void);
 void destroy_segment_manager_caches(void);
+int build_device_sb_sections(struct f2fs_sb_info *sbi);
+void set_per_device_sb_sentries(struct f2fs_sb_info *sbi);
 
 /*
  * checkpoint.c
@@ -2732,12 +2737,20 @@ int f2fs_migrate_page(struct address_space *mapping, 
struct page *newpage,
 /*
  * gc.c
  */
+struct gc_inode_list {
+   struct list_head ilist;
+   struct radix_tree_root iroot;
+};
+
 int start_gc_thread(struct f2fs_sb_info *sbi);
 void stop_gc_thread(struct f2fs_sb_info *sbi);
 block_t start_bidx_of_node(unsigned int node_ofs, struct inode *inode);
 int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, bool background,
unsigned int segno);
 void build_gc_manager(struct f2fs_sb_info *sbi);
+void put_gc_inode(struct gc_inode_list *gc_list);
+int do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int start_segno,
+   struct gc_inode_list *gc_list, int gc_type);
 
 /*
  * recovery.c
@@ -3083,6 +3096,11 @@ static inline int f2fs_sb_has_quota_ino(struct 
super_block *sb)
return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_QUOTA_INO);
 }
 
+static inline int f2fs_sb_has_per_device_sb(struct super_block *sb)
+{
+   return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_PER_DEV_SB);
+}
+
 #ifdef CONFIG_BLK_DEV_ZONED
 static inline int get_blkz_type(struct f2fs_sb_info *sbi,
struct block_device *bdev, block_t blkaddr)
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index c7b1d70..2f665e2 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -438,7 +438,7 @@ static void add_gc_inode(struct gc_inode_list *gc_list, 
struct inode *inode)
list_add_tail(&new_ie->list, &gc_list->ilist);
 }
 
-static void put_gc_inode(struct gc_inode_list *gc_list)
+void put_gc_inode(struct gc_inode_list *gc_list)
 {
struct inode_entry *ie, *next_ie;
list_for_each_entry_safe(ie, next_ie, &gc_list->ilist, list) {
@@ -90

[PATCH 1/1] f2fs: merge equivalent flags F2FS_GET_BLOCK_[READ|DIO]

2017-08-09 Thread sunqiuyang
From: Qiuyang Sun 

Currently, the two flags F2FS_GET_BLOCK_[READ|DIO] are totally equivalent
and can be used interchangably in all scenarios they are involved in. 
Neither of the flags is referenced in f2fs_map_blocks(), making them both 
the default case. To remove the ambiguity, this patch merges both flags
into F2FS_GET_BLOCK_DEFAULT, and introduces an enum for all distinct flags.

Signed-off-by: Qiuyang Sun 
---
 fs/f2fs/data.c |  4 ++--
 fs/f2fs/f2fs.h | 13 +++--
 fs/f2fs/file.c |  4 ++--
 3 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index c43262d..67da4f6 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -1044,7 +1044,7 @@ static int get_data_block_dio(struct inode *inode, 
sector_t iblock,
struct buffer_head *bh_result, int create)
 {
return __get_data_block(inode, iblock, bh_result, create,
-   F2FS_GET_BLOCK_DIO, NULL);
+   F2FS_GET_BLOCK_DEFAULT, NULL);
 }
 
 static int get_data_block_bmap(struct inode *inode, sector_t iblock,
@@ -1244,7 +1244,7 @@ static int f2fs_mpage_readpages(struct address_space 
*mapping,
map.m_len = last_block - block_in_file;
 
if (f2fs_map_blocks(inode, &map, 0,
-   F2FS_GET_BLOCK_READ))
+   F2FS_GET_BLOCK_DEFAULT))
goto set_error_page;
}
 got_it:
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index cea329f..2f20b6b 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -514,12 +514,13 @@ struct f2fs_map_blocks {
 };
 
 /* for flag in get_data_block */
-#define F2FS_GET_BLOCK_READ0
-#define F2FS_GET_BLOCK_DIO 1
-#define F2FS_GET_BLOCK_FIEMAP  2
-#define F2FS_GET_BLOCK_BMAP3
-#define F2FS_GET_BLOCK_PRE_DIO 4
-#define F2FS_GET_BLOCK_PRE_AIO 5
+enum {
+   F2FS_GET_BLOCK_DEFAULT,
+   F2FS_GET_BLOCK_FIEMAP,
+   F2FS_GET_BLOCK_BMAP,
+   F2FS_GET_BLOCK_PRE_DIO,
+   F2FS_GET_BLOCK_PRE_AIO,
+};
 
 /*
  * i_advise uses FADVISE_XXX_BIT. We can add additional hints later.
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index e2b33b8..3eebd49 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -2074,7 +2074,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi,
 */
while (map.m_lblk < pg_end) {
map.m_len = pg_end - map.m_lblk;
-   err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_READ);
+   err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_DEFAULT);
if (err)
goto out;
 
@@ -2116,7 +2116,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi,
 
 do_map:
map.m_len = pg_end - map.m_lblk;
-   err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_READ);
+   err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_DEFAULT);
if (err)
goto clear_out;
 
-- 
1.8.3.1



[PATCH 1/1] f2fs: merge equivalent flags F2FS_GET_BLOCK_[READ|DIO]

2017-08-08 Thread sunqiuyang
From: Qiuyang Sun 

Currently, the two flags F2FS_GET_BLOCK_[READ|DIO] are totally equivalent
and can be used interchangably in all scenarios they are involved in. This
patch deletes F2FS_GET_BLOCK_READ and uses F2FS_GET_BLOCK_DIO instead.

Signed-off-by: Qiuyang Sun 
---
 fs/f2fs/data.c | 2 +-
 fs/f2fs/f2fs.h | 1 -
 fs/f2fs/file.c | 4 ++--
 3 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index c43262d..e0a59bf 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -1244,7 +1244,7 @@ static int f2fs_mpage_readpages(struct address_space 
*mapping,
map.m_len = last_block - block_in_file;
 
if (f2fs_map_blocks(inode, &map, 0,
-   F2FS_GET_BLOCK_READ))
+   F2FS_GET_BLOCK_DIO))
goto set_error_page;
}
 got_it:
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index cea329f..0593ca7 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -514,7 +514,6 @@ struct f2fs_map_blocks {
 };
 
 /* for flag in get_data_block */
-#define F2FS_GET_BLOCK_READ0
 #define F2FS_GET_BLOCK_DIO 1
 #define F2FS_GET_BLOCK_FIEMAP  2
 #define F2FS_GET_BLOCK_BMAP3
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index e2b33b8..8271cb5 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -2074,7 +2074,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi,
 */
while (map.m_lblk < pg_end) {
map.m_len = pg_end - map.m_lblk;
-   err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_READ);
+   err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_DIO);
if (err)
goto out;
 
@@ -2116,7 +2116,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi,
 
 do_map:
map.m_len = pg_end - map.m_lblk;
-   err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_READ);
+   err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_DIO);
if (err)
goto clear_out;
 
-- 
1.8.3.1



RE: [PATCH v8 1/1] f2fs: dax: implement direct access

2017-07-25 Thread sunqiuyang
Hi, 

Considering the current interfaces of F2FS and EXT4, my thought is that we can 
define a generic user-modifiable flag FS_DAX_FL, which can be included in the 
i_flags field of [f2fs | ext4]_inode_info. Thus, DAX can be enabled in either 
of the two ways below: 

1) mount the FS with a "dax" option, so that all files created will have the 
flag S_DAX set in the VFS inode, and the flag FS_DAX_FL set in [f2fs | 
ext4]_inode_info, by default.

2) mount the FS without "dax", and enable DAX per-inode from 
f2fs_ioctl_setflags() => f2fs_set_inode_flags()
 
Thanks,


From: Jaegeuk Kim [jaeg...@kernel.org]
Sent: Wednesday, July 26, 2017 10:16
To: Dan Williams
Cc: sunqiuyang; Linux Kernel Mailing List; linux-fsdevel; 
linux-f2fs-de...@lists.sourceforge.net; linux-nvd...@lists.01.org
Subject: Re: [PATCH v8 1/1] f2fs: dax: implement direct access

Hi Dan,

On 07/25, Dan Williams wrote:
> [ adding linux-nvdimm ]
>
> On Thu, Jul 20, 2017 at 5:10 AM, sunqiuyang  wrote:
> > From: Qiuyang Sun 
> >
> > This patch implements Direct Access (DAX) in F2FS, including:
> >  - a mount option to choose whether to enable DAX or not
>
> We're in the process of walking back and potentially deprecating the
> use of the dax mount option for xfs and ext4 since dax can have
> negative performance implications if page cache memory happens to be
> faster than pmem. It should be limited to applications that
> specifically want the semantic, not globally enabled for the entire
> mount. xfs has went ahead and added the XFS_DIFLAG2_DAX indoe flag for
> per-inode enabling of dax.

Thank you so much for pointing this out. So, is there a plan to define a
generic inode flag to enable dax via inode_set_flag? Or, does each filesystem
need to handle it individually likewise xfs?

>
> I'm wondering if any new filesystem that adds dax support at this
> point should do so with inode flags and not a mount option?

Anyway, in such the case, I have to postpone merging this patch for a while.

Thanks,


[PATCH v8 1/1] f2fs: dax: implement direct access

2017-07-20 Thread sunqiuyang
From: Qiuyang Sun 

This patch implements Direct Access (DAX) in F2FS, including:
 - a mount option to choose whether to enable DAX or not
 - read/write and mmap of regular files in the DAX way
 - zero-out of unaligned partial blocks in the DAX way
 - garbage collection of DAX files, by mapping both old and new physical
   addresses of a data page into memory and copy data between them directly
 - incompatibility of DAX with inline data, atomic or volatile write, 
   collapse|insert_range, etc.

Signed-off-by: Qiuyang Sun 
---
Changelog v7 -> v8:
 - Introduce the macro f2fs_dax_file() to judge if a file is DAX for cases
   when CONFIG_FS_DAX is set or not
 - Return -ENOTSUPP when an operation does not support DAX
 - In f2fs_iomap_begin(), convert the inline data of an inode (if any) 
   before mapping blocks
 - Minor cleanups
---
 Documentation/filesystems/f2fs.txt |   2 +
 fs/f2fs/data.c | 132 +-
 fs/f2fs/f2fs.h |  15 +++
 fs/f2fs/file.c | 183 -
 fs/f2fs/gc.c   | 103 -
 fs/f2fs/inline.c   |   3 +
 fs/f2fs/inode.c|   8 +-
 fs/f2fs/namei.c|   5 +
 fs/f2fs/super.c|  15 +++
 9 files changed, 454 insertions(+), 12 deletions(-)

diff --git a/Documentation/filesystems/f2fs.txt 
b/Documentation/filesystems/f2fs.txt
index 273ccb2..c86c421 100644
--- a/Documentation/filesystems/f2fs.txt
+++ b/Documentation/filesystems/f2fs.txt
@@ -164,6 +164,8 @@ io_bits=%u Set the bit size of write IO 
requests. It should be set
with "mode=lfs".
 usrquota   Enable plain user disk quota accounting.
 grpquota   Enable plain group disk quota accounting.
+daxUse direct access (no page cache). See
+   Documentation/filesystems/dax.txt.
 
 

 DEBUGFS ENTRIES
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 87c1f41..4eb4b76 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -910,6 +910,15 @@ int f2fs_map_blocks(struct inode *inode, struct 
f2fs_map_blocks *map,
err = -EIO;
goto sync_out;
}
+   /*
+* If newly allocated blocks are to be zeroed out later,
+* a single f2fs_map_blocks must not contain both old
+* and new blocks at the same time.
+*/
+   if (flag == F2FS_GET_BLOCK_ZERO
+   && (map->m_flags & F2FS_MAP_MAPPED)
+   && !(map->m_flags & F2FS_MAP_NEW))
+   goto sync_out;
if (flag == F2FS_GET_BLOCK_PRE_AIO) {
if (blkaddr == NULL_ADDR) {
prealloc++;
@@ -938,6 +947,8 @@ int f2fs_map_blocks(struct inode *inode, struct 
f2fs_map_blocks *map,
blkaddr != NEW_ADDR)
goto sync_out;
}
+   } else if (flag == F2FS_GET_BLOCK_ZERO && map->m_flags & F2FS_MAP_NEW) {
+   goto sync_out;
}
 
if (flag == F2FS_GET_BLOCK_PRE_AIO)
@@ -996,6 +1007,12 @@ int f2fs_map_blocks(struct inode *inode, struct 
f2fs_map_blocks *map,
goto next_dnode;
 
 sync_out:
+   if (flag == F2FS_GET_BLOCK_ZERO && map->m_flags & F2FS_MAP_NEW) {
+   clean_bdev_aliases(inode->i_sb->s_bdev,
+   map->m_pblk, map->m_len);
+   err = sb_issue_zeroout(inode->i_sb, map->m_pblk,
+   map->m_len, GFP_NOFS);
+   }
f2fs_put_dnode(&dn);
 unlock_out:
if (create) {
@@ -1808,16 +1825,19 @@ static int f2fs_write_data_pages(struct address_space 
*mapping,
return 0;
 }
 
-static void f2fs_write_failed(struct address_space *mapping, loff_t to)
+static void f2fs_write_failed(struct address_space *mapping, loff_t to,
+   bool lock)
 {
struct inode *inode = mapping->host;
loff_t i_size = i_size_read(inode);
 
if (to > i_size) {
-   down_write(&F2FS_I(inode)->i_mmap_sem);
+   if (lock)
+   down_write(&F2FS_I(inode)->i_mmap_sem);
truncate_pagecache(inode, i_size);
truncate_blocks(inode, i_size, true);
-   up_write(&F2FS_I(inode)->i_mmap_sem);
+   if (lock)
+   up_write(&F2FS_I(inode)->i_mmap_sem);
}
 }
 
@@ -2000,7 +2020,7 @@ static int f2fs_write_begin(struct file *file, struct 
address_space *mapping,
 
 fail:
f2fs_put_page

[PATCH v7 1/1] f2fs: dax: implement direct access

2017-07-17 Thread sunqiuyang
From: Qiuyang Sun 

This patch implements Direct Access (DAX) in F2FS, including:
- a mount option to choose whether to enable DAX or not
- read/write and mmap of regular files in the DAX way
- zero-out of unaligned partial blocks in the DAX way
- garbage collection of DAX files, by mapping both old and new physical
  addresses of a data page into memory and copy data between them directly
- incompatibility of DAX with inline data, atomic or volatile write, etc.

Signed-off-by: Qiuyang Sun 
---
Changlog v6 -> v7:
- Document the mount option "dax" for this feature in f2fs.txt
- Minor cleanup in dax_move_data_page()

---
 Documentation/filesystems/f2fs.txt |   2 +
 fs/f2fs/data.c | 132 +++--
 fs/f2fs/f2fs.h |   9 ++
 fs/f2fs/file.c | 192 -
 fs/f2fs/gc.c   | 105 +++-
 fs/f2fs/inline.c   |   4 +
 fs/f2fs/inode.c|   8 +-
 fs/f2fs/namei.c|   5 +
 fs/f2fs/super.c|  15 +++
 9 files changed, 459 insertions(+), 13 deletions(-)

diff --git a/Documentation/filesystems/f2fs.txt 
b/Documentation/filesystems/f2fs.txt
index 273ccb2..c86c421 100644
--- a/Documentation/filesystems/f2fs.txt
+++ b/Documentation/filesystems/f2fs.txt
@@ -164,6 +164,8 @@ io_bits=%u Set the bit size of write IO 
requests. It should be set
with "mode=lfs".
 usrquota   Enable plain user disk quota accounting.
 grpquota   Enable plain group disk quota accounting.
+daxUse direct access (no page cache). See
+   Documentation/filesystems/dax.txt.
 
 

 DEBUGFS ENTRIES
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 87c1f41..26b908a 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -910,6 +910,15 @@ int f2fs_map_blocks(struct inode *inode, struct 
f2fs_map_blocks *map,
err = -EIO;
goto sync_out;
}
+   /*
+* If newly allocated blocks are to be zeroed out later,
+* a single f2fs_map_blocks must not contain both old
+* and new blocks at the same time.
+*/
+   if (flag == F2FS_GET_BLOCK_ZERO
+   && (map->m_flags & F2FS_MAP_MAPPED)
+   && !(map->m_flags & F2FS_MAP_NEW))
+   goto sync_out;
if (flag == F2FS_GET_BLOCK_PRE_AIO) {
if (blkaddr == NULL_ADDR) {
prealloc++;
@@ -938,7 +947,8 @@ int f2fs_map_blocks(struct inode *inode, struct 
f2fs_map_blocks *map,
blkaddr != NEW_ADDR)
goto sync_out;
}
-   }
+   } else if (flag == F2FS_GET_BLOCK_ZERO && map->m_flags & F2FS_MAP_NEW)
+   goto sync_out;
 
if (flag == F2FS_GET_BLOCK_PRE_AIO)
goto skip;
@@ -996,6 +1006,12 @@ int f2fs_map_blocks(struct inode *inode, struct 
f2fs_map_blocks *map,
goto next_dnode;
 
 sync_out:
+   if (flag == F2FS_GET_BLOCK_ZERO && map->m_flags & F2FS_MAP_NEW) {
+   clean_bdev_aliases(inode->i_sb->s_bdev,
+   map->m_pblk, map->m_len);
+   err = sb_issue_zeroout(inode->i_sb, map->m_pblk,
+   map->m_len, GFP_NOFS);
+   }
f2fs_put_dnode(&dn);
 unlock_out:
if (create) {
@@ -1808,16 +1824,19 @@ static int f2fs_write_data_pages(struct address_space 
*mapping,
return 0;
 }
 
-static void f2fs_write_failed(struct address_space *mapping, loff_t to)
+static void f2fs_write_failed(struct address_space *mapping, loff_t to,
+   bool lock)
 {
struct inode *inode = mapping->host;
loff_t i_size = i_size_read(inode);
 
if (to > i_size) {
-   down_write(&F2FS_I(inode)->i_mmap_sem);
+   if (lock)
+   down_write(&F2FS_I(inode)->i_mmap_sem);
truncate_pagecache(inode, i_size);
truncate_blocks(inode, i_size, true);
-   up_write(&F2FS_I(inode)->i_mmap_sem);
+   if (lock)
+   up_write(&F2FS_I(inode)->i_mmap_sem);
}
 }
 
@@ -2000,7 +2019,7 @@ static int f2fs_write_begin(struct file *file, struct 
address_space *mapping,
 
 fail:
f2fs_put_page(page, 1);
-   f2fs_write_failed(mapping, pos + len);
+   f2fs_write_failed(mapping, pos + len, true);
return err;
 }
 
@@ -2077,7 +2096,7 @@ static ssize_t f2fs_direct_IO(struc

[PATCH v6 1/1] f2fs: dax: implement direct access

2017-07-12 Thread sunqiuyang
From: Qiuyang Sun 

This patch implements Direct Access (DAX) in F2FS, including:
- a mount option to choose whether to enable DAX or not
- read/write and mmap of regular files in the DAX way
- zero-out of unaligned partial blocks in the DAX way
- garbage collection of DAX files, by mapping both old and new physical
  addresses of a data page into memory and copy data between them directly
- incompatibility of DAX with inline data, atomic or volatile write, etc.

Signed-off-by: Qiuyang Sun 
---
Changlog v5 -> v6:
- In f2fs_map_blocks(), optimize the separation of new allocated and old 
  mapped blocks for the flag F2FS_GET_BLOCK_ZERO, and check the return 
  value of zeroout;
- In f2fs_iomap_begin(), cover the truncation of failed allocation with the
  rwsemaphore i_mmap_sem when necessary;
- Optimize the order of exception handling in dax_move_data_page().

---
 fs/f2fs/data.c   | 132 --
 fs/f2fs/f2fs.h   |   9 +++
 fs/f2fs/file.c   | 192 ++-
 fs/f2fs/gc.c | 105 --
 fs/f2fs/inline.c |   4 ++
 fs/f2fs/inode.c  |   8 ++-
 fs/f2fs/namei.c  |   5 ++
 fs/f2fs/super.c  |  15 +
 8 files changed, 457 insertions(+), 13 deletions(-)

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 87c1f41..26b908a 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -910,6 +910,15 @@ int f2fs_map_blocks(struct inode *inode, struct 
f2fs_map_blocks *map,
err = -EIO;
goto sync_out;
}
+   /*
+* If newly allocated blocks are to be zeroed out later,
+* a single f2fs_map_blocks must not contain both old
+* and new blocks at the same time.
+*/
+   if (flag == F2FS_GET_BLOCK_ZERO
+   && (map->m_flags & F2FS_MAP_MAPPED)
+   && !(map->m_flags & F2FS_MAP_NEW))
+   goto sync_out;
if (flag == F2FS_GET_BLOCK_PRE_AIO) {
if (blkaddr == NULL_ADDR) {
prealloc++;
@@ -938,7 +947,8 @@ int f2fs_map_blocks(struct inode *inode, struct 
f2fs_map_blocks *map,
blkaddr != NEW_ADDR)
goto sync_out;
}
-   }
+   } else if (flag == F2FS_GET_BLOCK_ZERO && map->m_flags & F2FS_MAP_NEW)
+   goto sync_out;
 
if (flag == F2FS_GET_BLOCK_PRE_AIO)
goto skip;
@@ -996,6 +1006,12 @@ int f2fs_map_blocks(struct inode *inode, struct 
f2fs_map_blocks *map,
goto next_dnode;
 
 sync_out:
+   if (flag == F2FS_GET_BLOCK_ZERO && map->m_flags & F2FS_MAP_NEW) {
+   clean_bdev_aliases(inode->i_sb->s_bdev,
+   map->m_pblk, map->m_len);
+   err = sb_issue_zeroout(inode->i_sb, map->m_pblk,
+   map->m_len, GFP_NOFS);
+   }
f2fs_put_dnode(&dn);
 unlock_out:
if (create) {
@@ -1808,16 +1824,19 @@ static int f2fs_write_data_pages(struct address_space 
*mapping,
return 0;
 }
 
-static void f2fs_write_failed(struct address_space *mapping, loff_t to)
+static void f2fs_write_failed(struct address_space *mapping, loff_t to,
+   bool lock)
 {
struct inode *inode = mapping->host;
loff_t i_size = i_size_read(inode);
 
if (to > i_size) {
-   down_write(&F2FS_I(inode)->i_mmap_sem);
+   if (lock)
+   down_write(&F2FS_I(inode)->i_mmap_sem);
truncate_pagecache(inode, i_size);
truncate_blocks(inode, i_size, true);
-   up_write(&F2FS_I(inode)->i_mmap_sem);
+   if (lock)
+   up_write(&F2FS_I(inode)->i_mmap_sem);
}
 }
 
@@ -2000,7 +2019,7 @@ static int f2fs_write_begin(struct file *file, struct 
address_space *mapping,
 
 fail:
f2fs_put_page(page, 1);
-   f2fs_write_failed(mapping, pos + len);
+   f2fs_write_failed(mapping, pos + len, true);
return err;
 }
 
@@ -2077,7 +2096,7 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct 
iov_iter *iter)
if (err > 0)
set_inode_flag(inode, FI_UPDATE_WRITE);
else if (err < 0)
-   f2fs_write_failed(mapping, offset + count);
+   f2fs_write_failed(mapping, offset + count, true);
}
 
trace_f2fs_direct_IO_exit(inode, offset, count, rw, err);
@@ -2274,3 +2293,104 @@ int f2fs_migrate_page(struct address_space *mapping,
.migratepage= f2fs_migrate_page,
 #endif
 };
+
+#ifdef CONFIG_FS_DAX
+#include 
+#include 
+
+static int f2fs_io

[PATCH v5 1/1] f2fs: dax: implement direct access

2017-06-27 Thread sunqiuyang
From: Qiuyang Sun 

This patch implements Direct Access (DAX) in F2FS, including:
- a mount option to choose whether to enable DAX or not
- read/write and mmap of regular files in the DAX way
- zero-out of unaligned partial blocks in the DAX way
- garbage collection of DAX files, by mapping both old and new physical
  addresses of a data page into memory and copy data between them directly
- incompatibility of DAX with inline data, atomic or volatile write, etc.

Signed-off-by: Qiuyang Sun 
---
Changlog v4 -> v5:

In DAX write, to avoid stale data, the newly allocated blocks should be
zeroed out before writing to them. We introduce a new flag
F2FS_GET_BLOCK_ZERO for f2fs_map_blocks(), for which a single struct
f2fs_map_blocks can never contain old and new blocks at the same time, so
that zeroing out new blocks would not affect old ones. This flag is used in
the write path of f2fs_iomap_begin() for DAX I/O, replacing 
F2FS_GET_BLOCK_PRE_DIO in earlier versions.

---
 fs/f2fs/data.c   | 115 +
 fs/f2fs/f2fs.h   |   9 +++
 fs/f2fs/file.c   | 192 ++-
 fs/f2fs/gc.c | 104 --
 fs/f2fs/inline.c |   4 ++
 fs/f2fs/inode.c  |   8 ++-
 fs/f2fs/namei.c  |   5 ++
 fs/f2fs/super.c  |  15 +
 8 files changed, 445 insertions(+), 7 deletions(-)

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 7d3af48..b28d97a 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -908,6 +908,15 @@ int f2fs_map_blocks(struct inode *inode, struct 
f2fs_map_blocks *map,
err = -EIO;
goto sync_out;
}
+   /*
+* If newly allocated blocks are to be zeroed out later,
+* a single f2fs_map_blocks must not contain both old
+* and new blocks at the same time.
+*/
+   if (flag == F2FS_GET_BLOCK_ZERO
+   && (map->m_flags & F2FS_MAP_MAPPED)
+   && !(map->m_flags & F2FS_MAP_NEW))
+   goto sync_out;
if (flag == F2FS_GET_BLOCK_PRE_AIO) {
if (blkaddr == NULL_ADDR) {
prealloc++;
@@ -994,6 +1003,12 @@ int f2fs_map_blocks(struct inode *inode, struct 
f2fs_map_blocks *map,
goto next_dnode;
 
 sync_out:
+   if (flag == F2FS_GET_BLOCK_ZERO && map->m_flags & F2FS_MAP_NEW) {
+   clean_bdev_aliases(inode->i_sb->s_bdev,
+   map->m_pblk, map->m_len);
+   sb_issue_zeroout(inode->i_sb, map->m_pblk,
+   map->m_len, GFP_NOFS);
+   }
f2fs_put_dnode(&dn);
 unlock_out:
if (create) {
@@ -2257,3 +2272,103 @@ int f2fs_migrate_page(struct address_space *mapping,
.migratepage= f2fs_migrate_page,
 #endif
 };
+
+#ifdef CONFIG_FS_DAX
+#include 
+#include 
+
+static int f2fs_iomap_begin(struct inode *inode, loff_t offset,
+   loff_t length, unsigned int flags, struct iomap *iomap)
+{
+   struct block_device *bdev;
+   unsigned long first_block = F2FS_BYTES_TO_BLK(offset);
+   unsigned long last_block = F2FS_BYTES_TO_BLK(offset + length - 1);
+   struct f2fs_map_blocks map;
+   int ret;
+
+   if (WARN_ON_ONCE(f2fs_has_inline_data(inode)))
+   return -ERANGE;
+
+   map.m_lblk = first_block;
+   map.m_len = last_block - first_block + 1;
+   map.m_next_pgofs = NULL;
+
+   if (!(flags & IOMAP_WRITE))
+   ret = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_FIEMAP);
+   else {
+   /* i_size should be kept here and changed later in f2fs_iomap_end */
+   loff_t original_i_size = i_size_read(inode);
+
+   ret = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_ZERO);
+   if (i_size_read(inode) > original_i_size) {
+   f2fs_i_size_write(inode, original_i_size);
+   if (ret) {
+   truncate_pagecache(inode, original_i_size);
+   truncate_blocks(inode, original_i_size, true);
+   }
+   }
+   }
+
+   if (ret)
+   return ret;
+
+   iomap->flags = 0;
+   bdev = inode->i_sb->s_bdev;
+   iomap->bdev = bdev;
+   if (blk_queue_dax(bdev->bd_queue))
+   iomap->dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
+   else
+   iomap->dax_dev = NULL;
+   iomap->offset = F2FS_BLK_TO_BYTES((u64)first_block);
+
+   if (map.m_len == 0) {
+   iomap->type = IOMAP_HOLE;
+   iomap->blkno = IOMAP_NULL_BLOCK;
+   iomap->length = F2FS_BLKSIZE;
+   } else {
+   if (map.m_flags & F2FS_MAP_UNWRITTEN)
+

[PATCH v4 1/1] f2fs: dax: implement direct access

2017-06-15 Thread sunqiuyang
From: Qiuyang Sun 

This patch implements Direct Access (DAX) in F2FS.

Signed-off-by: Qiuyang Sun 
---

Changelog v3 -> v4:


  In f2fs_iomap_begin():
- For the write branch, if f2fs_map_blocks() returns error (probably due to
  ENOSPC), the allocated blocks beyond original_i_size are truncated.
- For the read branch, use F2FS_GET_BLOCK_FIEMAP instead of READ for 
  f2fs_map_blocks(), so that contiguous unwritten blocks can be treated in
  a batch. Accordingly, judge F2FS_MAP_UNWRITTEN before F2FS_MAP_MAPPED for
  iomap->type.

- Add a call of f2fs_update_time() in f2fs_iomap_end().


- In f2fs_move_file_range() and f2fs_ioc_defragment(), return -EINVAL for
  DAX files, as the current implementation uses page cache.
- Call f2fs_bug_on() in f2fs_ioc_commit_atomic_write() and 
  f2fs_ioc_(release|abort)_volatile_write() when the inode is DAX, which 
  should not happen.


- Optimize the logic in dax_move_data_page().


- Enable setting the S_DAX flag for an inode in f2fs_set_inode_flags().

The v4 patch is at f2fs-dev-test.

---
 fs/f2fs/data.c   | 100 +
 fs/f2fs/f2fs.h   |   8 +++
 fs/f2fs/file.c   | 192 ++-
 fs/f2fs/gc.c | 104 --
 fs/f2fs/inline.c |   4 ++
 fs/f2fs/inode.c  |   8 ++-
 fs/f2fs/namei.c  |   5 ++
 fs/f2fs/super.c  |  15 +
 8 files changed, 429 insertions(+), 7 deletions(-)

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 7d3af48..58efce0 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -2257,3 +2257,103 @@ int f2fs_migrate_page(struct address_space *mapping,
.migratepage= f2fs_migrate_page,
 #endif
 };
+
+#ifdef CONFIG_FS_DAX
+#include 
+#include 
+
+static int f2fs_iomap_begin(struct inode *inode, loff_t offset,
+   loff_t length, unsigned int flags, struct iomap *iomap)
+{
+   struct block_device *bdev;
+   unsigned long first_block = F2FS_BYTES_TO_BLK(offset);
+   unsigned long last_block = F2FS_BYTES_TO_BLK(offset + length - 1);
+   struct f2fs_map_blocks map;
+   int ret;
+
+   if (WARN_ON_ONCE(f2fs_has_inline_data(inode)))
+   return -ERANGE;
+
+   map.m_lblk = first_block;
+   map.m_len = last_block - first_block + 1;
+   map.m_next_pgofs = NULL;
+
+   if (!(flags & IOMAP_WRITE))
+   ret = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_FIEMAP);
+   else {
+   /* i_size should be kept here and changed later in f2fs_iomap_end */
+   loff_t original_i_size = i_size_read(inode);
+
+   ret = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_DIO);
+   if (i_size_read(inode) > original_i_size) {
+   f2fs_i_size_write(inode, original_i_size);
+   if (ret) {
+   truncate_pagecache(inode, original_i_size);
+   truncate_blocks(inode, original_i_size, true);
+   }
+   }
+   }
+
+   if (ret)
+   return ret;
+
+   iomap->flags = 0;
+   bdev = inode->i_sb->s_bdev;
+   iomap->bdev = bdev;
+   if (blk_queue_dax(bdev->bd_queue))
+   iomap->dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
+   else
+   iomap->dax_dev = NULL;
+   iomap->offset = F2FS_BLK_TO_BYTES((u64)first_block);
+
+   if (map.m_len == 0) {
+   iomap->type = IOMAP_HOLE;
+   iomap->blkno = IOMAP_NULL_BLOCK;
+   iomap->length = F2FS_BLKSIZE;
+   } else {
+   if (map.m_flags & F2FS_MAP_UNWRITTEN) {
+   iomap->type = IOMAP_UNWRITTEN;
+   } else if (map.m_flags & F2FS_MAP_MAPPED) {
+   iomap->type = IOMAP_MAPPED;
+   } else {
+   WARN_ON_ONCE(1);
+   return -EIO;
+   }
+   iomap->blkno =
+   (sector_t)map.m_pblk << F2FS_LOG_SECTORS_PER_BLOCK;
+   iomap->length = F2FS_BLK_TO_BYTES((u64)map.m_len);
+   }
+
+   if (map.m_flags & F2FS_MAP_NEW)
+   iomap->flags |= IOMAP_F_NEW;
+   return 0;
+}
+
+static int f2fs_iomap_end(struct inode *inode, loff_t offset, loff_t length,
+   ssize_t written, unsigned int flags, struct iomap *iomap)
+{
+   put_dax(iomap->dax_dev);
+   if (!(flags & IOMAP_WRITE) || (flags & IOMAP_FAULT))
+   return 0;
+
+   if (offset + written > i_size_read(inode))
+   f2fs_i_size_write(inode, offset + written);
+
+   if (iomap->offset + iomap->length >
+   ALIGN(i_size_read(inode), F2FS_BLKSIZE)) {
+   block_t written_blk = F2FS_BYTES_TO_BLK(offset + written);
+   block_t end_blk = F2FS_BYTES_TO_BLK(offset + length);
+
+   if (written_blk < end_blk)
+   f2fs_write_failed(inode->i_mapping, offset + length);
+   }
+
+   f2fs_update

[PATCH v3] f2fs: dax: implement direct access

2017-06-07 Thread sunqiuyang
From: Qiuyang Sun 

This is a new version of PATCH v2 2/2 with the following minor changes:
- In dax_move_data_page(), the call of allocate_data_block() is changed
  according to the new definition of this function in f2fs-dev, and the 
  usage of wio_mutex is removed;
- put_dax() is added in f2fs_iomap_end().

Signed-off-by: Qiuyang Sun 
---
 fs/f2fs/data.c   |  93 ++
 fs/f2fs/f2fs.h   |   8 +++
 fs/f2fs/file.c   | 194 ++-
 fs/f2fs/gc.c |  93 --
 fs/f2fs/inline.c |   4 ++
 fs/f2fs/namei.c  |   6 ++
 fs/f2fs/super.c  |  15 +
 7 files changed, 407 insertions(+), 6 deletions(-)

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 7d3af48..2285a10 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -2257,3 +2257,96 @@ int f2fs_migrate_page(struct address_space *mapping,
.migratepage= f2fs_migrate_page,
 #endif
 };
+
+#ifdef CONFIG_FS_DAX
+#include 
+#include 
+
+static int f2fs_iomap_begin(struct inode *inode, loff_t offset,
+   loff_t length, unsigned int flags, struct iomap *iomap)
+{
+   struct block_device *bdev;
+   unsigned long first_block = F2FS_BYTES_TO_BLK(offset);
+   unsigned long last_block = F2FS_BYTES_TO_BLK(offset + length - 1);
+   struct f2fs_map_blocks map;
+   int ret;
+   loff_t original_i_size = i_size_read(inode);
+
+   if (WARN_ON_ONCE(f2fs_has_inline_data(inode)))
+   return -ERANGE;
+
+   map.m_lblk = first_block;
+   map.m_len = last_block - first_block + 1;
+   map.m_next_pgofs = NULL;
+
+   if (!(flags & IOMAP_WRITE))
+   ret = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_READ);
+   else {
+   ret = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_DIO);
+   /* i_size should be kept here and changed later in f2fs_iomap_end */
+   if (i_size_read(inode) != original_i_size)
+   f2fs_i_size_write(inode, original_i_size);
+   }
+
+   if (ret)
+   return ret;
+
+   iomap->flags = 0;
+   bdev = inode->i_sb->s_bdev;
+   iomap->bdev = bdev;
+   if (blk_queue_dax(bdev->bd_queue))
+   iomap->dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
+   else
+   iomap->dax_dev = NULL;
+   iomap->offset = F2FS_BLK_TO_BYTES((u64)first_block);
+
+   if (map.m_len == 0) {
+   iomap->type = IOMAP_HOLE;
+   iomap->blkno = IOMAP_NULL_BLOCK;
+   iomap->length = F2FS_BLKSIZE;
+   } else {
+   if (map.m_flags & F2FS_MAP_MAPPED) {
+   iomap->type = IOMAP_MAPPED;
+   } else if (map.m_flags & F2FS_MAP_UNWRITTEN) {
+   iomap->type = IOMAP_UNWRITTEN;
+   } else {
+   WARN_ON_ONCE(1);
+   return -EIO;
+   }
+   iomap->blkno =
+   (sector_t)map.m_pblk << F2FS_LOG_SECTORS_PER_BLOCK;
+   iomap->length = F2FS_BLK_TO_BYTES((u64)map.m_len);
+   }
+
+   if (map.m_flags & F2FS_MAP_NEW)
+   iomap->flags |= IOMAP_F_NEW;
+   return 0;
+}
+
+static int f2fs_iomap_end(struct inode *inode, loff_t offset, loff_t length,
+   ssize_t written, unsigned int flags, struct iomap *iomap)
+{
+   put_dax(iomap->dax_dev);
+   if (!(flags & IOMAP_WRITE) || (flags & IOMAP_FAULT))
+   return 0;
+
+   if (offset + written > i_size_read(inode))
+   f2fs_i_size_write(inode, offset + written);
+
+   if (iomap->offset + iomap->length >
+   ALIGN(i_size_read(inode), F2FS_BLKSIZE)) {
+   block_t written_blk = F2FS_BYTES_TO_BLK(offset + written);
+   block_t end_blk = F2FS_BYTES_TO_BLK(offset + length);
+
+   if (written_blk < end_blk)
+   f2fs_write_failed(inode->i_mapping, offset + length);
+   }
+
+   return 0;
+}
+
+struct iomap_ops f2fs_iomap_ops = {
+   .iomap_begin= f2fs_iomap_begin,
+   .iomap_end  = f2fs_iomap_end,
+};
+#endif
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index da70964..e3c2ed4 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -88,6 +88,11 @@ struct f2fs_fault_info {
 #define F2FS_MOUNT_FAULT_INJECTION 0x0001
 #define F2FS_MOUNT_ADAPTIVE0x0002
 #define F2FS_MOUNT_LFS 0x0004
+#ifdef CONFIG_FS_DAX
+#define F2FS_MOUNT_DAX 0x0008 /* Direct Access */
+#else
+#define F2FS_MOUNT_DAX 0
+#endif
 
 #define clear_opt(sbi, option) ((sbi)->mount_opt.opt &= ~F2FS_MOUNT_##option)
 #define set_opt(sbi, option)   ((sbi)->mount_opt.opt |= F2FS_MOUNT_##option)
@@ -2387,6 +2392,9 @@ void f2fs_invalidate_page(struct page *page, unsigned int 
offset,
 int f2fs_migrate_page(struct address_space *mapping, struct page *newpage,
struct page *page, enum migrate

[PATCH v2 2/2] f2fs: dax: Implement direct access

2017-05-17 Thread sunqiuyang
From: Qiuyang Sun 

This patch implements Direct Access (DAX) in F2FS, including:
- a mount option to choose whether to enable DAX or not
- read/write and mmap of regular files in the DAX way
- zero-out of unaligned partial blocks in the DAX way
- garbage collection of DAX files, by mapping both old and new physical
  addresses of a data page into memory and copy data between them directly
- incompatibility of DAX with inline data, atomic or volatile write

TODO: The current implementation of f2fs_collapse/insert_range() does not
apply to DAX files, as filemap_write_and_wait_range() works in a different
way for such files, and thus the data pages cannot be moved correctly. In
this patch the two functions simply returns -EINVAL. A possible solution is
to enable using page cache temporarily.

Signed-off-by: Qiuyang Sun 
---
Changelog v1 -> v2:
- Remove [PATCH v1 2/3] which exported the interfaces of
  dax_map/unmap_atomic(), as the two functions no longer exist since 4.12.
  This patch is a newer version of v1 3/3.
- For dax_move_data_page() in , call dax_direct_access() to map the
  device into memory, instead of dax_map_atomic() in v1.
- Move the functions f2fs_iomap_begin/end() from  to ,
  and make changes according to the new interfaces for DAX in 4.12, e.g.,
  struct iomap with an additional field (struct dax_device *dax_dev).

The v2 patches are at 4.12-rc1.
---
 fs/f2fs/data.c   |  90 +
 fs/f2fs/f2fs.h   |   8 +++
 fs/f2fs/file.c   | 197 ++-
 fs/f2fs/gc.c |  95 +--
 fs/f2fs/inline.c |   4 ++
 fs/f2fs/namei.c  |   6 ++
 fs/f2fs/super.c  |  15 +
 7 files changed, 409 insertions(+), 6 deletions(-)

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index c9a3fbd..8362a9f 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -23,6 +23,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "f2fs.h"
 #include "node.h"
@@ -2209,3 +2210,92 @@ int f2fs_migrate_page(struct address_space *mapping,
.migratepage= f2fs_migrate_page,
 #endif
 };
+
+#ifdef CONFIG_FS_DAX
+static int f2fs_iomap_begin(struct inode *inode, loff_t offset,
+   loff_t length, unsigned int flags, struct iomap *iomap)
+{
+   struct block_device *bdev;
+   unsigned long first_block = F2FS_BYTES_TO_BLK(offset);
+   unsigned long last_block = F2FS_BYTES_TO_BLK(offset + length - 1);
+   struct f2fs_map_blocks map;
+   int ret;
+   loff_t original_i_size = i_size_read(inode);
+
+   if (WARN_ON_ONCE(f2fs_has_inline_data(inode)))
+   return -ERANGE;
+
+   map.m_lblk = first_block;
+   map.m_len = last_block - first_block + 1;
+   map.m_next_pgofs = NULL;
+
+   if (!(flags & IOMAP_WRITE))
+   ret = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_READ);
+   else {
+   ret = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_DIO);
+   /* i_size should be kept here and changed later in f2fs_iomap_end */
+   if (i_size_read(inode) != original_i_size)
+   f2fs_i_size_write(inode, original_i_size);
+   }
+
+   if (ret)
+   return ret;
+
+   iomap->flags = 0;
+   bdev = inode->i_sb->s_bdev;
+   iomap->bdev = bdev;
+   if (blk_queue_dax(bdev->bd_queue))
+   iomap->dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
+   else
+   iomap->dax_dev = NULL;
+   iomap->offset = F2FS_BLK_TO_BYTES((u64)first_block);
+
+   if (map.m_len == 0) {
+   iomap->type = IOMAP_HOLE;
+   iomap->blkno = IOMAP_NULL_BLOCK;
+   iomap->length = F2FS_BLKSIZE;
+   } else {
+   if (map.m_flags & F2FS_MAP_MAPPED) {
+   iomap->type = IOMAP_MAPPED;
+   } else if (map.m_flags & F2FS_MAP_UNWRITTEN) {
+   iomap->type = IOMAP_UNWRITTEN;
+   } else {
+   WARN_ON_ONCE(1);
+   return -EIO;
+   }
+   iomap->blkno =
+   (sector_t)map.m_pblk << F2FS_LOG_SECTORS_PER_BLOCK;
+   iomap->length = F2FS_BLK_TO_BYTES((u64)map.m_len);
+   }
+
+   if (map.m_flags & F2FS_MAP_NEW)
+   iomap->flags |= IOMAP_F_NEW;
+   return 0;
+}
+
+static int f2fs_iomap_end(struct inode *inode, loff_t offset, loff_t length,
+   ssize_t written, unsigned int flags, struct iomap *iomap)
+{
+   if (!(flags & IOMAP_WRITE) || (flags & IOMAP_FAULT))
+   return 0;
+
+   if (offset + written > i_size_read(inode))
+   f2fs_i_size_write(inode, offset + written);
+
+   if (iomap->offset + iomap->length >
+   ALIGN(i_size_read(inode), F2FS_BLKSIZE)) {
+   block_t written_blk = F2FS_BYTES_TO_BLK(offset + written);
+   block_t end_blk = F2FS_BYTES_TO_BLK(offset + length);
+
+   if (written_blk < end_blk)
+

[PATCH v2 1/2] f2fs: dax: fix races between page faults and truncating pages

2017-05-17 Thread sunqiuyang
From: Qiuyang Sun 

Currently in F2FS, page faults and operations that truncate the pagecahe
or data blocks, are completely unsynchronized. This can result in page
fault faulting in a page into a range that we are changing after
truncating, and thus we can end up with a page mapped to disk blocks that
will be shortly freed. Filesystem corruption will shortly follow.

This patch fixes the problem by creating new rw semaphore i_mmap_sem in
f2fs_inode_info and grab it for functions removing blocks from extent tree
and for read over page faults. The mechanism is similar to that in ext4.

Signed-off-by: Qiuyang Sun 
---
Changelog v1 -> v2:

- Apply the new rw semaphore in some other necessary scenarios:
f2fs_write_failed
f2fs_filemap_fault (new function)
f2fs_vm_page_mkwrite
f2fs_setattr
(f2fs_add_inline_entries() does not need this rw semaphore, as dir is a
directory file and its pages would not be mmap'ed.)

- Lock coverage in the scenarios below are reconsidered:
punch_hole
f2fs_collapse_range
f2fs_zero_range
f2fs_insert_range

The v2 patches are at 4.12-rc1.
---
 fs/f2fs/data.c  |  2 ++
 fs/f2fs/f2fs.h  |  1 +
 fs/f2fs/file.c  | 48 +++-
 fs/f2fs/super.c |  1 +
 4 files changed, 43 insertions(+), 9 deletions(-)

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 7c0f6bd..c9a3fbd 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -1753,8 +1753,10 @@ static void f2fs_write_failed(struct address_space 
*mapping, loff_t to)
loff_t i_size = i_size_read(inode);
 
if (to > i_size) {
+   down_write(&F2FS_I(inode)->i_mmap_sem);
truncate_pagecache(inode, i_size);
truncate_blocks(inode, i_size, true);
+   up_write(&F2FS_I(inode)->i_mmap_sem);
}
 }
 
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 2185c7a..8095f4f 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -519,6 +519,7 @@ struct f2fs_inode_info {
struct mutex inmem_lock;/* lock for inmemory pages */
struct extent_tree *extent_tree;/* cached extent_tree entry */
struct rw_semaphore dio_rwsem[2];/* avoid racing between dio and gc */
+   struct rw_semaphore i_mmap_sem;
 };
 
 static inline void get_extent_info(struct extent_info *ext,
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 61af721..0b0115c 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -33,6 +33,18 @@
 #include "trace.h"
 #include 
 
+static int f2fs_filemap_fault(struct vm_fault *vmf)
+{
+   struct inode *inode = file_inode(vmf->vma->vm_file);
+   int err;
+
+   down_read(&F2FS_I(inode)->i_mmap_sem);
+   err = filemap_fault(vmf);
+   up_read(&F2FS_I(inode)->i_mmap_sem);
+
+   return err;
+}
+
 static int f2fs_vm_page_mkwrite(struct vm_fault *vmf)
 {
struct page *page = vmf->page;
@@ -59,13 +71,14 @@ static int f2fs_vm_page_mkwrite(struct vm_fault *vmf)
f2fs_balance_fs(sbi, dn.node_changed);
 
file_update_time(vmf->vma->vm_file);
+   down_read(&F2FS_I(inode)->i_mmap_sem);
lock_page(page);
if (unlikely(page->mapping != inode->i_mapping ||
page_offset(page) > i_size_read(inode) ||
!PageUptodate(page))) {
unlock_page(page);
err = -EFAULT;
-   goto out;
+   goto out_sem;
}
 
/*
@@ -94,6 +107,8 @@ static int f2fs_vm_page_mkwrite(struct vm_fault *vmf)
if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
f2fs_wait_on_encrypted_page_writeback(sbi, dn.data_blkaddr);
 
+out_sem:
+   up_read(&F2FS_I(inode)->i_mmap_sem);
 out:
sb_end_pagefault(inode->i_sb);
f2fs_update_time(sbi, REQ_TIME);
@@ -101,7 +116,7 @@ static int f2fs_vm_page_mkwrite(struct vm_fault *vmf)
 }
 
 static const struct vm_operations_struct f2fs_file_vm_ops = {
-   .fault  = filemap_fault,
+   .fault  = f2fs_filemap_fault,
.map_pages  = filemap_map_pages,
.page_mkwrite   = f2fs_vm_page_mkwrite,
 };
@@ -687,8 +702,10 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
return -EACCES;
 
if (attr->ia_size <= i_size_read(inode)) {
+   down_write(&F2FS_I(inode)->i_mmap_sem);
truncate_setsize(inode, attr->ia_size);
err = f2fs_truncate(inode);
+   up_write(&F2FS_I(inode)->i_mmap_sem);
if (err)
return err;
} else {
@@ -696,7 +713,9 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
 * do not trim all blocks after i_size if target size is
 * larger than i_size.
 */
+   down_write(&F2FS_I(inode)->i_mmap_sem);

[PATCH v2 1/2] f2fs: dax: fix races between page faults and truncating pages

2017-05-17 Thread sunqiuyang
From: Qiuyang Sun 

Currently in F2FS, page faults and operations that truncate the pagecahe
or data blocks, are completely unsynchronized. This can result in page
fault faulting in a page into a range that we are changing after
truncating, and thus we can end up with a page mapped to disk blocks that
will be shortly freed. Filesystem corruption will shortly follow.

This patch fixes the problem by creating new rw semaphore i_mmap_sem in
f2fs_inode_info and grab it for functions removing blocks from extent tree
and for read over page faults. The mechanism is similar to that in ext4.

Signed-off-by: Qiuyang Sun 
---
Changelog v1 -> v2:

- Apply the new rw semaphore in some other necessary scenarios:
f2fs_write_failed
f2fs_filemap_fault (new function)
f2fs_vm_page_mkwrite
f2fs_setattr
(f2fs_add_inline_entries() does not need this rw semaphore, as dir is a
directory file and its pages would not be mmap'ed.)

- Lock coverage in the scenarios below are reconsidered:
punch_hole
f2fs_collapse_range
f2fs_zero_range
f2fs_insert_range

The v2 patches are at 4.12-rc1.
---
 fs/f2fs/data.c  |  2 ++
 fs/f2fs/f2fs.h  |  1 +
 fs/f2fs/file.c  | 48 +++-
 fs/f2fs/super.c |  1 +
 4 files changed, 43 insertions(+), 9 deletions(-)

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 7c0f6bd..c9a3fbd 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -1753,8 +1753,10 @@ static void f2fs_write_failed(struct address_space 
*mapping, loff_t to)
loff_t i_size = i_size_read(inode);
 
if (to > i_size) {
+   down_write(&F2FS_I(inode)->i_mmap_sem);
truncate_pagecache(inode, i_size);
truncate_blocks(inode, i_size, true);
+   up_write(&F2FS_I(inode)->i_mmap_sem);
}
 }
 
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 2185c7a..8095f4f 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -519,6 +519,7 @@ struct f2fs_inode_info {
struct mutex inmem_lock;/* lock for inmemory pages */
struct extent_tree *extent_tree;/* cached extent_tree entry */
struct rw_semaphore dio_rwsem[2];/* avoid racing between dio and gc */
+   struct rw_semaphore i_mmap_sem;
 };
 
 static inline void get_extent_info(struct extent_info *ext,
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 61af721..0b0115c 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -33,6 +33,18 @@
 #include "trace.h"
 #include 
 
+static int f2fs_filemap_fault(struct vm_fault *vmf)
+{
+   struct inode *inode = file_inode(vmf->vma->vm_file);
+   int err;
+
+   down_read(&F2FS_I(inode)->i_mmap_sem);
+   err = filemap_fault(vmf);
+   up_read(&F2FS_I(inode)->i_mmap_sem);
+
+   return err;
+}
+
 static int f2fs_vm_page_mkwrite(struct vm_fault *vmf)
 {
struct page *page = vmf->page;
@@ -59,13 +71,14 @@ static int f2fs_vm_page_mkwrite(struct vm_fault *vmf)
f2fs_balance_fs(sbi, dn.node_changed);
 
file_update_time(vmf->vma->vm_file);
+   down_read(&F2FS_I(inode)->i_mmap_sem);
lock_page(page);
if (unlikely(page->mapping != inode->i_mapping ||
page_offset(page) > i_size_read(inode) ||
!PageUptodate(page))) {
unlock_page(page);
err = -EFAULT;
-   goto out;
+   goto out_sem;
}
 
/*
@@ -94,6 +107,8 @@ static int f2fs_vm_page_mkwrite(struct vm_fault *vmf)
if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
f2fs_wait_on_encrypted_page_writeback(sbi, dn.data_blkaddr);
 
+out_sem:
+   up_read(&F2FS_I(inode)->i_mmap_sem);
 out:
sb_end_pagefault(inode->i_sb);
f2fs_update_time(sbi, REQ_TIME);
@@ -101,7 +116,7 @@ static int f2fs_vm_page_mkwrite(struct vm_fault *vmf)
 }
 
 static const struct vm_operations_struct f2fs_file_vm_ops = {
-   .fault  = filemap_fault,
+   .fault  = f2fs_filemap_fault,
.map_pages  = filemap_map_pages,
.page_mkwrite   = f2fs_vm_page_mkwrite,
 };
@@ -687,8 +702,10 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
return -EACCES;
 
if (attr->ia_size <= i_size_read(inode)) {
+   down_write(&F2FS_I(inode)->i_mmap_sem);
truncate_setsize(inode, attr->ia_size);
err = f2fs_truncate(inode);
+   up_write(&F2FS_I(inode)->i_mmap_sem);
if (err)
return err;
} else {
@@ -696,7 +713,9 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
 * do not trim all blocks after i_size if target size is
 * larger than i_size.
 */
+   down_write(&F2FS_I(inode)->i_mmap_sem);

[PATCH 3/3] f2fs:dax: Implement direct access

2017-05-03 Thread sunqiuyang
This patch implements Direct Access (DAX) in F2FS, including:
 - a mount option to enable DAX
 - read/write and mmap of regular files in the DAX way
 - zero-out of non-aligned partial blocks in the DAX way
 - garbage collection of DAX files
 - incompatibility of DAX with inline data, atomic or volatile write

TODO: We may need new implementation of f2fs_collapse/insert_range() for 
DAX files, as filemap_write_and_wait_range() does not work for DAX files, 
and thus the data pages cannot be moved correctly.

Signed-off-by: Qiuyang Sun 
---
 fs/f2fs/f2fs.h   |   8 +++
 fs/f2fs/file.c   | 197 ++-
 fs/f2fs/gc.c |  69 +--
 fs/f2fs/inline.c |   4 ++
 fs/f2fs/inode.c  |  88 +
 fs/f2fs/namei.c  |   7 ++
 fs/f2fs/super.c  |  16 +
 7 files changed, 383 insertions(+), 6 deletions(-)

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index f7957ca..d0e8af5 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -87,6 +87,11 @@ struct f2fs_fault_info {
 #define F2FS_MOUNT_FAULT_INJECTION 0x0001
 #define F2FS_MOUNT_ADAPTIVE0x0002
 #define F2FS_MOUNT_LFS 0x0004
+#ifdef CONFIG_FS_DAX
+#define F2FS_MOUNT_DAX 0x0008 /* Direct Access */
+#else
+#define F2FS_MOUNT_DAX 0
+#endif
 
 #define clear_opt(sbi, option) (sbi->mount_opt.opt &= ~F2FS_MOUNT_##option)
 #define set_opt(sbi, option)   (sbi->mount_opt.opt |= F2FS_MOUNT_##option)
@@ -2063,6 +2068,9 @@ int f2fs_getattr(const struct path *path, struct kstat 
*stat,
 int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc);
 void f2fs_evict_inode(struct inode *inode);
 void handle_failed_inode(struct inode *inode);
+#ifdef CONFIG_FS_DAX
+extern struct iomap_ops f2fs_iomap_ops;
+#endif
 
 /*
  * namei.c
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 165acbf..4eeb17b 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -23,6 +23,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 
 #include "f2fs.h"
 #include "node.h"
@@ -106,6 +108,64 @@ static int f2fs_vm_page_mkwrite(struct vm_fault *vmf)
.page_mkwrite   = f2fs_vm_page_mkwrite,
 };
 
+#ifdef CONFIG_FS_DAX
+static int f2fs_dax_huge_fault(struct vm_fault *vmf,
+   enum page_entry_size pe_size)
+{
+   int result;
+   struct inode *inode = file_inode(vmf->vma->vm_file);
+   struct super_block *sb = inode->i_sb;
+   bool write = vmf->flags & FAULT_FLAG_WRITE;
+
+   if (write) {
+   sb_start_pagefault(sb);
+   file_update_time(vmf->vma->vm_file);
+   }
+   down_read(&F2FS_I(inode)->i_mmap_sem);
+   result = dax_iomap_fault(vmf, pe_size, &f2fs_iomap_ops);
+   up_read(&F2FS_I(inode)->i_mmap_sem);
+   if (write)
+   sb_end_pagefault(sb);
+
+   return result;
+}
+
+static int f2fs_dax_fault(struct vm_fault *vmf)
+{
+   return f2fs_dax_huge_fault(vmf, PE_SIZE_PTE);
+}
+
+static int f2fs_dax_pfn_mkwrite(struct vm_fault *vmf)
+{
+   struct inode *inode = file_inode(vmf->vma->vm_file);
+   struct super_block *sb = inode->i_sb;
+   loff_t size;
+   int ret;
+
+   sb_start_pagefault(sb);
+   file_update_time(vmf->vma->vm_file);
+   down_read(&F2FS_I(inode)->i_mmap_sem);
+   size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+   if (vmf->pgoff >= size)
+   ret = VM_FAULT_SIGBUS;
+   else
+   ret = dax_pfn_mkwrite(vmf);
+   up_read(&F2FS_I(inode)->i_mmap_sem);
+   sb_end_pagefault(sb);
+
+   return ret;
+}
+
+static const struct vm_operations_struct f2fs_dax_vm_ops = {
+   .fault  = f2fs_dax_fault,
+   .huge_fault = f2fs_dax_huge_fault,
+   .page_mkwrite   = f2fs_dax_fault,
+   .pfn_mkwrite= f2fs_dax_pfn_mkwrite,
+};
+#else
+#define f2fs_dax_vm_ops f2fs_file_vm_ops
+#endif
+
 static int get_parent_ino(struct inode *inode, nid_t *pino)
 {
struct dentry *dentry;
@@ -434,7 +494,13 @@ static int f2fs_file_mmap(struct file *file, struct 
vm_area_struct *vma)
return err;
 
file_accessed(file);
-   vma->vm_ops = &f2fs_file_vm_ops;
+   if (IS_DAX(file_inode(file))) {
+   vma->vm_ops = &f2fs_dax_vm_ops;
+   vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
+   } else {
+   vma->vm_ops = &f2fs_file_vm_ops;
+   }
+
return 0;
 }
 
@@ -518,6 +584,18 @@ static int truncate_partial_data_page(struct inode *inode, 
u64 from,
if (!offset && !cache_only)
return 0;
 
+#ifdef CONFIG_FS_DAX
+   if (IS_DAX(inode)) {
+   int ret;
+
+   down_read(&F2FS_I(inode)->dio_rwsem[WRITE]);
+   ret = iomap_zero_range(inode, from, PAGE_SIZE - offset,
+   NULL, &f2fs_iomap_ops);
+   up_read(&F2FS_I(inode)->dio_rwsem[WRITE]);
+   return ret;
+   }
+#endif
+
if (cache_only) {
   

[PATCH 2/3] f2fs:dax: Export interfaces: dax_map/unmap_atomic()

2017-05-03 Thread sunqiuyang
Export interfaces of dax_map/unmap_atomic() for usage in moving data 
pages of DAX files in garbage collection of F2FS.

Signed-off-by: Qiuyang Sun 
---
 fs/dax.c| 6 --
 include/linux/dax.h | 3 +++
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index 85abd74..615a4c1 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -55,7 +55,7 @@ static int __init init_dax_wait_table(void)
 }
 fs_initcall(init_dax_wait_table);
 
-static long dax_map_atomic(struct block_device *bdev, struct blk_dax_ctl *dax)
+long dax_map_atomic(struct block_device *bdev, struct blk_dax_ctl *dax)
 {
struct request_queue *q = bdev->bd_queue;
long rc = -EIO;
@@ -72,14 +72,16 @@ static long dax_map_atomic(struct block_device *bdev, 
struct blk_dax_ctl *dax)
}
return rc;
 }
+EXPORT_SYMBOL_GPL(dax_map_atomic);
 
-static void dax_unmap_atomic(struct block_device *bdev,
+void dax_unmap_atomic(struct block_device *bdev,
const struct blk_dax_ctl *dax)
 {
if (IS_ERR(dax->addr))
return;
blk_queue_exit(bdev->bd_queue);
 }
+EXPORT_SYMBOL_GPL(dax_unmap_atomic);
 
 static int dax_is_pmd_entry(void *entry)
 {
diff --git a/include/linux/dax.h b/include/linux/dax.h
index d8a3dc0..b6451b2 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -7,6 +7,7 @@
 #include 
 
 struct iomap_ops;
+struct blk_dax_ctl;
 
 /*
  * We use lowest available bit in exceptional entry for locking, one bit for
@@ -36,6 +37,8 @@ static inline void *dax_radix_locked_entry(sector_t sector, 
unsigned long flags)
RADIX_DAX_ENTRY_LOCK);
 }
 
+long dax_map_atomic(struct block_device *bdev, struct blk_dax_ctl *dax);
+void dax_unmap_atomic(struct block_device *bdev, const struct blk_dax_ctl 
*dax);
 ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
const struct iomap_ops *ops);
 int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
-- 
1.8.3.1



[PATCH 1/3] f2fs:dax: fix races between page faults and fallocate

2017-05-03 Thread sunqiuyang
Currently in F2FS, page faults and fallocate operations, like punch_hole 
and collapse/insert/zero_range, are completely unsynchronized. This can 
result in page fault faulting in a page into a range that we are changing 
after truncating pagecache, and thus we can end up with a page mapped to 
disk blocks that will be shortly freed. Filesystem corruption will shortly 
follow. 

This patch fixes the problem by creating new rw semaphore i_mmap_sem in 
f2fs_inode_info and grab it for functions removing blocks from extent tree 
and for read over page faults. The mechanism is similar to that in ext4.

Signed-off-by: Qiuyang Sun 
---
 fs/f2fs/f2fs.h  |  1 +
 fs/f2fs/file.c  | 30 +-
 fs/f2fs/super.c |  1 +
 3 files changed, 23 insertions(+), 9 deletions(-)

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 0a6e115..f7957ca 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -474,6 +474,7 @@ struct f2fs_inode_info {
struct mutex inmem_lock;/* lock for inmemory pages */
struct extent_tree *extent_tree;/* cached extent_tree entry */
struct rw_semaphore dio_rwsem[2];/* avoid racing between dio and gc */
+   struct rw_semaphore i_mmap_sem;
 };
 
 static inline void get_extent_info(struct extent_info *ext,
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 5f73178..165acbf 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -813,22 +813,23 @@ static int punch_hole(struct inode *inode, loff_t offset, 
loff_t len)
off_start = offset & (PAGE_SIZE - 1);
off_end = (offset + len) & (PAGE_SIZE - 1);
 
+   down_write(&F2FS_I(inode)->i_mmap_sem);
if (pg_start == pg_end) {
ret = fill_zero(inode, pg_start, off_start,
off_end - off_start);
if (ret)
-   return ret;
+   goto out;
} else {
if (off_start) {
ret = fill_zero(inode, pg_start++, off_start,
PAGE_SIZE - off_start);
if (ret)
-   return ret;
+   goto out;
}
if (off_end) {
ret = fill_zero(inode, pg_end, 0, off_end);
if (ret)
-   return ret;
+   goto out;
}
 
if (pg_start < pg_end) {
@@ -849,6 +850,8 @@ static int punch_hole(struct inode *inode, loff_t offset, 
loff_t len)
}
}
 
+out:
+   up_write(&F2FS_I(inode)->i_mmap_sem);
return ret;
 }
 
@@ -1084,16 +1087,17 @@ static int f2fs_collapse_range(struct inode *inode, 
loff_t offset, loff_t len)
pg_start = offset >> PAGE_SHIFT;
pg_end = (offset + len) >> PAGE_SHIFT;
 
+   down_write(&F2FS_I(inode)->i_mmap_sem);
/* write out all dirty pages from offset */
ret = filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX);
if (ret)
-   return ret;
+   goto out;
 
truncate_pagecache(inode, offset);
 
ret = f2fs_do_collapse(inode, pg_start, pg_end);
if (ret)
-   return ret;
+   goto out;
 
/* write out all moved pages, if possible */
filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX);
@@ -1106,6 +1110,8 @@ static int f2fs_collapse_range(struct inode *inode, 
loff_t offset, loff_t len)
if (!ret)
f2fs_i_size_write(inode, new_size);
 
+out:
+   up_write(&F2FS_I(inode)->i_mmap_sem);
return ret;
 }
 
@@ -1182,11 +1188,12 @@ static int f2fs_zero_range(struct inode *inode, loff_t 
offset, loff_t len,
off_start = offset & (PAGE_SIZE - 1);
off_end = (offset + len) & (PAGE_SIZE - 1);
 
+   down_write(&F2FS_I(inode)->i_mmap_sem);
if (pg_start == pg_end) {
ret = fill_zero(inode, pg_start, off_start,
off_end - off_start);
if (ret)
-   return ret;
+   goto unlock;
 
if (offset + len > new_size)
new_size = offset + len;
@@ -1196,7 +1203,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t 
offset, loff_t len,
ret = fill_zero(inode, pg_start++, off_start,
PAGE_SIZE - off_start);
if (ret)
-   return ret;
+   goto unlock;
 
new_size = max_t(loff_t, new_size,
(loff_t)pg_start << PAGE_SHIFT);
@@ -1245,6 +1252,8 @@ static int f2fs_zero_range(struct inode *inode, loff_t 
offset, loff_t len,
 out:
if (!(mode & FALLOC_FL_KEEP_SIZE) && i_size_read(inode) < new_size)