From: Goldwyn Rodrigues <rgold...@suse.com>

This one is a long patch. Most of the code is "inspired" by
fs/btrfs/file.c. To keep the size small, all removals are in
following patches.

Signed-off-by: Goldwyn Rodrigues <rgold...@suse.com>
---
 fs/btrfs/Kconfig  |   1 +
 fs/btrfs/Makefile |   2 +-
 fs/btrfs/ctree.h  |   1 +
 fs/btrfs/file.c   |   4 +-
 fs/btrfs/iomap.c  | 383 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 388 insertions(+), 3 deletions(-)
 create mode 100644 fs/btrfs/iomap.c

diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index 38651fae7f21..ba87d21885ca 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -14,6 +14,7 @@ config BTRFS_FS
        select RAID6_PQ
        select XOR_BLOCKS
        select SRCU
+       select FS_IOMAP
 
        help
          Btrfs is a general purpose copy-on-write filesystem with extents,
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 76a843198bcb..f88e696b0698 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -11,7 +11,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o 
root-tree.o dir-item.o \
           compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
           reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \
           uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \
-          block-rsv.o delalloc-space.o
+          block-rsv.o delalloc-space.o iomap.o
 
 btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
 btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 94660063a162..9c501c7826b4 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3243,6 +3243,7 @@ int btrfs_fdatawrite_range(struct inode *inode, loff_t 
start, loff_t end);
 loff_t btrfs_remap_file_range(struct file *file_in, loff_t pos_in,
                              struct file *file_out, loff_t pos_out,
                              loff_t len, unsigned int remap_flags);
+size_t btrfs_buffered_iomap_write(struct kiocb *iocb, struct iov_iter *from);
 
 /* tree-defrag.c */
 int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 4466a09f2d98..0707db04d3cc 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1829,7 +1829,7 @@ static ssize_t __btrfs_direct_write(struct kiocb *iocb, 
struct iov_iter *from)
                return written;
 
        pos = iocb->ki_pos;
-       written_buffered = btrfs_buffered_write(iocb, from);
+       written_buffered = btrfs_buffered_iomap_write(iocb, from);
        if (written_buffered < 0) {
                err = written_buffered;
                goto out;
@@ -1966,7 +1966,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
        if (iocb->ki_flags & IOCB_DIRECT) {
                num_written = __btrfs_direct_write(iocb, from);
        } else {
-               num_written = btrfs_buffered_write(iocb, from);
+               num_written = btrfs_buffered_iomap_write(iocb, from);
                if (num_written > 0)
                        iocb->ki_pos = pos + num_written;
                if (clean_page)
diff --git a/fs/btrfs/iomap.c b/fs/btrfs/iomap.c
new file mode 100644
index 000000000000..025ccbf471bf
--- /dev/null
+++ b/fs/btrfs/iomap.c
@@ -0,0 +1,383 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * iomap support for BTRFS
+ *
+ * Copyright (c) 2019  SUSE Linux
+ * Author: Goldwyn Rodrigues <rgold...@suse.com>
+ */
+
+#include <linux/iomap.h>
+#include "ctree.h"
+#include "btrfs_inode.h"
+#include "volumes.h"
+#include "disk-io.h"
+#include "delalloc-space.h"
+
+struct btrfs_iomap {
+       u64 start;
+       u64 end;
+       bool nocow;
+       int extents_locked;
+       ssize_t reserved_bytes;
+       struct extent_changeset *data_reserved;
+       struct extent_state *cached_state;
+};
+
+
+/*
+ * This function locks the extent and properly waits for data=ordered extents
+ * to finish before allowing the pages to be modified if need.
+ *
+ * The return value:
+ * 1 - the extent is locked
+ * 0 - the extent is not locked, and everything is OK
+ * -EAGAIN - need re-prepare the pages
+ * the other < 0 number - Something wrong happens
+ */
+static noinline int
+lock_and_cleanup_extent(struct btrfs_inode *inode, loff_t pos,
+                        size_t write_bytes,
+                        u64 *lockstart, u64 *lockend,
+                        struct extent_state **cached_state)
+{
+       struct btrfs_fs_info *fs_info = inode->root->fs_info;
+       u64 start_pos;
+       u64 last_pos;
+       int ret = 0;
+
+       start_pos = round_down(pos, fs_info->sectorsize);
+       last_pos = start_pos
+               + round_up(pos + write_bytes - start_pos,
+                          fs_info->sectorsize) - 1;
+
+       if (start_pos < inode->vfs_inode.i_size) {
+               struct btrfs_ordered_extent *ordered;
+
+               lock_extent_bits(&inode->io_tree, start_pos, last_pos,
+                               cached_state);
+               ordered = btrfs_lookup_ordered_range(inode, start_pos,
+                                                    last_pos - start_pos + 1);
+               if (ordered &&
+                   ordered->file_offset + ordered->len > start_pos &&
+                   ordered->file_offset <= last_pos) {
+                       unlock_extent_cached(&inode->io_tree, start_pos,
+                                       last_pos, cached_state);
+                       btrfs_start_ordered_extent(&inode->vfs_inode,
+                                       ordered, 1);
+                       btrfs_put_ordered_extent(ordered);
+                       return -EAGAIN;
+               }
+               if (ordered)
+                       btrfs_put_ordered_extent(ordered);
+
+               *lockstart = start_pos;
+               *lockend = last_pos;
+               ret = 1;
+       }
+
+       return ret;
+}
+
+static noinline int check_can_nocow(struct btrfs_inode *inode, loff_t pos,
+                                   size_t *write_bytes)
+{
+       struct btrfs_fs_info *fs_info = inode->root->fs_info;
+       struct btrfs_root *root = inode->root;
+       struct btrfs_ordered_extent *ordered;
+       u64 lockstart, lockend;
+       u64 num_bytes;
+       int ret;
+
+       ret = btrfs_start_write_no_snapshotting(root);
+       if (!ret)
+               return -ENOSPC;
+
+       lockstart = round_down(pos, fs_info->sectorsize);
+       lockend = round_up(pos + *write_bytes,
+                          fs_info->sectorsize) - 1;
+
+       while (1) {
+               lock_extent(&inode->io_tree, lockstart, lockend);
+               ordered = btrfs_lookup_ordered_range(inode, lockstart,
+                                                    lockend - lockstart + 1);
+               if (!ordered) {
+                       break;
+               }
+               unlock_extent(&inode->io_tree, lockstart, lockend);
+               btrfs_start_ordered_extent(&inode->vfs_inode, ordered, 1);
+               btrfs_put_ordered_extent(ordered);
+       }
+
+       num_bytes = lockend - lockstart + 1;
+       ret = can_nocow_extent(&inode->vfs_inode, lockstart, &num_bytes,
+                       NULL, NULL, NULL);
+       if (ret <= 0) {
+               ret = 0;
+               btrfs_end_write_no_snapshotting(root);
+       } else {
+               *write_bytes = min_t(size_t, *write_bytes ,
+                                    num_bytes - pos + lockstart);
+       }
+
+       unlock_extent(&inode->io_tree, lockstart, lockend);
+
+       return ret;
+}
+
+static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode,
+                                        const u64 start,
+                                        const u64 len,
+                                        struct extent_state **cached_state)
+{
+       u64 search_start = start;
+       const u64 end = start + len - 1;
+
+       while (search_start < end) {
+               const u64 search_len = end - search_start + 1;
+               struct extent_map *em;
+               u64 em_len;
+               int ret = 0;
+
+               em = btrfs_get_extent(inode, NULL, 0, search_start,
+                                     search_len, 0);
+               if (IS_ERR(em))
+                       return PTR_ERR(em);
+
+               if (em->block_start != EXTENT_MAP_HOLE)
+                       goto next;
+
+               em_len = em->len;
+               if (em->start < search_start)
+                       em_len -= search_start - em->start;
+               if (em_len > search_len)
+                       em_len = search_len;
+
+               ret = set_extent_bit(&inode->io_tree, search_start,
+                                    search_start + em_len - 1,
+                                    EXTENT_DELALLOC_NEW,
+                                    NULL, cached_state, GFP_NOFS);
+next:
+               search_start = extent_map_end(em);
+               free_extent_map(em);
+               if (ret)
+                       return ret;
+       }
+       return 0;
+}
+
+static void btrfs_buffered_page_done(struct inode *inode, loff_t pos,
+               unsigned copied, struct page *page,
+               struct iomap *iomap)
+{
+       if (!page)
+               return;
+       SetPageUptodate(page);
+       ClearPageChecked(page);
+       set_page_dirty(page);
+       get_page(page);
+}
+
+
+static const struct iomap_page_ops btrfs_buffered_page_ops = {
+       .page_done = btrfs_buffered_page_done,
+};
+
+
+static int btrfs_buffered_iomap_begin(struct inode *inode, loff_t pos,
+               loff_t length, unsigned flags, struct iomap *iomap,
+               struct iomap *srcmap)
+{
+       int ret;
+       size_t write_bytes = length;
+       struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+       size_t sector_offset = pos & (fs_info->sectorsize - 1);
+       struct btrfs_iomap *bi;
+
+       bi = kzalloc(sizeof(struct btrfs_iomap), GFP_NOFS);
+       if (!bi)
+               return -ENOMEM;
+
+       bi->reserved_bytes = round_up(write_bytes + sector_offset,
+                       fs_info->sectorsize);
+
+       /* Reserve data space */
+       ret = btrfs_check_data_free_space(inode, &bi->data_reserved, pos,
+                       write_bytes);
+       if (ret < 0) {
+               /*
+                * Space allocation failed. Let's check if we can
+                * continue I/O without allocations
+                */
+               if ((BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
+                                               BTRFS_INODE_PREALLOC)) &&
+                               check_can_nocow(BTRFS_I(inode), pos,
+                                       &write_bytes) > 0) {
+                       bi->nocow = true;
+                       /*
+                        * our prealloc extent may be smaller than
+                        * write_bytes, so scale down.
+                        */
+                       bi->reserved_bytes = round_up(write_bytes +
+                                       sector_offset,
+                                       fs_info->sectorsize);
+               } else {
+                       goto error;
+               }
+       }
+
+       WARN_ON(bi->reserved_bytes == 0);
+
+       /* We have the data space allocated, reserve the metadata now */
+       ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode),
+                       bi->reserved_bytes);
+       if (ret) {
+               struct btrfs_root *root = BTRFS_I(inode)->root;
+               if (!bi->nocow)
+                       btrfs_free_reserved_data_space(inode,
+                                       bi->data_reserved, pos,
+                                       write_bytes);
+               else
+                       btrfs_end_write_no_snapshotting(root);
+               goto error;
+       }
+
+       do {
+               ret = lock_and_cleanup_extent(
+                               BTRFS_I(inode), pos, write_bytes, &bi->start,
+                               &bi->end, &bi->cached_state);
+       } while (ret == -EAGAIN);
+
+       if (ret < 0) {
+               btrfs_delalloc_release_extents(BTRFS_I(inode),
+                                              bi->reserved_bytes, true);
+               goto release;
+       } else {
+               bi->extents_locked = ret;
+       }
+       iomap->private = bi;
+       iomap->length = round_up(write_bytes, fs_info->sectorsize);
+       iomap->offset = round_down(pos, fs_info->sectorsize);
+       iomap->addr = IOMAP_NULL_ADDR;
+       iomap->type = IOMAP_DELALLOC;
+       iomap->bdev = fs_info->fs_devices->latest_bdev;
+       iomap->page_ops = &btrfs_buffered_page_ops;
+       return 0;
+release:
+       if (bi->extents_locked)
+               unlock_extent_cached(&BTRFS_I(inode)->io_tree, bi->start,
+                               bi->end, &bi->cached_state);
+       if (bi->nocow) {
+               struct btrfs_root *root = BTRFS_I(inode)->root;
+               btrfs_end_write_no_snapshotting(root);
+               btrfs_delalloc_release_metadata(BTRFS_I(inode),
+                               bi->reserved_bytes, true);
+       } else {
+               btrfs_delalloc_release_space(inode, bi->data_reserved,
+                               round_down(pos, fs_info->sectorsize),
+                               bi->reserved_bytes, true);
+       }
+       extent_changeset_free(bi->data_reserved);
+
+error:
+       kfree(bi);
+       return ret;
+}
+
+static int btrfs_buffered_iomap_end(struct inode *inode, loff_t pos,
+               loff_t length, ssize_t written, unsigned flags,
+               struct iomap *iomap)
+{
+       struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+       struct btrfs_iomap *bi = iomap->private;
+       ssize_t release_bytes = round_down(bi->reserved_bytes - written,
+                       1 << fs_info->sb->s_blocksize_bits);
+       unsigned int extra_bits = 0;
+       u64 start_pos = pos & ~((u64) fs_info->sectorsize - 1);
+       u64 num_bytes = round_up(written + pos - start_pos,
+                       fs_info->sectorsize);
+       u64 end_of_last_block = start_pos + num_bytes - 1;
+       int ret = 0;
+
+       if (release_bytes > 0) {
+               if (bi->nocow) {
+                       btrfs_delalloc_release_metadata(BTRFS_I(inode),
+                                       release_bytes, true);
+               } else {
+                       u64 __pos = round_down(pos + written, 
fs_info->sectorsize);
+                       btrfs_delalloc_release_space(inode, bi->data_reserved,
+                                       __pos, release_bytes, true);
+               }
+       }
+
+       /*
+        * The pages may have already been dirty, clear out old accounting so
+        * we can set things up properly
+        */
+       clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos, end_of_last_block,
+                       EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
+                       EXTENT_DEFRAG, 0, 0, &bi->cached_state);
+
+       if (!btrfs_is_free_space_inode(BTRFS_I(inode))) {
+               if (start_pos >= i_size_read(inode) &&
+                   !(BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC)) {
+                       /*
+                        * There can't be any extents following eof in this case
+                        * so just set the delalloc new bit for the range
+                        * directly.
+                        */
+                       extra_bits |= EXTENT_DELALLOC_NEW;
+               } else {
+                       ret = btrfs_find_new_delalloc_bytes(BTRFS_I(inode),
+                                       start_pos, num_bytes,
+                                       &bi->cached_state);
+                       if (ret)
+                               goto unlock;
+               }
+       }
+
+       ret = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
+                       extra_bits, &bi->cached_state, 0);
+unlock:
+       if (bi->extents_locked)
+               unlock_extent_cached(&BTRFS_I(inode)->io_tree,
+                               bi->start, bi->end, &bi->cached_state);
+
+       if (bi->nocow) {
+               struct btrfs_root *root = BTRFS_I(inode)->root;
+               btrfs_end_write_no_snapshotting(root);
+               if (written > 0) {
+                       u64 start = round_down(pos, fs_info->sectorsize);
+                       u64 end = round_up(pos + written, fs_info->sectorsize) 
- 1;
+                       set_extent_bit(&BTRFS_I(inode)->io_tree, start, end,
+                                       EXTENT_NORESERVE, NULL, NULL, GFP_NOFS);
+               }
+
+       }
+       btrfs_delalloc_release_extents(BTRFS_I(inode), bi->reserved_bytes,
+                       true);
+
+       if (written < fs_info->nodesize)
+               btrfs_btree_balance_dirty(fs_info);
+
+       extent_changeset_free(bi->data_reserved);
+       kfree(bi);
+       return ret;
+}
+
+static const struct iomap_ops btrfs_buffered_iomap_ops = {
+       .iomap_begin            = btrfs_buffered_iomap_begin,
+       .iomap_end              = btrfs_buffered_iomap_end,
+};
+
+size_t btrfs_buffered_iomap_write(struct kiocb *iocb, struct iov_iter *from)
+{
+       ssize_t written;
+       struct inode *inode = file_inode(iocb->ki_filp);
+       written = iomap_file_buffered_write(iocb, from, 
&btrfs_buffered_iomap_ops);
+       if (written > 0)
+               iocb->ki_pos += written;
+       if (iocb->ki_pos > i_size_read(inode))
+               i_size_write(inode, iocb->ki_pos);
+       return written;
+}
+
-- 
2.16.4

Reply via email to