From: Dmitry Monakhov <[email protected]>

Add EXT4_IOC_MFSYNC ioctl  which allow to perform sync on given set of files
in optimized way (only 1 barrier will be required in best scenario)

https://jira.sw.ru/browse/PSBM-18567

Signed-off-by: Dmitry Monakhov <[email protected]>

+++
Comment on rebasing to rh7 kernel-3.10.0-229.7.2.el7:

1) compile fix for ext4-add-mfsync-support

   ext4_flush_unwritten_io was removed in rh7-3.10.0-229.7.2

   https://jira.sw.ru/browse/PSBM-34909

2) compile fix for ext4-add-mfsync-support part2

   __sync_inode was removed in rh7-3.10.0-229.7.2
   It is honest to simply disable mfsync in  nojournal mode since we
   so not test nojournal mode at all.

   https://jira.sw.ru/browse/PSBM-34910

Signed-off-by: Dmitry Monakhov <[email protected]>

Rebase to vz8 kernel note:
  mutex_unlock(&inode->i_mutex) -> inode_lock_shared(inode)

Signed-off-by: Konstantin Khorenko <[email protected]>
---
 fs/ext4/ext4.h              |   8 ++-
 fs/ext4/fsync.c             | 108 ++++++++++++++++++++++++++++++++++++
 fs/ext4/ioctl.c             |  56 +++++++++++++++++++
 include/trace/events/ext4.h |  54 ++++++++++++++++++
 4 files changed, 225 insertions(+), 1 deletion(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 56e6bfecd6b4..c42cfaa8ed0f 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -536,6 +536,11 @@ struct compat_ext4_new_group_input {
 };
 #endif
 
+struct ext4_ioc_mfsync_info {
+       __u32 size;
+       __u32 fd[0];
+};
+
 /* The struct ext4_new_group_input in kernel space, with free_blocks_count */
 struct ext4_new_group_data {
        __u32 group;
@@ -645,6 +650,7 @@ enum {
 
 #define EXT4_IOC_FSGETXATTR            FS_IOC_FSGETXATTR
 #define EXT4_IOC_FSSETXATTR            FS_IOC_FSSETXATTR
+#define EXT4_IOC_MFSYNC                        _IO('f', 43)
 
 #define EXT4_IOC_SHUTDOWN _IOR ('X', 125, __u32)
 
@@ -655,7 +661,6 @@ enum {
 #define EXT4_GOING_FLAGS_LOGFLUSH              0x1     /* flush log but not 
data */
 #define EXT4_GOING_FLAGS_NOLOGFLUSH            0x2     /* don't flush log nor 
data */
 
-
 #if defined(__KERNEL__) && defined(CONFIG_COMPAT)
 /*
  * ioctl commands in 32 bit emulation
@@ -2363,6 +2368,7 @@ extern int ext4_check_all_de(struct inode *dir, struct 
buffer_head *bh,
 
 /* fsync.c */
 extern int ext4_sync_file(struct file *, loff_t, loff_t, int);
+extern int ext4_sync_files(struct file **, unsigned int *, unsigned int);
 
 /* hash.c */
 extern int ext4fs_dirhash(const char *name, int len, struct
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 26a7fe5c4fd3..41a2a7057a6e 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -162,3 +162,111 @@ int ext4_sync_file(struct file *file, loff_t start, 
loff_t end, int datasync)
        trace_ext4_sync_file_exit(inode, ret);
        return ret;
 }
+
+int ext4_sync_files(struct file **files, unsigned int *flags, unsigned int 
nr_files)
+{
+       struct super_block *sb;
+       journal_t *journal;
+       int err = 0, err2 = 0, i = 0, j = 0;
+       int force_commit = 0, datawriteback = 0;
+       tid_t commit_tid = 0;
+       int need_barrier = 0;
+
+       J_ASSERT(ext4_journal_current_handle() == NULL);
+       if (!nr_files)
+               return 0;
+
+       sb = files[0]->f_mapping->host->i_sb;
+       journal = EXT4_SB(sb)->s_journal;
+       if (sb->s_flags & MS_RDONLY) {
+               /* Make shure that we read updated s_mount_flags value */
+               smp_rmb();
+               if (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)
+                       return -EROFS;
+               return 0;
+       }
+       for (i = 0; i < nr_files; i++) {
+               struct address_space * mapping = files[i]->f_mapping;
+               struct inode *inode = mapping->host;
+
+               BUG_ON(sb != inode->i_sb);
+               if (!mapping->nrpages)
+                       continue;
+
+               err = filemap_fdatawrite(mapping);
+               if (err)
+                       break;
+
+       }
+       /*
+        * Even if the above returned error, the pages may be
+        * written partially (e.g. -ENOSPC), so we wait for it.
+        * But the -EIO is special case, it may indicate the worst
+        * thing (e.g. bug) happened, so we avoid waiting for it.
+        */
+       if (err == -EIO)
+               goto out;
+
+       for (j = 0; j < i; j++) {
+               struct address_space * mapping = files[j]->f_mapping;
+               struct inode *inode = mapping->host;
+               struct ext4_inode_info *ei = EXT4_I(inode);
+               unsigned int datasync = flags[j];
+               tid_t tid;
+
+               if (mapping->nrpages) {
+                       err2 = filemap_fdatawait(mapping);
+                       if (!err || err2 == -EIO)
+                               err = err2;
+               }
+
+               inode_lock_shared(inode);
+               force_commit  |= ext4_should_journal_data(inode);
+               datawriteback |= ext4_should_writeback_data(inode);
+               tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid;
+               inode_unlock_shared(inode);
+               trace_ext4_sync_files_iterate(files[j]->f_path.dentry, tid, 
datasync);
+               if (j == 0 || !tid_geq(commit_tid, tid))
+                       commit_tid = tid;
+       }
+
+       /* Ext4 specific stuff starts here */
+       if (!journal) {
+                return -ENOTSUPP;
+       } else if (force_commit) {
+               /* data=journal:
+                *  filemap_fdatawrite won't do anything (the buffers are 
clean).
+                *  ext4_force_commit will write the file data into the journal 
and
+                *  will wait on that.
+                *  filemap_fdatawait() will encounter a ton of newly-dirtied 
pages
+                *  (they were dirtied by commit).  But that's OK - the blocks 
are
+                *  safe in-journal, which is all fsync() needs to ensure.
+                */
+               err2 = ext4_force_commit(sb);
+       } else {
+               /*
+                * data=writeback,ordered:
+                * The caller's filemap_fdatawrite()/wait will sync the data.
+                * Metadata is in the journal, we wait for proper transaction to
+                * commit here.
+                */
+               if (journal->j_flags & JBD2_BARRIER &&
+                   !jbd2_trans_will_send_data_barrier(journal, commit_tid))
+                       need_barrier = true;
+
+               err2 = jbd2_complete_transaction(journal, commit_tid);
+               /* Even if we had to wait for commit completion, it does not
+                * mean a flush has been issued after data demanded by this
+                * fsync were written back. Commit could be in state after
+                * it is already done, but not yet in state where we should
+                * not wait.
+                */
+               if (need_barrier)
+                       err2 = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL);
+       }
+out:
+       trace_ext4_sync_files_exit(files[0]->f_path.dentry, commit_tid, 
need_barrier);
+       if (!err || err2 == -EIO)
+               err = err2;
+       return err;
+}
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index fbd0e79e0edc..f63237823064 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -1041,6 +1041,62 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, 
unsigned long arg)
        }
        case EXT4_IOC_SHUTDOWN:
                return ext4_shutdown(sb, arg);
+       case EXT4_IOC_MFSYNC:
+       {
+               struct ext4_ioc_mfsync_info mfsync;
+               struct file **filpp;
+               unsigned int *flags;
+               __u32 __user *usr_fd;
+               int i, err;
+
+               if (!ve_is_super(get_exec_env()))
+                       return -ENOTSUPP;
+               if (copy_from_user(&mfsync, (struct ext4_ioc_mfsync_info *)arg,
+                                  sizeof(mfsync)))
+                       return -EFAULT;
+
+               if (mfsync.size == 0)
+                       return 0;
+               if (mfsync.size > NR_FILE)
+                       return -ENFILE;
+
+               usr_fd = (__u32 __user *) (arg + sizeof(__u32));
+
+               filpp = kzalloc(mfsync.size * sizeof(*filpp), GFP_KERNEL);
+               if (!filpp)
+                       return -ENOMEM;
+               flags = kzalloc(mfsync.size * sizeof(*flags), GFP_KERNEL);
+               if (!flags) {
+                       kfree(filpp);
+                       return -ENOMEM;
+               }
+               for (i = 0; i < mfsync.size; i++) {
+                       int fd;
+                       int ret;
+
+                       err = -EFAULT;
+                       ret = get_user(fd, usr_fd + i);
+                       if (ret)
+                               goto mfsync_fput;
+
+                       /* negative fd means fdata_sync */
+                       flags[i] = (fd & (1<< 31)) != 0;
+                       fd &= ~(1<< 31);
+
+                       err = -EBADF;
+                       filpp[i] = fget(fd);
+                       if (!filpp[i])
+                               goto mfsync_fput;
+               }
+               err = ext4_sync_files(filpp, flags, mfsync.size);
+mfsync_fput:
+               for (i = 0; i < mfsync.size; i++)
+                       if (filpp[i])
+                               fput(filpp[i]);
+               kfree(filpp);
+               kfree(flags);
+               return err;
+       }
        default:
                return -ENOTTY;
        }
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
index 0e31eb136c57..57c10e20dd68 100644
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -909,6 +909,60 @@ TRACE_EVENT(ext4_sync_file_exit,
                  __entry->ret)
 );
 
+TRACE_EVENT(ext4_sync_files_iterate,
+       TP_PROTO(struct dentry *dentry, tid_t tid, int datasync),
+
+       TP_ARGS(dentry, tid, datasync),
+
+       TP_STRUCT__entry(
+               __field(        dev_t,  dev                     )
+               __field(        ino_t,  ino                     )
+               __field(        ino_t,  parent                  )
+               __field(        int,    datasync                )
+               __field(        unsigned int,   tid             )
+       ),
+
+       TP_fast_assign(
+               __entry->dev            = dentry->d_inode->i_sb->s_dev;
+               __entry->ino            = dentry->d_inode->i_ino;
+               __entry->datasync       = datasync;
+               __entry->parent         = dentry->d_parent->d_inode->i_ino;
+               __entry->tid            = tid;
+       ),
+
+       TP_printk("dev %d,%d ino %ld parent %ld datasync %d tid %u",
+                 MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) 
__entry->ino,
+                 (unsigned long) __entry->parent, __entry->datasync,
+                 __entry->tid)
+);
+
+TRACE_EVENT(ext4_sync_files_exit,
+       TP_PROTO(struct dentry *dentry, tid_t tid, int barrier),
+
+       TP_ARGS(dentry, tid, barrier),
+
+       TP_STRUCT__entry(
+               __field(        dev_t,  dev                     )
+               __field(        ino_t,  ino                     )
+               __field(        ino_t,  parent                  )
+               __field(        int,    barrier                 )
+               __field(        unsigned int,   tid             )
+       ),
+
+       TP_fast_assign(
+               __entry->dev            = dentry->d_inode->i_sb->s_dev;
+               __entry->ino            = dentry->d_inode->i_ino;
+               __entry->parent         = dentry->d_parent->d_inode->i_ino;
+               __entry->tid            = tid;
+               __entry->barrier        = barrier;
+       ),
+
+       TP_printk("dev %d,%d ino %ld parent %ld explicit_barrier %d tid %u",
+                 MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) 
__entry->ino,
+                 (unsigned long) __entry->parent, __entry->barrier,
+                 __entry->tid)
+);
+
 TRACE_EVENT(ext4_sync_fs,
        TP_PROTO(struct super_block *sb, int wait),
 
-- 
2.18.2

_______________________________________________
Devel mailing list
[email protected]
https://lists.openvz.org/mailman/listinfo/devel

Reply via email to