Stefano
commit cf7824f08a431ad5a2e1e2d20499734f0632b12d
Author: Stefano Panella <stefano.pane...@citrix.com>
Date: Wed Aug 15 15:32:52 2018 +0000
Add allocation_quantum to gfs2_write_iter
On gfs2_write_iter we know the size of the write and how much
more space we would need to complete the write but this information
is not used and instead all the space needed will be allocate 4kB
at the time from gfs2_write_begin. This behaviour is causing a massive
contention while growing hundereds of files from different nodes.
In an attempt to mitigate this problem a module parameter has been
added to configure a different allocation behaviour in gfs2_write_iter.
The module parameter is called gfs2_allocation_quantum and it has got
the following semantic:
-1: will not attempt to fallocate and not change the existing behaviour
0: (default) will only fallocate without zeroing the part which is
going to be written any way
>0: same as zero but will round up the allocation size by this value
interpreted as as kBytes. This can remove substantially the cost
of growing files at the expense of wasting more storage and having
part of the fallocated region not initialised. This option is meant
to help the use case where every file is backing up a Virtual Machine
qcow2 image for example where the file will grow linearly all the
time and is potentially going to be huge. For the fact that the newly
allocated region is uninitialised the image format wil make sure that
the guest will never see that.
The way the change has been implemented was to refactor the gfs2_fallocate
function to get also a flags parameter which can be set to include
GFS2_FALLOCATE_NO_ZERO in order to avoid writing zeros to the allocated
region. In case of the fallocate called from gfs2_write_iter the flag is
set but is not set otherwise so the behaviour of a normal fallocate will
be to still zero the range.
The performance improvement of the use case of many concurrent writes from
different nodes of very big files grown linearly is massive.
I am including the fio job which we have run on every host concurrently but
on different set of files
[ten-files]
directory=a0:a1:a2:a3:a4:a5:a6:a7:a8:a9
nrfiles=1
size=22G
bs=256k
rw=write
buffered=0
ioengine=libaio
fallocate=none
overwrite=1
numjobs=10
Signed-off-by: Stefano Panella <stefano.pane...@citrix.com>
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 4c0ebff..9db9105 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -26,6 +26,7 @@
#include <linux/dlm.h>
#include <linux/dlm_plock.h>
#include <linux/delay.h>
+#include <linux/module.h>
#include "gfs2.h"
#include "incore.h"
@@ -41,6 +42,17 @@
#include "trans.h"
#include "util.h"
+static int gfs2_allocation_quantum __read_mostly = 0;
+module_param_named(allocation_quantum, gfs2_allocation_quantum, int, 0644);
+MODULE_PARM_DESC(allocation_quantum, "Allocation quantum for gfs2 writes in kBytes,
"
+ "-1 will not attempt to fallocate, "
+ "0 will only fallocate without zeroing the part which is going to
be written any way, "
+ "bigger than 0 is same as zero but will round up the allocation
size by this value interpreted as kBytes");
+
+#define GFS2_FALLOCATE_NO_ZERO 1
+static long __gfs2_fallocate_internal(struct file *file, int mode, loff_t
offset,
+ loff_t len, int flags);
+
/**
* gfs2_llseek - seek to a location in a file
* @file: the file
@@ -695,13 +707,42 @@ static ssize_t gfs2_file_write_iter(struct kiocb *iocb,
struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
struct gfs2_inode *ip = GFS2_I(file_inode(file));
+ u64 offset, size, cluster;
int ret;
ret = gfs2_rsqa_alloc(ip);
if (ret)
return ret;
- gfs2_size_hint(file, iocb->ki_pos, iov_iter_count(from));
+ offset = iocb->ki_pos;
+ size = iov_iter_count(from);
+
+ gfs2_size_hint(file, offset, size);
+
+ if (gfs2_allocation_quantum > -1) {
+ /* Make an attempt to fallocate the full size of the write if
+ * not already allocated in the file, this will decrease
+ * fragmentation and improve overall performance over allocating
+ * a page size at the time as it would happen in
gfs2_write_begin */
+ if (gfs2_write_alloc_required(ip, offset, size)) {
+ if (gfs2_allocation_quantum) {
+ /* if gfs2_allocation_quantum > 0 we do not just
+ * allocate the part missing that is just to be
+ * written but we round it up to the allocation
+ * quantum in an attempt to do few growing
+ * operations at the expense of wasting a bit
+ * more space
+ */
+ cluster = gfs2_allocation_quantum * 1024;
+ size = (((offset + size + cluster - 1) /
+ cluster) * cluster) - offset;
+ }
+ ret = __gfs2_fallocate_internal(file, 0, offset, size,
+ GFS2_FALLOCATE_NO_ZERO);
+ if (ret)
+ return ret;
+ }
+ }
if (iocb->ki_flags & IOCB_APPEND) {
struct gfs2_holder gh;
@@ -716,7 +757,7 @@ static ssize_t gfs2_file_write_iter(struct kiocb *iocb,
struct iov_iter *from)
}
static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len,
- int mode)
+ int mode, int flags)
{
struct gfs2_inode *ip = GFS2_I(inode);
struct buffer_head *dibh;
@@ -739,7 +780,10 @@ static int fallocate_chunk(struct inode *inode, loff_t
offset, loff_t len,
while (len) {
struct buffer_head bh_map = { .b_state = 0, .b_blocknr = 0 };
bh_map.b_size = len;
- set_buffer_zeronew(&bh_map);
+ if (flags & GFS2_FALLOCATE_NO_ZERO)
+ clear_buffer_zeronew(&bh_map);
+ else
+ set_buffer_zeronew(&bh_map);
error = gfs2_block_map(inode, lblock, &bh_map, 1);
if (unlikely(error))
@@ -749,9 +793,11 @@ static int fallocate_chunk(struct inode *inode, loff_t
offset, loff_t len,
lblock += nr_blks;
if (!buffer_new(&bh_map))
continue;
- if (unlikely(!buffer_zeronew(&bh_map))) {
- error = -EIO;
- goto out;
+ if (!(flags & GFS2_FALLOCATE_NO_ZERO)) {
+ if (unlikely(!buffer_zeronew(&bh_map))) {
+ error = -EIO;
+ goto out;
+ }
}
}
out:
@@ -791,7 +837,8 @@ static void calc_max_reserv(struct gfs2_inode *ip, loff_t
*len,
}
}
-static long __gfs2_fallocate(struct file *file, int mode, loff_t offset,
loff_t len)
+static long __gfs2_fallocate(struct file *file, int mode, loff_t offset,
loff_t len,
+ int flags)
{
struct inode *inode = file_inode(file);
struct gfs2_sbd *sdp = GFS2_SB(inode);
@@ -877,7 +924,7 @@ static long __gfs2_fallocate(struct file *file, int mode,
loff_t offset, loff_t
if (error)
goto out_trans_fail;
- error = fallocate_chunk(inode, offset, max_bytes, mode);
+ error = fallocate_chunk(inode, offset, max_bytes, mode, flags);
gfs2_trans_end(sdp);
if (error)
@@ -904,7 +951,8 @@ out_qunlock:
return error;
}
-static long gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t
len)
+static long __gfs2_fallocate_internal(struct file *file, int mode, loff_t
offset,
+ loff_t len, int flags)
{
struct inode *inode = file_inode(file);
struct gfs2_sbd *sdp = GFS2_SB(inode);
@@ -940,7 +988,7 @@ static long gfs2_fallocate(struct file *file, int mode,
loff_t offset, loff_t le
if (ret)
goto out_putw;
- ret = __gfs2_fallocate(file, mode, offset, len);
+ ret = __gfs2_fallocate(file, mode, offset, len, flags);
if (ret)
gfs2_rs_deltree(&ip->i_res);
@@ -954,6 +1002,11 @@ out_uninit:
return ret;
}
+static long gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t
len)
+{
+ return __gfs2_fallocate_internal(file, mode, offset, len, 0);
+}
+
static ssize_t gfs2_file_splice_read(struct file *in, loff_t *ppos,
struct pipe_inode_info *pipe, size_t len,
unsigned int flags)