ср, 31 окт. 2018 г. в 21:12, Nick Terrell <terre...@fb.com>: > > From: Jennifer Liu <jenniferliu...@fb.com> > > Adds zstd compression level support to btrfs. Zstd requires > different amounts of memory for each level, so the design had > to be modified to allow set_level() to allocate memory. We > preallocate one workspace of the maximum size to guarantee > forward progress. This feature is expected to be useful for > read-mostly filesystems, or when creating images. > > Benchmarks run in qemu on Intel x86 with a single core. > The benchmark measures the time to copy the Silesia corpus [0] to > a btrfs filesystem 10 times, then read it back. > > The two important things to note are: > - The decompression speed and memory remains constant. > The memory required to decompress is the same as level 1. > - The compression speed and ratio will vary based on the source. > > Level Ratio Compression Decompression Compression Memory > 1 2.59 153 MB/s 112 MB/s 0.8 MB > 2 2.67 136 MB/s 113 MB/s 1.0 MB > 3 2.72 106 MB/s 115 MB/s 1.3 MB > 4 2.78 86 MB/s 109 MB/s 0.9 MB > 5 2.83 69 MB/s 109 MB/s 1.4 MB > 6 2.89 53 MB/s 110 MB/s 1.5 MB > 7 2.91 40 MB/s 112 MB/s 1.4 MB > 8 2.92 34 MB/s 110 MB/s 1.8 MB > 9 2.93 27 MB/s 109 MB/s 1.8 MB > 10 2.94 22 MB/s 109 MB/s 1.8 MB > 11 2.95 17 MB/s 114 MB/s 1.8 MB > 12 2.95 13 MB/s 113 MB/s 1.8 MB > 13 2.95 10 MB/s 111 MB/s 2.3 MB > 14 2.99 7 MB/s 110 MB/s 2.6 MB > 15 3.03 6 MB/s 110 MB/s 2.6 MB > > [0] http://sun.aei.polsl.pl/~sdeor/index.php?page=silesia > > Signed-off-by: Jennifer Liu <jenniferliu...@fb.com> > Signed-off-by: Nick Terrell <terre...@fb.com> > Reviewed-by: Omar Sandoval <osan...@fb.com> > --- > v1 -> v2: > - Don't reflow the unchanged line. > > fs/btrfs/compression.c | 169 +++++++++++++++++++++++++---------------- > fs/btrfs/compression.h | 18 +++-- > fs/btrfs/lzo.c | 5 +- > fs/btrfs/super.c | 7 +- > fs/btrfs/zlib.c | 33 ++++---- > fs/btrfs/zstd.c | 74 +++++++++++++----- > 6 files changed, 202 insertions(+), 104 deletions(-) > > diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c > index 2955a4ea2fa8..b46652cb653e 100644 > --- a/fs/btrfs/compression.c > +++ b/fs/btrfs/compression.c > @@ -822,9 +822,12 @@ void __init btrfs_init_compress(void) > > /* > * Preallocate one workspace for each compression type so > - * we can guarantee forward progress in the worst case > + * we can guarantee forward progress in the worst case. > + * Provide the maximum compression level to guarantee large > + * enough workspace. > */ > - workspace = btrfs_compress_op[i]->alloc_workspace(); > + workspace = btrfs_compress_op[i]->alloc_workspace( > + btrfs_compress_op[i]->max_level); > if (IS_ERR(workspace)) { > pr_warn("BTRFS: cannot preallocate compression > workspace, will try later\n"); > } else { > @@ -835,23 +838,78 @@ void __init btrfs_init_compress(void) > } > } > > +/* > + * put a workspace struct back on the list or free it if we have enough > + * idle ones sitting around > + */ > +static void __free_workspace(int type, struct list_head *workspace, > + bool heuristic) > +{ > + int idx = type - 1; > + struct list_head *idle_ws; > + spinlock_t *ws_lock; > + atomic_t *total_ws; > + wait_queue_head_t *ws_wait; > + int *free_ws; > + > + if (heuristic) { > + idle_ws = &btrfs_heuristic_ws.idle_ws; > + ws_lock = &btrfs_heuristic_ws.ws_lock; > + total_ws = &btrfs_heuristic_ws.total_ws; > + ws_wait = &btrfs_heuristic_ws.ws_wait; > + free_ws = &btrfs_heuristic_ws.free_ws; > + } else { > + idle_ws = &btrfs_comp_ws[idx].idle_ws; > + ws_lock = &btrfs_comp_ws[idx].ws_lock; > + total_ws = &btrfs_comp_ws[idx].total_ws; > + ws_wait = &btrfs_comp_ws[idx].ws_wait; > + free_ws = &btrfs_comp_ws[idx].free_ws; > + } > + > + spin_lock(ws_lock); > + if (*free_ws <= num_online_cpus()) { > + list_add(workspace, idle_ws); > + (*free_ws)++; > + spin_unlock(ws_lock); > + goto wake; > + } > + spin_unlock(ws_lock); > + > + if (heuristic) > + free_heuristic_ws(workspace); > + else > + btrfs_compress_op[idx]->free_workspace(workspace); > + atomic_dec(total_ws); > +wake: > + cond_wake_up(ws_wait); > +} > + > +static void free_workspace(int type, struct list_head *ws) > +{ > + return __free_workspace(type, ws, false); > +} > + > /* > * This finds an available workspace or allocates a new one. > * If it's not possible to allocate a new one, waits until there's one. > * Preallocation makes a forward progress guarantees and we do not return > * errors. > */ > -static struct list_head *__find_workspace(int type, bool heuristic) > +static struct list_head *__find_workspace(unsigned int type_level, > + bool heuristic) > { > struct list_head *workspace; > int cpus = num_online_cpus(); > + int type = type_level & 0xF; > int idx = type - 1; > - unsigned nofs_flag; > + unsigned int level = (type_level & 0xF0) >> 4; > + unsigned int nofs_flag; > struct list_head *idle_ws; > spinlock_t *ws_lock; > atomic_t *total_ws; > wait_queue_head_t *ws_wait; > int *free_ws; > + int ret; > > if (heuristic) { > idle_ws = &btrfs_heuristic_ws.idle_ws; > @@ -874,8 +932,17 @@ static struct list_head *__find_workspace(int type, bool > heuristic) > list_del(workspace); > (*free_ws)--; > spin_unlock(ws_lock); > + if (!heuristic) { > + nofs_flag = memalloc_nofs_save(); > + ret = btrfs_compress_op[idx]->set_level(workspace, > + level); > + memalloc_nofs_restore(nofs_flag); > + if (!ret) { > + free_workspace(type, workspace); > + goto again; > + } > + } > return workspace; > - > } > if (atomic_read(total_ws) > cpus) { > DEFINE_WAIT(wait); > @@ -899,7 +966,8 @@ static struct list_head *__find_workspace(int type, bool > heuristic) > if (heuristic) > workspace = alloc_heuristic_ws(); > else > - workspace = btrfs_compress_op[idx]->alloc_workspace(); > + workspace = btrfs_compress_op[idx]->alloc_workspace(level); > + > memalloc_nofs_restore(nofs_flag); > > if (IS_ERR(workspace)) { > @@ -930,60 +998,22 @@ static struct list_head *__find_workspace(int type, > bool heuristic) > return workspace; > } > > -static struct list_head *find_workspace(int type) > +static struct list_head *find_workspace(unsigned int type_level) > { > - return __find_workspace(type, false); > + return __find_workspace(type_level, false); > } > > -/* > - * put a workspace struct back on the list or free it if we have enough > - * idle ones sitting around > - */ > -static void __free_workspace(int type, struct list_head *workspace, > - bool heuristic) > +static struct list_head *find_decompression_workspace(unsigned int type) > { > - int idx = type - 1; > - struct list_head *idle_ws; > - spinlock_t *ws_lock; > - atomic_t *total_ws; > - wait_queue_head_t *ws_wait; > - int *free_ws; > + /* > + * Use the lowest level for decompression, since we don't need to > + * compress. This can help us save memory when using levels lower than > + * the default level. > + */ > + const unsigned int level = 1; > + const unsigned int type_level = (level << 4) | (type & 0xF); > > - if (heuristic) { > - idle_ws = &btrfs_heuristic_ws.idle_ws; > - ws_lock = &btrfs_heuristic_ws.ws_lock; > - total_ws = &btrfs_heuristic_ws.total_ws; > - ws_wait = &btrfs_heuristic_ws.ws_wait; > - free_ws = &btrfs_heuristic_ws.free_ws; > - } else { > - idle_ws = &btrfs_comp_ws[idx].idle_ws; > - ws_lock = &btrfs_comp_ws[idx].ws_lock; > - total_ws = &btrfs_comp_ws[idx].total_ws; > - ws_wait = &btrfs_comp_ws[idx].ws_wait; > - free_ws = &btrfs_comp_ws[idx].free_ws; > - } > - > - spin_lock(ws_lock); > - if (*free_ws <= num_online_cpus()) { > - list_add(workspace, idle_ws); > - (*free_ws)++; > - spin_unlock(ws_lock); > - goto wake; > - } > - spin_unlock(ws_lock); > - > - if (heuristic) > - free_heuristic_ws(workspace); > - else > - btrfs_compress_op[idx]->free_workspace(workspace); > - atomic_dec(total_ws); > -wake: > - cond_wake_up(ws_wait); > -} > - > -static void free_workspace(int type, struct list_head *ws) > -{ > - return __free_workspace(type, ws, false); > + return find_workspace(type_level); > } > > /* > @@ -1044,9 +1074,7 @@ int btrfs_compress_pages(unsigned int type_level, > struct address_space *mapping, > int ret; > int type = type_level & 0xF; > > - workspace = find_workspace(type); > - > - btrfs_compress_op[type - 1]->set_level(workspace, type_level); > + workspace = find_workspace(type_level); > ret = btrfs_compress_op[type-1]->compress_pages(workspace, mapping, > start, pages, > out_pages, > @@ -1075,7 +1103,7 @@ static int btrfs_decompress_bio(struct compressed_bio > *cb) > int ret; > int type = cb->compress_type; > > - workspace = find_workspace(type); > + workspace = find_decompression_workspace(type); > ret = btrfs_compress_op[type - 1]->decompress_bio(workspace, cb); > free_workspace(type, workspace); > > @@ -1093,7 +1121,7 @@ int btrfs_decompress(int type, unsigned char *data_in, > struct page *dest_page, > struct list_head *workspace; > int ret; > > - workspace = find_workspace(type); > + workspace = find_decompression_workspace(type); > > ret = btrfs_compress_op[type-1]->decompress(workspace, data_in, > dest_page, start_byte, > @@ -1591,12 +1619,23 @@ int btrfs_compress_heuristic(struct inode *inode, u64 > start, u64 end) > > unsigned int btrfs_compress_str2level(const char *str) > { > - if (strncmp(str, "zlib", 4) != 0) > + int ret; > + int type; > + unsigned int level; > + > + if (strncmp(str, "zlib", 4) == 0) > + type = BTRFS_COMPRESS_ZLIB; > + else if (strncmp(str, "zstd", 4) == 0) > + type = BTRFS_COMPRESS_ZSTD; > + else > return 0; > > - /* Accepted form: zlib:1 up to zlib:9 and nothing left after the > number */ > - if (str[4] == ':' && '1' <= str[5] && str[5] <= '9' && str[6] == 0) > - return str[5] - '0'; > + if (str[4] == ':') { > + ret = kstrtouint(str + 5, 10, &level); > + if (ret == 0 && 0 < level && > + level <= btrfs_compress_op[type-1]->max_level) > + return level; > + } > > - return BTRFS_ZLIB_DEFAULT_LEVEL; > + return btrfs_compress_op[type-1]->default_level; > } > diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h > index ddda9b80bf20..a582a4483077 100644 > --- a/fs/btrfs/compression.h > +++ b/fs/btrfs/compression.h > @@ -23,8 +23,6 @@ > /* Maximum size of data before compression */ > #define BTRFS_MAX_UNCOMPRESSED (SZ_128K) > > -#define BTRFS_ZLIB_DEFAULT_LEVEL 3 > - > struct compressed_bio { > /* number of bios pending for this compressed extent */ > refcount_t pending_bios; > @@ -87,7 +85,7 @@ blk_status_t btrfs_submit_compressed_write(struct inode > *inode, u64 start, > blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio > *bio, > int mirror_num, unsigned long bio_flags); > > -unsigned btrfs_compress_str2level(const char *str); > +unsigned int btrfs_compress_str2level(const char *str); > > enum btrfs_compression_type { > BTRFS_COMPRESS_NONE = 0, > @@ -98,7 +96,7 @@ enum btrfs_compression_type { > }; > > struct btrfs_compress_op { > - struct list_head *(*alloc_workspace)(void); > + struct list_head *(*alloc_workspace)(unsigned int level); > > void (*free_workspace)(struct list_head *workspace); > > @@ -119,7 +117,17 @@ struct btrfs_compress_op { > unsigned long start_byte, > size_t srclen, size_t destlen); > > - void (*set_level)(struct list_head *ws, unsigned int type); > + /* > + * Check if memory allocated in workspace is greater than > + * or equal to memory needed to compress with given level. > + * If not, try to reallocate memory for the compression level. > + * Returns true on success. > + */ > + bool (*set_level)(struct list_head *ws, unsigned int level); > + > + unsigned int max_level; > + > + unsigned int default_level; > }; > > extern const struct btrfs_compress_op btrfs_zlib_compress; > diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c > index b6a4cc178bee..ed9f0da8ceda 100644 > --- a/fs/btrfs/lzo.c > +++ b/fs/btrfs/lzo.c > @@ -71,7 +71,7 @@ static void lzo_free_workspace(struct list_head *ws) > kfree(workspace); > } > > -static struct list_head *lzo_alloc_workspace(void) > +static struct list_head *lzo_alloc_workspace(unsigned int level) > { > struct workspace *workspace; > > @@ -485,8 +485,9 @@ static int lzo_decompress(struct list_head *ws, unsigned > char *data_in, > return ret; > } > > -static void lzo_set_level(struct list_head *ws, unsigned int type) > +static bool lzo_set_level(struct list_head *ws, unsigned int level) > { > + return true; > } > > const struct btrfs_compress_op btrfs_lzo_compress = { > diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c > index b362b45dd757..77ebd69371f1 100644 > --- a/fs/btrfs/super.c > +++ b/fs/btrfs/super.c > @@ -520,7 +520,8 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char > *options, > compress_type = "zlib"; > > info->compress_type = BTRFS_COMPRESS_ZLIB; > - info->compress_level = > BTRFS_ZLIB_DEFAULT_LEVEL; > + info->compress_level = > + btrfs_zlib_compress.default_level; > /* > * args[0] contains uninitialized data since > * for these tokens we don't expect any > @@ -542,9 +543,11 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char > *options, > btrfs_clear_opt(info->mount_opt, NODATASUM); > btrfs_set_fs_incompat(info, COMPRESS_LZO); > no_compress = 0; > - } else if (strcmp(args[0].from, "zstd") == 0) { > + } else if (strncmp(args[0].from, "zstd", 4) == 0) { > compress_type = "zstd"; > info->compress_type = BTRFS_COMPRESS_ZSTD; > + info->compress_level = > + > btrfs_compress_str2level(args[0].from); > btrfs_set_opt(info->mount_opt, COMPRESS); > btrfs_clear_opt(info->mount_opt, NODATACOW); > btrfs_clear_opt(info->mount_opt, NODATASUM); > diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c > index 970ff3e35bb3..4c30a99b80f6 100644 > --- a/fs/btrfs/zlib.c > +++ b/fs/btrfs/zlib.c > @@ -20,6 +20,9 @@ > #include <linux/refcount.h> > #include "compression.h" > > +#define BTRFS_ZLIB_DEFAULT_LEVEL 3 > +#define BTRFS_ZLIB_MAX_LEVEL 9 > + > struct workspace { > z_stream strm; > char *buf; > @@ -36,7 +39,19 @@ static void zlib_free_workspace(struct list_head *ws) > kfree(workspace); > } > > -static struct list_head *zlib_alloc_workspace(void) > +static bool zlib_set_level(struct list_head *ws, unsigned int level) > +{ > + struct workspace *workspace = list_entry(ws, struct workspace, list); > + > + if (level > BTRFS_ZLIB_MAX_LEVEL) > + level = BTRFS_ZLIB_MAX_LEVEL; > + > + workspace->level = level > 0 ? level : BTRFS_ZLIB_DEFAULT_LEVEL; > + > + return true; > +} > + > +static struct list_head *zlib_alloc_workspace(unsigned int level) > { > struct workspace *workspace; > int workspacesize; > @@ -53,6 +68,7 @@ static struct list_head *zlib_alloc_workspace(void) > goto fail; > > INIT_LIST_HEAD(&workspace->list); > + zlib_set_level(&workspace->list, level); > > return &workspace->list; > fail: > @@ -390,22 +406,13 @@ static int zlib_decompress(struct list_head *ws, > unsigned char *data_in, > return ret; > } > > -static void zlib_set_level(struct list_head *ws, unsigned int type) > -{ > - struct workspace *workspace = list_entry(ws, struct workspace, list); > - unsigned level = (type & 0xF0) >> 4; > - > - if (level > 9) > - level = 9; > - > - workspace->level = level > 0 ? level : 3; > -} > - > const struct btrfs_compress_op btrfs_zlib_compress = { > .alloc_workspace = zlib_alloc_workspace, > .free_workspace = zlib_free_workspace, > .compress_pages = zlib_compress_pages, > .decompress_bio = zlib_decompress_bio, > .decompress = zlib_decompress, > - .set_level = zlib_set_level, > + .set_level = zlib_set_level, > + .max_level = BTRFS_ZLIB_MAX_LEVEL, > + .default_level = BTRFS_ZLIB_DEFAULT_LEVEL, > }; > diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c > index af6ec59972f5..e5d7c2eae65c 100644 > --- a/fs/btrfs/zstd.c > +++ b/fs/btrfs/zstd.c > @@ -19,12 +19,13 @@ > > #define ZSTD_BTRFS_MAX_WINDOWLOG 17 > #define ZSTD_BTRFS_MAX_INPUT (1 << ZSTD_BTRFS_MAX_WINDOWLOG) > -#define ZSTD_BTRFS_DEFAULT_LEVEL 3 > +#define BTRFS_ZSTD_DEFAULT_LEVEL 3 > +#define BTRFS_ZSTD_MAX_LEVEL 15 > > -static ZSTD_parameters zstd_get_btrfs_parameters(size_t src_len) > +static ZSTD_parameters zstd_get_btrfs_parameters(size_t src_len, > + unsigned int level) > { > - ZSTD_parameters params = ZSTD_getParams(ZSTD_BTRFS_DEFAULT_LEVEL, > - src_len, 0); > + ZSTD_parameters params = ZSTD_getParams(level, src_len, 0); > > if (params.cParams.windowLog > ZSTD_BTRFS_MAX_WINDOWLOG) > params.cParams.windowLog = ZSTD_BTRFS_MAX_WINDOWLOG; > @@ -37,10 +38,25 @@ struct workspace { > size_t size; > char *buf; > struct list_head list; > + unsigned int level; > ZSTD_inBuffer in_buf; > ZSTD_outBuffer out_buf; > }; > > +static bool zstd_reallocate_mem(struct workspace *workspace, int size) > +{ > + void *new_mem; > + > + new_mem = kvmalloc(size, GFP_KERNEL); > + if (new_mem) { > + kvfree(workspace->mem); > + workspace->mem = new_mem; > + workspace->size = size; > + return true; > + } > + return false; > +} > + > static void zstd_free_workspace(struct list_head *ws) > { > struct workspace *workspace = list_entry(ws, struct workspace, list); > @@ -50,10 +66,34 @@ static void zstd_free_workspace(struct list_head *ws) > kfree(workspace); > } > > -static struct list_head *zstd_alloc_workspace(void) > +static bool zstd_set_level(struct list_head *ws, unsigned int level) > +{ > + struct workspace *workspace = list_entry(ws, struct workspace, list); > + ZSTD_parameters params; > + int size; > + > + if (level > BTRFS_ZSTD_MAX_LEVEL) > + level = BTRFS_ZSTD_MAX_LEVEL; > + > + if (level == 0) > + level = BTRFS_ZSTD_DEFAULT_LEVEL; > + > + params = ZSTD_getParams(level, ZSTD_BTRFS_MAX_INPUT, 0); > + size = max_t(size_t, > + ZSTD_CStreamWorkspaceBound(params.cParams), > + ZSTD_DStreamWorkspaceBound(ZSTD_BTRFS_MAX_INPUT)); > + if (size > workspace->size) { > + if (!zstd_reallocate_mem(workspace, size)) > + return false; > + } > + workspace->level = level; > + return true; > +} > + > +static struct list_head *zstd_alloc_workspace(unsigned int level) > { > ZSTD_parameters params = > - zstd_get_btrfs_parameters(ZSTD_BTRFS_MAX_INPUT); > + zstd_get_btrfs_parameters(ZSTD_BTRFS_MAX_INPUT, > level); > struct workspace *workspace; > > workspace = kzalloc(sizeof(*workspace), GFP_KERNEL); > @@ -69,6 +109,7 @@ static struct list_head *zstd_alloc_workspace(void) > goto fail; > > INIT_LIST_HEAD(&workspace->list); > + zstd_set_level(&workspace->list, level); > > return &workspace->list; > fail: > @@ -95,7 +136,8 @@ static int zstd_compress_pages(struct list_head *ws, > unsigned long len = *total_out; > const unsigned long nr_dest_pages = *out_pages; > unsigned long max_out = nr_dest_pages * PAGE_SIZE; > - ZSTD_parameters params = zstd_get_btrfs_parameters(len); > + ZSTD_parameters params = zstd_get_btrfs_parameters(len, > + workspace->level); > > *out_pages = 0; > *total_out = 0; > @@ -419,15 +461,13 @@ static int zstd_decompress(struct list_head *ws, > unsigned char *data_in, > return ret; > } > > -static void zstd_set_level(struct list_head *ws, unsigned int type) > -{ > -} > - > const struct btrfs_compress_op btrfs_zstd_compress = { > - .alloc_workspace = zstd_alloc_workspace, > - .free_workspace = zstd_free_workspace, > - .compress_pages = zstd_compress_pages, > - .decompress_bio = zstd_decompress_bio, > - .decompress = zstd_decompress, > - .set_level = zstd_set_level, > + .alloc_workspace = zstd_alloc_workspace, > + .free_workspace = zstd_free_workspace, > + .compress_pages = zstd_compress_pages, > + .decompress_bio = zstd_decompress_bio, > + .decompress = zstd_decompress, > + .set_level = zstd_set_level, > + .max_level = BTRFS_ZSTD_MAX_LEVEL, > + .default_level = BTRFS_ZSTD_DEFAULT_LEVEL, > }; > -- > 2.17.1
Reviewed-by: Timofey Titovets <nefelim...@gmail.com> You didn't mention, so: Did you test compression ratio/performance with compress-force or just compress? Thanks. -- Have a nice day, Timofey.