On Thu, Aug 29, 2024 at 8:29 PM Gao Xiang <hsiang...@linux.alibaba.com> wrote: > > Since EROFS only needs to handle read requests in simple contexts, > Just directly use vfs_iocb_iter_read() for data I/Os. > > Signed-off-by: Gao Xiang <hsiang...@linux.alibaba.com> > --- > v2: > - fix redundant refcount which cause hanging on chunked inodes. > > fs/erofs/Makefile | 1 + > fs/erofs/data.c | 50 +++++++++++- > fs/erofs/fileio.c | 181 ++++++++++++++++++++++++++++++++++++++++++++ > fs/erofs/inode.c | 17 +++-- > fs/erofs/internal.h | 7 +- > fs/erofs/zdata.c | 46 ++--------- > 6 files changed, 251 insertions(+), 51 deletions(-) > create mode 100644 fs/erofs/fileio.c > > diff --git a/fs/erofs/Makefile b/fs/erofs/Makefile > index 097d672e6b14..4331d53c7109 100644 > --- a/fs/erofs/Makefile > +++ b/fs/erofs/Makefile > @@ -7,4 +7,5 @@ erofs-$(CONFIG_EROFS_FS_ZIP) += decompressor.o zmap.o zdata.o > zutil.o > erofs-$(CONFIG_EROFS_FS_ZIP_LZMA) += decompressor_lzma.o > erofs-$(CONFIG_EROFS_FS_ZIP_DEFLATE) += decompressor_deflate.o > erofs-$(CONFIG_EROFS_FS_ZIP_ZSTD) += decompressor_zstd.o > +erofs-$(CONFIG_EROFS_FS_BACKED_BY_FILE) += fileio.o > erofs-$(CONFIG_EROFS_FS_ONDEMAND) += fscache.o > diff --git a/fs/erofs/data.c b/fs/erofs/data.c > index 0fb31c588ae0..b4c07ce7a294 100644 > --- a/fs/erofs/data.c > +++ b/fs/erofs/data.c > @@ -132,7 +132,7 @@ int erofs_map_blocks(struct inode *inode, struct > erofs_map_blocks *map) > if (map->m_la >= inode->i_size) { > /* leave out-of-bound access unmapped */ > map->m_flags = 0; > - map->m_plen = 0; > + map->m_plen = map->m_llen; > goto out; > } > > @@ -197,8 +197,13 @@ static void erofs_fill_from_devinfo(struct erofs_map_dev > *map, > struct erofs_device_info *dif) > { > map->m_bdev = NULL; > - if (dif->file && S_ISBLK(file_inode(dif->file)->i_mode)) > - map->m_bdev = file_bdev(dif->file); > + map->m_fp = NULL; > + if (dif->file) { > + if (S_ISBLK(file_inode(dif->file)->i_mode)) > + map->m_bdev = file_bdev(dif->file); > + else > + map->m_fp = dif->file; > + } > map->m_daxdev = dif->dax_dev; > map->m_dax_part_off = dif->dax_part_off; > map->m_fscache = dif->fscache; > @@ -215,6 +220,7 @@ int erofs_map_dev(struct super_block *sb, struct > erofs_map_dev *map) > map->m_daxdev = EROFS_SB(sb)->dax_dev; > map->m_dax_part_off = EROFS_SB(sb)->dax_part_off; > map->m_fscache = EROFS_SB(sb)->s_fscache; > + map->m_fp = EROFS_SB(sb)->fdev; > > if (map->m_deviceid) { > down_read(&devs->rwsem); > @@ -250,6 +256,42 @@ int erofs_map_dev(struct super_block *sb, struct > erofs_map_dev *map) > return 0; > } > > +/* > + * bit 30: I/O error occurred on this folio > + * bit 0 - 29: remaining parts to complete this folio > + */ > +#define EROFS_ONLINEFOLIO_EIO (1 << 30) > + > +void erofs_onlinefolio_init(struct folio *folio) > +{ > + union { > + atomic_t o; > + void *v; > + } u = { .o = ATOMIC_INIT(1) }; > + > + folio->private = u.v; /* valid only if file-backed folio is locked > */ > +} > + > +void erofs_onlinefolio_split(struct folio *folio) > +{ > + atomic_inc((atomic_t *)&folio->private); > +} > + > +void erofs_onlinefolio_end(struct folio *folio, int err) > +{ > + int orig, v; > + > + do { > + orig = atomic_read((atomic_t *)&folio->private); > + v = (orig - 1) | (err ? EROFS_ONLINEFOLIO_EIO : 0); > + } while (atomic_cmpxchg((atomic_t *)&folio->private, orig, v) != > orig); > + > + if (v & ~EROFS_ONLINEFOLIO_EIO) > + return; > + folio->private = 0; > + folio_end_read(folio, !(v & EROFS_ONLINEFOLIO_EIO)); > +} > + > static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t > length, > unsigned int flags, struct iomap *iomap, struct iomap *srcmap) > { > @@ -399,7 +441,7 @@ static ssize_t erofs_file_read_iter(struct kiocb *iocb, > struct iov_iter *to) > } > > /* for uncompressed (aligned) files and raw access for other files */ > -const struct address_space_operations erofs_raw_access_aops = { > +const struct address_space_operations erofs_aops = { > .read_folio = erofs_read_folio, > .readahead = erofs_readahead, > .bmap = erofs_bmap, > diff --git a/fs/erofs/fileio.c b/fs/erofs/fileio.c > new file mode 100644 > index 000000000000..eab52b8abd0b > --- /dev/null > +++ b/fs/erofs/fileio.c > @@ -0,0 +1,181 @@ > +// SPDX-License-Identifier: GPL-2.0-or-later > +/* > + * Copyright (C) 2024, Alibaba Cloud > + */ > +#include "internal.h" > +#include <trace/events/erofs.h> > + > +struct erofs_fileio_rq { > + struct bio_vec bvecs[BIO_MAX_VECS]; > + struct bio bio; > + struct kiocb iocb; > +}; > + > +struct erofs_fileio { > + struct erofs_map_blocks map; > + struct erofs_map_dev dev; > + struct erofs_fileio_rq *rq; > +}; > + > +static void erofs_fileio_ki_complete(struct kiocb *iocb, long ret) > +{ > + struct erofs_fileio_rq *rq = > + container_of(iocb, struct erofs_fileio_rq, iocb); > + struct folio_iter fi; > + > + DBG_BUGON(rq->bio.bi_end_io); > + if (ret > 0) { > + if (ret != rq->bio.bi_iter.bi_size) { > + bio_advance(&rq->bio, ret); > + zero_fill_bio(&rq->bio); > + } > + ret = 0; > + } > + bio_for_each_folio_all(fi, &rq->bio) { > + DBG_BUGON(folio_test_uptodate(fi.folio)); > + erofs_onlinefolio_end(fi.folio, ret); > + } > + kfree(rq); > +} > + > +static void erofs_fileio_rq_submit(struct erofs_fileio_rq *rq) > +{ > + struct iov_iter iter; > + int ret; > + > + if (!rq) > + return; > + rq->iocb.ki_pos = rq->bio.bi_iter.bi_sector << 9; > + rq->iocb.ki_ioprio = get_current_ioprio(); > + rq->iocb.ki_complete = erofs_fileio_ki_complete; > + rq->iocb.ki_flags = (rq->iocb.ki_filp->f_mode & FMODE_CAN_ODIRECT) ? > + IOCB_DIRECT : 0; Hi Gao, Does this mean, direct IO by default if the backing file supports it (technically filesystem where image/backing file reside)?
Thanks, Sandeep. > + iov_iter_bvec(&iter, ITER_DEST, rq->bvecs, rq->bio.bi_vcnt, > + rq->bio.bi_iter.bi_size); > + ret = vfs_iocb_iter_read(rq->iocb.ki_filp, &rq->iocb, &iter); > + if (ret != -EIOCBQUEUED) > + erofs_fileio_ki_complete(&rq->iocb, ret); > +} > + > +static struct erofs_fileio_rq *erofs_fileio_rq_alloc(struct erofs_map_dev > *mdev) > +{ > + struct erofs_fileio_rq *rq = kzalloc(sizeof(*rq), GFP_KERNEL); > + > + if (!rq) > + return NULL; > + bio_init(&rq->bio, NULL, rq->bvecs, BIO_MAX_VECS, REQ_OP_READ); > + rq->iocb.ki_filp = mdev->m_fp; > + return rq; > +} > + > +static int erofs_fileio_scan_folio(struct erofs_fileio *io, struct folio > *folio) > +{ > + struct inode *inode = folio_inode(folio); > + struct erofs_map_blocks *map = &io->map; > + unsigned int cur = 0, end = folio_size(folio), len, attached = 0; > + loff_t pos = folio_pos(folio), ofs; > + struct iov_iter iter; > + struct bio_vec bv; > + int err = 0; > + > + erofs_onlinefolio_init(folio); > + while (cur < end) { > + if (!in_range(pos + cur, map->m_la, map->m_llen)) { > + map->m_la = pos + cur; > + map->m_llen = end - cur; > + err = erofs_map_blocks(inode, map); > + if (err) > + break; > + } > + > + ofs = folio_pos(folio) + cur - map->m_la; > + len = min_t(loff_t, map->m_llen - ofs, end - cur); > + if (map->m_flags & EROFS_MAP_META) { > + struct erofs_buf buf = __EROFS_BUF_INITIALIZER; > + void *src; > + > + src = erofs_read_metabuf(&buf, inode->i_sb, > + map->m_pa + ofs, EROFS_KMAP); > + if (IS_ERR(src)) { > + err = PTR_ERR(src); > + break; > + } > + bvec_set_folio(&bv, folio, len, cur); > + iov_iter_bvec(&iter, ITER_DEST, &bv, 1, len); > + if (copy_to_iter(src, len, &iter) != len) { > + erofs_put_metabuf(&buf); > + err = -EIO; > + break; > + } > + erofs_put_metabuf(&buf); > + } else if (!(map->m_flags & EROFS_MAP_MAPPED)) { > + folio_zero_segment(folio, cur, cur + len); > + } else { > + if (io->rq && (map->m_pa + ofs != io->dev.m_pa || > + map->m_deviceid != > io->dev.m_deviceid)) { > +io_retry: > + erofs_fileio_rq_submit(io->rq); > + io->rq = NULL; > + } > + > + if (!io->rq) { > + io->dev = (struct erofs_map_dev) { > + .m_pa = io->map.m_pa + ofs, > + .m_deviceid = io->map.m_deviceid, > + }; > + err = erofs_map_dev(inode->i_sb, &io->dev); > + if (err) > + break; > + io->rq = erofs_fileio_rq_alloc(&io->dev); > + if (!io->rq) { > + err = -ENOMEM; > + break; > + } > + io->rq->bio.bi_iter.bi_sector = io->dev.m_pa > >> 9; > + attached = 0; > + } > + if (!attached++) > + erofs_onlinefolio_split(folio); > + if (!bio_add_folio(&io->rq->bio, folio, len, cur)) > + goto io_retry; > + io->dev.m_pa += len; > + } > + cur += len; > + } > + erofs_onlinefolio_end(folio, err); > + return err; > +} > + > +static int erofs_fileio_read_folio(struct file *file, struct folio *folio) > +{ > + struct erofs_fileio io = {}; > + int err; > + > + trace_erofs_read_folio(folio, false); > + err = erofs_fileio_scan_folio(&io, folio); > + erofs_fileio_rq_submit(io.rq); > + return err; > +} > + > +static void erofs_fileio_readahead(struct readahead_control *rac) > +{ > + struct inode *inode = rac->mapping->host; > + struct erofs_fileio io = {}; > + struct folio *folio; > + int err; > + > + trace_erofs_readpages(inode, readahead_index(rac), > + readahead_count(rac), false); > + while ((folio = readahead_folio(rac))) { > + err = erofs_fileio_scan_folio(&io, folio); > + if (err && err != -EINTR) > + erofs_err(inode->i_sb, "readahead error at folio %lu > @ nid %llu", > + folio->index, EROFS_I(inode)->nid); > + } > + erofs_fileio_rq_submit(io.rq); > +} > + > +const struct address_space_operations erofs_fileio_aops = { > + .read_folio = erofs_fileio_read_folio, > + .readahead = erofs_fileio_readahead, > +}; > diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c > index d05b9e59f122..4a902e6e69a5 100644 > --- a/fs/erofs/inode.c > +++ b/fs/erofs/inode.c > @@ -258,11 +258,14 @@ static int erofs_fill_inode(struct inode *inode) > } > > mapping_set_large_folios(inode->i_mapping); > - if (erofs_is_fileio_mode(EROFS_SB(inode->i_sb))) { > - /* XXX: data I/Os will be implemented in the following > patches */ > - err = -EOPNOTSUPP; > - } else if (erofs_inode_is_data_compressed(vi->datalayout)) { > + if (erofs_inode_is_data_compressed(vi->datalayout)) { > #ifdef CONFIG_EROFS_FS_ZIP > +#ifdef CONFIG_EROFS_FS_BACKED_BY_FILE > + if (erofs_is_fileio_mode(EROFS_SB(inode->i_sb))) { > + err = -EOPNOTSUPP; > + goto out_unlock; > + } > +#endif > DO_ONCE_LITE_IF(inode->i_blkbits != PAGE_SHIFT, > erofs_info, inode->i_sb, > "EXPERIMENTAL EROFS subpage compressed block > support in use. Use at your own risk!"); > @@ -271,10 +274,14 @@ static int erofs_fill_inode(struct inode *inode) > err = -EOPNOTSUPP; > #endif > } else { > - inode->i_mapping->a_ops = &erofs_raw_access_aops; > + inode->i_mapping->a_ops = &erofs_aops; > #ifdef CONFIG_EROFS_FS_ONDEMAND > if (erofs_is_fscache_mode(inode->i_sb)) > inode->i_mapping->a_ops = &erofs_fscache_access_aops; > +#endif > +#ifdef CONFIG_EROFS_FS_BACKED_BY_FILE > + if (erofs_is_fileio_mode(EROFS_SB(inode->i_sb))) > + inode->i_mapping->a_ops = &erofs_fileio_aops; > #endif > } > out_unlock: > diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h > index 9bf4fb1cfa09..9bc4dcfd06d7 100644 > --- a/fs/erofs/internal.h > +++ b/fs/erofs/internal.h > @@ -372,6 +372,7 @@ struct erofs_map_dev { > struct erofs_fscache *m_fscache; > struct block_device *m_bdev; > struct dax_device *m_daxdev; > + struct file *m_fp; > u64 m_dax_part_off; > > erofs_off_t m_pa; > @@ -380,7 +381,8 @@ struct erofs_map_dev { > > extern const struct super_operations erofs_sops; > > -extern const struct address_space_operations erofs_raw_access_aops; > +extern const struct address_space_operations erofs_aops; > +extern const struct address_space_operations erofs_fileio_aops; > extern const struct address_space_operations z_erofs_aops; > extern const struct address_space_operations erofs_fscache_access_aops; > > @@ -411,6 +413,9 @@ int erofs_map_dev(struct super_block *sb, struct > erofs_map_dev *dev); > int erofs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, > u64 start, u64 len); > int erofs_map_blocks(struct inode *inode, struct erofs_map_blocks *map); > +void erofs_onlinefolio_init(struct folio *folio); > +void erofs_onlinefolio_split(struct folio *folio); > +void erofs_onlinefolio_end(struct folio *folio, int err); > struct inode *erofs_iget(struct super_block *sb, erofs_nid_t nid); > int erofs_getattr(struct mnt_idmap *idmap, const struct path *path, > struct kstat *stat, u32 request_mask, > diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c > index 424f656cd765..350612f32ac6 100644 > --- a/fs/erofs/zdata.c > +++ b/fs/erofs/zdata.c > @@ -122,42 +122,6 @@ static bool erofs_folio_is_managed(struct erofs_sb_info > *sbi, struct folio *fo) > return fo->mapping == MNGD_MAPPING(sbi); > } > > -/* > - * bit 30: I/O error occurred on this folio > - * bit 0 - 29: remaining parts to complete this folio > - */ > -#define Z_EROFS_FOLIO_EIO (1 << 30) > - > -static void z_erofs_onlinefolio_init(struct folio *folio) > -{ > - union { > - atomic_t o; > - void *v; > - } u = { .o = ATOMIC_INIT(1) }; > - > - folio->private = u.v; /* valid only if file-backed folio is locked > */ > -} > - > -static void z_erofs_onlinefolio_split(struct folio *folio) > -{ > - atomic_inc((atomic_t *)&folio->private); > -} > - > -static void z_erofs_onlinefolio_end(struct folio *folio, int err) > -{ > - int orig, v; > - > - do { > - orig = atomic_read((atomic_t *)&folio->private); > - v = (orig - 1) | (err ? Z_EROFS_FOLIO_EIO : 0); > - } while (atomic_cmpxchg((atomic_t *)&folio->private, orig, v) != > orig); > - > - if (v & ~Z_EROFS_FOLIO_EIO) > - return; > - folio->private = 0; > - folio_end_read(folio, !(v & Z_EROFS_FOLIO_EIO)); > -} > - > #define Z_EROFS_ONSTACK_PAGES 32 > > /* > @@ -965,7 +929,7 @@ static int z_erofs_scan_folio(struct > z_erofs_decompress_frontend *f, > int err = 0; > > tight = (bs == PAGE_SIZE); > - z_erofs_onlinefolio_init(folio); > + erofs_onlinefolio_init(folio); > do { > if (offset + end - 1 < map->m_la || > offset + end - 1 >= map->m_la + map->m_llen) { > @@ -1024,7 +988,7 @@ static int z_erofs_scan_folio(struct > z_erofs_decompress_frontend *f, > if (err) > break; > > - z_erofs_onlinefolio_split(folio); > + erofs_onlinefolio_split(folio); > if (f->pcl->pageofs_out != (map->m_la & ~PAGE_MASK)) > f->pcl->multibases = true; > if (f->pcl->length < offset + end - map->m_la) { > @@ -1044,7 +1008,7 @@ static int z_erofs_scan_folio(struct > z_erofs_decompress_frontend *f, > tight = (bs == PAGE_SIZE); > } > } while ((end = cur) > 0); > - z_erofs_onlinefolio_end(folio, err); > + erofs_onlinefolio_end(folio, err); > return err; > } > > @@ -1147,7 +1111,7 @@ static void z_erofs_fill_other_copies(struct > z_erofs_decompress_backend *be, > cur += len; > } > kunmap_local(dst); > - z_erofs_onlinefolio_end(page_folio(bvi->bvec.page), err); > + erofs_onlinefolio_end(page_folio(bvi->bvec.page), err); > list_del(p); > kfree(bvi); > } > @@ -1302,7 +1266,7 @@ static int z_erofs_decompress_pcluster(struct > z_erofs_decompress_backend *be, > > DBG_BUGON(z_erofs_page_is_invalidated(page)); > if (!z_erofs_is_shortlived_page(page)) { > - z_erofs_onlinefolio_end(page_folio(page), err); > + erofs_onlinefolio_end(page_folio(page), err); > continue; > } > if (pcl->algorithmformat != Z_EROFS_COMPRESSION_LZ4) { > -- > 2.43.5 >