On Thu, Jan 25, 2024 at 4:01 AM Gao Xiang <hsiang...@linux.alibaba.com> wrote: > > I encountered a race issue after lengthy (~594647 sec) stress tests on > a 64k-page arm64 VM with several 4k-block EROFS images. The timing > is like below: > > z_erofs_try_inplace_io z_erofs_fill_bio_vec > cmpxchg(&compressed_bvecs[].page, > NULL, ..) > [access bufvec] > compressed_bvecs[] = *bvec; > > Previously, z_erofs_submit_queue() just accessed bufvec->page only, so > other fields in bufvec didn't matter. After the subpage block support > is landed, .offset and .end can be used too, but filling bufvec isn't > an atomic operation which can cause inconsistency. > > Let's use a spinlock to keep the atomicity of each bufvec. More > specifically, just reuse the existing spinlock `pcl->obj.lockref.lock` > since it's rarely used (also it takes a short time if even used) as long > as the pcluster has a reference. > > Fixes: 192351616a9d ("erofs: support I/O submission for sub-page compressed > blocks") > Signed-off-by: Gao Xiang <hsiang...@linux.alibaba.com> > --- > fs/erofs/zdata.c | 74 +++++++++++++++++++++++++----------------------- > 1 file changed, 38 insertions(+), 36 deletions(-) > > diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c > index 583c062cd0e4..c1c77166b30f 100644 > --- a/fs/erofs/zdata.c > +++ b/fs/erofs/zdata.c > @@ -563,21 +563,19 @@ static void z_erofs_bind_cache(struct > z_erofs_decompress_frontend *fe) > __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN; > unsigned int i; > > - if (i_blocksize(fe->inode) != PAGE_SIZE) > - return; > - if (fe->mode < Z_EROFS_PCLUSTER_FOLLOWED) > + if (i_blocksize(fe->inode) != PAGE_SIZE || > + fe->mode < Z_EROFS_PCLUSTER_FOLLOWED) > return; > > for (i = 0; i < pclusterpages; ++i) { > struct page *page, *newpage; > void *t; /* mark pages just found for debugging */ > > - /* the compressed page was loaded before */ > + /* Inaccurate check w/o locking to avoid unneeded lookups */ > if (READ_ONCE(pcl->compressed_bvecs[i].page)) > continue; > > page = find_get_page(mc, pcl->obj.index + i); > - > if (page) { > t = (void *)((unsigned long)page | 1); > newpage = NULL; > @@ -597,9 +595,13 @@ static void z_erofs_bind_cache(struct > z_erofs_decompress_frontend *fe) > set_page_private(newpage, Z_EROFS_PREALLOCATED_PAGE); > t = (void *)((unsigned long)newpage | 1); > } > - > - if (!cmpxchg_relaxed(&pcl->compressed_bvecs[i].page, NULL, t)) > + spin_lock(&pcl->obj.lockref.lock); > + if (!pcl->compressed_bvecs[i].page) { > + pcl->compressed_bvecs[i].page = t; > + spin_unlock(&pcl->obj.lockref.lock); > continue; > + } > + spin_unlock(&pcl->obj.lockref.lock); > > if (page) > put_page(page); > @@ -718,31 +720,25 @@ int erofs_init_managed_cache(struct super_block *sb) > return 0; > } > > -static bool z_erofs_try_inplace_io(struct z_erofs_decompress_frontend *fe, > - struct z_erofs_bvec *bvec) > -{ > - struct z_erofs_pcluster *const pcl = fe->pcl; > - > - while (fe->icur > 0) { > - if (!cmpxchg(&pcl->compressed_bvecs[--fe->icur].page, > - NULL, bvec->page)) { > - pcl->compressed_bvecs[fe->icur] = *bvec; > - return true; > - } > - } > - return false; > -} > - > /* callers must be with pcluster lock held */ > static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe, > struct z_erofs_bvec *bvec, bool exclusive) > { > + struct z_erofs_pcluster *pcl = fe->pcl; > int ret; > > if (exclusive) { > /* give priority for inplaceio to use file pages first */ > - if (z_erofs_try_inplace_io(fe, bvec)) > + spin_lock(&pcl->obj.lockref.lock); > + while (fe->icur > 0) { > + if (pcl->compressed_bvecs[--fe->icur].page) > + continue; > + pcl->compressed_bvecs[fe->icur] = *bvec; > + spin_unlock(&pcl->obj.lockref.lock); > return 0; > + } > + spin_unlock(&pcl->obj.lockref.lock); > + > /* otherwise, check if it can be used as a bvpage */ > if (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED && > !fe->candidate_bvpage) > @@ -1423,23 +1419,26 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec, > { > gfp_t gfp = mapping_gfp_mask(mc); > bool tocache = false; > - struct z_erofs_bvec *zbv = pcl->compressed_bvecs + nr; > + struct z_erofs_bvec zbv; > struct address_space *mapping; > - struct page *page, *oldpage; > + struct page *page; > int justfound, bs = i_blocksize(f->inode); > > /* Except for inplace pages, the entire page can be used for I/Os */ > bvec->bv_offset = 0; > bvec->bv_len = PAGE_SIZE; > repeat: > - oldpage = READ_ONCE(zbv->page); > - if (!oldpage) > + spin_lock(&pcl->obj.lockref.lock); > + zbv = pcl->compressed_bvecs[nr]; > + page = zbv.page; > + justfound = (unsigned long)page & 1UL; > + page = (struct page *)((unsigned long)page & ~1UL); > + pcl->compressed_bvecs[nr].page = page; > + spin_unlock(&pcl->obj.lockref.lock); > + if (!page) > goto out_allocpage; > > - justfound = (unsigned long)oldpage & 1UL; > - page = (struct page *)((unsigned long)oldpage & ~1UL); > bvec->bv_page = page; > - > DBG_BUGON(z_erofs_is_shortlived_page(page)); > /* > * Handle preallocated cached pages. We tried to allocate such pages > @@ -1448,7 +1447,6 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec, > */ > if (page->private == Z_EROFS_PREALLOCATED_PAGE) { > set_page_private(page, 0); > - WRITE_ONCE(zbv->page, page); > tocache = true; > goto out_tocache; > } > @@ -1459,9 +1457,9 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec, > * therefore it is impossible for `mapping` to be NULL. > */ > if (mapping && mapping != mc) { > - if (zbv->offset < 0) > - bvec->bv_offset = round_up(-zbv->offset, bs); > - bvec->bv_len = round_up(zbv->end, bs) - bvec->bv_offset; > + if (zbv.offset < 0) > + bvec->bv_offset = round_up(-zbv.offset, bs); > + bvec->bv_len = round_up(zbv.end, bs) - bvec->bv_offset; > return; > } > > @@ -1471,7 +1469,6 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec, > > /* the cached page is still in managed cache */ > if (page->mapping == mc) { > - WRITE_ONCE(zbv->page, page); > /* > * The cached page is still available but without a valid > * `->private` pcluster hint. Let's reconnect them. > @@ -1503,11 +1500,15 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec, > put_page(page); > out_allocpage: > page = erofs_allocpage(&f->pagepool, gfp | __GFP_NOFAIL); > - if (oldpage != cmpxchg(&zbv->page, oldpage, page)) { > + spin_lock(&pcl->obj.lockref.lock); > + if (pcl->compressed_bvecs[nr].page) { > erofs_pagepool_add(&f->pagepool, page); > + spin_unlock(&pcl->obj.lockref.lock); > cond_resched(); > goto repeat; > } > + pcl->compressed_bvecs[nr].page = page; > + spin_unlock(&pcl->obj.lockref.lock); > bvec->bv_page = page; > out_tocache: > if (!tocache || bs != PAGE_SIZE || > @@ -1685,6 +1686,7 @@ static void z_erofs_submit_queue(struct > z_erofs_decompress_frontend *f, > > if (cur + bvec.bv_len > end) > bvec.bv_len = end - cur; > + DBG_BUGON(bvec.bv_len < sb->s_blocksize); > if (!bio_add_page(bio, bvec.bv_page, bvec.bv_len, > bvec.bv_offset)) > goto submit_bio_retry; > -- > 2.39.3 >
LGTM! Reviewed-by: Sandeep Dhavale <dhav...@google.com>