Maxim Patlasov <mpatla...@virtuozzo.com> writes: > Dima, > > I agree with general approach of this patch, but there are some > (easy-to-fix) issues. See, please, inline comments below... > > On 06/20/2016 11:58 AM, Dmitry Monakhov wrote: >> barrier code is broken in many ways: >> Currently only ->dio_submit() handles PLOOP_REQ_FORCE_{FLUSH,FUA} correctly. >> But request also can goes though ->dio_submit_alloc()->dio_submit_pad and >> write_page (for indexes) >> So in case of grow_dev we have following sequance: >> >> E_RELOC_DATA_READ: >> ->set_bit(PLOOP_REQ_FORCE_FUA, &preq->state); >> ->delta->allocate >> ->io->submit_allloc: dio_submit_alloc >> ->dio_submit_pad >> E_DATA_WBI : data written, time to update index >> ->delta->allocate_complete:ploop_index_update >> ->set_bit(PLOOP_REQ_FORCE_FUA, &preq->state); >> ->write_page >> ->ploop_map_wb_complete >> ->ploop_wb_complete_post_process >> ->set_bit(PLOOP_REQ_FORCE_FUA, &preq->state); >> E_RELOC_NULLIFY: >> >> ->submit() >> >> BUG#2: currecntly kaio write_page silently ignores REQ_FUA > > Sorry, I can't agree, it actually does not ignore: I've misstyped. I ment to say REQ_FLUSH. > >> static void >> kaio_write_page(struct ploop_io * io, struct ploop_request * preq, >> struct page * page, sector_t sec, int fua) >> { >> /* No FUA in kaio, convert it to fsync */ >> if (fua) >> set_bit(PLOOP_REQ_KAIO_FSYNC, &preq->state); > > >> BUG#3: io_direct:dio_submit if fua_delay is not possible we MUST tag all >> bios via REQ_FUA >> not just latest one. > > No need to tag *all*. See inline comments below. > >> This patch unify barrier handling like follows: >> - Get rid of FORCE_{FLUSH,FUA} >> - Introduce DELAYED_FLUSH, currecntly it supported only by io_direct >> - fix up fua handling for dio_submit >> >> This makes reloc sequence optimal: >> io_direct >> RELOC_S: R1, W2, WBI:FLUSH|FUA >> RELOC_A: R1, W2, WBI:FLUSH|FUA, W1:NULLIFY|FUA >> io_kaio >> RELOC_S: R1, W2:FUA, WBI:FUA >> RELOC_A: R1, W2:FUA, WBI:FUA, W1:NULLIFY|FUA >> >> https://jira.sw.ru/browse/PSBM-47107 >> Signed-off-by: Dmitry Monakhov <dmonak...@openvz.org> >> --- >> drivers/block/ploop/dev.c | 8 +++++--- >> drivers/block/ploop/io_direct.c | 29 +++++++++----------------- >> drivers/block/ploop/io_kaio.c | 17 ++++++++++------ >> drivers/block/ploop/map.c | 45 >> ++++++++++++++++++++++------------------- >> include/linux/ploop/ploop.h | 8 ++++---- >> 5 files changed, 54 insertions(+), 53 deletions(-) >> >> diff --git a/drivers/block/ploop/dev.c b/drivers/block/ploop/dev.c >> index 96f7850..fbc5f2f 100644 >> --- a/drivers/block/ploop/dev.c >> +++ b/drivers/block/ploop/dev.c >> @@ -1224,6 +1224,9 @@ static void ploop_complete_request(struct >> ploop_request * preq) >> >> __TRACE("Z %p %u\n", preq, preq->req_cluster); >> >> + if (!preq->error) { >> + WARN_ON(test_bit(PLOOP_REQ_DELAYED_FLUSH, &preq->state)); >> + } >> while (preq->bl.head) { >> struct bio * bio = preq->bl.head; >> preq->bl.head = bio->bi_next; >> @@ -2530,9 +2533,8 @@ restart: >> top_delta = ploop_top_delta(plo); >> sbl.head = sbl.tail = preq->aux_bio; >> >> - /* Relocated data write required sync before BAT updatee */ >> - set_bit(PLOOP_REQ_FORCE_FUA, &preq->state); >> - >> + /* Relocated data write required sync before BAT updatee >> + * this will happen inside index_update */ >> if (test_bit(PLOOP_REQ_RELOC_S, &preq->state)) { >> preq->eng_state = PLOOP_E_DATA_WBI; >> plo->st.bio_out++; >> diff --git a/drivers/block/ploop/io_direct.c >> b/drivers/block/ploop/io_direct.c >> index a6d83fe..d7ecd4a 100644 >> --- a/drivers/block/ploop/io_direct.c >> +++ b/drivers/block/ploop/io_direct.c >> @@ -90,21 +90,12 @@ dio_submit(struct ploop_io *io, struct ploop_request * >> preq, >> trace_submit(preq); >> >> preflush = !!(rw & REQ_FLUSH); >> - >> - if (test_and_clear_bit(PLOOP_REQ_FORCE_FLUSH, &preq->state)) >> - preflush = 1; >> - >> - if (test_and_clear_bit(PLOOP_REQ_FORCE_FUA, &preq->state)) >> - postfua = 1; >> - >> - if (!postfua && ploop_req_delay_fua_possible(rw, preq)) { >> - >> + postfua = !!(rw & REQ_FUA); >> + if (ploop_req_delay_fua_possible(rw, preq)) { >> /* Mark req that delayed flush required */ >> - set_bit(PLOOP_REQ_FORCE_FLUSH, &preq->state); >> - } else if (rw & REQ_FUA) { >> - postfua = 1; >> + set_bit(PLOOP_REQ_DELAYED_FLUSH, &preq->state); >> + postfua = 0; >> } > > "postfua" is a horrible name, let us see if we can get rid of it > completely. Also, the way how ploop_req_delay_fua_possible implemented > is prone to errors (see below an issue in kaio_complete_io_state). Let's > rework it like this: Let it be postflush. > >> static inline bool ploop_req_delay_fua_possible(struct ploop_request >> *preq) >> { >> return preq->eng_state == PLOOP_E_DATA_WBI; >> } > > Then, that chunk in the dio_submit above might look as: > >> /* If we can delay, mark req that delayed flush required */ >> if ((rw & REQ_FUA) && ploop_req_delay_fua_possible(preq)) >> set_bit(PLOOP_REQ_DELAYED_FLUSH, &preq->state); > > > >> - >> rw &= ~(REQ_FLUSH | REQ_FUA); >> >> >> @@ -238,14 +229,15 @@ flush_bio: >> rw2 |= REQ_FLUSH; >> preflush = 0; >> } >> - if (unlikely(postfua && !bl.head)) >> - rw2 |= (REQ_FUA | ((bio_num) ? REQ_FLUSH : 0)); >> + /* Very unlikely, but correct. >> + * TODO: Optimize postfua via DELAY_FLUSH for any req state */ >> + if (unlikely(!postfua)) >> + rw2 |= REQ_FUA; > > If test_bit(PLOOP_REQ_DELAYED_FLUSH, &preq->state), do nothing here -- > that's obvious. So let's assume !test_bit(PLOOP_REQ_DELAYED_FLUSH, > &preq->state)... > > There is a call to bio_add_page in dio_submit before that loop. The > moment bio_add_page returned zero (i.e. OK), we know for sure which bio > it came from: it is bw.cur. Then it's enough to mark new bio like this: OK >> bio->bi_rw |= bw.cur & REQ_FUA; > > Then, instead of "submit_bio(rw2, b);", use "submit_bio(rw2 | b->bi_rw, > b);". > > This way we'll mark with FUA only those of new bio-s that has at least > some data (pages) from original bio-s with FUA. Makes sense? > > Btw, dio_io_page() must be fixed similarly (get rid of postfua, mark > with FUA only what we really need). > >> >> ploop_acc_ff_out(preq->plo, rw2 | b->bi_rw); >> submit_bio(rw2, b); >> bio_num++; >> } >> - > > I liked this empty line, please keep it! ;) > >> ploop_complete_io_request(preq); >> return; >> >> @@ -1520,15 +1512,14 @@ dio_read_page(struct ploop_io * io, struct >> ploop_request * preq, >> >> static void >> dio_write_page(struct ploop_io * io, struct ploop_request * preq, >> - struct page * page, sector_t sec, int fua) >> + struct page * page, sector_t sec, unsigned long rw) >> { >> if (!(io->files.file->f_mode & FMODE_WRITE)) { >> PLOOP_FAIL_REQUEST(preq, -EBADF); >> return; >> } >> >> - dio_io_page(io, WRITE | (fua ? REQ_FUA : 0) | REQ_SYNC, >> - preq, page, sec); >> + dio_io_page(io, rw | WRITE | REQ_SYNC, preq, page, sec); >> } >> >> static int >> diff --git a/drivers/block/ploop/io_kaio.c b/drivers/block/ploop/io_kaio.c >> index de26319..0b93e13 100644 >> --- a/drivers/block/ploop/io_kaio.c >> +++ b/drivers/block/ploop/io_kaio.c >> @@ -72,13 +72,17 @@ static void kaio_complete_io_state(struct ploop_request >> * preq) >> } >> >> /* Convert requested fua to fsync */ >> - if (test_and_clear_bit(PLOOP_REQ_FORCE_FUA, &preq->state) || >> + if (test_and_clear_bit(PLOOP_REQ_DELAYED_FLUSH, &preq->state) || > > io_kaio has nothing to do with DELAYED_FLUSH (only dio sets it). Hence, > no need to check it here. Why not. delayed flush is possible for !reloc writes. In such cases we may postpone data flush till index_update. So from kaio point of view it will looks like follows: W1,WBI,FSYNC. AFAIU this is legal because even if power failure happens we may have WBI, but not W1. In such case index will point to zeroed data > >> test_and_clear_bit(PLOOP_REQ_KAIO_FSYNC, &preq->state)) >> post_fsync = 1; >> >> if (!post_fsync && >> - !ploop_req_delay_fua_possible(preq->req_rw, preq) && >> - (preq->req_rw & REQ_FUA)) >> + !ploop_req_delay_fua_possible(preq->req_rw, preq)) >> + post_fsync = 1; >> + /* Reloc requires FLUSH_FUA from ->index_update for delayed_fua >> + which is not supporded by ->kaio_write_page */ >> + if (test_and_clear_bit(PLOOP_REQ_RELOC_S, &preq->state) || >> + test_and_clear_bit(PLOOP_REQ_RELOC_S, &preq->state)) >> post_fsync = 1; > > I'm sorry, this looks like a mess. You can't freely clear this bit, and > RELOC_A missed. Can we replace all this lengthy code: CRAP. Off course I just want to test_bit, not clear. Same on me. > >> /* Convert requested fua to fsync */ >> if (test_and_clear_bit(PLOOP_REQ_DELAYED_FLUSH, &preq->state) || >> test_and_clear_bit(PLOOP_REQ_KAIO_FSYNC, &preq->state)) >> post_fsync = 1; >> >> if (!post_fsync && >> !ploop_req_delay_fua_possible(preq->req_rw, preq)) >> post_fsync = 1; >> /* Reloc requires FLUSH_FUA from ->index_update for delayed_fua >> which is not supporded by ->kaio_write_page */ >> if (test_and_clear_bit(PLOOP_REQ_RELOC_S, &preq->state) || >> test_and_clear_bit(PLOOP_REQ_RELOC_S, &preq->state)) >> post_fsync = 1; > > with something more brief like this: > >> bool fua = preq->req_rw & REQ_FUA; >> unsigned long state = READ_ONCE(preq->state); >> bool reloc = state & (PLOOP_REQ_RELOC_A_FL|PLOOP_REQ_RELOC_S_FL); >> >> if (test_and_clear_bit(PLOOP_REQ_KAIO_FSYNC, &preq->state) || >> (reloc && ploop_req_delay_fua_possible(preq)) || >> (fua && !ploop_req_delay_fua_possible(preq))) >> post_fsync = 1; > > Note, how it fixes a bug: if !(preq->req_rw & REQ_FUA), > delay_fua_possible(rw, preq) returned zero anyway. Hence we did > post_sync=1 even though the request has nothing to do with FUA! > > >> >> preq->req_rw &= ~REQ_FUA; >> @@ -608,12 +612,13 @@ kaio_read_page(struct ploop_io * io, struct >> ploop_request * preq, >> >> static void >> kaio_write_page(struct ploop_io * io, struct ploop_request * preq, >> - struct page * page, sector_t sec, int fua) >> + struct page * page, sector_t sec, unsigned long rw) >> { >> ploop_prepare_tracker(preq, sec); >> - >> + /* This is async call , preflush is not supported */ >> + BUG_ON(rw & REQ_FLUSH); >> /* No FUA in kaio, convert it to fsync */ >> - if (fua) >> + if (rw & REQ_FUA) >> set_bit(PLOOP_REQ_KAIO_FSYNC, &preq->state); >> >> kaio_io_page(io, IOCB_CMD_WRITE_ITER, preq, page, sec); >> diff --git a/drivers/block/ploop/map.c b/drivers/block/ploop/map.c >> index 3a6365d..de9bb32 100644 >> --- a/drivers/block/ploop/map.c >> +++ b/drivers/block/ploop/map.c >> @@ -896,6 +896,8 @@ void ploop_index_update(struct ploop_request * preq) >> struct ploop_device * plo = preq->plo; >> struct map_node * m = preq->map; >> struct ploop_delta * top_delta = map_top_delta(m->parent); >> + unsigned long rw = preq->req_rw & REQ_FUA; >> + unsigned long state = READ_ONCE(preq->state); >> u32 idx; >> map_index_t blk; >> int old_level; >> @@ -953,13 +955,14 @@ void ploop_index_update(struct ploop_request * preq) >> __TRACE("wbi %p %u %p\n", preq, preq->req_cluster, m); >> plo->st.map_single_writes++; >> top_delta->ops->map_index(top_delta, m->mn_start, &sec); >> - /* Relocate requires consistent writes, mark such reqs appropriately */ >> - if (test_bit(PLOOP_REQ_RELOC_A, &preq->state) || >> - test_bit(PLOOP_REQ_RELOC_S, &preq->state)) >> - set_bit(PLOOP_REQ_FORCE_FUA, &preq->state); >> - >> - top_delta->io.ops->write_page(&top_delta->io, preq, page, sec, >> - !!(preq->req_rw & REQ_FUA)); >> + /* Relocate requires consistent index update */ >> + if (state & (PLOOP_REQ_RELOC_A_FL|PLOOP_REQ_RELOC_S_FL)) >> + rw |= REQ_FUA; >> + if (state & PLOOP_REQ_DELAYED_FLUSH_FL) >> + rw |= REQ_FLUSH; >> + if (rw) >> + clear_bit(PLOOP_REQ_DELAYED_FLUSH, &preq->state); > > Can you please move clear_bit upward, like this: > >> if (state & PLOOP_REQ_DELAYED_FLUSH_FL) { >> rw |= REQ_FLUSH; >> clear_bit(PLOOP_REQ_DELAYED_FLUSH, &preq->state); >> } > >> + top_delta->io.ops->write_page(&top_delta->io, preq, page, sec, rw | >> WRITE); >> put_page(page); >> return; >> >> @@ -1063,7 +1066,7 @@ static void map_wb_complete_post_process(struct >> ploop_map *map, >> * (see dio_submit()). So fsync of EXT4 image doesnt help us. >> * We need to force sync of nullified blocks. >> */ >> - set_bit(PLOOP_REQ_FORCE_FUA, &preq->state); >> + preq->req_rw |= REQ_FUA; >> top_delta->io.ops->submit(&top_delta->io, preq, preq->req_rw, >> &sbl, preq->iblock, 1<<plo->cluster_log); >> } >> @@ -1074,11 +1077,12 @@ static void map_wb_complete(struct map_node * m, int >> err) >> struct ploop_delta * top_delta = map_top_delta(m->parent); >> struct list_head * cursor, * tmp; >> struct ploop_request * main_preq; >> + unsigned long rw = 0; >> struct page * page = NULL; >> int delayed = 0; >> unsigned int idx; >> sector_t sec; >> - int fua, force_fua; >> + int fua; >> >> /* First, complete processing of written back indices, >> * finally instantiate indices in mapping cache. >> @@ -1149,10 +1153,10 @@ static void map_wb_complete(struct map_node * m, int >> err) >> >> main_preq = NULL; >> fua = 0; >> - force_fua = 0; >> >> list_for_each_safe(cursor, tmp, &m->io_queue) { >> struct ploop_request * preq; >> + unsigned long state; >> >> preq = list_entry(cursor, struct ploop_request, list); >> >> @@ -1167,13 +1171,15 @@ static void map_wb_complete(struct map_node * m, int >> err) >> spin_unlock_irq(&plo->lock); >> break; >> } >> - >> - if (preq->req_rw & REQ_FUA) >> - fua = 1; >> - >> - if (test_bit(PLOOP_REQ_RELOC_A, &preq->state) || >> - test_bit(PLOOP_REQ_RELOC_S, &preq->state)) >> - force_fua = 1; >> + state = READ_ONCE(preq->state); >> + /* Relocate requires consistent index update */ >> + rw |= preq->req_rw & REQ_FUA; >> + if (state & (PLOOP_REQ_RELOC_A_FL|PLOOP_REQ_RELOC_S_FL)) >> + rw |= REQ_FUA; >> + if (state & PLOOP_REQ_DELAYED_FLUSH_FL) >> + rw |= REQ_FLUSH; >> + if (rw) >> + clear_bit(PLOOP_REQ_DELAYED_FLUSH, >> &preq->state); > > The same as above -- easier to read if clear_bit is inside DELAYED_FLUSH
> "if". > > Thanks, > Maxim > >> >> preq->eng_state = PLOOP_E_INDEX_WB; >> get_page(page); >> @@ -1200,10 +1206,7 @@ static void map_wb_complete(struct map_node * m, int >> err) >> plo->st.map_multi_writes++; >> top_delta->ops->map_index(top_delta, m->mn_start, &sec); >> >> - if (force_fua) >> - set_bit(PLOOP_REQ_FORCE_FUA, &main_preq->state); >> - >> - top_delta->io.ops->write_page(&top_delta->io, main_preq, page, sec, >> fua); >> + top_delta->io.ops->write_page(&top_delta->io, main_preq, page, sec, rw >> | WRITE); >> put_page(page); >> } >> >> diff --git a/include/linux/ploop/ploop.h b/include/linux/ploop/ploop.h >> index 0fba25e..2f26824 100644 >> --- a/include/linux/ploop/ploop.h >> +++ b/include/linux/ploop/ploop.h >> @@ -157,7 +157,7 @@ struct ploop_io_ops >> void (*read_page)(struct ploop_io * io, struct ploop_request * preq, >> struct page * page, sector_t sec); >> void (*write_page)(struct ploop_io * io, struct ploop_request * preq, >> - struct page * page, sector_t sec, int fua); >> + struct page * page, sector_t sec, unsigned long >> rw); >> >> >> int (*sync_read)(struct ploop_io * io, struct page * page, >> @@ -465,8 +465,7 @@ enum >> PLOOP_REQ_ZERO, >> PLOOP_REQ_DISCARD, >> PLOOP_REQ_RSYNC, >> - PLOOP_REQ_FORCE_FUA, /*force fua of req write I/O by engine */ >> - PLOOP_REQ_FORCE_FLUSH, /*force flush by engine */ >> + PLOOP_REQ_DELAYED_FLUSH,/* REQ_FLUSH is delayed */ >> PLOOP_REQ_KAIO_FSYNC, /*force image fsync by KAIO module */ >> PLOOP_REQ_POST_SUBMIT, /* preq needs post_submit processing */ >> PLOOP_REQ_PUSH_BACKUP, /* preq was ACKed by userspace push_backup */ >> @@ -477,6 +476,7 @@ enum >> #define PLOOP_REQ_MERGE_FL (1 << PLOOP_REQ_MERGE) >> #define PLOOP_REQ_RELOC_A_FL (1 << PLOOP_REQ_RELOC_A) >> #define PLOOP_REQ_RELOC_S_FL (1 << PLOOP_REQ_RELOC_S) >> +#define PLOOP_REQ_DELAYED_FLUSH_FL (1 << PLOOP_REQ_DELAYED_FLUSH) >> #define PLOOP_REQ_DISCARD_FL (1 << PLOOP_REQ_DISCARD) >> #define PLOOP_REQ_ZERO_FL (1 << PLOOP_REQ_ZERO) >> >> @@ -614,7 +614,7 @@ static inline int ploop_req_delay_fua_possible(unsigned >> long rw, >> * fua. >> */ >> if (rw & REQ_FUA) { >> - if (preq->eng_state != PLOOP_E_COMPLETE) >> + if (preq->eng_state == PLOOP_E_DATA_WBI) >> delay_fua = 1; >> } >> return delay_fua;
signature.asc
Description: PGP signature
_______________________________________________ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel