barrier code is broken in many ways: Currently only ->dio_submit() handles PLOOP_REQ_FORCE_{FLUSH,FUA} correctly. But request also can goes though ->dio_submit_alloc()->dio_submit_pad and write_page (for indexes) So in case of grow_dev we have following sequance:
E_RELOC_DATA_READ: ->set_bit(PLOOP_REQ_FORCE_FUA, &preq->state); ->delta->allocate ->io->submit_allloc: dio_submit_alloc ->dio_submit_pad E_DATA_WBI : data written, time to update index ->delta->allocate_complete:ploop_index_update ->set_bit(PLOOP_REQ_FORCE_FUA, &preq->state); ->write_page ->ploop_map_wb_complete ->ploop_wb_complete_post_process ->set_bit(PLOOP_REQ_FORCE_FUA, &preq->state); E_RELOC_NULLIFY: ->submit() BUG#2: currecntly kaio write_page silently ignores REQ_FLUSH BUG#3: io_direct:dio_submit if fua_delay is not possible we MUST tag all bios via REQ_FUA not just latest one. This patch unify barrier handling like follows: - Get rid of FORCE_{FLUSH,FUA} - Introduce DELAYED_FLUSH - fix fua handling for dio_submit - BUG_ON for REQ_FLUSH in kaio_page_write This makes reloc sequence optimal: io_direct RELOC_S: R1, W2, WBI:FLUSH|FUA RELOC_A: R1, W2, WBI:FLUSH|FUA, W1:NULLIFY|FUA io_kaio RELOC_S: R1, W2:FUA, WBI:FUA RELOC_A: R1, W2:FUA, WBI:FUA, W1:NULLIFY|FUA https://jira.sw.ru/browse/PSBM-47107 Signed-off-by: Dmitry Monakhov <dmonak...@openvz.org> --- drivers/block/ploop/dev.c | 8 +++++--- drivers/block/ploop/io_direct.c | 30 ++++++++++----------------- drivers/block/ploop/io_kaio.c | 23 +++++++++++++-------- drivers/block/ploop/map.c | 45 ++++++++++++++++++++++------------------- include/linux/ploop/ploop.h | 19 +++++------------ 5 files changed, 60 insertions(+), 65 deletions(-) diff --git a/drivers/block/ploop/dev.c b/drivers/block/ploop/dev.c index 96f7850..fbc5f2f 100644 --- a/drivers/block/ploop/dev.c +++ b/drivers/block/ploop/dev.c @@ -1224,6 +1224,9 @@ static void ploop_complete_request(struct ploop_request * preq) __TRACE("Z %p %u\n", preq, preq->req_cluster); + if (!preq->error) { + WARN_ON(test_bit(PLOOP_REQ_DELAYED_FLUSH, &preq->state)); + } while (preq->bl.head) { struct bio * bio = preq->bl.head; preq->bl.head = bio->bi_next; @@ -2530,9 +2533,8 @@ restart: top_delta = ploop_top_delta(plo); sbl.head = sbl.tail = preq->aux_bio; - /* Relocated data write required sync before BAT updatee */ - set_bit(PLOOP_REQ_FORCE_FUA, &preq->state); - + /* Relocated data write required sync before BAT updatee + * this will happen inside index_update */ if (test_bit(PLOOP_REQ_RELOC_S, &preq->state)) { preq->eng_state = PLOOP_E_DATA_WBI; plo->st.bio_out++; diff --git a/drivers/block/ploop/io_direct.c b/drivers/block/ploop/io_direct.c index a6d83fe..303eb70 100644 --- a/drivers/block/ploop/io_direct.c +++ b/drivers/block/ploop/io_direct.c @@ -83,28 +83,19 @@ dio_submit(struct ploop_io *io, struct ploop_request * preq, int err; struct bio_list_walk bw; int preflush; - int postfua = 0; + int fua = 0; int write = !!(rw & REQ_WRITE); int bio_num; trace_submit(preq); preflush = !!(rw & REQ_FLUSH); - - if (test_and_clear_bit(PLOOP_REQ_FORCE_FLUSH, &preq->state)) - preflush = 1; - - if (test_and_clear_bit(PLOOP_REQ_FORCE_FUA, &preq->state)) - postfua = 1; - - if (!postfua && ploop_req_delay_fua_possible(rw, preq)) { - + fua = !!(rw & REQ_FUA); + if (fua && ploop_req_delay_fua_possible(rw, preq)) { /* Mark req that delayed flush required */ - set_bit(PLOOP_REQ_FORCE_FLUSH, &preq->state); - } else if (rw & REQ_FUA) { - postfua = 1; + set_bit(PLOOP_REQ_DELAYED_FLUSH, &preq->state); + fua = 0; } - rw &= ~(REQ_FLUSH | REQ_FUA); @@ -238,8 +229,10 @@ flush_bio: rw2 |= REQ_FLUSH; preflush = 0; } - if (unlikely(postfua && !bl.head)) - rw2 |= (REQ_FUA | ((bio_num) ? REQ_FLUSH : 0)); + /* Very unlikely, but correct. + * TODO: Optimize postfua via DELAY_FLUSH for any req state */ + if (unlikely(fua)) + rw2 |= REQ_FUA; ploop_acc_ff_out(preq->plo, rw2 | b->bi_rw); submit_bio(rw2, b); @@ -1520,15 +1513,14 @@ dio_read_page(struct ploop_io * io, struct ploop_request * preq, static void dio_write_page(struct ploop_io * io, struct ploop_request * preq, - struct page * page, sector_t sec, int fua) + struct page * page, sector_t sec, unsigned long rw) { if (!(io->files.file->f_mode & FMODE_WRITE)) { PLOOP_FAIL_REQUEST(preq, -EBADF); return; } - dio_io_page(io, WRITE | (fua ? REQ_FUA : 0) | REQ_SYNC, - preq, page, sec); + dio_io_page(io, rw | WRITE | REQ_SYNC, preq, page, sec); } static int diff --git a/drivers/block/ploop/io_kaio.c b/drivers/block/ploop/io_kaio.c index de26319..c3bdd20 100644 --- a/drivers/block/ploop/io_kaio.c +++ b/drivers/block/ploop/io_kaio.c @@ -61,6 +61,7 @@ static void kaio_complete_io_state(struct ploop_request * preq) struct ploop_device * plo = preq->plo; unsigned long flags; int post_fsync = 0; + unsigned long state = READ_ONCE(preq->state); if (preq->error || !(preq->req_rw & REQ_FUA) || preq->eng_state == PLOOP_E_INDEX_READ || @@ -72,18 +73,23 @@ static void kaio_complete_io_state(struct ploop_request * preq) } /* Convert requested fua to fsync */ - if (test_and_clear_bit(PLOOP_REQ_FORCE_FUA, &preq->state) || - test_and_clear_bit(PLOOP_REQ_KAIO_FSYNC, &preq->state)) + if (test_and_clear_bit(PLOOP_REQ_KAIO_FSYNC, &preq->state)) post_fsync = 1; - if (!post_fsync && - !ploop_req_delay_fua_possible(preq->req_rw, preq) && - (preq->req_rw & REQ_FUA)) + if ((preq->req_rw & REQ_FUA) && + !ploop_req_delay_fua_possible(preq->req_rw, preq)) + post_fsync = 1; + /* Usually we may delay fua for regular requets, but relocation + * requires REQ_FLUSH_FUA for index_update which is not supported + * by kaio_write_page(). Flush it now. + */ + if (state & (PLOOP_REQ_RELOC_A_FL|PLOOP_REQ_RELOC_S_FL)) post_fsync = 1; preq->req_rw &= ~REQ_FUA; if (post_fsync) { + test_and_clear_bit(PLOOP_REQ_DELAYED_FLUSH, &preq->state); spin_lock_irqsave(&plo->lock, flags); kaio_queue_fsync_req(preq); plo->st.bio_syncwait++; @@ -608,12 +614,13 @@ kaio_read_page(struct ploop_io * io, struct ploop_request * preq, static void kaio_write_page(struct ploop_io * io, struct ploop_request * preq, - struct page * page, sector_t sec, int fua) + struct page * page, sector_t sec, unsigned long rw) { ploop_prepare_tracker(preq, sec); - + /* This is async call , preflush is not supported */ + BUG_ON(rw & REQ_FLUSH); /* No FUA in kaio, convert it to fsync */ - if (fua) + if (rw & REQ_FUA) set_bit(PLOOP_REQ_KAIO_FSYNC, &preq->state); kaio_io_page(io, IOCB_CMD_WRITE_ITER, preq, page, sec); diff --git a/drivers/block/ploop/map.c b/drivers/block/ploop/map.c index 3a6365d..62f7d79 100644 --- a/drivers/block/ploop/map.c +++ b/drivers/block/ploop/map.c @@ -896,6 +896,8 @@ void ploop_index_update(struct ploop_request * preq) struct ploop_device * plo = preq->plo; struct map_node * m = preq->map; struct ploop_delta * top_delta = map_top_delta(m->parent); + unsigned long rw = preq->req_rw & REQ_FUA; + unsigned long state = READ_ONCE(preq->state); u32 idx; map_index_t blk; int old_level; @@ -953,13 +955,14 @@ void ploop_index_update(struct ploop_request * preq) __TRACE("wbi %p %u %p\n", preq, preq->req_cluster, m); plo->st.map_single_writes++; top_delta->ops->map_index(top_delta, m->mn_start, &sec); - /* Relocate requires consistent writes, mark such reqs appropriately */ - if (test_bit(PLOOP_REQ_RELOC_A, &preq->state) || - test_bit(PLOOP_REQ_RELOC_S, &preq->state)) - set_bit(PLOOP_REQ_FORCE_FUA, &preq->state); - - top_delta->io.ops->write_page(&top_delta->io, preq, page, sec, - !!(preq->req_rw & REQ_FUA)); + /* Relocate requires consistent index update */ + if (state & (PLOOP_REQ_RELOC_A_FL|PLOOP_REQ_RELOC_S_FL)) + rw |= REQ_FUA; + if (state & PLOOP_REQ_DELAYED_FLUSH_FL) { + rw |= REQ_FLUSH; + clear_bit(PLOOP_REQ_DELAYED_FLUSH, &preq->state); + } + top_delta->io.ops->write_page(&top_delta->io, preq, page, sec, rw | WRITE); put_page(page); return; @@ -1063,7 +1066,7 @@ static void map_wb_complete_post_process(struct ploop_map *map, * (see dio_submit()). So fsync of EXT4 image doesnt help us. * We need to force sync of nullified blocks. */ - set_bit(PLOOP_REQ_FORCE_FUA, &preq->state); + preq->req_rw |= REQ_FUA; top_delta->io.ops->submit(&top_delta->io, preq, preq->req_rw, &sbl, preq->iblock, 1<<plo->cluster_log); } @@ -1074,11 +1077,12 @@ static void map_wb_complete(struct map_node * m, int err) struct ploop_delta * top_delta = map_top_delta(m->parent); struct list_head * cursor, * tmp; struct ploop_request * main_preq; + unsigned long rw = 0; struct page * page = NULL; int delayed = 0; unsigned int idx; sector_t sec; - int fua, force_fua; + int fua; /* First, complete processing of written back indices, * finally instantiate indices in mapping cache. @@ -1149,10 +1153,10 @@ static void map_wb_complete(struct map_node * m, int err) main_preq = NULL; fua = 0; - force_fua = 0; list_for_each_safe(cursor, tmp, &m->io_queue) { struct ploop_request * preq; + unsigned long state; preq = list_entry(cursor, struct ploop_request, list); @@ -1167,13 +1171,15 @@ static void map_wb_complete(struct map_node * m, int err) spin_unlock_irq(&plo->lock); break; } - - if (preq->req_rw & REQ_FUA) - fua = 1; - - if (test_bit(PLOOP_REQ_RELOC_A, &preq->state) || - test_bit(PLOOP_REQ_RELOC_S, &preq->state)) - force_fua = 1; + state = READ_ONCE(preq->state); + /* Relocate requires consistent index update */ + rw |= preq->req_rw & REQ_FUA; + if (state & (PLOOP_REQ_RELOC_A_FL|PLOOP_REQ_RELOC_S_FL)) + rw |= REQ_FUA; + if (state & PLOOP_REQ_DELAYED_FLUSH_FL) { + rw |= REQ_FLUSH; + clear_bit(PLOOP_REQ_DELAYED_FLUSH, &preq->state); + } preq->eng_state = PLOOP_E_INDEX_WB; get_page(page); @@ -1200,10 +1206,7 @@ static void map_wb_complete(struct map_node * m, int err) plo->st.map_multi_writes++; top_delta->ops->map_index(top_delta, m->mn_start, &sec); - if (force_fua) - set_bit(PLOOP_REQ_FORCE_FUA, &main_preq->state); - - top_delta->io.ops->write_page(&top_delta->io, main_preq, page, sec, fua); + top_delta->io.ops->write_page(&top_delta->io, main_preq, page, sec, rw | WRITE); put_page(page); } diff --git a/include/linux/ploop/ploop.h b/include/linux/ploop/ploop.h index 0fba25e..5af32ba 100644 --- a/include/linux/ploop/ploop.h +++ b/include/linux/ploop/ploop.h @@ -157,7 +157,7 @@ struct ploop_io_ops void (*read_page)(struct ploop_io * io, struct ploop_request * preq, struct page * page, sector_t sec); void (*write_page)(struct ploop_io * io, struct ploop_request * preq, - struct page * page, sector_t sec, int fua); + struct page * page, sector_t sec, unsigned long rw); int (*sync_read)(struct ploop_io * io, struct page * page, @@ -465,8 +465,7 @@ enum PLOOP_REQ_ZERO, PLOOP_REQ_DISCARD, PLOOP_REQ_RSYNC, - PLOOP_REQ_FORCE_FUA, /*force fua of req write I/O by engine */ - PLOOP_REQ_FORCE_FLUSH, /*force flush by engine */ + PLOOP_REQ_DELAYED_FLUSH,/* REQ_FLUSH is delayed */ PLOOP_REQ_KAIO_FSYNC, /*force image fsync by KAIO module */ PLOOP_REQ_POST_SUBMIT, /* preq needs post_submit processing */ PLOOP_REQ_PUSH_BACKUP, /* preq was ACKed by userspace push_backup */ @@ -477,6 +476,7 @@ enum #define PLOOP_REQ_MERGE_FL (1 << PLOOP_REQ_MERGE) #define PLOOP_REQ_RELOC_A_FL (1 << PLOOP_REQ_RELOC_A) #define PLOOP_REQ_RELOC_S_FL (1 << PLOOP_REQ_RELOC_S) +#define PLOOP_REQ_DELAYED_FLUSH_FL (1 << PLOOP_REQ_DELAYED_FLUSH) #define PLOOP_REQ_DISCARD_FL (1 << PLOOP_REQ_DISCARD) #define PLOOP_REQ_ZERO_FL (1 << PLOOP_REQ_ZERO) @@ -607,17 +607,8 @@ void ploop_preq_drop(struct ploop_device * plo, struct list_head *drop_list, static inline int ploop_req_delay_fua_possible(unsigned long rw, struct ploop_request *preq) { - int delay_fua = 0; - - /* In case of eng_state != COMPLETE, we'll do FUA in - * ploop_index_update(). Otherwise, we should post - * fua. - */ - if (rw & REQ_FUA) { - if (preq->eng_state != PLOOP_E_COMPLETE) - delay_fua = 1; - } - return delay_fua; + /* data may be flushed during ploop_index_update() */ + return (preq->eng_state == PLOOP_E_DATA_WBI); } static inline void ploop_req_set_error(struct ploop_request * preq, int err) -- 1.8.3.1 _______________________________________________ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel