barrier code is broken in many ways: Currently only ->dio_submit() handles PLOOP_REQ_FORCE_{FLUSH,FUA} correctly. But request also can goes though ->dio_submit_alloc()->dio_submit_pad and write_page (for indexes) So in case of grow_dev we have following sequance:
E_RELOC_DATA_READ: ->set_bit(PLOOP_REQ_FORCE_FUA, &preq->state); ->delta->allocate ->io->submit_allloc: dio_submit_alloc ->dio_submit_pad E_DATA_WBI : data written, time to update index ->delta->allocate_complete:ploop_index_update ->set_bit(PLOOP_REQ_FORCE_FUA, &preq->state); ->write_page ->ploop_map_wb_complete ->ploop_wb_complete_post_process ->set_bit(PLOOP_REQ_FORCE_FUA, &preq->state); E_RELOC_NULLIFY: ->submit() BUG#2: currecntly kaio write_page silently ignores REQ_FUA BUG#3: io_direct:dio_submit if fua_delay is not possible we MUST tag all bios via REQ_FUA not just latest one. This patch unify barrier handling like follows: - Get rid of FORCE_{FLUSH,FUA} - Introduce DELAYED_FLUSH, currecntly it supported only by io_direct - fix up fua handling for dio_submit This makes reloc sequence optimal: io_direct RELOC_S: R1, W2, WBI:FLUSH|FUA RELOC_A: R1, W2, WBI:FLUSH|FUA, W1:NULLIFY|FUA io_kaio RELOC_S: R1, W2:FUA, WBI:FUA RELOC_A: R1, W2:FUA, WBI:FUA, W1:NULLIFY|FUA https://jira.sw.ru/browse/PSBM-47107 Signed-off-by: Dmitry Monakhov <dmonak...@openvz.org> --- drivers/block/ploop/dev.c | 8 +++++--- drivers/block/ploop/io_direct.c | 29 +++++++++----------------- drivers/block/ploop/io_kaio.c | 17 ++++++++++------ drivers/block/ploop/map.c | 45 ++++++++++++++++++++++------------------- include/linux/ploop/ploop.h | 8 ++++---- 5 files changed, 54 insertions(+), 53 deletions(-) diff --git a/drivers/block/ploop/dev.c b/drivers/block/ploop/dev.c index 96f7850..fbc5f2f 100644 --- a/drivers/block/ploop/dev.c +++ b/drivers/block/ploop/dev.c @@ -1224,6 +1224,9 @@ static void ploop_complete_request(struct ploop_request * preq) __TRACE("Z %p %u\n", preq, preq->req_cluster); + if (!preq->error) { + WARN_ON(test_bit(PLOOP_REQ_DELAYED_FLUSH, &preq->state)); + } while (preq->bl.head) { struct bio * bio = preq->bl.head; preq->bl.head = bio->bi_next; @@ -2530,9 +2533,8 @@ restart: top_delta = ploop_top_delta(plo); sbl.head = sbl.tail = preq->aux_bio; - /* Relocated data write required sync before BAT updatee */ - set_bit(PLOOP_REQ_FORCE_FUA, &preq->state); - + /* Relocated data write required sync before BAT updatee + * this will happen inside index_update */ if (test_bit(PLOOP_REQ_RELOC_S, &preq->state)) { preq->eng_state = PLOOP_E_DATA_WBI; plo->st.bio_out++; diff --git a/drivers/block/ploop/io_direct.c b/drivers/block/ploop/io_direct.c index a6d83fe..d7ecd4a 100644 --- a/drivers/block/ploop/io_direct.c +++ b/drivers/block/ploop/io_direct.c @@ -90,21 +90,12 @@ dio_submit(struct ploop_io *io, struct ploop_request * preq, trace_submit(preq); preflush = !!(rw & REQ_FLUSH); - - if (test_and_clear_bit(PLOOP_REQ_FORCE_FLUSH, &preq->state)) - preflush = 1; - - if (test_and_clear_bit(PLOOP_REQ_FORCE_FUA, &preq->state)) - postfua = 1; - - if (!postfua && ploop_req_delay_fua_possible(rw, preq)) { - + postfua = !!(rw & REQ_FUA); + if (ploop_req_delay_fua_possible(rw, preq)) { /* Mark req that delayed flush required */ - set_bit(PLOOP_REQ_FORCE_FLUSH, &preq->state); - } else if (rw & REQ_FUA) { - postfua = 1; + set_bit(PLOOP_REQ_DELAYED_FLUSH, &preq->state); + postfua = 0; } - rw &= ~(REQ_FLUSH | REQ_FUA); @@ -238,14 +229,15 @@ flush_bio: rw2 |= REQ_FLUSH; preflush = 0; } - if (unlikely(postfua && !bl.head)) - rw2 |= (REQ_FUA | ((bio_num) ? REQ_FLUSH : 0)); + /* Very unlikely, but correct. + * TODO: Optimize postfua via DELAY_FLUSH for any req state */ + if (unlikely(!postfua)) + rw2 |= REQ_FUA; ploop_acc_ff_out(preq->plo, rw2 | b->bi_rw); submit_bio(rw2, b); bio_num++; } - ploop_complete_io_request(preq); return; @@ -1520,15 +1512,14 @@ dio_read_page(struct ploop_io * io, struct ploop_request * preq, static void dio_write_page(struct ploop_io * io, struct ploop_request * preq, - struct page * page, sector_t sec, int fua) + struct page * page, sector_t sec, unsigned long rw) { if (!(io->files.file->f_mode & FMODE_WRITE)) { PLOOP_FAIL_REQUEST(preq, -EBADF); return; } - dio_io_page(io, WRITE | (fua ? REQ_FUA : 0) | REQ_SYNC, - preq, page, sec); + dio_io_page(io, rw | WRITE | REQ_SYNC, preq, page, sec); } static int diff --git a/drivers/block/ploop/io_kaio.c b/drivers/block/ploop/io_kaio.c index de26319..0b93e13 100644 --- a/drivers/block/ploop/io_kaio.c +++ b/drivers/block/ploop/io_kaio.c @@ -72,13 +72,17 @@ static void kaio_complete_io_state(struct ploop_request * preq) } /* Convert requested fua to fsync */ - if (test_and_clear_bit(PLOOP_REQ_FORCE_FUA, &preq->state) || + if (test_and_clear_bit(PLOOP_REQ_DELAYED_FLUSH, &preq->state) || test_and_clear_bit(PLOOP_REQ_KAIO_FSYNC, &preq->state)) post_fsync = 1; if (!post_fsync && - !ploop_req_delay_fua_possible(preq->req_rw, preq) && - (preq->req_rw & REQ_FUA)) + !ploop_req_delay_fua_possible(preq->req_rw, preq)) + post_fsync = 1; + /* Reloc requires FLUSH_FUA from ->index_update for delayed_fua + which is not supporded by ->kaio_write_page */ + if (test_and_clear_bit(PLOOP_REQ_RELOC_S, &preq->state) || + test_and_clear_bit(PLOOP_REQ_RELOC_S, &preq->state)) post_fsync = 1; preq->req_rw &= ~REQ_FUA; @@ -608,12 +612,13 @@ kaio_read_page(struct ploop_io * io, struct ploop_request * preq, static void kaio_write_page(struct ploop_io * io, struct ploop_request * preq, - struct page * page, sector_t sec, int fua) + struct page * page, sector_t sec, unsigned long rw) { ploop_prepare_tracker(preq, sec); - + /* This is async call , preflush is not supported */ + BUG_ON(rw & REQ_FLUSH); /* No FUA in kaio, convert it to fsync */ - if (fua) + if (rw & REQ_FUA) set_bit(PLOOP_REQ_KAIO_FSYNC, &preq->state); kaio_io_page(io, IOCB_CMD_WRITE_ITER, preq, page, sec); diff --git a/drivers/block/ploop/map.c b/drivers/block/ploop/map.c index 3a6365d..de9bb32 100644 --- a/drivers/block/ploop/map.c +++ b/drivers/block/ploop/map.c @@ -896,6 +896,8 @@ void ploop_index_update(struct ploop_request * preq) struct ploop_device * plo = preq->plo; struct map_node * m = preq->map; struct ploop_delta * top_delta = map_top_delta(m->parent); + unsigned long rw = preq->req_rw & REQ_FUA; + unsigned long state = READ_ONCE(preq->state); u32 idx; map_index_t blk; int old_level; @@ -953,13 +955,14 @@ void ploop_index_update(struct ploop_request * preq) __TRACE("wbi %p %u %p\n", preq, preq->req_cluster, m); plo->st.map_single_writes++; top_delta->ops->map_index(top_delta, m->mn_start, &sec); - /* Relocate requires consistent writes, mark such reqs appropriately */ - if (test_bit(PLOOP_REQ_RELOC_A, &preq->state) || - test_bit(PLOOP_REQ_RELOC_S, &preq->state)) - set_bit(PLOOP_REQ_FORCE_FUA, &preq->state); - - top_delta->io.ops->write_page(&top_delta->io, preq, page, sec, - !!(preq->req_rw & REQ_FUA)); + /* Relocate requires consistent index update */ + if (state & (PLOOP_REQ_RELOC_A_FL|PLOOP_REQ_RELOC_S_FL)) + rw |= REQ_FUA; + if (state & PLOOP_REQ_DELAYED_FLUSH_FL) + rw |= REQ_FLUSH; + if (rw) + clear_bit(PLOOP_REQ_DELAYED_FLUSH, &preq->state); + top_delta->io.ops->write_page(&top_delta->io, preq, page, sec, rw | WRITE); put_page(page); return; @@ -1063,7 +1066,7 @@ static void map_wb_complete_post_process(struct ploop_map *map, * (see dio_submit()). So fsync of EXT4 image doesnt help us. * We need to force sync of nullified blocks. */ - set_bit(PLOOP_REQ_FORCE_FUA, &preq->state); + preq->req_rw |= REQ_FUA; top_delta->io.ops->submit(&top_delta->io, preq, preq->req_rw, &sbl, preq->iblock, 1<<plo->cluster_log); } @@ -1074,11 +1077,12 @@ static void map_wb_complete(struct map_node * m, int err) struct ploop_delta * top_delta = map_top_delta(m->parent); struct list_head * cursor, * tmp; struct ploop_request * main_preq; + unsigned long rw = 0; struct page * page = NULL; int delayed = 0; unsigned int idx; sector_t sec; - int fua, force_fua; + int fua; /* First, complete processing of written back indices, * finally instantiate indices in mapping cache. @@ -1149,10 +1153,10 @@ static void map_wb_complete(struct map_node * m, int err) main_preq = NULL; fua = 0; - force_fua = 0; list_for_each_safe(cursor, tmp, &m->io_queue) { struct ploop_request * preq; + unsigned long state; preq = list_entry(cursor, struct ploop_request, list); @@ -1167,13 +1171,15 @@ static void map_wb_complete(struct map_node * m, int err) spin_unlock_irq(&plo->lock); break; } - - if (preq->req_rw & REQ_FUA) - fua = 1; - - if (test_bit(PLOOP_REQ_RELOC_A, &preq->state) || - test_bit(PLOOP_REQ_RELOC_S, &preq->state)) - force_fua = 1; + state = READ_ONCE(preq->state); + /* Relocate requires consistent index update */ + rw |= preq->req_rw & REQ_FUA; + if (state & (PLOOP_REQ_RELOC_A_FL|PLOOP_REQ_RELOC_S_FL)) + rw |= REQ_FUA; + if (state & PLOOP_REQ_DELAYED_FLUSH_FL) + rw |= REQ_FLUSH; + if (rw) + clear_bit(PLOOP_REQ_DELAYED_FLUSH, &preq->state); preq->eng_state = PLOOP_E_INDEX_WB; get_page(page); @@ -1200,10 +1206,7 @@ static void map_wb_complete(struct map_node * m, int err) plo->st.map_multi_writes++; top_delta->ops->map_index(top_delta, m->mn_start, &sec); - if (force_fua) - set_bit(PLOOP_REQ_FORCE_FUA, &main_preq->state); - - top_delta->io.ops->write_page(&top_delta->io, main_preq, page, sec, fua); + top_delta->io.ops->write_page(&top_delta->io, main_preq, page, sec, rw | WRITE); put_page(page); } diff --git a/include/linux/ploop/ploop.h b/include/linux/ploop/ploop.h index 0fba25e..2f26824 100644 --- a/include/linux/ploop/ploop.h +++ b/include/linux/ploop/ploop.h @@ -157,7 +157,7 @@ struct ploop_io_ops void (*read_page)(struct ploop_io * io, struct ploop_request * preq, struct page * page, sector_t sec); void (*write_page)(struct ploop_io * io, struct ploop_request * preq, - struct page * page, sector_t sec, int fua); + struct page * page, sector_t sec, unsigned long rw); int (*sync_read)(struct ploop_io * io, struct page * page, @@ -465,8 +465,7 @@ enum PLOOP_REQ_ZERO, PLOOP_REQ_DISCARD, PLOOP_REQ_RSYNC, - PLOOP_REQ_FORCE_FUA, /*force fua of req write I/O by engine */ - PLOOP_REQ_FORCE_FLUSH, /*force flush by engine */ + PLOOP_REQ_DELAYED_FLUSH,/* REQ_FLUSH is delayed */ PLOOP_REQ_KAIO_FSYNC, /*force image fsync by KAIO module */ PLOOP_REQ_POST_SUBMIT, /* preq needs post_submit processing */ PLOOP_REQ_PUSH_BACKUP, /* preq was ACKed by userspace push_backup */ @@ -477,6 +476,7 @@ enum #define PLOOP_REQ_MERGE_FL (1 << PLOOP_REQ_MERGE) #define PLOOP_REQ_RELOC_A_FL (1 << PLOOP_REQ_RELOC_A) #define PLOOP_REQ_RELOC_S_FL (1 << PLOOP_REQ_RELOC_S) +#define PLOOP_REQ_DELAYED_FLUSH_FL (1 << PLOOP_REQ_DELAYED_FLUSH) #define PLOOP_REQ_DISCARD_FL (1 << PLOOP_REQ_DISCARD) #define PLOOP_REQ_ZERO_FL (1 << PLOOP_REQ_ZERO) @@ -614,7 +614,7 @@ static inline int ploop_req_delay_fua_possible(unsigned long rw, * fua. */ if (rw & REQ_FUA) { - if (preq->eng_state != PLOOP_E_COMPLETE) + if (preq->eng_state == PLOOP_E_DATA_WBI) delay_fua = 1; } return delay_fua; -- 1.8.3.1 _______________________________________________ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel