[Devel] [PATCH rh7 8/9] ploop: fix barriers for PLOOP_E_RELOC_NULLIFY
The last step of processing of RELOC_A request is nullifying BAT block. We smartly noticed, that flush needed after that, but fsync is not enough: > /* >* Lately we think we does sync of nullified blocks at format >* driver by image fsync before header update. >* But we write this data directly into underlying device >* bypassing EXT4 by usage of extent map tree >* (see dio_submit()). So fsync of EXT4 image doesnt help us. >* We need to force sync of nullified blocks. >*/ > set_bit(PLOOP_REQ_FORCE_FUA, >state); > top_delta->io.ops->submit(_delta->io, preq, preq->req_rw, > , preq->iblock, 1issue_flush to flush nullified block. Signed-off-by: Maxim Patlasov --- drivers/block/ploop/dev.c | 11 ++- drivers/block/ploop/io_direct.c |3 ++- drivers/block/ploop/map.c |4 +++- include/linux/ploop/ploop.h |1 + 4 files changed, 16 insertions(+), 3 deletions(-) diff --git a/drivers/block/ploop/dev.c b/drivers/block/ploop/dev.c index 557ddba..2b60dfa 100644 --- a/drivers/block/ploop/dev.c +++ b/drivers/block/ploop/dev.c @@ -1305,6 +1305,8 @@ static void ploop_complete_request(struct ploop_request * preq) } preq->bl.tail = NULL; + WARN_ON(!preq->error && test_bit(PLOOP_REQ_ISSUE_FLUSH, >state)); + if (test_bit(PLOOP_REQ_RELOC_A, >state) || test_bit(PLOOP_REQ_RELOC_S, >state)) { if (preq->error) @@ -2429,6 +2431,13 @@ static void ploop_req_state_process(struct ploop_request * preq) preq->eng_io = NULL; } + if (test_bit(PLOOP_REQ_ISSUE_FLUSH, >state)) { + preq->eng_io->ops->issue_flush(preq->eng_io, preq); + clear_bit(PLOOP_REQ_ISSUE_FLUSH, >state); + preq->eng_io = NULL; + goto out; + } + restart: BUG_ON(test_bit(PLOOP_REQ_POST_SUBMIT, >state)); __TRACE("ST %p %u %lu\n", preq, preq->req_cluster, preq->eng_state); @@ -2705,7 +2714,7 @@ restart: default: BUG(); } - +out: if (release_ioc) { struct io_context * ioc = current->io_context; current->io_context = saved_ioc; diff --git a/drivers/block/ploop/io_direct.c b/drivers/block/ploop/io_direct.c index 94936c7..c4d0f63 100644 --- a/drivers/block/ploop/io_direct.c +++ b/drivers/block/ploop/io_direct.c @@ -413,6 +413,7 @@ try_again: preq->iblock = iblk; preq->eng_io = io; + BUG_ON(test_bit(PLOOP_REQ_ISSUE_FLUSH, >state)); set_bit(PLOOP_REQ_POST_SUBMIT, >state); dio_submit_pad(io, preq, sbl, size, em); err = 0; @@ -1819,7 +1820,7 @@ static void dio_issue_flush(struct ploop_io * io, struct ploop_request *preq) atomic_inc(>io_count); ploop_acc_ff_out(io->plo, preq->req_rw | bio->bi_rw); - submit_bio(preq->req_rw, bio); + submit_bio(WRITE_FLUSH, bio); ploop_complete_io_request(preq); } diff --git a/drivers/block/ploop/map.c b/drivers/block/ploop/map.c index f87fb08..915a216 100644 --- a/drivers/block/ploop/map.c +++ b/drivers/block/ploop/map.c @@ -1077,7 +1077,9 @@ static void map_wb_complete_post_process(struct ploop_map *map, * (see dio_submit()). So fsync of EXT4 image doesnt help us. * We need to force sync of nullified blocks. */ - set_bit(PLOOP_REQ_FORCE_FUA, >state); + preq->eng_io = _delta->io; + BUG_ON(test_bit(PLOOP_REQ_POST_SUBMIT, >state)); + set_bit(PLOOP_REQ_ISSUE_FLUSH, >state); top_delta->io.ops->submit(_delta->io, preq, preq->req_rw, , preq->iblock, 1 fsync() */ + PLOOP_REQ_ISSUE_FLUSH, /* preq needs ->issue_flush before completing */ }; #define PLOOP_REQ_MERGE_FL (1 << PLOOP_REQ_MERGE) ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH rh7 9/9] ploop: fixup barrier handling during relocation
Rebase Dima's patch on top of rh7-3.10.0-327.18.2.vz7.14.19, but without help of delayed_flush engine: To ensure consistency on crash/power outage/hard reboot events, ploop must implement the following barrier logic for RELOC_A|S requests: 1) After we store data to new place, but before updating BAT on disk, we have FLUSH everything (in fact, flushing those data would be enough, but it is simplier to flush everything). 2) We should not proceed handling RELOC_A|S until we 100% sure new BAT value went to disk platters. So far as new BAT is only one page, it's OK to mark corresponding bio with FUA flag for io_direct case. For io_kaio, not having FUA api, we have to post_fsync BAT update. PLOOP_REQ_FORCE_FLUSH/PLOOP_REQ_FORCE_FUA introduced long time ago probably were intended to ensure the logic above, but they actually didn't. The patch removes PLOOP_REQ_FORCE_FLUSH/PLOOP_REQ_FORCE_FUA, and implements barriers in a straightforward and simple way: check for RELOC_A|S explicitly and make FLUSH/FUA where needed. Signed-off-by: Maxim Patlasov--- drivers/block/ploop/dev.c |4 ++-- drivers/block/ploop/io_direct.c |7 --- drivers/block/ploop/io_kaio.c |8 +--- drivers/block/ploop/map.c | 22 ++ include/linux/ploop/ploop.h |1 - 5 files changed, 17 insertions(+), 25 deletions(-) diff --git a/drivers/block/ploop/dev.c b/drivers/block/ploop/dev.c index 2b60dfa..40768b6 100644 --- a/drivers/block/ploop/dev.c +++ b/drivers/block/ploop/dev.c @@ -2610,8 +2610,8 @@ restart: top_delta = ploop_top_delta(plo); sbl.head = sbl.tail = preq->aux_bio; - /* Relocated data write required sync before BAT updatee */ - set_bit(PLOOP_REQ_FORCE_FUA, >state); + /* Relocated data write required sync before BAT update +* this will happen inside index_update */ if (test_bit(PLOOP_REQ_RELOC_S, >state)) { preq->eng_state = PLOOP_E_DATA_WBI; diff --git a/drivers/block/ploop/io_direct.c b/drivers/block/ploop/io_direct.c index c4d0f63..266f041 100644 --- a/drivers/block/ploop/io_direct.c +++ b/drivers/block/ploop/io_direct.c @@ -89,15 +89,11 @@ dio_submit(struct ploop_io *io, struct ploop_request * preq, sector_t sec, nsec; int err; struct bio_list_walk bw; - int postfua = 0; int write = !!(rw & REQ_WRITE); int delayed_fua = 0; trace_submit(preq); - if (test_and_clear_bit(PLOOP_REQ_FORCE_FUA, >state)) - postfua = 1; - if ((rw & REQ_FUA) && ploop_req_delay_fua_possible(preq)) { /* Mark req that delayed flush required */ preq->req_rw |= (REQ_FLUSH | REQ_FUA); @@ -233,9 +229,6 @@ flush_bio: b->bi_private = preq; b->bi_end_io = dio_endio_async; - if (unlikely(postfua && !bl.head)) - rw2 |= REQ_FUA; - ploop_acc_ff_out(preq->plo, rw2 | b->bi_rw); submit_bio(rw2, b); } diff --git a/drivers/block/ploop/io_kaio.c b/drivers/block/ploop/io_kaio.c index ed550f4..85863df 100644 --- a/drivers/block/ploop/io_kaio.c +++ b/drivers/block/ploop/io_kaio.c @@ -69,6 +69,8 @@ static void kaio_complete_io_state(struct ploop_request * preq) unsigned long flags; int post_fsync = 0; int need_fua = !!(preq->req_rw & REQ_FUA); + unsigned long state = READ_ONCE(preq->state); + int reloc = !!(state & (PLOOP_REQ_RELOC_A_FL|PLOOP_REQ_RELOC_S_FL)); if (preq->error || !(preq->req_rw & REQ_FUA) || preq->eng_state == PLOOP_E_INDEX_READ || @@ -80,9 +82,9 @@ static void kaio_complete_io_state(struct ploop_request * preq) } /* Convert requested fua to fsync */ - if (test_and_clear_bit(PLOOP_REQ_FORCE_FUA, >state) || - test_and_clear_bit(PLOOP_REQ_KAIO_FSYNC, >state) || - (need_fua && !ploop_req_delay_fua_possible(preq))) { + if (test_and_clear_bit(PLOOP_REQ_KAIO_FSYNC, >state) || + (need_fua && !ploop_req_delay_fua_possible(preq)) || + (reloc && ploop_req_delay_fua_possible(preq))) { post_fsync = 1; preq->req_rw &= ~REQ_FUA; } diff --git a/drivers/block/ploop/map.c b/drivers/block/ploop/map.c index 915a216..1883674 100644 --- a/drivers/block/ploop/map.c +++ b/drivers/block/ploop/map.c @@ -909,6 +909,7 @@ void ploop_index_update(struct ploop_request * preq) struct page * page; sector_t sec; unsigned long rw; + unsigned long state = READ_ONCE(preq->state); /* No way back, we are going to initiate index write. */ @@ -961,10 +962,6 @@ void ploop_index_update(struct ploop_request * preq) __TRACE("wbi %p %u %p\n", preq, preq->req_cluster, m); plo->st.map_single_writes++; top_delta->ops->map_index(top_delta,
[Devel] [PATCH rh7 5/9] ploop: resurrect delay_fua for io_direct
Recent commit c2247f3745 while fixing barriers for ordinary requests, accidentally smashed delay_fua optimization for io_direct by: > + bio->bi_rw |= bw.cur->bi_rw & (REQ_FLUSH | REQ_FUA); The idea is the following: if at least one incoming bio is marked as FUA (it is actually equivalent to (rw & REQ_FUA) check), and eng_state == E_DATA_WBI, we can delay FUA until index update and implement it there by REQ_FLUSH. It is not clear if this optimization provides any benefits, but if we lived with it for long so far, let's keep it for now. The patch removes PLOOP_REQ_FORCE_FLUSH thoroughly because it's easier to use REQ_FLUSH bit in preq->req_rw instead. Signed-off-by: Maxim Patlasov--- drivers/block/ploop/io_direct.c | 15 ++- drivers/block/ploop/map.c | 25 ++--- include/linux/ploop/ploop.h |1 - 3 files changed, 24 insertions(+), 17 deletions(-) diff --git a/drivers/block/ploop/io_direct.c b/drivers/block/ploop/io_direct.c index db82a61..1ea2008 100644 --- a/drivers/block/ploop/io_direct.c +++ b/drivers/block/ploop/io_direct.c @@ -92,23 +92,19 @@ dio_submit(struct ploop_io *io, struct ploop_request * preq, int preflush; int postfua = 0; int write = !!(rw & REQ_WRITE); + int delayed_fua = 0; trace_submit(preq); preflush = !!(rw & REQ_FLUSH); - if (test_and_clear_bit(PLOOP_REQ_FORCE_FLUSH, >state)) - preflush = 1; - if (test_and_clear_bit(PLOOP_REQ_FORCE_FUA, >state)) postfua = 1; - if (!postfua && ploop_req_delay_fua_possible(preq) && (rw & REQ_FUA)) { - + if ((rw & REQ_FUA) && ploop_req_delay_fua_possible(preq)) { /* Mark req that delayed flush required */ - set_bit(PLOOP_REQ_FORCE_FLUSH, >state); - } else if (rw & REQ_FUA) { - postfua = 1; + preq->req_rw |= (REQ_FLUSH | REQ_FUA); + delayed_fua = 1; } rw &= ~(REQ_FLUSH | REQ_FUA); @@ -222,7 +218,8 @@ flush_bio: goto flush_bio; } - bio->bi_rw |= bw.cur->bi_rw & (REQ_FLUSH | REQ_FUA); + bio->bi_rw |= bw.cur->bi_rw & + (REQ_FLUSH | delayed_fua ? 0 : REQ_FUA); bw.bv_off += copy; size -= copy >> 9; sec += copy >> 9; diff --git a/drivers/block/ploop/map.c b/drivers/block/ploop/map.c index ae6cc15..f87fb08 100644 --- a/drivers/block/ploop/map.c +++ b/drivers/block/ploop/map.c @@ -908,6 +908,7 @@ void ploop_index_update(struct ploop_request * preq) int old_level; struct page * page; sector_t sec; + unsigned long rw; /* No way back, we are going to initiate index write. */ @@ -965,8 +966,14 @@ void ploop_index_update(struct ploop_request * preq) test_bit(PLOOP_REQ_RELOC_S, >state)) set_bit(PLOOP_REQ_FORCE_FUA, >state); - top_delta->io.ops->write_page(_delta->io, preq, page, sec, - preq->req_rw & REQ_FUA); + rw = (preq->req_rw & (REQ_FUA | REQ_FLUSH)); + + /* We've just set REQ_FLUSH in rw, ->write_page() below + will do the FLUSH */ + preq->req_rw &= ~REQ_FLUSH; + + top_delta->io.ops->write_page(_delta->io, preq, page, sec, rw); + put_page(page); return; @@ -1085,7 +1092,8 @@ static void map_wb_complete(struct map_node * m, int err) int delayed = 0; unsigned int idx; sector_t sec; - int fua, force_fua; + int force_fua; + unsigned long rw; /* First, complete processing of written back indices, * finally instantiate indices in mapping cache. @@ -1155,7 +1163,7 @@ static void map_wb_complete(struct map_node * m, int err) copy_index_for_wb(page, m, top_delta->level); main_preq = NULL; - fua = 0; + rw = 0; force_fua = 0; list_for_each_safe(cursor, tmp, >io_queue) { @@ -1175,8 +1183,11 @@ static void map_wb_complete(struct map_node * m, int err) break; } - if (preq->req_rw & REQ_FUA) - fua = 1; + rw |= (preq->req_rw & (REQ_FLUSH | REQ_FUA)); + + /* We've just set REQ_FLUSH in rw, ->write_page() below + will do the FLUSH */ + preq->req_rw &= ~REQ_FLUSH; if (test_bit(PLOOP_REQ_RELOC_A, >state) || test_bit(PLOOP_REQ_RELOC_S, >state)) @@ -1211,7 +1222,7 @@ static void map_wb_complete(struct map_node * m, int err) set_bit(PLOOP_REQ_FORCE_FUA, _preq->state); top_delta->io.ops->write_page(_delta->io, main_preq, page, sec, - fua ? REQ_FUA : 0); +
[Devel] [PATCH rh7 6/9] ploop: remove preflush from dio_submit
After commit c2247f3745 fixing barriers for ordinary requests and previous patch fixing delay_fua, that legacy code in dio_submit processing (preq->req_rw & REQ_FLUSH) by setting REQ_FLUSH in the first outgoing bio must die: it is incorrect anyway (we don't wait for completion of the first bio before sending others). Signed-off-by: Maxim Patlasov--- drivers/block/ploop/io_direct.c |7 --- 1 file changed, 7 deletions(-) diff --git a/drivers/block/ploop/io_direct.c b/drivers/block/ploop/io_direct.c index 1ea2008..ee3cd5c 100644 --- a/drivers/block/ploop/io_direct.c +++ b/drivers/block/ploop/io_direct.c @@ -89,15 +89,12 @@ dio_submit(struct ploop_io *io, struct ploop_request * preq, sector_t sec, nsec; int err; struct bio_list_walk bw; - int preflush; int postfua = 0; int write = !!(rw & REQ_WRITE); int delayed_fua = 0; trace_submit(preq); - preflush = !!(rw & REQ_FLUSH); - if (test_and_clear_bit(PLOOP_REQ_FORCE_FUA, >state)) postfua = 1; @@ -236,10 +233,6 @@ flush_bio: b->bi_private = preq; b->bi_end_io = dio_endio_async; - if (unlikely(preflush)) { - rw2 |= REQ_FLUSH; - preflush = 0; - } if (unlikely(postfua && !bl.head)) rw2 |= REQ_FUA; ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH rh7 4/9] ploop: minor rework of ->write_page() io method
From: Dmitry MonakhovNo functional changes. Next patch will use this rework to pass REQ_FLUSH to dio_write_page(). The patch is actually a part of Dima's patch: > [PATCH 3/3] ploop: fixup FORCE_{FLUSH,FUA} handling v3 Signed-off-by: Maxim Patlasov --- drivers/block/ploop/io_direct.c |5 ++--- drivers/block/ploop/io_kaio.c |8 +--- drivers/block/ploop/map.c |5 +++-- include/linux/ploop/ploop.h |2 +- 4 files changed, 11 insertions(+), 9 deletions(-) diff --git a/drivers/block/ploop/io_direct.c b/drivers/block/ploop/io_direct.c index 0907540..db82a61 100644 --- a/drivers/block/ploop/io_direct.c +++ b/drivers/block/ploop/io_direct.c @@ -1505,15 +1505,14 @@ dio_read_page(struct ploop_io * io, struct ploop_request * preq, static void dio_write_page(struct ploop_io * io, struct ploop_request * preq, - struct page * page, sector_t sec, int fua) + struct page * page, sector_t sec, unsigned long rw) { if (!(io->files.file->f_mode & FMODE_WRITE)) { PLOOP_FAIL_REQUEST(preq, -EBADF); return; } - dio_io_page(io, WRITE | (fua ? REQ_FUA : 0) | REQ_SYNC, - preq, page, sec); + dio_io_page(io, rw | WRITE | REQ_SYNC, preq, page, sec); } static int diff --git a/drivers/block/ploop/io_kaio.c b/drivers/block/ploop/io_kaio.c index e4e4411..73edc5e 100644 --- a/drivers/block/ploop/io_kaio.c +++ b/drivers/block/ploop/io_kaio.c @@ -612,12 +612,14 @@ kaio_read_page(struct ploop_io * io, struct ploop_request * preq, static void kaio_write_page(struct ploop_io * io, struct ploop_request * preq, -struct page * page, sector_t sec, int fua) +struct page * page, sector_t sec, unsigned long rw) { ploop_prepare_tracker(preq, sec); - /* No FUA in kaio, convert it to fsync */ - if (fua) + /* No FUA in kaio, convert it to fsync. Don't care + about REQ_FLUSH: only io_direct relies on it, + io_kaio implements delay_fua in another way... */ + if (rw & REQ_FUA) set_bit(PLOOP_REQ_KAIO_FSYNC, >state); kaio_io_page(io, IOCB_CMD_WRITE_ITER, preq, page, sec); diff --git a/drivers/block/ploop/map.c b/drivers/block/ploop/map.c index 3ba8a22..ae6cc15 100644 --- a/drivers/block/ploop/map.c +++ b/drivers/block/ploop/map.c @@ -966,7 +966,7 @@ void ploop_index_update(struct ploop_request * preq) set_bit(PLOOP_REQ_FORCE_FUA, >state); top_delta->io.ops->write_page(_delta->io, preq, page, sec, - !!(preq->req_rw & REQ_FUA)); + preq->req_rw & REQ_FUA); put_page(page); return; @@ -1210,7 +1210,8 @@ static void map_wb_complete(struct map_node * m, int err) if (force_fua) set_bit(PLOOP_REQ_FORCE_FUA, _preq->state); - top_delta->io.ops->write_page(_delta->io, main_preq, page, sec, fua); + top_delta->io.ops->write_page(_delta->io, main_preq, page, sec, + fua ? REQ_FUA : 0); put_page(page); } diff --git a/include/linux/ploop/ploop.h b/include/linux/ploop/ploop.h index e1d8686..3e53b35 100644 --- a/include/linux/ploop/ploop.h +++ b/include/linux/ploop/ploop.h @@ -164,7 +164,7 @@ struct ploop_io_ops void(*read_page)(struct ploop_io * io, struct ploop_request * preq, struct page * page, sector_t sec); void(*write_page)(struct ploop_io * io, struct ploop_request * preq, - struct page * page, sector_t sec, int fua); + struct page * page, sector_t sec, unsigned long rw); int (*sync_read)(struct ploop_io * io, struct page * page, ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH rh7 3/9] ploop: resurrect delayed_fua for io_kaio
After long thinking it now seems to be clear how delayed_fua was supposed to work for io_kaio: 1) An incoming bio marked as REQ_FUA leads to this bit set in preq->req_rw. 2) kaio_submit does nothing with this bit in preq->req_rw. It only initiates sending data by aio_kernel_submit. 3) When userspace ACKs this WRITE, kaio_complete_io_state discovers that even though REQ_FUA bit is set in preq->req_rw, eng_state == E_DATA_WBI, so we can delay flush until index update. NB: It is crucial here, that preq->req_rw still needs REQ_FUA bit set! 4) index update calls ->write_page() with fua=1 because it detects REQ_FUA bit set in preq->req_rw. 5) kaio_write_page observes fua=1 and so set PLOOP_REQ_KAIO_FSYNC in preq->state. Then it initiates sending data (BAT update). 6) When userspace ACKs this WRITE (BAT update), kaio_complete_io_state detects PLOOP_REQ_KAIO_FSYNC bit set, so it clears it and enforces post_fsync=1. The patch fixes 3) that was broken so far. Signed-off-by: Maxim Patlasov--- drivers/block/ploop/io_kaio.c | 12 +--- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/drivers/block/ploop/io_kaio.c b/drivers/block/ploop/io_kaio.c index 69df456..e4e4411 100644 --- a/drivers/block/ploop/io_kaio.c +++ b/drivers/block/ploop/io_kaio.c @@ -68,6 +68,7 @@ static void kaio_complete_io_state(struct ploop_request * preq) struct ploop_device * plo = preq->plo; unsigned long flags; int post_fsync = 0; + int need_fua = !!(preq->req_rw & REQ_FUA); if (preq->error || !(preq->req_rw & REQ_FUA) || preq->eng_state == PLOOP_E_INDEX_READ || @@ -80,14 +81,11 @@ static void kaio_complete_io_state(struct ploop_request * preq) /* Convert requested fua to fsync */ if (test_and_clear_bit(PLOOP_REQ_FORCE_FUA, >state) || - test_and_clear_bit(PLOOP_REQ_KAIO_FSYNC, >state)) + test_and_clear_bit(PLOOP_REQ_KAIO_FSYNC, >state) || + (need_fua && !ploop_req_delay_fua_possible(preq))) { post_fsync = 1; - - if (!post_fsync && - !(ploop_req_delay_fua_possible(preq) && (preq->req_rw & REQ_FUA))) - post_fsync = 1; - - preq->req_rw &= ~REQ_FUA; + preq->req_rw &= ~REQ_FUA; + } if (post_fsync) { spin_lock_irqsave(>lock, flags); ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH rh7 2/9] ploop: minor rework of ploop_req_delay_fua_possible
No functional changes. The patch simplifies ploop_req_delay_fua_possible to make it more suitable for next patch. As was recently discussed, "eng_state == E_DATA_WBI" is lesser prone to errors than "eng_state != E_COMPLETE". Note, how the patch makes a bug in kaio_complete_io_state() obvious: if !(preq->req_rw & REQ_FUA), it must not matter what ploop_req_delay_fua_possible() returns! I.e., eng_state==E_COMPLETE is not sufficient ground for post_fsync=1 if no REQ_FUA set. Signed-off-by: Maxim Patlasov--- drivers/block/ploop/io_direct.c |2 +- drivers/block/ploop/io_kaio.c |3 +-- include/linux/ploop/ploop.h | 15 ++- 3 files changed, 4 insertions(+), 16 deletions(-) diff --git a/drivers/block/ploop/io_direct.c b/drivers/block/ploop/io_direct.c index 3acae79..0907540 100644 --- a/drivers/block/ploop/io_direct.c +++ b/drivers/block/ploop/io_direct.c @@ -103,7 +103,7 @@ dio_submit(struct ploop_io *io, struct ploop_request * preq, if (test_and_clear_bit(PLOOP_REQ_FORCE_FUA, >state)) postfua = 1; - if (!postfua && ploop_req_delay_fua_possible(rw, preq)) { + if (!postfua && ploop_req_delay_fua_possible(preq) && (rw & REQ_FUA)) { /* Mark req that delayed flush required */ set_bit(PLOOP_REQ_FORCE_FLUSH, >state); diff --git a/drivers/block/ploop/io_kaio.c b/drivers/block/ploop/io_kaio.c index 81da1c5..69df456 100644 --- a/drivers/block/ploop/io_kaio.c +++ b/drivers/block/ploop/io_kaio.c @@ -84,8 +84,7 @@ static void kaio_complete_io_state(struct ploop_request * preq) post_fsync = 1; if (!post_fsync && - !ploop_req_delay_fua_possible(preq->req_rw, preq) && - (preq->req_rw & REQ_FUA)) + !(ploop_req_delay_fua_possible(preq) && (preq->req_rw & REQ_FUA))) post_fsync = 1; preq->req_rw &= ~REQ_FUA; diff --git a/include/linux/ploop/ploop.h b/include/linux/ploop/ploop.h index 3441e7e..e1d8686 100644 --- a/include/linux/ploop/ploop.h +++ b/include/linux/ploop/ploop.h @@ -613,20 +613,9 @@ void ploop_preq_drop(struct ploop_device * plo, struct list_head *drop_list, int keep_locked); -static inline int ploop_req_delay_fua_possible(unsigned long rw, - struct ploop_request *preq) +static inline int ploop_req_delay_fua_possible(struct ploop_request *preq) { - int delay_fua = 0; - - /* In case of eng_state != COMPLETE, we'll do FUA in -* ploop_index_update(). Otherwise, we should post -* fua. -*/ - if (rw & REQ_FUA) { - if (preq->eng_state != PLOOP_E_COMPLETE) - delay_fua = 1; - } - return delay_fua; + return preq->eng_state == PLOOP_E_DATA_WBI; } static inline void ploop_req_set_error(struct ploop_request * preq, int err) ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH rh7 1/9] ploop: deadcode cleanup
From: Dmitry MonakhovRebase Dima's patch on top of rh7-3.10.0-327.18.2.vz7.14.19: (rw & REQ_FUA) branch is impossible because REQ_FUA was cleared line above. Logic was moved to ploop_req_delay_fua_possible() long time ago. Signed-off-by: Dmitry Monakhov Signed-off-by: Maxim Patlasov --- drivers/block/ploop/io_direct.c | 10 -- 1 file changed, 10 deletions(-) diff --git a/drivers/block/ploop/io_direct.c b/drivers/block/ploop/io_direct.c index 50c0ed1..3acae79 100644 --- a/drivers/block/ploop/io_direct.c +++ b/drivers/block/ploop/io_direct.c @@ -113,16 +113,6 @@ dio_submit(struct ploop_io *io, struct ploop_request * preq, rw &= ~(REQ_FLUSH | REQ_FUA); - - /* In case of eng_state != COMPLETE, we'll do FUA in -* ploop_index_update(). Otherwise, we should mark -* last bio as FUA here. */ - if (rw & REQ_FUA) { - rw &= ~REQ_FUA; - if (preq->eng_state == PLOOP_E_COMPLETE) - postfua = 1; - } - bio_list_init(); if (iblk == PLOOP_ZERO_INDEX) ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
Re: [Devel] [RH7 PATCH 5/6] ploop: fixup barrier handling during relocation
No reasons to keep it along with optimization patches. See please the port I'll send soon today. On 06/23/2016 10:25 AM, Dmitry Monakhov wrote: barrier code is broken in many ways: Currently only ->dio_submit() handles PLOOP_REQ_FORCE_{FLUSH,FUA} correctly. But request also can goes though ->dio_submit_alloc()->dio_submit_pad and write_page (for indexes) So in case of grow_dev we have following sequance: E_RELOC_DATA_READ: ->set_bit(PLOOP_REQ_FORCE_FUA, >state); ->delta->allocate ->io->submit_allloc: dio_submit_alloc ->dio_submit_pad E_DATA_WBI : data written, time to update index ->delta->allocate_complete:ploop_index_update ->set_bit(PLOOP_REQ_FORCE_FUA, >state); ->write_page ->ploop_map_wb_complete ->ploop_wb_complete_post_process ->set_bit(PLOOP_REQ_FORCE_FUA, >state); E_RELOC_NULLIFY: ->submit() Once we have delayed_flush engine it is easy to implement correct scheme for both engines. E_RELOC_DATA_READ ->submit_allloc => wait->post_submit->issue_flush E_DATA_WBI ->ploop_index_update with FUA E_RELOC_NULLIFY ->submit: => wait->post_submit->issue_flush This makes reloc sequence optimal: RELOC_S: R1, W2,WAIT,FLUSH, WBI:FUA RELOC_A: R1, W2,WAIT,FLUSH, WBI:FUA, W1:NULLIFY,WAIT, FLUSH https://jira.sw.ru/browse/PSBM-47107 Signed-off-by: Dmitry Monakhov--- drivers/block/ploop/dev.c | 2 +- drivers/block/ploop/io_kaio.c | 3 +-- drivers/block/ploop/map.c | 28 ++-- 3 files changed, 16 insertions(+), 17 deletions(-) diff --git a/drivers/block/ploop/dev.c b/drivers/block/ploop/dev.c index 95e3067..090cd2d 100644 --- a/drivers/block/ploop/dev.c +++ b/drivers/block/ploop/dev.c @@ -2533,7 +2533,7 @@ restart: sbl.head = sbl.tail = preq->aux_bio; /* Relocated data write required sync before BAT updatee */ - set_bit(PLOOP_REQ_FORCE_FUA, >state); + preq->req_rw |= REQ_FUA; if (test_bit(PLOOP_REQ_RELOC_S, >state)) { preq->eng_state = PLOOP_E_DATA_WBI; diff --git a/drivers/block/ploop/io_kaio.c b/drivers/block/ploop/io_kaio.c index 5341fd5..5217ab4 100644 --- a/drivers/block/ploop/io_kaio.c +++ b/drivers/block/ploop/io_kaio.c @@ -72,8 +72,7 @@ static void kaio_complete_io_state(struct ploop_request * preq) } /* Convert requested fua to fsync */ - if (test_and_clear_bit(PLOOP_REQ_FORCE_FUA, >state) || - test_and_clear_bit(PLOOP_REQ_DEL_FLUSH, >state) || + if (test_and_clear_bit(PLOOP_REQ_DEL_FLUSH, >state) || test_and_clear_bit(PLOOP_REQ_KAIO_FSYNC, >state)) post_fsync = 1; diff --git a/drivers/block/ploop/map.c b/drivers/block/ploop/map.c index 3a6365d..ef351fb 100644 --- a/drivers/block/ploop/map.c +++ b/drivers/block/ploop/map.c @@ -901,6 +901,8 @@ void ploop_index_update(struct ploop_request * preq) int old_level; struct page * page; sector_t sec; + int fua = !!(preq->req_rw & REQ_FUA); + unsigned long state = READ_ONCE(preq->state); /* No way back, we are going to initiate index write. */ @@ -954,12 +956,11 @@ void ploop_index_update(struct ploop_request * preq) plo->st.map_single_writes++; top_delta->ops->map_index(top_delta, m->mn_start, ); /* Relocate requires consistent writes, mark such reqs appropriately */ - if (test_bit(PLOOP_REQ_RELOC_A, >state) || - test_bit(PLOOP_REQ_RELOC_S, >state)) - set_bit(PLOOP_REQ_FORCE_FUA, >state); - - top_delta->io.ops->write_page(_delta->io, preq, page, sec, - !!(preq->req_rw & REQ_FUA)); + if (state & (PLOOP_REQ_RELOC_A_FL | PLOOP_REQ_RELOC_S_FL)) { + WARN_ON(state & PLOOP_REQ_DEL_FLUSH_FL); + fua = 1; + } + top_delta->io.ops->write_page(_delta->io, preq, page, sec, fua); put_page(page); return; @@ -1063,7 +1064,7 @@ static void map_wb_complete_post_process(struct ploop_map *map, * (see dio_submit()). So fsync of EXT4 image doesnt help us. * We need to force sync of nullified blocks. */ - set_bit(PLOOP_REQ_FORCE_FUA, >state); + preq->req_rw |= REQ_FUA; top_delta->io.ops->submit(_delta->io, preq, preq->req_rw, , preq->iblock, 1 io_queue) { struct ploop_request * preq; + unsigned long state; preq = list_entry(cursor, struct ploop_request, list); + state = READ_ONCE(preq->state); switch (preq->eng_state) { case PLOOP_E_INDEX_DELAY: @@ -1171,9 +1174,10 @@ static void
Re: [Devel] [RH7 PATCH 6/6] patch ploop_state_debugging.patch
OK On 06/23/2016 10:25 AM, Dmitry Monakhov wrote: Signed-off-by: Dmitry Monakhov--- drivers/block/ploop/dev.c | 6 ++ 1 file changed, 6 insertions(+) diff --git a/drivers/block/ploop/dev.c b/drivers/block/ploop/dev.c index 090cd2d..9bf8592 100644 --- a/drivers/block/ploop/dev.c +++ b/drivers/block/ploop/dev.c @@ -1232,6 +1232,12 @@ static void ploop_complete_request(struct ploop_request * preq) } preq->bl.tail = NULL; + if (!preq->error) { + unsigned long state = READ_ONCE(preq->state); + WARN_ON(state & (PLOOP_REQ_POST_SUBMIT_FL| +PLOOP_REQ_DEL_CONV_FL | +PLOOP_REQ_DEL_FLUSH_FL )); + } if (test_bit(PLOOP_REQ_RELOC_A, >state) || test_bit(PLOOP_REQ_RELOC_S, >state)) { if (preq->error) ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
Re: [Devel] [RH7 PATCH 4/6] ploop: io_kaio support PLOOP_REQ_DEL_FLUSH
On 06/23/2016 10:25 AM, Dmitry Monakhov wrote: Currently noone tag preqs with such bit but let it be here for simmetry I hate dead code (things that impossible to verify by any test). Can we add this "symmetry" check later, along with a patch for kaio setting this bit? (i.e.: kaio doesn't set it; ergo it must not check it) Signed-off-by: Dmitry Monakhov--- drivers/block/ploop/io_kaio.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/block/ploop/io_kaio.c b/drivers/block/ploop/io_kaio.c index bee2cee..5341fd5 100644 --- a/drivers/block/ploop/io_kaio.c +++ b/drivers/block/ploop/io_kaio.c @@ -73,6 +73,7 @@ static void kaio_complete_io_state(struct ploop_request * preq) /* Convert requested fua to fsync */ if (test_and_clear_bit(PLOOP_REQ_FORCE_FUA, >state) || + test_and_clear_bit(PLOOP_REQ_DEL_FLUSH, >state) || test_and_clear_bit(PLOOP_REQ_KAIO_FSYNC, >state)) post_fsync = 1; ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
Re: [Devel] [RH7 PATCH 3/6] ploop: add delayed flush support
On 06/23/2016 10:25 AM, Dmitry Monakhov wrote: dio_submit and dio_submit_pad may produce several bios. This makes processing of REQ_FUA complicated because in order to preserve correctness we have to TAG each bio with FUA flag which is suboptimal. Obviously there is a room for optimization here: once all bios was acknowledged by lower layer we may issue empty barrier aka ->issue_flush(). post_submit call back is the place where we all bios completed already. b1:FUA, b2:FUA, b3:FUA => b1,b2,b3,wait_for_bios,bX:FLUSH This allow us to remove all this REQ_FORCE_{FLUSH,FUA} crap and It seems we can remove REQ_FORCE_{FLUSH,FUA} right now. Only RELOC_A|S needs it and we can fix them in a simple and straightforward way -- essentially your 5th patch must be enough enough. Btw, this patch doesn't disable the logic of passing FUA from incoming bio-s to outgoing (commit c2247f3745). Was it by a miss or deliberately? Signed-off-by: Dmitry Monakhov--- drivers/block/ploop/io_direct.c | 48 + include/linux/ploop/ploop.h | 2 ++ 2 files changed, 22 insertions(+), 28 deletions(-) diff --git a/drivers/block/ploop/io_direct.c b/drivers/block/ploop/io_direct.c index 195d318..752a9c3e 100644 --- a/drivers/block/ploop/io_direct.c +++ b/drivers/block/ploop/io_direct.c @@ -82,31 +82,13 @@ dio_submit(struct ploop_io *io, struct ploop_request * preq, sector_t sec, nsec; int err; struct bio_list_walk bw; - int preflush; - int postfua = 0; + int preflush = !!(rw & REQ_FLUSH); + int postflush = !!(rw & REQ_FUA); int write = !!(rw & REQ_WRITE); trace_submit(preq); - preflush = !!(rw & REQ_FLUSH); - - if (test_and_clear_bit(PLOOP_REQ_FORCE_FLUSH, >state)) - preflush = 1; - - if (test_and_clear_bit(PLOOP_REQ_FORCE_FUA, >state)) - postfua = 1; - - if (!postfua && ploop_req_delay_fua_possible(rw, preq)) { - - /* Mark req that delayed flush required */ - set_bit(PLOOP_REQ_FORCE_FLUSH, >state); - } else if (rw & REQ_FUA) { - postfua = 1; - } - rw &= ~(REQ_FLUSH | REQ_FUA); - - bio_list_init(); if (iblk == PLOOP_ZERO_INDEX) @@ -237,13 +219,14 @@ flush_bio: rw2 |= REQ_FLUSH; preflush = 0; } - if (unlikely(postfua && !bl.head)) - rw2 |= REQ_FUA; - ploop_acc_ff_out(preq->plo, rw2 | b->bi_rw); submit_bio(rw2, b); } - + /* TODO: minor optimization is possible for single bio case */ + if (postflush) { + set_bit(PLOOP_REQ_DEL_FLUSH, >state); + ploop_add_post_submit(io, preq); + } ploop_complete_io_request(preq); return; @@ -523,9 +506,10 @@ dio_convert_extent(struct ploop_io *io, struct ploop_request * preq) (loff_t)sec << 9, clu_siz); /* highly unlikely case: FUA coming to a block not provisioned yet */ - if (!err && force_sync) + if (!err && force_sync) { + clear_bit(PLOOP_REQ_DEL_FLUSH, >state); err = io->ops->sync(io); - + } if (!force_sync) { spin_lock_irq(>lock); io->io_count++; @@ -546,7 +530,12 @@ dio_post_submit(struct ploop_io *io, struct ploop_request * preq) if (test_and_clear_bit(PLOOP_REQ_DEL_CONV, >state)) dio_convert_extent(io, preq); + if (test_and_clear_bit(PLOOP_REQ_DEL_FLUSH, >state)) { + io->ops->issue_flush(io, preq); + return 1; + } return 0; + } /* Submit the whole cluster. If preq contains only partial data @@ -562,7 +551,6 @@ dio_submit_pad(struct ploop_io *io, struct ploop_request * preq, sector_t sec, end_sec, nsec, start, end; struct bio_list_walk bw; int err; - bio_list_init(); /* sec..end_sec is the range which we are going to write */ @@ -694,7 +682,11 @@ flush_bio: ploop_acc_ff_out(preq->plo, rw | b->bi_rw); submit_bio(rw, b); } - + /* TODO: minor optimization is possible for single bio case */ + if (preq->req_rw & REQ_FUA) { + set_bit(PLOOP_REQ_DEL_FLUSH, >state); + ploop_add_post_submit(io, preq); + } ploop_complete_io_request(preq); return; diff --git a/include/linux/ploop/ploop.h b/include/linux/ploop/ploop.h index 4c52a40..5076f16 100644 --- a/include/linux/ploop/ploop.h +++ b/include/linux/ploop/ploop.h @@ -472,6 +472,7 @@ enum PLOOP_REQ_PUSH_BACKUP, /* preq was ACKed by userspace push_backup */ PLOOP_REQ_ALLOW_READS, /* READs are allowed for given req_cluster */ PLOOP_REQ_DEL_CONV,/* post_submit: conversion required */ + PLOOP_REQ_DEL_FLUSH,
Re: [Devel] [RH7 PATCH 1/6] ploop: generalize post_submit stage
On 06/23/2016 10:25 AM, Dmitry Monakhov wrote: Currently post_submit() used only for convert_unwritten_extents. But post_submit() is good transition point where all submitted data was completed by lower layer, and new state about to be processed. Iyt is ideal point where we can perform transition actions For example: io_direct: Convert unwritten extents io_direct: issue empty barrier bio in order to simulate postflush io_direct,io_kaio: queue to fsync queue Etc. This patch does not change anything, but prepare post_submit for more logic which will be added later. If we decide to have DEL_FLUSH, I'm OK with this approach. Maybe with some renaming: s/PLOOP_REQ_DEL_FLUSH/PLOOP_REQ_FLUSH_DELAYED s/PLOOP_REQ_DEL_CONV/PLOOP_REQ_CONV_DELAYED s/post_submit/pre_process s/PLOOP_REQ_POST_SUBMIT/PLOOP_REQ_PRE_PROCESS Signed-off-by: Dmitry Monakhov--- drivers/block/ploop/dev.c | 10 ++ drivers/block/ploop/io_direct.c | 15 --- include/linux/ploop/ploop.h | 12 +++- 3 files changed, 29 insertions(+), 8 deletions(-) diff --git a/drivers/block/ploop/dev.c b/drivers/block/ploop/dev.c index e405232..e8b0304 100644 --- a/drivers/block/ploop/dev.c +++ b/drivers/block/ploop/dev.c @@ -2351,10 +2351,12 @@ static void ploop_req_state_process(struct ploop_request * preq) preq->prealloc_size = 0; /* only for sanity */ } - if (test_bit(PLOOP_REQ_POST_SUBMIT, >state)) { - preq->eng_io->ops->post_submit(preq->eng_io, preq); - clear_bit(PLOOP_REQ_POST_SUBMIT, >state); + if (test_and_clear_bit(PLOOP_REQ_POST_SUBMIT, >state)) { + struct ploop_io *io = preq->eng_io; + preq->eng_io = NULL; + if (preq->eng_io->ops->post_submit(io, preq)) + goto out; } restart: @@ -2633,7 +2635,7 @@ restart: default: BUG(); } - +out: if (release_ioc) { struct io_context * ioc = current->io_context; current->io_context = saved_ioc; diff --git a/drivers/block/ploop/io_direct.c b/drivers/block/ploop/io_direct.c index f1812fe..ec905b4 100644 --- a/drivers/block/ploop/io_direct.c +++ b/drivers/block/ploop/io_direct.c @@ -416,8 +416,8 @@ try_again: } preq->iblock = iblk; - preq->eng_io = io; - set_bit(PLOOP_REQ_POST_SUBMIT, >state); + set_bit(PLOOP_REQ_DEL_CONV, >state); + ploop_add_post_submit(io, preq); dio_submit_pad(io, preq, sbl, size, em); err = 0; goto end_write; @@ -501,7 +501,7 @@ end_write: } static void -dio_post_submit(struct ploop_io *io, struct ploop_request * preq) +dio_convert_extent(struct ploop_io *io, struct ploop_request * preq) { struct ploop_device *plo = preq->plo; sector_t sec = (sector_t)preq->iblock << preq->plo->cluster_log; @@ -540,6 +540,15 @@ dio_post_submit(struct ploop_io *io, struct ploop_request * preq) } } +static int +dio_post_submit(struct ploop_io *io, struct ploop_request * preq) +{ + if (test_and_clear_bit(PLOOP_REQ_DEL_CONV, >state)) + dio_convert_extent(io, preq); + + return 0; +} + /* Submit the whole cluster. If preq contains only partial data * within the cluster, pad the rest of cluster with zeros. */ diff --git a/include/linux/ploop/ploop.h b/include/linux/ploop/ploop.h index 0fba25e..4c52a40 100644 --- a/include/linux/ploop/ploop.h +++ b/include/linux/ploop/ploop.h @@ -148,7 +148,7 @@ struct ploop_io_ops struct bio_list *sbl, iblock_t iblk, unsigned int size); void(*submit_alloc)(struct ploop_io *, struct ploop_request *, struct bio_list *sbl, unsigned int size); - void(*post_submit)(struct ploop_io *, struct ploop_request *); + int (*post_submit)(struct ploop_io *, struct ploop_request *); int (*disable_merge)(struct ploop_io * io, sector_t isector, unsigned int len); int (*fastmap)(struct ploop_io * io, struct bio *orig_bio, @@ -471,6 +471,7 @@ enum PLOOP_REQ_POST_SUBMIT, /* preq needs post_submit processing */ PLOOP_REQ_PUSH_BACKUP, /* preq was ACKed by userspace push_backup */ PLOOP_REQ_ALLOW_READS, /* READs are allowed for given req_cluster */ + PLOOP_REQ_DEL_CONV,/* post_submit: conversion required */ PLOOP_REQ_FSYNC_DONE, /* fsync_thread() performed f_op->fsync() */ }; @@ -479,6 +480,8 @@ enum #define PLOOP_REQ_RELOC_S_FL (1 << PLOOP_REQ_RELOC_S) #define PLOOP_REQ_DISCARD_FL (1 << PLOOP_REQ_DISCARD) #define PLOOP_REQ_ZERO_FL (1 << PLOOP_REQ_ZERO) +#define PLOOP_REQ_POST_SUBMIT_FL (1 << PLOOP_REQ_POST_SUBMIT) +#define PLOOP_REQ_DEL_CONV_FL (1 << PLOOP_REQ_DEL_CONV) enum { @@ -767,6 +770,13 @@ static inline void ploop_entry_qlen_dec(struct
Re: [Devel] [RH7 PATCH 0/6] RFC ploop: Barrier fix patch set v3
Dima, On 06/23/2016 10:25 AM, Dmitry Monakhov wrote: Here is 3'rd version of barrier fix patches based on recent fixes. This is an RFC version. I do not have time to test it before tomorrow, Max please review is briefly and tell be your oppinion about general idea. It's hard to review w/o context, and the series fail to apply to our vz7 tree. So, I spent pretty long while trying to find a tag or commit where it's possible to apply your patches w/o rejects. The first patch wants PLOOP_REQ_ALLOW_READS in ploop.h: @@ -471,6 +471,7 @@ enum PLOOP_REQ_POST_SUBMIT, /* preq needs post_submit processing */ PLOOP_REQ_PUSH_BACKUP, /* preq was ACKed by userspace push_backup */ PLOOP_REQ_ALLOW_READS, /* READs are allowed for given req_cluster */ +PLOOP_REQ_DEL_CONV,/* post_submit: conversion required */ PLOOP_REQ_FSYNC_DONE, /* fsync_thread() performed f_op->fsync() */ }; We removed ALLOW_READS by 06e7586 (Jun 3), so you must have rh7-3.10.0-327.18.2.vz7.14.11 or earlier. But the third patch has: @@ -562,7 +551,6 @@ dio_submit_pad(struct ploop_io *io, struct ploop_request * preq, sector_t sec, end_sec, nsec, start, end; struct bio_list_walk bw; int err; - bio_list_init(); /* sec..end_sec is the range which we are going to write */ while after applying the first and the second, it looks like: static void dio_submit_pad(struct ploop_io *io, struct ploop_request * preq, struct bio_list * sbl, unsigned int size, struct extent_map *em) { struct bio_list bl; struct bio * bio = NULL; sector_t sec, end_sec, nsec, start, end; struct bio_list_walk bw; int err; int preflush = !!(preq->req_rw & REQ_FLUSH); bio_list_init(); So the tree you used didn't have that "int preflush = !!(preq->req_rw & REQ_FLUSH);" line. But the patch removing this line was committed only yesterday, Jun 22 (c2247f3745). After applying c2247f3745 before your series, another conflict happens while applying the third patch: the first hunk assumes the following lines in dio_submit: rw &= ~(REQ_FLUSH | REQ_FUA); - - bio_list_init(); But we always (since 2013) had: rw &= ~(REQ_FLUSH | REQ_FUA); /* In case of eng_state != COMPLETE, we'll do FUA in * ploop_index_update(). Otherwise, we should mark * last bio as FUA here. */ if (rw & REQ_FUA) { rw &= ~REQ_FUA; if (preq->eng_state == PLOOP_E_COMPLETE) postfua = 1; } bio_list_init(); Hence, I drew conclusion, we need one of your previous patches to be applied at first. After very long row of trials and errors I eventually succeeded: $ git show c2247f3745 > /tmp/my.diff $ git checkout rh7-3.10.0-327.18.2.vz7.14.11 $ patch -p1 < ploop-fix-barriers-for-ordinary-requests $ patch -p1 < ploop-skip-redundant-fsync-for-REQ_FUA-in-post_submit $ patch -p1 < ploop-deadcode-cleanup $ patch -p1 < ploop-generalize-post_submit-stage $ patch -p1 < ploop-generalize-issue_flush $ patch -p1 < ploop-add-delayed-flush-support $ patch -p1 < ploop-io_kaio-support-PLOOP_REQ_DEL_FLUSH $ patch -p1 < ploop-fixup-barrier-handling-during-relocation $ patch -p1 < patch-ploop_state_debugging.patch Basic idea is to use post_submit state to issue empty FLUSH barrier in order to complete FUA requests. This allow us to unify all engines (direct and kaio). This makes FUA processing optimal: SUBMIT:FUA :W1{b1,b2,b3,b4..},WAIT,post_submit:FLUSH SUBMIT_ALLOC:FUA :W1{b1,b2,b3,b4..},WAIT,post_submit:FLUSH, WBI:FUA The above would be optimal only if all three statements below are true: 1) Lower layers process FUA as post-FLUSH. I remember that you wrote that in real (linux kernel) life it is always true, but somehow I'm not sure about this "always"... Of course, we can investigate and eventually nail down the question, but for now I'm not convinced. 2) The list of (incoming) bio-s has more than one marked as FUA. Otherwise (i.e. if only one is FUA), it must be equivalent (from performance perspective): to submit FUA now, or to submit FLUSH later (modulo 1) above). For now, I know only two entities generating FUA: ext4 writes superblock and jbd2 commits transaction. In both cases it is one page-sized bio and no way to have more than one FUA in queue. Do you know other cases? (Of course, we know about fsync(), but it generates zero-size FLUSH. The way how ploop processes it is not affected by the patches we discussed.) 3) The benefits of delayed FLUSH must overweight the performance loss we'll have from extra WAIT introduced. I have not verified it yet, but I suspect it must be observable (e.g. by blktrace) that one page-sized bio marked as FLUSH|FUA completes faster than: submit one page-sized bio marked as FLUSH, wait for completion, submit zero-size FLUSH, wait for completion again. Makes sense? If the three statements above are correct, and given some complexity added by the series, and possible
[Devel] [RH7 PATCH 4/6] ploop: io_kaio support PLOOP_REQ_DEL_FLUSH
Currently noone tag preqs with such bit but let it be here for simmetry Signed-off-by: Dmitry Monakhov--- drivers/block/ploop/io_kaio.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/block/ploop/io_kaio.c b/drivers/block/ploop/io_kaio.c index bee2cee..5341fd5 100644 --- a/drivers/block/ploop/io_kaio.c +++ b/drivers/block/ploop/io_kaio.c @@ -73,6 +73,7 @@ static void kaio_complete_io_state(struct ploop_request * preq) /* Convert requested fua to fsync */ if (test_and_clear_bit(PLOOP_REQ_FORCE_FUA, >state) || + test_and_clear_bit(PLOOP_REQ_DEL_FLUSH, >state) || test_and_clear_bit(PLOOP_REQ_KAIO_FSYNC, >state)) post_fsync = 1; -- 1.8.3.1 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [RH7 PATCH 3/6] ploop: add delayed flush support
dio_submit and dio_submit_pad may produce several bios. This makes processing of REQ_FUA complicated because in order to preserve correctness we have to TAG each bio with FUA flag which is suboptimal. Obviously there is a room for optimization here: once all bios was acknowledged by lower layer we may issue empty barrier aka ->issue_flush(). post_submit call back is the place where we all bios completed already. b1:FUA, b2:FUA, b3:FUA => b1,b2,b3,wait_for_bios,bX:FLUSH This allow us to remove all this REQ_FORCE_{FLUSH,FUA} crap and Signed-off-by: Dmitry Monakhov--- drivers/block/ploop/io_direct.c | 48 + include/linux/ploop/ploop.h | 2 ++ 2 files changed, 22 insertions(+), 28 deletions(-) diff --git a/drivers/block/ploop/io_direct.c b/drivers/block/ploop/io_direct.c index 195d318..752a9c3e 100644 --- a/drivers/block/ploop/io_direct.c +++ b/drivers/block/ploop/io_direct.c @@ -82,31 +82,13 @@ dio_submit(struct ploop_io *io, struct ploop_request * preq, sector_t sec, nsec; int err; struct bio_list_walk bw; - int preflush; - int postfua = 0; + int preflush = !!(rw & REQ_FLUSH); + int postflush = !!(rw & REQ_FUA); int write = !!(rw & REQ_WRITE); trace_submit(preq); - preflush = !!(rw & REQ_FLUSH); - - if (test_and_clear_bit(PLOOP_REQ_FORCE_FLUSH, >state)) - preflush = 1; - - if (test_and_clear_bit(PLOOP_REQ_FORCE_FUA, >state)) - postfua = 1; - - if (!postfua && ploop_req_delay_fua_possible(rw, preq)) { - - /* Mark req that delayed flush required */ - set_bit(PLOOP_REQ_FORCE_FLUSH, >state); - } else if (rw & REQ_FUA) { - postfua = 1; - } - rw &= ~(REQ_FLUSH | REQ_FUA); - - bio_list_init(); if (iblk == PLOOP_ZERO_INDEX) @@ -237,13 +219,14 @@ flush_bio: rw2 |= REQ_FLUSH; preflush = 0; } - if (unlikely(postfua && !bl.head)) - rw2 |= REQ_FUA; - ploop_acc_ff_out(preq->plo, rw2 | b->bi_rw); submit_bio(rw2, b); } - + /* TODO: minor optimization is possible for single bio case */ + if (postflush) { + set_bit(PLOOP_REQ_DEL_FLUSH, >state); + ploop_add_post_submit(io, preq); + } ploop_complete_io_request(preq); return; @@ -523,9 +506,10 @@ dio_convert_extent(struct ploop_io *io, struct ploop_request * preq) (loff_t)sec << 9, clu_siz); /* highly unlikely case: FUA coming to a block not provisioned yet */ - if (!err && force_sync) + if (!err && force_sync) { + clear_bit(PLOOP_REQ_DEL_FLUSH, >state); err = io->ops->sync(io); - + } if (!force_sync) { spin_lock_irq(>lock); io->io_count++; @@ -546,7 +530,12 @@ dio_post_submit(struct ploop_io *io, struct ploop_request * preq) if (test_and_clear_bit(PLOOP_REQ_DEL_CONV, >state)) dio_convert_extent(io, preq); + if (test_and_clear_bit(PLOOP_REQ_DEL_FLUSH, >state)) { + io->ops->issue_flush(io, preq); + return 1; + } return 0; + } /* Submit the whole cluster. If preq contains only partial data @@ -562,7 +551,6 @@ dio_submit_pad(struct ploop_io *io, struct ploop_request * preq, sector_t sec, end_sec, nsec, start, end; struct bio_list_walk bw; int err; - bio_list_init(); /* sec..end_sec is the range which we are going to write */ @@ -694,7 +682,11 @@ flush_bio: ploop_acc_ff_out(preq->plo, rw | b->bi_rw); submit_bio(rw, b); } - + /* TODO: minor optimization is possible for single bio case */ + if (preq->req_rw & REQ_FUA) { + set_bit(PLOOP_REQ_DEL_FLUSH, >state); + ploop_add_post_submit(io, preq); + } ploop_complete_io_request(preq); return; diff --git a/include/linux/ploop/ploop.h b/include/linux/ploop/ploop.h index 4c52a40..5076f16 100644 --- a/include/linux/ploop/ploop.h +++ b/include/linux/ploop/ploop.h @@ -472,6 +472,7 @@ enum PLOOP_REQ_PUSH_BACKUP, /* preq was ACKed by userspace push_backup */ PLOOP_REQ_ALLOW_READS, /* READs are allowed for given req_cluster */ PLOOP_REQ_DEL_CONV,/* post_submit: conversion required */ + PLOOP_REQ_DEL_FLUSH, /* post_submit: REQ_FLUSH required */ PLOOP_REQ_FSYNC_DONE, /* fsync_thread() performed f_op->fsync() */ }; @@ -482,6 +483,7 @@ enum #define PLOOP_REQ_ZERO_FL (1 << PLOOP_REQ_ZERO) #define PLOOP_REQ_POST_SUBMIT_FL (1 << PLOOP_REQ_POST_SUBMIT) #define PLOOP_REQ_DEL_CONV_FL (1 << PLOOP_REQ_DEL_CONV) +#define PLOOP_REQ_DEL_FLUSH_FL (1 << PLOOP_REQ_DEL_FLUSH)
[Devel] [RH7 PATCH 5/6] ploop: fixup barrier handling during relocation
barrier code is broken in many ways: Currently only ->dio_submit() handles PLOOP_REQ_FORCE_{FLUSH,FUA} correctly. But request also can goes though ->dio_submit_alloc()->dio_submit_pad and write_page (for indexes) So in case of grow_dev we have following sequance: E_RELOC_DATA_READ: ->set_bit(PLOOP_REQ_FORCE_FUA, >state); ->delta->allocate ->io->submit_allloc: dio_submit_alloc ->dio_submit_pad E_DATA_WBI : data written, time to update index ->delta->allocate_complete:ploop_index_update ->set_bit(PLOOP_REQ_FORCE_FUA, >state); ->write_page ->ploop_map_wb_complete ->ploop_wb_complete_post_process ->set_bit(PLOOP_REQ_FORCE_FUA, >state); E_RELOC_NULLIFY: ->submit() Once we have delayed_flush engine it is easy to implement correct scheme for both engines. E_RELOC_DATA_READ ->submit_allloc => wait->post_submit->issue_flush E_DATA_WBI ->ploop_index_update with FUA E_RELOC_NULLIFY ->submit: => wait->post_submit->issue_flush This makes reloc sequence optimal: RELOC_S: R1, W2,WAIT,FLUSH, WBI:FUA RELOC_A: R1, W2,WAIT,FLUSH, WBI:FUA, W1:NULLIFY,WAIT, FLUSH https://jira.sw.ru/browse/PSBM-47107 Signed-off-by: Dmitry Monakhov--- drivers/block/ploop/dev.c | 2 +- drivers/block/ploop/io_kaio.c | 3 +-- drivers/block/ploop/map.c | 28 ++-- 3 files changed, 16 insertions(+), 17 deletions(-) diff --git a/drivers/block/ploop/dev.c b/drivers/block/ploop/dev.c index 95e3067..090cd2d 100644 --- a/drivers/block/ploop/dev.c +++ b/drivers/block/ploop/dev.c @@ -2533,7 +2533,7 @@ restart: sbl.head = sbl.tail = preq->aux_bio; /* Relocated data write required sync before BAT updatee */ - set_bit(PLOOP_REQ_FORCE_FUA, >state); + preq->req_rw |= REQ_FUA; if (test_bit(PLOOP_REQ_RELOC_S, >state)) { preq->eng_state = PLOOP_E_DATA_WBI; diff --git a/drivers/block/ploop/io_kaio.c b/drivers/block/ploop/io_kaio.c index 5341fd5..5217ab4 100644 --- a/drivers/block/ploop/io_kaio.c +++ b/drivers/block/ploop/io_kaio.c @@ -72,8 +72,7 @@ static void kaio_complete_io_state(struct ploop_request * preq) } /* Convert requested fua to fsync */ - if (test_and_clear_bit(PLOOP_REQ_FORCE_FUA, >state) || - test_and_clear_bit(PLOOP_REQ_DEL_FLUSH, >state) || + if (test_and_clear_bit(PLOOP_REQ_DEL_FLUSH, >state) || test_and_clear_bit(PLOOP_REQ_KAIO_FSYNC, >state)) post_fsync = 1; diff --git a/drivers/block/ploop/map.c b/drivers/block/ploop/map.c index 3a6365d..ef351fb 100644 --- a/drivers/block/ploop/map.c +++ b/drivers/block/ploop/map.c @@ -901,6 +901,8 @@ void ploop_index_update(struct ploop_request * preq) int old_level; struct page * page; sector_t sec; + int fua = !!(preq->req_rw & REQ_FUA); + unsigned long state = READ_ONCE(preq->state); /* No way back, we are going to initiate index write. */ @@ -954,12 +956,11 @@ void ploop_index_update(struct ploop_request * preq) plo->st.map_single_writes++; top_delta->ops->map_index(top_delta, m->mn_start, ); /* Relocate requires consistent writes, mark such reqs appropriately */ - if (test_bit(PLOOP_REQ_RELOC_A, >state) || - test_bit(PLOOP_REQ_RELOC_S, >state)) - set_bit(PLOOP_REQ_FORCE_FUA, >state); - - top_delta->io.ops->write_page(_delta->io, preq, page, sec, - !!(preq->req_rw & REQ_FUA)); + if (state & (PLOOP_REQ_RELOC_A_FL | PLOOP_REQ_RELOC_S_FL)) { + WARN_ON(state & PLOOP_REQ_DEL_FLUSH_FL); + fua = 1; + } + top_delta->io.ops->write_page(_delta->io, preq, page, sec, fua); put_page(page); return; @@ -1063,7 +1064,7 @@ static void map_wb_complete_post_process(struct ploop_map *map, * (see dio_submit()). So fsync of EXT4 image doesnt help us. * We need to force sync of nullified blocks. */ - set_bit(PLOOP_REQ_FORCE_FUA, >state); + preq->req_rw |= REQ_FUA; top_delta->io.ops->submit(_delta->io, preq, preq->req_rw, , preq->iblock, 1 io_queue) { struct ploop_request * preq; + unsigned long state; preq = list_entry(cursor, struct ploop_request, list); + state = READ_ONCE(preq->state); switch (preq->eng_state) { case PLOOP_E_INDEX_DELAY: @@ -1171,9 +1174,10 @@ static void map_wb_complete(struct map_node * m, int err) if (preq->req_rw & REQ_FUA)
[Devel] [RH7 PATCH 2/6] ploop: generalize issue_flush
Currently io->ops->issue_flush is called only from single place, but it has potential to generic. Patch does not change actual logic, but allow to call ->issue_flush from various places Signed-off-by: Dmitry Monakhov--- drivers/block/ploop/dev.c | 1 + drivers/block/ploop/io_direct.c | 1 - drivers/block/ploop/io_kaio.c | 1 - 3 files changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/block/ploop/dev.c b/drivers/block/ploop/dev.c index e8b0304..95e3067 100644 --- a/drivers/block/ploop/dev.c +++ b/drivers/block/ploop/dev.c @@ -1989,6 +1989,7 @@ ploop_entry_request(struct ploop_request * preq) if (preq->req_size == 0) { if (preq->req_rw & REQ_FLUSH && !test_bit(PLOOP_REQ_FSYNC_DONE, >state)) { + preq->eng_state = PLOOP_E_COMPLETE; if (top_io->ops->issue_flush) { top_io->ops->issue_flush(top_io, preq); return; diff --git a/drivers/block/ploop/io_direct.c b/drivers/block/ploop/io_direct.c index ec905b4..195d318 100644 --- a/drivers/block/ploop/io_direct.c +++ b/drivers/block/ploop/io_direct.c @@ -1836,7 +1836,6 @@ static void dio_issue_flush(struct ploop_io * io, struct ploop_request *preq) bio->bi_private = preq; atomic_inc(>io_count); - preq->eng_state = PLOOP_E_COMPLETE; ploop_acc_ff_out(io->plo, preq->req_rw | bio->bi_rw); submit_bio(preq->req_rw, bio); ploop_complete_io_request(preq); diff --git a/drivers/block/ploop/io_kaio.c b/drivers/block/ploop/io_kaio.c index de26319..bee2cee 100644 --- a/drivers/block/ploop/io_kaio.c +++ b/drivers/block/ploop/io_kaio.c @@ -951,7 +951,6 @@ static void kaio_issue_flush(struct ploop_io * io, struct ploop_request *preq) { struct ploop_delta *delta = container_of(io, struct ploop_delta, io); - preq->eng_state = PLOOP_E_COMPLETE; preq->req_rw &= ~REQ_FLUSH; spin_lock_irq(>plo->lock); -- 1.8.3.1 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [RH7 PATCH 1/6] ploop: generalize post_submit stage
Currently post_submit() used only for convert_unwritten_extents. But post_submit() is good transition point where all submitted data was completed by lower layer, and new state about to be processed. Iyt is ideal point where we can perform transition actions For example: io_direct: Convert unwritten extents io_direct: issue empty barrier bio in order to simulate postflush io_direct,io_kaio: queue to fsync queue Etc. This patch does not change anything, but prepare post_submit for more logic which will be added later. Signed-off-by: Dmitry Monakhov--- drivers/block/ploop/dev.c | 10 ++ drivers/block/ploop/io_direct.c | 15 --- include/linux/ploop/ploop.h | 12 +++- 3 files changed, 29 insertions(+), 8 deletions(-) diff --git a/drivers/block/ploop/dev.c b/drivers/block/ploop/dev.c index e405232..e8b0304 100644 --- a/drivers/block/ploop/dev.c +++ b/drivers/block/ploop/dev.c @@ -2351,10 +2351,12 @@ static void ploop_req_state_process(struct ploop_request * preq) preq->prealloc_size = 0; /* only for sanity */ } - if (test_bit(PLOOP_REQ_POST_SUBMIT, >state)) { - preq->eng_io->ops->post_submit(preq->eng_io, preq); - clear_bit(PLOOP_REQ_POST_SUBMIT, >state); + if (test_and_clear_bit(PLOOP_REQ_POST_SUBMIT, >state)) { + struct ploop_io *io = preq->eng_io; + preq->eng_io = NULL; + if (preq->eng_io->ops->post_submit(io, preq)) + goto out; } restart: @@ -2633,7 +2635,7 @@ restart: default: BUG(); } - +out: if (release_ioc) { struct io_context * ioc = current->io_context; current->io_context = saved_ioc; diff --git a/drivers/block/ploop/io_direct.c b/drivers/block/ploop/io_direct.c index f1812fe..ec905b4 100644 --- a/drivers/block/ploop/io_direct.c +++ b/drivers/block/ploop/io_direct.c @@ -416,8 +416,8 @@ try_again: } preq->iblock = iblk; - preq->eng_io = io; - set_bit(PLOOP_REQ_POST_SUBMIT, >state); + set_bit(PLOOP_REQ_DEL_CONV, >state); + ploop_add_post_submit(io, preq); dio_submit_pad(io, preq, sbl, size, em); err = 0; goto end_write; @@ -501,7 +501,7 @@ end_write: } static void -dio_post_submit(struct ploop_io *io, struct ploop_request * preq) +dio_convert_extent(struct ploop_io *io, struct ploop_request * preq) { struct ploop_device *plo = preq->plo; sector_t sec = (sector_t)preq->iblock << preq->plo->cluster_log; @@ -540,6 +540,15 @@ dio_post_submit(struct ploop_io *io, struct ploop_request * preq) } } +static int +dio_post_submit(struct ploop_io *io, struct ploop_request * preq) +{ + if (test_and_clear_bit(PLOOP_REQ_DEL_CONV, >state)) + dio_convert_extent(io, preq); + + return 0; +} + /* Submit the whole cluster. If preq contains only partial data * within the cluster, pad the rest of cluster with zeros. */ diff --git a/include/linux/ploop/ploop.h b/include/linux/ploop/ploop.h index 0fba25e..4c52a40 100644 --- a/include/linux/ploop/ploop.h +++ b/include/linux/ploop/ploop.h @@ -148,7 +148,7 @@ struct ploop_io_ops struct bio_list *sbl, iblock_t iblk, unsigned int size); void(*submit_alloc)(struct ploop_io *, struct ploop_request *, struct bio_list *sbl, unsigned int size); - void(*post_submit)(struct ploop_io *, struct ploop_request *); + int (*post_submit)(struct ploop_io *, struct ploop_request *); int (*disable_merge)(struct ploop_io * io, sector_t isector, unsigned int len); int (*fastmap)(struct ploop_io * io, struct bio *orig_bio, @@ -471,6 +471,7 @@ enum PLOOP_REQ_POST_SUBMIT, /* preq needs post_submit processing */ PLOOP_REQ_PUSH_BACKUP, /* preq was ACKed by userspace push_backup */ PLOOP_REQ_ALLOW_READS, /* READs are allowed for given req_cluster */ + PLOOP_REQ_DEL_CONV,/* post_submit: conversion required */ PLOOP_REQ_FSYNC_DONE, /* fsync_thread() performed f_op->fsync() */ }; @@ -479,6 +480,8 @@ enum #define PLOOP_REQ_RELOC_S_FL (1 << PLOOP_REQ_RELOC_S) #define PLOOP_REQ_DISCARD_FL (1 << PLOOP_REQ_DISCARD) #define PLOOP_REQ_ZERO_FL (1 << PLOOP_REQ_ZERO) +#define PLOOP_REQ_POST_SUBMIT_FL (1 << PLOOP_REQ_POST_SUBMIT) +#define PLOOP_REQ_DEL_CONV_FL (1 << PLOOP_REQ_DEL_CONV) enum { @@ -767,6 +770,13 @@ static inline void ploop_entry_qlen_dec(struct ploop_request * preq) preq->plo->read_sync_reqs--; } } +static inline +void ploop_add_post_submit(struct ploop_io *io, struct ploop_request * preq) +{ + BUG_ON(preq->eng_io && preq->eng_io != io); + preq->eng_io = io; + set_bit(PLOOP_REQ_POST_SUBMIT, >state); +} static inline
[Devel] [RH7 PATCH 6/6] patch ploop_state_debugging.patch
Signed-off-by: Dmitry Monakhov--- drivers/block/ploop/dev.c | 6 ++ 1 file changed, 6 insertions(+) diff --git a/drivers/block/ploop/dev.c b/drivers/block/ploop/dev.c index 090cd2d..9bf8592 100644 --- a/drivers/block/ploop/dev.c +++ b/drivers/block/ploop/dev.c @@ -1232,6 +1232,12 @@ static void ploop_complete_request(struct ploop_request * preq) } preq->bl.tail = NULL; + if (!preq->error) { + unsigned long state = READ_ONCE(preq->state); + WARN_ON(state & (PLOOP_REQ_POST_SUBMIT_FL| +PLOOP_REQ_DEL_CONV_FL | +PLOOP_REQ_DEL_FLUSH_FL )); + } if (test_bit(PLOOP_REQ_RELOC_A, >state) || test_bit(PLOOP_REQ_RELOC_S, >state)) { if (preq->error) -- 1.8.3.1 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [RH7 PATCH 0/6] RFC ploop: Barrier fix patch set v3
Here is 3'rd version of barrier fix patches based on recent fixes. This is an RFC version. I do not have time to test it before tomorrow, Max please review is briefly and tell be your oppinion about general idea. Basic idea is to use post_submit state to issue empty FLUSH barrier in order to complete FUA requests. This allow us to unify all engines (direct and kaio). This makes FUA processing optimal: SUBMIT:FUA :W1{b1,b2,b3,b4..},WAIT,post_submit:FLUSH SUBMIT_ALLOC:FUA :W1{b1,b2,b3,b4..},WAIT,post_submit:FLUSH, WBI:FUA RELOC_S: R1, W2,WAIT,post_submit:FLUSH, WBI:FUA RELOC_A: R1, W2,WAIT,post_submit:FLUSH, WBI:FUA, W1:NULLIFY,WAIT,post_submit:FLUSH #POST_SUBMIT CHANGES: ploop-generalize-post_submit-stage.patch ploop-generalize-issue_flush.patch ploop-add-delayed-flush-support.patch ploop-io_kaio-support-PLOOP_REQ_DEL_FLUSH.patch #RELOC_XXX FIXES ploop-fixup-barrier-handling-during-relocation.patch patch-ploop_state_debugging.patch.patch ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
Re: [Devel] [PATCH rh7] ve/fs: namespace -- Don't fail on permissions if @ve->devmnt_list is empty
On Thu, Jun 23, 2016 at 01:34:18PM +0300, Cyrill Gorcunov wrote: > In commit 7eeb5b4afa8db5a2f2e1e47ab6b84e55fc8c5661 I addressed > first half of a problem, but I happen to work with dirty copy > of libvzctl where mount_opts cgroup has been c/r'ed manually, > so I missed the case where @devmnt_list is empty on restore > (just like it is in vanilla libvzctl). So fix the second half. > > https://jira.sw.ru/browse/PSBM-48188 > > Reported-by: Igor Sukhih> Signed-off-by: Cyrill Gorcunov > CC: Vladimir Davydov > CC: Konstantin Khorenko Reviewed-by: Vladimir Davydov ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RHEL7 COMMIT] ve/binfmt_misc: Allow mount if capable(CAP_SYS_ADMIN)
The commit is pushed to "branch-rh7-3.10.0-327.18.2.vz7.14.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git after rh7-3.10.0-327.18.2.vz7.14.19 --> commit b188e9ef18fc3630568eaab2923238df31babc94 Author: Kirill TkhaiDate: Thu Jun 23 19:06:40 2016 +0400 ve/binfmt_misc: Allow mount if capable(CAP_SYS_ADMIN) The patch allows to mount binfmt_misc in a CT with ve0's admin caps, and it's need that for CRIU dump. This time, unmounted binfmt_misc may be forced mounted back, and we don't want to change CRIU's user_ns to do that. https://jira.sw.ru/browse/PSBM-47737 Signed-off-by: Kirill Tkhai Reviewed-by: Andrey Ryabinin --- fs/binfmt_misc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index fd5227f..e259022 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -735,7 +735,7 @@ static int bm_fill_super(struct super_block * sb, void * data, int silent) static struct dentry *bm_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { - if (!current_user_ns_initial()) + if (!current_user_ns_initial() && !capable(CAP_SYS_ADMIN)) return ERR_PTR(-EPERM); return mount_ns(fs_type, flags, get_exec_env(), bm_fill_super); } ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH rh7] vecalls: kill VZCTL_SETDEVPERMS ioctl
All device permissions are controlled via device cgroup now. We don't need this ioctl() anymore. Signed-off-by: Andrey Ryabinin--- include/uapi/linux/vzcalluser.h | 2 +- kernel/ve/vecalls.c | 37 - 2 files changed, 1 insertion(+), 38 deletions(-) diff --git a/include/uapi/linux/vzcalluser.h b/include/uapi/linux/vzcalluser.h index 2b340cf..85761c7 100644 --- a/include/uapi/linux/vzcalluser.h +++ b/include/uapi/linux/vzcalluser.h @@ -161,7 +161,7 @@ struct vzctl_cpustatctl { #define VZCTLTYPE '.' #define VZCTL_OLD_ENV_CREATE _IOW(VZCTLTYPE, 0, struct vzctl_old_env_create) #define VZCTL_MARK_ENV_TO_DOWN _IOW(VZCTLTYPE, 1, struct vzctl_mark_env_to_down) -#define VZCTL_SETDEVPERMS _IOW(VZCTLTYPE, 2, struct vzctl_setdevperms) +#define VZCTL_SETDEVPERMS _IOW(VZCTLTYPE, 2, struct vzctl_setdevperms) /* DEPRECATED */ #define VZCTL_ENV_CREATE_CID _IOW(VZCTLTYPE, 4, struct vzctl_env_create_cid) #define VZCTL_ENV_CREATE _IOW(VZCTLTYPE, 5, struct vzctl_env_create) #define VZCTL_GET_CPU_STAT _IOW(VZCTLTYPE, 6, struct vzctl_cpustatctl) diff --git a/kernel/ve/vecalls.c b/kernel/ve/vecalls.c index 4742576..5ce3e29 100644 --- a/kernel/ve/vecalls.c +++ b/kernel/ve/vecalls.c @@ -97,29 +97,6 @@ out_put_ve: return retval; } -static int real_setdevperms(envid_t veid, unsigned type, - dev_t dev, unsigned mask) -{ - struct ve_struct *ve; - int err; - - if (!capable_setveid() || veid == 0) - return -EPERM; - - if ((ve = get_ve_by_id(veid)) == NULL) - return -ESRCH; - - down_read(>op_sem); - - err = -EAGAIN; - if (ve->is_running) - err = devcgroup_set_perms_ve(ve, type, dev, mask); - - up_read(>op_sem); - put_ve(ve); - return err; -} - /** ** * @@ -604,20 +581,6 @@ int vzcalls_ioctl(struct file *file, unsigned int cmd, unsigned long arg) err = 0; } break; - case VZCTL_SETDEVPERMS: { - /* Device type was mistakenly declared as dev_t -* in the old user-kernel interface. -* That's wrong, dev_t is a kernel internal type. -* I use `unsigned' not having anything better in mind. -* 2001/08/11 SAW */ - struct vzctl_setdevperms s; - err = -EFAULT; - if (copy_from_user(, (void __user *)arg, sizeof(s))) - break; - err = real_setdevperms(s.veid, s.type, - new_decode_dev(s.dev), s.mask); - } - break; #ifdef CONFIG_INET case VZCTL_VE_NETDEV: { struct vzctl_ve_netdev d; -- 2.7.3 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
Re: [Devel] [PATCH RH7] enable ipproto_icmp inside containers
On 06/23/2016 03:26 PM, Vasily Averin wrote: iputils-ping 20150815 fails inside containers because socket(PF_INET, SOCK_DGRAM, IPPROTO_ICMP) is restricted by vz_security_protocol_check() The patch enables creation of such sockets inside containers. By default sys_socket still fails because default setting of sysctl net.ipv4.ping_group_range, however it's enough for iputils-ping 20150815. its fallback handles this situation and successfully creates RAW socket. In mainlune it is enabled in MS kernel v3.13+, see: commit fd2d5356d902 ("ipv4: Allow unprivileged users to use per net sysctls") in future we're going backport this patch and add its save/restore into criu. https://bugs.openvz.org/browse/OVZ-6744 Signed-off-by: Vasily AverinAcked-by: Pavel Tikhomirov diff-vz7-enable-ipproto_icmp-inside-containers diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c index 53fa12d..ef521cf 100644 --- a/kernel/ve/ve.c +++ b/kernel/ve/ve.c @@ -222,6 +222,7 @@ int vz_security_protocol_check(struct net *net, int protocol) switch (protocol) { case IPPROTO_IP: + case IPPROTO_ICMP: case IPPROTO_TCP: case IPPROTO_UDP: case IPPROTO_RAW: -- Best regards, Tikhomirov Pavel Software Developer, Virtuozzo. ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RH7] enable ipproto_icmp inside containers
iputils-ping 20150815 fails inside containers because socket(PF_INET, SOCK_DGRAM, IPPROTO_ICMP) is restricted by vz_security_protocol_check() The patch enables creation of such sockets inside containers. By default sys_socket still fails because default setting of sysctl net.ipv4.ping_group_range, however it's enough for iputils-ping 20150815. its fallback handles this situation and successfully creates RAW socket. In mainlune it is enabled in MS kernel v3.13+, see: commit fd2d5356d902 ("ipv4: Allow unprivileged users to use per net sysctls") in future we're going backport this patch and add its save/restore into criu. https://bugs.openvz.org/browse/OVZ-6744 Signed-off-by: Vasily Averindiff --git a/kernel/ve/ve.c b/kernel/ve/ve.c index 53fa12d..ef521cf 100644 --- a/kernel/ve/ve.c +++ b/kernel/ve/ve.c @@ -222,6 +222,7 @@ int vz_security_protocol_check(struct net *net, int protocol) switch (protocol) { case IPPROTO_IP: + case IPPROTO_ICMP: case IPPROTO_TCP: case IPPROTO_UDP: case IPPROTO_RAW: ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
Re: [Devel] [PATCH RH7] enable ipproto_icmp inside containers
On 06/23/2016 03:08 PM, Vasily Averin wrote: iputils-ping 20150815 fails inside containers because socket(PF_INET, SOCK_DGRAM, IPPROTO_ICMP) is restricted by vz_security_protocol_check() The patch enables creation of such sockets inside containers. By default sys_socket still fails because default setting of sysctl net.ipv4.ping_group_range, however it's enough for iputils-ping 20150815. its fallback handles this situation and successfully creates RAW socket. According to ptikhomirov@ this sysctl will be enabled inside conaintes soon, and in future it will be saved/restored by criu. It is enabled in MS kernel v3.13+, see: commit fd2d5356d902 ("ipv4: Allow unprivileged users to use per net sysctls") https://bugs.openvz.org/browse/OVZ-6744 Signed-off-by: Vasily Averinpatch is empty? -- Best regards, Tikhomirov Pavel Software Developer, Virtuozzo. ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RH7] enable ipproto_icmp inside containers
iputils-ping 20150815 fails inside containers because socket(PF_INET, SOCK_DGRAM, IPPROTO_ICMP) is restricted by vz_security_protocol_check() The patch enables creation of such sockets inside containers. By default sys_socket still fails because default setting of sysctl net.ipv4.ping_group_range, however it's enough for iputils-ping 20150815. its fallback handles this situation and successfully creates RAW socket. According to ptikhomirov@ this sysctl will be enabled inside conaintes soon, and in future it will be saved/restored by criu. https://bugs.openvz.org/browse/OVZ-6744 Signed-off-by: Vasily Averin___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
Re: [Devel] [PATCH rh7 1/4] net: ipip: enable in container
On 23.06.2016 11:57, Vladimir Davydov wrote: Currently, we fail to init ipip per-net in a ve, because it has neither NETIF_F_VIRTUAL nor NETIF_F_NETNS_LOCAL: ipip_init_net ip_tunnel_init_net __ip_tunnel_create register_netdevice ve_is_dev_movable In PCS6 ipip has NETIF_F_NETNS_LOCAL, so everything works fine there, but this restriction was removed in RH7 kernel, so we fail to start a container if ipip is loaded (or load ipip if there are containers running). Mark ipip as NETIF_F_VIRTUAL to fix this issue. https://jira.sw.ru/browse/PSBM-48608 Signed-off-by: Vladimir Davydov--- net/ipv4/ipip.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index e556a1df5a57..7842dcb2fd65 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -301,6 +301,7 @@ static void ipip_tunnel_setup(struct net_device *dev) netif_keep_dst(dev); dev->features|= IPIP_FEATURES; + dev->features|= NETIF_F_VIRTUAL; dev->hw_features |= IPIP_FEATURES; ip_tunnel_setup(dev, ipip_net_id); } All 4 patches look good to me and I have checked that they do fix the described problems, at least on my machine. So, for the whole patch series: Tested-by: Evgenii Shatokhin Regards, Evgenii ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH rh7] ve/fs: namespace -- Don't fail on permissions if @ve->devmnt_list is empty
In commit 7eeb5b4afa8db5a2f2e1e47ab6b84e55fc8c5661 I addressed first half of a problem, but I happen to work with dirty copy of libvzctl where mount_opts cgroup has been c/r'ed manually, so I missed the case where @devmnt_list is empty on restore (just like it is in vanilla libvzctl). So fix the second half. https://jira.sw.ru/browse/PSBM-48188 Reported-by: Igor SukhihSigned-off-by: Cyrill Gorcunov CC: Vladimir Davydov CC: Konstantin Khorenko --- fs/namespace.c | 18 ++ 1 file changed, 14 insertions(+), 4 deletions(-) Index: linux-pcs7.git/fs/namespace.c === --- linux-pcs7.git.orig/fs/namespace.c +++ linux-pcs7.git/fs/namespace.c @@ -1954,10 +1954,20 @@ again: goto again; case 1: if (*data_pp) { - ve_printk(VE_LOG_BOTH, KERN_WARNING "VE%s: no allowed " - "mount options found for device %u:%u\n", - ve->ve_name, MAJOR(dev), MINOR(dev)); - err = -EPERM; + /* +* Same as in chunk above but for case where +* ve->devmnt_list is empty. Depending on +* the way userspace tool restore container +* it might be nonempty as well. +*/ + if (ve->is_pseudosuper) { + err = 0; + } else { + ve_printk(VE_LOG_BOTH, KERN_WARNING "VE%s: no allowed " + "mount options found for device %u:%u\n", + ve->ve_name, MAJOR(dev), MINOR(dev)); + err = -EPERM; + } } else err = 0; break; ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH rh7 3/4] net: ipip: fix crash on newlink if VE_FEATURE_IPIP is disabled
In this case net_generic returns NULL. We must handle this gracefully. Signed-off-by: Vladimir Davydov--- net/ipv4/ipip.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 7842dcb2fd65..b1004fb7539c 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -357,6 +357,9 @@ static int ipip_newlink(struct net *src_net, struct net_device *dev, { struct ip_tunnel_parm p; + if (net_generic(dev_net(dev), ipip_net_id) == NULL) + return -EACCES; + ipip_netlink_parms(data, ); return ip_tunnel_newlink(dev, tb, ); } -- 2.1.4 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH rh7 4/4] net: sit: fix crash on newlink if VE_FEATURE_SIT is disabled
In this case net_generic returns NULL. We must handle this gracefully. Signed-off-by: Vladimir Davydov--- net/ipv6/sit.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 2a73b520d3bf..6b1ae3b06be9 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -1441,6 +1441,9 @@ static int ipip6_newlink(struct net *src_net, struct net_device *dev, #endif int err; + if (net_generic(net, sit_net_id) == NULL) + return -EACCES; + nt = netdev_priv(dev); ipip6_netlink_parms(data, >parms); -- 2.1.4 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH rh7 1/4] net: ipip: enable in container
Currently, we fail to init ipip per-net in a ve, because it has neither NETIF_F_VIRTUAL nor NETIF_F_NETNS_LOCAL: ipip_init_net ip_tunnel_init_net __ip_tunnel_create register_netdevice ve_is_dev_movable In PCS6 ipip has NETIF_F_NETNS_LOCAL, so everything works fine there, but this restriction was removed in RH7 kernel, so we fail to start a container if ipip is loaded (or load ipip if there are containers running). Mark ipip as NETIF_F_VIRTUAL to fix this issue. https://jira.sw.ru/browse/PSBM-48608 Signed-off-by: Vladimir Davydov--- net/ipv4/ipip.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index e556a1df5a57..7842dcb2fd65 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -301,6 +301,7 @@ static void ipip_tunnel_setup(struct net_device *dev) netif_keep_dst(dev); dev->features |= IPIP_FEATURES; + dev->features |= NETIF_F_VIRTUAL; dev->hw_features|= IPIP_FEATURES; ip_tunnel_setup(dev, ipip_net_id); } -- 2.1.4 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH rh7 2/4] net: ip_vti: skip per net init in ve
ip_vti devices lack NETIF_F_VIRTUAL, so they can't be created inside a container. Problem is a device of this kind is created on net ns init if the module is loaded, as a result a container start fails with EPERM. We could allow ip_vti inside container (as well as other net devices, which I would really like to do), but this is insecure and might break migration, so let's keep it disabled and fix the issue by silently skipping ip_vti per net init if running inside a ve. https://jira.sw.ru/browse/PSBM-48698 Signed-off-by: Vladimir Davydov--- net/ipv4/ip_vti.c | 15 +++ 1 file changed, 15 insertions(+) diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index ce80a9a1be9d..3158100646ed 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -58,6 +58,9 @@ static int vti_input(struct sk_buff *skb, int nexthdr, __be32 spi, struct net *net = dev_net(skb->dev); struct ip_tunnel_net *itn = net_generic(net, vti_net_id); + if (itn == NULL) + return -EINVAL; + tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY, iph->saddr, iph->daddr, 0); if (tunnel != NULL) { @@ -256,6 +259,9 @@ static int vti4_err(struct sk_buff *skb, u32 info) int protocol = iph->protocol; struct ip_tunnel_net *itn = net_generic(net, vti_net_id); + if (itn == NULL) + return -1; + tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY, iph->daddr, iph->saddr, 0); if (!tunnel) @@ -413,6 +419,9 @@ static int __net_init vti_init_net(struct net *net) int err; struct ip_tunnel_net *itn; + if (!ve_is_super(net->owner_ve)) + return net_assign_generic(net, vti_net_id, NULL); + err = ip_tunnel_init_net(net, vti_net_id, _link_ops, "ip_vti0"); if (err) return err; @@ -424,6 +433,9 @@ static int __net_init vti_init_net(struct net *net) static void __net_exit vti_exit_net(struct net *net) { struct ip_tunnel_net *itn = net_generic(net, vti_net_id); + + if (itn == NULL) + return; ip_tunnel_delete_net(itn, _link_ops); } @@ -473,6 +485,9 @@ static int vti_newlink(struct net *src_net, struct net_device *dev, { struct ip_tunnel_parm parms; + if (net_generic(dev_net(dev), vti_net_id) == NULL) + return -EACCES; + vti_netlink_parms(data, ); return ip_tunnel_newlink(dev, tb, ); } -- 2.1.4 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel