Commit 9f860e606 introduced an engine to delay fsync: doing fallocate(FALLOC_FL_CONVERT_UNWRITTEN) dio_post_submit marks io as PLOOP_IO_FSYNC_DELAYED to ensure that fsync happens later, when incoming FLUSH|FUA comes.
That was deemed as important because (PSBM-47026): > This optimization becomes more important due to the fact that customers tend > to use pcompact heavily => ploop images grow each day. Now, we can easily re-use the engine to delay fsync for reloc requests as well. As explained in the description of commit 5aa3fe09: > 1->read_data_from_old_post > 2->write_to_new_pos > ->sumbit_alloc > ->submit_pad > ->post_submit->convert_unwritten > 3->update_index ->write_page with FLUSH|FUA > 4->nullify_old_pos > 5->issue_flush by the time of step 3 extent coversion is not yet stable because belongs to uncommitted transaction. But instead of doing fsync inside ->post_submit, we can fsync later, as the very first step of write_page for index_update. Changed in v2: - process delayed fsync asynchronously, via PLOOP_E_FSYNC_PENDED eng_state Changed in v3: - use extra arg for ploop_index_wb_proceed_or_delay() instead of ad-hoc PLOOP_REQ_FSYNC_IF_DELAYED https://jira.sw.ru/browse/PSBM-47026 Signed-off-by: Maxim Patlasov <mpatla...@virtuozzo.com> --- drivers/block/ploop/dev.c | 9 +++++++-- drivers/block/ploop/map.c | 32 ++++++++++++++++++++++++++++---- include/linux/ploop/ploop.h | 1 + 3 files changed, 36 insertions(+), 6 deletions(-) diff --git a/drivers/block/ploop/dev.c b/drivers/block/ploop/dev.c index df3eec9..ed60b1f 100644 --- a/drivers/block/ploop/dev.c +++ b/drivers/block/ploop/dev.c @@ -2720,6 +2720,11 @@ restart: ploop_index_wb_complete(preq); break; + case PLOOP_E_FSYNC_PENDED: + /* fsync done */ + ploop_index_wb_proceed(preq); + break; + default: BUG(); } @@ -4106,7 +4111,7 @@ static void ploop_relocate(struct ploop_device * plo) preq->bl.tail = preq->bl.head = NULL; preq->req_cluster = 0; preq->req_size = 0; - preq->req_rw = WRITE_SYNC|REQ_FUA; + preq->req_rw = WRITE_SYNC; preq->eng_state = PLOOP_E_ENTRY; preq->state = (1 << PLOOP_REQ_SYNC) | (1 << PLOOP_REQ_RELOC_A); preq->error = 0; @@ -4410,7 +4415,7 @@ static void ploop_relocblks_process(struct ploop_device *plo) preq->bl.tail = preq->bl.head = NULL; preq->req_cluster = ~0U; /* uninitialized */ preq->req_size = 0; - preq->req_rw = WRITE_SYNC|REQ_FUA; + preq->req_rw = WRITE_SYNC; preq->eng_state = PLOOP_E_ENTRY; preq->state = (1 << PLOOP_REQ_SYNC) | (1 << PLOOP_REQ_RELOC_S); preq->error = 0; diff --git a/drivers/block/ploop/map.c b/drivers/block/ploop/map.c index 5f7fd66..715dc15 100644 --- a/drivers/block/ploop/map.c +++ b/drivers/block/ploop/map.c @@ -915,6 +915,24 @@ void ploop_index_wb_proceed(struct ploop_request * preq) put_page(page); } +static void ploop_index_wb_proceed_or_delay(struct ploop_request * preq, + int do_fsync_if_delayed) +{ + if (do_fsync_if_delayed) { + struct map_node * m = preq->map; + struct ploop_delta * top_delta = map_top_delta(m->parent); + struct ploop_io * top_io = &top_delta->io; + + if (test_bit(PLOOP_IO_FSYNC_DELAYED, &top_io->io_state)) { + preq->eng_state = PLOOP_E_FSYNC_PENDED; + ploop_add_req_to_fsync_queue(preq); + return; + } + } + + ploop_index_wb_proceed(preq); +} + /* Data write is commited. Now we need to update index. */ void ploop_index_update(struct ploop_request * preq) @@ -927,6 +945,7 @@ void ploop_index_update(struct ploop_request * preq) int old_level; struct page * page; unsigned long state = READ_ONCE(preq->state); + int do_fsync_if_delayed = 0; /* No way back, we are going to initiate index write. */ @@ -985,10 +1004,12 @@ void ploop_index_update(struct ploop_request * preq) preq->req_rw &= ~REQ_FLUSH; /* Relocate requires consistent index update */ - if (state & (PLOOP_REQ_RELOC_A_FL|PLOOP_REQ_RELOC_S_FL)) + if (state & (PLOOP_REQ_RELOC_A_FL|PLOOP_REQ_RELOC_S_FL)) { preq->req_index_update_rw |= (REQ_FLUSH | REQ_FUA); + do_fsync_if_delayed = 1; + } - ploop_index_wb_proceed(preq); + ploop_index_wb_proceed_or_delay(preq, do_fsync_if_delayed); return; enomem: @@ -1109,6 +1130,7 @@ static void map_wb_complete(struct map_node * m, int err) int delayed = 0; unsigned int idx; unsigned long rw; + int do_fsync_if_delayed = 0; /* First, complete processing of written back indices, * finally instantiate indices in mapping cache. @@ -1206,8 +1228,10 @@ static void map_wb_complete(struct map_node * m, int err) state = READ_ONCE(preq->state); /* Relocate requires consistent index update */ - if (state & (PLOOP_REQ_RELOC_A_FL|PLOOP_REQ_RELOC_S_FL)) + if (state & (PLOOP_REQ_RELOC_A_FL|PLOOP_REQ_RELOC_S_FL)) { rw |= (REQ_FLUSH | REQ_FUA); + do_fsync_if_delayed = 1; + } preq->eng_state = PLOOP_E_INDEX_WB; get_page(page); @@ -1234,7 +1258,7 @@ static void map_wb_complete(struct map_node * m, int err) plo->st.map_multi_writes++; main_preq->req_index_update_rw = rw; - ploop_index_wb_proceed(main_preq); + ploop_index_wb_proceed_or_delay(main_preq, do_fsync_if_delayed); } void diff --git a/include/linux/ploop/ploop.h b/include/linux/ploop/ploop.h index d8e01b6..33733e9 100644 --- a/include/linux/ploop/ploop.h +++ b/include/linux/ploop/ploop.h @@ -513,6 +513,7 @@ enum PLOOP_E_ZERO_INDEX, /* Zeroing index of free block; original request can use .submit on completion */ PLOOP_E_DELTA_ZERO_INDEX,/* the same but for PLOOP_E_DELTA_READ */ + PLOOP_E_FSYNC_PENDED, /* INDEX_WB needs io->ops->sync() to proceed */ }; #define BIO_BDEV_REUSED 14 /* io_context is stored in bi_bdev */ _______________________________________________ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel