[Devel] [PATCH rh7 8/9] ploop: fix barriers for PLOOP_E_RELOC_NULLIFY

2016-06-23 Thread Maxim Patlasov
The last step of processing of RELOC_A request is
nullifying BAT block. We smartly noticed, that flush
needed after that, but fsync is not enough:

>   /*
>* Lately we think we does sync of nullified blocks at format
>* driver by image fsync before header update.
>* But we write this data directly into underlying device
>* bypassing EXT4 by usage of extent map tree
>* (see dio_submit()). So fsync of EXT4 image doesnt help us.
>* We need to force sync of nullified blocks.
>*/
>   set_bit(PLOOP_REQ_FORCE_FUA, >state);
>   top_delta->io.ops->submit(_delta->io, preq, preq->req_rw,
> , preq->iblock, 1issue_flush to flush
nullified block.

Signed-off-by: Maxim Patlasov 
---
 drivers/block/ploop/dev.c   |   11 ++-
 drivers/block/ploop/io_direct.c |3 ++-
 drivers/block/ploop/map.c   |4 +++-
 include/linux/ploop/ploop.h |1 +
 4 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/drivers/block/ploop/dev.c b/drivers/block/ploop/dev.c
index 557ddba..2b60dfa 100644
--- a/drivers/block/ploop/dev.c
+++ b/drivers/block/ploop/dev.c
@@ -1305,6 +1305,8 @@ static void ploop_complete_request(struct ploop_request * 
preq)
}
preq->bl.tail = NULL;
 
+   WARN_ON(!preq->error && test_bit(PLOOP_REQ_ISSUE_FLUSH, >state));
+
if (test_bit(PLOOP_REQ_RELOC_A, >state) ||
test_bit(PLOOP_REQ_RELOC_S, >state)) {
if (preq->error)
@@ -2429,6 +2431,13 @@ static void ploop_req_state_process(struct ploop_request 
* preq)
preq->eng_io = NULL;
}
 
+   if (test_bit(PLOOP_REQ_ISSUE_FLUSH, >state)) {
+   preq->eng_io->ops->issue_flush(preq->eng_io, preq);
+   clear_bit(PLOOP_REQ_ISSUE_FLUSH, >state);
+   preq->eng_io = NULL;
+   goto out;
+   }
+
 restart:
BUG_ON(test_bit(PLOOP_REQ_POST_SUBMIT, >state));
__TRACE("ST %p %u %lu\n", preq, preq->req_cluster, preq->eng_state);
@@ -2705,7 +2714,7 @@ restart:
default:
BUG();
}
-
+out:
if (release_ioc) {
struct io_context * ioc = current->io_context;
current->io_context = saved_ioc;
diff --git a/drivers/block/ploop/io_direct.c b/drivers/block/ploop/io_direct.c
index 94936c7..c4d0f63 100644
--- a/drivers/block/ploop/io_direct.c
+++ b/drivers/block/ploop/io_direct.c
@@ -413,6 +413,7 @@ try_again:
 
preq->iblock = iblk;
preq->eng_io = io;
+   BUG_ON(test_bit(PLOOP_REQ_ISSUE_FLUSH, >state));
set_bit(PLOOP_REQ_POST_SUBMIT, >state);
dio_submit_pad(io, preq, sbl, size, em);
err = 0;
@@ -1819,7 +1820,7 @@ static void dio_issue_flush(struct ploop_io * io, struct 
ploop_request *preq)
 
atomic_inc(>io_count);
ploop_acc_ff_out(io->plo, preq->req_rw | bio->bi_rw);
-   submit_bio(preq->req_rw, bio);
+   submit_bio(WRITE_FLUSH, bio);
ploop_complete_io_request(preq);
 }
 
diff --git a/drivers/block/ploop/map.c b/drivers/block/ploop/map.c
index f87fb08..915a216 100644
--- a/drivers/block/ploop/map.c
+++ b/drivers/block/ploop/map.c
@@ -1077,7 +1077,9 @@ static void map_wb_complete_post_process(struct ploop_map 
*map,
 * (see dio_submit()). So fsync of EXT4 image doesnt help us.
 * We need to force sync of nullified blocks.
 */
-   set_bit(PLOOP_REQ_FORCE_FUA, >state);
+   preq->eng_io = _delta->io;
+   BUG_ON(test_bit(PLOOP_REQ_POST_SUBMIT, >state));
+   set_bit(PLOOP_REQ_ISSUE_FLUSH, >state);
top_delta->io.ops->submit(_delta->io, preq, preq->req_rw,
  , preq->iblock, 1fsync() */
+   PLOOP_REQ_ISSUE_FLUSH, /* preq needs ->issue_flush before completing */
 };
 
 #define PLOOP_REQ_MERGE_FL (1 << PLOOP_REQ_MERGE)

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH rh7 9/9] ploop: fixup barrier handling during relocation

2016-06-23 Thread Maxim Patlasov
Rebase Dima's patch on top of rh7-3.10.0-327.18.2.vz7.14.19,
but without help of delayed_flush engine:

To ensure consistency on crash/power outage/hard reboot
events, ploop must implement the following barrier logic
for RELOC_A|S requests:

1) After we store data to new place, but before updating
BAT on disk, we have FLUSH everything (in fact, flushing
those data would be enough, but it is simplier to flush
everything).

2) We should not proceed handling RELOC_A|S until we
100% sure new BAT value went to disk platters. So far as
new BAT is only one page, it's OK to mark corresponding
bio with FUA flag for io_direct case. For io_kaio, not
having FUA api, we have to post_fsync BAT update.

PLOOP_REQ_FORCE_FLUSH/PLOOP_REQ_FORCE_FUA introduced
long time ago probably were intended to ensure the
logic above, but they actually didn't.

The patch removes PLOOP_REQ_FORCE_FLUSH/PLOOP_REQ_FORCE_FUA,
and implements barriers in a straightforward and simple way:
check for RELOC_A|S explicitly and make FLUSH/FUA where
needed.

Signed-off-by: Maxim Patlasov 
---
 drivers/block/ploop/dev.c   |4 ++--
 drivers/block/ploop/io_direct.c |7 ---
 drivers/block/ploop/io_kaio.c   |8 +---
 drivers/block/ploop/map.c   |   22 ++
 include/linux/ploop/ploop.h |1 -
 5 files changed, 17 insertions(+), 25 deletions(-)

diff --git a/drivers/block/ploop/dev.c b/drivers/block/ploop/dev.c
index 2b60dfa..40768b6 100644
--- a/drivers/block/ploop/dev.c
+++ b/drivers/block/ploop/dev.c
@@ -2610,8 +2610,8 @@ restart:
top_delta = ploop_top_delta(plo);
sbl.head = sbl.tail = preq->aux_bio;
 
-   /* Relocated data write required sync before BAT updatee */
-   set_bit(PLOOP_REQ_FORCE_FUA, >state);
+   /* Relocated data write required sync before BAT update
+* this will happen inside index_update */
 
if (test_bit(PLOOP_REQ_RELOC_S, >state)) {
preq->eng_state = PLOOP_E_DATA_WBI;
diff --git a/drivers/block/ploop/io_direct.c b/drivers/block/ploop/io_direct.c
index c4d0f63..266f041 100644
--- a/drivers/block/ploop/io_direct.c
+++ b/drivers/block/ploop/io_direct.c
@@ -89,15 +89,11 @@ dio_submit(struct ploop_io *io, struct ploop_request * preq,
sector_t sec, nsec;
int err;
struct bio_list_walk bw;
-   int postfua = 0;
int write = !!(rw & REQ_WRITE);
int delayed_fua = 0;
 
trace_submit(preq);
 
-   if (test_and_clear_bit(PLOOP_REQ_FORCE_FUA, >state))
-   postfua = 1;
-
if ((rw & REQ_FUA) && ploop_req_delay_fua_possible(preq)) {
/* Mark req that delayed flush required */
preq->req_rw |= (REQ_FLUSH | REQ_FUA);
@@ -233,9 +229,6 @@ flush_bio:
b->bi_private = preq;
b->bi_end_io = dio_endio_async;
 
-   if (unlikely(postfua && !bl.head))
-   rw2 |= REQ_FUA;
-
ploop_acc_ff_out(preq->plo, rw2 | b->bi_rw);
submit_bio(rw2, b);
}
diff --git a/drivers/block/ploop/io_kaio.c b/drivers/block/ploop/io_kaio.c
index ed550f4..85863df 100644
--- a/drivers/block/ploop/io_kaio.c
+++ b/drivers/block/ploop/io_kaio.c
@@ -69,6 +69,8 @@ static void kaio_complete_io_state(struct ploop_request * 
preq)
unsigned long flags;
int post_fsync = 0;
int need_fua = !!(preq->req_rw & REQ_FUA);
+   unsigned long state = READ_ONCE(preq->state);
+   int reloc = !!(state & (PLOOP_REQ_RELOC_A_FL|PLOOP_REQ_RELOC_S_FL));
 
if (preq->error || !(preq->req_rw & REQ_FUA) ||
preq->eng_state == PLOOP_E_INDEX_READ ||
@@ -80,9 +82,9 @@ static void kaio_complete_io_state(struct ploop_request * 
preq)
}
 
/* Convert requested fua to fsync */
-   if (test_and_clear_bit(PLOOP_REQ_FORCE_FUA, >state) ||
-   test_and_clear_bit(PLOOP_REQ_KAIO_FSYNC, >state) ||
-   (need_fua && !ploop_req_delay_fua_possible(preq))) {
+   if (test_and_clear_bit(PLOOP_REQ_KAIO_FSYNC, >state) ||
+   (need_fua && !ploop_req_delay_fua_possible(preq)) ||
+   (reloc && ploop_req_delay_fua_possible(preq))) {
post_fsync = 1;
preq->req_rw &= ~REQ_FUA;
}
diff --git a/drivers/block/ploop/map.c b/drivers/block/ploop/map.c
index 915a216..1883674 100644
--- a/drivers/block/ploop/map.c
+++ b/drivers/block/ploop/map.c
@@ -909,6 +909,7 @@ void ploop_index_update(struct ploop_request * preq)
struct page * page;
sector_t sec;
unsigned long rw;
+   unsigned long state = READ_ONCE(preq->state);
 
/* No way back, we are going to initiate index write. */
 
@@ -961,10 +962,6 @@ void ploop_index_update(struct ploop_request * preq)
__TRACE("wbi %p %u %p\n", preq, preq->req_cluster, m);
plo->st.map_single_writes++;
top_delta->ops->map_index(top_delta, 

[Devel] [PATCH rh7 5/9] ploop: resurrect delay_fua for io_direct

2016-06-23 Thread Maxim Patlasov
Recent commit c2247f3745 while fixing barriers for ordinary
requests, accidentally smashed delay_fua optimization for
io_direct by:

> +   bio->bi_rw |= bw.cur->bi_rw & (REQ_FLUSH | REQ_FUA);

The idea is the following: if at least one incoming bio is marked as
FUA (it is actually equivalent to (rw & REQ_FUA) check), and
eng_state == E_DATA_WBI, we can delay FUA until index update and
implement it there by REQ_FLUSH.

It is not clear if this optimization provides any benefits, but if
we lived with it for long so far, let's keep it for now.

The patch removes PLOOP_REQ_FORCE_FLUSH thoroughly because it's
easier to use REQ_FLUSH bit in preq->req_rw instead.

Signed-off-by: Maxim Patlasov 
---
 drivers/block/ploop/io_direct.c |   15 ++-
 drivers/block/ploop/map.c   |   25 ++---
 include/linux/ploop/ploop.h |1 -
 3 files changed, 24 insertions(+), 17 deletions(-)

diff --git a/drivers/block/ploop/io_direct.c b/drivers/block/ploop/io_direct.c
index db82a61..1ea2008 100644
--- a/drivers/block/ploop/io_direct.c
+++ b/drivers/block/ploop/io_direct.c
@@ -92,23 +92,19 @@ dio_submit(struct ploop_io *io, struct ploop_request * preq,
int preflush;
int postfua = 0;
int write = !!(rw & REQ_WRITE);
+   int delayed_fua = 0;
 
trace_submit(preq);
 
preflush = !!(rw & REQ_FLUSH);
 
-   if (test_and_clear_bit(PLOOP_REQ_FORCE_FLUSH, >state))
-   preflush = 1;
-
if (test_and_clear_bit(PLOOP_REQ_FORCE_FUA, >state))
postfua = 1;
 
-   if (!postfua && ploop_req_delay_fua_possible(preq) && (rw & REQ_FUA)) {
-
+   if ((rw & REQ_FUA) && ploop_req_delay_fua_possible(preq)) {
/* Mark req that delayed flush required */
-   set_bit(PLOOP_REQ_FORCE_FLUSH, >state);
-   } else if (rw & REQ_FUA) {
-   postfua = 1;
+   preq->req_rw |= (REQ_FLUSH | REQ_FUA);
+   delayed_fua = 1;
}
 
rw &= ~(REQ_FLUSH | REQ_FUA);
@@ -222,7 +218,8 @@ flush_bio:
goto flush_bio;
}
 
-   bio->bi_rw |= bw.cur->bi_rw & (REQ_FLUSH | REQ_FUA);
+   bio->bi_rw |= bw.cur->bi_rw &
+   (REQ_FLUSH | delayed_fua ? 0 : REQ_FUA);
bw.bv_off += copy;
size -= copy >> 9;
sec += copy >> 9;
diff --git a/drivers/block/ploop/map.c b/drivers/block/ploop/map.c
index ae6cc15..f87fb08 100644
--- a/drivers/block/ploop/map.c
+++ b/drivers/block/ploop/map.c
@@ -908,6 +908,7 @@ void ploop_index_update(struct ploop_request * preq)
int old_level;
struct page * page;
sector_t sec;
+   unsigned long rw;
 
/* No way back, we are going to initiate index write. */
 
@@ -965,8 +966,14 @@ void ploop_index_update(struct ploop_request * preq)
test_bit(PLOOP_REQ_RELOC_S, >state))
set_bit(PLOOP_REQ_FORCE_FUA, >state);
 
-   top_delta->io.ops->write_page(_delta->io, preq, page, sec,
- preq->req_rw & REQ_FUA);
+   rw = (preq->req_rw & (REQ_FUA | REQ_FLUSH));
+
+   /* We've just set REQ_FLUSH in rw, ->write_page() below
+  will do the FLUSH */
+   preq->req_rw &= ~REQ_FLUSH;
+
+   top_delta->io.ops->write_page(_delta->io, preq, page, sec, rw);
+
put_page(page);
return;
 
@@ -1085,7 +1092,8 @@ static void map_wb_complete(struct map_node * m, int err)
int delayed = 0;
unsigned int idx;
sector_t sec;
-   int fua, force_fua;
+   int force_fua;
+   unsigned long rw;
 
/* First, complete processing of written back indices,
 * finally instantiate indices in mapping cache.
@@ -1155,7 +1163,7 @@ static void map_wb_complete(struct map_node * m, int err)
copy_index_for_wb(page, m, top_delta->level);
 
main_preq = NULL;
-   fua = 0;
+   rw = 0;
force_fua = 0;
 
list_for_each_safe(cursor, tmp, >io_queue) {
@@ -1175,8 +1183,11 @@ static void map_wb_complete(struct map_node * m, int err)
break;
}
 
-   if (preq->req_rw & REQ_FUA)
-   fua = 1;
+   rw |= (preq->req_rw & (REQ_FLUSH | REQ_FUA));
+
+   /* We've just set REQ_FLUSH in rw, ->write_page() below
+  will do the FLUSH */
+   preq->req_rw &= ~REQ_FLUSH;
 
if (test_bit(PLOOP_REQ_RELOC_A, >state) ||
test_bit(PLOOP_REQ_RELOC_S, >state))
@@ -1211,7 +1222,7 @@ static void map_wb_complete(struct map_node * m, int err)
set_bit(PLOOP_REQ_FORCE_FUA, _preq->state);
 
top_delta->io.ops->write_page(_delta->io, main_preq, page, sec,
- fua ? REQ_FUA : 0);
+

[Devel] [PATCH rh7 6/9] ploop: remove preflush from dio_submit

2016-06-23 Thread Maxim Patlasov
After commit c2247f3745 fixing barriers for ordinary
requests and previous patch fixing delay_fua,
that legacy code in dio_submit processing
(preq->req_rw & REQ_FLUSH) by setting REQ_FLUSH in
the first outgoing bio must die: it is incorrect
anyway (we don't wait for completion of the first
bio before sending others).

Signed-off-by: Maxim Patlasov 
---
 drivers/block/ploop/io_direct.c |7 ---
 1 file changed, 7 deletions(-)

diff --git a/drivers/block/ploop/io_direct.c b/drivers/block/ploop/io_direct.c
index 1ea2008..ee3cd5c 100644
--- a/drivers/block/ploop/io_direct.c
+++ b/drivers/block/ploop/io_direct.c
@@ -89,15 +89,12 @@ dio_submit(struct ploop_io *io, struct ploop_request * preq,
sector_t sec, nsec;
int err;
struct bio_list_walk bw;
-   int preflush;
int postfua = 0;
int write = !!(rw & REQ_WRITE);
int delayed_fua = 0;
 
trace_submit(preq);
 
-   preflush = !!(rw & REQ_FLUSH);
-
if (test_and_clear_bit(PLOOP_REQ_FORCE_FUA, >state))
postfua = 1;
 
@@ -236,10 +233,6 @@ flush_bio:
b->bi_private = preq;
b->bi_end_io = dio_endio_async;
 
-   if (unlikely(preflush)) {
-   rw2 |= REQ_FLUSH;
-   preflush = 0;
-   }
if (unlikely(postfua && !bl.head))
rw2 |= REQ_FUA;
 

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH rh7 4/9] ploop: minor rework of ->write_page() io method

2016-06-23 Thread Maxim Patlasov
From: Dmitry Monakhov 

No functional changes. Next patch will use this
rework to pass REQ_FLUSH to dio_write_page().

The patch is actually a part of Dima's patch:

> [PATCH 3/3] ploop: fixup FORCE_{FLUSH,FUA} handling v3

Signed-off-by: Maxim Patlasov 
---
 drivers/block/ploop/io_direct.c |5 ++---
 drivers/block/ploop/io_kaio.c   |8 +---
 drivers/block/ploop/map.c   |5 +++--
 include/linux/ploop/ploop.h |2 +-
 4 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/drivers/block/ploop/io_direct.c b/drivers/block/ploop/io_direct.c
index 0907540..db82a61 100644
--- a/drivers/block/ploop/io_direct.c
+++ b/drivers/block/ploop/io_direct.c
@@ -1505,15 +1505,14 @@ dio_read_page(struct ploop_io * io, struct 
ploop_request * preq,
 
 static void
 dio_write_page(struct ploop_io * io, struct ploop_request * preq,
-  struct page * page, sector_t sec, int fua)
+  struct page * page, sector_t sec, unsigned long rw)
 {
if (!(io->files.file->f_mode & FMODE_WRITE)) {
PLOOP_FAIL_REQUEST(preq, -EBADF);
return;
}
 
-   dio_io_page(io, WRITE | (fua ? REQ_FUA : 0) | REQ_SYNC,
-   preq, page, sec);
+   dio_io_page(io, rw | WRITE | REQ_SYNC, preq, page, sec);
 }
 
 static int
diff --git a/drivers/block/ploop/io_kaio.c b/drivers/block/ploop/io_kaio.c
index e4e4411..73edc5e 100644
--- a/drivers/block/ploop/io_kaio.c
+++ b/drivers/block/ploop/io_kaio.c
@@ -612,12 +612,14 @@ kaio_read_page(struct ploop_io * io, struct ploop_request 
* preq,
 
 static void
 kaio_write_page(struct ploop_io * io, struct ploop_request * preq,
-struct page * page, sector_t sec, int fua)
+struct page * page, sector_t sec, unsigned long rw)
 {
ploop_prepare_tracker(preq, sec);
 
-   /* No FUA in kaio, convert it to fsync */
-   if (fua)
+   /* No FUA in kaio, convert it to fsync. Don't care
+  about REQ_FLUSH: only io_direct relies on it,
+  io_kaio implements delay_fua in another way... */
+   if (rw & REQ_FUA)
set_bit(PLOOP_REQ_KAIO_FSYNC, >state);
 
kaio_io_page(io, IOCB_CMD_WRITE_ITER, preq, page, sec);
diff --git a/drivers/block/ploop/map.c b/drivers/block/ploop/map.c
index 3ba8a22..ae6cc15 100644
--- a/drivers/block/ploop/map.c
+++ b/drivers/block/ploop/map.c
@@ -966,7 +966,7 @@ void ploop_index_update(struct ploop_request * preq)
set_bit(PLOOP_REQ_FORCE_FUA, >state);
 
top_delta->io.ops->write_page(_delta->io, preq, page, sec,
- !!(preq->req_rw & REQ_FUA));
+ preq->req_rw & REQ_FUA);
put_page(page);
return;
 
@@ -1210,7 +1210,8 @@ static void map_wb_complete(struct map_node * m, int err)
if (force_fua)
set_bit(PLOOP_REQ_FORCE_FUA, _preq->state);
 
-   top_delta->io.ops->write_page(_delta->io, main_preq, page, sec, 
fua);
+   top_delta->io.ops->write_page(_delta->io, main_preq, page, sec,
+ fua ? REQ_FUA : 0);
put_page(page);
 }
 
diff --git a/include/linux/ploop/ploop.h b/include/linux/ploop/ploop.h
index e1d8686..3e53b35 100644
--- a/include/linux/ploop/ploop.h
+++ b/include/linux/ploop/ploop.h
@@ -164,7 +164,7 @@ struct ploop_io_ops
void(*read_page)(struct ploop_io * io, struct ploop_request * preq,
 struct page * page, sector_t sec);
void(*write_page)(struct ploop_io * io, struct ploop_request * preq,
- struct page * page, sector_t sec, int fua);
+ struct page * page, sector_t sec, unsigned long 
rw);
 
 
int (*sync_read)(struct ploop_io * io, struct page * page,

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH rh7 3/9] ploop: resurrect delayed_fua for io_kaio

2016-06-23 Thread Maxim Patlasov
After long thinking it now seems to be clear how
delayed_fua was supposed to work for io_kaio:

1) An incoming bio marked as REQ_FUA leads to this bit
set in preq->req_rw.

2) kaio_submit does nothing with this bit in preq->req_rw.
It only initiates sending data by aio_kernel_submit.

3) When userspace ACKs this WRITE, kaio_complete_io_state
discovers that even though REQ_FUA bit is set in preq->req_rw,
eng_state == E_DATA_WBI, so we can delay flush until index
update.

NB: It is crucial here, that preq->req_rw still needs REQ_FUA
bit set!

4) index update calls ->write_page() with fua=1 because
it detects REQ_FUA bit set in preq->req_rw.

5) kaio_write_page observes fua=1 and so set PLOOP_REQ_KAIO_FSYNC
in preq->state. Then it initiates sending data (BAT update).

6) When userspace ACKs this WRITE (BAT update),
kaio_complete_io_state detects PLOOP_REQ_KAIO_FSYNC bit set,
so it clears it and enforces post_fsync=1.

The patch fixes 3) that was broken so far.

Signed-off-by: Maxim Patlasov 
---
 drivers/block/ploop/io_kaio.c |   12 +---
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/drivers/block/ploop/io_kaio.c b/drivers/block/ploop/io_kaio.c
index 69df456..e4e4411 100644
--- a/drivers/block/ploop/io_kaio.c
+++ b/drivers/block/ploop/io_kaio.c
@@ -68,6 +68,7 @@ static void kaio_complete_io_state(struct ploop_request * 
preq)
struct ploop_device * plo   = preq->plo;
unsigned long flags;
int post_fsync = 0;
+   int need_fua = !!(preq->req_rw & REQ_FUA);
 
if (preq->error || !(preq->req_rw & REQ_FUA) ||
preq->eng_state == PLOOP_E_INDEX_READ ||
@@ -80,14 +81,11 @@ static void kaio_complete_io_state(struct ploop_request * 
preq)
 
/* Convert requested fua to fsync */
if (test_and_clear_bit(PLOOP_REQ_FORCE_FUA, >state) ||
-   test_and_clear_bit(PLOOP_REQ_KAIO_FSYNC, >state))
+   test_and_clear_bit(PLOOP_REQ_KAIO_FSYNC, >state) ||
+   (need_fua && !ploop_req_delay_fua_possible(preq))) {
post_fsync = 1;
-
-   if (!post_fsync &&
-   !(ploop_req_delay_fua_possible(preq) && (preq->req_rw & REQ_FUA)))
-   post_fsync = 1;
-
-   preq->req_rw &= ~REQ_FUA;
+   preq->req_rw &= ~REQ_FUA;
+   }
 
if (post_fsync) {
spin_lock_irqsave(>lock, flags);

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH rh7 2/9] ploop: minor rework of ploop_req_delay_fua_possible

2016-06-23 Thread Maxim Patlasov
No functional changes. The patch simplifies ploop_req_delay_fua_possible
to make it more suitable for next patch. As was recently discussed,
"eng_state == E_DATA_WBI" is lesser prone to errors than
"eng_state != E_COMPLETE".

Note, how the patch makes a bug in kaio_complete_io_state() obvious:
if !(preq->req_rw & REQ_FUA), it must not matter what
ploop_req_delay_fua_possible() returns! I.e., eng_state==E_COMPLETE is
not sufficient ground for post_fsync=1 if no REQ_FUA set.

Signed-off-by: Maxim Patlasov 
---
 drivers/block/ploop/io_direct.c |2 +-
 drivers/block/ploop/io_kaio.c   |3 +--
 include/linux/ploop/ploop.h |   15 ++-
 3 files changed, 4 insertions(+), 16 deletions(-)

diff --git a/drivers/block/ploop/io_direct.c b/drivers/block/ploop/io_direct.c
index 3acae79..0907540 100644
--- a/drivers/block/ploop/io_direct.c
+++ b/drivers/block/ploop/io_direct.c
@@ -103,7 +103,7 @@ dio_submit(struct ploop_io *io, struct ploop_request * preq,
if (test_and_clear_bit(PLOOP_REQ_FORCE_FUA, >state))
postfua = 1;
 
-   if (!postfua && ploop_req_delay_fua_possible(rw, preq)) {
+   if (!postfua && ploop_req_delay_fua_possible(preq) && (rw & REQ_FUA)) {
 
/* Mark req that delayed flush required */
set_bit(PLOOP_REQ_FORCE_FLUSH, >state);
diff --git a/drivers/block/ploop/io_kaio.c b/drivers/block/ploop/io_kaio.c
index 81da1c5..69df456 100644
--- a/drivers/block/ploop/io_kaio.c
+++ b/drivers/block/ploop/io_kaio.c
@@ -84,8 +84,7 @@ static void kaio_complete_io_state(struct ploop_request * 
preq)
post_fsync = 1;
 
if (!post_fsync &&
-   !ploop_req_delay_fua_possible(preq->req_rw, preq) &&
-   (preq->req_rw & REQ_FUA))
+   !(ploop_req_delay_fua_possible(preq) && (preq->req_rw & REQ_FUA)))
post_fsync = 1;
 
preq->req_rw &= ~REQ_FUA;
diff --git a/include/linux/ploop/ploop.h b/include/linux/ploop/ploop.h
index 3441e7e..e1d8686 100644
--- a/include/linux/ploop/ploop.h
+++ b/include/linux/ploop/ploop.h
@@ -613,20 +613,9 @@ void ploop_preq_drop(struct ploop_device * plo, struct 
list_head *drop_list,
  int keep_locked);
 
 
-static inline int ploop_req_delay_fua_possible(unsigned long rw,
-   struct ploop_request *preq)
+static inline int ploop_req_delay_fua_possible(struct ploop_request *preq)
 {
-   int delay_fua = 0;
-
-   /* In case of eng_state != COMPLETE, we'll do FUA in
-* ploop_index_update(). Otherwise, we should post
-* fua.
-*/
-   if (rw & REQ_FUA) {
-   if (preq->eng_state != PLOOP_E_COMPLETE)
-   delay_fua = 1;
-   }
-   return delay_fua;
+   return preq->eng_state == PLOOP_E_DATA_WBI;
 }
 
 static inline void ploop_req_set_error(struct ploop_request * preq, int err)

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH rh7 1/9] ploop: deadcode cleanup

2016-06-23 Thread Maxim Patlasov
From: Dmitry Monakhov 

Rebase Dima's patch on top of rh7-3.10.0-327.18.2.vz7.14.19:

(rw & REQ_FUA) branch is impossible because REQ_FUA was cleared line above.
Logic was moved to ploop_req_delay_fua_possible() long time ago.

Signed-off-by: Dmitry Monakhov 
Signed-off-by: Maxim Patlasov 
---
 drivers/block/ploop/io_direct.c |   10 --
 1 file changed, 10 deletions(-)

diff --git a/drivers/block/ploop/io_direct.c b/drivers/block/ploop/io_direct.c
index 50c0ed1..3acae79 100644
--- a/drivers/block/ploop/io_direct.c
+++ b/drivers/block/ploop/io_direct.c
@@ -113,16 +113,6 @@ dio_submit(struct ploop_io *io, struct ploop_request * 
preq,
 
rw &= ~(REQ_FLUSH | REQ_FUA);
 
-
-   /* In case of eng_state != COMPLETE, we'll do FUA in
-* ploop_index_update(). Otherwise, we should mark
-* last bio as FUA here. */
-   if (rw & REQ_FUA) {
-   rw &= ~REQ_FUA;
-   if (preq->eng_state == PLOOP_E_COMPLETE)
-   postfua = 1;
-   }
-
bio_list_init();
 
if (iblk == PLOOP_ZERO_INDEX)

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


Re: [Devel] [RH7 PATCH 5/6] ploop: fixup barrier handling during relocation

2016-06-23 Thread Maxim Patlasov
No reasons to keep it along with optimization patches. See please the 
port I'll send soon today.


On 06/23/2016 10:25 AM, Dmitry Monakhov wrote:

barrier code is broken in many ways:
Currently only ->dio_submit() handles PLOOP_REQ_FORCE_{FLUSH,FUA} correctly.
But request also can goes though ->dio_submit_alloc()->dio_submit_pad and 
write_page (for indexes)
So in case of grow_dev we have following sequance:

E_RELOC_DATA_READ:
  ->set_bit(PLOOP_REQ_FORCE_FUA, >state);
   ->delta->allocate
  ->io->submit_allloc: dio_submit_alloc
->dio_submit_pad
E_DATA_WBI : data written, time to update index
   ->delta->allocate_complete:ploop_index_update
 ->set_bit(PLOOP_REQ_FORCE_FUA, >state);
 ->write_page
 ->ploop_map_wb_complete
   ->ploop_wb_complete_post_process
 ->set_bit(PLOOP_REQ_FORCE_FUA, >state);
E_RELOC_NULLIFY:

->submit()

Once we have delayed_flush engine it is easy to implement correct scheme for
both engines.

E_RELOC_DATA_READ ->submit_allloc => wait->post_submit->issue_flush
E_DATA_WBI ->ploop_index_update with FUA
E_RELOC_NULLIFY ->submit: => wait->post_submit->issue_flush

This makes reloc sequence optimal:
RELOC_S: R1, W2,WAIT,FLUSH, WBI:FUA
RELOC_A: R1, W2,WAIT,FLUSH, WBI:FUA, W1:NULLIFY,WAIT, FLUSH

https://jira.sw.ru/browse/PSBM-47107
Signed-off-by: Dmitry Monakhov 
---
  drivers/block/ploop/dev.c |  2 +-
  drivers/block/ploop/io_kaio.c |  3 +--
  drivers/block/ploop/map.c | 28 ++--
  3 files changed, 16 insertions(+), 17 deletions(-)

diff --git a/drivers/block/ploop/dev.c b/drivers/block/ploop/dev.c
index 95e3067..090cd2d 100644
--- a/drivers/block/ploop/dev.c
+++ b/drivers/block/ploop/dev.c
@@ -2533,7 +2533,7 @@ restart:
sbl.head = sbl.tail = preq->aux_bio;
  
  		/* Relocated data write required sync before BAT updatee */

-   set_bit(PLOOP_REQ_FORCE_FUA, >state);
+   preq->req_rw |= REQ_FUA;
  
  		if (test_bit(PLOOP_REQ_RELOC_S, >state)) {

preq->eng_state = PLOOP_E_DATA_WBI;
diff --git a/drivers/block/ploop/io_kaio.c b/drivers/block/ploop/io_kaio.c
index 5341fd5..5217ab4 100644
--- a/drivers/block/ploop/io_kaio.c
+++ b/drivers/block/ploop/io_kaio.c
@@ -72,8 +72,7 @@ static void kaio_complete_io_state(struct ploop_request * 
preq)
}
  
  	/* Convert requested fua to fsync */

-   if (test_and_clear_bit(PLOOP_REQ_FORCE_FUA, >state) ||
-   test_and_clear_bit(PLOOP_REQ_DEL_FLUSH, >state) ||
+   if (test_and_clear_bit(PLOOP_REQ_DEL_FLUSH, >state) ||
test_and_clear_bit(PLOOP_REQ_KAIO_FSYNC, >state))
post_fsync = 1;
  
diff --git a/drivers/block/ploop/map.c b/drivers/block/ploop/map.c

index 3a6365d..ef351fb 100644
--- a/drivers/block/ploop/map.c
+++ b/drivers/block/ploop/map.c
@@ -901,6 +901,8 @@ void ploop_index_update(struct ploop_request * preq)
int old_level;
struct page * page;
sector_t sec;
+   int fua = !!(preq->req_rw & REQ_FUA);
+   unsigned long state = READ_ONCE(preq->state);
  
  	/* No way back, we are going to initiate index write. */
  
@@ -954,12 +956,11 @@ void ploop_index_update(struct ploop_request * preq)

plo->st.map_single_writes++;
top_delta->ops->map_index(top_delta, m->mn_start, );
/* Relocate requires consistent writes, mark such reqs appropriately */
-   if (test_bit(PLOOP_REQ_RELOC_A, >state) ||
-   test_bit(PLOOP_REQ_RELOC_S, >state))
-   set_bit(PLOOP_REQ_FORCE_FUA, >state);
-
-   top_delta->io.ops->write_page(_delta->io, preq, page, sec,
- !!(preq->req_rw & REQ_FUA));
+   if (state & (PLOOP_REQ_RELOC_A_FL | PLOOP_REQ_RELOC_S_FL)) {
+   WARN_ON(state & PLOOP_REQ_DEL_FLUSH_FL);
+   fua = 1;
+   }
+   top_delta->io.ops->write_page(_delta->io, preq, page, sec, fua);
put_page(page);
return;
  
@@ -1063,7 +1064,7 @@ static void map_wb_complete_post_process(struct ploop_map *map,

 * (see dio_submit()). So fsync of EXT4 image doesnt help us.
 * We need to force sync of nullified blocks.
 */
-   set_bit(PLOOP_REQ_FORCE_FUA, >state);
+   preq->req_rw |= REQ_FUA;
top_delta->io.ops->submit(_delta->io, preq, preq->req_rw,
  , preq->iblock, 1io_queue) {

struct ploop_request * preq;
+   unsigned long state;
  
  		preq = list_entry(cursor, struct ploop_request, list);

+   state = READ_ONCE(preq->state);
  
  		switch (preq->eng_state) {

case PLOOP_E_INDEX_DELAY:
@@ -1171,9 +1174,10 @@ static void 

Re: [Devel] [RH7 PATCH 6/6] patch ploop_state_debugging.patch

2016-06-23 Thread Maxim Patlasov

OK

On 06/23/2016 10:25 AM, Dmitry Monakhov wrote:

Signed-off-by: Dmitry Monakhov 
---
  drivers/block/ploop/dev.c | 6 ++
  1 file changed, 6 insertions(+)

diff --git a/drivers/block/ploop/dev.c b/drivers/block/ploop/dev.c
index 090cd2d..9bf8592 100644
--- a/drivers/block/ploop/dev.c
+++ b/drivers/block/ploop/dev.c
@@ -1232,6 +1232,12 @@ static void ploop_complete_request(struct ploop_request 
* preq)
}
preq->bl.tail = NULL;
  
+	if (!preq->error) {

+   unsigned long state = READ_ONCE(preq->state);
+   WARN_ON(state & (PLOOP_REQ_POST_SUBMIT_FL|
+PLOOP_REQ_DEL_CONV_FL |
+PLOOP_REQ_DEL_FLUSH_FL ));
+   }
if (test_bit(PLOOP_REQ_RELOC_A, >state) ||
test_bit(PLOOP_REQ_RELOC_S, >state)) {
if (preq->error)


___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


Re: [Devel] [RH7 PATCH 4/6] ploop: io_kaio support PLOOP_REQ_DEL_FLUSH

2016-06-23 Thread Maxim Patlasov

On 06/23/2016 10:25 AM, Dmitry Monakhov wrote:

Currently noone tag preqs with such bit but let it be here for simmetry


I hate dead code (things that impossible to verify by any test). Can we 
add this "symmetry" check later, along with a patch for kaio setting 
this bit? (i.e.: kaio doesn't set it; ergo it must not check it)




Signed-off-by: Dmitry Monakhov 
---
  drivers/block/ploop/io_kaio.c | 1 +
  1 file changed, 1 insertion(+)

diff --git a/drivers/block/ploop/io_kaio.c b/drivers/block/ploop/io_kaio.c
index bee2cee..5341fd5 100644
--- a/drivers/block/ploop/io_kaio.c
+++ b/drivers/block/ploop/io_kaio.c
@@ -73,6 +73,7 @@ static void kaio_complete_io_state(struct ploop_request * 
preq)
  
  	/* Convert requested fua to fsync */

if (test_and_clear_bit(PLOOP_REQ_FORCE_FUA, >state) ||
+   test_and_clear_bit(PLOOP_REQ_DEL_FLUSH, >state) ||
test_and_clear_bit(PLOOP_REQ_KAIO_FSYNC, >state))
post_fsync = 1;
  


___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


Re: [Devel] [RH7 PATCH 3/6] ploop: add delayed flush support

2016-06-23 Thread Maxim Patlasov

On 06/23/2016 10:25 AM, Dmitry Monakhov wrote:

dio_submit and dio_submit_pad may produce several bios. This makes
processing of REQ_FUA complicated because in order to preserve correctness
we have to TAG each bio with FUA flag which is suboptimal.
Obviously there is a room for optimization here: once all bios was acknowledged
by lower layer we may issue empty barrier aka ->issue_flush().
post_submit call back is the place where we all bios completed already.

b1:FUA, b2:FUA, b3:FUA =>  b1,b2,b3,wait_for_bios,bX:FLUSH

This allow us to remove all this REQ_FORCE_{FLUSH,FUA} crap and


It seems we can remove REQ_FORCE_{FLUSH,FUA} right now. Only RELOC_A|S 
needs it and we can fix them in a simple and straightforward way -- 
essentially your 5th patch must be enough enough.


Btw, this patch doesn't disable the logic of passing FUA from incoming 
bio-s to outgoing (commit c2247f3745). Was it by a miss or deliberately?




Signed-off-by: Dmitry Monakhov 
---
  drivers/block/ploop/io_direct.c | 48 +
  include/linux/ploop/ploop.h |  2 ++
  2 files changed, 22 insertions(+), 28 deletions(-)

diff --git a/drivers/block/ploop/io_direct.c b/drivers/block/ploop/io_direct.c
index 195d318..752a9c3e 100644
--- a/drivers/block/ploop/io_direct.c
+++ b/drivers/block/ploop/io_direct.c
@@ -82,31 +82,13 @@ dio_submit(struct ploop_io *io, struct ploop_request * preq,
sector_t sec, nsec;
int err;
struct bio_list_walk bw;
-   int preflush;
-   int postfua = 0;
+   int preflush = !!(rw & REQ_FLUSH);
+   int postflush = !!(rw & REQ_FUA);
int write = !!(rw & REQ_WRITE);
  
  	trace_submit(preq);
  
-	preflush = !!(rw & REQ_FLUSH);

-
-   if (test_and_clear_bit(PLOOP_REQ_FORCE_FLUSH, >state))
-   preflush = 1;
-
-   if (test_and_clear_bit(PLOOP_REQ_FORCE_FUA, >state))
-   postfua = 1;
-
-   if (!postfua && ploop_req_delay_fua_possible(rw, preq)) {
-
-   /* Mark req that delayed flush required */
-   set_bit(PLOOP_REQ_FORCE_FLUSH, >state);
-   } else if (rw & REQ_FUA) {
-   postfua = 1;
-   }
-
rw &= ~(REQ_FLUSH | REQ_FUA);
-
-
bio_list_init();
  
  	if (iblk == PLOOP_ZERO_INDEX)

@@ -237,13 +219,14 @@ flush_bio:
rw2 |= REQ_FLUSH;
preflush = 0;
}
-   if (unlikely(postfua && !bl.head))
-   rw2 |= REQ_FUA;
-
ploop_acc_ff_out(preq->plo, rw2 | b->bi_rw);
submit_bio(rw2, b);
}
-
+   /* TODO: minor optimization is possible for single bio case */
+   if (postflush) {
+   set_bit(PLOOP_REQ_DEL_FLUSH, >state);
+   ploop_add_post_submit(io, preq);
+   }
ploop_complete_io_request(preq);
return;
  
@@ -523,9 +506,10 @@ dio_convert_extent(struct ploop_io *io, struct ploop_request * preq)

  (loff_t)sec << 9, clu_siz);
  
  	/* highly unlikely case: FUA coming to a block not provisioned yet */

-   if (!err && force_sync)
+   if (!err && force_sync) {
+   clear_bit(PLOOP_REQ_DEL_FLUSH, >state);
err = io->ops->sync(io);
-
+   }
if (!force_sync) {
spin_lock_irq(>lock);
io->io_count++;
@@ -546,7 +530,12 @@ dio_post_submit(struct ploop_io *io, struct ploop_request 
* preq)
if (test_and_clear_bit(PLOOP_REQ_DEL_CONV, >state))
dio_convert_extent(io, preq);
  
+	if (test_and_clear_bit(PLOOP_REQ_DEL_FLUSH, >state)) {

+   io->ops->issue_flush(io, preq);
+   return 1;
+   }
return 0;
+
  }
  
  /* Submit the whole cluster. If preq contains only partial data

@@ -562,7 +551,6 @@ dio_submit_pad(struct ploop_io *io, struct ploop_request * 
preq,
sector_t sec, end_sec, nsec, start, end;
struct bio_list_walk bw;
int err;
-
bio_list_init();
  
  	/* sec..end_sec is the range which we are going to write */

@@ -694,7 +682,11 @@ flush_bio:
ploop_acc_ff_out(preq->plo, rw | b->bi_rw);
submit_bio(rw, b);
}
-
+   /* TODO: minor optimization is possible for single bio case */
+   if (preq->req_rw &  REQ_FUA) {
+   set_bit(PLOOP_REQ_DEL_FLUSH, >state);
+   ploop_add_post_submit(io, preq);
+   }
ploop_complete_io_request(preq);
return;
  
diff --git a/include/linux/ploop/ploop.h b/include/linux/ploop/ploop.h

index 4c52a40..5076f16 100644
--- a/include/linux/ploop/ploop.h
+++ b/include/linux/ploop/ploop.h
@@ -472,6 +472,7 @@ enum
PLOOP_REQ_PUSH_BACKUP, /* preq was ACKed by userspace push_backup */
PLOOP_REQ_ALLOW_READS, /* READs are allowed for given req_cluster */
PLOOP_REQ_DEL_CONV,/* post_submit: conversion required */
+   PLOOP_REQ_DEL_FLUSH, 

Re: [Devel] [RH7 PATCH 1/6] ploop: generalize post_submit stage

2016-06-23 Thread Maxim Patlasov

On 06/23/2016 10:25 AM, Dmitry Monakhov wrote:

Currently post_submit() used only for convert_unwritten_extents.
But post_submit() is good transition point where all submitted
data was completed by lower layer, and new state about to be processed.
Iyt is ideal point where we can perform transition actions
For example:
  io_direct: Convert unwritten extents
  io_direct: issue empty barrier bio in order to simulate postflush
  io_direct,io_kaio: queue to fsync queue
  Etc.

This patch does not change anything, but prepare post_submit for
more logic which will be added later.


If we decide to have DEL_FLUSH, I'm OK with this approach. Maybe with 
some renaming:


s/PLOOP_REQ_DEL_FLUSH/PLOOP_REQ_FLUSH_DELAYED
s/PLOOP_REQ_DEL_CONV/PLOOP_REQ_CONV_DELAYED
s/post_submit/pre_process
s/PLOOP_REQ_POST_SUBMIT/PLOOP_REQ_PRE_PROCESS



Signed-off-by: Dmitry Monakhov 
---
  drivers/block/ploop/dev.c   | 10 ++
  drivers/block/ploop/io_direct.c | 15 ---
  include/linux/ploop/ploop.h | 12 +++-
  3 files changed, 29 insertions(+), 8 deletions(-)

diff --git a/drivers/block/ploop/dev.c b/drivers/block/ploop/dev.c
index e405232..e8b0304 100644
--- a/drivers/block/ploop/dev.c
+++ b/drivers/block/ploop/dev.c
@@ -2351,10 +2351,12 @@ static void ploop_req_state_process(struct 
ploop_request * preq)
preq->prealloc_size = 0; /* only for sanity */
}
  
-	if (test_bit(PLOOP_REQ_POST_SUBMIT, >state)) {

-   preq->eng_io->ops->post_submit(preq->eng_io, preq);
-   clear_bit(PLOOP_REQ_POST_SUBMIT, >state);
+   if (test_and_clear_bit(PLOOP_REQ_POST_SUBMIT, >state)) {
+   struct ploop_io *io = preq->eng_io;
+
preq->eng_io = NULL;
+   if (preq->eng_io->ops->post_submit(io, preq))
+   goto out;
}
  
  restart:

@@ -2633,7 +2635,7 @@ restart:
default:
BUG();
}
-
+out:
if (release_ioc) {
struct io_context * ioc = current->io_context;
current->io_context = saved_ioc;
diff --git a/drivers/block/ploop/io_direct.c b/drivers/block/ploop/io_direct.c
index f1812fe..ec905b4 100644
--- a/drivers/block/ploop/io_direct.c
+++ b/drivers/block/ploop/io_direct.c
@@ -416,8 +416,8 @@ try_again:
}
  
  		preq->iblock = iblk;

-   preq->eng_io = io;
-   set_bit(PLOOP_REQ_POST_SUBMIT, >state);
+   set_bit(PLOOP_REQ_DEL_CONV, >state);
+   ploop_add_post_submit(io, preq);
dio_submit_pad(io, preq, sbl, size, em);
err = 0;
goto end_write;
@@ -501,7 +501,7 @@ end_write:
  }
  
  static void

-dio_post_submit(struct ploop_io *io, struct ploop_request * preq)
+dio_convert_extent(struct ploop_io *io, struct ploop_request * preq)
  {
struct ploop_device *plo = preq->plo;
sector_t sec = (sector_t)preq->iblock << preq->plo->cluster_log;
@@ -540,6 +540,15 @@ dio_post_submit(struct ploop_io *io, struct ploop_request 
* preq)
}
  }
  
+static int

+dio_post_submit(struct ploop_io *io, struct ploop_request * preq)
+{
+   if (test_and_clear_bit(PLOOP_REQ_DEL_CONV, >state))
+   dio_convert_extent(io, preq);
+
+   return 0;
+}
+
  /* Submit the whole cluster. If preq contains only partial data
   * within the cluster, pad the rest of cluster with zeros.
   */
diff --git a/include/linux/ploop/ploop.h b/include/linux/ploop/ploop.h
index 0fba25e..4c52a40 100644
--- a/include/linux/ploop/ploop.h
+++ b/include/linux/ploop/ploop.h
@@ -148,7 +148,7 @@ struct ploop_io_ops
  struct bio_list *sbl, iblock_t iblk, unsigned int 
size);
void(*submit_alloc)(struct ploop_io *, struct ploop_request *,
struct bio_list *sbl, unsigned int size);
-   void(*post_submit)(struct ploop_io *, struct ploop_request *);
+   int (*post_submit)(struct ploop_io *, struct ploop_request *);
  
  	int	(*disable_merge)(struct ploop_io * io, sector_t isector, unsigned int len);

int (*fastmap)(struct ploop_io * io, struct bio *orig_bio,
@@ -471,6 +471,7 @@ enum
PLOOP_REQ_POST_SUBMIT, /* preq needs post_submit processing */
PLOOP_REQ_PUSH_BACKUP, /* preq was ACKed by userspace push_backup */
PLOOP_REQ_ALLOW_READS, /* READs are allowed for given req_cluster */
+   PLOOP_REQ_DEL_CONV,/* post_submit: conversion required */
PLOOP_REQ_FSYNC_DONE,  /* fsync_thread() performed f_op->fsync() */
  };
  
@@ -479,6 +480,8 @@ enum

  #define PLOOP_REQ_RELOC_S_FL (1 << PLOOP_REQ_RELOC_S)
  #define PLOOP_REQ_DISCARD_FL (1 << PLOOP_REQ_DISCARD)
  #define PLOOP_REQ_ZERO_FL (1 << PLOOP_REQ_ZERO)
+#define PLOOP_REQ_POST_SUBMIT_FL (1 << PLOOP_REQ_POST_SUBMIT)
+#define PLOOP_REQ_DEL_CONV_FL (1 << PLOOP_REQ_DEL_CONV)
  
  enum

  {
@@ -767,6 +770,13 @@ static inline void ploop_entry_qlen_dec(struct 

Re: [Devel] [RH7 PATCH 0/6] RFC ploop: Barrier fix patch set v3

2016-06-23 Thread Maxim Patlasov

Dima,

On 06/23/2016 10:25 AM, Dmitry Monakhov wrote:

Here is 3'rd version of barrier fix patches based on recent fixes.
This is an RFC version. I do not have time to test it before tomorrow,
Max please review is briefly and tell be your oppinion about general idea.


It's hard to review w/o context, and the series fail to apply to our vz7 
tree. So, I spent pretty long while trying to find a tag or commit where 
it's possible to apply your patches w/o rejects. The first patch wants 
PLOOP_REQ_ALLOW_READS in ploop.h:



@@ -471,6 +471,7 @@ enum
 PLOOP_REQ_POST_SUBMIT, /* preq needs post_submit processing */
 PLOOP_REQ_PUSH_BACKUP, /* preq was ACKed by userspace push_backup */
 PLOOP_REQ_ALLOW_READS, /* READs are allowed for given req_cluster */
+PLOOP_REQ_DEL_CONV,/* post_submit: conversion required */
 PLOOP_REQ_FSYNC_DONE,  /* fsync_thread() performed f_op->fsync() */
 };


We removed ALLOW_READS by 06e7586 (Jun 3), so you must have 
rh7-3.10.0-327.18.2.vz7.14.11 or earlier. But the third patch has:


@@ -562,7 +551,6 @@ dio_submit_pad(struct ploop_io *io, struct 
ploop_request * preq,

 sector_t sec, end_sec, nsec, start, end;
 struct bio_list_walk bw;
 int err;
-
 bio_list_init();

 /* sec..end_sec is the range which we are going to write */


while after applying the first and the second, it looks like:


static void
dio_submit_pad(struct ploop_io *io, struct ploop_request * preq,
   struct bio_list * sbl, unsigned int size,
   struct extent_map *em)
{
struct bio_list bl;
struct bio * bio = NULL;
sector_t sec, end_sec, nsec, start, end;
struct bio_list_walk bw;
int err;
int preflush = !!(preq->req_rw & REQ_FLUSH);

bio_list_init();


So the tree you used didn't have that "int preflush = !!(preq->req_rw & 
REQ_FLUSH);" line. But the patch removing this line was committed only 
yesterday, Jun 22 (c2247f3745).


After applying c2247f3745 before your series, another conflict happens 
while applying the third patch: the first hunk assumes the following 
lines in dio_submit:



 rw &= ~(REQ_FLUSH | REQ_FUA);
-
-
 bio_list_init();


But we always (since 2013) had:


rw &= ~(REQ_FLUSH | REQ_FUA);


/* In case of eng_state != COMPLETE, we'll do FUA in
 * ploop_index_update(). Otherwise, we should mark
 * last bio as FUA here. */
if (rw & REQ_FUA) {
rw &= ~REQ_FUA;
if (preq->eng_state == PLOOP_E_COMPLETE)
postfua = 1;
}

bio_list_init();


Hence, I drew conclusion, we need one of your previous patches to be 
applied at first. After very long row of trials and errors I eventually 
succeeded:


$ git show c2247f3745 > /tmp/my.diff
$ git checkout rh7-3.10.0-327.18.2.vz7.14.11
$ patch -p1 < ploop-fix-barriers-for-ordinary-requests
$ patch -p1 < ploop-skip-redundant-fsync-for-REQ_FUA-in-post_submit
$ patch -p1 < ploop-deadcode-cleanup
$ patch -p1 < ploop-generalize-post_submit-stage
$ patch -p1 < ploop-generalize-issue_flush
$ patch -p1 < ploop-add-delayed-flush-support
$ patch -p1 < ploop-io_kaio-support-PLOOP_REQ_DEL_FLUSH
$ patch -p1 < ploop-fixup-barrier-handling-during-relocation
$ patch -p1 < patch-ploop_state_debugging.patch



Basic idea is to use post_submit state to issue empty FLUSH barrier in order
to complete FUA requests. This allow us to unify all engines (direct and kaio).

This makes FUA processing optimal:
SUBMIT:FUA   :W1{b1,b2,b3,b4..},WAIT,post_submit:FLUSH
SUBMIT_ALLOC:FUA :W1{b1,b2,b3,b4..},WAIT,post_submit:FLUSH, WBI:FUA


The above would be optimal only if all three statements below are true:

1) Lower layers process FUA as post-FLUSH.

I remember that you wrote that in real (linux kernel) life it is always 
true, but somehow I'm not sure about this "always"... Of course, we can 
investigate and eventually nail down the question, but for now I'm not 
convinced.


2) The list of (incoming) bio-s has more than one marked as FUA. 
Otherwise (i.e. if only one is FUA), it must be equivalent (from 
performance perspective): to submit FUA now, or to submit FLUSH later 
(modulo 1) above).


For now, I know only two entities generating FUA: ext4 writes superblock 
and jbd2 commits transaction. In both cases it is one page-sized bio and 
no way to have more than one FUA in queue. Do you know other cases?
(Of course, we know about fsync(), but it generates zero-size FLUSH. The 
way how ploop processes it is not affected by the patches we discussed.)


3) The benefits of delayed FLUSH must overweight the performance loss 
we'll have from extra WAIT introduced. I have not verified it yet, but I 
suspect it must be observable (e.g. by blktrace) that one page-sized bio 
marked as FLUSH|FUA completes faster than: submit one page-sized bio 
marked as FLUSH, wait for completion, submit zero-size FLUSH, wait for 
completion again. Makes sense?


If the three statements above are correct, and given some complexity 
added by the series, and possible 

[Devel] [RH7 PATCH 4/6] ploop: io_kaio support PLOOP_REQ_DEL_FLUSH

2016-06-23 Thread Dmitry Monakhov
Currently noone tag preqs with such bit but let it be here for simmetry

Signed-off-by: Dmitry Monakhov 
---
 drivers/block/ploop/io_kaio.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/block/ploop/io_kaio.c b/drivers/block/ploop/io_kaio.c
index bee2cee..5341fd5 100644
--- a/drivers/block/ploop/io_kaio.c
+++ b/drivers/block/ploop/io_kaio.c
@@ -73,6 +73,7 @@ static void kaio_complete_io_state(struct ploop_request * 
preq)
 
/* Convert requested fua to fsync */
if (test_and_clear_bit(PLOOP_REQ_FORCE_FUA, >state) ||
+   test_and_clear_bit(PLOOP_REQ_DEL_FLUSH, >state) ||
test_and_clear_bit(PLOOP_REQ_KAIO_FSYNC, >state))
post_fsync = 1;
 
-- 
1.8.3.1

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [RH7 PATCH 3/6] ploop: add delayed flush support

2016-06-23 Thread Dmitry Monakhov
dio_submit and dio_submit_pad may produce several bios. This makes
processing of REQ_FUA complicated because in order to preserve correctness
we have to TAG each bio with FUA flag which is suboptimal.
Obviously there is a room for optimization here: once all bios was acknowledged
by lower layer we may issue empty barrier aka ->issue_flush().
post_submit call back is the place where we all bios completed already.

b1:FUA, b2:FUA, b3:FUA =>  b1,b2,b3,wait_for_bios,bX:FLUSH

This allow us to remove all this REQ_FORCE_{FLUSH,FUA} crap and

Signed-off-by: Dmitry Monakhov 
---
 drivers/block/ploop/io_direct.c | 48 +
 include/linux/ploop/ploop.h |  2 ++
 2 files changed, 22 insertions(+), 28 deletions(-)

diff --git a/drivers/block/ploop/io_direct.c b/drivers/block/ploop/io_direct.c
index 195d318..752a9c3e 100644
--- a/drivers/block/ploop/io_direct.c
+++ b/drivers/block/ploop/io_direct.c
@@ -82,31 +82,13 @@ dio_submit(struct ploop_io *io, struct ploop_request * preq,
sector_t sec, nsec;
int err;
struct bio_list_walk bw;
-   int preflush;
-   int postfua = 0;
+   int preflush = !!(rw & REQ_FLUSH);
+   int postflush = !!(rw & REQ_FUA);
int write = !!(rw & REQ_WRITE);
 
trace_submit(preq);
 
-   preflush = !!(rw & REQ_FLUSH);
-
-   if (test_and_clear_bit(PLOOP_REQ_FORCE_FLUSH, >state))
-   preflush = 1;
-
-   if (test_and_clear_bit(PLOOP_REQ_FORCE_FUA, >state))
-   postfua = 1;
-
-   if (!postfua && ploop_req_delay_fua_possible(rw, preq)) {
-
-   /* Mark req that delayed flush required */
-   set_bit(PLOOP_REQ_FORCE_FLUSH, >state);
-   } else if (rw & REQ_FUA) {
-   postfua = 1;
-   }
-
rw &= ~(REQ_FLUSH | REQ_FUA);
-
-
bio_list_init();
 
if (iblk == PLOOP_ZERO_INDEX)
@@ -237,13 +219,14 @@ flush_bio:
rw2 |= REQ_FLUSH;
preflush = 0;
}
-   if (unlikely(postfua && !bl.head))
-   rw2 |= REQ_FUA;
-
ploop_acc_ff_out(preq->plo, rw2 | b->bi_rw);
submit_bio(rw2, b);
}
-
+   /* TODO: minor optimization is possible for single bio case */
+   if (postflush) {
+   set_bit(PLOOP_REQ_DEL_FLUSH, >state);
+   ploop_add_post_submit(io, preq);
+   }
ploop_complete_io_request(preq);
return;
 
@@ -523,9 +506,10 @@ dio_convert_extent(struct ploop_io *io, struct 
ploop_request * preq)
  (loff_t)sec << 9, clu_siz);
 
/* highly unlikely case: FUA coming to a block not provisioned yet */
-   if (!err && force_sync)
+   if (!err && force_sync) {
+   clear_bit(PLOOP_REQ_DEL_FLUSH, >state);
err = io->ops->sync(io);
-
+   }
if (!force_sync) {
spin_lock_irq(>lock);
io->io_count++;
@@ -546,7 +530,12 @@ dio_post_submit(struct ploop_io *io, struct ploop_request 
* preq)
if (test_and_clear_bit(PLOOP_REQ_DEL_CONV, >state))
dio_convert_extent(io, preq);
 
+   if (test_and_clear_bit(PLOOP_REQ_DEL_FLUSH, >state)) {
+   io->ops->issue_flush(io, preq);
+   return 1;
+   }
return 0;
+
 }
 
 /* Submit the whole cluster. If preq contains only partial data
@@ -562,7 +551,6 @@ dio_submit_pad(struct ploop_io *io, struct ploop_request * 
preq,
sector_t sec, end_sec, nsec, start, end;
struct bio_list_walk bw;
int err;
-
bio_list_init();
 
/* sec..end_sec is the range which we are going to write */
@@ -694,7 +682,11 @@ flush_bio:
ploop_acc_ff_out(preq->plo, rw | b->bi_rw);
submit_bio(rw, b);
}
-
+   /* TODO: minor optimization is possible for single bio case */
+   if (preq->req_rw &  REQ_FUA) {
+   set_bit(PLOOP_REQ_DEL_FLUSH, >state);
+   ploop_add_post_submit(io, preq);
+   }
ploop_complete_io_request(preq);
return;
 
diff --git a/include/linux/ploop/ploop.h b/include/linux/ploop/ploop.h
index 4c52a40..5076f16 100644
--- a/include/linux/ploop/ploop.h
+++ b/include/linux/ploop/ploop.h
@@ -472,6 +472,7 @@ enum
PLOOP_REQ_PUSH_BACKUP, /* preq was ACKed by userspace push_backup */
PLOOP_REQ_ALLOW_READS, /* READs are allowed for given req_cluster */
PLOOP_REQ_DEL_CONV,/* post_submit: conversion required */
+   PLOOP_REQ_DEL_FLUSH,   /* post_submit: REQ_FLUSH required */
PLOOP_REQ_FSYNC_DONE,  /* fsync_thread() performed f_op->fsync() */
 };
 
@@ -482,6 +483,7 @@ enum
 #define PLOOP_REQ_ZERO_FL (1 << PLOOP_REQ_ZERO)
 #define PLOOP_REQ_POST_SUBMIT_FL (1 << PLOOP_REQ_POST_SUBMIT)
 #define PLOOP_REQ_DEL_CONV_FL (1 << PLOOP_REQ_DEL_CONV)
+#define PLOOP_REQ_DEL_FLUSH_FL (1 << PLOOP_REQ_DEL_FLUSH)
 
 

[Devel] [RH7 PATCH 5/6] ploop: fixup barrier handling during relocation

2016-06-23 Thread Dmitry Monakhov
barrier code is broken in many ways:
Currently only ->dio_submit() handles PLOOP_REQ_FORCE_{FLUSH,FUA} correctly.
But request also can goes though ->dio_submit_alloc()->dio_submit_pad and 
write_page (for indexes)
So in case of grow_dev we have following sequance:

E_RELOC_DATA_READ:
 ->set_bit(PLOOP_REQ_FORCE_FUA, >state);
  ->delta->allocate
 ->io->submit_allloc: dio_submit_alloc
   ->dio_submit_pad
E_DATA_WBI : data written, time to update index
  ->delta->allocate_complete:ploop_index_update
->set_bit(PLOOP_REQ_FORCE_FUA, >state);
->write_page
->ploop_map_wb_complete
  ->ploop_wb_complete_post_process
->set_bit(PLOOP_REQ_FORCE_FUA, >state);
E_RELOC_NULLIFY:

   ->submit()

Once we have delayed_flush engine it is easy to implement correct scheme for
both engines.

E_RELOC_DATA_READ ->submit_allloc => wait->post_submit->issue_flush
E_DATA_WBI ->ploop_index_update with FUA
E_RELOC_NULLIFY ->submit: => wait->post_submit->issue_flush

This makes reloc sequence optimal:
RELOC_S: R1, W2,WAIT,FLUSH, WBI:FUA
RELOC_A: R1, W2,WAIT,FLUSH, WBI:FUA, W1:NULLIFY,WAIT, FLUSH

https://jira.sw.ru/browse/PSBM-47107
Signed-off-by: Dmitry Monakhov 
---
 drivers/block/ploop/dev.c |  2 +-
 drivers/block/ploop/io_kaio.c |  3 +--
 drivers/block/ploop/map.c | 28 ++--
 3 files changed, 16 insertions(+), 17 deletions(-)

diff --git a/drivers/block/ploop/dev.c b/drivers/block/ploop/dev.c
index 95e3067..090cd2d 100644
--- a/drivers/block/ploop/dev.c
+++ b/drivers/block/ploop/dev.c
@@ -2533,7 +2533,7 @@ restart:
sbl.head = sbl.tail = preq->aux_bio;
 
/* Relocated data write required sync before BAT updatee */
-   set_bit(PLOOP_REQ_FORCE_FUA, >state);
+   preq->req_rw |= REQ_FUA;
 
if (test_bit(PLOOP_REQ_RELOC_S, >state)) {
preq->eng_state = PLOOP_E_DATA_WBI;
diff --git a/drivers/block/ploop/io_kaio.c b/drivers/block/ploop/io_kaio.c
index 5341fd5..5217ab4 100644
--- a/drivers/block/ploop/io_kaio.c
+++ b/drivers/block/ploop/io_kaio.c
@@ -72,8 +72,7 @@ static void kaio_complete_io_state(struct ploop_request * 
preq)
}
 
/* Convert requested fua to fsync */
-   if (test_and_clear_bit(PLOOP_REQ_FORCE_FUA, >state) ||
-   test_and_clear_bit(PLOOP_REQ_DEL_FLUSH, >state) ||
+   if (test_and_clear_bit(PLOOP_REQ_DEL_FLUSH, >state) ||
test_and_clear_bit(PLOOP_REQ_KAIO_FSYNC, >state))
post_fsync = 1;
 
diff --git a/drivers/block/ploop/map.c b/drivers/block/ploop/map.c
index 3a6365d..ef351fb 100644
--- a/drivers/block/ploop/map.c
+++ b/drivers/block/ploop/map.c
@@ -901,6 +901,8 @@ void ploop_index_update(struct ploop_request * preq)
int old_level;
struct page * page;
sector_t sec;
+   int fua = !!(preq->req_rw & REQ_FUA);
+   unsigned long state = READ_ONCE(preq->state);
 
/* No way back, we are going to initiate index write. */
 
@@ -954,12 +956,11 @@ void ploop_index_update(struct ploop_request * preq)
plo->st.map_single_writes++;
top_delta->ops->map_index(top_delta, m->mn_start, );
/* Relocate requires consistent writes, mark such reqs appropriately */
-   if (test_bit(PLOOP_REQ_RELOC_A, >state) ||
-   test_bit(PLOOP_REQ_RELOC_S, >state))
-   set_bit(PLOOP_REQ_FORCE_FUA, >state);
-
-   top_delta->io.ops->write_page(_delta->io, preq, page, sec,
- !!(preq->req_rw & REQ_FUA));
+   if (state & (PLOOP_REQ_RELOC_A_FL | PLOOP_REQ_RELOC_S_FL)) {
+   WARN_ON(state & PLOOP_REQ_DEL_FLUSH_FL);
+   fua = 1;
+   }
+   top_delta->io.ops->write_page(_delta->io, preq, page, sec, fua);
put_page(page);
return;
 
@@ -1063,7 +1064,7 @@ static void map_wb_complete_post_process(struct ploop_map 
*map,
 * (see dio_submit()). So fsync of EXT4 image doesnt help us.
 * We need to force sync of nullified blocks.
 */
-   set_bit(PLOOP_REQ_FORCE_FUA, >state);
+   preq->req_rw |= REQ_FUA;
top_delta->io.ops->submit(_delta->io, preq, preq->req_rw,
  , preq->iblock, 1io_queue) {
struct ploop_request * preq;
+   unsigned long state;
 
preq = list_entry(cursor, struct ploop_request, list);
+   state = READ_ONCE(preq->state);
 
switch (preq->eng_state) {
case PLOOP_E_INDEX_DELAY:
@@ -1171,9 +1174,10 @@ static void map_wb_complete(struct map_node * m, int err)
if (preq->req_rw & REQ_FUA)
   

[Devel] [RH7 PATCH 2/6] ploop: generalize issue_flush

2016-06-23 Thread Dmitry Monakhov
Currently io->ops->issue_flush is called only from single place,
but it has potential to generic. Patch does not change actual logic,
but allow to call ->issue_flush from various places

Signed-off-by: Dmitry Monakhov 
---
 drivers/block/ploop/dev.c   | 1 +
 drivers/block/ploop/io_direct.c | 1 -
 drivers/block/ploop/io_kaio.c   | 1 -
 3 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/block/ploop/dev.c b/drivers/block/ploop/dev.c
index e8b0304..95e3067 100644
--- a/drivers/block/ploop/dev.c
+++ b/drivers/block/ploop/dev.c
@@ -1989,6 +1989,7 @@ ploop_entry_request(struct ploop_request * preq)
if (preq->req_size == 0) {
if (preq->req_rw & REQ_FLUSH &&
!test_bit(PLOOP_REQ_FSYNC_DONE, >state)) {
+   preq->eng_state = PLOOP_E_COMPLETE;
if (top_io->ops->issue_flush) {
top_io->ops->issue_flush(top_io, preq);
return;
diff --git a/drivers/block/ploop/io_direct.c b/drivers/block/ploop/io_direct.c
index ec905b4..195d318 100644
--- a/drivers/block/ploop/io_direct.c
+++ b/drivers/block/ploop/io_direct.c
@@ -1836,7 +1836,6 @@ static void dio_issue_flush(struct ploop_io * io, struct 
ploop_request *preq)
bio->bi_private = preq;
 
atomic_inc(>io_count);
-   preq->eng_state = PLOOP_E_COMPLETE;
ploop_acc_ff_out(io->plo, preq->req_rw | bio->bi_rw);
submit_bio(preq->req_rw, bio);
ploop_complete_io_request(preq);
diff --git a/drivers/block/ploop/io_kaio.c b/drivers/block/ploop/io_kaio.c
index de26319..bee2cee 100644
--- a/drivers/block/ploop/io_kaio.c
+++ b/drivers/block/ploop/io_kaio.c
@@ -951,7 +951,6 @@ static void kaio_issue_flush(struct ploop_io * io, struct 
ploop_request *preq)
 {
struct ploop_delta *delta = container_of(io, struct ploop_delta, io);
 
-   preq->eng_state = PLOOP_E_COMPLETE;
preq->req_rw &= ~REQ_FLUSH;
 
spin_lock_irq(>plo->lock);
-- 
1.8.3.1

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [RH7 PATCH 1/6] ploop: generalize post_submit stage

2016-06-23 Thread Dmitry Monakhov
Currently post_submit() used only for convert_unwritten_extents.
But post_submit() is good transition point where all submitted
data was completed by lower layer, and new state about to be processed.
Iyt is ideal point where we can perform transition actions
For example:
 io_direct: Convert unwritten extents
 io_direct: issue empty barrier bio in order to simulate postflush
 io_direct,io_kaio: queue to fsync queue
 Etc.

This patch does not change anything, but prepare post_submit for
more logic which will be added later.

Signed-off-by: Dmitry Monakhov 
---
 drivers/block/ploop/dev.c   | 10 ++
 drivers/block/ploop/io_direct.c | 15 ---
 include/linux/ploop/ploop.h | 12 +++-
 3 files changed, 29 insertions(+), 8 deletions(-)

diff --git a/drivers/block/ploop/dev.c b/drivers/block/ploop/dev.c
index e405232..e8b0304 100644
--- a/drivers/block/ploop/dev.c
+++ b/drivers/block/ploop/dev.c
@@ -2351,10 +2351,12 @@ static void ploop_req_state_process(struct 
ploop_request * preq)
preq->prealloc_size = 0; /* only for sanity */
}
 
-   if (test_bit(PLOOP_REQ_POST_SUBMIT, >state)) {
-   preq->eng_io->ops->post_submit(preq->eng_io, preq);
-   clear_bit(PLOOP_REQ_POST_SUBMIT, >state);
+   if (test_and_clear_bit(PLOOP_REQ_POST_SUBMIT, >state)) {
+   struct ploop_io *io = preq->eng_io;
+
preq->eng_io = NULL;
+   if (preq->eng_io->ops->post_submit(io, preq))
+   goto out;
}
 
 restart:
@@ -2633,7 +2635,7 @@ restart:
default:
BUG();
}
-
+out:
if (release_ioc) {
struct io_context * ioc = current->io_context;
current->io_context = saved_ioc;
diff --git a/drivers/block/ploop/io_direct.c b/drivers/block/ploop/io_direct.c
index f1812fe..ec905b4 100644
--- a/drivers/block/ploop/io_direct.c
+++ b/drivers/block/ploop/io_direct.c
@@ -416,8 +416,8 @@ try_again:
}
 
preq->iblock = iblk;
-   preq->eng_io = io;
-   set_bit(PLOOP_REQ_POST_SUBMIT, >state);
+   set_bit(PLOOP_REQ_DEL_CONV, >state);
+   ploop_add_post_submit(io, preq);
dio_submit_pad(io, preq, sbl, size, em);
err = 0;
goto end_write;
@@ -501,7 +501,7 @@ end_write:
 }
 
 static void
-dio_post_submit(struct ploop_io *io, struct ploop_request * preq)
+dio_convert_extent(struct ploop_io *io, struct ploop_request * preq)
 {
struct ploop_device *plo = preq->plo;
sector_t sec = (sector_t)preq->iblock << preq->plo->cluster_log;
@@ -540,6 +540,15 @@ dio_post_submit(struct ploop_io *io, struct ploop_request 
* preq)
}
 }
 
+static int
+dio_post_submit(struct ploop_io *io, struct ploop_request * preq)
+{
+   if (test_and_clear_bit(PLOOP_REQ_DEL_CONV, >state))
+   dio_convert_extent(io, preq);
+
+   return 0;
+}
+
 /* Submit the whole cluster. If preq contains only partial data
  * within the cluster, pad the rest of cluster with zeros.
  */
diff --git a/include/linux/ploop/ploop.h b/include/linux/ploop/ploop.h
index 0fba25e..4c52a40 100644
--- a/include/linux/ploop/ploop.h
+++ b/include/linux/ploop/ploop.h
@@ -148,7 +148,7 @@ struct ploop_io_ops
  struct bio_list *sbl, iblock_t iblk, unsigned int 
size);
void(*submit_alloc)(struct ploop_io *, struct ploop_request *,
struct bio_list *sbl, unsigned int size);
-   void(*post_submit)(struct ploop_io *, struct ploop_request *);
+   int (*post_submit)(struct ploop_io *, struct ploop_request *);
 
int (*disable_merge)(struct ploop_io * io, sector_t isector, 
unsigned int len);
int (*fastmap)(struct ploop_io * io, struct bio *orig_bio,
@@ -471,6 +471,7 @@ enum
PLOOP_REQ_POST_SUBMIT, /* preq needs post_submit processing */
PLOOP_REQ_PUSH_BACKUP, /* preq was ACKed by userspace push_backup */
PLOOP_REQ_ALLOW_READS, /* READs are allowed for given req_cluster */
+   PLOOP_REQ_DEL_CONV,/* post_submit: conversion required */
PLOOP_REQ_FSYNC_DONE,  /* fsync_thread() performed f_op->fsync() */
 };
 
@@ -479,6 +480,8 @@ enum
 #define PLOOP_REQ_RELOC_S_FL (1 << PLOOP_REQ_RELOC_S)
 #define PLOOP_REQ_DISCARD_FL (1 << PLOOP_REQ_DISCARD)
 #define PLOOP_REQ_ZERO_FL (1 << PLOOP_REQ_ZERO)
+#define PLOOP_REQ_POST_SUBMIT_FL (1 << PLOOP_REQ_POST_SUBMIT)
+#define PLOOP_REQ_DEL_CONV_FL (1 << PLOOP_REQ_DEL_CONV)
 
 enum
 {
@@ -767,6 +770,13 @@ static inline void ploop_entry_qlen_dec(struct 
ploop_request * preq)
preq->plo->read_sync_reqs--;
}
 }
+static inline
+void ploop_add_post_submit(struct ploop_io *io, struct ploop_request * preq)
+{
+   BUG_ON(preq->eng_io && preq->eng_io != io);
+   preq->eng_io = io;
+   set_bit(PLOOP_REQ_POST_SUBMIT, >state);
+}
 
 static inline 

[Devel] [RH7 PATCH 6/6] patch ploop_state_debugging.patch

2016-06-23 Thread Dmitry Monakhov
Signed-off-by: Dmitry Monakhov 
---
 drivers/block/ploop/dev.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/drivers/block/ploop/dev.c b/drivers/block/ploop/dev.c
index 090cd2d..9bf8592 100644
--- a/drivers/block/ploop/dev.c
+++ b/drivers/block/ploop/dev.c
@@ -1232,6 +1232,12 @@ static void ploop_complete_request(struct ploop_request 
* preq)
}
preq->bl.tail = NULL;
 
+   if (!preq->error) {
+   unsigned long state = READ_ONCE(preq->state);
+   WARN_ON(state & (PLOOP_REQ_POST_SUBMIT_FL|
+PLOOP_REQ_DEL_CONV_FL |
+PLOOP_REQ_DEL_FLUSH_FL ));
+   }
if (test_bit(PLOOP_REQ_RELOC_A, >state) ||
test_bit(PLOOP_REQ_RELOC_S, >state)) {
if (preq->error)
-- 
1.8.3.1

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [RH7 PATCH 0/6] RFC ploop: Barrier fix patch set v3

2016-06-23 Thread Dmitry Monakhov

Here is 3'rd version of barrier fix patches based on recent fixes.
This is an RFC version. I do not have time to test it before tomorrow,
Max please review is briefly and tell be your oppinion about general idea.
Basic idea is to use post_submit state to issue empty FLUSH barrier in order
to complete FUA requests. This allow us to unify all engines (direct and kaio).

This makes FUA processing optimal:
SUBMIT:FUA   :W1{b1,b2,b3,b4..},WAIT,post_submit:FLUSH
SUBMIT_ALLOC:FUA :W1{b1,b2,b3,b4..},WAIT,post_submit:FLUSH, WBI:FUA
RELOC_S: R1, W2,WAIT,post_submit:FLUSH, WBI:FUA
RELOC_A: R1, W2,WAIT,post_submit:FLUSH, WBI:FUA, 
W1:NULLIFY,WAIT,post_submit:FLUSH


#POST_SUBMIT CHANGES:
ploop-generalize-post_submit-stage.patch
ploop-generalize-issue_flush.patch
ploop-add-delayed-flush-support.patch
ploop-io_kaio-support-PLOOP_REQ_DEL_FLUSH.patch
#RELOC_XXX FIXES
ploop-fixup-barrier-handling-during-relocation.patch
patch-ploop_state_debugging.patch.patch


___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


Re: [Devel] [PATCH rh7] ve/fs: namespace -- Don't fail on permissions if @ve->devmnt_list is empty

2016-06-23 Thread Vladimir Davydov
On Thu, Jun 23, 2016 at 01:34:18PM +0300, Cyrill Gorcunov wrote:
> In commit 7eeb5b4afa8db5a2f2e1e47ab6b84e55fc8c5661 I addressed
> first half of a problem, but I happen to work with dirty copy
> of libvzctl where mount_opts cgroup has been c/r'ed manually,
> so I missed the case where @devmnt_list is empty on restore
> (just like it is in vanilla libvzctl). So fix the second half.
> 
> https://jira.sw.ru/browse/PSBM-48188
> 
> Reported-by: Igor Sukhih 
> Signed-off-by: Cyrill Gorcunov 
> CC: Vladimir Davydov 
> CC: Konstantin Khorenko 

Reviewed-by: Vladimir Davydov 
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH RHEL7 COMMIT] ve/binfmt_misc: Allow mount if capable(CAP_SYS_ADMIN)

2016-06-23 Thread Konstantin Khorenko
The commit is pushed to "branch-rh7-3.10.0-327.18.2.vz7.14.x-ovz" and will 
appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-327.18.2.vz7.14.19
-->
commit b188e9ef18fc3630568eaab2923238df31babc94
Author: Kirill Tkhai 
Date:   Thu Jun 23 19:06:40 2016 +0400

ve/binfmt_misc: Allow mount if capable(CAP_SYS_ADMIN)

The patch allows to mount binfmt_misc in a CT with ve0's admin caps,
and it's need that for CRIU dump. This time, unmounted binfmt_misc
may be forced mounted back, and we don't want to change CRIU's user_ns
to do that.

https://jira.sw.ru/browse/PSBM-47737

Signed-off-by: Kirill Tkhai 
Reviewed-by: Andrey Ryabinin 
---
 fs/binfmt_misc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index fd5227f..e259022 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -735,7 +735,7 @@ static int bm_fill_super(struct super_block * sb, void * 
data, int silent)
 static struct dentry *bm_mount(struct file_system_type *fs_type,
int flags, const char *dev_name, void *data)
 {
-   if (!current_user_ns_initial())
+   if (!current_user_ns_initial() && !capable(CAP_SYS_ADMIN))
return ERR_PTR(-EPERM);
return mount_ns(fs_type, flags, get_exec_env(), bm_fill_super);
 }
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH rh7] vecalls: kill VZCTL_SETDEVPERMS ioctl

2016-06-23 Thread Andrey Ryabinin
All device permissions are controlled via device cgroup now.
We don't need this ioctl() anymore.

Signed-off-by: Andrey Ryabinin 
---
 include/uapi/linux/vzcalluser.h |  2 +-
 kernel/ve/vecalls.c | 37 -
 2 files changed, 1 insertion(+), 38 deletions(-)

diff --git a/include/uapi/linux/vzcalluser.h b/include/uapi/linux/vzcalluser.h
index 2b340cf..85761c7 100644
--- a/include/uapi/linux/vzcalluser.h
+++ b/include/uapi/linux/vzcalluser.h
@@ -161,7 +161,7 @@ struct vzctl_cpustatctl {
 #define VZCTLTYPE  '.'
 #define VZCTL_OLD_ENV_CREATE   _IOW(VZCTLTYPE,  0, struct 
vzctl_old_env_create)
 #define VZCTL_MARK_ENV_TO_DOWN _IOW(VZCTLTYPE,  1, struct 
vzctl_mark_env_to_down)
-#define VZCTL_SETDEVPERMS  _IOW(VZCTLTYPE,  2, struct 
vzctl_setdevperms)
+#define VZCTL_SETDEVPERMS  _IOW(VZCTLTYPE,  2, struct 
vzctl_setdevperms) /* DEPRECATED */
 #define VZCTL_ENV_CREATE_CID   _IOW(VZCTLTYPE,  4, struct 
vzctl_env_create_cid)
 #define VZCTL_ENV_CREATE   _IOW(VZCTLTYPE,  5, struct 
vzctl_env_create)
 #define VZCTL_GET_CPU_STAT _IOW(VZCTLTYPE,  6, struct 
vzctl_cpustatctl)
diff --git a/kernel/ve/vecalls.c b/kernel/ve/vecalls.c
index 4742576..5ce3e29 100644
--- a/kernel/ve/vecalls.c
+++ b/kernel/ve/vecalls.c
@@ -97,29 +97,6 @@ out_put_ve:
return retval;
 }
 
-static int real_setdevperms(envid_t veid, unsigned type,
-   dev_t dev, unsigned mask)
-{
-   struct ve_struct *ve;
-   int err;
-
-   if (!capable_setveid() || veid == 0)
-   return -EPERM;
-
-   if ((ve = get_ve_by_id(veid)) == NULL)
-   return -ESRCH;
-
-   down_read(>op_sem);
-
-   err = -EAGAIN;
-   if (ve->is_running)
-   err = devcgroup_set_perms_ve(ve, type, dev, mask);
-
-   up_read(>op_sem);
-   put_ve(ve);
-   return err;
-}
-
 /**
  **
  *
@@ -604,20 +581,6 @@ int vzcalls_ioctl(struct file *file, unsigned int cmd, 
unsigned long arg)
err = 0;
}
break;
-   case VZCTL_SETDEVPERMS: {
-   /* Device type was mistakenly declared as dev_t
-* in the old user-kernel interface.
-* That's wrong, dev_t is a kernel internal type.
-* I use `unsigned' not having anything better in mind.
-* 2001/08/11  SAW  */
-   struct vzctl_setdevperms s;
-   err = -EFAULT;
-   if (copy_from_user(, (void __user *)arg, sizeof(s)))
-   break;
-   err = real_setdevperms(s.veid, s.type,
-   new_decode_dev(s.dev), s.mask);
-   }
-   break;
 #ifdef CONFIG_INET
case VZCTL_VE_NETDEV: {
struct vzctl_ve_netdev d;
-- 
2.7.3

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


Re: [Devel] [PATCH RH7] enable ipproto_icmp inside containers

2016-06-23 Thread Pavel Tikhomirov

On 06/23/2016 03:26 PM, Vasily Averin wrote:

iputils-ping 20150815 fails inside containers
because socket(PF_INET, SOCK_DGRAM, IPPROTO_ICMP)
is restricted by vz_security_protocol_check()

The patch enables creation of such sockets inside containers.
By default sys_socket still fails
because default setting of sysctl net.ipv4.ping_group_range,
however it's enough for iputils-ping 20150815.
its fallback handles this situation
and successfully creates RAW socket.

In mainlune it is enabled in MS kernel v3.13+, see:
commit fd2d5356d902 ("ipv4: Allow unprivileged users to use per net sysctls")
in future we're going backport this patch and add its save/restore into criu.

https://bugs.openvz.org/browse/OVZ-6744

Signed-off-by:  Vasily Averin 


Acked-by: Pavel Tikhomirov 




diff-vz7-enable-ipproto_icmp-inside-containers


diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
index 53fa12d..ef521cf 100644
--- a/kernel/ve/ve.c
+++ b/kernel/ve/ve.c
@@ -222,6 +222,7 @@ int vz_security_protocol_check(struct net *net, int 
protocol)

switch (protocol) {
case  IPPROTO_IP:
+   case  IPPROTO_ICMP:
case  IPPROTO_TCP:
case  IPPROTO_UDP:
case  IPPROTO_RAW:



--
Best regards, Tikhomirov Pavel
Software Developer, Virtuozzo.
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH RH7] enable ipproto_icmp inside containers

2016-06-23 Thread Vasily Averin
iputils-ping 20150815 fails inside containers
because socket(PF_INET, SOCK_DGRAM, IPPROTO_ICMP) 
is restricted by vz_security_protocol_check()

The patch enables creation of such sockets inside containers.
By default sys_socket still fails 
because default setting of sysctl net.ipv4.ping_group_range,
however it's enough for iputils-ping 20150815.
its fallback handles this situation 
and successfully creates RAW socket.

In mainlune it is enabled in MS kernel v3.13+, see:
commit fd2d5356d902 ("ipv4: Allow unprivileged users to use per net sysctls") 
in future we're going backport this patch and add its save/restore into criu.

https://bugs.openvz.org/browse/OVZ-6744

Signed-off-by:  Vasily Averin 

diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
index 53fa12d..ef521cf 100644
--- a/kernel/ve/ve.c
+++ b/kernel/ve/ve.c
@@ -222,6 +222,7 @@ int vz_security_protocol_check(struct net *net, int 
protocol)
 
switch (protocol) {
case  IPPROTO_IP:
+   case  IPPROTO_ICMP:
case  IPPROTO_TCP:
case  IPPROTO_UDP:
case  IPPROTO_RAW:
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


Re: [Devel] [PATCH RH7] enable ipproto_icmp inside containers

2016-06-23 Thread Pavel Tikhomirov



On 06/23/2016 03:08 PM, Vasily Averin wrote:

iputils-ping 20150815 fails inside containers
because socket(PF_INET, SOCK_DGRAM, IPPROTO_ICMP)
is restricted by vz_security_protocol_check()

The patch enables creation of such sockets inside containers.
By default sys_socket still fails
because default setting of sysctl net.ipv4.ping_group_range,
however it's enough for iputils-ping 20150815.
its fallback handles this situation
and successfully creates RAW socket.

According to ptikhomirov@ this sysctl will be enabled inside conaintes soon,
and in future it will be saved/restored by criu.


It is enabled in MS kernel v3.13+, see:
commit fd2d5356d902 ("ipv4: Allow unprivileged users to use per net 
sysctls")




https://bugs.openvz.org/browse/OVZ-6744

Signed-off-by:  Vasily Averin 



patch is empty?

--
Best regards, Tikhomirov Pavel
Software Developer, Virtuozzo.
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH RH7] enable ipproto_icmp inside containers

2016-06-23 Thread Vasily Averin
iputils-ping 20150815 fails inside containers
because socket(PF_INET, SOCK_DGRAM, IPPROTO_ICMP) 
is restricted by vz_security_protocol_check()

The patch enables creation of such sockets inside containers.
By default sys_socket still fails 
because default setting of sysctl net.ipv4.ping_group_range,
however it's enough for iputils-ping 20150815.
its fallback handles this situation 
and successfully creates RAW socket.

According to ptikhomirov@ this sysctl will be enabled inside conaintes soon,
and in future it will be saved/restored by criu.

https://bugs.openvz.org/browse/OVZ-6744

Signed-off-by:  Vasily Averin 
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


Re: [Devel] [PATCH rh7 1/4] net: ipip: enable in container

2016-06-23 Thread Evgenii Shatokhin

On 23.06.2016 11:57, Vladimir Davydov wrote:

Currently, we fail to init ipip per-net in a ve, because it has neither
NETIF_F_VIRTUAL nor NETIF_F_NETNS_LOCAL:

  ipip_init_net
   ip_tunnel_init_net
__ip_tunnel_create
 register_netdevice
  ve_is_dev_movable

In PCS6 ipip has NETIF_F_NETNS_LOCAL, so everything works fine there,
but this restriction was removed in RH7 kernel, so we fail to start a
container if ipip is loaded (or load ipip if there are containers
running).

Mark ipip as NETIF_F_VIRTUAL to fix this issue.

https://jira.sw.ru/browse/PSBM-48608

Signed-off-by: Vladimir Davydov 
---
  net/ipv4/ipip.c | 1 +
  1 file changed, 1 insertion(+)

diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index e556a1df5a57..7842dcb2fd65 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -301,6 +301,7 @@ static void ipip_tunnel_setup(struct net_device *dev)
netif_keep_dst(dev);

dev->features|= IPIP_FEATURES;
+   dev->features|= NETIF_F_VIRTUAL;
dev->hw_features |= IPIP_FEATURES;
ip_tunnel_setup(dev, ipip_net_id);
  }



All 4 patches look good to me and I have checked that they do fix the 
described problems, at least on my machine.


So, for the whole patch series:

Tested-by: Evgenii Shatokhin 

Regards,
Evgenii
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH rh7] ve/fs: namespace -- Don't fail on permissions if @ve->devmnt_list is empty

2016-06-23 Thread Cyrill Gorcunov
In commit 7eeb5b4afa8db5a2f2e1e47ab6b84e55fc8c5661 I addressed
first half of a problem, but I happen to work with dirty copy
of libvzctl where mount_opts cgroup has been c/r'ed manually,
so I missed the case where @devmnt_list is empty on restore
(just like it is in vanilla libvzctl). So fix the second half.

https://jira.sw.ru/browse/PSBM-48188

Reported-by: Igor Sukhih 
Signed-off-by: Cyrill Gorcunov 
CC: Vladimir Davydov 
CC: Konstantin Khorenko 
---
 fs/namespace.c |   18 ++
 1 file changed, 14 insertions(+), 4 deletions(-)

Index: linux-pcs7.git/fs/namespace.c
===
--- linux-pcs7.git.orig/fs/namespace.c
+++ linux-pcs7.git/fs/namespace.c
@@ -1954,10 +1954,20 @@ again:
goto again;
case 1:
if (*data_pp) {
-   ve_printk(VE_LOG_BOTH, KERN_WARNING "VE%s: no allowed "
- "mount options found for device %u:%u\n",
- ve->ve_name, MAJOR(dev), MINOR(dev));
-   err = -EPERM;
+   /*
+* Same as in chunk above but for case where
+* ve->devmnt_list is empty. Depending on
+* the way userspace tool restore container
+* it might be nonempty as well.
+*/
+   if (ve->is_pseudosuper) {
+   err = 0;
+   } else {
+   ve_printk(VE_LOG_BOTH, KERN_WARNING "VE%s: no 
allowed "
+ "mount options found for device 
%u:%u\n",
+ ve->ve_name, MAJOR(dev), MINOR(dev));
+   err = -EPERM;
+   }
} else
err = 0;
break;
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH rh7 3/4] net: ipip: fix crash on newlink if VE_FEATURE_IPIP is disabled

2016-06-23 Thread Vladimir Davydov
In this case net_generic returns NULL. We must handle this gracefully.

Signed-off-by: Vladimir Davydov 
---
 net/ipv4/ipip.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 7842dcb2fd65..b1004fb7539c 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -357,6 +357,9 @@ static int ipip_newlink(struct net *src_net, struct 
net_device *dev,
 {
struct ip_tunnel_parm p;
 
+   if (net_generic(dev_net(dev), ipip_net_id) == NULL)
+   return -EACCES;
+
ipip_netlink_parms(data, );
return ip_tunnel_newlink(dev, tb, );
 }
-- 
2.1.4

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH rh7 4/4] net: sit: fix crash on newlink if VE_FEATURE_SIT is disabled

2016-06-23 Thread Vladimir Davydov
In this case net_generic returns NULL. We must handle this gracefully.

Signed-off-by: Vladimir Davydov 
---
 net/ipv6/sit.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index 2a73b520d3bf..6b1ae3b06be9 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -1441,6 +1441,9 @@ static int ipip6_newlink(struct net *src_net, struct 
net_device *dev,
 #endif
int err;
 
+   if (net_generic(net, sit_net_id) == NULL)
+   return -EACCES;
+
nt = netdev_priv(dev);
ipip6_netlink_parms(data, >parms);
 
-- 
2.1.4

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH rh7 1/4] net: ipip: enable in container

2016-06-23 Thread Vladimir Davydov
Currently, we fail to init ipip per-net in a ve, because it has neither
NETIF_F_VIRTUAL nor NETIF_F_NETNS_LOCAL:

 ipip_init_net
  ip_tunnel_init_net
   __ip_tunnel_create
register_netdevice
 ve_is_dev_movable

In PCS6 ipip has NETIF_F_NETNS_LOCAL, so everything works fine there,
but this restriction was removed in RH7 kernel, so we fail to start a
container if ipip is loaded (or load ipip if there are containers
running).

Mark ipip as NETIF_F_VIRTUAL to fix this issue.

https://jira.sw.ru/browse/PSBM-48608

Signed-off-by: Vladimir Davydov 
---
 net/ipv4/ipip.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index e556a1df5a57..7842dcb2fd65 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -301,6 +301,7 @@ static void ipip_tunnel_setup(struct net_device *dev)
netif_keep_dst(dev);
 
dev->features   |= IPIP_FEATURES;
+   dev->features   |= NETIF_F_VIRTUAL;
dev->hw_features|= IPIP_FEATURES;
ip_tunnel_setup(dev, ipip_net_id);
 }
-- 
2.1.4

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH rh7 2/4] net: ip_vti: skip per net init in ve

2016-06-23 Thread Vladimir Davydov
ip_vti devices lack NETIF_F_VIRTUAL, so they can't be created inside a
container. Problem is a device of this kind is created on net ns init if
the module is loaded, as a result a container start fails with EPERM.

We could allow ip_vti inside container (as well as other net devices,
which I would really like to do), but this is insecure and might break
migration, so let's keep it disabled and fix the issue by silently
skipping ip_vti per net init if running inside a ve.

https://jira.sw.ru/browse/PSBM-48698

Signed-off-by: Vladimir Davydov 
---
 net/ipv4/ip_vti.c | 15 +++
 1 file changed, 15 insertions(+)

diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index ce80a9a1be9d..3158100646ed 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -58,6 +58,9 @@ static int vti_input(struct sk_buff *skb, int nexthdr, __be32 
spi,
struct net *net = dev_net(skb->dev);
struct ip_tunnel_net *itn = net_generic(net, vti_net_id);
 
+   if (itn == NULL)
+   return -EINVAL;
+
tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY,
  iph->saddr, iph->daddr, 0);
if (tunnel != NULL) {
@@ -256,6 +259,9 @@ static int vti4_err(struct sk_buff *skb, u32 info)
int protocol = iph->protocol;
struct ip_tunnel_net *itn = net_generic(net, vti_net_id);
 
+   if (itn == NULL)
+   return -1;
+
tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY,
  iph->daddr, iph->saddr, 0);
if (!tunnel)
@@ -413,6 +419,9 @@ static int __net_init vti_init_net(struct net *net)
int err;
struct ip_tunnel_net *itn;
 
+   if (!ve_is_super(net->owner_ve))
+   return net_assign_generic(net, vti_net_id, NULL);
+
err = ip_tunnel_init_net(net, vti_net_id, _link_ops, "ip_vti0");
if (err)
return err;
@@ -424,6 +433,9 @@ static int __net_init vti_init_net(struct net *net)
 static void __net_exit vti_exit_net(struct net *net)
 {
struct ip_tunnel_net *itn = net_generic(net, vti_net_id);
+
+   if (itn == NULL)
+   return;
ip_tunnel_delete_net(itn, _link_ops);
 }
 
@@ -473,6 +485,9 @@ static int vti_newlink(struct net *src_net, struct 
net_device *dev,
 {
struct ip_tunnel_parm parms;
 
+   if (net_generic(dev_net(dev), vti_net_id) == NULL)
+   return -EACCES;
+
vti_netlink_parms(data, );
return ip_tunnel_newlink(dev, tb, );
 }
-- 
2.1.4

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel