from:"Kirill Tkhai"

Re: [Devel] [PATCH rh9 v2] dm-zero-req: Introduce zero request based target

2022-04-06 Thread Kirill Tkhai

On 05.04.2022 16:52, Konstantin Khorenko wrote:
> This driver is like "dm-zero", but request based rather than bio based
> like original "dm-zero".
> 
> This driver will be used on a block device for Container configuration
> stage: we need to construct a block device which honors CBT mask (stored
> in ploop image), for that first we need to create a dummy block device
> (pure technical issue, otherwise CBT mask is dropped).
> 
> dm-ploop/dm-qcow2 are request based, thus we need zero target
> also to be request based.
> 
> https://jira.sw.ru/browse/PSBM-134130
> 
> Signed-off-by: Konstantin Khorenko 
> Feature: cbt: changed block tracking (for backup)

Reviewed-by: Kirill Tkhai 
 
> ---
> v2: * dropped extra kernel config option, put new modules under CONFIG_DM_ZERO
> * dropped "readahead of null bytes" optimization
> ---
>  drivers/md/Makefile  |  1 +
>  drivers/md/dm-zero-req.c | 91 
>  2 files changed, 92 insertions(+)
>  create mode 100644 drivers/md/dm-zero-req.c
> 
> diff --git a/drivers/md/Makefile b/drivers/md/Makefile
> index 94134440cf70..3197d24c0e75 100644
> --- a/drivers/md/Makefile
> +++ b/drivers/md/Makefile
> @@ -76,6 +76,7 @@ obj-$(CONFIG_DM_PERSISTENT_DATA) += persistent-data/
>  obj-$(CONFIG_DM_MIRROR)  += dm-mirror.o dm-log.o dm-region-hash.o
>  obj-$(CONFIG_DM_LOG_USERSPACE)   += dm-log-userspace.o
>  obj-$(CONFIG_DM_ZERO)+= dm-zero.o
> +obj-$(CONFIG_DM_ZERO)+= dm-zero-req.o
>  obj-$(CONFIG_DM_RAID)+= dm-raid.o
>  obj-$(CONFIG_DM_THIN_PROVISIONING) += dm-thin-pool.o
>  obj-$(CONFIG_DM_VERITY)  += dm-verity.o
> diff --git a/drivers/md/dm-zero-req.c b/drivers/md/dm-zero-req.c
> new file mode 100644
> index ..9e44de15dcd6
> --- /dev/null
> +++ b/drivers/md/dm-zero-req.c
> @@ -0,0 +1,91 @@
> +/*
> + * Copyright (C) 2003 Jana Saout 
> + *
> + * This file is released under the GPL.
> + */
> +
> +#include 
> +
> +#include 
> +#include 
> +#include 
> +#include 
> +#include "dm-rq.h"
> +
> +#define DM_MSG_PREFIX "zero"
> +
> +/*
> + * Construct a dummy mapping that only returns zeros
> + */
> +static int zero_ctr(struct dm_target *ti, unsigned int argc, char **argv)
> +{
> + if (argc != 0) {
> + ti->error = "No arguments required";
> + return -EINVAL;
> + }
> +
> + /*
> +  * Silently drop discards, avoiding -EOPNOTSUPP.
> +  */
> + ti->num_discard_bios = 1;
> +
> + return 0;
> +}
> +
> +static int zero_clone_and_map_rq(struct dm_target *ti, struct request *rq,
> +  union map_info *map_context,
> +  struct request **clone)
> +{
> + struct bio *bio = rq->bio;
> +
> + switch (bio_op(bio)) {
> + case REQ_OP_READ:
> + while (bio) {
> + zero_fill_bio(bio);
> + bio = bio->bi_next;
> + }
> +
> + break;
> + case REQ_OP_WRITE:
> + /* writes get silently dropped */
> + break;
> + default:
> + return DM_MAPIO_KILL;
> + }
> +
> + dm_complete_request(rq, BLK_STS_OK);
> +
> + /* accepted rq, don't make new request */
> + return DM_MAPIO_SUBMITTED;
> +}
> +
> +static struct target_type zero_target = {
> + .name   = "zero-rq",
> + .version = {1, 1, 0},
> + .features = DM_TARGET_NOWAIT,
> + .module = THIS_MODULE,
> + .ctr= zero_ctr,
> + .clone_and_map_rq = zero_clone_and_map_rq,
> +};
> +
> +static int __init dm_zero_init(void)
> +{
> + int r = dm_register_target(&zero_target);
> +
> + if (r < 0)
> + DMERR("register failed %d", r);
> +
> + return r;
> +}
> +
> +static void __exit dm_zero_exit(void)
> +{
> + dm_unregister_target(&zero_target);
> +}
> +
> +module_init(dm_zero_init)
> +module_exit(dm_zero_exit)
> +
> +MODULE_AUTHOR("Jana Saout ");
> +MODULE_DESCRIPTION(DM_NAME " dummy request based target returning zeros");
> +MODULE_LICENSE("GPL");

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH RH9 4/9] dm-qcow2: Prepare handle_md_page() for calling not only from main kwork

2022-03-06 Thread Kirill Tkhai

Parallel handle_md_page() may fail because of a page has just been added.
Teach it to repeat the search.

Signed-off-by: Kirill Tkhai 
---
 drivers/md/dm-qcow2-map.c|5 ++---
 drivers/md/dm-qcow2-target.c |   14 ++
 2 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/drivers/md/dm-qcow2-map.c b/drivers/md/dm-qcow2-map.c
index 4e04505810fc..4edd63e47a3d 100644
--- a/drivers/md/dm-qcow2-map.c
+++ b/drivers/md/dm-qcow2-map.c
@@ -1526,10 +1526,8 @@ static int submit_read_md_page(struct qcow2 *qcow2, 
struct qio **qio,
int ret;
 
ret = alloc_and_insert_md_page(qcow2, page_id, &md);
-   if (ret < 0) {
-   pr_err("Can't alloc: ret=%d, page_id=%llu\n", ret, page_id);
+   if (ret < 0)
return ret;
-   }
 
spin_lock_irq(&qcow2->md_pages_lock);
list_add_tail(&(*qio)->link, &md->wait_list);
@@ -1543,6 +1541,7 @@ static int submit_read_md_page(struct qcow2 *qcow2, 
struct qio **qio,
 /*
  * This may be called with @qio == NULL, in case of we are
  * interesting in searching cached in memory md only.
+ * This is aimed to be called not only from main kwork.
  */
 static int handle_md_page(struct qcow2 *qcow2, u64 page_id,
 struct qio **qio, struct md_page **ret_md)
diff --git a/drivers/md/dm-qcow2-target.c b/drivers/md/dm-qcow2-target.c
index 6c550cbe2579..795d64516507 100644
--- a/drivers/md/dm-qcow2-target.c
+++ b/drivers/md/dm-qcow2-target.c
@@ -318,7 +318,7 @@ struct md_page *md_page_find_or_postpone(struct qcow2 
*qcow2, unsigned int id,
return md;
 }
 
-static void md_page_insert(struct qcow2 *qcow2, struct md_page *new_md)
+static int md_page_try_insert(struct qcow2 *qcow2, struct md_page *new_md)
 {
struct rb_root *root = &qcow2->md_pages;
unsigned int new_id = new_md->id;
@@ -337,11 +337,12 @@ static void md_page_insert(struct qcow2 *qcow2, struct 
md_page *new_md)
else if (new_id > md->id)
node = &parent->rb_right;
else
-   BUG();
+   return -EEXIST;
}
 
rb_link_node(&new_md->node, parent, node);
rb_insert_color(&new_md->node, root);
+   return 0;
 }
 
 void md_page_erase(struct qcow2 *qcow2, struct md_page *md)
@@ -361,7 +362,8 @@ struct md_page *md_page_renumber(struct qcow2 *qcow2, 
unsigned int id,
WARN_ON_ONCE(!list_empty(&md->wait_list));
md_page_erase(qcow2, md);
md->id = new_id;
-   md_page_insert(qcow2, md);
+   if (WARN_ON(md_page_try_insert(qcow2, md) < 0))
+   md = NULL;
}
return md;
 }
@@ -396,10 +398,14 @@ int alloc_and_insert_md_page(struct qcow2 *qcow2, u64 
index, struct md_page **md
INIT_LIST_HEAD(&(*md)->wb_link);
 
spin_lock_irq(&qcow2->md_pages_lock);
-   md_page_insert(qcow2, *md);
+   ret = md_page_try_insert(qcow2, *md);
spin_unlock_irq(&qcow2->md_pages_lock);
+   if (ret)
+   goto err_putpage;
return 0;
 
+err_putpage:
+   put_page((*md)->page);
 err_kfree:
kfree(*md);
return ret;


___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH RH9] dm-ploop: Remove tracking code

2021-10-27 Thread Kirill Tkhai

We use generic driver instead.

Signed-off-by: Kirill Tkhai 
---
 drivers/md/dm-ploop-cmd.c|  165 --
 drivers/md/dm-ploop-map.c|   38 --
 drivers/md/dm-ploop-target.c |3 -
 drivers/md/dm-ploop.h|   13 ---
 4 files changed, 219 deletions(-)

diff --git a/drivers/md/dm-ploop-cmd.c b/drivers/md/dm-ploop-cmd.c
index 3ba866cb0ec0..ed46da98b8d7 100644
--- a/drivers/md/dm-ploop-cmd.c
+++ b/drivers/md/dm-ploop-cmd.c
@@ -249,7 +249,6 @@ static int ploop_write_cluster_sync(struct ploop *ploop, 
struct pio *pio,
if (pio->bi_status)
return blk_status_to_errno(pio->bi_status);
 
-   /* track_bio(ploop, bio); */
return vfs_fsync(file, 0);
 }
 
@@ -982,166 +981,6 @@ static int ploop_set_falloc_new_clu(struct ploop *ploop, 
u64 val)
return 0;
 }
 
-static int process_tracking_start(struct ploop *ploop, void *tracking_bitmap,
- u32 tb_nr)
-{
-   u32 i, nr_pages, end, *bat_entries, dst_clu, nr;
-   struct rb_node *node;
-   struct md_page *md;
-   int ret = 0;
-
-   write_lock_irq(&ploop->bat_rwlock);
-   ploop->tracking_bitmap = tracking_bitmap;
-   ploop->tb_nr = tb_nr;
-
-   for_each_clear_bit(i, ploop->holes_bitmap, ploop->hb_nr)
-   set_bit(i, tracking_bitmap);
-   nr_pages = bat_clu_to_page_nr(ploop->nr_bat_entries - 1) + 1;
-   nr = 0;
-
-   ploop_for_each_md_page(ploop, md, node) {
-   ploop_init_be_iter(ploop, md->id, &i, &end);
-   bat_entries = kmap_atomic(md->page);
-   for (; i <= end; i++) {
-   dst_clu = bat_entries[i];
-   if (dst_clu == BAT_ENTRY_NONE ||
-   md->bat_levels[i] != top_level(ploop))
-   continue;
-   if (WARN_ON(dst_clu >= tb_nr)) {
-   ret = -EIO;
-   break;
-   }
-   set_bit(dst_clu, tracking_bitmap);
-   }
-   kunmap_atomic(bat_entries);
-   if (ret)
-   break;
-   nr++;
-   }
-   write_unlock_irq(&ploop->bat_rwlock);
-
-   BUG_ON(ret == 0 && nr != nr_pages);
-   return ret;
-}
-
-static int tracking_get_next(struct ploop *ploop, char *result,
-unsigned int maxlen)
-{
-   unsigned int i, sz = 0, tb_nr = ploop->tb_nr, prev = ploop->tb_cursor;
-   void *tracking_bitmap = ploop->tracking_bitmap;
-   int ret = -EAGAIN;
-
-   if (WARN_ON_ONCE(prev > tb_nr - 1))
-   prev = 0;
-
-   write_lock_irq(&ploop->bat_rwlock);
-   i = find_next_bit(tracking_bitmap, tb_nr, prev + 1);
-   if (i < tb_nr)
-   goto found;
-   i = find_first_bit(tracking_bitmap, prev + 1);
-   if (i >= prev + 1)
-   goto unlock;
-found:
-   ret = (DMEMIT("%u\n", i)) ? 1 : 0;
-   if (ret)
-   clear_bit(i, tracking_bitmap);
-unlock:
-   write_unlock_irq(&ploop->bat_rwlock);
-   if (ret > 0)
-   ploop->tb_cursor = i;
-   return ret;
-}
-
-static u32 max_dst_clu_in_top_delta(struct ploop *ploop)
-{
-   u32 i, nr_pages, nr = 0, end, *bat_entries, dst_clu = 0;
-   struct rb_node *node;
-   struct md_page *md;
-
-   nr_pages = bat_clu_to_page_nr(ploop->nr_bat_entries - 1) + 1;
-
-   read_lock_irq(&ploop->bat_rwlock);
-   ploop_for_each_md_page(ploop, md, node) {
-   ploop_init_be_iter(ploop, md->id, &i, &end);
-   bat_entries = kmap_atomic(md->page);
-   for (; i <= end; i++) {
-   if (dst_clu < bat_entries[i] &&
-   md->bat_levels[i] == top_level(ploop))
-   dst_clu = bat_entries[i];
-   }
-   kunmap_atomic(bat_entries);
-   nr++;
-   }
-   read_unlock_irq(&ploop->bat_rwlock);
-
-   BUG_ON(nr != nr_pages);
-   return dst_clu;
-}
-
-static int ploop_tracking_cmd(struct ploop *ploop, const char *suffix,
- char *result, unsigned int maxlen)
-{
-   void *tracking_bitmap = NULL;
-   unsigned int tb_nr, size;
-   int ret = 0;
-
-   if (ploop_is_ro(ploop))
-   return -EROFS;
-
-   if (!strcmp(suffix, "get_next")) {
-   if (!ploop->tracking_bitmap)
-   return -ENOENT;
-   return tracking_get_next(ploop, result, maxlen);
-   }
-
-   if (!strcmp(suffix, "start")) {
-   if (ploop->tracking_bitmap)
-   return -EEXIST;
-   if (ploop->maintaince)
-

[Devel] [PATCH RH9 2/2] dm-tracking: Do not return EAGAIN in case of there is no changed clu

2021-10-27 Thread Kirill Tkhai

Return nothing (we do not call DMEMIT() in this case) instead.
EAGAIN may confuse a user.

Signed-off-by: Kirill Tkhai 
---
 drivers/md/dm-tracking.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/md/dm-tracking.c b/drivers/md/dm-tracking.c
index a8880a83d270..e66060246acc 100644
--- a/drivers/md/dm-tracking.c
+++ b/drivers/md/dm-tracking.c
@@ -213,7 +213,7 @@ static int tracking_get_next(struct dm_tracking *dmt, char 
*result,
 {
unsigned int i, sz = 0, nr_clus = dmt->nr_clus, prev = dmt->cursor;
void *bitmap = dmt->bitmap;
-   int ret = -EAGAIN;
+   int ret = 0;
 
if (WARN_ON_ONCE(prev > nr_clus - 1))
prev = 0;


___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH RH9 1/2] dm-tracking: Track request after it completed

2021-10-27 Thread Kirill Tkhai

Otherwise there is a race in case of userspace calls
"tracking_get_next" and dumps cluster before the request
is written completely.

Signed-off-by: Kirill Tkhai 
---
 drivers/md/dm-tracking.c |   47 +++---
 1 file changed, 40 insertions(+), 7 deletions(-)

diff --git a/drivers/md/dm-tracking.c b/drivers/md/dm-tracking.c
index d723596fee44..a8880a83d270 100644
--- a/drivers/md/dm-tracking.c
+++ b/drivers/md/dm-tracking.c
@@ -34,18 +34,23 @@ struct dm_tracking {
struct mutex ctl_mutex;
 };
 
+struct treq {
+   sector_t pos;
+   u32 bytes;
+};
+
 static sector_t get_dev_size(struct dm_dev *dev)
 {
return i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT;
 }
 
-static void track_rq_clus(struct dm_tracking *dmt, struct request *rq)
+static void track_rq_clus(struct dm_tracking *dmt, struct treq *treq)
 {
-   loff_t off = to_bytes(blk_rq_pos(rq));
+   loff_t off = to_bytes(treq->pos);
u64 start_clu, end_clu, clu;
 
start_clu = off / dmt->clu_size;
-   end_clu = (off + blk_rq_bytes(rq) - 1) / dmt->clu_size;
+   end_clu = (off + treq->bytes - 1) / dmt->clu_size;
 
for (clu = start_clu; clu <= end_clu; clu++) {
set_bit(clu, dmt->bitmap);
@@ -61,20 +66,25 @@ static int dmt_clone_and_map(struct dm_target *ti, struct 
request *rq,
 {
struct dm_tracking *dmt = ti->private;
struct block_device *bdev = dmt->origin_dev->bdev;
+   struct treq *treq = NULL;
struct request_queue *q;
struct request *clone;
 
+   map_context->ptr = NULL;
if (blk_rq_bytes(rq) && op_is_write(req_op(rq))) {
-   spin_lock_irq(&dmt->lock);
-   if (dmt->bitmap)
-   track_rq_clus(dmt, rq);
-   spin_unlock_irq(&dmt->lock);
+   treq = kmalloc(sizeof(*treq), GFP_ATOMIC);
+   if (!treq)
+   return DM_MAPIO_REQUEUE;
+   treq->pos = blk_rq_pos(rq);
+   treq->bytes = blk_rq_bytes(rq);
+   map_context->ptr = treq;
}
 
q = bdev_get_queue(bdev);
clone = blk_get_request(q, rq->cmd_flags | REQ_NOMERGE,
BLK_MQ_REQ_NOWAIT);
if (IS_ERR(clone)) {
+   kfree(treq);
/* EBUSY, ENODEV or EWOULDBLOCK: requeue */
if (blk_queue_dying(q))
return DM_MAPIO_DELAY_REQUEUE;
@@ -91,9 +101,31 @@ static int dmt_clone_and_map(struct dm_target *ti, struct 
request *rq,
 static void dmt_release_clone(struct request *clone,
  union map_info *map_context)
 {
+   if (unlikely(map_context)) {
+   struct treq *treq = map_context->ptr;
+   kfree(treq);
+   }
+
blk_put_request(clone);
 }
 
+static int dmt_end_io(struct dm_target *ti, struct request *clone,
+ blk_status_t error, union map_info *map_context)
+{
+   struct treq *treq = map_context->ptr;
+   struct dm_tracking *dmt = ti->private;
+
+   if (treq) {
+   spin_lock_irq(&dmt->lock);
+   if (dmt->bitmap)
+   track_rq_clus(dmt, treq);
+   spin_unlock_irq(&dmt->lock);
+   kfree(treq);
+   }
+
+   return DM_ENDIO_DONE;
+}
+
 static void dmt_destroy(struct dm_tracking *dmt)
 {
if (dmt->origin_dev)
@@ -320,6 +352,7 @@ static struct target_type dmt_target = {
.dtr = dmt_dtr,
.clone_and_map_rq = dmt_clone_and_map,
.release_clone_rq = dmt_release_clone,
+   .rq_end_io = dmt_end_io,
.message = dmt_message,
.iterate_devices = dmt_iterate_devices,
.status = dmt_status,


___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH RH9] dm-ploop: Fix usage of bio_vec on stack

2021-10-26 Thread Kirill Tkhai

Previously, writing BAT page was synchronous, so
we could use on-stack bio_vec for that.
But after it became asynchronous, we can't do that.
Strange, this has not fired earlier.

https://jira.sw.ru/browse/PSBM-135137
Fixes: bfc5eaaba897 "ploop: Async md writeback"
Signed-off-by: Kirill Tkhai 
---
 drivers/md/dm-ploop-map.c |   12 ++--
 drivers/md/dm-ploop.h |1 +
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/drivers/md/dm-ploop-map.c b/drivers/md/dm-ploop-map.c
index 4cadf6e45b4d..a558445a1bec 100644
--- a/drivers/md/dm-ploop-map.c
+++ b/drivers/md/dm-ploop-map.c
@@ -1595,17 +1595,17 @@ void ploop_index_wb_submit(struct ploop *ploop, struct 
ploop_index_wb *piwb)
 {
loff_t pos = (loff_t)piwb->page_id << PAGE_SHIFT;
struct pio *pio = piwb->pio;
-   struct bio_vec bvec = {
-   .bv_page = piwb->bat_page,
-   .bv_len = PAGE_SIZE,
-   .bv_offset = 0,
-   };
+   struct bio_vec *bvec = &piwb->aux_bvec;
+
+   bvec->bv_page = piwb->bat_page;
+   bvec->bv_len = PAGE_SIZE;
+   bvec->bv_offset = 0;
 
pio->bi_iter.bi_sector = to_sector(pos);
pio->bi_iter.bi_size = PAGE_SIZE;
pio->bi_iter.bi_idx = 0;
pio->bi_iter.bi_bvec_done = 0;
-   pio->bi_io_vec = &bvec;
+   pio->bi_io_vec = bvec;
pio->level = top_level(ploop);
pio->endio_cb = md_write_endio;
pio->endio_cb_data = piwb;
diff --git a/drivers/md/dm-ploop.h b/drivers/md/dm-ploop.h
index a7ca942c4670..0a4c6b78e20e 100644
--- a/drivers/md/dm-ploop.h
+++ b/drivers/md/dm-ploop.h
@@ -106,6 +106,7 @@ struct ploop_index_wb {
bool completed;
blk_status_t bi_status;
u32 page_id;
+   struct bio_vec aux_bvec;
 };
 
 /* Metadata page */


___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH RH9 4/4] xfs: Provide a balloon nipple for management

2021-10-22 Thread Kirill Tkhai

A new ioctl() to open balloon file.

Signed-off-by: Kirill Tkhai 
---
 fs/xfs/libxfs/xfs_fs.h |1 +
 fs/xfs/xfs_ioctl.c |   63 
 2 files changed, 64 insertions(+)

diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index bde2b4c64dbe..2293e1b757b3 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -839,6 +839,7 @@ struct xfs_scrub_metadata {
 #define XFS_IOC_INUMBERS_IOR ('X', 128, struct xfs_inumbers_req)
 /* XFS_IOC_GETFSUUID -- deprecated 140  */
 
+#define XFS_IOC_OPEN_BALLOON   _IO('X', 255)
 
 #ifndef HAVE_BBMACROS
 /*
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 16039ea10ac9..1282d9412f92 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -1935,6 +1935,63 @@ xfs_fs_eofblocks_from_user(
return 0;
 }
 
+static int xfs_open_balloon(struct xfs_mount *mp, struct vfsmount *mnt)
+{
+   u64 balloon_ino = READ_ONCE(mp->m_balloon_ino);
+   struct xfs_inode *ip;
+   struct inode *inode;
+   int err, fd;
+   struct file *filp;
+   struct dentry *de;
+   struct path path;
+   fmode_t mode;
+
+   if (!balloon_ino)
+   return -ENOENT;
+   ip = xfs_balloon_get(mp, balloon_ino, 0);
+   if (IS_ERR(ip))
+   return PTR_ERR(ip);
+   inode = VFS_I(ip);
+
+   err = fd = get_unused_fd_flags(0);
+   if (err < 0)
+   goto err_put_ip;
+
+   __iget(inode);
+   de = d_obtain_alias(inode);
+   err = PTR_ERR(de);
+   if (IS_ERR(de))
+   goto err_put_fd;
+
+   path.dentry = de;
+   path.mnt = mntget(mnt);
+   err = mnt_want_write(path.mnt);
+   if (err)
+   mode = O_RDONLY;
+   else
+   mode = O_RDWR;
+   filp = alloc_file(&path, mode, &xfs_file_operations);
+   if (filp->f_mode & FMODE_WRITE)
+   mnt_drop_write(path.mnt);
+   if (IS_ERR(filp)) {
+   err = PTR_ERR(filp);
+   goto err_put_path;
+   }
+
+   filp->f_flags |= O_LARGEFILE;
+   fd_install(fd, filp);
+   xfs_irele(ip);
+   return fd;
+
+err_put_path:
+   path_put(&path);
+err_put_fd:
+   put_unused_fd(fd);
+err_put_ip:
+   xfs_irele(ip);
+   return err;
+}
+
 /*
  * Note: some of the ioctl's return positive numbers as a
  * byte count indicating success, such as readlink_by_handle.
@@ -2216,6 +2273,12 @@ xfs_file_ioctl(
return error;
}
 
+case XFS_IOC_OPEN_BALLOON:
+if (!capable(CAP_SYS_ADMIN))
+return -EACCES;
+
+return xfs_open_balloon(mp, filp->f_path.mnt);
+
default:
return -ENOTTY;
}


___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH RH9 3/4] xfs: Don't show the active balloon to user

2021-10-22 Thread Kirill Tkhai

Prohibit a notpriviliged user to reach balloon.
Prohibit everything to unlink it.

Signed-off-by: Kirill Tkhai 
---
 fs/xfs/xfs_inode.c |4 
 fs/xfs/xfs_iops.c  |4 
 2 files changed, 8 insertions(+)

diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 990b72ae3635..32f99876dc19 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -670,6 +670,10 @@ xfs_lookup(
if (error)
goto out_unlock;
 
+   error = -EPERM;
+   if (unlikely(inum == READ_ONCE(dp->i_mount->m_balloon_ino)))
+   goto out_free_name;
+
error = xfs_iget(dp->i_mount, NULL, inum, 0, 0, ipp);
if (error)
goto out_free_name;
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 93c082db04b7..09211e1d08ad 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -382,6 +382,10 @@ xfs_vn_unlink(
struct xfs_name name;
int error;
 
+   if (unlikely(d_inode(dentry)->i_ino ==
+   READ_ONCE(XFS_I(dir)->i_mount->m_balloon_ino)))
+   return -EPERM;
+
xfs_dentry_to_name(&name, dentry);
 
error = xfs_remove(XFS_I(dir), &name, XFS_I(d_inode(dentry)));


___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH RH9 0/4] xfs: Add balloon support

2021-10-22 Thread Kirill Tkhai

https://jira.sw.ru/browse/PSBM-133811
---

Kirill Tkhai (4):
  xfs: Teach the fs where the balloon inode is
  xfs: Never show balloon in readdir results
  xfs: Don't show the active balloon to user
  xfs: Provide a balloon nipple for management


 fs/xfs/libxfs/xfs_da_btree.h  |1 +
 fs/xfs/libxfs/xfs_dir2_priv.h |1 +
 fs/xfs/libxfs/xfs_fs.h|1 +
 fs/xfs/xfs_dir2_readdir.c |   23 +++-
 fs/xfs/xfs_file.c |2 +
 fs/xfs/xfs_inode.c|4 ++
 fs/xfs/xfs_ioctl.c|   63 +
 fs/xfs/xfs_iops.c |4 ++
 fs/xfs/xfs_mount.h|2 +
 fs/xfs/xfs_super.c|   79 +
 fs/xfs/xfs_super.h|2 +
 11 files changed, 180 insertions(+), 2 deletions(-)

--
Signed-off-by: Kirill Tkhai 

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH RH9 2/4] xfs: Never show balloon in readdir results

2021-10-22 Thread Kirill Tkhai

Note, that xfs_readdir() may be called from many
places. To underline the case, when it's called
from normal readdir syscalls (not from xfs service
functionality), and to avoid to add a new argument
to xfs_readdir(), we introduce a special value:
XFS_FAKE_TRANS_IGNORE_BALLOON.

Signed-off-by: Kirill Tkhai 
---
 fs/xfs/libxfs/xfs_da_btree.h  |1 +
 fs/xfs/libxfs/xfs_dir2_priv.h |1 +
 fs/xfs/xfs_dir2_readdir.c |   23 ++-
 fs/xfs/xfs_file.c |2 +-
 4 files changed, 25 insertions(+), 2 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_da_btree.h b/fs/xfs/libxfs/xfs_da_btree.h
index ad5dd324631a..3aa2dfd533ed 100644
--- a/fs/xfs/libxfs/xfs_da_btree.h
+++ b/fs/xfs/libxfs/xfs_da_btree.h
@@ -55,6 +55,7 @@ enum xfs_dacmp {
 typedef struct xfs_da_args {
struct xfs_da_geometry *geo;/* da block geometry */
const uint8_t   *name;  /* string (maybe not NULL 
terminated) */
+   uint8_t ignore_balloon:1;
int namelen;/* length of string (maybe no NULL) */
uint8_t filetype;   /* filetype of inode for directories */
void*value; /* set of bytes (maybe contain NULLs) */
diff --git a/fs/xfs/libxfs/xfs_dir2_priv.h b/fs/xfs/libxfs/xfs_dir2_priv.h
index 94943ce49cab..e78fc1667836 100644
--- a/fs/xfs/libxfs/xfs_dir2_priv.h
+++ b/fs/xfs/libxfs/xfs_dir2_priv.h
@@ -184,6 +184,7 @@ void xfs_dir2_sf_put_ftype(struct xfs_mount *mp,
struct xfs_dir2_sf_entry *sfep, uint8_t ftype);
 
 /* xfs_dir2_readdir.c */
+#define XFS_FAKE_TRANS_IGNORE_BALLOON ((void *)1)
 extern int xfs_readdir(struct xfs_trans *tp, struct xfs_inode *dp,
   struct dir_context *ctx, size_t bufsize);
 
diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c
index da1cc683560c..0dfba9054e3d 100644
--- a/fs/xfs/xfs_dir2_readdir.c
+++ b/fs/xfs/xfs_dir2_readdir.c
@@ -121,9 +121,13 @@ xfs_dir2_sf_getdents(
   !xfs_dir2_namecheck(sfep->name,
   sfep->namelen)))
return -EFSCORRUPTED;
+   if (unlikely(ino == READ_ONCE(dp->i_mount->m_balloon_ino) &&
+args->ignore_balloon))
+   goto next;
if (!dir_emit(ctx, (char *)sfep->name, sfep->namelen, ino,
xfs_dir3_get_dtype(mp, filetype)))
return 0;
+next:
sfep = xfs_dir2_sf_nextentry(mp, sfp, sfep);
}
 
@@ -214,6 +218,12 @@ xfs_dir2_block_getdents(
error = -EFSCORRUPTED;
goto out_rele;
}
+
+   if (unlikely(be64_to_cpu(dep->inumber) ==
+   READ_ONCE(dp->i_mount->m_balloon_ino) &&
+args->ignore_balloon))
+   continue;
+
if (!dir_emit(ctx, (char *)dep->name, dep->namelen,
be64_to_cpu(dep->inumber),
xfs_dir3_get_dtype(dp->i_mount, filetype)))
@@ -465,11 +475,17 @@ xfs_dir2_leaf_getdents(
error = -EFSCORRUPTED;
break;
}
+
+   if (unlikely(be64_to_cpu(dep->inumber) ==
+   READ_ONCE(dp->i_mount->m_balloon_ino) &&
+args->ignore_balloon))
+   goto next;
+
if (!dir_emit(ctx, (char *)dep->name, dep->namelen,
be64_to_cpu(dep->inumber),
xfs_dir3_get_dtype(dp->i_mount, filetype)))
break;
-
+next:
/*
 * Advance to next entry in the block.
 */
@@ -510,6 +526,11 @@ xfs_readdir(
int rval;
int v;
 
+   if (tp == XFS_FAKE_TRANS_IGNORE_BALLOON) {
+   args.ignore_balloon = true;
+   tp = NULL;
+   }
+
trace_xfs_readdir(dp);
 
if (XFS_FORCED_SHUTDOWN(dp->i_mount))
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index cc3cfb12df53..1164184cd1b0 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1266,7 +1266,7 @@ xfs_file_readdir(
 */
bufsize = (size_t)min_t(loff_t, XFS_READDIR_BUFSIZE, ip->i_disk_size);
 
-   return xfs_readdir(NULL, ip, ctx, bufsize);
+   return xfs_readdir(XFS_FAKE_TRANS_IGNORE_BALLOON, ip, ctx, bufsize);
 }
 
 STATIC loff_t


___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH RH9 1/4] xfs: Teach the fs where the balloon inode is

2021-10-22 Thread Kirill Tkhai

This adds balloon_ino=XXX mount option for xfs.

Signed-off-by: Kirill Tkhai 
---
 fs/xfs/xfs_mount.h |2 +
 fs/xfs/xfs_super.c |   79 
 fs/xfs/xfs_super.h |2 +
 3 files changed, 83 insertions(+)

diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index c78b63fe779a..4eb318bb44ac 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -154,6 +154,8 @@ typedef struct xfs_mount {
uint8_t m_rt_checked;
uint8_t m_rt_sick;
 
+   uint64_tm_balloon_ino;
+
/*
 * End of read-mostly variables. Frequently written variables and locks
 * should be placed below this comment from now on. The first variable
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 304875c0d3cc..aba14f5adc4e 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -95,6 +95,7 @@ enum {
Opt_prjquota, Opt_uquota, Opt_gquota, Opt_pquota,
Opt_uqnoenforce, Opt_gqnoenforce, Opt_pqnoenforce, Opt_qnoenforce,
Opt_discard, Opt_nodiscard, Opt_dax, Opt_dax_enum,
+   Opt_balloon_ino,
 };
 
 static const struct fs_parameter_spec xfs_fs_parameters[] = {
@@ -139,6 +140,7 @@ static const struct fs_parameter_spec xfs_fs_parameters[] = 
{
fsparam_flag("nodiscard",   Opt_nodiscard),
fsparam_flag("dax", Opt_dax),
fsparam_enum("dax", Opt_dax_enum, dax_param_enums),
+   fsparam_u64("balloon_ino",  Opt_balloon_ino),
{}
 };
 
@@ -171,6 +173,7 @@ xfs_fs_show_options(
};
struct xfs_mount*mp = XFS_M(root->d_sb);
struct proc_xfs_info*xfs_infop;
+   u64 balloon_ino;
 
for (xfs_infop = xfs_info_set; xfs_infop->flag; xfs_infop++) {
if (mp->m_flags & xfs_infop->flag)
@@ -224,6 +227,9 @@ xfs_fs_show_options(
if (!(mp->m_qflags & XFS_ALL_QUOTA_ACCT))
seq_puts(m, ",noquota");
 
+   if ((balloon_ino = READ_ONCE(mp->m_balloon_ino)) != 0)
+   seq_printf(m, ",balloon_ino=%llu",
+   balloon_ino);
return 0;
 }
 
@@ -776,6 +782,41 @@ xfs_fs_sync_fs(
return 0;
 }
 
+struct xfs_inode *
+xfs_balloon_get(struct xfs_mount *mp, u64 balloon_ino, uint flags)
+{
+   struct xfs_inode *ip;
+   struct inode *inode;
+   int error;
+
+   if (!xfs_verify_dir_ino(mp, balloon_ino))
+   return ERR_PTR(-EINVAL);
+
+   error = xfs_iget(mp, NULL, balloon_ino, flags, 0, &ip);
+   if (error)
+   return ERR_PTR(error);
+   inode = VFS_I(ip);
+   if (!S_ISREG(inode->i_mode) || IS_IMMUTABLE(inode))
+   return ERR_PTR(-EINVAL);
+
+   return ip;
+}
+
+STATIC int
+xfs_balloon_check(struct xfs_mount *mp, u64 balloon_ino)
+{
+   struct xfs_inode *ip;
+
+   if (!balloon_ino)
+   return 0;
+
+   ip = xfs_balloon_get(mp, balloon_ino, XFS_IGET_UNTRUSTED);
+   if (IS_ERR(ip))
+   return PTR_ERR(ip);
+   xfs_irele(ip);
+   return 0;
+}
+
 STATIC int
 xfs_fs_statfs(
struct dentry   *dentry,
@@ -790,6 +831,7 @@ xfs_fs_statfs(
uint64_tfdblocks;
xfs_extlen_tlsize;
int64_t ffree;
+   u64 balloon_ino;
 
statp->f_type = XFS_SUPER_MAGIC;
statp->f_namelen = MAXNAMELEN - 1;
@@ -840,6 +882,17 @@ xfs_fs_statfs(
sbp->sb_frextents * sbp->sb_rextsize;
}
 
+   if ((balloon_ino = READ_ONCE(mp->m_balloon_ino)) != 0) {
+   struct xfs_inode *ip;
+
+   ip = xfs_balloon_get(mp, balloon_ino, 0);
+   if (ip) {
+   /* Note, i_nblocks also contains metadata blocks */
+   statp->f_blocks -= ip->i_nblocks + ip->i_delayed_blks;
+   xfs_irele(ip);
+   }
+   }
+
return 0;
 }
 
@@ -1273,6 +1326,9 @@ xfs_fs_parse_param(
xfs_mount_set_dax_mode(parsing_mp, result.uint_32);
return 0;
 #endif
+   case Opt_balloon_ino:
+   parsing_mp->m_balloon_ino = result.uint_64;
+   return 0;
/* Following mount options will be removed in September 2025 */
case Opt_ikeep:
xfs_fs_warn_deprecated(fc, param, XFS_MOUNT_IKEEP, true);
@@ -1603,6 +1659,10 @@ xfs_fs_fill_super(
if (error)
goto out_filestream_unmount;
 
+   error = xfs_balloon_check(mp, mp->m_balloon_ino);
+   if (error)
+   goto out_unmount;
+
root = igrab(VFS_I(mp->m_rootip));
if (!root) {
error = -ENOENT;
@@ -1809,6 +1869,25 @@ xfs_fs_reconfigure(
return error;
}

Re: [Devel] [PATCH RH9] ploop: simplify ploop_status

2021-10-22 Thread Kirill Tkhai

On 21.10.2021 21:32, Cyrill Gorcunov wrote:
> From: Cyrill Gorcunov 
> 
> We can get rid of sprintf usage when encoding the status.
> Just fill the string directly.
> 
> Cc: Kirill Tkhai 
> Signed-off-by: Cyrill Gorcunov 

Acked-by: Kirill TKhai 

> ---
>  drivers/md/dm-ploop-target.c |   13 +++--
>  1 file changed, 7 insertions(+), 6 deletions(-)
> 
> --- vzkernel.orig/drivers/md/dm-ploop-target.c
> +++ vzkernel/drivers/md/dm-ploop-target.c
> @@ -435,20 +435,21 @@ static void ploop_status(struct dm_targe
>unsigned int maxlen)
>  {
>   struct ploop *ploop = ti->private;
> - char stat[16] = { 0 }, *p = stat;
> + char stat[16], *p = stat;
>   ssize_t sz = 0;
>  
>   down_read(&ploop->ctl_rwsem);
>   if (ploop->falloc_new_clu)
> - p += sprintf(p, "f");
> + *p++ = 'f';
>   if (ploop->tracking_bitmap)
> - p += sprintf(p, "t");
> + *p++ = 't';
>   if (READ_ONCE(ploop->noresume))
> - p += sprintf(p, "n");
> + *p++ = 'n';
>   if (READ_ONCE(ploop->event_enospc))
> - p += sprintf(p, "s");
> + *p++ = 's';
>   if (p == stat)
> - p += sprintf(p, "o");
> + *p++ = 'o';
> + *p++ = '\0';
>   up_read(&ploop->ctl_rwsem);
>  
>   BUG_ON(p - stat >= sizeof(stat));
> 

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

Re: [Devel] [PATCH RH8] ploop: Provide more info about ENOSPC

2021-10-21 Thread Kirill Tkhai

On 20.10.2021 22:22, Cyrill Gorcunov wrote:
> On Wed, Oct 20, 2021 at 06:13:01PM +0300, Kirill Tkhai wrote:
> ...
>> diff --git a/drivers/md/dm-ploop-target.c b/drivers/md/dm-ploop-target.c
>> index 327095f75359..bd68d5fb272b 100644
>> --- a/drivers/md/dm-ploop-target.c
>> +++ b/drivers/md/dm-ploop-target.c
>> @@ -455,6 +455,8 @@ static void ploop_status(struct dm_target *ti, 
>> status_type_t type,
>>  p += sprintf(p, "t");
>>  if (READ_ONCE(ploop->noresume))
>>  p += sprintf(p, "n");
>> +if (READ_ONCE(ploop->event_enospc))
>> +p += sprintf(p, "s");
>>  if (p == stat)
>>  p += sprintf(p, "o");
>>  if (ploop->skip_off)
> 
> While I've no clue what is going on here with this status I wonder why
> we use sprintf here at all? The sprintf is _very_ heavy function which
> consumes too much cycles for nothing, we don't even need any formatting
> here. Why not some simple
> 
> static void ploop_status(struct dm_target *ti, status_type_t type,
>unsigned int status_flags, char *result,
>unsigned int maxlen)
> {
>   struct ploop *ploop = ti->private;
>   char stat[16], *p = stat;
>   ssize_t sz = 0;
> 
>   down_read(&ploop->ctl_rwsem);
>   if (ploop->falloc_new_clu)
>   *p++ = 'f';
>   if (ploop->tracking_bitmap)
>   *p++ = 't';
>   if (READ_ONCE(ploop->noresume))
>   *p++ = 'n';
>   if (p == stat)
>   *p++ = 'o';
>   *p = '\0';
>   up_read(&ploop->ctl_rwsem);
> 
>   BUG_ON(p - stat >= sizeof(stat));
>   DMEMIT("%u v2 %u %s", ploop->nr_deltas, (u32)CLU_TO_SEC(ploop, 1), 
> stat);
> }
> 
> or I miss something obvious?

Good idea. Could you please provide a proper patch reworking this function on 
top of my patch?
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH RH8] ploop: Provide more info about ENOSPC

2021-10-20 Thread Kirill Tkhai

Add info to status and print to dmesg once.

https://jira.sw.ru/browse/PSBM-135007

Signed-off-by: Kirill Tkhai 
---
 drivers/md/dm-ploop-map.c|1 +
 drivers/md/dm-ploop-target.c |2 ++
 2 files changed, 3 insertions(+)

diff --git a/drivers/md/dm-ploop-map.c b/drivers/md/dm-ploop-map.c
index 79142acddecc..4cadf6e45b4d 100644
--- a/drivers/md/dm-ploop-map.c
+++ b/drivers/md/dm-ploop-map.c
@@ -169,6 +169,7 @@ static bool ploop_try_delay_enospc(struct ploop_rq *prq, 
struct pio *pio)
 
init_prq_and_embedded_pio(ploop, prq->rq, prq, pio);
 
+   pr_err_once("ploop: underlying disk is almost full\n");
ploop->event_enospc = true;
list_add_tail(&pio->list, &ploop->enospc_pios);
 unlock:
diff --git a/drivers/md/dm-ploop-target.c b/drivers/md/dm-ploop-target.c
index 327095f75359..bd68d5fb272b 100644
--- a/drivers/md/dm-ploop-target.c
+++ b/drivers/md/dm-ploop-target.c
@@ -455,6 +455,8 @@ static void ploop_status(struct dm_target *ti, 
status_type_t type,
p += sprintf(p, "t");
if (READ_ONCE(ploop->noresume))
p += sprintf(p, "n");
+   if (READ_ONCE(ploop->event_enospc))
+   p += sprintf(p, "s");
if (p == stat)
p += sprintf(p, "o");
if (ploop->skip_off)


___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH RH9] ploop: Provide more info about ENOSPC

2021-10-20 Thread Kirill Tkhai

Add info to status and print to dmesg once.

https://jira.sw.ru/browse/PSBM-135007

Signed-off-by: Kirill Tkhai 
---
 drivers/md/dm-ploop-map.c|1 +
 drivers/md/dm-ploop-target.c |2 ++
 2 files changed, 3 insertions(+)

diff --git a/drivers/md/dm-ploop-map.c b/drivers/md/dm-ploop-map.c
index 79142acddecc..4cadf6e45b4d 100644
--- a/drivers/md/dm-ploop-map.c
+++ b/drivers/md/dm-ploop-map.c
@@ -169,6 +169,7 @@ static bool ploop_try_delay_enospc(struct ploop_rq *prq, 
struct pio *pio)
 
init_prq_and_embedded_pio(ploop, prq->rq, prq, pio);
 
+   pr_err_once("ploop: underlying disk is almost full\n");
ploop->event_enospc = true;
list_add_tail(&pio->list, &ploop->enospc_pios);
 unlock:
diff --git a/drivers/md/dm-ploop-target.c b/drivers/md/dm-ploop-target.c
index 327095f75359..bd68d5fb272b 100644
--- a/drivers/md/dm-ploop-target.c
+++ b/drivers/md/dm-ploop-target.c
@@ -455,6 +455,8 @@ static void ploop_status(struct dm_target *ti, 
status_type_t type,
p += sprintf(p, "t");
if (READ_ONCE(ploop->noresume))
p += sprintf(p, "n");
+   if (READ_ONCE(ploop->event_enospc))
+   p += sprintf(p, "s");
if (p == stat)
p += sprintf(p, "o");
if (ploop->skip_off)


___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

Re: [Devel] [PATCH RH9 0/6] part 18: port release agent virtualization

2021-10-19 Thread Kirill Tkhai

On 18.10.2021 15:50, Pavel Tikhomirov wrote:
> Patches are massively reworked, see inpatch comments.
> 
> https://jira.sw.ru/browse/PSBM-134002
> 
> Pavel Tikhomirov (1):
>   ve/cgroup: fix cgroup_mark_ve_roots naming
> 
> Valeriy Vdovin (5):
>   cgroup/cfs: added 'activate' option to cgroup_add_file
>   ve/cgroup: Implement per-ve workqueue
>   ve/cgroup: Move release_agent from system_wq to per-ve workqueues
>   ve/cgroup: Private per-cgroup-root data container
>   ve/cgroup: Set release_agent_path for root cgroups separately
> 
>  include/linux/cgroup-defs.h |  11 +-
>  include/linux/cgroup.h  |   4 +-
>  include/linux/ve.h  |  27 
>  kernel/cgroup/cgroup-internal.h |   2 +
>  kernel/cgroup/cgroup-v1.c   | 165 +++---
>  kernel/cgroup/cgroup.c  |  53 ++-
>  kernel/ve/ve.c  | 242 +++-
>  7 files changed, 442 insertions(+), 62 deletions(-)

Looks OK for me.

Reviewed-by: Kirill Tkhai 

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH RH7] pfcache: Fix unitialized s_csum_partial

2021-10-14 Thread Kirill Tkhai

Since percpu_counter::counters is not allocated,
add and sub operations write to percpu memory
with 0 offset.

In scope of https://jira.sw.ru/browse/PSBM-134639
Fixes: 1204e364ca05 "pfcache: add hashed peers for ext4"
Signed-off-by: Kirill Tkhai 
---
 fs/ext4/super.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 810556737675..2d7c1d7c4190 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -4540,7 +4540,7 @@ static int ext4_fill_super(struct super_block *sb, void 
*data, int silent)
sbi->s_err_report.data = (unsigned long) sb;
 
err = percpu_counter_init(&sbi->s_extent_cache_cnt, 0, GFP_KERNEL);
-   if (err)
+   if (!err)
err = percpu_counter_init(&sbi->s_csum_partial, 0, GFP_KERNEL);
if (!err)
err = percpu_counter_init(&sbi->s_csum_complete, 0, GFP_KERNEL);


___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

Re: [Devel] [PATCH RH9] x86/cpu: init_cpu_flags -- use raw spinlock

2021-10-14 Thread Kirill Tkhai

On 14.10.2021 11:20, Cyrill Gorcunov wrote:
> The @cpu_flags_lock spinlock guards manipulations with
> per-cpu @cpu_flags which is used to hide some features
> in cpuinfo output inside VE. Still the init_cpu_flags
> is called from irq context leading to the following
> 
>  | [   13.827635] =
>  | [   13.827636] [ BUG: Invalid wait context ]
>  | [   13.827637] 5.14.0.ovz9.10.1+ #41 Tainted: G C X - 
> --- 
>  | [   13.827638] -
>  | [   13.827638] systemd/1 is trying to lock:
>  | [   13.827639] a4c9d258 (cpu_flags_lock){}-{3:3}, at: 
> init_cpu_flags+0xc8/0x220
>  | [   13.827649] other info that might help us debug this:
>  | [   13.827651] context-{2:2}
>  | [   13.827651] 3 locks held by systemd/1:
>  | [   13.827652]  #0: a56e8c60 (dup_mmap_sem){.+.+}-{0:0}, at: 
> dup_mm+0x83/0x5f0
>  | [   13.827660]  #1: 97574a37d138 (&mm->mmap_lock#2){}-{4:4}, at: 
> dup_mm+0x9c/0x5f0
>  | [   13.827664]  #2: 97574489c138 (&mm->mmap_lock/1){+.+.}-{4:4}, at: 
> dup_mm+0xd5/0x5f0
>  | [   13.827667] stack backtrace:
>  | [   13.827668] CPU: 0 PID: 1 Comm: systemd ve: / Tainted: G C 
> X - ---
>  | [   13.827670] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 
> 1.14.0-4.fc34 04/01/2014
>  | [   13.827672] Call Trace:
>  | [   13.827673]  
>  | [   13.827675]  dump_stack_lvl+0x57/0x7d
>  | [   13.827686]  __lock_acquire.cold+0x28b/0x2cd
>  | [   13.827694]  lock_acquire+0xca/0x300
>  | [   13.827700]  ? init_cpu_flags+0xc8/0x220
>  | [   13.827703]  _raw_spin_lock+0x34/0x80
>  | [   13.827708]  ? init_cpu_flags+0xc8/0x220
>  | [   13.827710]  init_cpu_flags+0xc8/0x220
>  | [   13.827713]  flush_smp_call_function_queue+0x13f/0x1e0
>  | [   13.827717]  __sysvec_call_function_single+0x43/0x1c0
>  | [   13.827722]  sysvec_call_function_single+0x9d/0xd0
>  | [   13.827724]  
>  | [   13.827724]  asm_sysvec_call_function_single+0x12/0x20
>  | [   13.827728] RIP: 0010:lock_release+0x178/0x460
>  | ...
>  | [   13.827741]  up_write+0x2f/0x1c0
>  | [   13.827743]  anon_vma_clone+0x158/0x1f0
>  | [   13.827749]  anon_vma_fork+0x33/0x180
>  | [   13.827751]  dup_mm+0x45b/0x5f0
>  | [   13.827755]  copy_process+0x1e5a/0x2050
>  | [   13.827758]  kernel_clone+0x9b/0x3f0
>  | [   13.827760]  ? vfs_statx+0x74/0x130
>  | [   13.827766]  __do_sys_clone+0x60/0x80
>  | [   13.827769]  do_syscall_64+0x3b/0x90
>  | [   13.827771]  entry_SYSCALL_64_after_hwframe+0x44/0xae
> 
> The problem is rather coming from rt camp where splinlocks
> become sleepable thus can't be used in irq context (and for our kernel
> it requires the CONFIG_PROVE_RAW_LOCK_NESTING to be set), thus since
> we know that we're operating in irq context lets use raw spinlocks
> instead.
> 
> https://jira.sw.ru/browse/PSBM-134761
> 
> CC: Kirill Tkhai 
> Signed-off-by: Cyrill Gorcunov 

Reviewed-by: Kirill Tkhai 

> ---
>  arch/x86/kernel/cpu/proc.c |   10 +-
>  1 file changed, 5 insertions(+), 5 deletions(-)
> 
> --- vzkernel.orig/arch/x86/kernel/cpu/proc.c
> +++ vzkernel/arch/x86/kernel/cpu/proc.c
> @@ -69,7 +69,7 @@ struct cpu_flags {
>  };
>  
>  static DEFINE_PER_CPU(struct cpu_flags, cpu_flags);
> -static DEFINE_SPINLOCK(cpu_flags_lock);
> +static DEFINE_RAW_SPINLOCK(cpu_flags_lock);
>  
>  static void init_cpu_flags(void *dummy)
>  {
> @@ -107,9 +107,9 @@ static void init_cpu_flags(void *dummy)
>   flags.val[10] &= eax;
>   }
>  
> - spin_lock(&cpu_flags_lock);
> + raw_spin_lock(&cpu_flags_lock);
>   memcpy(&per_cpu(cpu_flags, cpu), &flags, sizeof(flags));
> - spin_unlock(&cpu_flags_lock);
> + raw_spin_unlock(&cpu_flags_lock);
>  }
>  
>  static int show_cpuinfo(struct seq_file *m, void *v)
> @@ -158,9 +158,9 @@ static int show_cpuinfo(struct seq_file
>   show_cpuinfo_misc(m, c);
>  
>   if (!is_super) {
> - spin_lock_irq(&cpu_flags_lock);
> + raw_spin_lock_irq(&cpu_flags_lock);
>   memcpy(&ve_flags, &per_cpu(cpu_flags, cpu), sizeof(ve_flags));
> - spin_unlock_irq(&cpu_flags_lock);
> + raw_spin_unlock_irq(&cpu_flags_lock);
>   }
>  
>  
> 

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

Re: [Devel] [PATCH RH9] sched/ve: calc_load_ve -- use raw spinlock

2021-10-14 Thread Kirill Tkhai

On 14.10.2021 11:20, Cyrill Gorcunov wrote:
> The @load_ve_lock spinlock guards manipulations of @ve_root_list,
> same time the calc_load_ve() is executed from irq context which
> triggers "invalid context wait" bug
> 
>  | [5.195868] =
>  | [5.195877] [ BUG: Invalid wait context ]
>  | [5.195887] 5.14.0.ovz9.10.1 #37 Tainted: G C X - 
> --- 
>  | [5.195902] -
>  | [5.195911] swapper/0/0 is trying to lock:
>  | [5.196327] 872d8438 (load_ve_lock){}-{3:3}, at: 
> calc_load_ve+0x15/0x1c0
>  | [5.196742] other info that might help us debug this:
>  | [5.196807] context-{2:2}
>  | [5.196807] no locks held by swapper/0/0.
>  | [5.196807] stack backtrace:
>  | [5.196807] CPU: 0 PID: 0 Comm: swapper/0 ve: / Tainted: G C
>  X - ---  5.14.0.ovz9.10.1 #37 10.1
>  | [5.196807] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 
> 1.14.0-4.fc34 04/01/2014
>  | [5.196807] Call Trace:
>  | [5.196807]  
>  | [5.196807]  dump_stack_lvl+0x57/0x7d
>  | [5.196807]  __lock_acquire.cold+0x28b/0x2cd
>  | [5.196807]  ? __lock_acquire+0x3b1/0x1f20
>  | [5.196807]  lock_acquire+0xca/0x300
>  | [5.196807]  ? calc_load_ve+0x15/0x1c0
>  | [5.196807]  ? kvm_sched_clock_read+0x14/0x40
>  | [5.196807]  ? sched_clock_local+0xe/0x80
>  | [5.196807]  ? sched_clock_cpu+0xa5/0xc0
>  | [5.196807]  _raw_spin_lock+0x34/0x80
>  | [5.196807]  ? calc_load_ve+0x15/0x1c0
>  | [5.196807]  calc_load_ve+0x15/0x1c0
>  | [5.196807]  tick_do_update_jiffies64+0x115/0x150
>  | [5.196807]  tick_irq_enter+0x6c/0xe0
>  | [5.196807]  irq_enter_rcu+0x79/0x80
>  | [5.196807]  sysvec_apic_timer_interrupt+0x95/0xd0
>  | [5.196807]  
>  | [5.196807]  asm_sysvec_apic_timer_interrupt+0x12/0x20
>  | [5.196807] RIP: 0010:default_idle+0x10/0x20
>  | [5.196807] RSP: 0018:87203ea8 EFLAGS: 0202
>  | [5.196807] RAX: 86380df0 RBX:  RCX: 
> 0001
>  | [5.196807] RDX:  RSI: 86ee3980 RDI: 
> 86e1050e
>  | [5.196807] RBP: 87260a00 R08: 0001 R09: 
> 0001
>  | [5.196807] R10: 0001 R11:  R12: 
> 
>  | [5.196807] R13:  R14: 87260120 R15: 
> 
>  | [5.196807]  ? mwait_idle+0x70/0x70
>  | [5.196807]  ? mwait_idle+0x70/0x70
>  | [5.196807]  default_idle_call+0x59/0x90
>  | [5.196807]  do_idle+0x217/0x2b0
>  | [5.196807]  cpu_startup_entry+0x19/0x20
>  | [5.196807]  start_kernel+0x997/0x9bc
>  | [5.196807]  ? copy_bootdata+0x18/0x55
>  | [5.196807]  secondary_startup_64_no_verify+0xc2/0xcb
> 
> Note that the problem is rather coming from rt camp where splinlocks
> become sleepable thus can't be used in irq context (and for our kernel
> it requires the CONFIG_PROVE_RAW_LOCK_NESTING to be set), thus since
> we know that we're operating in irq context lets use raw spinlocks
> instead.
> 
> Also I make unlock to happen earlier because there is no need to
> keep it once we've finished traversing the @ve_root_list list.
> 
> https://jira.sw.ru/browse/PSBM-134756
> 
> CC: Kirill Tkhai 
> Signed-off-by: Cyrill Gorcunov 

Acked-by: Kirill Tkhai 

> ---
>  kernel/sched/core.c|   10 +-
>  kernel/sched/loadavg.c |6 +++---
>  2 files changed, 8 insertions(+), 8 deletions(-)
> 
> --- vzkernel.orig/kernel/sched/core.c
> +++ vzkernel/kernel/sched/core.c
> @@ -10036,18 +10036,18 @@ static u64 cpu_shares_read_u64(struct cg
>  
>  #ifdef CONFIG_VE
>  LIST_HEAD(ve_root_list);
> -DEFINE_SPINLOCK(load_ve_lock);
> +DEFINE_RAW_SPINLOCK(load_ve_lock);
>  
>  void link_ve_root_cpu_cgroup(struct cgroup_subsys_state *css)
>  {
>   struct task_group *tg = css_tg(css);
>   unsigned long flags;
>  
> - spin_lock_irqsave(&load_ve_lock, flags);
> + raw_spin_lock_irqsave(&load_ve_lock, flags);
>   BUG_ON(!(css->flags & CSS_ONLINE));
>   if (list_empty(&tg->ve_root_list))
>   list_add(&tg->ve_root_list, &ve_root_list);
> - spin_unlock_irqrestore(&load_ve_lock, flags);
> + raw_spin_unlock_irqrestore(&load_ve_lock, flags);
>  }
>  
>  void unlink_ve_root_cpu_cgroup(struct cgroup_subsys_state *css)
> @@ -10055,9 +10055,9 @@ void unlink_ve_root_cpu_cgroup(struct cg
> struct task_group *tg = css_tg(css);
> unsigned long flags;
>  
> -   spin_lock_irqsave(&load_ve_lock, flags);
&g

[Devel] [PATCH RH9 2/2] mm/backing-dev: associate writeback with correct blkcg

2021-10-13 Thread Kirill Tkhai

From: Andrey Zhadchenko 

Use cgroup_get_e_ve_css to get correct blkcg_css for writeback instances.

https://jira.sw.ru/browse/PSBM-131253

Signed-off-by: Andrey Zhadchenko 
Reviewed-by: Kirill Tkhai 

v2:
khorenko@: introduce a wrapper for getting blkcg_css from memcg_css.

==
mm/writeback: Adopt cgroup-v2 writeback (limit per-memcg dirty memory)

In cgroup-v1 all writeback IO is accounted to root blkcg by design. With
cgroup-v2 it became possible to link memcg and blkcg, so writeback code
was enhanced to
 1) consider balancing dirty pages per memory cgroup
 2) account writeback generated IO to blkcg

In vz7 writeback was balancing by means of beancounter cgroup. However we
dropped it.

In vz8 @aryabinin tried to enable cgroup-v2 writeback with 5cc286c98ee20
("mm, cgroup, writeback: Enable per-cgroup writeback for v1 cgroup."),
but cgroup_get_e_css(), which is used to find blkcg based on memcg,
does not work well with cgroup-v1 and always returns root blkcg.

However we can implement a new function to associate blkcg with memcg via
ve css_set.

Test results with 256M container without patch:
===
 # echo "253:22358 1" > 
/sys/fs/cgroup/blkio/machine.slice/1/blkio.throttle.write_bps_device
 # vzctl exec 1 dd if=/dev/zero of=/test bs=1M count=1000
 # 1048576000 bytes (1.0 GB, 1000 MiB) copied, 1.35522 s, 774 MB/s

Since dirty balancing is global, Container can dirty more than it's RAM
and blkio limits are not respected.

With patch:
===
 # echo "253:22765 1" > 
/sys/fs/cgroup/blkio/machine.slice/1/blkio.throttle.write_bps_device
 # vzctl exec 1 dd if=/dev/zero of=/test bs=1M count=1000
 # 1048576000 bytes (1.0 GB, 1000 MiB) copied, 10.2267 s, 103 MB/s

Per-ve dirty balancing and throttling work as expected.

v2:
Since ve->ve_ns is pointing to task nsproxy, it can be changed during ve
lifetime. We already have a helper ve_get_init_css() that handles this
case, so I decided to reuse it's code in new cgroup_get_e_ve_css().

Additionally I have added two patches that improve current code:
 1) drop 'get' from css_get_local_root() name since get with css functions
usually results in taking reference
 2) drop duplicate code and reuse css_local_root() helper in
ve_get_init_css()

Andrey Zhadchenko (4):
  kernel/cgroup: rename css_get_local_root
  kernel/ve: simplify ve_get_init_css
  kernel/cgroup: implement cgroup_get_e_ve_css
  mm/backing-dev: associate writeback with correct blkcg
---
 mm/backing-dev.c |   22 --
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index f5561ea7d90a..9c1a128199e6 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -434,6 +434,22 @@ static void cgwb_remove_from_bdi_list(struct bdi_writeback 
*wb)
spin_unlock_irq(&cgwb_lock);
 }
 
+static inline struct cgroup_subsys_state *
+cgroup_get_e_css_virtialized(struct cgroup *cgroup,
+struct cgroup_subsys *ss)
+{
+   struct cgroup_subsys_state *css;
+
+#ifdef CONFIG_VE
+   if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
+   css = cgroup_get_e_ve_css(cgroup, ss);
+   else
+#endif
+   css = cgroup_get_e_css(cgroup, ss);
+
+   return css;
+}
+
 static int cgwb_create(struct backing_dev_info *bdi,
   struct cgroup_subsys_state *memcg_css, gfp_t gfp)
 {
@@ -446,7 +462,8 @@ static int cgwb_create(struct backing_dev_info *bdi,
int ret = 0;
 
memcg = mem_cgroup_from_css(memcg_css);
-   blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &io_cgrp_subsys);
+   blkcg_css = cgroup_get_e_css_virtialized(memcg_css->cgroup,
+&io_cgrp_subsys);
blkcg = css_to_blkcg(blkcg_css);
memcg_cgwb_list = &memcg->cgwb_list;
blkcg_cgwb_list = &blkcg->cgwb_list;
@@ -566,7 +583,8 @@ struct bdi_writeback *wb_get_lookup(struct backing_dev_info 
*bdi,
struct cgroup_subsys_state *blkcg_css;
 
/* see whether the blkcg association has changed */
-   blkcg_css = cgroup_get_e_css(memcg_css->cgroup, 
&io_cgrp_subsys);
+   blkcg_css = cgroup_get_e_css_virtialized(memcg_css->cgroup,
+&io_cgrp_subsys);
if (unlikely(wb->blkcg_css != blkcg_css || !wb_tryget(wb)))
wb = NULL;
css_put(blkcg_css);


___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH RH9 1/2] kernel/cgroup: implement cgroup_get_e_ve_css

2021-10-13 Thread Kirill Tkhai

From: Andrey Zhadchenko 

Existing cgroup_get_e_css() is not suited for cgroup-v1 and will always
return root cgroup css. Implement new cgroup_get_e_ve_css to return
ve css.

https://jira.sw.ru/browse/PSBM-131253

Signed-off-by: Andrey Zhadchenko 
Reviewed-by: Kirill Tkhai 

==
mm/writeback: Adopt cgroup-v2 writeback (limit per-memcg dirty memory)

In cgroup-v1 all writeback IO is accounted to root blkcg by design. With
cgroup-v2 it became possible to link memcg and blkcg, so writeback code
was enhanced to
 1) consider balancing dirty pages per memory cgroup
 2) account writeback generated IO to blkcg

In vz7 writeback was balancing by means of beancounter cgroup. However we
dropped it.

In vz8 @aryabinin tried to enable cgroup-v2 writeback with 5cc286c98ee20
("mm, cgroup, writeback: Enable per-cgroup writeback for v1 cgroup."),
but cgroup_get_e_css(), which is used to find blkcg based on memcg,
does not work well with cgroup-v1 and always returns root blkcg.

However we can implement a new function to associate blkcg with memcg via
ve css_set.

Test results with 256M container without patch:
===
 # echo "253:22358 1" > 
/sys/fs/cgroup/blkio/machine.slice/1/blkio.throttle.write_bps_device
 # vzctl exec 1 dd if=/dev/zero of=/test bs=1M count=1000
 # 1048576000 bytes (1.0 GB, 1000 MiB) copied, 1.35522 s, 774 MB/s

Since dirty balancing is global, Container can dirty more than it's RAM
and blkio limits are not respected.

With patch:
===
 # echo "253:22765 1" > 
/sys/fs/cgroup/blkio/machine.slice/1/blkio.throttle.write_bps_device
 # vzctl exec 1 dd if=/dev/zero of=/test bs=1M count=1000
 # 1048576000 bytes (1.0 GB, 1000 MiB) copied, 10.2267 s, 103 MB/s

Per-ve dirty balancing and throttling work as expected.

v2:
Since ve->ve_ns is pointing to task nsproxy, it can be changed during ve
lifetime. We already have a helper ve_get_init_css() that handles this
case, so I decided to reuse it's code in new cgroup_get_e_ve_css().

Additionally I have added two patches that improve current code:
 1) drop 'get' from css_get_local_root() name since get with css functions
usually results in taking reference
 2) drop duplicate code and reuse css_local_root() helper in
ve_get_init_css()

Andrey Zhadchenko (4):
  kernel/cgroup: rename css_get_local_root
  kernel/ve: simplify ve_get_init_css
  kernel/cgroup: implement cgroup_get_e_ve_css
  mm/backing-dev: associate writeback with correct blkcg

Signed-off-by: Kirill Tkhai 
---
 include/linux/cgroup.h |2 ++
 kernel/cgroup/cgroup.c |   19 +++
 2 files changed, 21 insertions(+)

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 05b4688cf949..892362bde6b1 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -962,6 +962,8 @@ static inline void cgroup_bpf_put(struct cgroup *cgrp) {}
 #ifdef CONFIG_VE
 int ve_hide_cgroups(struct cgroup_root *root);
 struct ve_struct *get_curr_ve(void);
+struct cgroup_subsys_state *cgroup_get_e_ve_css(struct cgroup *cgrp,
+   struct cgroup_subsys *ss);
 #endif
 
 #endif /* _LINUX_CGROUP_H */
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index bb9bce3de45a..c08497c7eb5d 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -610,6 +610,25 @@ struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup 
*cgrp,
 }
 EXPORT_SYMBOL_GPL(cgroup_get_e_css);
 
+#ifdef CONFIG_VE
+struct cgroup_subsys_state *cgroup_get_e_ve_css(struct cgroup *cgrp,
+   struct cgroup_subsys *ss)
+{
+   struct cgroup_subsys_state *css;
+   struct ve_struct *ve;
+
+   rcu_read_lock();
+
+   ve = cgroup_ve_owner(cgrp);
+   if (!ve)
+   ve = get_ve0();
+   css = ve_get_init_css(ve, ss->id);
+
+   rcu_read_unlock();
+   return css;
+}
+#endif
+
 static void cgroup_get_live(struct cgroup *cgrp)
 {
WARN_ON_ONCE(cgroup_is_dead(cgrp));


___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH RH9 0/2] part23 part2

2021-10-13 Thread Kirill Tkhai




---

Andrey Zhadchenko (2):
  kernel/cgroup: implement cgroup_get_e_ve_css
  mm/backing-dev: associate writeback with correct blkcg


 include/linux/cgroup.h |2 ++
 kernel/cgroup/cgroup.c |   19 +++
 mm/backing-dev.c   |   22 --
 3 files changed, 41 insertions(+), 2 deletions(-)

--
Signed-off-by: Kirill Tkhai 

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH RH9] ploop: Introduce option "off=" to cut beginning of device

2021-10-12 Thread Kirill Tkhai

This is like the functionality of dm-linear.

https://jira.sw.ru/browse/PSBM-132445

Signed-off-by: Kirill Tkhai 
---
 drivers/md/dm-ploop-map.c|9 +++--
 drivers/md/dm-ploop-target.c |   28 
 drivers/md/dm-ploop.h|1 +
 3 files changed, 28 insertions(+), 10 deletions(-)

diff --git a/drivers/md/dm-ploop-map.c b/drivers/md/dm-ploop-map.c
index e3cf0ab73c98..79142acddecc 100644
--- a/drivers/md/dm-ploop-map.c
+++ b/drivers/md/dm-ploop-map.c
@@ -42,6 +42,11 @@ static unsigned int pio_nr_segs(struct pio *pio)
 return nr_segs;
 }
 
+static sector_t ploop_rq_pos(struct ploop *ploop, struct request *rq)
+{
+   return blk_rq_pos(rq) + ploop->skip_off;
+}
+
 void ploop_index_wb_init(struct ploop_index_wb *piwb, struct ploop *ploop)
 {
piwb->ploop = ploop;
@@ -85,7 +90,7 @@ void init_pio(struct ploop *ploop, unsigned int bi_op, struct 
pio *pio)
 /* Get clu related to pio sectors */
 static int ploop_rq_valid(struct ploop *ploop, struct request *rq)
 {
-   sector_t sector = blk_rq_pos(rq);
+   sector_t sector = ploop_rq_pos(ploop, rq);
loff_t end_byte;
u32 end_clu;
 
@@ -1651,7 +1656,6 @@ static void prepare_one_embedded_pio(struct ploop *ploop, 
struct pio *pio,
goto err_nomem;
prq->bvec = bvec;
 skip_bvec:
-   pio->bi_iter.bi_sector = blk_rq_pos(rq);
pio->bi_iter.bi_size = blk_rq_bytes(rq);
pio->bi_iter.bi_idx = 0;
pio->bi_iter.bi_bvec_done = 0;
@@ -1661,6 +1665,7 @@ static void prepare_one_embedded_pio(struct ploop *ploop, 
struct pio *pio,
 
pio->bi_iter = rq->bio->bi_iter;
}
+   pio->bi_iter.bi_sector = ploop_rq_pos(ploop, rq);
pio->bi_io_vec = bvec;
 
pio->queue_list_id = PLOOP_LIST_DEFERRED;
diff --git a/drivers/md/dm-ploop-target.c b/drivers/md/dm-ploop-target.c
index ec0efddef2ac..327095f75359 100644
--- a/drivers/md/dm-ploop-target.c
+++ b/drivers/md/dm-ploop-target.c
@@ -389,16 +389,26 @@ static int ploop_ctr(struct dm_target *ti, unsigned int 
argc, char **argv)
goto err;
}
 
-   /* Optional parameter */
-   if (strcmp(argv[0], "falloc_new_clu") == 0) {
-   if (argc < 2) {
-   ret = -EINVAL;
-   goto err;
+   ret = -EINVAL;
+   /* Optional parameters */
+   while (argc > 0) {
+   if (strcmp(argv[0], "falloc_new_clu") == 0) {
+   ploop->falloc_new_clu = true;
+   EAT_ARG(argc, argv);
+   continue;
+   }
+   if (strncmp(argv[0], "off=", 4) == 0) {
+   if (kstrtou64(argv[0] + 4, 10, &ploop->skip_off) < 0)
+   goto err;
+   EAT_ARG(argc, argv);
+   continue;
}
-   ploop->falloc_new_clu = true;
-   EAT_ARG(argc, argv);
+   break;
}
 
+   if (argc <= 0)
+   goto err;
+
ret = ploop_add_deltas_stack(ploop, &argv[0], argc);
if (ret)
goto err;
@@ -435,7 +445,7 @@ static void ploop_status(struct dm_target *ti, 
status_type_t type,
 unsigned int maxlen)
 {
struct ploop *ploop = ti->private;
-   char stat[16] = { 0 }, *p = stat;
+   char stat[32] = { 0 }, *p = stat;
ssize_t sz = 0;
 
down_read(&ploop->ctl_rwsem);
@@ -447,6 +457,8 @@ static void ploop_status(struct dm_target *ti, 
status_type_t type,
p += sprintf(p, "n");
if (p == stat)
p += sprintf(p, "o");
+   if (ploop->skip_off)
+   p += sprintf(p, " off=%llu", ploop->skip_off);
up_read(&ploop->ctl_rwsem);
 
BUG_ON(p - stat >= sizeof(stat));
diff --git a/drivers/md/dm-ploop.h b/drivers/md/dm-ploop.h
index 8de2a28b2dec..a7ca942c4670 100644
--- a/drivers/md/dm-ploop.h
+++ b/drivers/md/dm-ploop.h
@@ -148,6 +148,7 @@ struct ploop {
bool falloc_new_clu; /* fallocate() instead of truncate() */
u32 nr_bat_entries;
unsigned int cluster_log; /* In sectors */
+   sector_t skip_off; /* To cut beginning of ploop device */
 
u8 m_Sig[16]; /* Signature */
u32 m_Type; /* Disk type */


___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH RH8] ploop: Introduce option "off=" to cut beginning of device

2021-10-12 Thread Kirill Tkhai

This is like the functionality of dm-linear.

https://jira.sw.ru/browse/PSBM-132445

Signed-off-by: Kirill Tkhai 
---
 drivers/md/dm-ploop-map.c|9 +++--
 drivers/md/dm-ploop-target.c |   28 
 drivers/md/dm-ploop.h|1 +
 3 files changed, 28 insertions(+), 10 deletions(-)

diff --git a/drivers/md/dm-ploop-map.c b/drivers/md/dm-ploop-map.c
index e3cf0ab73c98..79142acddecc 100644
--- a/drivers/md/dm-ploop-map.c
+++ b/drivers/md/dm-ploop-map.c
@@ -42,6 +42,11 @@ static unsigned int pio_nr_segs(struct pio *pio)
 return nr_segs;
 }
 
+static sector_t ploop_rq_pos(struct ploop *ploop, struct request *rq)
+{
+   return blk_rq_pos(rq) + ploop->skip_off;
+}
+
 void ploop_index_wb_init(struct ploop_index_wb *piwb, struct ploop *ploop)
 {
piwb->ploop = ploop;
@@ -85,7 +90,7 @@ void init_pio(struct ploop *ploop, unsigned int bi_op, struct 
pio *pio)
 /* Get clu related to pio sectors */
 static int ploop_rq_valid(struct ploop *ploop, struct request *rq)
 {
-   sector_t sector = blk_rq_pos(rq);
+   sector_t sector = ploop_rq_pos(ploop, rq);
loff_t end_byte;
u32 end_clu;
 
@@ -1651,7 +1656,6 @@ static void prepare_one_embedded_pio(struct ploop *ploop, 
struct pio *pio,
goto err_nomem;
prq->bvec = bvec;
 skip_bvec:
-   pio->bi_iter.bi_sector = blk_rq_pos(rq);
pio->bi_iter.bi_size = blk_rq_bytes(rq);
pio->bi_iter.bi_idx = 0;
pio->bi_iter.bi_bvec_done = 0;
@@ -1661,6 +1665,7 @@ static void prepare_one_embedded_pio(struct ploop *ploop, 
struct pio *pio,
 
pio->bi_iter = rq->bio->bi_iter;
}
+   pio->bi_iter.bi_sector = ploop_rq_pos(ploop, rq);
pio->bi_io_vec = bvec;
 
pio->queue_list_id = PLOOP_LIST_DEFERRED;
diff --git a/drivers/md/dm-ploop-target.c b/drivers/md/dm-ploop-target.c
index ec0efddef2ac..327095f75359 100644
--- a/drivers/md/dm-ploop-target.c
+++ b/drivers/md/dm-ploop-target.c
@@ -389,16 +389,26 @@ static int ploop_ctr(struct dm_target *ti, unsigned int 
argc, char **argv)
goto err;
}
 
-   /* Optional parameter */
-   if (strcmp(argv[0], "falloc_new_clu") == 0) {
-   if (argc < 2) {
-   ret = -EINVAL;
-   goto err;
+   ret = -EINVAL;
+   /* Optional parameters */
+   while (argc > 0) {
+   if (strcmp(argv[0], "falloc_new_clu") == 0) {
+   ploop->falloc_new_clu = true;
+   EAT_ARG(argc, argv);
+   continue;
+   }
+   if (strncmp(argv[0], "off=", 4) == 0) {
+   if (kstrtou64(argv[0] + 4, 10, &ploop->skip_off) < 0)
+   goto err;
+   EAT_ARG(argc, argv);
+   continue;
}
-   ploop->falloc_new_clu = true;
-   EAT_ARG(argc, argv);
+   break;
}
 
+   if (argc <= 0)
+   goto err;
+
ret = ploop_add_deltas_stack(ploop, &argv[0], argc);
if (ret)
goto err;
@@ -435,7 +445,7 @@ static void ploop_status(struct dm_target *ti, 
status_type_t type,
 unsigned int maxlen)
 {
struct ploop *ploop = ti->private;
-   char stat[16] = { 0 }, *p = stat;
+   char stat[32] = { 0 }, *p = stat;
ssize_t sz = 0;
 
down_read(&ploop->ctl_rwsem);
@@ -447,6 +457,8 @@ static void ploop_status(struct dm_target *ti, 
status_type_t type,
p += sprintf(p, "n");
if (p == stat)
p += sprintf(p, "o");
+   if (ploop->skip_off)
+   p += sprintf(p, " off=%llu", ploop->skip_off);
up_read(&ploop->ctl_rwsem);
 
BUG_ON(p - stat >= sizeof(stat));
diff --git a/drivers/md/dm-ploop.h b/drivers/md/dm-ploop.h
index 8de2a28b2dec..a7ca942c4670 100644
--- a/drivers/md/dm-ploop.h
+++ b/drivers/md/dm-ploop.h
@@ -148,6 +148,7 @@ struct ploop {
bool falloc_new_clu; /* fallocate() instead of truncate() */
u32 nr_bat_entries;
unsigned int cluster_log; /* In sectors */
+   sector_t skip_off; /* To cut beginning of ploop device */
 
u8 m_Sig[16]; /* Signature */
u32 m_Type; /* Disk type */


___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH RH9 4/5] cgroup/net_prio: virtualize ifpriomap per-ve

2021-10-12 Thread Kirill Tkhai

From: Pavel Tikhomirov 

Ifpriomap is a map of net_prio cgroup id to device prio. Each process is
in some netprio cgroup and all sockets of this process have prio cgroup
id of this cgroup. When packet from such socket goes through network
stack we choose priority for packet on each device we go through based
on these device+id->prio map.

Previously we were able to set map for each net_prio cgroup on the
system, but only for devices of host init network namespace. This patch
adds mapping for ve init netns devices. VE can only get/change device
map for ve init netns, Host can only get/change device map for host's
init netns.

We can have for same cgroup both mappings setup by host for host net
devices and mappings setup by ve for ve net devices.

When new cgroup is created it either copies only mappings for host
network devices if done from host, or copies also mappings for ve
network devices if done from ve.

If ve is not running (ve_ns is NULL), even while in ve we would operate
with host ifpriomap.

https://jira.sw.ru/browse/PSBM-123766

Signed-off-by: Pavel Tikhomirov 


cgroup: ifpriomap virtualization

I've also added get_curr_ve() helper as it looks like in many places we
rely that get_exec_env() gives us ve which would not free under us, but
all processes can be moved easily from this ve in parallel and ve can be
freed AFAICS.

https://jira.sw.ru/browse/PSBM-123766

Signed-off-by: Kirill Tkhai 
---
 net/core/netprio_cgroup.c |   73 -
 1 file changed, 71 insertions(+), 2 deletions(-)

diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c
index 99a431c56f23..0ab8c37c42b8 100644
--- a/net/core/netprio_cgroup.c
+++ b/net/core/netprio_cgroup.c
@@ -17,6 +17,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -145,6 +146,7 @@ cgrp_css_alloc(struct cgroup_subsys_state *parent_css)
 static int cgrp_css_online(struct cgroup_subsys_state *css)
 {
struct cgroup_subsys_state *parent_css = css->parent;
+   struct ve_struct *ve;
struct net_device *dev;
int ret = 0;
 
@@ -166,6 +168,38 @@ static int cgrp_css_online(struct cgroup_subsys_state *css)
if (ret)
break;
}
+
+   /* get_exec_env is safe under cgroup_mutex */
+   ve = get_exec_env();
+   /*
+* Inherit prios from the parent cgroup in scope of ve init netns.
+*/
+   if (!ve_is_super(ve)) {
+   struct nsproxy *ve_ns;
+   struct net *net = NULL;
+
+   /*
+* Take rcu read lock to check that ve's net is not freed under
+* us after we release rcu read lock we still have rtnl lock to
+* insure net remains non-freed, pairs with rtnl lock in
+* cleanup_net().
+*/
+   rcu_read_lock();
+   ve_ns = rcu_dereference(ve->ve_ns);
+   if (ve_ns)
+   net = ve_ns->net_ns;
+   rcu_read_unlock();
+
+   if (net && net != &init_net) {
+   for_each_netdev(net, dev) {
+   u32 prio = netprio_prio(parent_css, dev);
+
+   ret = netprio_set_prio(css, dev, prio);
+   if (ret)
+   break;
+   }
+   }
+   }
rtnl_unlock();
return ret;
 }
@@ -182,19 +216,38 @@ static u64 read_prioidx(struct cgroup_subsys_state *css, 
struct cftype *cft)
 
 static int read_priomap(struct seq_file *sf, void *v)
 {
+   struct ve_struct *ve;
+   struct net *net, *_net = NULL;
struct net_device *dev;
 
+   ve = get_curr_ve();
+   if (!ve_is_super(ve)) {
+   struct nsproxy *ve_ns;
+
+   rcu_read_lock();
+   ve_ns = rcu_dereference(ve->ve_ns);
+   if (ve_ns)
+   _net = get_net(ve_ns->net_ns);
+   rcu_read_unlock();
+   }
+   put_ve(ve);
+
+   net = _net ? : &init_net;
rcu_read_lock();
-   for_each_netdev_rcu(&init_net, dev)
+   for_each_netdev_rcu(net, dev)
seq_printf(sf, "%s %u\n", dev->name,
   netprio_prio(seq_css(sf), dev));
rcu_read_unlock();
+   if (_net)
+   put_net(_net);
return 0;
 }
 
 static ssize_t write_priomap(struct kernfs_open_file *of,
 char *buf, size_t nbytes, loff_t off)
 {
+   struct ve_struct *ve;
+   struct net *net, *_net = NULL;
char devname[IFNAMSIZ + 1];
struct net_device *dev;
u32 prio;
@@ -203,7 +256,22 @@ static ssize_t write_priomap(struct kernfs_open_file *of,
if (sscanf(buf, "%"__stringify(IFNAMSIZ)"s %u", devname, &prio) != 2)

[Devel] [PATCH RH9 3/5] ve: get_curr_ve: first try getting ve with rcu lock

2021-10-12 Thread Kirill Tkhai

From: Pavel Tikhomirov 

By holding rcu lock we can have valid ve pointer. Next using css_tryget
we can get reference on ve cgroup if it is not yet started to destroy.
In case cgroup is destroying retry with cgroup_mutex.

https://jira.sw.ru/browse/PSBM-123766

Signed-off-by: Pavel Tikhomirov 


cgroup: ifpriomap virtualization

I've also added get_curr_ve() helper as it looks like in many places we
rely that get_exec_env() gives us ve which would not free under us, but
all processes can be moved easily from this ve in parallel and ve can be
freed AFAICS.

https://jira.sw.ru/browse/PSBM-123766

Signed-off-by: Kirill Tkhai 
---
 kernel/cgroup/cgroup.c |   25 +++--
 kernel/ve/ve.c |2 +-
 2 files changed, 24 insertions(+), 3 deletions(-)

diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 04a5e1effbaf..05fe9436a9a3 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1929,8 +1929,29 @@ struct ve_struct *get_curr_ve(void)
struct ve_struct *ve;
 
/*
-* Under cgroup_mutex both current tasks ve cgroup and ->task_ve
-* pointer can't change. Corresponding cgroup_mutex around
+* If first thread loads current->task_ve pointer, and if just after
+* that current is moved by other thread from this ve cgroup to some
+* other and this ve cgroup gets destroyed, ve pointer gets freed, so
+* first thread can't use such ve pointer safely.
+*/
+
+   /*
+* Fast path: Let's make it safe with rcu lock, though current can be
+* moved to other ve cgroup and our ve cgroup can start destroying, ve
+* pointer would be still valid. As it is freed in ve_destroy. And
+* ve_destroy is called from rcu callback after task_ve had changed.
+*/
+   rcu_read_lock();
+   ve = rcu_dereference(current->task_ve);
+   if (css_tryget(&ve->css)) {
+   rcu_read_unlock();
+   return ve;
+   }
+   rcu_read_unlock();
+
+   /*
+* Slow path: Under cgroup_mutex both current tasks ve cgroup and
+* task_ve pointer can't change. Corresponding cgroup_mutex around
 * cgroup_attach_task() protects us from it.
 */
mutex_lock(&cgroup_mutex);
diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
index af46a9b597df..ba5a3a63acec 100644
--- a/kernel/ve/ve.c
+++ b/kernel/ve/ve.c
@@ -900,7 +900,7 @@ static void ve_attach(struct cgroup_taskset *tset)
if (cpuid_override_on())
set_tsk_thread_flag(task, TIF_CPUID_OVERRIDE);
 
-   task->task_ve = ve;
+   rcu_assign_pointer(task->task_ve, ve);
}
 }
 


___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH RH9 2/5] ve: add get_curr_ve helper

2021-10-12 Thread Kirill Tkhai

From: Pavel Tikhomirov 

This helper is a safe alternative to get_exec_env(), this helper
actually gets reference on current ve so if from other thread current
would be moved from this ve, at least this ve would not be freed under
us.

https://jira.sw.ru/browse/PSBM-123766

Signed-off-by: Pavel Tikhomirov 


cgroup: ifpriomap virtualization

I've also added get_curr_ve() helper as it looks like in many places we
rely that get_exec_env() gives us ve which would not free under us, but
all processes can be moved easily from this ve in parallel and ve can be
freed AFAICS.

https://jira.sw.ru/browse/PSBM-123766
Signed-off-by: Kirill Tkhai 
---
 include/linux/cgroup.h |1 +
 kernel/cgroup/cgroup.c |   21 +
 2 files changed, 22 insertions(+)

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 6e9aca26313a..99bd069a476d 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -960,6 +960,7 @@ static inline void cgroup_bpf_put(struct cgroup *cgrp) {}
 
 #ifdef CONFIG_VE
 int ve_hide_cgroups(struct cgroup_root *root);
+struct ve_struct *get_curr_ve(void);
 #endif
 
 #endif /* _LINUX_CGROUP_H */
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 217846841fda..04a5e1effbaf 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1919,6 +1919,27 @@ static int cgroup_reconfigure(struct fs_context *fc)
 }
 
 #ifdef CONFIG_VE
+/*
+ * This helper is a safe alternative to get_exec_env(), this helper actually
+ * gets reference on current ve so if in other thread we would be moved from
+ * this ve, at least this ve would not be freed under us.
+ */
+struct ve_struct *get_curr_ve(void)
+{
+   struct ve_struct *ve;
+
+   /*
+* Under cgroup_mutex both current tasks ve cgroup and ->task_ve
+* pointer can't change. Corresponding cgroup_mutex around
+* cgroup_attach_task() protects us from it.
+*/
+   mutex_lock(&cgroup_mutex);
+   ve = get_ve(current->task_ve);
+   mutex_unlock(&cgroup_mutex);
+
+   return ve;
+}
+
 void cgroup_mark_ve_root(struct ve_struct *ve)
 {
struct cgrp_cset_link *link;


___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH RH9 5/5] ve/fs/inotify: do not impose limit on the number of instances by default

2021-10-12 Thread Kirill Tkhai

From: Vladimir Davydov 

In Vz7 we haven't switched to user ns yet. As a result, all containers
use the same user_struct for the same user id. This leads to hitting
fs.inotify.max_user_instances sysctl limit quickly (it equals 128 by
default) and failing to start a container. This patch sets the default
limit to INT_MAX. This is a temporary solution and should be reverted
once we start using user ns.

In PCS6 there is no such problem, because we actually create a user ns
per container there. Although its functionality is basic in comparison
to Vz7, it still results in creating a new user_struct for each user
inside a container so that the inotify limit is containerized.

https://jira.sw.ru/browse/PSBM-39048

Signed-off-by: Vladimir Davydov 

khorenko@: to be reverted once we support userns in Virtuozzo 7
(cherry picked from 78c91a02de6b6f0423e12e12128f9433934d7c61)
Signed-off-by: Valeriy Vdovin

https://jira.sw.ru/browse/PSBM-131634
Signed-off-by: Valeriy Vdovin 

khorenko@:
TODO: we have to review all places along this path of using inotifies
and make sure all allocations are accounted to Containers.
Signed-off-by: Kirill Tkhai 
---
 fs/notify/inotify/inotify_user.c |4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 62051247f6d2..d30a459136b6 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -847,8 +847,8 @@ static int __init inotify_user_setup(void)
   SLAB_PANIC|SLAB_ACCOUNT);
 
inotify_max_queued_events = 16384;
-   init_user_ns.ucount_max[UCOUNT_INOTIFY_INSTANCES] = 128;
-   init_user_ns.ucount_max[UCOUNT_INOTIFY_WATCHES] = watches_max;
+   init_user_ns.ucount_max[UCOUNT_INOTIFY_INSTANCES] = INT_MAX;
+   init_user_ns.ucount_max[UCOUNT_INOTIFY_WATCHES] = INT_MAX;
 
return 0;
 }


___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH RH9 1/5] shm: skip shm_destroy if task IPC namespace was changed

2021-10-12 Thread Kirill Tkhai

From: Alexander Mikhalitsyn 

Fixes: ab602f79915 ("shm: make exit_shm work proportional to task activity")
(ms commit)

https://jira.sw.ru/browse/PSBM-131142

Signed-off-by: Alexander Mikhalitsyn 
Signed-off-by: Kirill Tkhai 
---
 ipc/shm.c |   10 +-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/ipc/shm.c b/ipc/shm.c
index ab749be6d8b7..fb4e58375802 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -173,6 +173,14 @@ static inline struct shmid_kernel 
*shm_obtain_object_check(struct ipc_namespace
return container_of(ipcp, struct shmid_kernel, shm_perm);
 }
 
+static inline bool is_shm_in_ns(struct ipc_namespace *ns, struct shmid_kernel 
*shp)
+{
+   int idx = ipcid_to_idx(shp->shm_perm.id);
+   struct shmid_kernel *tshp = shm_obtain_object(ns, idx);
+
+   return !IS_ERR(tshp) && tshp == shp;
+}
+
 /*
  * shm_lock_(check_) routines are called in the paths where the rwsem
  * is not necessarily held.
@@ -415,7 +423,7 @@ void exit_shm(struct task_struct *task)
list_for_each_entry_safe(shp, n, &task->sysvshm.shm_clist, shm_clist) {
shp->shm_creator = NULL;
 
-   if (shm_may_destroy(ns, shp)) {
+   if (is_shm_in_ns(ns, shp) && shm_may_destroy(ns, shp)) {
shm_lock_by_ptr(shp);
shm_destroy(ns, shp);
}


___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH RH9 v2 0/5] part23 part

2021-10-12 Thread Kirill Tkhai

---

Alexander Mikhalitsyn (1):
  shm: skip shm_destroy if task IPC namespace was changed

Pavel Tikhomirov (3):
  ve: add get_curr_ve helper
  ve: get_curr_ve: first try getting ve with rcu lock
  cgroup/net_prio: virtualize ifpriomap per-ve

Vladimir Davydov (1):
  ve/fs/inotify: do not impose limit on the number of instances by default


 fs/notify/inotify/inotify_user.c |4 +-
 include/linux/cgroup.h   |1 +
 ipc/shm.c|   10 +
 kernel/cgroup/cgroup.c   |   42 ++
 kernel/ve/ve.c   |2 +
 net/core/netprio_cgroup.c|   73 +-
 6 files changed, 126 insertions(+), 6 deletions(-)

--
Signed-off-by: Kirill Tkhai 

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH RH9 1/3] shm: skip shm_destroy if task IPC namespace was changed

2021-10-11 Thread Kirill Tkhai

From: Alexander Mikhalitsyn 

Fixes: ab602f79915 ("shm: make exit_shm work proportional to task activity")
(ms commit)

https://jira.sw.ru/browse/PSBM-131142

Signed-off-by: Alexander Mikhalitsyn 
Signed-off-by: Kirill Tkhai 
---
 ipc/shm.c |   10 +-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/ipc/shm.c b/ipc/shm.c
index ab749be6d8b7..fb4e58375802 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -173,6 +173,14 @@ static inline struct shmid_kernel 
*shm_obtain_object_check(struct ipc_namespace
return container_of(ipcp, struct shmid_kernel, shm_perm);
 }
 
+static inline bool is_shm_in_ns(struct ipc_namespace *ns, struct shmid_kernel 
*shp)
+{
+   int idx = ipcid_to_idx(shp->shm_perm.id);
+   struct shmid_kernel *tshp = shm_obtain_object(ns, idx);
+
+   return !IS_ERR(tshp) && tshp == shp;
+}
+
 /*
  * shm_lock_(check_) routines are called in the paths where the rwsem
  * is not necessarily held.
@@ -415,7 +423,7 @@ void exit_shm(struct task_struct *task)
list_for_each_entry_safe(shp, n, &task->sysvshm.shm_clist, shm_clist) {
shp->shm_creator = NULL;
 
-   if (shm_may_destroy(ns, shp)) {
+   if (is_shm_in_ns(ns, shp) && shm_may_destroy(ns, shp)) {
shm_lock_by_ptr(shp);
shm_destroy(ns, shp);
}


___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH RH9 0/3] part23 part

2021-10-11 Thread Kirill Tkhai

https://jira.sw.ru/browse/PSBM-134015

---

Alexander Mikhalitsyn (1):
  shm: skip shm_destroy if task IPC namespace was changed

Pavel Tikhomirov (1):
  cgroup/net_prio: virtualize ifpriomap per-ve

Vladimir Davydov (1):
  commit 22b5a8a84548


 fs/notify/inotify/inotify_user.c |4 +-
 ipc/shm.c|   10 +
 net/core/netprio_cgroup.c|   73 +-
 3 files changed, 82 insertions(+), 5 deletions(-)

--
Signed-off-by: Kirill Tkhai 

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH RH9 2/3] cgroup/net_prio: virtualize ifpriomap per-ve

2021-10-11 Thread Kirill Tkhai

From: Pavel Tikhomirov 

Ifpriomap is a map of net_prio cgroup id to device prio. Each process is
in some netprio cgroup and all sockets of this process have prio cgroup
id of this cgroup. When packet from such socket goes through network
stack we choose priority for packet on each device we go through based
on these device+id->prio map.

Previously we were able to set map for each net_prio cgroup on the
system, but only for devices of host init network namespace. This patch
adds mapping for ve init netns devices. VE can only get/change device
map for ve init netns, Host can only get/change device map for host's
init netns.

We can have for same cgroup both mappings setup by host for host net
devices and mappings setup by ve for ve net devices.

When new cgroup is created it either copies only mappings for host
network devices if done from host, or copies also mappings for ve
network devices if done from ve.

If ve is not running (ve_ns is NULL), even while in ve we would operate
with host ifpriomap.

https://jira.sw.ru/browse/PSBM-123766

Signed-off-by: Pavel Tikhomirov 


cgroup: ifpriomap virtualization

I've also added get_curr_ve() helper as it looks like in many places we
rely that get_exec_env() gives us ve which would not free under us, but
all processes can be moved easily from this ve in parallel and ve can be
freed AFAICS.

https://jira.sw.ru/browse/PSBM-123766

Signed-off-by: Kirill Tkhai 
---
 net/core/netprio_cgroup.c |   73 -
 1 file changed, 71 insertions(+), 2 deletions(-)

diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c
index 99a431c56f23..0ab8c37c42b8 100644
--- a/net/core/netprio_cgroup.c
+++ b/net/core/netprio_cgroup.c
@@ -17,6 +17,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -145,6 +146,7 @@ cgrp_css_alloc(struct cgroup_subsys_state *parent_css)
 static int cgrp_css_online(struct cgroup_subsys_state *css)
 {
struct cgroup_subsys_state *parent_css = css->parent;
+   struct ve_struct *ve;
struct net_device *dev;
int ret = 0;
 
@@ -166,6 +168,38 @@ static int cgrp_css_online(struct cgroup_subsys_state *css)
if (ret)
break;
}
+
+   /* get_exec_env is safe under cgroup_mutex */
+   ve = get_exec_env();
+   /*
+* Inherit prios from the parent cgroup in scope of ve init netns.
+*/
+   if (!ve_is_super(ve)) {
+   struct nsproxy *ve_ns;
+   struct net *net = NULL;
+
+   /*
+* Take rcu read lock to check that ve's net is not freed under
+* us after we release rcu read lock we still have rtnl lock to
+* insure net remains non-freed, pairs with rtnl lock in
+* cleanup_net().
+*/
+   rcu_read_lock();
+   ve_ns = rcu_dereference(ve->ve_ns);
+   if (ve_ns)
+   net = ve_ns->net_ns;
+   rcu_read_unlock();
+
+   if (net && net != &init_net) {
+   for_each_netdev(net, dev) {
+   u32 prio = netprio_prio(parent_css, dev);
+
+   ret = netprio_set_prio(css, dev, prio);
+   if (ret)
+   break;
+   }
+   }
+   }
rtnl_unlock();
return ret;
 }
@@ -182,19 +216,38 @@ static u64 read_prioidx(struct cgroup_subsys_state *css, 
struct cftype *cft)
 
 static int read_priomap(struct seq_file *sf, void *v)
 {
+   struct ve_struct *ve;
+   struct net *net, *_net = NULL;
struct net_device *dev;
 
+   ve = get_curr_ve();
+   if (!ve_is_super(ve)) {
+   struct nsproxy *ve_ns;
+
+   rcu_read_lock();
+   ve_ns = rcu_dereference(ve->ve_ns);
+   if (ve_ns)
+   _net = get_net(ve_ns->net_ns);
+   rcu_read_unlock();
+   }
+   put_ve(ve);
+
+   net = _net ? : &init_net;
rcu_read_lock();
-   for_each_netdev_rcu(&init_net, dev)
+   for_each_netdev_rcu(net, dev)
seq_printf(sf, "%s %u\n", dev->name,
   netprio_prio(seq_css(sf), dev));
rcu_read_unlock();
+   if (_net)
+   put_net(_net);
return 0;
 }
 
 static ssize_t write_priomap(struct kernfs_open_file *of,
 char *buf, size_t nbytes, loff_t off)
 {
+   struct ve_struct *ve;
+   struct net *net, *_net = NULL;
char devname[IFNAMSIZ + 1];
struct net_device *dev;
u32 prio;
@@ -203,7 +256,22 @@ static ssize_t write_priomap(struct kernfs_open_file *of,
if (sscanf(buf, "%"__stringify(IFNAMSIZ)"s %u", devname, &prio) != 2)

[Devel] [PATCH RH9 3/3] commit 22b5a8a84548

2021-10-11 Thread Kirill Tkhai

From: Vladimir Davydov 

ve/fs/inotify: do not impose limit on the number of instances by default

In Vz7 we haven't switched to user ns yet. As a result, all containers
use the same user_struct for the same user id. This leads to hitting
fs.inotify.max_user_instances sysctl limit quickly (it equals 128 by
default) and failing to start a container. This patch sets the default
limit to INT_MAX. This is a temporary solution and should be reverted
once we start using user ns.

In PCS6 there is no such problem, because we actually create a user ns
per container there. Although its functionality is basic in comparison
to Vz7, it still results in creating a new user_struct for each user
inside a container so that the inotify limit is containerized.

https://jira.sw.ru/browse/PSBM-39048

Signed-off-by: Vladimir Davydov 

khorenko@: to be reverted once we support userns in Virtuozzo 7
(cherry picked from 78c91a02de6b6f0423e12e12128f9433934d7c61)
Signed-off-by: Valeriy Vdovin

https://jira.sw.ru/browse/PSBM-131634
Signed-off-by: Valeriy Vdovin 

khorenko@:
TODO: we have to review all places along this path of using inotifies
and make sure all allocations are accounted to Containers.
---
 fs/notify/inotify/inotify_user.c |4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 62051247f6d2..d30a459136b6 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -847,8 +847,8 @@ static int __init inotify_user_setup(void)
   SLAB_PANIC|SLAB_ACCOUNT);
 
inotify_max_queued_events = 16384;
-   init_user_ns.ucount_max[UCOUNT_INOTIFY_INSTANCES] = 128;
-   init_user_ns.ucount_max[UCOUNT_INOTIFY_WATCHES] = watches_max;
+   init_user_ns.ucount_max[UCOUNT_INOTIFY_INSTANCES] = INT_MAX;
+   init_user_ns.ucount_max[UCOUNT_INOTIFY_WATCHES] = INT_MAX;
 
return 0;
 }


___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH RH9] ve/devtmpfs: lightweight virtualization

2021-10-11 Thread Kirill Tkhai

From: Stanislav Kinsburskiy 

Due to changes in RH8.4 we need to rewrork it, actually the logic
becomes much more simple, we mount/umount single tmpts per ve on cgroup
creation/removal, all actual devtmpfs mount calls only increase a
refcount on corresponding ve's mount like with hosts devtmps.

Original commit message:

Previousely, we implemented full-featured devtmpfs virtualization for
VE: when a device is created in a VE "namespace", we send a signal to
kdevtmpfs to create the devnode on devtmpfs mount corresponding to the
VE. This seems to be over-complicated: all this work can be done from
userspace, because we only have a hardcoded list of devices created
exclusively for VE on container start. Those are tty-related stuff and
mem devices, and we only need the latter to create devtmpfs nodes.
Moreover, it is buggy: ve_stop_ns, which destroys VE devtmpfs mount can
be called before a VE tty device is unregistered, resulting in a KP:

https://jira.sw.ru/browse/PSBM-35077

This patch therefore simplified it. It makes the kernel only provide a
single empty tmpfs mount per VE, which appears on an attempt to mount
devtmpfs from inside a VE. The content of the fs is to be filled by the
userspace on container start, which will be done in the scope of

https://jira.sw.ru/browse/PSBM-35146

All this patch does is provides each VE with its own empty single tmpfs
mount, which appears on an attempt to mount "devtmpfs". It's up to the
userspace to populate this fs on container start, all kernel requests to
create a device node inside a VE are ignored.

Signed-off-by: Vladimir Davydov 
Signed-off-by: Stanislav Kinsburskiy 

https://jira.sw.ru/browse/PSBM-131158

Signed-off-by: Pavel Tikhomirov 

v2 by khorenko@: s/FS_USERNS_MOUNT/FS_VE_MOUNT/
Signed-off-by: Kirill Tkhai 
---
 drivers/base/devtmpfs.c |   24 
 include/linux/device.h  |2 ++
 include/linux/ve.h  |2 ++
 kernel/ve/ve.c  |6 ++
 4 files changed, 34 insertions(+)

diff --git a/drivers/base/devtmpfs.c b/drivers/base/devtmpfs.c
index 8be352ab4ddb..b3a3cbe65daa 100644
--- a/drivers/base/devtmpfs.c
+++ b/drivers/base/devtmpfs.c
@@ -27,6 +27,7 @@
 #include 
 #include 
 #include 
+#include 
 #include "base.h"
 
 static struct task_struct *thread;
@@ -59,6 +60,12 @@ static struct dentry *public_dev_mount(struct 
file_system_type *fs_type, int fla
  const char *dev_name, void *data)
 {
struct super_block *s = mnt->mnt_sb;
+#ifdef CONFIG_VE
+   struct ve_struct *ve = get_exec_env();
+
+   if (!ve_is_super(ve))
+   s = ve->devtmpfs_mnt->mnt_sb;
+#endif
atomic_inc(&s->s_active);
down_write(&s->s_umount);
return dget(s->s_root);
@@ -79,6 +86,7 @@ static struct file_system_type internal_fs_type = {
 static struct file_system_type dev_fs_type = {
.name = "devtmpfs",
.mount = public_dev_mount,
+   .fs_flags = FS_VIRTUALIZED | FS_VE_MOUNT,
 };
 
 #ifdef CONFIG_BLOCK
@@ -438,6 +446,22 @@ static int __ref devtmpfsd(void *p)
return 0;
 }
 
+int ve_mount_devtmpfs(struct ve_struct *ve)
+{
+   char opts[] = "mode=0755";
+   struct vfsmount *mnt;
+
+   mnt = vfs_kern_mount(&internal_fs_type, 0, "devtmpfs", opts);
+   if (IS_ERR(mnt)) {
+   printk(KERN_ERR "CT#%s: devtmpfs: unable to create devtmpfs 
%ld\n",
+  ve_name(ve), PTR_ERR(mnt));
+   return PTR_ERR(mnt);
+   }
+   ve->devtmpfs_mnt = mnt;
+
+   return 0;
+}
+
 /*
  * Create devtmpfs instance, driver-core devices will add their device
  * nodes here.
diff --git a/include/linux/device.h b/include/linux/device.h
index 65d84b67b024..8b1511b1af44 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -950,8 +950,10 @@ bool kill_device(struct device *dev);
 
 #ifdef CONFIG_DEVTMPFS
 int devtmpfs_mount(void);
+extern int ve_mount_devtmpfs(struct ve_struct *ve);
 #else
 static inline int devtmpfs_mount(void) { return 0; }
+static inline int ve_mount_devtmpfs(struct ve_struct *ve) { return 0; }
 #endif
 
 /* drivers/base/power/shutdown.c */
diff --git a/include/linux/ve.h b/include/linux/ve.h
index ffe068ec5fe7..e8514c5a0afb 100644
--- a/include/linux/ve.h
+++ b/include/linux/ve.h
@@ -26,6 +26,7 @@ struct nsproxy;
 struct veip_struct;
 struct user_namespace;
 struct cn_private;
+struct vfsmount;
 
 struct ve_struct {
struct cgroup_subsys_state  css;
@@ -103,6 +104,7 @@ struct ve_struct {
unsigned long   aio_nr;
unsigned long   aio_max_nr;
 #endif
+   struct vfsmount *devtmpfs_mnt;
 };
 
 struct ve_devmnt {
diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
index 38ede55d65b7..af46a9b597df 100644
--- a/kernel/ve/ve.c
+++ b/kernel/ve/ve.c
@@ -32,6 +32,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -7

Re: [Devel] [PATCH RH9 00/20] part 20 modules autoload

2021-10-08 Thread Kirill Tkhai

commited

On 08.10.2021 12:50, Kirill Tkhai wrote:
> ---
> 
> Andrey Ryabinin (2):
>   ve/kmod/whitelist: Allow ip6tables_raw modules autoload upon request 
> from CT
>   ve/kmod/whitelist: Allow nf_tables module autoloading on request from CT
> 
> Cyrill Gorcunov (1):
>   ve/kmod/whitelist: Add modules to whitelist for c/r sake
> 
> Kirill Tkhai (2):
>   ve/kmod/whitelist: Allow conntrack nft-helper-* modules autoloading
>   ve/kmod/whitelist: Allow ts_kmp module autoloading
> 
> Konstantin Khorenko (7):
>   ve/sysctl/kmod: Introduce tweak to allow indirect modules load from CT
>   ve/kmod/whitelist: Infrustructure for list of modules to autoload from 
> CT
>   ve/kmod: Honor modprobe blacklist on indirect modules autoload from CT
>   commit 04248b3ff00d
>   commit da8c1e2262f8
>   ve/kmod/whitelist: Allow nfnetlink_queue module autoload from CT
>   ve/kmod/whitelist: Allow "nft_compat" module autoload from inside a 
> Container
> 
> Pavel Tikhomirov (6):
>   ve/kmod/whitelist: Allow dummy module autoloading
>   ve/kmod/whitelist: Enable vxlan module autoload from inside a Container
>   ve/kmod/whitelist: Allow IPVS modules autoload in CT
>   ve/kmod/whitelist: Allow netfilter/ipset modules autoload from inside a 
> CT
>   ve/kmod/whitelist: make nfnetlink_log autoloadable upon request from a 
> CT
>   ve/kmod/whitelist: Make fib modules autoloadable from CT
> 
> Stanislav Kinsburskiy (1):
>   ve/kmod/whitelist: Allow NFS modules autoload in Containers
> 
> Vasily Averin (1):
>   ve/kmod/whitelist: Enable autoload for iptables security tables from 
> inside CT
> 
> 
>  include/linux/kmod.h   |5 +
>  include/linux/sysctl.h |2 
>  kernel/kmod.c  |  195 
> +++++---
>  kernel/sysctl.c|   16 
>  4 files changed, 207 insertions(+), 11 deletions(-)
> 
> --
> Signed-off-by: Kirill Tkhai 
> 

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

Re: [Devel] [PATCH RH9 0/8] part 29 vtty: vz console

2021-10-08 Thread Kirill Tkhai

On 07.10.2021 18:18, Cyrill Gorcunov wrote:
> Hi! Here is a ported vtty series, build and boot tested only obviously.
> I think we might revisit this code and rework more deeply once we manage
> to run containers inside so I would be able to manipulate vtty console
> from userspace level.
> 
> https://jira.sw.ru/browse/PSBM-134014

commited

> 
> Andrey Vagin (1):
>   ve/kbd: add file kbd_bind in sysfs, which allow unbind keyboard from
> tty (v2)
> 
> Cyrill Gorcunov (6):
>   ve/tty: Provide interface for current tty inheritance
>   ve/tty: vt -- Implement per VE support for console and terminals
>   ve/vtty: Don't zap termios fields on slave peer
>   ve/vtty: Make indices to match pcs6 scheme
>   ve/vtty: Don't close unread master peer if slave is nonzero
>   ve/vtty: Don't free console mapping until no clients left
> 
> Konstantin Khlebnikov (1):
>   ve/tty: TIOSAK Secure Attention Key ioctl
> 
>  arch/powerpc/include/uapi/asm/ioctls.h |   2 +
>  drivers/input/input.c  |   2 +-
>  drivers/tty/n_tty.c|   6 +
>  drivers/tty/pty.c  | 528 +
>  drivers/tty/tty_io.c   |  73 +++-
>  drivers/tty/vt/keyboard.c  |  74 +++-
>  include/linux/ve.h |  13 +-
>  include/uapi/asm-generic/ioctls.h  |   2 +
>  kernel/ve/ve.c |  91 +
>  kernel/ve/vecalls.c|   3 +
>  10 files changed, 781 insertions(+), 13 deletions(-)
> 
> 
> base-commit: dd9fd627ae5764d17efa1a432a7b771d65de1c71
> 

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH RH9 19/20] ve/kmod/whitelist: make nfnetlink_log autoloadable upon request from a CT

2021-10-08 Thread Kirill Tkhai

From: Pavel Tikhomirov 

We see that k8s_weave-npc container fails with:

Thu Oct 29 09:19:53 2020 <5> ulogd.c:981 building new pluginstance stack: 
'log1:NFLOG,base1:BASE,pcap1:PCAP'
Thu Oct 29 09:19:53 2020 <7> ulogd_inppkt_NFLOG.c:552 unable to bind to log 
group 86
Thu Oct 29 09:19:53 2020 <7> ulogd.c:948 error starting `log1'
Thu Oct 29 09:19:53 2020 <8> ulogd.c:1597 not even a single working plugin stack
Fatal error.

It needs nfnetlink_log module to be loaded. Need this to be able to run
kubernetes in centos-8 containers where it uses nft logs.

https://jira.sw.ru/browse/PSBM-121652

Signed-off-by: Pavel Tikhomirov 

(cherry picked from vz7 commit fdec083048f8 ("ve/kmod: make
nfnetlink_log autoloadable upon request from a CT")

Signed-off-by: Konstantin Khorenko 
---
 kernel/kmod.c |1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/kmod.c b/kernel/kmod.c
index f3bd4afb81e1..1ff56543f59d 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -254,6 +254,7 @@ static const char * const ve0_allowed_mod[] = {
"nfnetlink-subsys-1",   /* NFNL_SUBSYS_CTNETLINK */
"nfnetlink-subsys-2",   /* NFNL_SUBSYS_CTNETLINK_EXP */
"nfnetlink-subsys-3",   /* NFNL_SUBSYS_QUEUE */
+   "nfnetlink-subsys-4",   /* NFNL_SUBSYS_ULOG */
"nfnetlink-subsys-10",  /* nf_tables */
"nfnetlink-subsys-11",  /* nft_compat */
 


___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH RH9 20/20] ve/kmod/whitelist: Make fib modules autoloadable from CT

2021-10-08 Thread Kirill Tkhai

From: Pavel Tikhomirov 

Need it for docker Docker nat rules c/r in nft based environment.

https://jira.sw.ru/browse/PSBM-125002

Signed-off-by: Pavel Tikhomirov 


ve/kmod: fix misprint in fib modules autoload allow rules

When testing criu to suspend resume "fib" rules I found out that we have
wrong names for fib module aliases, and thus can't load them on restore
if they are not yet loaded.

Perf shows when restoring centos 8 CT with docker:
probe:module_payload_iptable_allowed: module_string="nft-expr-2-fib"

https://jira.sw.ru/browse/PSBM-125002

mFixes: 84beb0e73874a ("ve/kmod: make fib modules autoloadable from CT")
Signed-off-by: Pavel Tikhomirov 

(cherry picked from vz7 commit ("f4eb6e8a5a78 ve/kmod: make fib modules
autoloadable from CT")
Signed-off-by: Konstantin Khorenko 
---
 kernel/kmod.c |1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/kmod.c b/kernel/kmod.c
index 1ff56543f59d..678735dbb969 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -232,6 +232,7 @@ static const char * const ve0_allowed_mod[] = {
"nf_synproxy_core",
 
"nft-set",
+   "nft_fib",
"nf_tproxy_ipv4",
"nf_tproxy_ipv6",
 


___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH RH9 18/20] ve/kmod/whitelist: Enable autoload for iptables security tables from inside CT

2021-10-08 Thread Kirill Tkhai

From: Vasily Averin 

Patch enables autoload of iptable_security and ip6table_security from
inside containers.

It decreases number of errors generated during firewalld start.

https://jira.sw.ru/browse/PSBM-98212

Signed-by: Vasily Averin 

(cherry picked from vz7 commit 77a471044478 ("ve/kmod: enable autoload
for iptables security tables from inside CT")

Signed-off-by: Konstantin Khorenko 
---
 kernel/kmod.c |2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/kmod.c b/kernel/kmod.c
index 3a445d4e2734..f3bd4afb81e1 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -206,10 +206,12 @@ static const char * const ve0_allowed_mod[] = {
"ip6_tables",
"iptable_filter",
"iptable_raw",
+   "iptable_security",
"iptable_nat",
"iptable_mangle",
"ip6table_filter",
"ip6table_raw",
+   "ip6table_security",
"ip6table_nat",
"ip6table_mangle",
 


___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH RH9 15/20] ve/kmod/whitelist: Allow ts_kmp module autoloading

2021-10-08 Thread Kirill Tkhai

Otherwise rules like below can't applied from inside CT,
when the module is not loaded.

$iptables -I FORWARD -m string --string "xx" --algo kmp --to 65535 -j DROP

https://jira.sw.ru/browse/PSBM-97729

Signed-off-by: Kirill Tkhai 

(cherry picked from vz7 commit
 2e3b2c332d41 ("ve/modules: Add ts_kmp to allowed modules"))

Signed-off-by: Konstantin Khorenko 
---
 kernel/kmod.c |3 +++
 1 file changed, 3 insertions(+)

diff --git a/kernel/kmod.c b/kernel/kmod.c
index da0e72fe7de7..68aeed6587d6 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -298,6 +298,9 @@ static const char * const ve0_allowed_mod[] = {
"ip_vs_sh",
"ip_vs_lblcr",
"ip_vs_lc",
+
+   /* string */
+   "ts_kmp",
 };
 
 /*


___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH RH9 10/20] ve/kmod/whitelist: Allow netfilter/ipset modules autoload from inside a CT

2021-10-08 Thread Kirill Tkhai

From: Pavel Tikhomirov 

I forgot to allow in CT autoload of needed modules, so do:
ip_set_list_set
ip_set_hash_netiface
ip_set_hash_ipportnet
ip_set_hash_netport
ip_set_hash_net
ip_set_hash_ipportip
ip_set_hash_ipport
ip_set_hash_ip
ip_set_bitmap_port
ip_set_bitmap_ipmac
ip_set_bitmap_ip
ip_set

https://jira.sw.ru/browse/PSBM-46102

Signed-off-by: Pavel Tikhomirov 

(cherry picked from vz7 commit
 1af0b905877a ("ve/netfilter/ipset: allow modules autoload"))

Signed-off-by: Konstantin Khorenko 
---
 kernel/kmod.c |   14 ++
 1 file changed, 14 insertions(+)

diff --git a/kernel/kmod.c b/kernel/kmod.c
index be0908452d7b..6acc4d943283 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -260,6 +260,20 @@ static const char * const ve0_allowed_mod[] = {
/* netlink_diag */
"net-pf-16-proto-4-type-16",/* PF_NETLINK, NETLINK_SOCK_DIAG, 
AF_NETLINK */
 
+   /* ip_set */
+   "nfnetlink-subsys-6",   /* NFNL_SUBSYS_IPSET */
+   "ip_set_bitmap:ip",
+   "ip_set_bitmap:ip,mac",
+   "ip_set_bitmap:port",
+   "ip_set_hash:ip",
+   "ip_set_hash:ip,port",
+   "ip_set_hash:ip,port,ip",
+   "ip_set_hash:net",
+   "ip_set_hash:net,port",
+   "ip_set_hash:ip,port,net",
+   "ip_set_hash:net,iface",
+   "ip_set_list:set",
+
"rtnl-link-dummy",
"rtnl-link-vxlan",
 


___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH RH9 16/20] ve/kmod/whitelist: Allow nf_tables module autoloading on request from CT

2021-10-08 Thread Kirill Tkhai

From: Andrey Ryabinin 

Allow nf_tables.ko module autloading from CT. Needed for iptables in centos 8.

https://jira.sw.ru/browse/PSBM-98211
Signed-off-by: Andrey Ryabinin 

(cherry picked from vz7 commit 18c67099330a ("ve/kmod, nf_tables: allow
nf_tables.ko autoloading on request from ve."))

Signed-off-by: Konstantin Khorenko 
---
 kernel/kmod.c |1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/kmod.c b/kernel/kmod.c
index 68aeed6587d6..f79970fa75e1 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -252,6 +252,7 @@ static const char * const ve0_allowed_mod[] = {
"nfnetlink-subsys-1",   /* NFNL_SUBSYS_CTNETLINK */
"nfnetlink-subsys-2",   /* NFNL_SUBSYS_CTNETLINK_EXP */
"nfnetlink-subsys-3",   /* NFNL_SUBSYS_QUEUE */
+   "nfnetlink-subsys-10",  /* nf_tables */
 
/* unix_diag */
"net-pf-16-proto-4-type-1", /* PF_NETLINK, NETLINK_SOCK_DIAG, 
AF_LOCAL */


___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH RH9 17/20] ve/kmod/whitelist: Allow "nft_compat" module autoload from inside a Container

2021-10-08 Thread Kirill Tkhai

From: Konstantin Khorenko 

A Container with CentOS 8 inside uses nft by default and
iptables work in a legacy mode, for that "nft_compat" is
required, so allow its autoload.

 [CT]# iptables -A INPUT -m tos --tos Minimize-Cost -j REJECT
 iptables v1.8.2 (nf_tables): Couldn't load match
 `tos':No such file or directory

https://jira.sw.ru/browse/PSBM-98948

Signed-off-by: Konstantin Khorenko 
Acked-by: Andrey Ryabinin 

(cherry picked from vz7 commit f247ccddb3f9 ("ve/kmod: allow
"nft_compat" module autoload from inside a Container"))

Signed-off-by: Konstantin Khorenko 
---
 kernel/kmod.c |1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/kmod.c b/kernel/kmod.c
index f79970fa75e1..3a445d4e2734 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -253,6 +253,7 @@ static const char * const ve0_allowed_mod[] = {
"nfnetlink-subsys-2",   /* NFNL_SUBSYS_CTNETLINK_EXP */
"nfnetlink-subsys-3",   /* NFNL_SUBSYS_QUEUE */
"nfnetlink-subsys-10",  /* nf_tables */
+   "nfnetlink-subsys-11",  /* nft_compat */
 
/* unix_diag */
"net-pf-16-proto-4-type-1", /* PF_NETLINK, NETLINK_SOCK_DIAG, 
AF_LOCAL */


___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH RH9 13/20] ve/kmod/whitelist: Allow nfnetlink_queue module autoload from CT

2021-10-08 Thread Kirill Tkhai

From: Konstantin Khorenko 

+   "nfnetlink-subsys-3",   /* NFNL_SUBSYS_QUEUE */

https://jira.sw.ru/browse/PSBM-92694

Signed-off-by: Konstantin Khorenko 

(cherry picked from vz7 commit 588834a3e83f
 ("ve/netfilter/ipset: allow nfnetlink_queue module autoload"))

Signed-off-by: Konstantin Khorenko 
---
 kernel/kmod.c |1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/kmod.c b/kernel/kmod.c
index 36115c12b46c..b84bfdf216ff 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -251,6 +251,7 @@ static const char * const ve0_allowed_mod[] = {
"net-pf-16-proto-12",   /* PF_NETLINK, NETLINK_NETFILTER */
"nfnetlink-subsys-1",   /* NFNL_SUBSYS_CTNETLINK */
"nfnetlink-subsys-2",   /* NFNL_SUBSYS_CTNETLINK_EXP */
+   "nfnetlink-subsys-3",   /* NFNL_SUBSYS_QUEUE */
 
/* unix_diag */
"net-pf-16-proto-4-type-1", /* PF_NETLINK, NETLINK_SOCK_DIAG, 
AF_LOCAL */


___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH RH9 12/20] ve/kmod/whitelist: Allow NFS modules autoload in Containers

2021-10-08 Thread Kirill Tkhai

From: Stanislav Kinsburskiy 

Otherwise Container online migration can fail.

https://jira.sw.ru/browse/PSBM-58178

Signed-off-by: Stanislav Kinsburskiy 
Reviewed-by: Dmitry Safonov 

(cherry picked from vz7 commit
 d6e47c05b868 ("ve/modules: allow NFS modules autoload in Containers"))

Signed-off-by: Konstantin Khorenko 
---
 kernel/kmod.c |4 
 1 file changed, 4 insertions(+)

diff --git a/kernel/kmod.c b/kernel/kmod.c
index 6544d56a3f96..36115c12b46c 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -278,6 +278,10 @@ static const char * const ve0_allowed_mod[] = {
"rtnl-link-dummy",
"rtnl-link-vxlan",
 
+   /* NFS */
+   "nfsv3",
+   "nfsv4",
+
/* IPVS */
"ip_vs_ftp",
"ip_vs_nq",


___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH RH9 14/20] ve/kmod/whitelist: Allow conntrack nft-helper-* modules autoloading

2021-10-08 Thread Kirill Tkhai

Otherwise, in case of destination node does not have
modules loaded, CT migration fails.

https://jira.sw.ru/browse/PSBM-90319

Signed-off-by: Kirill Tkhai 

(cherry picked from vz7 commit
 c92758e6ea45 ("net: Allow autoloading conntrack nft-helper-* modules"))

Signed-off-by: Konstantin Khorenko 
---
 kernel/kmod.c |4 
 1 file changed, 4 insertions(+)

diff --git a/kernel/kmod.c b/kernel/kmod.c
index b84bfdf216ff..da0e72fe7de7 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -336,6 +336,10 @@ bool module_payload_allowed(const char *module)
!strncmp("nfct-helper-",module, 12))
return true;
 
+   /* nfct-helper-* modules */
+   if (!strncmp("nfct-helper-", module, 12))
+   return true;
+
return false;
 }
 #endif /* CONFIG_VE */


___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH RH9 11/20] ve/kmod/whitelist: Allow ip6tables_raw modules autoload upon request from CT

2021-10-08 Thread Kirill Tkhai

From: Andrey Ryabinin 

Currently autoloading of the ip6table_raw module is forbidden
from container, leading to:
 ip6tables-restore v1.4.21: ip6tables-restore: unable to initialize table 
'raw'

If use of ip6tables is allowed in container, autoloading of the ip6tables_raw
has to be permitted as well.

https://jira.sw.ru/browse/PSBM-50548

Signed-off-by: Andrey Ryabinin 
Acked-by: Kirill Tkhai 

(cherry picked from vz7 commit 7bc4ff4c5928 ("ve/net/ip6tables: fix
autoloading of the ip6table_raw module from CT"))

Signed-off-by: Konstantin Khorenko 
---
 kernel/kmod.c |1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/kmod.c b/kernel/kmod.c
index 6acc4d943283..6544d56a3f96 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -209,6 +209,7 @@ static const char * const ve0_allowed_mod[] = {
"iptable_nat",
"iptable_mangle",
"ip6table_filter",
+   "ip6table_raw",
"ip6table_nat",
"ip6table_mangle",
 


___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH RH9 09/20] ve/kmod/whitelist: Allow IPVS modules autoload in CT

2021-10-08 Thread Kirill Tkhai

From: Pavel Tikhomirov 

we still need to add ip_vs module in /etc/modules-load.d/vz.conf
to be able to use ipvs in CT, all other modules are request_module'ed
from ip_vs.

https://jira.sw.ru/browse/PSBM-63883
Signed-off-by: Pavel Tikhomirov 
Reviewed-by: Andrew Vagin 

(cherry picked from vz7 commit
 8852410899b0 ("ve/net/ipvs: allow IPVS modules autoload in CT"))

Signed-off-by: Konstantin Khorenko 
---
 kernel/kmod.c |   16 
 1 file changed, 16 insertions(+)

diff --git a/kernel/kmod.c b/kernel/kmod.c
index ddf44c79c851..be0908452d7b 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -262,6 +262,22 @@ static const char * const ve0_allowed_mod[] = {
 
"rtnl-link-dummy",
"rtnl-link-vxlan",
+
+   /* IPVS */
+   "ip_vs_ftp",
+   "ip_vs_nq",
+   "ip_vs_wlc",
+   "ip6t_ipvs",
+   "ipt_ipvs",
+   "ip_vs_rr",
+   "ip_vs_pe_sip",
+   "ip_vs_lblc",
+   "ip_vs_wrr",
+   "ip_vs_sed",
+   "ip_vs_dh",
+   "ip_vs_sh",
+   "ip_vs_lblcr",
+   "ip_vs_lc",
 };
 
 /*


___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH RH9 08/20] ve/kmod/whitelist: Enable vxlan module autoload from inside a Container

2021-10-08 Thread Kirill Tkhai

From: Pavel Tikhomirov 

vxlan is safe in CT as:

1) Udp multicast socket to connect to outer word sits in creation net-
namespace, and these socket can get packets only forwarded/routed
in creation ns.

2) Vxlan device is owned by second netns(could be same as first) as
any other network device, so same all packets come to it are from
the same ns.

3) Vxlans logic works through vxlan_net placed on creation netns,
vxlan_fdb and vxlan_rdst are per vxlan device. Thus entries can
not intersec with entries from host and other CTs.

* One problem I can see now is adding fdb with ifindex(index of
device to route packets from UDP socket through) after vxlan is
moved to second namespace in vxlan_fdb_parse we use second
namespace to check ifindex by device lookup, but in
vxlan_xmit_one->ip_route_output_key->...->__ip_route_output_key
we use first(creation) namespace to lookup device and probably
will fail. So all fdb configuration should go before moving to
ns. Same is in mainstream AFAICS.

https://jira.sw.ru/browse/PSBM-53629

Signed-off-by: Pavel Tikhomirov 
Acked-by: Andrei Vagin 

khorenko@: Docker Swarm requires vxlans.

(cherry picked from vz7 commit
 d5805ee4d748 ("ve/net/vxlan: enable support and autoload in a container"))

Signed-off-by: Konstantin Khorenko 
---
 kernel/kmod.c |1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/kmod.c b/kernel/kmod.c
index 9a24a65deecb..ddf44c79c851 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -261,6 +261,7 @@ static const char * const ve0_allowed_mod[] = {
"net-pf-16-proto-4-type-16",/* PF_NETLINK, NETLINK_SOCK_DIAG, 
AF_NETLINK */
 
"rtnl-link-dummy",
+   "rtnl-link-vxlan",
 };
 
 /*


___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH RH9 07/20] ve/kmod/whitelist: Allow dummy module autoloading

2021-10-08 Thread Kirill Tkhai

From: Pavel Tikhomirov 

After allowing dummy devices in container in bug PSBM-43329 by commit
6061ed860950 ("ve/net/dummy: enable support in a container") docker-ui
testcase TestDaemonIP was unXFAIed but it still fails to create dummy
because module is not automaticly loaded in rtnl_newlink:

ip link add name dummy_test type dummy
RTNETLINK answers: Operation not supported

So allow the module.

https://jira.sw.ru/browse/PSBM-52061
Signed-off-by: Pavel Tikhomirov 
Reviewed-by: Kirill Tkhai 

(cherry picked from vz7 commit 3919de0d2585ac861ed237b9b585f2e3bb2e59bd)
Signed-off-by: Konstantin Khorenko 
---
 kernel/kmod.c |2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/kmod.c b/kernel/kmod.c
index 36420d60cce2..9a24a65deecb 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -259,6 +259,8 @@ static const char * const ve0_allowed_mod[] = {
 
/* netlink_diag */
"net-pf-16-proto-4-type-16",/* PF_NETLINK, NETLINK_SOCK_DIAG, 
AF_NETLINK */
+
+   "rtnl-link-dummy",
 };
 
 /*


___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH RH9 06/20] ve/kmod/whitelist: Add modules to whitelist for c/r sake

2021-10-08 Thread Kirill Tkhai

From: Cyrill Gorcunov 

When doing checpoint/restore during migration we use netlink
sockets with diag functionality to fetch various information
we need. In particular when restoring on the machine where
say netfilter modules are not loaded we fail with

 | [root@s175 ~]# less /vz/dump/rst-iVS9OC-16.05.04-22.32/criu_restore.11.log
 | (00.151066)  1: Running ip addr restore
 | RTNETLINK answers: File exists
 | RTNETLINK answers: File exists
 | (00.152641)  1: Running ip route restore
 | (00.175144)  1: Running ip route restore
 | (00.184676)  1: Running ip rule delete
 | (00.186448)  1: Running ip rule delete
 | (00.188191)  1: Running ip rule delete
 | (00.190054)  1: Running ip rule restore
 | (00.191964)  1: Running iptables-restore for iptables-restore
 | (00.200958)  1: Running ip6tables-restore for ip6tables-restore
 | >(00.203833)  1: Error (net.c:466): Can't open rtnl sock for net dump: 
Protocol not supported
 | (00.229107) Error (cr-restore.c:1407): 15091 killed by signal 9: Killed
 | (00.229192) Switching to new ns to clean ghosts
 | (00.241142) uns: calling exit_usernsd (-1, 1)
 | (00.241173) uns: daemon calls 0x454950 (15085, -1, 1)
 | (00.241188) uns: `- daemon exits w/ 0
 | (00.241570) uns: daemon stopped
 | (00.241584) Error (cr-restore.c:2248): Restoring FAILED

which stands for the following criu code

 |  sk = socket(AF_NETLINK, SOCK_RAW, NETLINK_NETFILTER);
 |  if (sk < 0) {
 |  pr_perror("Can't open rtnl sock for net dump");
 |  goto out_img;
 |  }

because the nfnetlink module is not loaded on the destination machine
we're failing. If we would have been running on node the module would
be uploaded automatically but restore happens in veX context where modules
can't be uploaded.

Thus add modules needed for c/r into whitelist, so the criu will
upload them automatically.

https://jira.sw.ru/browse/PSBM-46789

CC: Vladimir Davydov 
CC: Konstantin Khorenko 
CC: Andrey Vagin 
CC: Pavel Emelyanov 
Signed-off-by: Cyrill Gorcunov 

+++
ve/kmod: Change modules whitelist to fit their aliases

When we do call for specifed sockets such as netlink netfilter,
dialog sockets and such we imply that the kernel will autoload
them. But previously (e0914131eeb08e6b1953c682be05b9fbcf185f1f
"ve/kmod: Add modules to whitelist for c/r sake")
I put module names instead of their alises used in net subsystem
to determinate which module to load on socket/protocol types.

Fix it putting proper names here.

Thanks to Vladimir to point the problem.

https://jira.sw.ru/browse/PSBM-46789

CC: Konstantin Khorenko 
CC: Andrey Vagin 
CC: Pavel Emelyanov 
Signed-off-by: Cyrill Gorcunov 
Reviewed-by: Vladimir Davydov 

+++
ve/kmod: Allow netfilter conntrack inside VE

Netfilter conntrack module is used during checkpoint (which
is done on node) so the modules get autoloaded but in case
of migration the restore starts inside veX so we need to allow
the conntrack to be requested from ve context. Thus add them
into whitelist.

Initially missed them in ebc70d73717f592c89ad992f77587d9e118bbee6.

https://jira.sw.ru/browse/PSBM-47359

CC: Vladimir Davydov 
CC: Konstantin Khorenko 
CC: Andrey Vagin 
CC: Pavel Emelyanov 
Signed-off-by: Cyrill Gorcunov 

https://jira.sw.ru/browse/PSBM-127787

(cherry picked from vz7 commit 7d9c655b08b4397fc04430540fdbc763e56beacb)
Signed-off-by: Konstantin Khorenko 
---
 kernel/kmod.c |   25 +
 1 file changed, 25 insertions(+)

diff --git a/kernel/kmod.c b/kernel/kmod.c
index b8ca90bec921..36420d60cce2 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -234,6 +234,31 @@ static const char * const ve0_allowed_mod[] = {
 
"fs-binfmt_misc",
"fs-overlay",
+
+   /* inet_diag, inet6_diag  */
+   "net-pf-16-proto-4-type-2", /* PF_NETLINK, NETLINK_SOCK_DIAG, 
AF_INET */
+   "net-pf-16-proto-4-type-10",/* PF_NETLINK, NETLINK_SOCK_DIAG, 
AF_INET6 */
+
+   /* tcp_diag */
+   "net-pf-16-proto-4-type-2-6",   /* PF_NETLINK, NETLINK_SOCK_DIAG, 
AF_INET - IPPROTO_TCP */
+
+   /* udp_diag */
+   "net-pf-16-proto-4-type-2-17",  /* PF_NETLINK, NETLINK_SOCK_DIAG, 
AF_INET - IPPROTO_UDP */
+   "net-pf-16-proto-4-type-2-136", /* PF_NETLINK, NETLINK_SOCK_DIAG, 
AF_INET - IPPROTO_UDPLITE */
+
+   /* nfnetlink  */
+   "net-pf-16-proto-12",   /* PF_NETLINK, NETLINK_NETFILTER */
+   "nfnetlink-subsys-1",   /* NFNL_SUBSYS_CTNETLINK */
+   "nfnetlink-subsys-2",   /* NFNL_SUBSYS_CTNETLINK_EXP */
+
+   /* unix_diag */
+   "net-pf-16-proto-4-type-1", /* PF_NETLINK, NETLINK_SOCK_DIAG, 
AF_LOCAL */
+
+   /* af_packet_diag */
+   "net-pf-16-proto-4-type-17",/* PF_NETLINK, NETLINK_SOCK_DIAG, 
AF_PACKET */
+
+   /* netlink_diag */
+   "net-pf-16-proto-4-type-16",/* PF_NETLINK, NETLINK_SOCK_DIAG, 
AF_NETLINK */
 };
 
 /*


___
Devel

[Devel] [PATCH RH9 02/20] ve/kmod/whitelist: Infrustructure for list of modules to autoload from CT

2021-10-08 Thread Kirill Tkhai

From: Konstantin Khorenko 

https://jira.sw.ru/browse/PSBM-127787

It's a port of following vz7 commits:
 * 3a4142e  ("ve/kmod: Port autoloading from CT") (partially)
 * 8af13e7c ("ve/kmod: list of allowed to autoload in CT modules")
(partially)
Signed-off-by: Konstantin Khorenko 
Signed-off-by: Kirill Tkhai 
---
 include/linux/kmod.h |5 +
 kernel/kmod.c|   33 +
 2 files changed, 38 insertions(+)

diff --git a/include/linux/kmod.h b/include/linux/kmod.h
index 68f69362d427..d9b8dd81f595 100644
--- a/include/linux/kmod.h
+++ b/include/linux/kmod.h
@@ -32,4 +32,9 @@ static inline int request_module_nowait(const char *name, 
...) { return -ENOSYS;
 #define try_then_request_module(x, mod...) (x)
 #endif
 
+#ifdef CONFIG_VE
+extern bool module_payload_allowed(const char *module);
+#else
+static inline bool module_payload_allowed(const char *module) { return true; }
+#endif
 #endif /* __LINUX_KMOD_H__ */
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 2f9afc601d20..c8506fd92017 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -151,6 +151,10 @@ int __request_module(bool wait, const char *fmt, ...)
!ve_allow_module_load)
return -EPERM;
 
+   /* Check that module functionality is permitted */
+   if (!module_payload_allowed(module_name))
+   return -EPERM;
+
ret = security_kernel_module_request(module_name);
if (ret)
return ret;
@@ -182,3 +186,32 @@ int __request_module(bool wait, const char *fmt, ...)
return ret;
 }
 EXPORT_SYMBOL(__request_module);
+
+#ifdef CONFIG_VE
+
+/* ve0 allowed modules */
+static const char * const ve0_allowed_mod[] = {
+};
+
+/*
+ * module_payload_allowed - check if module functionality is allowed
+ * to be used inside current virtual environment.
+ *
+ * Returns true if it is allowed or we're in ve0, false otherwise.
+ */
+bool module_payload_allowed(const char *module)
+{
+   int i;
+
+   if (ve_is_super(get_exec_env()))
+   return true;
+
+   /* Look for full module name in ve0_allowed_mod table */
+   for (i = 0; i < ARRAY_SIZE(ve0_allowed_mod); i++) {
+   if (!strcmp(ve0_allowed_mod[i], module))
+   return true;
+   }
+
+   return false;
+}
+#endif /* CONFIG_VE */


___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH RH9 05/20] commit da8c1e2262f8

2021-10-08 Thread Kirill Tkhai

From: Konstantin Khorenko 

ve/kmod/whitelist: List of allowed to autoload in CT modules 
(non-netfilters)

Following non-netfilter modules are allowed to be autoloaded
from inside a CT:
 * binfmt_misc
 * fs-overlay

It's port of vz7 commits:
* 8af13e7c ("ve/kmod: list of allowed to autoload in CT modules")
   (partically)
* 264ef13  ("ve/kmod/whitelist: allow overlay fs module autoloading")

https://jira.sw.ru/browse/PSBM-127787

Signed-off-by: Konstantin Khorenko 

to merge
---
 kernel/kmod.c |3 +++
 1 file changed, 3 insertions(+)

diff --git a/kernel/kmod.c b/kernel/kmod.c
index 7472184200f2..b8ca90bec921 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -231,6 +231,9 @@ static const char * const ve0_allowed_mod[] = {
"nft-set",
"nf_tproxy_ipv4",
"nf_tproxy_ipv6",
+
+   "fs-binfmt_misc",
+   "fs-overlay",
 };
 
 /*


___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH RH9 04/20] commit 04248b3ff00d

2021-10-08 Thread Kirill Tkhai

From: Konstantin Khorenko 

ve/kmod/whitelist: Allow iptables/netfilter modules for autoload from CT

For now following modules are allowed by default to be autoloaded
upon indirect request from inside a Container:

* iptables/ip6tables core modules
* netfilters core modules (including nf_tables_inet)
  https://jira.sw.ru/browse/PSBM-99406

* xt_*, ipt_*, ip6t_*, arpt_*,
  nft-chain-*, nft-expr-*, nf-logger-* modules

* ebt* modules: previously we allowed to autoload ebt_* modules only
  upon request from inside a Container but there are several ebtables_*
  modules to be allowed as well, thus allow all ebt* modules for that.
  (Default CentOS7.3 firewalld service inside a CT complains on that)
  https://jira.sw.ru/browse/PSBM-66435

* all nf_* and nft_* modules
  https://jira.sw.ru/browse/PSBM-99536

https://jira.sw.ru/browse/PSBM-127787

Signed-off-by: Konstantin Khorenko 

It's a port of following vz7 commits:
 * 3a4142e  ("ve/kmod: Port autoloading from CT") (partially)
 * f9422b8  ("ve/kmod: Add rules for autoloading (new) nf_tables")
 * ccd1a1d  ("ve/kmod: Add rules for new {ip, ip6, x}table modules")
 * fe6a9073 ("ve/kmod: allow to autoload nf_log_ipv[46]")
 * b221ce6  ("ve/kmod/ebtable: allow to autoload ebtable_* modules
 from inside a CT")
 * 24f61ddc955f ("ve/kmod: enable autoload for nf_tables_inet module
 from inside a CT")
 * 0995da4719da ("ve/kmod: make all nf_* and nft_* autoloadable upon
 request from a CT"))

Signed-off-by: Konstantin Khorenko 
---
 kernel/kmod.c |   46 ++
 1 file changed, 46 insertions(+)

diff --git a/kernel/kmod.c b/kernel/kmod.c
index 7915397fcf46..7472184200f2 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -202,6 +202,35 @@ EXPORT_SYMBOL(__request_module);
 
 /* ve0 allowed modules */
 static const char * const ve0_allowed_mod[] = {
+   "ip_tables",
+   "ip6_tables",
+   "iptable_filter",
+   "iptable_raw",
+   "iptable_nat",
+   "iptable_mangle",
+   "ip6table_filter",
+   "ip6table_nat",
+   "ip6table_mangle",
+
+   "nf-nat",
+   "nf_conncount",
+   "nf_defrag_ipv4",
+   "nf_defrag_ipv6",
+   "nf_dup_ipv4",
+   "nf_dup_ipv6",
+   "nf_dup_netdev",
+   "nf_flow_table",
+   "nf-flowtable-1",
+   "nf_flow_table_inet",
+   "nf_osf",
+   "nf_reject_ipv6",
+   "nf_socket_ipv4",
+   "nf_socket_ipv6",
+   "nf_synproxy_core",
+
+   "nft-set",
+   "nf_tproxy_ipv4",
+   "nf_tproxy_ipv6",
 };
 
 /*
@@ -223,6 +252,23 @@ bool module_payload_allowed(const char *module)
return true;
}
 
+   /* modules allowed by name/alias masks */
+   if (!strncmp("xt_", module,  3) ||
+   !strncmp("ip_conntrack",module, 12) ||
+   !strncmp("ip_nat_", module,  7) ||
+   !strncmp("ipt_",module,  4) ||
+   !strncmp("ip6t_",   module,  5) ||
+   !strncmp("arpt_",   module,  5) ||
+   !strncmp("ebt", module,  4) ||
+   !strncmp("nft-chain-",  module, 10) ||
+   !strncmp("nft-expr-",   module,  9) ||
+   !strncmp("nf_nat",  module,  6) ||
+   !strncmp("nf_log_", module,  7) ||
+   !strncmp("nf-logger-",  module, 10) ||
+   !strncmp("nf_conntrack",module, 12) ||
+   !strncmp("nfct-helper-",module, 12))
+   return true;
+
return false;
 }
 #endif /* CONFIG_VE */


___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH RH9 01/20] ve/sysctl/kmod: Introduce tweak to allow indirect modules load from CT

2021-10-08 Thread Kirill Tkhai

From: Konstantin Khorenko 

Introduce "kernel.ve_allow_module_load" sysctl
to allow (1) / deny (0) indorect kernel modules load upon requests
from inside Containers.

Indirect modules "autoload" set enabled by default.

https://jira.sw.ru/browse/PSBM-127787

Signed-off-by: Konstantin Khorenko 
---
 include/linux/sysctl.h |2 ++
 kernel/kmod.c  |   11 +++
 kernel/sysctl.c|   16 
 3 files changed, 25 insertions(+), 4 deletions(-)

diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 3c59f962f3f6..83ac52e15c73 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -187,6 +187,8 @@ struct ctl_path {
 
 extern int trusted_exec;
 
+extern int ve_allow_module_load;
+
 #ifdef CONFIG_SYSCTL
 
 void proc_sys_poll_notify(struct ctl_table_poll *poll);
diff --git a/kernel/kmod.c b/kernel/kmod.c
index a5959c0ecdc2..2f9afc601d20 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -25,6 +25,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 
 #include 
 
@@ -127,10 +129,6 @@ int __request_module(bool wait, const char *fmt, ...)
char module_name[MODULE_NAME_LEN];
int ret;
 
-   /* Don't allow request_module() inside VE. */
-   if (!ve_is_super(get_exec_env()))
-   return -EPERM;
-
/*
 * We don't allow synchronous module loading from async.  Module
 * init may invoke async_synchronize_full() which will end up
@@ -148,6 +146,11 @@ int __request_module(bool wait, const char *fmt, ...)
if (ret >= MODULE_NAME_LEN)
return -ENAMETOOLONG;
 
+   /* Check that autoload is not prohibited using /proc interface */
+   if (!ve_is_super(get_exec_env()) &&
+   !ve_allow_module_load)
+   return -EPERM;
+
ret = security_kernel_module_request(module_name);
if (ret)
return ret;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 49656fd84639..53090d656dec 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -117,12 +117,17 @@ static int __init set_trusted_exec(char *str)
 }
 __setup("trusted_exec", set_trusted_exec);
 
+int ve_allow_module_load = 1;
+EXPORT_SYMBOL(ve_allow_module_load);
+
 /* Constants used for minimum and  maximum */
 #ifdef CONFIG_LOCKUP_DETECTOR
 static int sixty = 60;
 #endif
 
 static int __maybe_unused neg_one = -1;
+static int __maybe_unused zero = 0;
+static int __maybe_unused one = 1;
 static int __maybe_unused two = 2;
 static int __maybe_unused four = 4;
 static unsigned long zero_ul;
@@ -2362,6 +2367,17 @@ static struct ctl_table kern_table[] = {
.extra1 = SYSCTL_ZERO,
.extra2 = &two,
},
+#endif
+#ifdef CONFIG_VE
+{
+   .procname   = "ve_allow_module_load",
+   .data   = &ve_allow_module_load,
+   .maxlen = sizeof(int),
+   .mode   = 0644,
+   .proc_handler   = &proc_dointvec_minmax,
+   .extra1 = &zero,
+   .extra2 = &one,
+   },
 #endif
{
.procname   = "ngroups_max",


___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH RH9 03/20] ve/kmod: Honor modprobe blacklist on indirect modules autoload from CT

2021-10-08 Thread Kirill Tkhai

From: Konstantin Khorenko 

If a kernel modules is requested indirectly from inside a Container,
check is this modules is blacklisted on the Node first.

https://jira.sw.ru/browse/PSBM-127787

Signed-off-by: Konstantin Khorenko 
Signed-off-by: Kirill Tkhai 
---
 kernel/kmod.c |   25 ++---
 1 file changed, 18 insertions(+), 7 deletions(-)

diff --git a/kernel/kmod.c b/kernel/kmod.c
index c8506fd92017..7915397fcf46 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -64,11 +64,11 @@ char modprobe_path[KMOD_PATH_LEN] = CONFIG_MODPROBE_PATH;
 
 static void free_modprobe_argv(struct subprocess_info *info)
 {
-   kfree(info->argv[3]); /* check call_modprobe() */
+   kfree(info->argv[4]); /* check call_modprobe() */
kfree(info->argv);
 }
 
-static int call_modprobe(char *module_name, int wait)
+static int call_modprobe(char *module_name, int wait, int blacklist)
 {
struct subprocess_info *info;
static char *envp[] = {
@@ -78,7 +78,7 @@ static int call_modprobe(char *module_name, int wait)
NULL
};
 
-   char **argv = kmalloc(sizeof(char *[5]), GFP_KERNEL);
+   char **argv = kmalloc(sizeof(char *[6]), GFP_KERNEL);
if (!argv)
goto out;
 
@@ -88,9 +88,13 @@ static int call_modprobe(char *module_name, int wait)
 
argv[0] = modprobe_path;
argv[1] = "-q";
-   argv[2] = "--";
-   argv[3] = module_name;  /* check free_modprobe_argv() */
-   argv[4] = NULL;
+   if (blacklist)
+   argv[2] = "-b";
+   else
+   argv[2] = "-q"; /* just repeat argv[1] */
+   argv[3] = "--";
+   argv[4] = module_name;  /* check free_modprobe_argv() */
+   argv[5] = NULL;
 
info = call_usermodehelper_setup(modprobe_path, argv, envp, GFP_KERNEL,
 NULL, free_modprobe_argv, NULL);
@@ -127,6 +131,7 @@ int __request_module(bool wait, const char *fmt, ...)
 {
va_list args;
char module_name[MODULE_NAME_LEN];
+   bool blacklist;
int ret;
 
/*
@@ -154,6 +159,12 @@ int __request_module(bool wait, const char *fmt, ...)
/* Check that module functionality is permitted */
if (!module_payload_allowed(module_name))
return -EPERM;
+   /*
+* This function may be called from ve0, where standard behaviour
+* is not to use blacklist. So, we request blacklist reading only
+* if we're inside CT.
+*/
+   blacklist = !ve_is_super(get_exec_env());
 
ret = security_kernel_module_request(module_name);
if (ret)
@@ -178,7 +189,7 @@ int __request_module(bool wait, const char *fmt, ...)
 
trace_module_request(module_name, wait, _RET_IP_);
 
-   ret = call_modprobe(module_name, wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC);
+   ret = call_modprobe(module_name, wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC, 
blacklist);
 
atomic_inc(&kmod_concurrent_max);
wake_up(&kmod_wq);


___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH RH9 00/20] part 20 modules autoload

2021-10-08 Thread Kirill Tkhai

---

Andrey Ryabinin (2):
  ve/kmod/whitelist: Allow ip6tables_raw modules autoload upon request from 
CT
  ve/kmod/whitelist: Allow nf_tables module autoloading on request from CT

Cyrill Gorcunov (1):
  ve/kmod/whitelist: Add modules to whitelist for c/r sake

Kirill Tkhai (2):
  ve/kmod/whitelist: Allow conntrack nft-helper-* modules autoloading
  ve/kmod/whitelist: Allow ts_kmp module autoloading

Konstantin Khorenko (7):
  ve/sysctl/kmod: Introduce tweak to allow indirect modules load from CT
  ve/kmod/whitelist: Infrustructure for list of modules to autoload from CT
  ve/kmod: Honor modprobe blacklist on indirect modules autoload from CT
  commit 04248b3ff00d
  commit da8c1e2262f8
  ve/kmod/whitelist: Allow nfnetlink_queue module autoload from CT
  ve/kmod/whitelist: Allow "nft_compat" module autoload from inside a 
Container

Pavel Tikhomirov (6):
  ve/kmod/whitelist: Allow dummy module autoloading
  ve/kmod/whitelist: Enable vxlan module autoload from inside a Container
  ve/kmod/whitelist: Allow IPVS modules autoload in CT
  ve/kmod/whitelist: Allow netfilter/ipset modules autoload from inside a CT
  ve/kmod/whitelist: make nfnetlink_log autoloadable upon request from a CT
  ve/kmod/whitelist: Make fib modules autoloadable from CT

Stanislav Kinsburskiy (1):
  ve/kmod/whitelist: Allow NFS modules autoload in Containers

Vasily Averin (1):
  ve/kmod/whitelist: Enable autoload for iptables security tables from 
inside CT


 include/linux/kmod.h   |5 +
 include/linux/sysctl.h |2 
 kernel/kmod.c  |  195 +---
 kernel/sysctl.c|   16 
 4 files changed, 207 insertions(+), 11 deletions(-)

--
Signed-off-by: Kirill Tkhai 

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

Re: [Devel] [PATCH RH9 00/26] part 10: connector

2021-10-07 Thread Kirill Tkhai

Commited

On 07.10.2021 15:53, Pavel Tikhomirov wrote:
> https://jira.sw.ru/browse/PSBM-133993
> 
> Stanislav Kinsburskiy (26):
>   connector: store all private data on VE structure
>   connector: introduce VE-aware get_cdev() helper
>   connector: per-ve init and fini helpers introduced
>   connector: use device stored in VE
>   connector: per-ve helpers intoruduced
>   connector: take cn_already_initialized from VE
>   proc connector: generic proc_event_connector() helper introduced
>   proc connector: use generic event helper for fork event
>   proc connector: use generic event helper for exec event
>   proc connector: use generic event helper for id event
>   proc connector: use generic event helper for sid event
>   proc connector: use generic event helper for ptrace event
>   proc connector: use generic event helper for comm event
>   proc connector: use generic event helper for coredump event
>   proc connector: use generic event helper for exit event
>   proc connector: add pid namespace awareness
>   proc connector: add per-ve init and fini foutines
>   proc connector: call proc-related init and fini routines explicitly
>   proc connector: take number of listeners and per-cpu conters from VE
>   proc connector: pass VE to event fillers
>   proc connector: take namespaces from VE
>   proc connector: use per-ve netlink sender helper
>   proc connector: send events to both VEs if not in VE#0
>   connector: containerize "connector" proc entry
>   connector: take VE from socket upon callback
>   connector: add VE SS hook
> 
>  drivers/connector/cn_proc.c   | 399 ++
>  drivers/connector/connector.c | 161 +++---
>  include/linux/connector.h |  21 ++
>  include/linux/ve.h|   4 +
>  4 files changed, 373 insertions(+), 212 deletions(-)
> 

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

Re: [Devel] [PATCH RH9 00/22] port part 21

2021-10-07 Thread Kirill Tkhai

Commited

On 07.10.2021 13:57, Andrey Zhadchenko wrote:
> Alexander Mikhalitsyn (1):
>   ms/fs: Revert "Make super_blocks and sb_lock static"
> 
> Andrey Ryabinin (1):
>   drivers/bnx2x: Limit setting of the max mtu
> 
> Cyrill Gorcunov (3):
>   ve/fs: Export fs.aio-max-nr via ve cgroup
>   ve/fs: namespace -- Ignore device permissions during restore
>   ve/fs: namespace -- Don't fail on permissions if @ve->devmnt_list is
> empty
> 
> Kirill Tkhai (4):
>   fs: Lower ioprio in case of ioprio_set() called from CT
>   ve/fs/files: Add new argument to expand_files()
>   ve/fs/files: Add fdtable_align() helper
>   ve/fs/files: Shrink big fdtable on close in is_pseudosuper mode
> 
> Konstantin Khlebnikov (1):
>   pidns: add proc mount option 'hidepidns=0|1'
> 
> Konstantin Khorenko (4):
>   ve/fs/sync: Per containter sync and syncfs and fs.fsync-enable sysctl
>   ve/fs: Allow mount fs in init userns if it's mounted in another userns
>   drivers/igb: increase link detection timeout up to 5 sec
>   net/teql: disable "True" (or "trivial") link equalizer inside a CT
> 
> Maxim Patlasov (1):
>   fs: FIEMAP should sync only required range with FIEMAP_FLAG_SYNC
> 
> Pavel Tikhomirov (1):
>   ve/cgroup: Hide ve cgroup in Containers
> 
> Valeriy Vdovin (2):
>   x86/cpuid_fault: Increase max count of cpuid overrides
>   x86/cpuid_fault: Log table updates
> 
> Vasily Averin (2):
>   ve/net/ipv6 tunnels: Enable GRE netdevice register inside container
>   ve/net/sit: Enable SIT devices in Containers
> 
> Vladimir Davydov (2):
>   fs: Allow to remove swapfile hardlinks (for ploop images protection)
>   ve/fs/ioprio: Confine ioprio_{set, get}(IOPRIO_WHO_USER) to current ve
> 
>  Documentation/filesystems/proc.rst  |   4 +
>  arch/x86/kernel/cpuid_fault.c   |  36 +++-
>  block/ioprio.c  |  24 +++
>  drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c |   8 +
>  drivers/net/ethernet/intel/igb/igb_main.c   |   5 +-
>  fs/fcntl.c  |   2 +
>  fs/file.c   |  92 +++---
>  fs/ioctl.c  |   3 +-
>  fs/mount.h  |   2 +
>  fs/namei.c  |   3 +-
>  fs/namespace.c  |  33 +++-
>  fs/open.c   |   3 +
>  fs/proc/base.c  |  11 +-
>  fs/proc/inode.c |   2 +
>  fs/proc/root.c  |  12 ++
>  fs/super.c  |  11 +-
>  fs/sync.c   | 213 
> +++-
>  include/linux/cgroup.h  |   4 +
>  include/linux/cpuid_override.h  |   2 +-
>  include/linux/fs.h  |  15 ++
>  include/linux/proc_fs.h |   1 +
>  include/linux/ve.h  |   2 +
>  kernel/cgroup/cgroup-v1.c   |   8 +-
>  kernel/cgroup/cgroup.c  |  20 +++
>  kernel/ve/ve.c  |  40 +
>  kernel/ve/veowner.c |   8 +
>  mm/msync.c  |   2 +
>  net/ipv6/ip6_gre.c  |   1 +
>  net/ipv6/ip6_tunnel.c   |   2 +-
>  net/ipv6/sit.c  |   1 +
>  net/sched/sch_teql.c|   3 +
>  31 files changed, 523 insertions(+), 50 deletions(-)
> 

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

Re: [Devel] [PATCH RH9 00/12] part19 ext4

2021-10-07 Thread Kirill Tkhai

Commited

On 07.10.2021 13:19, Kirill Tkhai wrote:
> ---
> 
> Dmitry Monakhov (6):
>   ext4: Fix error handling after filesystem abort
>   jbd2: make shure that we do not miss aborted state
>   jbd2: raid amnesia protection for the journal
>   ext4: add mfsync support
>   ext4: add generic uevent infrastructure
>   ext4: send abort uevent on ext4 journal abort
> 
> Kirill Tkhai (2):
>   ve/ext3: treat panic_on_errors as remount-ro_on_errors in CTs
>   ext4: make data=writeback mode safe
> 
> Konstantin Khorenko (4):
>   ve/fs/namespace: allow submounts in non-init userns
>   Kconfig.openvz: force CGROUP_PERF if compiling VZ Containers code
>   ext4: don't iterate over sbi->s_es_list more than the number of elements
>   ms/Revert "ext4: simplify kobject usage"
> 
> 
>  fs/ext4/ext4.h  |   20 ++
>  fs/ext4/extents_status.c|8 ++
>  fs/ext4/fsync.c |  108 ++
>  fs/ext4/inode.c |   11 ++-
>  fs/ext4/ioctl.c |   60 +
>  fs/ext4/super.c |  155 
> ++-
>  fs/ext4/sysfs.c |   43 +---
>  fs/jbd2/journal.c   |3 -
>  fs/jbd2/recovery.c  |   77 +
>  fs/namespace.c  |   25 +++
>  include/trace/events/ext4.h |   54 +++
>  kernel/Kconfig.openvz   |1 
>  12 files changed, 544 insertions(+), 21 deletions(-)
> 
> --
> Signed-off-by: Kirill Tkhai 
> 

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH RH9 11/12] ext4: add generic uevent infrastructure

2021-10-07 Thread Kirill Tkhai

From: Dmitry Monakhov 

*Purpose:
It is reasonable to announce fs related events via uevent infrastructure.
This patch implement only ext4'th part, but IMHO this should be usefull for
any generic filesystem.

Example: Runtime fs-error is pure async event. Currently there is no good
way to handle this situation and inform user-space about this.

*Implementation:
 Add uevent infrastructure similar to dm uevent
 FS_ACTION = {MOUNT|UMOUNT|REMOUNT|ERROR|FREEZE|UNFREEZE}
 FS_UUID
 FS_NAME
 FS_TYPE

Signed-off-by: Dmitry Monakhov 

[aryabinin: add error event, rh8 rebase]
Signed-off-by: Andrey Ryabinin 
Signed-off-by: Kirill Tkhai 
---
 fs/ext4/ext4.h  |   11 +
 fs/ext4/super.c |  129 +++
 2 files changed, 139 insertions(+), 1 deletion(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 5f6fdd5514b2..70b3038fa0d1 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1625,6 +1625,8 @@ struct ext4_sb_info {
/* Precomputed FS UUID checksum for seeding other checksums */
__u32 s_csum_seed;
 
+   bool s_err_event_sent;
+
/* Reclaim extents from extent status tree */
struct shrinker s_es_shrinker;
struct list_head s_es_list; /* List of inodes with reclaimable 
extents */
@@ -3655,6 +3657,15 @@ extern int ext4_check_blockref(const char *, unsigned 
int,
 struct ext4_ext_path;
 struct ext4_extent;
 
+enum ext4_event_type {
+   EXT4_UA_MOUNT,
+   EXT4_UA_UMOUNT,
+   EXT4_UA_REMOUNT,
+   EXT4_UA_ERROR,
+   EXT4_UA_FREEZE,
+   EXT4_UA_UNFREEZE,
+};
+
 /*
  * Maximum number of logical blocks in a file; ext4_extent's ee_block is
  * __le32.
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 6cf2d3e0ed8f..597768497c42 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -424,6 +424,118 @@ static time64_t __ext4_get_tstamp(__le32 *lo, __u8 *hi)
 #define ext4_get_tstamp(es, tstamp) \
__ext4_get_tstamp(&(es)->tstamp, &(es)->tstamp ## _hi)
 
+static int ext4_uuid_valid(const u8 *uuid)
+{
+   int i;
+
+   for (i = 0; i < 16; i++) {
+   if (uuid[i])
+   return 1;
+   }
+   return 0;
+}
+
+struct ext4_uevent {
+   struct super_block *sb;
+   enum ext4_event_type action;
+   struct work_struct work;
+};
+
+/**
+ * ext4_send_uevent - prepare and send uevent
+ *
+ * @sb:super_block
+ * @action:action type
+ *
+ */
+static void ext4_send_uevent_work(struct work_struct *w)
+{
+   struct ext4_uevent *e = container_of(w, struct ext4_uevent, work);
+   struct super_block *sb = e->sb;
+   struct kobj_uevent_env *env;
+   const u8 *uuid = EXT4_SB(sb)->s_es->s_uuid;
+   enum kobject_action kaction = KOBJ_CHANGE;
+   int ret;
+
+   env = kzalloc(sizeof(struct kobj_uevent_env), GFP_KERNEL);
+   if (!env){
+   kfree(e);
+   return;
+   }
+   ret = add_uevent_var(env, "FS_TYPE=%s", sb->s_type->name);
+   if (ret)
+   goto out;
+   ret = add_uevent_var(env, "FS_NAME=%s", sb->s_id);
+   if (ret)
+   goto out;
+
+   if (ext4_uuid_valid(uuid)) {
+   ret = add_uevent_var(env, "UUID=%pUB", uuid);
+   if (ret)
+   goto out;
+   }
+
+   switch (e->action) {
+   case EXT4_UA_MOUNT:
+   kaction = KOBJ_ONLINE;
+   ret = add_uevent_var(env, "FS_ACTION=%s", "MOUNT");
+   break;
+   case EXT4_UA_UMOUNT:
+   kaction = KOBJ_OFFLINE;
+   ret = add_uevent_var(env, "FS_ACTION=%s", "UMOUNT");
+   break;
+   case EXT4_UA_REMOUNT:
+   ret = add_uevent_var(env, "FS_ACTION=%s", "REMOUNT");
+   break;
+   case EXT4_UA_ERROR:
+   ret = add_uevent_var(env, "FS_ACTION=%s", "ERROR");
+   break;
+   case EXT4_UA_FREEZE:
+   ret = add_uevent_var(env, "FS_ACTION=%s", "FREEZE");
+   break;
+   case EXT4_UA_UNFREEZE:
+   ret = add_uevent_var(env, "FS_ACTION=%s", "UNFREEZE");
+   break;
+   default:
+   ret = -EINVAL;
+   }
+   if (ret)
+   goto out;
+   ret = kobject_uevent_env(&(EXT4_SB(sb)->s_kobj), kaction, env->envp);
+out:
+   kfree(env);
+   kfree(e);
+}
+
+/**
+ * ext4_send_uevent - prepare and schedule event submission
+ *
+ * @sb:super_block
+ * @action:action type
+ *
+ */
+void ext4_send_uevent(struct super_block *sb, enum ext4_event_type action)
+{
+   struct ext4_uevent *e;
+
+   /*
+* May happen if called from ext4_put_super() -> __ext4_abort()
+* -> ext4_send_uevent()
+*/
+   if

[Devel] [PATCH RH9 12/12] ext4: send abort uevent on ext4 journal abort

2021-10-07 Thread Kirill Tkhai

From: Dmitry Monakhov 

Currenlty error from device result in ext4_abort, but uevent not generated 
because
ext4_abort() caller's context do not allow GFP_KERNEL memory allocation.
Let's relax submission context requirement and deffer actual uevent submission
to work_queue.  It can be any workqueue I've pick rsv_conversion_wq because it 
is
already exists.

khorenko@: "system_wq" does not fit here because at the moment of
work execution sb can be already destroyed.
"EXT4_SB(sb)->rsv_conversion_wq" is flushed before sb is destroyed.

Signed-off-by: Dmitry Monakhov 

[aryabinin rh8 rebase]
Signed-off-by: Andrey Ryabinin 
---
 fs/ext4/ext4.h  |2 ++
 fs/ext4/super.c |6 ++
 2 files changed, 8 insertions(+)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 70b3038fa0d1..5ea1ca7c57c3 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1626,6 +1626,7 @@ struct ext4_sb_info {
__u32 s_csum_seed;
 
bool s_err_event_sent;
+   bool s_abrt_event_sent;
 
/* Reclaim extents from extent status tree */
struct shrinker s_es_shrinker;
@@ -3662,6 +3663,7 @@ enum ext4_event_type {
EXT4_UA_UMOUNT,
EXT4_UA_REMOUNT,
EXT4_UA_ERROR,
+   EXT4_UA_ABORT,
EXT4_UA_FREEZE,
EXT4_UA_UNFREEZE,
 };
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 597768497c42..9119dc05850f 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -490,6 +490,9 @@ static void ext4_send_uevent_work(struct work_struct *w)
case EXT4_UA_ERROR:
ret = add_uevent_var(env, "FS_ACTION=%s", "ERROR");
break;
+   case EXT4_UA_ABORT:
+   ret = add_uevent_var(env, "FS_ACTION=%s", "ABORT");
+   break;
case EXT4_UA_FREEZE:
ret = add_uevent_var(env, "FS_ACTION=%s", "FREEZE");
break;
@@ -764,6 +767,9 @@ static void ext4_handle_error(struct super_block *sb, bool 
force_ro, int error,
WARN_ON_ONCE(1);
 
if (!continue_fs && !sb_rdonly(sb)) {
+   if (!xchg(&EXT4_SB(sb)->s_abrt_event_sent, 1))
+   ext4_send_uevent(sb, EXT4_UA_ABORT);
+
ext4_set_mount_flag(sb, EXT4_MF_FS_ABORTED);
if (journal)
jbd2_journal_abort(journal, -EIO);


___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH RH9 10/12] ms/Revert "ext4: simplify kobject usage"

2021-10-07 Thread Kirill Tkhai

From: Konstantin Khorenko 

This reverts ms commit bc1420ae56266fa2c5a8e452d55f744ca98fe42f.

* we want ext4 to send udev events
* kobject_uevent_env() kobject->kset is defined
  => let's ext4 defines kobject->kset

https://jira.sw.ru/browse/PSBM-127422

Signed-off-by: Konstantin Khorenko 
Signed-off-by: Kirill Tkhai 
---
 fs/ext4/sysfs.c |   43 ---
 1 file changed, 32 insertions(+), 11 deletions(-)

diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
index 2314f7446592..7af925442a61 100644
--- a/fs/ext4/sysfs.c
+++ b/fs/ext4/sysfs.c
@@ -489,6 +489,13 @@ static void ext4_sb_release(struct kobject *kobj)
complete(&sbi->s_kobj_unregister);
 }
 
+static void ext4_kset_release(struct kobject *kobj)
+{
+   struct kset *kset = container_of(kobj, struct kset, kobj);
+
+   kfree(kset);
+}
+
 static const struct sysfs_ops ext4_attr_ops = {
.show   = ext4_attr_show,
.store  = ext4_attr_store,
@@ -511,7 +518,12 @@ void ext4_notify_error_sysfs(struct ext4_sb_info *sbi)
sysfs_notify(&sbi->s_kobj, NULL, "errors_count");
 }
 
-static struct kobject *ext4_root;
+static struct kobj_type ext4_ktype = {
+   .sysfs_ops  = &ext4_attr_ops,
+   .release= ext4_kset_release,
+};
+
+static struct kset *ext4_kset;
 
 static struct kobject *ext4_feat;
 
@@ -520,8 +532,9 @@ int ext4_register_sysfs(struct super_block *sb)
struct ext4_sb_info *sbi = EXT4_SB(sb);
int err;
 
+   sbi->s_kobj.kset = ext4_kset;
init_completion(&sbi->s_kobj_unregister);
-   err = kobject_init_and_add(&sbi->s_kobj, &ext4_sb_ktype, ext4_root,
+   err = kobject_init_and_add(&sbi->s_kobj, &ext4_sb_ktype, NULL,
   "%s", sb->s_id);
if (err) {
kobject_put(&sbi->s_kobj);
@@ -562,18 +575,26 @@ int __init ext4_init_sysfs(void)
 {
int ret;
 
-   ext4_root = kobject_create_and_add("ext4", fs_kobj);
-   if (!ext4_root)
+   ext4_kset = kzalloc(sizeof(*ext4_kset), GFP_KERNEL);
+   if (!ext4_kset)
return -ENOMEM;
 
+   kobject_set_name(&ext4_kset->kobj, "ext4");
+   ext4_kset->kobj.parent = fs_kobj;
+   ext4_kset->kobj.ktype = &ext4_ktype;
+   ret = kset_register(ext4_kset);
+   if (ret)
+   goto kset_err;
+
ext4_feat = kzalloc(sizeof(*ext4_feat), GFP_KERNEL);
if (!ext4_feat) {
ret = -ENOMEM;
-   goto root_err;
+   goto kset_err;
}
 
+   ext4_feat->kset = ext4_kset;
ret = kobject_init_and_add(ext4_feat, &ext4_feat_ktype,
-  ext4_root, "features");
+  NULL, "features");
if (ret)
goto feat_err;
 
@@ -583,9 +604,9 @@ int __init ext4_init_sysfs(void)
 feat_err:
kobject_put(ext4_feat);
ext4_feat = NULL;
-root_err:
-   kobject_put(ext4_root);
-   ext4_root = NULL;
+kset_err:
+   kset_unregister(ext4_kset);
+   ext4_kset = NULL;
return ret;
 }
 
@@ -593,8 +614,8 @@ void ext4_exit_sysfs(void)
 {
kobject_put(ext4_feat);
ext4_feat = NULL;
-   kobject_put(ext4_root);
-   ext4_root = NULL;
+   kset_unregister(ext4_kset);
+   ext4_kset = NULL;
remove_proc_entry(proc_dirname, NULL);
ext4_proc_root = NULL;
 }


___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH RH9 04/12] ve/ext3: treat panic_on_errors as remount-ro_on_errors in CTs

2021-10-07 Thread Kirill Tkhai

This is a port from 2.6.32-x of:

* diff-ext4-in-containers-treat-panic_on_errors-as-remount-ro_on_errors

ext4: in containers treat errors=panic as

Container can explode whole node if it remounts its ploop
with option 'errors=panic' and triggers abort after that.

Signed-off-by: Konstantin Khlebnikov 
Acked-by: Maxim V. Patlasov 

Signed-off-by: Dmitry Monakhov 

khorenko@: currently we have devmnt->allowed_options options which are
configured via userspace and currently vzctl provides empty list.
This is an additional check - just in case someone get secondary
ploop image with 'errors=panic' mount option saved in the image
and mounts it from inside a CT.

Signed-off-by: Andrey Ryabinin 
Signed-off-by: Kirill Tkhai 
---
 fs/ext4/super.c |   14 +++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index f09a2432a20e..685686f5b849 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1906,6 +1906,7 @@ static int clear_qf_name(struct super_block *sb, int 
qtype)
 #define MOPT_STRING0x0400
 #define MOPT_SKIP  0x0800
 #defineMOPT_2  0x1000
+#define MOPT_WANT_SYS_ADMIN0x4000
 
 static const struct mount_opts {
int token;
@@ -1938,7 +1939,7 @@ static const struct mount_opts {
EXT4_MOUNT_JOURNAL_CHECKSUM),
 MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT},
{Opt_noload, EXT4_MOUNT_NOLOAD, MOPT_NO_EXT2 | MOPT_SET},
-   {Opt_err_panic, EXT4_MOUNT_ERRORS_PANIC, MOPT_SET | MOPT_CLEAR_ERR},
+   {Opt_err_panic, EXT4_MOUNT_ERRORS_PANIC, MOPT_SET | 
MOPT_CLEAR_ERR|MOPT_WANT_SYS_ADMIN},
{Opt_err_ro, EXT4_MOUNT_ERRORS_RO, MOPT_SET | MOPT_CLEAR_ERR},
{Opt_err_cont, EXT4_MOUNT_ERRORS_CONT, MOPT_SET | MOPT_CLEAR_ERR},
{Opt_data_err_abort, EXT4_MOUNT_DATA_ERR_ABORT,
@@ -2182,6 +2183,9 @@ static int handle_mount_opt(struct super_block *sb, char 
*opt, int token,
}
if (m->flags & MOPT_CLEAR_ERR)
clear_opt(sb, ERRORS_MASK);
+   if (m->flags & MOPT_WANT_SYS_ADMIN && !capable(CAP_SYS_ADMIN))
+   return 1;
+
if (token == Opt_noquota && sb_any_quota_loaded(sb)) {
ext4_msg(sb, KERN_ERR, "Cannot change quota "
 "options when quota turned on");
@@ -4226,8 +4230,12 @@ static int ext4_fill_super(struct super_block *sb, void 
*data, int silent)
else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_WBACK)
set_opt(sb, WRITEBACK_DATA);
 
-   if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_PANIC)
-   set_opt(sb, ERRORS_PANIC);
+   if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_PANIC) {
+   if (capable(CAP_SYS_ADMIN))
+   set_opt(sb, ERRORS_PANIC);
+   else
+   set_opt(sb, ERRORS_RO);
+   }
else if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_CONTINUE)
set_opt(sb, ERRORS_CONT);
else


___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH RH9 05/12] ext4: Fix error handling after filesystem abort

2021-10-07 Thread Kirill Tkhai

From: Dmitry Monakhov 

If filesystem was aborted after inode's write back is complete
but before its metadata was updated we may return success
results in data loss.
In order to handle fs abort correctly we have to check
fs state once we discover that it is in MS_RDONLY state

Signed-off-by: Dmitry Monakhov 

+++
ext4: fix broken fsync for dirs/symlink

mFixes commit 6a63db16da84fe
("ext4: Fix error handling after filesystem abort").

xfstests: generic/321 generic/335 generic/348
Signed-off-by: Dmitry Monakhov 

(cherry picked from vz7 commit 00399757c828ee82941123f6c67e7c96d906ce2b)
Signed-off-by: Konstantin Khorenko 
Signed-off-by: Kirill Tkhai 
---
 fs/ext4/super.c |6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 685686f5b849..6cf2d3e0ed8f 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -5784,8 +5784,12 @@ int ext4_force_commit(struct super_block *sb)
 {
journal_t *journal;
 
-   if (sb_rdonly(sb))
+   if (sb_rdonly(sb)) {
+   smp_rmb();
+   if (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)
+   return -EROFS;
return 0;
+   }
 
journal = EXT4_SB(sb)->s_journal;
return ext4_journal_force_commit(journal);


___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH RH9 03/12] ext4: don't iterate over sbi->s_es_list more than the number of elements

2021-10-07 Thread Kirill Tkhai

From: Konstantin Khorenko 

If there are several shrinkers working on a single sbi there can be easily a
situation when a neighbor shrinkers reclaimed a bunch of extents and thus a
bunch inodes from the s_es_list but we don't honor this and iterate over
sbi->s_es_list the number of times equal to the initial number of inodes in
s_es_list.

Before each iteration, check if we are going to iterate more than the
number of inodes in the list and adjust nr_to_walk accordingly.

https://jira.sw.ru/browse/PSBM-83335

Signed-off-by: Konstantin Khorenko 
Acked-by: Dmitry Monakhov 

(cherry picked from vz7 commit 17a5132158a4 ("ext4: don't iterate over
sbi->s_es_list more than the number of elements"))

VZ 8 rebase part https://jira.sw.ru/browse/PSBM-127798

Signed-off-by: Alexander Mikhalitsyn 
Signed-off-by: Kirill Tkhai 
---
 fs/ext4/extents_status.c |8 
 1 file changed, 8 insertions(+)

diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index 9a3a8996aacf..92aa9265a117 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -1485,6 +1485,14 @@ static int __es_shrink(struct ext4_sb_info *sbi, int 
nr_to_scan,
spin_unlock(&sbi->s_es_lock);
goto out;
}
+   /*
+* Another shrinker can remove a bunch of extents in parallel,
+* we don't have to iterate more than the current number of
+* inodes in the list.
+*/
+   if (nr_to_walk > sbi->s_es_nr_inode)
+   nr_to_walk = sbi->s_es_nr_inode;
+
ei = list_first_entry(&sbi->s_es_list, struct ext4_inode_info,
  i_es_list);
/* Move the inode to the tail */


___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH RH9 07/12] ext4: make data=writeback mode safe

2021-10-07 Thread Kirill Tkhai

From: Kirill Tkhai 

It is not obvious, but delalloc makes data=writeback mode safer.
This is because actual data allocation happens inside writepages,
So stale blocks after unclean umount no longer an issue.

So in order to make data=writeback mode reliable we can not
temporarily disable delalloc in case of low diskspace. It must
be enabled permanently. Original discussion:
  http://thread.gmane.org/gmane.comp.file-systems.ext4/19527

https://jira.sw.ru:9443/browse/PCLIN-299

diff-ms-ext4-safe-writeback

Signed-off-by: Dmitry Monakhov 
Signed-off-by: Kirill Tkhai 

(cherry picked from vz7 commit 025b3611cf3eba7f1a83bf34c05ea439c4ade410)
Signed-off-by: Konstantin Khorenko 
Signed-off-by: Kirill Tkhai 
---
 fs/ext4/inode.c |   11 +--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index d8de607849df..0d2268ead3e7 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2915,8 +2915,15 @@ static int ext4_nonda_switch(struct super_block *sb)
if (dirty_clusters && (free_clusters < 2 * dirty_clusters))
try_to_writeback_inodes_sb(sb, WB_REASON_FS_FREE_SPACE);
 
-   if (2 * free_clusters < 3 * dirty_clusters ||
-   free_clusters < (dirty_clusters + EXT4_FREECLUSTERS_WATERMARK)) {
+   /*
+* NOTE: Delalloc make data=writeback mode safer, similar to ordered
+* mode, so stale blocks after power failure no longer an issue Do not
+* disable delalloc to guarantee data security on data=writeback mode.
+*  -dmon
+*/
+   if (test_opt(sb, DATA_FLAGS) != EXT4_MOUNT_WRITEBACK_DATA &&
+   (2 * free_clusters < 3 * dirty_clusters ||
+free_clusters < (dirty_clusters + EXT4_FREECLUSTERS_WATERMARK))) {
/*
 * free block count is less than 150% of dirty blocks
 * or free blocks is less than watermark


___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH RH9 00/12] part19 ext4

2021-10-07 Thread Kirill Tkhai

---

Dmitry Monakhov (6):
  ext4: Fix error handling after filesystem abort
  jbd2: make shure that we do not miss aborted state
  jbd2: raid amnesia protection for the journal
  ext4: add mfsync support
  ext4: add generic uevent infrastructure
  ext4: send abort uevent on ext4 journal abort

Kirill Tkhai (2):
  ve/ext3: treat panic_on_errors as remount-ro_on_errors in CTs
  ext4: make data=writeback mode safe

Konstantin Khorenko (4):
  ve/fs/namespace: allow submounts in non-init userns
  Kconfig.openvz: force CGROUP_PERF if compiling VZ Containers code
  ext4: don't iterate over sbi->s_es_list more than the number of elements
  ms/Revert "ext4: simplify kobject usage"


 fs/ext4/ext4.h  |   20 ++
 fs/ext4/extents_status.c|8 ++
 fs/ext4/fsync.c |  108 ++
 fs/ext4/inode.c |   11 ++-
 fs/ext4/ioctl.c |   60 +
 fs/ext4/super.c |  155 ++-
 fs/ext4/sysfs.c |   43 +---
 fs/jbd2/journal.c   |3 -
 fs/jbd2/recovery.c  |   77 +
 fs/namespace.c  |   25 +++
 include/trace/events/ext4.h |   54 +++
 kernel/Kconfig.openvz   |1 
 12 files changed, 544 insertions(+), 21 deletions(-)

--
Signed-off-by: Kirill Tkhai 

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH RH9 08/12] jbd2: raid amnesia protection for the journal

2021-10-07 Thread Kirill Tkhai

From: Dmitry Monakhov 

https://jira.sw.ru/browse/PSBM-15484

Some blockdevices can return different data on read requests from same block
after power failure (for example mirrored raid is out of sync, and resync is
in progress) In that case following sutuation is possible:

Power failure happen after transaction commit log was issued for
transaction 'D', next boot first dist will have commit block, but
second one will not.
mirror1: journal={Ac-Bc-Cc-Dc }
mirror2: journal={Ac-Bc-Cc-D  }
Now let's let assumes that we read from mirror1 and found that 'D' has
valid commit block, so journal_replay will replay that transaction, but
second power failure may happen before journal_reset() so next
journal_replay() may read from mirror2 and found that 'C' is last valid
transaction. This result in corruption because we already replayed
trandaction 'D'.
In order to avoid such ambiguity we should pefrorm 'stabilize write'.
1) Read and rewrite latest commit id block
2) Invalidate next block in
order to guarantee that journal head becomes stable.

Signed-off-by: Dmitry Monakhov 
Signed-off-by: Andrey Ryabinin 
Signed-off-by: Kirill Tkhai 
---
 fs/jbd2/recovery.c |   77 +++-
 1 file changed, 76 insertions(+), 1 deletion(-)

diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index d47a0d96bf30..01b937aa0a81 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -33,6 +33,9 @@ struct recovery_info
int nr_replays;
int nr_revokes;
int nr_revoke_hits;
+
+   unsigned intlast_log_block;
+   struct buffer_head  *last_commit_bh;
 };
 
 static int do_one_pass(journal_t *journal,
@@ -268,6 +271,71 @@ static int fc_do_one_pass(journal_t *journal,
return err;
 }
 
+/*
+ * The 'Raid amnesia' effect protection: https://jira.sw.ru/browse/PSBM-15484
+ *
+ * Some blockdevices can return different data on read requests from same block
+ * after power failure (for example mirrored raid is out of sync, and resync is
+ * in progress) In that case following sutuation is possible:
+ *
+ * Power failure happen after transaction commit log was issued for
+ * transaction 'D', next boot first dist will have commit block, but
+ * second one will not.
+ * mirror1: journal={Ac-Bc-Cc-Dc }
+ * mirror2: journal={Ac-Bc-Cc-D  }
+ * Now let's let assumes that we read from mirror1 and found that 'D' has
+ * valid commit block, so journal_replay will replay that transaction, but
+ * second power failure may happen before journal_reset() so next
+ * journal_replay() may read from mirror2 and found that 'C' is last valid
+ * transaction. This result in corruption because we already replayed
+ * trandaction 'D'.
+ * In order to avoid such ambiguity we should pefrorm 'stabilize write'.
+ * 1) Read and rewrite latest commit id block
+ * 2) Invalidate next block in
+ * order to guarantee that journal head becomes stable.
+ * Yes i know that 'stabilize write' approach is ugly but this is the only
+ * way to run filesystem on blkdevices with 'raid amnesia' effect
+ */
+static int stabilize_journal_head(journal_t *journal, struct recovery_info 
*info)
+{
+   struct buffer_head *bh[2] = {NULL, NULL};
+   int err, err2, i;
+
+   if (!info->last_commit_bh)
+   return 0;
+
+   bh[0] = info->last_commit_bh;
+   info->last_commit_bh = NULL;
+
+   err = jread(&bh[1], journal, info->last_log_block);
+   if (err)
+   goto out;
+
+   for (i = 0; i < 2; i++) {
+   lock_buffer(bh[i]);
+   /* Explicitly invalidate block beyond last commit block */
+   if (i == 1)
+   memset(bh[i]->b_data, 0, journal->j_blocksize);
+
+   BUFFER_TRACE(bh[i], "marking dirty");
+   set_buffer_uptodate(bh[i]);
+   mark_buffer_dirty(bh[i]);
+   BUFFER_TRACE(bh[i], "marking uptodate");
+   unlock_buffer(bh[i]);
+   }
+   err = sync_blockdev(journal->j_dev);
+   /* Make sure data is on permanent storage */
+   if (journal->j_flags & JBD2_BARRIER) {
+   err2 = blkdev_issue_flush(journal->j_dev);
+   if (!err)
+   err = err2;
+   }
+out:
+   brelse(bh[0]);
+   brelse(bh[1]);
+   return err;
+}
+
 /**
  * jbd2_journal_recover - recovers a on-disk journal
  * @journal: the journal to recover
@@ -304,6 +372,8 @@ int jbd2_journal_recover(journal_t *journal)
}
 
err = do_one_pass(journal, &info, PASS_SCAN);
+   if (!err)
+   err = stabilize_journal_head(journal, &info);
if (!err)
err = do_one_pass(journal, &info, PASS_REVOKE);
if (!err)
@@ -354,6 +4

[Devel] [PATCH RH9 06/12] jbd2: make shure that we do not miss aborted state

2021-10-07 Thread Kirill Tkhai

From: Dmitry Monakhov 

Signed-off-by: Dmitry Monakhov 

(cherry picked from vz7 commit 2398d7694d2afe5cf83e379ad4ea6e2ddc191675)
Signed-off-by: Konstantin Khorenko 
Signed-off-by: Kirill Tkhai 
---
 fs/jbd2/journal.c |3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 35302bc192eb..4a879e04f4b1 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -713,10 +713,9 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid)
!tid_gt(tid, journal->j_commit_sequence));
read_lock(&journal->j_state_lock);
}
-   read_unlock(&journal->j_state_lock);
-
if (unlikely(is_journal_aborted(journal)))
err = -EIO;
+   read_unlock(&journal->j_state_lock);
return err;
 }
 


___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH RH9 09/12] ext4: add mfsync support

2021-10-07 Thread Kirill Tkhai

From: Dmitry Monakhov 

Add EXT4_IOC_MFSYNC ioctl  which allow to perform sync on given set of files
in optimized way (only 1 barrier will be required in best scenario)

https://jira.sw.ru/browse/PSBM-18567

Signed-off-by: Dmitry Monakhov 

+++
Comment on rebasing to rh7 kernel-3.10.0-229.7.2.el7:

1) compile fix for ext4-add-mfsync-support

   ext4_flush_unwritten_io was removed in rh7-3.10.0-229.7.2

   https://jira.sw.ru/browse/PSBM-34909

2) compile fix for ext4-add-mfsync-support part2

   __sync_inode was removed in rh7-3.10.0-229.7.2
   It is honest to simply disable mfsync in  nojournal mode since we
   so not test nojournal mode at all.

   https://jira.sw.ru/browse/PSBM-34910

Signed-off-by: Dmitry Monakhov 

Rebase to vz8 kernel note:
  mutex_unlock(&inode->i_mutex) -> inode_lock_shared(inode)

Signed-off-by: Konstantin Khorenko 
Signed-off-by: Kirill Tkhai 
---
 fs/ext4/ext4.h  |7 +++
 fs/ext4/fsync.c |  108 +++
 fs/ext4/ioctl.c |   60 
 include/trace/events/ext4.h |   54 ++
 4 files changed, 229 insertions(+)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index df46d5586ca1..5f6fdd5514b2 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -615,6 +615,11 @@ struct compat_ext4_new_group_input {
 };
 #endif
 
+struct ext4_ioc_mfsync_info {
+   __u32 size;
+   __u32 fd[0];
+};
+
 /* The struct ext4_new_group_input in kernel space, with free_blocks_count */
 struct ext4_new_group_data {
__u32 group;
@@ -722,6 +727,7 @@ enum {
 #define EXT4_IOC_GET_ES_CACHE  _IOWR('f', 42, struct fiemap)
 #define EXT4_IOC_OPEN_BALLOON  _IO('f', 42)
 #define EXT4_IOC_CHECKPOINT_IOW('f', 43, __u32)
+#define EXT4_IOC_MFSYNC_IO('f', 43)
 
 #define EXT4_IOC_SHUTDOWN _IOR ('X', 125, __u32)
 
@@ -2814,6 +2820,7 @@ extern int ext4_check_all_de(struct inode *dir, struct 
buffer_head *bh,
 
 /* fsync.c */
 extern int ext4_sync_file(struct file *, loff_t, loff_t, int);
+extern int ext4_sync_files(struct file **, unsigned int *, unsigned int);
 
 /* hash.c */
 extern int ext4fs_dirhash(const struct inode *dir, const char *name, int len,
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 027a7d7037a0..8179066765bd 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -185,3 +185,111 @@ int ext4_sync_file(struct file *file, loff_t start, 
loff_t end, int datasync)
trace_ext4_sync_file_exit(inode, ret);
return ret;
 }
+
+int ext4_sync_files(struct file **files, unsigned int *flags, unsigned int 
nr_files)
+{
+   struct super_block *sb;
+   journal_t *journal;
+   int err = 0, err2 = 0, i = 0, j = 0;
+   int force_commit = 0, datawriteback = 0;
+   tid_t commit_tid = 0;
+   int need_barrier = 0;
+
+   J_ASSERT(ext4_journal_current_handle() == NULL);
+   if (!nr_files)
+   return 0;
+
+   sb = files[0]->f_mapping->host->i_sb;
+   journal = EXT4_SB(sb)->s_journal;
+   if (sb->s_flags & SB_RDONLY) {
+   /* Make shure that we read updated s_mount_flags value */
+   smp_rmb();
+   if (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)
+   return -EROFS;
+   return 0;
+   }
+   for (i = 0; i < nr_files; i++) {
+   struct address_space * mapping = files[i]->f_mapping;
+   struct inode *inode = mapping->host;
+
+   BUG_ON(sb != inode->i_sb);
+   if (!mapping->nrpages)
+   continue;
+
+   err = filemap_fdatawrite(mapping);
+   if (err)
+   break;
+
+   }
+   /*
+* Even if the above returned error, the pages may be
+* written partially (e.g. -ENOSPC), so we wait for it.
+* But the -EIO is special case, it may indicate the worst
+* thing (e.g. bug) happened, so we avoid waiting for it.
+*/
+   if (err == -EIO)
+   goto out;
+
+   for (j = 0; j < i; j++) {
+   struct address_space * mapping = files[j]->f_mapping;
+   struct inode *inode = mapping->host;
+   struct ext4_inode_info *ei = EXT4_I(inode);
+   unsigned int datasync = flags[j];
+   tid_t tid;
+
+   if (mapping->nrpages) {
+   err2 = filemap_fdatawait(mapping);
+   if (!err || err2 == -EIO)
+   err = err2;
+   }
+
+   inode_lock_shared(inode);
+   force_commit  |= ext4_should_journal_data(inode);
+   datawriteback |= ext4_should_writeback_data(inode);
+   tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid;
+   inode_unlock_shared(inode);
+

[Devel] [PATCH RH9 02/12] Kconfig.openvz: force CGROUP_PERF if compiling VZ Containers code

2021-10-07 Thread Kirill Tkhai

From: Konstantin Khorenko 

The perf_event_open() syscall is available from Containers.  The
CONFIG_CGROUP_PERF option is set in current OpenVZ kernel configs, but let's
force-enable it if CONFIG_VE is enabled to prevent possible non-secure kernel
config if someone rebuilds the kernel with own config.

https://jira.sw.ru/browse/PSBM-51360

Signed-off-by: Konstantin Khorenko 

(cherry picked from vz7 commit a35598ba04acf80424fd8f997686a2edd3c3dcb8)
Signed-off-by: Konstantin Khorenko 
---
 kernel/Kconfig.openvz |1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/Kconfig.openvz b/kernel/Kconfig.openvz
index 9489342596ab..6ea4f707df61 100644
--- a/kernel/Kconfig.openvz
+++ b/kernel/Kconfig.openvz
@@ -19,6 +19,7 @@ config VE
select CGROUPS
select CGROUP_DEVICE
select CGROUP_FREEZER
+   select CGROUP_PERF
help
  This option adds support of virtual Linux running on the original box
  with fully supported virtual network driver, tty subsystem and


___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH RH9 01/12] ve/fs/namespace: allow submounts in non-init userns

2021-10-07 Thread Kirill Tkhai

From: Konstantin Khorenko 

Simple NFS mount inside a Container brings us to vfs_submount(), so if
we want to enable NFS inside a Container (read - in CT root userns), we
have to soften the check for init userns.

SyS_mount
 do_mount
  vfs_kern_mount
   mount_fs
nfs_fs_mount
 nfs4_try_mount
  nfs_follow_remote_path
   mount_subtree
vfs_path_lookup
 do_path_lookup
  filename_lookup
   path_lookupat
lookup_slow
 follow_managed
  nfs_d_automount
   nfs4_submount
nfs_do_submount
 vfs_submount

https://jira.sw.ru/browse/PSBM-86277
Signed-off-by: Konstantin Khorenko 

https://jira.sw.ru/browse/PSBM-127234
(cherry picked from vz7 commit bc060d46276144f91a139b7d0acf384dcd0a4dde)

vz7->vz8 port note: in vz7 the check has been dropped at all
in vz8 we leave the check, but allow submounts only for root CT userns.

Signed-off-by: Konstantin Khorenko 
Reviewed-by: Pavel Tikhomirov 

+++
ve/fs/namespace: fix allowing submounts in non-init userns

When mounting nfs4 mount inside container with something like:

  mount -t nfs4 $NODEIP:/root/build/criu /mnt

we can see that because the source "root" path is several directories
long we do create several submounts.

Adding perf probes to list mountpoint->d_sb->s_user_ns and
mountpoint->d_iname from vfs_submount we see:

crash > p &init_user_ns
$2 = (struct user_namespace *) 0x9644efc0

1) First submount created has mountpoint dentry "root" and ve userns:
mount.nfs4 ...: probe:vfs_submount: (95a970e0)
user_ns=0x8b6d6e86a000 dentry="root"

2) Second submount created has mountpoint dentry "build" from first
submount and init userns of host:
mount.nfs4 ...: probe:vfs_submount: (95a970e0)
user_ns=0x9644efc0 dentry="build"

So on first step we have ve userns and on second init userns. Either
compairing it to one of init userns or ve userns would not work because
we can have both of them. So easy solution here is to disable the check
completely like we do in vz7.

Note: this patch allows nfs4 mounts in containers, thus we overcome
nfs3 rpcbind non-dumpable socket migration problems, as now nfs mounts
in v4 mode by default.

https://jira.sw.ru/browse/PSBM-102629
mFixes: 81a2b734416d ("ve/fs/namespace: allow submounts in non-init
userns")
Signed-off-by: Pavel Tikhomirov 
Signed-off-by: Kirill Tkhai 
---
 fs/namespace.c |   25 +
 1 file changed, 25 insertions(+)

diff --git a/fs/namespace.c b/fs/namespace.c
index c10614908e7e..85a451861e14 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1051,12 +1051,37 @@ struct vfsmount *
 vfs_submount(const struct dentry *mountpoint, struct file_system_type *type,
 const char *name, void *data)
 {
+#if 0
/* Until it is worked out how to pass the user namespace
 * through from the parent mount to the submount don't support
 * unprivileged mounts with submounts.
 */
+   /* Simple NFS mount inside a Container brings us here, so if we want to
+* enable NFS inside a Container (read - in non-init userns), we have
+* to omit the check. Below is how is was in VZ8:
+*
+*  SyS_mount
+*   do_mount
+*vfs_kern_mount
+* mount_fs
+*  nfs_fs_mount
+*   nfs4_try_mount
+*nfs_follow_remote_path
+* mount_subtree
+*  vfs_path_lookup
+*   do_path_lookup
+*filename_lookup
+* path_lookupat
+*  lookup_slow
+*   follow_managed
+*nfs_d_automount
+* nfs4_submount
+*  nfs_do_submount
+*   vfs_submount
+*/
if (mountpoint->d_sb->s_user_ns != &init_user_ns)
return ERR_PTR(-EPERM);
+#endif
 
return vfs_kern_mount(type, SB_SUBMOUNT, name, data);
 }


___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

Re: [Devel] [PATCH RH9 0/5] part 12 userfaultfd

2021-10-06 Thread Kirill Tkhai

Commited

On 05.10.2021 20:54, Cyrill Gorcunov wrote:
> Hi! The series addresses patches left from userfaultfd handling,
> which were mostly merged already so only netlink and a few puckups
> left. I had to tune up netlink series since it didn't apply smoothly:
> datagram seding proto has been lifted up and netlink socket errors
> has been depending on repair mode so the patches were not build-able
> step by step.
> 
> Andrey Vagin (1):
>   netlink: add an ability to restore messages in a receive queue
> 
> Andrey Zhadchenko (1):
>   netlink: add an option to set sk->err from userspace
> 
> Angelo Ruocco (2):
>   ms/cgroup: let a symlink too be created with a cftype file
>   ms/block, bfq: add weight symlink to the bfq.weight cgroup parameter
> 
> Stanislav Kinsburskiy (1):
>   netlink: allow to set peeking offset for sockets
> 
>  block/bfq-cgroup.c   |  6 ++-
>  include/linux/cgroup-defs.h  |  3 ++
>  include/uapi/linux/netlink.h |  2 +
>  kernel/cgroup/cgroup.c   | 33 +++--
>  net/netlink/af_netlink.c | 92 
>  net/netlink/af_netlink.h |  2 +
>  6 files changed, 112 insertions(+), 26 deletions(-)
> 
> 
> base-commit: 26c73ba86152babe66810a7c153a0dfc1f1edc49
> 

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

Re: [Devel] [PATCH vz9 00/27] part17

2021-10-06 Thread Kirill Tkhai

Commited

On 06.10.2021 11:57, Nikita Yushchenko wrote:
> Andrey Ryabinin (3):
>   x86: make ARCH_[SET|GET]_CPUID friends with /proc/vz/cpuid_override
>   x86, cpuinfo: Fix race on parallel /proc/cpuinfo read #PSBM-121823
>   x86: don't enable cpuid faults if /proc/vz/cpuid_override unused
> #PSBM-121823
> 
> Evgenii Shatokhin (1):
>   sched: show CPU stats for a cgroup in cpu.proc.stat file
> 
> Kirill Tkhai (3):
>   sched: Count loadavg under rq::lock in calc_load_nohz_start()
>   sched/ve: Do not show loadavg in child VE cpu cgroups
>   x86: Show vcpu cpuflags in cpuinfo
> 
> Konstantin Khorenko (12):
>   kernel/stat: Introduce kernel_cpustat operation wrappers
>   ve/sched/stat: Add basic infrastructure for vcpu statistics
>   ve/sched/stat: Introduce functions to calculate vcpustat data
>   ve/proc/stat: Wire virtualized /proc/stat handler
>   sched: Fix task_group "iowait_sum" statistic accounting
>   ve/sched/stat: Introduce handler for getting CT cpu statistics
>   ve/time/stat: idle time virtualization in /proc/loadavg
>   ve/proc/stat: Introduce CPUTIME_USED field in cpustat statistic
>   ve/vestat: Introduce /proc/vz/vestat
>   ve/net/core: allow to call setsockopt(SO_SNDBUFFORCE) from Containers
>   ve/net/core: allow to call setsockopt(SO_RCVBUFFORCE) from Containers
>   vecalls: Introduce VZCTL_GET_CPU_STAT ioctl
> 
> Nikita Yushchenko (1):
>   ve: uninline ve_get_monotonic() and ve_get_uptime()
> 
> Pavel Tikhomirov (2):
>   ve/proc/net/nr_cpus: Cut lines in /proc/net/softnet_stat to number of
> vcpus in CT
>   ve: allow writing to features in pseudosuper state
> 
> Stanislav Kinsburskiy (2):
>   ve/fs/aio: aio_nr & aio_max_nr variables virtualization
>   ve/aio: Add a handle to checkpoint/restore AIO context
> 
> Vladimir Davydov (3):
>   sched/stat: account ctxsw per task group
>   sched/stat: account forks per task group
>   arch/x86: introduce cpuid override
> 
>  arch/x86/include/asm/msr-index.h   |   1 +
>  arch/x86/include/asm/thread_info.h |   4 +-
>  arch/x86/include/asm/traps.h   |   2 +
>  arch/x86/kernel/Makefile   |   1 +
>  arch/x86/kernel/cpu/proc.c |  80 +-
>  arch/x86/kernel/cpuid_fault.c  | 249 
>  arch/x86/kernel/process.c  |  13 +-
>  arch/x86/kernel/traps.c|  27 ++
>  fs/aio.c   | 137 +++--
>  fs/proc/base.c |  27 ++
>  fs/proc/stat.c |  10 +
>  fs/proc/uptime.c   |  30 +-
>  include/linux/aio.h|  19 +-
>  include/linux/cpuid_override.h |  38 +++
>  include/linux/kernel_stat.h|  37 +++
>  include/linux/ve.h |  54 ++--
>  kernel/sched/core.c|  28 +-
>  kernel/sched/cpuacct.c | 441 +
>  kernel/sched/fair.c|  21 +-
>  kernel/sched/loadavg.c |   6 +-
>  kernel/sched/sched.h   |   9 +
>  kernel/sysctl.c|  16 +-
>  kernel/time/time.c |   1 +
>  kernel/ve/ve.c | 107 ++-
>  kernel/ve/vecalls.c| 159 +++
>  net/core/net-procfs.c  |   3 +-
>  net/core/sock.c|  14 +-
>  27 files changed, 1438 insertions(+), 96 deletions(-)
>  create mode 100644 arch/x86/kernel/cpuid_fault.c
>  create mode 100644 include/linux/cpuid_override.h
> 

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

Re: [Devel] [PATCH RH9 1/5] ms/cgroup: let a symlink too be created with a cftype file

2021-10-06 Thread Kirill Tkhai

On 05.10.2021 20:54, Cyrill Gorcunov wrote:
> From: Angelo Ruocco 
> 
> This commit enables a cftype to have a symlink (of any name) that
> points to the file associated with the cftype.
> 
> Signed-off-by: Angelo Ruocco 
> Signed-off-by: Paolo Valente 
> Signed-off-by: Jens Axboe 
> 
> https://jira.sw.ru/browse/PSBM-101019
> (cherry-picked from 54b7b868e826b294687c439b68ec55fe20cafe5b)
> Signed-off-by: Andrey Ryabinin 
> Signed-off-by: Cyrill Gorcunov 
> ---
>  include/linux/cgroup-defs.h |  3 +++
>  kernel/cgroup/cgroup.c  | 33 +
>  2 files changed, 32 insertions(+), 4 deletions(-)
> 
> diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
> index 583ce2bce98c..0bb884ce 100644
> --- a/include/linux/cgroup-defs.h
> +++ b/include/linux/cgroup-defs.h
> @@ -125,6 +125,8 @@ enum {
>*/
>   CFTYPE_VE_WRITABLE  = (1 << 15),
>  
> + CFTYPE_SYMLINKED= (1 << 6), /* pointed to by symlink too */

We already have:

CFTYPE_PRESSURE = (1 << 6), /* only if pressure feature is 
enabled */  


> +
>   /* internal flags, do not use outside cgroup core proper */
>   __CFTYPE_ONLY_ON_DFL= (1 << 16),/* only on default hierarchy */
>   __CFTYPE_NOT_ON_DFL = (1 << 17),/* not on default hierarchy */
> @@ -552,6 +554,7 @@ struct cftype {
>* end of cftype array.
>*/
>   char name[MAX_CFTYPE_NAME];
> + char link_name[MAX_CFTYPE_NAME];
>   unsigned long private;
>  
>   /*
> diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
> index 08b7cff7a1c3..74d0b503e696 100644
> --- a/kernel/cgroup/cgroup.c
> +++ b/kernel/cgroup/cgroup.c
> @@ -1471,8 +1471,8 @@ struct cgroup *task_cgroup_from_root(struct task_struct 
> *task,
>  
>  static struct kernfs_syscall_ops cgroup_kf_syscall_ops;
>  
> -static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
> -   char *buf)
> +static char *cgroup_fill_name(struct cgroup *cgrp, const struct cftype *cft,
> +   char *buf, bool write_link_name)
>  {
>   struct cgroup_subsys *ss = cft->ss;
>  
> @@ -1482,13 +1482,26 @@ static char *cgroup_file_name(struct cgroup *cgrp, 
> const struct cftype *cft,
>  
>   snprintf(buf, CGROUP_FILE_NAME_MAX, "%s%s.%s",
>dbg, cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name,
> -  cft->name);
> +  write_link_name ? cft->link_name : cft->name);
>   } else {
> - strscpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
> + strscpy(buf, write_link_name ? cft->link_name : cft->name,
> + CGROUP_FILE_NAME_MAX);
>   }
>   return buf;
>  }
>  
> +static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
> +   char *buf)
> +{
> + return cgroup_fill_name(cgrp, cft, buf, false);
> +}
> +
> +static char *cgroup_link_name(struct cgroup *cgrp, const struct cftype *cft,
> +   char *buf)
> +{
> + return cgroup_fill_name(cgrp, cft, buf, true);
> +}
> +
>  /**
>   * cgroup_file_mode - deduce file mode of a control file
>   * @cft: the control file in question
> @@ -1647,6 +1660,9 @@ static void cgroup_rm_file(struct cgroup *cgrp, const 
> struct cftype *cft)
>   }
>  
>   kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name));
> + if (cft->flags & CFTYPE_SYMLINKED)
> + kernfs_remove_by_name(cgrp->kn,
> +   cgroup_link_name(cgrp, cft, name));
>  }
>  
>  /**
> @@ -4012,6 +4028,7 @@ static int cgroup_add_file(struct cgroup_subsys_state 
> *css, struct cgroup *cgrp,
>  {
>   char name[CGROUP_FILE_NAME_MAX];
>   struct kernfs_node *kn;
> + struct kernfs_node *kn_link;
>   struct lock_class_key *key = NULL;
>   int ret;
>  
> @@ -4042,6 +4059,14 @@ static int cgroup_add_file(struct cgroup_subsys_state 
> *css, struct cgroup *cgrp,
>   spin_unlock_irq(&cgroup_file_kn_lock);
>   }
>  
> + if (cft->flags & CFTYPE_SYMLINKED) {
> + kn_link = kernfs_create_link(cgrp->kn,
> +  cgroup_link_name(cgrp, cft, name),
> +  kn);
> + if (IS_ERR(kn_link))
> + return PTR_ERR(kn_link);
> + }
> +
>   return 0;
>  }
>  
> 

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

Re: [Devel] [PATCH RH9 0/6] ext4: Balloon patches

2021-10-05 Thread Kirill Tkhai

Commited

On 05.10.2021 18:42, Kirill Tkhai wrote:
> https://jira.sw.ru/browse/PSBM-134003
> 
> ---
> 
> Kirill Tkhai (2):
>   Date:   Wed Oct 7 14:47:07 2015 +0400
>   fs: Revert ee1904ba44bd "make alloc_file() static"
> 
> Konstantin Khorenko (1):
>   ext4: Provide a balloon nipple for management
> 
> Maxim V. Patlasov (3):
>   ext4: Teach the fs where the balloon inode is
>   ext4: Teach statfs to report reduced disk usage
>   ext4: Don't show the active balloon to user
> 
> 
>  fs/ext4/dir.c|   15 ++-
>  fs/ext4/ext4.h   |3 +
>  fs/ext4/ioctl.c  |   59 +++
>  fs/ext4/namei.c  |9 
>  fs/ext4/super.c  |  111 
> +++---
>  fs/file_table.c  |3 +
>  fs/inode.c   |1 
>  include/linux/file.h |    2 +
>  8 files changed, 194 insertions(+), 9 deletions(-)
> 
> --
> Signed-off-by: Kirill Tkhai 
> 

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

Re: [Devel] [PATCH RH9 0/7] part-10 non-connector patches

2021-10-05 Thread Kirill Tkhai

Commited into branch-rh9-5.14.vz9.1.x-ovz

On 05.10.2021 15:55, Pavel Tikhomirov wrote:
> These are patches from part-10 not directly connected with proc
> connector.
> 
> These two are renamed: "prctl: reduce requirements to exe link change",
> "ve/prctl_set_mm: allow to change mm content in ve" to
> "ve/prctl_set_mm: allow setting exe link while unprivileged for spfs".
> 
> And "ve/net: allow to rename devices in non-ve namespaces" got a crash
> fix.
> 
> Andrey Ryabinin (1):
>   ve/module: hide module refcounts from container
> 
> Kirill Tkhai (1):
>   ve/net: allow to rename devices in non-ve namespaces
> 
> Pavel Tikhomirov (2):
>   ve/prctl_set_mm: allow setting exe link while unprivileged for spfs
>   ve/coredump: virtualize kernel.core_pattern sysctl
> 
> Stanislav Kinsburskiy (3):
>   ve/kernfs: export kernfs_perms_set() helper
>   ve/sysfs: generic sysfs_set_def_perms() helper introduced
>   ve/module: export sysfs dentries in containers
> 
>  fs/coredump.c | 12 +
>  fs/kernfs/ve.c|  4 +--
>  fs/sysfs/ve.c | 11 
>  include/linux/coredump.h  |  1 -
>  include/linux/kernfs-ve.h |  4 +++
>  include/linux/sysfs-ve.h  | 20 ++
>  include/linux/ve.h|  7 +
>  kernel/module.c   | 57 ---
>  kernel/sys.c  |  6 ++---
>  kernel/sysctl.c   | 13 ++---
>  kernel/ve/ve.c| 18 +
>  net/core/dev.c| 22 +++
>  12 files changed, 157 insertions(+), 18 deletions(-)
>  create mode 100644 include/linux/sysfs-ve.h
> 

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH RH9 6/6] ext4: Provide a balloon nipple for management

2021-10-05 Thread Kirill Tkhai

From: Konstantin Khorenko 

When the fs is mounted with active balloon someone will have to
inflate/blow off one. To make it possible there will be a special
ioctl for obtaining the fd.

Not very elegant solution maybe, but it's OK for PVC containers.

+++
ext4: fix file allocation check in ext4_open_balloon

Function alloc_file() doesn't return NULL (unlike in 2.6.32-x).
It returns error pointer. File structure allocation may fail before
file->f_ep_links is initialized, which may lead to crash in
eventpoll_release_file().

https://jira.sw.ru/browse/PSBM-41222

mFixes: 9cea7449aa589f325fff378e7256a3c2fc8f048d
"ext4: Provide a balloon nipple for management"

Signed-off-by: Stanislav Kinsburskiy 

(cherry picked from vz7 commit 100feb098ab22c6b8b25861c3b2dfaa9c5db0b03)
Signed-off-by: Konstantin Khorenko 

+++
ext4/balloon: Use proper O_ mode flags in balloon opening code

alloc_file() expects O_* mode flags, so provide them, not
internal FMODE_* ones.

mFixes: bee340a206d7 ("ext4: Provide a balloon nipple for management")
https://jira.sw.ru/browse/PSBM-129392

Signed-off-by: Konstantin Khorenko 
Reviewed-by: Kirill Tkhai 
Signed-off-by: Kirill Tkhai 
---
 fs/ext4/ext4.h  |1 +
 fs/ext4/ioctl.c |   59 +++
 fs/inode.c  |1 +
 3 files changed, 61 insertions(+)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 9b655a94eb16..df46d5586ca1 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -720,6 +720,7 @@ enum {
 #define EXT4_IOC_CLEAR_ES_CACHE_IO('f', 40)
 #define EXT4_IOC_GETSTATE  _IOW('f', 41, __u32)
 #define EXT4_IOC_GET_ES_CACHE  _IOWR('f', 42, struct fiemap)
+#define EXT4_IOC_OPEN_BALLOON  _IO('f', 42)
 #define EXT4_IOC_CHECKPOINT_IOW('f', 43, __u32)
 
 #define EXT4_IOC_SHUTDOWN _IOR ('X', 125, __u32)
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 6eed6170aded..6e2be4859571 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -850,6 +850,59 @@ static int ext4_ioctl_checkpoint(struct file *filp, 
unsigned long arg)
return err;
 }
 
+static int ext4_open_balloon(struct super_block *sb, struct vfsmount *mnt)
+{
+   struct inode *balloon_ino;
+   int err, fd;
+   struct file *filp;
+   struct dentry *de;
+   struct path path;
+   fmode_t mode;
+
+   balloon_ino = EXT4_SB(sb)->s_balloon_ino;
+   err = -ENOENT;
+   if (balloon_ino == NULL)
+   goto err;
+
+   err = fd = get_unused_fd_flags(0);
+   if (err < 0)
+   goto err_fd;
+
+   __iget(balloon_ino);
+   de = d_obtain_alias(balloon_ino);
+   err = PTR_ERR(de);
+   if (IS_ERR(de))
+   goto err_de;
+
+   path.dentry = de;
+   path.mnt = mntget(mnt);
+   err = mnt_want_write(path.mnt);
+   if (err)
+   mode = O_RDONLY;
+   else
+   mode = O_RDWR;
+   filp = alloc_file(&path, mode, &ext4_file_operations);
+   if (filp->f_mode & FMODE_WRITE)
+   mnt_drop_write(path.mnt);
+   if (IS_ERR(filp)) {
+   err = PTR_ERR(filp);
+   goto err_filp;
+   }
+
+   filp->f_flags |= O_LARGEFILE;
+   fd_install(fd, filp);
+   return fd;
+
+err_filp:
+   path_put(&path);
+err_de:
+   put_unused_fd(fd);
+err_fd:
+   /* nothing */
+err:
+   return err;
+}
+
 static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long 
arg)
 {
struct inode *inode = file_inode(filp);
@@ -1264,6 +1317,12 @@ static long __ext4_ioctl(struct file *filp, unsigned int 
cmd, unsigned long arg)
case EXT4_IOC_CHECKPOINT:
return ext4_ioctl_checkpoint(filp, arg);
 
+   case EXT4_IOC_OPEN_BALLOON:
+   if (!capable(CAP_SYS_ADMIN))
+   return -EACCES;
+
+   return ext4_open_balloon(inode->i_sb, filp->f_path.mnt);
+
default:
return -ENOTTY;
}
diff --git a/fs/inode.c b/fs/inode.c
index c93500d84264..55498b31f088 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -414,6 +414,7 @@ void __iget(struct inode *inode)
 {
atomic_inc(&inode->i_count);
 }
+EXPORT_SYMBOL(__iget);
 
 /*
  * get additional reference to inode; caller must already hold one.


___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH RH9 4/6] ext4: Don't show the active balloon to user

2021-10-05 Thread Kirill Tkhai

From: Maxim V. Patlasov 

This is a port of
e123b6d ext4: Don't show the active balloon to user

Fix the readdir and lookup. The former one pretends the inode doesn't
exists, the latter one denies an access to on. Reporting negative dentry
in lookup is pointless, as in that case smth will have to be don the
ext4_create callback :\

[VvS RH79 rebase vz7.170.x]: minor context changes

(cherry picked from vz7 commit c231c40a93927f3080067e5d880ef11841de278c)
Signed-off-by: Konstantin Khorenko 
Signed-off-by: Kirill Tkhai 
---
 fs/ext4/dir.c   |   15 ++-
 fs/ext4/namei.c |9 +
 2 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index ffb295aa891c..8ed108299fbb 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -123,6 +123,14 @@ int __ext4_check_dir_entry(const char *function, unsigned 
int line,
return 1;
 }
 
+static inline int ext4_balloon(struct super_block *sb, unsigned ino)
+{
+   struct ext4_sb_info *sbi;
+
+   sbi = EXT4_SB(sb);
+   return sbi->s_balloon_ino && (sbi->s_balloon_ino->i_ino == ino);
+}
+
 static int ext4_readdir(struct file *file, struct dir_context *ctx)
 {
unsigned int offset;
@@ -267,7 +275,8 @@ static int ext4_readdir(struct file *file, struct 
dir_context *ctx)
}
offset += ext4_rec_len_from_disk(de->rec_len,
sb->s_blocksize);
-   if (le32_to_cpu(de->inode)) {
+   if (le32_to_cpu(de->inode) &&
+   !ext4_balloon(sb, le32_to_cpu(de->inode))) {
if (!IS_ENCRYPTED(inode)) {
if (!dir_emit(ctx, de->name,
de->name_len,
@@ -534,6 +543,9 @@ static int call_filldir(struct file *file, struct 
dir_context *ctx,
}
ctx->pos = hash2pos(file, fname->hash, fname->minor_hash);
while (fname) {
+   if (ext4_balloon(sb, fname->inode))
+   goto skip;
+
if (!dir_emit(ctx, fname->name,
fname->name_len,
fname->inode,
@@ -541,6 +553,7 @@ static int call_filldir(struct file *file, struct 
dir_context *ctx,
info->extra_fname = fname;
return 1;
}
+skip:
fname = fname->next;
}
return 0;
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index f3bbcd4efb56..4a71df8bf8d8 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1797,6 +1797,11 @@ static struct dentry *ext4_lookup(struct inode *dir, 
struct dentry *dentry, unsi
iput(inode);
return ERR_PTR(-EPERM);
}
+   if (!IS_ERR(inode) &&
+   inode == EXT4_SB(inode->i_sb)->s_balloon_ino) {
+   iput(inode);
+   return ERR_PTR(-EPERM);
+   }
}
 
 #ifdef CONFIG_UNICODE
@@ -3392,6 +3397,10 @@ static int ext4_unlink(struct inode *dir, struct dentry 
*dentry)
retval = dquot_initialize(d_inode(dentry));
if (retval)
goto out_trace;
+if (d_inode(dentry) == EXT4_SB(dir->i_sb)->s_balloon_ino) {
+   retval = -EPERM;
+goto out_trace;
+   }
 
handle = ext4_journal_start(dir, EXT4_HT_DIR,
EXT4_DATA_TRANS_BLOCKS(dir->i_sb));


___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH RH9 5/6] fs: Revert ee1904ba44bd "make alloc_file() static"

2021-10-05 Thread Kirill Tkhai

Signed-off-by: Kirill Tkhai 
---
 fs/file_table.c  |3 ++-
 include/linux/file.h |2 ++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/fs/file_table.c b/fs/file_table.c
index 45437f8e1003..f624f1a069e8 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -184,7 +184,7 @@ struct file *alloc_empty_file_noaccount(int flags, const 
struct cred *cred)
  * @flags: O_... flags with which the new file will be opened
  * @fop: the 'struct file_operations' for the new file
  */
-static struct file *alloc_file(const struct path *path, int flags,
+struct file *alloc_file(const struct path *path, int flags,
const struct file_operations *fop)
 {
struct file *file;
@@ -210,6 +210,7 @@ static struct file *alloc_file(const struct path *path, int 
flags,
i_readcount_inc(path->dentry->d_inode);
return file;
 }
+EXPORT_SYMBOL(alloc_file);
 
 struct file *alloc_file_pseudo(struct inode *inode, struct vfsmount *mnt,
const char *name, int flags,
diff --git a/include/linux/file.h b/include/linux/file.h
index 2de2e4613d7b..bdaefc80bb28 100644
--- a/include/linux/file.h
+++ b/include/linux/file.h
@@ -22,6 +22,8 @@ struct vfsmount;
 struct dentry;
 struct inode;
 struct path;
+extern struct file *alloc_file(const struct path *path, int flags,
+   const struct file_operations *fop);
 extern struct file *alloc_file_pseudo(struct inode *, struct vfsmount *,
const char *, int flags, const struct file_operations *);
 extern struct file *alloc_file_clone(struct file *, int flags,


___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH RH9 1/6] ext4: Teach the fs where the balloon inode is

2021-10-05 Thread Kirill Tkhai

From: Maxim V. Patlasov 

This is a port of
da0fae4 ext4: Teach the fs where the balloon inode is

This adds the balloon_ino mount option and stores the inode
pointer on the in-memory super block object.

This is not good solution - in a perfect world the balloon
inode should be hidden (like the journalling one), but this
requires
a) reserve its number in the mainline sources;)
b) teach e2fsprogs not to treat one as orphaned

Until (if) we do this it's better to keep this as a regular
file on the disk.

(cherry picked from vz7 commit 54ac06cf671c68a3778e9f939ba3794fd6a51470)
Signed-off-by: Konstantin Khorenko 
Signed-off-by: Kirill Tkhai 
---
 fs/ext4/ext4.h  |2 +
 fs/ext4/super.c |   91 +++
 2 files changed, 87 insertions(+), 6 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 3c51e243450d..9b655a94eb16 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1579,6 +1579,8 @@ struct ext4_sb_info {
atomic_t s_mb_discarded;
atomic_t s_lock_busy;
 
+   struct inode *s_balloon_ino;
+
/* locality groups */
struct ext4_locality_group __percpu *s_locality_groups;
 
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index befbb0892fdd..3bc2cfb04518 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1682,6 +1682,7 @@ enum {
 #ifdef CONFIG_EXT4_DEBUG
Opt_fc_debug_max_replay, Opt_fc_debug_force
 #endif
+   Opt_balloon_ino,
 };
 
 static const match_table_t tokens = {
@@ -1786,6 +1787,7 @@ static const match_table_t tokens = {
{Opt_removed, "reservation"},   /* mount option from ext2/3 */
{Opt_removed, "noreservation"}, /* mount option from ext2/3 */
{Opt_removed, "journal=%u"},/* mount option from ext2/3 */
+   {Opt_balloon_ino, "balloon_ino=%u"},
{Opt_err, NULL},
 };
 
@@ -2009,6 +2011,7 @@ static const struct mount_opts {
 MOPT_SET | MOPT_2 | MOPT_EXT4_ONLY},
{Opt_fc_debug_max_replay, 0, MOPT_GTE0},
 #endif
+   {Opt_balloon_ino, 0, 0},
{Opt_err, 0, 0}
 };
 
@@ -2093,7 +2096,8 @@ struct ext4_parsed_options {
 
 static int handle_mount_opt(struct super_block *sb, char *opt, int token,
substring_t *args, struct ext4_parsed_options 
*parsed_opts,
-   int is_remount)
+
+   unsigned long *balloon_ino, int is_remount)
 {
struct ext4_sb_info *sbi = EXT4_SB(sb);
const struct mount_opts *m;
@@ -2300,6 +2304,8 @@ static int handle_mount_opt(struct super_block *sb, char 
*opt, int token,
} else if (token == Opt_test_dummy_encryption) {
return ext4_set_test_dummy_encryption(sb, opt, &args[0],
  is_remount);
+   } else if (token == Opt_balloon_ino) {
+   *balloon_ino = arg;
} else if (m->flags & MOPT_DATAJ) {
if (is_remount) {
if (!sbi->s_journal)
@@ -2420,6 +2426,7 @@ static int handle_mount_opt(struct super_block *sb, char 
*opt, int token,
 
 static int parse_options(char *options, struct super_block *sb,
 struct ext4_parsed_options *ret_opts,
+unsigned long *balloon_ino,
 int is_remount)
 {
struct ext4_sb_info __maybe_unused *sbi = EXT4_SB(sb);
@@ -2440,7 +2447,7 @@ static int parse_options(char *options, struct 
super_block *sb,
args[0].to = args[0].from = NULL;
token = match_token(p, tokens, args);
if (handle_mount_opt(sb, p, token, args, ret_opts,
-is_remount) < 0)
+balloon_ino, is_remount) < 0)
return 0;
}
 #ifdef CONFIG_QUOTA
@@ -2628,6 +2635,10 @@ static int _ext4_show_options(struct seq_file *seq, 
struct super_block *sb,
} else if (test_opt2(sb, DAX_INODE)) {
SEQ_OPTS_PUTS("dax=inode");
}
+
+   if (sbi->s_balloon_ino)
+   SEQ_OPTS_PRINT("balloon_ino=%ld", sbi->s_balloon_ino->i_ino);
+
ext4_show_quota_options(seq, sb);
return 0;
 }
@@ -4014,6 +4025,54 @@ static const char *ext4_quota_mode(struct super_block 
*sb)
 #endif
 }
 
+static void ext4_load_balloon(struct super_block *sb, unsigned long ino)
+{
+   struct inode *inode;
+   struct ext4_sb_info *sbi;
+
+   sbi = EXT4_SB(sb);
+
+   if (!ino) {
+   /* FIXME locking */
+   if (sbi->s_balloon_ino) {
+   iput(sbi->s_balloon_ino);
+   sbi->s_balloon_ino = NULL;
+   }
+
+   return;
+   }
+
+   if (ino < EXT4_FIRST_INO(sb)) {
+   ext4_msg(sb, KERN_WARNING, "bad balloon inode specified");
+   return;
+   }
+
+

[Devel] [PATCH RH9 3/6] ext4: Teach statfs to report reduced disk usage

2021-10-05 Thread Kirill Tkhai

From: Maxim V. Patlasov 

The magic 9 in there came from 512 bytes - the i_blocks is accounted
in these units in any case.

(cherry picked from vz7 commit 4b10f27018d330d9e03e932a98a41a3e55da81fb)
Signed-off-by: Konstantin Khorenko 
Signed-off-by: Kirill Tkhai 
---
 fs/ext4/super.c |   16 
 1 file changed, 16 insertions(+)

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 03ade65fbe51..f09a2432a20e 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -6290,6 +6290,22 @@ static int ext4_statfs(struct dentry *dentry, struct 
kstatfs *buf)
sb_has_quota_limits_enabled(sb, PRJQUOTA))
ext4_statfs_project(sb, EXT4_I(dentry->d_inode)->i_projid, buf);
 #endif
+
+   if (sbi->s_balloon_ino) {
+   struct ext4_inode_info *ei;
+   blkcnt_t balloon_blocks;
+
+   balloon_blocks = sbi->s_balloon_ino->i_blocks;
+   ei = EXT4_I(sbi->s_balloon_ino);
+   spin_lock(&ei->i_block_reservation_lock);
+   balloon_blocks += ei->i_reserved_data_blocks;
+   spin_unlock(&ei->i_block_reservation_lock);
+
+   BUG_ON(sbi->s_balloon_ino->i_blkbits < 9);
+   buf->f_blocks -= balloon_blocks >>
+(sbi->s_balloon_ino->i_blkbits - 9);
+   }
+
return 0;
 }
 


___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH RH9 0/6] ext4: Balloon patches

2021-10-05 Thread Kirill Tkhai

https://jira.sw.ru/browse/PSBM-134003

---

Kirill Tkhai (2):
  Date:   Wed Oct 7 14:47:07 2015 +0400
  fs: Revert ee1904ba44bd "make alloc_file() static"

Konstantin Khorenko (1):
  ext4: Provide a balloon nipple for management

Maxim V. Patlasov (3):
  ext4: Teach the fs where the balloon inode is
  ext4: Teach statfs to report reduced disk usage
  ext4: Don't show the active balloon to user


 fs/ext4/dir.c|   15 ++-
 fs/ext4/ext4.h   |3 +
 fs/ext4/ioctl.c  |   59 +++
 fs/ext4/namei.c  |9 
 fs/ext4/super.c  |  111 +++---
 fs/file_table.c  |3 +
 fs/inode.c   |1 
 include/linux/file.h |2 +
 8 files changed, 194 insertions(+), 9 deletions(-)

--
Signed-off-by: Kirill Tkhai 

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH RH9 2/6] Date: Wed Oct 7 14:47:07 2015 +0400

2021-10-05 Thread Kirill Tkhai

ve/fs: Allow to mount ext4 in top CT userns

https://jira.sw.ru/browse/PSBM-40100

v2: Check that user_ns is initial for the ve.
v3: Be sure ve->init_cred is set.

Signed-off-by: Kirill Tkhai 
Acked-by: Vladimir Davydov 

khorenko@: in fact we allowed to do those mounts in top CT user ns only.

(cherry picked from vz7 commit d8aabe8924283e12ef30dee49253f91f33d3e9bc
("ve/fs: Allow to mount ext4 and binfmt_misc under non-root ns"))
Signed-off-by: Konstantin Khorenko 

+++
ve/fs: Allow to mount ext4 in top CT userns - cleanup

After commit d5c3320347bb ("fs/ve: add new FS_VE_MOUNT flag to allow mount
in container init userns") it's wise to use FS_VE_MOUNT flag
instead of generic FS_USERNS_MOUNT + additional per-fs check.

This patch does not change the behavior.

Signed-off-by: Konstantin Khorenko 
Reviewed-by: Pavel Tikhomirov 
Signed-off-by: Kirill Tkhai 
---
 fs/ext4/super.c |4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 3bc2cfb04518..03ade65fbe51 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -40,6 +40,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -6783,7 +6784,8 @@ static struct file_system_type ext4_fs_type = {
.name   = "ext4",
.mount  = ext4_mount,
.kill_sb= ext4_kill_sb,
-   .fs_flags   = FS_REQUIRES_DEV | FS_ALLOW_IDMAP | FS_VIRTUALIZED,
+   .fs_flags   = FS_REQUIRES_DEV | FS_ALLOW_IDMAP | FS_VIRTUALIZED |
+ FS_VE_MOUNT,
 };
 MODULE_ALIAS_FS("ext4");
 


___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH RH9 0/2] Port part 32 SAK

2021-10-04 Thread Kirill Tkhai

https://jira.sw.ru/browse/PSBM-134449

---

Kirill Tkhai (2):
  tty: Avoid threads files iterations in __do_SAK()
  tty: Use RCU read lock to iterate tasks and threads in __do_SAK()


 drivers/tty/tty_io.c |   41 -
 1 file changed, 28 insertions(+), 13 deletions(-)

--
Signed-off-by: Kirill Tkhai 

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH RH9 2/2] tty: Use RCU read lock to iterate tasks and threads in __do_SAK()

2021-10-04 Thread Kirill Tkhai

There were made several efforts to make __do_SAK()
working in process context long ago, but it does
not solves the problem completely. Since __do_SAK()
may take tasklist_lock for a long time, the concurent
processes, waiting for write lock with interrupts
disabled (e.g., forking), get into the same situation
like __do_SAK() would have been executed in interrupt
context. I've observed several hard lockups on 3.10
kernel running 200 containers, caused by long duration
of copy_process()->write_lock_irq() after SAK was sent
to a tty. Current mainline kernel has the same problem.

The solution is to use RCU to iterate processes and threads.
Task list integrity is the only reason we taken tasklist_lock
before, as tty subsys primitives mostly take it for reading
also (e.g., __proc_set_tty). RCU read lock is enough for that.
This patch solves the problem and makes __do_SAK() to be
not greedy of tasklist_lock. That should prevent hard lockups
I've pointed above.

https://jira.sw.ru/browse/PSBM-80340

Signed-off-by: Kirill Tkhai 
Reviewed-by: Pavel Tikhomirov 

(cherry picked from vz7 commit 6aecb63c35a5 ("tty: Use RCU read lock to iterate
tasks and threads in __do_SAK()"))
Signed-off-by: Andrey Zhadchenko 
Signed-off-by: Kirill Tkhai 
---
 drivers/tty/tty_io.c |4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c
index 535f40164c2b..e7268372edb6 100644
--- a/drivers/tty/tty_io.c
+++ b/drivers/tty/tty_io.c
@@ -3055,8 +3055,10 @@ void __do_SAK(struct tty_struct *tty)
   task_pid_nr(p), p->comm);
group_send_sig_info(SIGKILL, SEND_SIG_PRIV, p, PIDTYPE_SID);
} while_each_pid_task(session, PIDTYPE_SID, p);
+   read_unlock(&tasklist_lock);
 
/* Now kill any processes that happen to have the tty open */
+   rcu_read_lock();
for_each_process(p) {
if (p->signal->tty == tty) {
tty_notice(tty, "SAK: killed process %d (%s): by 
controlling tty\n",
@@ -3085,7 +3087,7 @@ void __do_SAK(struct tty_struct *tty)
 kill:
group_send_sig_info(SIGKILL, SEND_SIG_PRIV, p, PIDTYPE_SID);
}
-   read_unlock(&tasklist_lock);
+   rcu_read_unlock();
put_pid(session);
 #endif
 }


___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH RH9 1/2] tty: Avoid threads files iterations in __do_SAK()

2021-10-04 Thread Kirill Tkhai

The patch makes __do_SAK() iterate a next thread files
only in case of the thread's files are different
to previous. I.e., if all threads points the same
files_struct, the files will be iterated only once.

Since all threads have the same files_struct is
the generic case for most Linux systems, this
improvement should clearly speed up __do_SAK()
execution.

Also, for_each_process()/for_each_thread() are
used instead of do_each_thread()/while_each_thread().
This prepares __do_SAK() to become tasklist_lock
free, and will be made in next patch.

https://jira.sw.ru/browse/PSBM-80340

Suggested-by: Oleg Nesterov 
Signed-off-by: Kirill Tkhai 

Reviewed-by: Pavel Tikhomirov 

Rebase to vz8:
 - Change send_sig to group_send_sig_info to respect ms commit
 a8ebd17160ce ("tty_io: Use group_send_sig_info in __do_SACK to note it is a
 session being killed")

(cherry picked from vz7 commit d61ca741c3ae ("tty: Avoid threads files
iterations in __do_SAK()"))
Signed-off-by: Andrey Zhadchenko 
Signed-off-by: Kirill Tkhai 
---
 drivers/tty/tty_io.c |   37 +
 1 file changed, 25 insertions(+), 12 deletions(-)

diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c
index a6230b25fbe5..535f40164c2b 100644
--- a/drivers/tty/tty_io.c
+++ b/drivers/tty/tty_io.c
@@ -3031,7 +3031,8 @@ void __do_SAK(struct tty_struct *tty)
 #ifdef TTY_SOFT_SAK
tty_hangup(tty);
 #else
-   struct task_struct *g, *p;
+   struct task_struct *p, *t;
+   struct files_struct *files;
struct pid *session;
int i;
unsigned long flags;
@@ -3056,22 +3057,34 @@ void __do_SAK(struct tty_struct *tty)
} while_each_pid_task(session, PIDTYPE_SID, p);
 
/* Now kill any processes that happen to have the tty open */
-   do_each_thread(g, p) {
+   for_each_process(p) {
if (p->signal->tty == tty) {
tty_notice(tty, "SAK: killed process %d (%s): by 
controlling tty\n",
   task_pid_nr(p), p->comm);
-   group_send_sig_info(SIGKILL, SEND_SIG_PRIV, p, 
PIDTYPE_SID);
-   continue;
+   goto kill;
}
-   task_lock(p);
-   i = iterate_fd(p->files, 0, this_tty, tty);
-   if (i != 0) {
-   tty_notice(tty, "SAK: killed process %d (%s): by 
fd#%d\n",
-  task_pid_nr(p), p->comm, i - 1);
-   group_send_sig_info(SIGKILL, SEND_SIG_PRIV, p, 
PIDTYPE_SID);
+
+   files = NULL;
+   for_each_thread(p, t) {
+   if (t->files == files) /* racy but we do not care */
+   continue;
+
+   task_lock(t);
+   files = t->files;
+   i = iterate_fd(files, 0, this_tty, tty);
+   task_unlock(t);
+
+   if (i != 0) {
+   dev_notice(tty->dev, "SAK: killed process %d 
(%s): by fd#%d\n",
+  task_pid_nr(p), p->comm, i - 1);
+   goto kill;
+   }
}
-   task_unlock(p);
-   } while_each_thread(g, p);
+
+   continue;
+kill:
+   group_send_sig_info(SIGKILL, SEND_SIG_PRIV, p, PIDTYPE_SID);
+   }
read_unlock(&tasklist_lock);
put_pid(session);
 #endif


___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH RH7] writeback: Write dirty times for WB_SYNC_ALL writeback

2021-10-01 Thread Kirill Tkhai

From: Jan Kara 

ms commit dc5ff2b1d66f

Currently we take care to handle I_DIRTY_TIME in vfs_fsync() and
queue_io() so that inodes which have only dirty timestamps are properly
written on fsync(2) and sync(2). However there are other call sites -
most notably going through write_inode_now() - which expect inode to be
clean after WB_SYNC_ALL writeback. This is not currently true as we do
not clear I_DIRTY_TIME in __writeback_single_inode() even for
WB_SYNC_ALL writeback in all the cases. This then resulted in the
following oops because bdev_write_inode() did not clean the inode and
writeback code later stumbled over a dirty inode with detached wb.

  general protection fault:  [#1] SMP DEBUG_PAGEALLOC KASAN
  Modules linked in:
  CPU: 3 PID: 32 Comm: kworker/u10:1 Not tainted 4.6.0-rc3+ #349
  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
  Workqueue: writeback wb_workfn (flush-11:0)
  task: 88006ccf1840 ti: 88006cda8000 task.ti: 88006cda8000
  RIP: 0010:[]  []
  locked_inode_to_wb_and_lock_list+0xa2/0x750
  RSP: 0018:88006cdaf7d0  EFLAGS: 00010246
  RAX:  RBX:  RCX: 88006ccf2050
  RDX:  RSI: 00114c8a8484 RDI: 0286
  RBP: 88006cdaf820 R08: 88006ccf1840 R09: 
  R10: 000229915090805f R11: 0001 R12: 88006a72f5e0
  R13: dc00 R14: ed000d4e5eed R15: 8830cf40
  FS:  () GS:88006d50() knlGS:
  CS:  0010 DS:  ES:  CR0: 80050033
  CR2: 03301bf8 CR3: 6368f000 CR4: 06e0
  DR0: 1ec9 DR1:  DR2: 
  DR3:  DR6: 0ff0 DR7: 0600
  Stack:
   88006a72f680 88006a72f768 8800671230d8 03ff88006cdaf948
   88006a72f668 88006a72f5e0 8800671230d8 88006cdaf948
   880065b90cc8 880067123100 88006cdaf970 8188e12e
  Call Trace:
   [< inline >] inode_to_wb_and_lock_list fs/fs-writeback.c:309
   [] writeback_sb_inodes+0x4de/0x1250 fs/fs-writeback.c:1554
   [] __writeback_inodes_wb+0x104/0x1e0 fs/fs-writeback.c:1600
   [] wb_writeback+0x7ce/0xc90 fs/fs-writeback.c:1709
   [< inline >] wb_do_writeback fs/fs-writeback.c:1844
   [] wb_workfn+0x2f9/0x1000 fs/fs-writeback.c:1884
   [] process_one_work+0x78e/0x15c0 kernel/workqueue.c:2094
   [] worker_thread+0xdb/0xfc0 kernel/workqueue.c:2228
   [] kthread+0x23f/0x2d0 drivers/block/aoe/aoecmd.c:1303
   [] ret_from_fork+0x22/0x50 arch/x86/entry/entry_64.S:392
  Code: 05 94 4a a8 06 85 c0 0f 85 03 03 00 00 e8 07 15 d0 ff 41 80 3e
  00 0f 85 64 06 00 00 49 8b 9c 24 88 01 00 00 48 89 d8 48 c1 e8 03 <42>
  80 3c 28 00 0f 85 17 06 00 00 48 8b 03 48 83 c0 50 48 39 c3
  RIP  [< inline >] wb_get include/linux/backing-dev-defs.h:212
  RIP  [] locked_inode_to_wb_and_lock_list+0xa2/0x750
  fs/fs-writeback.c:281
   RSP 
  ---[ end trace 986a4d314dcb2694 ]---

Fix the problem by making sure __writeback_single_inode() writes inode
only with dirty times in WB_SYNC_ALL mode.

Reported-by: Dmitry Vyukov 
Tested-by: Laurent Dufour 
Signed-off-by: Jan Kara 
Signed-off-by: Jens Axboe 

This loses to dirty inode, when it's called from freeze_bdev().
So, backup loses mtime.

In scope of #PSBM-134225 (but not a not final fix)

Signed-off-by: Kirill Tkhai 
---
 fs/fs-writeback.c |1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index c16a39f4f724..1c8c27188361 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -553,6 +553,7 @@ __do_writeback_single_inode(struct inode *inode, struct 
writeback_control *wbc)
dirty = inode->i_state & I_DIRTY;
if (inode->i_state & I_DIRTY_TIME) {
if ((dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) ||
+   wbc->sync_mode == WB_SYNC_ALL ||
unlikely(inode->i_state & I_DIRTY_TIME_EXPIRED) ||
unlikely(time_after(jiffies,
(inode->dirtied_time_when +


___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH RH9] dm-tracking: Add tracking_clear cmd

2021-09-30 Thread Kirill Tkhai

Command to ACK ordered copied cluster.

Signed-off-by: Kirill Tkhai 
---
 drivers/md/dm-tracking.c |   31 +++
 1 file changed, 27 insertions(+), 4 deletions(-)

diff --git a/drivers/md/dm-tracking.c b/drivers/md/dm-tracking.c
index e9cf0a4ae298..d723596fee44 100644
--- a/drivers/md/dm-tracking.c
+++ b/drivers/md/dm-tracking.c
@@ -168,6 +168,14 @@ static void dmt_dtr(struct dm_target *ti)
dmt_destroy(ti->private);
 }
 
+static int tracking_clear(struct dm_tracking *dmt, u64 clu)
+{
+   spin_lock_irq(&dmt->lock);
+   clear_bit(clu, dmt->bitmap);
+   spin_unlock_irq(&dmt->lock);
+   return 0;
+}
+
 static int tracking_get_next(struct dm_tracking *dmt, char *result,
 unsigned int maxlen)
 {
@@ -197,10 +205,24 @@ static int tracking_get_next(struct dm_tracking *dmt, 
char *result,
 }
 
 static int dmt_cmd(struct dm_tracking *dmt, const char *suffix,
+  int argc, char *argv[],
   char *result, unsigned int maxlen)
 {
unsigned int nr_clus, size;
void *bitmap = NULL;
+   u64 val;
+
+   if (!strcmp(suffix, "clear")) {
+   if (argc != 1 || kstrtou64(argv[0], 10, &val) < 0 ||
+   val >= dmt->nr_clus)
+   return -EINVAL;
+   if (!dmt->bitmap)
+   return -ENOENT;
+   return tracking_clear(dmt, val);
+   }
+
+   if (argc != 0)
+   return -EINVAL;
 
if (!strcmp(suffix, "get_next")) {
if (!dmt->bitmap)
@@ -248,13 +270,14 @@ static int dmt_message(struct dm_target *ti, unsigned int 
argc, char **argv,
return -EPERM;
 
mutex_lock(&dmt->ctl_mutex);
+   ret = -EINVAL;
+   if (argc < 1)
+   goto unlock;
ret = -ENOTSUPP;
if (strncmp(argv[0], "tracking_", 9))
goto unlock;
-   ret = -EINVAL;
-   if (argc != 1)
-   goto unlock;
-   ret = dmt_cmd(dmt, argv[0] + 9, result, maxlen);
+   ret = dmt_cmd(dmt, argv[0] + 9, argc - 1,
+ &argv[1], result, maxlen);
 unlock:
mutex_unlock(&dmt->ctl_mutex);
 


___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH RH8] dm-tracking: Add tracking_clear cmd

2021-09-30 Thread Kirill Tkhai

Command to ACK ordered copied cluster.

Signed-off-by: Kirill Tkhai 
---
 drivers/md/dm-tracking.c |   31 +++
 1 file changed, 27 insertions(+), 4 deletions(-)

diff --git a/drivers/md/dm-tracking.c b/drivers/md/dm-tracking.c
index e9cf0a4ae298..d723596fee44 100644
--- a/drivers/md/dm-tracking.c
+++ b/drivers/md/dm-tracking.c
@@ -168,6 +168,14 @@ static void dmt_dtr(struct dm_target *ti)
dmt_destroy(ti->private);
 }
 
+static int tracking_clear(struct dm_tracking *dmt, u64 clu)
+{
+   spin_lock_irq(&dmt->lock);
+   clear_bit(clu, dmt->bitmap);
+   spin_unlock_irq(&dmt->lock);
+   return 0;
+}
+
 static int tracking_get_next(struct dm_tracking *dmt, char *result,
 unsigned int maxlen)
 {
@@ -197,10 +205,24 @@ static int tracking_get_next(struct dm_tracking *dmt, 
char *result,
 }
 
 static int dmt_cmd(struct dm_tracking *dmt, const char *suffix,
+  int argc, char *argv[],
   char *result, unsigned int maxlen)
 {
unsigned int nr_clus, size;
void *bitmap = NULL;
+   u64 val;
+
+   if (!strcmp(suffix, "clear")) {
+   if (argc != 1 || kstrtou64(argv[0], 10, &val) < 0 ||
+   val >= dmt->nr_clus)
+   return -EINVAL;
+   if (!dmt->bitmap)
+   return -ENOENT;
+   return tracking_clear(dmt, val);
+   }
+
+   if (argc != 0)
+   return -EINVAL;
 
if (!strcmp(suffix, "get_next")) {
if (!dmt->bitmap)
@@ -248,13 +270,14 @@ static int dmt_message(struct dm_target *ti, unsigned int 
argc, char **argv,
return -EPERM;
 
mutex_lock(&dmt->ctl_mutex);
+   ret = -EINVAL;
+   if (argc < 1)
+   goto unlock;
ret = -ENOTSUPP;
if (strncmp(argv[0], "tracking_", 9))
goto unlock;
-   ret = -EINVAL;
-   if (argc != 1)
-   goto unlock;
-   ret = dmt_cmd(dmt, argv[0] + 9, result, maxlen);
+   ret = dmt_cmd(dmt, argv[0] + 9, argc - 1,
+ &argv[1], result, maxlen);
 unlock:
mutex_unlock(&dmt->ctl_mutex);
 


___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

Re: [Devel] [PATCH rh9 03/11] ve/printk: Virtualize syslog_*

2021-09-30 Thread Kirill Tkhai

On 29.09.2021 22:24, Konstantin Khorenko wrote:
> From: Vladimir Davydov 
> 
> https://jira.sw.ru/browse/PSBM-17899
> 
> Signed-off-by: Vladimir Davydov 
> Signed-off-by: Stanislav Kinsburskiy 
> 
> +++
> ve/printk: Fix printk virtualization
> 
> ve_printk() corrupts host's dmesg:
>   # dmesg|wc -l
>   599
>   # vzctl create 101
>   # vzctl set 101 --netif_add eth0 --save
>   # vzctl start 101
>   # vzctl exec 101 'tcpdump -w tcpdump.out -U -n -i eth0 esp'
>   # dmesg|wc -l
>   2
> 
> Add missing parts of prinkt virtualization to fix this.
> 
> https://jira.sw.ru/browse/PSBM-17899
> https://jira.sw.ru/browse/PSBM-105442
> 
> Signed-off-by: Andrey Ryabinin 
> 
> Rebasing to vz9: part of vz8 commit:
>  d63aeb311a64 ("ve/printk: printk virtualization")
> 
> https://jira.sw.ru/browse/PSBM-133985
> 
> Signed-off-by: Konstantin Khorenko 
> ---
>  kernel/printk/printk.c | 60 +-
>  1 file changed, 30 insertions(+), 30 deletions(-)
> 
> diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
> index a1dedbc88426..70dbf204c052 100644
> --- a/kernel/printk/printk.c
> +++ b/kernel/printk/printk.c
> @@ -360,13 +360,6 @@ enum log_flags {
>  static DEFINE_RAW_SPINLOCK(syslog_lock);
>  
>  #ifdef CONFIG_PRINTK
> -DECLARE_WAIT_QUEUE_HEAD(log_wait);

log_wait is still used in code, so we should move this to another patch.

> -/* All 3 protected by @syslog_lock. */
> -/* the next printk record to read by syslog(READ) or /proc/kmsg */
> -static u64 syslog_seq;
> -static size_t syslog_partial;
> -static bool syslog_time;
> -
>  /* All 3 protected by @console_sem. */
>  /* the next printk record to write to the console */
>  static u64 console_seq;
> @@ -418,6 +411,12 @@ static struct log_state {
>   char *buf;
>   u32 buf_len;
>  
> + /* All 3 protected by @syslog_lock. */
> + /* the next printk record to read by syslog(READ) or /proc/kmsg */
> + u64 syslog_seq;
> + size_t syslog_partial;
> + bool syslog_time;
> +
>   /*
>* The next printk record to read after the last 'clear' command. There 
> are
>* two copies (updated with seqcount_latch) so that reads can locklessly
> @@ -1546,35 +1545,35 @@ static int syslog_print(struct log_state *log,
>  
>   printk_safe_enter_irq();
>   raw_spin_lock(&syslog_lock);
> - if (!prb_read_valid(log->prb, syslog_seq, &r)) {
> + if (!prb_read_valid(log->prb, log->syslog_seq, &r)) {
>   raw_spin_unlock(&syslog_lock);
>   printk_safe_exit_irq();
>   break;
>   }
> - if (r.info->seq != syslog_seq) {
> + if (r.info->seq != log->syslog_seq) {
>   /* message is gone, move to next valid one */
> - syslog_seq = r.info->seq;
> - syslog_partial = 0;
> + log->syslog_seq = r.info->seq;
> + log->syslog_partial = 0;
>   }
>  
>   /*
>* To keep reading/counting partial line consistent,
>* use printk_time value as of the beginning of a line.
>*/
> - if (!syslog_partial)
> - syslog_time = printk_time;
> + if (!log->syslog_partial)
> + log->syslog_time = printk_time;
>  
> - skip = syslog_partial;
> - n = record_print_text(&r, true, syslog_time);
> - if (n - syslog_partial <= size) {
> + skip = log->syslog_partial;
> + n = record_print_text(&r, true, log->syslog_time);
> + if (n - log->syslog_partial <= size) {
>   /* message fits into buffer, move forward */
> - syslog_seq = r.info->seq + 1;
> - n -= syslog_partial;
> - syslog_partial = 0;
> + log->syslog_seq = r.info->seq + 1;
> + n -= log->syslog_partial;
> + log->syslog_partial = 0;
>   } else if (!len){
>   /* partial read(), remember position */
>   n = size;
> - syslog_partial += n;
> + log->syslog_partial += n;
>   } else
>   n = 0;
>   raw_spin_unlock(&syslog_lock);
> @@ -1669,10 +1668,11 @@ static void syslog_clear(struct log_state *log)
>  /* Return a consistent copy of @syslog_seq. */
>  static u64 read_syslog_seq_irq(void)
>  {
> + struct log_state *log = ve_log_state();
>   u64 seq;
>  
>   raw_spin_lock_irq(&syslog_lock);
> - seq = syslog_seq;
> + seq = log->syslog_seq;
>   raw_spin_unlock_irq(&syslog_lock);
>  
>   return seq;
> @@ -1707,7 +1707,7 @@ int do_syslog(int type, char __user *buf, int len, int 
> source)
>   prb_read_valid(log->prb, read_syslo

Re: [Devel] [PATCH rh9 01/11] ve/printk: Introduce struct "log_state" and virtualize log_buf/log_buf_len

2021-09-30 Thread Kirill Tkhai

On 29.09.2021 22:24, Konstantin Khorenko wrote:
> From: Vladimir Davydov 
> 
> https://jira.sw.ru/browse/PSBM-17899
> 
> Signed-off-by: Vladimir Davydov 
> Signed-off-by: Stanislav Kinsburskiy 
> 
> +++
> ve/printk: Fix printk virtualization
> 
> ve_printk() corrupts host's dmesg:
>   # dmesg|wc -l
>   599
>   # vzctl create 101
>   # vzctl set 101 --netif_add eth0 --save
>   # vzctl start 101
>   # vzctl exec 101 'tcpdump -w tcpdump.out -U -n -i eth0 esp'
>   # dmesg|wc -l
>   2
> 
> Add missing parts of prinkt virtualization to fix this.
> 
> https://jira.sw.ru/browse/PSBM-17899
> https://jira.sw.ru/browse/PSBM-105442
> 
> Signed-off-by: Andrey Ryabinin 
> 
> Rebasing to vz9: part of vz8 commit:
>  d63aeb311a64 ("ve/printk: printk virtualization")
> 
> https://jira.sw.ru/browse/PSBM-133985
> 
> Signed-off-by: Konstantin Khorenko 
> ---
>  include/linux/printk.h |  13 
>  include/linux/ve.h |   3 +
>  kernel/printk/printk.c | 165 ++---
>  kernel/ve/ve.c |   8 ++
>  4 files changed, 164 insertions(+), 25 deletions(-)
> 
> diff --git a/include/linux/printk.h b/include/linux/printk.h
> index e834d78f0478..f178e2e5d7f5 100644
> --- a/include/linux/printk.h
> +++ b/include/linux/printk.h
> @@ -176,6 +176,10 @@ int vprintk(const char *fmt, va_list args);
>  asmlinkage __printf(1, 2) __cold
>  int printk(const char *fmt, ...);
>  
> +struct ve_struct;
> +int ve_log_init(struct ve_struct *ve);
> +void ve_log_destroy(struct ve_struct *ve);
> +
>  /*
>   * Special printk facility for scheduler/timekeeping use only, _DO_NOT_USE_ !
>   */
> @@ -222,6 +226,15 @@ int printk(const char *s, ...)
>  {
>   return 0;
>  }
> +static inline
> +int ve_log_init(struct ve_struct *ve)
> +{
> + return 0;
> +}
> +static inline
> +void ve_log_destroy(struct ve_struct *ve)
> +{
> +}
>  static inline __printf(1, 2) __cold
>  int printk_deferred(const char *s, ...)
>  {
> diff --git a/include/linux/ve.h b/include/linux/ve.h
> index 248cdeb0a2e4..552fa577e2f9 100644
> --- a/include/linux/ve.h
> +++ b/include/linux/ve.h
> @@ -50,6 +50,9 @@ struct ve_struct {
>   /* see vzcalluser.h for VE_FEATURE_XXX definitions */
>   __u64   features;
>  
> + void*log_state;
> +#define VE_LOG_BUF_LEN   4096
> +
>   struct kstat_lat_pcpu_structsched_lat_ve;
>  
>   struct kmapset_key  sysfs_perms_key;
> diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
> index 142a58d124d9..77e6787c752e 100644
> --- a/kernel/printk/printk.c
> +++ b/kernel/printk/printk.c
> @@ -44,6 +44,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  #include 
>  #include 
> @@ -408,8 +409,6 @@ static struct latched_seq clear_seq = {
>  #define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT)
>  #define LOG_BUF_LEN_MAX (u32)(1 << 31)
>  static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN);
> -static char *log_buf = __log_buf;
> -static u32 log_buf_len = __LOG_BUF_LEN;
>  
>  /*
>   * Define the average message size. This only affects the number of
> @@ -427,6 +426,34 @@ _DEFINE_PRINTKRB(printk_rb_static, CONFIG_LOG_BUF_SHIFT 
> - PRB_AVGBITS,
>  static struct printk_ringbuffer printk_rb_dynamic;
>  
>  static struct printk_ringbuffer *prb = &printk_rb_static;
> +static struct log_state {
> + char *buf;
> + u32 buf_len;
> +} init_log_state = {
> + .buf = __log_buf,
> + .buf_len = __LOG_BUF_LEN,
> +};

Maybe, we move this hunk up to previous hunk "-"?

-static char *log_buf = __log_buf;
-static u32 log_buf_len = __LOG_BUF_LEN;
+static struct log_state {
+   char *buf;
...
+   .buf = __log_buf,
+   .buf_len = __LOG_BUF_LEN,
...

So, it clearer to understand we just reassigned __log_buf etc here?

> +
> +/* kdump relies on some log_* symbols, let's make it happy */
> +#define DEFINE_STRUCT_MEMBER_ALIAS(name, inst, memb) \
> +static void  ## name ## _definition(void) __attribute__((used)); \
> +static void  ## name ## _definition(void)
> \
> +{\
> + asm (".globl " #name "\n\t.set " #name ", " #inst "+%c0"\
> +  : : "g" (offsetof(typeof(inst), memb)));   \
> +}\
> +extern typeof(inst.memb) name;
> +#undef DEFINE_STRUCT_MEMBER_ALIAS

Maybe we should move this define to patch where it's used?

> +
> +static inline struct log_state *ve_log_state(void)
> +{
> + struct log_state *log = &init_log_state;
> +#ifdef CONFIG_VE
> + if (get_exec_env()->log_state)
> + log = get_exec_env()->log_state;
> +#endif
> + return log;
> +}
>  
>  /*
>   * We cannot access per-CPU data (e.g. per-CPU flush irq_work) before
> @@ -468,13 +495,13 @@ static u64 latched_seq_read_nolock(struct latched_seq 
> *ls)
>  /* Return log buffer address */
>

[Devel] [PATCH RH9] cbt: Add config

2021-09-29 Thread Kirill Tkhai

Signed-off-by: Kirill Tkhai 
---
 .../custom-overrides/generic/CONFIG_BLK_DEV_CBT|1 +
 1 file changed, 1 insertion(+)
 create mode 100644 redhat/configs/custom-overrides/generic/CONFIG_BLK_DEV_CBT

diff --git a/redhat/configs/custom-overrides/generic/CONFIG_BLK_DEV_CBT 
b/redhat/configs/custom-overrides/generic/CONFIG_BLK_DEV_CBT
new file mode 100644
index ..03d9fa0ae0cf
--- /dev/null
+++ b/redhat/configs/custom-overrides/generic/CONFIG_BLK_DEV_CBT
@@ -0,0 +1 @@
+CONFIG_BLK_DEV_CBT=y


___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH RH9] cbt: introduce changed block tracking

2021-09-29 Thread Kirill Tkhai

Combined patch including original patch and fixes:

93f326594516 block/blk-cbt.c: copyright update
20ff882b3eb0 cbt: selfdeadlock in __blk_cbt_set()
c22ab989e4f9 cbt: bitmap corruption caused by ipi
4bb4b6b568d9 cbt: license: put correct copyrights into file headers
d991ef45b2bc cbt: don't leak ce_reserved64 in cbt_ito userspace
c91a55b3c347 cbt: blk_cbt_update_size() must return if cbt->block_max not 
changed
8c3db04ecfd5 cbt: blk_cbt_update_size() should not copy uninitialized data
92039d6d0a9a cbt: fix possible race on alloc_page()
c55fac6426e7 cbt: new api: blk_cbt_map_merge()
d54e079764bd cbt: fix panic in blk_cbt_map_copy_once()
c2c61c3eb41c cbt: fix cbt->block_max calculation
737d22a7d677 cbt: add uuid arg to blk_cbt_map_copy_once()
3040064c1c69 cbt: add blk_cbt_map_copy_once() helper
981be8f5e23d cbt: fix page allocation
99f0cacd3cba cbt: make __blk_cbt_set() smarter
7324f0cc6139 cbt: introduce CBT_PAGE_MISSED
7588bf9d56c9 cbt: factor out alloc_page
d9511b56e79e cbt: introduce changed block tracking

@ktkhai: Backport changes were made for bvec_iter dereferencing.

Signed-off-by: Kirill Tkhai 

+++
cbt: Update CBT size from check_disk_size_change()

Here is customer node, where is CBT size is different
to ploop size. Searching against kernel code shows,
this is the only place we skip CBT size update after
bd_inode size change.

https://jira.sw.ru/browse/PSBM-123819
Signed-off-by: Kirill Tkhai 

(cherry picked from vz7 commit 042072dc3899 ("cbt: Update CBT size from
check_disk_size_change()"))
Signed-off-by: Vasily Averin 

+++
60576729f55c cbt: New interface to save current mask snapshot in cbt

During the backup, we want to save current changed mask
and to start tracking from clean mask again.

Previously, the mask was saved in another driver:
ploop used to call cbt primitives and saved it in ploop
device structures. This looks better than saving the mask
in userspace, because the mask remains alive even in case
of userspace death. The only thing needed after died backup
is to merge the saved mask back from ploop driver
to cbt driver. Thus, all changed (from previous successful
backup) blocks are still available, and it's possible
to create partial backup even after segfaulted userspace.

This patchset continues the practice of saving mask in kernel,
but it makes possible to save CBT snapshot in cbt driver
without distributing CBT structures over the kernel. Here is
a new BLKCBTMISC ioctl, which allows to create, drop and
merge back a snapshot. The ioctl has 3 switches:

* CMI_SNP_CREATE: create a new mask snapshot and move changed
blocks mask there (changed blocks mask becomes empty after
that).

* CMI_SNP_DROP: drops created snapshot (should be called after
successful backup).

* CMI_SNP_MERGE_BACK: moves snapshot bits into changing blocks
mask and kills snapshot (should be called after failed backup).

+++
cbt: Change errno values for new ioctl
cbt: Actually show errors on return
cbt: Rename misc commands names
cbt: Add size to CBT_SNAP_CREATE
cbt: Rename also blk_user_cbt_snp_create
cbt: Move cbt_flush_cache() before size calculation
cbt: Fix off-by-one in map_required_size()
cbt: endless loop on rollback in blk_cbt_snap_create()
cbt: Fix off-by-one in map_required_size()

Signed-off-by: Kirill Tkhai 
---
 block/Makefile   |1 
 block/blk-cbt.c  | 1035 ++
 block/blk-core.c |1 
 block/blk-sysfs.c|1 
 block/genhd.c|   13 -
 block/ioctl.c|6 
 block/partitions/Kconfig |8 
 block/partitions/core.c  |4 
 include/linux/blkdev.h   |   25 +
 include/uapi/linux/fs.h  |   53 ++
 10 files changed, 1141 insertions(+), 6 deletions(-)
 create mode 100644 block/blk-cbt.c

diff --git a/block/Makefile b/block/Makefile
index 1e1afa10f869..5c5f703bcbd2 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -40,3 +40,4 @@ obj-$(CONFIG_BLK_SED_OPAL)+= sed-opal.o
 obj-$(CONFIG_BLK_PM)   += blk-pm.o
 obj-$(CONFIG_BLK_INLINE_ENCRYPTION)+= keyslot-manager.o blk-crypto.o
 obj-$(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK)   += blk-crypto-fallback.o
+obj-$(CONFIG_BLK_DEV_CBT)   += blk-cbt.o
diff --git a/block/blk-cbt.c b/block/blk-cbt.c
new file mode 100644
index ..e8eee11a87ba
--- /dev/null
+++ b/block/blk-cbt.c
@@ -0,0 +1,1035 @@
+/*
+ *  block/blk-cbt.c
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *  Copyright (c) 2017-2021 Virtuozzo International GmbH. All rights reserved.
+ *
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#define CBT_MAX_EXTENTS512
+#define NR_PAGES(bits) (((bits) + PAGE_SIZE*8 - 1) / (PAGE_SIZE*8))
+#define BITS_PER_PAGE  (1UL << (PAGE_SHIFT + 3))
+
+#define CBT_PAGE_MISSED (struct page *)(0x1)
+#define CBT_PAGE(cbt, idx) (cbt->map[idx] == CBT_PAGE_MISSED ? \
+

[Devel] [PATCH RH8] cbt: Fix off-by-one in map_required_size()

2021-09-28 Thread Kirill Tkhai

Instead of:

return DIV_ROUND_UP(bit, 8) + page * PAGE_SIZE;

we have to have:

return DIV_ROUND_UP(bit, 8) + (page - 1) * PAGE_SIZE;

But instead of that we fix @page to be enumerated
from 0 in standard C way.

Signed-off-by: Kirill Tkhai 
---
 block/blk-cbt.c |9 -
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/block/blk-cbt.c b/block/blk-cbt.c
index 1ccc393f1419..e8eee11a87ba 100644
--- a/block/blk-cbt.c
+++ b/block/blk-cbt.c
@@ -315,15 +315,14 @@ static unsigned long map_required_size(struct page **map, 
unsigned long block_ma
 {
unsigned long bit, page, npages = NR_PAGES(block_max);
 
-   for (page = npages; page > 0; page--) {
-   if (map[page-1])
+   for (page = npages - 1; page != ULONG_MAX; page--) {
+   if (map[page])
break;
}
-
-   if (page == 0)
+   if (page == ULONG_MAX)
return 0;
 
-   bit = find_last_bit(page_address(map[page - 1]), PAGE_SIZE);
+   bit = find_last_bit(page_address(map[page]), PAGE_SIZE);
if (bit >= PAGE_SIZE)
bit = 0; /* Not found */
else


___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH RH9] push_backup: Do not take write lock on ENOTTY

2021-09-28 Thread Kirill Tkhai

Userspace may pass wrong command. Do not take write lock then.

Signed-off-by: Kirill Tkhai 
---
 drivers/md/dm-push-backup.c |   23 +++
 1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/drivers/md/dm-push-backup.c b/drivers/md/dm-push-backup.c
index 6d7b1859298a..1500d0681cee 100644
--- a/drivers/md/dm-push-backup.c
+++ b/drivers/md/dm-push-backup.c
@@ -329,11 +329,10 @@ static void pb_release_clone(struct request *clone,
blk_put_request(clone);
 }
 
-static bool msg_wants_down_read(const char *cmd)
+static bool msg_wants_down_write(const char *cmd)
 {
-   if (!strcmp(cmd, "push_backup_read") ||
-   !strcmp(cmd, "push_backup_write") ||
-   !strcmp(cmd, "push_backup_statistics"))
+   if (!strcmp(cmd, "push_backup_start") ||
+   !strcmp(cmd, "push_backup_stop"))
return true;
 
return false;
@@ -567,7 +566,7 @@ static int pb_message(struct dm_target *ti, unsigned int 
argc, char **argv,
struct push_backup *pb = ti->private;
int ret = -EPERM;
u64 val, val2;
-   bool read;
+   bool write;
 
if (!capable(CAP_SYS_ADMIN))
goto out;
@@ -576,11 +575,11 @@ static int pb_message(struct dm_target *ti, unsigned int 
argc, char **argv,
if (argc < 1)
goto out;
 
-   read = msg_wants_down_read(argv[0]);
-   if (read)
-   ret = down_read_killable(&pb->ctl_rwsem);
-   else
+   write = msg_wants_down_write(argv[0]);
+   if (write)
ret = down_write_killable(&pb->ctl_rwsem);
+   else
+   ret = down_read_killable(&pb->ctl_rwsem);
if (unlikely(ret))
goto out;
 
@@ -612,10 +611,10 @@ static int pb_message(struct dm_target *ti, unsigned int 
argc, char **argv,
}
 
 unlock:
-   if (read)
-   up_read(&pb->ctl_rwsem);
-   else
+   if (write)
up_write(&pb->ctl_rwsem);
+   else
+   up_read(&pb->ctl_rwsem);
 out:
return ret;
 }


___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH RH9] push_backup: Take rwsem killable

2021-09-28 Thread Kirill Tkhai

... to have possibility to kill process.

Signed-off-by: Kirill Tkhai 
---
 drivers/md/dm-push-backup.c |6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/md/dm-push-backup.c b/drivers/md/dm-push-backup.c
index 75f080fe34cf..6d7b1859298a 100644
--- a/drivers/md/dm-push-backup.c
+++ b/drivers/md/dm-push-backup.c
@@ -578,9 +578,11 @@ static int pb_message(struct dm_target *ti, unsigned int 
argc, char **argv,
 
read = msg_wants_down_read(argv[0]);
if (read)
-   down_read(&pb->ctl_rwsem);
+   ret = down_read_killable(&pb->ctl_rwsem);
else
-   down_write(&pb->ctl_rwsem);
+   ret = down_write_killable(&pb->ctl_rwsem);
+   if (unlikely(ret))
+   goto out;
 
if (!strcmp(argv[0], "push_backup_start")) {
if (argc < 2 || argc > 3)


___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

[Devel] [PATCH RH9] push_backup: Do not take write lock on statistics

2021-09-27 Thread Kirill Tkhai

It's overkill.

Signed-off-by: Kirill Tkhai 
---
 drivers/md/dm-push-backup.c |3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/md/dm-push-backup.c b/drivers/md/dm-push-backup.c
index 16e8cf27211f..75f080fe34cf 100644
--- a/drivers/md/dm-push-backup.c
+++ b/drivers/md/dm-push-backup.c
@@ -332,7 +332,8 @@ static void pb_release_clone(struct request *clone,
 static bool msg_wants_down_read(const char *cmd)
 {
if (!strcmp(cmd, "push_backup_read") ||
-   !strcmp(cmd, "push_backup_write"))
+   !strcmp(cmd, "push_backup_write") ||
+   !strcmp(cmd, "push_backup_statistics"))
return true;
 
return false;


___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

1 2 3 4 5 6 7 8 9 10 >

1 - 100 of 1557 matches

Mail list logo