Re: [Devel] [PATCH rh9 v2] dm-zero-req: Introduce zero request based target
On 05.04.2022 16:52, Konstantin Khorenko wrote: > This driver is like "dm-zero", but request based rather than bio based > like original "dm-zero". > > This driver will be used on a block device for Container configuration > stage: we need to construct a block device which honors CBT mask (stored > in ploop image), for that first we need to create a dummy block device > (pure technical issue, otherwise CBT mask is dropped). > > dm-ploop/dm-qcow2 are request based, thus we need zero target > also to be request based. > > https://jira.sw.ru/browse/PSBM-134130 > > Signed-off-by: Konstantin Khorenko > Feature: cbt: changed block tracking (for backup) Reviewed-by: Kirill Tkhai > --- > v2: * dropped extra kernel config option, put new modules under CONFIG_DM_ZERO > * dropped "readahead of null bytes" optimization > --- > drivers/md/Makefile | 1 + > drivers/md/dm-zero-req.c | 91 > 2 files changed, 92 insertions(+) > create mode 100644 drivers/md/dm-zero-req.c > > diff --git a/drivers/md/Makefile b/drivers/md/Makefile > index 94134440cf70..3197d24c0e75 100644 > --- a/drivers/md/Makefile > +++ b/drivers/md/Makefile > @@ -76,6 +76,7 @@ obj-$(CONFIG_DM_PERSISTENT_DATA) += persistent-data/ > obj-$(CONFIG_DM_MIRROR) += dm-mirror.o dm-log.o dm-region-hash.o > obj-$(CONFIG_DM_LOG_USERSPACE) += dm-log-userspace.o > obj-$(CONFIG_DM_ZERO)+= dm-zero.o > +obj-$(CONFIG_DM_ZERO)+= dm-zero-req.o > obj-$(CONFIG_DM_RAID)+= dm-raid.o > obj-$(CONFIG_DM_THIN_PROVISIONING) += dm-thin-pool.o > obj-$(CONFIG_DM_VERITY) += dm-verity.o > diff --git a/drivers/md/dm-zero-req.c b/drivers/md/dm-zero-req.c > new file mode 100644 > index ..9e44de15dcd6 > --- /dev/null > +++ b/drivers/md/dm-zero-req.c > @@ -0,0 +1,91 @@ > +/* > + * Copyright (C) 2003 Jana Saout > + * > + * This file is released under the GPL. > + */ > + > +#include > + > +#include > +#include > +#include > +#include > +#include "dm-rq.h" > + > +#define DM_MSG_PREFIX "zero" > + > +/* > + * Construct a dummy mapping that only returns zeros > + */ > +static int zero_ctr(struct dm_target *ti, unsigned int argc, char **argv) > +{ > + if (argc != 0) { > + ti->error = "No arguments required"; > + return -EINVAL; > + } > + > + /* > + * Silently drop discards, avoiding -EOPNOTSUPP. > + */ > + ti->num_discard_bios = 1; > + > + return 0; > +} > + > +static int zero_clone_and_map_rq(struct dm_target *ti, struct request *rq, > + union map_info *map_context, > + struct request **clone) > +{ > + struct bio *bio = rq->bio; > + > + switch (bio_op(bio)) { > + case REQ_OP_READ: > + while (bio) { > + zero_fill_bio(bio); > + bio = bio->bi_next; > + } > + > + break; > + case REQ_OP_WRITE: > + /* writes get silently dropped */ > + break; > + default: > + return DM_MAPIO_KILL; > + } > + > + dm_complete_request(rq, BLK_STS_OK); > + > + /* accepted rq, don't make new request */ > + return DM_MAPIO_SUBMITTED; > +} > + > +static struct target_type zero_target = { > + .name = "zero-rq", > + .version = {1, 1, 0}, > + .features = DM_TARGET_NOWAIT, > + .module = THIS_MODULE, > + .ctr= zero_ctr, > + .clone_and_map_rq = zero_clone_and_map_rq, > +}; > + > +static int __init dm_zero_init(void) > +{ > + int r = dm_register_target(&zero_target); > + > + if (r < 0) > + DMERR("register failed %d", r); > + > + return r; > +} > + > +static void __exit dm_zero_exit(void) > +{ > + dm_unregister_target(&zero_target); > +} > + > +module_init(dm_zero_init) > +module_exit(dm_zero_exit) > + > +MODULE_AUTHOR("Jana Saout "); > +MODULE_DESCRIPTION(DM_NAME " dummy request based target returning zeros"); > +MODULE_LICENSE("GPL"); ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RH9 4/9] dm-qcow2: Prepare handle_md_page() for calling not only from main kwork
Parallel handle_md_page() may fail because of a page has just been added. Teach it to repeat the search. Signed-off-by: Kirill Tkhai --- drivers/md/dm-qcow2-map.c|5 ++--- drivers/md/dm-qcow2-target.c | 14 ++ 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/drivers/md/dm-qcow2-map.c b/drivers/md/dm-qcow2-map.c index 4e04505810fc..4edd63e47a3d 100644 --- a/drivers/md/dm-qcow2-map.c +++ b/drivers/md/dm-qcow2-map.c @@ -1526,10 +1526,8 @@ static int submit_read_md_page(struct qcow2 *qcow2, struct qio **qio, int ret; ret = alloc_and_insert_md_page(qcow2, page_id, &md); - if (ret < 0) { - pr_err("Can't alloc: ret=%d, page_id=%llu\n", ret, page_id); + if (ret < 0) return ret; - } spin_lock_irq(&qcow2->md_pages_lock); list_add_tail(&(*qio)->link, &md->wait_list); @@ -1543,6 +1541,7 @@ static int submit_read_md_page(struct qcow2 *qcow2, struct qio **qio, /* * This may be called with @qio == NULL, in case of we are * interesting in searching cached in memory md only. + * This is aimed to be called not only from main kwork. */ static int handle_md_page(struct qcow2 *qcow2, u64 page_id, struct qio **qio, struct md_page **ret_md) diff --git a/drivers/md/dm-qcow2-target.c b/drivers/md/dm-qcow2-target.c index 6c550cbe2579..795d64516507 100644 --- a/drivers/md/dm-qcow2-target.c +++ b/drivers/md/dm-qcow2-target.c @@ -318,7 +318,7 @@ struct md_page *md_page_find_or_postpone(struct qcow2 *qcow2, unsigned int id, return md; } -static void md_page_insert(struct qcow2 *qcow2, struct md_page *new_md) +static int md_page_try_insert(struct qcow2 *qcow2, struct md_page *new_md) { struct rb_root *root = &qcow2->md_pages; unsigned int new_id = new_md->id; @@ -337,11 +337,12 @@ static void md_page_insert(struct qcow2 *qcow2, struct md_page *new_md) else if (new_id > md->id) node = &parent->rb_right; else - BUG(); + return -EEXIST; } rb_link_node(&new_md->node, parent, node); rb_insert_color(&new_md->node, root); + return 0; } void md_page_erase(struct qcow2 *qcow2, struct md_page *md) @@ -361,7 +362,8 @@ struct md_page *md_page_renumber(struct qcow2 *qcow2, unsigned int id, WARN_ON_ONCE(!list_empty(&md->wait_list)); md_page_erase(qcow2, md); md->id = new_id; - md_page_insert(qcow2, md); + if (WARN_ON(md_page_try_insert(qcow2, md) < 0)) + md = NULL; } return md; } @@ -396,10 +398,14 @@ int alloc_and_insert_md_page(struct qcow2 *qcow2, u64 index, struct md_page **md INIT_LIST_HEAD(&(*md)->wb_link); spin_lock_irq(&qcow2->md_pages_lock); - md_page_insert(qcow2, *md); + ret = md_page_try_insert(qcow2, *md); spin_unlock_irq(&qcow2->md_pages_lock); + if (ret) + goto err_putpage; return 0; +err_putpage: + put_page((*md)->page); err_kfree: kfree(*md); return ret; ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RH9] dm-ploop: Remove tracking code
We use generic driver instead. Signed-off-by: Kirill Tkhai --- drivers/md/dm-ploop-cmd.c| 165 -- drivers/md/dm-ploop-map.c| 38 -- drivers/md/dm-ploop-target.c |3 - drivers/md/dm-ploop.h| 13 --- 4 files changed, 219 deletions(-) diff --git a/drivers/md/dm-ploop-cmd.c b/drivers/md/dm-ploop-cmd.c index 3ba866cb0ec0..ed46da98b8d7 100644 --- a/drivers/md/dm-ploop-cmd.c +++ b/drivers/md/dm-ploop-cmd.c @@ -249,7 +249,6 @@ static int ploop_write_cluster_sync(struct ploop *ploop, struct pio *pio, if (pio->bi_status) return blk_status_to_errno(pio->bi_status); - /* track_bio(ploop, bio); */ return vfs_fsync(file, 0); } @@ -982,166 +981,6 @@ static int ploop_set_falloc_new_clu(struct ploop *ploop, u64 val) return 0; } -static int process_tracking_start(struct ploop *ploop, void *tracking_bitmap, - u32 tb_nr) -{ - u32 i, nr_pages, end, *bat_entries, dst_clu, nr; - struct rb_node *node; - struct md_page *md; - int ret = 0; - - write_lock_irq(&ploop->bat_rwlock); - ploop->tracking_bitmap = tracking_bitmap; - ploop->tb_nr = tb_nr; - - for_each_clear_bit(i, ploop->holes_bitmap, ploop->hb_nr) - set_bit(i, tracking_bitmap); - nr_pages = bat_clu_to_page_nr(ploop->nr_bat_entries - 1) + 1; - nr = 0; - - ploop_for_each_md_page(ploop, md, node) { - ploop_init_be_iter(ploop, md->id, &i, &end); - bat_entries = kmap_atomic(md->page); - for (; i <= end; i++) { - dst_clu = bat_entries[i]; - if (dst_clu == BAT_ENTRY_NONE || - md->bat_levels[i] != top_level(ploop)) - continue; - if (WARN_ON(dst_clu >= tb_nr)) { - ret = -EIO; - break; - } - set_bit(dst_clu, tracking_bitmap); - } - kunmap_atomic(bat_entries); - if (ret) - break; - nr++; - } - write_unlock_irq(&ploop->bat_rwlock); - - BUG_ON(ret == 0 && nr != nr_pages); - return ret; -} - -static int tracking_get_next(struct ploop *ploop, char *result, -unsigned int maxlen) -{ - unsigned int i, sz = 0, tb_nr = ploop->tb_nr, prev = ploop->tb_cursor; - void *tracking_bitmap = ploop->tracking_bitmap; - int ret = -EAGAIN; - - if (WARN_ON_ONCE(prev > tb_nr - 1)) - prev = 0; - - write_lock_irq(&ploop->bat_rwlock); - i = find_next_bit(tracking_bitmap, tb_nr, prev + 1); - if (i < tb_nr) - goto found; - i = find_first_bit(tracking_bitmap, prev + 1); - if (i >= prev + 1) - goto unlock; -found: - ret = (DMEMIT("%u\n", i)) ? 1 : 0; - if (ret) - clear_bit(i, tracking_bitmap); -unlock: - write_unlock_irq(&ploop->bat_rwlock); - if (ret > 0) - ploop->tb_cursor = i; - return ret; -} - -static u32 max_dst_clu_in_top_delta(struct ploop *ploop) -{ - u32 i, nr_pages, nr = 0, end, *bat_entries, dst_clu = 0; - struct rb_node *node; - struct md_page *md; - - nr_pages = bat_clu_to_page_nr(ploop->nr_bat_entries - 1) + 1; - - read_lock_irq(&ploop->bat_rwlock); - ploop_for_each_md_page(ploop, md, node) { - ploop_init_be_iter(ploop, md->id, &i, &end); - bat_entries = kmap_atomic(md->page); - for (; i <= end; i++) { - if (dst_clu < bat_entries[i] && - md->bat_levels[i] == top_level(ploop)) - dst_clu = bat_entries[i]; - } - kunmap_atomic(bat_entries); - nr++; - } - read_unlock_irq(&ploop->bat_rwlock); - - BUG_ON(nr != nr_pages); - return dst_clu; -} - -static int ploop_tracking_cmd(struct ploop *ploop, const char *suffix, - char *result, unsigned int maxlen) -{ - void *tracking_bitmap = NULL; - unsigned int tb_nr, size; - int ret = 0; - - if (ploop_is_ro(ploop)) - return -EROFS; - - if (!strcmp(suffix, "get_next")) { - if (!ploop->tracking_bitmap) - return -ENOENT; - return tracking_get_next(ploop, result, maxlen); - } - - if (!strcmp(suffix, "start")) { - if (ploop->tracking_bitmap) - return -EEXIST; - if (ploop->maintaince) -
[Devel] [PATCH RH9 2/2] dm-tracking: Do not return EAGAIN in case of there is no changed clu
Return nothing (we do not call DMEMIT() in this case) instead. EAGAIN may confuse a user. Signed-off-by: Kirill Tkhai --- drivers/md/dm-tracking.c |2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/dm-tracking.c b/drivers/md/dm-tracking.c index a8880a83d270..e66060246acc 100644 --- a/drivers/md/dm-tracking.c +++ b/drivers/md/dm-tracking.c @@ -213,7 +213,7 @@ static int tracking_get_next(struct dm_tracking *dmt, char *result, { unsigned int i, sz = 0, nr_clus = dmt->nr_clus, prev = dmt->cursor; void *bitmap = dmt->bitmap; - int ret = -EAGAIN; + int ret = 0; if (WARN_ON_ONCE(prev > nr_clus - 1)) prev = 0; ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RH9 1/2] dm-tracking: Track request after it completed
Otherwise there is a race in case of userspace calls "tracking_get_next" and dumps cluster before the request is written completely. Signed-off-by: Kirill Tkhai --- drivers/md/dm-tracking.c | 47 +++--- 1 file changed, 40 insertions(+), 7 deletions(-) diff --git a/drivers/md/dm-tracking.c b/drivers/md/dm-tracking.c index d723596fee44..a8880a83d270 100644 --- a/drivers/md/dm-tracking.c +++ b/drivers/md/dm-tracking.c @@ -34,18 +34,23 @@ struct dm_tracking { struct mutex ctl_mutex; }; +struct treq { + sector_t pos; + u32 bytes; +}; + static sector_t get_dev_size(struct dm_dev *dev) { return i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT; } -static void track_rq_clus(struct dm_tracking *dmt, struct request *rq) +static void track_rq_clus(struct dm_tracking *dmt, struct treq *treq) { - loff_t off = to_bytes(blk_rq_pos(rq)); + loff_t off = to_bytes(treq->pos); u64 start_clu, end_clu, clu; start_clu = off / dmt->clu_size; - end_clu = (off + blk_rq_bytes(rq) - 1) / dmt->clu_size; + end_clu = (off + treq->bytes - 1) / dmt->clu_size; for (clu = start_clu; clu <= end_clu; clu++) { set_bit(clu, dmt->bitmap); @@ -61,20 +66,25 @@ static int dmt_clone_and_map(struct dm_target *ti, struct request *rq, { struct dm_tracking *dmt = ti->private; struct block_device *bdev = dmt->origin_dev->bdev; + struct treq *treq = NULL; struct request_queue *q; struct request *clone; + map_context->ptr = NULL; if (blk_rq_bytes(rq) && op_is_write(req_op(rq))) { - spin_lock_irq(&dmt->lock); - if (dmt->bitmap) - track_rq_clus(dmt, rq); - spin_unlock_irq(&dmt->lock); + treq = kmalloc(sizeof(*treq), GFP_ATOMIC); + if (!treq) + return DM_MAPIO_REQUEUE; + treq->pos = blk_rq_pos(rq); + treq->bytes = blk_rq_bytes(rq); + map_context->ptr = treq; } q = bdev_get_queue(bdev); clone = blk_get_request(q, rq->cmd_flags | REQ_NOMERGE, BLK_MQ_REQ_NOWAIT); if (IS_ERR(clone)) { + kfree(treq); /* EBUSY, ENODEV or EWOULDBLOCK: requeue */ if (blk_queue_dying(q)) return DM_MAPIO_DELAY_REQUEUE; @@ -91,9 +101,31 @@ static int dmt_clone_and_map(struct dm_target *ti, struct request *rq, static void dmt_release_clone(struct request *clone, union map_info *map_context) { + if (unlikely(map_context)) { + struct treq *treq = map_context->ptr; + kfree(treq); + } + blk_put_request(clone); } +static int dmt_end_io(struct dm_target *ti, struct request *clone, + blk_status_t error, union map_info *map_context) +{ + struct treq *treq = map_context->ptr; + struct dm_tracking *dmt = ti->private; + + if (treq) { + spin_lock_irq(&dmt->lock); + if (dmt->bitmap) + track_rq_clus(dmt, treq); + spin_unlock_irq(&dmt->lock); + kfree(treq); + } + + return DM_ENDIO_DONE; +} + static void dmt_destroy(struct dm_tracking *dmt) { if (dmt->origin_dev) @@ -320,6 +352,7 @@ static struct target_type dmt_target = { .dtr = dmt_dtr, .clone_and_map_rq = dmt_clone_and_map, .release_clone_rq = dmt_release_clone, + .rq_end_io = dmt_end_io, .message = dmt_message, .iterate_devices = dmt_iterate_devices, .status = dmt_status, ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RH9] dm-ploop: Fix usage of bio_vec on stack
Previously, writing BAT page was synchronous, so we could use on-stack bio_vec for that. But after it became asynchronous, we can't do that. Strange, this has not fired earlier. https://jira.sw.ru/browse/PSBM-135137 Fixes: bfc5eaaba897 "ploop: Async md writeback" Signed-off-by: Kirill Tkhai --- drivers/md/dm-ploop-map.c | 12 ++-- drivers/md/dm-ploop.h |1 + 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/md/dm-ploop-map.c b/drivers/md/dm-ploop-map.c index 4cadf6e45b4d..a558445a1bec 100644 --- a/drivers/md/dm-ploop-map.c +++ b/drivers/md/dm-ploop-map.c @@ -1595,17 +1595,17 @@ void ploop_index_wb_submit(struct ploop *ploop, struct ploop_index_wb *piwb) { loff_t pos = (loff_t)piwb->page_id << PAGE_SHIFT; struct pio *pio = piwb->pio; - struct bio_vec bvec = { - .bv_page = piwb->bat_page, - .bv_len = PAGE_SIZE, - .bv_offset = 0, - }; + struct bio_vec *bvec = &piwb->aux_bvec; + + bvec->bv_page = piwb->bat_page; + bvec->bv_len = PAGE_SIZE; + bvec->bv_offset = 0; pio->bi_iter.bi_sector = to_sector(pos); pio->bi_iter.bi_size = PAGE_SIZE; pio->bi_iter.bi_idx = 0; pio->bi_iter.bi_bvec_done = 0; - pio->bi_io_vec = &bvec; + pio->bi_io_vec = bvec; pio->level = top_level(ploop); pio->endio_cb = md_write_endio; pio->endio_cb_data = piwb; diff --git a/drivers/md/dm-ploop.h b/drivers/md/dm-ploop.h index a7ca942c4670..0a4c6b78e20e 100644 --- a/drivers/md/dm-ploop.h +++ b/drivers/md/dm-ploop.h @@ -106,6 +106,7 @@ struct ploop_index_wb { bool completed; blk_status_t bi_status; u32 page_id; + struct bio_vec aux_bvec; }; /* Metadata page */ ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RH9 4/4] xfs: Provide a balloon nipple for management
A new ioctl() to open balloon file. Signed-off-by: Kirill Tkhai --- fs/xfs/libxfs/xfs_fs.h |1 + fs/xfs/xfs_ioctl.c | 63 2 files changed, 64 insertions(+) diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index bde2b4c64dbe..2293e1b757b3 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -839,6 +839,7 @@ struct xfs_scrub_metadata { #define XFS_IOC_INUMBERS_IOR ('X', 128, struct xfs_inumbers_req) /* XFS_IOC_GETFSUUID -- deprecated 140 */ +#define XFS_IOC_OPEN_BALLOON _IO('X', 255) #ifndef HAVE_BBMACROS /* diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 16039ea10ac9..1282d9412f92 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -1935,6 +1935,63 @@ xfs_fs_eofblocks_from_user( return 0; } +static int xfs_open_balloon(struct xfs_mount *mp, struct vfsmount *mnt) +{ + u64 balloon_ino = READ_ONCE(mp->m_balloon_ino); + struct xfs_inode *ip; + struct inode *inode; + int err, fd; + struct file *filp; + struct dentry *de; + struct path path; + fmode_t mode; + + if (!balloon_ino) + return -ENOENT; + ip = xfs_balloon_get(mp, balloon_ino, 0); + if (IS_ERR(ip)) + return PTR_ERR(ip); + inode = VFS_I(ip); + + err = fd = get_unused_fd_flags(0); + if (err < 0) + goto err_put_ip; + + __iget(inode); + de = d_obtain_alias(inode); + err = PTR_ERR(de); + if (IS_ERR(de)) + goto err_put_fd; + + path.dentry = de; + path.mnt = mntget(mnt); + err = mnt_want_write(path.mnt); + if (err) + mode = O_RDONLY; + else + mode = O_RDWR; + filp = alloc_file(&path, mode, &xfs_file_operations); + if (filp->f_mode & FMODE_WRITE) + mnt_drop_write(path.mnt); + if (IS_ERR(filp)) { + err = PTR_ERR(filp); + goto err_put_path; + } + + filp->f_flags |= O_LARGEFILE; + fd_install(fd, filp); + xfs_irele(ip); + return fd; + +err_put_path: + path_put(&path); +err_put_fd: + put_unused_fd(fd); +err_put_ip: + xfs_irele(ip); + return err; +} + /* * Note: some of the ioctl's return positive numbers as a * byte count indicating success, such as readlink_by_handle. @@ -2216,6 +2273,12 @@ xfs_file_ioctl( return error; } +case XFS_IOC_OPEN_BALLOON: +if (!capable(CAP_SYS_ADMIN)) +return -EACCES; + +return xfs_open_balloon(mp, filp->f_path.mnt); + default: return -ENOTTY; } ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RH9 3/4] xfs: Don't show the active balloon to user
Prohibit a notpriviliged user to reach balloon. Prohibit everything to unlink it. Signed-off-by: Kirill Tkhai --- fs/xfs/xfs_inode.c |4 fs/xfs/xfs_iops.c |4 2 files changed, 8 insertions(+) diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 990b72ae3635..32f99876dc19 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -670,6 +670,10 @@ xfs_lookup( if (error) goto out_unlock; + error = -EPERM; + if (unlikely(inum == READ_ONCE(dp->i_mount->m_balloon_ino))) + goto out_free_name; + error = xfs_iget(dp->i_mount, NULL, inum, 0, 0, ipp); if (error) goto out_free_name; diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 93c082db04b7..09211e1d08ad 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -382,6 +382,10 @@ xfs_vn_unlink( struct xfs_name name; int error; + if (unlikely(d_inode(dentry)->i_ino == + READ_ONCE(XFS_I(dir)->i_mount->m_balloon_ino))) + return -EPERM; + xfs_dentry_to_name(&name, dentry); error = xfs_remove(XFS_I(dir), &name, XFS_I(d_inode(dentry))); ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RH9 0/4] xfs: Add balloon support
https://jira.sw.ru/browse/PSBM-133811 --- Kirill Tkhai (4): xfs: Teach the fs where the balloon inode is xfs: Never show balloon in readdir results xfs: Don't show the active balloon to user xfs: Provide a balloon nipple for management fs/xfs/libxfs/xfs_da_btree.h |1 + fs/xfs/libxfs/xfs_dir2_priv.h |1 + fs/xfs/libxfs/xfs_fs.h|1 + fs/xfs/xfs_dir2_readdir.c | 23 +++- fs/xfs/xfs_file.c |2 + fs/xfs/xfs_inode.c|4 ++ fs/xfs/xfs_ioctl.c| 63 + fs/xfs/xfs_iops.c |4 ++ fs/xfs/xfs_mount.h|2 + fs/xfs/xfs_super.c| 79 + fs/xfs/xfs_super.h|2 + 11 files changed, 180 insertions(+), 2 deletions(-) -- Signed-off-by: Kirill Tkhai ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RH9 2/4] xfs: Never show balloon in readdir results
Note, that xfs_readdir() may be called from many places. To underline the case, when it's called from normal readdir syscalls (not from xfs service functionality), and to avoid to add a new argument to xfs_readdir(), we introduce a special value: XFS_FAKE_TRANS_IGNORE_BALLOON. Signed-off-by: Kirill Tkhai --- fs/xfs/libxfs/xfs_da_btree.h |1 + fs/xfs/libxfs/xfs_dir2_priv.h |1 + fs/xfs/xfs_dir2_readdir.c | 23 ++- fs/xfs/xfs_file.c |2 +- 4 files changed, 25 insertions(+), 2 deletions(-) diff --git a/fs/xfs/libxfs/xfs_da_btree.h b/fs/xfs/libxfs/xfs_da_btree.h index ad5dd324631a..3aa2dfd533ed 100644 --- a/fs/xfs/libxfs/xfs_da_btree.h +++ b/fs/xfs/libxfs/xfs_da_btree.h @@ -55,6 +55,7 @@ enum xfs_dacmp { typedef struct xfs_da_args { struct xfs_da_geometry *geo;/* da block geometry */ const uint8_t *name; /* string (maybe not NULL terminated) */ + uint8_t ignore_balloon:1; int namelen;/* length of string (maybe no NULL) */ uint8_t filetype; /* filetype of inode for directories */ void*value; /* set of bytes (maybe contain NULLs) */ diff --git a/fs/xfs/libxfs/xfs_dir2_priv.h b/fs/xfs/libxfs/xfs_dir2_priv.h index 94943ce49cab..e78fc1667836 100644 --- a/fs/xfs/libxfs/xfs_dir2_priv.h +++ b/fs/xfs/libxfs/xfs_dir2_priv.h @@ -184,6 +184,7 @@ void xfs_dir2_sf_put_ftype(struct xfs_mount *mp, struct xfs_dir2_sf_entry *sfep, uint8_t ftype); /* xfs_dir2_readdir.c */ +#define XFS_FAKE_TRANS_IGNORE_BALLOON ((void *)1) extern int xfs_readdir(struct xfs_trans *tp, struct xfs_inode *dp, struct dir_context *ctx, size_t bufsize); diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c index da1cc683560c..0dfba9054e3d 100644 --- a/fs/xfs/xfs_dir2_readdir.c +++ b/fs/xfs/xfs_dir2_readdir.c @@ -121,9 +121,13 @@ xfs_dir2_sf_getdents( !xfs_dir2_namecheck(sfep->name, sfep->namelen))) return -EFSCORRUPTED; + if (unlikely(ino == READ_ONCE(dp->i_mount->m_balloon_ino) && +args->ignore_balloon)) + goto next; if (!dir_emit(ctx, (char *)sfep->name, sfep->namelen, ino, xfs_dir3_get_dtype(mp, filetype))) return 0; +next: sfep = xfs_dir2_sf_nextentry(mp, sfp, sfep); } @@ -214,6 +218,12 @@ xfs_dir2_block_getdents( error = -EFSCORRUPTED; goto out_rele; } + + if (unlikely(be64_to_cpu(dep->inumber) == + READ_ONCE(dp->i_mount->m_balloon_ino) && +args->ignore_balloon)) + continue; + if (!dir_emit(ctx, (char *)dep->name, dep->namelen, be64_to_cpu(dep->inumber), xfs_dir3_get_dtype(dp->i_mount, filetype))) @@ -465,11 +475,17 @@ xfs_dir2_leaf_getdents( error = -EFSCORRUPTED; break; } + + if (unlikely(be64_to_cpu(dep->inumber) == + READ_ONCE(dp->i_mount->m_balloon_ino) && +args->ignore_balloon)) + goto next; + if (!dir_emit(ctx, (char *)dep->name, dep->namelen, be64_to_cpu(dep->inumber), xfs_dir3_get_dtype(dp->i_mount, filetype))) break; - +next: /* * Advance to next entry in the block. */ @@ -510,6 +526,11 @@ xfs_readdir( int rval; int v; + if (tp == XFS_FAKE_TRANS_IGNORE_BALLOON) { + args.ignore_balloon = true; + tp = NULL; + } + trace_xfs_readdir(dp); if (XFS_FORCED_SHUTDOWN(dp->i_mount)) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index cc3cfb12df53..1164184cd1b0 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1266,7 +1266,7 @@ xfs_file_readdir( */ bufsize = (size_t)min_t(loff_t, XFS_READDIR_BUFSIZE, ip->i_disk_size); - return xfs_readdir(NULL, ip, ctx, bufsize); + return xfs_readdir(XFS_FAKE_TRANS_IGNORE_BALLOON, ip, ctx, bufsize); } STATIC loff_t ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RH9 1/4] xfs: Teach the fs where the balloon inode is
This adds balloon_ino=XXX mount option for xfs. Signed-off-by: Kirill Tkhai --- fs/xfs/xfs_mount.h |2 + fs/xfs/xfs_super.c | 79 fs/xfs/xfs_super.h |2 + 3 files changed, 83 insertions(+) diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index c78b63fe779a..4eb318bb44ac 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -154,6 +154,8 @@ typedef struct xfs_mount { uint8_t m_rt_checked; uint8_t m_rt_sick; + uint64_tm_balloon_ino; + /* * End of read-mostly variables. Frequently written variables and locks * should be placed below this comment from now on. The first variable diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 304875c0d3cc..aba14f5adc4e 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -95,6 +95,7 @@ enum { Opt_prjquota, Opt_uquota, Opt_gquota, Opt_pquota, Opt_uqnoenforce, Opt_gqnoenforce, Opt_pqnoenforce, Opt_qnoenforce, Opt_discard, Opt_nodiscard, Opt_dax, Opt_dax_enum, + Opt_balloon_ino, }; static const struct fs_parameter_spec xfs_fs_parameters[] = { @@ -139,6 +140,7 @@ static const struct fs_parameter_spec xfs_fs_parameters[] = { fsparam_flag("nodiscard", Opt_nodiscard), fsparam_flag("dax", Opt_dax), fsparam_enum("dax", Opt_dax_enum, dax_param_enums), + fsparam_u64("balloon_ino", Opt_balloon_ino), {} }; @@ -171,6 +173,7 @@ xfs_fs_show_options( }; struct xfs_mount*mp = XFS_M(root->d_sb); struct proc_xfs_info*xfs_infop; + u64 balloon_ino; for (xfs_infop = xfs_info_set; xfs_infop->flag; xfs_infop++) { if (mp->m_flags & xfs_infop->flag) @@ -224,6 +227,9 @@ xfs_fs_show_options( if (!(mp->m_qflags & XFS_ALL_QUOTA_ACCT)) seq_puts(m, ",noquota"); + if ((balloon_ino = READ_ONCE(mp->m_balloon_ino)) != 0) + seq_printf(m, ",balloon_ino=%llu", + balloon_ino); return 0; } @@ -776,6 +782,41 @@ xfs_fs_sync_fs( return 0; } +struct xfs_inode * +xfs_balloon_get(struct xfs_mount *mp, u64 balloon_ino, uint flags) +{ + struct xfs_inode *ip; + struct inode *inode; + int error; + + if (!xfs_verify_dir_ino(mp, balloon_ino)) + return ERR_PTR(-EINVAL); + + error = xfs_iget(mp, NULL, balloon_ino, flags, 0, &ip); + if (error) + return ERR_PTR(error); + inode = VFS_I(ip); + if (!S_ISREG(inode->i_mode) || IS_IMMUTABLE(inode)) + return ERR_PTR(-EINVAL); + + return ip; +} + +STATIC int +xfs_balloon_check(struct xfs_mount *mp, u64 balloon_ino) +{ + struct xfs_inode *ip; + + if (!balloon_ino) + return 0; + + ip = xfs_balloon_get(mp, balloon_ino, XFS_IGET_UNTRUSTED); + if (IS_ERR(ip)) + return PTR_ERR(ip); + xfs_irele(ip); + return 0; +} + STATIC int xfs_fs_statfs( struct dentry *dentry, @@ -790,6 +831,7 @@ xfs_fs_statfs( uint64_tfdblocks; xfs_extlen_tlsize; int64_t ffree; + u64 balloon_ino; statp->f_type = XFS_SUPER_MAGIC; statp->f_namelen = MAXNAMELEN - 1; @@ -840,6 +882,17 @@ xfs_fs_statfs( sbp->sb_frextents * sbp->sb_rextsize; } + if ((balloon_ino = READ_ONCE(mp->m_balloon_ino)) != 0) { + struct xfs_inode *ip; + + ip = xfs_balloon_get(mp, balloon_ino, 0); + if (ip) { + /* Note, i_nblocks also contains metadata blocks */ + statp->f_blocks -= ip->i_nblocks + ip->i_delayed_blks; + xfs_irele(ip); + } + } + return 0; } @@ -1273,6 +1326,9 @@ xfs_fs_parse_param( xfs_mount_set_dax_mode(parsing_mp, result.uint_32); return 0; #endif + case Opt_balloon_ino: + parsing_mp->m_balloon_ino = result.uint_64; + return 0; /* Following mount options will be removed in September 2025 */ case Opt_ikeep: xfs_fs_warn_deprecated(fc, param, XFS_MOUNT_IKEEP, true); @@ -1603,6 +1659,10 @@ xfs_fs_fill_super( if (error) goto out_filestream_unmount; + error = xfs_balloon_check(mp, mp->m_balloon_ino); + if (error) + goto out_unmount; + root = igrab(VFS_I(mp->m_rootip)); if (!root) { error = -ENOENT; @@ -1809,6 +1869,25 @@ xfs_fs_reconfigure( return error; }
Re: [Devel] [PATCH RH9] ploop: simplify ploop_status
On 21.10.2021 21:32, Cyrill Gorcunov wrote: > From: Cyrill Gorcunov > > We can get rid of sprintf usage when encoding the status. > Just fill the string directly. > > Cc: Kirill Tkhai > Signed-off-by: Cyrill Gorcunov Acked-by: Kirill TKhai > --- > drivers/md/dm-ploop-target.c | 13 +++-- > 1 file changed, 7 insertions(+), 6 deletions(-) > > --- vzkernel.orig/drivers/md/dm-ploop-target.c > +++ vzkernel/drivers/md/dm-ploop-target.c > @@ -435,20 +435,21 @@ static void ploop_status(struct dm_targe >unsigned int maxlen) > { > struct ploop *ploop = ti->private; > - char stat[16] = { 0 }, *p = stat; > + char stat[16], *p = stat; > ssize_t sz = 0; > > down_read(&ploop->ctl_rwsem); > if (ploop->falloc_new_clu) > - p += sprintf(p, "f"); > + *p++ = 'f'; > if (ploop->tracking_bitmap) > - p += sprintf(p, "t"); > + *p++ = 't'; > if (READ_ONCE(ploop->noresume)) > - p += sprintf(p, "n"); > + *p++ = 'n'; > if (READ_ONCE(ploop->event_enospc)) > - p += sprintf(p, "s"); > + *p++ = 's'; > if (p == stat) > - p += sprintf(p, "o"); > + *p++ = 'o'; > + *p++ = '\0'; > up_read(&ploop->ctl_rwsem); > > BUG_ON(p - stat >= sizeof(stat)); > ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
Re: [Devel] [PATCH RH8] ploop: Provide more info about ENOSPC
On 20.10.2021 22:22, Cyrill Gorcunov wrote: > On Wed, Oct 20, 2021 at 06:13:01PM +0300, Kirill Tkhai wrote: > ... >> diff --git a/drivers/md/dm-ploop-target.c b/drivers/md/dm-ploop-target.c >> index 327095f75359..bd68d5fb272b 100644 >> --- a/drivers/md/dm-ploop-target.c >> +++ b/drivers/md/dm-ploop-target.c >> @@ -455,6 +455,8 @@ static void ploop_status(struct dm_target *ti, >> status_type_t type, >> p += sprintf(p, "t"); >> if (READ_ONCE(ploop->noresume)) >> p += sprintf(p, "n"); >> +if (READ_ONCE(ploop->event_enospc)) >> +p += sprintf(p, "s"); >> if (p == stat) >> p += sprintf(p, "o"); >> if (ploop->skip_off) > > While I've no clue what is going on here with this status I wonder why > we use sprintf here at all? The sprintf is _very_ heavy function which > consumes too much cycles for nothing, we don't even need any formatting > here. Why not some simple > > static void ploop_status(struct dm_target *ti, status_type_t type, >unsigned int status_flags, char *result, >unsigned int maxlen) > { > struct ploop *ploop = ti->private; > char stat[16], *p = stat; > ssize_t sz = 0; > > down_read(&ploop->ctl_rwsem); > if (ploop->falloc_new_clu) > *p++ = 'f'; > if (ploop->tracking_bitmap) > *p++ = 't'; > if (READ_ONCE(ploop->noresume)) > *p++ = 'n'; > if (p == stat) > *p++ = 'o'; > *p = '\0'; > up_read(&ploop->ctl_rwsem); > > BUG_ON(p - stat >= sizeof(stat)); > DMEMIT("%u v2 %u %s", ploop->nr_deltas, (u32)CLU_TO_SEC(ploop, 1), > stat); > } > > or I miss something obvious? Good idea. Could you please provide a proper patch reworking this function on top of my patch? ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RH8] ploop: Provide more info about ENOSPC
Add info to status and print to dmesg once. https://jira.sw.ru/browse/PSBM-135007 Signed-off-by: Kirill Tkhai --- drivers/md/dm-ploop-map.c|1 + drivers/md/dm-ploop-target.c |2 ++ 2 files changed, 3 insertions(+) diff --git a/drivers/md/dm-ploop-map.c b/drivers/md/dm-ploop-map.c index 79142acddecc..4cadf6e45b4d 100644 --- a/drivers/md/dm-ploop-map.c +++ b/drivers/md/dm-ploop-map.c @@ -169,6 +169,7 @@ static bool ploop_try_delay_enospc(struct ploop_rq *prq, struct pio *pio) init_prq_and_embedded_pio(ploop, prq->rq, prq, pio); + pr_err_once("ploop: underlying disk is almost full\n"); ploop->event_enospc = true; list_add_tail(&pio->list, &ploop->enospc_pios); unlock: diff --git a/drivers/md/dm-ploop-target.c b/drivers/md/dm-ploop-target.c index 327095f75359..bd68d5fb272b 100644 --- a/drivers/md/dm-ploop-target.c +++ b/drivers/md/dm-ploop-target.c @@ -455,6 +455,8 @@ static void ploop_status(struct dm_target *ti, status_type_t type, p += sprintf(p, "t"); if (READ_ONCE(ploop->noresume)) p += sprintf(p, "n"); + if (READ_ONCE(ploop->event_enospc)) + p += sprintf(p, "s"); if (p == stat) p += sprintf(p, "o"); if (ploop->skip_off) ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RH9] ploop: Provide more info about ENOSPC
Add info to status and print to dmesg once. https://jira.sw.ru/browse/PSBM-135007 Signed-off-by: Kirill Tkhai --- drivers/md/dm-ploop-map.c|1 + drivers/md/dm-ploop-target.c |2 ++ 2 files changed, 3 insertions(+) diff --git a/drivers/md/dm-ploop-map.c b/drivers/md/dm-ploop-map.c index 79142acddecc..4cadf6e45b4d 100644 --- a/drivers/md/dm-ploop-map.c +++ b/drivers/md/dm-ploop-map.c @@ -169,6 +169,7 @@ static bool ploop_try_delay_enospc(struct ploop_rq *prq, struct pio *pio) init_prq_and_embedded_pio(ploop, prq->rq, prq, pio); + pr_err_once("ploop: underlying disk is almost full\n"); ploop->event_enospc = true; list_add_tail(&pio->list, &ploop->enospc_pios); unlock: diff --git a/drivers/md/dm-ploop-target.c b/drivers/md/dm-ploop-target.c index 327095f75359..bd68d5fb272b 100644 --- a/drivers/md/dm-ploop-target.c +++ b/drivers/md/dm-ploop-target.c @@ -455,6 +455,8 @@ static void ploop_status(struct dm_target *ti, status_type_t type, p += sprintf(p, "t"); if (READ_ONCE(ploop->noresume)) p += sprintf(p, "n"); + if (READ_ONCE(ploop->event_enospc)) + p += sprintf(p, "s"); if (p == stat) p += sprintf(p, "o"); if (ploop->skip_off) ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
Re: [Devel] [PATCH RH9 0/6] part 18: port release agent virtualization
On 18.10.2021 15:50, Pavel Tikhomirov wrote: > Patches are massively reworked, see inpatch comments. > > https://jira.sw.ru/browse/PSBM-134002 > > Pavel Tikhomirov (1): > ve/cgroup: fix cgroup_mark_ve_roots naming > > Valeriy Vdovin (5): > cgroup/cfs: added 'activate' option to cgroup_add_file > ve/cgroup: Implement per-ve workqueue > ve/cgroup: Move release_agent from system_wq to per-ve workqueues > ve/cgroup: Private per-cgroup-root data container > ve/cgroup: Set release_agent_path for root cgroups separately > > include/linux/cgroup-defs.h | 11 +- > include/linux/cgroup.h | 4 +- > include/linux/ve.h | 27 > kernel/cgroup/cgroup-internal.h | 2 + > kernel/cgroup/cgroup-v1.c | 165 +++--- > kernel/cgroup/cgroup.c | 53 ++- > kernel/ve/ve.c | 242 +++- > 7 files changed, 442 insertions(+), 62 deletions(-) Looks OK for me. Reviewed-by: Kirill Tkhai ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RH7] pfcache: Fix unitialized s_csum_partial
Since percpu_counter::counters is not allocated, add and sub operations write to percpu memory with 0 offset. In scope of https://jira.sw.ru/browse/PSBM-134639 Fixes: 1204e364ca05 "pfcache: add hashed peers for ext4" Signed-off-by: Kirill Tkhai --- fs/ext4/super.c |2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 810556737675..2d7c1d7c4190 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -4540,7 +4540,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_err_report.data = (unsigned long) sb; err = percpu_counter_init(&sbi->s_extent_cache_cnt, 0, GFP_KERNEL); - if (err) + if (!err) err = percpu_counter_init(&sbi->s_csum_partial, 0, GFP_KERNEL); if (!err) err = percpu_counter_init(&sbi->s_csum_complete, 0, GFP_KERNEL); ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
Re: [Devel] [PATCH RH9] x86/cpu: init_cpu_flags -- use raw spinlock
On 14.10.2021 11:20, Cyrill Gorcunov wrote: > The @cpu_flags_lock spinlock guards manipulations with > per-cpu @cpu_flags which is used to hide some features > in cpuinfo output inside VE. Still the init_cpu_flags > is called from irq context leading to the following > > | [ 13.827635] = > | [ 13.827636] [ BUG: Invalid wait context ] > | [ 13.827637] 5.14.0.ovz9.10.1+ #41 Tainted: G C X - > --- > | [ 13.827638] - > | [ 13.827638] systemd/1 is trying to lock: > | [ 13.827639] a4c9d258 (cpu_flags_lock){}-{3:3}, at: > init_cpu_flags+0xc8/0x220 > | [ 13.827649] other info that might help us debug this: > | [ 13.827651] context-{2:2} > | [ 13.827651] 3 locks held by systemd/1: > | [ 13.827652] #0: a56e8c60 (dup_mmap_sem){.+.+}-{0:0}, at: > dup_mm+0x83/0x5f0 > | [ 13.827660] #1: 97574a37d138 (&mm->mmap_lock#2){}-{4:4}, at: > dup_mm+0x9c/0x5f0 > | [ 13.827664] #2: 97574489c138 (&mm->mmap_lock/1){+.+.}-{4:4}, at: > dup_mm+0xd5/0x5f0 > | [ 13.827667] stack backtrace: > | [ 13.827668] CPU: 0 PID: 1 Comm: systemd ve: / Tainted: G C > X - --- > | [ 13.827670] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS > 1.14.0-4.fc34 04/01/2014 > | [ 13.827672] Call Trace: > | [ 13.827673] > | [ 13.827675] dump_stack_lvl+0x57/0x7d > | [ 13.827686] __lock_acquire.cold+0x28b/0x2cd > | [ 13.827694] lock_acquire+0xca/0x300 > | [ 13.827700] ? init_cpu_flags+0xc8/0x220 > | [ 13.827703] _raw_spin_lock+0x34/0x80 > | [ 13.827708] ? init_cpu_flags+0xc8/0x220 > | [ 13.827710] init_cpu_flags+0xc8/0x220 > | [ 13.827713] flush_smp_call_function_queue+0x13f/0x1e0 > | [ 13.827717] __sysvec_call_function_single+0x43/0x1c0 > | [ 13.827722] sysvec_call_function_single+0x9d/0xd0 > | [ 13.827724] > | [ 13.827724] asm_sysvec_call_function_single+0x12/0x20 > | [ 13.827728] RIP: 0010:lock_release+0x178/0x460 > | ... > | [ 13.827741] up_write+0x2f/0x1c0 > | [ 13.827743] anon_vma_clone+0x158/0x1f0 > | [ 13.827749] anon_vma_fork+0x33/0x180 > | [ 13.827751] dup_mm+0x45b/0x5f0 > | [ 13.827755] copy_process+0x1e5a/0x2050 > | [ 13.827758] kernel_clone+0x9b/0x3f0 > | [ 13.827760] ? vfs_statx+0x74/0x130 > | [ 13.827766] __do_sys_clone+0x60/0x80 > | [ 13.827769] do_syscall_64+0x3b/0x90 > | [ 13.827771] entry_SYSCALL_64_after_hwframe+0x44/0xae > > The problem is rather coming from rt camp where splinlocks > become sleepable thus can't be used in irq context (and for our kernel > it requires the CONFIG_PROVE_RAW_LOCK_NESTING to be set), thus since > we know that we're operating in irq context lets use raw spinlocks > instead. > > https://jira.sw.ru/browse/PSBM-134761 > > CC: Kirill Tkhai > Signed-off-by: Cyrill Gorcunov Reviewed-by: Kirill Tkhai > --- > arch/x86/kernel/cpu/proc.c | 10 +- > 1 file changed, 5 insertions(+), 5 deletions(-) > > --- vzkernel.orig/arch/x86/kernel/cpu/proc.c > +++ vzkernel/arch/x86/kernel/cpu/proc.c > @@ -69,7 +69,7 @@ struct cpu_flags { > }; > > static DEFINE_PER_CPU(struct cpu_flags, cpu_flags); > -static DEFINE_SPINLOCK(cpu_flags_lock); > +static DEFINE_RAW_SPINLOCK(cpu_flags_lock); > > static void init_cpu_flags(void *dummy) > { > @@ -107,9 +107,9 @@ static void init_cpu_flags(void *dummy) > flags.val[10] &= eax; > } > > - spin_lock(&cpu_flags_lock); > + raw_spin_lock(&cpu_flags_lock); > memcpy(&per_cpu(cpu_flags, cpu), &flags, sizeof(flags)); > - spin_unlock(&cpu_flags_lock); > + raw_spin_unlock(&cpu_flags_lock); > } > > static int show_cpuinfo(struct seq_file *m, void *v) > @@ -158,9 +158,9 @@ static int show_cpuinfo(struct seq_file > show_cpuinfo_misc(m, c); > > if (!is_super) { > - spin_lock_irq(&cpu_flags_lock); > + raw_spin_lock_irq(&cpu_flags_lock); > memcpy(&ve_flags, &per_cpu(cpu_flags, cpu), sizeof(ve_flags)); > - spin_unlock_irq(&cpu_flags_lock); > + raw_spin_unlock_irq(&cpu_flags_lock); > } > > > ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
Re: [Devel] [PATCH RH9] sched/ve: calc_load_ve -- use raw spinlock
On 14.10.2021 11:20, Cyrill Gorcunov wrote: > The @load_ve_lock spinlock guards manipulations of @ve_root_list, > same time the calc_load_ve() is executed from irq context which > triggers "invalid context wait" bug > > | [5.195868] = > | [5.195877] [ BUG: Invalid wait context ] > | [5.195887] 5.14.0.ovz9.10.1 #37 Tainted: G C X - > --- > | [5.195902] - > | [5.195911] swapper/0/0 is trying to lock: > | [5.196327] 872d8438 (load_ve_lock){}-{3:3}, at: > calc_load_ve+0x15/0x1c0 > | [5.196742] other info that might help us debug this: > | [5.196807] context-{2:2} > | [5.196807] no locks held by swapper/0/0. > | [5.196807] stack backtrace: > | [5.196807] CPU: 0 PID: 0 Comm: swapper/0 ve: / Tainted: G C > X - --- 5.14.0.ovz9.10.1 #37 10.1 > | [5.196807] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS > 1.14.0-4.fc34 04/01/2014 > | [5.196807] Call Trace: > | [5.196807] > | [5.196807] dump_stack_lvl+0x57/0x7d > | [5.196807] __lock_acquire.cold+0x28b/0x2cd > | [5.196807] ? __lock_acquire+0x3b1/0x1f20 > | [5.196807] lock_acquire+0xca/0x300 > | [5.196807] ? calc_load_ve+0x15/0x1c0 > | [5.196807] ? kvm_sched_clock_read+0x14/0x40 > | [5.196807] ? sched_clock_local+0xe/0x80 > | [5.196807] ? sched_clock_cpu+0xa5/0xc0 > | [5.196807] _raw_spin_lock+0x34/0x80 > | [5.196807] ? calc_load_ve+0x15/0x1c0 > | [5.196807] calc_load_ve+0x15/0x1c0 > | [5.196807] tick_do_update_jiffies64+0x115/0x150 > | [5.196807] tick_irq_enter+0x6c/0xe0 > | [5.196807] irq_enter_rcu+0x79/0x80 > | [5.196807] sysvec_apic_timer_interrupt+0x95/0xd0 > | [5.196807] > | [5.196807] asm_sysvec_apic_timer_interrupt+0x12/0x20 > | [5.196807] RIP: 0010:default_idle+0x10/0x20 > | [5.196807] RSP: 0018:87203ea8 EFLAGS: 0202 > | [5.196807] RAX: 86380df0 RBX: RCX: > 0001 > | [5.196807] RDX: RSI: 86ee3980 RDI: > 86e1050e > | [5.196807] RBP: 87260a00 R08: 0001 R09: > 0001 > | [5.196807] R10: 0001 R11: R12: > > | [5.196807] R13: R14: 87260120 R15: > > | [5.196807] ? mwait_idle+0x70/0x70 > | [5.196807] ? mwait_idle+0x70/0x70 > | [5.196807] default_idle_call+0x59/0x90 > | [5.196807] do_idle+0x217/0x2b0 > | [5.196807] cpu_startup_entry+0x19/0x20 > | [5.196807] start_kernel+0x997/0x9bc > | [5.196807] ? copy_bootdata+0x18/0x55 > | [5.196807] secondary_startup_64_no_verify+0xc2/0xcb > > Note that the problem is rather coming from rt camp where splinlocks > become sleepable thus can't be used in irq context (and for our kernel > it requires the CONFIG_PROVE_RAW_LOCK_NESTING to be set), thus since > we know that we're operating in irq context lets use raw spinlocks > instead. > > Also I make unlock to happen earlier because there is no need to > keep it once we've finished traversing the @ve_root_list list. > > https://jira.sw.ru/browse/PSBM-134756 > > CC: Kirill Tkhai > Signed-off-by: Cyrill Gorcunov Acked-by: Kirill Tkhai > --- > kernel/sched/core.c| 10 +- > kernel/sched/loadavg.c |6 +++--- > 2 files changed, 8 insertions(+), 8 deletions(-) > > --- vzkernel.orig/kernel/sched/core.c > +++ vzkernel/kernel/sched/core.c > @@ -10036,18 +10036,18 @@ static u64 cpu_shares_read_u64(struct cg > > #ifdef CONFIG_VE > LIST_HEAD(ve_root_list); > -DEFINE_SPINLOCK(load_ve_lock); > +DEFINE_RAW_SPINLOCK(load_ve_lock); > > void link_ve_root_cpu_cgroup(struct cgroup_subsys_state *css) > { > struct task_group *tg = css_tg(css); > unsigned long flags; > > - spin_lock_irqsave(&load_ve_lock, flags); > + raw_spin_lock_irqsave(&load_ve_lock, flags); > BUG_ON(!(css->flags & CSS_ONLINE)); > if (list_empty(&tg->ve_root_list)) > list_add(&tg->ve_root_list, &ve_root_list); > - spin_unlock_irqrestore(&load_ve_lock, flags); > + raw_spin_unlock_irqrestore(&load_ve_lock, flags); > } > > void unlink_ve_root_cpu_cgroup(struct cgroup_subsys_state *css) > @@ -10055,9 +10055,9 @@ void unlink_ve_root_cpu_cgroup(struct cg > struct task_group *tg = css_tg(css); > unsigned long flags; > > - spin_lock_irqsave(&load_ve_lock, flags); &g
[Devel] [PATCH RH9 2/2] mm/backing-dev: associate writeback with correct blkcg
From: Andrey Zhadchenko Use cgroup_get_e_ve_css to get correct blkcg_css for writeback instances. https://jira.sw.ru/browse/PSBM-131253 Signed-off-by: Andrey Zhadchenko Reviewed-by: Kirill Tkhai v2: khorenko@: introduce a wrapper for getting blkcg_css from memcg_css. == mm/writeback: Adopt cgroup-v2 writeback (limit per-memcg dirty memory) In cgroup-v1 all writeback IO is accounted to root blkcg by design. With cgroup-v2 it became possible to link memcg and blkcg, so writeback code was enhanced to 1) consider balancing dirty pages per memory cgroup 2) account writeback generated IO to blkcg In vz7 writeback was balancing by means of beancounter cgroup. However we dropped it. In vz8 @aryabinin tried to enable cgroup-v2 writeback with 5cc286c98ee20 ("mm, cgroup, writeback: Enable per-cgroup writeback for v1 cgroup."), but cgroup_get_e_css(), which is used to find blkcg based on memcg, does not work well with cgroup-v1 and always returns root blkcg. However we can implement a new function to associate blkcg with memcg via ve css_set. Test results with 256M container without patch: === # echo "253:22358 1" > /sys/fs/cgroup/blkio/machine.slice/1/blkio.throttle.write_bps_device # vzctl exec 1 dd if=/dev/zero of=/test bs=1M count=1000 # 1048576000 bytes (1.0 GB, 1000 MiB) copied, 1.35522 s, 774 MB/s Since dirty balancing is global, Container can dirty more than it's RAM and blkio limits are not respected. With patch: === # echo "253:22765 1" > /sys/fs/cgroup/blkio/machine.slice/1/blkio.throttle.write_bps_device # vzctl exec 1 dd if=/dev/zero of=/test bs=1M count=1000 # 1048576000 bytes (1.0 GB, 1000 MiB) copied, 10.2267 s, 103 MB/s Per-ve dirty balancing and throttling work as expected. v2: Since ve->ve_ns is pointing to task nsproxy, it can be changed during ve lifetime. We already have a helper ve_get_init_css() that handles this case, so I decided to reuse it's code in new cgroup_get_e_ve_css(). Additionally I have added two patches that improve current code: 1) drop 'get' from css_get_local_root() name since get with css functions usually results in taking reference 2) drop duplicate code and reuse css_local_root() helper in ve_get_init_css() Andrey Zhadchenko (4): kernel/cgroup: rename css_get_local_root kernel/ve: simplify ve_get_init_css kernel/cgroup: implement cgroup_get_e_ve_css mm/backing-dev: associate writeback with correct blkcg --- mm/backing-dev.c | 22 -- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/mm/backing-dev.c b/mm/backing-dev.c index f5561ea7d90a..9c1a128199e6 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -434,6 +434,22 @@ static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb) spin_unlock_irq(&cgwb_lock); } +static inline struct cgroup_subsys_state * +cgroup_get_e_css_virtialized(struct cgroup *cgroup, +struct cgroup_subsys *ss) +{ + struct cgroup_subsys_state *css; + +#ifdef CONFIG_VE + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) + css = cgroup_get_e_ve_css(cgroup, ss); + else +#endif + css = cgroup_get_e_css(cgroup, ss); + + return css; +} + static int cgwb_create(struct backing_dev_info *bdi, struct cgroup_subsys_state *memcg_css, gfp_t gfp) { @@ -446,7 +462,8 @@ static int cgwb_create(struct backing_dev_info *bdi, int ret = 0; memcg = mem_cgroup_from_css(memcg_css); - blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &io_cgrp_subsys); + blkcg_css = cgroup_get_e_css_virtialized(memcg_css->cgroup, +&io_cgrp_subsys); blkcg = css_to_blkcg(blkcg_css); memcg_cgwb_list = &memcg->cgwb_list; blkcg_cgwb_list = &blkcg->cgwb_list; @@ -566,7 +583,8 @@ struct bdi_writeback *wb_get_lookup(struct backing_dev_info *bdi, struct cgroup_subsys_state *blkcg_css; /* see whether the blkcg association has changed */ - blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &io_cgrp_subsys); + blkcg_css = cgroup_get_e_css_virtialized(memcg_css->cgroup, +&io_cgrp_subsys); if (unlikely(wb->blkcg_css != blkcg_css || !wb_tryget(wb))) wb = NULL; css_put(blkcg_css); ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RH9 1/2] kernel/cgroup: implement cgroup_get_e_ve_css
From: Andrey Zhadchenko Existing cgroup_get_e_css() is not suited for cgroup-v1 and will always return root cgroup css. Implement new cgroup_get_e_ve_css to return ve css. https://jira.sw.ru/browse/PSBM-131253 Signed-off-by: Andrey Zhadchenko Reviewed-by: Kirill Tkhai == mm/writeback: Adopt cgroup-v2 writeback (limit per-memcg dirty memory) In cgroup-v1 all writeback IO is accounted to root blkcg by design. With cgroup-v2 it became possible to link memcg and blkcg, so writeback code was enhanced to 1) consider balancing dirty pages per memory cgroup 2) account writeback generated IO to blkcg In vz7 writeback was balancing by means of beancounter cgroup. However we dropped it. In vz8 @aryabinin tried to enable cgroup-v2 writeback with 5cc286c98ee20 ("mm, cgroup, writeback: Enable per-cgroup writeback for v1 cgroup."), but cgroup_get_e_css(), which is used to find blkcg based on memcg, does not work well with cgroup-v1 and always returns root blkcg. However we can implement a new function to associate blkcg with memcg via ve css_set. Test results with 256M container without patch: === # echo "253:22358 1" > /sys/fs/cgroup/blkio/machine.slice/1/blkio.throttle.write_bps_device # vzctl exec 1 dd if=/dev/zero of=/test bs=1M count=1000 # 1048576000 bytes (1.0 GB, 1000 MiB) copied, 1.35522 s, 774 MB/s Since dirty balancing is global, Container can dirty more than it's RAM and blkio limits are not respected. With patch: === # echo "253:22765 1" > /sys/fs/cgroup/blkio/machine.slice/1/blkio.throttle.write_bps_device # vzctl exec 1 dd if=/dev/zero of=/test bs=1M count=1000 # 1048576000 bytes (1.0 GB, 1000 MiB) copied, 10.2267 s, 103 MB/s Per-ve dirty balancing and throttling work as expected. v2: Since ve->ve_ns is pointing to task nsproxy, it can be changed during ve lifetime. We already have a helper ve_get_init_css() that handles this case, so I decided to reuse it's code in new cgroup_get_e_ve_css(). Additionally I have added two patches that improve current code: 1) drop 'get' from css_get_local_root() name since get with css functions usually results in taking reference 2) drop duplicate code and reuse css_local_root() helper in ve_get_init_css() Andrey Zhadchenko (4): kernel/cgroup: rename css_get_local_root kernel/ve: simplify ve_get_init_css kernel/cgroup: implement cgroup_get_e_ve_css mm/backing-dev: associate writeback with correct blkcg Signed-off-by: Kirill Tkhai --- include/linux/cgroup.h |2 ++ kernel/cgroup/cgroup.c | 19 +++ 2 files changed, 21 insertions(+) diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 05b4688cf949..892362bde6b1 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -962,6 +962,8 @@ static inline void cgroup_bpf_put(struct cgroup *cgrp) {} #ifdef CONFIG_VE int ve_hide_cgroups(struct cgroup_root *root); struct ve_struct *get_curr_ve(void); +struct cgroup_subsys_state *cgroup_get_e_ve_css(struct cgroup *cgrp, + struct cgroup_subsys *ss); #endif #endif /* _LINUX_CGROUP_H */ diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index bb9bce3de45a..c08497c7eb5d 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -610,6 +610,25 @@ struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgrp, } EXPORT_SYMBOL_GPL(cgroup_get_e_css); +#ifdef CONFIG_VE +struct cgroup_subsys_state *cgroup_get_e_ve_css(struct cgroup *cgrp, + struct cgroup_subsys *ss) +{ + struct cgroup_subsys_state *css; + struct ve_struct *ve; + + rcu_read_lock(); + + ve = cgroup_ve_owner(cgrp); + if (!ve) + ve = get_ve0(); + css = ve_get_init_css(ve, ss->id); + + rcu_read_unlock(); + return css; +} +#endif + static void cgroup_get_live(struct cgroup *cgrp) { WARN_ON_ONCE(cgroup_is_dead(cgrp)); ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RH9 0/2] part23 part2
--- Andrey Zhadchenko (2): kernel/cgroup: implement cgroup_get_e_ve_css mm/backing-dev: associate writeback with correct blkcg include/linux/cgroup.h |2 ++ kernel/cgroup/cgroup.c | 19 +++ mm/backing-dev.c | 22 -- 3 files changed, 41 insertions(+), 2 deletions(-) -- Signed-off-by: Kirill Tkhai ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RH9] ploop: Introduce option "off=" to cut beginning of device
This is like the functionality of dm-linear. https://jira.sw.ru/browse/PSBM-132445 Signed-off-by: Kirill Tkhai --- drivers/md/dm-ploop-map.c|9 +++-- drivers/md/dm-ploop-target.c | 28 drivers/md/dm-ploop.h|1 + 3 files changed, 28 insertions(+), 10 deletions(-) diff --git a/drivers/md/dm-ploop-map.c b/drivers/md/dm-ploop-map.c index e3cf0ab73c98..79142acddecc 100644 --- a/drivers/md/dm-ploop-map.c +++ b/drivers/md/dm-ploop-map.c @@ -42,6 +42,11 @@ static unsigned int pio_nr_segs(struct pio *pio) return nr_segs; } +static sector_t ploop_rq_pos(struct ploop *ploop, struct request *rq) +{ + return blk_rq_pos(rq) + ploop->skip_off; +} + void ploop_index_wb_init(struct ploop_index_wb *piwb, struct ploop *ploop) { piwb->ploop = ploop; @@ -85,7 +90,7 @@ void init_pio(struct ploop *ploop, unsigned int bi_op, struct pio *pio) /* Get clu related to pio sectors */ static int ploop_rq_valid(struct ploop *ploop, struct request *rq) { - sector_t sector = blk_rq_pos(rq); + sector_t sector = ploop_rq_pos(ploop, rq); loff_t end_byte; u32 end_clu; @@ -1651,7 +1656,6 @@ static void prepare_one_embedded_pio(struct ploop *ploop, struct pio *pio, goto err_nomem; prq->bvec = bvec; skip_bvec: - pio->bi_iter.bi_sector = blk_rq_pos(rq); pio->bi_iter.bi_size = blk_rq_bytes(rq); pio->bi_iter.bi_idx = 0; pio->bi_iter.bi_bvec_done = 0; @@ -1661,6 +1665,7 @@ static void prepare_one_embedded_pio(struct ploop *ploop, struct pio *pio, pio->bi_iter = rq->bio->bi_iter; } + pio->bi_iter.bi_sector = ploop_rq_pos(ploop, rq); pio->bi_io_vec = bvec; pio->queue_list_id = PLOOP_LIST_DEFERRED; diff --git a/drivers/md/dm-ploop-target.c b/drivers/md/dm-ploop-target.c index ec0efddef2ac..327095f75359 100644 --- a/drivers/md/dm-ploop-target.c +++ b/drivers/md/dm-ploop-target.c @@ -389,16 +389,26 @@ static int ploop_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto err; } - /* Optional parameter */ - if (strcmp(argv[0], "falloc_new_clu") == 0) { - if (argc < 2) { - ret = -EINVAL; - goto err; + ret = -EINVAL; + /* Optional parameters */ + while (argc > 0) { + if (strcmp(argv[0], "falloc_new_clu") == 0) { + ploop->falloc_new_clu = true; + EAT_ARG(argc, argv); + continue; + } + if (strncmp(argv[0], "off=", 4) == 0) { + if (kstrtou64(argv[0] + 4, 10, &ploop->skip_off) < 0) + goto err; + EAT_ARG(argc, argv); + continue; } - ploop->falloc_new_clu = true; - EAT_ARG(argc, argv); + break; } + if (argc <= 0) + goto err; + ret = ploop_add_deltas_stack(ploop, &argv[0], argc); if (ret) goto err; @@ -435,7 +445,7 @@ static void ploop_status(struct dm_target *ti, status_type_t type, unsigned int maxlen) { struct ploop *ploop = ti->private; - char stat[16] = { 0 }, *p = stat; + char stat[32] = { 0 }, *p = stat; ssize_t sz = 0; down_read(&ploop->ctl_rwsem); @@ -447,6 +457,8 @@ static void ploop_status(struct dm_target *ti, status_type_t type, p += sprintf(p, "n"); if (p == stat) p += sprintf(p, "o"); + if (ploop->skip_off) + p += sprintf(p, " off=%llu", ploop->skip_off); up_read(&ploop->ctl_rwsem); BUG_ON(p - stat >= sizeof(stat)); diff --git a/drivers/md/dm-ploop.h b/drivers/md/dm-ploop.h index 8de2a28b2dec..a7ca942c4670 100644 --- a/drivers/md/dm-ploop.h +++ b/drivers/md/dm-ploop.h @@ -148,6 +148,7 @@ struct ploop { bool falloc_new_clu; /* fallocate() instead of truncate() */ u32 nr_bat_entries; unsigned int cluster_log; /* In sectors */ + sector_t skip_off; /* To cut beginning of ploop device */ u8 m_Sig[16]; /* Signature */ u32 m_Type; /* Disk type */ ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RH8] ploop: Introduce option "off=" to cut beginning of device
This is like the functionality of dm-linear. https://jira.sw.ru/browse/PSBM-132445 Signed-off-by: Kirill Tkhai --- drivers/md/dm-ploop-map.c|9 +++-- drivers/md/dm-ploop-target.c | 28 drivers/md/dm-ploop.h|1 + 3 files changed, 28 insertions(+), 10 deletions(-) diff --git a/drivers/md/dm-ploop-map.c b/drivers/md/dm-ploop-map.c index e3cf0ab73c98..79142acddecc 100644 --- a/drivers/md/dm-ploop-map.c +++ b/drivers/md/dm-ploop-map.c @@ -42,6 +42,11 @@ static unsigned int pio_nr_segs(struct pio *pio) return nr_segs; } +static sector_t ploop_rq_pos(struct ploop *ploop, struct request *rq) +{ + return blk_rq_pos(rq) + ploop->skip_off; +} + void ploop_index_wb_init(struct ploop_index_wb *piwb, struct ploop *ploop) { piwb->ploop = ploop; @@ -85,7 +90,7 @@ void init_pio(struct ploop *ploop, unsigned int bi_op, struct pio *pio) /* Get clu related to pio sectors */ static int ploop_rq_valid(struct ploop *ploop, struct request *rq) { - sector_t sector = blk_rq_pos(rq); + sector_t sector = ploop_rq_pos(ploop, rq); loff_t end_byte; u32 end_clu; @@ -1651,7 +1656,6 @@ static void prepare_one_embedded_pio(struct ploop *ploop, struct pio *pio, goto err_nomem; prq->bvec = bvec; skip_bvec: - pio->bi_iter.bi_sector = blk_rq_pos(rq); pio->bi_iter.bi_size = blk_rq_bytes(rq); pio->bi_iter.bi_idx = 0; pio->bi_iter.bi_bvec_done = 0; @@ -1661,6 +1665,7 @@ static void prepare_one_embedded_pio(struct ploop *ploop, struct pio *pio, pio->bi_iter = rq->bio->bi_iter; } + pio->bi_iter.bi_sector = ploop_rq_pos(ploop, rq); pio->bi_io_vec = bvec; pio->queue_list_id = PLOOP_LIST_DEFERRED; diff --git a/drivers/md/dm-ploop-target.c b/drivers/md/dm-ploop-target.c index ec0efddef2ac..327095f75359 100644 --- a/drivers/md/dm-ploop-target.c +++ b/drivers/md/dm-ploop-target.c @@ -389,16 +389,26 @@ static int ploop_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto err; } - /* Optional parameter */ - if (strcmp(argv[0], "falloc_new_clu") == 0) { - if (argc < 2) { - ret = -EINVAL; - goto err; + ret = -EINVAL; + /* Optional parameters */ + while (argc > 0) { + if (strcmp(argv[0], "falloc_new_clu") == 0) { + ploop->falloc_new_clu = true; + EAT_ARG(argc, argv); + continue; + } + if (strncmp(argv[0], "off=", 4) == 0) { + if (kstrtou64(argv[0] + 4, 10, &ploop->skip_off) < 0) + goto err; + EAT_ARG(argc, argv); + continue; } - ploop->falloc_new_clu = true; - EAT_ARG(argc, argv); + break; } + if (argc <= 0) + goto err; + ret = ploop_add_deltas_stack(ploop, &argv[0], argc); if (ret) goto err; @@ -435,7 +445,7 @@ static void ploop_status(struct dm_target *ti, status_type_t type, unsigned int maxlen) { struct ploop *ploop = ti->private; - char stat[16] = { 0 }, *p = stat; + char stat[32] = { 0 }, *p = stat; ssize_t sz = 0; down_read(&ploop->ctl_rwsem); @@ -447,6 +457,8 @@ static void ploop_status(struct dm_target *ti, status_type_t type, p += sprintf(p, "n"); if (p == stat) p += sprintf(p, "o"); + if (ploop->skip_off) + p += sprintf(p, " off=%llu", ploop->skip_off); up_read(&ploop->ctl_rwsem); BUG_ON(p - stat >= sizeof(stat)); diff --git a/drivers/md/dm-ploop.h b/drivers/md/dm-ploop.h index 8de2a28b2dec..a7ca942c4670 100644 --- a/drivers/md/dm-ploop.h +++ b/drivers/md/dm-ploop.h @@ -148,6 +148,7 @@ struct ploop { bool falloc_new_clu; /* fallocate() instead of truncate() */ u32 nr_bat_entries; unsigned int cluster_log; /* In sectors */ + sector_t skip_off; /* To cut beginning of ploop device */ u8 m_Sig[16]; /* Signature */ u32 m_Type; /* Disk type */ ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RH9 4/5] cgroup/net_prio: virtualize ifpriomap per-ve
From: Pavel Tikhomirov Ifpriomap is a map of net_prio cgroup id to device prio. Each process is in some netprio cgroup and all sockets of this process have prio cgroup id of this cgroup. When packet from such socket goes through network stack we choose priority for packet on each device we go through based on these device+id->prio map. Previously we were able to set map for each net_prio cgroup on the system, but only for devices of host init network namespace. This patch adds mapping for ve init netns devices. VE can only get/change device map for ve init netns, Host can only get/change device map for host's init netns. We can have for same cgroup both mappings setup by host for host net devices and mappings setup by ve for ve net devices. When new cgroup is created it either copies only mappings for host network devices if done from host, or copies also mappings for ve network devices if done from ve. If ve is not running (ve_ns is NULL), even while in ve we would operate with host ifpriomap. https://jira.sw.ru/browse/PSBM-123766 Signed-off-by: Pavel Tikhomirov cgroup: ifpriomap virtualization I've also added get_curr_ve() helper as it looks like in many places we rely that get_exec_env() gives us ve which would not free under us, but all processes can be moved easily from this ve in parallel and ve can be freed AFAICS. https://jira.sw.ru/browse/PSBM-123766 Signed-off-by: Kirill Tkhai --- net/core/netprio_cgroup.c | 73 - 1 file changed, 71 insertions(+), 2 deletions(-) diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c index 99a431c56f23..0ab8c37c42b8 100644 --- a/net/core/netprio_cgroup.c +++ b/net/core/netprio_cgroup.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -145,6 +146,7 @@ cgrp_css_alloc(struct cgroup_subsys_state *parent_css) static int cgrp_css_online(struct cgroup_subsys_state *css) { struct cgroup_subsys_state *parent_css = css->parent; + struct ve_struct *ve; struct net_device *dev; int ret = 0; @@ -166,6 +168,38 @@ static int cgrp_css_online(struct cgroup_subsys_state *css) if (ret) break; } + + /* get_exec_env is safe under cgroup_mutex */ + ve = get_exec_env(); + /* +* Inherit prios from the parent cgroup in scope of ve init netns. +*/ + if (!ve_is_super(ve)) { + struct nsproxy *ve_ns; + struct net *net = NULL; + + /* +* Take rcu read lock to check that ve's net is not freed under +* us after we release rcu read lock we still have rtnl lock to +* insure net remains non-freed, pairs with rtnl lock in +* cleanup_net(). +*/ + rcu_read_lock(); + ve_ns = rcu_dereference(ve->ve_ns); + if (ve_ns) + net = ve_ns->net_ns; + rcu_read_unlock(); + + if (net && net != &init_net) { + for_each_netdev(net, dev) { + u32 prio = netprio_prio(parent_css, dev); + + ret = netprio_set_prio(css, dev, prio); + if (ret) + break; + } + } + } rtnl_unlock(); return ret; } @@ -182,19 +216,38 @@ static u64 read_prioidx(struct cgroup_subsys_state *css, struct cftype *cft) static int read_priomap(struct seq_file *sf, void *v) { + struct ve_struct *ve; + struct net *net, *_net = NULL; struct net_device *dev; + ve = get_curr_ve(); + if (!ve_is_super(ve)) { + struct nsproxy *ve_ns; + + rcu_read_lock(); + ve_ns = rcu_dereference(ve->ve_ns); + if (ve_ns) + _net = get_net(ve_ns->net_ns); + rcu_read_unlock(); + } + put_ve(ve); + + net = _net ? : &init_net; rcu_read_lock(); - for_each_netdev_rcu(&init_net, dev) + for_each_netdev_rcu(net, dev) seq_printf(sf, "%s %u\n", dev->name, netprio_prio(seq_css(sf), dev)); rcu_read_unlock(); + if (_net) + put_net(_net); return 0; } static ssize_t write_priomap(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { + struct ve_struct *ve; + struct net *net, *_net = NULL; char devname[IFNAMSIZ + 1]; struct net_device *dev; u32 prio; @@ -203,7 +256,22 @@ static ssize_t write_priomap(struct kernfs_open_file *of, if (sscanf(buf, "%"__stringify(IFNAMSIZ)"s %u", devname, &prio) != 2)
[Devel] [PATCH RH9 3/5] ve: get_curr_ve: first try getting ve with rcu lock
From: Pavel Tikhomirov By holding rcu lock we can have valid ve pointer. Next using css_tryget we can get reference on ve cgroup if it is not yet started to destroy. In case cgroup is destroying retry with cgroup_mutex. https://jira.sw.ru/browse/PSBM-123766 Signed-off-by: Pavel Tikhomirov cgroup: ifpriomap virtualization I've also added get_curr_ve() helper as it looks like in many places we rely that get_exec_env() gives us ve which would not free under us, but all processes can be moved easily from this ve in parallel and ve can be freed AFAICS. https://jira.sw.ru/browse/PSBM-123766 Signed-off-by: Kirill Tkhai --- kernel/cgroup/cgroup.c | 25 +++-- kernel/ve/ve.c |2 +- 2 files changed, 24 insertions(+), 3 deletions(-) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 04a5e1effbaf..05fe9436a9a3 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1929,8 +1929,29 @@ struct ve_struct *get_curr_ve(void) struct ve_struct *ve; /* -* Under cgroup_mutex both current tasks ve cgroup and ->task_ve -* pointer can't change. Corresponding cgroup_mutex around +* If first thread loads current->task_ve pointer, and if just after +* that current is moved by other thread from this ve cgroup to some +* other and this ve cgroup gets destroyed, ve pointer gets freed, so +* first thread can't use such ve pointer safely. +*/ + + /* +* Fast path: Let's make it safe with rcu lock, though current can be +* moved to other ve cgroup and our ve cgroup can start destroying, ve +* pointer would be still valid. As it is freed in ve_destroy. And +* ve_destroy is called from rcu callback after task_ve had changed. +*/ + rcu_read_lock(); + ve = rcu_dereference(current->task_ve); + if (css_tryget(&ve->css)) { + rcu_read_unlock(); + return ve; + } + rcu_read_unlock(); + + /* +* Slow path: Under cgroup_mutex both current tasks ve cgroup and +* task_ve pointer can't change. Corresponding cgroup_mutex around * cgroup_attach_task() protects us from it. */ mutex_lock(&cgroup_mutex); diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c index af46a9b597df..ba5a3a63acec 100644 --- a/kernel/ve/ve.c +++ b/kernel/ve/ve.c @@ -900,7 +900,7 @@ static void ve_attach(struct cgroup_taskset *tset) if (cpuid_override_on()) set_tsk_thread_flag(task, TIF_CPUID_OVERRIDE); - task->task_ve = ve; + rcu_assign_pointer(task->task_ve, ve); } } ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RH9 2/5] ve: add get_curr_ve helper
From: Pavel Tikhomirov This helper is a safe alternative to get_exec_env(), this helper actually gets reference on current ve so if from other thread current would be moved from this ve, at least this ve would not be freed under us. https://jira.sw.ru/browse/PSBM-123766 Signed-off-by: Pavel Tikhomirov cgroup: ifpriomap virtualization I've also added get_curr_ve() helper as it looks like in many places we rely that get_exec_env() gives us ve which would not free under us, but all processes can be moved easily from this ve in parallel and ve can be freed AFAICS. https://jira.sw.ru/browse/PSBM-123766 Signed-off-by: Kirill Tkhai --- include/linux/cgroup.h |1 + kernel/cgroup/cgroup.c | 21 + 2 files changed, 22 insertions(+) diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 6e9aca26313a..99bd069a476d 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -960,6 +960,7 @@ static inline void cgroup_bpf_put(struct cgroup *cgrp) {} #ifdef CONFIG_VE int ve_hide_cgroups(struct cgroup_root *root); +struct ve_struct *get_curr_ve(void); #endif #endif /* _LINUX_CGROUP_H */ diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 217846841fda..04a5e1effbaf 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1919,6 +1919,27 @@ static int cgroup_reconfigure(struct fs_context *fc) } #ifdef CONFIG_VE +/* + * This helper is a safe alternative to get_exec_env(), this helper actually + * gets reference on current ve so if in other thread we would be moved from + * this ve, at least this ve would not be freed under us. + */ +struct ve_struct *get_curr_ve(void) +{ + struct ve_struct *ve; + + /* +* Under cgroup_mutex both current tasks ve cgroup and ->task_ve +* pointer can't change. Corresponding cgroup_mutex around +* cgroup_attach_task() protects us from it. +*/ + mutex_lock(&cgroup_mutex); + ve = get_ve(current->task_ve); + mutex_unlock(&cgroup_mutex); + + return ve; +} + void cgroup_mark_ve_root(struct ve_struct *ve) { struct cgrp_cset_link *link; ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RH9 5/5] ve/fs/inotify: do not impose limit on the number of instances by default
From: Vladimir Davydov In Vz7 we haven't switched to user ns yet. As a result, all containers use the same user_struct for the same user id. This leads to hitting fs.inotify.max_user_instances sysctl limit quickly (it equals 128 by default) and failing to start a container. This patch sets the default limit to INT_MAX. This is a temporary solution and should be reverted once we start using user ns. In PCS6 there is no such problem, because we actually create a user ns per container there. Although its functionality is basic in comparison to Vz7, it still results in creating a new user_struct for each user inside a container so that the inotify limit is containerized. https://jira.sw.ru/browse/PSBM-39048 Signed-off-by: Vladimir Davydov khorenko@: to be reverted once we support userns in Virtuozzo 7 (cherry picked from 78c91a02de6b6f0423e12e12128f9433934d7c61) Signed-off-by: Valeriy Vdovin https://jira.sw.ru/browse/PSBM-131634 Signed-off-by: Valeriy Vdovin khorenko@: TODO: we have to review all places along this path of using inotifies and make sure all allocations are accounted to Containers. Signed-off-by: Kirill Tkhai --- fs/notify/inotify/inotify_user.c |4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 62051247f6d2..d30a459136b6 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -847,8 +847,8 @@ static int __init inotify_user_setup(void) SLAB_PANIC|SLAB_ACCOUNT); inotify_max_queued_events = 16384; - init_user_ns.ucount_max[UCOUNT_INOTIFY_INSTANCES] = 128; - init_user_ns.ucount_max[UCOUNT_INOTIFY_WATCHES] = watches_max; + init_user_ns.ucount_max[UCOUNT_INOTIFY_INSTANCES] = INT_MAX; + init_user_ns.ucount_max[UCOUNT_INOTIFY_WATCHES] = INT_MAX; return 0; } ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RH9 1/5] shm: skip shm_destroy if task IPC namespace was changed
From: Alexander Mikhalitsyn Fixes: ab602f79915 ("shm: make exit_shm work proportional to task activity") (ms commit) https://jira.sw.ru/browse/PSBM-131142 Signed-off-by: Alexander Mikhalitsyn Signed-off-by: Kirill Tkhai --- ipc/shm.c | 10 +- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/ipc/shm.c b/ipc/shm.c index ab749be6d8b7..fb4e58375802 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -173,6 +173,14 @@ static inline struct shmid_kernel *shm_obtain_object_check(struct ipc_namespace return container_of(ipcp, struct shmid_kernel, shm_perm); } +static inline bool is_shm_in_ns(struct ipc_namespace *ns, struct shmid_kernel *shp) +{ + int idx = ipcid_to_idx(shp->shm_perm.id); + struct shmid_kernel *tshp = shm_obtain_object(ns, idx); + + return !IS_ERR(tshp) && tshp == shp; +} + /* * shm_lock_(check_) routines are called in the paths where the rwsem * is not necessarily held. @@ -415,7 +423,7 @@ void exit_shm(struct task_struct *task) list_for_each_entry_safe(shp, n, &task->sysvshm.shm_clist, shm_clist) { shp->shm_creator = NULL; - if (shm_may_destroy(ns, shp)) { + if (is_shm_in_ns(ns, shp) && shm_may_destroy(ns, shp)) { shm_lock_by_ptr(shp); shm_destroy(ns, shp); } ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RH9 v2 0/5] part23 part
--- Alexander Mikhalitsyn (1): shm: skip shm_destroy if task IPC namespace was changed Pavel Tikhomirov (3): ve: add get_curr_ve helper ve: get_curr_ve: first try getting ve with rcu lock cgroup/net_prio: virtualize ifpriomap per-ve Vladimir Davydov (1): ve/fs/inotify: do not impose limit on the number of instances by default fs/notify/inotify/inotify_user.c |4 +- include/linux/cgroup.h |1 + ipc/shm.c| 10 + kernel/cgroup/cgroup.c | 42 ++ kernel/ve/ve.c |2 + net/core/netprio_cgroup.c| 73 +- 6 files changed, 126 insertions(+), 6 deletions(-) -- Signed-off-by: Kirill Tkhai ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RH9 1/3] shm: skip shm_destroy if task IPC namespace was changed
From: Alexander Mikhalitsyn Fixes: ab602f79915 ("shm: make exit_shm work proportional to task activity") (ms commit) https://jira.sw.ru/browse/PSBM-131142 Signed-off-by: Alexander Mikhalitsyn Signed-off-by: Kirill Tkhai --- ipc/shm.c | 10 +- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/ipc/shm.c b/ipc/shm.c index ab749be6d8b7..fb4e58375802 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -173,6 +173,14 @@ static inline struct shmid_kernel *shm_obtain_object_check(struct ipc_namespace return container_of(ipcp, struct shmid_kernel, shm_perm); } +static inline bool is_shm_in_ns(struct ipc_namespace *ns, struct shmid_kernel *shp) +{ + int idx = ipcid_to_idx(shp->shm_perm.id); + struct shmid_kernel *tshp = shm_obtain_object(ns, idx); + + return !IS_ERR(tshp) && tshp == shp; +} + /* * shm_lock_(check_) routines are called in the paths where the rwsem * is not necessarily held. @@ -415,7 +423,7 @@ void exit_shm(struct task_struct *task) list_for_each_entry_safe(shp, n, &task->sysvshm.shm_clist, shm_clist) { shp->shm_creator = NULL; - if (shm_may_destroy(ns, shp)) { + if (is_shm_in_ns(ns, shp) && shm_may_destroy(ns, shp)) { shm_lock_by_ptr(shp); shm_destroy(ns, shp); } ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RH9 0/3] part23 part
https://jira.sw.ru/browse/PSBM-134015 --- Alexander Mikhalitsyn (1): shm: skip shm_destroy if task IPC namespace was changed Pavel Tikhomirov (1): cgroup/net_prio: virtualize ifpriomap per-ve Vladimir Davydov (1): commit 22b5a8a84548 fs/notify/inotify/inotify_user.c |4 +- ipc/shm.c| 10 + net/core/netprio_cgroup.c| 73 +- 3 files changed, 82 insertions(+), 5 deletions(-) -- Signed-off-by: Kirill Tkhai ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RH9 2/3] cgroup/net_prio: virtualize ifpriomap per-ve
From: Pavel Tikhomirov Ifpriomap is a map of net_prio cgroup id to device prio. Each process is in some netprio cgroup and all sockets of this process have prio cgroup id of this cgroup. When packet from such socket goes through network stack we choose priority for packet on each device we go through based on these device+id->prio map. Previously we were able to set map for each net_prio cgroup on the system, but only for devices of host init network namespace. This patch adds mapping for ve init netns devices. VE can only get/change device map for ve init netns, Host can only get/change device map for host's init netns. We can have for same cgroup both mappings setup by host for host net devices and mappings setup by ve for ve net devices. When new cgroup is created it either copies only mappings for host network devices if done from host, or copies also mappings for ve network devices if done from ve. If ve is not running (ve_ns is NULL), even while in ve we would operate with host ifpriomap. https://jira.sw.ru/browse/PSBM-123766 Signed-off-by: Pavel Tikhomirov cgroup: ifpriomap virtualization I've also added get_curr_ve() helper as it looks like in many places we rely that get_exec_env() gives us ve which would not free under us, but all processes can be moved easily from this ve in parallel and ve can be freed AFAICS. https://jira.sw.ru/browse/PSBM-123766 Signed-off-by: Kirill Tkhai --- net/core/netprio_cgroup.c | 73 - 1 file changed, 71 insertions(+), 2 deletions(-) diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c index 99a431c56f23..0ab8c37c42b8 100644 --- a/net/core/netprio_cgroup.c +++ b/net/core/netprio_cgroup.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -145,6 +146,7 @@ cgrp_css_alloc(struct cgroup_subsys_state *parent_css) static int cgrp_css_online(struct cgroup_subsys_state *css) { struct cgroup_subsys_state *parent_css = css->parent; + struct ve_struct *ve; struct net_device *dev; int ret = 0; @@ -166,6 +168,38 @@ static int cgrp_css_online(struct cgroup_subsys_state *css) if (ret) break; } + + /* get_exec_env is safe under cgroup_mutex */ + ve = get_exec_env(); + /* +* Inherit prios from the parent cgroup in scope of ve init netns. +*/ + if (!ve_is_super(ve)) { + struct nsproxy *ve_ns; + struct net *net = NULL; + + /* +* Take rcu read lock to check that ve's net is not freed under +* us after we release rcu read lock we still have rtnl lock to +* insure net remains non-freed, pairs with rtnl lock in +* cleanup_net(). +*/ + rcu_read_lock(); + ve_ns = rcu_dereference(ve->ve_ns); + if (ve_ns) + net = ve_ns->net_ns; + rcu_read_unlock(); + + if (net && net != &init_net) { + for_each_netdev(net, dev) { + u32 prio = netprio_prio(parent_css, dev); + + ret = netprio_set_prio(css, dev, prio); + if (ret) + break; + } + } + } rtnl_unlock(); return ret; } @@ -182,19 +216,38 @@ static u64 read_prioidx(struct cgroup_subsys_state *css, struct cftype *cft) static int read_priomap(struct seq_file *sf, void *v) { + struct ve_struct *ve; + struct net *net, *_net = NULL; struct net_device *dev; + ve = get_curr_ve(); + if (!ve_is_super(ve)) { + struct nsproxy *ve_ns; + + rcu_read_lock(); + ve_ns = rcu_dereference(ve->ve_ns); + if (ve_ns) + _net = get_net(ve_ns->net_ns); + rcu_read_unlock(); + } + put_ve(ve); + + net = _net ? : &init_net; rcu_read_lock(); - for_each_netdev_rcu(&init_net, dev) + for_each_netdev_rcu(net, dev) seq_printf(sf, "%s %u\n", dev->name, netprio_prio(seq_css(sf), dev)); rcu_read_unlock(); + if (_net) + put_net(_net); return 0; } static ssize_t write_priomap(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { + struct ve_struct *ve; + struct net *net, *_net = NULL; char devname[IFNAMSIZ + 1]; struct net_device *dev; u32 prio; @@ -203,7 +256,22 @@ static ssize_t write_priomap(struct kernfs_open_file *of, if (sscanf(buf, "%"__stringify(IFNAMSIZ)"s %u", devname, &prio) != 2)
[Devel] [PATCH RH9 3/3] commit 22b5a8a84548
From: Vladimir Davydov ve/fs/inotify: do not impose limit on the number of instances by default In Vz7 we haven't switched to user ns yet. As a result, all containers use the same user_struct for the same user id. This leads to hitting fs.inotify.max_user_instances sysctl limit quickly (it equals 128 by default) and failing to start a container. This patch sets the default limit to INT_MAX. This is a temporary solution and should be reverted once we start using user ns. In PCS6 there is no such problem, because we actually create a user ns per container there. Although its functionality is basic in comparison to Vz7, it still results in creating a new user_struct for each user inside a container so that the inotify limit is containerized. https://jira.sw.ru/browse/PSBM-39048 Signed-off-by: Vladimir Davydov khorenko@: to be reverted once we support userns in Virtuozzo 7 (cherry picked from 78c91a02de6b6f0423e12e12128f9433934d7c61) Signed-off-by: Valeriy Vdovin https://jira.sw.ru/browse/PSBM-131634 Signed-off-by: Valeriy Vdovin khorenko@: TODO: we have to review all places along this path of using inotifies and make sure all allocations are accounted to Containers. --- fs/notify/inotify/inotify_user.c |4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 62051247f6d2..d30a459136b6 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -847,8 +847,8 @@ static int __init inotify_user_setup(void) SLAB_PANIC|SLAB_ACCOUNT); inotify_max_queued_events = 16384; - init_user_ns.ucount_max[UCOUNT_INOTIFY_INSTANCES] = 128; - init_user_ns.ucount_max[UCOUNT_INOTIFY_WATCHES] = watches_max; + init_user_ns.ucount_max[UCOUNT_INOTIFY_INSTANCES] = INT_MAX; + init_user_ns.ucount_max[UCOUNT_INOTIFY_WATCHES] = INT_MAX; return 0; } ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RH9] ve/devtmpfs: lightweight virtualization
From: Stanislav Kinsburskiy Due to changes in RH8.4 we need to rewrork it, actually the logic becomes much more simple, we mount/umount single tmpts per ve on cgroup creation/removal, all actual devtmpfs mount calls only increase a refcount on corresponding ve's mount like with hosts devtmps. Original commit message: Previousely, we implemented full-featured devtmpfs virtualization for VE: when a device is created in a VE "namespace", we send a signal to kdevtmpfs to create the devnode on devtmpfs mount corresponding to the VE. This seems to be over-complicated: all this work can be done from userspace, because we only have a hardcoded list of devices created exclusively for VE on container start. Those are tty-related stuff and mem devices, and we only need the latter to create devtmpfs nodes. Moreover, it is buggy: ve_stop_ns, which destroys VE devtmpfs mount can be called before a VE tty device is unregistered, resulting in a KP: https://jira.sw.ru/browse/PSBM-35077 This patch therefore simplified it. It makes the kernel only provide a single empty tmpfs mount per VE, which appears on an attempt to mount devtmpfs from inside a VE. The content of the fs is to be filled by the userspace on container start, which will be done in the scope of https://jira.sw.ru/browse/PSBM-35146 All this patch does is provides each VE with its own empty single tmpfs mount, which appears on an attempt to mount "devtmpfs". It's up to the userspace to populate this fs on container start, all kernel requests to create a device node inside a VE are ignored. Signed-off-by: Vladimir Davydov Signed-off-by: Stanislav Kinsburskiy https://jira.sw.ru/browse/PSBM-131158 Signed-off-by: Pavel Tikhomirov v2 by khorenko@: s/FS_USERNS_MOUNT/FS_VE_MOUNT/ Signed-off-by: Kirill Tkhai --- drivers/base/devtmpfs.c | 24 include/linux/device.h |2 ++ include/linux/ve.h |2 ++ kernel/ve/ve.c |6 ++ 4 files changed, 34 insertions(+) diff --git a/drivers/base/devtmpfs.c b/drivers/base/devtmpfs.c index 8be352ab4ddb..b3a3cbe65daa 100644 --- a/drivers/base/devtmpfs.c +++ b/drivers/base/devtmpfs.c @@ -27,6 +27,7 @@ #include #include #include +#include #include "base.h" static struct task_struct *thread; @@ -59,6 +60,12 @@ static struct dentry *public_dev_mount(struct file_system_type *fs_type, int fla const char *dev_name, void *data) { struct super_block *s = mnt->mnt_sb; +#ifdef CONFIG_VE + struct ve_struct *ve = get_exec_env(); + + if (!ve_is_super(ve)) + s = ve->devtmpfs_mnt->mnt_sb; +#endif atomic_inc(&s->s_active); down_write(&s->s_umount); return dget(s->s_root); @@ -79,6 +86,7 @@ static struct file_system_type internal_fs_type = { static struct file_system_type dev_fs_type = { .name = "devtmpfs", .mount = public_dev_mount, + .fs_flags = FS_VIRTUALIZED | FS_VE_MOUNT, }; #ifdef CONFIG_BLOCK @@ -438,6 +446,22 @@ static int __ref devtmpfsd(void *p) return 0; } +int ve_mount_devtmpfs(struct ve_struct *ve) +{ + char opts[] = "mode=0755"; + struct vfsmount *mnt; + + mnt = vfs_kern_mount(&internal_fs_type, 0, "devtmpfs", opts); + if (IS_ERR(mnt)) { + printk(KERN_ERR "CT#%s: devtmpfs: unable to create devtmpfs %ld\n", + ve_name(ve), PTR_ERR(mnt)); + return PTR_ERR(mnt); + } + ve->devtmpfs_mnt = mnt; + + return 0; +} + /* * Create devtmpfs instance, driver-core devices will add their device * nodes here. diff --git a/include/linux/device.h b/include/linux/device.h index 65d84b67b024..8b1511b1af44 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -950,8 +950,10 @@ bool kill_device(struct device *dev); #ifdef CONFIG_DEVTMPFS int devtmpfs_mount(void); +extern int ve_mount_devtmpfs(struct ve_struct *ve); #else static inline int devtmpfs_mount(void) { return 0; } +static inline int ve_mount_devtmpfs(struct ve_struct *ve) { return 0; } #endif /* drivers/base/power/shutdown.c */ diff --git a/include/linux/ve.h b/include/linux/ve.h index ffe068ec5fe7..e8514c5a0afb 100644 --- a/include/linux/ve.h +++ b/include/linux/ve.h @@ -26,6 +26,7 @@ struct nsproxy; struct veip_struct; struct user_namespace; struct cn_private; +struct vfsmount; struct ve_struct { struct cgroup_subsys_state css; @@ -103,6 +104,7 @@ struct ve_struct { unsigned long aio_nr; unsigned long aio_max_nr; #endif + struct vfsmount *devtmpfs_mnt; }; struct ve_devmnt { diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c index 38ede55d65b7..af46a9b597df 100644 --- a/kernel/ve/ve.c +++ b/kernel/ve/ve.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include @@ -7
Re: [Devel] [PATCH RH9 00/20] part 20 modules autoload
commited On 08.10.2021 12:50, Kirill Tkhai wrote: > --- > > Andrey Ryabinin (2): > ve/kmod/whitelist: Allow ip6tables_raw modules autoload upon request > from CT > ve/kmod/whitelist: Allow nf_tables module autoloading on request from CT > > Cyrill Gorcunov (1): > ve/kmod/whitelist: Add modules to whitelist for c/r sake > > Kirill Tkhai (2): > ve/kmod/whitelist: Allow conntrack nft-helper-* modules autoloading > ve/kmod/whitelist: Allow ts_kmp module autoloading > > Konstantin Khorenko (7): > ve/sysctl/kmod: Introduce tweak to allow indirect modules load from CT > ve/kmod/whitelist: Infrustructure for list of modules to autoload from > CT > ve/kmod: Honor modprobe blacklist on indirect modules autoload from CT > commit 04248b3ff00d > commit da8c1e2262f8 > ve/kmod/whitelist: Allow nfnetlink_queue module autoload from CT > ve/kmod/whitelist: Allow "nft_compat" module autoload from inside a > Container > > Pavel Tikhomirov (6): > ve/kmod/whitelist: Allow dummy module autoloading > ve/kmod/whitelist: Enable vxlan module autoload from inside a Container > ve/kmod/whitelist: Allow IPVS modules autoload in CT > ve/kmod/whitelist: Allow netfilter/ipset modules autoload from inside a > CT > ve/kmod/whitelist: make nfnetlink_log autoloadable upon request from a > CT > ve/kmod/whitelist: Make fib modules autoloadable from CT > > Stanislav Kinsburskiy (1): > ve/kmod/whitelist: Allow NFS modules autoload in Containers > > Vasily Averin (1): > ve/kmod/whitelist: Enable autoload for iptables security tables from > inside CT > > > include/linux/kmod.h |5 + > include/linux/sysctl.h |2 > kernel/kmod.c | 195 > +++++--- > kernel/sysctl.c| 16 > 4 files changed, 207 insertions(+), 11 deletions(-) > > -- > Signed-off-by: Kirill Tkhai > ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
Re: [Devel] [PATCH RH9 0/8] part 29 vtty: vz console
On 07.10.2021 18:18, Cyrill Gorcunov wrote: > Hi! Here is a ported vtty series, build and boot tested only obviously. > I think we might revisit this code and rework more deeply once we manage > to run containers inside so I would be able to manipulate vtty console > from userspace level. > > https://jira.sw.ru/browse/PSBM-134014 commited > > Andrey Vagin (1): > ve/kbd: add file kbd_bind in sysfs, which allow unbind keyboard from > tty (v2) > > Cyrill Gorcunov (6): > ve/tty: Provide interface for current tty inheritance > ve/tty: vt -- Implement per VE support for console and terminals > ve/vtty: Don't zap termios fields on slave peer > ve/vtty: Make indices to match pcs6 scheme > ve/vtty: Don't close unread master peer if slave is nonzero > ve/vtty: Don't free console mapping until no clients left > > Konstantin Khlebnikov (1): > ve/tty: TIOSAK Secure Attention Key ioctl > > arch/powerpc/include/uapi/asm/ioctls.h | 2 + > drivers/input/input.c | 2 +- > drivers/tty/n_tty.c| 6 + > drivers/tty/pty.c | 528 + > drivers/tty/tty_io.c | 73 +++- > drivers/tty/vt/keyboard.c | 74 +++- > include/linux/ve.h | 13 +- > include/uapi/asm-generic/ioctls.h | 2 + > kernel/ve/ve.c | 91 + > kernel/ve/vecalls.c| 3 + > 10 files changed, 781 insertions(+), 13 deletions(-) > > > base-commit: dd9fd627ae5764d17efa1a432a7b771d65de1c71 > ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RH9 19/20] ve/kmod/whitelist: make nfnetlink_log autoloadable upon request from a CT
From: Pavel Tikhomirov We see that k8s_weave-npc container fails with: Thu Oct 29 09:19:53 2020 <5> ulogd.c:981 building new pluginstance stack: 'log1:NFLOG,base1:BASE,pcap1:PCAP' Thu Oct 29 09:19:53 2020 <7> ulogd_inppkt_NFLOG.c:552 unable to bind to log group 86 Thu Oct 29 09:19:53 2020 <7> ulogd.c:948 error starting `log1' Thu Oct 29 09:19:53 2020 <8> ulogd.c:1597 not even a single working plugin stack Fatal error. It needs nfnetlink_log module to be loaded. Need this to be able to run kubernetes in centos-8 containers where it uses nft logs. https://jira.sw.ru/browse/PSBM-121652 Signed-off-by: Pavel Tikhomirov (cherry picked from vz7 commit fdec083048f8 ("ve/kmod: make nfnetlink_log autoloadable upon request from a CT") Signed-off-by: Konstantin Khorenko --- kernel/kmod.c |1 + 1 file changed, 1 insertion(+) diff --git a/kernel/kmod.c b/kernel/kmod.c index f3bd4afb81e1..1ff56543f59d 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -254,6 +254,7 @@ static const char * const ve0_allowed_mod[] = { "nfnetlink-subsys-1", /* NFNL_SUBSYS_CTNETLINK */ "nfnetlink-subsys-2", /* NFNL_SUBSYS_CTNETLINK_EXP */ "nfnetlink-subsys-3", /* NFNL_SUBSYS_QUEUE */ + "nfnetlink-subsys-4", /* NFNL_SUBSYS_ULOG */ "nfnetlink-subsys-10", /* nf_tables */ "nfnetlink-subsys-11", /* nft_compat */ ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RH9 20/20] ve/kmod/whitelist: Make fib modules autoloadable from CT
From: Pavel Tikhomirov Need it for docker Docker nat rules c/r in nft based environment. https://jira.sw.ru/browse/PSBM-125002 Signed-off-by: Pavel Tikhomirov ve/kmod: fix misprint in fib modules autoload allow rules When testing criu to suspend resume "fib" rules I found out that we have wrong names for fib module aliases, and thus can't load them on restore if they are not yet loaded. Perf shows when restoring centos 8 CT with docker: probe:module_payload_iptable_allowed: module_string="nft-expr-2-fib" https://jira.sw.ru/browse/PSBM-125002 mFixes: 84beb0e73874a ("ve/kmod: make fib modules autoloadable from CT") Signed-off-by: Pavel Tikhomirov (cherry picked from vz7 commit ("f4eb6e8a5a78 ve/kmod: make fib modules autoloadable from CT") Signed-off-by: Konstantin Khorenko --- kernel/kmod.c |1 + 1 file changed, 1 insertion(+) diff --git a/kernel/kmod.c b/kernel/kmod.c index 1ff56543f59d..678735dbb969 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -232,6 +232,7 @@ static const char * const ve0_allowed_mod[] = { "nf_synproxy_core", "nft-set", + "nft_fib", "nf_tproxy_ipv4", "nf_tproxy_ipv6", ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RH9 18/20] ve/kmod/whitelist: Enable autoload for iptables security tables from inside CT
From: Vasily Averin Patch enables autoload of iptable_security and ip6table_security from inside containers. It decreases number of errors generated during firewalld start. https://jira.sw.ru/browse/PSBM-98212 Signed-by: Vasily Averin (cherry picked from vz7 commit 77a471044478 ("ve/kmod: enable autoload for iptables security tables from inside CT") Signed-off-by: Konstantin Khorenko --- kernel/kmod.c |2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/kmod.c b/kernel/kmod.c index 3a445d4e2734..f3bd4afb81e1 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -206,10 +206,12 @@ static const char * const ve0_allowed_mod[] = { "ip6_tables", "iptable_filter", "iptable_raw", + "iptable_security", "iptable_nat", "iptable_mangle", "ip6table_filter", "ip6table_raw", + "ip6table_security", "ip6table_nat", "ip6table_mangle", ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RH9 15/20] ve/kmod/whitelist: Allow ts_kmp module autoloading
Otherwise rules like below can't applied from inside CT, when the module is not loaded. $iptables -I FORWARD -m string --string "xx" --algo kmp --to 65535 -j DROP https://jira.sw.ru/browse/PSBM-97729 Signed-off-by: Kirill Tkhai (cherry picked from vz7 commit 2e3b2c332d41 ("ve/modules: Add ts_kmp to allowed modules")) Signed-off-by: Konstantin Khorenko --- kernel/kmod.c |3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/kmod.c b/kernel/kmod.c index da0e72fe7de7..68aeed6587d6 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -298,6 +298,9 @@ static const char * const ve0_allowed_mod[] = { "ip_vs_sh", "ip_vs_lblcr", "ip_vs_lc", + + /* string */ + "ts_kmp", }; /* ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RH9 10/20] ve/kmod/whitelist: Allow netfilter/ipset modules autoload from inside a CT
From: Pavel Tikhomirov I forgot to allow in CT autoload of needed modules, so do: ip_set_list_set ip_set_hash_netiface ip_set_hash_ipportnet ip_set_hash_netport ip_set_hash_net ip_set_hash_ipportip ip_set_hash_ipport ip_set_hash_ip ip_set_bitmap_port ip_set_bitmap_ipmac ip_set_bitmap_ip ip_set https://jira.sw.ru/browse/PSBM-46102 Signed-off-by: Pavel Tikhomirov (cherry picked from vz7 commit 1af0b905877a ("ve/netfilter/ipset: allow modules autoload")) Signed-off-by: Konstantin Khorenko --- kernel/kmod.c | 14 ++ 1 file changed, 14 insertions(+) diff --git a/kernel/kmod.c b/kernel/kmod.c index be0908452d7b..6acc4d943283 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -260,6 +260,20 @@ static const char * const ve0_allowed_mod[] = { /* netlink_diag */ "net-pf-16-proto-4-type-16",/* PF_NETLINK, NETLINK_SOCK_DIAG, AF_NETLINK */ + /* ip_set */ + "nfnetlink-subsys-6", /* NFNL_SUBSYS_IPSET */ + "ip_set_bitmap:ip", + "ip_set_bitmap:ip,mac", + "ip_set_bitmap:port", + "ip_set_hash:ip", + "ip_set_hash:ip,port", + "ip_set_hash:ip,port,ip", + "ip_set_hash:net", + "ip_set_hash:net,port", + "ip_set_hash:ip,port,net", + "ip_set_hash:net,iface", + "ip_set_list:set", + "rtnl-link-dummy", "rtnl-link-vxlan", ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RH9 16/20] ve/kmod/whitelist: Allow nf_tables module autoloading on request from CT
From: Andrey Ryabinin Allow nf_tables.ko module autloading from CT. Needed for iptables in centos 8. https://jira.sw.ru/browse/PSBM-98211 Signed-off-by: Andrey Ryabinin (cherry picked from vz7 commit 18c67099330a ("ve/kmod, nf_tables: allow nf_tables.ko autoloading on request from ve.")) Signed-off-by: Konstantin Khorenko --- kernel/kmod.c |1 + 1 file changed, 1 insertion(+) diff --git a/kernel/kmod.c b/kernel/kmod.c index 68aeed6587d6..f79970fa75e1 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -252,6 +252,7 @@ static const char * const ve0_allowed_mod[] = { "nfnetlink-subsys-1", /* NFNL_SUBSYS_CTNETLINK */ "nfnetlink-subsys-2", /* NFNL_SUBSYS_CTNETLINK_EXP */ "nfnetlink-subsys-3", /* NFNL_SUBSYS_QUEUE */ + "nfnetlink-subsys-10", /* nf_tables */ /* unix_diag */ "net-pf-16-proto-4-type-1", /* PF_NETLINK, NETLINK_SOCK_DIAG, AF_LOCAL */ ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RH9 17/20] ve/kmod/whitelist: Allow "nft_compat" module autoload from inside a Container
From: Konstantin Khorenko A Container with CentOS 8 inside uses nft by default and iptables work in a legacy mode, for that "nft_compat" is required, so allow its autoload. [CT]# iptables -A INPUT -m tos --tos Minimize-Cost -j REJECT iptables v1.8.2 (nf_tables): Couldn't load match `tos':No such file or directory https://jira.sw.ru/browse/PSBM-98948 Signed-off-by: Konstantin Khorenko Acked-by: Andrey Ryabinin (cherry picked from vz7 commit f247ccddb3f9 ("ve/kmod: allow "nft_compat" module autoload from inside a Container")) Signed-off-by: Konstantin Khorenko --- kernel/kmod.c |1 + 1 file changed, 1 insertion(+) diff --git a/kernel/kmod.c b/kernel/kmod.c index f79970fa75e1..3a445d4e2734 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -253,6 +253,7 @@ static const char * const ve0_allowed_mod[] = { "nfnetlink-subsys-2", /* NFNL_SUBSYS_CTNETLINK_EXP */ "nfnetlink-subsys-3", /* NFNL_SUBSYS_QUEUE */ "nfnetlink-subsys-10", /* nf_tables */ + "nfnetlink-subsys-11", /* nft_compat */ /* unix_diag */ "net-pf-16-proto-4-type-1", /* PF_NETLINK, NETLINK_SOCK_DIAG, AF_LOCAL */ ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RH9 13/20] ve/kmod/whitelist: Allow nfnetlink_queue module autoload from CT
From: Konstantin Khorenko + "nfnetlink-subsys-3", /* NFNL_SUBSYS_QUEUE */ https://jira.sw.ru/browse/PSBM-92694 Signed-off-by: Konstantin Khorenko (cherry picked from vz7 commit 588834a3e83f ("ve/netfilter/ipset: allow nfnetlink_queue module autoload")) Signed-off-by: Konstantin Khorenko --- kernel/kmod.c |1 + 1 file changed, 1 insertion(+) diff --git a/kernel/kmod.c b/kernel/kmod.c index 36115c12b46c..b84bfdf216ff 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -251,6 +251,7 @@ static const char * const ve0_allowed_mod[] = { "net-pf-16-proto-12", /* PF_NETLINK, NETLINK_NETFILTER */ "nfnetlink-subsys-1", /* NFNL_SUBSYS_CTNETLINK */ "nfnetlink-subsys-2", /* NFNL_SUBSYS_CTNETLINK_EXP */ + "nfnetlink-subsys-3", /* NFNL_SUBSYS_QUEUE */ /* unix_diag */ "net-pf-16-proto-4-type-1", /* PF_NETLINK, NETLINK_SOCK_DIAG, AF_LOCAL */ ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RH9 12/20] ve/kmod/whitelist: Allow NFS modules autoload in Containers
From: Stanislav Kinsburskiy Otherwise Container online migration can fail. https://jira.sw.ru/browse/PSBM-58178 Signed-off-by: Stanislav Kinsburskiy Reviewed-by: Dmitry Safonov (cherry picked from vz7 commit d6e47c05b868 ("ve/modules: allow NFS modules autoload in Containers")) Signed-off-by: Konstantin Khorenko --- kernel/kmod.c |4 1 file changed, 4 insertions(+) diff --git a/kernel/kmod.c b/kernel/kmod.c index 6544d56a3f96..36115c12b46c 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -278,6 +278,10 @@ static const char * const ve0_allowed_mod[] = { "rtnl-link-dummy", "rtnl-link-vxlan", + /* NFS */ + "nfsv3", + "nfsv4", + /* IPVS */ "ip_vs_ftp", "ip_vs_nq", ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RH9 14/20] ve/kmod/whitelist: Allow conntrack nft-helper-* modules autoloading
Otherwise, in case of destination node does not have modules loaded, CT migration fails. https://jira.sw.ru/browse/PSBM-90319 Signed-off-by: Kirill Tkhai (cherry picked from vz7 commit c92758e6ea45 ("net: Allow autoloading conntrack nft-helper-* modules")) Signed-off-by: Konstantin Khorenko --- kernel/kmod.c |4 1 file changed, 4 insertions(+) diff --git a/kernel/kmod.c b/kernel/kmod.c index b84bfdf216ff..da0e72fe7de7 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -336,6 +336,10 @@ bool module_payload_allowed(const char *module) !strncmp("nfct-helper-",module, 12)) return true; + /* nfct-helper-* modules */ + if (!strncmp("nfct-helper-", module, 12)) + return true; + return false; } #endif /* CONFIG_VE */ ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RH9 11/20] ve/kmod/whitelist: Allow ip6tables_raw modules autoload upon request from CT
From: Andrey Ryabinin Currently autoloading of the ip6table_raw module is forbidden from container, leading to: ip6tables-restore v1.4.21: ip6tables-restore: unable to initialize table 'raw' If use of ip6tables is allowed in container, autoloading of the ip6tables_raw has to be permitted as well. https://jira.sw.ru/browse/PSBM-50548 Signed-off-by: Andrey Ryabinin Acked-by: Kirill Tkhai (cherry picked from vz7 commit 7bc4ff4c5928 ("ve/net/ip6tables: fix autoloading of the ip6table_raw module from CT")) Signed-off-by: Konstantin Khorenko --- kernel/kmod.c |1 + 1 file changed, 1 insertion(+) diff --git a/kernel/kmod.c b/kernel/kmod.c index 6acc4d943283..6544d56a3f96 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -209,6 +209,7 @@ static const char * const ve0_allowed_mod[] = { "iptable_nat", "iptable_mangle", "ip6table_filter", + "ip6table_raw", "ip6table_nat", "ip6table_mangle", ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RH9 09/20] ve/kmod/whitelist: Allow IPVS modules autoload in CT
From: Pavel Tikhomirov we still need to add ip_vs module in /etc/modules-load.d/vz.conf to be able to use ipvs in CT, all other modules are request_module'ed from ip_vs. https://jira.sw.ru/browse/PSBM-63883 Signed-off-by: Pavel Tikhomirov Reviewed-by: Andrew Vagin (cherry picked from vz7 commit 8852410899b0 ("ve/net/ipvs: allow IPVS modules autoload in CT")) Signed-off-by: Konstantin Khorenko --- kernel/kmod.c | 16 1 file changed, 16 insertions(+) diff --git a/kernel/kmod.c b/kernel/kmod.c index ddf44c79c851..be0908452d7b 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -262,6 +262,22 @@ static const char * const ve0_allowed_mod[] = { "rtnl-link-dummy", "rtnl-link-vxlan", + + /* IPVS */ + "ip_vs_ftp", + "ip_vs_nq", + "ip_vs_wlc", + "ip6t_ipvs", + "ipt_ipvs", + "ip_vs_rr", + "ip_vs_pe_sip", + "ip_vs_lblc", + "ip_vs_wrr", + "ip_vs_sed", + "ip_vs_dh", + "ip_vs_sh", + "ip_vs_lblcr", + "ip_vs_lc", }; /* ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RH9 08/20] ve/kmod/whitelist: Enable vxlan module autoload from inside a Container
From: Pavel Tikhomirov vxlan is safe in CT as: 1) Udp multicast socket to connect to outer word sits in creation net- namespace, and these socket can get packets only forwarded/routed in creation ns. 2) Vxlan device is owned by second netns(could be same as first) as any other network device, so same all packets come to it are from the same ns. 3) Vxlans logic works through vxlan_net placed on creation netns, vxlan_fdb and vxlan_rdst are per vxlan device. Thus entries can not intersec with entries from host and other CTs. * One problem I can see now is adding fdb with ifindex(index of device to route packets from UDP socket through) after vxlan is moved to second namespace in vxlan_fdb_parse we use second namespace to check ifindex by device lookup, but in vxlan_xmit_one->ip_route_output_key->...->__ip_route_output_key we use first(creation) namespace to lookup device and probably will fail. So all fdb configuration should go before moving to ns. Same is in mainstream AFAICS. https://jira.sw.ru/browse/PSBM-53629 Signed-off-by: Pavel Tikhomirov Acked-by: Andrei Vagin khorenko@: Docker Swarm requires vxlans. (cherry picked from vz7 commit d5805ee4d748 ("ve/net/vxlan: enable support and autoload in a container")) Signed-off-by: Konstantin Khorenko --- kernel/kmod.c |1 + 1 file changed, 1 insertion(+) diff --git a/kernel/kmod.c b/kernel/kmod.c index 9a24a65deecb..ddf44c79c851 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -261,6 +261,7 @@ static const char * const ve0_allowed_mod[] = { "net-pf-16-proto-4-type-16",/* PF_NETLINK, NETLINK_SOCK_DIAG, AF_NETLINK */ "rtnl-link-dummy", + "rtnl-link-vxlan", }; /* ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RH9 07/20] ve/kmod/whitelist: Allow dummy module autoloading
From: Pavel Tikhomirov After allowing dummy devices in container in bug PSBM-43329 by commit 6061ed860950 ("ve/net/dummy: enable support in a container") docker-ui testcase TestDaemonIP was unXFAIed but it still fails to create dummy because module is not automaticly loaded in rtnl_newlink: ip link add name dummy_test type dummy RTNETLINK answers: Operation not supported So allow the module. https://jira.sw.ru/browse/PSBM-52061 Signed-off-by: Pavel Tikhomirov Reviewed-by: Kirill Tkhai (cherry picked from vz7 commit 3919de0d2585ac861ed237b9b585f2e3bb2e59bd) Signed-off-by: Konstantin Khorenko --- kernel/kmod.c |2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/kmod.c b/kernel/kmod.c index 36420d60cce2..9a24a65deecb 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -259,6 +259,8 @@ static const char * const ve0_allowed_mod[] = { /* netlink_diag */ "net-pf-16-proto-4-type-16",/* PF_NETLINK, NETLINK_SOCK_DIAG, AF_NETLINK */ + + "rtnl-link-dummy", }; /* ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RH9 06/20] ve/kmod/whitelist: Add modules to whitelist for c/r sake
From: Cyrill Gorcunov When doing checpoint/restore during migration we use netlink sockets with diag functionality to fetch various information we need. In particular when restoring on the machine where say netfilter modules are not loaded we fail with | [root@s175 ~]# less /vz/dump/rst-iVS9OC-16.05.04-22.32/criu_restore.11.log | (00.151066) 1: Running ip addr restore | RTNETLINK answers: File exists | RTNETLINK answers: File exists | (00.152641) 1: Running ip route restore | (00.175144) 1: Running ip route restore | (00.184676) 1: Running ip rule delete | (00.186448) 1: Running ip rule delete | (00.188191) 1: Running ip rule delete | (00.190054) 1: Running ip rule restore | (00.191964) 1: Running iptables-restore for iptables-restore | (00.200958) 1: Running ip6tables-restore for ip6tables-restore | >(00.203833) 1: Error (net.c:466): Can't open rtnl sock for net dump: Protocol not supported | (00.229107) Error (cr-restore.c:1407): 15091 killed by signal 9: Killed | (00.229192) Switching to new ns to clean ghosts | (00.241142) uns: calling exit_usernsd (-1, 1) | (00.241173) uns: daemon calls 0x454950 (15085, -1, 1) | (00.241188) uns: `- daemon exits w/ 0 | (00.241570) uns: daemon stopped | (00.241584) Error (cr-restore.c:2248): Restoring FAILED which stands for the following criu code | sk = socket(AF_NETLINK, SOCK_RAW, NETLINK_NETFILTER); | if (sk < 0) { | pr_perror("Can't open rtnl sock for net dump"); | goto out_img; | } because the nfnetlink module is not loaded on the destination machine we're failing. If we would have been running on node the module would be uploaded automatically but restore happens in veX context where modules can't be uploaded. Thus add modules needed for c/r into whitelist, so the criu will upload them automatically. https://jira.sw.ru/browse/PSBM-46789 CC: Vladimir Davydov CC: Konstantin Khorenko CC: Andrey Vagin CC: Pavel Emelyanov Signed-off-by: Cyrill Gorcunov +++ ve/kmod: Change modules whitelist to fit their aliases When we do call for specifed sockets such as netlink netfilter, dialog sockets and such we imply that the kernel will autoload them. But previously (e0914131eeb08e6b1953c682be05b9fbcf185f1f "ve/kmod: Add modules to whitelist for c/r sake") I put module names instead of their alises used in net subsystem to determinate which module to load on socket/protocol types. Fix it putting proper names here. Thanks to Vladimir to point the problem. https://jira.sw.ru/browse/PSBM-46789 CC: Konstantin Khorenko CC: Andrey Vagin CC: Pavel Emelyanov Signed-off-by: Cyrill Gorcunov Reviewed-by: Vladimir Davydov +++ ve/kmod: Allow netfilter conntrack inside VE Netfilter conntrack module is used during checkpoint (which is done on node) so the modules get autoloaded but in case of migration the restore starts inside veX so we need to allow the conntrack to be requested from ve context. Thus add them into whitelist. Initially missed them in ebc70d73717f592c89ad992f77587d9e118bbee6. https://jira.sw.ru/browse/PSBM-47359 CC: Vladimir Davydov CC: Konstantin Khorenko CC: Andrey Vagin CC: Pavel Emelyanov Signed-off-by: Cyrill Gorcunov https://jira.sw.ru/browse/PSBM-127787 (cherry picked from vz7 commit 7d9c655b08b4397fc04430540fdbc763e56beacb) Signed-off-by: Konstantin Khorenko --- kernel/kmod.c | 25 + 1 file changed, 25 insertions(+) diff --git a/kernel/kmod.c b/kernel/kmod.c index b8ca90bec921..36420d60cce2 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -234,6 +234,31 @@ static const char * const ve0_allowed_mod[] = { "fs-binfmt_misc", "fs-overlay", + + /* inet_diag, inet6_diag */ + "net-pf-16-proto-4-type-2", /* PF_NETLINK, NETLINK_SOCK_DIAG, AF_INET */ + "net-pf-16-proto-4-type-10",/* PF_NETLINK, NETLINK_SOCK_DIAG, AF_INET6 */ + + /* tcp_diag */ + "net-pf-16-proto-4-type-2-6", /* PF_NETLINK, NETLINK_SOCK_DIAG, AF_INET - IPPROTO_TCP */ + + /* udp_diag */ + "net-pf-16-proto-4-type-2-17", /* PF_NETLINK, NETLINK_SOCK_DIAG, AF_INET - IPPROTO_UDP */ + "net-pf-16-proto-4-type-2-136", /* PF_NETLINK, NETLINK_SOCK_DIAG, AF_INET - IPPROTO_UDPLITE */ + + /* nfnetlink */ + "net-pf-16-proto-12", /* PF_NETLINK, NETLINK_NETFILTER */ + "nfnetlink-subsys-1", /* NFNL_SUBSYS_CTNETLINK */ + "nfnetlink-subsys-2", /* NFNL_SUBSYS_CTNETLINK_EXP */ + + /* unix_diag */ + "net-pf-16-proto-4-type-1", /* PF_NETLINK, NETLINK_SOCK_DIAG, AF_LOCAL */ + + /* af_packet_diag */ + "net-pf-16-proto-4-type-17",/* PF_NETLINK, NETLINK_SOCK_DIAG, AF_PACKET */ + + /* netlink_diag */ + "net-pf-16-proto-4-type-16",/* PF_NETLINK, NETLINK_SOCK_DIAG, AF_NETLINK */ }; /* ___ Devel
[Devel] [PATCH RH9 02/20] ve/kmod/whitelist: Infrustructure for list of modules to autoload from CT
From: Konstantin Khorenko https://jira.sw.ru/browse/PSBM-127787 It's a port of following vz7 commits: * 3a4142e ("ve/kmod: Port autoloading from CT") (partially) * 8af13e7c ("ve/kmod: list of allowed to autoload in CT modules") (partially) Signed-off-by: Konstantin Khorenko Signed-off-by: Kirill Tkhai --- include/linux/kmod.h |5 + kernel/kmod.c| 33 + 2 files changed, 38 insertions(+) diff --git a/include/linux/kmod.h b/include/linux/kmod.h index 68f69362d427..d9b8dd81f595 100644 --- a/include/linux/kmod.h +++ b/include/linux/kmod.h @@ -32,4 +32,9 @@ static inline int request_module_nowait(const char *name, ...) { return -ENOSYS; #define try_then_request_module(x, mod...) (x) #endif +#ifdef CONFIG_VE +extern bool module_payload_allowed(const char *module); +#else +static inline bool module_payload_allowed(const char *module) { return true; } +#endif #endif /* __LINUX_KMOD_H__ */ diff --git a/kernel/kmod.c b/kernel/kmod.c index 2f9afc601d20..c8506fd92017 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -151,6 +151,10 @@ int __request_module(bool wait, const char *fmt, ...) !ve_allow_module_load) return -EPERM; + /* Check that module functionality is permitted */ + if (!module_payload_allowed(module_name)) + return -EPERM; + ret = security_kernel_module_request(module_name); if (ret) return ret; @@ -182,3 +186,32 @@ int __request_module(bool wait, const char *fmt, ...) return ret; } EXPORT_SYMBOL(__request_module); + +#ifdef CONFIG_VE + +/* ve0 allowed modules */ +static const char * const ve0_allowed_mod[] = { +}; + +/* + * module_payload_allowed - check if module functionality is allowed + * to be used inside current virtual environment. + * + * Returns true if it is allowed or we're in ve0, false otherwise. + */ +bool module_payload_allowed(const char *module) +{ + int i; + + if (ve_is_super(get_exec_env())) + return true; + + /* Look for full module name in ve0_allowed_mod table */ + for (i = 0; i < ARRAY_SIZE(ve0_allowed_mod); i++) { + if (!strcmp(ve0_allowed_mod[i], module)) + return true; + } + + return false; +} +#endif /* CONFIG_VE */ ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RH9 05/20] commit da8c1e2262f8
From: Konstantin Khorenko ve/kmod/whitelist: List of allowed to autoload in CT modules (non-netfilters) Following non-netfilter modules are allowed to be autoloaded from inside a CT: * binfmt_misc * fs-overlay It's port of vz7 commits: * 8af13e7c ("ve/kmod: list of allowed to autoload in CT modules") (partically) * 264ef13 ("ve/kmod/whitelist: allow overlay fs module autoloading") https://jira.sw.ru/browse/PSBM-127787 Signed-off-by: Konstantin Khorenko to merge --- kernel/kmod.c |3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/kmod.c b/kernel/kmod.c index 7472184200f2..b8ca90bec921 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -231,6 +231,9 @@ static const char * const ve0_allowed_mod[] = { "nft-set", "nf_tproxy_ipv4", "nf_tproxy_ipv6", + + "fs-binfmt_misc", + "fs-overlay", }; /* ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RH9 04/20] commit 04248b3ff00d
From: Konstantin Khorenko ve/kmod/whitelist: Allow iptables/netfilter modules for autoload from CT For now following modules are allowed by default to be autoloaded upon indirect request from inside a Container: * iptables/ip6tables core modules * netfilters core modules (including nf_tables_inet) https://jira.sw.ru/browse/PSBM-99406 * xt_*, ipt_*, ip6t_*, arpt_*, nft-chain-*, nft-expr-*, nf-logger-* modules * ebt* modules: previously we allowed to autoload ebt_* modules only upon request from inside a Container but there are several ebtables_* modules to be allowed as well, thus allow all ebt* modules for that. (Default CentOS7.3 firewalld service inside a CT complains on that) https://jira.sw.ru/browse/PSBM-66435 * all nf_* and nft_* modules https://jira.sw.ru/browse/PSBM-99536 https://jira.sw.ru/browse/PSBM-127787 Signed-off-by: Konstantin Khorenko It's a port of following vz7 commits: * 3a4142e ("ve/kmod: Port autoloading from CT") (partially) * f9422b8 ("ve/kmod: Add rules for autoloading (new) nf_tables") * ccd1a1d ("ve/kmod: Add rules for new {ip, ip6, x}table modules") * fe6a9073 ("ve/kmod: allow to autoload nf_log_ipv[46]") * b221ce6 ("ve/kmod/ebtable: allow to autoload ebtable_* modules from inside a CT") * 24f61ddc955f ("ve/kmod: enable autoload for nf_tables_inet module from inside a CT") * 0995da4719da ("ve/kmod: make all nf_* and nft_* autoloadable upon request from a CT")) Signed-off-by: Konstantin Khorenko --- kernel/kmod.c | 46 ++ 1 file changed, 46 insertions(+) diff --git a/kernel/kmod.c b/kernel/kmod.c index 7915397fcf46..7472184200f2 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -202,6 +202,35 @@ EXPORT_SYMBOL(__request_module); /* ve0 allowed modules */ static const char * const ve0_allowed_mod[] = { + "ip_tables", + "ip6_tables", + "iptable_filter", + "iptable_raw", + "iptable_nat", + "iptable_mangle", + "ip6table_filter", + "ip6table_nat", + "ip6table_mangle", + + "nf-nat", + "nf_conncount", + "nf_defrag_ipv4", + "nf_defrag_ipv6", + "nf_dup_ipv4", + "nf_dup_ipv6", + "nf_dup_netdev", + "nf_flow_table", + "nf-flowtable-1", + "nf_flow_table_inet", + "nf_osf", + "nf_reject_ipv6", + "nf_socket_ipv4", + "nf_socket_ipv6", + "nf_synproxy_core", + + "nft-set", + "nf_tproxy_ipv4", + "nf_tproxy_ipv6", }; /* @@ -223,6 +252,23 @@ bool module_payload_allowed(const char *module) return true; } + /* modules allowed by name/alias masks */ + if (!strncmp("xt_", module, 3) || + !strncmp("ip_conntrack",module, 12) || + !strncmp("ip_nat_", module, 7) || + !strncmp("ipt_",module, 4) || + !strncmp("ip6t_", module, 5) || + !strncmp("arpt_", module, 5) || + !strncmp("ebt", module, 4) || + !strncmp("nft-chain-", module, 10) || + !strncmp("nft-expr-", module, 9) || + !strncmp("nf_nat", module, 6) || + !strncmp("nf_log_", module, 7) || + !strncmp("nf-logger-", module, 10) || + !strncmp("nf_conntrack",module, 12) || + !strncmp("nfct-helper-",module, 12)) + return true; + return false; } #endif /* CONFIG_VE */ ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RH9 01/20] ve/sysctl/kmod: Introduce tweak to allow indirect modules load from CT
From: Konstantin Khorenko Introduce "kernel.ve_allow_module_load" sysctl to allow (1) / deny (0) indorect kernel modules load upon requests from inside Containers. Indirect modules "autoload" set enabled by default. https://jira.sw.ru/browse/PSBM-127787 Signed-off-by: Konstantin Khorenko --- include/linux/sysctl.h |2 ++ kernel/kmod.c | 11 +++ kernel/sysctl.c| 16 3 files changed, 25 insertions(+), 4 deletions(-) diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 3c59f962f3f6..83ac52e15c73 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -187,6 +187,8 @@ struct ctl_path { extern int trusted_exec; +extern int ve_allow_module_load; + #ifdef CONFIG_SYSCTL void proc_sys_poll_notify(struct ctl_table_poll *poll); diff --git a/kernel/kmod.c b/kernel/kmod.c index a5959c0ecdc2..2f9afc601d20 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -25,6 +25,8 @@ #include #include #include +#include +#include #include @@ -127,10 +129,6 @@ int __request_module(bool wait, const char *fmt, ...) char module_name[MODULE_NAME_LEN]; int ret; - /* Don't allow request_module() inside VE. */ - if (!ve_is_super(get_exec_env())) - return -EPERM; - /* * We don't allow synchronous module loading from async. Module * init may invoke async_synchronize_full() which will end up @@ -148,6 +146,11 @@ int __request_module(bool wait, const char *fmt, ...) if (ret >= MODULE_NAME_LEN) return -ENAMETOOLONG; + /* Check that autoload is not prohibited using /proc interface */ + if (!ve_is_super(get_exec_env()) && + !ve_allow_module_load) + return -EPERM; + ret = security_kernel_module_request(module_name); if (ret) return ret; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 49656fd84639..53090d656dec 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -117,12 +117,17 @@ static int __init set_trusted_exec(char *str) } __setup("trusted_exec", set_trusted_exec); +int ve_allow_module_load = 1; +EXPORT_SYMBOL(ve_allow_module_load); + /* Constants used for minimum and maximum */ #ifdef CONFIG_LOCKUP_DETECTOR static int sixty = 60; #endif static int __maybe_unused neg_one = -1; +static int __maybe_unused zero = 0; +static int __maybe_unused one = 1; static int __maybe_unused two = 2; static int __maybe_unused four = 4; static unsigned long zero_ul; @@ -2362,6 +2367,17 @@ static struct ctl_table kern_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = &two, }, +#endif +#ifdef CONFIG_VE +{ + .procname = "ve_allow_module_load", + .data = &ve_allow_module_load, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, #endif { .procname = "ngroups_max", ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RH9 03/20] ve/kmod: Honor modprobe blacklist on indirect modules autoload from CT
From: Konstantin Khorenko If a kernel modules is requested indirectly from inside a Container, check is this modules is blacklisted on the Node first. https://jira.sw.ru/browse/PSBM-127787 Signed-off-by: Konstantin Khorenko Signed-off-by: Kirill Tkhai --- kernel/kmod.c | 25 ++--- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/kernel/kmod.c b/kernel/kmod.c index c8506fd92017..7915397fcf46 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -64,11 +64,11 @@ char modprobe_path[KMOD_PATH_LEN] = CONFIG_MODPROBE_PATH; static void free_modprobe_argv(struct subprocess_info *info) { - kfree(info->argv[3]); /* check call_modprobe() */ + kfree(info->argv[4]); /* check call_modprobe() */ kfree(info->argv); } -static int call_modprobe(char *module_name, int wait) +static int call_modprobe(char *module_name, int wait, int blacklist) { struct subprocess_info *info; static char *envp[] = { @@ -78,7 +78,7 @@ static int call_modprobe(char *module_name, int wait) NULL }; - char **argv = kmalloc(sizeof(char *[5]), GFP_KERNEL); + char **argv = kmalloc(sizeof(char *[6]), GFP_KERNEL); if (!argv) goto out; @@ -88,9 +88,13 @@ static int call_modprobe(char *module_name, int wait) argv[0] = modprobe_path; argv[1] = "-q"; - argv[2] = "--"; - argv[3] = module_name; /* check free_modprobe_argv() */ - argv[4] = NULL; + if (blacklist) + argv[2] = "-b"; + else + argv[2] = "-q"; /* just repeat argv[1] */ + argv[3] = "--"; + argv[4] = module_name; /* check free_modprobe_argv() */ + argv[5] = NULL; info = call_usermodehelper_setup(modprobe_path, argv, envp, GFP_KERNEL, NULL, free_modprobe_argv, NULL); @@ -127,6 +131,7 @@ int __request_module(bool wait, const char *fmt, ...) { va_list args; char module_name[MODULE_NAME_LEN]; + bool blacklist; int ret; /* @@ -154,6 +159,12 @@ int __request_module(bool wait, const char *fmt, ...) /* Check that module functionality is permitted */ if (!module_payload_allowed(module_name)) return -EPERM; + /* +* This function may be called from ve0, where standard behaviour +* is not to use blacklist. So, we request blacklist reading only +* if we're inside CT. +*/ + blacklist = !ve_is_super(get_exec_env()); ret = security_kernel_module_request(module_name); if (ret) @@ -178,7 +189,7 @@ int __request_module(bool wait, const char *fmt, ...) trace_module_request(module_name, wait, _RET_IP_); - ret = call_modprobe(module_name, wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC); + ret = call_modprobe(module_name, wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC, blacklist); atomic_inc(&kmod_concurrent_max); wake_up(&kmod_wq); ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RH9 00/20] part 20 modules autoload
--- Andrey Ryabinin (2): ve/kmod/whitelist: Allow ip6tables_raw modules autoload upon request from CT ve/kmod/whitelist: Allow nf_tables module autoloading on request from CT Cyrill Gorcunov (1): ve/kmod/whitelist: Add modules to whitelist for c/r sake Kirill Tkhai (2): ve/kmod/whitelist: Allow conntrack nft-helper-* modules autoloading ve/kmod/whitelist: Allow ts_kmp module autoloading Konstantin Khorenko (7): ve/sysctl/kmod: Introduce tweak to allow indirect modules load from CT ve/kmod/whitelist: Infrustructure for list of modules to autoload from CT ve/kmod: Honor modprobe blacklist on indirect modules autoload from CT commit 04248b3ff00d commit da8c1e2262f8 ve/kmod/whitelist: Allow nfnetlink_queue module autoload from CT ve/kmod/whitelist: Allow "nft_compat" module autoload from inside a Container Pavel Tikhomirov (6): ve/kmod/whitelist: Allow dummy module autoloading ve/kmod/whitelist: Enable vxlan module autoload from inside a Container ve/kmod/whitelist: Allow IPVS modules autoload in CT ve/kmod/whitelist: Allow netfilter/ipset modules autoload from inside a CT ve/kmod/whitelist: make nfnetlink_log autoloadable upon request from a CT ve/kmod/whitelist: Make fib modules autoloadable from CT Stanislav Kinsburskiy (1): ve/kmod/whitelist: Allow NFS modules autoload in Containers Vasily Averin (1): ve/kmod/whitelist: Enable autoload for iptables security tables from inside CT include/linux/kmod.h |5 + include/linux/sysctl.h |2 kernel/kmod.c | 195 +--- kernel/sysctl.c| 16 4 files changed, 207 insertions(+), 11 deletions(-) -- Signed-off-by: Kirill Tkhai ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
Re: [Devel] [PATCH RH9 00/26] part 10: connector
Commited On 07.10.2021 15:53, Pavel Tikhomirov wrote: > https://jira.sw.ru/browse/PSBM-133993 > > Stanislav Kinsburskiy (26): > connector: store all private data on VE structure > connector: introduce VE-aware get_cdev() helper > connector: per-ve init and fini helpers introduced > connector: use device stored in VE > connector: per-ve helpers intoruduced > connector: take cn_already_initialized from VE > proc connector: generic proc_event_connector() helper introduced > proc connector: use generic event helper for fork event > proc connector: use generic event helper for exec event > proc connector: use generic event helper for id event > proc connector: use generic event helper for sid event > proc connector: use generic event helper for ptrace event > proc connector: use generic event helper for comm event > proc connector: use generic event helper for coredump event > proc connector: use generic event helper for exit event > proc connector: add pid namespace awareness > proc connector: add per-ve init and fini foutines > proc connector: call proc-related init and fini routines explicitly > proc connector: take number of listeners and per-cpu conters from VE > proc connector: pass VE to event fillers > proc connector: take namespaces from VE > proc connector: use per-ve netlink sender helper > proc connector: send events to both VEs if not in VE#0 > connector: containerize "connector" proc entry > connector: take VE from socket upon callback > connector: add VE SS hook > > drivers/connector/cn_proc.c | 399 ++ > drivers/connector/connector.c | 161 +++--- > include/linux/connector.h | 21 ++ > include/linux/ve.h| 4 + > 4 files changed, 373 insertions(+), 212 deletions(-) > ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
Re: [Devel] [PATCH RH9 00/22] port part 21
Commited On 07.10.2021 13:57, Andrey Zhadchenko wrote: > Alexander Mikhalitsyn (1): > ms/fs: Revert "Make super_blocks and sb_lock static" > > Andrey Ryabinin (1): > drivers/bnx2x: Limit setting of the max mtu > > Cyrill Gorcunov (3): > ve/fs: Export fs.aio-max-nr via ve cgroup > ve/fs: namespace -- Ignore device permissions during restore > ve/fs: namespace -- Don't fail on permissions if @ve->devmnt_list is > empty > > Kirill Tkhai (4): > fs: Lower ioprio in case of ioprio_set() called from CT > ve/fs/files: Add new argument to expand_files() > ve/fs/files: Add fdtable_align() helper > ve/fs/files: Shrink big fdtable on close in is_pseudosuper mode > > Konstantin Khlebnikov (1): > pidns: add proc mount option 'hidepidns=0|1' > > Konstantin Khorenko (4): > ve/fs/sync: Per containter sync and syncfs and fs.fsync-enable sysctl > ve/fs: Allow mount fs in init userns if it's mounted in another userns > drivers/igb: increase link detection timeout up to 5 sec > net/teql: disable "True" (or "trivial") link equalizer inside a CT > > Maxim Patlasov (1): > fs: FIEMAP should sync only required range with FIEMAP_FLAG_SYNC > > Pavel Tikhomirov (1): > ve/cgroup: Hide ve cgroup in Containers > > Valeriy Vdovin (2): > x86/cpuid_fault: Increase max count of cpuid overrides > x86/cpuid_fault: Log table updates > > Vasily Averin (2): > ve/net/ipv6 tunnels: Enable GRE netdevice register inside container > ve/net/sit: Enable SIT devices in Containers > > Vladimir Davydov (2): > fs: Allow to remove swapfile hardlinks (for ploop images protection) > ve/fs/ioprio: Confine ioprio_{set, get}(IOPRIO_WHO_USER) to current ve > > Documentation/filesystems/proc.rst | 4 + > arch/x86/kernel/cpuid_fault.c | 36 +++- > block/ioprio.c | 24 +++ > drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c | 8 + > drivers/net/ethernet/intel/igb/igb_main.c | 5 +- > fs/fcntl.c | 2 + > fs/file.c | 92 +++--- > fs/ioctl.c | 3 +- > fs/mount.h | 2 + > fs/namei.c | 3 +- > fs/namespace.c | 33 +++- > fs/open.c | 3 + > fs/proc/base.c | 11 +- > fs/proc/inode.c | 2 + > fs/proc/root.c | 12 ++ > fs/super.c | 11 +- > fs/sync.c | 213 > +++- > include/linux/cgroup.h | 4 + > include/linux/cpuid_override.h | 2 +- > include/linux/fs.h | 15 ++ > include/linux/proc_fs.h | 1 + > include/linux/ve.h | 2 + > kernel/cgroup/cgroup-v1.c | 8 +- > kernel/cgroup/cgroup.c | 20 +++ > kernel/ve/ve.c | 40 + > kernel/ve/veowner.c | 8 + > mm/msync.c | 2 + > net/ipv6/ip6_gre.c | 1 + > net/ipv6/ip6_tunnel.c | 2 +- > net/ipv6/sit.c | 1 + > net/sched/sch_teql.c| 3 + > 31 files changed, 523 insertions(+), 50 deletions(-) > ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
Re: [Devel] [PATCH RH9 00/12] part19 ext4
Commited On 07.10.2021 13:19, Kirill Tkhai wrote: > --- > > Dmitry Monakhov (6): > ext4: Fix error handling after filesystem abort > jbd2: make shure that we do not miss aborted state > jbd2: raid amnesia protection for the journal > ext4: add mfsync support > ext4: add generic uevent infrastructure > ext4: send abort uevent on ext4 journal abort > > Kirill Tkhai (2): > ve/ext3: treat panic_on_errors as remount-ro_on_errors in CTs > ext4: make data=writeback mode safe > > Konstantin Khorenko (4): > ve/fs/namespace: allow submounts in non-init userns > Kconfig.openvz: force CGROUP_PERF if compiling VZ Containers code > ext4: don't iterate over sbi->s_es_list more than the number of elements > ms/Revert "ext4: simplify kobject usage" > > > fs/ext4/ext4.h | 20 ++ > fs/ext4/extents_status.c|8 ++ > fs/ext4/fsync.c | 108 ++ > fs/ext4/inode.c | 11 ++- > fs/ext4/ioctl.c | 60 + > fs/ext4/super.c | 155 > ++- > fs/ext4/sysfs.c | 43 +--- > fs/jbd2/journal.c |3 - > fs/jbd2/recovery.c | 77 + > fs/namespace.c | 25 +++ > include/trace/events/ext4.h | 54 +++ > kernel/Kconfig.openvz |1 > 12 files changed, 544 insertions(+), 21 deletions(-) > > -- > Signed-off-by: Kirill Tkhai > ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RH9 11/12] ext4: add generic uevent infrastructure
From: Dmitry Monakhov *Purpose: It is reasonable to announce fs related events via uevent infrastructure. This patch implement only ext4'th part, but IMHO this should be usefull for any generic filesystem. Example: Runtime fs-error is pure async event. Currently there is no good way to handle this situation and inform user-space about this. *Implementation: Add uevent infrastructure similar to dm uevent FS_ACTION = {MOUNT|UMOUNT|REMOUNT|ERROR|FREEZE|UNFREEZE} FS_UUID FS_NAME FS_TYPE Signed-off-by: Dmitry Monakhov [aryabinin: add error event, rh8 rebase] Signed-off-by: Andrey Ryabinin Signed-off-by: Kirill Tkhai --- fs/ext4/ext4.h | 11 + fs/ext4/super.c | 129 +++ 2 files changed, 139 insertions(+), 1 deletion(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 5f6fdd5514b2..70b3038fa0d1 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1625,6 +1625,8 @@ struct ext4_sb_info { /* Precomputed FS UUID checksum for seeding other checksums */ __u32 s_csum_seed; + bool s_err_event_sent; + /* Reclaim extents from extent status tree */ struct shrinker s_es_shrinker; struct list_head s_es_list; /* List of inodes with reclaimable extents */ @@ -3655,6 +3657,15 @@ extern int ext4_check_blockref(const char *, unsigned int, struct ext4_ext_path; struct ext4_extent; +enum ext4_event_type { + EXT4_UA_MOUNT, + EXT4_UA_UMOUNT, + EXT4_UA_REMOUNT, + EXT4_UA_ERROR, + EXT4_UA_FREEZE, + EXT4_UA_UNFREEZE, +}; + /* * Maximum number of logical blocks in a file; ext4_extent's ee_block is * __le32. diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 6cf2d3e0ed8f..597768497c42 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -424,6 +424,118 @@ static time64_t __ext4_get_tstamp(__le32 *lo, __u8 *hi) #define ext4_get_tstamp(es, tstamp) \ __ext4_get_tstamp(&(es)->tstamp, &(es)->tstamp ## _hi) +static int ext4_uuid_valid(const u8 *uuid) +{ + int i; + + for (i = 0; i < 16; i++) { + if (uuid[i]) + return 1; + } + return 0; +} + +struct ext4_uevent { + struct super_block *sb; + enum ext4_event_type action; + struct work_struct work; +}; + +/** + * ext4_send_uevent - prepare and send uevent + * + * @sb:super_block + * @action:action type + * + */ +static void ext4_send_uevent_work(struct work_struct *w) +{ + struct ext4_uevent *e = container_of(w, struct ext4_uevent, work); + struct super_block *sb = e->sb; + struct kobj_uevent_env *env; + const u8 *uuid = EXT4_SB(sb)->s_es->s_uuid; + enum kobject_action kaction = KOBJ_CHANGE; + int ret; + + env = kzalloc(sizeof(struct kobj_uevent_env), GFP_KERNEL); + if (!env){ + kfree(e); + return; + } + ret = add_uevent_var(env, "FS_TYPE=%s", sb->s_type->name); + if (ret) + goto out; + ret = add_uevent_var(env, "FS_NAME=%s", sb->s_id); + if (ret) + goto out; + + if (ext4_uuid_valid(uuid)) { + ret = add_uevent_var(env, "UUID=%pUB", uuid); + if (ret) + goto out; + } + + switch (e->action) { + case EXT4_UA_MOUNT: + kaction = KOBJ_ONLINE; + ret = add_uevent_var(env, "FS_ACTION=%s", "MOUNT"); + break; + case EXT4_UA_UMOUNT: + kaction = KOBJ_OFFLINE; + ret = add_uevent_var(env, "FS_ACTION=%s", "UMOUNT"); + break; + case EXT4_UA_REMOUNT: + ret = add_uevent_var(env, "FS_ACTION=%s", "REMOUNT"); + break; + case EXT4_UA_ERROR: + ret = add_uevent_var(env, "FS_ACTION=%s", "ERROR"); + break; + case EXT4_UA_FREEZE: + ret = add_uevent_var(env, "FS_ACTION=%s", "FREEZE"); + break; + case EXT4_UA_UNFREEZE: + ret = add_uevent_var(env, "FS_ACTION=%s", "UNFREEZE"); + break; + default: + ret = -EINVAL; + } + if (ret) + goto out; + ret = kobject_uevent_env(&(EXT4_SB(sb)->s_kobj), kaction, env->envp); +out: + kfree(env); + kfree(e); +} + +/** + * ext4_send_uevent - prepare and schedule event submission + * + * @sb:super_block + * @action:action type + * + */ +void ext4_send_uevent(struct super_block *sb, enum ext4_event_type action) +{ + struct ext4_uevent *e; + + /* +* May happen if called from ext4_put_super() -> __ext4_abort() +* -> ext4_send_uevent() +*/ + if
[Devel] [PATCH RH9 12/12] ext4: send abort uevent on ext4 journal abort
From: Dmitry Monakhov Currenlty error from device result in ext4_abort, but uevent not generated because ext4_abort() caller's context do not allow GFP_KERNEL memory allocation. Let's relax submission context requirement and deffer actual uevent submission to work_queue. It can be any workqueue I've pick rsv_conversion_wq because it is already exists. khorenko@: "system_wq" does not fit here because at the moment of work execution sb can be already destroyed. "EXT4_SB(sb)->rsv_conversion_wq" is flushed before sb is destroyed. Signed-off-by: Dmitry Monakhov [aryabinin rh8 rebase] Signed-off-by: Andrey Ryabinin --- fs/ext4/ext4.h |2 ++ fs/ext4/super.c |6 ++ 2 files changed, 8 insertions(+) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 70b3038fa0d1..5ea1ca7c57c3 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1626,6 +1626,7 @@ struct ext4_sb_info { __u32 s_csum_seed; bool s_err_event_sent; + bool s_abrt_event_sent; /* Reclaim extents from extent status tree */ struct shrinker s_es_shrinker; @@ -3662,6 +3663,7 @@ enum ext4_event_type { EXT4_UA_UMOUNT, EXT4_UA_REMOUNT, EXT4_UA_ERROR, + EXT4_UA_ABORT, EXT4_UA_FREEZE, EXT4_UA_UNFREEZE, }; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 597768497c42..9119dc05850f 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -490,6 +490,9 @@ static void ext4_send_uevent_work(struct work_struct *w) case EXT4_UA_ERROR: ret = add_uevent_var(env, "FS_ACTION=%s", "ERROR"); break; + case EXT4_UA_ABORT: + ret = add_uevent_var(env, "FS_ACTION=%s", "ABORT"); + break; case EXT4_UA_FREEZE: ret = add_uevent_var(env, "FS_ACTION=%s", "FREEZE"); break; @@ -764,6 +767,9 @@ static void ext4_handle_error(struct super_block *sb, bool force_ro, int error, WARN_ON_ONCE(1); if (!continue_fs && !sb_rdonly(sb)) { + if (!xchg(&EXT4_SB(sb)->s_abrt_event_sent, 1)) + ext4_send_uevent(sb, EXT4_UA_ABORT); + ext4_set_mount_flag(sb, EXT4_MF_FS_ABORTED); if (journal) jbd2_journal_abort(journal, -EIO); ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RH9 10/12] ms/Revert "ext4: simplify kobject usage"
From: Konstantin Khorenko This reverts ms commit bc1420ae56266fa2c5a8e452d55f744ca98fe42f. * we want ext4 to send udev events * kobject_uevent_env() kobject->kset is defined => let's ext4 defines kobject->kset https://jira.sw.ru/browse/PSBM-127422 Signed-off-by: Konstantin Khorenko Signed-off-by: Kirill Tkhai --- fs/ext4/sysfs.c | 43 --- 1 file changed, 32 insertions(+), 11 deletions(-) diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c index 2314f7446592..7af925442a61 100644 --- a/fs/ext4/sysfs.c +++ b/fs/ext4/sysfs.c @@ -489,6 +489,13 @@ static void ext4_sb_release(struct kobject *kobj) complete(&sbi->s_kobj_unregister); } +static void ext4_kset_release(struct kobject *kobj) +{ + struct kset *kset = container_of(kobj, struct kset, kobj); + + kfree(kset); +} + static const struct sysfs_ops ext4_attr_ops = { .show = ext4_attr_show, .store = ext4_attr_store, @@ -511,7 +518,12 @@ void ext4_notify_error_sysfs(struct ext4_sb_info *sbi) sysfs_notify(&sbi->s_kobj, NULL, "errors_count"); } -static struct kobject *ext4_root; +static struct kobj_type ext4_ktype = { + .sysfs_ops = &ext4_attr_ops, + .release= ext4_kset_release, +}; + +static struct kset *ext4_kset; static struct kobject *ext4_feat; @@ -520,8 +532,9 @@ int ext4_register_sysfs(struct super_block *sb) struct ext4_sb_info *sbi = EXT4_SB(sb); int err; + sbi->s_kobj.kset = ext4_kset; init_completion(&sbi->s_kobj_unregister); - err = kobject_init_and_add(&sbi->s_kobj, &ext4_sb_ktype, ext4_root, + err = kobject_init_and_add(&sbi->s_kobj, &ext4_sb_ktype, NULL, "%s", sb->s_id); if (err) { kobject_put(&sbi->s_kobj); @@ -562,18 +575,26 @@ int __init ext4_init_sysfs(void) { int ret; - ext4_root = kobject_create_and_add("ext4", fs_kobj); - if (!ext4_root) + ext4_kset = kzalloc(sizeof(*ext4_kset), GFP_KERNEL); + if (!ext4_kset) return -ENOMEM; + kobject_set_name(&ext4_kset->kobj, "ext4"); + ext4_kset->kobj.parent = fs_kobj; + ext4_kset->kobj.ktype = &ext4_ktype; + ret = kset_register(ext4_kset); + if (ret) + goto kset_err; + ext4_feat = kzalloc(sizeof(*ext4_feat), GFP_KERNEL); if (!ext4_feat) { ret = -ENOMEM; - goto root_err; + goto kset_err; } + ext4_feat->kset = ext4_kset; ret = kobject_init_and_add(ext4_feat, &ext4_feat_ktype, - ext4_root, "features"); + NULL, "features"); if (ret) goto feat_err; @@ -583,9 +604,9 @@ int __init ext4_init_sysfs(void) feat_err: kobject_put(ext4_feat); ext4_feat = NULL; -root_err: - kobject_put(ext4_root); - ext4_root = NULL; +kset_err: + kset_unregister(ext4_kset); + ext4_kset = NULL; return ret; } @@ -593,8 +614,8 @@ void ext4_exit_sysfs(void) { kobject_put(ext4_feat); ext4_feat = NULL; - kobject_put(ext4_root); - ext4_root = NULL; + kset_unregister(ext4_kset); + ext4_kset = NULL; remove_proc_entry(proc_dirname, NULL); ext4_proc_root = NULL; } ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RH9 04/12] ve/ext3: treat panic_on_errors as remount-ro_on_errors in CTs
This is a port from 2.6.32-x of: * diff-ext4-in-containers-treat-panic_on_errors-as-remount-ro_on_errors ext4: in containers treat errors=panic as Container can explode whole node if it remounts its ploop with option 'errors=panic' and triggers abort after that. Signed-off-by: Konstantin Khlebnikov Acked-by: Maxim V. Patlasov Signed-off-by: Dmitry Monakhov khorenko@: currently we have devmnt->allowed_options options which are configured via userspace and currently vzctl provides empty list. This is an additional check - just in case someone get secondary ploop image with 'errors=panic' mount option saved in the image and mounts it from inside a CT. Signed-off-by: Andrey Ryabinin Signed-off-by: Kirill Tkhai --- fs/ext4/super.c | 14 +++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index f09a2432a20e..685686f5b849 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1906,6 +1906,7 @@ static int clear_qf_name(struct super_block *sb, int qtype) #define MOPT_STRING0x0400 #define MOPT_SKIP 0x0800 #defineMOPT_2 0x1000 +#define MOPT_WANT_SYS_ADMIN0x4000 static const struct mount_opts { int token; @@ -1938,7 +1939,7 @@ static const struct mount_opts { EXT4_MOUNT_JOURNAL_CHECKSUM), MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT}, {Opt_noload, EXT4_MOUNT_NOLOAD, MOPT_NO_EXT2 | MOPT_SET}, - {Opt_err_panic, EXT4_MOUNT_ERRORS_PANIC, MOPT_SET | MOPT_CLEAR_ERR}, + {Opt_err_panic, EXT4_MOUNT_ERRORS_PANIC, MOPT_SET | MOPT_CLEAR_ERR|MOPT_WANT_SYS_ADMIN}, {Opt_err_ro, EXT4_MOUNT_ERRORS_RO, MOPT_SET | MOPT_CLEAR_ERR}, {Opt_err_cont, EXT4_MOUNT_ERRORS_CONT, MOPT_SET | MOPT_CLEAR_ERR}, {Opt_data_err_abort, EXT4_MOUNT_DATA_ERR_ABORT, @@ -2182,6 +2183,9 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, } if (m->flags & MOPT_CLEAR_ERR) clear_opt(sb, ERRORS_MASK); + if (m->flags & MOPT_WANT_SYS_ADMIN && !capable(CAP_SYS_ADMIN)) + return 1; + if (token == Opt_noquota && sb_any_quota_loaded(sb)) { ext4_msg(sb, KERN_ERR, "Cannot change quota " "options when quota turned on"); @@ -4226,8 +4230,12 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_WBACK) set_opt(sb, WRITEBACK_DATA); - if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_PANIC) - set_opt(sb, ERRORS_PANIC); + if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_PANIC) { + if (capable(CAP_SYS_ADMIN)) + set_opt(sb, ERRORS_PANIC); + else + set_opt(sb, ERRORS_RO); + } else if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_CONTINUE) set_opt(sb, ERRORS_CONT); else ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RH9 05/12] ext4: Fix error handling after filesystem abort
From: Dmitry Monakhov If filesystem was aborted after inode's write back is complete but before its metadata was updated we may return success results in data loss. In order to handle fs abort correctly we have to check fs state once we discover that it is in MS_RDONLY state Signed-off-by: Dmitry Monakhov +++ ext4: fix broken fsync for dirs/symlink mFixes commit 6a63db16da84fe ("ext4: Fix error handling after filesystem abort"). xfstests: generic/321 generic/335 generic/348 Signed-off-by: Dmitry Monakhov (cherry picked from vz7 commit 00399757c828ee82941123f6c67e7c96d906ce2b) Signed-off-by: Konstantin Khorenko Signed-off-by: Kirill Tkhai --- fs/ext4/super.c |6 +- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 685686f5b849..6cf2d3e0ed8f 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -5784,8 +5784,12 @@ int ext4_force_commit(struct super_block *sb) { journal_t *journal; - if (sb_rdonly(sb)) + if (sb_rdonly(sb)) { + smp_rmb(); + if (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED) + return -EROFS; return 0; + } journal = EXT4_SB(sb)->s_journal; return ext4_journal_force_commit(journal); ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RH9 03/12] ext4: don't iterate over sbi->s_es_list more than the number of elements
From: Konstantin Khorenko If there are several shrinkers working on a single sbi there can be easily a situation when a neighbor shrinkers reclaimed a bunch of extents and thus a bunch inodes from the s_es_list but we don't honor this and iterate over sbi->s_es_list the number of times equal to the initial number of inodes in s_es_list. Before each iteration, check if we are going to iterate more than the number of inodes in the list and adjust nr_to_walk accordingly. https://jira.sw.ru/browse/PSBM-83335 Signed-off-by: Konstantin Khorenko Acked-by: Dmitry Monakhov (cherry picked from vz7 commit 17a5132158a4 ("ext4: don't iterate over sbi->s_es_list more than the number of elements")) VZ 8 rebase part https://jira.sw.ru/browse/PSBM-127798 Signed-off-by: Alexander Mikhalitsyn Signed-off-by: Kirill Tkhai --- fs/ext4/extents_status.c |8 1 file changed, 8 insertions(+) diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 9a3a8996aacf..92aa9265a117 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -1485,6 +1485,14 @@ static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, spin_unlock(&sbi->s_es_lock); goto out; } + /* +* Another shrinker can remove a bunch of extents in parallel, +* we don't have to iterate more than the current number of +* inodes in the list. +*/ + if (nr_to_walk > sbi->s_es_nr_inode) + nr_to_walk = sbi->s_es_nr_inode; + ei = list_first_entry(&sbi->s_es_list, struct ext4_inode_info, i_es_list); /* Move the inode to the tail */ ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RH9 07/12] ext4: make data=writeback mode safe
From: Kirill Tkhai It is not obvious, but delalloc makes data=writeback mode safer. This is because actual data allocation happens inside writepages, So stale blocks after unclean umount no longer an issue. So in order to make data=writeback mode reliable we can not temporarily disable delalloc in case of low diskspace. It must be enabled permanently. Original discussion: http://thread.gmane.org/gmane.comp.file-systems.ext4/19527 https://jira.sw.ru:9443/browse/PCLIN-299 diff-ms-ext4-safe-writeback Signed-off-by: Dmitry Monakhov Signed-off-by: Kirill Tkhai (cherry picked from vz7 commit 025b3611cf3eba7f1a83bf34c05ea439c4ade410) Signed-off-by: Konstantin Khorenko Signed-off-by: Kirill Tkhai --- fs/ext4/inode.c | 11 +-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index d8de607849df..0d2268ead3e7 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2915,8 +2915,15 @@ static int ext4_nonda_switch(struct super_block *sb) if (dirty_clusters && (free_clusters < 2 * dirty_clusters)) try_to_writeback_inodes_sb(sb, WB_REASON_FS_FREE_SPACE); - if (2 * free_clusters < 3 * dirty_clusters || - free_clusters < (dirty_clusters + EXT4_FREECLUSTERS_WATERMARK)) { + /* +* NOTE: Delalloc make data=writeback mode safer, similar to ordered +* mode, so stale blocks after power failure no longer an issue Do not +* disable delalloc to guarantee data security on data=writeback mode. +* -dmon +*/ + if (test_opt(sb, DATA_FLAGS) != EXT4_MOUNT_WRITEBACK_DATA && + (2 * free_clusters < 3 * dirty_clusters || +free_clusters < (dirty_clusters + EXT4_FREECLUSTERS_WATERMARK))) { /* * free block count is less than 150% of dirty blocks * or free blocks is less than watermark ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RH9 00/12] part19 ext4
--- Dmitry Monakhov (6): ext4: Fix error handling after filesystem abort jbd2: make shure that we do not miss aborted state jbd2: raid amnesia protection for the journal ext4: add mfsync support ext4: add generic uevent infrastructure ext4: send abort uevent on ext4 journal abort Kirill Tkhai (2): ve/ext3: treat panic_on_errors as remount-ro_on_errors in CTs ext4: make data=writeback mode safe Konstantin Khorenko (4): ve/fs/namespace: allow submounts in non-init userns Kconfig.openvz: force CGROUP_PERF if compiling VZ Containers code ext4: don't iterate over sbi->s_es_list more than the number of elements ms/Revert "ext4: simplify kobject usage" fs/ext4/ext4.h | 20 ++ fs/ext4/extents_status.c|8 ++ fs/ext4/fsync.c | 108 ++ fs/ext4/inode.c | 11 ++- fs/ext4/ioctl.c | 60 + fs/ext4/super.c | 155 ++- fs/ext4/sysfs.c | 43 +--- fs/jbd2/journal.c |3 - fs/jbd2/recovery.c | 77 + fs/namespace.c | 25 +++ include/trace/events/ext4.h | 54 +++ kernel/Kconfig.openvz |1 12 files changed, 544 insertions(+), 21 deletions(-) -- Signed-off-by: Kirill Tkhai ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RH9 08/12] jbd2: raid amnesia protection for the journal
From: Dmitry Monakhov https://jira.sw.ru/browse/PSBM-15484 Some blockdevices can return different data on read requests from same block after power failure (for example mirrored raid is out of sync, and resync is in progress) In that case following sutuation is possible: Power failure happen after transaction commit log was issued for transaction 'D', next boot first dist will have commit block, but second one will not. mirror1: journal={Ac-Bc-Cc-Dc } mirror2: journal={Ac-Bc-Cc-D } Now let's let assumes that we read from mirror1 and found that 'D' has valid commit block, so journal_replay will replay that transaction, but second power failure may happen before journal_reset() so next journal_replay() may read from mirror2 and found that 'C' is last valid transaction. This result in corruption because we already replayed trandaction 'D'. In order to avoid such ambiguity we should pefrorm 'stabilize write'. 1) Read and rewrite latest commit id block 2) Invalidate next block in order to guarantee that journal head becomes stable. Signed-off-by: Dmitry Monakhov Signed-off-by: Andrey Ryabinin Signed-off-by: Kirill Tkhai --- fs/jbd2/recovery.c | 77 +++- 1 file changed, 76 insertions(+), 1 deletion(-) diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index d47a0d96bf30..01b937aa0a81 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -33,6 +33,9 @@ struct recovery_info int nr_replays; int nr_revokes; int nr_revoke_hits; + + unsigned intlast_log_block; + struct buffer_head *last_commit_bh; }; static int do_one_pass(journal_t *journal, @@ -268,6 +271,71 @@ static int fc_do_one_pass(journal_t *journal, return err; } +/* + * The 'Raid amnesia' effect protection: https://jira.sw.ru/browse/PSBM-15484 + * + * Some blockdevices can return different data on read requests from same block + * after power failure (for example mirrored raid is out of sync, and resync is + * in progress) In that case following sutuation is possible: + * + * Power failure happen after transaction commit log was issued for + * transaction 'D', next boot first dist will have commit block, but + * second one will not. + * mirror1: journal={Ac-Bc-Cc-Dc } + * mirror2: journal={Ac-Bc-Cc-D } + * Now let's let assumes that we read from mirror1 and found that 'D' has + * valid commit block, so journal_replay will replay that transaction, but + * second power failure may happen before journal_reset() so next + * journal_replay() may read from mirror2 and found that 'C' is last valid + * transaction. This result in corruption because we already replayed + * trandaction 'D'. + * In order to avoid such ambiguity we should pefrorm 'stabilize write'. + * 1) Read and rewrite latest commit id block + * 2) Invalidate next block in + * order to guarantee that journal head becomes stable. + * Yes i know that 'stabilize write' approach is ugly but this is the only + * way to run filesystem on blkdevices with 'raid amnesia' effect + */ +static int stabilize_journal_head(journal_t *journal, struct recovery_info *info) +{ + struct buffer_head *bh[2] = {NULL, NULL}; + int err, err2, i; + + if (!info->last_commit_bh) + return 0; + + bh[0] = info->last_commit_bh; + info->last_commit_bh = NULL; + + err = jread(&bh[1], journal, info->last_log_block); + if (err) + goto out; + + for (i = 0; i < 2; i++) { + lock_buffer(bh[i]); + /* Explicitly invalidate block beyond last commit block */ + if (i == 1) + memset(bh[i]->b_data, 0, journal->j_blocksize); + + BUFFER_TRACE(bh[i], "marking dirty"); + set_buffer_uptodate(bh[i]); + mark_buffer_dirty(bh[i]); + BUFFER_TRACE(bh[i], "marking uptodate"); + unlock_buffer(bh[i]); + } + err = sync_blockdev(journal->j_dev); + /* Make sure data is on permanent storage */ + if (journal->j_flags & JBD2_BARRIER) { + err2 = blkdev_issue_flush(journal->j_dev); + if (!err) + err = err2; + } +out: + brelse(bh[0]); + brelse(bh[1]); + return err; +} + /** * jbd2_journal_recover - recovers a on-disk journal * @journal: the journal to recover @@ -304,6 +372,8 @@ int jbd2_journal_recover(journal_t *journal) } err = do_one_pass(journal, &info, PASS_SCAN); + if (!err) + err = stabilize_journal_head(journal, &info); if (!err) err = do_one_pass(journal, &info, PASS_REVOKE); if (!err) @@ -354,6 +4
[Devel] [PATCH RH9 06/12] jbd2: make shure that we do not miss aborted state
From: Dmitry Monakhov Signed-off-by: Dmitry Monakhov (cherry picked from vz7 commit 2398d7694d2afe5cf83e379ad4ea6e2ddc191675) Signed-off-by: Konstantin Khorenko Signed-off-by: Kirill Tkhai --- fs/jbd2/journal.c |3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 35302bc192eb..4a879e04f4b1 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -713,10 +713,9 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid) !tid_gt(tid, journal->j_commit_sequence)); read_lock(&journal->j_state_lock); } - read_unlock(&journal->j_state_lock); - if (unlikely(is_journal_aborted(journal))) err = -EIO; + read_unlock(&journal->j_state_lock); return err; } ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RH9 09/12] ext4: add mfsync support
From: Dmitry Monakhov Add EXT4_IOC_MFSYNC ioctl which allow to perform sync on given set of files in optimized way (only 1 barrier will be required in best scenario) https://jira.sw.ru/browse/PSBM-18567 Signed-off-by: Dmitry Monakhov +++ Comment on rebasing to rh7 kernel-3.10.0-229.7.2.el7: 1) compile fix for ext4-add-mfsync-support ext4_flush_unwritten_io was removed in rh7-3.10.0-229.7.2 https://jira.sw.ru/browse/PSBM-34909 2) compile fix for ext4-add-mfsync-support part2 __sync_inode was removed in rh7-3.10.0-229.7.2 It is honest to simply disable mfsync in nojournal mode since we so not test nojournal mode at all. https://jira.sw.ru/browse/PSBM-34910 Signed-off-by: Dmitry Monakhov Rebase to vz8 kernel note: mutex_unlock(&inode->i_mutex) -> inode_lock_shared(inode) Signed-off-by: Konstantin Khorenko Signed-off-by: Kirill Tkhai --- fs/ext4/ext4.h |7 +++ fs/ext4/fsync.c | 108 +++ fs/ext4/ioctl.c | 60 include/trace/events/ext4.h | 54 ++ 4 files changed, 229 insertions(+) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index df46d5586ca1..5f6fdd5514b2 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -615,6 +615,11 @@ struct compat_ext4_new_group_input { }; #endif +struct ext4_ioc_mfsync_info { + __u32 size; + __u32 fd[0]; +}; + /* The struct ext4_new_group_input in kernel space, with free_blocks_count */ struct ext4_new_group_data { __u32 group; @@ -722,6 +727,7 @@ enum { #define EXT4_IOC_GET_ES_CACHE _IOWR('f', 42, struct fiemap) #define EXT4_IOC_OPEN_BALLOON _IO('f', 42) #define EXT4_IOC_CHECKPOINT_IOW('f', 43, __u32) +#define EXT4_IOC_MFSYNC_IO('f', 43) #define EXT4_IOC_SHUTDOWN _IOR ('X', 125, __u32) @@ -2814,6 +2820,7 @@ extern int ext4_check_all_de(struct inode *dir, struct buffer_head *bh, /* fsync.c */ extern int ext4_sync_file(struct file *, loff_t, loff_t, int); +extern int ext4_sync_files(struct file **, unsigned int *, unsigned int); /* hash.c */ extern int ext4fs_dirhash(const struct inode *dir, const char *name, int len, diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index 027a7d7037a0..8179066765bd 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -185,3 +185,111 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) trace_ext4_sync_file_exit(inode, ret); return ret; } + +int ext4_sync_files(struct file **files, unsigned int *flags, unsigned int nr_files) +{ + struct super_block *sb; + journal_t *journal; + int err = 0, err2 = 0, i = 0, j = 0; + int force_commit = 0, datawriteback = 0; + tid_t commit_tid = 0; + int need_barrier = 0; + + J_ASSERT(ext4_journal_current_handle() == NULL); + if (!nr_files) + return 0; + + sb = files[0]->f_mapping->host->i_sb; + journal = EXT4_SB(sb)->s_journal; + if (sb->s_flags & SB_RDONLY) { + /* Make shure that we read updated s_mount_flags value */ + smp_rmb(); + if (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED) + return -EROFS; + return 0; + } + for (i = 0; i < nr_files; i++) { + struct address_space * mapping = files[i]->f_mapping; + struct inode *inode = mapping->host; + + BUG_ON(sb != inode->i_sb); + if (!mapping->nrpages) + continue; + + err = filemap_fdatawrite(mapping); + if (err) + break; + + } + /* +* Even if the above returned error, the pages may be +* written partially (e.g. -ENOSPC), so we wait for it. +* But the -EIO is special case, it may indicate the worst +* thing (e.g. bug) happened, so we avoid waiting for it. +*/ + if (err == -EIO) + goto out; + + for (j = 0; j < i; j++) { + struct address_space * mapping = files[j]->f_mapping; + struct inode *inode = mapping->host; + struct ext4_inode_info *ei = EXT4_I(inode); + unsigned int datasync = flags[j]; + tid_t tid; + + if (mapping->nrpages) { + err2 = filemap_fdatawait(mapping); + if (!err || err2 == -EIO) + err = err2; + } + + inode_lock_shared(inode); + force_commit |= ext4_should_journal_data(inode); + datawriteback |= ext4_should_writeback_data(inode); + tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid; + inode_unlock_shared(inode); +
[Devel] [PATCH RH9 02/12] Kconfig.openvz: force CGROUP_PERF if compiling VZ Containers code
From: Konstantin Khorenko The perf_event_open() syscall is available from Containers. The CONFIG_CGROUP_PERF option is set in current OpenVZ kernel configs, but let's force-enable it if CONFIG_VE is enabled to prevent possible non-secure kernel config if someone rebuilds the kernel with own config. https://jira.sw.ru/browse/PSBM-51360 Signed-off-by: Konstantin Khorenko (cherry picked from vz7 commit a35598ba04acf80424fd8f997686a2edd3c3dcb8) Signed-off-by: Konstantin Khorenko --- kernel/Kconfig.openvz |1 + 1 file changed, 1 insertion(+) diff --git a/kernel/Kconfig.openvz b/kernel/Kconfig.openvz index 9489342596ab..6ea4f707df61 100644 --- a/kernel/Kconfig.openvz +++ b/kernel/Kconfig.openvz @@ -19,6 +19,7 @@ config VE select CGROUPS select CGROUP_DEVICE select CGROUP_FREEZER + select CGROUP_PERF help This option adds support of virtual Linux running on the original box with fully supported virtual network driver, tty subsystem and ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RH9 01/12] ve/fs/namespace: allow submounts in non-init userns
From: Konstantin Khorenko Simple NFS mount inside a Container brings us to vfs_submount(), so if we want to enable NFS inside a Container (read - in CT root userns), we have to soften the check for init userns. SyS_mount do_mount vfs_kern_mount mount_fs nfs_fs_mount nfs4_try_mount nfs_follow_remote_path mount_subtree vfs_path_lookup do_path_lookup filename_lookup path_lookupat lookup_slow follow_managed nfs_d_automount nfs4_submount nfs_do_submount vfs_submount https://jira.sw.ru/browse/PSBM-86277 Signed-off-by: Konstantin Khorenko https://jira.sw.ru/browse/PSBM-127234 (cherry picked from vz7 commit bc060d46276144f91a139b7d0acf384dcd0a4dde) vz7->vz8 port note: in vz7 the check has been dropped at all in vz8 we leave the check, but allow submounts only for root CT userns. Signed-off-by: Konstantin Khorenko Reviewed-by: Pavel Tikhomirov +++ ve/fs/namespace: fix allowing submounts in non-init userns When mounting nfs4 mount inside container with something like: mount -t nfs4 $NODEIP:/root/build/criu /mnt we can see that because the source "root" path is several directories long we do create several submounts. Adding perf probes to list mountpoint->d_sb->s_user_ns and mountpoint->d_iname from vfs_submount we see: crash > p &init_user_ns $2 = (struct user_namespace *) 0x9644efc0 1) First submount created has mountpoint dentry "root" and ve userns: mount.nfs4 ...: probe:vfs_submount: (95a970e0) user_ns=0x8b6d6e86a000 dentry="root" 2) Second submount created has mountpoint dentry "build" from first submount and init userns of host: mount.nfs4 ...: probe:vfs_submount: (95a970e0) user_ns=0x9644efc0 dentry="build" So on first step we have ve userns and on second init userns. Either compairing it to one of init userns or ve userns would not work because we can have both of them. So easy solution here is to disable the check completely like we do in vz7. Note: this patch allows nfs4 mounts in containers, thus we overcome nfs3 rpcbind non-dumpable socket migration problems, as now nfs mounts in v4 mode by default. https://jira.sw.ru/browse/PSBM-102629 mFixes: 81a2b734416d ("ve/fs/namespace: allow submounts in non-init userns") Signed-off-by: Pavel Tikhomirov Signed-off-by: Kirill Tkhai --- fs/namespace.c | 25 + 1 file changed, 25 insertions(+) diff --git a/fs/namespace.c b/fs/namespace.c index c10614908e7e..85a451861e14 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1051,12 +1051,37 @@ struct vfsmount * vfs_submount(const struct dentry *mountpoint, struct file_system_type *type, const char *name, void *data) { +#if 0 /* Until it is worked out how to pass the user namespace * through from the parent mount to the submount don't support * unprivileged mounts with submounts. */ + /* Simple NFS mount inside a Container brings us here, so if we want to +* enable NFS inside a Container (read - in non-init userns), we have +* to omit the check. Below is how is was in VZ8: +* +* SyS_mount +* do_mount +*vfs_kern_mount +* mount_fs +* nfs_fs_mount +* nfs4_try_mount +*nfs_follow_remote_path +* mount_subtree +* vfs_path_lookup +* do_path_lookup +*filename_lookup +* path_lookupat +* lookup_slow +* follow_managed +*nfs_d_automount +* nfs4_submount +* nfs_do_submount +* vfs_submount +*/ if (mountpoint->d_sb->s_user_ns != &init_user_ns) return ERR_PTR(-EPERM); +#endif return vfs_kern_mount(type, SB_SUBMOUNT, name, data); } ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
Re: [Devel] [PATCH RH9 0/5] part 12 userfaultfd
Commited On 05.10.2021 20:54, Cyrill Gorcunov wrote: > Hi! The series addresses patches left from userfaultfd handling, > which were mostly merged already so only netlink and a few puckups > left. I had to tune up netlink series since it didn't apply smoothly: > datagram seding proto has been lifted up and netlink socket errors > has been depending on repair mode so the patches were not build-able > step by step. > > Andrey Vagin (1): > netlink: add an ability to restore messages in a receive queue > > Andrey Zhadchenko (1): > netlink: add an option to set sk->err from userspace > > Angelo Ruocco (2): > ms/cgroup: let a symlink too be created with a cftype file > ms/block, bfq: add weight symlink to the bfq.weight cgroup parameter > > Stanislav Kinsburskiy (1): > netlink: allow to set peeking offset for sockets > > block/bfq-cgroup.c | 6 ++- > include/linux/cgroup-defs.h | 3 ++ > include/uapi/linux/netlink.h | 2 + > kernel/cgroup/cgroup.c | 33 +++-- > net/netlink/af_netlink.c | 92 > net/netlink/af_netlink.h | 2 + > 6 files changed, 112 insertions(+), 26 deletions(-) > > > base-commit: 26c73ba86152babe66810a7c153a0dfc1f1edc49 > ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
Re: [Devel] [PATCH vz9 00/27] part17
Commited On 06.10.2021 11:57, Nikita Yushchenko wrote: > Andrey Ryabinin (3): > x86: make ARCH_[SET|GET]_CPUID friends with /proc/vz/cpuid_override > x86, cpuinfo: Fix race on parallel /proc/cpuinfo read #PSBM-121823 > x86: don't enable cpuid faults if /proc/vz/cpuid_override unused > #PSBM-121823 > > Evgenii Shatokhin (1): > sched: show CPU stats for a cgroup in cpu.proc.stat file > > Kirill Tkhai (3): > sched: Count loadavg under rq::lock in calc_load_nohz_start() > sched/ve: Do not show loadavg in child VE cpu cgroups > x86: Show vcpu cpuflags in cpuinfo > > Konstantin Khorenko (12): > kernel/stat: Introduce kernel_cpustat operation wrappers > ve/sched/stat: Add basic infrastructure for vcpu statistics > ve/sched/stat: Introduce functions to calculate vcpustat data > ve/proc/stat: Wire virtualized /proc/stat handler > sched: Fix task_group "iowait_sum" statistic accounting > ve/sched/stat: Introduce handler for getting CT cpu statistics > ve/time/stat: idle time virtualization in /proc/loadavg > ve/proc/stat: Introduce CPUTIME_USED field in cpustat statistic > ve/vestat: Introduce /proc/vz/vestat > ve/net/core: allow to call setsockopt(SO_SNDBUFFORCE) from Containers > ve/net/core: allow to call setsockopt(SO_RCVBUFFORCE) from Containers > vecalls: Introduce VZCTL_GET_CPU_STAT ioctl > > Nikita Yushchenko (1): > ve: uninline ve_get_monotonic() and ve_get_uptime() > > Pavel Tikhomirov (2): > ve/proc/net/nr_cpus: Cut lines in /proc/net/softnet_stat to number of > vcpus in CT > ve: allow writing to features in pseudosuper state > > Stanislav Kinsburskiy (2): > ve/fs/aio: aio_nr & aio_max_nr variables virtualization > ve/aio: Add a handle to checkpoint/restore AIO context > > Vladimir Davydov (3): > sched/stat: account ctxsw per task group > sched/stat: account forks per task group > arch/x86: introduce cpuid override > > arch/x86/include/asm/msr-index.h | 1 + > arch/x86/include/asm/thread_info.h | 4 +- > arch/x86/include/asm/traps.h | 2 + > arch/x86/kernel/Makefile | 1 + > arch/x86/kernel/cpu/proc.c | 80 +- > arch/x86/kernel/cpuid_fault.c | 249 > arch/x86/kernel/process.c | 13 +- > arch/x86/kernel/traps.c| 27 ++ > fs/aio.c | 137 +++-- > fs/proc/base.c | 27 ++ > fs/proc/stat.c | 10 + > fs/proc/uptime.c | 30 +- > include/linux/aio.h| 19 +- > include/linux/cpuid_override.h | 38 +++ > include/linux/kernel_stat.h| 37 +++ > include/linux/ve.h | 54 ++-- > kernel/sched/core.c| 28 +- > kernel/sched/cpuacct.c | 441 + > kernel/sched/fair.c| 21 +- > kernel/sched/loadavg.c | 6 +- > kernel/sched/sched.h | 9 + > kernel/sysctl.c| 16 +- > kernel/time/time.c | 1 + > kernel/ve/ve.c | 107 ++- > kernel/ve/vecalls.c| 159 +++ > net/core/net-procfs.c | 3 +- > net/core/sock.c| 14 +- > 27 files changed, 1438 insertions(+), 96 deletions(-) > create mode 100644 arch/x86/kernel/cpuid_fault.c > create mode 100644 include/linux/cpuid_override.h > ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
Re: [Devel] [PATCH RH9 1/5] ms/cgroup: let a symlink too be created with a cftype file
On 05.10.2021 20:54, Cyrill Gorcunov wrote: > From: Angelo Ruocco > > This commit enables a cftype to have a symlink (of any name) that > points to the file associated with the cftype. > > Signed-off-by: Angelo Ruocco > Signed-off-by: Paolo Valente > Signed-off-by: Jens Axboe > > https://jira.sw.ru/browse/PSBM-101019 > (cherry-picked from 54b7b868e826b294687c439b68ec55fe20cafe5b) > Signed-off-by: Andrey Ryabinin > Signed-off-by: Cyrill Gorcunov > --- > include/linux/cgroup-defs.h | 3 +++ > kernel/cgroup/cgroup.c | 33 + > 2 files changed, 32 insertions(+), 4 deletions(-) > > diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h > index 583ce2bce98c..0bb884ce 100644 > --- a/include/linux/cgroup-defs.h > +++ b/include/linux/cgroup-defs.h > @@ -125,6 +125,8 @@ enum { >*/ > CFTYPE_VE_WRITABLE = (1 << 15), > > + CFTYPE_SYMLINKED= (1 << 6), /* pointed to by symlink too */ We already have: CFTYPE_PRESSURE = (1 << 6), /* only if pressure feature is enabled */ > + > /* internal flags, do not use outside cgroup core proper */ > __CFTYPE_ONLY_ON_DFL= (1 << 16),/* only on default hierarchy */ > __CFTYPE_NOT_ON_DFL = (1 << 17),/* not on default hierarchy */ > @@ -552,6 +554,7 @@ struct cftype { >* end of cftype array. >*/ > char name[MAX_CFTYPE_NAME]; > + char link_name[MAX_CFTYPE_NAME]; > unsigned long private; > > /* > diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c > index 08b7cff7a1c3..74d0b503e696 100644 > --- a/kernel/cgroup/cgroup.c > +++ b/kernel/cgroup/cgroup.c > @@ -1471,8 +1471,8 @@ struct cgroup *task_cgroup_from_root(struct task_struct > *task, > > static struct kernfs_syscall_ops cgroup_kf_syscall_ops; > > -static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft, > - char *buf) > +static char *cgroup_fill_name(struct cgroup *cgrp, const struct cftype *cft, > + char *buf, bool write_link_name) > { > struct cgroup_subsys *ss = cft->ss; > > @@ -1482,13 +1482,26 @@ static char *cgroup_file_name(struct cgroup *cgrp, > const struct cftype *cft, > > snprintf(buf, CGROUP_FILE_NAME_MAX, "%s%s.%s", >dbg, cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name, > - cft->name); > + write_link_name ? cft->link_name : cft->name); > } else { > - strscpy(buf, cft->name, CGROUP_FILE_NAME_MAX); > + strscpy(buf, write_link_name ? cft->link_name : cft->name, > + CGROUP_FILE_NAME_MAX); > } > return buf; > } > > +static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft, > + char *buf) > +{ > + return cgroup_fill_name(cgrp, cft, buf, false); > +} > + > +static char *cgroup_link_name(struct cgroup *cgrp, const struct cftype *cft, > + char *buf) > +{ > + return cgroup_fill_name(cgrp, cft, buf, true); > +} > + > /** > * cgroup_file_mode - deduce file mode of a control file > * @cft: the control file in question > @@ -1647,6 +1660,9 @@ static void cgroup_rm_file(struct cgroup *cgrp, const > struct cftype *cft) > } > > kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name)); > + if (cft->flags & CFTYPE_SYMLINKED) > + kernfs_remove_by_name(cgrp->kn, > + cgroup_link_name(cgrp, cft, name)); > } > > /** > @@ -4012,6 +4028,7 @@ static int cgroup_add_file(struct cgroup_subsys_state > *css, struct cgroup *cgrp, > { > char name[CGROUP_FILE_NAME_MAX]; > struct kernfs_node *kn; > + struct kernfs_node *kn_link; > struct lock_class_key *key = NULL; > int ret; > > @@ -4042,6 +4059,14 @@ static int cgroup_add_file(struct cgroup_subsys_state > *css, struct cgroup *cgrp, > spin_unlock_irq(&cgroup_file_kn_lock); > } > > + if (cft->flags & CFTYPE_SYMLINKED) { > + kn_link = kernfs_create_link(cgrp->kn, > + cgroup_link_name(cgrp, cft, name), > + kn); > + if (IS_ERR(kn_link)) > + return PTR_ERR(kn_link); > + } > + > return 0; > } > > ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
Re: [Devel] [PATCH RH9 0/6] ext4: Balloon patches
Commited On 05.10.2021 18:42, Kirill Tkhai wrote: > https://jira.sw.ru/browse/PSBM-134003 > > --- > > Kirill Tkhai (2): > Date: Wed Oct 7 14:47:07 2015 +0400 > fs: Revert ee1904ba44bd "make alloc_file() static" > > Konstantin Khorenko (1): > ext4: Provide a balloon nipple for management > > Maxim V. Patlasov (3): > ext4: Teach the fs where the balloon inode is > ext4: Teach statfs to report reduced disk usage > ext4: Don't show the active balloon to user > > > fs/ext4/dir.c| 15 ++- > fs/ext4/ext4.h |3 + > fs/ext4/ioctl.c | 59 +++ > fs/ext4/namei.c |9 > fs/ext4/super.c | 111 > +++--- > fs/file_table.c |3 + > fs/inode.c |1 > include/linux/file.h | 2 + > 8 files changed, 194 insertions(+), 9 deletions(-) > > -- > Signed-off-by: Kirill Tkhai > ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
Re: [Devel] [PATCH RH9 0/7] part-10 non-connector patches
Commited into branch-rh9-5.14.vz9.1.x-ovz On 05.10.2021 15:55, Pavel Tikhomirov wrote: > These are patches from part-10 not directly connected with proc > connector. > > These two are renamed: "prctl: reduce requirements to exe link change", > "ve/prctl_set_mm: allow to change mm content in ve" to > "ve/prctl_set_mm: allow setting exe link while unprivileged for spfs". > > And "ve/net: allow to rename devices in non-ve namespaces" got a crash > fix. > > Andrey Ryabinin (1): > ve/module: hide module refcounts from container > > Kirill Tkhai (1): > ve/net: allow to rename devices in non-ve namespaces > > Pavel Tikhomirov (2): > ve/prctl_set_mm: allow setting exe link while unprivileged for spfs > ve/coredump: virtualize kernel.core_pattern sysctl > > Stanislav Kinsburskiy (3): > ve/kernfs: export kernfs_perms_set() helper > ve/sysfs: generic sysfs_set_def_perms() helper introduced > ve/module: export sysfs dentries in containers > > fs/coredump.c | 12 + > fs/kernfs/ve.c| 4 +-- > fs/sysfs/ve.c | 11 > include/linux/coredump.h | 1 - > include/linux/kernfs-ve.h | 4 +++ > include/linux/sysfs-ve.h | 20 ++ > include/linux/ve.h| 7 + > kernel/module.c | 57 --- > kernel/sys.c | 6 ++--- > kernel/sysctl.c | 13 ++--- > kernel/ve/ve.c| 18 + > net/core/dev.c| 22 +++ > 12 files changed, 157 insertions(+), 18 deletions(-) > create mode 100644 include/linux/sysfs-ve.h > ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RH9 6/6] ext4: Provide a balloon nipple for management
From: Konstantin Khorenko When the fs is mounted with active balloon someone will have to inflate/blow off one. To make it possible there will be a special ioctl for obtaining the fd. Not very elegant solution maybe, but it's OK for PVC containers. +++ ext4: fix file allocation check in ext4_open_balloon Function alloc_file() doesn't return NULL (unlike in 2.6.32-x). It returns error pointer. File structure allocation may fail before file->f_ep_links is initialized, which may lead to crash in eventpoll_release_file(). https://jira.sw.ru/browse/PSBM-41222 mFixes: 9cea7449aa589f325fff378e7256a3c2fc8f048d "ext4: Provide a balloon nipple for management" Signed-off-by: Stanislav Kinsburskiy (cherry picked from vz7 commit 100feb098ab22c6b8b25861c3b2dfaa9c5db0b03) Signed-off-by: Konstantin Khorenko +++ ext4/balloon: Use proper O_ mode flags in balloon opening code alloc_file() expects O_* mode flags, so provide them, not internal FMODE_* ones. mFixes: bee340a206d7 ("ext4: Provide a balloon nipple for management") https://jira.sw.ru/browse/PSBM-129392 Signed-off-by: Konstantin Khorenko Reviewed-by: Kirill Tkhai Signed-off-by: Kirill Tkhai --- fs/ext4/ext4.h |1 + fs/ext4/ioctl.c | 59 +++ fs/inode.c |1 + 3 files changed, 61 insertions(+) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 9b655a94eb16..df46d5586ca1 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -720,6 +720,7 @@ enum { #define EXT4_IOC_CLEAR_ES_CACHE_IO('f', 40) #define EXT4_IOC_GETSTATE _IOW('f', 41, __u32) #define EXT4_IOC_GET_ES_CACHE _IOWR('f', 42, struct fiemap) +#define EXT4_IOC_OPEN_BALLOON _IO('f', 42) #define EXT4_IOC_CHECKPOINT_IOW('f', 43, __u32) #define EXT4_IOC_SHUTDOWN _IOR ('X', 125, __u32) diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 6eed6170aded..6e2be4859571 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -850,6 +850,59 @@ static int ext4_ioctl_checkpoint(struct file *filp, unsigned long arg) return err; } +static int ext4_open_balloon(struct super_block *sb, struct vfsmount *mnt) +{ + struct inode *balloon_ino; + int err, fd; + struct file *filp; + struct dentry *de; + struct path path; + fmode_t mode; + + balloon_ino = EXT4_SB(sb)->s_balloon_ino; + err = -ENOENT; + if (balloon_ino == NULL) + goto err; + + err = fd = get_unused_fd_flags(0); + if (err < 0) + goto err_fd; + + __iget(balloon_ino); + de = d_obtain_alias(balloon_ino); + err = PTR_ERR(de); + if (IS_ERR(de)) + goto err_de; + + path.dentry = de; + path.mnt = mntget(mnt); + err = mnt_want_write(path.mnt); + if (err) + mode = O_RDONLY; + else + mode = O_RDWR; + filp = alloc_file(&path, mode, &ext4_file_operations); + if (filp->f_mode & FMODE_WRITE) + mnt_drop_write(path.mnt); + if (IS_ERR(filp)) { + err = PTR_ERR(filp); + goto err_filp; + } + + filp->f_flags |= O_LARGEFILE; + fd_install(fd, filp); + return fd; + +err_filp: + path_put(&path); +err_de: + put_unused_fd(fd); +err_fd: + /* nothing */ +err: + return err; +} + static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(filp); @@ -1264,6 +1317,12 @@ static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) case EXT4_IOC_CHECKPOINT: return ext4_ioctl_checkpoint(filp, arg); + case EXT4_IOC_OPEN_BALLOON: + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + + return ext4_open_balloon(inode->i_sb, filp->f_path.mnt); + default: return -ENOTTY; } diff --git a/fs/inode.c b/fs/inode.c index c93500d84264..55498b31f088 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -414,6 +414,7 @@ void __iget(struct inode *inode) { atomic_inc(&inode->i_count); } +EXPORT_SYMBOL(__iget); /* * get additional reference to inode; caller must already hold one. ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RH9 4/6] ext4: Don't show the active balloon to user
From: Maxim V. Patlasov This is a port of e123b6d ext4: Don't show the active balloon to user Fix the readdir and lookup. The former one pretends the inode doesn't exists, the latter one denies an access to on. Reporting negative dentry in lookup is pointless, as in that case smth will have to be don the ext4_create callback :\ [VvS RH79 rebase vz7.170.x]: minor context changes (cherry picked from vz7 commit c231c40a93927f3080067e5d880ef11841de278c) Signed-off-by: Konstantin Khorenko Signed-off-by: Kirill Tkhai --- fs/ext4/dir.c | 15 ++- fs/ext4/namei.c |9 + 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index ffb295aa891c..8ed108299fbb 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -123,6 +123,14 @@ int __ext4_check_dir_entry(const char *function, unsigned int line, return 1; } +static inline int ext4_balloon(struct super_block *sb, unsigned ino) +{ + struct ext4_sb_info *sbi; + + sbi = EXT4_SB(sb); + return sbi->s_balloon_ino && (sbi->s_balloon_ino->i_ino == ino); +} + static int ext4_readdir(struct file *file, struct dir_context *ctx) { unsigned int offset; @@ -267,7 +275,8 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) } offset += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize); - if (le32_to_cpu(de->inode)) { + if (le32_to_cpu(de->inode) && + !ext4_balloon(sb, le32_to_cpu(de->inode))) { if (!IS_ENCRYPTED(inode)) { if (!dir_emit(ctx, de->name, de->name_len, @@ -534,6 +543,9 @@ static int call_filldir(struct file *file, struct dir_context *ctx, } ctx->pos = hash2pos(file, fname->hash, fname->minor_hash); while (fname) { + if (ext4_balloon(sb, fname->inode)) + goto skip; + if (!dir_emit(ctx, fname->name, fname->name_len, fname->inode, @@ -541,6 +553,7 @@ static int call_filldir(struct file *file, struct dir_context *ctx, info->extra_fname = fname; return 1; } +skip: fname = fname->next; } return 0; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index f3bbcd4efb56..4a71df8bf8d8 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1797,6 +1797,11 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi iput(inode); return ERR_PTR(-EPERM); } + if (!IS_ERR(inode) && + inode == EXT4_SB(inode->i_sb)->s_balloon_ino) { + iput(inode); + return ERR_PTR(-EPERM); + } } #ifdef CONFIG_UNICODE @@ -3392,6 +3397,10 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) retval = dquot_initialize(d_inode(dentry)); if (retval) goto out_trace; +if (d_inode(dentry) == EXT4_SB(dir->i_sb)->s_balloon_ino) { + retval = -EPERM; +goto out_trace; + } handle = ext4_journal_start(dir, EXT4_HT_DIR, EXT4_DATA_TRANS_BLOCKS(dir->i_sb)); ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RH9 5/6] fs: Revert ee1904ba44bd "make alloc_file() static"
Signed-off-by: Kirill Tkhai --- fs/file_table.c |3 ++- include/linux/file.h |2 ++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/file_table.c b/fs/file_table.c index 45437f8e1003..f624f1a069e8 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -184,7 +184,7 @@ struct file *alloc_empty_file_noaccount(int flags, const struct cred *cred) * @flags: O_... flags with which the new file will be opened * @fop: the 'struct file_operations' for the new file */ -static struct file *alloc_file(const struct path *path, int flags, +struct file *alloc_file(const struct path *path, int flags, const struct file_operations *fop) { struct file *file; @@ -210,6 +210,7 @@ static struct file *alloc_file(const struct path *path, int flags, i_readcount_inc(path->dentry->d_inode); return file; } +EXPORT_SYMBOL(alloc_file); struct file *alloc_file_pseudo(struct inode *inode, struct vfsmount *mnt, const char *name, int flags, diff --git a/include/linux/file.h b/include/linux/file.h index 2de2e4613d7b..bdaefc80bb28 100644 --- a/include/linux/file.h +++ b/include/linux/file.h @@ -22,6 +22,8 @@ struct vfsmount; struct dentry; struct inode; struct path; +extern struct file *alloc_file(const struct path *path, int flags, + const struct file_operations *fop); extern struct file *alloc_file_pseudo(struct inode *, struct vfsmount *, const char *, int flags, const struct file_operations *); extern struct file *alloc_file_clone(struct file *, int flags, ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RH9 1/6] ext4: Teach the fs where the balloon inode is
From: Maxim V. Patlasov This is a port of da0fae4 ext4: Teach the fs where the balloon inode is This adds the balloon_ino mount option and stores the inode pointer on the in-memory super block object. This is not good solution - in a perfect world the balloon inode should be hidden (like the journalling one), but this requires a) reserve its number in the mainline sources;) b) teach e2fsprogs not to treat one as orphaned Until (if) we do this it's better to keep this as a regular file on the disk. (cherry picked from vz7 commit 54ac06cf671c68a3778e9f939ba3794fd6a51470) Signed-off-by: Konstantin Khorenko Signed-off-by: Kirill Tkhai --- fs/ext4/ext4.h |2 + fs/ext4/super.c | 91 +++ 2 files changed, 87 insertions(+), 6 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 3c51e243450d..9b655a94eb16 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1579,6 +1579,8 @@ struct ext4_sb_info { atomic_t s_mb_discarded; atomic_t s_lock_busy; + struct inode *s_balloon_ino; + /* locality groups */ struct ext4_locality_group __percpu *s_locality_groups; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index befbb0892fdd..3bc2cfb04518 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1682,6 +1682,7 @@ enum { #ifdef CONFIG_EXT4_DEBUG Opt_fc_debug_max_replay, Opt_fc_debug_force #endif + Opt_balloon_ino, }; static const match_table_t tokens = { @@ -1786,6 +1787,7 @@ static const match_table_t tokens = { {Opt_removed, "reservation"}, /* mount option from ext2/3 */ {Opt_removed, "noreservation"}, /* mount option from ext2/3 */ {Opt_removed, "journal=%u"},/* mount option from ext2/3 */ + {Opt_balloon_ino, "balloon_ino=%u"}, {Opt_err, NULL}, }; @@ -2009,6 +2011,7 @@ static const struct mount_opts { MOPT_SET | MOPT_2 | MOPT_EXT4_ONLY}, {Opt_fc_debug_max_replay, 0, MOPT_GTE0}, #endif + {Opt_balloon_ino, 0, 0}, {Opt_err, 0, 0} }; @@ -2093,7 +2096,8 @@ struct ext4_parsed_options { static int handle_mount_opt(struct super_block *sb, char *opt, int token, substring_t *args, struct ext4_parsed_options *parsed_opts, - int is_remount) + + unsigned long *balloon_ino, int is_remount) { struct ext4_sb_info *sbi = EXT4_SB(sb); const struct mount_opts *m; @@ -2300,6 +2304,8 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, } else if (token == Opt_test_dummy_encryption) { return ext4_set_test_dummy_encryption(sb, opt, &args[0], is_remount); + } else if (token == Opt_balloon_ino) { + *balloon_ino = arg; } else if (m->flags & MOPT_DATAJ) { if (is_remount) { if (!sbi->s_journal) @@ -2420,6 +2426,7 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, static int parse_options(char *options, struct super_block *sb, struct ext4_parsed_options *ret_opts, +unsigned long *balloon_ino, int is_remount) { struct ext4_sb_info __maybe_unused *sbi = EXT4_SB(sb); @@ -2440,7 +2447,7 @@ static int parse_options(char *options, struct super_block *sb, args[0].to = args[0].from = NULL; token = match_token(p, tokens, args); if (handle_mount_opt(sb, p, token, args, ret_opts, -is_remount) < 0) +balloon_ino, is_remount) < 0) return 0; } #ifdef CONFIG_QUOTA @@ -2628,6 +2635,10 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb, } else if (test_opt2(sb, DAX_INODE)) { SEQ_OPTS_PUTS("dax=inode"); } + + if (sbi->s_balloon_ino) + SEQ_OPTS_PRINT("balloon_ino=%ld", sbi->s_balloon_ino->i_ino); + ext4_show_quota_options(seq, sb); return 0; } @@ -4014,6 +4025,54 @@ static const char *ext4_quota_mode(struct super_block *sb) #endif } +static void ext4_load_balloon(struct super_block *sb, unsigned long ino) +{ + struct inode *inode; + struct ext4_sb_info *sbi; + + sbi = EXT4_SB(sb); + + if (!ino) { + /* FIXME locking */ + if (sbi->s_balloon_ino) { + iput(sbi->s_balloon_ino); + sbi->s_balloon_ino = NULL; + } + + return; + } + + if (ino < EXT4_FIRST_INO(sb)) { + ext4_msg(sb, KERN_WARNING, "bad balloon inode specified"); + return; + } + +
[Devel] [PATCH RH9 3/6] ext4: Teach statfs to report reduced disk usage
From: Maxim V. Patlasov The magic 9 in there came from 512 bytes - the i_blocks is accounted in these units in any case. (cherry picked from vz7 commit 4b10f27018d330d9e03e932a98a41a3e55da81fb) Signed-off-by: Konstantin Khorenko Signed-off-by: Kirill Tkhai --- fs/ext4/super.c | 16 1 file changed, 16 insertions(+) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 03ade65fbe51..f09a2432a20e 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -6290,6 +6290,22 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf) sb_has_quota_limits_enabled(sb, PRJQUOTA)) ext4_statfs_project(sb, EXT4_I(dentry->d_inode)->i_projid, buf); #endif + + if (sbi->s_balloon_ino) { + struct ext4_inode_info *ei; + blkcnt_t balloon_blocks; + + balloon_blocks = sbi->s_balloon_ino->i_blocks; + ei = EXT4_I(sbi->s_balloon_ino); + spin_lock(&ei->i_block_reservation_lock); + balloon_blocks += ei->i_reserved_data_blocks; + spin_unlock(&ei->i_block_reservation_lock); + + BUG_ON(sbi->s_balloon_ino->i_blkbits < 9); + buf->f_blocks -= balloon_blocks >> +(sbi->s_balloon_ino->i_blkbits - 9); + } + return 0; } ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RH9 0/6] ext4: Balloon patches
https://jira.sw.ru/browse/PSBM-134003 --- Kirill Tkhai (2): Date: Wed Oct 7 14:47:07 2015 +0400 fs: Revert ee1904ba44bd "make alloc_file() static" Konstantin Khorenko (1): ext4: Provide a balloon nipple for management Maxim V. Patlasov (3): ext4: Teach the fs where the balloon inode is ext4: Teach statfs to report reduced disk usage ext4: Don't show the active balloon to user fs/ext4/dir.c| 15 ++- fs/ext4/ext4.h |3 + fs/ext4/ioctl.c | 59 +++ fs/ext4/namei.c |9 fs/ext4/super.c | 111 +++--- fs/file_table.c |3 + fs/inode.c |1 include/linux/file.h |2 + 8 files changed, 194 insertions(+), 9 deletions(-) -- Signed-off-by: Kirill Tkhai ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RH9 2/6] Date: Wed Oct 7 14:47:07 2015 +0400
ve/fs: Allow to mount ext4 in top CT userns https://jira.sw.ru/browse/PSBM-40100 v2: Check that user_ns is initial for the ve. v3: Be sure ve->init_cred is set. Signed-off-by: Kirill Tkhai Acked-by: Vladimir Davydov khorenko@: in fact we allowed to do those mounts in top CT user ns only. (cherry picked from vz7 commit d8aabe8924283e12ef30dee49253f91f33d3e9bc ("ve/fs: Allow to mount ext4 and binfmt_misc under non-root ns")) Signed-off-by: Konstantin Khorenko +++ ve/fs: Allow to mount ext4 in top CT userns - cleanup After commit d5c3320347bb ("fs/ve: add new FS_VE_MOUNT flag to allow mount in container init userns") it's wise to use FS_VE_MOUNT flag instead of generic FS_USERNS_MOUNT + additional per-fs check. This patch does not change the behavior. Signed-off-by: Konstantin Khorenko Reviewed-by: Pavel Tikhomirov Signed-off-by: Kirill Tkhai --- fs/ext4/super.c |4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 3bc2cfb04518..03ade65fbe51 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -40,6 +40,7 @@ #include #include #include +#include #include #include #include @@ -6783,7 +6784,8 @@ static struct file_system_type ext4_fs_type = { .name = "ext4", .mount = ext4_mount, .kill_sb= ext4_kill_sb, - .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP | FS_VIRTUALIZED, + .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP | FS_VIRTUALIZED | + FS_VE_MOUNT, }; MODULE_ALIAS_FS("ext4"); ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RH9 0/2] Port part 32 SAK
https://jira.sw.ru/browse/PSBM-134449 --- Kirill Tkhai (2): tty: Avoid threads files iterations in __do_SAK() tty: Use RCU read lock to iterate tasks and threads in __do_SAK() drivers/tty/tty_io.c | 41 - 1 file changed, 28 insertions(+), 13 deletions(-) -- Signed-off-by: Kirill Tkhai ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RH9 2/2] tty: Use RCU read lock to iterate tasks and threads in __do_SAK()
There were made several efforts to make __do_SAK() working in process context long ago, but it does not solves the problem completely. Since __do_SAK() may take tasklist_lock for a long time, the concurent processes, waiting for write lock with interrupts disabled (e.g., forking), get into the same situation like __do_SAK() would have been executed in interrupt context. I've observed several hard lockups on 3.10 kernel running 200 containers, caused by long duration of copy_process()->write_lock_irq() after SAK was sent to a tty. Current mainline kernel has the same problem. The solution is to use RCU to iterate processes and threads. Task list integrity is the only reason we taken tasklist_lock before, as tty subsys primitives mostly take it for reading also (e.g., __proc_set_tty). RCU read lock is enough for that. This patch solves the problem and makes __do_SAK() to be not greedy of tasklist_lock. That should prevent hard lockups I've pointed above. https://jira.sw.ru/browse/PSBM-80340 Signed-off-by: Kirill Tkhai Reviewed-by: Pavel Tikhomirov (cherry picked from vz7 commit 6aecb63c35a5 ("tty: Use RCU read lock to iterate tasks and threads in __do_SAK()")) Signed-off-by: Andrey Zhadchenko Signed-off-by: Kirill Tkhai --- drivers/tty/tty_io.c |4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c index 535f40164c2b..e7268372edb6 100644 --- a/drivers/tty/tty_io.c +++ b/drivers/tty/tty_io.c @@ -3055,8 +3055,10 @@ void __do_SAK(struct tty_struct *tty) task_pid_nr(p), p->comm); group_send_sig_info(SIGKILL, SEND_SIG_PRIV, p, PIDTYPE_SID); } while_each_pid_task(session, PIDTYPE_SID, p); + read_unlock(&tasklist_lock); /* Now kill any processes that happen to have the tty open */ + rcu_read_lock(); for_each_process(p) { if (p->signal->tty == tty) { tty_notice(tty, "SAK: killed process %d (%s): by controlling tty\n", @@ -3085,7 +3087,7 @@ void __do_SAK(struct tty_struct *tty) kill: group_send_sig_info(SIGKILL, SEND_SIG_PRIV, p, PIDTYPE_SID); } - read_unlock(&tasklist_lock); + rcu_read_unlock(); put_pid(session); #endif } ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RH9 1/2] tty: Avoid threads files iterations in __do_SAK()
The patch makes __do_SAK() iterate a next thread files only in case of the thread's files are different to previous. I.e., if all threads points the same files_struct, the files will be iterated only once. Since all threads have the same files_struct is the generic case for most Linux systems, this improvement should clearly speed up __do_SAK() execution. Also, for_each_process()/for_each_thread() are used instead of do_each_thread()/while_each_thread(). This prepares __do_SAK() to become tasklist_lock free, and will be made in next patch. https://jira.sw.ru/browse/PSBM-80340 Suggested-by: Oleg Nesterov Signed-off-by: Kirill Tkhai Reviewed-by: Pavel Tikhomirov Rebase to vz8: - Change send_sig to group_send_sig_info to respect ms commit a8ebd17160ce ("tty_io: Use group_send_sig_info in __do_SACK to note it is a session being killed") (cherry picked from vz7 commit d61ca741c3ae ("tty: Avoid threads files iterations in __do_SAK()")) Signed-off-by: Andrey Zhadchenko Signed-off-by: Kirill Tkhai --- drivers/tty/tty_io.c | 37 + 1 file changed, 25 insertions(+), 12 deletions(-) diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c index a6230b25fbe5..535f40164c2b 100644 --- a/drivers/tty/tty_io.c +++ b/drivers/tty/tty_io.c @@ -3031,7 +3031,8 @@ void __do_SAK(struct tty_struct *tty) #ifdef TTY_SOFT_SAK tty_hangup(tty); #else - struct task_struct *g, *p; + struct task_struct *p, *t; + struct files_struct *files; struct pid *session; int i; unsigned long flags; @@ -3056,22 +3057,34 @@ void __do_SAK(struct tty_struct *tty) } while_each_pid_task(session, PIDTYPE_SID, p); /* Now kill any processes that happen to have the tty open */ - do_each_thread(g, p) { + for_each_process(p) { if (p->signal->tty == tty) { tty_notice(tty, "SAK: killed process %d (%s): by controlling tty\n", task_pid_nr(p), p->comm); - group_send_sig_info(SIGKILL, SEND_SIG_PRIV, p, PIDTYPE_SID); - continue; + goto kill; } - task_lock(p); - i = iterate_fd(p->files, 0, this_tty, tty); - if (i != 0) { - tty_notice(tty, "SAK: killed process %d (%s): by fd#%d\n", - task_pid_nr(p), p->comm, i - 1); - group_send_sig_info(SIGKILL, SEND_SIG_PRIV, p, PIDTYPE_SID); + + files = NULL; + for_each_thread(p, t) { + if (t->files == files) /* racy but we do not care */ + continue; + + task_lock(t); + files = t->files; + i = iterate_fd(files, 0, this_tty, tty); + task_unlock(t); + + if (i != 0) { + dev_notice(tty->dev, "SAK: killed process %d (%s): by fd#%d\n", + task_pid_nr(p), p->comm, i - 1); + goto kill; + } } - task_unlock(p); - } while_each_thread(g, p); + + continue; +kill: + group_send_sig_info(SIGKILL, SEND_SIG_PRIV, p, PIDTYPE_SID); + } read_unlock(&tasklist_lock); put_pid(session); #endif ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RH7] writeback: Write dirty times for WB_SYNC_ALL writeback
From: Jan Kara ms commit dc5ff2b1d66f Currently we take care to handle I_DIRTY_TIME in vfs_fsync() and queue_io() so that inodes which have only dirty timestamps are properly written on fsync(2) and sync(2). However there are other call sites - most notably going through write_inode_now() - which expect inode to be clean after WB_SYNC_ALL writeback. This is not currently true as we do not clear I_DIRTY_TIME in __writeback_single_inode() even for WB_SYNC_ALL writeback in all the cases. This then resulted in the following oops because bdev_write_inode() did not clean the inode and writeback code later stumbled over a dirty inode with detached wb. general protection fault: [#1] SMP DEBUG_PAGEALLOC KASAN Modules linked in: CPU: 3 PID: 32 Comm: kworker/u10:1 Not tainted 4.6.0-rc3+ #349 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 Workqueue: writeback wb_workfn (flush-11:0) task: 88006ccf1840 ti: 88006cda8000 task.ti: 88006cda8000 RIP: 0010:[] [] locked_inode_to_wb_and_lock_list+0xa2/0x750 RSP: 0018:88006cdaf7d0 EFLAGS: 00010246 RAX: RBX: RCX: 88006ccf2050 RDX: RSI: 00114c8a8484 RDI: 0286 RBP: 88006cdaf820 R08: 88006ccf1840 R09: R10: 000229915090805f R11: 0001 R12: 88006a72f5e0 R13: dc00 R14: ed000d4e5eed R15: 8830cf40 FS: () GS:88006d50() knlGS: CS: 0010 DS: ES: CR0: 80050033 CR2: 03301bf8 CR3: 6368f000 CR4: 06e0 DR0: 1ec9 DR1: DR2: DR3: DR6: 0ff0 DR7: 0600 Stack: 88006a72f680 88006a72f768 8800671230d8 03ff88006cdaf948 88006a72f668 88006a72f5e0 8800671230d8 88006cdaf948 880065b90cc8 880067123100 88006cdaf970 8188e12e Call Trace: [< inline >] inode_to_wb_and_lock_list fs/fs-writeback.c:309 [] writeback_sb_inodes+0x4de/0x1250 fs/fs-writeback.c:1554 [] __writeback_inodes_wb+0x104/0x1e0 fs/fs-writeback.c:1600 [] wb_writeback+0x7ce/0xc90 fs/fs-writeback.c:1709 [< inline >] wb_do_writeback fs/fs-writeback.c:1844 [] wb_workfn+0x2f9/0x1000 fs/fs-writeback.c:1884 [] process_one_work+0x78e/0x15c0 kernel/workqueue.c:2094 [] worker_thread+0xdb/0xfc0 kernel/workqueue.c:2228 [] kthread+0x23f/0x2d0 drivers/block/aoe/aoecmd.c:1303 [] ret_from_fork+0x22/0x50 arch/x86/entry/entry_64.S:392 Code: 05 94 4a a8 06 85 c0 0f 85 03 03 00 00 e8 07 15 d0 ff 41 80 3e 00 0f 85 64 06 00 00 49 8b 9c 24 88 01 00 00 48 89 d8 48 c1 e8 03 <42> 80 3c 28 00 0f 85 17 06 00 00 48 8b 03 48 83 c0 50 48 39 c3 RIP [< inline >] wb_get include/linux/backing-dev-defs.h:212 RIP [] locked_inode_to_wb_and_lock_list+0xa2/0x750 fs/fs-writeback.c:281 RSP ---[ end trace 986a4d314dcb2694 ]--- Fix the problem by making sure __writeback_single_inode() writes inode only with dirty times in WB_SYNC_ALL mode. Reported-by: Dmitry Vyukov Tested-by: Laurent Dufour Signed-off-by: Jan Kara Signed-off-by: Jens Axboe This loses to dirty inode, when it's called from freeze_bdev(). So, backup loses mtime. In scope of #PSBM-134225 (but not a not final fix) Signed-off-by: Kirill Tkhai --- fs/fs-writeback.c |1 + 1 file changed, 1 insertion(+) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index c16a39f4f724..1c8c27188361 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -553,6 +553,7 @@ __do_writeback_single_inode(struct inode *inode, struct writeback_control *wbc) dirty = inode->i_state & I_DIRTY; if (inode->i_state & I_DIRTY_TIME) { if ((dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) || + wbc->sync_mode == WB_SYNC_ALL || unlikely(inode->i_state & I_DIRTY_TIME_EXPIRED) || unlikely(time_after(jiffies, (inode->dirtied_time_when + ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RH9] dm-tracking: Add tracking_clear cmd
Command to ACK ordered copied cluster. Signed-off-by: Kirill Tkhai --- drivers/md/dm-tracking.c | 31 +++ 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/drivers/md/dm-tracking.c b/drivers/md/dm-tracking.c index e9cf0a4ae298..d723596fee44 100644 --- a/drivers/md/dm-tracking.c +++ b/drivers/md/dm-tracking.c @@ -168,6 +168,14 @@ static void dmt_dtr(struct dm_target *ti) dmt_destroy(ti->private); } +static int tracking_clear(struct dm_tracking *dmt, u64 clu) +{ + spin_lock_irq(&dmt->lock); + clear_bit(clu, dmt->bitmap); + spin_unlock_irq(&dmt->lock); + return 0; +} + static int tracking_get_next(struct dm_tracking *dmt, char *result, unsigned int maxlen) { @@ -197,10 +205,24 @@ static int tracking_get_next(struct dm_tracking *dmt, char *result, } static int dmt_cmd(struct dm_tracking *dmt, const char *suffix, + int argc, char *argv[], char *result, unsigned int maxlen) { unsigned int nr_clus, size; void *bitmap = NULL; + u64 val; + + if (!strcmp(suffix, "clear")) { + if (argc != 1 || kstrtou64(argv[0], 10, &val) < 0 || + val >= dmt->nr_clus) + return -EINVAL; + if (!dmt->bitmap) + return -ENOENT; + return tracking_clear(dmt, val); + } + + if (argc != 0) + return -EINVAL; if (!strcmp(suffix, "get_next")) { if (!dmt->bitmap) @@ -248,13 +270,14 @@ static int dmt_message(struct dm_target *ti, unsigned int argc, char **argv, return -EPERM; mutex_lock(&dmt->ctl_mutex); + ret = -EINVAL; + if (argc < 1) + goto unlock; ret = -ENOTSUPP; if (strncmp(argv[0], "tracking_", 9)) goto unlock; - ret = -EINVAL; - if (argc != 1) - goto unlock; - ret = dmt_cmd(dmt, argv[0] + 9, result, maxlen); + ret = dmt_cmd(dmt, argv[0] + 9, argc - 1, + &argv[1], result, maxlen); unlock: mutex_unlock(&dmt->ctl_mutex); ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RH8] dm-tracking: Add tracking_clear cmd
Command to ACK ordered copied cluster. Signed-off-by: Kirill Tkhai --- drivers/md/dm-tracking.c | 31 +++ 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/drivers/md/dm-tracking.c b/drivers/md/dm-tracking.c index e9cf0a4ae298..d723596fee44 100644 --- a/drivers/md/dm-tracking.c +++ b/drivers/md/dm-tracking.c @@ -168,6 +168,14 @@ static void dmt_dtr(struct dm_target *ti) dmt_destroy(ti->private); } +static int tracking_clear(struct dm_tracking *dmt, u64 clu) +{ + spin_lock_irq(&dmt->lock); + clear_bit(clu, dmt->bitmap); + spin_unlock_irq(&dmt->lock); + return 0; +} + static int tracking_get_next(struct dm_tracking *dmt, char *result, unsigned int maxlen) { @@ -197,10 +205,24 @@ static int tracking_get_next(struct dm_tracking *dmt, char *result, } static int dmt_cmd(struct dm_tracking *dmt, const char *suffix, + int argc, char *argv[], char *result, unsigned int maxlen) { unsigned int nr_clus, size; void *bitmap = NULL; + u64 val; + + if (!strcmp(suffix, "clear")) { + if (argc != 1 || kstrtou64(argv[0], 10, &val) < 0 || + val >= dmt->nr_clus) + return -EINVAL; + if (!dmt->bitmap) + return -ENOENT; + return tracking_clear(dmt, val); + } + + if (argc != 0) + return -EINVAL; if (!strcmp(suffix, "get_next")) { if (!dmt->bitmap) @@ -248,13 +270,14 @@ static int dmt_message(struct dm_target *ti, unsigned int argc, char **argv, return -EPERM; mutex_lock(&dmt->ctl_mutex); + ret = -EINVAL; + if (argc < 1) + goto unlock; ret = -ENOTSUPP; if (strncmp(argv[0], "tracking_", 9)) goto unlock; - ret = -EINVAL; - if (argc != 1) - goto unlock; - ret = dmt_cmd(dmt, argv[0] + 9, result, maxlen); + ret = dmt_cmd(dmt, argv[0] + 9, argc - 1, + &argv[1], result, maxlen); unlock: mutex_unlock(&dmt->ctl_mutex); ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
Re: [Devel] [PATCH rh9 03/11] ve/printk: Virtualize syslog_*
On 29.09.2021 22:24, Konstantin Khorenko wrote: > From: Vladimir Davydov > > https://jira.sw.ru/browse/PSBM-17899 > > Signed-off-by: Vladimir Davydov > Signed-off-by: Stanislav Kinsburskiy > > +++ > ve/printk: Fix printk virtualization > > ve_printk() corrupts host's dmesg: > # dmesg|wc -l > 599 > # vzctl create 101 > # vzctl set 101 --netif_add eth0 --save > # vzctl start 101 > # vzctl exec 101 'tcpdump -w tcpdump.out -U -n -i eth0 esp' > # dmesg|wc -l > 2 > > Add missing parts of prinkt virtualization to fix this. > > https://jira.sw.ru/browse/PSBM-17899 > https://jira.sw.ru/browse/PSBM-105442 > > Signed-off-by: Andrey Ryabinin > > Rebasing to vz9: part of vz8 commit: > d63aeb311a64 ("ve/printk: printk virtualization") > > https://jira.sw.ru/browse/PSBM-133985 > > Signed-off-by: Konstantin Khorenko > --- > kernel/printk/printk.c | 60 +- > 1 file changed, 30 insertions(+), 30 deletions(-) > > diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c > index a1dedbc88426..70dbf204c052 100644 > --- a/kernel/printk/printk.c > +++ b/kernel/printk/printk.c > @@ -360,13 +360,6 @@ enum log_flags { > static DEFINE_RAW_SPINLOCK(syslog_lock); > > #ifdef CONFIG_PRINTK > -DECLARE_WAIT_QUEUE_HEAD(log_wait); log_wait is still used in code, so we should move this to another patch. > -/* All 3 protected by @syslog_lock. */ > -/* the next printk record to read by syslog(READ) or /proc/kmsg */ > -static u64 syslog_seq; > -static size_t syslog_partial; > -static bool syslog_time; > - > /* All 3 protected by @console_sem. */ > /* the next printk record to write to the console */ > static u64 console_seq; > @@ -418,6 +411,12 @@ static struct log_state { > char *buf; > u32 buf_len; > > + /* All 3 protected by @syslog_lock. */ > + /* the next printk record to read by syslog(READ) or /proc/kmsg */ > + u64 syslog_seq; > + size_t syslog_partial; > + bool syslog_time; > + > /* >* The next printk record to read after the last 'clear' command. There > are >* two copies (updated with seqcount_latch) so that reads can locklessly > @@ -1546,35 +1545,35 @@ static int syslog_print(struct log_state *log, > > printk_safe_enter_irq(); > raw_spin_lock(&syslog_lock); > - if (!prb_read_valid(log->prb, syslog_seq, &r)) { > + if (!prb_read_valid(log->prb, log->syslog_seq, &r)) { > raw_spin_unlock(&syslog_lock); > printk_safe_exit_irq(); > break; > } > - if (r.info->seq != syslog_seq) { > + if (r.info->seq != log->syslog_seq) { > /* message is gone, move to next valid one */ > - syslog_seq = r.info->seq; > - syslog_partial = 0; > + log->syslog_seq = r.info->seq; > + log->syslog_partial = 0; > } > > /* >* To keep reading/counting partial line consistent, >* use printk_time value as of the beginning of a line. >*/ > - if (!syslog_partial) > - syslog_time = printk_time; > + if (!log->syslog_partial) > + log->syslog_time = printk_time; > > - skip = syslog_partial; > - n = record_print_text(&r, true, syslog_time); > - if (n - syslog_partial <= size) { > + skip = log->syslog_partial; > + n = record_print_text(&r, true, log->syslog_time); > + if (n - log->syslog_partial <= size) { > /* message fits into buffer, move forward */ > - syslog_seq = r.info->seq + 1; > - n -= syslog_partial; > - syslog_partial = 0; > + log->syslog_seq = r.info->seq + 1; > + n -= log->syslog_partial; > + log->syslog_partial = 0; > } else if (!len){ > /* partial read(), remember position */ > n = size; > - syslog_partial += n; > + log->syslog_partial += n; > } else > n = 0; > raw_spin_unlock(&syslog_lock); > @@ -1669,10 +1668,11 @@ static void syslog_clear(struct log_state *log) > /* Return a consistent copy of @syslog_seq. */ > static u64 read_syslog_seq_irq(void) > { > + struct log_state *log = ve_log_state(); > u64 seq; > > raw_spin_lock_irq(&syslog_lock); > - seq = syslog_seq; > + seq = log->syslog_seq; > raw_spin_unlock_irq(&syslog_lock); > > return seq; > @@ -1707,7 +1707,7 @@ int do_syslog(int type, char __user *buf, int len, int > source) > prb_read_valid(log->prb, read_syslo
Re: [Devel] [PATCH rh9 01/11] ve/printk: Introduce struct "log_state" and virtualize log_buf/log_buf_len
On 29.09.2021 22:24, Konstantin Khorenko wrote: > From: Vladimir Davydov > > https://jira.sw.ru/browse/PSBM-17899 > > Signed-off-by: Vladimir Davydov > Signed-off-by: Stanislav Kinsburskiy > > +++ > ve/printk: Fix printk virtualization > > ve_printk() corrupts host's dmesg: > # dmesg|wc -l > 599 > # vzctl create 101 > # vzctl set 101 --netif_add eth0 --save > # vzctl start 101 > # vzctl exec 101 'tcpdump -w tcpdump.out -U -n -i eth0 esp' > # dmesg|wc -l > 2 > > Add missing parts of prinkt virtualization to fix this. > > https://jira.sw.ru/browse/PSBM-17899 > https://jira.sw.ru/browse/PSBM-105442 > > Signed-off-by: Andrey Ryabinin > > Rebasing to vz9: part of vz8 commit: > d63aeb311a64 ("ve/printk: printk virtualization") > > https://jira.sw.ru/browse/PSBM-133985 > > Signed-off-by: Konstantin Khorenko > --- > include/linux/printk.h | 13 > include/linux/ve.h | 3 + > kernel/printk/printk.c | 165 ++--- > kernel/ve/ve.c | 8 ++ > 4 files changed, 164 insertions(+), 25 deletions(-) > > diff --git a/include/linux/printk.h b/include/linux/printk.h > index e834d78f0478..f178e2e5d7f5 100644 > --- a/include/linux/printk.h > +++ b/include/linux/printk.h > @@ -176,6 +176,10 @@ int vprintk(const char *fmt, va_list args); > asmlinkage __printf(1, 2) __cold > int printk(const char *fmt, ...); > > +struct ve_struct; > +int ve_log_init(struct ve_struct *ve); > +void ve_log_destroy(struct ve_struct *ve); > + > /* > * Special printk facility for scheduler/timekeeping use only, _DO_NOT_USE_ ! > */ > @@ -222,6 +226,15 @@ int printk(const char *s, ...) > { > return 0; > } > +static inline > +int ve_log_init(struct ve_struct *ve) > +{ > + return 0; > +} > +static inline > +void ve_log_destroy(struct ve_struct *ve) > +{ > +} > static inline __printf(1, 2) __cold > int printk_deferred(const char *s, ...) > { > diff --git a/include/linux/ve.h b/include/linux/ve.h > index 248cdeb0a2e4..552fa577e2f9 100644 > --- a/include/linux/ve.h > +++ b/include/linux/ve.h > @@ -50,6 +50,9 @@ struct ve_struct { > /* see vzcalluser.h for VE_FEATURE_XXX definitions */ > __u64 features; > > + void*log_state; > +#define VE_LOG_BUF_LEN 4096 > + > struct kstat_lat_pcpu_structsched_lat_ve; > > struct kmapset_key sysfs_perms_key; > diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c > index 142a58d124d9..77e6787c752e 100644 > --- a/kernel/printk/printk.c > +++ b/kernel/printk/printk.c > @@ -44,6 +44,7 @@ > #include > #include > #include > +#include > #include > #include > #include > @@ -408,8 +409,6 @@ static struct latched_seq clear_seq = { > #define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) > #define LOG_BUF_LEN_MAX (u32)(1 << 31) > static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN); > -static char *log_buf = __log_buf; > -static u32 log_buf_len = __LOG_BUF_LEN; > > /* > * Define the average message size. This only affects the number of > @@ -427,6 +426,34 @@ _DEFINE_PRINTKRB(printk_rb_static, CONFIG_LOG_BUF_SHIFT > - PRB_AVGBITS, > static struct printk_ringbuffer printk_rb_dynamic; > > static struct printk_ringbuffer *prb = &printk_rb_static; > +static struct log_state { > + char *buf; > + u32 buf_len; > +} init_log_state = { > + .buf = __log_buf, > + .buf_len = __LOG_BUF_LEN, > +}; Maybe, we move this hunk up to previous hunk "-"? -static char *log_buf = __log_buf; -static u32 log_buf_len = __LOG_BUF_LEN; +static struct log_state { + char *buf; ... + .buf = __log_buf, + .buf_len = __LOG_BUF_LEN, ... So, it clearer to understand we just reassigned __log_buf etc here? > + > +/* kdump relies on some log_* symbols, let's make it happy */ > +#define DEFINE_STRUCT_MEMBER_ALIAS(name, inst, memb) \ > +static void ## name ## _definition(void) __attribute__((used)); \ > +static void ## name ## _definition(void) > \ > +{\ > + asm (".globl " #name "\n\t.set " #name ", " #inst "+%c0"\ > + : : "g" (offsetof(typeof(inst), memb))); \ > +}\ > +extern typeof(inst.memb) name; > +#undef DEFINE_STRUCT_MEMBER_ALIAS Maybe we should move this define to patch where it's used? > + > +static inline struct log_state *ve_log_state(void) > +{ > + struct log_state *log = &init_log_state; > +#ifdef CONFIG_VE > + if (get_exec_env()->log_state) > + log = get_exec_env()->log_state; > +#endif > + return log; > +} > > /* > * We cannot access per-CPU data (e.g. per-CPU flush irq_work) before > @@ -468,13 +495,13 @@ static u64 latched_seq_read_nolock(struct latched_seq > *ls) > /* Return log buffer address */ >
[Devel] [PATCH RH9] cbt: Add config
Signed-off-by: Kirill Tkhai --- .../custom-overrides/generic/CONFIG_BLK_DEV_CBT|1 + 1 file changed, 1 insertion(+) create mode 100644 redhat/configs/custom-overrides/generic/CONFIG_BLK_DEV_CBT diff --git a/redhat/configs/custom-overrides/generic/CONFIG_BLK_DEV_CBT b/redhat/configs/custom-overrides/generic/CONFIG_BLK_DEV_CBT new file mode 100644 index ..03d9fa0ae0cf --- /dev/null +++ b/redhat/configs/custom-overrides/generic/CONFIG_BLK_DEV_CBT @@ -0,0 +1 @@ +CONFIG_BLK_DEV_CBT=y ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RH9] cbt: introduce changed block tracking
Combined patch including original patch and fixes: 93f326594516 block/blk-cbt.c: copyright update 20ff882b3eb0 cbt: selfdeadlock in __blk_cbt_set() c22ab989e4f9 cbt: bitmap corruption caused by ipi 4bb4b6b568d9 cbt: license: put correct copyrights into file headers d991ef45b2bc cbt: don't leak ce_reserved64 in cbt_ito userspace c91a55b3c347 cbt: blk_cbt_update_size() must return if cbt->block_max not changed 8c3db04ecfd5 cbt: blk_cbt_update_size() should not copy uninitialized data 92039d6d0a9a cbt: fix possible race on alloc_page() c55fac6426e7 cbt: new api: blk_cbt_map_merge() d54e079764bd cbt: fix panic in blk_cbt_map_copy_once() c2c61c3eb41c cbt: fix cbt->block_max calculation 737d22a7d677 cbt: add uuid arg to blk_cbt_map_copy_once() 3040064c1c69 cbt: add blk_cbt_map_copy_once() helper 981be8f5e23d cbt: fix page allocation 99f0cacd3cba cbt: make __blk_cbt_set() smarter 7324f0cc6139 cbt: introduce CBT_PAGE_MISSED 7588bf9d56c9 cbt: factor out alloc_page d9511b56e79e cbt: introduce changed block tracking @ktkhai: Backport changes were made for bvec_iter dereferencing. Signed-off-by: Kirill Tkhai +++ cbt: Update CBT size from check_disk_size_change() Here is customer node, where is CBT size is different to ploop size. Searching against kernel code shows, this is the only place we skip CBT size update after bd_inode size change. https://jira.sw.ru/browse/PSBM-123819 Signed-off-by: Kirill Tkhai (cherry picked from vz7 commit 042072dc3899 ("cbt: Update CBT size from check_disk_size_change()")) Signed-off-by: Vasily Averin +++ 60576729f55c cbt: New interface to save current mask snapshot in cbt During the backup, we want to save current changed mask and to start tracking from clean mask again. Previously, the mask was saved in another driver: ploop used to call cbt primitives and saved it in ploop device structures. This looks better than saving the mask in userspace, because the mask remains alive even in case of userspace death. The only thing needed after died backup is to merge the saved mask back from ploop driver to cbt driver. Thus, all changed (from previous successful backup) blocks are still available, and it's possible to create partial backup even after segfaulted userspace. This patchset continues the practice of saving mask in kernel, but it makes possible to save CBT snapshot in cbt driver without distributing CBT structures over the kernel. Here is a new BLKCBTMISC ioctl, which allows to create, drop and merge back a snapshot. The ioctl has 3 switches: * CMI_SNP_CREATE: create a new mask snapshot and move changed blocks mask there (changed blocks mask becomes empty after that). * CMI_SNP_DROP: drops created snapshot (should be called after successful backup). * CMI_SNP_MERGE_BACK: moves snapshot bits into changing blocks mask and kills snapshot (should be called after failed backup). +++ cbt: Change errno values for new ioctl cbt: Actually show errors on return cbt: Rename misc commands names cbt: Add size to CBT_SNAP_CREATE cbt: Rename also blk_user_cbt_snp_create cbt: Move cbt_flush_cache() before size calculation cbt: Fix off-by-one in map_required_size() cbt: endless loop on rollback in blk_cbt_snap_create() cbt: Fix off-by-one in map_required_size() Signed-off-by: Kirill Tkhai --- block/Makefile |1 block/blk-cbt.c | 1035 ++ block/blk-core.c |1 block/blk-sysfs.c|1 block/genhd.c| 13 - block/ioctl.c|6 block/partitions/Kconfig |8 block/partitions/core.c |4 include/linux/blkdev.h | 25 + include/uapi/linux/fs.h | 53 ++ 10 files changed, 1141 insertions(+), 6 deletions(-) create mode 100644 block/blk-cbt.c diff --git a/block/Makefile b/block/Makefile index 1e1afa10f869..5c5f703bcbd2 100644 --- a/block/Makefile +++ b/block/Makefile @@ -40,3 +40,4 @@ obj-$(CONFIG_BLK_SED_OPAL)+= sed-opal.o obj-$(CONFIG_BLK_PM) += blk-pm.o obj-$(CONFIG_BLK_INLINE_ENCRYPTION)+= keyslot-manager.o blk-crypto.o obj-$(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) += blk-crypto-fallback.o +obj-$(CONFIG_BLK_DEV_CBT) += blk-cbt.o diff --git a/block/blk-cbt.c b/block/blk-cbt.c new file mode 100644 index ..e8eee11a87ba --- /dev/null +++ b/block/blk-cbt.c @@ -0,0 +1,1035 @@ +/* + * block/blk-cbt.c + * + * Copyright (c) 2010-2015 Parallels IP Holdings GmbH + * Copyright (c) 2017-2021 Virtuozzo International GmbH. All rights reserved. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define CBT_MAX_EXTENTS512 +#define NR_PAGES(bits) (((bits) + PAGE_SIZE*8 - 1) / (PAGE_SIZE*8)) +#define BITS_PER_PAGE (1UL << (PAGE_SHIFT + 3)) + +#define CBT_PAGE_MISSED (struct page *)(0x1) +#define CBT_PAGE(cbt, idx) (cbt->map[idx] == CBT_PAGE_MISSED ? \ +
[Devel] [PATCH RH8] cbt: Fix off-by-one in map_required_size()
Instead of: return DIV_ROUND_UP(bit, 8) + page * PAGE_SIZE; we have to have: return DIV_ROUND_UP(bit, 8) + (page - 1) * PAGE_SIZE; But instead of that we fix @page to be enumerated from 0 in standard C way. Signed-off-by: Kirill Tkhai --- block/blk-cbt.c |9 - 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/block/blk-cbt.c b/block/blk-cbt.c index 1ccc393f1419..e8eee11a87ba 100644 --- a/block/blk-cbt.c +++ b/block/blk-cbt.c @@ -315,15 +315,14 @@ static unsigned long map_required_size(struct page **map, unsigned long block_ma { unsigned long bit, page, npages = NR_PAGES(block_max); - for (page = npages; page > 0; page--) { - if (map[page-1]) + for (page = npages - 1; page != ULONG_MAX; page--) { + if (map[page]) break; } - - if (page == 0) + if (page == ULONG_MAX) return 0; - bit = find_last_bit(page_address(map[page - 1]), PAGE_SIZE); + bit = find_last_bit(page_address(map[page]), PAGE_SIZE); if (bit >= PAGE_SIZE) bit = 0; /* Not found */ else ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RH9] push_backup: Do not take write lock on ENOTTY
Userspace may pass wrong command. Do not take write lock then. Signed-off-by: Kirill Tkhai --- drivers/md/dm-push-backup.c | 23 +++ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/drivers/md/dm-push-backup.c b/drivers/md/dm-push-backup.c index 6d7b1859298a..1500d0681cee 100644 --- a/drivers/md/dm-push-backup.c +++ b/drivers/md/dm-push-backup.c @@ -329,11 +329,10 @@ static void pb_release_clone(struct request *clone, blk_put_request(clone); } -static bool msg_wants_down_read(const char *cmd) +static bool msg_wants_down_write(const char *cmd) { - if (!strcmp(cmd, "push_backup_read") || - !strcmp(cmd, "push_backup_write") || - !strcmp(cmd, "push_backup_statistics")) + if (!strcmp(cmd, "push_backup_start") || + !strcmp(cmd, "push_backup_stop")) return true; return false; @@ -567,7 +566,7 @@ static int pb_message(struct dm_target *ti, unsigned int argc, char **argv, struct push_backup *pb = ti->private; int ret = -EPERM; u64 val, val2; - bool read; + bool write; if (!capable(CAP_SYS_ADMIN)) goto out; @@ -576,11 +575,11 @@ static int pb_message(struct dm_target *ti, unsigned int argc, char **argv, if (argc < 1) goto out; - read = msg_wants_down_read(argv[0]); - if (read) - ret = down_read_killable(&pb->ctl_rwsem); - else + write = msg_wants_down_write(argv[0]); + if (write) ret = down_write_killable(&pb->ctl_rwsem); + else + ret = down_read_killable(&pb->ctl_rwsem); if (unlikely(ret)) goto out; @@ -612,10 +611,10 @@ static int pb_message(struct dm_target *ti, unsigned int argc, char **argv, } unlock: - if (read) - up_read(&pb->ctl_rwsem); - else + if (write) up_write(&pb->ctl_rwsem); + else + up_read(&pb->ctl_rwsem); out: return ret; } ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RH9] push_backup: Take rwsem killable
... to have possibility to kill process. Signed-off-by: Kirill Tkhai --- drivers/md/dm-push-backup.c |6 -- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/md/dm-push-backup.c b/drivers/md/dm-push-backup.c index 75f080fe34cf..6d7b1859298a 100644 --- a/drivers/md/dm-push-backup.c +++ b/drivers/md/dm-push-backup.c @@ -578,9 +578,11 @@ static int pb_message(struct dm_target *ti, unsigned int argc, char **argv, read = msg_wants_down_read(argv[0]); if (read) - down_read(&pb->ctl_rwsem); + ret = down_read_killable(&pb->ctl_rwsem); else - down_write(&pb->ctl_rwsem); + ret = down_write_killable(&pb->ctl_rwsem); + if (unlikely(ret)) + goto out; if (!strcmp(argv[0], "push_backup_start")) { if (argc < 2 || argc > 3) ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RH9] push_backup: Do not take write lock on statistics
It's overkill. Signed-off-by: Kirill Tkhai --- drivers/md/dm-push-backup.c |3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/md/dm-push-backup.c b/drivers/md/dm-push-backup.c index 16e8cf27211f..75f080fe34cf 100644 --- a/drivers/md/dm-push-backup.c +++ b/drivers/md/dm-push-backup.c @@ -332,7 +332,8 @@ static void pb_release_clone(struct request *clone, static bool msg_wants_down_read(const char *cmd) { if (!strcmp(cmd, "push_backup_read") || - !strcmp(cmd, "push_backup_write")) + !strcmp(cmd, "push_backup_write") || + !strcmp(cmd, "push_backup_statistics")) return true; return false; ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel