[Devel] [PATCH rh8 v3 2/2] ve/proc: Added separate start time field to task_struct to show in container

2020-12-21 Thread Konstantin Khorenko
From: Valeriy Vdovin 

Introduced 'real_start_time_ct' field in task_struct.

The value is READ:
1. When the process lives inside of a ve group and any process
inside of the same ve group wants to know it's start time by reading
it's /proc/[pid]/stat file.
2. At container suspend operation to store this value to a dump image.

The value is WRITTEN:
1. At creation time (copy_process function)
1.1. If a process is being created outside of ve group / on host, then
this value is initialized to 0
1.2. If a process is being created by process already living in ve
group, this value is calculated as host_uptime - ve_uptime.

2. During attach to ve. (ve_attach function). The process can be created on
a host and later attached to ve. It's container's start_time value has been
already initialized to 0 at creation time. After the process enters the
domain of a ve, the value should be initialized.
Note that the process can be attached to a non-running container, in which
case it's start_time value should not be calculated and left initialized to
0.

3. At container restore via prctl (prctl_set_task_ct_fields function).
In this case the value is only settable outside of a container.
During restore the processes would be created from the dump image.
At restore step each process will execute prctl to set it's start_time
value, read from the dump. This would only be permitted during
pseudosuper ve mode. The value is set as is (read from the dump), without
any calculations.

https://jira.sw.ru/browse/PSBM-64123

Signed-off-by: Valeriy Vdovin 

(cherry picked from vz7 commit eca790eaed527bae7029b4ae1cd557ce847ac6c0)
Signed-off-by: Konstantin Khorenko 

v2: rebased to branch-rh8-4.18.0-240.1.1.vz8.5.x-ovz branch
v3: added missing ve.h include
---
 fs/proc/array.c| 12 +++-
 include/linux/sched.h  |  7 ++-
 include/linux/ve.h | 16 
 include/uapi/linux/prctl.h |  6 ++
 kernel/fork.c  | 12 
 kernel/sys.c   | 23 +++
 kernel/ve/ve.c |  2 ++
 7 files changed, 68 insertions(+), 10 deletions(-)

diff --git a/fs/proc/array.c b/fs/proc/array.c
index ba712f18e5ff..735876a51a18 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -555,16 +555,10 @@ static int do_task_stat(struct seq_file *m, struct 
pid_namespace *ns,
start_time = task->real_start_time;
 
 #ifdef CONFIG_VE
-   if (!is_super) {
-   u64 offset = get_exec_env()->real_start_time;
-   start_time -= (unsigned long long)offset;
-   }
-   /* tasks inside a CT can have negative start time e.g. if the CT was
-* migrated from another hw node, in which case we will report 0 in
-* order not to confuse userspace */
-   if ((s64)start_time < 0)
-   start_time = 0;
+   if (!is_super)
+   start_time = task->real_start_time_ct;
 #endif
+
/* convert nsec -> ticks */
start_time = nsec_to_clock_t(start_time);
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 19ca9cc0f3b9..9846553f7039 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -839,7 +839,6 @@ struct task_struct {
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
struct vtimevtime;
 #endif
-
 #ifdef CONFIG_NO_HZ_FULL
atomic_ttick_dep_mask;
 #endif
@@ -853,6 +852,12 @@ struct task_struct {
/* Boot based time in nsecs: */
u64 real_start_time;
 
+   /*
+* This is a Container-side copy of 'real_start_time' field
+* shown from inside of a Container and modified by host.
+*/
+   u64 real_start_time_ct;
+
/* MM fault and swap info: this can arguably be seen as either 
mm-specific or thread-specific: */
unsigned long   min_flt;
unsigned long   maj_flt;
diff --git a/include/linux/ve.h b/include/linux/ve.h
index 3aa0ea0b1bab..ab8da4dceec1 100644
--- a/include/linux/ve.h
+++ b/include/linux/ve.h
@@ -148,6 +148,22 @@ static u64 ve_get_uptime(struct ve_struct *ve)
return ktime_get_boot_ns() - ve->real_start_time;
 }
 
+static inline void ve_set_task_start_time(struct ve_struct *ve,
+ struct task_struct *t)
+{
+   /*
+* Mitigate memory access reordering risks by doing double check,
+* 'is_running' could be read as 1 before we see
+* 'real_start_time' updated here. If it's still 0,
+* we know 'is_running' is being modified right NOW in
+* parallel so it's safe to say that start time is also 0.
+*/
+   if (!ve->is_running || !ve->real_start_time)
+   t->real_start_time_ct = 0;
+   else
+   t->real_start_time_ct = ve_get_uptime(ve

[Devel] [PATCH RHEL8 COMMIT] config.OpenVZ.minimal: Enable infiniband for fast-path for vStorage

2020-12-21 Thread Konstantin Khorenko
The commit is pushed to "branch-rh8-4.18.0-240.1.1.vz8.5.x-ovz" and will appear 
at https://src.openvz.org/scm/ovz/vzkernel.git
after rh8-4.18.0-240.1.1.vz8.5.3
-->
commit b1517a7192febd59da249c6391bbff56094ceaaf
Author: Konstantin Khorenko 
Date:   Mon Dec 21 14:58:35 2020 +0300

config.OpenVZ.minimal: Enable infiniband for fast-path for vStorage

+CONFIG_INFINIBAND=y
+CONFIG_INFINIBAND_ADDR_TRANS=y
+CONFIG_INFINIBAND_ADDR_TRANS_CONFIGFS=y

Infiniband is required for fast-path for vStorage

Signed-off-by: Konstantin Khorenko 
---
 configs/kernel-4.18.0-x86_64-KVM-minimal.config | 28 -
 1 file changed, 27 insertions(+), 1 deletion(-)

diff --git a/configs/kernel-4.18.0-x86_64-KVM-minimal.config 
b/configs/kernel-4.18.0-x86_64-KVM-minimal.config
index c8cb0796ed12..83d8142847c8 100644
--- a/configs/kernel-4.18.0-x86_64-KVM-minimal.config
+++ b/configs/kernel-4.18.0-x86_64-KVM-minimal.config
@@ -433,6 +433,7 @@ CONFIG_BLK_DEV_CBT=y
 CONFIG_BLOCK_COMPAT=y
 CONFIG_BLK_MQ_PCI=y
 CONFIG_BLK_MQ_VIRTIO=y
+CONFIG_BLK_MQ_RDMA=y
 CONFIG_BLK_PM=y
 
 #
@@ -953,6 +954,7 @@ CONFIG_XFRM_SUB_POLICY=y
 CONFIG_XFRM_MIGRATE=y
 CONFIG_XFRM_STATISTICS=y
 # CONFIG_NET_KEY is not set
+# CONFIG_SMC is not set
 CONFIG_XDP_SOCKETS=y
 CONFIG_XDP_SOCKETS_DIAG=y
 CONFIG_INET=y
@@ -1586,6 +1588,7 @@ CONFIG_BLK_DEV_LOOP_MIN_COUNT=8
 # NVME Support
 #
 # CONFIG_BLK_DEV_NVME is not set
+# CONFIG_NVME_RDMA is not set
 # CONFIG_NVME_FC is not set
 # CONFIG_NVME_TARGET is not set
 
@@ -3614,7 +3617,27 @@ CONFIG_LEDS_TRIGGER_DISK=y
 # CONFIG_LEDS_TRIGGER_PATTERN is not set
 # CONFIG_LEDS_TRIGGER_AUDIO is not set
 # CONFIG_ACCESSIBILITY is not set
-# CONFIG_INFINIBAND is not set
+CONFIG_INFINIBAND=y
+CONFIG_INFINIBAND_USER_MAD=y
+CONFIG_INFINIBAND_USER_ACCESS=y
+# CONFIG_INFINIBAND_EXP_LEGACY_VERBS_NEW_UAPI is not set
+CONFIG_INFINIBAND_USER_MEM=y
+CONFIG_INFINIBAND_ON_DEMAND_PAGING=y
+CONFIG_INFINIBAND_ADDR_TRANS=y
+CONFIG_INFINIBAND_ADDR_TRANS_CONFIGFS=y
+# CONFIG_INFINIBAND_MTHCA is not set
+# CONFIG_INFINIBAND_EFA is not set
+# CONFIG_MLX4_INFINIBAND is not set
+# CONFIG_INFINIBAND_OCRDMA is not set
+# CONFIG_INFINIBAND_USNIC is not set
+# CONFIG_INFINIBAND_BNXT_RE is not set
+# CONFIG_INFINIBAND_RDMAVT is not set
+# CONFIG_RDMA_RXE is not set
+# CONFIG_RDMA_SIW is not set
+# CONFIG_INFINIBAND_IPOIB is not set
+# CONFIG_INFINIBAND_SRP is not set
+# CONFIG_INFINIBAND_ISER is not set
+# CONFIG_INFINIBAND_OPA_VNIC is not set
 CONFIG_EDAC_ATOMIC_SCRUB=y
 CONFIG_EDAC_SUPPORT=y
 CONFIG_EDAC=y
@@ -4102,6 +4125,7 @@ CONFIG_CUSE=y
 # CONFIG_VIRTIO_FS is not set
 # CONFIG_FUSE_KIO_NOOP is not set
 # CONFIG_FUSE_KIO_NULLIO is not set
+# CONFIG_FUSE_KIO_PCS is not set
 CONFIG_OVERLAY_FS=y
 # CONFIG_OVERLAY_FS_REDIRECT_DIR is not set
 CONFIG_OVERLAY_FS_REDIRECT_ALWAYS_FOLLOW=y
@@ -4221,6 +4245,7 @@ CONFIG_SUNRPC_BACKCHANNEL=y
 CONFIG_RPCSEC_GSS_KRB5=y
 # CONFIG_SUNRPC_DISABLE_INSECURE_ENCTYPES is not set
 CONFIG_SUNRPC_DEBUG=y
+CONFIG_SUNRPC_XPRT_RDMA=y
 # CONFIG_CEPH_FS is not set
 # CONFIG_CIFS is not set
 # CONFIG_CODA_FS is not set
@@ -4569,6 +4594,7 @@ CONFIG_SECURITY_WRITABLE_HOOKS=y
 CONFIG_SECURITYFS=y
 CONFIG_SECURITY_NETWORK=y
 CONFIG_PAGE_TABLE_ISOLATION=y
+# CONFIG_SECURITY_INFINIBAND is not set
 CONFIG_SECURITY_NETWORK_XFRM=y
 # CONFIG_SECURITY_PATH is not set
 CONFIG_INTEL_TXT=y
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH RHEL8 COMMIT] config.minimal: Aling minimal config options for RHEL8.3 based kernel

2020-12-21 Thread Konstantin Khorenko
The commit is pushed to "branch-rh8-4.18.0-240.1.1.vz8.5.x-ovz" and will appear 
at https://src.openvz.org/scm/ovz/vzkernel.git
after rh8-4.18.0-240.1.1.vz8.5.3
-->
commit 573a50c973f989aecb6e0beeace679a8463b900c
Author: Konstantin Khorenko 
Date:   Mon Dec 21 14:50:44 2020 +0300

config.minimal: Aling minimal config options for RHEL8.3 based kernel

Signed-off-by: Konstantin Khorenko 
---
 configs/kernel-4.18.0-x86_64-KVM-minimal.config | 231 +---
 1 file changed, 166 insertions(+), 65 deletions(-)

diff --git a/configs/kernel-4.18.0-x86_64-KVM-minimal.config 
b/configs/kernel-4.18.0-x86_64-KVM-minimal.config
index 29f4de20def4..c8cb0796ed12 100644
--- a/configs/kernel-4.18.0-x86_64-KVM-minimal.config
+++ b/configs/kernel-4.18.0-x86_64-KVM-minimal.config
@@ -1,10 +1,10 @@
 #
 # Automatically generated file; DO NOT EDIT.
-# Linux/x86 4.18.0.ovz.3.11 Kernel Configuration
+# Linux/x86 4.18.0.ovz.5.3 Kernel Configuration
 #
 
 #
-# Compiler: gcc (GCC) 8.2.1 20180905 (Red Hat 8.2.1-3)
+# Compiler: gcc (GCC) 8.3.1 20191121 (Red Hat 8.3.1-5)
 #
 CONFIG_64BIT=y
 CONFIG_X86_64=y
@@ -24,7 +24,6 @@ CONFIG_GENERIC_BUG=y
 CONFIG_GENERIC_BUG_RELATIVE_POINTERS=y
 CONFIG_GENERIC_HWEIGHT=y
 CONFIG_ARCH_MAY_HAVE_PC_FDC=y
-CONFIG_RWSEM_XCHGADD_ALGORITHM=y
 CONFIG_GENERIC_CALIBRATE_DELAY=y
 CONFIG_ARCH_HAS_CPU_RELAX=y
 CONFIG_ARCH_HAS_CACHE_LINE_SIZE=y
@@ -47,8 +46,9 @@ CONFIG_FIX_EARLYCON_MEM=y
 CONFIG_DYNAMIC_PHYSICAL_MASK=y
 CONFIG_PGTABLE_LEVELS=5
 CONFIG_CC_IS_GCC=y
-CONFIG_GCC_VERSION=80201
+CONFIG_GCC_VERSION=80301
 CONFIG_CLANG_VERSION=0
+CONFIG_CC_CAN_LINK=y
 CONFIG_IRQ_WORK=y
 CONFIG_BUILDTIME_EXTABLE_SORT=y
 CONFIG_THREAD_INFO_IN_TASK=y
@@ -83,8 +83,6 @@ CONFIG_CROSS_MEMORY_ATTACH=y
 CONFIG_AUDIT=y
 CONFIG_HAVE_ARCH_AUDITSYSCALL=y
 CONFIG_AUDITSYSCALL=y
-CONFIG_AUDIT_WATCH=y
-CONFIG_AUDIT_TREE=y
 
 #
 # IRQ subsystem
@@ -94,10 +92,12 @@ CONFIG_GENERIC_IRQ_SHOW=y
 CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK=y
 CONFIG_GENERIC_PENDING_IRQ=y
 CONFIG_GENERIC_IRQ_MIGRATION=y
+CONFIG_HARDIRQS_SW_RESEND=y
 CONFIG_IRQ_DOMAIN=y
 CONFIG_IRQ_DOMAIN_HIERARCHY=y
 CONFIG_GENERIC_MSI_IRQ=y
 CONFIG_GENERIC_MSI_IRQ_DOMAIN=y
+CONFIG_IRQ_MSI_IOMMU=y
 CONFIG_GENERIC_IRQ_MATRIX_ALLOCATOR=y
 CONFIG_GENERIC_IRQ_RESERVATION_MODE=y
 CONFIG_IRQ_FORCED_THREADING=y
@@ -129,12 +129,15 @@ CONFIG_HIGH_RES_TIMERS=y
 CONFIG_VIRT_CPU_ACCOUNTING=y
 CONFIG_VIRT_CPU_ACCOUNTING_GEN=y
 CONFIG_IRQ_TIME_ACCOUNTING=y
+CONFIG_HAVE_SCHED_AVG_IRQ=y
 CONFIG_BSD_PROCESS_ACCT=y
 CONFIG_BSD_PROCESS_ACCT_V3=y
 CONFIG_TASKSTATS=y
 CONFIG_TASK_DELAY_ACCT=y
 CONFIG_TASK_XACCT=y
 CONFIG_TASK_IO_ACCOUNTING=y
+CONFIG_PSI=y
+CONFIG_PSI_DEFAULT_DISABLED=y
 CONFIG_CPU_ISOLATION=y
 
 #
@@ -168,7 +171,6 @@ CONFIG_MEMCG_SWAP=y
 CONFIG_MEMCG_SWAP_ENABLED=y
 CONFIG_MEMCG_KMEM=y
 CONFIG_BLK_CGROUP=y
-# CONFIG_DEBUG_BLK_CGROUP is not set
 CONFIG_CGROUP_WRITEBACK=y
 CONFIG_CGROUP_SCHED=y
 CONFIG_FAIR_GROUP_SCHED=y
@@ -241,8 +243,11 @@ CONFIG_KALLSYMS=y
 CONFIG_KALLSYMS_ALL=y
 CONFIG_KALLSYMS_ABSOLUTE_PERCPU=y
 CONFIG_KALLSYMS_BASE_RELATIVE=y
+# CONFIG_BPF_LSM is not set
 CONFIG_BPF_SYSCALL=y
+CONFIG_ARCH_WANT_DEFAULT_BPF_JIT=y
 CONFIG_BPF_JIT_ALWAYS_ON=y
+CONFIG_BPF_JIT_DEFAULT_ON=y
 CONFIG_USERFAULTFD=y
 CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE=y
 CONFIG_RSEQ=y
@@ -262,6 +267,7 @@ CONFIG_SLUB=y
 CONFIG_SLAB_MERGE_DEFAULT=y
 CONFIG_SLAB_FREELIST_RANDOM=y
 # CONFIG_SLAB_FREELIST_HARDENED is not set
+CONFIG_SHUFFLE_PAGE_ALLOCATOR=y
 CONFIG_SLUB_CPU_PARTIAL=y
 CONFIG_SYSTEM_DATA_VERIFICATION=y
 CONFIG_PROFILING=y
@@ -293,6 +299,7 @@ CONFIG_HAVE_DMA_CONTIGUOUS=y
 CONFIG_GENERIC_SMP_IDLE_THREAD=y
 CONFIG_ARCH_HAS_FORTIFY_SOURCE=y
 CONFIG_ARCH_HAS_SET_MEMORY=y
+CONFIG_ARCH_HAS_SET_DIRECT_MAP=y
 CONFIG_HAVE_ARCH_THREAD_STRUCT_WHITELIST=y
 CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT=y
 CONFIG_HAVE_REGS_AND_STACK_ACCESS_API=y
@@ -354,6 +361,8 @@ CONFIG_ARCH_HAS_STRICT_MODULE_RWX=y
 CONFIG_STRICT_MODULE_RWX=y
 CONFIG_ARCH_HAS_REFCOUNT=y
 # CONFIG_REFCOUNT_FULL is not set
+# CONFIG_LOCK_EVENT_COUNTS is not set
+CONFIG_ARCH_HAS_MEM_ENCRYPT=y
 
 #
 # GCOV-based kernel profiling
@@ -362,6 +371,7 @@ CONFIG_ARCH_HAS_REFCOUNT=y
 CONFIG_ARCH_HAS_GCOV_PROFILE_ALL=y
 CONFIG_RT_MUTEXES=y
 CONFIG_BASE_SMALL=0
+CONFIG_MODULE_SIG_FORMAT=y
 CONFIG_MODULES=y
 CONFIG_MODULE_FORCE_LOAD=y
 CONFIG_MODULE_UNLOAD=y
@@ -391,6 +401,7 @@ CONFIG_BLK_DEV_THROTTLING=y
 # CONFIG_BLK_CMDLINE_PARSER is not set
 CONFIG_BLK_WBT=y
 # CONFIG_BLK_CGROUP_IOLATENCY is not set
+# CONFIG_BLK_CGROUP_IOCOST is not set
 CONFIG_BLK_WBT_MQ=y
 CONFIG_BLK_DEBUG_FS=y
 # CONFIG_BLK_SED_OPAL is not set
@@ -431,6 +442,8 @@ CONFIG_MQ_IOSCHED_DEADLINE=y
 CONFIG_MQ_IOSCHED_KYBER=y
 CONFIG_IOSCHED_BFQ=y
 CONFIG_BFQ_GROUP_IOSCHED=y
+# CONFIG_BFQ_CGROUP_DEBUG is not set
+CONFIG_PADATA=y
 CONFIG_ASN1=y
 CONFIG_INLINE_SPIN_UNLOCK_IRQ=y
 CONFIG_INLINE_READ_UNLOCK=y
@@ -460,7 +473,7 @@ CONFIG_X86_X2APIC=y
 CONFIG_X86_MPPARSE=y
 # CONFIG_GOLDFISH is not set
 CONFIG_RETPOLINE=y
-CONFIG_INTEL_RDT=y
+CONFIG_X86_

[Devel] [PATCH RHEL8 COMMIT] netlink: add an option to set sk->err from userspace

2020-12-21 Thread Konstantin Khorenko
The commit is pushed to "branch-rh8-4.18.0-240.1.1.vz8.5.x-ovz" and will appear 
at https://src.openvz.org/scm/ovz/vzkernel.git
after rh8-4.18.0-240.1.1.vz8.5.3
-->
commit 1855f8f5de8bcdeaa58557cdcbfc52e11f970936
Author: Andrey Zhadchenko 
Date:   Mon Dec 21 18:51:31 2020 +0300

netlink: add an option to set sk->err from userspace

Sometimes during dump criu can encounter sockets with overflown kernel
buffer, which results in ENOBUFS error during next read.
We need a reliable way to restore sk->sk_err.

https://jira.sw.ru/browse/PSBM-120976

Signed-off-by: Andrey Zhadchenko 
---
 include/uapi/linux/netlink.h |  1 +
 net/netlink/af_netlink.c | 10 ++
 2 files changed, 11 insertions(+)

diff --git a/include/uapi/linux/netlink.h b/include/uapi/linux/netlink.h
index 67ea114f19a4..4360186183d4 100644
--- a/include/uapi/linux/netlink.h
+++ b/include/uapi/linux/netlink.h
@@ -157,6 +157,7 @@ enum nlmsgerr_attrs {
 #define NETLINK_EXT_ACK11
 #define NETLINK_GET_STRICT_CHK 12
 #define NETLINK_REPAIR 127
+#define NETLINK_SETERR 128
 
 struct nl_pktinfo {
__u32   group;
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 9889446b9653..4f06946c9923 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -1685,6 +1685,16 @@ static int netlink_setsockopt(struct socket *sock, int 
level, int optname,
nlk->flags &= ~NETLINK_F_REPAIR;
err = 0;
break;
+   case NETLINK_SETERR:
+   err = -ENOPROTOOPT;
+   if (nlk->flags & NETLINK_F_REPAIR) {
+   if (!val || val > MAX_ERRNO)
+   return -EINVAL;
+   sk->sk_err = val;
+   sk->sk_error_report(sk);
+   err = 0;
+   }
+   break;
case NETLINK_PKTINFO:
if (val)
nlk->flags |= NETLINK_F_RECV_PKTINFO;
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH RHEL8 COMMIT] netlink: protect NETLINK_REPAIR

2020-12-21 Thread Konstantin Khorenko
The commit is pushed to "branch-rh8-4.18.0-240.1.1.vz8.5.x-ovz" and will appear 
at https://src.openvz.org/scm/ovz/vzkernel.git
after rh8-4.18.0-240.1.1.vz8.5.3
-->
commit 0ba63537137941826825b89fd7d6540b00c85357
Author: Andrey Zhadchenko 
Date:   Mon Dec 21 18:51:30 2020 +0300

netlink: protect NETLINK_REPAIR

Prevent using netlink repair mode from containers.

Signed-off-by: Andrey Zhadchenko 
---
 net/netlink/af_netlink.c | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 3cfef64bb28c..9889446b9653 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -1672,6 +1672,13 @@ static int netlink_setsockopt(struct socket *sock, int 
level, int optname,
 
switch (optname) {
case NETLINK_REPAIR:
+#ifdef CONFIG_VE
+   {
+   struct ve_struct *ve = get_exec_env();
+   if (!ve_is_super(ve) && !ve->is_pseudosuper)
+   return -ENOPROTOOPT;
+   }
+#endif
if (val)
nlk->flags |= NETLINK_F_REPAIR;
else
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH RHEL8 COMMIT] proc, memcg: use memcg limits for showing oom_score inside CT

2020-12-21 Thread Konstantin Khorenko
The commit is pushed to "branch-rh8-4.18.0-240.1.1.vz8.5.x-ovz" and will appear 
at https://src.openvz.org/scm/ovz/vzkernel.git
after rh8-4.18.0-240.1.1.vz8.5.3
-->
commit 58b2cd949131965c57ddd7f584144645cf97fafe
Author: Andrey Ryabinin 
Date:   Mon Dec 21 19:42:59 2020 +0300

proc,memcg: use memcg limits for showing oom_score inside CT

Use memcg's limits of task to show /proc//oom_score.

Note: in vz7 we had different behavior. It showed 'oom_score'
based on 've->memcg' limits of process reading oom_score.

Now we look at memcg of  process and don't care about the
current one. It seems more correct behaviour.

Signed-off-by: Andrey Ryabinin 
---
 fs/proc/base.c |  8 +++-
 include/linux/memcontrol.h | 11 +++
 2 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 7639d1e45842..7c0fd93ba7d1 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -526,8 +526,14 @@ static const struct file_operations proc_lstats_operations 
= {
 static int proc_oom_score(struct seq_file *m, struct pid_namespace *ns,
  struct pid *pid, struct task_struct *task)
 {
-   unsigned long totalpages = totalram_pages + total_swap_pages;
+   unsigned long totalpages;
unsigned long points = 0;
+   struct mem_cgroup *memcg;
+
+   rcu_read_lock();
+   memcg = mem_cgroup_from_task(task);
+   totalpages = mem_cgroup_total_pages(memcg);
+   rcu_read_unlock();
 
points = oom_badness(task, NULL, NULL, totalpages, NULL) *
1000 / totalpages;
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 8f85eb2dc9db..917e6ab9b1ab 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -596,6 +596,17 @@ unsigned long mem_cgroup_get_zone_lru_size(struct lruvec 
*lruvec,
return mz->lru_zone_size[zone_idx][lru];
 }
 
+static inline unsigned long mem_cgroup_total_pages(struct mem_cgroup *memcg)
+{
+   unsigned long ram, ram_swap;
+   extern long total_swap_pages;
+
+   ram = min_t(unsigned long, totalram_pages, memcg->memory.max);
+   ram_swap = min_t(unsigned long, memcg->memsw.max, ram + 
total_swap_pages);
+
+   return ram_swap;
+}
+
 void mem_cgroup_handle_over_high(void);
 
 unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg);
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH RHEL8 COMMIT] oom: make berserker more aggressive

2020-12-21 Thread Konstantin Khorenko
The commit is pushed to "branch-rh8-4.18.0-240.1.1.vz8.5.x-ovz" and will appear 
at https://src.openvz.org/scm/ovz/vzkernel.git
after rh8-4.18.0-240.1.1.vz8.5.3
-->
commit 89c9c96179c7c59ca937010420c590e38fa1bf30
Author: Vladimir Davydov 
Date:   Mon Dec 21 19:49:33 2020 +0300

oom: make berserker more aggressive

In the berserker mode we kill a bunch of tasks that are as bad as the
selected victim. We assume two tasks to be equally bad if they consume
the same permille of memory. With such a strict check, it might turn out
that oom berserker won't kill any tasks in case a fork bomb is running
inside a container while the effect of killing a task eating <=1/1000th
of memory won't be enough to cope with memory shortage. Let's loosen
this check and use percentage instead of permille. In this case, it
might still happen that berserker won't kill anyone, but in this case
the regular oom should free at least 1/100th of memory, which should be
enough even for small containers.

Also, check berserker mode even if the victim has already exited by the
time we are about to send SIGKILL to it. Rationale: when the berserker
is in rage, it might kill hundreds of tasks so that the next oom kill is
likely to select an exiting task. Not triggering berserker in this case
will result in oom stalls.

Signed-off-by: Vladimir Davydov 

[aryabinin: rh8 rebase]
Signed-off-by: Andrey Ryabinin 
---
 mm/oom_kill.c | 9 +
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 353fb22da98c..366ec5be9107 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -1024,11 +1024,11 @@ static void oom_berserker(struct oom_control *oc)
continue;
 
/*
-* Consider tasks as equally bad if they have equal
-* normalized scores.
+* Consider tasks as equally bad if they occupy equal
+* percentage of available memory.
 */
-   if (tsk_points * 1000 / oc->totalpages <
-   oc->chosen_points * 1000 / oc->totalpages)
+   if (tsk_points * 100 / oc->totalpages <
+   oc->chosen_points * 100 / oc->totalpages)
continue;
 
if (__ratelimit(&berserker_rs)) {
@@ -1069,6 +1069,7 @@ static void oom_kill_process(struct oom_control *oc, 
const char *message)
wake_oom_reaper(victim);
task_unlock(victim);
put_task_struct(victim);
+   oom_berserker(oc);
return;
}
task_unlock(victim);
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH RHEL8 COMMIT] oom: resurrect berserker mode

2020-12-21 Thread Konstantin Khorenko
The commit is pushed to "branch-rh8-4.18.0-240.1.1.vz8.5.x-ovz" and will appear 
at https://src.openvz.org/scm/ovz/vzkernel.git
after rh8-4.18.0-240.1.1.vz8.5.3
-->
commit fd0c0eddf619ad335ed60170bdb7024e6df818d6
Author: Vladimir Davydov 
Date:   Mon Dec 21 19:49:32 2020 +0300

oom: resurrect berserker mode

The logic behind the OOM berserker is the same as in PCS6: if processes
are killed by oom killer too often (< sysctl vm.oom_relaxation, 1 sec by
default), we increase "rage" (min -10, max 20) and kill 1 << "rage"
youngest worst processes if "rage" >= 0.

https://jira.sw.ru/browse/PSBM-17930

Signed-off-by: Vladimir Davydov 

[aryabinin: vz8 rebase]
Signed-off-by: Andrey Ryabinin 
---
 include/linux/memcontrol.h |  6 +++
 include/linux/oom.h|  5 +++
 mm/oom_kill.c  | 97 ++
 3 files changed, 108 insertions(+)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 917e6ab9b1ab..d4d49160ee40 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -258,6 +258,12 @@ struct mem_cgroup {
/* OOM-Killer disable */
int oom_kill_disable;
 
+   int oom_rage;
+   spinlock_t  oom_rage_lock;
+   unsigned long   prev_oom_time;
+   unsigned long   oom_time;
+
+
/* memory.events */
struct cgroup_file events_file;
 
diff --git a/include/linux/oom.h b/include/linux/oom.h
index 0dc94a5bad9e..8ae3aaa00a0f 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -22,6 +22,10 @@ enum oom_constraint {
CONSTRAINT_MEMCG,
 };
 
+
+#define OOM_BASE_RAGE  -10
+#define OOM_MAX_RAGE   20
+
 /*
  * Details of the page allocation that triggered the oom killer that are used 
to
  * determine what should be killed.
@@ -51,6 +55,7 @@ struct oom_control {
unsigned long totalpages;
struct task_struct *chosen;
unsigned long chosen_points;
+   unsigned long overdraft;
 
/* Used to print the constraint info. */
enum oom_constraint constraint;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index fe34e85f62ec..353fb22da98c 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -53,6 +53,7 @@
 int sysctl_panic_on_oom;
 int sysctl_oom_kill_allocating_task;
 int sysctl_oom_dump_tasks;
+int sysctl_oom_relaxation = HZ;
 
 DEFINE_MUTEX(oom_lock);
 
@@ -955,6 +956,101 @@ static int oom_kill_memcg_member(struct task_struct 
*task, void *message)
return 0;
 }
 
+/*
+ * Kill more processes if oom happens too often in this context.
+ */
+static void oom_berserker(struct oom_control *oc)
+{
+   static DEFINE_RATELIMIT_STATE(berserker_rs,
+   DEFAULT_RATELIMIT_INTERVAL,
+   DEFAULT_RATELIMIT_BURST);
+   struct task_struct *p;
+   struct mem_cgroup *memcg;
+   unsigned long now = jiffies;
+   int rage;
+   int killed = 0;
+
+   memcg = oc->memcg ?: root_mem_cgroup;
+
+   spin_lock(&memcg->oom_rage_lock);
+   memcg->prev_oom_time = memcg->oom_time;
+   memcg->oom_time = now;
+   /*
+* Increase rage if oom happened recently in this context, reset
+* rage otherwise.
+*
+* previous oomthis oom (unfinished)
+* +
+*^^
+*  prev_oom_time  <>  oom_time
+*/
+   if (time_after(now, memcg->prev_oom_time + sysctl_oom_relaxation))
+   memcg->oom_rage = OOM_BASE_RAGE;
+   else if (memcg->oom_rage < OOM_MAX_RAGE)
+   memcg->oom_rage++;
+   rage = memcg->oom_rage;
+   spin_unlock(&memcg->oom_rage_lock);
+
+   if (rage < 0)
+   return;
+
+   /*
+* So, we are in rage. Kill (1 << rage) youngest tasks that are
+* as bad as the victim.
+*/
+   read_lock(&tasklist_lock);
+   list_for_each_entry_reverse(p, &init_task.tasks, tasks) {
+   unsigned long tsk_points;
+   unsigned long tsk_overdraft;
+
+   if (!p->mm || test_tsk_thread_flag(p, TIF_MEMDIE) ||
+   fatal_signal_pending(p) || p->flags & PF_EXITING ||
+   oom_unkillable_task(p, oc->memcg, oc->nodemask))
+   continue;
+
+   tsk_points = oom_badness(p, oc->memcg, oc->nodemask,
+   oc->totalpages, &tsk_overdraft);
+   if (tsk_overdraft < oc->overdraft)
+   continue;
+
+   /*
+* oom_badness never returns a negative value, even if
+* oom_score_adj would make badness so, instead it
+* returns 1. So we do not kill task with badness 1 if
+* the victim has badness > 1 so as not to risk killing
+* protected tasks.
+   

[Devel] [PATCH RHEL8 COMMIT] ve/proc: Added separate start time field to task_struct to show in container

2020-12-22 Thread Konstantin Khorenko
The commit is pushed to "branch-rh8-4.18.0-240.1.1.vz8.5.x-ovz" and will appear 
at https://src.openvz.org/scm/ovz/vzkernel.git
after rh8-4.18.0-240.1.1.vz8.5.3
-->
commit bd69a02332b3f8f9e286370399b1e5e1babd9dd1
Author: Valeriy Vdovin 
Date:   Thu Jan 30 22:15:34 2020 +0300

ve/proc: Added separate start time field to task_struct to show in container

Introduced 'real_start_time_ct' field in task_struct.

The value is READ:
1. When the process lives inside of a ve group and any process
inside of the same ve group wants to know it's start time by reading
it's /proc/[pid]/stat file.
2. At container suspend operation to store this value to a dump image.

The value is WRITTEN:
1. At creation time (copy_process function)
1.1. If a process is being created outside of ve group / on host, then
this value is initialized to 0
1.2. If a process is being created by process already living in ve
group, this value is calculated as host_uptime - ve_uptime.

2. During attach to ve. (ve_attach function). The process can be created on
a host and later attached to ve. It's container's start_time value has been
already initialized to 0 at creation time. After the process enters the
domain of a ve, the value should be initialized.
Note that the process can be attached to a non-running container, in which
case it's start_time value should not be calculated and left initialized to
0.

3. At container restore via prctl (prctl_set_task_ct_fields function).
In this case the value is only settable outside of a container.
During restore the processes would be created from the dump image.
At restore step each process will execute prctl to set it's start_time
value, read from the dump. This would only be permitted during
pseudosuper ve mode. The value is set as is (read from the dump), without
any calculations.

https://jira.sw.ru/browse/PSBM-64123

Signed-off-by: Valeriy Vdovin 

(cherry picked from vz7 commit eca790eaed527bae7029b4ae1cd557ce847ac6c0)
Signed-off-by: Konstantin Khorenko 
Reviewed-by: Valeriy Vdovin 
---
 fs/proc/array.c| 12 +++-
 include/linux/sched.h  |  7 ++-
 include/linux/ve.h | 16 
 include/uapi/linux/prctl.h |  6 ++
 kernel/fork.c  | 12 
 kernel/sys.c   | 23 +++
 kernel/ve/ve.c |  2 ++
 7 files changed, 68 insertions(+), 10 deletions(-)

diff --git a/fs/proc/array.c b/fs/proc/array.c
index ba712f18e5ff..735876a51a18 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -555,16 +555,10 @@ static int do_task_stat(struct seq_file *m, struct 
pid_namespace *ns,
start_time = task->real_start_time;
 
 #ifdef CONFIG_VE
-   if (!is_super) {
-   u64 offset = get_exec_env()->real_start_time;
-   start_time -= (unsigned long long)offset;
-   }
-   /* tasks inside a CT can have negative start time e.g. if the CT was
-* migrated from another hw node, in which case we will report 0 in
-* order not to confuse userspace */
-   if ((s64)start_time < 0)
-   start_time = 0;
+   if (!is_super)
+   start_time = task->real_start_time_ct;
 #endif
+
/* convert nsec -> ticks */
start_time = nsec_to_clock_t(start_time);
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 19ca9cc0f3b9..9846553f7039 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -839,7 +839,6 @@ struct task_struct {
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
struct vtimevtime;
 #endif
-
 #ifdef CONFIG_NO_HZ_FULL
atomic_ttick_dep_mask;
 #endif
@@ -853,6 +852,12 @@ struct task_struct {
/* Boot based time in nsecs: */
u64 real_start_time;
 
+   /*
+* This is a Container-side copy of 'real_start_time' field
+* shown from inside of a Container and modified by host.
+*/
+   u64 real_start_time_ct;
+
/* MM fault and swap info: this can arguably be seen as either 
mm-specific or thread-specific: */
unsigned long   min_flt;
unsigned long   maj_flt;
diff --git a/include/linux/ve.h b/include/linux/ve.h
index 3aa0ea0b1bab..ab8da4dceec1 100644
--- a/include/linux/ve.h
+++ b/include/linux/ve.h
@@ -148,6 +148,22 @@ static u64 ve_get_uptime(struct ve_struct *ve)
return ktime_get_boot_ns() - ve->real_start_time;
 }
 
+static inline void ve_set_task_start_time(struct ve_struct *ve,
+ struct task_struct *t)
+{
+   /*
+* Mitigate memory access reordering risks by doing double check,
+* 'is_running' 

[Devel] [PATCH RHEL8 COMMIT] ve/time: Move ve_get_uptime() to header

2020-12-22 Thread Konstantin Khorenko
The commit is pushed to "branch-rh8-4.18.0-240.1.1.vz8.5.x-ovz" and will appear 
at https://src.openvz.org/scm/ovz/vzkernel.git
after rh8-4.18.0-240.1.1.vz8.5.3
-->
commit d99d1df4d0a762c8eb674cd764b08a7e0735c632
Author: Konstantin Khorenko 
Date:   Tue Nov 10 19:03:14 2020 +0300

ve/time: Move ve_get_uptime() to header

Will be used in ve.h in another function.

To_merge: 9644a237d401 ("ve/vestat: Introduce /proc/vz/vestat")
    
Signed-off-by: Konstantin Khorenko 
Reviewed-by: Valeriy Vdovin 
---
 include/linux/ve.h  | 5 +
 kernel/ve/vecalls.c | 5 -
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/include/linux/ve.h b/include/linux/ve.h
index 7cb416f342e7..3aa0ea0b1bab 100644
--- a/include/linux/ve.h
+++ b/include/linux/ve.h
@@ -143,6 +143,11 @@ static inline struct ve_struct *css_to_ve(struct 
cgroup_subsys_state *css)
 
 extern struct cgroup_subsys_state *ve_get_init_css(struct ve_struct *ve, int 
subsys_id);
 
+static u64 ve_get_uptime(struct ve_struct *ve)
+{
+   return ktime_get_boot_ns() - ve->real_start_time;
+}
+
 extern void monotonic_abs_to_ve(clockid_t which_clock, struct timespec64 *tp);
 extern void monotonic_ve_to_abs(clockid_t which_clock, struct timespec64 *tp);
 
diff --git a/kernel/ve/vecalls.c b/kernel/ve/vecalls.c
index 786a743faa1a..f1cc04ee82da 100644
--- a/kernel/ve/vecalls.c
+++ b/kernel/ve/vecalls.c
@@ -32,11 +32,6 @@
 #include 
 #include 
 
-static u64 ve_get_uptime(struct ve_struct *ve)
-{
-   return ktime_get_boot_ns() - ve->real_start_time;
-}
-
 static int fill_cpu_stat(envid_t veid, struct vz_cpu_stat __user *buf)
 {
struct ve_struct *ve;
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


Re: [Devel] [PATCH RH8] loop: fix no-unmap write-zeroes request behavior

2020-12-22 Thread Konstantin Khorenko

Dropping this as already applied by RedHat:

* Thu Apr 16 2020 Frantisek Hrbata  [4.18.0-193.9.el8]
- [block] loop: fix no-unmap write-zeroes request behavior (Ming Lei) [1798919]

--
Best regards,

Konstantin Khorenko,
Virtuozzo Linux Kernel Team

On 11/17/2020 01:21 PM, Kirill Tkhai wrote:

From: Darrick J. Wong 

ms commit efcfec579f61

Currently, if the loop device receives a WRITE_ZEROES request, it asks
the underlying filesystem to punch out the range.  This behavior is
correct if unmapping is allowed.  However, a NOUNMAP request means that
the caller doesn't want us to free the storage backing the range, so
punching out the range is incorrect behavior.

To satisfy a NOUNMAP | WRITE_ZEROES request, loop should ask the
underlying filesystem to FALLOC_FL_ZERO_RANGE, which is (according to
the fallocate documentation) required to ensure that the entire range is
backed by real storage, which suffices for our purposes.

Fixes: 19372e2769179dd ("loop: implement REQ_OP_WRITE_ZEROES")
Signed-off-by: Darrick J. Wong 
Reviewed-by: Christoph Hellwig 
Signed-off-by: Jens Axboe 
Signed-off-by: Kirill Tkhai 
---
 drivers/block/loop.c |   26 ++
 1 file changed, 18 insertions(+), 8 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 5b124d57b9c0..6308dabc253f 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -417,18 +417,20 @@ static int lo_read_transfer(struct loop_device *lo, 
struct request *rq,
return ret;
 }

-static int lo_discard(struct loop_device *lo, struct request *rq, loff_t pos)
+static int lo_fallocate(struct loop_device *lo, struct request *rq, loff_t pos,
+   int mode)
 {
/*
-* We use punch hole to reclaim the free space used by the
-* image a.k.a. discard. However we do not support discard if
-* encryption is enabled, because it may give an attacker
-* useful information.
+* We use fallocate to manipulate the space mappings used by the image
+* a.k.a. discard/zerorange. However we do not support this if
+* encryption is enabled, because it may give an attacker useful
+* information.
 */
struct file *file = lo->lo_backing_file;
-   int mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE;
int ret;

+   mode |= FALLOC_FL_KEEP_SIZE;
+
if ((!file->f_op->fallocate) || lo->lo_encrypt_key_size) {
ret = -EOPNOTSUPP;
goto out;
@@ -596,9 +598,17 @@ static int do_req_filebacked(struct loop_device *lo, 
struct request *rq)
switch (req_op(rq)) {
case REQ_OP_FLUSH:
return lo_req_flush(lo, rq);
-   case REQ_OP_DISCARD:
case REQ_OP_WRITE_ZEROES:
-   return lo_discard(lo, rq, pos);
+   /*
+* If the caller doesn't want deallocation, call zeroout to
+* write zeroes the range.  Otherwise, punch them out.
+*/
+   return lo_fallocate(lo, rq, pos,
+   (rq->cmd_flags & REQ_NOUNMAP) ?
+   FALLOC_FL_ZERO_RANGE :
+   FALLOC_FL_PUNCH_HOLE);
+   case REQ_OP_DISCARD:
+   return lo_fallocate(lo, rq, pos, FALLOC_FL_PUNCH_HOLE);
case REQ_OP_WRITE:
if (lo->transfer)
return lo_write_transfer(lo, rq, pos);


.


___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH RHEL8 COMMIT] dm-ploop: Skip zero writes to unallocated clusters

2020-12-22 Thread Konstantin Khorenko
The commit is pushed to "branch-rh8-4.18.0-240.1.1.vz8.5.x-ovz" and will appear 
at https://src.openvz.org/scm/ovz/vzkernel.git
after rh8-4.18.0-240.1.1.vz8.5.3
-->
commit 8d9b6a4975972a522c1790ad5cc93c221e68604a
Author: Kirill Tkhai 
Date:   Tue Dec 22 14:21:13 2020 +0300

dm-ploop: Skip zero writes to unallocated clusters

Sometimes this may save some space...

https://jira.sw.ru/browse/PSBM-123748

Signed-off-by: Kirill Tkhai 
---
 drivers/md/dm-ploop-map.c | 24 
 1 file changed, 24 insertions(+)

diff --git a/drivers/md/dm-ploop-map.c b/drivers/md/dm-ploop-map.c
index 4b12b5fc082a..f193b25cbd28 100644
--- a/drivers/md/dm-ploop-map.c
+++ b/drivers/md/dm-ploop-map.c
@@ -397,6 +397,27 @@ static void maybe_unlink_completed_bio(struct ploop 
*ploop, struct bio *bio)
queue_work(ploop->wq, &ploop->worker);
 }
 
+static bool bio_endio_if_all_zeros(struct bio *bio)
+{
+   struct bvec_iter bi = {
+   .bi_size = bio->bi_iter.bi_size,
+   };
+   struct bio_vec bv;
+   void *data, *ret;
+
+   for_each_bvec(bv, bio->bi_io_vec, bi, bi) {
+   data = kmap(bv.bv_page);
+   ret = memchr_inv(data + bv.bv_offset, 0, bv.bv_len);
+   kunmap(bv.bv_page);
+   if (ret)
+   return false;
+   }
+
+   bio->bi_status = BLK_STS_OK;
+   bio_endio(bio);
+   return true;
+}
+
 static void handle_discard_bio(struct ploop *ploop, struct bio *bio,
 unsigned int cluster, unsigned int dst_cluster)
 {
@@ -1326,6 +1347,9 @@ static int process_one_deferred_bio(struct ploop *ploop, 
struct bio *bio,
goto out;
}
 
+   if (unlikely(bio_endio_if_all_zeros(bio)))
+   goto out;
+
/* Cluster exists nowhere. Allocate it and setup bio as outrunning */
ret = locate_new_cluster_and_attach_bio(ploop, piwb, cluster,
&dst_cluster, bio);
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH RHEL8 COMMIT] ms/tracing: remove WARN_ON in start_thread()

2020-12-22 Thread Konstantin Khorenko
The commit is pushed to "branch-rh8-4.18.0-240.1.1.vz8.5.x-ovz" and will appear 
at https://src.openvz.org/scm/ovz/vzkernel.git
after rh8-4.18.0-240.1.1.vz8.5.3
-->
commit df45cae70ccd9baddd35f22abcd43dc01f78e3c2
Author: Vasily Averin 
Date:   Fri Nov 20 09:27:18 2020 +0300

ms/tracing: remove WARN_ON in start_thread()

This patch reverts upstream commit 978defee11a5 ("tracing: Do a WARN_ON()
 if start_thread() in hwlat is called when thread exists")

.start hook can be legally called several times if according
tracer is stopped

screen window 1
[root@localhost ~]# echo 1 > /sys/kernel/tracing/events/kmem/kfree/enable
[root@localhost ~]# echo 1 > /sys/kernel/tracing/options/pause-on-trace
[root@localhost ~]# less -F /sys/kernel/tracing/trace

screen window 2
[root@localhost ~]# cat /sys/kernel/debug/tracing/tracing_on
0
[root@localhost ~]# echo hwlat >  /sys/kernel/debug/tracing/current_tracer
[root@localhost ~]# echo 1 > /sys/kernel/debug/tracing/tracing_on
[root@localhost ~]# cat /sys/kernel/debug/tracing/tracing_on
0
[root@localhost ~]# echo 2 > /sys/kernel/debug/tracing/tracing_on

triggers warning in dmesg:
WARNING: CPU: 3 PID: 1403 at kernel/trace/trace_hwlat.c:371 
hwlat_tracer_start+0xc9/0xd0

mFixes: 978defee11a5 ("tracing: Do a WARN_ON() if start_thread() in hwlat is
called when thread exists")

ms commit: 310e3a4b5a4f ("tracing: Remove WARN_ON in start_thread()")

https://jira.sw.ru/browse/PSBM-120940
Signed-off-by: Vasily Averin 
---
 kernel/trace/trace_hwlat.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c
index bccff460cd35..af8c26d5642a 100644
--- a/kernel/trace/trace_hwlat.c
+++ b/kernel/trace/trace_hwlat.c
@@ -354,7 +354,7 @@ static int start_kthread(struct trace_array *tr)
struct task_struct *kthread;
int next_cpu;
 
-   if (WARN_ON(hwlat_kthread))
+   if (hwlat_kthread)
return 0;
 
/* Just pick the first CPU on first iteration */
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH RHEL8 COMMIT] venetdev: check ve_ns is not null before dereferencing

2020-12-22 Thread Konstantin Khorenko
The commit is pushed to "branch-rh8-4.18.0-240.1.1.vz8.5.x-ovz" and will appear 
at https://src.openvz.org/scm/ovz/vzkernel.git
after rh8-4.18.0-240.1.1.vz8.5.3
-->
commit 64b9ec9f3dd51c53b048508ea1936dec57b146e6
Author: Pavel Tikhomirov 
Date:   Tue Dec 22 16:48:03 2020 +0300

venetdev: check ve_ns is not null before dereferencing

When testing criu on vz8 I got crash in venet_newlink on dereferencing
zero ve_ns:

  BUG: unable to handle kernel NULL pointer dereference at 0028
  PGD 800136ceb067 P4D 800136ceb067 PUD 137624067 PMD 0
  Oops:  [#1] SMP PTI
  CPU: 0 PID: 6853 Comm: criu ve: ad7d77df-8614-42b4-8bf4-c9313fcb2a17
  Kdump: loaded Not tainted 4.18.0-193.6.3.vz8.4.18 #1 4.18
  Hardware name: Virtuozzo KVM, BIOS 1.11.0-2.vz7.2 04/01/2014
  RIP: 0010:venet_newlink+0x18/0xc0 [vznetdev]

Small reproducer:

  # term 1
  echo $$
  # 407283

  # term 2
  mkdir /sys/fs/cgroup/ve/my_new_ve
  echo 666 > /sys/fs/cgroup/ve/my_new_ve/ve.veid
  echo 407283 > /sys/fs/cgroup/ve/my_new_ve/tasks

  # term 1
  unshare -n
  ip link add venet0 type venet

If we create venet in network namespace which is owned by ve which is
not started yet - we crash.

Note on vz7 we are safe as there is no ve_ns.

https://jira.sw.ru/browse/PSBM-123077

Signed-off-by: Pavel Tikhomirov 
---
 drivers/net/venetdev.c | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/net/venetdev.c b/drivers/net/venetdev.c
index b5b3f7e16c58..cdf56b9e7ec1 100644
--- a/drivers/net/venetdev.c
+++ b/drivers/net/venetdev.c
@@ -7,6 +7,7 @@
 
 #include 
 #include 
+#include 
 #include 
 
 #include 
@@ -733,6 +734,7 @@ static int venet_newlink(struct net *src_net,
 struct netlink_ext_ack *extack)
 {
struct ve_struct *ve = src_net->owner_ve;
+   struct nsproxy *ve_ns;
struct net *net;
int err = 0;
 
@@ -741,7 +743,10 @@ static int venet_newlink(struct net *src_net,
 * also referenced on assignment => ve won't die =>
 * rcu_read_lock()/unlock not needed here.
 */
-   net = rcu_dereference_check(ve->ve_ns, 1)->net_ns;
+   ve_ns = rcu_dereference_check(ve->ve_ns, 1);
+   if (!ve_ns)
+   return -EBUSY;
+   net = ve_ns->net_ns;
if (!net)
return -EBUSY;
 
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH RHEL8 COMMIT] ve: allow writing to features and iptables_mask in pseudosuper state

2020-12-22 Thread Konstantin Khorenko
The commit is pushed to "branch-rh8-4.18.0-240.1.1.vz8.5.x-ovz" and will appear 
at https://src.openvz.org/scm/ovz/vzkernel.git
after rh8-4.18.0-240.1.1.vz8.5.3
-->
commit 347eb23fe824e903516f1c80fa6bcbe01b6d28e0
Author: Pavel Tikhomirov 
Date:   Tue Dec 22 16:48:03 2020 +0300

ve: allow writing to features and iptables_mask in pseudosuper state

This is needed by criu to be able to restore those ops from
vz-rst-action action script setup on setup-namespaces stage.
This is effectively a port from vz7 kernel.

While on it let's also fix missprint in ve_features_write name.

https://jira.sw.ru/browse/PSBM-120728

Signed-off-by: Pavel Tikhomirov 
---
 kernel/ve/ve.c | 10 ++
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
index 3f53641455ad..65a1ea27b738 100644
--- a/kernel/ve/ve.c
+++ b/kernel/ve/ve.c
@@ -930,11 +930,12 @@ static u64 ve_reatures_read(struct cgroup_subsys_state 
*css, struct cftype *cft)
return css_to_ve(css)->features;
 }
 
-static int ve_reatures_write(struct cgroup_subsys_state *css, struct cftype 
*cft, u64 val)
+static int ve_features_write(struct cgroup_subsys_state *css, struct cftype 
*cft, u64 val)
 {
struct ve_struct *ve = css_to_ve(css);
 
-   if (!ve_is_super(get_exec_env()))
+   if (!ve_is_super(get_exec_env()) &&
+   !ve->is_pseudosuper)
return -EPERM;
 
down_write(&ve->op_sem);
@@ -957,7 +958,8 @@ static int ve_iptables_mask_write(struct 
cgroup_subsys_state *css, struct cftype
 {
struct ve_struct *ve = css_to_ve(css);
 
-   if (!ve_is_super(get_exec_env()))
+   if (!ve_is_super(get_exec_env()) &&
+   !ve->is_pseudosuper)
return -EPERM;
 
down_write(&ve->op_sem);
@@ -1285,7 +1287,7 @@ static struct cftype ve_cftypes[] = {
.name   = "features",
.flags  = CFTYPE_NOT_ON_ROOT,
.read_u64   = ve_reatures_read,
-   .write_u64  = ve_reatures_write,
+   .write_u64  = ve_features_write,
},
{
.name   = "os_release",
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH RHEL8 COMMIT] ms/netfilter: ipset: Fix "INFO: rcu detected stall in hash_xxx" reports

2020-12-22 Thread Konstantin Khorenko
The commit is pushed to "branch-rh8-4.18.0-240.1.1.vz8.5.x-ovz" and will appear 
at https://src.openvz.org/scm/ovz/vzkernel.git
after rh8-4.18.0-240.1.1.vz8.5.3
-->
commit f266f5b08aa571a7c8e77940580cacbd09d8b08b
Author: Jozsef Kadlecsik 
Date:   Tue Dec 22 17:02:07 2020 +0300

ms/netfilter: ipset: Fix "INFO: rcu detected stall in hash_xxx" reports

In the case of huge hash:* types of sets, due to the single spinlock of
a set the processing of the whole set under spinlock protection could take
too long.

There were four places where the whole hash table of the set was processed
from bucket to bucket under holding the spinlock:

- During resizing a set, the original set was locked to exclude kernel side
  add/del element operations (userspace add/del is excluded by the
  nfnetlink mutex). The original set is actually just read during the
  resize, so the spinlocking is replaced with rcu locking of regions.
  However, thus there can be parallel kernel side add/del of entries.
  In order not to loose those operations a backlog is added and replayed
  after the successful resize.
- Garbage collection of timed out entries was also protected by the 
spinlock.
  In order not to lock too long, region locking is introduced and a single
  region is processed in one gc go. Also, the simple timer based gc running
  is replaced with a workqueue based solution. The internal book-keeping
  (number of elements, size of extensions) is moved to region level due to
  the region locking.
- Adding elements: when the max number of the elements is reached, the gc
  was called to evict the timed out entries. The new approach is that the gc
  is called just for the matching region, assuming that if the region
  (proportionally) seems to be full, then the whole set does. We could scan
  the other regions to check every entry under rcu locking, but for huge
  sets it'd mean a slowdown at adding elements.
- Listing the set header data: when the set was defined with timeout
  support, the garbage collector was called to clean up timed out entries
  to get the correct element numbers and set size values. Now the set is
  scanned to check non-timed out entries, without actually calling the gc
  for the whole set.

Thanks to Florian Westphal for helping me to solve the SOFTIRQ-safe ->
SOFTIRQ-unsafe lock order issues during working on the patch.

Reported-by: syzbot+4b0e9d4ff3cf11783...@syzkaller.appspotmail.com
Reported-by: syzbot+c27b8d5010f45c666...@syzkaller.appspotmail.com
Reported-by: syzbot+68a806795ac89df3a...@syzkaller.appspotmail.com
mFixes: 23c42a403a9c ("netfilter: ipset: Introduction of new commands and 
protocol version 7")
Signed-off-by: Jozsef Kadlecsik 

https://jira.sw.ru/browse/PSBM-123524
(cherry picked from commit f66ee0410b1c3481ee75e5db9b34547b4d582465)
Signed-off-by: Andrey Ryabinin 
---
 include/linux/netfilter/ipset/ip_set.h |  11 +-
 net/netfilter/ipset/ip_set_core.c  |  34 +-
 net/netfilter/ipset/ip_set_hash_gen.h  | 633 +++--
 3 files changed, 472 insertions(+), 206 deletions(-)

diff --git a/include/linux/netfilter/ipset/ip_set.h 
b/include/linux/netfilter/ipset/ip_set.h
index e499d170f12d..3c49b540c701 100644
--- a/include/linux/netfilter/ipset/ip_set.h
+++ b/include/linux/netfilter/ipset/ip_set.h
@@ -124,6 +124,7 @@ struct ip_set_ext {
u32 timeout;
u8 packets_op;
u8 bytes_op;
+   bool target;
 };
 
 struct ip_set;
@@ -190,6 +191,14 @@ struct ip_set_type_variant {
/* Return true if "b" set is the same as "a"
 * according to the create set parameters */
bool (*same_set)(const struct ip_set *a, const struct ip_set *b);
+   /* Region-locking is used */
+   bool region_lock;
+};
+
+struct ip_set_region {
+   spinlock_t lock;/* Region lock */
+   size_t ext_size;/* Size of the dynamic extensions */
+   u32 elements;   /* Number of elements vs timeout */
 };
 
 /* The core set type structure */
@@ -461,7 +470,7 @@ bitmap_bytes(u32 a, u32 b)
 #include 
 
 #define IP_SET_INIT_KEXT(skb, opt, set)\
-   { .bytes = (skb)->len, .packets = 1,\
+   { .bytes = (skb)->len, .packets = 1, .target = true,\
  .timeout = ip_set_adt_opt_timeout(opt, set) }
 
 #define IP_SET_INIT_UEXT(set)  \
diff --git a/net/netfilter/ipset/ip_set_core.c 
b/net/netfilter/ipset/ip_set_core.c
index 56b59904feca..615b5791edf2 100644
--- a/net/netfilter/ipset/ip_set_core.c
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -558,6 +558,20 @@ ip_set_rcu_get(struct net *net, ip_set_id_t index)
return set;
 }
 
+static inline void
+ip_set_lock(struct ip_set *set)
+{
+   if (!set->variant->region_lock)
+   spin_lock_bh(&set->lock);
+}
+
+static inline voi

[Devel] [PATCH RHEL8 COMMIT] ms/ptrace: fix task_join_group_stop() for the case when current is traced

2020-12-22 Thread Konstantin Khorenko
The commit is pushed to "branch-rh8-4.18.0-240.1.1.vz8.5.x-ovz" and will appear 
at https://src.openvz.org/scm/ovz/vzkernel.git
after rh8-4.18.0-240.1.1.vz8.5.3
-->
commit 375b9696ec2b0eb6af3a22647ade8e9881d03133
Author: Oleg Nesterov 
Date:   Tue Dec 22 17:02:07 2020 +0300

ms/ptrace: fix task_join_group_stop() for the case when current is traced

This testcase

#include 
#include 
#include 
#include 
#include 
#include 
#include 

void *tf(void *arg)
{
return NULL;
}

int main(void)
{
int pid = fork();
if (!pid) {
kill(getpid(), SIGSTOP);

pthread_t th;
pthread_create(&th, NULL, tf, NULL);

return 0;
}

waitpid(pid, NULL, WSTOPPED);

ptrace(PTRACE_SEIZE, pid, 0, PTRACE_O_TRACECLONE);
waitpid(pid, NULL, 0);

ptrace(PTRACE_CONT, pid, 0,0);
waitpid(pid, NULL, 0);

int status;
int thread = waitpid(-1, &status, 0);
assert(thread > 0 && thread != pid);
assert(status == 0x80137f);

return 0;
}

fails and triggers WARN_ON_ONCE(!signr) in do_jobctl_trap().

This is because task_join_group_stop() has 2 problems when current is 
traced:

1. We can't rely on the "JOBCTL_STOP_PENDING" check, a stopped 
tracee
   can be woken up by debugger and it can clone another thread which
   should join the group-stop.

   We need to check group_stop_count || SIGNAL_STOP_STOPPED.

2. If SIGNAL_STOP_STOPPED is already set, we should not increment
   sig->group_stop_count and add JOBCTL_STOP_CONSUME. The new thread
   should stop without another do_notify_parent_cldstop() report.

To clarify, the problem is very old and we should blame
ptrace_init_task().  But now that we have task_join_group_stop() it makes
more sense to fix this helper to avoid the code duplication.

Reported-by: syzbot+3485e3773f7da290e...@syzkaller.appspotmail.com
Signed-off-by: Oleg Nesterov 
Signed-off-by: Andrew Morton 
Cc: Jens Axboe 
Cc: Christian Brauner 
Cc: "Eric W . Biederman" 
Cc: Zhiqiang Liu 
Cc: Tejun Heo 
Cc: 
Link: https://lkml.kernel.org/r/20201019134237.ga18...@redhat.com
Signed-off-by: Linus Torvalds 

https://jira.sw.ru/browse/PSBM-123525
(cherry picked from commit 7b3c36fc4c231ca532120bbc0df67a12f09c1d96)
Signed-off-by: Andrey Ryabinin 
---
 kernel/signal.c | 19 ++-
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/kernel/signal.c b/kernel/signal.c
index 177cd7f04acb..171f7496f811 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -388,16 +388,17 @@ static bool task_participate_group_stop(struct 
task_struct *task)
 
 void task_join_group_stop(struct task_struct *task)
 {
+   unsigned long mask = current->jobctl & JOBCTL_STOP_SIGMASK;
+   struct signal_struct *sig = current->signal;
+
+   if (sig->group_stop_count) {
+   sig->group_stop_count++;
+   mask |= JOBCTL_STOP_CONSUME;
+   } else if (!(sig->flags & SIGNAL_STOP_STOPPED))
+   return;
+
/* Have the new thread join an on-going signal group stop */
-   unsigned long jobctl = current->jobctl;
-   if (jobctl & JOBCTL_STOP_PENDING) {
-   struct signal_struct *sig = current->signal;
-   unsigned long signr = jobctl & JOBCTL_STOP_SIGMASK;
-   unsigned long gstop = JOBCTL_STOP_PENDING | JOBCTL_STOP_CONSUME;
-   if (task_set_jobctl_pending(task, signr | gstop)) {
-   sig->group_stop_count++;
-   }
-   }
+   task_set_jobctl_pending(task, mask | JOBCTL_STOP_PENDING);
 }
 
 /*
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH RHEL8 COMMIT] ve/fs/aio: aio_nr & aio_max_nr variables virtualization

2020-12-22 Thread Konstantin Khorenko
The commit is pushed to "branch-rh8-4.18.0-240.1.1.vz8.5.x-ovz" and will appear 
at https://src.openvz.org/scm/ovz/vzkernel.git
after rh8-4.18.0-240.1.1.vz8.5.3
-->
commit f33a6bd814d31dd056682c49cf8baa3490eae5c0
Author: Stanislav Kinsburskiy 
Date:   Tue Dec 22 17:49:02 2020 +0300

ve/fs/aio: aio_nr & aio_max_nr variables virtualization

Virtualization of kernel global aio_nr & aio_max_nr variables is required
to isolate containers and ve0 when allocating aio request/events resources.

Each ve and ve0 has own aio_nr, aio_max_nr values. Function ioctx_alloc 
trying
to charge appropriate aio_nr value selected by ve context.

It's not possible to exhaust aio events resources of one ve from another ve.

Default per-CT aio_max_nr value == 0x1, including CT0.

https://jira.sw.ru/browse/PSBM-29017

Signed-off-by: Andrey Ryabinin 

Reviewed-by: Vladimir Davydov 

==

fs-aio-show-real-number-of-aio

fs/aio: show real number of aio events in fs.aio-nr sysctl

fs.aio-nr accounts number of aio events requested by user via io_setup()
syscall. The kernel usually creates more events than was requested.
CRIU doesn't care about the number of requested events, it cares only
about created events. So while restoring the process CRIU requests
in io_setup() the number of actually created events. This leads
to inconsistent value of fs.aio-nr after the restore.

Let's show in fs.aio-nr a number of created events, not requested.

https://jira.sw.ru/browse/PSBM-47209

Signed-off-by: Andrey Ryabinin 

Acked-by: Kirill Tkhai 

+++
fs/aio-nr: fix decrement of aio-nr

Commit 280363c ("fs/aio: show real number of aio events in fs.aio-nr 
sysctl")
changed only incrementing of fs.aio-nr counter. It failed to update
decrement path which leads to constant growing of fs.aio-nr value.

mFixes commit 280363c ("fs/aio: show real number of aio events in fs.aio-nr
sysctl").

https://jira.sw.ru/browse/PSBM-47209

Signed-off-by: Andrey Ryabinin 

Acked-by: Kirill Tkhai 

+++
Ported to VZ8:
ve->aio_nr now incremented by ctx->nr_events (really allocated
io events) as in ms kernel
https://jira.sw.ru/browse/PSBM-123159

Signed-off-by: Alexander Mikhalitsyn 
---
 fs/aio.c| 45 -
 include/linux/aio.h |  6 ++
 include/linux/ve.h  |  6 ++
 kernel/sysctl.c | 16 
 kernel/ve/ve.c  |  7 +++
 5 files changed, 47 insertions(+), 33 deletions(-)

diff --git a/fs/aio.c b/fs/aio.c
index e81c8583e055..492f1a8b7661 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -30,6 +30,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -155,6 +156,7 @@ struct kioctx {
 
struct page *internal_pages[AIO_RING_PAGES];
struct file *aio_ring_file;
+   struct ve_struct*ve;
 
unsignedid;
 };
@@ -187,12 +189,6 @@ struct aio_kiocb {
struct eventfd_ctx  *ki_eventfd;
 };
 
-/*-- sysctl variables*/
-static DEFINE_SPINLOCK(aio_nr_lock);
-unsigned long aio_nr;  /* current system wide number of aio requests */
-unsigned long aio_max_nr = 0x1; /* system wide maximum number of aio 
requests */
-/*end sysctl variables---*/
-
 static struct kmem_cache   *kiocb_cachep;
 static struct kmem_cache   *kioctx_cachep;
 
@@ -555,12 +551,14 @@ static void free_ioctx(struct work_struct *work)
 {
struct kioctx *ctx = container_of(to_rcu_work(work), struct kioctx,
  free_rwork);
+   struct ve_struct *ve = ctx->ve;
pr_debug("freeing %p\n", ctx);
 
aio_free_ring(ctx);
free_percpu(ctx->cpu);
percpu_ref_exit(&ctx->reqs);
percpu_ref_exit(&ctx->users);
+   put_ve(ve);
kmem_cache_free(kioctx_cachep, ctx);
 }
 
@@ -657,14 +655,16 @@ static int ioctx_add_table(struct kioctx *ctx, struct 
mm_struct *mm)
}
 }
 
-static void aio_nr_sub(unsigned nr)
+static void aio_nr_sub(struct kioctx *ctx, unsigned nr)
 {
-   spin_lock(&aio_nr_lock);
-   if (WARN_ON(aio_nr - nr > aio_nr))
-   aio_nr = 0;
+   struct ve_struct *ve = ctx->ve;
+
+   spin_lock(&ve->aio_nr_lock);
+   if (WARN_ON(ve->aio_nr - nr > ve->aio_nr))
+   ve->aio_nr = 0;
else
-   aio_nr -= nr;
-   spin_unlock(&aio_nr_lock);
+   ve->aio_nr -= nr;
+   spin_unlock(&ve->aio_nr_lock);
 }
 
 /* ioctx_alloc
@@ -674,6 +674,7 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
 {
struct mm_struct *mm = current->mm;
struct kioctx *ctx;
+   struct ve_struct *ve = get_exec_env();
int err = -ENOMEM;
 
  

[Devel] [PATCH RHEL8 COMMIT] ve/aio: Add a handle to checkpoint/restore AIO context

2020-12-22 Thread Konstantin Khorenko
The commit is pushed to "branch-rh8-4.18.0-240.1.1.vz8.5.x-ovz" and will appear 
at https://src.openvz.org/scm/ovz/vzkernel.git
after rh8-4.18.0-240.1.1.vz8.5.3
-->
commit f5d12793d7fcea5477c0e5736adb1d7129a0e762
Author: Stanislav Kinsburskiy 
Date:   Tue Dec 22 17:56:03 2020 +0300

ve/aio: Add a handle to checkpoint/restore AIO context

This adds ioctl, which allows to set ring buffer tail
and to wait till aio requests are finished.

v2: Add pseudosuper check

https://jira.sw.ru/browse/PSBM-42488

Signed-off-by: Kirill Tkhai 

Reviewed-by: Cyrill Gorcunov 

khorenko@: we don't support migration of incomplete aio requests
   https://jira.sw.ru/browse/PSBM-41425

so using added instruments we wait till all AIO requests are completed
and migrate the results (AIO req contexts with status).

==

ve/aio: Enumerate ioctl numbers right

Do not use common used numbers, use custom.
Also, make error codes different.

https://jira.sw.ru/browse/PSBM-42488

Signed-off-by: Kirill Tkhai 

Acked-by: Cyrill Gorcunov 

==

ve/aio: Kill ve_aio_set_tail()

Since tail is restored using submitting requests to write in /dev/null,
we do not need this interface anymore.

https://jira.sw.ru/browse/PSBM-42488

Signed-off-by: Kirill Tkhai 

Acked-by: Cyrill Gorcunov 

==

ve/aio: Wait for all inflight AIO reqs of a task

Make it wait all task's AIO contexts instead of a single AIO request.
This minimizes the number of syscall we do to dump aios.

https://jira.sw.ru/browse/PSBM-42488

Signed-off-by: Kirill Tkhai 

Acked-by: Cyrill Gorcunov 
==

Ported with respect to ms commits:
34e83fc ("aio: reqs_active -> reqs_available")
723be6e ("aio: percpu ioctx refcount")
db446a0 ("aio: convert the ioctx list to table lookup v3")

https://jira.sw.ru/browse/PSBM-123159

Signed-off-by: Alexander Mikhalitsyn 
---
 fs/aio.c| 82 +
 fs/proc/base.c  | 27 ++
 include/linux/aio.h | 13 +
 3 files changed, 122 insertions(+)

diff --git a/fs/aio.c b/fs/aio.c
index 492f1a8b7661..7c547247b056 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -20,6 +20,7 @@
 #include 
 #include 
 
+#include 
 #include 
 #include 
 #include 
@@ -1892,6 +1893,87 @@ SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id,
return ret;
 }
 
+#ifdef CONFIG_VE
+static bool has_reqs_active(struct kioctx *ctx)
+{
+   unsigned long flags;
+   unsigned nr;
+
+   spin_lock_irqsave(&ctx->completion_lock, flags);
+   nr = (ctx->nr_events - 1) - atomic_read(&ctx->reqs_available);
+   nr -= ctx->completed_events;
+   spin_unlock_irqrestore(&ctx->completion_lock, flags);
+
+   return !!nr;
+}
+
+static int ve_aio_wait_inflight_reqs(struct task_struct *p)
+{
+   struct mm_struct *mm;
+   struct kioctx_table *table;
+   int ret, i;
+
+   if (p->flags & PF_KTHREAD)
+   return -EINVAL;
+
+   task_lock(p);
+   mm = p->mm;
+   if (mm)
+   atomic_inc(&mm->mm_count);
+   task_unlock(p);
+   if (!mm)
+   return -ESRCH;
+
+again:
+   spin_lock_irq(&mm->ioctx_lock);
+   rcu_read_lock();
+   table = rcu_dereference(mm->ioctx_table);
+   for (i = 0; i < table->nr; i++) {
+   struct kioctx *ctx;
+
+   ctx = rcu_dereference(table->table[i]);
+   if (!ctx)
+   continue;
+
+   if (!has_reqs_active(ctx))
+   continue;
+
+   percpu_ref_get(&ctx->users);
+   rcu_read_unlock();
+   spin_unlock_irq(&mm->ioctx_lock);
+
+   ret = wait_event_interruptible(ctx->wait, 
!has_reqs_active(ctx));
+   percpu_ref_put(&ctx->users);
+
+   if (ret)
+   goto mmdrop;
+   goto again;
+   }
+
+   rcu_read_unlock();
+   spin_unlock_irq(&mm->ioctx_lock);
+   ret = 0;
+mmdrop:
+   mmdrop(mm);
+   return ret;
+}
+
+int ve_aio_ioctl(struct task_struct *task, unsigned int cmd, unsigned long arg)
+{
+   int ret;
+
+   switch (cmd) {
+   case VE_AIO_IOC_WAIT_ACTIVE:
+   ret = ve_aio_wait_inflight_reqs(task);
+   break;
+   default:
+   ret = -EINVAL;
+   }
+
+   return ret;
+}
+#endif
+
 struct __aio_sigset {
const sigset_t __user   *sigmask;
size_t  sigsetsize;
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 7c0fd93ba7d1..38268a980989 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -

[Devel] [PATCH rh7] fs: Drop "pos" argument from {read, write}_iter() callbacks

2020-12-23 Thread Konstantin Khorenko
This syncs file_operations::read_iter()/write_iter() declarations with
mainstream versions.

We brought _iter() interface before it appeared in mainstream and
callbacks declarations are not in sync.

Usually this does not hurt, but zfs is a bit upset that we have
address_space_operations::direct_IO() which uses iters, but
file_operations::read_iter()/write_iter() are not implemented (they are
implemented in vz kernel, but have different declaration, thus not
detected correctly).

So, let's make zfs happy.

To_merge: eddb0e72476d ("fs/aio: kernel direct aio")

Signed-off-by: Konstantin Khorenko 
---
 fs/aio.c   |  4 ++--
 fs/ext4/file.c | 29 +++--
 include/linux/fs.h |  8 
 mm/filemap.c   | 24 
 4 files changed, 33 insertions(+), 32 deletions(-)

diff --git a/fs/aio.c b/fs/aio.c
index f1b27fc5defb..f0a9e1613529 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1343,7 +1343,7 @@ static ssize_t aio_read_iter(struct kiocb *iocb)
if (!file->f_op->read_iter)
return -EINVAL;
 
-   return file->f_op->read_iter(iocb, iocb->ki_iter, iocb->ki_pos);
+   return file->f_op->read_iter(iocb, iocb->ki_iter);
 }
 
 static ssize_t aio_write_iter(struct kiocb *iocb)
@@ -1365,7 +1365,7 @@ static ssize_t aio_write_iter(struct kiocb *iocb)
return -EINVAL;
 
file_start_write(file);
-   ret = file->f_op->write_iter(iocb, iocb->ki_iter, iocb->ki_pos);
+   ret = file->f_op->write_iter(iocb, iocb->ki_iter);
file_end_write(file);
return ret;
 }
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 67a385e9f716..2b3a73f90163 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -318,26 +318,26 @@ ext4_file_dax_write(
 #endif
 
 static ssize_t
-ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *iter, loff_t pos)
+ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *iter)
 {
struct inode *inode = file_inode(iocb->ki_filp);
ssize_t ret;
int overwrite = 0;
 
-   ret = ext4_write_checks(iocb, iter, &pos);
+   ret = ext4_write_checks(iocb, iter, &iocb->ki_pos);
if (ret <= 0)
return ret;
 
 #ifdef CONFIG_FS_DAX
if (IS_DAX(inode))
-   return ext4_file_dax_write(iocb, iter, pos);
+   return ext4_file_dax_write(iocb, iter, iocb->ki_pos);
 #endif
 
iocb->private = &overwrite; /* RHEL7 only - prevent DIO race */
if (unlikely(io_is_direct(iocb->ki_filp)))
-   ret = ext4_file_dio_write(iocb, iter, pos);
+   ret = ext4_file_dio_write(iocb, iter, iocb->ki_pos);
else
-   ret = generic_file_write_iter(iocb, iter, pos);
+   ret = generic_file_write_iter(iocb, iter);
 
return ret;
 }
@@ -350,7 +350,8 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
 
iov_iter_init(&iter, iov, nr_segs, iov_length(iov, nr_segs), 0);
 
-   return ext4_file_write_iter(iocb, &iter, pos);
+   BUG_ON(iocb->ki_pos != pos);
+   return ext4_file_write_iter(iocb, &iter);
 }
 
 #ifdef CONFIG_FS_DAX
@@ -585,8 +586,7 @@ loff_t ext4_llseek(struct file *file, loff_t offset, int 
whence)
 static ssize_t
 ext4_file_dax_read_iter(
struct kiocb*iocb,
-   struct iov_iter *iter,
-   loff_t  pos)
+   struct iov_iter *iter)
 {
size_t  size = iov_iter_count(iter);
ssize_t ret = 0;
@@ -603,10 +603,10 @@ ext4_file_dax_read_iter(
if (!IS_DAX(inode)) {
inode_unlock(inode);
/* Fallback to buffered IO in case we cannot support DAX */
-   return generic_file_read_iter(iocb, iter, pos);
+   return generic_file_read_iter(iocb, iter);
}
 
-   ret = dax_iomap_rw(READ, iocb, iter, pos,
+   ret = dax_iomap_rw(READ, iocb, iter, iocb->ki_pos,
size, &ext4_iomap_ops);
inode_unlock(inode);
 
@@ -616,13 +616,13 @@ ext4_file_dax_read_iter(
 #endif
 
 ssize_t
-ext4_file_read_iter(struct kiocb *iocb, struct iov_iter *iter, loff_t pos)
+ext4_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
 {
 #ifdef CONFIG_FS_DAX
if (IS_DAX(file_inode(iocb->ki_filp)))
-   return ext4_file_dax_read_iter(iocb, iter, pos);
+   return ext4_file_dax_read_iter(iocb, iter);
 #endif
-   return generic_file_read_iter(iocb, iter, pos);
+   return generic_file_read_iter(iocb, iter);
 }
 
 static ssize_t
@@ -636,7 +636,8 @@ ext4_file_read(
struct iov_iter iter;
 
iov_iter_init(&iter, iovp, nr_segs, size, 0);
-   return ext4_file_read_iter(iocb, &iter, pos);
+   BUG_ON(iocb->ki_pos != pos);
+   return ext4_file_read_iter(iocb, &iter);
 }
 
 const struc

[Devel] [PATCH rh7 v2 0/4] fs/iov_iter: Fix ZFS kernel module compilation

2020-12-25 Thread Konstantin Khorenko
Commit 9fdccb71a24b ("fs: Pass iov_iter to ->direct_IO") in VZ kernel
has changed the interface of .direct_IO() callback and
zfs assumes now VZ kernel has iov_iters like in mainstream,
but this was not completely true.

As we have added iters long ago, the interface is different from
mainstream's version which breaks zfs compilation.

This patchset make our iter interface close to mainstream version
which should be enough for zfs.

https://bugs.openvz.org/browse/OVZ-7243

Konstantin Khorenko (4):
  fs: Drop "pos" argument from {read,write}_iter() callbacks
  fs/iov_iter: Introduce and use iov_iter.type instead of "ops"
verification
  fs/iov_iter: Introduce "iov" member in struct iov_iter for ITER_IOVEC
  fs/iov_iter: Drop ITER_PAGE iov_iter primitive - no users

 fs/aio.c   |   4 +-
 fs/ceph/file.c |   4 +-
 fs/ext4/file.c |  29 +++
 include/linux/fs.h | 190 +
 mm/filemap.c   |  24 +++---
 mm/iov-iter.c  | 141 -
 6 files changed, 137 insertions(+), 255 deletions(-)

-- 
2.24.3

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH rh7 v2 1/4] fs: Drop "pos" argument from {read, write}_iter() callbacks

2020-12-25 Thread Konstantin Khorenko
This syncs file_operations::read_iter()/write_iter() declarations with
mainstream versions.

We brought _iter() interface before it appeared in mainstream and
callbacks declarations are not in sync.

Usually this does not hurt, but zfs is a bit upset that we have
address_space_operations::direct_IO() which uses iters, but
file_operations::read_iter()/write_iter() are not implemented
(they are implemented in vz kernel, but have different declaration,
thus not detected correctly).

So, let's make zfs happy.

To_merge: eddb0e72476d ("fs/aio: kernel direct aio")

https://bugs.openvz.org/browse/OVZ-7243

Signed-off-by: Konstantin Khorenko 
---
 fs/aio.c   |  4 ++--
 fs/ext4/file.c | 29 +++--
 include/linux/fs.h |  8 
 mm/filemap.c   | 24 
 4 files changed, 33 insertions(+), 32 deletions(-)

diff --git a/fs/aio.c b/fs/aio.c
index f1b27fc5defb..f0a9e1613529 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1343,7 +1343,7 @@ static ssize_t aio_read_iter(struct kiocb *iocb)
if (!file->f_op->read_iter)
return -EINVAL;
 
-   return file->f_op->read_iter(iocb, iocb->ki_iter, iocb->ki_pos);
+   return file->f_op->read_iter(iocb, iocb->ki_iter);
 }
 
 static ssize_t aio_write_iter(struct kiocb *iocb)
@@ -1365,7 +1365,7 @@ static ssize_t aio_write_iter(struct kiocb *iocb)
return -EINVAL;
 
file_start_write(file);
-   ret = file->f_op->write_iter(iocb, iocb->ki_iter, iocb->ki_pos);
+   ret = file->f_op->write_iter(iocb, iocb->ki_iter);
file_end_write(file);
return ret;
 }
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 67a385e9f716..2b3a73f90163 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -318,26 +318,26 @@ ext4_file_dax_write(
 #endif
 
 static ssize_t
-ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *iter, loff_t pos)
+ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *iter)
 {
struct inode *inode = file_inode(iocb->ki_filp);
ssize_t ret;
int overwrite = 0;
 
-   ret = ext4_write_checks(iocb, iter, &pos);
+   ret = ext4_write_checks(iocb, iter, &iocb->ki_pos);
if (ret <= 0)
return ret;
 
 #ifdef CONFIG_FS_DAX
if (IS_DAX(inode))
-   return ext4_file_dax_write(iocb, iter, pos);
+   return ext4_file_dax_write(iocb, iter, iocb->ki_pos);
 #endif
 
iocb->private = &overwrite; /* RHEL7 only - prevent DIO race */
if (unlikely(io_is_direct(iocb->ki_filp)))
-   ret = ext4_file_dio_write(iocb, iter, pos);
+   ret = ext4_file_dio_write(iocb, iter, iocb->ki_pos);
else
-   ret = generic_file_write_iter(iocb, iter, pos);
+   ret = generic_file_write_iter(iocb, iter);
 
return ret;
 }
@@ -350,7 +350,8 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
 
iov_iter_init(&iter, iov, nr_segs, iov_length(iov, nr_segs), 0);
 
-   return ext4_file_write_iter(iocb, &iter, pos);
+   BUG_ON(iocb->ki_pos != pos);
+   return ext4_file_write_iter(iocb, &iter);
 }
 
 #ifdef CONFIG_FS_DAX
@@ -585,8 +586,7 @@ loff_t ext4_llseek(struct file *file, loff_t offset, int 
whence)
 static ssize_t
 ext4_file_dax_read_iter(
struct kiocb*iocb,
-   struct iov_iter *iter,
-   loff_t  pos)
+   struct iov_iter *iter)
 {
size_t  size = iov_iter_count(iter);
ssize_t ret = 0;
@@ -603,10 +603,10 @@ ext4_file_dax_read_iter(
if (!IS_DAX(inode)) {
inode_unlock(inode);
/* Fallback to buffered IO in case we cannot support DAX */
-   return generic_file_read_iter(iocb, iter, pos);
+   return generic_file_read_iter(iocb, iter);
}
 
-   ret = dax_iomap_rw(READ, iocb, iter, pos,
+   ret = dax_iomap_rw(READ, iocb, iter, iocb->ki_pos,
size, &ext4_iomap_ops);
inode_unlock(inode);
 
@@ -616,13 +616,13 @@ ext4_file_dax_read_iter(
 #endif
 
 ssize_t
-ext4_file_read_iter(struct kiocb *iocb, struct iov_iter *iter, loff_t pos)
+ext4_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
 {
 #ifdef CONFIG_FS_DAX
if (IS_DAX(file_inode(iocb->ki_filp)))
-   return ext4_file_dax_read_iter(iocb, iter, pos);
+   return ext4_file_dax_read_iter(iocb, iter);
 #endif
-   return generic_file_read_iter(iocb, iter, pos);
+   return generic_file_read_iter(iocb, iter);
 }
 
 static ssize_t
@@ -636,7 +636,8 @@ ext4_file_read(
struct iov_iter iter;
 
iov_iter_init(&iter, iovp, nr_segs, size, 0);
-   return ext4_file_read_iter(iocb, &iter, pos);
+   BUG_ON(iocb->ki_pos != pos);
+   return ext4_file_read_it

[Devel] [PATCH rh7 v2 2/4] fs/iov_iter: Introduce and use iov_iter.type instead of "ops" verification

2020-12-25 Thread Konstantin Khorenko
zfs code checks the iov_iter type via "type" struct field, so
we have to
  1) have it
  2) put proper types there

https://bugs.openvz.org/browse/OVZ-7243

Signed-off-by: Konstantin Khorenko 
---
 include/linux/fs.h | 164 +
 1 file changed, 105 insertions(+), 59 deletions(-)

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 2f498238202a..61dcc20052c3 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -330,14 +330,69 @@ struct page;
 struct address_space;
 struct writeback_control;
 
+enum iter_type {
+   /* iter types */
+   ITER_IOVEC  = 4,/* ii_iovec_ops */
+   ITER_KVEC   = 8,/* ms type, not used in vzkernel */
+   ITER_BVEC   = 16,   /* ii_bvec_ops  */
+   ITER_PIPE   = 32,   /* ms type, not used in vzkernel */
+   ITER_DISCARD= 64,   /* ms type, not used in vzkernel */
+   ITER_PAGE   = 128,  /* ii_page_ops, currently not used */
+   ITER_PLAIN  = 256,  /* ii_plain_ops */
+   ITER_BAD= 512,  /* ii_bad_ops   */
+};
+
 struct iov_iter {
-   struct iov_iter_ops *ops;
+   /*
+* Bit 0 is the read/write bit, set if we're writing.
+* !!! VZ kernel does not set/use bit 0 for direction !!!
+*
+* Bit 1 is the BVEC_FLAG_NO_REF bit, set if type is a bvec and
+* the caller isn't expecting to drop a page reference when done.
+*/
+   unsigned int type;
+
unsigned long data;
unsigned long nr_segs;
size_t iov_offset;
size_t count;
 };
 
+extern struct iov_iter_ops ii_bvec_ops;
+extern struct iov_iter_ops ii_page_ops;
+extern struct iov_iter_ops ii_iovec_ops;
+extern struct iov_iter_ops ii_bad_ops;
+extern struct iov_iter_ops ii_plain_ops;
+
+static inline struct iov_iter_ops *iov_iter_get_ops(const struct iov_iter 
*iter)
+{
+   switch (iter->type) {
+   case ITER_IOVEC:
+   return &ii_iovec_ops;
+   /*
+   case ITER_KVEC:
+   return lalala;
+   */
+   case ITER_BVEC:
+   return &ii_iovec_ops;
+   /*
+   case ITER_PIPE:
+   return lalala;
+   case ITER_DISCARD:
+   return lalala;
+   */
+   case ITER_PAGE:
+   return &ii_page_ops;
+   case ITER_PLAIN:
+   return &ii_plain_ops;
+   case ITER_BAD:
+   return &ii_bad_ops;
+   default:
+   /* we must be aware about unexpected types */
+   BUG();
+   }
+}
+
 struct iov_iter_ops {
size_t (*ii_copy_to_user_atomic)(struct page *, struct iov_iter *,
 unsigned long, size_t);
@@ -359,59 +414,81 @@ struct iov_iter_ops {
 static inline size_t iov_iter_copy_to_user_atomic(struct page *page,
 struct iov_iter *i, unsigned long offset, size_t bytes)
 {
-   return i->ops->ii_copy_to_user_atomic(page, i, offset, bytes);
+   return iov_iter_get_ops(i)->ii_copy_to_user_atomic(page, i, offset,
+  bytes);
 }
 static inline size_t iov_iter_copy_to_user(struct page *page,
struct iov_iter *i, unsigned long offset, size_t bytes)
 {
-   return i->ops->ii_copy_to_user(page, i, offset, bytes);
+   return iov_iter_get_ops(i)->ii_copy_to_user(page, i, offset, bytes);
 }
 static inline size_t iov_iter_copy_from_user_atomic(struct page *page,
 struct iov_iter *i, unsigned long offset, size_t bytes)
 {
-   return i->ops->ii_copy_from_user_atomic(page, i, offset, bytes);
+   return iov_iter_get_ops(i)->ii_copy_from_user_atomic(page, i, offset,
+bytes);
 }
 static inline size_t iov_iter_copy_from_user(struct page *page,
struct iov_iter *i, unsigned long offset, size_t bytes)
 {
-   return i->ops->ii_copy_from_user(page, i, offset, bytes);
+   return iov_iter_get_ops(i)->ii_copy_from_user(page, i, offset, bytes);
 }
 static inline void iov_iter_advance(struct iov_iter *i, size_t bytes)
 {
-   return i->ops->ii_advance(i, bytes);
+   return iov_iter_get_ops(i)->ii_advance(i, bytes);
 }
 static inline int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes)
 {
-   return i->ops->ii_fault_in_readable(i, bytes);
+   return iov_iter_get_ops(i)->ii_fault_in_readable(i, bytes);
 }
 static inline size_t iov_iter_single_seg_count(const struct iov_iter *i)
 {
-   return i->ops->ii_single_seg_count(i);
+   return iov_iter_get_ops(i)->ii_single_seg_count(i);
 }
 static inline int iov_iter_shorten(struct iov_iter *i, size_t count)
 {
-   return i->ops->ii_shorten(i, count);
+   return iov_iter_get_ops(i)->ii_shorten(i, count);
 }
 
 static inline void *iov_iter_kmap_atomic(const struct iov_iter *i, 

[Devel] [PATCH rh7 v2 3/4] fs/iov_iter: Introduce "iov" member in struct iov_iter for ITER_IOVEC

2020-12-25 Thread Konstantin Khorenko
zfs code expects the data of ITER_IOVEC iov_iter is accessed
via .iov member, so add the field to struct iov_iter
and fill with the data correctly.

https://bugs.openvz.org/browse/OVZ-7243

Signed-off-by: Konstantin Khorenko 
---
 fs/ceph/file.c |  4 ++--
 include/linux/fs.h | 14 --
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 4302940a8a6c..1d06a474fa68 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -115,7 +115,7 @@ dio_get_pages_alloc(const struct iov_iter *it, size_t 
nbytes,
return ERR_PTR(-ENOMEM);
 
for (idx = 0; idx < npages; ) {
-   struct iovec *tmp_iov = iov_iter_iovec(&tmp_it);
+   const struct iovec *tmp_iov = iov_iter_iovec(&tmp_it);
void __user *data = tmp_iov->iov_base + tmp_it.iov_offset;
size_t off = (unsigned long)data & (PAGE_SIZE - 1);
size_t len = min_t(size_t, nbytes,
@@ -1216,7 +1216,7 @@ static ssize_t inline_to_iov(struct kiocb *iocb, struct 
iov_iter *i,
zero_user_segment(inline_page, inline_len, end);
 
while (left) {
-   struct iovec *iov = iov_iter_iovec(i);
+   const struct iovec *iov = iov_iter_iovec(i);
void __user *udata = iov->iov_base;
size_t n = min(iov->iov_len - i->iov_offset, left);
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 61dcc20052c3..d4c143fa7212 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -351,11 +351,13 @@ struct iov_iter {
 * the caller isn't expecting to drop a page reference when done.
 */
unsigned int type;
-
-   unsigned long data;
-   unsigned long nr_segs;
size_t iov_offset;
size_t count;
+   union {
+   const struct iovec *iov;
+   unsigned long data;
+   };
+   unsigned long nr_segs;
 };
 
 extern struct iov_iter_ops ii_bvec_ops;
@@ -473,7 +475,7 @@ static inline void iov_iter_init(struct iov_iter *i,
size_t count, size_t written)
 {
i->type = ITER_IOVEC;
-   i->data = (unsigned long)iov;
+   i->iov = iov;
i->nr_segs = nr_segs;
i->iov_offset = 0;
i->count = count + written;
@@ -484,10 +486,10 @@ static inline int iov_iter_has_iovec(const struct 
iov_iter *i)
 {
return i->type == ITER_IOVEC;
 }
-static inline struct iovec *iov_iter_iovec(const struct iov_iter *i)
+static inline const struct iovec *iov_iter_iovec(const struct iov_iter *i)
 {
BUG_ON(!iov_iter_has_iovec(i));
-   return (struct iovec *)i->data;
+   return i->iov;
 }
 
 struct bio_vec;
-- 
2.24.3

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH rh7 v2 4/4] fs/iov_iter: Drop ITER_PAGE iov_iter primitive - no users

2020-12-25 Thread Konstantin Khorenko
And no such iterator in mainstream kernel as well =>
good reason to drop it.

To_merge: 3f4f3e9c9d8e ("mm: extend generic iov iterator API")

Signed-off-by: Konstantin Khorenko 
---
 include/linux/fs.h |  26 -
 mm/iov-iter.c  | 141 -
 2 files changed, 167 deletions(-)

diff --git a/include/linux/fs.h b/include/linux/fs.h
index d4c143fa7212..f30f19fa0b33 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -337,7 +337,6 @@ enum iter_type {
ITER_BVEC   = 16,   /* ii_bvec_ops  */
ITER_PIPE   = 32,   /* ms type, not used in vzkernel */
ITER_DISCARD= 64,   /* ms type, not used in vzkernel */
-   ITER_PAGE   = 128,  /* ii_page_ops, currently not used */
ITER_PLAIN  = 256,  /* ii_plain_ops */
ITER_BAD= 512,  /* ii_bad_ops   */
 };
@@ -361,7 +360,6 @@ struct iov_iter {
 };
 
 extern struct iov_iter_ops ii_bvec_ops;
-extern struct iov_iter_ops ii_page_ops;
 extern struct iov_iter_ops ii_iovec_ops;
 extern struct iov_iter_ops ii_bad_ops;
 extern struct iov_iter_ops ii_plain_ops;
@@ -383,8 +381,6 @@ static inline struct iov_iter_ops *iov_iter_get_ops(const 
struct iov_iter *iter)
case ITER_DISCARD:
return lalala;
*/
-   case ITER_PAGE:
-   return &ii_page_ops;
case ITER_PLAIN:
return &ii_plain_ops;
case ITER_BAD:
@@ -516,28 +512,6 @@ static inline struct bio_vec *iov_iter_bvec(struct 
iov_iter *i)
return (struct bio_vec *)i->data;
 }
 
-static inline void iov_iter_init_page(struct iov_iter *i,
- struct page *page,
- size_t count, size_t written)
-{
-   i->type = ITER_PAGE;
-   i->data = (unsigned long)page;
-   i->nr_segs = 1;
-   i->iov_offset = 0;
-   i->count = count + written;
-
-   iov_iter_advance(i, written);
-}
-static inline int iov_iter_has_page(struct iov_iter *i)
-{
-   return i->type == ITER_PAGE;
-}
-static inline struct page *iov_iter_page(struct iov_iter *i)
-{
-   BUG_ON(!iov_iter_has_page(i));
-   return (struct page *)i->data;
-}
-
 static inline void iov_iter_init_plain(struct iov_iter *i, void *data,
   size_t count, size_t written)
 {
diff --git a/mm/iov-iter.c b/mm/iov-iter.c
index 04d92fb9bfcf..99ededb7f2cc 100644
--- a/mm/iov-iter.c
+++ b/mm/iov-iter.c
@@ -626,147 +626,6 @@ struct iov_iter_ops ii_bvec_ops = {
 };
 EXPORT_SYMBOL(ii_bvec_ops);
 
-/* Functions to get on with single page */
-
-static void *ii_page_kmap_atomic(const struct iov_iter *iter, void **bp,
-size_t *len)
-{
-   struct page *page = (struct page *)iter->data;
-   void *map;
-
-   BUG_ON(iter->iov_offset >= PAGE_SIZE);
-   map = kmap_atomic(page);
-   *bp = map + iter->iov_offset;
-   *len = iter->count;
-   return map;
-}
-
-static struct page *ii_page_kmap(const struct iov_iter *iter, void **bp,
-size_t *len)
-{
-   struct page *page = (struct page *)iter->data;
-   void *map;
-
-   BUG_ON(iter->iov_offset >= PAGE_SIZE);
-   map = kmap(page);
-   *bp = map + iter->iov_offset;
-   *len = iter->count;
-   return page;
-}
-
-static struct page *ii_page_get_page(const struct iov_iter *iter, size_t *off,
-size_t *len)
-{
-   struct page *page = (struct page *)iter->data;
-
-   if (!get_page_is_safe(page))
-   return NULL;
-
-   *off = iter->iov_offset;
-   *len = iter->count;
-   get_page(page);
-
-   return page;
-}
-
-static size_t page_copy_tofrom_page(struct iov_iter *iter, struct page *page,
-   unsigned long page_offset, size_t bytes,
-   int topage)
-{
-   struct page *ipage = (struct page *)iter->data;
-   size_t ipage_offset = iter->iov_offset;
-   void *ipage_map;
-   void *page_map;
-
-   BUG_ON(bytes > iter->count);
-   BUG_ON(bytes > PAGE_SIZE - ipage_offset);
-   BUG_ON(ipage_offset >= PAGE_SIZE);
-
-   page_map = kmap_atomic(page);
-   ipage_map = kmap_atomic(ipage);
-
-   if (topage)
-   memcpy(page_map + page_offset,
-  ipage_map + ipage_offset,
-  bytes);
-   else
-   memcpy(ipage_map + ipage_offset,
-  page_map + page_offset,
-  bytes);
-
-   kunmap_atomic(ipage_map);
-   kunmap_atomic(page_map);
-
-   return bytes;
-}
-
-size_t ii_page_copy_to_user_atomic(struct page *page, struct iov_iter *i,
-  unsigned long offset, size_t bytes)
-{
-   return page_copy_tofrom_

Re: [Devel] [PATCH RH7] nat: allow nft NAT and iptables NAT work on the same node

2020-12-28 Thread Konstantin Khorenko

On 12/28/2020 09:46 AM, Vasily Averin wrote:

The netfilter NAT core cannot deal with more than one NAT hook
per hook location (prerouting, input ...), because the NAT hooks install
a NAT null binding in case the iptables nat table (iptable_nat hooks)
or the corresponding nftables chain (nft nat hooks) doesn't specify a nat
transformation.

Currently iptables NAT hook is called in all netns, even if according
iptables NAT tables (vpv4 and ipv6 have separate tables) are empty and does 
nothing.
this block execution of corrsponding nft NAT hook, even if it is present.
This is true in reverted direction: if nft NAT hook was called first it blocks
execution of iptbles nat hook, because corresponding conntrack already have
NAT null binding.

This patch allows nft nat hook to be sure if coresponding NAT kind is enabled
and is in use in current net namespace.

The patch does not allow nft to add new NAT chains if netns already have 
another one:
either iptables nat or an another nft NAT chain with the same priority.

Patch does not block the loading of the iptables NAT if nft NAT is already 
present,
because it is quite rare case.

In general this patch allows to work NAT both on host and inside 
iptables-containers
and inside centos8 containers where nftables nat is only used without any 
additional
configuration.

https://jira.sw.ru/browse/PSBM-123345
Signed-off-by: Vasily Averin 


Reviewed-by: Konstantin Khorenko 


---
 include/net/netfilter/nf_nat.h   | 24 +---
 net/ipv4/netfilter/nf_nat_l3proto_ipv4.c |  4 ++--
 net/ipv6/netfilter/nf_nat_l3proto_ipv6.c |  4 ++--
 net/netfilter/core.c | 13 -
 4 files changed, 33 insertions(+), 12 deletions(-)

diff --git a/include/net/netfilter/nf_nat.h b/include/net/netfilter/nf_nat.h
index c9ca6ed..4308180 100644
--- a/include/net/netfilter/nf_nat.h
+++ b/include/net/netfilter/nf_nat.h
@@ -79,23 +79,33 @@ static inline bool nf_nat_oif_changed(unsigned int hooknum,
 }

 /*
- * Check if nft chain's netns fits conntrack netns.
+ * Check if netns have enabled NAT.
  * Uses ops->is_nft_ops flag to detect nft ops.
- * If ops is not nft-related, the check is considered passed.
+ * If ops is not nft-related we need to be sure
+ * that according ipt nat table is not empty and can be in use.
  */
-#define is_valid_netns(ops, ct) ({ \
+#define netns_nat_check(ops, pf, __net) ({ \
const struct nft_chain *__chain;\
const struct net *__chain_net;  \
-   const struct net *__net;\
bool __ret; \
\
if (ops->is_nft_ops) {   \
__chain = ops->priv; \
__chain_net = read_pnet(&nft_base_chain(__chain)->pnet);\
-   __net = nf_ct_net(ct);  \
__ret = net_eq(__net, __chain_net); \
-   } else  \
-   __ret = true;   \
+   } else {\
+   struct xt_table_info *__priv = NULL;\
+   if (pf == NFPROTO_IPV4 &&   \
+   !IS_ERR_OR_NULL(__net->ipv4.nat_table))  \
+   __priv = __net->ipv4.nat_table->private   \

   ^

Add ";".
Just for code beauty. :)


+   else if (pf == NFPROTO_IPV6 &&  \
+!IS_ERR_OR_NULL(__net->ipv6.ip6table_nat))  \
+   __priv = __net->ipv6.ip6table_nat->private;   \
+   if (__priv && __priv->number > __priv->initial_entries)\
+   __ret = true;   \
+   else\
+   __ret = false;  \
+   }   \
__ret;  \
 })
 #endif
diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c 
b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
index 3a261e2..a202287 100644
--- a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
@@ -292,8 +292,8 @@ nf_nat_ipv4_fn(const struct nf_hook_ops *ops, struct 
sk_buff *skb,
if (!nf_nat_initialized(ct, maniptype)) {
 

[Devel] [PATCH rh7] fs: Resurrect generic_segment_checks()

2020-12-29 Thread Konstantin Khorenko
This is a partial revert of a9ca9d715754 ("kill generic_segment_checks()").

The patch resurrects the function, but does not restore any call for it.

Why this functiona is needed then?
For zfs.

VZ kernel does not have file_operations::read_iter()/write_iter()
compatible with mainstream version thus zfs external module
tries live without iters and assumes generic_segment_checks() exists.

So keep the generic_segment_checks() specially for zfs.

https://bugs.openvz.org/browse/OVZ-7243

Signed-off-by: Konstantin Khorenko 
---
 include/linux/fs.h |  2 ++
 mm/filemap.c   | 39 +++
 2 files changed, 41 insertions(+)

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 8398ec202a42..aee8adfc1252 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3255,6 +3255,8 @@ extern ssize_t generic_file_buffered_write_iter(struct 
kiocb *, struct iov_iter
loff_t, loff_t *, ssize_t);
 extern ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, 
loff_t *ppos);
 extern ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t 
len, loff_t *ppos);
+extern int generic_segment_checks(const struct iovec *iov,
+   unsigned long *nr_segs, size_t *count, int access_flags);
 
 /* fs/block_dev.c */
 extern ssize_t blkdev_aio_read(struct kiocb *, const struct iovec *,
diff --git a/mm/filemap.c b/mm/filemap.c
index 950d92b6059b..585c57ef1e0a 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1981,6 +1981,45 @@ int file_read_actor(read_descriptor_t *desc, struct page 
*page,
return size;
 }
 
+/*
+ * Performs necessary checks before doing a write
+ * @iov:   io vector request
+ * @nr_segs:   number of segments in the iovec
+ * @count: number of bytes to write
+ * @access_flags: type of access: %VERIFY_READ or %VERIFY_WRITE
+ *
+ * Adjust number of segments and amount of bytes to write (nr_segs should be
+ * properly initialized first). Returns appropriate error code that caller
+ * should return or zero in case that write should be allowed.
+ */
+int generic_segment_checks(const struct iovec *iov,
+   unsigned long *nr_segs, size_t *count, int access_flags)
+{
+   unsigned long   seg;
+   size_t cnt = 0;
+   for (seg = 0; seg < *nr_segs; seg++) {
+   const struct iovec *iv = &iov[seg];
+
+   /*
+* If any segment has a negative length, or the cumulative
+* length ever wraps negative then return -EINVAL.
+*/
+   cnt += iv->iov_len;
+   if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
+   return -EINVAL;
+   if (access_ok(access_flags, iv->iov_base, iv->iov_len))
+   continue;
+   if (seg == 0)
+   return -EFAULT;
+   *nr_segs = seg;
+   cnt -= iv->iov_len; /* This segment is no good */
+   break;
+   }
+   *count = cnt;
+   return 0;
+}
+EXPORT_SYMBOL(generic_segment_checks);
+
 static int file_read_iter_actor(read_descriptor_t *desc, struct page *page,
unsigned long offset, unsigned long size)
 {
-- 
2.24.3

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


Re: [Devel] [PATCH rh7] fs: Resurrect generic_segment_checks()

2020-12-30 Thread Konstantin Khorenko

On 12/30/2020 08:32 AM, Vasily Averin wrote:

2 questions:
1) Is it used in our kernel?

Nope, not used, as it's said in the commit message.


if not -- I afraid it will not be compiled

Will be.


if so -- I doubt it should be exported.

Will be.


By another words:
are you sure "EXPORT_SYMBOL(generic_segment_checks);" line is required?

Yes, i'm sure.

i've tested zfs compilation on the test kernel without and with the patch.


2) your previous patch-set "fs/iov_iter: Fix ZFS kernel module compilation" is 
still required too?


Nope, it's not needed anymore.

We can probably drop PAGE_ITER - it's just a cleanup.
i'll send that patch once again.


Thank you,
    Vasily Averin

On 12/29/20 4:40 PM, Konstantin Khorenko wrote:

This is a partial revert of a9ca9d715754 ("kill generic_segment_checks()").

The patch resurrects the function, but does not restore any call for it.

Why this functiona is needed then?
For zfs.

VZ kernel does not have file_operations::read_iter()/write_iter()
compatible with mainstream version thus zfs external module
tries live without iters and assumes generic_segment_checks() exists.

So keep the generic_segment_checks() specially for zfs.

https://bugs.openvz.org/browse/OVZ-7243

Signed-off-by: Konstantin Khorenko 
---
 include/linux/fs.h |  2 ++
 mm/filemap.c   | 39 +++
 2 files changed, 41 insertions(+)

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 8398ec202a42..aee8adfc1252 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3255,6 +3255,8 @@ extern ssize_t generic_file_buffered_write_iter(struct 
kiocb *, struct iov_iter
loff_t, loff_t *, ssize_t);
 extern ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, 
loff_t *ppos);
 extern ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t 
len, loff_t *ppos);
+extern int generic_segment_checks(const struct iovec *iov,
+   unsigned long *nr_segs, size_t *count, int access_flags);

 /* fs/block_dev.c */
 extern ssize_t blkdev_aio_read(struct kiocb *, const struct iovec *,
diff --git a/mm/filemap.c b/mm/filemap.c
index 950d92b6059b..585c57ef1e0a 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1981,6 +1981,45 @@ int file_read_actor(read_descriptor_t *desc, struct page 
*page,
return size;
 }

+/*
+ * Performs necessary checks before doing a write
+ * @iov:   io vector request
+ * @nr_segs:   number of segments in the iovec
+ * @count: number of bytes to write
+ * @access_flags: type of access: %VERIFY_READ or %VERIFY_WRITE
+ *
+ * Adjust number of segments and amount of bytes to write (nr_segs should be
+ * properly initialized first). Returns appropriate error code that caller
+ * should return or zero in case that write should be allowed.
+ */
+int generic_segment_checks(const struct iovec *iov,
+   unsigned long *nr_segs, size_t *count, int access_flags)
+{
+   unsigned long   seg;
+   size_t cnt = 0;
+   for (seg = 0; seg < *nr_segs; seg++) {
+   const struct iovec *iv = &iov[seg];
+
+   /*
+* If any segment has a negative length, or the cumulative
+* length ever wraps negative then return -EINVAL.
+*/
+   cnt += iv->iov_len;
+   if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
+   return -EINVAL;
+   if (access_ok(access_flags, iv->iov_base, iv->iov_len))
+   continue;
+   if (seg == 0)
+   return -EFAULT;
+   *nr_segs = seg;
+   cnt -= iv->iov_len;  /* This segment is no good */
+   break;
+   }
+   *count = cnt;
+   return 0;
+}
+EXPORT_SYMBOL(generic_segment_checks);
+
 static int file_read_iter_actor(read_descriptor_t *desc, struct page *page,
unsigned long offset, unsigned long size)
 {


.


___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


Re: [Devel] [PATCH rh7 v2 0/4] fs/iov_iter: Fix ZFS kernel module compilation

2020-12-30 Thread Konstantin Khorenko

The patchset is obsoleted by the patch

[PATCH rh7] fs: Resurrect generic_segment_checks()

--
Best regards,

Konstantin Khorenko,
Virtuozzo Linux Kernel Team

On 12/25/2020 07:15 PM, Konstantin Khorenko wrote:

Commit 9fdccb71a24b ("fs: Pass iov_iter to ->direct_IO") in VZ kernel
has changed the interface of .direct_IO() callback and
zfs assumes now VZ kernel has iov_iters like in mainstream,
but this was not completely true.

As we have added iters long ago, the interface is different from
mainstream's version which breaks zfs compilation.

This patchset make our iter interface close to mainstream version
which should be enough for zfs.

https://bugs.openvz.org/browse/OVZ-7243

Konstantin Khorenko (4):
  fs: Drop "pos" argument from {read,write}_iter() callbacks
  fs/iov_iter: Introduce and use iov_iter.type instead of "ops"
verification
  fs/iov_iter: Introduce "iov" member in struct iov_iter for ITER_IOVEC
  fs/iov_iter: Drop ITER_PAGE iov_iter primitive - no users

 fs/aio.c   |   4 +-
 fs/ceph/file.c |   4 +-
 fs/ext4/file.c |  29 +++
 include/linux/fs.h | 190 +
 mm/filemap.c   |  24 +++---
 mm/iov-iter.c  | 141 -
 6 files changed, 137 insertions(+), 255 deletions(-)


___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH RHEL8 COMMIT] fs/ovelayfs: Fix crash on overlayfs mount

2021-01-27 Thread Konstantin Khorenko
The commit is pushed to "branch-rh8-4.18.0-240.1.1.vz8.5.x-ovz" and will appear 
at https://src.openvz.org/scm/ovz/vzkernel.git
after rh8-4.18.0-240.1.1.vz8.5.4
-->
commit 4267859a056ae596d339de2b508ca56531000ca3
Author: Andrey Ryabinin 
Date:   Wed Jan 27 16:46:47 2021 +0300

fs/ovelayfs: Fix crash on overlayfs mount

Kdump kernel fails to load because of crash on mount of overlayfs:

 BUG: unable to handle kernel NULL pointer dereference at 0060

 Call Trace:
  seq_path+0x64/0xb0
  print_paths_option+0x79/0xa0
  ovl_show_options+0x3a/0x320
  show_mountinfo+0x1ee/0x290
  seq_read+0x2f8/0x400
  vfs_read+0x9d/0x150
  ksys_read+0x4f/0xb0
  do_syscall_64+0x5b/0x1a0

This is cause by OOB access of ofs->lowerpaths.
We transfer to print_paths_option() ofs->numlayer as size of ->lowerpaths
array, but it's not.

The correct number of lowerpaths elements is ->numlower in 'struct 
ovl_entry'.
So move lowerpaths there and use oe->numlower as array size.

Fixes: 17fc61697f73 ("overlayfs: add dynamic path resolving in mount 
options")
Fixes: 2191d729083d ("overlayfs: add mnt_id paths options")

https://jira.sw.ru/browse/PSBM-123508
Signed-off-by: Andrey Ryabinin 
Reviewed-by: Alexander Mikhalitsyn 
---
 fs/overlayfs/ovl_entry.h |  2 +-
 fs/overlayfs/super.c | 37 -
 2 files changed, 21 insertions(+), 18 deletions(-)

diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h
index ea1906448ec5..2315089a0211 100644
--- a/fs/overlayfs/ovl_entry.h
+++ b/fs/overlayfs/ovl_entry.h
@@ -54,7 +54,6 @@ struct ovl_fs {
unsigned int numlayer;
/* Number of unique fs among layers including upper fs */
unsigned int numfs;
-   struct path *lowerpaths;
const struct ovl_layer *layers;
struct ovl_sb *fs;
/* workbasepath is the path at workdir= mount option */
@@ -98,6 +97,7 @@ struct ovl_entry {
struct rcu_head rcu;
};
unsigned numlower;
+   struct path *lowerpaths;
struct ovl_path lowerstack[];
 };
 
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index 3755f280036f..fb419617564c 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -70,8 +70,12 @@ static void ovl_entry_stack_free(struct ovl_entry *oe)
 {
unsigned int i;
 
-   for (i = 0; i < oe->numlower; i++)
+   for (i = 0; i < oe->numlower; i++) {
dput(oe->lowerstack[i].dentry);
+   if (oe->lowerpaths)
+   path_put(&oe->lowerpaths[i]);
+   }
+   kfree(oe->lowerpaths);
 }
 
 static bool ovl_metacopy_def = IS_ENABLED(CONFIG_OVERLAY_FS_METACOPY);
@@ -241,11 +245,6 @@ static void ovl_free_fs(struct ovl_fs *ofs)
ovl_inuse_unlock(ofs->upper_mnt->mnt_root);
mntput(ofs->upper_mnt);
path_put(&ofs->upperpath);
-   if (ofs->lowerpaths) {
-   for (i = 0; i < ofs->numlayer; i++)
-   path_put(&ofs->lowerpaths[i]);
-   kfree(ofs->lowerpaths);
-   }
for (i = 1; i < ofs->numlayer; i++) {
iput(ofs->layers[i].trap);
mntput(ofs->layers[i].mnt);
@@ -359,9 +358,10 @@ static int ovl_show_options(struct seq_file *m, struct 
dentry *dentry)
 {
struct super_block *sb = dentry->d_sb;
struct ovl_fs *ofs = sb->s_fs_info;
+   struct ovl_entry *oe = OVL_E(dentry);
 
if (ovl_dyn_path_opts) {
-   print_paths_option(m, "lowerdir", ofs->lowerpaths, 
ofs->numlayer);
+   print_paths_option(m, "lowerdir", oe->lowerpaths, oe->numlower);
if (ofs->config.upperdir) {
print_path_option(m, "upperdir", &ofs->upperpath);
print_path_option(m, "workdir", &ofs->workbasepath);
@@ -375,7 +375,7 @@ static int ovl_show_options(struct seq_file *m, struct 
dentry *dentry)
}
 
if (ovl_mnt_id_path_opts) {
-   print_mnt_ids_option(m, "lowerdir_mnt_id", ofs->lowerpaths, 
ofs->numlayer);
+   print_mnt_ids_option(m, "lowerdir_mnt_id", oe->lowerpaths, 
oe->numlower);
/*
 * We don't need to show mnt_id for workdir because it
 * on the same mount as upperdir.
@@ -1626,6 +1626,7 @@ static struct ovl_entry *ovl_get_lowerstack(struct 
super_block *sb,
int err;
char *lowertmp, *lower;
unsigned int stacklen, numlower = 0, i;
+   struct path *stack = NULL;
struct ovl_entry *oe;
 
err = -ENOMEM;
@@ -1649,14 +1650,14 @@ static struct ovl_entry *ovl_get_lowerstack(struct 
super_block *sb,
}
 
err = -ENOMEM;
-   ofs->lowerpaths = kcalloc(stacklen, sizeof(struct path), GFP_KERNEL);
-   if (!ofs->lowerpaths)
+   stack = kcalloc(stacklen, sizeof(struct path), GFP_KERNEL);
+   if (!stack)
 

[Devel] [PATCH RHEL8 COMMIT] config.OpenVZ.minimal: Add NETFILTER_XT_NAT, IP_NF_NAT, IP_NF_TARGET_MASQUERADE

2021-01-27 Thread Konstantin Khorenko
The commit is pushed to "branch-rh8-4.18.0-240.1.1.vz8.5.x-ovz" and will appear 
at https://src.openvz.org/scm/ovz/vzkernel.git
after rh8-4.18.0-240.1.1.vz8.5.4
-->
commit 1dbf0a9824e7b51b67562ff5b96eee74a12d873b
Author: Kirill Tkhai 
Date:   Wed Jan 27 16:51:35 2021 +0300

config.OpenVZ.minimal: Add NETFILTER_XT_NAT, IP_NF_NAT, 
IP_NF_TARGET_MASQUERADE

VZ services (and thus containers) fail to start without them.

Signed-off-by: Kirill Tkhai 
---
 configs/kernel-4.18.0-x86_64-KVM-minimal.config | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/configs/kernel-4.18.0-x86_64-KVM-minimal.config 
b/configs/kernel-4.18.0-x86_64-KVM-minimal.config
index 83d8142847c8..46e87cb94f70 100644
--- a/configs/kernel-4.18.0-x86_64-KVM-minimal.config
+++ b/configs/kernel-4.18.0-x86_64-KVM-minimal.config
@@ -1128,7 +1128,7 @@ CONFIG_NETFILTER_XT_SET=y
 # CONFIG_NETFILTER_XT_TARGET_LED is not set
 # CONFIG_NETFILTER_XT_TARGET_LOG is not set
 # CONFIG_NETFILTER_XT_TARGET_MARK is not set
-# CONFIG_NETFILTER_XT_NAT is not set
+CONFIG_NETFILTER_XT_NAT=y
 # CONFIG_NETFILTER_XT_TARGET_NETMAP is not set
 # CONFIG_NETFILTER_XT_TARGET_NFLOG is not set
 # CONFIG_NETFILTER_XT_TARGET_NFQUEUE is not set
@@ -1277,7 +1277,8 @@ CONFIG_IP_NF_IPTABLES=y
 CONFIG_IP_NF_FILTER=y
 CONFIG_IP_NF_TARGET_REJECT=y
 # CONFIG_IP_NF_TARGET_SYNPROXY is not set
-# CONFIG_IP_NF_NAT is not set
+CONFIG_IP_NF_NAT=y
+CONFIG_IP_NF_TARGET_MASQUERADE=y
 CONFIG_IP_NF_MANGLE=y
 # CONFIG_IP_NF_TARGET_CLUSTERIP is not set
 # CONFIG_IP_NF_TARGET_ECN is not set
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH RHEL8 COMMIT] dm-ploop: Fix detection of unallocated clusters on write

2021-01-27 Thread Konstantin Khorenko
The commit is pushed to "branch-rh8-4.18.0-240.1.1.vz8.5.x-ovz" and will appear 
at https://src.openvz.org/scm/ovz/vzkernel.git
after rh8-4.18.0-240.1.1.vz8.5.4
-->
commit aac7255e284c7eace406fc77bf47c96f97d5ad9b
Author: Kirill Tkhai 
Date:   Wed Jan 27 17:00:37 2021 +0300

dm-ploop: Fix detection of unallocated clusters on write

bio_vec may start from bi_idx != 0 and has bi_bvec_done != 0
in case of it was created by bio_split().

https://jira.sw.ru/browse/PSBM-124701
Fixes: 8d9b6a497597 "dm-ploop: Skip zero writes to unallocated clusters"
Signed-off-by: Kirill Tkhai 
---
 drivers/md/dm-ploop-map.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/drivers/md/dm-ploop-map.c b/drivers/md/dm-ploop-map.c
index f193b25cbd28..2b9d006b3777 100644
--- a/drivers/md/dm-ploop-map.c
+++ b/drivers/md/dm-ploop-map.c
@@ -401,11 +401,15 @@ static bool bio_endio_if_all_zeros(struct bio *bio)
 {
struct bvec_iter bi = {
.bi_size = bio->bi_iter.bi_size,
+   .bi_bvec_done = bio->bi_iter.bi_bvec_done,
+   .bi_idx = bio->bi_iter.bi_idx,
};
struct bio_vec bv;
void *data, *ret;
 
for_each_bvec(bv, bio->bi_io_vec, bi, bi) {
+   if (!bv.bv_len)
+   continue;
data = kmap(bv.bv_page);
ret = memchr_inv(data + bv.bv_offset, 0, bv.bv_len);
kunmap(bv.bv_page);
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH RHEL8 COMMIT] ploop: Add GFP_NOIO brackets around call_read_iter()

2021-01-27 Thread Konstantin Khorenko
The commit is pushed to "branch-rh8-4.18.0-240.1.1.vz8.5.x-ovz" and will appear 
at https://src.openvz.org/scm/ovz/vzkernel.git
after rh8-4.18.0-240.1.1.vz8.5.4
-->
commit fde304ae7d3b59e6feefee7889652a321198a012
Author: Kirill Tkhai 
Date:   Wed Jan 27 17:00:37 2021 +0300

ploop: Add GFP_NOIO brackets around call_read_iter()

Force all allocations not to result in IO.

Signed-off-by: Kirill Tkhai 
---
 drivers/md/dm-ploop-map.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/md/dm-ploop-map.c b/drivers/md/dm-ploop-map.c
index 2b9d006b3777..39194cb2d7fe 100644
--- a/drivers/md/dm-ploop-map.c
+++ b/drivers/md/dm-ploop-map.c
@@ -1,6 +1,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -917,10 +918,10 @@ static void ploop_read_aio_complete(struct kiocb *iocb, 
long ret, long ret2)
 static void submit_delta_read(struct ploop *ploop, unsigned int level,
unsigned int dst_cluster, struct bio *bio)
 {
+   unsigned int flags, offset;
struct ploop_iocb *piocb;
struct bio_vec *bvec;
struct iov_iter iter;
-   unsigned int offset;
struct file *file;
loff_t pos;
int ret;
@@ -951,7 +952,9 @@ static void submit_delta_read(struct ploop *ploop, unsigned 
int level,
piocb->iocb.ki_flags = IOCB_DIRECT;
piocb->iocb.ki_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0);
 
+   flags = memalloc_noio_save();
ret = call_read_iter(file, &piocb->iocb, &iter);
+   memalloc_noio_restore(flags);
 
ploop_read_aio_do_completion(piocb);
 
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


Re: [Devel] [PATCH] scripts/checkpatch.pl: fixed regression after COPYING was renamed

2021-01-27 Thread Konstantin Khorenko




--
Best regards,

Konstantin Khorenko,
Virtuozzo Linux Kernel Team

On 01/20/2021 10:15 AM, Vasily Averin wrote:

according to
https://access.redhat.com/labs/rhcb/RHEL-8.3/kernel-4.18.0-240.10.1.el8/sources/
original RHEL8 kernel uses COPYING name.
 problem was happen during our rebase.
Kostja,
could you please take look at your build scripts?


"rpmbuild -bp" renames "COPYING" file.

RHEL8 kernel.spec:
...
chmod +x scripts/checkpatch.pl
mv COPYING COPYING-%{version}
...

i suspect the following patch is related, but did not find the content of the 
patch yet:
- [rpmspec] fix conflicts with COPYING file while installing newer 4.17 kernel 
("Herton R. Krzesinski") [1579563]



Thank you,
Vasily Averin

On 1/20/21 1:51 AM, Valeriy Vdovin wrote:

fixes: 90e3b8f80b22002418d3056438a82837769c4691
checpatch.pl first checks that the script is run from top of kernel tree.
it does so by calling top_of_kernel_tree function, which has a hardcoded
set of files that are enough to identify top of tree by their presence.
The troublesome patch renames COPYING by adding prefix and so the check
fails.

Signed-off-by: Valeriy Vdovin 
---
 scripts/checkpatch.pl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index 4d80526..cb9cddd 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -1069,7 +1069,7 @@ sub top_of_kernel_tree {
my ($root) = @_;

my @tree_check = (
-   "COPYING", "CREDITS", "Kbuild", "MAINTAINERS", "Makefile",
+   "COPYING-4.18.0", "CREDITS", "Kbuild", "MAINTAINERS", 
"Makefile",
"README", "Documentation", "arch", "include", "drivers",
"fs", "init", "ipc", "kernel", "lib", "scripts",
);


.


___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH RHEL8 COMMIT] COPYING: Move COPYING-4.18.0 -> COPYING file

2021-01-27 Thread Konstantin Khorenko
The commit is pushed to "branch-rh8-4.18.0-240.1.1.vz8.5.x-ovz" and will appear 
at https://src.openvz.org/scm/ovz/vzkernel.git
after rh8-4.18.0-240.1.1.vz8.5.5
-->
commit 9e33af2e36f9e0f8e021adc672e07eaf37a79e34
Author: Konstantin Khorenko 
Date:   Wed Jan 27 19:00:21 2021 +0300

COPYING: Move COPYING-4.18.0 -> COPYING file

RHEL 8 in kernel spec does rename COPYING -> COPYING-4.18.0.
i do not know the reason for that.

May be the following RHEL8 patch is related (do not have its content):
[rpmspec] fix conflicts with COPYING file while installing newer 4.17
  kernel ("Herton R. Krzesinski") [1579563]

"COPYING" file must exist in the tree, checkpatch.pl needs it,
it refuses to work without "COPYING" file => rename the file back.

Note: i've updated my rebase scripts so next this patch won't be
necessary after next rebase on new RHEL8 kernel.

Signed-off-by: Konstantin Khorenko 
---
 COPYING-4.18.0 => COPYING | 0
 1 file changed, 0 insertions(+), 0 deletions(-)

diff --git a/COPYING-4.18.0 b/COPYING
similarity index 100%
rename from COPYING-4.18.0
rename to COPYING
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


Re: [Devel] [VZ8 PATCH 3/3] cgroup: port CGROUP_REMOVED flag from vz7

2021-01-28 Thread Konstantin Khorenko

ms commits:

commit 54766d4a1d3d6f84ff8fa475cd8f165c0aeb
Author: Tejun Heo 
Date:   Wed Jun 12 21:04:53 2013 -0700
cgroup: rename CGRP_REMOVED to CGRP_DEAD

+

commit 184faf32328c65c9d86b19577b8d8b90bdd2cd2e
Author: Tejun Heo 
Date:   Fri May 16 13:22:51 2014 -0400
cgroup: use CSS_ONLINE instead of CGRP_DEAD

=> the current patch is not needed, let's rework places where CGRP_REMOVED is 
used during the rebase.

--
Best regards,

Konstantin Khorenko,
Virtuozzo Linux Kernel Team

On 01/18/2021 09:08 PM, Valeriy Vdovin wrote:

The flag will be used in subsequent porting patches for release_agent
functionality
Signed-off-by: Valeriy Vdovin 
---
 include/linux/cgroup-defs.h | 2 ++
 include/linux/cgroup.h  | 5 +
 kernel/cgroup/cgroup.c  | 1 +
 3 files changed, 8 insertions(+)

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index a3b309ab1a90..5ee5f10e3de7 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -57,6 +57,8 @@ enum {

 /* bits in struct cgroup flags field */
 enum {
+   /* Control Cgroup is dead */
+   CGRP_REMOVED,
/* Control Group requires release notifications to userspace */
CGRP_NOTIFY_ON_RELEASE,
/*
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index c0a42c3d43fa..dfd9460986ee 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -922,6 +922,11 @@ static inline bool cgroup_task_frozen(struct task_struct 
*task)
return task->frozen;
 }

+static inline int cgroup_is_removed(const struct cgroup *cgrp)
+{
+   return test_bit(CGRP_REMOVED, &cgrp->flags);
+}
+
 #else /* !CONFIG_CGROUPS */

 static inline void cgroup_enter_frozen(void) { }
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 797a3971ab46..447c8f003496 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -5562,6 +5562,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
tcgrp->freezer.nr_frozen_descendants--;
}
spin_unlock_irq(&css_set_lock);
+   set_bit(CGRP_REMOVED, &cgrp->flags);

cgroup1_check_for_release(parent);



___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


Re: [Devel] [VZ8 PATCH 1/3] ve/cgroup: unmark ve-root cgroups at container stop

2021-01-28 Thread Konstantin Khorenko

On 01/18/2021 09:08 PM, Valeriy Vdovin wrote:

fixes: 915a1130c7ee4ffb6de3f69a5bd98c5ee42a723f

Signed-off-by: Valeriy Vdovin 
Reviewed-by: Kirill Tkhai 

Cherry-picked from 5dceccf5dd794673ebb1b0e6840d96aa654ec33e)
Signed-off-by: Valeriy Vdovin 


Ack, i will drop one redundant copy-paste line during commit, see below.


---
 include/linux/cgroup.h |  1 +
 kernel/cgroup/cgroup.c | 24 
 kernel/ve/ve.c |  3 +++
 3 files changed, 28 insertions(+)

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 4f0dd51338bf..c0a42c3d43fa 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -867,6 +867,7 @@ int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t 
buflen,

 #ifdef CONFIG_VE
 extern void cgroup_mark_ve_root(struct ve_struct *ve);
+void cgroup_unmark_ve_roots(struct ve_struct *ve);
 #endif

 #else /* !CONFIG_CGROUPS */
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 0335a07f64e6..4488df184235 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1908,6 +1908,30 @@ void cgroup_mark_ve_root(struct ve_struct *ve)
spin_unlock_irq(&css_set_lock);
 }

+void cgroup_unmark_ve_roots(struct ve_struct *ve)
+{
+   struct cgrp_cset_link *link;
+   struct css_set *cset;
+   struct cgroup *cgrp;
+
+   spin_lock_irq(&css_set_lock);
+
+   rcu_read_lock();
+   cset = rcu_dereference(ve->ve_ns)->cgroup_ns->root_cset;
+   if (WARN_ON(!cset))
+   goto unlock;
+
+   list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
+   cgrp = link->cgrp;
+   clear_bit(CGRP_VE_ROOT, &cgrp->flags);
+   }
+   link_ve_root_cpu_cgroup(cset->subsys[cpu_cgrp_id]);


^^^ this line to be dropped.


+unlock:
+   rcu_read_unlock();
+
+   spin_unlock_irq(&css_set_lock);
+}
+
 struct cgroup *cgroup_get_ve_root1(struct cgroup *cgrp)
 {
struct cgroup *ve_root = NULL;
diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
index 3f53641455ad..b922653acf49 100644
--- a/kernel/ve/ve.c
+++ b/kernel/ve/ve.c
@@ -519,6 +519,9 @@ void ve_exit_ns(struct pid_namespace *pid_ns)
 */
if (!ve_ns || ve_ns->pid_ns_for_children != pid_ns)
goto unlock;
+
+   cgroup_unmark_ve_roots(ve);
+
/*
 * At this point all userspace tasks in container are dead.
 */


___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH RHEL8 COMMIT] cgroup/cfs: added 'activate' option to cgroup_add_file

2021-01-28 Thread Konstantin Khorenko
The commit is pushed to "branch-rh8-4.18.0-240.1.1.vz8.5.x-ovz" and will appear 
at https://src.openvz.org/scm/ovz/vzkernel.git
after rh8-4.18.0-240.1.1.vz8.5.5
-->
commit 4a0b1219c4ca72b0a3b79654b45c7004ba7d7571
Author: Valeriy Vdovin 
Date:   Thu Jan 28 12:04:25 2021 +0300

cgroup/cfs: added 'activate' option to cgroup_add_file

In kernfs files get created in 'deactivated' state, which means
they are not visible.
Add option to activate the file after creation immediately making it visible
in the parent directory.
Will be used in later patches.

Signed-off-by: Valeriy Vdovin 
---
 kernel/cgroup/cgroup.c | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index b71d4ccb2f0c..946031fcb393 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -3823,7 +3823,7 @@ static void cgroup_file_notify_timer(struct timer_list 
*timer)
 }
 
 static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup 
*cgrp,
-  struct cftype *cft)
+  struct cftype *cft, bool activate)
 {
char name[CGROUP_FILE_NAME_MAX];
struct kernfs_node *kn;
@@ -3865,6 +3865,8 @@ static int cgroup_add_file(struct cgroup_subsys_state 
*css, struct cgroup *cgrp,
if (IS_ERR(kn_link))
return PTR_ERR(kn_link);
}
+   if (activate)
+   kernfs_activate(kn);
 
return 0;
 }
@@ -3902,7 +3904,7 @@ static int cgroup_addrm_files(struct cgroup_subsys_state 
*css,
if ((cft->flags & CFTYPE_DEBUG) && !cgroup_debug)
continue;
if (is_add) {
-   ret = cgroup_add_file(css, cgrp, cft);
+   ret = cgroup_add_file(css, cgrp, cft, false);
if (ret) {
pr_warn("%s: failed to add %s, err=%d\n",
__func__, cft->name, ret);
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH RHEL8 COMMIT] ve/cgroup: Unmark ve-root cgroups at Container stop

2021-01-28 Thread Konstantin Khorenko
The commit is pushed to "branch-rh8-4.18.0-240.1.1.vz8.5.x-ovz" and will appear 
at https://src.openvz.org/scm/ovz/vzkernel.git
after rh8-4.18.0-240.1.1.vz8.5.5
-->
commit eb1382872c59700876c558bddf3c056028b384d5
Author: Valeriy Vdovin 
Date:   Thu Jan 28 12:04:25 2021 +0300

ve/cgroup: Unmark ve-root cgroups at Container stop

To_merge: 3a90ea5cf85a ("cgroup: Mark cgroup CGRP_VE_ROOT")

Signed-off-by: Valeriy Vdovin 
Reviewed-by: Kirill Tkhai 

Cherry-picked from vz7 commit
5dceccf5dd79 "(ve/cgroup: unmark ve-root cgroups at container stop)"

Signed-off-by: Valeriy Vdovin 
---
 include/linux/cgroup.h |  1 +
 kernel/cgroup/cgroup.c | 23 +++
 kernel/ve/ve.c |  3 +++
 3 files changed, 27 insertions(+)

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 4f0dd51338bf..c0a42c3d43fa 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -867,6 +867,7 @@ int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t 
buflen,
 
 #ifdef CONFIG_VE
 extern void cgroup_mark_ve_root(struct ve_struct *ve);
+void cgroup_unmark_ve_roots(struct ve_struct *ve);
 #endif
 
 #else /* !CONFIG_CGROUPS */
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 0335a07f64e6..b71d4ccb2f0c 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1908,6 +1908,29 @@ void cgroup_mark_ve_root(struct ve_struct *ve)
spin_unlock_irq(&css_set_lock);
 }
 
+void cgroup_unmark_ve_roots(struct ve_struct *ve)
+{
+   struct cgrp_cset_link *link;
+   struct css_set *cset;
+   struct cgroup *cgrp;
+
+   spin_lock_irq(&css_set_lock);
+
+   rcu_read_lock();
+   cset = rcu_dereference(ve->ve_ns)->cgroup_ns->root_cset;
+   if (WARN_ON(!cset))
+   goto unlock;
+
+   list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
+   cgrp = link->cgrp;
+   clear_bit(CGRP_VE_ROOT, &cgrp->flags);
+   }
+unlock:
+   rcu_read_unlock();
+
+   spin_unlock_irq(&css_set_lock);
+}
+
 struct cgroup *cgroup_get_ve_root1(struct cgroup *cgrp)
 {
struct cgroup *ve_root = NULL;
diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
index b83b2b66a875..f7d605357d2e 100644
--- a/kernel/ve/ve.c
+++ b/kernel/ve/ve.c
@@ -520,6 +520,9 @@ void ve_exit_ns(struct pid_namespace *pid_ns)
 */
if (!ve_ns || ve_ns->pid_ns_for_children != pid_ns)
goto unlock;
+
+   cgroup_unmark_ve_roots(ve);
+
/*
 * At this point all userspace tasks in container are dead.
 */
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH rh7 0/4] ext4: more error hanglings (ports from mainstream)

2021-02-03 Thread Konstantin Khorenko
Just potentially useful mainstream patches which can prevent fs
corruptions due to proper error handlings.

Harshad Shirwadkar (1):
  ms/ext4: handle ext4_mark_inode_dirty errors

Theodore Ts'o (3):
  ms/ext4: allow ext4_truncate() to return an error
  ms/ext4: allow ext4_ext_truncate() to return an error
  ms/ext4: propagate error values from ext4_inline_data_truncate()

 fs/ext4/acl.c   |  4 +-
 fs/ext4/ext4.h  |  8 ++--
 fs/ext4/ext4_jbd2.h |  5 ++-
 fs/ext4/extents.c   | 49 ++---
 fs/ext4/indirect.c  |  4 +-
 fs/ext4/inline.c| 46 +--
 fs/ext4/inode.c | 89 ++---
 fs/ext4/ioctl.c |  7 +++-
 fs/ext4/migrate.c   | 12 --
 fs/ext4/namei.c | 78 +--
 fs/ext4/super.c | 22 +++
 11 files changed, 204 insertions(+), 120 deletions(-)

-- 
2.24.3

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH rh7 3/4] ms/ext4: propagate error values from ext4_inline_data_truncate()

2021-02-03 Thread Konstantin Khorenko
From: Theodore Ts'o 

Signed-off-by: Theodore Ts'o 

(cherry picked from commit 01daf9452569fe2e69e27fe3e617b43d2ebb1e93)
Signed-off-by: Konstantin Khorenko 
---
 fs/ext4/ext4.h   |  2 +-
 fs/ext4/inline.c | 40 +++-
 fs/ext4/inode.c  |  4 +++-
 3 files changed, 27 insertions(+), 19 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index ad641685f993..7b16ea4a2930 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2766,7 +2766,7 @@ extern struct buffer_head 
*ext4_get_first_inline_block(struct inode *inode,
 extern int ext4_inline_data_fiemap(struct inode *inode,
   struct fiemap_extent_info *fieinfo,
   int *has_inline, __u64 start, __u64 len);
-extern void ext4_inline_data_truncate(struct inode *inode, int *has_inline);
+extern int ext4_inline_data_truncate(struct inode *inode, int *has_inline);
 
 extern int ext4_convert_inline_data(struct inode *inode);
 
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 94e8743b852e..cca1237fecae 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -1889,10 +1889,10 @@ int ext4_inline_data_fiemap(struct inode *inode,
return (error < 0 ? error : 0);
 }
 
-void ext4_inline_data_truncate(struct inode *inode, int *has_inline)
+int ext4_inline_data_truncate(struct inode *inode, int *has_inline)
 {
handle_t *handle;
-   int inline_size, value_len, needed_blocks;
+   int inline_size, value_len, needed_blocks, err = 0;
size_t i_size;
void *value = NULL;
struct ext4_xattr_ibody_find is = {
@@ -1907,19 +1907,19 @@ void ext4_inline_data_truncate(struct inode *inode, int 
*has_inline)
needed_blocks = ext4_writepage_trans_blocks(inode);
handle = ext4_journal_start(inode, EXT4_HT_INODE, needed_blocks);
if (IS_ERR(handle))
-   return;
+   return PTR_ERR(handle);
 
down_write(&EXT4_I(inode)->xattr_sem);
if (!ext4_has_inline_data(inode)) {
*has_inline = 0;
ext4_journal_stop(handle);
-   return;
+   return 0;
}
 
-   if (ext4_orphan_add(handle, inode))
+   if ((err = ext4_orphan_add(handle, inode)) != 0)
goto out;
 
-   if (ext4_get_inode_loc(inode, &is.iloc))
+   if ((err = ext4_get_inode_loc(inode, &is.iloc)) != 0)
goto out;
 
down_write(&EXT4_I(inode)->i_data_sem);
@@ -1930,24 +1930,29 @@ void ext4_inline_data_truncate(struct inode *inode, int 
*has_inline)
if (i_size < inline_size) {
/* Clear the content in the xattr space. */
if (inline_size > EXT4_MIN_INLINE_DATA_SIZE) {
-   if (ext4_xattr_ibody_find(inode, &i, &is))
+   if ((err = ext4_xattr_ibody_find(inode, &i, &is)) != 0)
goto out_error;
 
BUG_ON(is.s.not_found);
 
value_len = le32_to_cpu(is.s.here->e_value_size);
value = kmalloc(value_len, GFP_NOFS);
-   if (!value)
+   if (!value) {
+   err = -ENOMEM;
goto out_error;
+   }
 
-   if (ext4_xattr_ibody_get(inode, i.name_index, i.name,
-   value, value_len))
+   err = ext4_xattr_ibody_get(inode, i.name_index,
+  i.name, value, value_len);
+   if (err <= 0)
goto out_error;
 
i.value = value;
i.value_len = i_size > EXT4_MIN_INLINE_DATA_SIZE ?
i_size - EXT4_MIN_INLINE_DATA_SIZE : 0;
-   if (ext4_xattr_ibody_inline_set(handle, inode, &i, &is))
+   err = ext4_xattr_ibody_inline_set(handle, inode,
+ &i, &is);
+   if (err)
goto out_error;
}
 
@@ -1972,13 +1977,14 @@ void ext4_inline_data_truncate(struct inode *inode, int 
*has_inline)
if (inode->i_nlink)
ext4_orphan_del(handle, inode);
 
-   inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
-   ext4_mark_inode_dirty(handle, inode);
-   if (IS_SYNC(inode))
-   ext4_handle_sync(handle);
-
+   if (err == 0) {
+   inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+   err = ext4_mark_inode_dirty(handle, inode);
+   if (IS_SYNC(inode))
+   ext4_handle_sync(handle);
+   }
ext4_journal_stop(handle);
-   return;
+   return err;
 }
 

[Devel] [PATCH rh7 1/4] ms/ext4: allow ext4_truncate() to return an error

2021-02-03 Thread Konstantin Khorenko
From: Theodore Ts'o 

This allows us to properly propagate errors back up to
ext4_truncate()'s callers.  This also means we no longer have to
silently ignore some errors (e.g., when trying to add the inode to the
orphan inode list).

Signed-off-by: Theodore Ts'o 
Reviewed-by: Jan Kara 

(cherry picked from commit 2c98eb5ea249767bbc11cf4e70e91d5b0458ed13)
Signed-off-by: Konstantin Khorenko 
---
 fs/ext4/ext4.h  |  2 +-
 fs/ext4/inode.c | 41 ++---
 fs/ext4/ioctl.c |  7 +--
 fs/ext4/super.c |  6 --
 4 files changed, 36 insertions(+), 20 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 9202c5e1c078..10df0e7bba7f 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2237,7 +2237,7 @@ extern int ext4_change_inode_journal_flag(struct inode *, 
int);
 extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
 extern int ext4_inode_attach_jinode(struct inode *inode);
 extern int ext4_can_truncate(struct inode *inode);
-extern void ext4_truncate(struct inode *);
+extern int ext4_truncate(struct inode *);
 extern int ext4_break_layouts(struct inode *);
 extern int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length);
 extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int 
nblocks);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index feab57428166..bdd55a9b0e41 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -262,8 +262,15 @@ void ext4_evict_inode(struct inode *inode)
 "couldn't mark inode dirty (err %d)", err);
goto stop_handle;
}
-   if (inode->i_blocks)
-   ext4_truncate(inode);
+   if (inode->i_blocks) {
+   err = ext4_truncate(inode);
+   if (err) {
+   ext4_error(inode->i_sb,
+  "couldn't truncate inode %lu (err %d)",
+  inode->i_ino, err);
+   goto stop_handle;
+   }
+   }
 
/*
 * ext4_ext_truncate() doesn't reserve any slop when it
@@ -4051,10 +4058,11 @@ int ext4_inode_attach_jinode(struct inode *inode)
  * that's fine - as long as they are linked from the inode, the post-crash
  * ext4_truncate() run will find them and release them.
  */
-void ext4_truncate(struct inode *inode)
+int ext4_truncate(struct inode *inode)
 {
struct ext4_inode_info *ei = EXT4_I(inode);
unsigned int credits;
+   int err = 0;
handle_t *handle;
struct address_space *mapping = inode->i_mapping;
 
@@ -4068,7 +4076,7 @@ void ext4_truncate(struct inode *inode)
trace_ext4_truncate_enter(inode);
 
if (!ext4_can_truncate(inode))
-   return;
+   return 0;
 
ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
 
@@ -4083,13 +4091,13 @@ void ext4_truncate(struct inode *inode)
 
ext4_inline_data_truncate(inode, &has_inline);
if (has_inline)
-   return;
+   return 0;
}
 
/* If we zero-out tail of the page, we have to create jinode for jbd2 */
if (inode->i_size & (inode->i_sb->s_blocksize - 1)) {
if (ext4_inode_attach_jinode(inode) < 0)
-   return;
+   return 0;
}
 
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
@@ -4098,10 +4106,8 @@ void ext4_truncate(struct inode *inode)
credits = ext4_blocks_for_truncate(inode);
 
handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
-   if (IS_ERR(handle)) {
-   ext4_std_error(inode->i_sb, PTR_ERR(handle));
-   return;
-   }
+   if (IS_ERR(handle))
+   return PTR_ERR(handle);
 
if (inode->i_size & (inode->i_sb->s_blocksize - 1))
ext4_block_truncate_page(handle, mapping, inode->i_size);
@@ -4115,7 +4121,8 @@ void ext4_truncate(struct inode *inode)
 * Implication: the file must always be in a sane, consistent
 * truncatable state while each transaction commits.
 */
-   if (ext4_orphan_add(handle, inode))
+   err = ext4_orphan_add(handle, inode);
+   if (err)
goto out_stop;
 
down_write(&EXT4_I(inode)->i_data_sem);
@@ -4148,6 +4155,7 @@ void ext4_truncate(struct inode *inode)
ext4_journal_stop(handle);
 
trace_ext4_truncate_exit(inode);
+   return err;
 }
 
 /*
@@ -5167,12 +5175,15 @@ int ext4_setattr(struct dentry *dentry, struct iattr 
*attr)
 * in data=journal mode to make pages freeable.
 */
truncate_pagecache(inode, inode->i_size);
-   if (shrink)
-   ext4_truncate(inode);
+   if (shrink) {
+ 

[Devel] [PATCH rh7 2/4] ms/ext4: allow ext4_ext_truncate() to return an error

2021-02-03 Thread Konstantin Khorenko
From: Theodore Ts'o 

Return errors to the caller instead of declaring the file system
corrupted.

Signed-off-by: Theodore Ts'o 
Reviewed-by: Jan Kara 

(cherry picked from commit d0abb36db44faaf8f8aa148ca206fe2404042dec)
Signed-off-by: Konstantin Khorenko 
---
 fs/ext4/ext4.h|  2 +-
 fs/ext4/extents.c | 15 +++
 fs/ext4/inode.c   |  4 +++-
 3 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 10df0e7bba7f..ad641685f993 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2848,7 +2848,7 @@ extern int ext4_ext_writepage_trans_blocks(struct inode 
*, int);
 extern int ext4_ext_index_trans_blocks(struct inode *inode, int extents);
 extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
   struct ext4_map_blocks *map, int flags);
-extern void ext4_ext_truncate(handle_t *, struct inode *);
+extern int ext4_ext_truncate(handle_t *, struct inode *);
 extern int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
 ext4_lblk_t end);
 extern void ext4_ext_init(struct super_block *);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index cb0d9b6b7e8e..b059276121ef 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -4660,7 +4660,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode 
*inode,
return err ? err : allocated;
 }
 
-void ext4_ext_truncate(handle_t *handle, struct inode *inode)
+int ext4_ext_truncate(handle_t *handle, struct inode *inode)
 {
struct super_block *sb = inode->i_sb;
ext4_lblk_t last_block;
@@ -4674,7 +4674,9 @@ void ext4_ext_truncate(handle_t *handle, struct inode 
*inode)
 
/* we have to know where to truncate from in crash case */
EXT4_I(inode)->i_disksize = inode->i_size;
-   ext4_mark_inode_dirty(handle, inode);
+   err = ext4_mark_inode_dirty(handle, inode);
+   if (err)
+   return err;
 
last_block = (inode->i_size + sb->s_blocksize - 1)
>> EXT4_BLOCK_SIZE_BITS(sb);
@@ -4686,12 +4688,9 @@ void ext4_ext_truncate(handle_t *handle, struct inode 
*inode)
congestion_wait(BLK_RW_ASYNC, HZ/50);
goto retry;
}
-   if (err) {
-   ext4_std_error(inode->i_sb, err);
-   return;
-   }
-   err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1);
-   ext4_std_error(inode->i_sb, err);
+   if (err)
+   return err;
+   return ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1);
 }
 
 static int ext4_convert_unwritten(struct inode *inode, loff_t offset,
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index bdd55a9b0e41..6489d55036e8 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4130,11 +4130,13 @@ int ext4_truncate(struct inode *inode)
ext4_discard_preallocations(inode);
 
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
-   ext4_ext_truncate(handle, inode);
+   err = ext4_ext_truncate(handle, inode);
else
ext4_ind_truncate(handle, inode);
 
up_write(&ei->i_data_sem);
+   if (err)
+   goto out_stop;
 
if (IS_SYNC(inode))
ext4_handle_sync(handle);
-- 
2.24.3

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH rh7 4/4] ms/ext4: handle ext4_mark_inode_dirty errors

2021-02-03 Thread Konstantin Khorenko
From: Harshad Shirwadkar 

ext4_mark_inode_dirty() can fail for real reasons. Ignoring its return
value may lead ext4 to ignore real failures that would result in
corruption / crashes. Harden ext4_mark_inode_dirty error paths to fail
as soon as possible and return errors to the caller whenever
appropriate.

One of the possible scnearios when this bug could affected is that
while creating a new inode, its directory entry gets added
successfully but while writing the inode itself mark_inode_dirty
returns error which is ignored. This would result in inconsistency
that the directory entry points to a non-existent inode.

Ran gce-xfstests smoke tests and verified that there were no
regressions.

Signed-off-by: Harshad Shirwadkar 
Link: 
https://lore.kernel.org/r/20200427013438.219117-1-harshadshirwad...@gmail.com
Signed-off-by: Theodore Ts'o 

(cherry picked from commit 4209ae12b12265d475bba28634184423149bd14f)
Signed-off-by: Konstantin Khorenko 

Backport notes:
* skipped hunk for ext4_handle_inode_extension(), no function in vz7
* skipped hunk for ext4_xattr_inode_write(), no function in vz7
* added an error check of ext4_mark_inode_dirty() ret code in
  ext4_iomap_end()
* added an error check of ext4_mark_inode_dirty() ret code in
  ext4_xattr_set_acl()
* in ext4_mark_inode_dirty() ext4_error_inode_err() has been substituted
  with ext4_std_error() + ext4_error_inode()
---
 fs/ext4/acl.c   |  4 +--
 fs/ext4/ext4.h  |  2 +-
 fs/ext4/ext4_jbd2.h |  5 ++-
 fs/ext4/extents.c   | 34 
 fs/ext4/indirect.c  |  4 ++-
 fs/ext4/inline.c|  6 ++--
 fs/ext4/inode.c | 42 
 fs/ext4/migrate.c   | 12 ---
 fs/ext4/namei.c | 78 +
 fs/ext4/super.c | 16 ++
 10 files changed, 131 insertions(+), 72 deletions(-)

diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index aa689e75fafd..daf6519b94c0 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -336,7 +336,7 @@ ext4_acl_chmod(struct inode *inode)
if (!error && update_mode) {
inode->i_mode = mode;
inode->i_ctime = ext4_current_time(inode);
-   ext4_mark_inode_dirty(handle, inode);
+   error = ext4_mark_inode_dirty(handle, inode);
}
 out_stop:
ext4_journal_stop(handle);
@@ -448,7 +448,7 @@ ext4_xattr_set_acl(struct dentry *dentry, const char *name, 
const void *value,
if (!error && update_mode) {
inode->i_mode = mode;
inode->i_ctime = ext4_current_time(inode);
-   ext4_mark_inode_dirty(handle, inode);
+   error = ext4_mark_inode_dirty(handle, inode);
}
 out_stop:
ext4_journal_stop(handle);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 7b16ea4a2930..f3c7b4c7cf83 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2843,7 +2843,7 @@ enum ext4_event_type {
  */
 #define EXT_MAX_BLOCKS 0x
 
-extern int ext4_ext_tree_init(handle_t *handle, struct inode *);
+extern void ext4_ext_tree_init(handle_t *handle, struct inode *inode);
 extern int ext4_ext_writepage_trans_blocks(struct inode *, int);
 extern int ext4_ext_index_trans_blocks(struct inode *inode, int extents);
 extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 1183d091a918..04c50955 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -226,7 +226,10 @@ ext4_mark_iloc_dirty(handle_t *handle,
 int ext4_reserve_inode_write(handle_t *handle, struct inode *inode,
struct ext4_iloc *iloc);
 
-int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode);
+#define ext4_mark_inode_dirty(__h, __i)
\
+   __ext4_mark_inode_dirty((__h), (__i), __func__, __LINE__)
+int __ext4_mark_inode_dirty(handle_t *handle, struct inode *inode,
+   const char *func, unsigned int line);
 
 /*
  * Wrapper functions with which ext4 calls into JBD.
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index b059276121ef..47ce9c99874f 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -848,7 +848,7 @@ ext4_ext_binsearch(struct inode *inode,
 
 }
 
-int ext4_ext_tree_init(handle_t *handle, struct inode *inode)
+void ext4_ext_tree_init(handle_t *handle, struct inode *inode)
 {
struct ext4_extent_header *eh;
 
@@ -858,7 +858,6 @@ int ext4_ext_tree_init(handle_t *handle, struct inode 
*inode)
eh->eh_magic = EXT4_EXT_MAGIC;
eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode, 0));
ext4_mark_inode_dirty(handle, inode);
-   return 0;
 }
 
 struct ext4_ext_path *
@@ -1338,7 +1337,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct 
inode *inode,
  ext4_idx_pblock(EXT_FIRST_INDEX(neh)));
 
le16_add_cpu(&neh->eh_depth, 1);
-   ext4_mark_inode_dirty(handle, inode);
+  

[Devel] [PATCH rh7 1/2] ms/ext4: Avoid freeing inodes on dirty list

2021-02-03 Thread Konstantin Khorenko
From: Jan Kara 

When we are evicting inode with journalled data, we may race with
transaction commit in the following way:

CPU0CPU1
jbd2_journal_commit_transaction()   evict(inode)
  inode_io_list_del()
  inode_wait_for_writeback()
  process BJ_Forget list
__jbd2_journal_insert_checkpoint()
__jbd2_journal_refile_buffer()
  __jbd2_journal_unfile_buffer()
if (test_clear_buffer_jbddirty(bh))
  mark_buffer_dirty(bh)
__mark_inode_dirty(inode)
  ext4_evict_inode(inode)
frees the inode

This results in use-after-free issues in the writeback code (or
the assertion added in the previous commit triggering).

Fix the problem by removing inode from writeback lists once all the page
cache is evicted and so inode cannot be added to writeback lists again.

Signed-off-by: Jan Kara 
Link: https://lore.kernel.org/r/20200421085445.5731-4-j...@suse.cz
Signed-off-by: Theodore Ts'o 

(cherry picked from commit ceff86fddae8748fe00d4f2d249cb02cae62ad84)
Signed-off-by: Konstantin Khorenko 
---
 fs/ext4/inode.c | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index f89fdb951aee..1c8e0cb64116 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -232,6 +232,16 @@ void ext4_evict_inode(struct inode *inode)
ext4_begin_ordered_truncate(inode, 0);
truncate_inode_pages_final(&inode->i_data);
 
+   /*
+* For inodes with journalled data, transaction commit could have
+* dirtied the inode. Flush worker is ignoring it because of I_FREEING
+* flag but we still need to remove the inode from the writeback lists.
+*/
+   if (!list_empty_careful(&inode->i_io_list)) {
+   WARN_ON_ONCE(!ext4_should_journal_data(inode));
+   inode_io_list_del(inode);
+   }
+
/*
 * Protect us against freezing - iput() caller didn't have to have any
 * protection against it
-- 
2.24.3

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH rh7 2/2] ms/ext4: don't ignore return values from ext4_ext_dirty()

2021-02-03 Thread Konstantin Khorenko
From: Harshad Shirwadkar 

Don't ignore return values from ext4_ext_dirty, since the errors
indicate valid failures below Ext4.  In all of the other instances of
ext4_ext_dirty calls, the error return value is handled in some
way. This patch makes those remaining couple of places to handle
ext4_ext_dirty errors as well. In case of ext4_split_extent_at(), the
ignorance of return value is intentional. The reason is that we are
already in error path and there isn't much we can do if ext4_ext_dirty
returns error. This patch adds a comment for that case explaining why
we ignore the return value.

In the longer run, we probably should
make sure that errors from other mark_dirty routines are handled as
well.

Ran gce-xfstests smoke tests and verified that there were no
regressions.

Signed-off-by: Harshad Shirwadkar 
Reviewed-by: Jan Kara 
Link: 
https://lore.kernel.org/r/20200427013438.219117-2-harshadshirwad...@gmail.com
Signed-off-by: Theodore Ts'o 

(cherry picked from commit b60ca3343e78761c6ebe6ff52c628893c8297b73)
Signed-off-by: Konstantin Khorenko 
---
 fs/ext4/extents.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 47ce9c99874f..cb31585002ca 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3302,6 +3302,10 @@ static int ext4_split_extent_at(handle_t *handle,
 
 fix_extent_len:
ex->ee_len = orig_ex.ee_len;
+   /*
+* Ignore ext4_ext_dirty return value since we are already in error path
+* and err is a non-zero error code.
+*/
ext4_ext_dirty(handle, inode, path + path->p_depth);
return err;
 }
@@ -3560,7 +3564,7 @@ static int ext4_ext_convert_to_initialized(handle_t 
*handle,
}
if (allocated) {
/* Mark the block containing both extents as dirty */
-   ext4_ext_dirty(handle, inode, path + depth);
+   err = ext4_ext_dirty(handle, inode, path + depth);
 
/* Update path to point to the right extent */
path[depth].p_ext = abut_ex;
-- 
2.24.3

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


Re: [Devel] [PATCH rh7 1/2] ms/ext4: Avoid freeing inodes on dirty list

2021-02-03 Thread Konstantin Khorenko

Oops, disregard this one,
wrong patch sent.

--
Best regards,

Konstantin Khorenko,
Virtuozzo Linux Kernel Team

On 02/03/2021 06:37 PM, Konstantin Khorenko wrote:

From: Jan Kara 

When we are evicting inode with journalled data, we may race with
transaction commit in the following way:

CPU0CPU1
jbd2_journal_commit_transaction()   evict(inode)
  inode_io_list_del()
  inode_wait_for_writeback()
  process BJ_Forget list
__jbd2_journal_insert_checkpoint()
__jbd2_journal_refile_buffer()
  __jbd2_journal_unfile_buffer()
if (test_clear_buffer_jbddirty(bh))
  mark_buffer_dirty(bh)
__mark_inode_dirty(inode)
  ext4_evict_inode(inode)
frees the inode

This results in use-after-free issues in the writeback code (or
the assertion added in the previous commit triggering).

Fix the problem by removing inode from writeback lists once all the page
cache is evicted and so inode cannot be added to writeback lists again.

Signed-off-by: Jan Kara 
Link: https://lore.kernel.org/r/20200421085445.5731-4-j...@suse.cz
Signed-off-by: Theodore Ts'o 

(cherry picked from commit ceff86fddae8748fe00d4f2d249cb02cae62ad84)
Signed-off-by: Konstantin Khorenko 
---
 fs/ext4/inode.c | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index f89fdb951aee..1c8e0cb64116 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -232,6 +232,16 @@ void ext4_evict_inode(struct inode *inode)
ext4_begin_ordered_truncate(inode, 0);
truncate_inode_pages_final(&inode->i_data);

+   /*
+* For inodes with journalled data, transaction commit could have
+* dirtied the inode. Flush worker is ignoring it because of I_FREEING
+* flag but we still need to remove the inode from the writeback lists.
+*/
+   if (!list_empty_careful(&inode->i_io_list)) {
+   WARN_ON_ONCE(!ext4_should_journal_data(inode));
+   inode_io_list_del(inode);
+   }
+
/*
 * Protect us against freezing - iput() caller didn't have to have any
 * protection against it


___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH rh7 1/2] ms/writeback: Export inode_io_list_del()

2021-02-03 Thread Konstantin Khorenko
From: Jan Kara 

Ext4 needs to remove inode from writeback lists after it is out of
visibility of its journalling machinery (which can still dirty the
inode). Export inode_io_list_del() for it.

Signed-off-by: Jan Kara 
Link: https://lore.kernel.org/r/20200421085445.5731-3-j...@suse.cz
Signed-off-by: Theodore Ts'o 

(cherry picked from commit 4301efa4c7cca11556dd89899eee066d49b47bf7)
Signed-off-by: Konstantin Khorenko 
---
 fs/fs-writeback.c | 1 +
 fs/internal.h | 2 --
 include/linux/writeback.h | 1 +
 3 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index dd6b06ded3f4..c16a39f4f724 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -197,6 +197,7 @@ void inode_io_list_del(struct inode *inode)
list_del_init(&inode->i_io_list);
spin_unlock(&bdi->wb.list_lock);
 }
+EXPORT_SYMBOL(inode_io_list_del);
 
 /*
  * mark an inode as under writeback on the sb
diff --git a/fs/internal.h b/fs/internal.h
index 1c50f43eb786..e183eebb54af 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -133,8 +133,6 @@ extern bool atime_needs_update_rcu(const struct path *, 
struct inode *);
 /*
  * fs-writeback.c
  */
-extern void inode_io_list_del(struct inode *inode);
-
 extern long get_nr_dirty_inodes(void);
 extern void evict_inodes(struct super_block *);
 extern int invalidate_inodes(struct super_block *, bool);
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 12075c8e0415..fd12ba981135 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -116,6 +116,7 @@ void wakeup_flusher_threads(long nr_pages, enum wb_reason 
reason);
 void wakeup_flusher_threads_ub(long nr_pages, struct user_beancounter *ub,
enum wb_reason reason);
 void inode_wait_for_writeback(struct inode *inode);
+void inode_io_list_del(struct inode *inode);
 
 /* writeback.h requires fs.h; it, too, is not included from here. */
 static inline void wait_on_inode(struct inode *inode)
-- 
2.24.3

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH rh7 2/2] ms/ext4: Avoid freeing inodes on dirty list

2021-02-03 Thread Konstantin Khorenko
From: Jan Kara 

When we are evicting inode with journalled data, we may race with
transaction commit in the following way:

CPU0CPU1
jbd2_journal_commit_transaction()   evict(inode)
  inode_io_list_del()
  inode_wait_for_writeback()
  process BJ_Forget list
__jbd2_journal_insert_checkpoint()
__jbd2_journal_refile_buffer()
  __jbd2_journal_unfile_buffer()
if (test_clear_buffer_jbddirty(bh))
  mark_buffer_dirty(bh)
__mark_inode_dirty(inode)
  ext4_evict_inode(inode)
frees the inode

This results in use-after-free issues in the writeback code (or
the assertion added in the previous commit triggering).

Fix the problem by removing inode from writeback lists once all the page
cache is evicted and so inode cannot be added to writeback lists again.

Signed-off-by: Jan Kara 
Link: https://lore.kernel.org/r/20200421085445.5731-4-j...@suse.cz
Signed-off-by: Theodore Ts'o 

(cherry picked from commit ceff86fddae8748fe00d4f2d249cb02cae62ad84)
Signed-off-by: Konstantin Khorenko 
---
 fs/ext4/inode.c | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index f89fdb951aee..1c8e0cb64116 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -232,6 +232,16 @@ void ext4_evict_inode(struct inode *inode)
ext4_begin_ordered_truncate(inode, 0);
truncate_inode_pages_final(&inode->i_data);
 
+   /*
+* For inodes with journalled data, transaction commit could have
+* dirtied the inode. Flush worker is ignoring it because of I_FREEING
+* flag but we still need to remove the inode from the writeback lists.
+*/
+   if (!list_empty_careful(&inode->i_io_list)) {
+   WARN_ON_ONCE(!ext4_should_journal_data(inode));
+   inode_io_list_del(inode);
+   }
+
/*
 * Protect us against freezing - iput() caller didn't have to have any
 * protection against it
-- 
2.24.3

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH rh8 1/2] ms/ext4: handle ext4_mark_inode_dirty errors

2021-02-03 Thread Konstantin Khorenko
From: Harshad Shirwadkar 

ext4_mark_inode_dirty() can fail for real reasons. Ignoring its return
value may lead ext4 to ignore real failures that would result in
corruption / crashes. Harden ext4_mark_inode_dirty error paths to fail
as soon as possible and return errors to the caller whenever
appropriate.

One of the possible scnearios when this bug could affected is that
while creating a new inode, its directory entry gets added
successfully but while writing the inode itself mark_inode_dirty
returns error which is ignored. This would result in inconsistency
that the directory entry points to a non-existent inode.

Ran gce-xfstests smoke tests and verified that there were no
regressions.

Signed-off-by: Harshad Shirwadkar 
Link: 
https://lore.kernel.org/r/20200427013438.219117-1-harshadshirwad...@gmail.com
Signed-off-by: Theodore Ts'o 

(cherry picked from commit 4209ae12b12265d475bba28634184423149bd14f)
Signed-off-by: Konstantin Khorenko 

Backport note: dropped hunk in ext4_handle_inode_extension() as vz8
kernel does not have this function yet.
---
 fs/ext4/acl.c   |  2 +-
 fs/ext4/ext4.h  |  3 +-
 fs/ext4/ext4_jbd2.h |  5 ++-
 fs/ext4/extents.c   | 34 
 fs/ext4/indirect.c  |  4 ++-
 fs/ext4/inline.c|  6 ++--
 fs/ext4/inode.c | 38 +++---
 fs/ext4/migrate.c   | 12 ---
 fs/ext4/namei.c | 78 +
 fs/ext4/super.c | 16 ++
 fs/ext4/xattr.c |  6 ++--
 11 files changed, 131 insertions(+), 73 deletions(-)

diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index b3eba92f38f5..76f634d185f1 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -255,7 +255,7 @@ ext4_set_acl(struct inode *inode, struct posix_acl *acl, 
int type)
if (!error && update_mode) {
inode->i_mode = mode;
inode->i_ctime = current_time(inode);
-   ext4_mark_inode_dirty(handle, inode);
+   error = ext4_mark_inode_dirty(handle, inode);
}
 out_stop:
ext4_journal_stop(handle);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 0249169b9c14..053b512397f2 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -3223,8 +3223,7 @@ enum ext4_event_type {
  */
 #define EXT_MAX_BLOCKS 0x
 
-extern int ext4_ext_tree_init(handle_t *handle, struct inode *);
-extern int ext4_ext_writepage_trans_blocks(struct inode *, int);
+extern void ext4_ext_tree_init(handle_t *handle, struct inode *inode);
 extern int ext4_ext_index_trans_blocks(struct inode *inode, int extents);
 extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
   struct ext4_map_blocks *map, int flags);
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index a6b9b66dbfad..eb81e139ce56 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -222,7 +222,10 @@ ext4_mark_iloc_dirty(handle_t *handle,
 int ext4_reserve_inode_write(handle_t *handle, struct inode *inode,
struct ext4_iloc *iloc);
 
-int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode);
+#define ext4_mark_inode_dirty(__h, __i)
\
+   __ext4_mark_inode_dirty((__h), (__i), __func__, __LINE__)
+int __ext4_mark_inode_dirty(handle_t *handle, struct inode *inode,
+   const char *func, unsigned int line);
 
 int ext4_expand_extra_isize(struct inode *inode,
unsigned int new_extra_isize,
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 6cabc158a59a..afa98403d602 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -871,7 +871,7 @@ ext4_ext_binsearch(struct inode *inode,
 
 }
 
-int ext4_ext_tree_init(handle_t *handle, struct inode *inode)
+void ext4_ext_tree_init(handle_t *handle, struct inode *inode)
 {
struct ext4_extent_header *eh;
 
@@ -881,7 +881,6 @@ int ext4_ext_tree_init(handle_t *handle, struct inode 
*inode)
eh->eh_magic = EXT4_EXT_MAGIC;
eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode, 0));
ext4_mark_inode_dirty(handle, inode);
-   return 0;
 }
 
 struct ext4_ext_path *
@@ -1382,7 +1381,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct 
inode *inode,
  ext4_idx_pblock(EXT_FIRST_INDEX(neh)));
 
le16_add_cpu(&neh->eh_depth, 1);
-   ext4_mark_inode_dirty(handle, inode);
+   err = ext4_mark_inode_dirty(handle, inode);
 out:
brelse(bh);
 
@@ -4644,7 +4643,7 @@ static int ext4_alloc_file_blocks(struct file *file, 
ext4_lblk_t offset,
struct inode *inode = file_inode(file);
handle_t *handle;
int ret = 0;
-   int ret2 = 0;
+   int ret2 = 0, ret3 = 0;
int retries = 0;
int depth = 0;
struct ext4_map_blocks map;
@@ -4708,10 +4707,11 @@ static int ext4_alloc_file_blocks(struct file *file, 
ext4_lblk_t offset,
 

[Devel] [PATCH rh8 2/2] ms/ext4: don't ignore return values from ext4_ext_dirty()

2021-02-03 Thread Konstantin Khorenko
From: Harshad Shirwadkar 

Don't ignore return values from ext4_ext_dirty, since the errors
indicate valid failures below Ext4.  In all of the other instances of
ext4_ext_dirty calls, the error return value is handled in some
way. This patch makes those remaining couple of places to handle
ext4_ext_dirty errors as well. In case of ext4_split_extent_at(), the
ignorance of return value is intentional. The reason is that we are
already in error path and there isn't much we can do if ext4_ext_dirty
returns error. This patch adds a comment for that case explaining why
we ignore the return value.

In the longer run, we probably should
make sure that errors from other mark_dirty routines are handled as
well.

Ran gce-xfstests smoke tests and verified that there were no
regressions.

Signed-off-by: Harshad Shirwadkar 
Reviewed-by: Jan Kara 
Link: 
https://lore.kernel.org/r/20200427013438.219117-2-harshadshirwad...@gmail.com
Signed-off-by: Theodore Ts'o 

(cherry picked from commit b60ca3343e78761c6ebe6ff52c628893c8297b73)
Signed-off-by: Konstantin Khorenko 
---
 fs/ext4/extents.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index afa98403d602..d22e5182d3e8 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3422,6 +3422,10 @@ static int ext4_split_extent_at(handle_t *handle,
 
 fix_extent_len:
ex->ee_len = orig_ex.ee_len;
+   /*
+* Ignore ext4_ext_dirty return value since we are already in error path
+* and err is a non-zero error code.
+*/
ext4_ext_dirty(handle, inode, path + path->p_depth);
return err;
 }
@@ -3681,7 +3685,7 @@ static int ext4_ext_convert_to_initialized(handle_t 
*handle,
}
if (allocated) {
/* Mark the block containing both extents as dirty */
-   ext4_ext_dirty(handle, inode, path + depth);
+   err = ext4_ext_dirty(handle, inode, path + depth);
 
/* Update path to point to the right extent */
path[depth].p_ext = abut_ex;
-- 
2.28.0

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH rh7] mm/memcontrol: Avoid shrink_zone() endless looping on global reclaim

2021-02-17 Thread Konstantin Khorenko
This version of the patch is for ReadyKernel patch only.

If the Node is highly loaded and we have a lot of mem cgroups to
scan/reclaim it's quite possible to race with some memcg death and
restart the reclaimer loop again and again causing kswapd to work
forever even if there are a lot of free memory already.

More precisely: kswapd may loop forever in shrink_zone() in
do {} while ((memcg = mem_cgroup_iter(root, memcg, &reclaim)))

loop if mem_cgroup_iter() restarts from root memory cgroup again and
again.
And the latter could happen is memory cgroups are created/destroyed
faster than shrink_zone() reclaims all memory cgroups in a row.

Let's stop kswapd in case we see mem_cgroup_iter() has restarted several
times in shrink_zone(). We might prevent kswapd from making its work
good enough, but will definitely prevent the situation when permanent
kswapd work brings node to swapin/swapout while the node has a lot of
free RAM.

https://jira.sw.ru/browse/PSBM-123655

Signed-off-by: Konstantin Khorenko 
---
 mm/vmscan.c | 15 +++
 1 file changed, 15 insertions(+)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 81a115313582..7695bb1d4d72 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2657,6 +2657,7 @@ static void shrink_zone(struct zone *zone, struct 
scan_control *sc,
unsigned long nr_reclaimed, nr_scanned;
bool slab_only = sc->slab_only;
bool retry;
+   int root_memcg_counter = 0;
 
do {
struct mem_cgroup *root = sc->target_mem_cgroup;
@@ -2724,6 +2725,20 @@ static void shrink_zone(struct zone *zone, struct 
scan_control *sc,
mem_cgroup_iter_break(root, memcg);
break;
}
+
+   /*
+* We can loop in this cycle forever on global reclaim
+* if our Node had a lot of mem cgroups && disk is slow
+* && mem cgroups are created/destroyed often.
+* In this case mem_cgroup_iter() will restart the loop
+* from the root memcg again and again. Forever.
+*/
+   if (global_reclaim(sc) &&
+   (memcg == root_mem_cgroup) &&
+   (root_memcg_counter++ > 5)) {
+   mem_cgroup_iter_break(root, memcg);
+   break;
+   }
} while ((memcg = mem_cgroup_iter(root, memcg, &reclaim)));
 
if ((!sc->has_inactive || !sc->nr_reclaimed)
-- 
2.24.3

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH rh7 0/8] mm/mem_cgroup_iter: Reduce the number of iterator restarts upon cgroup removals

2021-02-20 Thread Konstantin Khorenko
May thanks to Kirill Tkhai for his bright ideas and review!

Problem description from the user point of view:
  * the Node is slow
  * the Node has a lot of free RAM
  * the Node has a lot of swapin/swapout
  * kswapd is always running

Problem in a nutshell from technical point of view:
  * kswapd is looping in shrink_zone() inside the loop
  do {} while ((memcg = mem_cgroup_iter(root, memcg, &reclaim)));
(and never goes trough the outer loop)
  * there are a quite a number of memory cgroups of the Node (~1000)
  * some cgroups are hard to reclaim (reclaim may take ~3 seconds),
this is because of very busy disk due to permanent swapin/swapout
  * mem_cgroup_iter() does not have success scanning all cgroups
in a row, it restarts from the root cgroup one time after
another (after different number of cgroups scanned)

Q: Why does mem_cgroup_iter() restart from the root memcg?
A: Because it is invalidated once some memory cgroup is
   destroyed on the Node.
   Note: ANY memory cgroup destroy on the Node leads to iter
   restart.

The following patchset solves this problem in the following way:
there is no need to restart the iter until we see the iter has
the position which is exactly the memory cgroup being destroyed.

The patchset ensures the iter->last_visited is NULL-ified on
invalidation and thus restarts only in the unlikely case when
the iter points to the memcg being destroyed.

Konstantin Khorenko (8):
  mm/mem_cgroup_iter: Make 'iter->last_visited' a bit more stable
  mm/mem_cgroup_iter: Always assign iter->last_visited under seqlock
  mm/mem_cgroup_iter: NULL-ify 'last_visited' for invalidated iterators
  mm/mem_cgroup_iter: Provide _iter_invalidate() the dying memcg as an
argument
  mm/mem_cgroup_iter: Invalidate iterators only if needed
  mm/mem_cgroup_iter: Don't bother checking 'dead_count' anymore
  mm/mem_cgroup_iter: Cleanup mem_cgroup_iter_load()
  mm/mem_cgroup_iter: Drop dead_count related infrastructure

 mm/memcontrol.c | 125 +++-
 1 file changed, 80 insertions(+), 45 deletions(-)

-- 
2.24.3

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH rh7 0/8] mm/mem_cgroup_iter: Reduce the number of iterator restarts upon cgroup removals

2021-02-20 Thread Konstantin Khorenko
May thanks to Kirill Tkhai for his bright ideas and review!

Problem description from the user point of view:
  * the Node is slow
  * the Node has a lot of free RAM
  * the Node has a lot of swapin/swapout
  * kswapd is always running

Problem in a nutshell from technical point of view:
  * kswapd is looping in shrink_zone() inside the loop
  do {} while ((memcg = mem_cgroup_iter(root, memcg, &reclaim)));
(and never goes trough the outer loop)
  * there are a quite a number of memory cgroups of the Node (~1000)
  * some cgroups are hard to reclaim (reclaim may take ~3 seconds),
this is because of very busy disk due to permanent swapin/swapout
  * mem_cgroup_iter() does not have success scanning all cgroups
in a row, it restarts from the root cgroup one time after
another (after different number of cgroups scanned)

Q: Why does mem_cgroup_iter() restart from the root memcg?
A: Because it is invalidated once some memory cgroup is
   destroyed on the Node.
   Note: ANY memory cgroup destroy on the Node leads to iter
   restart.

The following patchset solves this problem in the following way:
there is no need to restart the iter until we see the iter has
the position which is exactly the memory cgroup being destroyed.

The patchset ensures the iter->last_visited is NULL-ified on
invalidation and thus restarts only in the unlikely case when
the iter points to the memcg being destroyed.

https://jira.sw.ru/browse/PSBM-123655

Konstantin Khorenko (8):
  mm/mem_cgroup_iter: Make 'iter->last_visited' a bit more stable
  mm/mem_cgroup_iter: Always assign iter->last_visited under seqlock
  mm/mem_cgroup_iter: NULL-ify 'last_visited' for invalidated iterators
  mm/mem_cgroup_iter: Provide _iter_invalidate() the dying memcg as an
argument
  mm/mem_cgroup_iter: Invalidate iterators only if needed
  mm/mem_cgroup_iter: Don't bother checking 'dead_count' anymore
  mm/mem_cgroup_iter: Cleanup mem_cgroup_iter_load()
  mm/mem_cgroup_iter: Drop dead_count related infrastructure

 mm/memcontrol.c | 125 +++-
 1 file changed, 80 insertions(+), 45 deletions(-)

-- 
2.24.3

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH rh7 1/8] mm/mem_cgroup_iter: Make 'iter->last_visited' a bit more stable

2021-02-20 Thread Konstantin Khorenko
In fact this patch does not change the logic, but after it
we can state that iter->last_visited _always_ contains valid
pointer until the iter is "break-ed".

Why? Because 'last_visited' is always assigned in _update()
to the memcg which has passed css_tryget(), so css won't be
ever offlined (and moreover - destroyed) until we css_put() it.

And if now we call css_put() after iter->last_visited is
assigned a new cgroup, the only case when 'last_visited' may
contain invalid entry is "break-ed" mem_cgroup_iter().

https://jira.sw.ru/browse/PSBM-123655

Signed-off-by: Konstantin Khorenko 
---
 mm/memcontrol.c | 7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e0a430908138..5be8fbfe0308 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1622,9 +1622,6 @@ static void mem_cgroup_iter_update(struct 
mem_cgroup_reclaim_iter *iter,
   struct mem_cgroup *root,
   int sequence)
 {
-   /* root reference counting symmetric to mem_cgroup_iter_load */
-   if (last_visited && last_visited != root)
-   css_put(&last_visited->css);
/*
 * We store the sequence count from the time @last_visited was
 * loaded successfully instead of rereading it here so that we
@@ -1635,6 +1632,10 @@ static void mem_cgroup_iter_update(struct 
mem_cgroup_reclaim_iter *iter,
iter->last_visited = new_position;
iter->last_dead_count = sequence;
write_sequnlock(&iter->last_visited_lock);
+
+   /* root reference counting symmetric to mem_cgroup_iter_load */
+   if (last_visited && last_visited != root)
+   css_put(&last_visited->css);
 }
 
 /**
-- 
2.24.3

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH rh7 3/8] mm/mem_cgroup_iter: NULL-ify 'last_visited' for invalidated iterators

2021-02-20 Thread Konstantin Khorenko
Our target is to invalidate only those iterators which have our
dying memcg as 'last_visited' and put NULL there instead.

As the first step let's put NULL to all iterators'
'last_visited' we are invalidating.

https://jira.sw.ru/browse/PSBM-123655

Signed-off-by: Konstantin Khorenko 
---
 mm/memcontrol.c | 24 
 1 file changed, 24 insertions(+)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index ba107b12f314..23f37b129fc7 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1568,6 +1568,30 @@ static struct mem_cgroup *__mem_cgroup_iter_next(struct 
mem_cgroup *root,
 
 static void mem_cgroup_iter_invalidate(struct mem_cgroup *root)
 {
+   struct mem_cgroup_reclaim_iter *iter;
+   struct mem_cgroup_per_zone *mz;
+   struct mem_cgroup *pos;
+   seqlock_t *lock;
+   int zone, node, i;
+   unsigned seq;
+
+   for_each_node(node) {
+   for (zone = 0; zone < MAX_NR_ZONES; zone++) {
+   mz = mem_cgroup_zoneinfo(root, node, zone);
+
+   for (i = 0; i < ARRAY_SIZE(mz->reclaim_iter); i++) {
+   iter = &mz->reclaim_iter[i];
+   lock = &iter->last_visited_lock;
+
+   write_seqlock(lock);
+   pos = iter->last_visited;
+   iter->last_visited = NULL;
+   write_sequnlock(lock);
+   }
+   }
+   }
+   }
+
/*
 * When a group in the hierarchy below root is destroyed, the
 * hierarchy iterator can no longer be trusted since it might
-- 
2.24.3

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH rh7 2/8] mm/mem_cgroup_iter: Always assign iter->last_visited under seqlock

2021-02-20 Thread Konstantin Khorenko
All other places (both read and assignment) are performed under
seqlock.

https://jira.sw.ru/browse/PSBM-123655

Signed-off-by: Konstantin Khorenko 
---
 mm/memcontrol.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 5be8fbfe0308..ba107b12f314 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1690,7 +1690,9 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup 
*root,
mz = mem_cgroup_zoneinfo(root, nid, zid);
iter = &mz->reclaim_iter[reclaim->priority];
if (prev && reclaim->generation != iter->generation) {
+   write_seqlock(&iter->last_visited_lock);
iter->last_visited = NULL;
+   write_sequnlock(&iter->last_visited_lock);
goto out_unlock;
}
 
-- 
2.24.3

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH rh7 4/8] mm/mem_cgroup_iter: Provide _iter_invalidate() the dying memcg as an argument

2021-02-20 Thread Konstantin Khorenko
It will be used by next patches when we search for this pointer
stored in iterators.

https://jira.sw.ru/browse/PSBM-123655

Signed-off-by: Konstantin Khorenko 
---
 mm/memcontrol.c | 7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 23f37b129fc7..6cde4dbe3c7b 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1566,7 +1566,8 @@ static struct mem_cgroup *__mem_cgroup_iter_next(struct 
mem_cgroup *root,
return NULL;
 }
 
-static void mem_cgroup_iter_invalidate(struct mem_cgroup *root)
+static void mem_cgroup_iter_invalidate(struct mem_cgroup *root,
+  struct mem_cgroup *dead_memcg)
 {
struct mem_cgroup_reclaim_iter *iter;
struct mem_cgroup_per_zone *mz;
@@ -6903,14 +6904,14 @@ static void 
mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg)
struct mem_cgroup *parent = memcg;
 
while ((parent = parent_mem_cgroup(parent)))
-   mem_cgroup_iter_invalidate(parent);
+   mem_cgroup_iter_invalidate(parent, memcg);
 
/*
 * if the root memcg is not hierarchical we have to check it
 * explicitely.
 */
if (!root_mem_cgroup->use_hierarchy)
-   mem_cgroup_iter_invalidate(root_mem_cgroup);
+   mem_cgroup_iter_invalidate(root_mem_cgroup, memcg);
 }
 
 static void mem_cgroup_free_all(struct mem_cgroup *memcg)
-- 
2.24.3

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH rh7 5/8] mm/mem_cgroup_iter: Invalidate iterators only if needed

2021-02-20 Thread Konstantin Khorenko
Previously we invalidated all iterators up to the memcg root,
but this is overkill: we can check if currently dying memcg is
stored in iterator's 'last_visited' and invalidate only those
unlucky iterators.

https://jira.sw.ru/browse/PSBM-123655

Signed-off-by: Konstantin Khorenko 
---
 mm/memcontrol.c | 48 ++--
 1 file changed, 38 insertions(+), 10 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 6cde4dbe3c7b..d60dc768f762 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -192,8 +192,8 @@ struct mem_cgroup_stat2_cpu {
 
 struct mem_cgroup_reclaim_iter {
/*
-* last scanned hierarchy member. Valid only if last_dead_count
-* matches memcg->dead_count of the hierarchy root group.
+* Last scanned hierarchy member.
+* If stored memcg is destroyed, the field is wiped.
 */
struct mem_cgroup *last_visited;
unsigned long last_dead_count;
@@ -1576,6 +1576,28 @@ static void mem_cgroup_iter_invalidate(struct mem_cgroup 
*root,
int zone, node, i;
unsigned seq;
 
+   /*
+* When a group in the hierarchy below root is destroyed,
+* the hierarchy iterator can no longer be trusted iif
+* iter->last_visited contains the cgroup being destroyed.
+* Check if we get this unlikely case and invalidate the iterator
+* if so.
+*
+* Note, only "break-ed" iterators can store iter->last_visited
+* == dead_memcg because normally 'last_visited' is assigned
+* under 'last_visited_lock' in mem_cgroup_iter_update() and
+* 'new_position' is under css_tryget() there (ref inc-ed in
+* __mem_cgroup_iter_next()) and thus css will not be offlined.
+*
+* mem_cgroup_iter_break() in its turn puts memcg's css but does
+* not wipe it from iter->last_visited.
+*
+* Q: Why dying memcg (dead_memcg) cannot get into
+*iter->last_visited a bit later after we wipe it here?
+* A: Because up to the moment of the current function execution
+*css_tryget() is guaranteed to fail on 'dead_memcg'.
+*/
+
for_each_node(node) {
for (zone = 0; zone < MAX_NR_ZONES; zone++) {
mz = mem_cgroup_zoneinfo(root, node, zone);
@@ -1584,10 +1606,17 @@ static void mem_cgroup_iter_invalidate(struct 
mem_cgroup *root,
iter = &mz->reclaim_iter[i];
lock = &iter->last_visited_lock;
 
-   write_seqlock(lock);
-   pos = iter->last_visited;
-   iter->last_visited = NULL;
-   write_sequnlock(lock);
+   do {
+   seq = read_seqbegin(lock);
+   pos = READ_ONCE(iter->last_visited);
+   } while (read_seqretry(lock, seq));
+
+   if (pos == dead_memcg) {
+   write_seqlock(lock);
+   pos = iter->last_visited;
+   if (pos == dead_memcg)
+   iter->last_visited = NULL;
+   write_sequnlock(lock);
}
}
}
@@ -1648,10 +1677,9 @@ static void mem_cgroup_iter_update(struct 
mem_cgroup_reclaim_iter *iter,
   int sequence)
 {
/*
-* We store the sequence count from the time @last_visited was
-* loaded successfully instead of rereading it here so that we
-* don't lose destruction events in between.  We could have
-* raced with the destruction of @new_position after all.
+* The position saved in 'last_visited' is always valid.
+* If the stored corresponding cgroup is destroyed,
+* 'last_visited' is NULLed.
 */
write_seqlock(&iter->last_visited_lock);
iter->last_visited = new_position;
-- 
2.24.3

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH rh7 6/8] mm/mem_cgroup_iter: Don't bother checking 'dead_count' anymore

2021-02-20 Thread Konstantin Khorenko
As we've enhanced mem_cgroup_iter_invalidate() to NULL-ify
'last_visited' if it stored dying cgroups, we can be sure
iter->last_visited always contain valid pointer to memcg
(or NULL surely).

So just skip extra check iter->last_dead_count vs root->dead_count,
it's not needed anymore.

Note: the patch is not as small as possible - for review simplicity.
Cleanup patch will follow.

https://jira.sw.ru/browse/PSBM-123655

Signed-off-by: Konstantin Khorenko 
---
 mm/memcontrol.c | 4 
 1 file changed, 4 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index d60dc768f762..39b3350eddf9 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1646,11 +1646,8 @@ mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter 
*iter,
 * offlining.  The RCU lock ensures the object won't be
 * released, tryget will fail if we lost the race.
 */
-   *sequence = atomic_read(&root->dead_count);
 retry:
-   position = NULL;
seq = read_seqbegin(&iter->last_visited_lock);
-   if (iter->last_dead_count == *sequence) {
position = READ_ONCE(iter->last_visited);
 
if (read_seqretry(&iter->last_visited_lock, seq))
@@ -1666,7 +1663,6 @@ mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter,
!css_tryget(&position->css))
 
position = NULL;
-   }
return position;
 }
 
-- 
2.24.3

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH rh7 7/8] mm/mem_cgroup_iter: Cleanup mem_cgroup_iter_load()

2021-02-20 Thread Konstantin Khorenko
No functional changes here.

https://jira.sw.ru/browse/PSBM-123655

Signed-off-by: Konstantin Khorenko 
---
 mm/memcontrol.c | 25 +++--
 1 file changed, 11 insertions(+), 14 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 39b3350eddf9..62bc824adb53 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1646,23 +1646,20 @@ mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter 
*iter,
 * offlining.  The RCU lock ensures the object won't be
 * released, tryget will fail if we lost the race.
 */
-retry:
-   seq = read_seqbegin(&iter->last_visited_lock);
+   do {
+   seq = read_seqbegin(&iter->last_visited_lock);
position = READ_ONCE(iter->last_visited);
+   } while (read_seqretry(&iter->last_visited_lock, seq));
 
-   if (read_seqretry(&iter->last_visited_lock, seq))
-   goto retry;
+   /*
+* We cannot take a reference to root because we might race
+* with root removal and returning NULL would end up in
+* an endless loop on the iterator user level when root
+* would be returned all the time.
+*/
+   if (position && position != root && !css_tryget(&position->css))
+   position = NULL;
 
-   /*
-* We cannot take a reference to root because we might race
-* with root removal and returning NULL would end up in
-* an endless loop on the iterator user level when root
-* would be returned all the time.
-   */
-   if (position && position != root &&
-   !css_tryget(&position->css))
-
-   position = NULL;
return position;
 }
 
-- 
2.24.3

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH rh7 8/8] mm/mem_cgroup_iter: Drop dead_count related infrastructure

2021-02-20 Thread Konstantin Khorenko
As we now have stable and reliable iter->last_visited,
don't need to save/compare number of destroyed cgroups.

https://jira.sw.ru/browse/PSBM-123655

Signed-off-by: Konstantin Khorenko 
---
 mm/memcontrol.c | 22 --
 1 file changed, 4 insertions(+), 18 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 62bc824adb53..73c0232e6303 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -196,7 +196,6 @@ struct mem_cgroup_reclaim_iter {
 * If stored memcg is destroyed, the field is wiped.
 */
struct mem_cgroup *last_visited;
-   unsigned long last_dead_count;
seqlock_t last_visited_lock;
 
/* scan generation, increased every round-trip */
@@ -403,7 +402,6 @@ struct mem_cgroup {
spinlock_t pcp_counter_lock;
atomic_long_t   oom;
 
-   atomic_tdead_count;
 #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
struct tcp_memcontrol tcp_mem;
struct udp_memcontrol udp_mem;
@@ -1621,19 +1619,11 @@ static void mem_cgroup_iter_invalidate(struct 
mem_cgroup *root,
}
}
}
-
-   /*
-* When a group in the hierarchy below root is destroyed, the
-* hierarchy iterator can no longer be trusted since it might
-* have pointed to the destroyed group.  Invalidate it.
-*/
-   atomic_inc(&root->dead_count);
 }
 
 static struct mem_cgroup *
 mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter,
-struct mem_cgroup *root,
-int *sequence)
+struct mem_cgroup *root)
 {
struct mem_cgroup *position;
unsigned seq;
@@ -1666,8 +1656,7 @@ mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter,
 static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter,
   struct mem_cgroup *last_visited,
   struct mem_cgroup *new_position,
-  struct mem_cgroup *root,
-  int sequence)
+  struct mem_cgroup *root)
 {
/*
 * The position saved in 'last_visited' is always valid.
@@ -1676,7 +1665,6 @@ static void mem_cgroup_iter_update(struct 
mem_cgroup_reclaim_iter *iter,
 */
write_seqlock(&iter->last_visited_lock);
iter->last_visited = new_position;
-   iter->last_dead_count = sequence;
write_sequnlock(&iter->last_visited_lock);
 
/* root reference counting symmetric to mem_cgroup_iter_load */
@@ -1726,7 +1714,6 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup 
*root,
rcu_read_lock();
while (!memcg) {
struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
-   int uninitialized_var(seq);
 
if (reclaim) {
int nid = zone_to_nid(reclaim->zone);
@@ -1742,14 +1729,13 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup 
*root,
goto out_unlock;
}
 
-   last_visited = mem_cgroup_iter_load(iter, root, &seq);
+   last_visited = mem_cgroup_iter_load(iter, root);
}
 
memcg = __mem_cgroup_iter_next(root, last_visited);
 
if (reclaim) {
-   mem_cgroup_iter_update(iter, last_visited, memcg, root,
-   seq);
+   mem_cgroup_iter_update(iter, last_visited, memcg, root);
 
if (!memcg)
iter->generation++;
-- 
2.24.3

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH rh7 v2 00/10] mm/mem_cgroup_iter: Reduce the number of iterator restarts upon cgroup removals

2021-02-24 Thread Konstantin Khorenko
May thanks to Kirill Tkhai for his bright ideas and review!

Problem description from the user point of view:
  * the Node is slow
  * the Node has a lot of free RAM
  * the Node has a lot of swapin/swapout
  * kswapd is always running

Problem in a nutshell from technical point of view:
  * kswapd is looping in shrink_zone() inside the loop
  do {} while ((memcg = mem_cgroup_iter(root, memcg, &reclaim)));
(and never goes trough the outer loop)
  * there are a quite a number of memory cgroups of the Node (~1000)
  * some cgroups are hard to reclaim (reclaim may take ~3 seconds),
this is because of very busy disk due to permanent swapin/swapout
  * mem_cgroup_iter() does not have success scanning all cgroups
in a row, it restarts from the root cgroup one time after
another (after different number of cgroups scanned)

Q: Why does mem_cgroup_iter() restart from the root memcg?
A: Because it is invalidated once some memory cgroup is
   destroyed on the Node.
   Note: ANY memory cgroup destroy on the Node leads to iter
   restart.

The following patchset solves this problem in the following way:
there is no need to restart the iter until we see the iter has
the position which is exactly the memory cgroup being destroyed.

The patchset ensures the iter->last_visited is NULL-ified on
invalidation and thus restarts only in the unlikely case when
the iter points to the memcg being destroyed.

https://jira.sw.ru/browse/PSBM-123655

v2 changes:
 - reverted 2 patches in this code which were focused on syncronizing
   updates of iter->last_visited and ->last_dead_count
   (as we are getting rid of iter->last_dead_count at all)
 - use rcu primitives to access iter->last_visited

Konstantin Khorenko (10):
  Revert "mm/memcg: fix css_tryget(),css_put() imbalance"
  Revert "mm/memcg: use seqlock to protect reclaim_iter updates"
  mm/mem_cgroup_iter: Make 'iter->last_visited' a bit more stable
  mm/mem_cgroup_iter: Always assign iter->last_visited under rcu
  mm/mem_cgroup_iter: NULL-ify 'last_visited' for invalidated iterators
  mm/mem_cgroup_iter: Provide _iter_invalidate() the dying memcg as an
argument
  mm/mem_cgroup_iter: Invalidate iterators only if needed
  mm/mem_cgroup_iter: Don't bother checking 'dead_count' anymore
  mm/mem_cgroup_iter: Cleanup mem_cgroup_iter_load()
  mm/mem_cgroup_iter: Drop dead_count related infrastructure

 mm/memcontrol.c | 133 +++-
 1 file changed, 76 insertions(+), 57 deletions(-)

-- 
2.24.3

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH rh7 v2 01/10] Revert "mm/memcg: fix css_tryget(), css_put() imbalance"

2021-02-24 Thread Konstantin Khorenko
This reverts commit 5f351790d598bbf014441a86e7081972086de61b.

We are going to get rid of seqlock 'iter->last_visited_lock',
so reverting the patch.

https://jira.sw.ru/browse/PSBM-123655
Signed-off-by: Konstantin Khorenko 
---
 mm/memcontrol.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e0a430908138..e5c5f64d6bb6 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1581,7 +1581,7 @@ mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter,
 struct mem_cgroup *root,
 int *sequence)
 {
-   struct mem_cgroup *position;
+   struct mem_cgroup *position = NULL;
unsigned seq;
 
/*
@@ -1594,7 +1594,6 @@ mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter,
 */
*sequence = atomic_read(&root->dead_count);
 retry:
-   position = NULL;
seq = read_seqbegin(&iter->last_visited_lock);
if (iter->last_dead_count == *sequence) {
position = READ_ONCE(iter->last_visited);
-- 
2.24.3

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH rh7 v2 03/10] mm/mem_cgroup_iter: Make 'iter->last_visited' a bit more stable

2021-02-24 Thread Konstantin Khorenko
In fact this patch does not change the logic, but after it
we can state that iter->last_visited _always_ contains valid
pointer until the iter is "break-ed".

Why? Because 'last_visited' is always assigned in _update()
to the memcg which has passed css_tryget(), so css won't be
ever offlined (and moreover - destroyed) until we css_put() it.

And if now we call css_put() after iter->last_visited is
assigned a new cgroup, the only case when 'last_visited' may
contain invalid entry is "break-ed" mem_cgroup_iter().

https://jira.sw.ru/browse/PSBM-123655

Signed-off-by: Konstantin Khorenko 
---
 mm/memcontrol.c | 7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index d3a35a13ae4d..8040f09425bf 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1614,9 +1614,6 @@ static void mem_cgroup_iter_update(struct 
mem_cgroup_reclaim_iter *iter,
   struct mem_cgroup *root,
   int sequence)
 {
-   /* root reference counting symmetric to mem_cgroup_iter_load */
-   if (last_visited && last_visited != root)
-   css_put(&last_visited->css);
/*
 * We store the sequence count from the time @last_visited was
 * loaded successfully instead of rereading it here so that we
@@ -1626,6 +1623,10 @@ static void mem_cgroup_iter_update(struct 
mem_cgroup_reclaim_iter *iter,
iter->last_visited = new_position;
smp_wmb();
iter->last_dead_count = sequence;
+
+   /* root reference counting symmetric to mem_cgroup_iter_load */
+   if (last_visited && last_visited != root)
+   css_put(&last_visited->css);
 }
 
 /**
-- 
2.24.3

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH rh7 v2 02/10] Revert "mm/memcg: use seqlock to protect reclaim_iter updates"

2021-02-24 Thread Konstantin Khorenko
This reverts commit 5a2d13cf16faedb8a2c318d50cca71d74d2be264.

We are going to make 'iter->last_visited' always valid to skip
verification 'iter->last_dead_count' vs 'root->dead_count',
thus there will be no need to update 'last_visited' and
'last_dead_count' consistently (we'll remove 'iter->last_dead_count'
field at all), thus dropping the seqlock.

https://jira.sw.ru/browse/PSBM-123655
Signed-off-by: Konstantin Khorenko 
---
 mm/memcontrol.c | 18 +++---
 1 file changed, 3 insertions(+), 15 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e5c5f64d6bb6..d3a35a13ae4d 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -197,7 +197,6 @@ struct mem_cgroup_reclaim_iter {
 */
struct mem_cgroup *last_visited;
unsigned long last_dead_count;
-   seqlock_t last_visited_lock;
 
/* scan generation, increased every round-trip */
unsigned int generation;
@@ -1582,8 +1581,6 @@ mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter,
 int *sequence)
 {
struct mem_cgroup *position = NULL;
-   unsigned seq;
-
/*
 * A cgroup destruction happens in two stages: offlining and
 * release.  They are separated by a RCU grace period.
@@ -1593,13 +1590,9 @@ mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter 
*iter,
 * released, tryget will fail if we lost the race.
 */
*sequence = atomic_read(&root->dead_count);
-retry:
-   seq = read_seqbegin(&iter->last_visited_lock);
if (iter->last_dead_count == *sequence) {
-   position = READ_ONCE(iter->last_visited);
-
-   if (read_seqretry(&iter->last_visited_lock, seq))
-   goto retry;
+   smp_rmb();
+   position = iter->last_visited;
 
/*
 * We cannot take a reference to root because we might race
@@ -1630,10 +1623,9 @@ static void mem_cgroup_iter_update(struct 
mem_cgroup_reclaim_iter *iter,
 * don't lose destruction events in between.  We could have
 * raced with the destruction of @new_position after all.
 */
-   write_seqlock(&iter->last_visited_lock);
iter->last_visited = new_position;
+   smp_wmb();
iter->last_dead_count = sequence;
-   write_sequnlock(&iter->last_visited_lock);
 }
 
 /**
@@ -6589,15 +6581,11 @@ static int alloc_mem_cgroup_per_zone_info(struct 
mem_cgroup *memcg, int node)
return 1;
 
for (zone = 0; zone < MAX_NR_ZONES; zone++) {
-   int i;
-
mz = &pn->zoneinfo[zone];
lruvec_init(&mz->lruvec);
mz->usage_in_excess = 0;
mz->on_tree = false;
mz->memcg = memcg;
-   for (i = 0; i < ARRAY_SIZE(mz->reclaim_iter); i++)
-   seqlock_init(&mz->reclaim_iter[i].last_visited_lock);
}
memcg->info.nodeinfo[node] = pn;
return 0;
-- 
2.24.3

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH rh7 v2 04/10] mm/mem_cgroup_iter: Always assign iter->last_visited under rcu

2021-02-24 Thread Konstantin Khorenko
It's quite strange to have rcu section in mem_cgroup_iter(),
but do not use rcu_dereference/rcu_assign for pointers being defended.

We plan to access/assign '->last_visited' during iterator invalidation,
so we'll need the protection there anyway.

https://jira.sw.ru/browse/PSBM-123655

Signed-off-by: Konstantin Khorenko 
---
 mm/memcontrol.c | 8 +++-
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 8040f09425bf..2f1dfb0de524 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1591,8 +1591,7 @@ mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter,
 */
*sequence = atomic_read(&root->dead_count);
if (iter->last_dead_count == *sequence) {
-   smp_rmb();
-   position = iter->last_visited;
+   position = rcu_dereference(iter->last_visited);
 
/*
 * We cannot take a reference to root because we might race
@@ -1620,8 +1619,7 @@ static void mem_cgroup_iter_update(struct 
mem_cgroup_reclaim_iter *iter,
 * don't lose destruction events in between.  We could have
 * raced with the destruction of @new_position after all.
 */
-   iter->last_visited = new_position;
-   smp_wmb();
+   rcu_assign_pointer(iter->last_visited, new_position);
iter->last_dead_count = sequence;
 
/* root reference counting symmetric to mem_cgroup_iter_load */
@@ -1681,7 +1679,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup 
*root,
mz = mem_cgroup_zoneinfo(root, nid, zid);
iter = &mz->reclaim_iter[reclaim->priority];
if (prev && reclaim->generation != iter->generation) {
-   iter->last_visited = NULL;
+   rcu_assign_pointer(iter->last_visited, NULL);
goto out_unlock;
}
 
-- 
2.24.3

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH rh7 v2 05/10] mm/mem_cgroup_iter: NULL-ify 'last_visited' for invalidated iterators

2021-02-24 Thread Konstantin Khorenko
Our target is to invalidate only those iterators which have our
dying memcg as 'last_visited' and put NULL there instead.

As the first step let's put NULL to all iterators'
'last_visited' we are invalidating.

https://jira.sw.ru/browse/PSBM-123655

Signed-off-by: Konstantin Khorenko 
---
 mm/memcontrol.c | 16 
 1 file changed, 16 insertions(+)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 2f1dfb0de524..c56d68d12c24 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1567,6 +1567,22 @@ static struct mem_cgroup *__mem_cgroup_iter_next(struct 
mem_cgroup *root,
 
 static void mem_cgroup_iter_invalidate(struct mem_cgroup *root)
 {
+   struct mem_cgroup_reclaim_iter *iter;
+   struct mem_cgroup_per_zone *mz;
+   int zone, node, i;
+
+   for_each_node(node) {
+   for (zone = 0; zone < MAX_NR_ZONES; zone++) {
+   mz = mem_cgroup_zoneinfo(root, node, zone);
+
+   for (i = 0; i < ARRAY_SIZE(mz->reclaim_iter); i++) {
+   iter = &mz->reclaim_iter[i];
+   rcu_assign_pointer(iter->last_visited, NULL);
+   }
+   }
+   }
+   }
+
/*
 * When a group in the hierarchy below root is destroyed, the
 * hierarchy iterator can no longer be trusted since it might
-- 
2.24.3

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH rh7 v2 06/10] mm/mem_cgroup_iter: Provide _iter_invalidate() the dying memcg as an argument

2021-02-24 Thread Konstantin Khorenko
It will be used by next patches when we search for this pointer
stored in iterators.

https://jira.sw.ru/browse/PSBM-123655

Signed-off-by: Konstantin Khorenko 
---
 mm/memcontrol.c | 7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index c56d68d12c24..eec944f712b0 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1565,7 +1565,8 @@ static struct mem_cgroup *__mem_cgroup_iter_next(struct 
mem_cgroup *root,
return NULL;
 }
 
-static void mem_cgroup_iter_invalidate(struct mem_cgroup *root)
+static void mem_cgroup_iter_invalidate(struct mem_cgroup *root,
+  struct mem_cgroup *dead_memcg)
 {
struct mem_cgroup_reclaim_iter *iter;
struct mem_cgroup_per_zone *mz;
@@ -6878,14 +6879,14 @@ static void 
mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg)
struct mem_cgroup *parent = memcg;
 
while ((parent = parent_mem_cgroup(parent)))
-   mem_cgroup_iter_invalidate(parent);
+   mem_cgroup_iter_invalidate(parent, memcg);
 
/*
 * if the root memcg is not hierarchical we have to check it
 * explicitely.
 */
if (!root_mem_cgroup->use_hierarchy)
-   mem_cgroup_iter_invalidate(root_mem_cgroup);
+   mem_cgroup_iter_invalidate(root_mem_cgroup, memcg);
 }
 
 static void mem_cgroup_free_all(struct mem_cgroup *memcg)
-- 
2.24.3

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH rh7 v2 07/10] mm/mem_cgroup_iter: Invalidate iterators only if needed

2021-02-24 Thread Konstantin Khorenko
Previously we invalidated all iterators up to the memcg root,
but this is overkill: we can check if currently dying memcg is
stored in iterator's 'last_visited' and invalidate only those
unlucky iterators.

https://jira.sw.ru/browse/PSBM-123655

Signed-off-by: Konstantin Khorenko 
---
 mm/memcontrol.c | 48 +---
 1 file changed, 41 insertions(+), 7 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index eec944f712b0..b6059f46049b 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -192,8 +192,8 @@ struct mem_cgroup_stat2_cpu {
 
 struct mem_cgroup_reclaim_iter {
/*
-* last scanned hierarchy member. Valid only if last_dead_count
-* matches memcg->dead_count of the hierarchy root group.
+* Last scanned hierarchy member.
+* If stored memcg is destroyed, the field is wiped.
 */
struct mem_cgroup *last_visited;
unsigned long last_dead_count;
@@ -1570,19 +1570,54 @@ static void mem_cgroup_iter_invalidate(struct 
mem_cgroup *root,
 {
struct mem_cgroup_reclaim_iter *iter;
struct mem_cgroup_per_zone *mz;
+   struct mem_cgroup *pos;
int zone, node, i;
 
+   /*
+* When a group in the hierarchy below root is destroyed,
+* the hierarchy iterator can no longer be trusted iif
+* iter->last_visited contains the cgroup being destroyed.
+* Check if we get this unlikely case and invalidate the iterator
+* if so.
+*
+* Note, only "break-ed" iterators can store iter->last_visited
+* == dead_memcg because normally 'last_visited' is assigned
+* in mem_cgroup_iter_update() and 'new_position' is just after
+* css_tryget() there (ref inc-ed in __mem_cgroup_iter_next())
+* and thus cgroup is not offlined yet.
+*
+* mem_cgroup_iter_break() in its turn puts memcg's css but does
+* not wipe it from iter->last_visited.
+*
+* Q: Why dying memcg (dead_memcg) cannot get into
+*iter->last_visited a bit later after we wipe it here?
+* A: Because up to the moment of the current function execution
+*css_tryget() is guaranteed to fail on 'dead_memcg'.
+*/
+
+   rcu_read_lock();
for_each_node(node) {
for (zone = 0; zone < MAX_NR_ZONES; zone++) {
mz = mem_cgroup_zoneinfo(root, node, zone);
 
for (i = 0; i < ARRAY_SIZE(mz->reclaim_iter); i++) {
iter = &mz->reclaim_iter[i];
-   rcu_assign_pointer(iter->last_visited, NULL);
+
+   pos = rcu_access_pointer(iter->last_visited);
+   /*
+* It's OK to race with mem_cgroup_iter_update()
+* here because it cannot write new
+* position == dead_memcg as
+* css_tryget() for it should fail already.
+*/
+   if (pos == dead_memcg) {
+   rcu_assign_pointer(iter->last_visited,
+  NULL);
}
}
}
}
+   rcu_read_unlock();
 
/*
 * When a group in the hierarchy below root is destroyed, the
@@ -1631,10 +1666,9 @@ static void mem_cgroup_iter_update(struct 
mem_cgroup_reclaim_iter *iter,
   int sequence)
 {
/*
-* We store the sequence count from the time @last_visited was
-* loaded successfully instead of rereading it here so that we
-* don't lose destruction events in between.  We could have
-* raced with the destruction of @new_position after all.
+* The position saved in 'last_visited' is always valid.
+* If the stored corresponding cgroup is destroyed,
+* 'last_visited' is NULLed.
 */
rcu_assign_pointer(iter->last_visited, new_position);
iter->last_dead_count = sequence;
-- 
2.24.3

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH rh7 v2 09/10] mm/mem_cgroup_iter: Cleanup mem_cgroup_iter_load()

2021-02-24 Thread Konstantin Khorenko
No functional changes here.

https://jira.sw.ru/browse/PSBM-123655

Signed-off-by: Konstantin Khorenko 
---
 mm/memcontrol.c | 21 ++---
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 5fde291ecfee..22c404cc69ab 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1641,18 +1641,17 @@ mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter 
*iter,
 * offlining.  The RCU lock ensures the object won't be
 * released, tryget will fail if we lost the race.
 */
-   position = rcu_dereference(iter->last_visited);
+   position = rcu_dereference(iter->last_visited);
+
+   /*
+* We cannot take a reference to root because we might race
+* with root removal and returning NULL would end up in
+* an endless loop on the iterator user level when root
+* would be returned all the time.
+*/
+   if (position && position != root && !css_tryget(&position->css))
+   position = NULL;
 
-   /*
-* We cannot take a reference to root because we might race
-* with root removal and returning NULL would end up in
-* an endless loop on the iterator user level when root
-* would be returned all the time.
-   */
-   if (position && position != root &&
-   !css_tryget(&position->css))
-
-   position = NULL;
return position;
 }
 
-- 
2.24.3

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH rh7 v2 08/10] mm/mem_cgroup_iter: Don't bother checking 'dead_count' anymore

2021-02-24 Thread Konstantin Khorenko
As we've enhanced mem_cgroup_iter_invalidate() to NULL-ify
'last_visited' if it stored dying cgroups, we can be sure
iter->last_visited always contain valid pointer to memcg
(or NULL surely).

So just skip extra check iter->last_dead_count vs root->dead_count,
it's not needed anymore.

Note: the patch is prepared as small as possible - for review simplicity.
Cleanup patch will follow.

https://jira.sw.ru/browse/PSBM-123655

Signed-off-by: Konstantin Khorenko 
---
 mm/memcontrol.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index b6059f46049b..5fde291ecfee 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1641,8 +1641,6 @@ mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter,
 * offlining.  The RCU lock ensures the object won't be
 * released, tryget will fail if we lost the race.
 */
-   *sequence = atomic_read(&root->dead_count);
-   if (iter->last_dead_count == *sequence) {
position = rcu_dereference(iter->last_visited);
 
/*
@@ -1655,7 +1653,6 @@ mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter,
!css_tryget(&position->css))
 
position = NULL;
-   }
return position;
 }
 
-- 
2.24.3

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH rh7 v2 10/10] mm/mem_cgroup_iter: Drop dead_count related infrastructure

2021-02-24 Thread Konstantin Khorenko
As we now have stable and reliable iter->last_visited,
don't need to save/compare number of destroyed cgroups.

https://jira.sw.ru/browse/PSBM-123655

Signed-off-by: Konstantin Khorenko 
---
 mm/memcontrol.c | 22 --
 1 file changed, 4 insertions(+), 18 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 22c404cc69ab..a04fcb16008a 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -196,7 +196,6 @@ struct mem_cgroup_reclaim_iter {
 * If stored memcg is destroyed, the field is wiped.
 */
struct mem_cgroup *last_visited;
-   unsigned long last_dead_count;
 
/* scan generation, increased every round-trip */
unsigned int generation;
@@ -402,7 +401,6 @@ struct mem_cgroup {
spinlock_t pcp_counter_lock;
atomic_long_t   oom;
 
-   atomic_tdead_count;
 #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
struct tcp_memcontrol tcp_mem;
struct udp_memcontrol udp_mem;
@@ -1618,19 +1616,11 @@ static void mem_cgroup_iter_invalidate(struct 
mem_cgroup *root,
}
}
rcu_read_unlock();
-
-   /*
-* When a group in the hierarchy below root is destroyed, the
-* hierarchy iterator can no longer be trusted since it might
-* have pointed to the destroyed group.  Invalidate it.
-*/
-   atomic_inc(&root->dead_count);
 }
 
 static struct mem_cgroup *
 mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter,
-struct mem_cgroup *root,
-int *sequence)
+struct mem_cgroup *root)
 {
struct mem_cgroup *position = NULL;
/*
@@ -1658,8 +1648,7 @@ mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter,
 static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter,
   struct mem_cgroup *last_visited,
   struct mem_cgroup *new_position,
-  struct mem_cgroup *root,
-  int sequence)
+  struct mem_cgroup *root)
 {
/*
 * The position saved in 'last_visited' is always valid.
@@ -1667,7 +1656,6 @@ static void mem_cgroup_iter_update(struct 
mem_cgroup_reclaim_iter *iter,
 * 'last_visited' is NULLed.
 */
rcu_assign_pointer(iter->last_visited, new_position);
-   iter->last_dead_count = sequence;
 
/* root reference counting symmetric to mem_cgroup_iter_load */
if (last_visited && last_visited != root)
@@ -1716,7 +1704,6 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup 
*root,
rcu_read_lock();
while (!memcg) {
struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
-   int uninitialized_var(seq);
 
if (reclaim) {
int nid = zone_to_nid(reclaim->zone);
@@ -1730,14 +1717,13 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup 
*root,
goto out_unlock;
}
 
-   last_visited = mem_cgroup_iter_load(iter, root, &seq);
+   last_visited = mem_cgroup_iter_load(iter, root);
}
 
memcg = __mem_cgroup_iter_next(root, last_visited);
 
if (reclaim) {
-   mem_cgroup_iter_update(iter, last_visited, memcg, root,
-   seq);
+   mem_cgroup_iter_update(iter, last_visited, memcg, root);
 
if (!memcg)
iter->generation++;
-- 
2.24.3

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH rh7 v3 0/9] mm/mem_cgroup_iter: Reduce the number of iterator restarts upon cgroup removals

2021-02-24 Thread Konstantin Khorenko
May thanks to Kirill Tkhai for his bright ideas and review!

Problem description from the user point of view:
  * the Node is slow
  * the Node has a lot of free RAM
  * the Node has a lot of swapin/swapout
  * kswapd is always running

Problem in a nutshell from technical point of view:
  * kswapd is looping in shrink_zone() inside the loop
  do {} while ((memcg = mem_cgroup_iter(root, memcg, &reclaim)));
(and never goes trough the outer loop)
  * there are a quite a number of memory cgroups of the Node (~1000)
  * some cgroups are hard to reclaim (reclaim may take ~3 seconds),
this is because of very busy disk due to permanent swapin/swapout
  * mem_cgroup_iter() does not have success scanning all cgroups
in a row, it restarts from the root cgroup one time after
another (after different number of cgroups scanned)

Q: Why does mem_cgroup_iter() restart from the root memcg?
A: Because it is invalidated once some memory cgroup is
   destroyed on the Node.
   Note: ANY memory cgroup destroy on the Node leads to iter
   restart.

The following patchset solves this problem in the following way:
there is no need to restart the iter until we see the iter has
the position which is exactly the memory cgroup being destroyed.

The patchset ensures the iter->last_visited is NULL-ified on
invalidation and thus restarts only in the unlikely case when
the iter points to the memcg being destroyed.

https://jira.sw.ru/browse/PSBM-123655

v2 changes:
 - reverted 2 patches in this code which were focused on syncronizing
   updates of iter->last_visited and ->last_dead_count
   (as we are getting rid of iter->last_dead_count at all)
 - use rcu primitives to access iter->last_visited

v3 changes:
 - more comments explaining the locking scheme
 - use rcu_read_{lock,unlock}_sched() in mem_cgroup_iter()
   for syncronization with iterator invalidation func
 - do not use rcu_read_{lock/unlock}() wrap in iterator invalidation func
   as it protects nothing

Konstantin Khorenko (9):
  Revert "mm/memcg: fix css_tryget(),css_put() imbalance"
  Revert "mm/memcg: use seqlock to protect reclaim_iter updates"
  mm/mem_cgroup_iter: Make 'iter->last_visited' a bit more stable
  mm/mem_cgroup_iter: Always assign iter->last_visited under rcu
  mm/mem_cgroup_iter: Provide _iter_invalidate() the dying memcg as an
argument
  mm/mem_cgroup_iter: NULL-ify 'last_visited' for invalidated iterators
  mm/mem_cgroup_iter: Don't bother checking 'dead_count' anymore
  mm/mem_cgroup_iter: Cleanup mem_cgroup_iter_load()
  mm/mem_cgroup_iter: Drop dead_count related infrastructure

 mm/memcontrol.c | 201 ++--
 1 file changed, 143 insertions(+), 58 deletions(-)

-- 
2.24.3

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH rh7 v3 1/9] Revert "mm/memcg: fix css_tryget(), css_put() imbalance"

2021-02-24 Thread Konstantin Khorenko
This reverts commit 5f351790d598bbf014441a86e7081972086de61b.

We are going to get rid of seqlock 'iter->last_visited_lock',
so reverting the patch.

https://jira.sw.ru/browse/PSBM-123655
Signed-off-by: Konstantin Khorenko 
---
 mm/memcontrol.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e0a430908138..e5c5f64d6bb6 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1581,7 +1581,7 @@ mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter,
 struct mem_cgroup *root,
 int *sequence)
 {
-   struct mem_cgroup *position;
+   struct mem_cgroup *position = NULL;
unsigned seq;
 
/*
@@ -1594,7 +1594,6 @@ mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter,
 */
*sequence = atomic_read(&root->dead_count);
 retry:
-   position = NULL;
seq = read_seqbegin(&iter->last_visited_lock);
if (iter->last_dead_count == *sequence) {
position = READ_ONCE(iter->last_visited);
-- 
2.24.3

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH rh7 v3 4/9] mm/mem_cgroup_iter: Always assign iter->last_visited under rcu

2021-02-24 Thread Konstantin Khorenko
It's quite strange to have rcu section in mem_cgroup_iter(),
but do not use rcu_dereference/rcu_assign for pointers being defended.

We plan to access/assign '->last_visited' during iterator invalidation,
so we'll need the protection there anyway.

https://jira.sw.ru/browse/PSBM-123655

Signed-off-by: Konstantin Khorenko 
---
 mm/memcontrol.c | 20 ++--
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 8040f09425bf..d0251d27de00 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -194,8 +194,18 @@ struct mem_cgroup_reclaim_iter {
/*
 * last scanned hierarchy member. Valid only if last_dead_count
 * matches memcg->dead_count of the hierarchy root group.
+*
+* Memory pointed by 'last_visited' is freed not earlier than
+* one rcu period after we accessed it:
+*   cgroup_offline_fn()
+*offline_css()
+*list_del_rcu()
+*dput()
+*...
+* cgroup_diput()
+*  call_rcu(&cgrp->rcu_head, cgroup_free_rcu)
 */
-   struct mem_cgroup *last_visited;
+   struct mem_cgroup __rcu *last_visited;
unsigned long last_dead_count;
 
/* scan generation, increased every round-trip */
@@ -1591,8 +1601,7 @@ mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter,
 */
*sequence = atomic_read(&root->dead_count);
if (iter->last_dead_count == *sequence) {
-   smp_rmb();
-   position = iter->last_visited;
+   position = rcu_dereference(iter->last_visited);
 
/*
 * We cannot take a reference to root because we might race
@@ -1620,8 +1629,7 @@ static void mem_cgroup_iter_update(struct 
mem_cgroup_reclaim_iter *iter,
 * don't lose destruction events in between.  We could have
 * raced with the destruction of @new_position after all.
 */
-   iter->last_visited = new_position;
-   smp_wmb();
+   rcu_assign_pointer(iter->last_visited, new_position);
iter->last_dead_count = sequence;
 
/* root reference counting symmetric to mem_cgroup_iter_load */
@@ -1681,7 +1689,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup 
*root,
mz = mem_cgroup_zoneinfo(root, nid, zid);
iter = &mz->reclaim_iter[reclaim->priority];
if (prev && reclaim->generation != iter->generation) {
-   iter->last_visited = NULL;
+   rcu_assign_pointer(iter->last_visited, NULL);
goto out_unlock;
}
 
-- 
2.24.3

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH rh7 v3 2/9] Revert "mm/memcg: use seqlock to protect reclaim_iter updates"

2021-02-24 Thread Konstantin Khorenko
This reverts commit 5a2d13cf16faedb8a2c318d50cca71d74d2be264.

We are going to make 'iter->last_visited' always valid to skip
verification 'iter->last_dead_count' vs 'root->dead_count',
thus there will be no need to update 'last_visited' and
'last_dead_count' consistently (we'll remove 'iter->last_dead_count'
field at all), thus dropping the seqlock.

https://jira.sw.ru/browse/PSBM-123655
Signed-off-by: Konstantin Khorenko 
---
 mm/memcontrol.c | 18 +++---
 1 file changed, 3 insertions(+), 15 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e5c5f64d6bb6..d3a35a13ae4d 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -197,7 +197,6 @@ struct mem_cgroup_reclaim_iter {
 */
struct mem_cgroup *last_visited;
unsigned long last_dead_count;
-   seqlock_t last_visited_lock;
 
/* scan generation, increased every round-trip */
unsigned int generation;
@@ -1582,8 +1581,6 @@ mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter,
 int *sequence)
 {
struct mem_cgroup *position = NULL;
-   unsigned seq;
-
/*
 * A cgroup destruction happens in two stages: offlining and
 * release.  They are separated by a RCU grace period.
@@ -1593,13 +1590,9 @@ mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter 
*iter,
 * released, tryget will fail if we lost the race.
 */
*sequence = atomic_read(&root->dead_count);
-retry:
-   seq = read_seqbegin(&iter->last_visited_lock);
if (iter->last_dead_count == *sequence) {
-   position = READ_ONCE(iter->last_visited);
-
-   if (read_seqretry(&iter->last_visited_lock, seq))
-   goto retry;
+   smp_rmb();
+   position = iter->last_visited;
 
/*
 * We cannot take a reference to root because we might race
@@ -1630,10 +1623,9 @@ static void mem_cgroup_iter_update(struct 
mem_cgroup_reclaim_iter *iter,
 * don't lose destruction events in between.  We could have
 * raced with the destruction of @new_position after all.
 */
-   write_seqlock(&iter->last_visited_lock);
iter->last_visited = new_position;
+   smp_wmb();
iter->last_dead_count = sequence;
-   write_sequnlock(&iter->last_visited_lock);
 }
 
 /**
@@ -6589,15 +6581,11 @@ static int alloc_mem_cgroup_per_zone_info(struct 
mem_cgroup *memcg, int node)
return 1;
 
for (zone = 0; zone < MAX_NR_ZONES; zone++) {
-   int i;
-
mz = &pn->zoneinfo[zone];
lruvec_init(&mz->lruvec);
mz->usage_in_excess = 0;
mz->on_tree = false;
mz->memcg = memcg;
-   for (i = 0; i < ARRAY_SIZE(mz->reclaim_iter); i++)
-   seqlock_init(&mz->reclaim_iter[i].last_visited_lock);
}
memcg->info.nodeinfo[node] = pn;
return 0;
-- 
2.24.3

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH rh7 v3 3/9] mm/mem_cgroup_iter: Make 'iter->last_visited' a bit more stable

2021-02-24 Thread Konstantin Khorenko
In fact this patch does not change the logic, but after it
we can state that iter->last_visited _always_ contains valid
pointer until the iter is "break-ed".

Why? Because 'last_visited' is always assigned in _update()
to the memcg which has passed css_tryget(), so css won't be
ever offlined (and moreover - destroyed) until we css_put() it.

And if now we call css_put() after iter->last_visited is
assigned a new cgroup, the only case when 'last_visited' may
contain invalid entry is "break-ed" mem_cgroup_iter().

https://jira.sw.ru/browse/PSBM-123655

Signed-off-by: Konstantin Khorenko 
---
 mm/memcontrol.c | 7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index d3a35a13ae4d..8040f09425bf 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1614,9 +1614,6 @@ static void mem_cgroup_iter_update(struct 
mem_cgroup_reclaim_iter *iter,
   struct mem_cgroup *root,
   int sequence)
 {
-   /* root reference counting symmetric to mem_cgroup_iter_load */
-   if (last_visited && last_visited != root)
-   css_put(&last_visited->css);
/*
 * We store the sequence count from the time @last_visited was
 * loaded successfully instead of rereading it here so that we
@@ -1626,6 +1623,10 @@ static void mem_cgroup_iter_update(struct 
mem_cgroup_reclaim_iter *iter,
iter->last_visited = new_position;
smp_wmb();
iter->last_dead_count = sequence;
+
+   /* root reference counting symmetric to mem_cgroup_iter_load */
+   if (last_visited && last_visited != root)
+   css_put(&last_visited->css);
 }
 
 /**
-- 
2.24.3

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH rh7 v3 6/9] mm/mem_cgroup_iter: NULL-ify 'last_visited' for invalidated iterators

2021-02-24 Thread Konstantin Khorenko
Our target is to invalidate only those iterators which have our
dying memcg as 'last_visited' and put NULL there instead.

https://jira.sw.ru/browse/PSBM-123655

Signed-off-by: Konstantin Khorenko 
---
 mm/memcontrol.c | 136 ++--
 1 file changed, 121 insertions(+), 15 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 4c9db4544d0c..5b4bb633d745 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -192,18 +192,11 @@ struct mem_cgroup_stat2_cpu {
 
 struct mem_cgroup_reclaim_iter {
/*
-* last scanned hierarchy member. Valid only if last_dead_count
-* matches memcg->dead_count of the hierarchy root group.
+* Last scanned hierarchy member.
+* If stored memcg is destroyed, the field is wiped.
 *
-* Memory pointed by 'last_visited' is freed not earlier than
-* one rcu period after we accessed it:
-*   cgroup_offline_fn()
-*offline_css()
-*list_del_rcu()
-*dput()
-*...
-* cgroup_diput()
-*  call_rcu(&cgrp->rcu_head, cgroup_free_rcu)
+* Check comment in mem_cgroup_iter() for 'last_visited'
+* protection scheme.
 */
struct mem_cgroup __rcu *last_visited;
unsigned long last_dead_count;
@@ -1578,6 +1571,66 @@ static struct mem_cgroup *__mem_cgroup_iter_next(struct 
mem_cgroup *root,
 static void mem_cgroup_iter_invalidate(struct mem_cgroup *root,
   struct mem_cgroup *dead_memcg)
 {
+   struct mem_cgroup_reclaim_iter *iter;
+   struct mem_cgroup_per_zone *mz;
+   struct mem_cgroup *pos;
+   int zone, node, i;
+
+   /*
+* When a group in the hierarchy below root is destroyed,
+* the hierarchy iterator can no longer be trusted iif
+* iter->last_visited contains the cgroup being destroyed.
+* Check if we get this unlikely case and invalidate the iterator
+* if so.
+*
+* Note, only "break-ed" iterators can store iter->last_visited
+* == dead_memcg because normally 'last_visited' is assigned
+* in mem_cgroup_iter_update() and 'new_position' is just after
+* css_tryget() there (ref inc-ed in __mem_cgroup_iter_next())
+* and thus cgroup is not offlined yet.
+*
+* mem_cgroup_iter_break() in its turn puts memcg's css but does
+* not wipe it from iter->last_visited.
+*
+* Q: Why dying memcg (dead_memcg) cannot get into
+*iter->last_visited a bit later after we wipe it here?
+* A: Because up to the moment of the current function execution
+*css_tryget() is guaranteed to fail on 'dead_memcg'.
+*
+* Q: Why don't we need rcu_read_lock()/unlock() wrap for this
+*cycle?
+* A: We must invalidate iter only in case it contains
+*'dead_memcg' in '->last_visited'. While we are running
+*here css_tryget() is guaranteed to fail on 'dead_memcg',
+*so any mem_cgroup_iter() started after this function is
+*executed will not get 'dead_memcg' as a result of
+*mem_cgroup_iter_load().
+*And thus any mem_cgroup_iter_update() we might race with -
+*will never write 'dead_memcg' in '->last_visited'.
+*It might write some alive cgroup pointer - true, but not
+*'dead_memcg'.
+*/
+   for_each_node(node) {
+   for (zone = 0; zone < MAX_NR_ZONES; zone++) {
+   mz = mem_cgroup_zoneinfo(root, node, zone);
+
+   for (i = 0; i < ARRAY_SIZE(mz->reclaim_iter); i++) {
+   iter = &mz->reclaim_iter[i];
+
+   pos = rcu_access_pointer(iter->last_visited);
+   /*
+* It's OK to race with mem_cgroup_iter_update()
+* here because it cannot write new
+* position == dead_memcg as
+* css_tryget() for it should fail already.
+*/
+   if (pos == dead_memcg)
+   rcu_assign_pointer(iter->last_visited,
+  NULL);
+   }
+   }
+   }
+
/*
 * When a group in the hierarchy below root is destroyed, the
 * hierarchy iterator can no longer be trusted since it might
@@ -1625,10 +1678,9 @@ static void mem_cgroup_iter_update(struct 
mem_cgroup_reclaim_iter *iter,
   int sequence)
 {

[Devel] [PATCH rh7 v3 5/9] mm/mem_cgroup_iter: Provide _iter_invalidate() the dying memcg as an argument

2021-02-24 Thread Konstantin Khorenko
It will be used by next patches when we search for this pointer
stored in iterators.

https://jira.sw.ru/browse/PSBM-123655

Signed-off-by: Konstantin Khorenko 
---
 mm/memcontrol.c | 7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index d0251d27de00..4c9db4544d0c 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1575,7 +1575,8 @@ static struct mem_cgroup *__mem_cgroup_iter_next(struct 
mem_cgroup *root,
return NULL;
 }
 
-static void mem_cgroup_iter_invalidate(struct mem_cgroup *root)
+static void mem_cgroup_iter_invalidate(struct mem_cgroup *root,
+  struct mem_cgroup *dead_memcg)
 {
/*
 * When a group in the hierarchy below root is destroyed, the
@@ -6872,14 +6873,14 @@ static void 
mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg)
struct mem_cgroup *parent = memcg;
 
while ((parent = parent_mem_cgroup(parent)))
-   mem_cgroup_iter_invalidate(parent);
+   mem_cgroup_iter_invalidate(parent, memcg);
 
/*
 * if the root memcg is not hierarchical we have to check it
 * explicitely.
 */
if (!root_mem_cgroup->use_hierarchy)
-   mem_cgroup_iter_invalidate(root_mem_cgroup);
+   mem_cgroup_iter_invalidate(root_mem_cgroup, memcg);
 }
 
 static void mem_cgroup_free_all(struct mem_cgroup *memcg)
-- 
2.24.3

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH rh7 v3 7/9] mm/mem_cgroup_iter: Don't bother checking 'dead_count' anymore

2021-02-24 Thread Konstantin Khorenko
As we've enhanced mem_cgroup_iter_invalidate() to NULL-ify
'last_visited' if it stored dying cgroups, we can be sure
iter->last_visited always contain valid pointer to memcg
(or NULL surely).

So just skip extra check iter->last_dead_count vs root->dead_count,
it's not needed anymore.

Note: the patch is prepared as small as possible - for review simplicity.
Cleanup patch will follow.

https://jira.sw.ru/browse/PSBM-123655

Signed-off-by: Konstantin Khorenko 
---
 mm/memcontrol.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 5b4bb633d745..96fdce58eacc 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1653,8 +1653,6 @@ mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter,
 * offlining.  The RCU lock ensures the object won't be
 * released, tryget will fail if we lost the race.
 */
-   *sequence = atomic_read(&root->dead_count);
-   if (iter->last_dead_count == *sequence) {
position = rcu_dereference(iter->last_visited);
 
/*
@@ -1667,7 +1665,6 @@ mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter,
!css_tryget(&position->css))
 
position = NULL;
-   }
return position;
 }
 
-- 
2.24.3

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH rh7 v3 8/9] mm/mem_cgroup_iter: Cleanup mem_cgroup_iter_load()

2021-02-24 Thread Konstantin Khorenko
No functional changes here.

https://jira.sw.ru/browse/PSBM-123655

Signed-off-by: Konstantin Khorenko 
---
 mm/memcontrol.c | 21 ++---
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 96fdce58eacc..3613c01d6bd2 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1653,18 +1653,17 @@ mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter 
*iter,
 * offlining.  The RCU lock ensures the object won't be
 * released, tryget will fail if we lost the race.
 */
-   position = rcu_dereference(iter->last_visited);
+   position = rcu_dereference(iter->last_visited);
+
+   /*
+* We cannot take a reference to root because we might race
+* with root removal and returning NULL would end up in
+* an endless loop on the iterator user level when root
+* would be returned all the time.
+*/
+   if (position && position != root && !css_tryget(&position->css))
+   position = NULL;
 
-   /*
-* We cannot take a reference to root because we might race
-* with root removal and returning NULL would end up in
-* an endless loop on the iterator user level when root
-* would be returned all the time.
-   */
-   if (position && position != root &&
-   !css_tryget(&position->css))
-
-   position = NULL;
return position;
 }
 
-- 
2.24.3

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH rh7 v3 9/9] mm/mem_cgroup_iter: Drop dead_count related infrastructure

2021-02-24 Thread Konstantin Khorenko
As we now have stable and reliable iter->last_visited,
don't need to save/compare number of destroyed cgroups.

https://jira.sw.ru/browse/PSBM-123655

Signed-off-by: Konstantin Khorenko 
---
 mm/memcontrol.c | 22 --
 1 file changed, 4 insertions(+), 18 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 3613c01d6bd2..c89973c82deb 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -199,7 +199,6 @@ struct mem_cgroup_reclaim_iter {
 * protection scheme.
 */
struct mem_cgroup __rcu *last_visited;
-   unsigned long last_dead_count;
 
/* scan generation, increased every round-trip */
unsigned int generation;
@@ -405,7 +404,6 @@ struct mem_cgroup {
spinlock_t pcp_counter_lock;
atomic_long_t   oom;
 
-   atomic_tdead_count;
 #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
struct tcp_memcontrol tcp_mem;
struct udp_memcontrol udp_mem;
@@ -1630,19 +1628,11 @@ static void mem_cgroup_iter_invalidate(struct 
mem_cgroup *root,
}
}
}
-
-   /*
-* When a group in the hierarchy below root is destroyed, the
-* hierarchy iterator can no longer be trusted since it might
-* have pointed to the destroyed group.  Invalidate it.
-*/
-   atomic_inc(&root->dead_count);
 }
 
 static struct mem_cgroup *
 mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter,
-struct mem_cgroup *root,
-int *sequence)
+struct mem_cgroup *root)
 {
struct mem_cgroup *position = NULL;
/*
@@ -1670,8 +1660,7 @@ mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter,
 static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter,
   struct mem_cgroup *last_visited,
   struct mem_cgroup *new_position,
-  struct mem_cgroup *root,
-  int sequence)
+  struct mem_cgroup *root)
 {
/*
 * The position saved in 'last_visited' is always valid.
@@ -1679,7 +1668,6 @@ static void mem_cgroup_iter_update(struct 
mem_cgroup_reclaim_iter *iter,
 * 'last_visited' is NULLed.
 */
rcu_assign_pointer(iter->last_visited, new_position);
-   iter->last_dead_count = sequence;
 
/* root reference counting symmetric to mem_cgroup_iter_load */
if (last_visited && last_visited != root)
@@ -1781,7 +1769,6 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup 
*root,
rcu_read_lock_sched();
while (!memcg) {
struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
-   int uninitialized_var(seq);
 
if (reclaim) {
int nid = zone_to_nid(reclaim->zone);
@@ -1795,14 +1782,13 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup 
*root,
goto out_unlock;
}
 
-   last_visited = mem_cgroup_iter_load(iter, root, &seq);
+   last_visited = mem_cgroup_iter_load(iter, root);
}
 
memcg = __mem_cgroup_iter_next(root, last_visited);
 
if (reclaim) {
-   mem_cgroup_iter_update(iter, last_visited, memcg, root,
-   seq);
+   mem_cgroup_iter_update(iter, last_visited, memcg, root);
 
if (!memcg)
iter->generation++;
-- 
2.24.3

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH rh7 v4 0/9] mm/mem_cgroup_iter: Reduce the number of iterator restarts upon cgroup removals

2021-02-24 Thread Konstantin Khorenko
May thanks to Kirill Tkhai for his bright ideas and review!

Problem description from the user point of view:
  * the Node is slow
  * the Node has a lot of free RAM
  * the Node has a lot of swapin/swapout
  * kswapd is always running

Problem in a nutshell from technical point of view:
  * kswapd is looping in shrink_zone() inside the loop
  do {} while ((memcg = mem_cgroup_iter(root, memcg, &reclaim)));
(and never goes trough the outer loop)
  * there are a quite a number of memory cgroups of the Node (~1000)
  * some cgroups are hard to reclaim (reclaim may take ~3 seconds),
this is because of very busy disk due to permanent swapin/swapout
  * mem_cgroup_iter() does not have success scanning all cgroups
in a row, it restarts from the root cgroup one time after
another (after different number of cgroups scanned)

Q: Why does mem_cgroup_iter() restart from the root memcg?
A: Because it is invalidated once some memory cgroup is
   destroyed on the Node.
   Note: ANY memory cgroup destroy on the Node leads to iter
   restart.

The following patchset solves this problem in the following way:
there is no need to restart the iter until we see the iter has
the position which is exactly the memory cgroup being destroyed.

The patchset ensures the iter->last_visited is NULL-ified on
invalidation and thus restarts only in the unlikely case when
the iter points to the memcg being destroyed.

https://jira.sw.ru/browse/PSBM-123655

v2 changes:
 - reverted 2 patches in this code which were focused on syncronizing
   updates of iter->last_visited and ->last_dead_count
   (as we are getting rid of iter->last_dead_count at all)
 - use rcu primitives to access iter->last_visited

v3 changes:
 - more comments explaining the locking scheme
 - use rcu_read_{lock,unlock}_sched() in mem_cgroup_iter()
   for syncronization with iterator invalidation func
 - do not use rcu_read_{lock/unlock}() wrap in iterator invalidation func
   as it protects nothing

v4 changes:
 - extended comment why iter invalidation function must see all
   pointers to dying memcg and no pointer to it can be written later

Konstantin Khorenko (9):
  Revert "mm/memcg: fix css_tryget(),css_put() imbalance"
  Revert "mm/memcg: use seqlock to protect reclaim_iter updates"
  mm/mem_cgroup_iter: Make 'iter->last_visited' a bit more stable
  mm/mem_cgroup_iter: Always assign iter->last_visited under rcu
  mm/mem_cgroup_iter: Provide _iter_invalidate() the dying memcg as an
argument
  mm/mem_cgroup_iter: NULL-ify 'last_visited' for invalidated iterators
  mm/mem_cgroup_iter: Don't bother checking 'dead_count' anymore
  mm/mem_cgroup_iter: Cleanup mem_cgroup_iter_load()
  mm/mem_cgroup_iter: Drop dead_count related infrastructure

 mm/memcontrol.c | 208 ++--
 1 file changed, 150 insertions(+), 58 deletions(-)

-- 
2.24.3

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH rh7 v4 1/9] Revert "mm/memcg: fix css_tryget(), css_put() imbalance"

2021-02-24 Thread Konstantin Khorenko
This reverts commit 5f351790d598bbf014441a86e7081972086de61b.

We are going to get rid of seqlock 'iter->last_visited_lock',
so reverting the patch.

https://jira.sw.ru/browse/PSBM-123655
Signed-off-by: Konstantin Khorenko 
---
 mm/memcontrol.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e0a430908138..e5c5f64d6bb6 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1581,7 +1581,7 @@ mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter,
 struct mem_cgroup *root,
 int *sequence)
 {
-   struct mem_cgroup *position;
+   struct mem_cgroup *position = NULL;
unsigned seq;
 
/*
@@ -1594,7 +1594,6 @@ mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter,
 */
*sequence = atomic_read(&root->dead_count);
 retry:
-   position = NULL;
seq = read_seqbegin(&iter->last_visited_lock);
if (iter->last_dead_count == *sequence) {
position = READ_ONCE(iter->last_visited);
-- 
2.24.3

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH rh7 v4 2/9] Revert "mm/memcg: use seqlock to protect reclaim_iter updates"

2021-02-24 Thread Konstantin Khorenko
This reverts commit 5a2d13cf16faedb8a2c318d50cca71d74d2be264.

We are going to make 'iter->last_visited' always valid to skip
verification 'iter->last_dead_count' vs 'root->dead_count',
thus there will be no need to update 'last_visited' and
'last_dead_count' consistently (we'll remove 'iter->last_dead_count'
field at all), thus dropping the seqlock.

https://jira.sw.ru/browse/PSBM-123655
Signed-off-by: Konstantin Khorenko 
---
 mm/memcontrol.c | 18 +++---
 1 file changed, 3 insertions(+), 15 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e5c5f64d6bb6..d3a35a13ae4d 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -197,7 +197,6 @@ struct mem_cgroup_reclaim_iter {
 */
struct mem_cgroup *last_visited;
unsigned long last_dead_count;
-   seqlock_t last_visited_lock;
 
/* scan generation, increased every round-trip */
unsigned int generation;
@@ -1582,8 +1581,6 @@ mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter,
 int *sequence)
 {
struct mem_cgroup *position = NULL;
-   unsigned seq;
-
/*
 * A cgroup destruction happens in two stages: offlining and
 * release.  They are separated by a RCU grace period.
@@ -1593,13 +1590,9 @@ mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter 
*iter,
 * released, tryget will fail if we lost the race.
 */
*sequence = atomic_read(&root->dead_count);
-retry:
-   seq = read_seqbegin(&iter->last_visited_lock);
if (iter->last_dead_count == *sequence) {
-   position = READ_ONCE(iter->last_visited);
-
-   if (read_seqretry(&iter->last_visited_lock, seq))
-   goto retry;
+   smp_rmb();
+   position = iter->last_visited;
 
/*
 * We cannot take a reference to root because we might race
@@ -1630,10 +1623,9 @@ static void mem_cgroup_iter_update(struct 
mem_cgroup_reclaim_iter *iter,
 * don't lose destruction events in between.  We could have
 * raced with the destruction of @new_position after all.
 */
-   write_seqlock(&iter->last_visited_lock);
iter->last_visited = new_position;
+   smp_wmb();
iter->last_dead_count = sequence;
-   write_sequnlock(&iter->last_visited_lock);
 }
 
 /**
@@ -6589,15 +6581,11 @@ static int alloc_mem_cgroup_per_zone_info(struct 
mem_cgroup *memcg, int node)
return 1;
 
for (zone = 0; zone < MAX_NR_ZONES; zone++) {
-   int i;
-
mz = &pn->zoneinfo[zone];
lruvec_init(&mz->lruvec);
mz->usage_in_excess = 0;
mz->on_tree = false;
mz->memcg = memcg;
-   for (i = 0; i < ARRAY_SIZE(mz->reclaim_iter); i++)
-   seqlock_init(&mz->reclaim_iter[i].last_visited_lock);
}
memcg->info.nodeinfo[node] = pn;
return 0;
-- 
2.24.3

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH rh7 v4 3/9] mm/mem_cgroup_iter: Make 'iter->last_visited' a bit more stable

2021-02-24 Thread Konstantin Khorenko
In fact this patch does not change the logic, but after it
we can state that iter->last_visited _always_ contains valid
pointer until the iter is "break-ed".

Why? Because 'last_visited' is always assigned in _update()
to the memcg which has passed css_tryget(), so css won't be
ever offlined (and moreover - destroyed) until we css_put() it.

And if now we call css_put() after iter->last_visited is
assigned a new cgroup, the only case when 'last_visited' may
contain invalid entry is "break-ed" mem_cgroup_iter().

https://jira.sw.ru/browse/PSBM-123655

Signed-off-by: Konstantin Khorenko 
---
 mm/memcontrol.c | 7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index d3a35a13ae4d..8040f09425bf 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1614,9 +1614,6 @@ static void mem_cgroup_iter_update(struct 
mem_cgroup_reclaim_iter *iter,
   struct mem_cgroup *root,
   int sequence)
 {
-   /* root reference counting symmetric to mem_cgroup_iter_load */
-   if (last_visited && last_visited != root)
-   css_put(&last_visited->css);
/*
 * We store the sequence count from the time @last_visited was
 * loaded successfully instead of rereading it here so that we
@@ -1626,6 +1623,10 @@ static void mem_cgroup_iter_update(struct 
mem_cgroup_reclaim_iter *iter,
iter->last_visited = new_position;
smp_wmb();
iter->last_dead_count = sequence;
+
+   /* root reference counting symmetric to mem_cgroup_iter_load */
+   if (last_visited && last_visited != root)
+   css_put(&last_visited->css);
 }
 
 /**
-- 
2.24.3

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH rh7 v4 5/9] mm/mem_cgroup_iter: Provide _iter_invalidate() the dying memcg as an argument

2021-02-24 Thread Konstantin Khorenko
It will be used by next patches when we search for this pointer
stored in iterators.

https://jira.sw.ru/browse/PSBM-123655

Signed-off-by: Konstantin Khorenko 
---
 mm/memcontrol.c | 7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index d0251d27de00..4c9db4544d0c 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1575,7 +1575,8 @@ static struct mem_cgroup *__mem_cgroup_iter_next(struct 
mem_cgroup *root,
return NULL;
 }
 
-static void mem_cgroup_iter_invalidate(struct mem_cgroup *root)
+static void mem_cgroup_iter_invalidate(struct mem_cgroup *root,
+  struct mem_cgroup *dead_memcg)
 {
/*
 * When a group in the hierarchy below root is destroyed, the
@@ -6872,14 +6873,14 @@ static void 
mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg)
struct mem_cgroup *parent = memcg;
 
while ((parent = parent_mem_cgroup(parent)))
-   mem_cgroup_iter_invalidate(parent);
+   mem_cgroup_iter_invalidate(parent, memcg);
 
/*
 * if the root memcg is not hierarchical we have to check it
 * explicitely.
 */
if (!root_mem_cgroup->use_hierarchy)
-   mem_cgroup_iter_invalidate(root_mem_cgroup);
+   mem_cgroup_iter_invalidate(root_mem_cgroup, memcg);
 }
 
 static void mem_cgroup_free_all(struct mem_cgroup *memcg)
-- 
2.24.3

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH rh7 v4 6/9] mm/mem_cgroup_iter: NULL-ify 'last_visited' for invalidated iterators

2021-02-24 Thread Konstantin Khorenko
Our target is to invalidate only those iterators which have our
dying memcg as 'last_visited' and put NULL there instead.

https://jira.sw.ru/browse/PSBM-123655

Signed-off-by: Konstantin Khorenko 
---
 mm/memcontrol.c | 143 +++-
 1 file changed, 128 insertions(+), 15 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 4c9db4544d0c..c17fa6aaa7ad 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -192,18 +192,11 @@ struct mem_cgroup_stat2_cpu {
 
 struct mem_cgroup_reclaim_iter {
/*
-* last scanned hierarchy member. Valid only if last_dead_count
-* matches memcg->dead_count of the hierarchy root group.
+* Last scanned hierarchy member.
+* If stored memcg is destroyed, the field is wiped.
 *
-* Memory pointed by 'last_visited' is freed not earlier than
-* one rcu period after we accessed it:
-*   cgroup_offline_fn()
-*offline_css()
-*list_del_rcu()
-*dput()
-*...
-* cgroup_diput()
-*  call_rcu(&cgrp->rcu_head, cgroup_free_rcu)
+* Check comment in mem_cgroup_iter() for 'last_visited'
+* protection scheme.
 */
struct mem_cgroup __rcu *last_visited;
unsigned long last_dead_count;
@@ -1578,6 +1571,66 @@ static struct mem_cgroup *__mem_cgroup_iter_next(struct 
mem_cgroup *root,
 static void mem_cgroup_iter_invalidate(struct mem_cgroup *root,
   struct mem_cgroup *dead_memcg)
 {
+   struct mem_cgroup_reclaim_iter *iter;
+   struct mem_cgroup_per_zone *mz;
+   struct mem_cgroup *pos;
+   int zone, node, i;
+
+   /*
+* When a group in the hierarchy below root is destroyed,
+* the hierarchy iterator can no longer be trusted iif
+* iter->last_visited contains the cgroup being destroyed.
+* Check if we get this unlikely case and invalidate the iterator
+* if so.
+*
+* Note, only "break-ed" iterators can store iter->last_visited
+* == dead_memcg because normally 'last_visited' is assigned
+* in mem_cgroup_iter_update() and 'new_position' is just after
+* css_tryget() there (ref inc-ed in __mem_cgroup_iter_next())
+* and thus cgroup is not offlined yet.
+*
+* mem_cgroup_iter_break() in its turn puts memcg's css but does
+* not wipe it from iter->last_visited.
+*
+* Q: Why dying memcg (dead_memcg) cannot get into
+*iter->last_visited a bit later after we wipe it here?
+* A: Because up to the moment of the current function execution
+*css_tryget() is guaranteed to fail on 'dead_memcg'.
+*
+* Q: Why don't we need rcu_read_lock()/unlock() wrap for this
+*cycle?
+* A: We must invalidate iter only in case it contains
+*'dead_memcg' in '->last_visited'. While we are running
+*here css_tryget() is guaranteed to fail on 'dead_memcg',
+*so any mem_cgroup_iter() started after this function is
+*executed will not get 'dead_memcg' as a result of
+*mem_cgroup_iter_load().
+*And thus any mem_cgroup_iter_update() we might race with -
+*will never write 'dead_memcg' in '->last_visited'.
+*It might write some alive cgroup pointer - true, but not
+*'dead_memcg'.
+*/
+   for_each_node(node) {
+   for (zone = 0; zone < MAX_NR_ZONES; zone++) {
+   mz = mem_cgroup_zoneinfo(root, node, zone);
+
+   for (i = 0; i < ARRAY_SIZE(mz->reclaim_iter); i++) {
+   iter = &mz->reclaim_iter[i];
+
+   pos = rcu_access_pointer(iter->last_visited);
+   /*
+* It's OK to race with mem_cgroup_iter_update()
+* here because it cannot write new
+* position == dead_memcg as
+* css_tryget() for it should fail already.
+*/
+   if (pos == dead_memcg)
+   rcu_assign_pointer(iter->last_visited,
+  NULL);
+   }
+   }
+   }
+
/*
 * When a group in the hierarchy below root is destroyed, the
 * hierarchy iterator can no longer be trusted since it might
@@ -1625,10 +1678,9 @@ static void mem_cgroup_iter_update(struct 
mem_cgroup_reclaim_iter *iter,
   int sequence)
 {

[Devel] [PATCH rh7 v4 4/9] mm/mem_cgroup_iter: Always assign iter->last_visited under rcu

2021-02-24 Thread Konstantin Khorenko
It's quite strange to have rcu section in mem_cgroup_iter(),
but do not use rcu_dereference/rcu_assign for pointers being defended.

We plan to access/assign '->last_visited' during iterator invalidation,
so we'll need the protection there anyway.

https://jira.sw.ru/browse/PSBM-123655

Signed-off-by: Konstantin Khorenko 
---
 mm/memcontrol.c | 20 ++--
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 8040f09425bf..d0251d27de00 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -194,8 +194,18 @@ struct mem_cgroup_reclaim_iter {
/*
 * last scanned hierarchy member. Valid only if last_dead_count
 * matches memcg->dead_count of the hierarchy root group.
+*
+* Memory pointed by 'last_visited' is freed not earlier than
+* one rcu period after we accessed it:
+*   cgroup_offline_fn()
+*offline_css()
+*list_del_rcu()
+*dput()
+*...
+* cgroup_diput()
+*  call_rcu(&cgrp->rcu_head, cgroup_free_rcu)
 */
-   struct mem_cgroup *last_visited;
+   struct mem_cgroup __rcu *last_visited;
unsigned long last_dead_count;
 
/* scan generation, increased every round-trip */
@@ -1591,8 +1601,7 @@ mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter,
 */
*sequence = atomic_read(&root->dead_count);
if (iter->last_dead_count == *sequence) {
-   smp_rmb();
-   position = iter->last_visited;
+   position = rcu_dereference(iter->last_visited);
 
/*
 * We cannot take a reference to root because we might race
@@ -1620,8 +1629,7 @@ static void mem_cgroup_iter_update(struct 
mem_cgroup_reclaim_iter *iter,
 * don't lose destruction events in between.  We could have
 * raced with the destruction of @new_position after all.
 */
-   iter->last_visited = new_position;
-   smp_wmb();
+   rcu_assign_pointer(iter->last_visited, new_position);
iter->last_dead_count = sequence;
 
/* root reference counting symmetric to mem_cgroup_iter_load */
@@ -1681,7 +1689,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup 
*root,
mz = mem_cgroup_zoneinfo(root, nid, zid);
iter = &mz->reclaim_iter[reclaim->priority];
if (prev && reclaim->generation != iter->generation) {
-   iter->last_visited = NULL;
+   rcu_assign_pointer(iter->last_visited, NULL);
goto out_unlock;
}
 
-- 
2.24.3

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH rh7 v4 7/9] mm/mem_cgroup_iter: Don't bother checking 'dead_count' anymore

2021-02-24 Thread Konstantin Khorenko
As we've enhanced mem_cgroup_iter_invalidate() to NULL-ify
'last_visited' if it stored dying cgroups, we can be sure
iter->last_visited always contain valid pointer to memcg
(or NULL surely).

So just skip extra check iter->last_dead_count vs root->dead_count,
it's not needed anymore.

Note: the patch is prepared as small as possible - for review simplicity.
Cleanup patch will follow.

https://jira.sw.ru/browse/PSBM-123655

Signed-off-by: Konstantin Khorenko 
---
 mm/memcontrol.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index c17fa6aaa7ad..cdb7f6f4c994 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1653,8 +1653,6 @@ mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter,
 * offlining.  The RCU lock ensures the object won't be
 * released, tryget will fail if we lost the race.
 */
-   *sequence = atomic_read(&root->dead_count);
-   if (iter->last_dead_count == *sequence) {
position = rcu_dereference(iter->last_visited);
 
/*
@@ -1667,7 +1665,6 @@ mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter,
!css_tryget(&position->css))
 
position = NULL;
-   }
return position;
 }
 
-- 
2.24.3

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH rh7 v4 8/9] mm/mem_cgroup_iter: Cleanup mem_cgroup_iter_load()

2021-02-24 Thread Konstantin Khorenko
No functional changes here.

https://jira.sw.ru/browse/PSBM-123655

Signed-off-by: Konstantin Khorenko 
---
 mm/memcontrol.c | 21 ++---
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index cdb7f6f4c994..3e8607c62035 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1653,18 +1653,17 @@ mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter 
*iter,
 * offlining.  The RCU lock ensures the object won't be
 * released, tryget will fail if we lost the race.
 */
-   position = rcu_dereference(iter->last_visited);
+   position = rcu_dereference(iter->last_visited);
+
+   /*
+* We cannot take a reference to root because we might race
+* with root removal and returning NULL would end up in
+* an endless loop on the iterator user level when root
+* would be returned all the time.
+*/
+   if (position && position != root && !css_tryget(&position->css))
+   position = NULL;
 
-   /*
-* We cannot take a reference to root because we might race
-* with root removal and returning NULL would end up in
-* an endless loop on the iterator user level when root
-* would be returned all the time.
-   */
-   if (position && position != root &&
-   !css_tryget(&position->css))
-
-   position = NULL;
return position;
 }
 
-- 
2.24.3

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH rh7 v4 9/9] mm/mem_cgroup_iter: Drop dead_count related infrastructure

2021-02-24 Thread Konstantin Khorenko
As we now have stable and reliable iter->last_visited,
don't need to save/compare number of destroyed cgroups.

https://jira.sw.ru/browse/PSBM-123655

Signed-off-by: Konstantin Khorenko 
---
 mm/memcontrol.c | 22 --
 1 file changed, 4 insertions(+), 18 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 3e8607c62035..ca6ffe6fcafc 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -199,7 +199,6 @@ struct mem_cgroup_reclaim_iter {
 * protection scheme.
 */
struct mem_cgroup __rcu *last_visited;
-   unsigned long last_dead_count;
 
/* scan generation, increased every round-trip */
unsigned int generation;
@@ -405,7 +404,6 @@ struct mem_cgroup {
spinlock_t pcp_counter_lock;
atomic_long_t   oom;
 
-   atomic_tdead_count;
 #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
struct tcp_memcontrol tcp_mem;
struct udp_memcontrol udp_mem;
@@ -1630,19 +1628,11 @@ static void mem_cgroup_iter_invalidate(struct 
mem_cgroup *root,
}
}
}
-
-   /*
-* When a group in the hierarchy below root is destroyed, the
-* hierarchy iterator can no longer be trusted since it might
-* have pointed to the destroyed group.  Invalidate it.
-*/
-   atomic_inc(&root->dead_count);
 }
 
 static struct mem_cgroup *
 mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter,
-struct mem_cgroup *root,
-int *sequence)
+struct mem_cgroup *root)
 {
struct mem_cgroup *position = NULL;
/*
@@ -1670,8 +1660,7 @@ mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter,
 static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter,
   struct mem_cgroup *last_visited,
   struct mem_cgroup *new_position,
-  struct mem_cgroup *root,
-  int sequence)
+  struct mem_cgroup *root)
 {
/*
 * The position saved in 'last_visited' is always valid.
@@ -1679,7 +1668,6 @@ static void mem_cgroup_iter_update(struct 
mem_cgroup_reclaim_iter *iter,
 * 'last_visited' is NULLed.
 */
rcu_assign_pointer(iter->last_visited, new_position);
-   iter->last_dead_count = sequence;
 
/* root reference counting symmetric to mem_cgroup_iter_load */
if (last_visited && last_visited != root)
@@ -1788,7 +1776,6 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup 
*root,
rcu_read_lock_sched();
while (!memcg) {
struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
-   int uninitialized_var(seq);
 
if (reclaim) {
int nid = zone_to_nid(reclaim->zone);
@@ -1802,14 +1789,13 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup 
*root,
goto out_unlock;
}
 
-   last_visited = mem_cgroup_iter_load(iter, root, &seq);
+   last_visited = mem_cgroup_iter_load(iter, root);
}
 
memcg = __mem_cgroup_iter_next(root, last_visited);
 
if (reclaim) {
-   mem_cgroup_iter_update(iter, last_visited, memcg, root,
-   seq);
+   mem_cgroup_iter_update(iter, last_visited, memcg, root);
 
if (!memcg)
iter->generation++;
-- 
2.24.3

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


Re: [Devel] [PATCH rh7 v4 4/9] mm/mem_cgroup_iter: Always assign iter->last_visited under rcu

2021-02-26 Thread Konstantin Khorenko

On 02/26/2021 12:12 PM, Kirill Tkhai wrote:

On 24.02.2021 21:55, Konstantin Khorenko wrote:

It's quite strange to have rcu section in mem_cgroup_iter(),
but do not use rcu_dereference/rcu_assign for pointers being defended.

We plan to access/assign '->last_visited' during iterator invalidation,
so we'll need the protection there anyway.

https://jira.sw.ru/browse/PSBM-123655

Signed-off-by: Konstantin Khorenko 
---
 mm/memcontrol.c | 20 ++--
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 8040f09425bf..d0251d27de00 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -194,8 +194,18 @@ struct mem_cgroup_reclaim_iter {
/*
 * last scanned hierarchy member. Valid only if last_dead_count
 * matches memcg->dead_count of the hierarchy root group.
+*
+* Memory pointed by 'last_visited' is freed not earlier than
+* one rcu period after we accessed it:
+*   cgroup_offline_fn()
+*offline_css()
+*list_del_rcu()
+*dput()
+*...
+* cgroup_diput()
+*  call_rcu(&cgrp->rcu_head, cgroup_free_rcu)
 */
-   struct mem_cgroup *last_visited;
+   struct mem_cgroup __rcu *last_visited;
unsigned long last_dead_count;

/* scan generation, increased every round-trip */
@@ -1591,8 +1601,7 @@ mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter,
 */
*sequence = atomic_read(&root->dead_count);
if (iter->last_dead_count == *sequence) {
-   smp_rmb();
-   position = iter->last_visited;
+   position = rcu_dereference(iter->last_visited);

/*
 * We cannot take a reference to root because we might race
@@ -1620,8 +1629,7 @@ static void mem_cgroup_iter_update(struct 
mem_cgroup_reclaim_iter *iter,
 * don't lose destruction events in between.  We could have
 * raced with the destruction of @new_position after all.
 */
-   iter->last_visited = new_position;
-   smp_wmb();


We can't remove barriers in this patch, this makes the patch wrong.
We should remove barriers somewhere in [9/9].


Ok, will resend.

Thank you very much!




+   rcu_assign_pointer(iter->last_visited, new_position);
iter->last_dead_count = sequence;

/* root reference counting symmetric to mem_cgroup_iter_load */
@@ -1681,7 +1689,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup 
*root,
mz = mem_cgroup_zoneinfo(root, nid, zid);
iter = &mz->reclaim_iter[reclaim->priority];
if (prev && reclaim->generation != iter->generation) {
-   iter->last_visited = NULL;
+   rcu_assign_pointer(iter->last_visited, NULL);
goto out_unlock;
}




.


___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


Re: [Devel] [PATCH rh7 v4 4/9] mm/mem_cgroup_iter: Always assign iter->last_visited under rcu

2021-02-26 Thread Konstantin Khorenko




--
Best regards,

Konstantin Khorenko,
Virtuozzo Linux Kernel Team

On 02/26/2021 12:12 PM, Kirill Tkhai wrote:

On 24.02.2021 21:55, Konstantin Khorenko wrote:

It's quite strange to have rcu section in mem_cgroup_iter(),
but do not use rcu_dereference/rcu_assign for pointers being defended.

We plan to access/assign '->last_visited' during iterator invalidation,
so we'll need the protection there anyway.

https://jira.sw.ru/browse/PSBM-123655

Signed-off-by: Konstantin Khorenko 
---
 mm/memcontrol.c | 20 ++--
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 8040f09425bf..d0251d27de00 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -194,8 +194,18 @@ struct mem_cgroup_reclaim_iter {
/*
 * last scanned hierarchy member. Valid only if last_dead_count
 * matches memcg->dead_count of the hierarchy root group.
+*
+* Memory pointed by 'last_visited' is freed not earlier than
+* one rcu period after we accessed it:
+*   cgroup_offline_fn()
+*offline_css()
+*list_del_rcu()
+*dput()
+*...
+* cgroup_diput()
+*  call_rcu(&cgrp->rcu_head, cgroup_free_rcu)
 */
-   struct mem_cgroup *last_visited;
+   struct mem_cgroup __rcu *last_visited;
unsigned long last_dead_count;

/* scan generation, increased every round-trip */
@@ -1591,8 +1601,7 @@ mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter,
 */
*sequence = atomic_read(&root->dead_count);
if (iter->last_dead_count == *sequence) {
-   smp_rmb();
-   position = iter->last_visited;
+   position = rcu_dereference(iter->last_visited);

/*
 * We cannot take a reference to root because we might race
@@ -1620,8 +1629,7 @@ static void mem_cgroup_iter_update(struct 
mem_cgroup_reclaim_iter *iter,
 * don't lose destruction events in between.  We could have
 * raced with the destruction of @new_position after all.
 */
-   iter->last_visited = new_position;
-   smp_wmb();


We can't remove barriers in this patch, this makes the patch wrong.
We should remove barriers somewhere in [9/9].


JFYI: i've moved this not to [9/9], but to
[PATCH rh7 v4 7/9] mm/mem_cgroup_iter: Don't bother checking 'dead_count' 
anymore




+   rcu_assign_pointer(iter->last_visited, new_position);
iter->last_dead_count = sequence;

/* root reference counting symmetric to mem_cgroup_iter_load */
@@ -1681,7 +1689,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup 
*root,
mz = mem_cgroup_zoneinfo(root, nid, zid);
iter = &mz->reclaim_iter[reclaim->priority];
if (prev && reclaim->generation != iter->generation) {
-   iter->last_visited = NULL;
+   rcu_assign_pointer(iter->last_visited, NULL);
goto out_unlock;
}




.


___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH rh7 v5 0/9] mm/mem_cgroup_iter: Reduce the number of iterator restarts upon cgroup removals

2021-02-26 Thread Konstantin Khorenko
May thanks to Kirill Tkhai for his bright ideas and review!

Problem description from the user point of view:
  * the Node is slow
  * the Node has a lot of free RAM
  * the Node has a lot of swapin/swapout
  * kswapd is always running

Problem in a nutshell from technical point of view:
  * kswapd is looping in shrink_zone() inside the loop
  do {} while ((memcg = mem_cgroup_iter(root, memcg, &reclaim)));
(and never goes trough the outer loop)
  * there are a quite a number of memory cgroups of the Node (~1000)
  * some cgroups are hard to reclaim (reclaim may take ~3 seconds),
this is because of very busy disk due to permanent swapin/swapout
  * mem_cgroup_iter() does not have success scanning all cgroups
in a row, it restarts from the root cgroup one time after
another (after different number of cgroups scanned)

Q: Why does mem_cgroup_iter() restart from the root memcg?
A: Because it is invalidated once some memory cgroup is
   destroyed on the Node.
   Note: ANY memory cgroup destroy on the Node leads to iter
   restart.

The following patchset solves this problem in the following way:
there is no need to restart the iter until we see the iter has
the position which is exactly the memory cgroup being destroyed.

The patchset ensures the iter->last_visited is NULL-ified on
invalidation and thus restarts only in the unlikely case when
the iter points to the memcg being destroyed.

Testing: i've tested this patchset using modified kernel which breaks
the memcg iterator in case of global reclaim with probability of 2%.

3 kernels have been tested: "release", KASAN-only, "debug" kernels.
Each worked for 12 hours, no issues, from 12000 to 26000 races were
caught during this period (i.e. dying memcg was found in some iterator
and wiped).

The testing scenario is documented in the jira issue.

https://jira.sw.ru/browse/PSBM-123655

v2 changes:
 - reverted 2 patches in this code which were focused on syncronizing
   updates of iter->last_visited and ->last_dead_count
   (as we are getting rid of iter->last_dead_count at all)
 - use rcu primitives to access iter->last_visited

v3 changes:
 - more comments explaining the locking scheme
 - use rcu_read_{lock,unlock}_sched() in mem_cgroup_iter()
   for syncronization with iterator invalidation func
 - do not use rcu_read_{lock/unlock}() wrap in iterator invalidation func
   as it protects nothing

v4 changes:
 - extended comment why iter invalidation function must see all
   pointers to dying memcg and no pointer to it can be written later

v5 changes:
 - droppig barriers (rmb/wmb) are moved from patch [4/9] to
   "[PATCH rh7 v4 7/9] mm/mem_cgroup_iter: Don't bother checking
   'dead_count' anymore", it makes the code valid in between 4th and 7th
   patch.
 - no more changes, resulted trees after v4 and v5 applied are identical

Konstantin Khorenko (9):
  Revert "mm/memcg: fix css_tryget(),css_put() imbalance"
  Revert "mm/memcg: use seqlock to protect reclaim_iter updates"
  mm/mem_cgroup_iter: Make 'iter->last_visited' a bit more stable
  mm/mem_cgroup_iter: Always assign iter->last_visited under rcu
  mm/mem_cgroup_iter: Provide _iter_invalidate() the dying memcg as an
argument
  mm/mem_cgroup_iter: NULL-ify 'last_visited' for invalidated iterators
  mm/mem_cgroup_iter: Don't bother saving/checking 'dead_count' anymore
  mm/mem_cgroup_iter: Cleanup mem_cgroup_iter_load()
  mm/mem_cgroup_iter: Drop dead_count related infrastructure

 mm/memcontrol.c | 208 ++--
 1 file changed, 150 insertions(+), 58 deletions(-)

-- 
2.24.3

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH rh7 v5 1/9] Revert "mm/memcg: fix css_tryget(), css_put() imbalance"

2021-02-26 Thread Konstantin Khorenko
This reverts commit 5f351790d598bbf014441a86e7081972086de61b.

We are going to get rid of seqlock 'iter->last_visited_lock',
so reverting the patch.

https://jira.sw.ru/browse/PSBM-123655
Signed-off-by: Konstantin Khorenko 
Reviewed-by: Kirill Tkhai 

---
 mm/memcontrol.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e0a430908138..e5c5f64d6bb6 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1581,7 +1581,7 @@ mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter,
 struct mem_cgroup *root,
 int *sequence)
 {
-   struct mem_cgroup *position;
+   struct mem_cgroup *position = NULL;
unsigned seq;
 
/*
@@ -1594,7 +1594,6 @@ mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter,
 */
*sequence = atomic_read(&root->dead_count);
 retry:
-   position = NULL;
seq = read_seqbegin(&iter->last_visited_lock);
if (iter->last_dead_count == *sequence) {
position = READ_ONCE(iter->last_visited);
-- 
2.24.3

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH rh7 v5 2/9] Revert "mm/memcg: use seqlock to protect reclaim_iter updates"

2021-02-26 Thread Konstantin Khorenko
This reverts commit 5a2d13cf16faedb8a2c318d50cca71d74d2be264.

We are going to make 'iter->last_visited' always valid to skip
verification 'iter->last_dead_count' vs 'root->dead_count',
thus there will be no need to update 'last_visited' and
'last_dead_count' consistently (we'll remove 'iter->last_dead_count'
field at all), thus dropping the seqlock.

https://jira.sw.ru/browse/PSBM-123655
Signed-off-by: Konstantin Khorenko 
Reviewed-by: Kirill Tkhai 
---
 mm/memcontrol.c | 18 +++---
 1 file changed, 3 insertions(+), 15 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e5c5f64d6bb6..d3a35a13ae4d 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -197,7 +197,6 @@ struct mem_cgroup_reclaim_iter {
 */
struct mem_cgroup *last_visited;
unsigned long last_dead_count;
-   seqlock_t last_visited_lock;
 
/* scan generation, increased every round-trip */
unsigned int generation;
@@ -1582,8 +1581,6 @@ mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter,
 int *sequence)
 {
struct mem_cgroup *position = NULL;
-   unsigned seq;
-
/*
 * A cgroup destruction happens in two stages: offlining and
 * release.  They are separated by a RCU grace period.
@@ -1593,13 +1590,9 @@ mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter 
*iter,
 * released, tryget will fail if we lost the race.
 */
*sequence = atomic_read(&root->dead_count);
-retry:
-   seq = read_seqbegin(&iter->last_visited_lock);
if (iter->last_dead_count == *sequence) {
-   position = READ_ONCE(iter->last_visited);
-
-   if (read_seqretry(&iter->last_visited_lock, seq))
-   goto retry;
+   smp_rmb();
+   position = iter->last_visited;
 
/*
 * We cannot take a reference to root because we might race
@@ -1630,10 +1623,9 @@ static void mem_cgroup_iter_update(struct 
mem_cgroup_reclaim_iter *iter,
 * don't lose destruction events in between.  We could have
 * raced with the destruction of @new_position after all.
 */
-   write_seqlock(&iter->last_visited_lock);
iter->last_visited = new_position;
+   smp_wmb();
iter->last_dead_count = sequence;
-   write_sequnlock(&iter->last_visited_lock);
 }
 
 /**
@@ -6589,15 +6581,11 @@ static int alloc_mem_cgroup_per_zone_info(struct 
mem_cgroup *memcg, int node)
return 1;
 
for (zone = 0; zone < MAX_NR_ZONES; zone++) {
-   int i;
-
mz = &pn->zoneinfo[zone];
lruvec_init(&mz->lruvec);
mz->usage_in_excess = 0;
mz->on_tree = false;
mz->memcg = memcg;
-   for (i = 0; i < ARRAY_SIZE(mz->reclaim_iter); i++)
-   seqlock_init(&mz->reclaim_iter[i].last_visited_lock);
}
memcg->info.nodeinfo[node] = pn;
return 0;
-- 
2.24.3

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH rh7 v5 3/9] mm/mem_cgroup_iter: Make 'iter->last_visited' a bit more stable

2021-02-26 Thread Konstantin Khorenko
In fact this patch does not change the logic, but after it
we can state that iter->last_visited _always_ contains valid
pointer until the iter is "break-ed".

Why? Because 'last_visited' is always assigned in _update()
to the memcg which has passed css_tryget(), so css won't be
ever offlined (and moreover - destroyed) until we css_put() it.

And if now we call css_put() after iter->last_visited is
assigned a new cgroup, the only case when 'last_visited' may
contain invalid entry is "break-ed" mem_cgroup_iter().

https://jira.sw.ru/browse/PSBM-123655

Signed-off-by: Konstantin Khorenko 
Reviewed-by: Kirill Tkhai 
---
 mm/memcontrol.c | 7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index d3a35a13ae4d..8040f09425bf 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1614,9 +1614,6 @@ static void mem_cgroup_iter_update(struct 
mem_cgroup_reclaim_iter *iter,
   struct mem_cgroup *root,
   int sequence)
 {
-   /* root reference counting symmetric to mem_cgroup_iter_load */
-   if (last_visited && last_visited != root)
-   css_put(&last_visited->css);
/*
 * We store the sequence count from the time @last_visited was
 * loaded successfully instead of rereading it here so that we
@@ -1626,6 +1623,10 @@ static void mem_cgroup_iter_update(struct 
mem_cgroup_reclaim_iter *iter,
iter->last_visited = new_position;
smp_wmb();
iter->last_dead_count = sequence;
+
+   /* root reference counting symmetric to mem_cgroup_iter_load */
+   if (last_visited && last_visited != root)
+   css_put(&last_visited->css);
 }
 
 /**
-- 
2.24.3

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH rh7 v5 4/9] mm/mem_cgroup_iter: Always assign iter->last_visited under rcu

2021-02-26 Thread Konstantin Khorenko
It's quite strange to have rcu section in mem_cgroup_iter(),
but do not use rcu_dereference/rcu_assign for pointers being defended.

We plan to access/assign '->last_visited' during iterator invalidation,
so we'll need the protection there anyway.

https://jira.sw.ru/browse/PSBM-123655

Signed-off-by: Konstantin Khorenko 
Reviewed-by: Kirill Tkhai 
---
 mm/memcontrol.c | 18 ++
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 8040f09425bf..7921f04b3670 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -194,8 +194,18 @@ struct mem_cgroup_reclaim_iter {
/*
 * last scanned hierarchy member. Valid only if last_dead_count
 * matches memcg->dead_count of the hierarchy root group.
+*
+* Memory pointed by 'last_visited' is freed not earlier than
+* one rcu period after we accessed it:
+*   cgroup_offline_fn()
+*offline_css()
+*list_del_rcu()
+*dput()
+*...
+* cgroup_diput()
+*  call_rcu(&cgrp->rcu_head, cgroup_free_rcu)
 */
-   struct mem_cgroup *last_visited;
+   struct mem_cgroup __rcu *last_visited;
unsigned long last_dead_count;
 
/* scan generation, increased every round-trip */
@@ -1592,7 +1602,7 @@ mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter,
*sequence = atomic_read(&root->dead_count);
if (iter->last_dead_count == *sequence) {
smp_rmb();
-   position = iter->last_visited;
+   position = rcu_dereference(iter->last_visited);
 
/*
 * We cannot take a reference to root because we might race
@@ -1620,7 +1630,7 @@ static void mem_cgroup_iter_update(struct 
mem_cgroup_reclaim_iter *iter,
 * don't lose destruction events in between.  We could have
 * raced with the destruction of @new_position after all.
 */
-   iter->last_visited = new_position;
+   rcu_assign_pointer(iter->last_visited, new_position);
smp_wmb();
iter->last_dead_count = sequence;
 
@@ -1681,7 +1691,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup 
*root,
mz = mem_cgroup_zoneinfo(root, nid, zid);
iter = &mz->reclaim_iter[reclaim->priority];
if (prev && reclaim->generation != iter->generation) {
-   iter->last_visited = NULL;
+   rcu_assign_pointer(iter->last_visited, NULL);
goto out_unlock;
}
 
-- 
2.24.3

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


<    2   3   4   5   6   7   8   9   10   11   >