from:"Konstantin Khlebnikov"

Re: [PATCH] lib/test_lockup.c: add parameters for cond_resched inside loop

2020-08-09 Thread Konstantin Khlebnikov

On Sun, Aug 9, 2020 at 10:06 PM Dmitry Monakhov
 wrote:
>
> call_cond_resched_before=Y  call cond_resched with resource before wait
> call_cond_resched_after=Y   call cond_resched with resource after wait
> measure_cond_resched=Y  measure maximum cond_resched time inside loop

Do you really need all of them? It seems "call_cond_resched_after" is
enough for demonstration.
It could be shortened to just "call_cond_resched". I see no sense in
ordering before\after sleep.

Measuring time of cond_resched has vague meaning too. This just "yep,
we have an overload".

>
> This simulate situation where process call cond_resched() with lock held.
>
> Example demonstrate priority inversion issue with epbf-program, where
> low priority task sheduled out while holding cgroup_mutex for a long
> periods of time which blocks others programs with high priority.
>
> CGROUP_MUTEX=$(gawk '$3 == "cgroup_mutex" {print "0x"$1}' /proc/kallsyms)
> # Emulate ebpf-application load which can hung inside cgroup_bpf_attach()
> nice -20 modprobe lib/test_lockup.ko \
>   time_nsecs=1000 cooldown_nsecs=10 iterations=10 \
>   lock_mutex_ptr=$CGROUP_MUTEX \
>   measure_lock_wait=Y call_cond_resched_after=Y &
>
> stress-ng -c $(nproc) --timeout 10s&
> time mkdir /sys/fs/cgroup/blkio/a
>
> Signed-off-by: Dmitry Monakhov 
> ---
>  lib/test_lockup.c | 44 +++-
>  1 file changed, 43 insertions(+), 1 deletion(-)
>
> diff --git a/lib/test_lockup.c b/lib/test_lockup.c
> index 0f81252..3e05d6e 100644
> --- a/lib/test_lockup.c
> +++ b/lib/test_lockup.c
> @@ -77,6 +77,18 @@ static bool call_cond_resched;
>  module_param(call_cond_resched, bool, 0600);
>  MODULE_PARM_DESC(call_cond_resched, "call cond_resched() between 
> iterations");
>
> +static bool call_cond_resched_before;
> +module_param(call_cond_resched_before, bool, 0600);
> +MODULE_PARM_DESC(call_cond_resched_before, "call cond_resched() before 
> wait");
> +
> +static bool call_cond_resched_after;
> +module_param(call_cond_resched_after, bool, 0600);
> +MODULE_PARM_DESC(call_cond_resched_after, "call cond_resched() after wait");
> +
> +static bool measure_cond_resched;
> +module_param(measure_cond_resched, bool, 0400);
> +MODULE_PARM_DESC(measure_cond_resched, "measure cond_resched time");
> +
>  static bool measure_lock_wait;
>  module_param(measure_lock_wait, bool, 0400);
>  MODULE_PARM_DESC(measure_lock_wait, "measure lock wait time");
> @@ -162,6 +174,7 @@ MODULE_PARM_DESC(lock_sb_umount, "lock file -> sb -> 
> s_umount");
>  static atomic_t alloc_pages_failed = ATOMIC_INIT(0);
>
>  static atomic64_t max_lock_wait = ATOMIC64_INIT(0);
> +static atomic64_t max_cond_resched = ATOMIC64_INIT(0);
>
>  static struct task_struct *main_task;
>  static int master_cpu;
> @@ -346,6 +359,22 @@ static void test_wait(unsigned int secs, unsigned int 
> nsecs)
> }
>  }
>
> +static void test_cond_resched(void)
> +{
> +   s64 cur, old_max;
> +   s64 start = local_clock();
> +
> +   cond_resched();
> +
> +   cur  = local_clock() - start;
> +   old_max = atomic64_read(_cond_resched);
> +   do {
> +   if (cur < old_max)
> +   break;
> +   old_max = atomic64_cmpxchg(_cond_resched, old_max, cur);
> +   } while (old_max != cur);
> +}
> +
>  static void test_lockup(bool master)
>  {
> u64 lockup_start = local_clock();
> @@ -363,8 +392,14 @@ static void test_lockup(bool master)
> if (iowait)
> current->in_iowait = 1;
>
> +   if (call_cond_resched_before)
> +   test_cond_resched();
> +
> test_wait(time_secs, time_nsecs);
>
> +   if (call_cond_resched_after)
> +   test_cond_resched();
> +
> if (iowait)
> current->in_iowait = 0;
>
> @@ -497,6 +532,7 @@ static int __init test_lockup_init(void)
>
> if ((wait_state != TASK_RUNNING ||
>  (call_cond_resched && !reacquire_locks) ||
> +call_cond_resched_before || call_cond_resched_after ||
>  (alloc_pages_nr && gfpflags_allow_blocking(alloc_pages_gfp))) &&
> (test_disable_irq || disable_softirq || disable_preempt ||
>  lock_rcu || lock_spinlock_ptr || lock_rwlock_ptr)) {
> @@ -532,7 +568,7 @@ static int __init test_lockup_init(void)
> if (test_lock_sb_umount && test_inode)
> lock_rwsem_ptr = (unsigned long)_inode->i_sb->s_umount;
>
> -   pr_notice("START pid=%d time=%u +%u ns cooldown=%u +%u ns 
> iterations=%u state=%s %s%s%s%s%s%s%s%s%s%s%s\n",
> +   pr_notice("START pid=%d time=%u +%u ns cooldown=%u +%u ns 
> iterations=%u state=%s %s%s%s%s%s%s%s%s%s%s%s%s%s\n",
>   main_task->pid, time_secs, time_nsecs,
>   cooldown_secs, cooldown_nsecs, iterations, state,
>   all_cpus ? "all_cpus " : "",
> @@ -545,6 +581,8 @@ static

[PATCH] net/ipv4: add comment about connect() to INADDR_ANY

2020-07-25 Thread Konstantin Khlebnikov

Copy comment from net/ipv6/tcp_ipv6.c to help future readers.

Signed-off-by: Konstantin Khlebnikov 
---
 net/ipv4/route.c |1 +
 1 file changed, 1 insertion(+)

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index a01efa062f6b..303fe706cbd2 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2591,6 +2591,7 @@ struct rtable *ip_route_output_key_hash_rcu(struct net 
*net, struct flowi4 *fl4,
}
}
 
+   /* connect() to INADDR_ANY means loopback (BSD'ism). */
if (!fl4->daddr) {
fl4->daddr = fl4->saddr;
if (!fl4->daddr)

Re: [PATCH v14 15/20] mm/swap: serialize memcg changes during pagevec_lru_move_fn

2020-07-03 Thread Konstantin Khlebnikov

On Fri, Jul 3, 2020 at 8:09 AM Alex Shi  wrote:
>
> Hugh Dickins' found a memcg change bug on original version:
> If we want to change the pgdat->lru_lock to memcg's lruvec lock, we have
> to serialize mem_cgroup_move_account during pagevec_lru_move_fn. The
> possible bad scenario would like:
>
> cpu 0   cpu 1
> lruvec = mem_cgroup_page_lruvec()
> if (!isolate_lru_page())
> mem_cgroup_move_account
>
> spin_lock_irqsave(>lru_lock <== wrong lock.
>
> So we need the ClearPageLRU to block isolate_lru_page(), then serialize
> the memcg change here.
>
> Reported-by: Hugh Dickins 
> Signed-off-by: Alex Shi 
> Cc: Andrew Morton 
> Cc: linux...@kvack.org
> Cc: linux-kernel@vger.kernel.org
> ---
>  mm/swap.c | 31 +++
>  1 file changed, 19 insertions(+), 12 deletions(-)
>
> diff --git a/mm/swap.c b/mm/swap.c
> index b24d5f69b93a..55eb2c2eed03 100644
> --- a/mm/swap.c
> +++ b/mm/swap.c
> @@ -203,7 +203,7 @@ int get_kernel_page(unsigned long start, int write, 
> struct page **pages)
>  EXPORT_SYMBOL_GPL(get_kernel_page);
>
>  static void pagevec_lru_move_fn(struct pagevec *pvec,
> -   void (*move_fn)(struct page *page, struct lruvec *lruvec))
> +   void (*move_fn)(struct page *page, struct lruvec *lruvec), bool add)
>  {
> int i;
> struct pglist_data *pgdat = NULL;
> @@ -221,8 +221,15 @@ static void pagevec_lru_move_fn(struct pagevec *pvec,
> spin_lock_irqsave(>lru_lock, flags);
> }
>
> +   /* new page add to lru or page moving between lru */
> +   if (!add && !TestClearPageLRU(page))
> +   continue;
> +
> lruvec = mem_cgroup_page_lruvec(page, pgdat);
> (*move_fn)(page, lruvec);
> +
> +   if (!add)
> +   SetPageLRU(page);
> }
> if (pgdat)
> spin_unlock_irqrestore(>lru_lock, flags);
> @@ -259,7 +266,7 @@ void rotate_reclaimable_page(struct page *page)
> local_lock_irqsave(_rotate.lock, flags);
> pvec = this_cpu_ptr(_rotate.pvec);
> if (!pagevec_add(pvec, page) || PageCompound(page))
> -   pagevec_lru_move_fn(pvec, pagevec_move_tail_fn);
> +   pagevec_lru_move_fn(pvec, pagevec_move_tail_fn, 
> false);
> local_unlock_irqrestore(_rotate.lock, flags);
> }
>  }
> @@ -328,7 +335,7 @@ static void activate_page_drain(int cpu)
> struct pagevec *pvec = _cpu(lru_pvecs.activate_page, cpu);
>
> if (pagevec_count(pvec))
> -   pagevec_lru_move_fn(pvec, __activate_page);
> +   pagevec_lru_move_fn(pvec, __activate_page, false);
>  }
>
>  static bool need_activate_page_drain(int cpu)
> @@ -346,7 +353,7 @@ void activate_page(struct page *page)
> pvec = this_cpu_ptr(_pvecs.activate_page);
> get_page(page);
> if (!pagevec_add(pvec, page) || PageCompound(page))
> -   pagevec_lru_move_fn(pvec, __activate_page);
> +   pagevec_lru_move_fn(pvec, __activate_page, false);
> local_unlock(_pvecs.lock);
> }
>  }
> @@ -621,21 +628,21 @@ void lru_add_drain_cpu(int cpu)
>
> /* No harm done if a racing interrupt already did this */
> local_lock_irqsave(_rotate.lock, flags);
> -   pagevec_lru_move_fn(pvec, pagevec_move_tail_fn);
> +   pagevec_lru_move_fn(pvec, pagevec_move_tail_fn, false);
> local_unlock_irqrestore(_rotate.lock, flags);
> }
>
> pvec = _cpu(lru_pvecs.lru_deactivate_file, cpu);
> if (pagevec_count(pvec))
> -   pagevec_lru_move_fn(pvec, lru_deactivate_file_fn);
> +   pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, false);
>
> pvec = _cpu(lru_pvecs.lru_deactivate, cpu);
> if (pagevec_count(pvec))
> -   pagevec_lru_move_fn(pvec, lru_deactivate_fn);
> +   pagevec_lru_move_fn(pvec, lru_deactivate_fn, false);
>
> pvec = _cpu(lru_pvecs.lru_lazyfree, cpu);
> if (pagevec_count(pvec))
> -   pagevec_lru_move_fn(pvec, lru_lazyfree_fn);
> +   pagevec_lru_move_fn(pvec, lru_lazyfree_fn, false);
>
> activate_page_drain(cpu);
>  }
> @@ -664,7 +671,7 @@ void deactivate_file_page(struct page *page)
> pvec = this_cpu_ptr(_pvecs.lru_deactivate_file);
>
> if (!pagevec_add(pvec, page) || PageCompound(page))
> -   pagevec_lru_move_fn(pvec, lru_deactivate_file_fn);
> +   pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, 
> false);
> local_unlock(_pvecs.lock);
> }
>  }
> @@ -686,7 +693,7 @@ void deactivate_page(struct page *page)
> pvec

[PATCH] mailmap: add entry for obsolete email address

2020-07-01 Thread Konstantin Khlebnikov

Map old corporate email address @yandex-team.ru to stable private address.

Signed-off-by: Konstantin Khlebnikov 
---
 .mailmap |1 +
 1 file changed, 1 insertion(+)

diff --git a/.mailmap b/.mailmap
index c69d9c734fb5..b15c836ea7fe 100644
--- a/.mailmap
+++ b/.mailmap
@@ -146,6 +146,7 @@ Kamil Konieczny  

 Kay Sievers 
 Kenneth W Chen 
 Konstantin Khlebnikov  
+Konstantin Khlebnikov  
 Koushik 
 Krzysztof Kozlowski  
 Krzysztof Kozlowski

[PATCH 1/4] scripts/decode_stacktrace: skip missing symbols

2020-06-22 Thread Konstantin Khlebnikov

For now script turns missing symbols into '0' and make bogus decode.
Skip them instead. Also simplify parsing output of 'nm'.

Before:

$ echo 'xxx+0x0/0x0' | ./scripts/decode_stacktrace.sh vmlinux ""
xxx (home/khlebnikov/src/linux/./arch/x86/include/asm/processor.h:398)

After:

$ echo 'xxx+0x0/0x0' | ./scripts/decode_stacktrace.sh vmlinux ""
xxx+0x0/0x0

Signed-off-by: Konstantin Khlebnikov 
---
 scripts/decode_stacktrace.sh |6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/scripts/decode_stacktrace.sh b/scripts/decode_stacktrace.sh
index 66a6d511b524..6ec8d6dff86c 100755
--- a/scripts/decode_stacktrace.sh
+++ b/scripts/decode_stacktrace.sh
@@ -56,7 +56,11 @@ parse_symbol() {
if [[ "${cache[$module,$name]+isset}" == "isset" ]]; then
local base_addr=${cache[$module,$name]}
else
-   local base_addr=$(nm "$objfile" | grep -i ' t ' | awk "/ 
$name\$/ {print \$1}" | head -n1)
+   local base_addr=$(nm "$objfile" | awk '$3 == "'$name'" && ($2 
== "t" || $2 == "T") {print $1; exit}')
+   if [[ $base_addr == "" ]] ; then
+   # address not found
+   return
+   fi
cache[$module,$name]="$base_addr"
fi
# Let's start doing the math to get the exact address into the

[PATCH 4/4] scripts/decode_stacktrace: guess path to vmlinux by release name

2020-06-22 Thread Konstantin Khlebnikov

Add option decode_stacktrace -r  to specify only release name.
This is enough to guess standard paths to vmlinux and modules:

$ echo -e 'schedule+0x0/0x0\ntap_open+0x0/0x0 [tap]' |
./scripts/decode_stacktrace.sh -r 5.4.0-37-generic
schedule (kernel/sched/core.c:4138)
tap_open (drivers/net/tap.c:502) tap

Signed-off-by: Konstantin Khlebnikov 
---
 scripts/decode_stacktrace.sh |   29 -
 1 file changed, 24 insertions(+), 5 deletions(-)

diff --git a/scripts/decode_stacktrace.sh b/scripts/decode_stacktrace.sh
index 7f18ac10af03..4bdcb6d8c605 100755
--- a/scripts/decode_stacktrace.sh
+++ b/scripts/decode_stacktrace.sh
@@ -5,14 +5,33 @@
 
 if [[ $# < 1 ]]; then
echo "Usage:"
-   echo "  $0  [base path] [modules path]"
+   echo "  $0 -r  |  [base path] [modules path]"
exit 1
 fi
 
-vmlinux=$1
-basepath=${2-auto}
-modpath=$3
-release=""
+if [[ $1 == "-r" ]] ; then
+   vmlinux=""
+   basepath="auto"
+   modpath=""
+   release=$2
+
+   for fn in {,/usr/lib/debug}/boot/vmlinux-$release{,.debug} 
/lib/modules/$release{,/build}/vmlinux ; do
+   if [ -e "$fn" ] ; then
+   vmlinux=$fn
+   break
+   fi
+   done
+
+   if [[ $vmlinux == "" ]] ; then
+   echo "ERROR! vmlinux image for release $release is not found" 
>&2
+   exit 2
+   fi
+else
+   vmlinux=$1
+   basepath=${2-auto}
+   modpath=$3
+   release=""
+fi
 
 declare -A cache
 declare -A modcache

[PATCH 3/4] scripts/decode_stacktrace: guess path to modules

2020-06-22 Thread Konstantin Khlebnikov

Try to find module in directory with vmlinux (for fresh build).
Then try standard paths where debuginfo are usually placed.
Pick first file which have elf section '.debug_line'.

Before:

$ echo 'tap_open+0x0/0x0 [tap]' |
  ./scripts/decode_stacktrace.sh /usr/lib/debug/boot/vmlinux-5.4.0-37-generic
WARNING! Modules path isn't set, but is needed to parse this symbol
tap_open+0x0/0x0 tap

After:

$ echo 'tap_open+0x0/0x0 [tap]' |
  ./scripts/decode_stacktrace.sh /usr/lib/debug/boot/vmlinux-5.4.0-37-generic
tap_open (drivers/net/tap.c:502) tap

Signed-off-by: Konstantin Khlebnikov 
---
 scripts/decode_stacktrace.sh |   36 +---
 1 file changed, 33 insertions(+), 3 deletions(-)

diff --git a/scripts/decode_stacktrace.sh b/scripts/decode_stacktrace.sh
index b1b85a7b2115..7f18ac10af03 100755
--- a/scripts/decode_stacktrace.sh
+++ b/scripts/decode_stacktrace.sh
@@ -12,9 +12,40 @@ fi
 vmlinux=$1
 basepath=${2-auto}
 modpath=$3
+release=""
+
 declare -A cache
 declare -A modcache
 
+find_module() {
+   if [[ "$modpath" != "" ]] ; then
+   for fn in $(find "$modpath" -name "${module//_/[-_]}.ko*") ; do
+   if readelf -WS "$fn" | grep -qwF .debug_line ; then
+   echo $fn
+   return
+   fi
+   done
+   return 1
+   fi
+
+   modpath=$(dirname "$vmlinux")
+   find_module && return
+
+   if [[ $release == "" ]] ; then
+   release=$(gdb -ex 'print init_uts_ns.name.release' -ex 'quit' 
-quiet -batch "$vmlinux" | sed -n 's/\$1 = "\(.*\)".*/\1/p')
+   fi
+
+   for dn in {/usr/lib/debug,}/lib/modules/$release ; do
+   if [ -e "$dn" ] ; then
+   modpath="$dn"
+   find_module && return
+   fi
+   done
+
+   modpath=""
+   return 1
+}
+
 parse_symbol() {
# The structure of symbol at this point is:
#   ([name]+[offset]/[total length])
@@ -27,12 +58,11 @@ parse_symbol() {
elif [[ "${modcache[$module]+isset}" == "isset" ]]; then
local objfile=${modcache[$module]}
else
-   if [[ $modpath == "" ]]; then
+   local objfile=$(find_module)
+   if [[ $objfile == "" ]] ; then
echo "WARNING! Modules path isn't set, but is needed to 
parse this symbol" >&2
return
fi
-   local objfile=$(find "$modpath" -name "${module//_/[-_]}.ko*" 
-print -quit)
-   [[ $objfile == "" ]] && return
modcache[$module]=$objfile
fi

[PATCH 2/4] scripts/decode_stacktrace: guess basepath if not specified

2020-06-22 Thread Konstantin Khlebnikov

Guess path to kernel sources using known location of symbol "kernel_init".
Make basepath argument optional.

Before:

$ echo 'vfs_open+0x0/0x0' | ./scripts/decode_stacktrace.sh vmlinux ""
vfs_open (home/khlebnikov/src/linux/fs/open.c:912)

After:

$ echo 'vfs_open+0x0/0x0' | ./scripts/decode_stacktrace.sh vmlinux
vfs_open (fs/open.c:912)

Signed-off-by: Konstantin Khlebnikov 
---
 scripts/decode_stacktrace.sh |   14 +++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/scripts/decode_stacktrace.sh b/scripts/decode_stacktrace.sh
index 6ec8d6dff86c..b1b85a7b2115 100755
--- a/scripts/decode_stacktrace.sh
+++ b/scripts/decode_stacktrace.sh
@@ -3,14 +3,14 @@
 # (c) 2014, Sasha Levin 
 #set -x
 
-if [[ $# < 2 ]]; then
+if [[ $# < 1 ]]; then
echo "Usage:"
-   echo "  $0 [vmlinux] [base path] [modules path]"
+   echo "  $0  [base path] [modules path]"
exit 1
 fi
 
 vmlinux=$1
-basepath=$2
+basepath=${2-auto}
 modpath=$3
 declare -A cache
 declare -A modcache
@@ -152,6 +152,14 @@ handle_line() {
echo "${words[@]}" "$symbol $module"
 }
 
+if [[ $basepath == "auto" ]] ; then
+   module=""
+   symbol="kernel_init+0x0/0x0"
+   parse_symbol
+   basepath=${symbol#kernel_init (}
+   basepath=${basepath%/init/main.c:*)}
+fi
+
 while read line; do
# Let's see if we have an address in the line
if [[ $line =~ \[\<([^]]+)\>\] ]] ||

Re: [PATCH util-linux] dmesg: adjust timestamps according to suspended time

2020-06-04 Thread Konstantin Khlebnikov


On 04/06/2020 12.30, Karel Zak wrote:

On Mon, Jun 01, 2020 at 10:21:34PM +0300, Konstantin Khlebnikov wrote:

Timestamps in kernel log comes from monotonic clocksource which does not
tick when system suspended. Suspended time easily sums into hours and days
rendering human readable timestamps in dmesg useless.

Adjusting timestamps accouring to current delta between boottime and
monotonic clocksources produces accurate timestamps for messages printed
since last resume. Which are supposed to be most interesting.


It's definitely better than the current broken timestamps, but the real
and final solution is to have exact information about system suspends.

It would be enough to maintain in kernel memory a simple log with
  
and export this info by /proc/suspendlog, after that we can all
re-count /dev/kmsg timestamps to something useful.


Boottime or real time could be simply printed into kernel log at
suspend and resume. So demsg could detect current offset while reading.

But for older kernel dmesg still needs guessing like this.



   Karel

Re: [PATCH RFC 1/3] block: add flag 'nowait_requests' into queue limits

2020-06-03 Thread Konstantin Khlebnikov


On 03/06/2020 07.58, Christoph Hellwig wrote:

On Mon, Jun 01, 2020 at 03:37:09PM +0300, Konstantin Khlebnikov wrote:

Add flag for marking bio-based queues which support REQ_NOWAIT.
Set for all request based (mq) devices.

Stacking device should set it after blk_set_stacking_limits() if method
make_request() itself doesn't delay requests or handles REQ_NOWAIT.


I don't think this belongs into the queue limits.  For example a
stacking driver that always defers requests to a workqueue can support
REQ_NOWAIT entirely independent of the underlying devices.  I think
this just needs to be a simple queue flag.



For O_DIRECT I/O REQ_NOWAIT not just about non-blocking submition.
It also provides instant feedback about contention. Like ECN from network.
This feedback is useful for rate-control and balancing load between replicas.

If upper layer simply remaps and forwards requests below then to forward
contention all layers of stacked device should support this feature.
That's why I've put it as flag into limits - to reuse limits stacking.

If any layer defers request, then it should somehow limit size of backlog
at the same time to provide sane behaviour for REQ_NOWAIT regardless of
behaviour lower devices. So, then it could simply set that flag in limits.

Also I want to add handing into blk-qos/throttler - never delay REQ_NOWAIT.

Re: kobject_init_and_add is easy to misuse

2020-06-02 Thread Konstantin Khlebnikov


On 02/06/2020 15.10, Matthew Wilcox wrote:

On Tue, Jun 02, 2020 at 07:50:33PM +0800, Wang Hai wrote:

syzkaller reports for memory leak when kobject_init_and_add()
returns an error in the function sysfs_slab_add() [1]

When this happened, the function kobject_put() is not called for the
corresponding kobject, which potentially leads to memory leak.

This patch fixes the issue by calling kobject_put() even if
kobject_init_and_add() fails.


I think this speaks to a deeper problem with kobject_init_and_add()
-- the need to call kobject_put() if it fails is not readily apparent
to most users.  This same bug appears in the first three users of
kobject_init_and_add() that I checked --
arch/ia64/kernel/topology.c
drivers/firmware/dmi-sysfs.c
drivers/firmware/efi/esrt.c
drivers/scsi/iscsi_boot_sysfs.c

Some do get it right --
arch/powerpc/kernel/cacheinfo.c
drivers/gpu/drm/ttm/ttm_bo.c
drivers/gpu/drm/ttm/ttm_memory.c
drivers/infiniband/hw/mlx4/sysfs.c

I'd argue that the current behaviour is wrong, that kobject_init_and_add()
should call kobject_put() if the add fails.  This would need a tree-wide
audit.  But somebody needs to do that anyway because based on my random
sampling, half of the users currently get it wrong.



At his point kobject doesn't own kmem-cache structure itself yet.

So calling kobject_put() will free kmem-cache and then it will be
freed second time on error path in create_cache().

I suppose freeing in case of error should be pushed from common
create_cache() into slab-specific __kmem_cache_create().

[PATCH util-linux] dmesg: adjust timestamps according to suspended time

2020-06-01 Thread Konstantin Khlebnikov

Timestamps in kernel log comes from monotonic clocksource which does not
tick when system suspended. Suspended time easily sums into hours and days
rendering human readable timestamps in dmesg useless.

Adjusting timestamps accouring to current delta between boottime and
monotonic clocksources produces accurate timestamps for messages printed
since last resume. Which are supposed to be most interesting.

Signed-off-by: Konstantin Khlebnikov 
---
 include/monotonic.h |2 ++
 lib/monotonic.c |   12 
 sys-utils/dmesg.1   |2 ++
 sys-utils/dmesg.c   |   22 +-
 4 files changed, 33 insertions(+), 5 deletions(-)

diff --git a/include/monotonic.h b/include/monotonic.h
index 7a69d9e4b51e..380e59c3791e 100644
--- a/include/monotonic.h
+++ b/include/monotonic.h
@@ -15,6 +15,8 @@
 
 extern int get_boot_time(struct timeval *boot_time);
 
+extern time_t get_suspended_time(void);
+
 extern int gettime_monotonic(struct timeval *tv);
 
 #endif /* UTIL_LINUX_MONOTONIC_H */
diff --git a/lib/monotonic.c b/lib/monotonic.c
index b684d8dd650a..f0aeba6828e7 100644
--- a/lib/monotonic.c
+++ b/lib/monotonic.c
@@ -48,6 +48,18 @@ int get_boot_time(struct timeval *boot_time)
 #endif
 }
 
+time_t get_suspended_time(void)
+{
+#if defined(CLOCK_BOOTTIME) && defined(CLOCK_MONOTONIC)
+   struct timespec boot, mono;
+
+   if (clock_gettime(CLOCK_BOOTTIME, ) == 0 &&
+   clock_gettime(CLOCK_MONOTONIC, ) == 0)
+   return boot.tv_sec - mono.tv_sec;
+#endif
+   return 0;
+}
+
 int gettime_monotonic(struct timeval *tv)
 {
 #ifdef CLOCK_MONOTONIC
diff --git a/sys-utils/dmesg.1 b/sys-utils/dmesg.1
index 61a6ce89465d..6c5773a6b211 100644
--- a/sys-utils/dmesg.1
+++ b/sys-utils/dmesg.1
@@ -161,6 +161,8 @@ source used for the logs is
 .B not updated after
 system
 .BR SUSPEND / RESUME .
+Timestamps are adjusted according to current delta between boottime and 
monotonic
+clocks, this works only for messages printed after last resume.
 .IP "\fB\-t\fR, \fB\-\-notime\fR"
 Do not print kernel's timestamps.
 .IP "\fB\-\-time\-format\fR \fIformat\fR"
diff --git a/sys-utils/dmesg.c b/sys-utils/dmesg.c
index ae1ebc74a2d9..c78f01ca8755 100644
--- a/sys-utils/dmesg.c
+++ b/sys-utils/dmesg.c
@@ -169,6 +169,7 @@ struct dmesg_control {
struct timeval  lasttime;   /* last printed timestamp */
struct tm   lasttm; /* last localtime */
struct timeval  boot_time;  /* system boot time */
+   time_t  suspended_time; /* time spent in suspeneded state */
 
int action; /* SYSLOG_ACTION_* */
int method; /* DMESG_METHOD_* */
@@ -824,7 +825,7 @@ static struct tm *record_localtime(struct dmesg_control 
*ctl,
   struct dmesg_record *rec,
   struct tm *tm)
 {
-   time_t t = ctl->boot_time.tv_sec + rec->tv.tv_sec;
+   time_t t = ctl->boot_time.tv_sec + ctl->suspended_time + rec->tv.tv_sec;
return localtime_r(, tm);
 }
 
@@ -852,7 +853,7 @@ static char *iso_8601_time(struct dmesg_control *ctl, 
struct dmesg_record *rec,
   char *buf, size_t bufsz)
 {
struct timeval tv = {
-   .tv_sec = ctl->boot_time.tv_sec + rec->tv.tv_sec,
+   .tv_sec = ctl->boot_time.tv_sec + ctl->suspended_time + 
rec->tv.tv_sec,
.tv_usec = rec->tv.tv_usec
};
 
@@ -1301,8 +1302,16 @@ static inline int dmesg_get_boot_time(struct timeval *tv)
 
return get_boot_time(tv);
 }
+
+static inline time_t dmesg_get_suspended_time(void)
+{
+   if (getenv("DMESG_TEST_BOOTIME"))
+   return 0;
+   return get_suspended_time();
+}
 #else
 # define dmesg_get_boot_time   get_boot_time
+# define dmesg_get_suspended_time  get_suspended_time
 #endif
 
 int main(int argc, char *argv[])
@@ -1499,9 +1508,12 @@ int main(int argc, char *argv[])
 
if ((is_timefmt(, RELTIME) ||
 is_timefmt(, CTIME)   ||
-is_timefmt(, ISO8601))
-   && dmesg_get_boot_time(_time) != 0)
-   ctl.time_fmt = DMESG_TIMEFTM_NONE;
+is_timefmt(, ISO8601))) {
+   if (dmesg_get_boot_time(_time) != 0)
+   ctl.time_fmt = DMESG_TIMEFTM_NONE;
+   else
+   ctl.suspended_time = dmesg_get_suspended_time();
+   }
 
if (delta)
switch (ctl.time_fmt) {

Re: [RFC PATCH] mm: swap: remove lru drain waiters

2020-06-01 Thread Konstantin Khlebnikov


On 01/06/2020 17.37, Hillf Danton wrote:


After updating the lru drain sequence, new comers avoid waiting for
the current drainer, because he is flushing works on each online CPU,
by trying to lock the mutex; the drainer OTOH tries to do works for
those who fail to acquire the lock by checking the lru drain sequence
after releasing lock.

See eef1a429f234 ("mm/swap.c: piggyback lru_add_drain_all() calls")
for reasons why we can skip waiting for the lock.


That patch tells nothing about such change in behaviour.

Callers like invalidate_bdev() really need synchronous drain to be sure
that pages have no extra reference from per-cpu vectors.



The memory barriers around the sequence and the lock come together
to remove waiters without their drain works bandoned.

Cc: Sebastian Andrzej Siewior 
Cc: Konstantin Khlebnikov 
Signed-off-by: Hillf Danton 
---
This is inspired by one of the works from Sebastian.

--- a/mm/swap.c
+++ b/mm/swap.c
@@ -714,10 +714,11 @@ static void lru_add_drain_per_cpu(struct
   */
  void lru_add_drain_all(void)
  {
-   static seqcount_t seqcount = SEQCNT_ZERO(seqcount);
+   static unsigned int lru_drain_seq;
static DEFINE_MUTEX(lock);
static struct cpumask has_work;
-   int cpu, seq;
+   int cpu;
+   unsigned int seq;
  
  	/*

 * Make sure nobody triggers this path before mm_percpu_wq is fully
@@ -726,18 +727,16 @@ void lru_add_drain_all(void)
if (WARN_ON(!mm_percpu_wq))
return;
  
-	seq = raw_read_seqcount_latch();

+   lru_drain_seq++;
+   smp_mb();
  
-	mutex_lock();

+more_work:
  
-	/*

-* Piggyback on drain started and finished while we waited for lock:
-* all pages pended at the time of our enter were drained from vectors.
-*/
-   if (__read_seqcount_retry(, seq))
-   goto done;
+   if (!mutex_trylock())
+   return;
  
-	raw_write_seqcount_latch();

+   smp_mb();
+   seq = lru_drain_seq;
  
  	cpumask_clear(_work);
  
@@ -759,8 +758,11 @@ void lru_add_drain_all(void)

for_each_cpu(cpu, _work)
flush_work(_cpu(lru_add_drain_work, cpu));
  
-done:

mutex_unlock();
+
+   smp_mb();
+   if (seq != lru_drain_seq)
+   goto more_work;
  }
  #else
  void lru_add_drain_all(void)
--

[PATCH RFC 2/3] md/raid0: enable REQ_NOWAIT

2020-06-01 Thread Konstantin Khlebnikov

Set limits.nowait_requests = 1 before stacking limits.
Raid itself does not delay bio in raid0_make_request().

Signed-off-by: Konstantin Khlebnikov 
---
 drivers/md/raid0.c |3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 322386ff5d22..e34292b05488 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -406,6 +406,9 @@ static int raid0_run(struct mddev *mddev)
blk_queue_io_opt(mddev->queue,
 (mddev->chunk_sectors << 9) * 
mddev->raid_disks);
 
+   /* raid0_make_request() does not delay requests */
+   mddev->queue->limits.nowait_requests = 1;
+
rdev_for_each(rdev, mddev) {
disk_stack_limits(mddev->gendisk, rdev->bdev,
  rdev->data_offset << 9);

[PATCH RFC 0/3] block: allow REQ_NOWAIT to some bio-based/stacked devices

2020-06-01 Thread Konstantin Khlebnikov

Here is pretty straight forward attempt of handling REQ_NOWAIT for
bio-based and stacked devices.

They are marked with flag queue->limits.nowait_requests which tells that
queue method make_request() handles REQ_NOWAIT or doesn't delay requests,
and all backend devices do the same.

As a example second/third patches add support into md-raid0 and dm-linear.

---

Konstantin Khlebnikov (3):
  block: add flag 'nowait_requests' into queue limits
  md/raid0: enable REQ_NOWAIT
  dm: add support for REQ_NOWAIT and enable for target dm-linear


 drivers/md/dm-linear.c| 5 +++--
 drivers/md/dm-table.c | 3 +++
 drivers/md/dm.c   | 4 +++-
 drivers/md/raid0.c| 3 +++
 include/linux/device-mapper.h | 6 ++
 5 files changed, 18 insertions(+), 3 deletions(-)

--
Signature

[PATCH RFC 3/3] dm: add support for REQ_NOWAIT and enable for target dm-linear

2020-06-01 Thread Konstantin Khlebnikov

Add dm target feature flag DM_TARGET_NOWAIT which tells that target
has no problem with REQ_NOWAIT.

Set limits.nowait_requests if all targets and backends handle REQ_NOWAIT.

Signed-off-by: Konstantin Khlebnikov 
---
 drivers/md/dm-linear.c|5 +++--
 drivers/md/dm-table.c |3 +++
 drivers/md/dm.c   |4 +++-
 include/linux/device-mapper.h |6 ++
 4 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index e1db43446327..00774b5d7668 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -228,10 +228,11 @@ static struct target_type linear_target = {
.name   = "linear",
.version = {1, 4, 0},
 #ifdef CONFIG_BLK_DEV_ZONED
-   .features = DM_TARGET_PASSES_INTEGRITY | DM_TARGET_ZONED_HM,
+   .features = DM_TARGET_PASSES_INTEGRITY | DM_TARGET_NOWAIT |
+   DM_TARGET_ZONED_HM,
.report_zones = linear_report_zones,
 #else
-   .features = DM_TARGET_PASSES_INTEGRITY,
+   .features = DM_TARGET_PASSES_INTEGRITY | DM_TARGET_NOWAIT,
 #endif
.module = THIS_MODULE,
.ctr= linear_ctr,
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 0a2cc197f62b..f4610f79ebd6 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1500,12 +1500,15 @@ int dm_calculate_queue_limits(struct dm_table *table,
unsigned int zone_sectors = 0;
 
blk_set_stacking_limits(limits);
+   limits->nowait_requests = 1;
 
for (i = 0; i < dm_table_get_num_targets(table); i++) {
blk_set_stacking_limits(_limits);
 
ti = dm_table_get_target(table, i);
 
+   ti_limits.nowait_requests = dm_target_supports_nowait(ti->type);
+
if (!ti->type->iterate_devices)
goto combine_limits;
 
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index db9e46114653..767cd4d70341 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1794,7 +1794,9 @@ static blk_qc_t dm_make_request(struct request_queue *q, 
struct bio *bio)
if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, >flags))) {
dm_put_live_table(md, srcu_idx);
 
-   if (!(bio->bi_opf & REQ_RAHEAD))
+   if (bio->bi_opf & REQ_NOWAIT)
+   bio_wouldblock_error(bio);
+   else if (!(bio->bi_opf & REQ_RAHEAD))
queue_io(md, bio);
else
bio_io_error(bio);
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index af48d9da3916..4d4af1eeeba4 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -252,6 +252,12 @@ struct target_type {
 #define DM_TARGET_ZONED_HM 0x0040
 #define dm_target_supports_zoned_hm(type) ((type)->features & 
DM_TARGET_ZONED_HM)
 
+/*
+ * A target handles REQ_NOWAIT
+ */
+#define DM_TARGET_NOWAIT   0x0080
+#define dm_target_supports_nowait(type) (!!((type)->features & 
DM_TARGET_NOWAIT))
+
 struct dm_target {
struct dm_table *table;
struct target_type *type;

[PATCH RFC 1/3] block: add flag 'nowait_requests' into queue limits

2020-06-01 Thread Konstantin Khlebnikov

Add flag for marking bio-based queues which support REQ_NOWAIT.
Set for all request based (mq) devices.

Stacking device should set it after blk_set_stacking_limits() if method
make_request() itself doesn't delay requests or handles REQ_NOWAIT.

Signed-off-by: Konstantin Khlebnikov 
---
 block/blk-core.c   |4 ++--
 block/blk-mq.c |3 +++
 block/blk-settings.c   |3 +++
 include/linux/blkdev.h |1 +
 4 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index c4b015004796..9139a316e6d4 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -892,9 +892,9 @@ generic_make_request_checks(struct bio *bio)
 
/*
 * For a REQ_NOWAIT based request, return -EOPNOTSUPP
-* if queue is not a request based queue.
+* if queue does not support this flag.
 */
-   if ((bio->bi_opf & REQ_NOWAIT) && !queue_is_mq(q))
+   if ((bio->bi_opf & REQ_NOWAIT) && !q->limits.nowait_requests)
goto not_supported;
 
if (should_fail_bio(bio))
diff --git a/block/blk-mq.c b/block/blk-mq.c
index a7785df2c944..0c3daa0cda87 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2952,6 +2952,9 @@ struct request_queue *blk_mq_init_allocated_queue(struct 
blk_mq_tag_set *set,
 */
q->poll_nsec = BLK_MQ_POLL_CLASSIC;
 
+   /* Request based queue always supports REQ_NOWAIT */
+   q->limits.nowait_requests = 1;
+
blk_mq_init_cpu_queues(q, set->nr_hw_queues);
blk_mq_add_queue_tag_set(set, q);
blk_mq_map_swqueue(q);
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 14397b4c4b53..8f96c7324497 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -59,6 +59,7 @@ void blk_set_default_limits(struct queue_limits *lim)
lim->io_opt = 0;
lim->misaligned = 0;
lim->zoned = BLK_ZONED_NONE;
+   lim->nowait_requests = 0;
 }
 EXPORT_SYMBOL(blk_set_default_limits);
 
@@ -486,6 +487,8 @@ int blk_stack_limits(struct queue_limits *t, struct 
queue_limits *b,
t->max_segment_size = min_not_zero(t->max_segment_size,
   b->max_segment_size);
 
+   t->nowait_requests &= b->nowait_requests;
+
t->misaligned |= b->misaligned;
 
alignment = queue_limit_alignment_offset(b, start);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 32868fbedc9e..5f612dda34c2 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -346,6 +346,7 @@ struct queue_limits {
unsigned char   misaligned;
unsigned char   discard_misaligned;
unsigned char   raid_partial_stripes_expensive;
+   unsigned char   nowait_requests;
enum blk_zoned_modelzoned;
 };

Re: [PATCH] block: really remove REQ_NOWAIT_INLINE

2020-05-31 Thread Konstantin Khlebnikov


On 31/05/2020 19.33, Konstantin Khlebnikov wrote:

Commit 7b6620d7db56 ("block: remove REQ_NOWAIT_INLINE") removed it,
but some pieces left. Probably something went wrong with git merge.


Nevermind. As I see in block/for-next, Christoph have removed REQ_NOWAIT_INLINE.

But BLK_QC_T_EAGAIN is still here though.



Signed-off-by: Konstantin Khlebnikov 
---
  include/linux/blk_types.h |7 ++-
  1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 31eb92876be7..59b2e9bd9bd8 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -322,8 +322,7 @@ enum req_flag_bits {
__REQ_PREFLUSH, /* request for cache flush */
__REQ_RAHEAD,   /* read ahead, can fail anytime */
__REQ_BACKGROUND,   /* background IO */
-   __REQ_NOWAIT,   /* Don't wait if request will block */
-   __REQ_NOWAIT_INLINE,/* Return would-block error inline */
+   __REQ_NOWAIT,   /* Don't wait if request will block */
/*
 * When a shared kthread needs to issue a bio for a cgroup, doing
 * so synchronously can lead to priority inversions as the kthread
@@ -358,7 +357,6 @@ enum req_flag_bits {
  #define REQ_RAHEAD(1ULL << __REQ_RAHEAD)
  #define REQ_BACKGROUND(1ULL << __REQ_BACKGROUND)
  #define REQ_NOWAIT(1ULL << __REQ_NOWAIT)
-#define REQ_NOWAIT_INLINE  (1ULL << __REQ_NOWAIT_INLINE)
  #define REQ_CGROUP_PUNT   (1ULL << __REQ_CGROUP_PUNT)
  
  #define REQ_NOUNMAP		(1ULL << __REQ_NOUNMAP)

@@ -452,13 +450,12 @@ static inline int op_stat_group(unsigned int op)
  
  typedef unsigned int blk_qc_t;

  #define BLK_QC_T_NONE -1U
-#define BLK_QC_T_EAGAIN-2U
  #define BLK_QC_T_SHIFT16
  #define BLK_QC_T_INTERNAL (1U << 31)
  
  static inline bool blk_qc_t_valid(blk_qc_t cookie)

  {
-   return cookie != BLK_QC_T_NONE && cookie != BLK_QC_T_EAGAIN;
+   return cookie != BLK_QC_T_NONE;
  }
  
  static inline unsigned int blk_qc_t_to_queue_num(blk_qc_t cookie)

[PATCH] block: really remove REQ_NOWAIT_INLINE

2020-05-31 Thread Konstantin Khlebnikov

Commit 7b6620d7db56 ("block: remove REQ_NOWAIT_INLINE") removed it,
but some pieces left. Probably something went wrong with git merge.

Signed-off-by: Konstantin Khlebnikov 
---
 include/linux/blk_types.h |7 ++-
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 31eb92876be7..59b2e9bd9bd8 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -322,8 +322,7 @@ enum req_flag_bits {
__REQ_PREFLUSH, /* request for cache flush */
__REQ_RAHEAD,   /* read ahead, can fail anytime */
__REQ_BACKGROUND,   /* background IO */
-   __REQ_NOWAIT,   /* Don't wait if request will block */
-   __REQ_NOWAIT_INLINE,/* Return would-block error inline */
+   __REQ_NOWAIT,   /* Don't wait if request will block */
/*
 * When a shared kthread needs to issue a bio for a cgroup, doing
 * so synchronously can lead to priority inversions as the kthread
@@ -358,7 +357,6 @@ enum req_flag_bits {
 #define REQ_RAHEAD (1ULL << __REQ_RAHEAD)
 #define REQ_BACKGROUND (1ULL << __REQ_BACKGROUND)
 #define REQ_NOWAIT (1ULL << __REQ_NOWAIT)
-#define REQ_NOWAIT_INLINE  (1ULL << __REQ_NOWAIT_INLINE)
 #define REQ_CGROUP_PUNT(1ULL << __REQ_CGROUP_PUNT)
 
 #define REQ_NOUNMAP(1ULL << __REQ_NOUNMAP)
@@ -452,13 +450,12 @@ static inline int op_stat_group(unsigned int op)
 
 typedef unsigned int blk_qc_t;
 #define BLK_QC_T_NONE  -1U
-#define BLK_QC_T_EAGAIN-2U
 #define BLK_QC_T_SHIFT 16
 #define BLK_QC_T_INTERNAL  (1U << 31)
 
 static inline bool blk_qc_t_valid(blk_qc_t cookie)
 {
-   return cookie != BLK_QC_T_NONE && cookie != BLK_QC_T_EAGAIN;
+   return cookie != BLK_QC_T_NONE;
 }
 
 static inline unsigned int blk_qc_t_to_queue_num(blk_qc_t cookie)

Re: [PATCH] mm: dump_page: add debugfs file for dumping page state by pfn

2020-05-25 Thread Konstantin Khlebnikov


b

On 25/05/2020 19.03, Konstantin Khlebnikov wrote:



On 25/05/2020 18.33, Matthew Wilcox wrote:

On Mon, May 25, 2020 at 05:19:11PM +0300, Konstantin Khlebnikov wrote:

Tool 'page-types' could list pages mapped by process or file cache pages,
but it shows only limited amount of state exported via procfs.

Let's employ existing helper dump_page() to reach remaining information:
writing pfn into /sys/kernel/debug/dump_page dumps state into kernel log.

# echo 0x37c43c > /sys/kernel/debug/dump_page
# dmesg | tail -6
  page:cb0b0df10f00 refcount:1 mapcount:0 mapping:7755d3d9 
index:0x30
  0xae4239e0 name:"libGeoIP.so.1.6.9"
  flags: 0x2020014(uptodate|lru|mappedtodisk)
  raw: 02020014 cb0b187fd288 cb0b189e6248 9528a04afe10
  raw: 0030  0001 
  page dumped because: debugfs request


This makes me deeply uncomfortable.  We're using %px, and %lx
(for the 'raw' lines) so we actually get to see kernel addresses.
We've rationalised this in the past as being acceptable because you're
already in an "assert triggered" kind of situation.  Now you're adding
a way for any process with CAP_SYS_ADMIN to get kernel addresses dumped
into the syslog.

I think we need a different function for this, or we need to re-audit
dump_page() for exposing kernel pointers, and not expose the raw data
in struct page.



It's better to add switch for disabling paranoia if bad things happening.
I.e. keep everything safe by default (or whatever sysctl/config set) and
flip the switch when needed.


Also I'm ok to seal this interface if kernel in mode of serious paranoia.

Re: [PATCH] mm: dump_page: add debugfs file for dumping page state by pfn

2020-05-25 Thread Konstantin Khlebnikov





On 25/05/2020 18.33, Matthew Wilcox wrote:

On Mon, May 25, 2020 at 05:19:11PM +0300, Konstantin Khlebnikov wrote:

Tool 'page-types' could list pages mapped by process or file cache pages,
but it shows only limited amount of state exported via procfs.

Let's employ existing helper dump_page() to reach remaining information:
writing pfn into /sys/kernel/debug/dump_page dumps state into kernel log.

# echo 0x37c43c > /sys/kernel/debug/dump_page
# dmesg | tail -6
  page:cb0b0df10f00 refcount:1 mapcount:0 mapping:7755d3d9 
index:0x30
  0xae4239e0 name:"libGeoIP.so.1.6.9"
  flags: 0x2020014(uptodate|lru|mappedtodisk)
  raw: 02020014 cb0b187fd288 cb0b189e6248 9528a04afe10
  raw: 0030  0001 
  page dumped because: debugfs request


This makes me deeply uncomfortable.  We're using %px, and %lx
(for the 'raw' lines) so we actually get to see kernel addresses.
We've rationalised this in the past as being acceptable because you're
already in an "assert triggered" kind of situation.  Now you're adding
a way for any process with CAP_SYS_ADMIN to get kernel addresses dumped
into the syslog.

I think we need a different function for this, or we need to re-audit
dump_page() for exposing kernel pointers, and not expose the raw data
in struct page.



It's better to add switch for disabling paranoia if bad things happening.
I.e. keep everything safe by default (or whatever sysctl/config set) and
flip the switch when needed.

Re: [PATCH] mm: dump_page: add debugfs file for dumping page state by pfn

2020-05-25 Thread Konstantin Khlebnikov


On 25/05/2020 18.35, Vlastimil Babka wrote:

On 5/25/20 4:19 PM, Konstantin Khlebnikov wrote:

Tool 'page-types' could list pages mapped by process or file cache pages,
but it shows only limited amount of state exported via procfs.

Let's employ existing helper dump_page() to reach remaining information:
writing pfn into /sys/kernel/debug/dump_page dumps state into kernel log.


Yeah that's indeed useful, however I'm less sure if kernel log is the proper way
to extract the data. For example IIRC with the page_owner file can "seek to pfn"
to dump it, although that makes it somewhat harder to use.

Or we could write pfn to one file and read the dump from another one? But that's
not atomic.

Perhaps if we could do something like "cat /sys/kernel/debug/dump_page/"
without all the pfns being actually listed in the dump_page directory with "ls"?
Is that possible?


Too much code for me. =)

This could be kind of ftrace tracer which iterates over pages and dumps them,
but anyway looks ridiculously overengineered.

This one hack connects existing 'pagemap' with existing 'dump_page', so almost 
free.

For complicated cases there is gdb and special tool drgn 
https://github.com/osandov/drgn

Writing script which parses all that stuff from kernel log isn't big deal 
either.
I have one with 100+ lines regexp for all kinds of kernel splats.
Will publish when find time for polishing.




# echo 0x37c43c > /sys/kernel/debug/dump_page
# dmesg | tail -6
  page:cb0b0df10f00 refcount:1 mapcount:0 mapping:7755d3d9 
index:0x30
  0xae4239e0 name:"libGeoIP.so.1.6.9"
  flags: 0x2020014(uptodate|lru|mappedtodisk)
  raw: 02020014 cb0b187fd288 cb0b189e6248 9528a04afe10
  raw: 0030  0001 
  page dumped because: debugfs request

With CONFIG_PAGE_OWNER=y shows also stacks for last page alloc and free:

  page:ea0018fff480 refcount:1 mapcount:1 mapping: 
index:0x7f9f28f62
  anon flags: 0x1080034(uptodate|lru|active|swapbacked)
  raw: 01080034 ea00184140c8 ea0018517d88 8886076ba161
  raw: 0007f9f28f62  0001 888bfc79f000
  page dumped because: debugfs request
  page->mem_cgroup:888bfc79f000
  page_owner tracks the page as allocated
  page last allocated via order 0, migratetype Movable, gfp_mask 
0x100dca(GFP_HIGHUSER_MOVABLE|__GFP_ZERO)
   prep_new_page+0x139/0x1a0
   get_page_from_freelist+0xde9/0x14e0
   __alloc_pages_nodemask+0x18b/0x360
   alloc_pages_vma+0x7c/0x270
   __handle_mm_fault+0xd40/0x12b0
   handle_mm_fault+0xe7/0x1e0
   do_page_fault+0x2d5/0x610
   page_fault+0x2f/0x40
  page last free stack trace:
   free_pcp_prepare+0x11e/0x1c0
   free_unref_page_list+0x71/0x180
   release_pages+0x31e/0x480
   tlb_flush_mmu+0x44/0x150
   tlb_finish_mmu+0x3d/0x70
   exit_mmap+0xdd/0x1a0
   mmput+0x70/0x140
   do_exit+0x33f/0xc40
   do_group_exit+0x3a/0xa0
   __x64_sys_exit_group+0x14/0x20
   do_syscall_64+0x48/0x130
   entry_SYSCALL_64_after_hwframe+0x44/0xa9

Signed-off-by: Konstantin Khlebnikov 
---
  Documentation/admin-guide/mm/pagemap.rst |3 +++
  Documentation/vm/page_owner.rst  |   10 ++
  mm/debug.c   |   27 +++
  3 files changed, 40 insertions(+)

diff --git a/Documentation/admin-guide/mm/pagemap.rst 
b/Documentation/admin-guide/mm/pagemap.rst
index 340a5aee9b80..663ad5490d72 100644
--- a/Documentation/admin-guide/mm/pagemap.rst
+++ b/Documentation/admin-guide/mm/pagemap.rst
@@ -205,3 +205,6 @@ Before Linux 3.11 pagemap bits 55-60 were used for 
"page-shift" (which is
  always 12 at most architectures). Since Linux 3.11 their meaning changes
  after first clear of soft-dirty bits. Since Linux 4.2 they are used for
  flags unconditionally.
+
+Page state could be dumped into kernel log by writing pfn in text form
+into /sys/kernel/debug/dump_page.
diff --git a/Documentation/vm/page_owner.rst b/Documentation/vm/page_owner.rst
index 0ed5ab8c7ab4..d4d4dc64c19d 100644
--- a/Documentation/vm/page_owner.rst
+++ b/Documentation/vm/page_owner.rst
@@ -88,3 +88,13 @@ Usage
  
 See the result about who allocated each page

 in the ``sorted_page_owner.txt``.
+
+Notes
+=
+
+To lookup pages in file cache or mapped in process you could use interface
+pagemap documented in Documentation/admin-guide/mm/pagemap.rst or tool
+page-types in the tools/vm directory.
+
+Page state could be dumped into kernel log by writing pfn in text form
+into /sys/kernel/debug/dump_page.
diff --git a/mm/debug.c b/mm/debug.c
index 2189357f0987..5803f2b63d95 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -14,6 +14,7 @@
  #include 
  #include 
  #include 
+#include 
  
  #include "internal.h"
  
@@ -147,6 +148,32 @@ void dump_page(struct page *page, const char *reason)

  }
  EXPORT_SYMBOL(dump_page);
  
+#ifdef CONFIG_DEBUG_FS

+st

[PATCH] mm: dump_page: add debugfs file for dumping page state by pfn

2020-05-25 Thread Konstantin Khlebnikov

Tool 'page-types' could list pages mapped by process or file cache pages,
but it shows only limited amount of state exported via procfs.

Let's employ existing helper dump_page() to reach remaining information:
writing pfn into /sys/kernel/debug/dump_page dumps state into kernel log.

# echo 0x37c43c > /sys/kernel/debug/dump_page
# dmesg | tail -6
 page:cb0b0df10f00 refcount:1 mapcount:0 mapping:7755d3d9 index:0x30
 0xae4239e0 name:"libGeoIP.so.1.6.9"
 flags: 0x2020014(uptodate|lru|mappedtodisk)
 raw: 02020014 cb0b187fd288 cb0b189e6248 9528a04afe10
 raw: 0030  0001 
 page dumped because: debugfs request

With CONFIG_PAGE_OWNER=y shows also stacks for last page alloc and free:

 page:ea0018fff480 refcount:1 mapcount:1 mapping: 
index:0x7f9f28f62
 anon flags: 0x1080034(uptodate|lru|active|swapbacked)
 raw: 01080034 ea00184140c8 ea0018517d88 8886076ba161
 raw: 0007f9f28f62  0001 888bfc79f000
 page dumped because: debugfs request
 page->mem_cgroup:888bfc79f000
 page_owner tracks the page as allocated
 page last allocated via order 0, migratetype Movable, gfp_mask 
0x100dca(GFP_HIGHUSER_MOVABLE|__GFP_ZERO)
  prep_new_page+0x139/0x1a0
  get_page_from_freelist+0xde9/0x14e0
  __alloc_pages_nodemask+0x18b/0x360
  alloc_pages_vma+0x7c/0x270
  __handle_mm_fault+0xd40/0x12b0
  handle_mm_fault+0xe7/0x1e0
  do_page_fault+0x2d5/0x610
  page_fault+0x2f/0x40
 page last free stack trace:
  free_pcp_prepare+0x11e/0x1c0
  free_unref_page_list+0x71/0x180
  release_pages+0x31e/0x480
  tlb_flush_mmu+0x44/0x150
  tlb_finish_mmu+0x3d/0x70
  exit_mmap+0xdd/0x1a0
  mmput+0x70/0x140
  do_exit+0x33f/0xc40
  do_group_exit+0x3a/0xa0
  __x64_sys_exit_group+0x14/0x20
  do_syscall_64+0x48/0x130
  entry_SYSCALL_64_after_hwframe+0x44/0xa9

Signed-off-by: Konstantin Khlebnikov 
---
 Documentation/admin-guide/mm/pagemap.rst |3 +++
 Documentation/vm/page_owner.rst  |   10 ++
 mm/debug.c   |   27 +++
 3 files changed, 40 insertions(+)

diff --git a/Documentation/admin-guide/mm/pagemap.rst 
b/Documentation/admin-guide/mm/pagemap.rst
index 340a5aee9b80..663ad5490d72 100644
--- a/Documentation/admin-guide/mm/pagemap.rst
+++ b/Documentation/admin-guide/mm/pagemap.rst
@@ -205,3 +205,6 @@ Before Linux 3.11 pagemap bits 55-60 were used for 
"page-shift" (which is
 always 12 at most architectures). Since Linux 3.11 their meaning changes
 after first clear of soft-dirty bits. Since Linux 4.2 they are used for
 flags unconditionally.
+
+Page state could be dumped into kernel log by writing pfn in text form
+into /sys/kernel/debug/dump_page.
diff --git a/Documentation/vm/page_owner.rst b/Documentation/vm/page_owner.rst
index 0ed5ab8c7ab4..d4d4dc64c19d 100644
--- a/Documentation/vm/page_owner.rst
+++ b/Documentation/vm/page_owner.rst
@@ -88,3 +88,13 @@ Usage
 
See the result about who allocated each page
in the ``sorted_page_owner.txt``.
+
+Notes
+=
+
+To lookup pages in file cache or mapped in process you could use interface
+pagemap documented in Documentation/admin-guide/mm/pagemap.rst or tool
+page-types in the tools/vm directory.
+
+Page state could be dumped into kernel log by writing pfn in text form
+into /sys/kernel/debug/dump_page.
diff --git a/mm/debug.c b/mm/debug.c
index 2189357f0987..5803f2b63d95 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -14,6 +14,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "internal.h"
 
@@ -147,6 +148,32 @@ void dump_page(struct page *page, const char *reason)
 }
 EXPORT_SYMBOL(dump_page);
 
+#ifdef CONFIG_DEBUG_FS
+static int dump_page_set(void *data, u64 pfn)
+{
+   struct page *page;
+
+   if (!capable(CAP_SYS_ADMIN))
+   return -EPERM;
+
+   page = pfn_to_online_page(pfn);
+   if (!page)
+   return -ENXIO;
+
+   dump_page(page, "debugfs request");
+   return 0;
+}
+DEFINE_DEBUGFS_ATTRIBUTE(dump_page_fops, NULL, dump_page_set, "%llx\n");
+
+static int __init dump_page_debugfs(void)
+{
+   debugfs_create_file_unsafe("dump_page", 0200, NULL, NULL,
+  _page_fops);
+   return 0;
+}
+late_initcall(dump_page_debugfs);
+#endif /* CONFIG_DEBUG_FS */
+
 #ifdef CONFIG_DEBUG_VM
 
 void dump_vma(const struct vm_area_struct *vma)

Re: block I/O accounting improvements

2020-05-25 Thread Konstantin Khlebnikov


On 25/05/2020 14.29, Christoph Hellwig wrote:

Hi Jens,

they series contains various improvement for block I/O accounting.  The
first bunch of patches switch the bio based drivers to better accounting
helpers compared to the current mess.  The end contains a fix and various
performanc improvements.  Most of this comes from a series Konstantin
sent a few weeks ago, rebased on changes that landed in your tree since
and my change to always use the percpu version of the disk stats.



Thanks for picking this up.

One note about possible further improvement in reply to first patch.

Reviewed-by: Konstantin Khlebnikov

Re: [PATCH 01/16] block: add disk/bio-based accounting helpers

2020-05-25 Thread Konstantin Khlebnikov


On 25/05/2020 14.29, Christoph Hellwig wrote:

Add two new helpers to simplify I/O accounting for bio based drivers.
Currently these drivers use the generic_start_io_acct and
generic_end_io_acct helpers which have very cumbersome calling
conventions, don't actually return the time they started accounting,
and try to deal with accounting for partitions, which can't happen
for bio based drivers.  The new helpers will be used to subsequently
replace uses of the old helpers.

The main function is the bio based wrappes in blkdev.h, but for zram
which wants to account rw_page based I/O lower level routines are
provided as well.

Signed-off-by: Christoph Hellwig 
---
  block/blk-core.c   | 34 ++
  include/linux/blkdev.h | 26 ++
  2 files changed, 60 insertions(+)

diff --git a/block/blk-core.c b/block/blk-core.c
index 77e57c2e8d602..8973104f88d90 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1432,6 +1432,40 @@ void blk_account_io_start(struct request *rq, bool 
new_io)
part_stat_unlock();
  }
  
+unsigned long disk_start_io_acct(struct gendisk *disk, unsigned int sectors,

+   unsigned int op)
+{
+   struct hd_struct *part = >part0;
+   const int sgrp = op_stat_group(op);
+   unsigned long now = READ_ONCE(jiffies);
+
+   part_stat_lock();
+   update_io_ticks(part, now, false);
+   part_stat_inc(part, ios[sgrp]);
+   part_stat_add(part, sectors[sgrp], sectors);
+   part_stat_local_inc(part, in_flight[op_is_write(op)]);
+   part_stat_unlock();
+
+   return now;
+}
+EXPORT_SYMBOL(disk_start_io_acct);
+
+void disk_end_io_acct(struct gendisk *disk, unsigned int op,
+   unsigned long start_time)
+{
+   struct hd_struct *part = >part0;
+   const int sgrp = op_stat_group(op);
+   unsigned long now = READ_ONCE(jiffies);
+   unsigned long duration = now - start_time;


I think it would be better to leave this jiffies legacy nonsense in
callers and pass here request duration in nanoseconds.

So rewriting them to nanoseconds later wouldn't touch generic code.


+
+   part_stat_lock();
+   update_io_ticks(part, now, true);
+   part_stat_add(part, nsecs[sgrp], jiffies_to_nsecs(duration));
+   part_stat_local_dec(part, in_flight[op_is_write(op)]);
+   part_stat_unlock();
+}
+EXPORT_SYMBOL(disk_end_io_acct);
+
  /*
   * Steal bios from a request and add them to a bio list.
   * The request must not have been partially completed before.
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 7d10f4e632325..76d01a8a13b80 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1892,4 +1892,30 @@ static inline void blk_wake_io_task(struct task_struct 
*waiter)
wake_up_process(waiter);
  }
  
+unsigned long disk_start_io_acct(struct gendisk *disk, unsigned int sectors,

+   unsigned int op);
+void disk_end_io_acct(struct gendisk *disk, unsigned int op,
+   unsigned long start_time);
+
+/**
+ * bio_start_io_acct - start I/O accounting for bio based drivers
+ * @bio:   bio to start account for
+ *
+ * Returns the start time that should be passed back to bio_end_io_acct().
+ */
+static inline unsigned long bio_start_io_acct(struct bio *bio)
+{
+   return disk_start_io_acct(bio->bi_disk, bio_sectors(bio), bio_op(bio));
+}
+
+/**
+ * bio_end_io_acct - end I/O accounting for bio based drivers
+ * @bio:   bio to end account for
+ * @start: start time returned by bio_start_io_acct()
+ */
+static inline void bio_end_io_acct(struct bio *bio, unsigned long start_time)
+{
+   return disk_end_io_acct(bio->bi_disk, bio_op(bio), start_time);
+}
+
  #endif

[PATCH v2] mm: remove VM_BUG_ON(PageSlab()) from page_mapcount()

2020-05-24 Thread Konstantin Khlebnikov

Replace superfluous VM_BUG_ON() with comment about correct usage.

Technically reverts commit 1d148e218a0d0566b1c06f2f45f1436d53b049b2
("mm: add VM_BUG_ON_PAGE() to page_mapcount()"), but context have changed.

Function isolate_migratepages_block() runs some checks out of lru_lock
when choose pages for migration. After checking PageLRU() it checks extra
page references by comparing page_count() and page_mapcount(). Between
these two checks page could be removed from lru, freed and taken by slab.

As a result this race triggers VM_BUG_ON(PageSlab()) in page_mapcount().
Race window is tiny. For certain workload this happens around once a year.


 page:ea0105ca9380 count:1 mapcount:0 mapping:88ff7712c180 index:0x0 
compound_mapcount: 0
 flags: 0x5008100(slab|head)
 raw: 05008100 dead0100 dead0200 88ff7712c180
 raw:  80200020 0001 
 page dumped because: VM_BUG_ON_PAGE(PageSlab(page))
 [ cut here ]
 kernel BUG at ./include/linux/mm.h:628!
 invalid opcode:  [#1] SMP NOPTI
 CPU: 77 PID: 504 Comm: kcompactd1 Tainted: GW 4.19.109-27 #1
 Hardware name: Yandex T175-N41-Y3N/MY81-EX0-Y3N, BIOS R05 06/20/2019
 RIP: 0010:isolate_migratepages_block+0x986/0x9b0


Code in isolate_migratepages_block() was added in commit 119d6d59dcc0
("mm, compaction: avoid isolating pinned pages") before adding VM_BUG_ON
into page_mapcount().

This race has been predicted in 2015 by Vlastimil Babka (see link below).

Signed-off-by: Konstantin Khlebnikov 
Fixes: 1d148e218a0d ("mm: add VM_BUG_ON_PAGE() to page_mapcount()")
Link: https://lore.kernel.org/lkml/557710e1.6060...@suse.cz/
Link: 
https://lore.kernel.org/linux-mm/158937872515.474360.5066096871639561424.stgit@buzz/T/
 (v1)
---
 include/linux/mm.h |   14 --
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 5a323422d783..95f777f482ac 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -782,6 +782,11 @@ static inline void *kvcalloc(size_t n, size_t size, gfp_t 
flags)
 
 extern void kvfree(const void *addr);
 
+/*
+ * Mapcount of compound page as a whole, not includes mapped sub-pages.
+ *
+ * Must be called only for compound pages or any their tail sub-pages.
+ */
 static inline int compound_mapcount(struct page *page)
 {
VM_BUG_ON_PAGE(!PageCompound(page), page);
@@ -801,10 +806,15 @@ static inline void page_mapcount_reset(struct page *page)
 
 int __page_mapcount(struct page *page);
 
+/*
+ * Mapcount of 0-order page, for sub-page includes compound_mapcount().
+ *
+ * Result is undefined for pages which cannot be mapped into userspace.
+ * For example SLAB or special types of pages. See function page_has_type().
+ * They use this place in struct page differently.
+ */
 static inline int page_mapcount(struct page *page)
 {
-   VM_BUG_ON_PAGE(PageSlab(page), page);
-
if (unlikely(PageCompound(page)))
return __page_mapcount(page);
return atomic_read(>_mapcount) + 1;

Re: [PATCH] mm/compaction: avoid VM_BUG_ON(PageSlab()) in page_mapcount()

2020-05-24 Thread Konstantin Khlebnikov


On 24/05/2020 04.01, Hugh Dickins wrote:

On Wed, 13 May 2020, Konstantin Khlebnikov wrote:


Function isolate_migratepages_block() runs some checks out of lru_lock
when choose pages for migration. After checking PageLRU() it checks extra
page references by comparing page_count() and page_mapcount(). Between
these two checks page could be removed from lru, freed and taken by slab.

As a result this race triggers VM_BUG_ON(PageSlab()) in page_mapcount().
Race window is tiny. For certain workload this happens around once a year.


Around once a year, that was my guess too. I have no record of us ever
hitting this, but yes it could happen when you have CONFIG_DEBUG_VM=y
(which I too like to run with, but would not recommend for users).


Yep, but for large cluster and pinpointed workload this happens surprisingly
frequently =) I've believed into this race only after seeing statistics for
count of compactions and how it correlates with incidents.

Probably the key component is a slab allocation from network irq/bh context
which interrupts compaction exactly at this spot.






  page:ea0105ca9380 count:1 mapcount:0 mapping:88ff7712c180 index:0x0 
compound_mapcount: 0
  flags: 0x5008100(slab|head)
  raw: 05008100 dead0100 dead0200 88ff7712c180
  raw:  80200020 0001 
  page dumped because: VM_BUG_ON_PAGE(PageSlab(page))
  [ cut here ]
  kernel BUG at ./include/linux/mm.h:628!
  invalid opcode:  [#1] SMP NOPTI
  CPU: 77 PID: 504 Comm: kcompactd1 Tainted: GW 4.19.109-27 #1
  Hardware name: Yandex T175-N41-Y3N/MY81-EX0-Y3N, BIOS R05 06/20/2019
  RIP: 0010:isolate_migratepages_block+0x986/0x9b0


To fix just opencode page_mapcount() in racy check for 0-order case and
recheck carefully under lru_lock when page cannot escape from lru.

Also add checking extra references for file pages and swap cache.

Signed-off-by: Konstantin Khlebnikov 
Fixes: 119d6d59dcc0 ("mm, compaction: avoid isolating pinned pages")


Not really, that commit was correct at the time it went in.


Fixes: 1d148e218a0d ("mm: add VM_BUG_ON_PAGE() to page_mapcount()")


Exactly, that commit was well-intentioned, but did not allow for this
(admittedly very exceptional) usage.  How many developers actually
make the mistake of applying page_mapcount() to their slab pages?
None, I expect.  That VM_BUG_ON_PAGE() is there for documentation,
and could just be replaced by a comment - and Linus would be happy
with that.


Ok, I'll redo the fix in this way.




---
  mm/compaction.c |   17 +
  1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/mm/compaction.c b/mm/compaction.c
index 46f0fcc93081..91bb87fd9420 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -935,12 +935,16 @@ isolate_migratepages_block(struct compact_control *cc, 
unsigned long low_pfn,
}
  
  		/*

-* Migration will fail if an anonymous page is pinned in memory,
+* Migration will fail if an page is pinned in memory,
 * so avoid taking lru_lock and isolating it unnecessarily in an
-* admittedly racy check.
+* admittedly racy check simplest case for 0-order pages.
+*
+* Open code page_mapcount() to avoid VM_BUG_ON(PageSlab(page)).


But open coding page_mapcount() is not all that you did.  You have
(understandably) chosen to avoid calling page_mapping(page), but...


+* Page could have extra reference from mapping or swap cache.
 */
-   if (!page_mapping(page) &&
-   page_count(page) > page_mapcount(page))
+   if (!PageCompound(page) &&
+   page_count(page) > atomic_read(>_mapcount) + 1 +
+   (!PageAnon(page) || PageSwapCache(page)))
goto isolate_fail;


Isn't that test going to send all the file cache pages with buffer heads
in page->private, off to isolate_fail when they're actually great
candidates for migration?


Yes. What a shame. Adding page_has_private() could fix that?

Kind of

page_count(page) > page_mapcount(page) +
(PageAnon(page) ? PageSwapCache(page) : (1 + page_has_private(page)))

or probably something like this:

page_count(page) > page_mapcount(page) +
(PageAnon(page) ? PageSwapCache(page) : GUP_PIN_COUNTING_BIAS)

I.e. skip only file pages pinned by dma or something slower.
I see some movements in this direction in recent changes.

of course that's independent matter.



Given that the actual bug spotted was with the VM_BUG_ON_PAGE(PageSlab),
and nobody has reported any crash from the use of page_mapping() there
(and we only need the test to be right most of the time: all of this
knowingly racy, as you explain in other mail): I'd go for just replacing
the VM_BUG_ON_PAGE in page_mapc

Re: [PATCH] mm/compaction: avoid VM_BUG_ON(PageSlab()) in page_mapcount()

2020-05-23 Thread Konstantin Khlebnikov

On Sat, May 23, 2020 at 4:34 AM Andrew Morton  wrote:
>
> On Wed, 13 May 2020 17:05:25 +0300 Konstantin Khlebnikov 
>  wrote:
>
> > Function isolate_migratepages_block() runs some checks out of lru_lock
> > when choose pages for migration. After checking PageLRU() it checks extra
> > page references by comparing page_count() and page_mapcount(). Between
> > these two checks page could be removed from lru, freed and taken by slab.
> >
> > As a result this race triggers VM_BUG_ON(PageSlab()) in page_mapcount().
> > Race window is tiny. For certain workload this happens around once a year.
> >
> >
> >  page:ea0105ca9380 count:1 mapcount:0 mapping:88ff7712c180 
> > index:0x0 compound_mapcount: 0
> >  flags: 0x5008100(slab|head)
> >  raw: 05008100 dead0100 dead0200 88ff7712c180
> >  raw:  80200020 0001 
> >  page dumped because: VM_BUG_ON_PAGE(PageSlab(page))
> >  [ cut here ]
> >  kernel BUG at ./include/linux/mm.h:628!
> >  invalid opcode:  [#1] SMP NOPTI
> >  CPU: 77 PID: 504 Comm: kcompactd1 Tainted: GW 4.19.109-27 
> > #1
> >  Hardware name: Yandex T175-N41-Y3N/MY81-EX0-Y3N, BIOS R05 06/20/2019
> >  RIP: 0010:isolate_migratepages_block+0x986/0x9b0
> >
> >
> > To fix just opencode page_mapcount() in racy check for 0-order case and
> > recheck carefully under lru_lock when page cannot escape from lru.
> >
> > Also add checking extra references for file pages and swap cache.
>
> I dunno, this code looks quite nasty.  I'm more thinking we should
> revert and rethink David's 119d6d59dcc0980dcd58 ("mm, compaction: avoid
> isolating pinned pages").
>
> > --- a/mm/compaction.c
> > +++ b/mm/compaction.c
> > @@ -935,12 +935,16 @@ isolate_migratepages_block(struct compact_control 
> > *cc, unsigned long low_pfn,
> >   }
> >
> >   /*
> > -  * Migration will fail if an anonymous page is pinned in 
> > memory,
> > +  * Migration will fail if an page is pinned in memory,
> >* so avoid taking lru_lock and isolating it unnecessarily in 
> > an
> > -  * admittedly racy check.
> > +  * admittedly racy check simplest case for 0-order pages.
> > +  *
> > +  * Open code page_mapcount() to avoid 
> > VM_BUG_ON(PageSlab(page)).
> > +  * Page could have extra reference from mapping or swap cache.
> >*/
> > - if (!page_mapping(page) &&
> > - page_count(page) > page_mapcount(page))
> > + if (!PageCompound(page) &&
> > + page_count(page) > atomic_read(>_mapcount) + 1 +
> > + (!PageAnon(page) || PageSwapCache(page)))
> >   goto isolate_fail;
>
> OK, we happened to notice this because we happened to trigger a
> !PageSlab assertion.  But if this page has been freed and reused for
> slab, it could have been reused for *anything*?  Perhaps it was reused
> as a migratable page which we'll go ahead and migrate even though we no
> longer should.  There are various whacky parts of the kernel which
> (ab)use surprising struct page fields in surprising ways - how do we
> know it isn't one of those which now happens to look like a migratable
> page?

Here we just optimistically skip as much unwanted pages as possible.

This code rechecks PageLRU and other tests later, under lru_lock.
lru_lock blocks freeing path which should acquire it to remove from lru.

>
> I also worry about the next test:
>
> /*
>  * Only allow to migrate anonymous pages in GFP_NOFS context
>  * because those do not depend on fs locks.
>  */
> if (!(cc->gfp_mask & __GFP_FS) && page_mapping(page))
> goto isolate_fail;
>
> This page isn't PageLocked(), is it?  It could be a recycled page which
> is will be getting its ->mapping set one nanosecond hence.

Yes, it's racy. I don't see how compaction rechecks  this later.
So it could try to unmap and migrate file page if race here with recycle.

Probably nobody starts direct-compaction without GFP_FS.

>
>
> >   /*
> > @@ -975,6 +979,11 @@ isolate_migratepages_block(struct compact_control *cc, 
> > unsigned long low_pfn,
> >   low_pfn += compound_nr(page) - 1;
> >   goto isolate_fail;
> >   }
> > +
> > + /* Recheck page extra references under lock */
> > + if (page_count(page) > page_mapcount(page) +
> > + (!PageAnon(page) || PageSwapCache(page)))
> > + goto isolate_fail;
> >   }
> >
> >   lruvec = mem_cgroup_page_lruvec(page, pgdat);
> >
>

Re: [PATCH v1 02/25] mm/swap: Don't abuse the seqcount latching API

2020-05-20 Thread Konstantin Khlebnikov


On 20/05/2020 00.45, Ahmed S. Darwish wrote:

Commit eef1a429f234 ("mm/swap.c: piggyback lru_add_drain_all() calls")
implemented an optimization mechanism to exit the to-be-started LRU
drain operation (name it A) if another drain operation *started and
finished* while (A) was blocked on the LRU draining mutex.

This was done through a seqcount latch, which is an abuse of its
semantics:

   1. Seqcount latching should be used for the purpose of switching
  between two storage places with sequence protection to allow
  interruptible, preemptible writer sections. The optimization
  mechanism has absolutely nothing to do with that.

   2. The used raw_write_seqcount_latch() has two smp write memory
  barriers to always insure one consistent storage place out of the
  two storage places available. This extra smp_wmb() is redundant for
  the optimization use case.

Beside the API abuse, the semantics of a latch sequence counter was
force fitted into the optimization. What was actually meant is to track
generations of LRU draining operations, where "current lru draining
generation = x" implies that all generations 0 < n <= x are already
*scheduled* for draining.

Remove the conceptually-inappropriate seqcount latch usage and manually
implement the optimization using a counter and SMP memory barriers.


Well, I thought it fits perfectly =)

Maybe it's worth to add helpers with appropriate semantics?
This is pretty common pattern.



Link: 
https://lkml.kernel.org/r/calygnipsr-cxv9mx9czavh6wz_gzsv3h_8kpvgjbtgbjywu...@mail.gmail.com
Signed-off-by: Ahmed S. Darwish 
---
  mm/swap.c | 57 +--
  1 file changed, 47 insertions(+), 10 deletions(-)

diff --git a/mm/swap.c b/mm/swap.c
index bf9a79fed62d..d6910eeed43d 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -713,10 +713,20 @@ static void lru_add_drain_per_cpu(struct work_struct 
*dummy)
   */
  void lru_add_drain_all(void)
  {
-   static seqcount_t seqcount = SEQCNT_ZERO(seqcount);
-   static DEFINE_MUTEX(lock);
+   /*
+* lru_drain_gen - Current generation of pages that could be in vectors
+*
+* (A) Definition: lru_drain_gen = x implies that all generations
+* 0 < n <= x are already scheduled for draining.
+*
+* This is an optimization for the highly-contended use case where a
+* user space workload keeps constantly generating a flow of pages
+* for each CPU.
+*/
+   static unsigned int lru_drain_gen;
static struct cpumask has_work;
-   int cpu, seq;
+   static DEFINE_MUTEX(lock);
+   int cpu, this_gen;
  
  	/*

 * Make sure nobody triggers this path before mm_percpu_wq is fully
@@ -725,21 +735,48 @@ void lru_add_drain_all(void)
if (WARN_ON(!mm_percpu_wq))
return;
  
-	seq = raw_read_seqcount_latch();

+   /*
+* (B) Cache the LRU draining generation number
+*
+* smp_rmb() ensures that the counter is loaded before the mutex is
+* taken. It pairs with the smp_wmb() inside the mutex critical section
+* at (D).
+*/
+   this_gen = READ_ONCE(lru_drain_gen);
+   smp_rmb();
  
  	mutex_lock();
  
  	/*

-* Piggyback on drain started and finished while we waited for lock:
-* all pages pended at the time of our enter were drained from vectors.
+* (C) Exit the draining operation if a newer generation, from another
+* lru_add_drain_all(), was already scheduled for draining. Check (A).
 */
-   if (__read_seqcount_retry(, seq))
+   if (unlikely(this_gen != lru_drain_gen))
goto done;
  
-	raw_write_seqcount_latch();

+   /*
+* (D) Increment generation number
+*
+* Pairs with READ_ONCE() and smp_rmb() at (B), outside of the critical
+* section.
+*
+* This pairing must be done here, before the for_each_online_cpu loop
+* below which drains the page vectors.
+*
+* Let x, y, and z represent some system CPU numbers, where x < y < z.
+* Assume CPU #z is is in the middle of the for_each_online_cpu loop
+* below and has already reached CPU #y's per-cpu data. CPU #x comes
+* along, adds some pages to its per-cpu vectors, then calls
+* lru_add_drain_all().
+*
+* If the paired smp_wmb() below is done at any later step, e.g. after
+* the loop, CPU #x will just exit at (C) and miss flushing out all of
+* its added pages.
+*/
+   WRITE_ONCE(lru_drain_gen, lru_drain_gen + 1);
+   smp_wmb();
  
  	cpumask_clear(_work);

-
for_each_online_cpu(cpu) {
struct work_struct *work = _cpu(lru_add_drain_work, cpu);
  
@@ -766,7 +803,7 @@ void lru_add_drain_all(void)

  {
lru_add_drain();
  }
-#endif
+#endif /* CONFIG_SMP */
  
  /**

   * release_pages - batched put_page()

Re: [PATCH] futex: send SIGBUS if argument is not aligned on a four-byte boundary

2020-05-20 Thread Konstantin Khlebnikov


On 20/05/2020 01.53, Thomas Gleixner wrote:

Konstantin Khlebnikov  writes:


Userspace implementations of mutexes (including glibc) in some cases
retries operation without checking error code from syscall futex.
This is good for performance because most errors are impossible when
locking code trusts itself.


This argument is blantantly wrong. It's the justification for bad
programming. Code ignoring error returns is simply buggy.


I knew you'll love it. =)




Some errors which could came from outer code are handled automatically,
for example invalid address triggers SIGSEGV on atomic fast path.

But one case turns into nasty busy-loop: when address is unaligned.
futex(FUTEX_WAIT) returns EINVAL immediately and loop goes to retry.


Why is that something the kernel has to care about? The kernel returns
EINVAl as documented and when user space decides to ignore then it goes
to retry for a full timeslice for nothing.

You have to come up with a better argument why we want to send a signal
here.

Along with an argument why SIGBUS is the right thing when a user space
fast path violation results in a SIGSEGV as you stated above.


SIGSEGV comes only if address points to unmapped area.
This case doesn't need special care unlike to misaligned address.



Plus a patch which documents this change in the futex man page.


Yep, will do.




Example which loops inside second call rather than hung peacefully:

#include 
#include 

int main(int argc, char **argv)
{
char buf[sizeof(pthread_mutex_t) + 1];
pthread_mutex_t *mutex = (pthread_mutex_t *)(buf + 1);

pthread_mutex_init(mutex, NULL);
pthread_mutex_lock(mutex);
pthread_mutex_lock(mutex);
}


And this broken code is a kernel problem because?


That's just distilled example how pthread primitives works
when application passes misaligned pointer.




It seems there is no practical usage for calling syscall futex for
unaligned address. This may be only bug in user space. Let's help and
handle this gracefully without adding extra code on fast path.


How does that help?

Are you going to stick SIGBUS in _ALL_ syscalls which might be retried
in a loop just because user space fails to evaluate the error code
properly?

You have to come up with a better argument to justify this change.


This particular case is important because buggy code is in glibc.
Definitely most frequently used library ever.  Musl is buggy too.

Bug is here for ages and it took another decade to spread update.
In modern containerized setup kernel updates much more frequently.

All this hides bugs: application might recover from loop
if another thread unlock mutex from atomic fast path.
But mutex works as spin-lock wasting cpu time.

So, these few lines of code might prevent significant CO2 emission =)




This patch sends SIGBUS signal to slay task and break busy-loop.


I'm pretty sure that I asked you to read and follow documentation
before. If I did not:

  git grep 'This patch' Documentation/process/


Force of habit. =/
Definitely should be in checkpatch.pl



Thanks,

 tglx

[PATCH] futex: send SIGBUS if argument is not aligned on a four-byte boundary

2020-05-15 Thread Konstantin Khlebnikov

Userspace implementations of mutexes (including glibc) in some cases
retries operation without checking error code from syscall futex.
This is good for performance because most errors are impossible when
locking code trusts itself.

Some errors which could came from outer code are handled automatically,
for example invalid address triggers SIGSEGV on atomic fast path.

But one case turns into nasty busy-loop: when address is unaligned.
futex(FUTEX_WAIT) returns EINVAL immediately and loop goes to retry.

Example which loops inside second call rather than hung peacefully:

#include 
#include 

int main(int argc, char **argv)
{
char buf[sizeof(pthread_mutex_t) + 1];
pthread_mutex_t *mutex = (pthread_mutex_t *)(buf + 1);

pthread_mutex_init(mutex, NULL);
pthread_mutex_lock(mutex);
pthread_mutex_lock(mutex);
}

It seems there is no practical usage for calling syscall futex for
unaligned address. This may be only bug in user space. Let's help
and handle this gracefully without adding extra code on fast path.

This patch sends SIGBUS signal to slay task and break busy-loop.

Signed-off-by: Konstantin Khlebnikov 
Reported-by: Maxim Samoylov 
---
 kernel/futex.c |   13 -
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/kernel/futex.c b/kernel/futex.c
index b59532862bc0..8a6d35fa56bc 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -508,10 +508,21 @@ get_futex_key(u32 __user *uaddr, int fshared, union 
futex_key *key, enum futex_a
 
/*
 * The futex address must be "naturally" aligned.
+* Also send signal to break busy-loop if user-space ignore error.
+* EFAULT case should trigger SIGSEGV at access from user-space.
 */
key->both.offset = address % PAGE_SIZE;
-   if (unlikely((address % sizeof(u32)) != 0))
+   if (unlikely((address % sizeof(u32)) != 0)) {
+   struct kernel_siginfo info;
+
+   clear_siginfo();
+   info.si_signo = SIGBUS;
+   info.si_code  = BUS_ADRALN;
+   info.si_addr  = uaddr;
+   force_sig_info();
+
return -EINVAL;
+   }
address -= key->both.offset;
 
if (unlikely(!access_ok(uaddr, sizeof(u32

Re: [PATCH] mm/compaction: avoid VM_BUG_ON(PageSlab()) in page_mapcount()

2020-05-13 Thread Konstantin Khlebnikov


On 13/05/2020 21.32, Andrew Morton wrote:

On Wed, 13 May 2020 17:05:25 +0300 Konstantin Khlebnikov 
 wrote:


Function isolate_migratepages_block() runs some checks out of lru_lock
when choose pages for migration. After checking PageLRU() it checks extra
page references by comparing page_count() and page_mapcount(). Between
these two checks page could be removed from lru, freed and taken by slab.

As a result this race triggers VM_BUG_ON(PageSlab()) in page_mapcount().
Race window is tiny. For certain workload this happens around once a year.


  page:ea0105ca9380 count:1 mapcount:0 mapping:88ff7712c180 index:0x0 
compound_mapcount: 0
  flags: 0x5008100(slab|head)
  raw: 05008100 dead0100 dead0200 88ff7712c180
  raw:  80200020 0001 
  page dumped because: VM_BUG_ON_PAGE(PageSlab(page))
  [ cut here ]
  kernel BUG at ./include/linux/mm.h:628!
  invalid opcode:  [#1] SMP NOPTI
  CPU: 77 PID: 504 Comm: kcompactd1 Tainted: GW 4.19.109-27 #1
  Hardware name: Yandex T175-N41-Y3N/MY81-EX0-Y3N, BIOS R05 06/20/2019
  RIP: 0010:isolate_migratepages_block+0x986/0x9b0


To fix just opencode page_mapcount() in racy check for 0-order case and
recheck carefully under lru_lock when page cannot escape from lru.

Also add checking extra references for file pages and swap cache.


It sounds like a cc:stable is appropriate?


Yep, but probably I'm missing something.

It seems bug is there for a long time and nobody seen it.
Am I the only one using COONFIG_DEBUG_VM=y everywhere? =)




--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -935,12 +935,16 @@ isolate_migratepages_block(struct compact_control *cc, 
unsigned long low_pfn,
}
  
  		/*

-* Migration will fail if an anonymous page is pinned in memory,
+* Migration will fail if an page is pinned in memory,
 * so avoid taking lru_lock and isolating it unnecessarily in an
-* admittedly racy check.
+* admittedly racy check simplest case for 0-order pages.
+*
+* Open code page_mapcount() to avoid VM_BUG_ON(PageSlab(page)).
+* Page could have extra reference from mapping or swap cache.
 */
-   if (!page_mapping(page) &&
-   page_count(page) > page_mapcount(page))
+   if (!PageCompound(page) &&
+   page_count(page) > atomic_read(>_mapcount) + 1 +
+   (!PageAnon(page) || PageSwapCache(page)))
goto isolate_fail;
  
  		/*

@@ -975,6 +979,11 @@ isolate_migratepages_block(struct compact_control *cc, 
unsigned long low_pfn,
low_pfn += compound_nr(page) - 1;
goto isolate_fail;
}
+
+   /* Recheck page extra references under lock */
+   if (page_count(page) > page_mapcount(page) +
+   (!PageAnon(page) || PageSwapCache(page)))
+   goto isolate_fail;
}
  
  		lruvec = mem_cgroup_page_lruvec(page, pgdat);

[PATCH] mm/compaction: avoid VM_BUG_ON(PageSlab()) in page_mapcount()

2020-05-13 Thread Konstantin Khlebnikov

Function isolate_migratepages_block() runs some checks out of lru_lock
when choose pages for migration. After checking PageLRU() it checks extra
page references by comparing page_count() and page_mapcount(). Between
these two checks page could be removed from lru, freed and taken by slab.

As a result this race triggers VM_BUG_ON(PageSlab()) in page_mapcount().
Race window is tiny. For certain workload this happens around once a year.


 page:ea0105ca9380 count:1 mapcount:0 mapping:88ff7712c180 index:0x0 
compound_mapcount: 0
 flags: 0x5008100(slab|head)
 raw: 05008100 dead0100 dead0200 88ff7712c180
 raw:  80200020 0001 
 page dumped because: VM_BUG_ON_PAGE(PageSlab(page))
 [ cut here ]
 kernel BUG at ./include/linux/mm.h:628!
 invalid opcode:  [#1] SMP NOPTI
 CPU: 77 PID: 504 Comm: kcompactd1 Tainted: GW 4.19.109-27 #1
 Hardware name: Yandex T175-N41-Y3N/MY81-EX0-Y3N, BIOS R05 06/20/2019
 RIP: 0010:isolate_migratepages_block+0x986/0x9b0


To fix just opencode page_mapcount() in racy check for 0-order case and
recheck carefully under lru_lock when page cannot escape from lru.

Also add checking extra references for file pages and swap cache.

Signed-off-by: Konstantin Khlebnikov 
Fixes: 119d6d59dcc0 ("mm, compaction: avoid isolating pinned pages")
Fixes: 1d148e218a0d ("mm: add VM_BUG_ON_PAGE() to page_mapcount()")
---
 mm/compaction.c |   17 +
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/mm/compaction.c b/mm/compaction.c
index 46f0fcc93081..91bb87fd9420 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -935,12 +935,16 @@ isolate_migratepages_block(struct compact_control *cc, 
unsigned long low_pfn,
}
 
/*
-* Migration will fail if an anonymous page is pinned in memory,
+* Migration will fail if an page is pinned in memory,
 * so avoid taking lru_lock and isolating it unnecessarily in an
-* admittedly racy check.
+* admittedly racy check simplest case for 0-order pages.
+*
+* Open code page_mapcount() to avoid VM_BUG_ON(PageSlab(page)).
+* Page could have extra reference from mapping or swap cache.
 */
-   if (!page_mapping(page) &&
-   page_count(page) > page_mapcount(page))
+   if (!PageCompound(page) &&
+   page_count(page) > atomic_read(>_mapcount) + 1 +
+   (!PageAnon(page) || PageSwapCache(page)))
goto isolate_fail;
 
/*
@@ -975,6 +979,11 @@ isolate_migratepages_block(struct compact_control *cc, 
unsigned long low_pfn,
low_pfn += compound_nr(page) - 1;
goto isolate_fail;
}
+
+   /* Recheck page extra references under lock */
+   if (page_count(page) > page_mapcount(page) +
+   (!PageAnon(page) || PageSwapCache(page)))
+   goto isolate_fail;
}
 
lruvec = mem_cgroup_page_lruvec(page, pgdat);

Re: [PATCH RFC tip/core/rcu] Add shrinker to shift to fast/inefficient GP mode

2020-05-12 Thread Konstantin Khlebnikov


On 13/05/2020 06.18, Paul E. McKenney wrote:

On Wed, May 13, 2020 at 11:32:38AM +1000, Dave Chinner wrote:

On Sat, May 09, 2020 at 09:09:00AM -0700, Paul E. McKenney wrote:

On Sat, May 09, 2020 at 11:54:40AM +0300, Konstantin Khlebnikov wrote:

On 08/05/2020 17.46, Paul E. McKenney wrote:

Easy for me to provide "start fast and inefficient mode" and "stop fast
and inefficient mode" APIs for MM to call!

How about rcu_mempressure_start() and rcu_mempressure_end()?  I would
expect them not to nest (as in if you need them to nest, please let
me know).  I would not expect these to be invoked all that often (as in
if you do need them to be fast and scalable, please let me know). >
RCU would then be in fast/inefficient mode if either MM told it to be
or if RCU had detected callback overload on at least one CPU.

Seem reasonable?


Not exactly nested calls, but kswapd threads are per numa node.
So, at some level nodes under pressure must be counted.


Easy enough, especially given that RCU already "counts" CPUs having
excessive numbers of callbacks.  But assuming that the transitions to/from
OOM are rare, I would start by just counting them with a global counter.
If the counter is non-zero, RCU is in fast and inefficient mode.


Also forcing rcu calls only for cpus in one numa node might be useful.


Interesting.  RCU currently evaluates a given CPU by comparing the
number of callbacks against a fixed cutoff that can be set at boot using
rcutree.qhimark, which defaults to 10,000.  When this cutoff is exceeded,
RCU becomes more aggressive about invoking callbacks on that CPU, for
example, by sacrificing some degree of real-time response.  I believe
that this heuristic would also serve the OOM use case well.


So one of the things that I'm not sure people have connected here is
that memory reclaim done by shrinkers is one of the things that
drives huge numbers of call_rcu() callbacks to free memory via rcu.
If we are reclaiming dentries and inodes, then we can be pushing
thousands to hundreds of thousands of objects into kfree_rcu()
and/or direct call_rcu() calls to free these objects in a single
reclaim pass.


Good point!


Indeed




Hence the trigger for RCU going into "excessive callback" mode
might, in fact, be kswapd running a pass over the shrinkers. i.e.
memory reclaim itself can be responsible for pushing RCU into this "OOM
pressure" situation.

So perhaps we've missed a trick here by not having the memory
reclaim routines trigger RCU callbacks at the end of a priority
scan. The shrinkers have queued the objects for freeing, but they
haven't actually been freed yet and so things like slab pages
haven't actually been returned to the free pool even though the
shrinkers have said "freed this many objects"...

i.e. perhaps the right solution here is a "rcu_run_callbacks()"
function that memory reclaim calls before backing off and/or winding
up reclaim priority.


It would not be hard to make something that put RCU into fast/inefficient
mode for a couple of grace periods.  I will also look into the possibility
of speeding up callback invocation.

It might also make sense to put RCU grace periods into fast mode while
running the shrinkers that are freeing dentries and inodes.  However,
kbuild test robot reports ugly regressions when putting RCU into
fast/inefficient mode to quickly and too often.  As in 78.5% degradation
on one of the benchmarks.


I think fast/inefficient mode here just an optimization for freeing
memory faster. It doesn't solve the problem itself.

At first we have to close the loop in reclaimer and actually wait or run
rcu callbacks which might free memory before increasing priority and
invoking OOM killer.




I wonder if direct-reclaim should at some stage simply wait for RCU QS.
I.e. call rcu_barrier() or similar somewhere before invoking OOM.


The rcu_oom_count() function in the patch starting this thread returns the
total number of outstanding callbacks queued on all CPUs.  So one approach
would be to invoke this function, and if the return value was truly
huge (taking size of memory and who knows that all else into account),
do the rcu_barrier() to wait for RCU to clear its current backlog.


The shrinker scan control structure has a node mask in it to
indicate what node (and hence CPUs) it should be reclaiming from.
This information comes from the main reclaim scan routine, so it
would be trivial to feed straight into the RCU code to have it
act on just the CPUs/node that we are reclaiming memory from...


For the callbacks, RCU can operate on CPUs, in theory anyway.  The
grace period itself, however, is inherently global.


On the NUMA point, it would be dead easy for me to supply a function
that returned the number of callbacks on a given CPU, which would allow
you to similarly evaluate a NUMA node, a cgroup, or whatever.


I'd think it runs the other way around - we optimisitically call the
RCU layer to

Re: [PATCH] doc: cgroup: update note about conditions when oom killer is invoked

2020-05-11 Thread Konstantin Khlebnikov





On 11/05/2020 11.39, Michal Hocko wrote:

On Fri 08-05-20 17:16:29, Konstantin Khlebnikov wrote:

Starting from v4.19 commit 29ef680ae7c2 ("memcg, oom: move out_of_memory
back to the charge path") cgroup oom killer is no longer invoked only from
page faults. Now it implements the same semantics as global OOM killer:
allocation context invokes OOM killer and keeps retrying until success.

Signed-off-by: Konstantin Khlebnikov 


Acked-by: Michal Hocko 


---
  Documentation/admin-guide/cgroup-v2.rst |   17 -
  1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/Documentation/admin-guide/cgroup-v2.rst 
b/Documentation/admin-guide/cgroup-v2.rst
index bcc80269bb6a..1bb9a8f6ebe1 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -1172,6 +1172,13 @@ PAGE_SIZE multiple when read back.
Under certain circumstances, the usage may go over the limit
temporarily.
  
+	In default configuration regular 0-order allocation always

+   succeed unless OOM killer choose current task as a victim.
+
+   Some kinds of allocations don't invoke the OOM killer.
+   Caller could retry them differently, return into userspace
+   as -ENOMEM or silently ignore in cases like disk readahead.


I would probably add -EFAULT but the less error codes we document the
better.


Yeah, EFAULT was a most obscure result of memory shortage.
Fortunately with new behaviour this shouldn't happens a lot.

Actually where it is still possible? THP always fallback to 0-order.
I mean EFAULT could appear inside kernel only if task is killed so
nobody would see it.




+
This is the ultimate protection mechanism.  As long as the
high limit is used and monitored properly, this limit's
utility is limited to providing the final safety net.
@@ -1228,17 +1235,9 @@ PAGE_SIZE multiple when read back.
The number of time the cgroup's memory usage was
reached the limit and allocation was about to fail.
  
-		Depending on context result could be invocation of OOM

-   killer and retrying allocation or failing allocation.
-
-   Failed allocation in its turn could be returned into
-   userspace as -ENOMEM or silently ignored in cases like
-   disk readahead.  For now OOM in memory cgroup kills
-   tasks iff shortage has happened inside page fault.
-
This event is not raised if the OOM killer is not
considered as an option, e.g. for failed high-order
-   allocations.
+   allocations or if caller asked to not retry attempts.
  
  	  oom_kill

The number of processes belonging to this cgroup

[PATCH] f2fs: report delalloc reserve as non-free in statfs for project quota

2020-05-11 Thread Konstantin Khlebnikov

This reserved space isn't committed yet but cannot be used for
allocations. For userspace it has no difference from used space.

See the same fix in ext4 commit f06925c73942 ("ext4: report delalloc
reserve as non-free in statfs for project quota").

Signed-off-by: Konstantin Khlebnikov 
Fixes: ddc34e328d06 ("f2fs: introduce f2fs_statfs_project")
---
 fs/f2fs/super.c |3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index f2dfc21c6abb..c5e8cb31626f 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -1292,7 +1292,8 @@ static int f2fs_statfs_project(struct super_block *sb,
limit >>= sb->s_blocksize_bits;
 
if (limit && buf->f_blocks > limit) {
-   curblock = dquot->dq_dqb.dqb_curspace >> sb->s_blocksize_bits;
+   curblock = (dquot->dq_dqb.dqb_curspace +
+   dquot->dq_dqb.dqb_rsvspace) >> sb->s_blocksize_bits;
buf->f_blocks = limit;
buf->f_bfree = buf->f_bavail =
(buf->f_blocks > curblock) ?

Re: [PATCH RFC tip/core/rcu] Add shrinker to shift to fast/inefficient GP mode

2020-05-09 Thread Konstantin Khlebnikov


On 08/05/2020 17.46, Paul E. McKenney wrote:

On Fri, May 08, 2020 at 12:00:28PM +0300, Konstantin Khlebnikov wrote:

On 07/05/2020 22.09, Paul E. McKenney wrote:

On Thu, May 07, 2020 at 02:31:02PM -0400, Johannes Weiner wrote:

On Thu, May 07, 2020 at 10:09:03AM -0700, Paul E. McKenney wrote:

On Thu, May 07, 2020 at 01:00:06PM -0400, Johannes Weiner wrote:

On Wed, May 06, 2020 at 05:55:35PM -0700, Andrew Morton wrote:

On Wed, 6 May 2020 17:42:40 -0700 "Paul E. McKenney"  wrote:


This commit adds a shrinker so as to inform RCU when memory is scarce.
RCU responds by shifting into the same fast and inefficient mode that is
used in the presence of excessive numbers of RCU callbacks.  RCU remains
in this state for one-tenth of a second, though this time window can be
extended by another call to the shrinker.


We may be able to use shrinkers here, but merely being invoked does
not carry a reliable distress signal.

Shrinkers get invoked whenever vmscan runs. It's a useful indicator
for when to age an auxiliary LRU list - test references, clear and
rotate or reclaim stale entries. The urgency, and what can and cannot
be considered "stale", is encoded in the callback frequency and scan
counts, and meant to be relative to the VM's own rate of aging: "I've
tested X percent of mine for recent use, now you go and test the same
share of your pool." It doesn't translate well to other
interpretations of the callbacks, although people have tried.


Would it make sense for RCU to interpret two invocations within (say)
100ms of each other as indicating urgency?  (Hey, I had to ask!)


It's the perfect number for one combination of CPU, storage device,
and shrinker implementation :-)


Woo-hoo!!!

But is that one combination actually in use anywhere?  ;-)


If it proves feasible, a later commit might add a function call directly
indicating the end of the period of scarce memory.


(Cc David Chinner, who often has opinions on shrinkers ;))

It's a bit abusive of the intent of the slab shrinkers, but I don't
immediately see a problem with it.  Always returning 0 from
->scan_objects might cause a problem in some situations(?).

Perhaps we should have a formal "system getting low on memory, please
do something" notification API.


It's tricky to find a useful definition of what low on memory
means. In the past we've used sc->priority cutoffs, the vmpressure
interface (reclaimed/scanned - reclaim efficiency cutoffs), oom
notifiers (another reclaim efficiency cutoff). But none of these
reliably capture "distress", and they vary highly between different
hardware setups. It can be hard to trigger OOM itself on fast IO
devices, even when the machine is way past useful (where useful is
somewhat subjective to the user). Userspace OOM implementations that
consider userspace health (also subjective) are getting more common.


How significant is this?  How much memory can RCU consume?


I think if rcu can end up consuming a significant share of memory, one
way that may work would be to do proper shrinker integration and track
the age of its objects relative to the age of other allocations in the
system. I.e. toss them all on a clock list with "new" bits and shrink
them at VM velocity. If the shrinker sees objects with new bit set,
clear and rotate. If it sees objects without them, we know rcu_heads
outlive cache pages etc. and should probably cycle faster too.


It would be easy for RCU to pass back (or otherwise use) the age of the
current grace period, if that would help.

Tracking the age of individual callbacks is out of the question due to
memory overhead, but RCU could approximate this via statistical sampling.
Comparing this to grace-period durations could give information as to
whether making grace periods go faster would be helpful.


That makes sense.

So RCU knows the time and the VM knows the amount of memory. Either
RCU needs to figure out its memory component to be able to translate
shrinker input to age, or the VM needs to learn about time to be able
to say: I'm currently scanning memory older than timestamp X.

The latter would also require sampling in the VM. Nose goes. :-)


Sounds about right.  ;-)

Does reclaim have any notion of having continuously scanned for
longer than some amount of time?  Or could RCU reasonably deduce this?
For example, if RCU noticed that reclaim had been scanning for longer than
(say) five grace periods, RCU might decide to speed things up.

But on the other hand, with slow disks, reclaim might go on for tens of
seconds even without much in the way of memory pressure, mightn't it?

I suppose that another indicator would be recent NULL returns from
allocators.  But that indicator flashes a bit later than one would like,
doesn't it?  And has false positives when allocators are invoked from
atomic contexts, no doubt.  And no doubt similar for sleeping more than
a certain length of time in an allocator.


There actually is

Re: [PATCH RFC 1/8] dcache: show count of hash buckets in sysctl fs.dentry-state

2020-05-08 Thread Konstantin Khlebnikov





On 08/05/2020 22.05, Waiman Long wrote:

On 5/8/20 12:16 PM, Konstantin Khlebnikov wrote:

On 08/05/2020 17.49, Waiman Long wrote:

On 5/8/20 8:23 AM, Konstantin Khlebnikov wrote:

Count of buckets is required for estimating average length of hash chains.
Size of hash table depends on memory size and printed once at boot.

Let's expose nr_buckets as sixth number in sysctl fs.dentry-state


The hash bucket count is a constant determined at boot time. Is there a need to use up one dentry_stat entry for that? Besides one can 
get it by looking up the kernel dmesg log like:


[    0.055212] Dentry cache hash table entries: 8388608 (order: 14, 67108864 
bytes)


Grepping logs since boot time is a worst API ever.

dentry-state shows count of dentries in various states.
It's very convenient to show count of buckets next to it,
because this number defines overall scale. 


I am not against using the last free entry for that. My only concern is when we want to expose another internal dcache data point via 
dentry-state, we will have to add one more number to the array which can cause all sort of compatibility problem. So do we want to use the 
last free slot for a constant that can be retrieved from somewhere else?


I see no problem in adding more numbers into sysctl.
Especially into such rarely used.
This interface is designed for that.

Also fields 'age_limit' and 'want_pages' are unused since kernel 2.2.0



Cheers,
Longman

Re: [PATCH RFC 8/8] dcache: prevent flooding with negative dentries

2020-05-08 Thread Konstantin Khlebnikov


On 08/05/2020 17.56, Matthew Wilcox wrote:

On Fri, May 08, 2020 at 03:23:33PM +0300, Konstantin Khlebnikov wrote:

This patch implements heuristic which detects such scenarios and prevents
unbounded growth of completely unneeded negative dentries. It keeps up to
three latest negative dentry in each bucket unless they were referenced.

At first dput of negative dentry when it swept to the tail of siblings
we'll also clear it's reference flag and look at next dentries in chain.
Then kill third in series of negative, unused and unreferenced denries.

This way each hash bucket will preserve three negative dentry to let them
get reference and survive. Adding positive or used dentry into hash chain
also protects few recent negative dentries. In result total size of dcache
asymptotically limited by count of buckets and positive or used dentries.

This heuristic isn't bulletproof and solves only most practical case.
It's easy to deceive: just touch same random name twice.


I'm not sure if that's "easy to deceive" ... My concern with limiting
negative dentries is something like a kernel compilation where there
are many (11 for mm/mmap.c, 9 in general) and there will be a lot of
places where  does not exist

-isystem /usr/lib/gcc/x86_64-linux-gnu/9/include
-I../arch/x86/include
-I./arch/x86/include/generated
-I../include
-I./include
-I../arch/x86/include/uapi
-I./arch/x86/include/generated/uapi
-I../include/uapi
-I./include/generated/uapi
-I ../mm
-I ./mm

So it'd be good to know that kernel compilation times are unaffected by
this patch.



It's very unlikely that this patches changes anything for compilation.
Or any other scenario with sane amount and rate of appearing new names.

This trims only dentries which never been accessed twice.
Keeping 3 dentries per bucket gives high chances that all of them get
reference bit and stay in cache until shrinker bury them.

To get false positive in this heuristic - multiple newly created
negative dentries must hit one bucket in short period of time.
I.e. at least three hash collisions is required.

Re: [PATCH RFC 1/8] dcache: show count of hash buckets in sysctl fs.dentry-state

2020-05-08 Thread Konstantin Khlebnikov


On 08/05/2020 17.49, Waiman Long wrote:

On 5/8/20 8:23 AM, Konstantin Khlebnikov wrote:

Count of buckets is required for estimating average length of hash chains.
Size of hash table depends on memory size and printed once at boot.

Let's expose nr_buckets as sixth number in sysctl fs.dentry-state


The hash bucket count is a constant determined at boot time. Is there a need to use up one dentry_stat entry for that? Besides one can get 
it by looking up the kernel dmesg log like:


[    0.055212] Dentry cache hash table entries: 8388608 (order: 14, 67108864 
bytes)


Grepping logs since boot time is a worst API ever.

dentry-state shows count of dentries in various states.
It's very convenient to show count of buckets next to it,
because this number defines overall scale.



Cheers,
Longman

[PATCH] doc: cgroup: update note about conditions when oom killer is invoked

2020-05-08 Thread Konstantin Khlebnikov

Starting from v4.19 commit 29ef680ae7c2 ("memcg, oom: move out_of_memory
back to the charge path") cgroup oom killer is no longer invoked only from
page faults. Now it implements the same semantics as global OOM killer:
allocation context invokes OOM killer and keeps retrying until success.

Signed-off-by: Konstantin Khlebnikov 
---
 Documentation/admin-guide/cgroup-v2.rst |   17 -
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/Documentation/admin-guide/cgroup-v2.rst 
b/Documentation/admin-guide/cgroup-v2.rst
index bcc80269bb6a..1bb9a8f6ebe1 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -1172,6 +1172,13 @@ PAGE_SIZE multiple when read back.
Under certain circumstances, the usage may go over the limit
temporarily.
 
+   In default configuration regular 0-order allocation always
+   succeed unless OOM killer choose current task as a victim.
+
+   Some kinds of allocations don't invoke the OOM killer.
+   Caller could retry them differently, return into userspace
+   as -ENOMEM or silently ignore in cases like disk readahead.
+
This is the ultimate protection mechanism.  As long as the
high limit is used and monitored properly, this limit's
utility is limited to providing the final safety net.
@@ -1228,17 +1235,9 @@ PAGE_SIZE multiple when read back.
The number of time the cgroup's memory usage was
reached the limit and allocation was about to fail.
 
-   Depending on context result could be invocation of OOM
-   killer and retrying allocation or failing allocation.
-
-   Failed allocation in its turn could be returned into
-   userspace as -ENOMEM or silently ignored in cases like
-   disk readahead.  For now OOM in memory cgroup kills
-   tasks iff shortage has happened inside page fault.
-
This event is not raised if the OOM killer is not
considered as an option, e.g. for failed high-order
-   allocations.
+   allocations or if caller asked to not retry attempts.
 
  oom_kill
The number of processes belonging to this cgroup

[tip: perf/core] perf tools: Simplify checking if SMT is active.

2020-05-08 Thread tip-bot2 for Konstantin Khlebnikov

The following commit has been merged into the perf/core branch of tip:

Commit-ID: bb629484d924118e3b1d8652177040115adcba01
Gitweb:
https://git.kernel.org/tip/bb629484d924118e3b1d8652177040115adcba01
Author:Konstantin Khlebnikov 
AuthorDate:Wed, 29 Apr 2020 19:23:41 +03:00
Committer: Arnaldo Carvalho de Melo 
CommitterDate: Tue, 05 May 2020 16:35:29 -03:00

perf tools: Simplify checking if SMT is active.

SMT now could be disabled via "/sys/devices/system/cpu/smt/control".

Status is shown in "/sys/devices/system/cpu/smt/active" simply as "0" / "1".

If this knob isn't here then fallback to checking topology as before.

Signed-off-by: Konstantin Khlebnikov 
Cc: Andi Kleen 
Cc: Dmitry Monakhov 
Cc: Jiri Olsa 
Cc: Kan Liang 
Cc: Peter Zijlstra 
Link: 
http://lore.kernel.org/lkml/158817741394.748034.9273604089138009552.stgit@buzz
Signed-off-by: Arnaldo Carvalho de Melo 
---
 tools/perf/util/smt.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/tools/perf/util/smt.c b/tools/perf/util/smt.c
index 8481842..20bacd5 100644
--- a/tools/perf/util/smt.c
+++ b/tools/perf/util/smt.c
@@ -15,6 +15,9 @@ int smt_on(void)
if (cached)
return cached_result;
 
+   if (sysfs__read_int("devices/system/cpu/smt/active", _result) > 
0)
+   goto done;
+
ncpu = sysconf(_SC_NPROCESSORS_CONF);
for (cpu = 0; cpu < ncpu; cpu++) {
unsigned long long siblings;
@@ -42,6 +45,7 @@ int smt_on(void)
}
if (!cached) {
cached_result = 0;
+done:
cached = true;
}
return cached_result;

[tip: perf/core] perf tools: Fix reading new topology attribute "core_cpus"

2020-05-08 Thread tip-bot2 for Konstantin Khlebnikov

The following commit has been merged into the perf/core branch of tip:

Commit-ID: 846de4371fdfddfa49481e3d04884539870dc127
Gitweb:
https://git.kernel.org/tip/846de4371fdfddfa49481e3d04884539870dc127
Author:Konstantin Khlebnikov 
AuthorDate:Wed, 29 Apr 2020 19:19:47 +03:00
Committer: Arnaldo Carvalho de Melo 
CommitterDate: Tue, 05 May 2020 16:35:29 -03:00

perf tools: Fix reading new topology attribute "core_cpus"

Check if access("devices/system/cpu/cpu%d/topology/core_cpus", F_OK)
fails, which will happen unless the current directory is "/sys".

Simply try to read this file first.

Fixes: 0ccdb8407a46 ("perf tools: Apply new CPU topology sysfs attributes")
Signed-off-by: Konstantin Khlebnikov 
Cc: Andi Kleen 
Cc: Dmitry Monakhov 
Cc: Jiri Olsa 
Cc: Kan Liang 
Cc: Peter Zijlstra 
Link: 
http://lore.kernel.org/lkml/158817718710.747528.11009278875028211991.stgit@buzz
Signed-off-by: Arnaldo Carvalho de Melo 
---
 tools/perf/util/smt.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tools/perf/util/smt.c b/tools/perf/util/smt.c
index 3b791ef..8481842 100644
--- a/tools/perf/util/smt.c
+++ b/tools/perf/util/smt.c
@@ -24,13 +24,13 @@ int smt_on(void)
 
snprintf(fn, sizeof fn,
"devices/system/cpu/cpu%d/topology/core_cpus", cpu);
-   if (access(fn, F_OK) == -1) {
+   if (sysfs__read_str(fn, , ) < 0) {
snprintf(fn, sizeof fn,

"devices/system/cpu/cpu%d/topology/thread_siblings",
cpu);
+   if (sysfs__read_str(fn, , ) < 0)
+   continue;
}
-   if (sysfs__read_str(fn, , ) < 0)
-   continue;
/* Entry is hex, but does not have 0x, so need custom parser */
siblings = strtoull(str, NULL, 16);
free(str);

[PATCH RFC 5/8] dcache: add action D_WALK_SKIP_SIBLINGS to d_walk()

2020-05-08 Thread Konstantin Khlebnikov

This lets skip remaining siblings at seeing d_is_tail_negative().

Signed-off-by: Konstantin Khlebnikov 
---
 fs/dcache.c |7 +++
 1 file changed, 7 insertions(+)

diff --git a/fs/dcache.c b/fs/dcache.c
index 743255773cc7..44c6832d21d6 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1303,12 +1303,14 @@ EXPORT_SYMBOL(shrink_dcache_sb);
  * @D_WALK_QUIT:   quit walk
  * @D_WALK_NORETRY:quit when retry is needed
  * @D_WALK_SKIP:   skip this dentry and its children
+ * @D_WALK_SKIP_SIBLINGS: skip siblings and their children
  */
 enum d_walk_ret {
D_WALK_CONTINUE,
D_WALK_QUIT,
D_WALK_NORETRY,
D_WALK_SKIP,
+   D_WALK_SKIP_SIBLINGS,
 };
 
 /**
@@ -1339,6 +1341,7 @@ static void d_walk(struct dentry *parent, void *data,
break;
case D_WALK_QUIT:
case D_WALK_SKIP:
+   case D_WALK_SKIP_SIBLINGS:
goto out_unlock;
case D_WALK_NORETRY:
retry = false;
@@ -1370,6 +1373,9 @@ static void d_walk(struct dentry *parent, void *data,
case D_WALK_SKIP:
spin_unlock(>d_lock);
continue;
+   case D_WALK_SKIP_SIBLINGS:
+   spin_unlock(>d_lock);
+   goto skip_siblings;
}
 
if (!list_empty(>d_subdirs)) {
@@ -1381,6 +1387,7 @@ static void d_walk(struct dentry *parent, void *data,
}
spin_unlock(>d_lock);
}
+skip_siblings:
/*
 * All done at this level ... ascend and resume the search.
 */

[PATCH RFC 7/8] dcache: push releasing dentry lock into sweep_negative

2020-05-08 Thread Konstantin Khlebnikov

This is preparation for the next patch.

Signed-off-by: Konstantin Khlebnikov 
---
 fs/dcache.c |   12 +---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/fs/dcache.c b/fs/dcache.c
index 0fd2e02e507b..60158065891e 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -636,15 +636,17 @@ static inline struct dentry *lock_parent(struct dentry 
*dentry)
  * Move cached negative dentry to the tail of parent->d_subdirs.
  * This lets walkers skip them all together at first sight.
  * Must be called at dput of negative dentry.
+ * dentry->d_lock must be held, returns with it unlocked.
  */
 static void sweep_negative(struct dentry *dentry)
+   __releases(dentry->d_lock)
 {
struct dentry *parent;
 
if (!d_is_tail_negative(dentry)) {
parent = lock_parent(dentry);
if (!parent)
-   return;
+   goto out;
 
if (!d_count(dentry) && d_is_negative(dentry) &&
!d_is_tail_negative(dentry)) {
@@ -654,6 +656,8 @@ static void sweep_negative(struct dentry *dentry)
 
spin_unlock(>d_lock);
}
+out:
+   spin_unlock(>d_lock);
 }
 
 /*
@@ -747,7 +751,8 @@ static struct dentry *dentry_kill(struct dentry *dentry)
spin_unlock(>d_lock);
if (d_is_negative(dentry))
sweep_negative(dentry);
-   spin_unlock(>d_lock);
+   else
+   spin_unlock(>d_lock);
return NULL;
 }
 
@@ -905,7 +910,8 @@ void dput(struct dentry *dentry)
if (likely(retain_dentry(dentry))) {
if (d_is_negative(dentry))
sweep_negative(dentry);
-   spin_unlock(>d_lock);
+   else
+   spin_unlock(>d_lock);
return;
}

[PATCH RFC 6/8] dcache: stop walking siblings if remaining dentries all negative

2020-05-08 Thread Konstantin Khlebnikov

Most walkers are interested only in positive dentries.

Changes in simple_* libfs helpers are mostly cosmetic: it shouldn't cache
negative dentries unless uses d_delete other than always_delete_dentry().

Signed-off-by: Konstantin Khlebnikov 
---
 fs/dcache.c |   10 ++
 fs/libfs.c  |   10 +-
 2 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/fs/dcache.c b/fs/dcache.c
index 44c6832d21d6..0fd2e02e507b 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1442,6 +1442,9 @@ static enum d_walk_ret path_check_mount(void *data, 
struct dentry *dentry)
struct check_mount *info = data;
struct path path = { .mnt = info->mnt, .dentry = dentry };
 
+   if (d_is_tail_negative(dentry))
+   return D_WALK_SKIP_SIBLINGS;
+
if (likely(!d_mountpoint(dentry)))
return D_WALK_CONTINUE;
if (__path_is_mountpoint()) {
@@ -1688,6 +1691,10 @@ void shrink_dcache_for_umount(struct super_block *sb)
 static enum d_walk_ret find_submount(void *_data, struct dentry *dentry)
 {
struct dentry **victim = _data;
+
+   if (d_is_tail_negative(dentry))
+   return D_WALK_SKIP_SIBLINGS;
+
if (d_mountpoint(dentry)) {
__dget_dlock(dentry);
*victim = dentry;
@@ -3159,6 +3166,9 @@ static enum d_walk_ret d_genocide_kill(void *data, struct 
dentry *dentry)
 {
struct dentry *root = data;
if (dentry != root) {
+   if (d_is_tail_negative(dentry))
+   return D_WALK_SKIP_SIBLINGS;
+
if (d_unhashed(dentry) || !dentry->d_inode)
return D_WALK_SKIP;
 
diff --git a/fs/libfs.c b/fs/libfs.c
index 3759fbacf522..de944c241cf0 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -106,6 +106,10 @@ static struct dentry *scan_positives(struct dentry *cursor,
spin_lock(>d_lock);
while ((p = p->next) != >d_subdirs) {
struct dentry *d = list_entry(p, struct dentry, d_child);
+
+   if (d_is_tail_negative(d))
+   break;
+
// we must at least skip cursors, to avoid livelocks
if (d->d_flags & DCACHE_DENTRY_CURSOR)
continue;
@@ -255,7 +259,8 @@ static struct dentry *find_next_child(struct dentry 
*parent, struct dentry *prev
spin_unlock(>d_lock);
if (likely(child))
break;
-   }
+   } else if (d_is_tail_negative(d))
+   break;
}
spin_unlock(>d_lock);
dput(prev);
@@ -408,6 +413,9 @@ int simple_empty(struct dentry *dentry)
 
spin_lock(>d_lock);
list_for_each_entry(child, >d_subdirs, d_child) {
+   if (d_is_tail_negative(child))
+   break;
+
spin_lock_nested(>d_lock, DENTRY_D_LOCK_NESTED);
if (simple_positive(child)) {
spin_unlock(>d_lock);

[PATCH RFC 4/8] fsnotify: stop walking child dentries if remaining tail is negative

2020-05-08 Thread Konstantin Khlebnikov

When notification starts/stops listening events from inode's children it
have to update dentry->d_flags of all positive child dentries. Scanning
may took a long time if directory has a lot of negative child dentries.

This is main beneficiary of sweeping cached negative dentries to the end.

Before patch:

nr_dentry = 2417259724.2M
nr_buckets = 83886082.9 avg
nr_unused = 2415811099.9%
nr_negative = 24142810  99.9%

inotify time: 0.507182 seconds

After patch:

nr_dentry = 2456274724.6M
nr_buckets = 83886082.9 avg
nr_unused = 2454871499.9%
nr_negative = 24543867  99.9%

inotify time: 0.10 seconds

Negative dentries no longer slow down inotify op at parent directory.

Signed-off-by: Konstantin Khlebnikov 
---
 fs/notify/fsnotify.c |6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 72d332ce8e12..072974302950 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -127,8 +127,12 @@ void __fsnotify_update_child_dentry_flags(struct inode 
*inode)
 * original inode) */
spin_lock(>d_lock);
list_for_each_entry(child, >d_subdirs, d_child) {
-   if (!child->d_inode)
+   if (!child->d_inode) {
+   /* all remaining children are negative */
+   if (d_is_tail_negative(child))
+   break;
continue;
+   }
 
spin_lock_nested(>d_lock, DENTRY_D_LOCK_NESTED);
if (watched)

[PATCH RFC 1/8] dcache: show count of hash buckets in sysctl fs.dentry-state

2020-05-08 Thread Konstantin Khlebnikov

Count of buckets is required for estimating average length of hash chains.
Size of hash table depends on memory size and printed once at boot.

Let's expose nr_buckets as sixth number in sysctl fs.dentry-state

Signed-off-by: Konstantin Khlebnikov 
---
 Documentation/admin-guide/sysctl/fs.rst |   12 ++--
 fs/dcache.c |   12 ++--
 include/linux/dcache.h  |2 +-
 3 files changed, 17 insertions(+), 9 deletions(-)

diff --git a/Documentation/admin-guide/sysctl/fs.rst 
b/Documentation/admin-guide/sysctl/fs.rst
index 2a45119e3331..b74df4714ddd 100644
--- a/Documentation/admin-guide/sysctl/fs.rst
+++ b/Documentation/admin-guide/sysctl/fs.rst
@@ -66,12 +66,12 @@ dentry-state
 From linux/include/linux/dcache.h::
 
   struct dentry_stat_t dentry_stat {
-int nr_dentry;
-int nr_unused;
-int age_limit; /* age in seconds */
-int want_pages;/* pages requested by system */
-int nr_negative;   /* # of unused negative dentries */
-int dummy; /* Reserved for future use */
+long nr_dentry;
+long nr_unused;
+long age_limit; /* age in seconds */
+long want_pages;/* pages requested by system */
+long nr_negative;   /* # of unused negative dentries */
+long nr_buckets;/* count of dcache hash buckets */
   };
 
 Dentries are dynamically allocated and deallocated.
diff --git a/fs/dcache.c b/fs/dcache.c
index b280e07e162b..386f97eaf2ff 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -3139,6 +3139,14 @@ static int __init set_dhash_entries(char *str)
 }
 __setup("dhash_entries=", set_dhash_entries);
 
+static void __init dcache_init_hash(void)
+{
+   dentry_stat.nr_buckets = 1l << d_hash_shift;
+
+   /* shift to use higher bits of 32 bit hash value */
+   d_hash_shift = 32 - d_hash_shift;
+}
+
 static void __init dcache_init_early(void)
 {
/* If hashes are distributed across NUMA nodes, defer
@@ -3157,7 +3165,7 @@ static void __init dcache_init_early(void)
NULL,
0,
0);
-   d_hash_shift = 32 - d_hash_shift;
+   dcache_init_hash();
 }
 
 static void __init dcache_init(void)
@@ -3185,7 +3193,7 @@ static void __init dcache_init(void)
NULL,
0,
0);
-   d_hash_shift = 32 - d_hash_shift;
+   dcache_init_hash();
 }
 
 /* SLAB cache for __getname() consumers */
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index c1488cc84fd9..082b55068e4d 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -65,7 +65,7 @@ struct dentry_stat_t {
long age_limit; /* age in seconds */
long want_pages;/* pages requested by system */
long nr_negative;   /* # of unused negative dentries */
-   long dummy; /* Reserved for future use */
+   long nr_buckets;/* count of dcache hash buckets */
 };
 extern struct dentry_stat_t dentry_stat;

[PATCH RFC 2/8] selftests: add stress testing tool for dcache

2020-05-08 Thread Konstantin Khlebnikov

This tool fills dcache with negative dentries. Between iterations it prints
statistics and measures time of inotify operation which might degrade.

Signed-off-by: Konstantin Khlebnikov 
---
 tools/testing/selftests/filesystems/Makefile   |1 
 .../testing/selftests/filesystems/dcache_stress.c  |  210 
 2 files changed, 211 insertions(+)
 create mode 100644 tools/testing/selftests/filesystems/dcache_stress.c

diff --git a/tools/testing/selftests/filesystems/Makefile 
b/tools/testing/selftests/filesystems/Makefile
index 129880fb42d3..6b5e08617d11 100644
--- a/tools/testing/selftests/filesystems/Makefile
+++ b/tools/testing/selftests/filesystems/Makefile
@@ -3,5 +3,6 @@
 CFLAGS += -I../../../../usr/include/
 TEST_GEN_PROGS := devpts_pts
 TEST_GEN_PROGS_EXTENDED := dnotify_test
+TEST_GEN_FILES += dcache_stress
 
 include ../lib.mk
diff --git a/tools/testing/selftests/filesystems/dcache_stress.c 
b/tools/testing/selftests/filesystems/dcache_stress.c
new file mode 100644
index ..770e8876629e
--- /dev/null
+++ b/tools/testing/selftests/filesystems/dcache_stress.c
@@ -0,0 +1,210 @@
+// SPDX-License-Identifier: GPL-2.0
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+double now(void)
+{
+   struct timespec ts;
+
+   clock_gettime(CLOCK_MONOTONIC, );
+   return ts.tv_sec + ts.tv_nsec * 1e-9;
+}
+
+struct dentry_stat {
+   long nr_dentry;
+   long nr_unused;
+   long age_limit; /* age in seconds */
+   long want_pages;/* pages requested by system */
+   long nr_negative;   /* # of unused negative dentries */
+   long nr_buckets;/* count of dcache hash buckets */
+};
+
+void show_dentry_state(void)
+{
+   struct dentry_stat stat;
+   ssize_t len;
+   FILE *f;
+
+   f = fopen("/proc/sys/fs/dentry-state", "r");
+   if (!f)
+   err(2, "open fs.dentry-state");
+
+   if (fscanf(f, "%ld %ld %ld %ld %ld %ld",
+  _dentry,
+  _unused,
+  _limit,
+  _pages,
+  _negative,
+  _buckets) != 6)
+   err(2, "read fs.dentry-state");
+   fclose(f);
+
+   if (!stat.nr_buckets)
+   stat.nr_buckets = 1 << 20;  // for 8Gb ram
+
+   printf("nr_dentry = %ld\t%.1fM\n", stat.nr_dentry, stat.nr_dentry / 
1e6);
+   printf("nr_buckets = %ld\t%.1f avg\n", stat.nr_buckets, 
(double)stat.nr_dentry / stat.nr_buckets);
+   printf("nr_unused = %ld\t%.1f%%\n", stat.nr_unused, stat.nr_unused * 
100. / stat.nr_dentry);
+   printf("nr_negative = %ld\t%.1f%%\n\n", stat.nr_negative, 
stat.nr_negative * 100. / stat.nr_dentry);
+}
+
+void test_inotify(const char *path)
+{
+   double tm;
+   int fd;
+
+   fd = inotify_init1(0);
+
+   tm = now();
+   inotify_add_watch(fd, path, IN_OPEN);
+   tm = now() - tm;
+
+   printf("inotify time: %f seconds\n\n", tm);
+
+   close(fd);
+}
+
+int main(int argc, char **argv)
+{
+   char dir_name[] = "dcache_stress.XX";
+   char name[4096];
+   char *suffix = name;
+   int nr_iterations = 10;
+   int nr_names = 1 << 20;
+   int iteration, index;
+   int other_dir = -1;
+   int mknod_unlink = 0;
+   int mkdir_chdir = 0;
+   int second_access = 0;
+   long long total_names = 0;
+   double tm;
+   int opt;
+
+   while ((opt = getopt(argc, argv, "i:n:p:o:usdh")) != -1) {
+   switch (opt) {
+   case 'i':
+   nr_iterations = atoi(optarg);
+   break;
+   case 'n':
+   nr_names = atoi(optarg);
+   break;
+   case 'p':
+   strcpy(suffix, optarg);
+   suffix += strlen(suffix);
+   break;
+   case 'o':
+   other_dir = open(optarg, O_RDONLY | O_DIRECTORY);
+   if (other_dir < 0)
+   err(2, "open %s", optarg);
+   break;
+   case 'u':
+   mknod_unlink = 1;
+   break;
+   case 'd':
+   mkdir_chdir = 1;
+   break;
+   case 's':
+   second_access = 1;
+   break;
+   case '?':
+   case 'h':
+   printf("usage: %s [-i ] [-n ] [-p 
] [-o ] [-u] [-s]\n"
+  "  -i  test iterations, default %d\n"
+  "  -n  names at each iterations, default %d\n"
+  "  -p  prefix fo

[PATCH RFC 3/8] dcache: sweep cached negative dentries to the end of list of siblings

2020-05-08 Thread Konstantin Khlebnikov

For disk filesystems result of every negative lookup is cached, content of
directories is usually cached too. Production of negative dentries isn't
limited with disk speed. It's really easy to generate millions of them if
system has enough memory. Negative dentries are linked into siblings list
along with normal positive dentries. Some operations walks dcache tree but
looks only for positive dentries: most important is fsnotify/inotify.

This patch moves negative dentries to the end of list at final dput() and
marks with flag which tells that all following dentries are negative too.
Reverse operation is required before instantiating negative dentry.

Signed-off-by: Konstantin Khlebnikov 
---
 fs/dcache.c|   63 ++--
 include/linux/dcache.h |6 +
 2 files changed, 66 insertions(+), 3 deletions(-)

diff --git a/fs/dcache.c b/fs/dcache.c
index 386f97eaf2ff..743255773cc7 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -632,6 +632,48 @@ static inline struct dentry *lock_parent(struct dentry 
*dentry)
return __lock_parent(dentry);
 }
 
+/*
+ * Move cached negative dentry to the tail of parent->d_subdirs.
+ * This lets walkers skip them all together at first sight.
+ * Must be called at dput of negative dentry.
+ */
+static void sweep_negative(struct dentry *dentry)
+{
+   struct dentry *parent;
+
+   if (!d_is_tail_negative(dentry)) {
+   parent = lock_parent(dentry);
+   if (!parent)
+   return;
+
+   if (!d_count(dentry) && d_is_negative(dentry) &&
+   !d_is_tail_negative(dentry)) {
+   dentry->d_flags |= DCACHE_TAIL_NEGATIVE;
+   list_move_tail(>d_child, >d_subdirs);
+   }
+
+   spin_unlock(>d_lock);
+   }
+}
+
+/*
+ * Undo sweep_negative() and move to the head of parent->d_subdirs.
+ * Must be called before converting negative dentry into positive.
+ */
+static void recycle_negative(struct dentry *dentry)
+{
+   struct dentry *parent;
+
+   spin_lock(>d_lock);
+   parent = lock_parent(dentry);
+   if (parent) {
+   list_move(>d_child, >d_subdirs);
+   spin_unlock(>d_lock);
+   }
+   dentry->d_flags &= ~DCACHE_TAIL_NEGATIVE;
+   spin_unlock(>d_lock);
+}
+
 static inline bool retain_dentry(struct dentry *dentry)
 {
WARN_ON(d_in_lookup(dentry));
@@ -703,6 +745,8 @@ static struct dentry *dentry_kill(struct dentry *dentry)
spin_unlock(>i_lock);
if (parent)
spin_unlock(>d_lock);
+   if (d_is_negative(dentry))
+   sweep_negative(dentry);
spin_unlock(>d_lock);
return NULL;
 }
@@ -718,7 +762,7 @@ static struct dentry *dentry_kill(struct dentry *dentry)
 static inline bool fast_dput(struct dentry *dentry)
 {
int ret;
-   unsigned int d_flags;
+   unsigned int d_flags, required;
 
/*
 * If we have a d_op->d_delete() operation, we sould not
@@ -766,6 +810,8 @@ static inline bool fast_dput(struct dentry *dentry)
 * a 'delete' op, and it's referenced and already on
 * the LRU list.
 *
+* Cached negative dentry must be swept to the tail.
+*
 * NOTE! Since we aren't locked, these values are
 * not "stable". However, it is sufficient that at
 * some point after we dropped the reference the
@@ -777,10 +823,15 @@ static inline bool fast_dput(struct dentry *dentry)
 */
smp_rmb();
d_flags = READ_ONCE(dentry->d_flags);
-   d_flags &= DCACHE_REFERENCED | DCACHE_LRU_LIST | DCACHE_DISCONNECTED;
+
+   required = DCACHE_REFERENCED | DCACHE_LRU_LIST |
+  (d_flags_negative(d_flags) ? DCACHE_TAIL_NEGATIVE : 0);
+
+   d_flags &= DCACHE_REFERENCED | DCACHE_LRU_LIST |
+  DCACHE_DISCONNECTED | DCACHE_TAIL_NEGATIVE;
 
/* Nothing to do? Dropping the reference was all we needed? */
-   if (d_flags == (DCACHE_REFERENCED | DCACHE_LRU_LIST) && 
!d_unhashed(dentry))
+   if (d_flags == required && !d_unhashed(dentry))
return true;
 
/*
@@ -852,6 +903,8 @@ void dput(struct dentry *dentry)
rcu_read_unlock();
 
if (likely(retain_dentry(dentry))) {
+   if (d_is_negative(dentry))
+   sweep_negative(dentry);
spin_unlock(>d_lock);
return;
}
@@ -1951,6 +2004,8 @@ void d_instantiate(struct dentry *entry, struct inode * 
inode)
 {
BUG_ON(!hlist_unhashed(>d_u.d_alias));
if (inode) {
+   if (d_is_tail_negative(entry))
+   recycle_negative(entry);
security_d_instantiate(entry, inode);

[PATCH RFC 0/8] dcache: increase poison resistance

2020-05-08 Thread Konstantin Khlebnikov

For most filesystems result of every negative lookup is cached, content of
directories is usually cached too. Production of negative dentries isn't
limited with disk speed. It's really easy to generate millions of them if
system has enough memory.

Getting this memory back ins't that easy because slab frees pages only when
all related objects are gone. While dcache shrinker works in LRU order.

Typical scenario is an idle system where some process periodically creates
temporary files and removes them. After some time, memory will be filled
with negative dentries for these random file names.

Simple lookup of random names also generates negative dentries very fast.
Constant flow of such negative denries drains all other inactive caches.

Negative dentries are linked into siblings list along with normal positive
dentries. Some operations walks dcache tree but looks only for positive
dentries: most important is fsnotify/inotify. Hordes of negative dentries
slow down these operations significantly.

Time of dentry lookup is usually unaffected because hash table grows along
with size of memory. Unless somebody especially crafts hash collisions.

This patch set solves all of these problems:

Move negative denries to the end of sliblings list, thus walkers could
skip them at first sight (patches 3-6).

Keep in dcache at most three unreferenced negative denties in row in each
hash bucket (patches 7-8).

---

Konstantin Khlebnikov (8):
  dcache: show count of hash buckets in sysctl fs.dentry-state
  selftests: add stress testing tool for dcache
  dcache: sweep cached negative dentries to the end of list of siblings
  fsnotify: stop walking child dentries if remaining tail is negative
  dcache: add action D_WALK_SKIP_SIBLINGS to d_walk()
  dcache: stop walking siblings if remaining dentries all negative
  dcache: push releasing dentry lock into sweep_negative
  dcache: prevent flooding with negative dentries


 fs/dcache.c   | 144 +++-
 fs/libfs.c|  10 +-
 fs/notify/fsnotify.c  |   6 +-
 include/linux/dcache.h|   6 +
 tools/testing/selftests/filesystems/Makefile  |   1 +
 .../selftests/filesystems/dcache_stress.c | 210 ++
 6 files changed, 370 insertions(+), 7 deletions(-)
 create mode 100644 tools/testing/selftests/filesystems/dcache_stress.c

--
Signature

[PATCH RFC 8/8] dcache: prevent flooding with negative dentries

2020-05-08 Thread Konstantin Khlebnikov

Without memory pressure count of negative dentries isn't bounded.
They could consume all memory and drain all other inactive caches.

Typical scenario is an idle system where some process periodically creates
temporary files and removes them. After some time, memory will be filled
with negative dentries for these random file names. Reclaiming them took
some time because slab frees pages only when all related objects are gone.
Time of dentry lookup is usually unaffected because hash table grows along
with size of memory. Unless somebody especially crafts hash collisions.
Simple lookup of random names also generates negative dentries very fast.

This patch implements heuristic which detects such scenarios and prevents
unbounded growth of completely unneeded negative dentries. It keeps up to
three latest negative dentry in each bucket unless they were referenced.

At first dput of negative dentry when it swept to the tail of siblings
we'll also clear it's reference flag and look at next dentries in chain.
Then kill third in series of negative, unused and unreferenced denries.

This way each hash bucket will preserve three negative dentry to let them
get reference and survive. Adding positive or used dentry into hash chain
also protects few recent negative dentries. In result total size of dcache
asymptotically limited by count of buckets and positive or used dentries.

Before patch: tool 'dcache_stress' could fill entire memory with dentries.

nr_dentry = 104913261   104.9M
nr_buckets = 838860812.5 avg
nr_unused = 104898729   100.0%
nr_negative = 104883218 100.0%

After this patch count of dentries saturates at around 3 per bucket:

nr_dentry = 2461925924.6M
nr_buckets = 83886082.9 avg
nr_unused = 2460522699.9%
nr_negative = 24600351  99.9%

This heuristic isn't bulletproof and solves only most practical case.
It's easy to deceive: just touch same random name twice.

Signed-off-by: Konstantin Khlebnikov 
---
 fs/dcache.c |   54 ++
 1 file changed, 54 insertions(+)

diff --git a/fs/dcache.c b/fs/dcache.c
index 60158065891e..9f3d331b4978 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -632,6 +632,58 @@ static inline struct dentry *lock_parent(struct dentry 
*dentry)
return __lock_parent(dentry);
 }
 
+/*
+ * Called at first dput of each negative dentry.
+ * Prevents filling cache with never reused negative dentries.
+ *
+ * This clears reference and then looks at following dentries in hash chain.
+ * If they are negative, unused and unreferenced then keep two and kill third.
+ */
+static void trim_negative(struct dentry *dentry)
+   __releases(dentry->d_lock)
+{
+   struct dentry *victim, *parent;
+   struct hlist_bl_node *next;
+   int keep = 2;
+
+   rcu_read_lock();
+
+   dentry->d_flags &= ~DCACHE_REFERENCED;
+   spin_unlock(>d_lock);
+
+   next = rcu_dereference_raw(dentry->d_hash.next);
+   while (1) {
+   victim = hlist_bl_entry(next, struct dentry, d_hash);
+
+   if (!next || d_count(victim) || !d_is_negative(victim) ||
+   (victim->d_flags & DCACHE_REFERENCED)) {
+   rcu_read_unlock();
+   return;
+   }
+
+   if (!keep--)
+   break;
+
+   next = rcu_dereference_raw(next->next);
+   }
+
+   spin_lock(>d_lock);
+   parent = lock_parent(victim);
+
+   rcu_read_unlock();
+
+   if (d_count(victim) || !d_is_negative(victim) ||
+   (victim->d_flags & DCACHE_REFERENCED)) {
+   if (parent)
+   spin_unlock(>d_lock);
+   spin_unlock(>d_lock);
+   return;
+   }
+
+   __dentry_kill(victim);
+   dput(parent);
+}
+
 /*
  * Move cached negative dentry to the tail of parent->d_subdirs.
  * This lets walkers skip them all together at first sight.
@@ -655,6 +707,8 @@ static void sweep_negative(struct dentry *dentry)
}
 
spin_unlock(>d_lock);
+
+   return trim_negative(dentry);
}
 out:
spin_unlock(>d_lock);

Re: [PATCH RFC tip/core/rcu] Add shrinker to shift to fast/inefficient GP mode

2020-05-08 Thread Konstantin Khlebnikov


On 07/05/2020 22.09, Paul E. McKenney wrote:

On Thu, May 07, 2020 at 02:31:02PM -0400, Johannes Weiner wrote:

On Thu, May 07, 2020 at 10:09:03AM -0700, Paul E. McKenney wrote:

On Thu, May 07, 2020 at 01:00:06PM -0400, Johannes Weiner wrote:

On Wed, May 06, 2020 at 05:55:35PM -0700, Andrew Morton wrote:

On Wed, 6 May 2020 17:42:40 -0700 "Paul E. McKenney"  wrote:


This commit adds a shrinker so as to inform RCU when memory is scarce.
RCU responds by shifting into the same fast and inefficient mode that is
used in the presence of excessive numbers of RCU callbacks.  RCU remains
in this state for one-tenth of a second, though this time window can be
extended by another call to the shrinker.


We may be able to use shrinkers here, but merely being invoked does
not carry a reliable distress signal.

Shrinkers get invoked whenever vmscan runs. It's a useful indicator
for when to age an auxiliary LRU list - test references, clear and
rotate or reclaim stale entries. The urgency, and what can and cannot
be considered "stale", is encoded in the callback frequency and scan
counts, and meant to be relative to the VM's own rate of aging: "I've
tested X percent of mine for recent use, now you go and test the same
share of your pool." It doesn't translate well to other
interpretations of the callbacks, although people have tried.


Would it make sense for RCU to interpret two invocations within (say)
100ms of each other as indicating urgency?  (Hey, I had to ask!)


It's the perfect number for one combination of CPU, storage device,
and shrinker implementation :-)


Woo-hoo!!!

But is that one combination actually in use anywhere?  ;-)


If it proves feasible, a later commit might add a function call directly
indicating the end of the period of scarce memory.


(Cc David Chinner, who often has opinions on shrinkers ;))

It's a bit abusive of the intent of the slab shrinkers, but I don't
immediately see a problem with it.  Always returning 0 from
->scan_objects might cause a problem in some situations(?).

Perhaps we should have a formal "system getting low on memory, please
do something" notification API.


It's tricky to find a useful definition of what low on memory
means. In the past we've used sc->priority cutoffs, the vmpressure
interface (reclaimed/scanned - reclaim efficiency cutoffs), oom
notifiers (another reclaim efficiency cutoff). But none of these
reliably capture "distress", and they vary highly between different
hardware setups. It can be hard to trigger OOM itself on fast IO
devices, even when the machine is way past useful (where useful is
somewhat subjective to the user). Userspace OOM implementations that
consider userspace health (also subjective) are getting more common.


How significant is this?  How much memory can RCU consume?


I think if rcu can end up consuming a significant share of memory, one
way that may work would be to do proper shrinker integration and track
the age of its objects relative to the age of other allocations in the
system. I.e. toss them all on a clock list with "new" bits and shrink
them at VM velocity. If the shrinker sees objects with new bit set,
clear and rotate. If it sees objects without them, we know rcu_heads
outlive cache pages etc. and should probably cycle faster too.


It would be easy for RCU to pass back (or otherwise use) the age of the
current grace period, if that would help.

Tracking the age of individual callbacks is out of the question due to
memory overhead, but RCU could approximate this via statistical sampling.
Comparing this to grace-period durations could give information as to
whether making grace periods go faster would be helpful.


That makes sense.

So RCU knows the time and the VM knows the amount of memory. Either
RCU needs to figure out its memory component to be able to translate
shrinker input to age, or the VM needs to learn about time to be able
to say: I'm currently scanning memory older than timestamp X.

The latter would also require sampling in the VM. Nose goes. :-)


Sounds about right.  ;-)

Does reclaim have any notion of having continuously scanned for
longer than some amount of time?  Or could RCU reasonably deduce this?
For example, if RCU noticed that reclaim had been scanning for longer than
(say) five grace periods, RCU might decide to speed things up.

But on the other hand, with slow disks, reclaim might go on for tens of
seconds even without much in the way of memory pressure, mightn't it?

I suppose that another indicator would be recent NULL returns from
allocators.  But that indicator flashes a bit later than one would like,
doesn't it?  And has false positives when allocators are invoked from
atomic contexts, no doubt.  And no doubt similar for sleeping more than
a certain length of time in an allocator.


There actually is prior art for teaching reclaim about time:
https://lore.kernel.org/linux-mm/20130430110214.22179.26139.stgit@zurg/

CCing Konstantin. I'm curious how widely this ended up being used

Re: [PATCH] slub: limit count of partial slabs scanned to gather statistics

2020-05-06 Thread Konstantin Khlebnikov


On 06/05/2020 14.56, Vlastimil Babka wrote:

On 5/4/20 6:07 PM, Konstantin Khlebnikov wrote:

To get exact count of free and used objects slub have to scan list of
partial slabs. This may take at long time. Scanning holds spinlock and
blocks allocations which move partial slabs to per-cpu lists and back.

Example found in the wild:

# cat /sys/kernel/slab/dentry/partial
14478538 N0=7329569 N1=7148969
# time cat /sys/kernel/slab/dentry/objects
286225471 N0=136967768 N1=149257703

real0m1.722s
user0m0.001s
sys 0m1.721s

The same problem in slab was addressed in commit f728b0a5d72a ("mm, slab:
faster active and free stats") by adding more kmem cache statistics.
For slub same approach requires atomic op on fast path when object frees.


In general yeah, but are you sure about this one? AFAICS this is about pages in
the n->partial list, where manipulations happen under n->list_lock and shouldn't
be fast path. It should be feasible to add a counter under the same lock, so it
wouldn't even need to be atomic?


SLUB allocates objects from prepared per-cpu slabs, they could be subtracted 
from
count of free object under this lock in advance when slab moved out of this 
list.

But at freeing path object might belong to any slab, including global partials.




Let's simply limit count of scanned slabs and print warning.
Limit set in /sys/module/slub/parameters/max_partial_to_count.
Default is 1 which should be enough for most sane cases.

Return linear approximation if list of partials is longer than limit.
Nobody should notice difference.

Signed-off-by: Konstantin Khlebnikov 


BTW there was a different patch in that area proposed recently [1] for slabinfo.
Christopher argued that we can do that for slabinfo but leave /sys stats
precise. Guess not then?

[1]
https://lore.kernel.org/linux-mm/20200222092428.99488-1-weny...@linux.alibaba.com/


---
  mm/slub.c |   15 ++-
  1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/mm/slub.c b/mm/slub.c
index 9bf44955c4f1..86a366f7acb6 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2407,16 +2407,29 @@ static inline unsigned long node_nr_objs(struct 
kmem_cache_node *n)
  #endif /* CONFIG_SLUB_DEBUG */
  
  #if defined(CONFIG_SLUB_DEBUG) || defined(CONFIG_SYSFS)

+
+static unsigned long max_partial_to_count __read_mostly = 1;
+module_param(max_partial_to_count, ulong, 0644);
+
  static unsigned long count_partial(struct kmem_cache_node *n,
int (*get_count)(struct page *))
  {
+   unsigned long counted = 0;
unsigned long flags;
unsigned long x = 0;
struct page *page;
  
  	spin_lock_irqsave(>list_lock, flags);

-   list_for_each_entry(page, >partial, slab_list)
+   list_for_each_entry(page, >partial, slab_list) {
x += get_count(page);
+
+   if (++counted > max_partial_to_count) {
+   pr_warn_once("SLUB: too much partial slabs to count all 
objects, increase max_partial_to_count.\n");
+   /* Approximate total count of objects */
+   x = mult_frac(x, n->nr_partial, counted);
+   break;
+   }
+   }
spin_unlock_irqrestore(>list_lock, flags);
return x;
  }

Re: [PATCH] slub: limit count of partial slabs scanned to gather statistics

2020-05-06 Thread Konstantin Khlebnikov


On 07/05/2020 06.01, Qian Cai wrote:




On May 6, 2020, at 3:06 PM, Qian Cai  wrote:




On May 4, 2020, at 12:07 PM, Konstantin Khlebnikov  
wrote:

To get exact count of free and used objects slub have to scan list of
partial slabs. This may take at long time. Scanning holds spinlock and
blocks allocations which move partial slabs to per-cpu lists and back.

Example found in the wild:

# cat /sys/kernel/slab/dentry/partial
14478538 N0=7329569 N1=7148969
# time cat /sys/kernel/slab/dentry/objects
286225471 N0=136967768 N1=149257703

real0m1.722s
user0m0.001s
sys 0m1.721s

The same problem in slab was addressed in commit f728b0a5d72a ("mm, slab:
faster active and free stats") by adding more kmem cache statistics.
For slub same approach requires atomic op on fast path when object frees.

Let's simply limit count of scanned slabs and print warning.
Limit set in /sys/module/slub/parameters/max_partial_to_count.
Default is 1 which should be enough for most sane cases.

Return linear approximation if list of partials is longer than limit.
Nobody should notice difference.

Signed-off-by: Konstantin Khlebnikov 


This patch will trigger the warning under memory pressure, and then makes 
lockdep unhappy. Also,  it is almost impossible tell how many 
max_partial_to_count is sufficient from user perspective.


Oops, my bad. Printing under this lock indeed a bad idea.

Probably it's better to simply remove this message.
I cannot imagine situation when precise count of object matters at such scale.



Andrew, Stephen, can you remove this patch from linux-next?

Even read some procfs files would trigger the warning and lockdep on a debug 
kernel probably due to kmemleak and debugobjects that would require more 
partial slabs objects. Thus, it would be problematic to break testing bots on 
linux-next like this.



[ 6371.600511] SLUB: too much partial slabs to count all objects, increase 
max_partial_to_count.
[ 6371.601399] irq event stamp: 8132599

[ 6371.611415] ==
[ 6371.611417] WARNING: possible circular locking dependency detected
[ 6371.611419] 5.7.0-rc4-mm1+ #1 Not tainted
[ 6371.611421] --
[ 6371.611423] oom02/43515 is trying to acquire lock:
[ 6371.611425] 893b8980 (console_owner){-.-.}-{0:0}, at: 
console_unlock+0x240/0x750

[ 6371.611433] but task is already holding lock:
[ 6371.611434] 8886456fcb98 (>list_lock){-.-.}-{2:2}, at: 
count_partial+0x29/0xe0

[ 6371.611441] which lock already depends on the new lock.


[ 6371.611445] the existing dependency chain (in reverse order) is:

[ 6371.611446] -> #3 (>list_lock){-.-.}-{2:2}:
[ 6371.611452]_raw_spin_lock+0x2f/0x40
[ 6371.611453]deactivate_slab+0x37a/0x690
[ 6371.611455]___slab_alloc+0x65d/0x810
[ 6371.611456]__slab_alloc+0x43/0x70
[ 6371.611457]__kmalloc+0x2b2/0x430
[ 6371.611459]__tty_buffer_request_room+0x100/0x250
[ 6371.611460]tty_insert_flip_string_fixed_flag+0x67/0x130
[ 6371.611462]pty_write+0xa2/0xf0
[ 6371.611463]n_tty_write+0x36b/0x7c0
[ 6371.611464]tty_write+0x275/0x500
[ 6371.611466]__vfs_write+0x50/0xa0
[ 6371.611467]vfs_write+0x10b/0x290
[ 6371.611468]redirected_tty_write+0x6a/0xc0
[ 6371.611470]do_iter_write+0x253/0x2b0
[ 6371.611471]vfs_writev+0x152/0x1f0
[ 6371.611472]do_writev+0xda/0x180
[ 6371.611474]__x64_sys_writev+0x45/0x50
[ 6371.611475]do_syscall_64+0xcc/0xaf0
[ 6371.611477]entry_SYSCALL_64_after_hwframe+0x49/0xb3

[ 6371.611478] -> #2 (>lock#2){-.-.}-{2:2}:
[ 6371.611484]_raw_spin_lock_irqsave+0x3a/0x50
[ 6371.611486]tty_port_tty_get+0x22/0xa0
[ 6371.611487]tty_port_default_wakeup+0xf/0x30
[ 6371.611489]tty_port_tty_wakeup+0x39/0x40
[ 6371.611490]uart_write_wakeup+0x2a/0x40
[ 6371.611492]serial8250_tx_chars+0x22e/0x410
[ 6371.611493]serial8250_handle_irq.part.21+0x17c/0x180
[ 6371.611495]serial8250_default_handle_irq+0x5c/0x90
[ 6371.611496]serial8250_interrupt+0xa6/0x130
[ 6371.611498]__handle_irq_event_percpu+0x81/0x550
[ 6371.611499]handle_irq_event_percpu+0x70/0x100
[ 6371.611501]handle_irq_event+0x5a/0x8b
[ 6371.611502]handle_edge_irq+0x10c/0x370
[ 6371.611503]do_IRQ+0x9e/0x1d0
[ 6371.611505]ret_from_intr+0x0/0x37
[ 6371.611506]cpuidle_enter_state+0x148/0x910
[ 6371.611507]cpuidle_enter+0x41/0x70
[ 6371.611509]do_idle+0x3cf/0x440
[ 6371.611510]cpu_startup_entry+0x1d/0x1f
[ 6371.611511]start_secondary+0x29a/0x340
[ 6371.611513]secondary_startup_64+0xb6/0xc0

[ 6371.611516] -> #1 (>lock){-.-.}-{2:2}:
[ 6371.611522]_raw_spin_lock_irqsave+0x3a/0x50
[ 6371.611525]serial8250_console_write+0x113/0x560
[ 6371.611527]univ8250_consol

Re: [PATCH] slub: limit count of partial slabs scanned to gather statistics

2020-05-05 Thread Konstantin Khlebnikov


On 05/05/2020 00.19, David Rientjes wrote:

On Mon, 4 May 2020, Konstantin Khlebnikov wrote:


To get exact count of free and used objects slub have to scan list of
partial slabs. This may take at long time. Scanning holds spinlock and
blocks allocations which move partial slabs to per-cpu lists and back.

Example found in the wild:

# cat /sys/kernel/slab/dentry/partial
14478538 N0=7329569 N1=7148969
# time cat /sys/kernel/slab/dentry/objects
286225471 N0=136967768 N1=149257703

real0m1.722s
user0m0.001s
sys 0m1.721s

The same problem in slab was addressed in commit f728b0a5d72a ("mm, slab:
faster active and free stats") by adding more kmem cache statistics.
For slub same approach requires atomic op on fast path when object frees.

Let's simply limit count of scanned slabs and print warning.
Limit set in /sys/module/slub/parameters/max_partial_to_count.
Default is 1 which should be enough for most sane cases.

Return linear approximation if list of partials is longer than limit.
Nobody should notice difference.



Hi Konstantin,

Do you only exhibit this on slub for SO_ALL|SO_OBJECTS?  I notice the
timing in the changelog is only looking at "objects" and not "partial".


"partial" is a count of partial slabs which simply sums per-numa counters.
Affected only "objects" and "objects_partial" which walk the list.



If so, it seems this is also a problem for get_slabinfo() since it also
uses the count_free() callback for count_partial().


Yep, /proc/slabinfo also affected.

Actually it's more affected than sysfs - it walks all cgroups while sysfs shows 
only root.



Concern would be that the kernel has now drastically changed a statistic
that it exports to userspace.  There was some discussion about this back
in 2016[*] and one idea was that slabinfo would truncate its scanning and
append a '+' to the end of the value to indicate it exceeds the max, i.e.
1+.  I think that '+' actually caused the problem itself for userspace
processes.


Yep, "+" will break everything for sure.
I thought about returning "-1" or INT_MAX without counting,
but approximation gives almost correct result without breaking anything.

Each partial slab has at least one used and free object thus approximated
result will be somewhere between nr_partial_slabs and nr_partial_objects.



I think the patch is too far reaching, however, since it impacts all
count_partial() counting and not only for the case cited in the changelog.
Are there examples for things other than the count_free() callback?


Nope, this is just a statistics for used/free objects.
Total count of objects and slabs are counted precisely.



  [*] https://lore.kernel.org/patchwork/patch/708427/


Signed-off-by: Konstantin Khlebnikov 
---
  mm/slub.c |   15 ++-
  1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/mm/slub.c b/mm/slub.c
index 9bf44955c4f1..86a366f7acb6 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2407,16 +2407,29 @@ static inline unsigned long node_nr_objs(struct 
kmem_cache_node *n)
  #endif /* CONFIG_SLUB_DEBUG */
  
  #if defined(CONFIG_SLUB_DEBUG) || defined(CONFIG_SYSFS)

+
+static unsigned long max_partial_to_count __read_mostly = 1;
+module_param(max_partial_to_count, ulong, 0644);
+
  static unsigned long count_partial(struct kmem_cache_node *n,
int (*get_count)(struct page *))
  {
+   unsigned long counted = 0;
unsigned long flags;
unsigned long x = 0;
struct page *page;
  
  	spin_lock_irqsave(>list_lock, flags);

-   list_for_each_entry(page, >partial, slab_list)
+   list_for_each_entry(page, >partial, slab_list) {
x += get_count(page);
+
+   if (++counted > max_partial_to_count) {
+   pr_warn_once("SLUB: too much partial slabs to count all 
objects, increase max_partial_to_count.\n");
+   /* Approximate total count of objects */
+   x = mult_frac(x, n->nr_partial, counted);
+   break;
+   }
+   }
spin_unlock_irqrestore(>list_lock, flags);
return x;
  }

Re: [PATCH] slub: limit count of partial slabs scanned to gather statistics

2020-05-04 Thread Konstantin Khlebnikov


On 04/05/2020 22.56, Andrew Morton wrote:

On Mon, 04 May 2020 19:07:39 +0300 Konstantin Khlebnikov 
 wrote:


To get exact count of free and used objects slub have to scan list of
partial slabs. This may take at long time. Scanning holds spinlock and
blocks allocations which move partial slabs to per-cpu lists and back.

Example found in the wild:

# cat /sys/kernel/slab/dentry/partial
14478538 N0=7329569 N1=7148969
# time cat /sys/kernel/slab/dentry/objects
286225471 N0=136967768 N1=149257703

real0m1.722s
user0m0.001s
sys 0m1.721s


I assume this could trigger the softlockup detector or even NMI
watchdog in some situations?


Yes, irqs are disabled here. But loop itself is pretty fast.
It requires terabytes of ram to reach common thresholds for watchdogs.




The same problem in slab was addressed in commit f728b0a5d72a ("mm, slab:
faster active and free stats") by adding more kmem cache statistics.
For slub same approach requires atomic op on fast path when object frees.

Let's simply limit count of scanned slabs and print warning.
Limit set in /sys/module/slub/parameters/max_partial_to_count.
Default is 1 which should be enough for most sane cases.

Return linear approximation if list of partials is longer than limit.
Nobody should notice difference.


That's a pretty sad "solution" :(

But I guess it's better than nothing at all, unless there are
alternative ideas?


Running this loop till the end adds more problems than gives information.
Adding new  percpu or atomic counters to fast paths seems redundant even for 
debugging.

Actually there is no much sense in accurate statistics for count of objects,
when there are millions of them.

Memory consumption here is defined by count and size of slabs.




--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2407,16 +2407,29 @@ static inline unsigned long node_nr_objs(struct 
kmem_cache_node *n)
  #endif /* CONFIG_SLUB_DEBUG */
  
  #if defined(CONFIG_SLUB_DEBUG) || defined(CONFIG_SYSFS)

+
+static unsigned long max_partial_to_count __read_mostly = 1;
+module_param(max_partial_to_count, ulong, 0644);
+
  static unsigned long count_partial(struct kmem_cache_node *n,
int (*get_count)(struct page *))
  {
+   unsigned long counted = 0;
unsigned long flags;
unsigned long x = 0;
struct page *page;
  
  	spin_lock_irqsave(>list_lock, flags);

-   list_for_each_entry(page, >partial, slab_list)
+   list_for_each_entry(page, >partial, slab_list) {
x += get_count(page);
+
+   if (++counted > max_partial_to_count) {
+   pr_warn_once("SLUB: too much partial slabs to count all 
objects, increase max_partial_to_count.\n");
+   /* Approximate total count of objects */
+   x = mult_frac(x, n->nr_partial, counted);
+   break;
+   }
+   }
spin_unlock_irqrestore(>list_lock, flags);
return x;
  }

Re: [PATCH RFC 1/2] fs/iomap/direct-io: pass NOWAIT to bio flags

2020-05-04 Thread Konstantin Khlebnikov


On 04/05/2020 19.00, Christoph Hellwig wrote:

On Mon, May 04, 2020 at 06:54:53PM +0300, Konstantin Khlebnikov wrote:

This is required to avoid waiting in lower layers.

Signed-off-by: Konstantin Khlebnikov 


This looks sensible.  Did you run this through xfstests?



Nope. It seems xfstests has one trivial test for NOWAIT - generic/471
It tests only write with/without extent, nothing about contention.

I've added nowait into fio and played with it a little.
https://github.com/axboe/fio/pull/972

With these patches I see EAGAINs when queue is flooded.

[PATCH] slub: limit count of partial slabs scanned to gather statistics

2020-05-04 Thread Konstantin Khlebnikov

To get exact count of free and used objects slub have to scan list of
partial slabs. This may take at long time. Scanning holds spinlock and
blocks allocations which move partial slabs to per-cpu lists and back.

Example found in the wild:

# cat /sys/kernel/slab/dentry/partial
14478538 N0=7329569 N1=7148969
# time cat /sys/kernel/slab/dentry/objects
286225471 N0=136967768 N1=149257703

real0m1.722s
user0m0.001s
sys 0m1.721s

The same problem in slab was addressed in commit f728b0a5d72a ("mm, slab:
faster active and free stats") by adding more kmem cache statistics.
For slub same approach requires atomic op on fast path when object frees.

Let's simply limit count of scanned slabs and print warning.
Limit set in /sys/module/slub/parameters/max_partial_to_count.
Default is 1 which should be enough for most sane cases.

Return linear approximation if list of partials is longer than limit.
Nobody should notice difference.

Signed-off-by: Konstantin Khlebnikov 
---
 mm/slub.c |   15 ++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/mm/slub.c b/mm/slub.c
index 9bf44955c4f1..86a366f7acb6 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2407,16 +2407,29 @@ static inline unsigned long node_nr_objs(struct 
kmem_cache_node *n)
 #endif /* CONFIG_SLUB_DEBUG */
 
 #if defined(CONFIG_SLUB_DEBUG) || defined(CONFIG_SYSFS)
+
+static unsigned long max_partial_to_count __read_mostly = 1;
+module_param(max_partial_to_count, ulong, 0644);
+
 static unsigned long count_partial(struct kmem_cache_node *n,
int (*get_count)(struct page *))
 {
+   unsigned long counted = 0;
unsigned long flags;
unsigned long x = 0;
struct page *page;
 
spin_lock_irqsave(>list_lock, flags);
-   list_for_each_entry(page, >partial, slab_list)
+   list_for_each_entry(page, >partial, slab_list) {
x += get_count(page);
+
+   if (++counted > max_partial_to_count) {
+   pr_warn_once("SLUB: too much partial slabs to count all 
objects, increase max_partial_to_count.\n");
+   /* Approximate total count of objects */
+   x = mult_frac(x, n->nr_partial, counted);
+   break;
+   }
+   }
spin_unlock_irqrestore(>list_lock, flags);
return x;
 }

[PATCH RFC 1/2] fs/iomap/direct-io: pass NOWAIT to bio flags

2020-05-04 Thread Konstantin Khlebnikov

This is required to avoid waiting in lower layers.

Signed-off-by: Konstantin Khlebnikov 
---
 fs/iomap/direct-io.c |2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index 20dde5aadcdd..9b53fa7651e3 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -63,6 +63,8 @@ static void iomap_dio_submit_bio(struct iomap_dio *dio, 
struct iomap *iomap,
 {
atomic_inc(>ref);
 
+   if (dio->iocb->ki_flags & IOCB_NOWAIT)
+   bio->bi_opf |= REQ_NOWAIT;
if (dio->iocb->ki_flags & IOCB_HIPRI)
bio_set_polled(bio, dio->iocb);

[PATCH RFC 2/2] fs/direct-io: pass NOWAIT to also for read requests

2020-05-04 Thread Konstantin Khlebnikov

For some reason NOWAIT currently is passed only for writes.

Signed-off-by: Konstantin Khlebnikov 
Fixes: 03a07c92a9ed ("block: return on congested block device")
---
 fs/direct-io.c |4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/direct-io.c b/fs/direct-io.c
index 00b4d15bb811..dbb6afef6be9 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -1234,11 +1234,11 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode 
*inode,
if (iov_iter_rw(iter) == WRITE) {
dio->op = REQ_OP_WRITE;
dio->op_flags = REQ_SYNC | REQ_IDLE;
-   if (iocb->ki_flags & IOCB_NOWAIT)
-   dio->op_flags |= REQ_NOWAIT;
} else {
dio->op = REQ_OP_READ;
}
+   if (iocb->ki_flags & IOCB_NOWAIT)
+   dio->op_flags |= REQ_NOWAIT;
if (iocb->ki_flags & IOCB_HIPRI)
dio->op_flags |= REQ_HIPRI;

[PATCH RFC 2/2] tracing/block: add request operation and flags into trace events

2020-05-04 Thread Konstantin Khlebnikov

It's hard to fit all flags into field 'rwbs' and even harder to remove
something without breaking compatibility. Let's expose all request flags
using common trace event methods: __print_symbolic and __print_flags.

This adds 32-bit 'req' field printed as req=OP,FLAGS,... by all events.
Exact constants are not part of ABI thus could be easily added/removed.

Keep 'rwbs' for backward compatibility.

Signed-off-by: Konstantin Khlebnikov 
---
 include/trace/events/block.h |  178 +-
 1 file changed, 156 insertions(+), 22 deletions(-)

diff --git a/include/trace/events/block.h b/include/trace/events/block.h
index aac9a5c0e2cc..25fff917a07f 100644
--- a/include/trace/events/block.h
+++ b/include/trace/events/block.h
@@ -11,6 +11,96 @@
 #include 
 
 
+/* Request operations, see enum req_opf */
+
+TRACE_DEFINE_ENUM(REQ_OP_READ);
+TRACE_DEFINE_ENUM(REQ_OP_WRITE);
+TRACE_DEFINE_ENUM(REQ_OP_FLUSH);
+TRACE_DEFINE_ENUM(REQ_OP_DISCARD);
+TRACE_DEFINE_ENUM(REQ_OP_SECURE_ERASE);
+TRACE_DEFINE_ENUM(REQ_OP_ZONE_RESET);
+TRACE_DEFINE_ENUM(REQ_OP_WRITE_SAME);
+TRACE_DEFINE_ENUM(REQ_OP_ZONE_RESET_ALL);
+TRACE_DEFINE_ENUM(REQ_OP_WRITE_ZEROES);
+TRACE_DEFINE_ENUM(REQ_OP_ZONE_OPEN);
+TRACE_DEFINE_ENUM(REQ_OP_ZONE_CLOSE);
+TRACE_DEFINE_ENUM(REQ_OP_ZONE_FINISH);
+TRACE_DEFINE_ENUM(REQ_OP_SCSI_IN);
+TRACE_DEFINE_ENUM(REQ_OP_SCSI_OUT);
+TRACE_DEFINE_ENUM(REQ_OP_DRV_IN);
+TRACE_DEFINE_ENUM(REQ_OP_DRV_OUT);
+
+#define BLOCK_REQ_OP_STRINGS   \
+   { REQ_OP_READ,  "READ" },   \
+   { REQ_OP_WRITE, "WRITE" },  \
+   { REQ_OP_FLUSH, "FLUSH" },  \
+   { REQ_OP_DISCARD,   "DISCARD" },\
+   { REQ_OP_SECURE_ERASE,  "SECURE_ERASE" },   \
+   { REQ_OP_ZONE_RESET,"ZONE_RESET" }, \
+   { REQ_OP_WRITE_SAME,"WRITE_SAME" }, \
+   { REQ_OP_ZONE_RESET_ALL,"ZONE_RESET_ALL" }, \
+   { REQ_OP_WRITE_ZEROES,  "WRITE_ZEROES" },   \
+   { REQ_OP_ZONE_OPEN, "ZONE_OPEN" },  \
+   { REQ_OP_ZONE_CLOSE,"ZONE_CLOSE" }, \
+   { REQ_OP_ZONE_FINISH,   "ZONE_FINISH" },\
+   { REQ_OP_SCSI_IN,   "SCSI_IN" },\
+   { REQ_OP_SCSI_OUT,  "SCSI_OUT" },   \
+   { REQ_OP_DRV_IN,"DRV_IN" }, \
+   { REQ_OP_DRV_OUT,   "DRV_OUT" }
+
+#define show_block_req_op(req) \
+   __print_symbolic((req) & REQ_OP_MASK, BLOCK_REQ_OP_STRINGS)
+
+
+/* Request operation flags, see enum req_flag_bits */
+
+TRACE_DEFINE_ENUM(__REQ_FAILFAST_DEV);
+TRACE_DEFINE_ENUM(__REQ_FAILFAST_TRANSPORT);
+TRACE_DEFINE_ENUM(__REQ_FAILFAST_DRIVER);
+TRACE_DEFINE_ENUM(__REQ_SYNC);
+TRACE_DEFINE_ENUM(__REQ_META);
+TRACE_DEFINE_ENUM(__REQ_PRIO);
+TRACE_DEFINE_ENUM(__REQ_NOMERGE);
+TRACE_DEFINE_ENUM(__REQ_IDLE);
+TRACE_DEFINE_ENUM(__REQ_INTEGRITY);
+TRACE_DEFINE_ENUM(__REQ_FUA);
+TRACE_DEFINE_ENUM(__REQ_PREFLUSH);
+TRACE_DEFINE_ENUM(__REQ_RAHEAD);
+TRACE_DEFINE_ENUM(__REQ_BACKGROUND);
+TRACE_DEFINE_ENUM(__REQ_NOWAIT);
+TRACE_DEFINE_ENUM(__REQ_NOWAIT_INLINE);
+TRACE_DEFINE_ENUM(__REQ_CGROUP_PUNT);
+TRACE_DEFINE_ENUM(__REQ_NOUNMAP);
+TRACE_DEFINE_ENUM(__REQ_HIPRI);
+TRACE_DEFINE_ENUM(__REQ_DRV);
+TRACE_DEFINE_ENUM(__REQ_SWAP);
+
+#define BLOCK_REQ_FLAG_STRINGS \
+   { REQ_FAILFAST_DEV, "FAILFAST_DEV" },   \
+   { REQ_FAILFAST_TRANSPORT,"FAILFAST_TRANSPORT" },\
+   { REQ_FAILFAST_DRIVER,  "FAILFAST_DRIVER" },\
+   { REQ_SYNC, "SYNC" },   \
+   { REQ_META, "META" },   \
+   { REQ_PRIO, "PRIO" },   \
+   { REQ_NOMERGE,  "NOMERGE" },\
+   { REQ_IDLE, "IDLE" },   \
+   { REQ_INTEGRITY,"INTEGRITY" },  \
+   { REQ_FUA,  "FUA" },\
+   { REQ_PREFLUSH, "PREFLUSH" },   \
+   { REQ_RAHEAD,   "RAHEAD" }, \
+   { REQ_BACKGROUND,   "BACKGROUND" }, \
+   { REQ_NOWAIT,   "NOWAIT" }, \
+   { REQ_NOWAIT_INLINE,"NOWAIT_INLINE" },  \
+   { REQ_CGROUP_PUNT,  "CGROUP_PUNT" },\
+   { REQ_NOUNMAP,  "NOUNMAP" },\
+   { REQ_HIP

[PATCH RFC 1/2] tracing/block: cleanup rwbs filling in trace events

2020-05-04 Thread Konstantin Khlebnikov

Define BLK_RWBS_LEN in blktrace_api.h
Bcache events use shorter 6 char buffer which could overflow.

Also remove unsed "bytes" argument.

Signed-off-by: Konstantin Khlebnikov 
---
 include/linux/blktrace_api.h  |4 +-
 include/trace/events/bcache.h |   20 +-
 include/trace/events/block.h  |   84 -
 kernel/trace/blktrace.c   |2 -
 4 files changed, 55 insertions(+), 55 deletions(-)

diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h
index 3b6ff5902edc..ea9da15d32d9 100644
--- a/include/linux/blktrace_api.h
+++ b/include/linux/blktrace_api.h
@@ -120,7 +120,9 @@ struct compat_blk_user_trace_setup {
 
 #endif
 
-extern void blk_fill_rwbs(char *rwbs, unsigned int op, int bytes);
+#define BLK_RWBS_LEN   8
+
+extern void blk_fill_rwbs(char *rwbs, unsigned int op);
 
 static inline sector_t blk_rq_trace_sector(struct request *rq)
 {
diff --git a/include/trace/events/bcache.h b/include/trace/events/bcache.h
index 0bddea663b3b..7440d704c200 100644
--- a/include/trace/events/bcache.h
+++ b/include/trace/events/bcache.h
@@ -18,7 +18,7 @@ DECLARE_EVENT_CLASS(bcache_request,
__field(sector_t,   sector  )
__field(dev_t,  orig_sector )
__field(unsigned int,   nr_sector   )
-   __array(char,   rwbs,   6   )
+   __array(char,   rwbs,   BLK_RWBS_LEN)
),
 
TP_fast_assign(
@@ -28,7 +28,7 @@ DECLARE_EVENT_CLASS(bcache_request,
__entry->sector = bio->bi_iter.bi_sector;
__entry->orig_sector= bio->bi_iter.bi_sector - 16;
__entry->nr_sector  = bio->bi_iter.bi_size >> 9;
-   blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size);
+   blk_fill_rwbs(__entry->rwbs, bio->bi_opf);
),
 
TP_printk("%d,%d %s %llu + %u (from %d,%d @ %llu)",
@@ -95,14 +95,14 @@ DECLARE_EVENT_CLASS(bcache_bio,
__field(dev_t,  dev )
__field(sector_t,   sector  )
__field(unsigned int,   nr_sector   )
-   __array(char,   rwbs,   6   )
+   __array(char,   rwbs,   BLK_RWBS_LEN)
),
 
TP_fast_assign(
__entry->dev= bio_dev(bio);
__entry->sector = bio->bi_iter.bi_sector;
__entry->nr_sector  = bio->bi_iter.bi_size >> 9;
-   blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size);
+   blk_fill_rwbs(__entry->rwbs, bio->bi_opf);
),
 
TP_printk("%d,%d  %s %llu + %u",
@@ -128,7 +128,7 @@ TRACE_EVENT(bcache_read,
__field(dev_t,  dev )
__field(sector_t,   sector  )
__field(unsigned int,   nr_sector   )
-   __array(char,   rwbs,   6   )
+   __array(char,   rwbs,   BLK_RWBS_LEN)
__field(bool,   cache_hit   )
__field(bool,   bypass  )
),
@@ -137,7 +137,7 @@ TRACE_EVENT(bcache_read,
__entry->dev= bio_dev(bio);
__entry->sector = bio->bi_iter.bi_sector;
__entry->nr_sector  = bio->bi_iter.bi_size >> 9;
-   blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size);
+   blk_fill_rwbs(__entry->rwbs, bio->bi_opf);
__entry->cache_hit = hit;
__entry->bypass = bypass;
),
@@ -158,7 +158,7 @@ TRACE_EVENT(bcache_write,
__field(u64,inode   )
__field(sector_t,   sector  )
__field(unsigned int,   nr_sector   )
-   __array(char,   rwbs,   6   )
+   __array(char,   rwbs,   BLK_RWBS_LEN)
__field(bool,   writeback   )
__field(bool,   bypass  )
),
@@ -168,7 +168,7 @@ TRACE_EVENT(bcache_write,
__entry->inode  = inode;
__entry->sector = bio->bi_iter.bi_sector;
__entry->nr_sector  = bio->bi_iter.bi_size >> 9;
-   blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size);
+   blk_fill_rwbs(__entry->rwbs, bio->bi_opf);
__entry->writeback = writeback;
__entry->bypass = bypass;
),
@@ -229,7 +229,7 @@

Re: [PATCH 2/4] block/part_stat: use __this_cpu_add() instead of access by smp_processor_id()

2020-05-04 Thread Konstantin Khlebnikov


On 04/05/2020 17.03, Christoph Hellwig wrote:

+#define __part_stat_add(part, field, addnd)\
+   (part_stat_get(part, field) += (addnd))


Just open coding part_stat_get for the UP side would seems a little
easier to read.


If rewrite filed definition as

#ifdef  CONFIG_SMP
struct disk_stats __percpu *dkstats;
#else
struct disk_stats __percpu dkstats[1];
#endif

Then all per-cpu macro should work as is for UP case too.
Surprisingly arrow operator (struct->filed) works for arrays too =)


Inlining per-cpu structures should be good for tiny UP systems.
Especially if this could be done by few macro only in three places:
definition, allocating and freeing.


But sparse still complains.

./include/linux/part_stat.h:45:24: warning: incorrect type in initializer 
(different address spaces)
./include/linux/part_stat.h:45:24:expected void const [noderef]  
*__vpp_verify
./include/linux/part_stat.h:45:24:got struct disk_stats [noderef] *

Looks like it cannot assign different address-space to the filed.




Otherwise this looks good:

Reviewed-by: Christoph Hellwig

[PATCH 3/4] block/part_stat: account merge of two requests

2020-05-04 Thread Konstantin Khlebnikov

Also rename blk_account_io_merge() into blk_account_io_merge_request() to
distinguish it from merging request and bio.

Signed-off-by: Konstantin Khlebnikov 
---
 block/blk-merge.c |7 +--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/block/blk-merge.c b/block/blk-merge.c
index a04e991b5ded..37bced39bae8 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -662,20 +662,23 @@ void blk_rq_set_mixed_merge(struct request *rq)
rq->rq_flags |= RQF_MIXED_MERGE;
 }
 
-static void blk_account_io_merge(struct request *req)
+static void blk_account_io_merge_request(struct request *req)
 {
if (blk_do_io_stat(req)) {
+   const int sgrp = op_stat_group(req_op(req));
struct hd_struct *part;
 
part_stat_lock();
part = req->part;
 
+   part_stat_inc(part, merges[sgrp]);
part_dec_in_flight(req->q, part, rq_data_dir(req));
 
hd_struct_put(part);
part_stat_unlock();
}
 }
+
 /*
  * Two cases of handling DISCARD merge:
  * If max_discard_segments > 1, the driver takes every bio
@@ -787,7 +790,7 @@ static struct request *attempt_merge(struct request_queue 
*q,
/*
 * 'next' is going away, so update stats accordingly
 */
-   blk_account_io_merge(next);
+   blk_account_io_merge_request(next);
 
/*
 * ownership of bio passed from next to req, return 'next' for

[PATCH 4/4] block/part_stat: add helper blk_account_io_merge_bio()

2020-05-04 Thread Konstantin Khlebnikov

Move non-"new_io" branch of blk_account_io_start() into separate function.
Fix merge accounting for discards (they were counted as write merges).

Also blk_account_io_merge_bio() doesn't call update_io_ticks() unlike to
blk_account_io_start(), there is no reason for that.

Signed-off-by: Konstantin Khlebnikov 
---
 block/blk-core.c |   39 ++-
 block/blk-exec.c |2 +-
 block/blk-mq.c   |2 +-
 block/blk.h  |2 +-
 4 files changed, 25 insertions(+), 20 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 45ddf7238c06..18fb42eb2f18 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -622,6 +622,17 @@ void blk_put_request(struct request *req)
 }
 EXPORT_SYMBOL(blk_put_request);
 
+static void blk_account_io_merge_bio(struct request *req)
+{
+   if (blk_do_io_stat(req)) {
+   const int sgrp = op_stat_group(req_op(req));
+
+   part_stat_lock();
+   part_stat_inc(req->part, merges[sgrp]);
+   part_stat_unlock();
+   }
+}
+
 bool bio_attempt_back_merge(struct request *req, struct bio *bio,
unsigned int nr_segs)
 {
@@ -640,7 +651,7 @@ bool bio_attempt_back_merge(struct request *req, struct bio 
*bio,
req->biotail = bio;
req->__data_len += bio->bi_iter.bi_size;
 
-   blk_account_io_start(req, false);
+   blk_account_io_merge_bio(req);
return true;
 }
 
@@ -664,7 +675,7 @@ bool bio_attempt_front_merge(struct request *req, struct 
bio *bio,
req->__sector = bio->bi_iter.bi_sector;
req->__data_len += bio->bi_iter.bi_size;
 
-   blk_account_io_start(req, false);
+   blk_account_io_merge_bio(req);
return true;
 }
 
@@ -686,7 +697,7 @@ bool bio_attempt_discard_merge(struct request_queue *q, 
struct request *req,
req->__data_len += bio->bi_iter.bi_size;
req->nr_phys_segments = segments + 1;
 
-   blk_account_io_start(req, false);
+   blk_account_io_merge_bio(req);
return true;
 no_merge:
req_set_nomerge(q, req);
@@ -1258,7 +1269,7 @@ blk_status_t blk_insert_cloned_request(struct 
request_queue *q, struct request *
return BLK_STS_IOERR;
 
if (blk_queue_io_stat(q))
-   blk_account_io_start(rq, true);
+   blk_account_io_start(rq);
 
/*
 * Since we have a scheduler attached on the top device,
@@ -1348,20 +1359,14 @@ void blk_account_io_done(struct request *req, u64 now)
}
 }
 
-void blk_account_io_start(struct request *rq, bool new_io)
+void blk_account_io_start(struct request *rq)
 {
struct hd_struct *part;
int rw = rq_data_dir(rq);
 
-   if (!blk_do_io_stat(rq))
-   return;
-
-   part_stat_lock();
+   if (blk_do_io_stat(rq)) {
+   part_stat_lock();
 
-   if (!new_io) {
-   part = rq->part;
-   part_stat_inc(part, merges[rw]);
-   } else {
rcu_read_lock();
part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
if (!hd_struct_try_get(part)) {
@@ -1378,13 +1383,13 @@ void blk_account_io_start(struct request *rq, bool 
new_io)
}
rcu_read_unlock();
 
-   part_inc_in_flight(rq->q, part, rw);
rq->part = part;
-   }
 
-   update_io_ticks(part, jiffies, false);
+   part_inc_in_flight(rq->q, part, rw);
+   update_io_ticks(part, jiffies, false);
 
-   part_stat_unlock();
+   part_stat_unlock();
+   }
 }
 
 /*
diff --git a/block/blk-exec.c b/block/blk-exec.c
index e20a852ae432..85324d53d072 100644
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@@ -55,7 +55,7 @@ void blk_execute_rq_nowait(struct request_queue *q, struct 
gendisk *bd_disk,
rq->rq_disk = bd_disk;
rq->end_io = done;
 
-   blk_account_io_start(rq, true);
+   blk_account_io_start(rq);
 
/*
 * don't check dying flag for MQ because the request won't
diff --git a/block/blk-mq.c b/block/blk-mq.c
index bcc3a2397d4a..049c4f9417c3 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1794,7 +1794,7 @@ static void blk_mq_bio_to_request(struct request *rq, 
struct bio *bio,
rq->write_hint = bio->bi_write_hint;
blk_rq_bio_prep(rq, bio, nr_segs);
 
-   blk_account_io_start(rq, true);
+   blk_account_io_start(rq);
 }
 
 static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx,
diff --git a/block/blk.h b/block/blk.h
index 73bd3b1c6938..06cd57cc10fb 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -195,7 +195,7 @@ bool bio_attempt_discard_merge(struct request_queue *q, 
struct request *req,
 bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
unsigned int nr_segs, struct request **same_queue_rq);
 
-void blk_account_io_start(struct request *req

[PATCH 2/4] block/part_stat: use __this_cpu_add() instead of access by smp_processor_id()

2020-05-04 Thread Konstantin Khlebnikov

Most architectures have fast path to access percpu for current cpu.
Required preempt_disable() is provided by part_stat_lock().

Signed-off-by: Konstantin Khlebnikov 
---
 include/linux/part_stat.h |9 ++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/include/linux/part_stat.h b/include/linux/part_stat.h
index 755a01f0fd61..a0ddeff3798e 100644
--- a/include/linux/part_stat.h
+++ b/include/linux/part_stat.h
@@ -36,6 +36,9 @@
res;\
 })
 
+#define __part_stat_add(part, field, addnd)\
+   __this_cpu_add((part)->dkstats->field, addnd)
+
 static inline void part_stat_set_all(struct hd_struct *part, int value)
 {
int i;
@@ -64,6 +67,9 @@ static inline void free_part_stats(struct hd_struct *part)
 #define part_stat_get_cpu(part, field, cpu)part_stat_get(part, field)
 #define part_stat_read(part, field)part_stat_get(part, field)
 
+#define __part_stat_add(part, field, addnd)\
+   (part_stat_get(part, field) += (addnd))
+
 static inline void part_stat_set_all(struct hd_struct *part, int value)
 {
memset(>dkstats, value, sizeof(struct disk_stats));
@@ -85,9 +91,6 @@ static inline void free_part_stats(struct hd_struct *part)
 part_stat_read(part, field[STAT_WRITE]) +  \
 part_stat_read(part, field[STAT_DISCARD]))
 
-#define __part_stat_add(part, field, addnd)\
-   (part_stat_get(part, field) += (addnd))
-
 #define part_stat_add(part, field, addnd)  do {\
__part_stat_add((part), field, addnd);  \
if ((part)->partno) \

[PATCH 1/4] block/part_stat: remove rcu_read_lock() from part_stat_lock()

2020-05-04 Thread Konstantin Khlebnikov

RCU lock is required only in blk_account_io_start() to lookup partition.
After that request holds reference to related hd_struct.

Replace get_cpu() with preempt_disable() - returned cpu index is unused.

Non-SMP case also needs preempt_disable, otherwise statistics update could
be non-atomic. Previously that was provided by rcu_read_lock().

Signed-off-by: Konstantin Khlebnikov 
---
 block/blk-core.c  |3 +++
 include/linux/part_stat.h |7 +++
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 7f11560bfddb..45ddf7238c06 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1362,6 +1362,7 @@ void blk_account_io_start(struct request *rq, bool new_io)
part = rq->part;
part_stat_inc(part, merges[rw]);
} else {
+   rcu_read_lock();
part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
if (!hd_struct_try_get(part)) {
/*
@@ -1375,6 +1376,8 @@ void blk_account_io_start(struct request *rq, bool new_io)
part = >rq_disk->part0;
hd_struct_get(part);
}
+   rcu_read_unlock();
+
part_inc_in_flight(rq->q, part, rw);
rq->part = part;
}
diff --git a/include/linux/part_stat.h b/include/linux/part_stat.h
index ece607607a86..755a01f0fd61 100644
--- a/include/linux/part_stat.h
+++ b/include/linux/part_stat.h
@@ -16,9 +16,10 @@
  * part_stat_{add|set_all}() and {init|free}_part_stats are for
  * internal use only.
  */
+#define part_stat_lock()   preempt_disable()
+#define part_stat_unlock() preempt_enable()
+
 #ifdef CONFIG_SMP
-#define part_stat_lock()   ({ rcu_read_lock(); get_cpu(); })
-#define part_stat_unlock() do { put_cpu(); rcu_read_unlock(); } while (0)
 
 #define part_stat_get_cpu(part, field, cpu)\
(per_cpu_ptr((part)->dkstats, (cpu))->field)
@@ -58,8 +59,6 @@ static inline void free_part_stats(struct hd_struct *part)
 }
 
 #else /* !CONFIG_SMP */
-#define part_stat_lock()   ({ rcu_read_lock(); 0; })
-#define part_stat_unlock() rcu_read_unlock()
 
 #define part_stat_get(part, field) ((part)->dkstats.field)
 #define part_stat_get_cpu(part, field, cpu)part_stat_get(part, field)

[PATCH v3] perf tools: fix detecting SMT at machines with more than 32 cpus

2020-05-04 Thread Konstantin Khlebnikov

Cpu bitmap is split into 32 bit words. For system with more than 32 cores
threads are always in different words thus first word never has two bits:
cpu0: ",0100,0001", cpu 79: "8000,0080,".

Instead of parsing bitmap read "core_cpus_list" or "thread_siblings_list"
and simply check presence of ',' or '-' in it.

Signed-off-by: Konstantin Khlebnikov 
Fixes: de5077c4e38f ("perf tools: Add utility function to detect SMT status")
---
 tools/perf/util/smt.c |   39 ---
 1 file changed, 16 insertions(+), 23 deletions(-)

diff --git a/tools/perf/util/smt.c b/tools/perf/util/smt.c
index 20bacd5972ad..ef713981725a 100644
--- a/tools/perf/util/smt.c
+++ b/tools/perf/util/smt.c
@@ -1,6 +1,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include "api/fs/fs.h"
 #include "smt.h"
@@ -9,6 +10,7 @@ int smt_on(void)
 {
static bool cached;
static int cached_result;
+   char *str = NULL;
int cpu;
int ncpu;
 
@@ -20,33 +22,24 @@ int smt_on(void)
 
ncpu = sysconf(_SC_NPROCESSORS_CONF);
for (cpu = 0; cpu < ncpu; cpu++) {
-   unsigned long long siblings;
-   char *str;
-   size_t strlen;
char fn[256];
+   size_t len;
 
-   snprintf(fn, sizeof fn,
-   "devices/system/cpu/cpu%d/topology/core_cpus", cpu);
-   if (sysfs__read_str(fn, , ) < 0) {
-   snprintf(fn, sizeof fn,
-   
"devices/system/cpu/cpu%d/topology/thread_siblings",
-   cpu);
-   if (sysfs__read_str(fn, , ) < 0)
-   continue;
-   }
-   /* Entry is hex, but does not have 0x, so need custom parser */
-   siblings = strtoull(str, NULL, 16);
-   free(str);
-   if (hweight64(siblings) > 1) {
-   cached_result = 1;
-   cached = true;
+   snprintf(fn, sizeof(fn), "devices/system/cpu/cpu%d/topology/%s",
+cpu, "core_cpus_list");
+   if (sysfs__read_str(fn, , ) > 0)
+   break;
+
+   snprintf(fn, sizeof(fn), "devices/system/cpu/cpu%d/topology/%s",
+cpu, "thread_siblings_list");
+   if (sysfs__read_str(fn, , ) > 0)
break;
-   }
}
-   if (!cached) {
-   cached_result = 0;
+
+   // ',' or '-' should be in string if list contains more than one cpu
+   cached_result = str && (strchr(str, ',') || strchr(str, '-'));
+   free(str);
 done:
-   cached = true;
-   }
+   cached = true;
return cached_result;
 }

[tip: x86/urgent] ftrace/x86: Fix trace event registration for syscalls without arguments

2020-05-01 Thread tip-bot2 for Konstantin Khlebnikov

The following commit has been merged into the x86/urgent branch of tip:

Commit-ID: fdc63ff0e49c54992b4b2656345a5e878b32
Gitweb:
https://git.kernel.org/tip/fdc63ff0e49c54992b4b2656345a5e878b32
Author:Konstantin Khlebnikov 
AuthorDate:Wed, 08 Apr 2020 21:13:10 +03:00
Committer: Thomas Gleixner 
CommitterDate: Fri, 01 May 2020 19:15:40 +02:00

ftrace/x86: Fix trace event registration for syscalls without arguments

The refactoring of SYSCALL_DEFINE0() macros removed the ABI stubs and
simply defines __abi_sys_$NAME as alias of __do_sys_$NAME.

As a result kallsyms_lookup() returns "__do_sys_$NAME" which does not match
with the declared trace event name.

See also commit 1c758a2202a6 ("tracing/x86: Update syscall trace events to
handle new prefixed syscall func names").

Add __do_sys_ to the valid prefixes which are checked in
arch_syscall_match_sym_name().

Fixes: d2b5de495ee9 ("x86/entry: Refactor SYSCALL_DEFINE0 macros")
Signed-off-by: Konstantin Khlebnikov 
Signed-off-by: Thomas Gleixner 
Acked-by: Steven Rostedt (VMware) 
Link: 
https://lkml.kernel.org/r/158636958997.7900.16485049455470033557.stgit@buzz

---
 arch/x86/include/asm/ftrace.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h
index 85be2f5..70b96ca 100644
--- a/arch/x86/include/asm/ftrace.h
+++ b/arch/x86/include/asm/ftrace.h
@@ -61,11 +61,12 @@ static inline bool arch_syscall_match_sym_name(const char 
*sym, const char *name
 {
/*
 * Compare the symbol name with the system call name. Skip the
-* "__x64_sys", "__ia32_sys" or simple "sys" prefix.
+* "__x64_sys", "__ia32_sys", "__do_sys" or simple "sys" prefix.
 */
return !strcmp(sym + 3, name + 3) ||
(!strncmp(sym, "__x64_", 6) && !strcmp(sym + 9, name + 3)) ||
-   (!strncmp(sym, "__ia32_", 7) && !strcmp(sym + 10, name + 3));
+   (!strncmp(sym, "__ia32_", 7) && !strcmp(sym + 10, name + 3)) ||
+   (!strncmp(sym, "__do_sys", 8) && !strcmp(sym + 8, name + 3));
 }
 
 #ifndef COMPILE_OFFSETS

Re: [PATCH v2 2/3] perf tool: fix detecting smt at machines with more than 32 cpus

2020-04-29 Thread Konstantin Khlebnikov

On Wed, Apr 29, 2020 at 9:16 PM Arnaldo Carvalho de Melo
 wrote:
>
> Em Wed, Apr 29, 2020 at 07:22:43PM +0300, Konstantin Khlebnikov escreveu:
> > Cpu bitmap is split into 32 bit words. For system with more than 32 cores
> > threads are always in different words thus first word never has two bits:
> > cpu0: ",0100,0001", cpu 79: "8000,0080,".
> >
> > Instead of parsing bitmap read "core_cpus_list" or "thread_siblings_list"
> > and simply check presence of ',' or '-' in it.
> >
> > Signed-off-by: Konstantin Khlebnikov 
> > Fixes: de5077c4e38f ("perf tools: Add utility function to detect SMT 
> > status")
> > ---
> >  tools/perf/util/smt.c |   37 +
> >  1 file changed, 17 insertions(+), 20 deletions(-)
> >
> > diff --git a/tools/perf/util/smt.c b/tools/perf/util/smt.c
> > index 8481842e9edb..dc37b5abd1c3 100644
> > --- a/tools/perf/util/smt.c
> > +++ b/tools/perf/util/smt.c
> > @@ -1,6 +1,7 @@
> >  #include 
> >  #include 
> >  #include 
> > +#include 
> >  #include 
> >  #include "api/fs/fs.h"
> >  #include "smt.h"
> > @@ -9,39 +10,35 @@ int smt_on(void)
> >  {
> >   static bool cached;
> >   static int cached_result;
> > + int active;
> >   int cpu;
> >   int ncpu;
> > + char *str = NULL;
> > + size_t strlen;
>
> Try not to use as the name of a variable the well known name of a
> standard C library function, there are cases where some of those names
> are used as #define directives and then all hell break loose...

You mean "strlen"? Yeah, that's weird name for variable
but it was here before me thus I haven't noticed.

>
> Also doing first the change that makes the use of that new file would
> allow me to have processed that patch first, which is way simpler than
> this one, i.e. try to leave the more involved changes to the end of the
> patchkit, that helps cherry-picking the less complex/smaller parts of
> your patchkit.

Hmm. Common sense tells to put cleanups and bugfixes before new features.

>
> I've applied the first one, thanks!
>
> - Arnaldo
>
> >   if (cached)
> >   return cached_result;
> >
> >   ncpu = sysconf(_SC_NPROCESSORS_CONF);
> >   for (cpu = 0; cpu < ncpu; cpu++) {
> > - unsigned long long siblings;
> > - char *str;
> > - size_t strlen;
> >   char fn[256];
> >
> > - snprintf(fn, sizeof fn,
> > - "devices/system/cpu/cpu%d/topology/core_cpus", cpu);
> > - if (sysfs__read_str(fn, , ) < 0) {
> > - snprintf(fn, sizeof fn,
> > - 
> > "devices/system/cpu/cpu%d/topology/thread_siblings",
> > - cpu);
> > - if (sysfs__read_str(fn, , ) < 0)
> > - continue;
> > - }
> > - /* Entry is hex, but does not have 0x, so need custom parser 
> > */
> > - siblings = strtoull(str, NULL, 16);
> > - free(str);
> > - if (hweight64(siblings) > 1) {
> > - cached_result = 1;
> > - cached = true;
> > + snprintf(fn, sizeof(fn), 
> > "devices/system/cpu/cpu%d/topology/%s",
> > +  cpu, "core_cpus_list");
> > + if (sysfs__read_str(fn, , ) > 0)
> > + break;
> > +
> > + snprintf(fn, sizeof(fn), 
> > "devices/system/cpu/cpu%d/topology/%s",
> > +  cpu, "thread_siblings_list");
> > + if (sysfs__read_str(fn, , ) > 0)
> >   break;
> > - }
> >   }
> > +
> > + active = str && (strchr(str, ',') != NULL || strchr(str, '-') != 
> > NULL);
> > + free(str);
> > +
> >   if (!cached) {
> > - cached_result = 0;
> > + cached_result = active;
> >   cached = true;
> >   }
> >   return cached_result;
> >
>
> --
>
> - Arnaldo

[PATCH v2 3/3] perf tool: simplify checking active smt

2020-04-29 Thread Konstantin Khlebnikov

SMT now could be disabled via "/sys/devices/system/cpu/smt/control".
Status shown in "/sys/devices/system/cpu/smt/active" simply as "0" / "1".

If this knob isn't here then fallback to checking topology as before.

Signed-off-by: Konstantin Khlebnikov 
---
 tools/perf/util/smt.c |4 
 1 file changed, 4 insertions(+)

diff --git a/tools/perf/util/smt.c b/tools/perf/util/smt.c
index dc37b5abd1c3..c398528d1006 100644
--- a/tools/perf/util/smt.c
+++ b/tools/perf/util/smt.c
@@ -19,6 +19,9 @@ int smt_on(void)
if (cached)
return cached_result;
 
+   if (sysfs__read_int("devices/system/cpu/smt/active", ) > 0)
+   goto done;
+
ncpu = sysconf(_SC_NPROCESSORS_CONF);
for (cpu = 0; cpu < ncpu; cpu++) {
char fn[256];
@@ -37,6 +40,7 @@ int smt_on(void)
active = str && (strchr(str, ',') != NULL || strchr(str, '-') != NULL);
free(str);
 
+done:
if (!cached) {
cached_result = active;
cached = true;

[PATCH v2 2/3] perf tool: fix detecting smt at machines with more than 32 cpus

2020-04-29 Thread Konstantin Khlebnikov

Cpu bitmap is split into 32 bit words. For system with more than 32 cores
threads are always in different words thus first word never has two bits:
cpu0: ",0100,0001", cpu 79: "8000,0080,".

Instead of parsing bitmap read "core_cpus_list" or "thread_siblings_list"
and simply check presence of ',' or '-' in it.

Signed-off-by: Konstantin Khlebnikov 
Fixes: de5077c4e38f ("perf tools: Add utility function to detect SMT status")
---
 tools/perf/util/smt.c |   37 +
 1 file changed, 17 insertions(+), 20 deletions(-)

diff --git a/tools/perf/util/smt.c b/tools/perf/util/smt.c
index 8481842e9edb..dc37b5abd1c3 100644
--- a/tools/perf/util/smt.c
+++ b/tools/perf/util/smt.c
@@ -1,6 +1,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include "api/fs/fs.h"
 #include "smt.h"
@@ -9,39 +10,35 @@ int smt_on(void)
 {
static bool cached;
static int cached_result;
+   int active;
int cpu;
int ncpu;
+   char *str = NULL;
+   size_t strlen;
 
if (cached)
return cached_result;
 
ncpu = sysconf(_SC_NPROCESSORS_CONF);
for (cpu = 0; cpu < ncpu; cpu++) {
-   unsigned long long siblings;
-   char *str;
-   size_t strlen;
char fn[256];
 
-   snprintf(fn, sizeof fn,
-   "devices/system/cpu/cpu%d/topology/core_cpus", cpu);
-   if (sysfs__read_str(fn, , ) < 0) {
-   snprintf(fn, sizeof fn,
-   
"devices/system/cpu/cpu%d/topology/thread_siblings",
-   cpu);
-   if (sysfs__read_str(fn, , ) < 0)
-   continue;
-   }
-   /* Entry is hex, but does not have 0x, so need custom parser */
-   siblings = strtoull(str, NULL, 16);
-   free(str);
-   if (hweight64(siblings) > 1) {
-   cached_result = 1;
-   cached = true;
+   snprintf(fn, sizeof(fn), "devices/system/cpu/cpu%d/topology/%s",
+cpu, "core_cpus_list");
+   if (sysfs__read_str(fn, , ) > 0)
+   break;
+
+   snprintf(fn, sizeof(fn), "devices/system/cpu/cpu%d/topology/%s",
+cpu, "thread_siblings_list");
+   if (sysfs__read_str(fn, , ) > 0)
break;
-   }
}
+
+   active = str && (strchr(str, ',') != NULL || strchr(str, '-') != NULL);
+   free(str);
+
if (!cached) {
-   cached_result = 0;
+   cached_result = active;
cached = true;
}
return cached_result;

[PATCH v2 1/3] perf tool: fix reading new topology attribute "core_cpus"

2020-04-29 Thread Konstantin Khlebnikov

Check access("devices/system/cpu/cpu%d/topology/core_cpus", F_OK) fails,
unless current directory is "/sys". Simply try read this file first.

Signed-off-by: Konstantin Khlebnikov 
Fixes: 0ccdb8407a46 ("perf tools: Apply new CPU topology sysfs attributes")
---
 tools/perf/util/smt.c |6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tools/perf/util/smt.c b/tools/perf/util/smt.c
index 3b791ef2cd50..8481842e9edb 100644
--- a/tools/perf/util/smt.c
+++ b/tools/perf/util/smt.c
@@ -24,13 +24,13 @@ int smt_on(void)
 
snprintf(fn, sizeof fn,
"devices/system/cpu/cpu%d/topology/core_cpus", cpu);
-   if (access(fn, F_OK) == -1) {
+   if (sysfs__read_str(fn, , ) < 0) {
snprintf(fn, sizeof fn,

"devices/system/cpu/cpu%d/topology/thread_siblings",
cpu);
+   if (sysfs__read_str(fn, , ) < 0)
+   continue;
}
-   if (sysfs__read_str(fn, , ) < 0)
-   continue;
/* Entry is hex, but does not have 0x, so need custom parser */
siblings = strtoull(str, NULL, 16);
free(str);

[PATCH] perf tool: fix detection of active SMT

2020-04-29 Thread Konstantin Khlebnikov

SMT now could be disabled via "/sys/devices/system/cpu/smt/control".
Status shown in "/sys/devices/system/cpu/smt/active" simply as "0" / "1".

If this knob isn't here fallback to checking topology but fix couple bugs:

Check access("devices/system/cpu/cpu%d/topology/core_cpus", F_OK) fails,
unless current directory is "/sys". Simply try read this file first.

Cpu bitmap is split into 32 bit words. For system with more than 32 cores
threads are always in different words thus first word never has two bits:
cpu0: ",0100,0001", cpu 79: "8000,0080,".

Instead of parsing bitmap read "core_cpus_list" or "thread_siblings_list"
and simply check presence of ',' or '-' in it.

Signed-off-by: Konstantin Khlebnikov 
Fixes: de5077c4e38f ("perf tools: Add utility function to detect SMT status")
Fixes: 0ccdb8407a46 ("perf tools: Apply new CPU topology sysfs attributes")
---
 tools/perf/util/smt.c |   41 +
 1 file changed, 21 insertions(+), 20 deletions(-)

diff --git a/tools/perf/util/smt.c b/tools/perf/util/smt.c
index 3b791ef2cd50..c398528d1006 100644
--- a/tools/perf/util/smt.c
+++ b/tools/perf/util/smt.c
@@ -1,6 +1,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include "api/fs/fs.h"
 #include "smt.h"
@@ -9,39 +10,39 @@ int smt_on(void)
 {
static bool cached;
static int cached_result;
+   int active;
int cpu;
int ncpu;
+   char *str = NULL;
+   size_t strlen;
 
if (cached)
return cached_result;
 
+   if (sysfs__read_int("devices/system/cpu/smt/active", ) > 0)
+   goto done;
+
ncpu = sysconf(_SC_NPROCESSORS_CONF);
for (cpu = 0; cpu < ncpu; cpu++) {
-   unsigned long long siblings;
-   char *str;
-   size_t strlen;
char fn[256];
 
-   snprintf(fn, sizeof fn,
-   "devices/system/cpu/cpu%d/topology/core_cpus", cpu);
-   if (access(fn, F_OK) == -1) {
-   snprintf(fn, sizeof fn,
-   
"devices/system/cpu/cpu%d/topology/thread_siblings",
-   cpu);
-   }
-   if (sysfs__read_str(fn, , ) < 0)
-   continue;
-   /* Entry is hex, but does not have 0x, so need custom parser */
-   siblings = strtoull(str, NULL, 16);
-   free(str);
-   if (hweight64(siblings) > 1) {
-   cached_result = 1;
-   cached = true;
+   snprintf(fn, sizeof(fn), "devices/system/cpu/cpu%d/topology/%s",
+cpu, "core_cpus_list");
+   if (sysfs__read_str(fn, , ) > 0)
+   break;
+
+   snprintf(fn, sizeof(fn), "devices/system/cpu/cpu%d/topology/%s",
+cpu, "thread_siblings_list");
+   if (sysfs__read_str(fn, , ) > 0)
break;
-   }
}
+
+   active = str && (strchr(str, ',') != NULL || strchr(str, '-') != NULL);
+   free(str);
+
+done:
if (!cached) {
-   cached_result = 0;
+   cached_result = active;
cached = true;
}
return cached_result;

Re: [PATCH 4.4 1/2] x86/vdso: Remove direct HPET mapping into userspace

2019-10-23 Thread Konstantin Khlebnikov


On 23/10/2019 15.07, Konstantin Khlebnikov wrote:

commit 1ed95e52d902035e39a715ff3a314a893a96e5b7 upstream.

Commit d96d87834d5b870402a4a5b565706a4869ebc020 in v4.4.190 which is
backport of upstream commit 1ed95e52d902035e39a715ff3a314a893a96e5b7
removed only HPET access from vdso but leaved HPET mapped in "vvar".
So userspace still could read HPET directly and confuse hardware.

This patch removes mapping HPET page into userspace.

Fixes: d96d87834d5b ("x86/vdso: Remove direct HPET access through the vDSO") # 
v4.4.190
Signed-off-by: Konstantin Khlebnikov 
Link: 
https://lore.kernel.org/lkml/6fd42b2b-e29a-1fd6-03d1-e9da9192e...@yandex-team.ru/
---
  arch/x86/entry/vdso/vma.c |   14 --
  1 file changed, 14 deletions(-)

diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index 6b46648588d8..cc0a3c16a95d 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -18,7 +18,6 @@
  #include 
  #include 
  #include 
-#include 
  #include 
  #include 
  
@@ -159,19 +158,6 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr)

if (ret)
goto up_fail;
  
-#ifdef CONFIG_HPET_TIMER

-   if (hpet_address && image->sym_hpet_page) {


Probably this patch is not required.
It seems after removing symbol "hpet_page" from vdso code
image->sym_hpet_page always is NULL and this branch never executed.


-   ret = io_remap_pfn_range(vma,
-   text_start + image->sym_hpet_page,
-   hpet_address >> PAGE_SHIFT,
-   PAGE_SIZE,
-   pgprot_noncached(PAGE_READONLY));
-
-   if (ret)
-   goto up_fail;
-   }
-#endif
-
pvti = pvclock_pvti_cpu0_va();
if (pvti && image->sym_pvclock_page) {
ret = remap_pfn_range(vma,

[PATCH 4.4 1/2] x86/vdso: Remove direct HPET mapping into userspace

2019-10-23 Thread Konstantin Khlebnikov

commit 1ed95e52d902035e39a715ff3a314a893a96e5b7 upstream.

Commit d96d87834d5b870402a4a5b565706a4869ebc020 in v4.4.190 which is
backport of upstream commit 1ed95e52d902035e39a715ff3a314a893a96e5b7
removed only HPET access from vdso but leaved HPET mapped in "vvar".
So userspace still could read HPET directly and confuse hardware.

This patch removes mapping HPET page into userspace.

Fixes: d96d87834d5b ("x86/vdso: Remove direct HPET access through the vDSO") # 
v4.4.190
Signed-off-by: Konstantin Khlebnikov 
Link: 
https://lore.kernel.org/lkml/6fd42b2b-e29a-1fd6-03d1-e9da9192e...@yandex-team.ru/
---
 arch/x86/entry/vdso/vma.c |   14 --
 1 file changed, 14 deletions(-)

diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index 6b46648588d8..cc0a3c16a95d 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -18,7 +18,6 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 #include 
 
@@ -159,19 +158,6 @@ static int map_vdso(const struct vdso_image *image, bool 
calculate_addr)
if (ret)
goto up_fail;
 
-#ifdef CONFIG_HPET_TIMER
-   if (hpet_address && image->sym_hpet_page) {
-   ret = io_remap_pfn_range(vma,
-   text_start + image->sym_hpet_page,
-   hpet_address >> PAGE_SHIFT,
-   PAGE_SIZE,
-   pgprot_noncached(PAGE_READONLY));
-
-   if (ret)
-   goto up_fail;
-   }
-#endif
-
pvti = pvclock_pvti_cpu0_va();
if (pvti && image->sym_pvclock_page) {
ret = remap_pfn_range(vma,

[PATCH 4.4 2/2] x86/vdso: Remove hpet_page from vDSO

2019-10-23 Thread Konstantin Khlebnikov

From: Jia Zhang 

Commit 81d30225bc0c246b53270eb90b23cfbb941a186d upstream.

This trivial cleanup finalizes the removal of vDSO HPET support.

Fixes: 1ed95e52d902 ("x86/vdso: Remove direct HPET access through the vDSO")
Signed-off-by: Jia Zhang 
Signed-off-by: Thomas Gleixner 
Cc: l...@kernel.org
Cc: b...@alien8.de
Link: 
https://lkml.kernel.org/r/20190401114045.7280-1-zhang@linux.alibaba.com
Signed-off-by: Ingo Molnar 
Signed-off-by: Konstantin Khlebnikov 
---
 arch/x86/entry/vdso/vdso2c.c |3 ---
 arch/x86/include/asm/vdso.h  |1 -
 2 files changed, 4 deletions(-)

diff --git a/arch/x86/entry/vdso/vdso2c.c b/arch/x86/entry/vdso/vdso2c.c
index 491020b2826d..6446ba489eb2 100644
--- a/arch/x86/entry/vdso/vdso2c.c
+++ b/arch/x86/entry/vdso/vdso2c.c
@@ -72,7 +72,6 @@ const char *outfilename;
 enum {
sym_vvar_start,
sym_vvar_page,
-   sym_hpet_page,
sym_pvclock_page,
sym_VDSO_FAKE_SECTION_TABLE_START,
sym_VDSO_FAKE_SECTION_TABLE_END,
@@ -80,7 +79,6 @@ enum {
 
 const int special_pages[] = {
sym_vvar_page,
-   sym_hpet_page,
sym_pvclock_page,
 };
 
@@ -92,7 +90,6 @@ struct vdso_sym {
 struct vdso_sym required_syms[] = {
[sym_vvar_start] = {"vvar_start", true},
[sym_vvar_page] = {"vvar_page", true},
-   [sym_hpet_page] = {"hpet_page", true},
[sym_pvclock_page] = {"pvclock_page", true},
[sym_VDSO_FAKE_SECTION_TABLE_START] = {
"VDSO_FAKE_SECTION_TABLE_START", false
diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h
index deabaf9759b6..c2a1188cd0bf 100644
--- a/arch/x86/include/asm/vdso.h
+++ b/arch/x86/include/asm/vdso.h
@@ -21,7 +21,6 @@ struct vdso_image {
long sym_vvar_start;  /* Negative offset to the vvar area */
 
long sym_vvar_page;
-   long sym_hpet_page;
long sym_pvclock_page;
long sym_VDSO32_NOTE_MASK;
long sym___kernel_sigreturn;

Re: [PATCH RFC] time: validate watchdog clocksource using second best candidate

2019-10-23 Thread Konstantin Khlebnikov

On 20/05/2019 18.04, Konstantin Khlebnikov wrote:

On 18.05.2019 21:26, Thomas Gleixner wrote:

On Sat, 18 May 2019, Konstantin Khlebnikov wrote:

On 18.05.2019 18:17, Thomas Gleixner wrote:

On Wed, 15 May 2019, Konstantin Khlebnikov wrote:

Timekeeping watchdog verifies doubtful clocksources using more reliable
candidates. For x86 it likely verifies 'tsc' using 'hpet'. But 'hpet'
is far from perfect too. It's better to have second opinion if possible.

We're seeing sudden jumps of hpet counter to 0x:

On which kind of hardware? A particular type of CPU or random ones?

In general this is very rare event.

This exact pattern have been seen ten times or so on several servers with
Intel(R) Xeon(R) CPU E5-2660 v4 @ 2.00GHz
(this custom built platform with chipset Intel C610)

and haven't seen for previous generation
Intel(R) Xeon(R) CPU E5-2650 v2 @ 2.60GHz
(this is another custom built platform)

Same chipset? Note the HPET is part of the chipset not of the CPU.

Almost the same. Intel C600.

Link was in patch: https://lore.kernel.org/patchwork/patch/667413/

Hmm. Not really helpful either.

This patch uses second reliable clocksource as backup for validation.
For x86 this is usually 'acpi_pm'. If watchdog and backup are not consent
then other clocksources will not be marked as unstable at this iteration.

The mess you add to the watchdog code is unholy and that's broken as there
is no guarantee for acpi_pm (or any other secondary watchdog) being
available.

ACPI power management timer is a pretty standard x86 hardware.

Used to be.

But my patch should work for any platform with any second reliable
clocksource.

Which is close to zero if PM timer is not exposed.

If there is no second clocksource my patch does noting:
watchdog_backup stays NULL and backup_consent always true.

That still does not justify the extra complexity for a few custom built
systems.

 >
 > Aside of that this leaves the HPET in a half broken state. HPET is not only
 > used as a clock event device it's also exposed by HPET device. So no, if we
 > figure out that HPET is broken on some platforms we have to blacklist and
 > disable it completely and not just duct tape the place which exposes the
 > wreckage.
 >

If re-reading helps then HPET is fine.
This is temporary failure, probably bus issue.

I'll add re-reading with debug logging and try to collect more information this 
year.

Good news, everyone! We've found conditions when HPET counter returns '-1'.

clocksource: timekeeping watchdog on CPU21: Marking clocksource 'tsc' as 
unstable because the skew is too large:
clocksource:   'hpet' wd_now:  wd_last: 40bc1ee3 
mask: 
clocksource:   'tsc' cs_now: 919b39935acdaa cs_last: 
919b3957c0ec24 mask: 
clocksource: Switched to clocksource hpet

This happens when user-space does inappropriate access to directly mapped HPET.
For example dumping whole "vvar" area: memcpy(buf, addr("vvar"), 0x3000).

In our case sequence was triggered by bug in "atop" crashed at "\n" in task 
comm. =)

In upstream everything is fine. Direct access to HPET was sealed in 4.7 (we 
seen bug in 4.4)
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=1ed95e52d902035e39a715ff3a314a893a96e5b7

Kudos to Arseny Smalyuk for (accidental) reproducing and investigation.

---

#include 
#include 

char tmp[0x3000];

int main() {
void* vvar = NULL;
FILE* fp = fopen("/proc/self/maps", "r");

char buf[4096];
while (fgets(buf, 4096, fp)) {
size_t slen = strlen(buf);
if (slen > 0 && buf[slen - 1] == '\n') {
--slen;
}
if (slen > 6 && !memcmp(buf + slen - 6, "[vvar]", 6)) {
sscanf(buf, "%p", );
break;
}
}

fclose(fp);

memcpy(tmp, vvar, 0x3000);
return 0;
}

[PATCH] mm/vmstat: do not use size of vmstat_text as count of /proc/vmstat items

2019-10-19 Thread Konstantin Khlebnikov

Strings from vmstat_text[] will be used for printing memory cgroup
statistics which exists even if CONFIG_VM_EVENT_COUNTERS=n.

This should be applied before patch "mm/memcontrol: use vmstat names
for printing statistics".

Signed-off-by: Konstantin Khlebnikov 
Link: 
https://lore.kernel.org/linux-mm/cd1c42ae-281f-c8a8-70ac-1d01d417b...@infradead.org/T/#u
---
 mm/vmstat.c |   26 --
 1 file changed, 12 insertions(+), 14 deletions(-)

diff --git a/mm/vmstat.c b/mm/vmstat.c
index 590aeca27cab..13e36da70f3c 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1638,25 +1638,23 @@ static const struct seq_operations zoneinfo_op = {
.show   = zoneinfo_show,
 };
 
+#define NR_VMSTAT_ITEMS (NR_VM_ZONE_STAT_ITEMS + \
+NR_VM_NUMA_STAT_ITEMS + \
+NR_VM_NODE_STAT_ITEMS + \
+NR_VM_WRITEBACK_STAT_ITEMS + \
+(IS_ENABLED(CONFIG_VM_EVENT_COUNTERS) ? \
+ NR_VM_EVENT_ITEMS : 0))
+
 static void *vmstat_start(struct seq_file *m, loff_t *pos)
 {
unsigned long *v;
-   int i, stat_items_size;
+   int i;
 
-   if (*pos >= ARRAY_SIZE(vmstat_text))
+   if (*pos >= NR_VMSTAT_ITEMS)
return NULL;
-   stat_items_size = NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long) +
- NR_VM_NUMA_STAT_ITEMS * sizeof(unsigned long) +
- NR_VM_NODE_STAT_ITEMS * sizeof(unsigned long) +
- NR_VM_WRITEBACK_STAT_ITEMS * sizeof(unsigned long);
-
-#ifdef CONFIG_VM_EVENT_COUNTERS
-   stat_items_size += sizeof(struct vm_event_state);
-#endif
 
-   BUILD_BUG_ON(stat_items_size !=
-ARRAY_SIZE(vmstat_text) * sizeof(unsigned long));
-   v = kmalloc(stat_items_size, GFP_KERNEL);
+   BUILD_BUG_ON(ARRAY_SIZE(vmstat_text) < NR_VMSTAT_ITEMS);
+   v = kmalloc_array(NR_VMSTAT_ITEMS, sizeof(unsigned long), GFP_KERNEL);
m->private = v;
if (!v)
return ERR_PTR(-ENOMEM);
@@ -1689,7 +1687,7 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos)
 static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos)
 {
(*pos)++;
-   if (*pos >= ARRAY_SIZE(vmstat_text))
+   if (*pos >= NR_VMSTAT_ITEMS)
return NULL;
return (unsigned long *)m->private + *pos;
 }

Re: mmotm 2019-10-18-22-40 uploaded

2019-10-19 Thread Konstantin Khlebnikov

On Sat, Oct 19, 2019 at 8:40 AM  wrote:
>
> The mm-of-the-moment snapshot 2019-10-18-22-40 has been uploaded to
>
>http://www.ozlabs.org/~akpm/mmotm/
>
> mmotm-readme.txt says
>
> README for mm-of-the-moment:
>
> http://www.ozlabs.org/~akpm/mmotm/
>
> This is a snapshot of my -mm patch queue.  Uploaded at random hopefully
> more than once a week.
>
> You will need quilt to apply these patches to the latest Linus release (5.x
> or 5.x-rcY).  The series file is in broken-out.tar.gz and is duplicated in
> http://ozlabs.org/~akpm/mmotm/series
>
> The file broken-out.tar.gz contains two datestamp files: .DATE and
> .DATE--mm-dd-hh-mm-ss.  Both contain the string -mm-dd-hh-mm-ss,
> followed by the base kernel version against which this patch series is to
> be applied.
>
> This tree is partially included in linux-next.  To see which patches are
> included in linux-next, consult the `series' file.  Only the patches
> within the #NEXT_PATCHES_START/#NEXT_PATCHES_END markers are included in
> linux-next.
>
>
> A full copy of the full kernel tree with the linux-next and mmotm patches
> already applied is available through git within an hour of the mmotm
> release.  Individual mmotm releases are tagged.  The master branch always
> points to the latest release, so it's constantly rebasing.
>
> http://git.cmpxchg.org/cgit.cgi/linux-mmotm.git/

I seems git mirror does not update anymore.
Latest tag is v5.3-rc7-mmots-2019-09-03-21-33

Re: [PATCH] mm/memcontrol: update lruvec counters in mem_cgroup_move_account

2019-10-16 Thread Konstantin Khlebnikov


On 15/10/2019 17.31, Johannes Weiner wrote:

On Tue, Oct 15, 2019 at 01:04:01PM +0200, Michal Hocko wrote:

On Tue 15-10-19 13:49:14, Konstantin Khlebnikov wrote:

On 15/10/2019 13.36, Michal Hocko wrote:

On Tue 15-10-19 11:44:22, Konstantin Khlebnikov wrote:

On 15/10/2019 11.20, Michal Hocko wrote:

On Tue 15-10-19 11:09:59, Konstantin Khlebnikov wrote:

Mapped, dirty and writeback pages are also counted in per-lruvec stats.
These counters needs update when page is moved between cgroups.


Please describe the user visible effect.


Surprisingly I don't see any users at this moment.
So, there is no effect in mainline kernel.


Those counters are exported right? Or do we exclude them for v1?


It seems per-lruvec statistics is not exposed anywhere.
And per-lruvec NR_FILE_MAPPED, NR_FILE_DIRTY, NR_WRITEBACK never had users.


So why do we have it in the first place? I have to say that counters
as we have them now are really clear as mud. This is really begging for
a clean up.


IMO This is going in the right direction. The goal is to have all
vmstat items accounted per lruvec - the intersection of the node and
the memcg - to further integrate memcg into the traditional VM code
and eliminate differences between them. We use the lruvec counters
quite extensively in reclaim already, since the lruvec is the primary
context for page reclaim. More consumers will follow in pending
patches. This patch cleans up some stragglers.

The only counters we can't have in the lruvec are the legacy memcg
ones that are accounted to the memcg without a node context:
MEMCG_RSS, MEMCG_CACHE etc. We should eventually replace them with
per-lruvec accounted NR_ANON_PAGES, NR_FILE_PAGES etc - tracked by
generic VM code, not inside memcg, further reducing the size of the
memory controller. But it'll require some work in the page creation
path, as that accounting happens before the memcg commit right now.

Then we can get rid of memcg_stat_item and the_memcg_page_state
API. And we should be able to do for_each_node() summing of the lruvec
counters to produce memory.stat output, and drop memcg->vmstats_local,
memcg->vmstats_percpu, memcg->vmstats and memcg->vmevents altogether.



Ok, I see where it goes.
Some years ago I've worked on something similar.
Including linking page directly with its lruvec and moving lru_lock into lruvec.

Indeed VM code must be split per-node except accounting matters.
But summing per-node counters might be costly for balance_dirty_pages.
Probably memcg needs own dirty pages counter with per-cpu batching.

Re: [PATCH] mm/memcontrol: update lruvec counters in mem_cgroup_move_account

2019-10-15 Thread Konstantin Khlebnikov


On 15/10/2019 16.53, Johannes Weiner wrote:

On Tue, Oct 15, 2019 at 11:09:59AM +0300, Konstantin Khlebnikov wrote:

Mapped, dirty and writeback pages are also counted in per-lruvec stats.
These counters needs update when page is moved between cgroups.

Fixes: 00f3ca2c2d66 ("mm: memcontrol: per-lruvec stats infrastructure")
Signed-off-by: Konstantin Khlebnikov 


Acked-by: Johannes Weiner 

Please mention in the changelog that currently is nobody *consuming*
the lruvec versions of these counters and that there is no
user-visible effect. Thanks



Maybe just kill all these per-lruvec counters?
I see only one user which have no alternative data source: WORKINGSET_ACTIVATE.

This will save some memory: 32 * sizeof(long) * nr_nodes * nr_cpus bytes

Re: [PATCH] mm/memcontrol: update lruvec counters in mem_cgroup_move_account

2019-10-15 Thread Konstantin Khlebnikov


On 15/10/2019 13.36, Michal Hocko wrote:

On Tue 15-10-19 11:44:22, Konstantin Khlebnikov wrote:

On 15/10/2019 11.20, Michal Hocko wrote:

On Tue 15-10-19 11:09:59, Konstantin Khlebnikov wrote:

Mapped, dirty and writeback pages are also counted in per-lruvec stats.
These counters needs update when page is moved between cgroups.


Please describe the user visible effect.


Surprisingly I don't see any users at this moment.
So, there is no effect in mainline kernel.


Those counters are exported right? Or do we exclude them for v1?


It seems per-lruvec statistics is not exposed anywhere.
And per-lruvec NR_FILE_MAPPED, NR_FILE_DIRTY, NR_WRITEBACK never had users.

I've found this because I'm using mem_cgroup_move_account for recharging
pages at mlock and playing right now with debug for memory cgroup which
validates statistics and counters when cgroup dies.




Fixes: 00f3ca2c2d66 ("mm: memcontrol: per-lruvec stats infrastructure")
Signed-off-by: Konstantin Khlebnikov 


We want Cc: stable I suspect because broken stats might be really
misleading.

The patch looks ok to me otherwise
Acked-by: Michal Hocko 


---
   mm/memcontrol.c |   18 --
   1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index bdac56009a38..363106578876 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5420,6 +5420,8 @@ static int mem_cgroup_move_account(struct page *page,
   struct mem_cgroup *from,
   struct mem_cgroup *to)
   {
+   struct lruvec *from_vec, *to_vec;
+   struct pglist_data *pgdat;
unsigned long flags;
unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
int ret;
@@ -5443,11 +5445,15 @@ static int mem_cgroup_move_account(struct page *page,
anon = PageAnon(page);
+   pgdat = page_pgdat(page);
+   from_vec = mem_cgroup_lruvec(pgdat, from);
+   to_vec = mem_cgroup_lruvec(pgdat, to);
+
spin_lock_irqsave(>move_lock, flags);
if (!anon && page_mapped(page)) {
-   __mod_memcg_state(from, NR_FILE_MAPPED, -nr_pages);
-   __mod_memcg_state(to, NR_FILE_MAPPED, nr_pages);
+   __mod_lruvec_state(from_vec, NR_FILE_MAPPED, -nr_pages);
+   __mod_lruvec_state(to_vec, NR_FILE_MAPPED, nr_pages);
}
/*
@@ -5459,14 +5465,14 @@ static int mem_cgroup_move_account(struct page *page,
struct address_space *mapping = page_mapping(page);
if (mapping_cap_account_dirty(mapping)) {
-   __mod_memcg_state(from, NR_FILE_DIRTY, -nr_pages);
-   __mod_memcg_state(to, NR_FILE_DIRTY, nr_pages);
+   __mod_lruvec_state(from_vec, NR_FILE_DIRTY, -nr_pages);
+   __mod_lruvec_state(to_vec, NR_FILE_DIRTY, nr_pages);
}
}
if (PageWriteback(page)) {
-   __mod_memcg_state(from, NR_WRITEBACK, -nr_pages);
-   __mod_memcg_state(to, NR_WRITEBACK, nr_pages);
+   __mod_lruvec_state(from_vec, NR_WRITEBACK, -nr_pages);
+   __mod_lruvec_state(to_vec, NR_WRITEBACK, nr_pages);
}
   #ifdef CONFIG_TRANSPARENT_HUGEPAGE

[PATCH 2/2] mm/memcontrol: use vmstat names for printing statistics

2019-10-15 Thread Konstantin Khlebnikov

Use common names from vmstat array when possible.
This gives not much difference in code size for now,
but should help in keeping interfaces consistent.

add/remove: 0/2 grow/shrink: 2/0 up/down: 70/-72 (-2)
Function old new   delta
memory_stat_format   9841050 +66
memcg_stat_show  957 961  +4
memcg1_event_names32   - -32
mem_cgroup_lru_names  40   - -40
Total: Before=14485337, After=14485335, chg -0.00%

Signed-off-by: Konstantin Khlebnikov 
---
 include/linux/vmstat.h |4 ++--
 mm/memcontrol.c|   52 
 mm/vmstat.c|9 +---
 3 files changed, 29 insertions(+), 36 deletions(-)

diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index b995d8b680c2..292485f3d24d 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -420,7 +420,7 @@ static inline const char *writeback_stat_name(enum 
writeback_stat_item item)
   item];
 }
 
-#ifdef CONFIG_VM_EVENT_COUNTERS
+#if defined(CONFIG_VM_EVENT_COUNTERS) || defined(CONFIG_MEMCG)
 static inline const char *vm_event_name(enum vm_event_item item)
 {
return vmstat_text[NR_VM_ZONE_STAT_ITEMS +
@@ -429,6 +429,6 @@ static inline const char *vm_event_name(enum vm_event_item 
item)
   NR_VM_WRITEBACK_STAT_ITEMS +
   item];
 }
-#endif /* CONFIG_VM_EVENT_COUNTERS */
+#endif /* CONFIG_VM_EVENT_COUNTERS || CONFIG_MEMCG */
 
 #endif /* _LINUX_VMSTAT_H */
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index bdac56009a38..e0aff8a81067 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -98,14 +98,6 @@ static bool do_memsw_account(void)
return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && do_swap_account;
 }
 
-static const char *const mem_cgroup_lru_names[] = {
-   "inactive_anon",
-   "active_anon",
-   "inactive_file",
-   "active_file",
-   "unevictable",
-};
-
 #define THRESHOLDS_EVENTS_TARGET 128
 #define SOFTLIMIT_EVENTS_TARGET 1024
 #define NUMAINFO_EVENTS_TARGET 1024
@@ -1438,7 +1430,7 @@ static char *memory_stat_format(struct mem_cgroup *memcg)
   PAGE_SIZE);
 
for (i = 0; i < NR_LRU_LISTS; i++)
-   seq_buf_printf(, "%s %llu\n", mem_cgroup_lru_names[i],
+   seq_buf_printf(, "%s %llu\n", lru_list_name(i),
   (u64)memcg_page_state(memcg, NR_LRU_BASE + i) *
   PAGE_SIZE);
 
@@ -1451,8 +1443,10 @@ static char *memory_stat_format(struct mem_cgroup *memcg)
 
/* Accumulated memory events */
 
-   seq_buf_printf(, "pgfault %lu\n", memcg_events(memcg, PGFAULT));
-   seq_buf_printf(, "pgmajfault %lu\n", memcg_events(memcg, PGMAJFAULT));
+   seq_buf_printf(, "%s %lu\n", vm_event_name(PGFAULT),
+  memcg_events(memcg, PGFAULT));
+   seq_buf_printf(, "%s %lu\n", vm_event_name(PGMAJFAULT),
+  memcg_events(memcg, PGMAJFAULT));
 
seq_buf_printf(, "workingset_refault %lu\n",
   memcg_page_state(memcg, WORKINGSET_REFAULT));
@@ -1461,22 +1455,27 @@ static char *memory_stat_format(struct mem_cgroup 
*memcg)
seq_buf_printf(, "workingset_nodereclaim %lu\n",
   memcg_page_state(memcg, WORKINGSET_NODERECLAIM));
 
-   seq_buf_printf(, "pgrefill %lu\n", memcg_events(memcg, PGREFILL));
+   seq_buf_printf(, "%s %lu\n",  vm_event_name(PGREFILL),
+  memcg_events(memcg, PGREFILL));
seq_buf_printf(, "pgscan %lu\n",
   memcg_events(memcg, PGSCAN_KSWAPD) +
   memcg_events(memcg, PGSCAN_DIRECT));
seq_buf_printf(, "pgsteal %lu\n",
   memcg_events(memcg, PGSTEAL_KSWAPD) +
   memcg_events(memcg, PGSTEAL_DIRECT));
-   seq_buf_printf(, "pgactivate %lu\n", memcg_events(memcg, PGACTIVATE));
-   seq_buf_printf(, "pgdeactivate %lu\n", memcg_events(memcg, 
PGDEACTIVATE));
-   seq_buf_printf(, "pglazyfree %lu\n", memcg_events(memcg, PGLAZYFREE));
-   seq_buf_printf(, "pglazyfreed %lu\n", memcg_events(memcg, 
PGLAZYFREED));
+   seq_buf_printf(, "%s %lu\n", vm_event_name(PGACTIVATE),
+  memcg_events(memcg, PGACTIVATE));
+   seq_buf_printf(, "%s %lu\n", vm_event_name(PGDEACTIVATE),
+  memcg_events(memcg, PGDEACTIVATE));
+   seq_buf_printf(, "%s %lu\n", vm_event_name(PGLAZYFREE),
+  memcg_events(memcg, PGLAZYFREE));
+   seq_buf_printf(, &

[PATCH 1/2] mm/vmstat: add helpers to get vmstat item names for each enum type

2019-10-15 Thread Konstantin Khlebnikov

Statistics in vmstat is combined from counters with different structure,
but names for them are merged into one array.

This patch adds trivial helpers to get name for each item:

const char *zone_stat_name(enum zone_stat_item item);
const char *numa_stat_name(enum numa_stat_item item);
const char *node_stat_name(enum node_stat_item item);
const char *writeback_stat_name(enum writeback_stat_item item);
const char *vm_event_name(enum vm_event_item item);


Names for enum writeback_stat_item are folded in the middle of
vmstat_text so this patch moves declaration into header to calculate
offset of following items.


Also this patch reuses piece of node stat names for lru list names:

const char *lru_list_name(enum lru_list lru);

This returns common lru list names: "inactive_anon", "active_anon",
"inactive_file", "active_file", "unevictable".

Signed-off-by: Konstantin Khlebnikov 
---
 drivers/base/node.c|9 +++--
 include/linux/vmstat.h |   50 
 mm/vmstat.c|   27 +-
 3 files changed, 62 insertions(+), 24 deletions(-)

diff --git a/drivers/base/node.c b/drivers/base/node.c
index 296546ffed6c..98a31bafc8a2 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -496,20 +496,17 @@ static ssize_t node_read_vmstat(struct device *dev,
int n = 0;
 
for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
-   n += sprintf(buf+n, "%s %lu\n", vmstat_text[i],
+   n += sprintf(buf+n, "%s %lu\n", zone_stat_name(i),
 sum_zone_node_page_state(nid, i));
 
 #ifdef CONFIG_NUMA
for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
-   n += sprintf(buf+n, "%s %lu\n",
-vmstat_text[i + NR_VM_ZONE_STAT_ITEMS],
+   n += sprintf(buf+n, "%s %lu\n", numa_stat_name(i),
 sum_zone_numa_state(nid, i));
 #endif
 
for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
-   n += sprintf(buf+n, "%s %lu\n",
-vmstat_text[i + NR_VM_ZONE_STAT_ITEMS +
-NR_VM_NUMA_STAT_ITEMS],
+   n += sprintf(buf+n, "%s %lu\n", node_stat_name(i),
 node_page_state(pgdat, i));
 
return n;
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index bdeda4b079fe..b995d8b680c2 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -31,6 +31,12 @@ struct reclaim_stat {
unsigned nr_unmap_fail;
 };
 
+enum writeback_stat_item {
+   NR_DIRTY_THRESHOLD,
+   NR_DIRTY_BG_THRESHOLD,
+   NR_VM_WRITEBACK_STAT_ITEMS,
+};
+
 #ifdef CONFIG_VM_EVENT_COUNTERS
 /*
  * Light weight per cpu counter implementation.
@@ -381,4 +387,48 @@ static inline void __mod_zone_freepage_state(struct zone 
*zone, int nr_pages,
 
 extern const char * const vmstat_text[];
 
+static inline const char *zone_stat_name(enum zone_stat_item item)
+{
+   return vmstat_text[item];
+}
+
+#ifdef CONFIG_NUMA
+static inline const char *numa_stat_name(enum numa_stat_item item)
+{
+   return vmstat_text[NR_VM_ZONE_STAT_ITEMS +
+  item];
+}
+#endif /* CONFIG_NUMA */
+
+static inline const char *node_stat_name(enum node_stat_item item)
+{
+   return vmstat_text[NR_VM_ZONE_STAT_ITEMS +
+  NR_VM_NUMA_STAT_ITEMS +
+  item];
+}
+
+static inline const char *lru_list_name(enum lru_list lru)
+{
+   return node_stat_name(NR_LRU_BASE + lru) + 3; // skip "nr_"
+}
+
+static inline const char *writeback_stat_name(enum writeback_stat_item item)
+{
+   return vmstat_text[NR_VM_ZONE_STAT_ITEMS +
+  NR_VM_NUMA_STAT_ITEMS +
+  NR_VM_NODE_STAT_ITEMS +
+  item];
+}
+
+#ifdef CONFIG_VM_EVENT_COUNTERS
+static inline const char *vm_event_name(enum vm_event_item item)
+{
+   return vmstat_text[NR_VM_ZONE_STAT_ITEMS +
+  NR_VM_NUMA_STAT_ITEMS +
+  NR_VM_NODE_STAT_ITEMS +
+  NR_VM_WRITEBACK_STAT_ITEMS +
+  item];
+}
+#endif /* CONFIG_VM_EVENT_COUNTERS */
+
 #endif /* _LINUX_VMSTAT_H */
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 6afc892a148a..590aeca27cab 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1134,7 +1134,7 @@ const char * const vmstat_text[] = {
"numa_other",
 #endif
 
-   /* Node-based counters */
+   /* enum node_stat_item counters */
"nr_inactive_anon",
"nr_active_anon",
"nr_inactive_file",
@@ -1547,10 +1547,8 @@ static void zoneinfo_show_print(struct seq_file *m, 
pg_data_t *pgdat,
if (is_zone_first_populated(pgdat, zone)) {
seq_printf(m, "\n  per-node s

Re: [PATCH] mm/memcontrol: update lruvec counters in mem_cgroup_move_account

2019-10-15 Thread Konstantin Khlebnikov


On 15/10/2019 11.20, Michal Hocko wrote:

On Tue 15-10-19 11:09:59, Konstantin Khlebnikov wrote:

Mapped, dirty and writeback pages are also counted in per-lruvec stats.
These counters needs update when page is moved between cgroups.


Please describe the user visible effect.


Surprisingly I don't see any users at this moment.
So, there is no effect in mainline kernel.


Fixes: 00f3ca2c2d66 ("mm: memcontrol: per-lruvec stats infrastructure")
Signed-off-by: Konstantin Khlebnikov 


We want Cc: stable I suspect because broken stats might be really
misleading.

The patch looks ok to me otherwise
Acked-by: Michal Hocko 


---
  mm/memcontrol.c |   18 --
  1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index bdac56009a38..363106578876 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5420,6 +5420,8 @@ static int mem_cgroup_move_account(struct page *page,
   struct mem_cgroup *from,
   struct mem_cgroup *to)
  {
+   struct lruvec *from_vec, *to_vec;
+   struct pglist_data *pgdat;
unsigned long flags;
unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
int ret;
@@ -5443,11 +5445,15 @@ static int mem_cgroup_move_account(struct page *page,
  
  	anon = PageAnon(page);
  
+	pgdat = page_pgdat(page);

+   from_vec = mem_cgroup_lruvec(pgdat, from);
+   to_vec = mem_cgroup_lruvec(pgdat, to);
+
spin_lock_irqsave(>move_lock, flags);
  
  	if (!anon && page_mapped(page)) {

-   __mod_memcg_state(from, NR_FILE_MAPPED, -nr_pages);
-   __mod_memcg_state(to, NR_FILE_MAPPED, nr_pages);
+   __mod_lruvec_state(from_vec, NR_FILE_MAPPED, -nr_pages);
+   __mod_lruvec_state(to_vec, NR_FILE_MAPPED, nr_pages);
}
  
  	/*

@@ -5459,14 +5465,14 @@ static int mem_cgroup_move_account(struct page *page,
struct address_space *mapping = page_mapping(page);
  
  		if (mapping_cap_account_dirty(mapping)) {

-   __mod_memcg_state(from, NR_FILE_DIRTY, -nr_pages);
-   __mod_memcg_state(to, NR_FILE_DIRTY, nr_pages);
+   __mod_lruvec_state(from_vec, NR_FILE_DIRTY, -nr_pages);
+   __mod_lruvec_state(to_vec, NR_FILE_DIRTY, nr_pages);
}
}
  
  	if (PageWriteback(page)) {

-   __mod_memcg_state(from, NR_WRITEBACK, -nr_pages);
-   __mod_memcg_state(to, NR_WRITEBACK, nr_pages);
+   __mod_lruvec_state(from_vec, NR_WRITEBACK, -nr_pages);
+   __mod_lruvec_state(to_vec, NR_WRITEBACK, nr_pages);
}
  
  #ifdef CONFIG_TRANSPARENT_HUGEPAGE

[PATCH] mm/memcontrol: update lruvec counters in mem_cgroup_move_account

2019-10-15 Thread Konstantin Khlebnikov

Mapped, dirty and writeback pages are also counted in per-lruvec stats.
These counters needs update when page is moved between cgroups.

Fixes: 00f3ca2c2d66 ("mm: memcontrol: per-lruvec stats infrastructure")
Signed-off-by: Konstantin Khlebnikov 
---
 mm/memcontrol.c |   18 --
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index bdac56009a38..363106578876 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5420,6 +5420,8 @@ static int mem_cgroup_move_account(struct page *page,
   struct mem_cgroup *from,
   struct mem_cgroup *to)
 {
+   struct lruvec *from_vec, *to_vec;
+   struct pglist_data *pgdat;
unsigned long flags;
unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
int ret;
@@ -5443,11 +5445,15 @@ static int mem_cgroup_move_account(struct page *page,
 
anon = PageAnon(page);
 
+   pgdat = page_pgdat(page);
+   from_vec = mem_cgroup_lruvec(pgdat, from);
+   to_vec = mem_cgroup_lruvec(pgdat, to);
+
spin_lock_irqsave(>move_lock, flags);
 
if (!anon && page_mapped(page)) {
-   __mod_memcg_state(from, NR_FILE_MAPPED, -nr_pages);
-   __mod_memcg_state(to, NR_FILE_MAPPED, nr_pages);
+   __mod_lruvec_state(from_vec, NR_FILE_MAPPED, -nr_pages);
+   __mod_lruvec_state(to_vec, NR_FILE_MAPPED, nr_pages);
}
 
/*
@@ -5459,14 +5465,14 @@ static int mem_cgroup_move_account(struct page *page,
struct address_space *mapping = page_mapping(page);
 
if (mapping_cap_account_dirty(mapping)) {
-   __mod_memcg_state(from, NR_FILE_DIRTY, -nr_pages);
-   __mod_memcg_state(to, NR_FILE_DIRTY, nr_pages);
+   __mod_lruvec_state(from_vec, NR_FILE_DIRTY, -nr_pages);
+   __mod_lruvec_state(to_vec, NR_FILE_DIRTY, nr_pages);
}
}
 
if (PageWriteback(page)) {
-   __mod_memcg_state(from, NR_WRITEBACK, -nr_pages);
-   __mod_memcg_state(to, NR_WRITEBACK, nr_pages);
+   __mod_lruvec_state(from_vec, NR_WRITEBACK, -nr_pages);
+   __mod_lruvec_state(to_vec, NR_WRITEBACK, nr_pages);
}
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE

Re: [Patch v4 2/2] mm/rmap.c: reuse mergeable anon_vma as parent when fork

2019-10-11 Thread Konstantin Khlebnikov


On 11/10/2019 10.22, Wei Yang wrote:

In function __anon_vma_prepare(), we will try to find anon_vma if it is
possible to reuse it. While on fork, the logic is different.

Since commit 5beb49305251 ("mm: change anon_vma linking to fix
multi-process server scalability issue"), function anon_vma_clone()
tries to allocate new anon_vma for child process. But the logic here
will allocate a new anon_vma for each vma, even in parent this vma
is mergeable and share the same anon_vma with its sibling. This may do
better for scalability issue, while it is not necessary to do so
especially after interval tree is used.

Commit 7a3ef208e662 ("mm: prevent endless growth of anon_vma hierarchy")
tries to reuse some anon_vma by counting child anon_vma and attached
vmas. While for those mergeable anon_vmas, we can just reuse it and not
necessary to go through the logic.

After this change, kernel build test reduces 20% anon_vma allocation.

Do the same kernel build test, it shows run time in sys reduced 11.6%.

Origin:

real2m50.467s
user17m52.002s
sys 1m51.953s

real2m48.662s
user17m55.464s
sys 1m50.553s

real2m51.143s
user17m59.687s
sys 1m53.600s

Patched:

real2m39.933s
user17m1.835s
sys 1m38.802s

real2m39.321s
user17m1.634s
sys 1m39.206s

real2m39.575s
user17m1.420s
sys 1m38.845s

Signed-off-by: Wei Yang 


Acked-by: Konstantin Khlebnikov 


---
  mm/rmap.c | 13 +
  1 file changed, 13 insertions(+)

diff --git a/mm/rmap.c b/mm/rmap.c
index c34414567474..2c13e2bfd393 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -268,6 +268,19 @@ int anon_vma_clone(struct vm_area_struct *dst, struct 
vm_area_struct *src)
  {
struct anon_vma_chain *avc, *pavc;
struct anon_vma *root = NULL;
+   struct vm_area_struct *prev = dst->vm_prev, *pprev = src->vm_prev;
+
+   /*
+* If parent share anon_vma with its vm_prev, keep this sharing in in
+* child.
+*
+* 1. Parent has vm_prev, which implies we have vm_prev.
+* 2. Parent and its vm_prev have the same anon_vma.
+*/
+   if (!dst->anon_vma && src->anon_vma &&
+   pprev && pprev->anon_vma == src->anon_vma)
+   dst->anon_vma = prev->anon_vma;
+


I believe that in present code "prev" cannot be NULL if !dst->anon_vma && 
src->anon_vma is true.
It would be safer to check this explicitly.

  
  	list_for_each_entry_reverse(pavc, >anon_vma_chain, same_vma) {

struct anon_vma *anon_vma;

Re: [Patch v4 1/2] mm/rmap.c: don't reuse anon_vma if we just want a copy

2019-10-11 Thread Konstantin Khlebnikov


On 11/10/2019 10.22, Wei Yang wrote:

Before commit 7a3ef208e662 ("mm: prevent endless growth of anon_vma
hierarchy"), anon_vma_clone() doesn't change dst->anon_vma. While after
this commit, anon_vma_clone() will try to reuse an exist one on forking.

But this commit go a little bit further for the case not forking.
anon_vma_clone() is called from __vma_split(), __split_vma(), copy_vma()
and anon_vma_fork(). For the first three places, the purpose here is get
a copy of src and we don't expect to touch dst->anon_vma even it is
NULL. While after that commit, it is possible to reuse an anon_vma when
dst->anon_vma is NULL. This is not we intend to have.

This patch stop reuse anon_vma for non-fork cases.

Fix commit 7a3ef208e662 ("mm: prevent endless growth of anon_vma
hierarchy")

Signed-off-by: Wei Yang 


Yes, reusing heuristic was designed for fork.
But this isn't strictly necessary - any vmas could share anon_vma.
For example all vmas in system could be linked with single anon_vma.

Acked-by: Konstantin Khlebnikov 



---
v4:
   * check dst->anon_vma in each iteration
v3:
   * use dst->anon_vma and src->anon_vma to get reuse state
 pointed by Konstantin Khlebnikov
---
  mm/rmap.c | 24 +++-
  1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/mm/rmap.c b/mm/rmap.c
index d9a23bb773bf..c34414567474 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -250,13 +250,19 @@ static inline void unlock_anon_vma_root(struct anon_vma 
*root)
   * Attach the anon_vmas from src to dst.
   * Returns 0 on success, -ENOMEM on failure.
   *
- * If dst->anon_vma is NULL this function tries to find and reuse existing
- * anon_vma which has no vmas and only one child anon_vma. This prevents
- * degradation of anon_vma hierarchy to endless linear chain in case of
- * constantly forking task. On the other hand, an anon_vma with more than one
- * child isn't reused even if there was no alive vma, thus rmap walker has a
- * good chance of avoiding scanning the whole hierarchy when it searches where
- * page is mapped.
+ * anon_vma_clone() is called by __vma_split(), __split_vma(), copy_vma() and
+ * anon_vma_fork(). The first three want an exact copy of src, while the last
+ * one, anon_vma_fork(), may try to reuse an existing anon_vma to prevent
+ * endless growth of anon_vma. Since dst->anon_vma is set to NULL before call,
+ * we can identify this case by checking (!dst->anon_vma && src->anon_vma).
+ *
+ * If (!dst->anon_vma && src->anon_vma) is true, this function tries to find
+ * and reuse existing anon_vma which has no vmas and only one child anon_vma.
+ * This prevents degradation of anon_vma hierarchy to endless linear chain in
+ * case of constantly forking task. On the other hand, an anon_vma with more
+ * than one child isn't reused even if there was no alive vma, thus rmap
+ * walker has a good chance of avoiding scanning the whole hierarchy when it
+ * searches where page is mapped.
   */
  int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
  {
@@ -286,8 +292,8 @@ int anon_vma_clone(struct vm_area_struct *dst, struct 
vm_area_struct *src)
 * will always reuse it. Root anon_vma is never reused:
 * it has self-parent reference and at least one child.
 */
-   if (!dst->anon_vma && anon_vma != src->anon_vma &&
-   anon_vma->degree < 2)
+   if (!dst->anon_vma && src->anon_vma &&
+   anon_vma != src->anon_vma && anon_vma->degree < 2)
dst->anon_vma = anon_vma;
}
if (dst->anon_vma)

Re: [Patch v2 1/2] mm/rmap.c: don't reuse anon_vma if we just want a copy

2019-10-10 Thread Konstantin Khlebnikov


On 10/10/2019 16.58, Wei Yang wrote:

Before commit 7a3ef208e662 ("mm: prevent endless growth of anon_vma
hierarchy"), anon_vma_clone() doesn't change dst->anon_vma. While after
this commit, anon_vma_clone() will try to reuse an exist one on forking.

But this commit go a little bit further for the case not forking.
anon_vma_clone() is called from __vma_split(), __split_vma(), copy_vma()
and anon_vma_fork(). For the first three places, the purpose here is get
a copy of src and we don't expect to touch dst->anon_vma even it is
NULL. While after that commit, it is possible to reuse an anon_vma when
dst->anon_vma is NULL. This is not we intend to have.


In all these cases dst->anon_vma is a copy of src->anon_vma except
anon_vma_fork where dst_>anon_vma explicitly set to NULL before call.

So reuse == true iff (!dst->anon_vma && src->anon_vma)



This patch stop reuse anon_vma for non-fork cases.

Fix commit 7a3ef208e662 ("mm: prevent endless growth of anon_vma
hierarchy")

Signed-off-by: Wei Yang 
---
  include/linux/rmap.h | 3 ++-
  mm/mmap.c| 6 +++---
  mm/rmap.c| 7 ---
  3 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 988d176472df..963e6ab09b9b 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -142,7 +142,8 @@ static inline void anon_vma_unlock_read(struct anon_vma 
*anon_vma)
  void anon_vma_init(void); /* create anon_vma_cachep */
  int  __anon_vma_prepare(struct vm_area_struct *);
  void unlink_anon_vmas(struct vm_area_struct *);
-int anon_vma_clone(struct vm_area_struct *, struct vm_area_struct *);
+int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src,
+  bool reuse);
  int anon_vma_fork(struct vm_area_struct *, struct vm_area_struct *);
  
  static inline int anon_vma_prepare(struct vm_area_struct *vma)

diff --git a/mm/mmap.c b/mm/mmap.c
index 93f221785956..21e94f8ac4c7 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -791,7 +791,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long 
start,
int error;
  
  			importer->anon_vma = exporter->anon_vma;

-   error = anon_vma_clone(importer, exporter);
+   error = anon_vma_clone(importer, exporter, false);
if (error)
return error;
}
@@ -2666,7 +2666,7 @@ int __split_vma(struct mm_struct *mm, struct 
vm_area_struct *vma,
if (err)
goto out_free_vma;
  
-	err = anon_vma_clone(new, vma);

+   err = anon_vma_clone(new, vma, false);
if (err)
goto out_free_mpol;
  
@@ -3247,7 +3247,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,

new_vma->vm_pgoff = pgoff;
if (vma_dup_policy(vma, new_vma))
goto out_free_vma;
-   if (anon_vma_clone(new_vma, vma))
+   if (anon_vma_clone(new_vma, vma, false))
goto out_free_mempol;
if (new_vma->vm_file)
get_file(new_vma->vm_file);
diff --git a/mm/rmap.c b/mm/rmap.c
index d9a23bb773bf..f729e4013613 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -258,7 +258,8 @@ static inline void unlock_anon_vma_root(struct anon_vma 
*root)
   * good chance of avoiding scanning the whole hierarchy when it searches where
   * page is mapped.
   */
-int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
+int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src,
+  bool reuse)
  {
struct anon_vma_chain *avc, *pavc;
struct anon_vma *root = NULL;
@@ -286,7 +287,7 @@ int anon_vma_clone(struct vm_area_struct *dst, struct 
vm_area_struct *src)
 * will always reuse it. Root anon_vma is never reused:
 * it has self-parent reference and at least one child.
 */
-   if (!dst->anon_vma && anon_vma != src->anon_vma &&
+   if (reuse && !dst->anon_vma && anon_vma != src->anon_vma &&
anon_vma->degree < 2)
dst->anon_vma = anon_vma;
}
@@ -329,7 +330,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct 
vm_area_struct *pvma)
 * First, attach the new VMA to the parent VMA's anon_vmas,
 * so rmap can find non-COWed pages in child processes.
 */
-   error = anon_vma_clone(vma, pvma);
+   error = anon_vma_clone(vma, pvma, true);
if (error)
return error;

Re: "reuse mergeable anon_vma as parent when fork" causes a crash on s390

2019-10-10 Thread Konstantin Khlebnikov


On 10/10/2019 06.15, Wei Yang wrote:

On Thu, Oct 10, 2019 at 10:36:01AM +0800, Wei Yang wrote:

Hi, Qian, Shakeel

Thanks for testing.

Sounds I missed some case to handle. anon_vma_clone() now would be called in
vma_adjust, which is a different case when it is introduced.



Well, I have to correct my statement. The reason is we may did something more
in anon_vma_clone().

Here is a quick fix, while I need to go through all the cases carefully.


Oops, I've overlooked this case too.

You have to check src->anon_vma
otherwise in  __split_vma or copy_vma dst could pick completely random anon_vma.

Also checking prev will not hurt, just to be sure.

So, something like this should work:

if (!dst->anon_vma && src->anon_vma &&
prev && pprev && pprev->anon_vma == src->anon_vma)
  dst->anon_vma = prev->anon_vma;



diff --git a/mm/rmap.c b/mm/rmap.c
index 12f6c3d7fd9d..2844f442208d 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -271,7 +271,7 @@ int anon_vma_clone(struct vm_area_struct *dst, struct 
vm_area_struct *src)
  * 1. Parent has vm_prev, which implies we have vm_prev.
  * 2. Parent and its vm_prev have the same anon_vma.
  */
-   if (pprev && pprev->anon_vma == src->anon_vma)
+   if (!dst->anon_vma && pprev && pprev->anon_vma == src->anon_vma)
 dst->anon_vma = prev->anon_vma;
  
 list_for_each_entry_reverse(pavc, >anon_vma_chain, same_vma) {



BTW, do you have the specific test case? So that I could verify my change. The
kernel build test doesn't trigger this.

Thanks a lot :-)

On Wed, Oct 09, 2019 at 03:21:11PM -0700, Shakeel Butt wrote:
--
Wei Yang
Help you, Help me

Re: [PATCH v2] mm/swap: piggyback lru_add_drain_all() calls

2019-10-05 Thread Konstantin Khlebnikov

On Sat, Oct 5, 2019 at 10:35 PM Andrew Morton  wrote:
>
> On Fri, 04 Oct 2019 16:09:22 +0300 Konstantin Khlebnikov 
>  wrote:
>
> > This is very slow operation. There is no reason to do it again if somebody
> > else already drained all per-cpu vectors while we waited for lock.
> >
> > Piggyback on drain started and finished while we waited for lock:
> > all pages pended at the time of our enter were drained from vectors.
> >
> > Callers like POSIX_FADV_DONTNEED retry their operations once after
> > draining per-cpu vectors when pages have unexpected references.
> >
> > ...
> >
> > --- a/mm/swap.c
> > +++ b/mm/swap.c
> > @@ -708,9 +708,10 @@ static void lru_add_drain_per_cpu(struct work_struct 
> > *dummy)
> >   */
> >  void lru_add_drain_all(void)
> >  {
> > + static seqcount_t seqcount = SEQCNT_ZERO(seqcount);
> >   static DEFINE_MUTEX(lock);
> >   static struct cpumask has_work;
> > - int cpu;
> > + int cpu, seq;
> >
> >   /*
> >* Make sure nobody triggers this path before mm_percpu_wq is fully
> > @@ -719,7 +720,19 @@ void lru_add_drain_all(void)
> >   if (WARN_ON(!mm_percpu_wq))
> >   return;
> >
> > + seq = raw_read_seqcount_latch();
> > +
> >   mutex_lock();
> > +
> > + /*
> > +  * Piggyback on drain started and finished while we waited for lock:
> > +  * all pages pended at the time of our enter were drained from 
> > vectors.
> > +  */
> > + if (__read_seqcount_retry(, seq))
> > + goto done;
> > +
> > + raw_write_seqcount_latch();
> > +
> >   cpumask_clear(_work);
> >
> >   for_each_online_cpu(cpu) {
> > @@ -740,6 +753,7 @@ void lru_add_drain_all(void)
> >   for_each_cpu(cpu, _work)
> >   flush_work(_cpu(lru_add_drain_work, cpu));
> >
> > +done:
> >   mutex_unlock();
> >  }
>
> I'm not sure this works as intended.
>
> Suppose CPU #30 is presently executing the for_each_online_cpu() loop
> and has reached CPU #15's per-cpu data.
>
> Now CPU #2 comes along, adds some pages to its per-cpu vectors then
> calls lru_add_drain_all().  AFAICT the code will assume that CPU #30
> has flushed out all of the pages which CPU #2 just added, but that
> isn't the case.
>
> Moving the raw_write_seqcount_latch() to the point where all processing
> has completed might fix?
>
>

No, raw_write_seqcount_latch() should be exactly before draining.

Here seqcount works as generation of pages that could be in vectors.
And all steps are serialized by mutex: only after taking lock we could be
sure that all previous generations are gone.

Here CPU #2 will see same generation at entry and after taking lock.
So it will drain own pages.

Re: [PATCH] mm/rmap.c: reuse mergeable anon_vma as parent when fork

2019-10-04 Thread Konstantin Khlebnikov


On 04/10/2019 19.06, Wei Yang wrote:

In function __anon_vma_prepare(), we will try to find anon_vma if it is
possible to reuse it. While on fork, the logic is different.

Since commit 5beb49305251 ("mm: change anon_vma linking to fix
multi-process server scalability issue"), function anon_vma_clone()
tries to allocate new anon_vma for child process. But the logic here
will allocate a new anon_vma for each vma, even in parent this vma
is mergeable and share the same anon_vma with its sibling. This may do
better for scalability issue, while it is not necessary to do so
especially after interval tree is used.

Commit 7a3ef208e662 ("mm: prevent endless growth of anon_vma hierarchy")
tries to reuse some anon_vma by counting child anon_vma and attached
vmas. While for those mergeable anon_vmas, we can just reuse it and not
necessary to go through the logic.

After this change, kernel build test reduces 20% anon_vma allocation.



Makes sense. This might have much bigger effect for scenarios when task
unmaps holes in huge vma as red-zones between allocations and then forks.

Acked-by: Konstantin Khlebnikov 


Signed-off-by: Wei Yang 
---
  mm/rmap.c | 11 +++
  1 file changed, 11 insertions(+)

diff --git a/mm/rmap.c b/mm/rmap.c
index d9a23bb773bf..12f6c3d7fd9d 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -262,6 +262,17 @@ int anon_vma_clone(struct vm_area_struct *dst, struct 
vm_area_struct *src)
  {
struct anon_vma_chain *avc, *pavc;
struct anon_vma *root = NULL;
+   struct vm_area_struct *prev = dst->vm_prev, *pprev = src->vm_prev;
+
+   /*
+* If parent share anon_vma with its vm_prev, keep this sharing in in
+* child.
+*
+* 1. Parent has vm_prev, which implies we have vm_prev.
+* 2. Parent and its vm_prev have the same anon_vma.
+*/
+   if (pprev && pprev->anon_vma == src->anon_vma)
+   dst->anon_vma = prev->anon_vma;
  
  	list_for_each_entry_reverse(pavc, >anon_vma_chain, same_vma) {

struct anon_vma *anon_vma;

Re: [PATCH v2] mm/swap: piggyback lru_add_drain_all() calls

2019-10-04 Thread Konstantin Khlebnikov


On 04/10/2019 16.39, Michal Hocko wrote:

On Fri 04-10-19 16:32:39, Konstantin Khlebnikov wrote:

On 04/10/2019 16.12, Michal Hocko wrote:

On Fri 04-10-19 16:09:22, Konstantin Khlebnikov wrote:

This is very slow operation. There is no reason to do it again if somebody
else already drained all per-cpu vectors while we waited for lock.

Piggyback on drain started and finished while we waited for lock:
all pages pended at the time of our enter were drained from vectors.

Callers like POSIX_FADV_DONTNEED retry their operations once after
draining per-cpu vectors when pages have unexpected references.


This describes why we need to wait for preexisted pages on the pvecs but
the changelog doesn't say anything about improvements this leads to.
In other words what kind of workloads benefit from it?


Right now POSIX_FADV_DONTNEED is top user because it have to freeze page
reference when removes it from cache. invalidate_bdev calls it for same reason.
Both are triggered from userspace, so it's easy to generate storm.

mlock/mlockall no longer calls lru_add_drain_all - I've seen here
serious slowdown on older kernel.

There are some less obvious paths in memory migration/CMA/offlining
which shouldn't be called frequently.


Can you back those claims by any numbers?



Well, worst case requires non-trivial workload because lru_add_drain_all
skips cpus where vectors are empty. Something must constantly generates
flow of pages at each cpu. Also cpus must be busy to make scheduling per-cpu
works slower. And machine must be big enough (64+ cpus in our case).

In our case that was massive series of mlock calls in map-reduce while other
tasks writes log (and generates flow of new pages in per-cpu vectors). Mlock
calls were serialized by mutex and accumulated latency up to 10 second and more.

Kernel does not call lru_add_drain_all on mlock paths since 4.15, but same 
scenario
could be triggered by fadvise(POSIX_FADV_DONTNEED) or any other remaining user.

Re: [PATCH v2] mm/swap: piggyback lru_add_drain_all() calls

2019-10-04 Thread Konstantin Khlebnikov


On 04/10/2019 16.12, Michal Hocko wrote:

On Fri 04-10-19 16:09:22, Konstantin Khlebnikov wrote:

This is very slow operation. There is no reason to do it again if somebody
else already drained all per-cpu vectors while we waited for lock.

Piggyback on drain started and finished while we waited for lock:
all pages pended at the time of our enter were drained from vectors.

Callers like POSIX_FADV_DONTNEED retry their operations once after
draining per-cpu vectors when pages have unexpected references.


This describes why we need to wait for preexisted pages on the pvecs but
the changelog doesn't say anything about improvements this leads to.
In other words what kind of workloads benefit from it?


Right now POSIX_FADV_DONTNEED is top user because it have to freeze page
reference when removes it from cache. invalidate_bdev calls it for same reason.
Both are triggered from userspace, so it's easy to generate storm.

mlock/mlockall no longer calls lru_add_drain_all - I've seen here
serious slowdown on older kernel.

There are some less obvious paths in memory migration/CMA/offlining
which shouldn't be called frequently.




Signed-off-by: Konstantin Khlebnikov 
---
  mm/swap.c |   16 +++-
  1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/mm/swap.c b/mm/swap.c
index 38c3fa4308e2..5ba948a9d82a 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -708,9 +708,10 @@ static void lru_add_drain_per_cpu(struct work_struct 
*dummy)
   */
  void lru_add_drain_all(void)
  {
+   static seqcount_t seqcount = SEQCNT_ZERO(seqcount);
static DEFINE_MUTEX(lock);
static struct cpumask has_work;
-   int cpu;
+   int cpu, seq;
  
  	/*

 * Make sure nobody triggers this path before mm_percpu_wq is fully
@@ -719,7 +720,19 @@ void lru_add_drain_all(void)
if (WARN_ON(!mm_percpu_wq))
return;
  
+	seq = raw_read_seqcount_latch();

+
mutex_lock();
+
+   /*
+* Piggyback on drain started and finished while we waited for lock:
+* all pages pended at the time of our enter were drained from vectors.
+*/
+   if (__read_seqcount_retry(, seq))
+   goto done;
+
+   raw_write_seqcount_latch();
+
cpumask_clear(_work);
  
  	for_each_online_cpu(cpu) {

@@ -740,6 +753,7 @@ void lru_add_drain_all(void)
for_each_cpu(cpu, _work)
flush_work(_cpu(lru_add_drain_work, cpu));
  
+done:

mutex_unlock();
  }
  #else

[PATCH v2] mm/swap: piggyback lru_add_drain_all() calls

2019-10-04 Thread Konstantin Khlebnikov

This is very slow operation. There is no reason to do it again if somebody
else already drained all per-cpu vectors while we waited for lock.

Piggyback on drain started and finished while we waited for lock:
all pages pended at the time of our enter were drained from vectors.

Callers like POSIX_FADV_DONTNEED retry their operations once after
draining per-cpu vectors when pages have unexpected references.

Signed-off-by: Konstantin Khlebnikov 
---
 mm/swap.c |   16 +++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/mm/swap.c b/mm/swap.c
index 38c3fa4308e2..5ba948a9d82a 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -708,9 +708,10 @@ static void lru_add_drain_per_cpu(struct work_struct 
*dummy)
  */
 void lru_add_drain_all(void)
 {
+   static seqcount_t seqcount = SEQCNT_ZERO(seqcount);
static DEFINE_MUTEX(lock);
static struct cpumask has_work;
-   int cpu;
+   int cpu, seq;
 
/*
 * Make sure nobody triggers this path before mm_percpu_wq is fully
@@ -719,7 +720,19 @@ void lru_add_drain_all(void)
if (WARN_ON(!mm_percpu_wq))
return;
 
+   seq = raw_read_seqcount_latch();
+
mutex_lock();
+
+   /*
+* Piggyback on drain started and finished while we waited for lock:
+* all pages pended at the time of our enter were drained from vectors.
+*/
+   if (__read_seqcount_retry(, seq))
+   goto done;
+
+   raw_write_seqcount_latch();
+
cpumask_clear(_work);
 
for_each_online_cpu(cpu) {
@@ -740,6 +753,7 @@ void lru_add_drain_all(void)
for_each_cpu(cpu, _work)
flush_work(_cpu(lru_add_drain_work, cpu));
 
+done:
mutex_unlock();
 }
 #else

Re: [PATCH] mm/swap: piggyback lru_add_drain_all() calls

2019-10-04 Thread Konstantin Khlebnikov





On 04/10/2019 15.27, Michal Hocko wrote:

On Fri 04-10-19 05:10:17, Matthew Wilcox wrote:

On Fri, Oct 04, 2019 at 01:11:06PM +0300, Konstantin Khlebnikov wrote:

This is very slow operation. There is no reason to do it again if somebody
else already drained all per-cpu vectors after we waited for lock.
+   seq = raw_read_seqcount_latch();
+
mutex_lock();
+
+   /* Piggyback on drain done by somebody else. */
+   if (__read_seqcount_retry(, seq))
+   goto done;
+
+   raw_write_seqcount_latch();
+


Do we really need the seqcount to do this?  Wouldn't a mutex_trylock()
have the same effect?


Yeah, this makes sense. From correctness point of view it should be ok
because no caller can expect that per-cpu pvecs are empty on return.
This might have some runtime effects that some paths might retry more -
e.g. offlining path drains pcp pvces before migrating the range away, if
there are pages still waiting for a worker to drain them then the
migration would fail and we would retry. But this not a correctness
issue.



Caller might expect that pages added by him before are drained.
Exiting after mutex_trylock() will not guarantee that.

For example POSIX_FADV_DONTNEED uses that.

Re: [PATCH] mm/swap: piggyback lru_add_drain_all() calls

2019-10-04 Thread Konstantin Khlebnikov





On 04/10/2019 15.10, Matthew Wilcox wrote:

On Fri, Oct 04, 2019 at 01:11:06PM +0300, Konstantin Khlebnikov wrote:

This is very slow operation. There is no reason to do it again if somebody
else already drained all per-cpu vectors after we waited for lock.
+   seq = raw_read_seqcount_latch();
+
mutex_lock();
+
+   /* Piggyback on drain done by somebody else. */
+   if (__read_seqcount_retry(, seq))
+   goto done;
+
+   raw_write_seqcount_latch();
+


Do we really need the seqcount to do this?  Wouldn't a mutex_trylock()
have the same effect?



No, this is completely different semantics.

Operation could be safely skipped only if somebody else started and
finished drain after current task called this function.

[PATCH] mm/swap: piggyback lru_add_drain_all() calls

2019-10-04 Thread Konstantin Khlebnikov

This is very slow operation. There is no reason to do it again if somebody
else already drained all per-cpu vectors after we waited for lock.

Signed-off-by: Konstantin Khlebnikov 
---
 mm/swap.c |   13 -
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/mm/swap.c b/mm/swap.c
index 38c3fa4308e2..6203918e1316 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -708,9 +708,10 @@ static void lru_add_drain_per_cpu(struct work_struct 
*dummy)
  */
 void lru_add_drain_all(void)
 {
+   static seqcount_t seqcount = SEQCNT_ZERO(seqcount);
static DEFINE_MUTEX(lock);
static struct cpumask has_work;
-   int cpu;
+   int cpu, seq;
 
/*
 * Make sure nobody triggers this path before mm_percpu_wq is fully
@@ -719,7 +720,16 @@ void lru_add_drain_all(void)
if (WARN_ON(!mm_percpu_wq))
return;
 
+   seq = raw_read_seqcount_latch();
+
mutex_lock();
+
+   /* Piggyback on drain done by somebody else. */
+   if (__read_seqcount_retry(, seq))
+   goto done;
+
+   raw_write_seqcount_latch();
+
cpumask_clear(_work);
 
for_each_online_cpu(cpu) {
@@ -740,6 +750,7 @@ void lru_add_drain_all(void)
for_each_cpu(cpu, _work)
flush_work(_cpu(lru_add_drain_work, cpu));
 
+done:
mutex_unlock();
 }
 #else

1 2 3 4 5 6 7 8 9 10 >

1 - 100 of 2011 matches

Mail list logo