[PATCH] mm: migrate: initialize err in do_migrate_pages

2021-01-05 Thread Jan Stancek
After commit 236c32eb1096 ("mm: migrate: clean up migrate_prep{_local}")',
do_migrate_pages can return uninitialized variable 'err' (which is
propagated to user-space as error) when 'from' and 'to' nodesets
are identical. This can be reproduced with LTP migrate_pages01,
which calls migrate_pages() with same set for both old/new_nodes.

Add 'err' initialization back.

Fixes: 236c32eb1096 ("mm: migrate: clean up migrate_prep{_local}")
Cc: Zi Yan 
Cc: Yang Shi 
Cc: Jan Kara 
Cc: Matthew Wilcox 
Cc: Mel Gorman 
Cc: Michal Hocko 
Cc: Song Liu 
Cc: Andrew Morton 
Signed-off-by: Jan Stancek 
---
 mm/mempolicy.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 8cf96bd21341..2c3a86502053 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -,7 +,7 @@ int do_migrate_pages(struct mm_struct *mm, const 
nodemask_t *from,
 const nodemask_t *to, int flags)
 {
int busy = 0;
-   int err;
+   int err = 0;
nodemask_t tmp;
 
migrate_prep();
-- 
2.18.1



Re: [LTP] [PATCH] selftests: vdso: hash entry size on alpha, s390x is 8 bytes

2020-08-07 Thread Jan Stancek


- Original Message -
> 
> - Original Message -
> > Hi!
> > As much as it's worth the changes looks good to me.
> > 
> > @Jan: I guess that we can as well fix this in LTP first then we can try
> >   to get the kernel version fixed...
> 
> Fine by me, I'll give it couple more days then push it.

Pushed.



Re: [LTP] [PATCH] selftests: vdso: hash entry size on alpha, s390x is 8 bytes

2020-08-05 Thread Jan Stancek


- Original Message -
> Hi!
> As much as it's worth the changes looks good to me.
> 
> @Jan: I guess that we can as well fix this in LTP first then we can try
>   to get the kernel version fixed...

Fine by me, I'll give it couple more days then push it.

I did repost it with original author CC-ed:
  
https://lore.kernel.org/lkml/93a07b1808e61babef3a20542cbeb4565d3c8410.1596458924.git.jstan...@redhat.com/
but haven't heard back yet.



[PATCH RESEND] selftests: vdso: hash entry size on alpha,s390x is 8 bytes

2020-08-03 Thread Jan Stancek
parse_vdso.c is crashing on 5.8-rc5 s390x, because it wrongly reads
nbucket as 0:
  Program received signal SIGFPE, Arithmetic exception.
  0x01000f3e in vdso_sym (version=0x1001280 "LINUX_2.6", name=0x100128a 
"__vdso_getcpu") at parse_vdso.c:207
  207 ELF(Word) chain = vdso_info.bucket[elf_hash(name) % 
vdso_info.nbucket];
  (gdb) p vdso_info.nbucket
  $1 = 0

Per readelf source:
  
https://sourceware.org/git/?p=binutils-gdb.git;a=blob;f=binutils/readelf.c;h=2406304fe35a832ac53aa7b1a367f3f7afed4264;hb=HEAD#l10027
and objdump output hash entries are double size on 64bit s390 and alpha:
  0120 <.hash>:
   120:   00 00 00 00 .long   0x
   124:   00 00 00 03 .long   0x0003
   128:   00 00 00 00 .long   0x
   12c:   00 00 00 07 .long   0x0007
   130:   00 00 00 00 .long   0x
   134:   00 00 00 01 .long   0x0001
   138:   00 00 00 00 .long   0x
   13c:   00 00 00 05 .long   0x0005
   140:   00 00 00 00 .long   0x
   144:   00 00 00 06 .long   0x0006
  ...
   16c:   00 00 00 02 .long   0x0002
   170:   00 00 00 00 .long   0x
   174:   00 00 00 03 .long   0x0003
   178:   00 00 00 00 .long   0x
   17c:   00 00 00 04 .long   0x0004

Do similar check in parse_vdso.c and treat hash entries as double word.

Signed-off-by: Jan Stancek 
---
 tools/testing/selftests/vDSO/parse_vdso.c | 48 +++
 1 file changed, 40 insertions(+), 8 deletions(-)

Resend with original author CC-ed.

diff --git a/tools/testing/selftests/vDSO/parse_vdso.c 
b/tools/testing/selftests/vDSO/parse_vdso.c
index 413f75620a35..e23efcbd1c88 100644
--- a/tools/testing/selftests/vDSO/parse_vdso.c
+++ b/tools/testing/selftests/vDSO/parse_vdso.c
@@ -47,8 +47,9 @@ static struct vdso_info
/* Symbol table */
ELF(Sym) *symtab;
const char *symstrings;
-   ELF(Word) *bucket, *chain;
+   void *bucket, *chain;
ELF(Word) nbucket, nchain;
+   bool hash_ent_is_dword;
 
/* Version table */
ELF(Versym) *versym;
@@ -69,6 +70,28 @@ static unsigned long elf_hash(const unsigned char *name)
return h;
 }
 
+/* return value of hash table entry */
+ELF(Word) get_hash_val(void *ptr, ELF(Word) idx)
+{
+   if (vdso_info.hash_ent_is_dword) {
+   ELF(Xword) *table = ptr;
+   /* for vdso assume all values fit in Elf Word */
+   return (ELF(Word)) table[idx];
+   }
+
+   ELF(Word) *table = ptr;
+   return table[idx];
+}
+
+/* return pointer to hash table entry */
+void *get_hash_ptr(void *ptr, ELF(Word) idx)
+{
+   if (vdso_info.hash_ent_is_dword)
+   return &((ELF(Xword) *) ptr)[idx];
+
+   return &((ELF(Word) *) ptr)[idx];
+}
+
 void vdso_init_from_sysinfo_ehdr(uintptr_t base)
 {
size_t i;
@@ -84,6 +107,14 @@ void vdso_init_from_sysinfo_ehdr(uintptr_t base)
return;  /* Wrong ELF class -- check ELF_BITS */
}
 
+   /* 64bit s390 and alpha have hash entry size of 8 bytes */
+   if ((hdr->e_machine == EM_ALPHA
+   || hdr->e_machine == EM_S390)
+   && hdr->e_ident[EI_CLASS] == ELFCLASS64)
+   vdso_info.hash_ent_is_dword = true;
+   else
+   vdso_info.hash_ent_is_dword = false;
+
ELF(Phdr) *pt = (ELF(Phdr)*)(vdso_info.load_addr + hdr->e_phoff);
ELF(Dyn) *dyn = 0;
 
@@ -149,11 +180,11 @@ void vdso_init_from_sysinfo_ehdr(uintptr_t base)
if (!vdso_info.verdef)
vdso_info.versym = 0;
 
-   /* Parse the hash table header. */
-   vdso_info.nbucket = hash[0];
-   vdso_info.nchain = hash[1];
-   vdso_info.bucket = [2];
-   vdso_info.chain = [vdso_info.nbucket + 2];
+
+   vdso_info.nbucket = get_hash_val(hash, 0);
+   vdso_info.nchain = get_hash_val(hash, 1);
+   vdso_info.bucket = get_hash_ptr(hash, 2);
+   vdso_info.chain = get_hash_ptr(hash, vdso_info.nbucket + 2);
 
/* That's all we need. */
vdso_info.valid = true;
@@ -204,9 +235,10 @@ void *vdso_sym(const char *version, const char *name)
return 0;
 
ver_hash = elf_hash(version);
-   ELF(Word) chain = vdso_info.bucket[elf_hash(name) % vdso_info.nbucket];
+   ELF(Word) chain = get_hash_val(vdso_info.bucket,
+   elf_hash(name) % vdso_info.nbucket);
 
-   for (; chain != STN_UNDEF; chain = vdso_info.chain[chain]) {
+   for (; chain != STN_UNDEF; chain = get_hash_val(vdso_info.chain, 
chain)) {
ELF(Sym) *sym = _info.symtab[chain];
 
/* Check for a defined global or weak function w/ right name. */
-- 
2.18.1



Re: [PATCH] selftests: vdso: hash entry size on alpha,s390x is 8 bytes

2020-08-03 Thread Jan Stancek


- Original Message -
> parse_vdso.c is crashing on 5.8-rc5 s390x, because it wrongly reads
> nbucket as 0:
>   Program received signal SIGFPE, Arithmetic exception.
>   0x01000f3e in vdso_sym (version=0x1001280 "LINUX_2.6",
>   name=0x100128a "__vdso_getcpu") at parse_vdso.c:207
>   207 ELF(Word) chain = vdso_info.bucket[elf_hash(name) %
>   vdso_info.nbucket];
>   (gdb) p vdso_info.nbucket
>   $1 = 0
> 
> Per readelf source:
>   
> https://sourceware.org/git/?p=binutils-gdb.git;a=blob;f=binutils/readelf.c;h=2406304fe35a832ac53aa7b1a367f3f7afed4264;hb=HEAD#l10027
> and objdump output hash entries are double size on 64bit s390 and alpha:
>   0120 <.hash>:
>120:   00 00 00 00 .long   0x
>124:   00 00 00 03 .long   0x0003
>128:   00 00 00 00 .long   0x
>12c:   00 00 00 07 .long   0x0007
>130:   00 00 00 00 .long   0x
>134:   00 00 00 01 .long   0x0001
>138:   00 00 00 00 .long   0x
>13c:   00 00 00 05 .long   0x0005
>140:   00 00 00 00 .long   0x
>144:   00 00 00 06 .long   0x0006
> ...
>16c:   00 00 00 02 .long   0x0002
>170:   00 00 00 00 .long   0x
>174:   00 00 00 03 .long   0x0003
>178:   00 00 00 00 .long   0x
>17c:   00 00 00 04 .long   0x0004
> 
> Do similar check in parse_vdso.c and treat hash entries as double word.

Ping, any thoughts about the issue or patch?



[PATCH] selftests: vdso: hash entry size on alpha,s390x is 8 bytes

2020-07-15 Thread Jan Stancek
parse_vdso.c is crashing on 5.8-rc5 s390x, because it wrongly reads
nbucket as 0:
  Program received signal SIGFPE, Arithmetic exception.
  0x01000f3e in vdso_sym (version=0x1001280 "LINUX_2.6", name=0x100128a 
"__vdso_getcpu") at parse_vdso.c:207
  207 ELF(Word) chain = vdso_info.bucket[elf_hash(name) % 
vdso_info.nbucket];
  (gdb) p vdso_info.nbucket
  $1 = 0

Per readelf source:
  
https://sourceware.org/git/?p=binutils-gdb.git;a=blob;f=binutils/readelf.c;h=2406304fe35a832ac53aa7b1a367f3f7afed4264;hb=HEAD#l10027
and objdump output hash entries are double size on 64bit s390 and alpha:
  0120 <.hash>:
   120:   00 00 00 00 .long   0x
   124:   00 00 00 03 .long   0x0003
   128:   00 00 00 00 .long   0x
   12c:   00 00 00 07 .long   0x0007
   130:   00 00 00 00 .long   0x
   134:   00 00 00 01 .long   0x0001
   138:   00 00 00 00 .long   0x
   13c:   00 00 00 05 .long   0x0005
   140:   00 00 00 00 .long   0x
   144:   00 00 00 06 .long   0x0006
  ...
   16c:   00 00 00 02 .long   0x0002
   170:   00 00 00 00 .long   0x
   174:   00 00 00 03 .long   0x0003
   178:   00 00 00 00 .long   0x
   17c:   00 00 00 04 .long   0x0004

Do similar check in parse_vdso.c and treat hash entries as double word.

Signed-off-by: Jan Stancek 
---
 tools/testing/selftests/vDSO/parse_vdso.c | 48 +++
 1 file changed, 40 insertions(+), 8 deletions(-)

diff --git a/tools/testing/selftests/vDSO/parse_vdso.c 
b/tools/testing/selftests/vDSO/parse_vdso.c
index 413f75620a35..e23efcbd1c88 100644
--- a/tools/testing/selftests/vDSO/parse_vdso.c
+++ b/tools/testing/selftests/vDSO/parse_vdso.c
@@ -47,8 +47,9 @@ static struct vdso_info
/* Symbol table */
ELF(Sym) *symtab;
const char *symstrings;
-   ELF(Word) *bucket, *chain;
+   void *bucket, *chain;
ELF(Word) nbucket, nchain;
+   bool hash_ent_is_dword;
 
/* Version table */
ELF(Versym) *versym;
@@ -69,6 +70,28 @@ static unsigned long elf_hash(const unsigned char *name)
return h;
 }
 
+/* return value of hash table entry */
+ELF(Word) get_hash_val(void *ptr, ELF(Word) idx)
+{
+   if (vdso_info.hash_ent_is_dword) {
+   ELF(Xword) *table = ptr;
+   /* for vdso assume all values fit in Elf Word */
+   return (ELF(Word)) table[idx];
+   }
+
+   ELF(Word) *table = ptr;
+   return table[idx];
+}
+
+/* return pointer to hash table entry */
+void *get_hash_ptr(void *ptr, ELF(Word) idx)
+{
+   if (vdso_info.hash_ent_is_dword)
+   return &((ELF(Xword) *) ptr)[idx];
+
+   return &((ELF(Word) *) ptr)[idx];
+}
+
 void vdso_init_from_sysinfo_ehdr(uintptr_t base)
 {
size_t i;
@@ -84,6 +107,14 @@ void vdso_init_from_sysinfo_ehdr(uintptr_t base)
return;  /* Wrong ELF class -- check ELF_BITS */
}
 
+   /* 64bit s390 and alpha have hash entry size of 8 bytes */
+   if ((hdr->e_machine == EM_ALPHA
+   || hdr->e_machine == EM_S390)
+   && hdr->e_ident[EI_CLASS] == ELFCLASS64)
+   vdso_info.hash_ent_is_dword = true;
+   else
+   vdso_info.hash_ent_is_dword = false;
+
ELF(Phdr) *pt = (ELF(Phdr)*)(vdso_info.load_addr + hdr->e_phoff);
ELF(Dyn) *dyn = 0;
 
@@ -149,11 +180,11 @@ void vdso_init_from_sysinfo_ehdr(uintptr_t base)
if (!vdso_info.verdef)
vdso_info.versym = 0;
 
-   /* Parse the hash table header. */
-   vdso_info.nbucket = hash[0];
-   vdso_info.nchain = hash[1];
-   vdso_info.bucket = [2];
-   vdso_info.chain = [vdso_info.nbucket + 2];
+
+   vdso_info.nbucket = get_hash_val(hash, 0);
+   vdso_info.nchain = get_hash_val(hash, 1);
+   vdso_info.bucket = get_hash_ptr(hash, 2);
+   vdso_info.chain = get_hash_ptr(hash, vdso_info.nbucket + 2);
 
/* That's all we need. */
vdso_info.valid = true;
@@ -204,9 +235,10 @@ void *vdso_sym(const char *version, const char *name)
return 0;
 
ver_hash = elf_hash(version);
-   ELF(Word) chain = vdso_info.bucket[elf_hash(name) % vdso_info.nbucket];
+   ELF(Word) chain = get_hash_val(vdso_info.bucket,
+   elf_hash(name) % vdso_info.nbucket);
 
-   for (; chain != STN_UNDEF; chain = vdso_info.chain[chain]) {
+   for (; chain != STN_UNDEF; chain = get_hash_val(vdso_info.chain, 
chain)) {
ELF(Sym) *sym = _info.symtab[chain];
 
/* Check for a defined global or weak function w/ right name. */
-- 
2.18.1



Re: [LTP] 303cc571d1: ltp.setns01.fail

2020-06-16 Thread Jan Stancek



- Original Message -
> I'll send a pr for this to Linus this week (or next since I'm on
> vacation this week) and get this fixed. Thanks for spotting this. What's
> the Reported-by: line format that LTP uses?

I'm not sure we ever used one, it's usually various CI systems that
reference LTP. This thread has been started by kernel test robot:
  Reported-by: kernel test robot 



Re: [LTP] 303cc571d1: ltp.setns01.fail

2020-06-15 Thread Jan Stancek



- Original Message -
> Hi!
> > setns01 6  TFAIL  :  setns01.c:176: regular file fd exp_errno=22:
> > errno=EBADF(9): Bad file descriptor
> > setns01 0  TINFO  :  setns(12, 0x2)
> > setns01 7  TFAIL  :  setns01.c:176: regular file fd exp_errno=22:
> > errno=EBADF(9): Bad file descriptor
> > setns01 0  TINFO  :  setns(12, 0x4000)
> > setns01 8  TFAIL  :  setns01.c:176: regular file fd exp_errno=22:
> > errno=EBADF(9): Bad file descriptor
> > setns01 0  TINFO  :  setns(12, 0x2000)
> > setns01 9  TFAIL  :  setns01.c:176: regular file fd exp_errno=22:
> > errno=EBADF(9): Bad file descriptor
> > setns01 0  TINFO  :  setns(12, 0x400)
> > setns0110  TFAIL  :  setns01.c:176: regular file fd exp_errno=22:
> > errno=EBADF(9): Bad file descriptor
> 
> The messages here are a bit cryptic, I will fix that later on, but what
> it means here is that the errno has changed from EINVAL to EBADF in a
> case we pass file descriptor to a regular file to setns().

I posted a series that accepts both errnos about week ago:
  https://lists.linux.it/pipermail/ltp/2020-June/017467.html



Re: LTP: syscalls: regression on mainline - ioctl_loop01 mknod07 setns01

2020-06-05 Thread Jan Stancek



- Original Message -
> Following three test cases reported as regression on Linux mainline kernel
> on x86_64, arm64, arm and i386
> 
>   ltp-syscalls-tests:
> * ioctl_loop01
> * mknod07

Test updated:
  
https://github.com/linux-test-project/ltp/commit/13fcfa2d6bdd1fb71c4528b47170e8e8fb3a8a32

> * setns01

commit 303cc571d107 ("nsproxy: attach to namespaces via pidfds")
changed errno that is returned for regular file from EINVAL to EBADF.
This appears to fit more current man page, so I think we need to fix
test to accept both. (I'm looking into that)



[bug?] LTP pt_test failing after 38bb8d77d0b9 "perf/x86/intel/pt: Split ToPA metadata and page layout"

2019-10-19 Thread Jan Stancek
Hi,

All variants of pt_test:
  
https://github.com/linux-test-project/ltp/blob/master/testcases/kernel/tracing/pt_test/pt_test.c
started failing after:
  38bb8d77d0b9 ("perf/x86/intel/pt: Split ToPA metadata and page layout")

with following error on console/dmesg:
  pt: ToPA ERROR encountered, trying to recover

on Broadwell-EP:

vendor_id   : GenuineIntel
cpu family  : 6
model   : 79
model name  : Intel(R) Xeon(R) CPU E5-2699 v4 @ 2.20GHz
stepping: 1

Regards,
Jan


Re: [PATCH] fat: fix corruption in fat_alloc_new_dir()

2019-09-08 Thread Jan Stancek



- Original Message -
> Jan Stancek  writes:
> 
> > sb_getblk does not guarantee that buffer_head is uptodate. If there is
> > async read running in parallel for same buffer_head, it can overwrite
> > just initialized msdos_dir_entry, leading to corruption:
> >   FAT-fs (loop0): error, corrupted directory (invalid entries)
> >   FAT-fs (loop0): Filesystem has been set read-only
> >
> > This can happen for example during LTP statx04, which creates loop
> > device, formats it (mkfs.vfat), mounts it and immediately creates
> > a new directory. In parallel, systemd-udevd is probing new block
> > device, which leads to async read.
> >
> >   do_mkdirat  ksys_read
> >vfs_mkdir   vfs_read
> > vfat_mkdir  __vfs_read
> >  fat_alloc_new_dir   new_sync_read
> >/* init de[0], de[1] */blkdev_read_iter
> >generic_file_read_iter
> > generic_file_buffered_read
> >  blkdev_readpage
> >   block_read_full_page
> >
> > Faster reproducer (based on LTP statx04):
> >
> > int main(void)
> > {
> > int i, j, ret, fd, loop_fd, ctrl_fd;
> > int loop_num;
> > char loopdev[256], tmp[256], testfile[256];
> >
> > mkdir("/tmp/mntpoint", 0777);
> > for (i = 0; ; i++) {
> > printf("Iteration: %d\n", i);
> > sprintf(testfile, "/tmp/test.img.%d", getpid());
> >
> > ctrl_fd = open("/dev/loop-control", O_RDWR);
> > loop_num = ioctl(ctrl_fd, LOOP_CTL_GET_FREE);
> > close(ctrl_fd);
> > sprintf(loopdev, "/dev/loop%d", loop_num);
> >
> > fd = open(testfile, O_WRONLY|O_CREAT|O_TRUNC, 0600);
> > fallocate(fd, 0, 0, 256*1024*1024);
> > close(fd);
> >
> > fd = open(testfile, O_RDWR);
> > loop_fd = open(loopdev, O_RDWR);
> > ioctl(loop_fd, LOOP_SET_FD, fd);
> > close(loop_fd);
> > close(fd);
> >
> > sprintf(tmp, "mkfs.vfat %s", loopdev);
> > system(tmp);
> > mount(loopdev, "/tmp/mntpoint", "vfat", 0, NULL);
> >
> > for (j = 0; j < 200; j++) {
> > sprintf(tmp, "/tmp/mntpoint/testdir%d", j);
> > ret = mkdir(tmp, 0777);
> > if (ret) {
> > perror("mkdir");
> > break;
> > }
> > }
> >
> > umount("/tmp/mntpoint");
> > loop_fd = open(loopdev, O_RDWR);
> > ioctl(loop_fd, LOOP_CLR_FD, fd);
> > close(loop_fd);
> > unlink(testfile);
> >
> > if (ret)
> > break;
> > }
> >
> > return 0;
> > }
> >
> > Issue triggers within minute on HPE Apollo 70 (arm64, 64GB RAM, 224 CPUs).
> 
> Using the device while mounting same device doesn't work reliably like
> this race. (getblk() is intentionally used to get the buffer to write
> new data.)

Are you saying this is expected even if 'usage' is just read?

> 
> mount(2) internally opens the device by EXCL mode, so I guess udev opens
> without EXCL (I dont know if it is intent or not).

I gave this a try and added O_EXCL to udev-builtin-blkid.c. My system had 
trouble
booting, it was getting stuck on mounting LVM volumes.

So, I'm not sure how to move forward here. 

Regards,
Jan


[tip: x86/urgent] x86/timer: Force PIT initialization when !X86_FEATURE_ARAT

2019-09-08 Thread tip-bot2 for Jan Stancek
The following commit has been merged into the x86/urgent branch of tip:

Commit-ID: afa8b475c1aec185a8e106c48b3832e0b88bc2de
Gitweb:
https://git.kernel.org/tip/afa8b475c1aec185a8e106c48b3832e0b88bc2de
Author:Jan Stancek 
AuthorDate:Sun, 08 Sep 2019 00:50:40 +02:00
Committer: Thomas Gleixner 
CommitterDate: Sun, 08 Sep 2019 09:01:15 +02:00

x86/timer: Force PIT initialization when !X86_FEATURE_ARAT

KVM guests with commit c8c4076723da ("x86/timer: Skip PIT initialization on
modern chipsets") applied to guest kernel have been observed to have
unusually higher CPU usage with symptoms of increase in vm exits for HLT
and MSW_WRITE (MSR_IA32_TSCDEADLINE).

This is caused by older QEMUs lacking support for X86_FEATURE_ARAT.  lapic
clock retains CLOCK_EVT_FEAT_C3STOP and nohz stays inactive.  There's no
usable broadcast device either.

Do the PIT initialization if guest CPU lacks X86_FEATURE_ARAT.  On real
hardware it shouldn't matter as ARAT and DEADLINE come together.

Fixes: c8c4076723da ("x86/timer: Skip PIT initialization on modern chipsets")
Signed-off-by: Jan Stancek 
Signed-off-by: Thomas Gleixner 

---
 arch/x86/kernel/apic/apic.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index dba2828..f91b3ff 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -834,6 +834,10 @@ bool __init apic_needs_pit(void)
if (!boot_cpu_has(X86_FEATURE_APIC))
return true;
 
+   /* Virt guests may lack ARAT, but still have DEADLINE */
+   if (!boot_cpu_has(X86_FEATURE_ARAT))
+   return true;
+
/* Deadline timer is based on TSC so no further PIT action required */
if (boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER))
return false;


[tip: x86/urgent] x86/timer: Force PIT initialization when !X86_FEATURE_ARAT

2019-09-08 Thread tip-bot2 for Jan Stancek
The following commit has been merged into the x86/urgent branch of tip:

Commit-ID: afa8b475c1aec185a8e106c48b3832e0b88bc2de
Gitweb:
https://git.kernel.org/tip/afa8b475c1aec185a8e106c48b3832e0b88bc2de
Author:Jan Stancek 
AuthorDate:Sun, 08 Sep 2019 00:50:40 +02:00
Committer: Thomas Gleixner 
CommitterDate: Sun, 08 Sep 2019 09:01:15 +02:00

x86/timer: Force PIT initialization when !X86_FEATURE_ARAT

KVM guests with commit c8c4076723da ("x86/timer: Skip PIT initialization on
modern chipsets") applied to guest kernel have been observed to have
unusually higher CPU usage with symptoms of increase in vm exits for HLT
and MSW_WRITE (MSR_IA32_TSCDEADLINE).

This is caused by older QEMUs lacking support for X86_FEATURE_ARAT.  lapic
clock retains CLOCK_EVT_FEAT_C3STOP and nohz stays inactive.  There's no
usable broadcast device either.

Do the PIT initialization if guest CPU lacks X86_FEATURE_ARAT.  On real
hardware it shouldn't matter as ARAT and DEADLINE come together.

Fixes: c8c4076723da ("x86/timer: Skip PIT initialization on modern chipsets")
Signed-off-by: Jan Stancek 
Signed-off-by: Thomas Gleixner 

---
 arch/x86/kernel/apic/apic.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index dba2828..f91b3ff 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -834,6 +834,10 @@ bool __init apic_needs_pit(void)
if (!boot_cpu_has(X86_FEATURE_APIC))
return true;
 
+   /* Virt guests may lack ARAT, but still have DEADLINE */
+   if (!boot_cpu_has(X86_FEATURE_ARAT))
+   return true;
+
/* Deadline timer is based on TSC so no further PIT action required */
if (boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER))
return false;


[PATCH] fat: fix corruption in fat_alloc_new_dir()

2019-09-02 Thread Jan Stancek
sb_getblk does not guarantee that buffer_head is uptodate. If there is
async read running in parallel for same buffer_head, it can overwrite
just initialized msdos_dir_entry, leading to corruption:
  FAT-fs (loop0): error, corrupted directory (invalid entries)
  FAT-fs (loop0): Filesystem has been set read-only

This can happen for example during LTP statx04, which creates loop
device, formats it (mkfs.vfat), mounts it and immediately creates
a new directory. In parallel, systemd-udevd is probing new block
device, which leads to async read.

  do_mkdirat  ksys_read
   vfs_mkdir   vfs_read
vfat_mkdir  __vfs_read
 fat_alloc_new_dir   new_sync_read
   /* init de[0], de[1] */blkdev_read_iter
   generic_file_read_iter
generic_file_buffered_read
 blkdev_readpage
  block_read_full_page

Faster reproducer (based on LTP statx04):
- 8< -
int main(void)
{
int i, j, ret, fd, loop_fd, ctrl_fd;
int loop_num;
char loopdev[256], tmp[256], testfile[256];

mkdir("/tmp/mntpoint", 0777);
for (i = 0; ; i++) {
printf("Iteration: %d\n", i);
sprintf(testfile, "/tmp/test.img.%d", getpid());

ctrl_fd = open("/dev/loop-control", O_RDWR);
loop_num = ioctl(ctrl_fd, LOOP_CTL_GET_FREE);
close(ctrl_fd);
sprintf(loopdev, "/dev/loop%d", loop_num);

fd = open(testfile, O_WRONLY|O_CREAT|O_TRUNC, 0600);
fallocate(fd, 0, 0, 256*1024*1024);
close(fd);

fd = open(testfile, O_RDWR);
loop_fd = open(loopdev, O_RDWR);
ioctl(loop_fd, LOOP_SET_FD, fd);
close(loop_fd);
close(fd);

sprintf(tmp, "mkfs.vfat %s", loopdev);
system(tmp);
mount(loopdev, "/tmp/mntpoint", "vfat", 0, NULL);

for (j = 0; j < 200; j++) {
sprintf(tmp, "/tmp/mntpoint/testdir%d", j);
ret = mkdir(tmp, 0777);
if (ret) {
perror("mkdir");
break;
}
}

umount("/tmp/mntpoint");
loop_fd = open(loopdev, O_RDWR);
ioctl(loop_fd, LOOP_CLR_FD, fd);
close(loop_fd);
unlink(testfile);

if (ret)
break;
}

return 0;
}
- 8< -------------

Issue triggers within minute on HPE Apollo 70 (arm64, 64GB RAM, 224 CPUs).

Signed-off-by: Jan Stancek 
Cc: OGAWA Hirofumi 
---
 fs/fat/dir.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 1bda2ab6745b..474fd6873ec8 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -1149,7 +1149,7 @@ int fat_alloc_new_dir(struct inode *dir, struct 
timespec64 *ts)
goto error;
 
blknr = fat_clus_to_blknr(sbi, cluster);
-   bhs[0] = sb_getblk(sb, blknr);
+   bhs[0] = sb_bread(sb, blknr);
if (!bhs[0]) {
err = -ENOMEM;
goto error_free;
-- 
1.8.3.1



Re: [PATCH v5.2 1/2] locking/rwsem: Add missing ACQUIRE to read_slowpath exit when queue is empty

2019-08-27 Thread Jan Stancek


- Original Message -
> From: Jan Stancek 
> 
> [ Upstream commit e1b98fa316648420d0434d9ff5b92ad6609ba6c3 ]
> 
> LTP mtest06 has been observed to occasionally hit "still mapped when
> deleted" and following BUG_ON on arm64.
> 
> The extra mapcount originated from pagefault handler, which handled
> pagefault for vma that has already been detached. vma is detached
> under mmap_sem write lock by detach_vmas_to_be_unmapped(), which
> also invalidates vmacache.
> 
> When the pagefault handler (under mmap_sem read lock) calls
> find_vma(), vmacache_valid() wrongly reports vmacache as valid.
> 
> After rwsem down_read() returns via 'queue empty' path (as of v5.2),
> it does so without an ACQUIRE on sem->count:
> 
>   down_read()
> __down_read()
>   rwsem_down_read_failed()
> __rwsem_down_read_failed_common()
>   raw_spin_lock_irq(>wait_lock);
>   if (list_empty(>wait_list)) {
> if (atomic_long_read(>count) >= 0) {
>   raw_spin_unlock_irq(>wait_lock);
>   return sem;
> 
> The problem can be reproduced by running LTP mtest06 in a loop and
> building the kernel (-j $NCPUS) in parallel. It does reproduces since
> v4.20 on arm64 HPE Apollo 70 (224 CPUs, 256GB RAM, 2 nodes). It
> triggers reliably in about an hour.
> 
> The patched kernel ran fine for 10+ hours.
> 
> Signed-off-by: Jan Stancek 
> Signed-off-by: Peter Zijlstra (Intel) 
> Reviewed-by: Will Deacon 
> Acked-by: Waiman Long 
> Cc: Linus Torvalds 
> Cc: Peter Zijlstra 
> Cc: Thomas Gleixner 
> Cc: dbu...@suse.de
> Fixes: 4b486b535c33 ("locking/rwsem: Exit read lock slowpath if queue empty &
> no writer")
> Link:
> https://lkml.kernel.org/r/50b8914e20d1d62bb2dee42d342836c2c16ebee7.1563438048.git.jstan...@redhat.com
> Signed-off-by: Ingo Molnar 
> Signed-off-by: Sasha Levin 
> ---
> 
> This is a backport for the v5.2 stable tree. There were multiple reports
> of this issue being hit.
> 
> Given that there were a few changes to the code around this, I'd
> appreciate an ack before pulling it in.

ACK, both look good to me.
I also re-ran reproducer with this series applied on top of 5.2.10, it PASS-ed.

Thanks,
Jan


Re: Linux-next-20190823: x86_64/i386: prot_hsymlinks.c:325: Failed to run cmd: useradd hsym

2019-08-27 Thread Jan Stancek



- Original Message -
> On Tue, 2019-08-27 at 06:25 -0400, Jan Stancek wrote:
> > That theory is probably not correct for this case, since EIO I see
> > appears
> > to originate from write and nfs_writeback_result(). This function
> > also
> > produces message we saw in logs from Naresh.
> > 
> > I can't find where/how is resp->count updated on WRITE reply in
> > NFSv2.
> > Issue also goes away with patch below, though I can't speak about its
> > correctness:
> > 
> > NFS version TypeTestReturn code
> > nfsvers=2   tcp -b:base 0
> > nfsvers=2   tcp -g:general  0
> > nfsvers=2   tcp -s:special  0
> > nfsvers=2   tcp -l:lock 0
> > Total time: 141
> > 
> > diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
> > index cbc17a203248..4913c6da270b 100644
> > --- a/fs/nfs/nfs2xdr.c
> > +++ b/fs/nfs/nfs2xdr.c
> > @@ -897,6 +897,16 @@ static int nfs2_xdr_dec_writeres(struct rpc_rqst
> > *req, struct xdr_stream *xdr,
> >  void *data)
> >  {
> > struct nfs_pgio_res *result = data;
> > +   struct rpc_task *rq_task  = req->rq_task;
> > +
> > +   if (rq_task) {
> > +   struct nfs_pgio_args *args = rq_task-
> > >tk_msg.rpc_argp;
> > +
> > +   if (args) {
> > +   result->count = args->count;
> > +   }
> > +   }
> >  
> > /* All NFSv2 writes are "file sync" writes */
> > result->verf->committed = NFS_FILE_SYNC;
> 
> Thanks! I've moved the above to nfs_write_done() so that we do it only
> on success (see
> http://git.linux-nfs.org/?p=trondmy/linux-nfs.git;a=commitdiff;h=3ba5688da709dd0f7d917029c206bc1848a6ae74
> )

Thanks, retested with 3ba5688da, all PASS:

NFS version TypeTestReturn code
nfsvers=2   tcp -b:base 0
nfsvers=2   tcp -g:general  0
nfsvers=2   tcp -s:special  0
nfsvers=2   tcp -l:lock 0

NFS version TypeTestReturn code
nfsvers=3   tcp -b:base 0
nfsvers=3   tcp -g:general  0
nfsvers=3   tcp -s:special  0
nfsvers=3   tcp -l:lock 0
nfsvers=3   tcp6-b:base 0
nfsvers=3   tcp6-g:general  0
nfsvers=3   tcp6-s:special  0
nfsvers=3   tcp6-l:lock 0

NFS version TypeTestReturn code
nfsvers=4   tcp -b:base 0
nfsvers=4   tcp -g:general  0
nfsvers=4   tcp -s:special  0
nfsvers=4   tcp -l:lock 0
nfsvers=4   tcp6-b:base 0
nfsvers=4   tcp6-g:general  0
nfsvers=4   tcp6-s:special  0
nfsvers=4   tcp6-l:lock 0

Feel free to add also:

Reported-by: Naresh Kamboju 
Tested-by: Jan Stancek 


Re: Linux-next-20190823: x86_64/i386: prot_hsymlinks.c:325: Failed to run cmd: useradd hsym

2019-08-27 Thread Jan Stancek


- Original Message -
> On Mon, 2019-08-26 at 19:12 -0400, Jan Stancek wrote:
> > - Original Message -
> > > On Mon, 2019-08-26 at 10:38 -0400, Jan Stancek wrote:
> > > > - Original Message -
> > > > > Hi Jan and Cyril,
> > > > > 
> > > > > On Mon, 26 Aug 2019 at 16:35, Jan Stancek 
> > > > > wrote:
> > > > > > 
> > > > > > - Original Message -
> > > > > > > Hi!
> > > > > > > > Do you see this LTP prot_hsymlinks failure on linux next
> > > > > > > > 20190823 on
> > > > > > > > x86_64 and i386 devices?
> > > > > > > > 
> > > > > > > > test output log,
> > > > > > > > useradd: failure while writing changes to /etc/passwd
> > > > > > > > useradd: /home/hsym was created, but could not be removed
> > > > > > > 
> > > > > > > This looks like an unrelated problem, failure to write to
> > > > > > > /etc/passwd
> > > > > > > probably means that filesystem is full or some problem
> > > > > > > happend
> > > > > > > and how
> > > > > > > is remounted RO.
> > > > > > 
> > > > > > In Naresh' example, root is on NFS:
> > > > > >   root=/dev/nfs rw
> > > > > >  
> > > > > > nfsroot=10.66.16.123:/var/lib/lava/dispatcher/tmp/886412/extr
> > > > > > act-
> > > > > > nfsrootfs-tyuevoxm,tcp,hard,intr
> > > > > 
> > > > > Right !
> > > > > root is mounted on NFS.
> > > > > 
> > > > > > 10.66.16.123:/var/lib/lava/dispatcher/tmp/886412/extract-
> > > > > > nfsrootfs-tyuevoxm
> > > > > > on / type nfs
> > > > > > (rw,relatime,vers=2,rsize=4096,wsize=4096,namlen=255,hard,nol
> > > > > > ock,
> > > > > > proto=tcp,timeo=600,retrans=2,sec=sys,mountaddr=10.66.16.123,
> > > > > > moun
> > > > > > tvers=1,mountproto=tcp,local_lock=all,addr=10.66.16.123)
> > > > > > devtmpfs on /dev type devtmpfs
> > > > > > (rw,relatime,size=3977640k,nr_inodes=994410,mode=755)
> > > > > > 
> > > 
> > > The only thing I can think of that might cause an EIO on NFSv2
> > > would be
> > > this patch
> > > http://git.linux-nfs.org/?p=trondmy/linux-nfs.git;a=commitdiff;h=627d48e597ec5993c4abb3b81dc75e554a07c7c0
> > > assuming that a bind-related error is leaking through.
> > > 
> > > I'd suggest something like the following to fix it up:
> > 
> > No change with that patch,
> > but following one fixes it for me:
> > 
> > diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
> > index 20b3717cd7ca..56cefa0ab804 100644
> > --- a/fs/nfs/pagelist.c
> > +++ b/fs/nfs/pagelist.c
> > @@ -590,7 +590,7 @@ static void nfs_pgio_rpcsetup(struct
> > nfs_pgio_header *hdr,
> > }
> >  
> > hdr->res.fattr   = >fattr;
> > -   hdr->res.count   = 0;
> > +   hdr->res.count   = count;
> > hdr->res.eof = 0;
> > hdr->res.verf= >verf;
> > nfs_fattr_init(>fattr);
> > 
> > which is functionally revert of "NFS: Fix initialisation of I/O
> > result struct in nfs_pgio_rpcsetup".
> > 
> > This hunk caught my eye, could res.eof == 0 explain those I/O errors?
> 
> Interesting hypothesis. It could if res.count ends up being 0. So does
> the following also fix the problem?

It didn't fix it.

That theory is probably not correct for this case, since EIO I see appears
to originate from write and nfs_writeback_result(). This function also
produces message we saw in logs from Naresh.

I can't find where/how is resp->count updated on WRITE reply in NFSv2.
Issue also goes away with patch below, though I can't speak about its 
correctness:

NFS version TypeTestReturn code
nfsvers=2   tcp -b:base 0
nfsvers=2   tcp -g:general  0
nfsvers=2   tcp -s:special  0
nfsvers=2   tcp -l:lock 0
Total time: 141

diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index cbc17a203248..4913c6da270b 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -897,6 +897,16 @@ static int nfs2_xdr_dec_writeres(struct rpc_rqst *req, 
struct xdr_stream *xdr,
 void *data)
 {
struct nfs_pgio_res *result = data;
+   struct rpc_task *rq_task  = req->rq_task;
+
+   if (rq_task) {
+   struct nfs_pgio_args *args = rq_task->tk_msg.rpc_argp;
+
+   if (args) {
+   result->count = args->count;
+   }
+   }
 
/* All NFSv2 writes are "file sync" writes */
result->verf->committed = NFS_FILE_SYNC;


Re: Linux-next-20190823: x86_64/i386: prot_hsymlinks.c:325: Failed to run cmd: useradd hsym

2019-08-26 Thread Jan Stancek


- Original Message -
> On Mon, 2019-08-26 at 10:38 -0400, Jan Stancek wrote:
> > - Original Message -
> > > Hi Jan and Cyril,
> > > 
> > > On Mon, 26 Aug 2019 at 16:35, Jan Stancek 
> > > wrote:
> > > > 
> > > > 
> > > > - Original Message -
> > > > > Hi!
> > > > > > Do you see this LTP prot_hsymlinks failure on linux next
> > > > > > 20190823 on
> > > > > > x86_64 and i386 devices?
> > > > > > 
> > > > > > test output log,
> > > > > > useradd: failure while writing changes to /etc/passwd
> > > > > > useradd: /home/hsym was created, but could not be removed
> > > > > 
> > > > > This looks like an unrelated problem, failure to write to
> > > > > /etc/passwd
> > > > > probably means that filesystem is full or some problem happend
> > > > > and how
> > > > > is remounted RO.
> > > > 
> > > > In Naresh' example, root is on NFS:
> > > >   root=/dev/nfs rw
> > > >  
> > > > nfsroot=10.66.16.123:/var/lib/lava/dispatcher/tmp/886412/extract-
> > > > nfsrootfs-tyuevoxm,tcp,hard,intr
> > > 
> > > Right !
> > > root is mounted on NFS.
> > > 
> > > > 10.66.16.123:/var/lib/lava/dispatcher/tmp/886412/extract-
> > > > nfsrootfs-tyuevoxm
> > > > on / type nfs
> > > > (rw,relatime,vers=2,rsize=4096,wsize=4096,namlen=255,hard,nolock,
> > > > proto=tcp,timeo=600,retrans=2,sec=sys,mountaddr=10.66.16.123,moun
> > > > tvers=1,mountproto=tcp,local_lock=all,addr=10.66.16.123)
> > > > devtmpfs on /dev type devtmpfs
> > > > (rw,relatime,size=3977640k,nr_inodes=994410,mode=755)
> > > > 
> 
> The only thing I can think of that might cause an EIO on NFSv2 would be
> this patch
> http://git.linux-nfs.org/?p=trondmy/linux-nfs.git;a=commitdiff;h=627d48e597ec5993c4abb3b81dc75e554a07c7c0
> assuming that a bind-related error is leaking through.
> 
> I'd suggest something like the following to fix it up:

No change with that patch,
but following one fixes it for me:

diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 20b3717cd7ca..56cefa0ab804 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -590,7 +590,7 @@ static void nfs_pgio_rpcsetup(struct nfs_pgio_header *hdr,
}
 
hdr->res.fattr   = >fattr;
-   hdr->res.count   = 0;
+   hdr->res.count   = count;
hdr->res.eof = 0;
hdr->res.verf= >verf;
nfs_fattr_init(>fattr);

which is functionally revert of "NFS: Fix initialisation of I/O result struct 
in nfs_pgio_rpcsetup".

This hunk caught my eye, could res.eof == 0 explain those I/O errors?
/* Emulate the eof flag, which isn't normally needed in NFSv2   
  
 * as it is guaranteed to always return the file attributes 
  
 */ 
  
if (hdr->args.offset + hdr->res.count >= hdr->res.fattr->size)  
  
hdr->res.eof = 1; 


Re: Linux-next-20190823: x86_64/i386: prot_hsymlinks.c:325: Failed to run cmd: useradd hsym

2019-08-26 Thread Jan Stancek


- Original Message -
> Hi Jan and Cyril,
> 
> On Mon, 26 Aug 2019 at 16:35, Jan Stancek  wrote:
> >
> >
> >
> > - Original Message -
> > > Hi!
> > > > Do you see this LTP prot_hsymlinks failure on linux next 20190823 on
> > > > x86_64 and i386 devices?
> > > >
> > > > test output log,
> > > > useradd: failure while writing changes to /etc/passwd
> > > > useradd: /home/hsym was created, but could not be removed
> > >
> > > This looks like an unrelated problem, failure to write to /etc/passwd
> > > probably means that filesystem is full or some problem happend and how
> > > is remounted RO.
> >
> > In Naresh' example, root is on NFS:
> >   root=/dev/nfs rw
> >   
> > nfsroot=10.66.16.123:/var/lib/lava/dispatcher/tmp/886412/extract-nfsrootfs-tyuevoxm,tcp,hard,intr
> 
> Right !
> root is mounted on NFS.
> 
> >
> > 10.66.16.123:/var/lib/lava/dispatcher/tmp/886412/extract-nfsrootfs-tyuevoxm
> > on / type nfs
> > (rw,relatime,vers=2,rsize=4096,wsize=4096,namlen=255,hard,nolock,proto=tcp,timeo=600,retrans=2,sec=sys,mountaddr=10.66.16.123,mountvers=1,mountproto=tcp,local_lock=all,addr=10.66.16.123)
> > devtmpfs on /dev type devtmpfs
> > (rw,relatime,size=3977640k,nr_inodes=994410,mode=755)
> >
> > Following message repeats couple times in logs:
> >   NFS: Server wrote zero bytes, expected XXX
> >
> > Naresh, can you check if there are any errors on NFS server side?
> 
> I have re-tested the failed tests on next-20190822 and all get pass
> which is also
> using same NFS server [1] [2].

Thanks, that suggests some client side change between next-20190822 and 
next-20190823
might introduced it.

> 
> > Maybe run NFS cthon against that server with client running next-20190822
> > and next-20190823.
> 
> Thanks for the pointers.
> I will setup and run NFS cthon on next-20190822 and next-20190823.

I'll try to reproduce too.

> 
> >
> > >
> > > I do not see the kernel messages from this job anywhere at the job
> > > pages, is it stored somewhere?
> >
> > It appears to be mixed in same log file:
> >   
> > https://qa-reports.linaro.org/lkft/linux-next-oe/build/next-20190823/testrun/886412/log
> 
> For the record the following tests failed on linux -next-20190823 on x86_64
> and i386. The filesystem is mounted on NFS and tests are using
> locally mounted hard drive ( with -d /scratch ).
> 
> Juno-r2 device filesystem mounted on NFS and did not see these errors
> and test getting pass on -next-20190823.
> 
> These failures are reproducible all time on next-20190823 kernel on x86_64
> and i386 device with root mounted on NFS [3] [4] [5] [6].
> 
> I will git bisect to find out which is bad commit.
> 
> prot_hsymlinks: [3]
> --
> useradd: failure while writing changes to /etc/passwd
> useradd: /home/hsym was created, but could not be removed
> userdel: user 'hsym' does not exist
> prot_hsymlinks1  TBROK  :  prot_hsymlinks.c:325: Failed to run
> cmd: useradd hsym
> prot_hsymlinks2  TBROK  :  prot_hsymlinks.c:325: Remaining cases broken
> prot_hsymlinks3  TBROK  :  prot_hsymlinks.c:325: Failed to run
> cmd: userdel -r hsym
> prot_hsymlinks4  TBROK  :  tst_sig.c:234: unexpected signal
> SIGIOT/SIGABRT(6) received (pid = 8324).
> 
> logrotate01: [4]
> -
> compressing log with: /bin/gzip
> error: error creating temp state file /var/lib/logrotate.status.tmp:
> Input/output error
> logrotate011  TFAIL  :  ltpapicmd.c:154: Test #1: logrotate
> command exited with 1 return code. Output:
> 
> sem_unlink_2-2: [5]
> --
> make[3]: Entering directory
> '/opt/ltp/testcases/open_posix_testsuite/conformance/interfaces/sem_unlink'
> cat: write error: Input/output error
> conformance/interfaces/sem_unlink/sem_unlink_2-2: execution: FAILED
> 
> syslog{01 ...10} [6]
> ---
> cp: failed to close '/etc/syslog.conf.ltpback': Input/output error
> syslog011  TBROK  :  ltpapicmd.c:188: failed to backup /etc/syslog.conf
> 
> cp: failed to close '/etc/syslog.conf.ltpback': Input/output error
> syslog021  TBROK  :  ltpapicmd.c:188: failed to backup /etc/syslog.conf
> 
> ...
> cp: failed to close '/etc/syslog.conf.ltpback': Input/output error
> syslog101  TBROK  :  ltpapicmd.c:188: failed to backup /etc/syslog.conf
> 
> ref:
> PASS on 20190222:
> [1] https://lkft.validation.linaro.org/scheduler/job/890446#L1232
> [2] https://lkft.validation.linaro.org/scheduler/job/890454
> 
> FAILED on 20190823:
> [3] https://lkft.validation.linaro.org/scheduler/job/890404#L1245
> [4] https://lkft.validation.linaro.org/scheduler/job/886408#L2544
> [5] https://lkft.validation.linaro.org/scheduler/job/886409#L3088
> [6] https://lkft.validation.linaro.org/scheduler/job/890400#L1234
> 
>  - Naresh
> 


Re: Linux-next-20190823: x86_64/i386: prot_hsymlinks.c:325: Failed to run cmd: useradd hsym

2019-08-26 Thread Jan Stancek



- Original Message -
> Hi!
> > Do you see this LTP prot_hsymlinks failure on linux next 20190823 on
> > x86_64 and i386 devices?
> > 
> > test output log,
> > useradd: failure while writing changes to /etc/passwd
> > useradd: /home/hsym was created, but could not be removed
> 
> This looks like an unrelated problem, failure to write to /etc/passwd
> probably means that filesystem is full or some problem happend and how
> is remounted RO.

In Naresh' example, root is on NFS:
  root=/dev/nfs rw 
nfsroot=10.66.16.123:/var/lib/lava/dispatcher/tmp/886412/extract-nfsrootfs-tyuevoxm,tcp,hard,intr

10.66.16.123:/var/lib/lava/dispatcher/tmp/886412/extract-nfsrootfs-tyuevoxm on 
/ type nfs 
(rw,relatime,vers=2,rsize=4096,wsize=4096,namlen=255,hard,nolock,proto=tcp,timeo=600,retrans=2,sec=sys,mountaddr=10.66.16.123,mountvers=1,mountproto=tcp,local_lock=all,addr=10.66.16.123)
devtmpfs on /dev type devtmpfs 
(rw,relatime,size=3977640k,nr_inodes=994410,mode=755)

Following message repeats couple times in logs:
  NFS: Server wrote zero bytes, expected XXX

Naresh, can you check if there are any errors on NFS server side?
Maybe run NFS cthon against that server with client running next-20190822 and 
next-20190823.

> 
> I do not see the kernel messages from this job anywhere at the job
> pages, is it stored somewhere?

It appears to be mixed in same log file:
  
https://qa-reports.linaro.org/lkft/linux-next-oe/build/next-20190823/testrun/886412/log


[tip:locking/core] locking/rwsem: Add missing ACQUIRE to read_slowpath exit when queue is empty

2019-07-25 Thread tip-bot for Jan Stancek
Commit-ID:  e1b98fa316648420d0434d9ff5b92ad6609ba6c3
Gitweb: https://git.kernel.org/tip/e1b98fa316648420d0434d9ff5b92ad6609ba6c3
Author: Jan Stancek 
AuthorDate: Thu, 18 Jul 2019 10:51:25 +0200
Committer:  Ingo Molnar 
CommitDate: Thu, 25 Jul 2019 15:39:23 +0200

locking/rwsem: Add missing ACQUIRE to read_slowpath exit when queue is empty

LTP mtest06 has been observed to occasionally hit "still mapped when
deleted" and following BUG_ON on arm64.

The extra mapcount originated from pagefault handler, which handled
pagefault for vma that has already been detached. vma is detached
under mmap_sem write lock by detach_vmas_to_be_unmapped(), which
also invalidates vmacache.

When the pagefault handler (under mmap_sem read lock) calls
find_vma(), vmacache_valid() wrongly reports vmacache as valid.

After rwsem down_read() returns via 'queue empty' path (as of v5.2),
it does so without an ACQUIRE on sem->count:

  down_read()
__down_read()
  rwsem_down_read_failed()
__rwsem_down_read_failed_common()
  raw_spin_lock_irq(>wait_lock);
  if (list_empty(>wait_list)) {
if (atomic_long_read(>count) >= 0) {
  raw_spin_unlock_irq(>wait_lock);
  return sem;

The problem can be reproduced by running LTP mtest06 in a loop and
building the kernel (-j $NCPUS) in parallel. It does reproduces since
v4.20 on arm64 HPE Apollo 70 (224 CPUs, 256GB RAM, 2 nodes). It
triggers reliably in about an hour.

The patched kernel ran fine for 10+ hours.

Signed-off-by: Jan Stancek 
Signed-off-by: Peter Zijlstra (Intel) 
Reviewed-by: Will Deacon 
Acked-by: Waiman Long 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Thomas Gleixner 
Cc: dbu...@suse.de
Fixes: 4b486b535c33 ("locking/rwsem: Exit read lock slowpath if queue empty & 
no writer")
Link: 
https://lkml.kernel.org/r/50b8914e20d1d62bb2dee42d342836c2c16ebee7.1563438048.git.jstan...@redhat.com
Signed-off-by: Ingo Molnar 
---
 kernel/locking/rwsem.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index bc91aacaab58..d3ce7c6c42a6 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -1036,6 +1036,8 @@ queue:
 */
if (adjustment && !(atomic_long_read(>count) &
 (RWSEM_WRITER_MASK | RWSEM_FLAG_HANDOFF))) {
+   /* Provide lock ACQUIRE */
+   smp_acquire__after_ctrl_dep();
raw_spin_unlock_irq(>wait_lock);
rwsem_set_reader_owned(sem);
lockevent_inc(rwsem_rlock_fast);


Re: [PATCH v2] locking/rwsem: add acquire barrier to read_slowpath exit when queue is empty

2019-07-18 Thread Jan Stancek


- Original Message -
> 
> It's simpler like so:
> 
> On Thu, Jul 18, 2019 at 01:04:46PM +0200, Peter Zijlstra wrote:
> > X = 0;
> > 
> > rwsem_down_read()
> >   for (;;) {
> > set_current_state(TASK_UNINTERRUPTIBLE);
> >
> > X = 1;
> >   
> > rwsem_up_write();
> >   
> > rwsem_mark_wake()
> > 
> > atomic_long_add(adjustment, >count);
> > 
> > smp_store_release(>task, NULL);
> > 
> > if (!waiter.task)
> >   break;
> > 
> > ...
> >   }
> > 
> > r = X;
> > 

I see - it looks possible. Thank you for this example.


Re: [PATCH v2] locking/rwsem: add acquire barrier to read_slowpath exit when queue is empty

2019-07-18 Thread Jan Stancek


- Original Message -
> Hi Jan, Waiman, [+Jade and Paul for the litmus test at the end]
> 
> On Wed, Jul 17, 2019 at 09:22:00PM +0200, Jan Stancek wrote:
> > On Wed, Jul 17, 2019 at 10:19:04AM -0400, Waiman Long wrote:
> > > > If you add a comment to the code outlining the issue (preferably as a
> > > > litmus
> > > > test involving sem->count and some shared data which happens to be
> > > > vmacache_seqnum in your test)), then:
> > > > 
> > > > Reviewed-by: Will Deacon 
> > > > 
> > > > Thanks,
> > > > 
> > > > Will
> > > 
> > > Agreed. A comment just above smp_acquire__after_ctrl_dep() on why this
> > > is needed will be great.
> > > 
> > > Other than that,
> > > 
> > > Acked-by: Waiman Long 
> > > 
> > 
> > litmus test looks a bit long, would following be acceptable?
> > 
> > diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
> > index 37524a47f002..d9c96651bfc7 100644
> > --- a/kernel/locking/rwsem.c
> > +++ b/kernel/locking/rwsem.c
> > @@ -1032,6 +1032,13 @@ static inline bool rwsem_reader_phase_trylock(struct
> > rw_semaphore *sem,
> >  */
> > if (adjustment && !(atomic_long_read(>count) &
> >  (RWSEM_WRITER_MASK | RWSEM_FLAG_HANDOFF))) {
> > +   /*
> > +* down_read() issued ACQUIRE on enter, but we can race
> > +* with writer who did RELEASE only after us.
> > +* ACQUIRE here makes sure reader operations happen only
> > +* after all writer ones.
> > +*/
> 
> How about an abridged form of the litmus test here, just to show the cod
> flow? e.g.:
> 
> /*
>  * We need to ensure ACQUIRE semantics when reading sem->count so that
>  * we pair with the RELEASE store performed by an unlocking/downgrading
>  * writer.
>  *
>  * P0 (writer)P1 (reader)
>  *
>  * down_write(sem);
>  * 
>  * downgrade_write(sem);
>  * -> fetch_add_release(>count)
>  *
>  *down_read_slowpath(sem);
>  *-> atomic_read(>count)
>  *   
>  *   smp_acquire__after_ctrl_dep()
>  *
>  */

Works for me. The code is at 3 level of indentation, but I can try
to squeeze it in for v4.

> 
> In writing this, I also noticed that we don't have any explicit ordering
> at the end of the reader slowpath when we wait on the queue but get woken
> immediately:
> 
>   if (!waiter.task)
>   break;
> 
> Am I missing something?

I'm assuming this isn't problem, because set_current_state() on line above
is using smp_store_mb().

> 
> > +   smp_acquire__after_ctrl_dep();
> > raw_spin_unlock_irq(>wait_lock);
> > rwsem_set_reader_owned(sem);
> > lockevent_inc(rwsem_rlock_fast);
> > 
> > 
> > with litmus test in commit log:
> > --- 8< 
> > C rwsem
> > 
> > {
> > atomic_t rwsem_count = ATOMIC_INIT(1);
> > int vmacache_seqnum = 10;
> > }
> > 
> > P0(int *vmacache_seqnum, atomic_t *rwsem_count)
> > {
> > r0 = READ_ONCE(*vmacache_seqnum);
> > WRITE_ONCE(*vmacache_seqnum, r0 + 1);
> > /* downgrade_write */
> > r1 = atomic_fetch_add_release(-1+256, rwsem_count);
> > }
> > 
> > P1(int *vmacache_seqnum, atomic_t *rwsem_count, spinlock_t *sem_wait_lock)
> > {
> > /* rwsem_read_trylock */
> > r0 = atomic_add_return_acquire(256, rwsem_count);
> > /* rwsem_down_read_slowpath */
> > spin_lock(sem_wait_lock);
> > r0 = atomic_read(rwsem_count);
> > if ((r0 & 1) == 0) {
> > // BUG: needs barrier
> > spin_unlock(sem_wait_lock);
> > r1 = READ_ONCE(*vmacache_seqnum);
> > }
> > }
> > exists (1:r1=10)
> > --- 8< 
> 
> Thanks for writing this! It's definitely worth sticking it in the commit
> log, but Paul and Jade might also like to include it as part of their litmus
> test repository too.
> 
> Will
> 


[PATCH v3] locking/rwsem: add acquire barrier to read_slowpath exit when queue is empty

2019-07-18 Thread Jan Stancek
LTP mtest06 has been observed to rarely hit "still mapped when deleted"
and following BUG_ON on arm64:
  page:7e02fa37e480 refcount:3 mapcount:1 mapping:80be3d678ab0 index:0x0
  xfs_address_space_operations [xfs]
  flags: 0xbfffe00037(locked|referenced|uptodate|lru|active)
  page dumped because: VM_BUG_ON_PAGE(page_mapped(page))
  [ cut here ]
  kernel BUG at mm/filemap.c:171!
  Internal error: Oops - BUG: 0 [#1] SMP
  CPU: 220 PID: 154292 Comm: mmap1 Not tainted 5.2.0-0ecfebd.cki #1
  Hardware name: HPE Apollo 70 /C01_APACHE_MB , BIOS L50_5.13_1.10 05/17/2019
  pstate: 40400089 (nZcv daIf +PAN -UAO)
  pc : unaccount_page_cache_page+0x17c/0x1a0
  lr : unaccount_page_cache_page+0x17c/0x1a0
  Call trace:
  unaccount_page_cache_page+0x17c/0x1a0
  delete_from_page_cache_batch+0xa0/0x300
  truncate_inode_pages_range+0x1b8/0x640
  truncate_inode_pages_final+0x88/0xa8
  evict+0x1a0/0x1d8
  iput+0x150/0x240
  dentry_unlink_inode+0x120/0x130
  __dentry_kill+0xd8/0x1d0
  dentry_kill+0x88/0x248
  dput+0x168/0x1b8
  __fput+0xe8/0x208
  fput+0x20/0x30
  task_work_run+0xc0/0xf0
  do_notify_resume+0x2b0/0x328
  work_pending+0x8/0x10

The extra mapcount originated from pagefault handler, which handled
pagefault for vma that has already been detached. vma is detached
under mmap_sem write lock by detach_vmas_to_be_unmapped(), which
also invalidates vmacache.

When pagefault handler (under mmap_sem read lock) called find_vma(),
vmacache_valid() wrongly reported vmacache as valid.

After rwsem down_read() returns via 'queue empty' path (as of v5.2),
it does so without issuing read_acquire on sem->count:
  down_read
__down_read
  rwsem_down_read_failed
__rwsem_down_read_failed_common
  raw_spin_lock_irq(>wait_lock);
  if (list_empty(>wait_list)) {
if (atomic_long_read(>count) >= 0) {
  raw_spin_unlock_irq(>wait_lock);
  return sem;

Suspected problem here is that last *_acquire on down_read() side
happens before write side issues *_release:
  1. writer: has the lock
  2. reader: down_read() issues *read_acquire on entry
  3. writer: mm->vmacache_seqnum++; downgrades lock (*fetch_add_release)
  4. reader: __rwsem_down_read_failed_common() finds it can take lock and 
returns
  5. reader: observes stale mm->vmacache_seqnum

--- 8< 
C rwsem

{
atomic_t rwsem_count = ATOMIC_INIT(1);
int vmacache_seqnum = 10;
}

P0(int *vmacache_seqnum, atomic_t *rwsem_count)
{
r0 = READ_ONCE(*vmacache_seqnum);
WRITE_ONCE(*vmacache_seqnum, r0 + 1);
/* downgrade_write */
r1 = atomic_fetch_add_release(-1+256, rwsem_count);
}

P1(int *vmacache_seqnum, atomic_t *rwsem_count, spinlock_t *sem_wait_lock)
{
/* rwsem_read_trylock */
r0 = atomic_add_return_acquire(256, rwsem_count);
/* rwsem_down_read_slowpath */
spin_lock(sem_wait_lock);
r0 = atomic_read(rwsem_count);
if ((r0 & 1) == 0) {
// BUG: needs barrier
spin_unlock(sem_wait_lock);
r1 = READ_ONCE(*vmacache_seqnum);
}
}
exists (1:r1=10)
--- >8 

I can reproduce the problem by running LTP mtest06 in a loop and building
kernel (-j $NCPUS) in parallel. It does reproduce since v4.20 up to v5.2
on arm64 HPE Apollo 70 (224 CPUs, 256GB RAM, 2 nodes). It triggers reliably
within ~hour. Patched kernel ran fine for 10+ hours with clean dmesg.
Tests were done against v5.2, since commit cf69482d62d9 ("locking/rwsem:
Enable readers spinning on writer") makes it much harder to reproduce.

v2: Move barrier after test (Waiman Long)
Use smp_acquire__after_ctrl_dep() (Peter Zijlstra)
v3: Add comment to barrier (Waiman Long, Will Deacon)
Add litmus test

Related: 
https://github.com/linux-test-project/ltp/blob/master/testcases/kernel/mem/mtest06/mmap1.c
Related: commit dd2283f2605e ("mm: mmap: zap pages with read mmap_sem in 
munmap")
Fixes: 4b486b535c33 ("locking/rwsem: Exit read lock slowpath if queue empty & 
no writer")

Signed-off-by: Jan Stancek 
Reviewed-by: Will Deacon 
Acked-by: Waiman Long 
Cc: sta...@vger.kernel.org # v4.20+
Cc: Waiman Long 
Cc: Davidlohr Bueso 
Cc: Will Deacon 
Cc: Peter Zijlstra 
Cc: Ingo Molnar 
---
 kernel/locking/rwsem.c | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 37524a47f002..fe02aef39e9d 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -1032,6 +1032,13 @@ static inline bool rwsem_reader_phase_trylock(struct 
rw_semaphore *sem,
 */
if (adjustment && !(atomic_long_read(>count) &
 (RWSEM_WRITER_MASK | RWSEM_FLAG_HANDOFF))) {
+ 

Re: [PATCH v2] locking/rwsem: add acquire barrier to read_slowpath exit when queue is empty

2019-07-17 Thread Jan Stancek

On Wed, Jul 17, 2019 at 10:19:04AM -0400, Waiman Long wrote:

If you add a comment to the code outlining the issue (preferably as a litmus
test involving sem->count and some shared data which happens to be
vmacache_seqnum in your test)), then:

Reviewed-by: Will Deacon 

Thanks,

Will


Agreed. A comment just above smp_acquire__after_ctrl_dep() on why this
is needed will be great.

Other than that,

Acked-by: Waiman Long 



litmus test looks a bit long, would following be acceptable?

diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 37524a47f002..d9c96651bfc7 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -1032,6 +1032,13 @@ static inline bool rwsem_reader_phase_trylock(struct 
rw_semaphore *sem,
 */
if (adjustment && !(atomic_long_read(>count) &
 (RWSEM_WRITER_MASK | RWSEM_FLAG_HANDOFF))) {
+   /*
+* down_read() issued ACQUIRE on enter, but we can race
+* with writer who did RELEASE only after us.
+* ACQUIRE here makes sure reader operations happen only
+* after all writer ones.
+*/
+   smp_acquire__after_ctrl_dep();
raw_spin_unlock_irq(>wait_lock);
rwsem_set_reader_owned(sem);
lockevent_inc(rwsem_rlock_fast);


with litmus test in commit log:
--- 8< 
C rwsem

{
atomic_t rwsem_count = ATOMIC_INIT(1);
int vmacache_seqnum = 10;
}

P0(int *vmacache_seqnum, atomic_t *rwsem_count)
{
r0 = READ_ONCE(*vmacache_seqnum);
WRITE_ONCE(*vmacache_seqnum, r0 + 1);
/* downgrade_write */
r1 = atomic_fetch_add_release(-1+256, rwsem_count);
}

P1(int *vmacache_seqnum, atomic_t *rwsem_count, spinlock_t *sem_wait_lock)
{
/* rwsem_read_trylock */
r0 = atomic_add_return_acquire(256, rwsem_count);
/* rwsem_down_read_slowpath */
spin_lock(sem_wait_lock);
r0 = atomic_read(rwsem_count);
if ((r0 & 1) == 0) {
// BUG: needs barrier
spin_unlock(sem_wait_lock);
r1 = READ_ONCE(*vmacache_seqnum);
}
}
exists (1:r1=10)
--- 8< 

Thanks,
Jan



[PATCH v2] locking/rwsem: add acquire barrier to read_slowpath exit when queue is empty

2019-07-17 Thread Jan Stancek
LTP mtest06 has been observed to rarely hit "still mapped when deleted"
and following BUG_ON on arm64:
  page:7e02fa37e480 refcount:3 mapcount:1 mapping:80be3d678ab0 index:0x0
  xfs_address_space_operations [xfs]
  flags: 0xbfffe00037(locked|referenced|uptodate|lru|active)
  page dumped because: VM_BUG_ON_PAGE(page_mapped(page))
  [ cut here ]
  kernel BUG at mm/filemap.c:171!
  Internal error: Oops - BUG: 0 [#1] SMP
  CPU: 220 PID: 154292 Comm: mmap1 Not tainted 5.2.0-0ecfebd.cki #1
  Hardware name: HPE Apollo 70 /C01_APACHE_MB , BIOS L50_5.13_1.10 05/17/2019
  pstate: 40400089 (nZcv daIf +PAN -UAO)
  pc : unaccount_page_cache_page+0x17c/0x1a0
  lr : unaccount_page_cache_page+0x17c/0x1a0
  Call trace:
  unaccount_page_cache_page+0x17c/0x1a0
  delete_from_page_cache_batch+0xa0/0x300
  truncate_inode_pages_range+0x1b8/0x640
  truncate_inode_pages_final+0x88/0xa8
  evict+0x1a0/0x1d8
  iput+0x150/0x240
  dentry_unlink_inode+0x120/0x130
  __dentry_kill+0xd8/0x1d0
  dentry_kill+0x88/0x248
  dput+0x168/0x1b8
  __fput+0xe8/0x208
  fput+0x20/0x30
  task_work_run+0xc0/0xf0
  do_notify_resume+0x2b0/0x328
  work_pending+0x8/0x10

The extra mapcount originated from pagefault handler, which handled
pagefault for vma that has already been detached. vma is detached
under mmap_sem write lock by detach_vmas_to_be_unmapped(), which
also invalidates vmacache.

When pagefault handler (under mmap_sem read lock) called find_vma(),
vmacache_valid() wrongly reported vmacache as valid.

After rwsem down_read() returns via 'queue empty' path (as of v5.2),
it does so without issuing read_acquire on sem->count:
  down_read
__down_read
  rwsem_down_read_failed
__rwsem_down_read_failed_common
  raw_spin_lock_irq(>wait_lock);
  if (list_empty(>wait_list)) {
if (atomic_long_read(>count) >= 0) {
  raw_spin_unlock_irq(>wait_lock);
  return sem;

Suspected problem here is that last *_acquire on down_read() side
happens before write side issues *_release:
  1. writer: has the lock
  2. reader: down_read() issues *read_acquire on entry
  3. writer: mm->vmacache_seqnum++; downgrades lock (*fetch_add_release)
  4. reader: __rwsem_down_read_failed_common() finds it can take lock and 
returns
  5. reader: observes stale mm->vmacache_seqnum

I can reproduce the problem by running LTP mtest06 in a loop and building
kernel (-j $NCPUS) in parallel. It does reproduce since v4.20 up to v5.2
on arm64 HPE Apollo 70 (224 CPUs, 256GB RAM, 2 nodes). It triggers reliably
within ~hour. Patched kernel ran fine for 10+ hours with clean dmesg.
Tests were done against v5.2, since commit cf69482d62d9 ("locking/rwsem:
Enable readers spinning on writer") makes it much harder to reproduce.

v2: Move barrier after test (Waiman Long)
Use smp_acquire__after_ctrl_dep() (Peter Zijlstra)

Related: 
https://github.com/linux-test-project/ltp/blob/master/testcases/kernel/mem/mtest06/mmap1.c
Related: commit dd2283f2605e ("mm: mmap: zap pages with read mmap_sem in 
munmap")
Fixes: 4b486b535c33 ("locking/rwsem: Exit read lock slowpath if queue empty & 
no writer")

Signed-off-by: Jan Stancek 
Cc: sta...@vger.kernel.org # v4.20+
Cc: Waiman Long 
Cc: Davidlohr Bueso 
Cc: Will Deacon 
Cc: Peter Zijlstra 
Cc: Ingo Molnar 
---
 kernel/locking/rwsem.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 37524a47f002..5ac72b60608b 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -1032,6 +1032,7 @@ static inline bool rwsem_reader_phase_trylock(struct 
rw_semaphore *sem,
 */
if (adjustment && !(atomic_long_read(>count) &
 (RWSEM_WRITER_MASK | RWSEM_FLAG_HANDOFF))) {
+   smp_acquire__after_ctrl_dep();
raw_spin_unlock_irq(>wait_lock);
rwsem_set_reader_owned(sem);
lockevent_inc(rwsem_rlock_fast);
-- 
1.8.3.1



Re: [PATCH] locking/rwsem: use read_acquire in read_slowpath exit when queue is empty

2019-07-16 Thread Jan Stancek



- Original Message -
> On 7/16/19 12:04 PM, Jan Stancek wrote:
> > LTP mtest06 has been observed to rarely hit "still mapped when deleted"
> > and following BUG_ON on arm64:
> >   page:7e02fa37e480 refcount:3 mapcount:1 mapping:80be3d678ab0
> >   index:0x0
> >   xfs_address_space_operations [xfs]
> >   flags: 0xbfffe00037(locked|referenced|uptodate|lru|active)
> >   page dumped because: VM_BUG_ON_PAGE(page_mapped(page))
> >   [ cut here ]
> >   kernel BUG at mm/filemap.c:171!
> >   Internal error: Oops - BUG: 0 [#1] SMP
> >   CPU: 220 PID: 154292 Comm: mmap1 Not tainted 5.2.0-0ecfebd.cki #1
> >   Hardware name: HPE Apollo 70 /C01_APACHE_MB , BIOS L50_5.13_1.10
> >   05/17/2019
> >   pstate: 40400089 (nZcv daIf +PAN -UAO)
> >   pc : unaccount_page_cache_page+0x17c/0x1a0
> >   lr : unaccount_page_cache_page+0x17c/0x1a0
> >   Call trace:
> >   unaccount_page_cache_page+0x17c/0x1a0
> >   delete_from_page_cache_batch+0xa0/0x300
> >   truncate_inode_pages_range+0x1b8/0x640
> >   truncate_inode_pages_final+0x88/0xa8
> >   evict+0x1a0/0x1d8
> >   iput+0x150/0x240
> >   dentry_unlink_inode+0x120/0x130
> >   __dentry_kill+0xd8/0x1d0
> >   dentry_kill+0x88/0x248
> >   dput+0x168/0x1b8
> >   __fput+0xe8/0x208
> >   fput+0x20/0x30
> >   task_work_run+0xc0/0xf0
> >   do_notify_resume+0x2b0/0x328
> >   work_pending+0x8/0x10
> >
> > The extra mapcount originated from pagefault handler, which handled
> > pagefault for vma that has already been detached. vma is detached
> > under mmap_sem write lock by detach_vmas_to_be_unmapped(), which
> > also invalidates vmacache.
> >
> > When pagefault handler (under mmap_sem read lock) called find_vma(),
> > vmacache_valid() wrongly reported vmacache as valid.
> >
> > After rwsem down_read() returns via 'queue empty' path (as of v5.2),
> > it does so without issuing read_acquire on sem->count:
> >   down_read
> > __down_read
> >   rwsem_down_read_failed
> > __rwsem_down_read_failed_common
> >   raw_spin_lock_irq(>wait_lock);
> >   if (list_empty(>wait_list)) {
> > if (atomic_long_read(>count) >= 0) {
> >   raw_spin_unlock_irq(>wait_lock);
> >   return sem;
> >
> > Suspected problem here is that last *_acquire on down_read() side
> > happens before write side issues *_release:
> >   1. writer: has the lock
> >   2. reader: down_read() issues *read_acquire on entry
> >   3. writer: mm->vmacache_seqnum++; downgrades lock (*fetch_add_release)
> >   4. reader: __rwsem_down_read_failed_common() finds it can take lock and
> >   returns
> >   5. reader: observes stale mm->vmacache_seqnum
> >
> > I can reproduce the problem by running LTP mtest06 in a loop and building
> > kernel (-j $NCPUS) in parallel. It does reproduce since v4.20 up to v5.2
> > on arm64 HPE Apollo 70 (224 CPUs, 256GB RAM, 2 nodes). It triggers reliably
> > within ~hour. Patched kernel ran fine for 5+ hours with clean dmesg.
> > Tests were done against v5.2, since commit cf69482d62d9 ("locking/rwsem:
> > Enable readers spinning on writer") makes it much harder to reproduce.
> The explanation makes sense to me. Thanks for the investigation.
> >
> > Related:
> > https://github.com/linux-test-project/ltp/blob/master/testcases/kernel/mem/mtest06/mmap1.c
> > Related: commit dd2283f2605e ("mm: mmap: zap pages with read mmap_sem in
> > munmap")
> > Fixes: 4b486b535c33 ("locking/rwsem: Exit read lock slowpath if queue empty
> > & no writer")
> >
> > Signed-off-by: Jan Stancek 
> > Cc: Waiman Long 
> > Cc: Davidlohr Bueso 
> > Cc: Will Deacon 
> > Cc: Peter Zijlstra 
> > Cc: Ingo Molnar 
> > ---
> >  kernel/locking/rwsem.c | 2 +-
> >  1 file changed, 1 insertion(+), 1 deletion(-)
> >
> > diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
> > index 37524a47f002..757b198d7a5b 100644
> > --- a/kernel/locking/rwsem.c
> > +++ b/kernel/locking/rwsem.c
> > @@ -1030,7 +1030,7 @@ static inline bool rwsem_reader_phase_trylock(struct
> > rw_semaphore *sem,
> >  * exit the slowpath and return immediately as its
> >  * RWSEM_READER_BIAS has already been set in the count.
> >  */
> > -   if (adjustment && !(atomic_long_read(>count) &
> > +   if (adjustment && !(atomic_long_read_acquire(>count) &
> >  (RWSEM_WRITER_MASK | RWSEM_FLAG_HANDOFF))) {
> > raw_spin_unlock_irq(>wait_lock);
> > rwsem_set_reader_owned(sem);
> 
> The chance of taking this path is not that high. So instead of
> increasing the cost of the test by adding an acquire barrier,

Yes, that is good point.

> how about
> just adding smp_mb__after_spinlock() before spin_unlock_irq(). This
> should have the same effect of making sure that no stale data will be
> used in the read-lock critical section.

I'll redo the tests with smp_mb__after_spinlock().

Thanks,
Jan

> 
> -Longman
> 
> 


[PATCH] locking/rwsem: use read_acquire in read_slowpath exit when queue is empty

2019-07-16 Thread Jan Stancek
LTP mtest06 has been observed to rarely hit "still mapped when deleted"
and following BUG_ON on arm64:
  page:7e02fa37e480 refcount:3 mapcount:1 mapping:80be3d678ab0 index:0x0
  xfs_address_space_operations [xfs]
  flags: 0xbfffe00037(locked|referenced|uptodate|lru|active)
  page dumped because: VM_BUG_ON_PAGE(page_mapped(page))
  [ cut here ]
  kernel BUG at mm/filemap.c:171!
  Internal error: Oops - BUG: 0 [#1] SMP
  CPU: 220 PID: 154292 Comm: mmap1 Not tainted 5.2.0-0ecfebd.cki #1
  Hardware name: HPE Apollo 70 /C01_APACHE_MB , BIOS L50_5.13_1.10 05/17/2019
  pstate: 40400089 (nZcv daIf +PAN -UAO)
  pc : unaccount_page_cache_page+0x17c/0x1a0
  lr : unaccount_page_cache_page+0x17c/0x1a0
  Call trace:
  unaccount_page_cache_page+0x17c/0x1a0
  delete_from_page_cache_batch+0xa0/0x300
  truncate_inode_pages_range+0x1b8/0x640
  truncate_inode_pages_final+0x88/0xa8
  evict+0x1a0/0x1d8
  iput+0x150/0x240
  dentry_unlink_inode+0x120/0x130
  __dentry_kill+0xd8/0x1d0
  dentry_kill+0x88/0x248
  dput+0x168/0x1b8
  __fput+0xe8/0x208
  fput+0x20/0x30
  task_work_run+0xc0/0xf0
  do_notify_resume+0x2b0/0x328
  work_pending+0x8/0x10

The extra mapcount originated from pagefault handler, which handled
pagefault for vma that has already been detached. vma is detached
under mmap_sem write lock by detach_vmas_to_be_unmapped(), which
also invalidates vmacache.

When pagefault handler (under mmap_sem read lock) called find_vma(),
vmacache_valid() wrongly reported vmacache as valid.

After rwsem down_read() returns via 'queue empty' path (as of v5.2),
it does so without issuing read_acquire on sem->count:
  down_read
__down_read
  rwsem_down_read_failed
__rwsem_down_read_failed_common
  raw_spin_lock_irq(>wait_lock);
  if (list_empty(>wait_list)) {
if (atomic_long_read(>count) >= 0) {
  raw_spin_unlock_irq(>wait_lock);
  return sem;

Suspected problem here is that last *_acquire on down_read() side
happens before write side issues *_release:
  1. writer: has the lock
  2. reader: down_read() issues *read_acquire on entry
  3. writer: mm->vmacache_seqnum++; downgrades lock (*fetch_add_release)
  4. reader: __rwsem_down_read_failed_common() finds it can take lock and 
returns
  5. reader: observes stale mm->vmacache_seqnum

I can reproduce the problem by running LTP mtest06 in a loop and building
kernel (-j $NCPUS) in parallel. It does reproduce since v4.20 up to v5.2
on arm64 HPE Apollo 70 (224 CPUs, 256GB RAM, 2 nodes). It triggers reliably
within ~hour. Patched kernel ran fine for 5+ hours with clean dmesg.
Tests were done against v5.2, since commit cf69482d62d9 ("locking/rwsem:
Enable readers spinning on writer") makes it much harder to reproduce.

Related: 
https://github.com/linux-test-project/ltp/blob/master/testcases/kernel/mem/mtest06/mmap1.c
Related: commit dd2283f2605e ("mm: mmap: zap pages with read mmap_sem in 
munmap")
Fixes: 4b486b535c33 ("locking/rwsem: Exit read lock slowpath if queue empty & 
no writer")

Signed-off-by: Jan Stancek 
Cc: Waiman Long 
Cc: Davidlohr Bueso 
Cc: Will Deacon 
Cc: Peter Zijlstra 
Cc: Ingo Molnar 
---
 kernel/locking/rwsem.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 37524a47f002..757b198d7a5b 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -1030,7 +1030,7 @@ static inline bool rwsem_reader_phase_trylock(struct 
rw_semaphore *sem,
 * exit the slowpath and return immediately as its
 * RWSEM_READER_BIAS has already been set in the count.
 */
-   if (adjustment && !(atomic_long_read(>count) &
+   if (adjustment && !(atomic_long_read_acquire(>count) &
 (RWSEM_WRITER_MASK | RWSEM_FLAG_HANDOFF))) {
raw_spin_unlock_irq(>wait_lock);
rwsem_set_reader_owned(sem);
-- 
1.8.3.1



Re: [LTP] [mm] 8c7829b04c: ltp.overcommit_memory01.fail

2019-05-20 Thread Jan Stancek



- Original Message -
> FYI, we noticed the following commit (built with gcc-7):
> 
> commit: 8c7829b04c523cdc732cb77f59f03320e09f3386 ("mm: fix false-positive
> OVERCOMMIT_GUESS failures")

This matches report from Naresh:
  http://lists.linux.it/pipermail/ltp/2019-May/011962.html

> https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git master
> 
> in testcase: ltp
> with following parameters:
> 
>   disk: 1HDD
>   test: mm-01
> 
> test-description: The LTP testsuite contains a collection of tools for
> testing the Linux kernel and related features.
> test-url: http://linux-test-project.github.io/
> 
> 
> on test machine: qemu-system-x86_64 -enable-kvm -cpu SandyBridge -smp 2 -m 4G
> 
> caused below changes (please refer to attached dmesg/kmsg for entire
> log/backtrace):
> 
> 
> If you fix the issue, kindly add following tag
> Reported-by: kernel test robot 
> 
> 
> [  554.112267 ] <<>>
> [  554.112269 ]
> [  554.115806 ] tag=overcommit_memory01 stime=1558074982
> [  554.115809 ]
> [  554.119303 ] cmdline="overcommit_memory"
> [  554.119306 ]
> [  554.121962 ] contacts=""
> [  554.121965 ]
> [  554.124459 ] analysis=exit
> [  554.124463 ]
> [  554.127140 ] <<>>
> [  554.127144 ]
> [  554.131401 ] tst_test.c:1096: INFO: Timeout per run is 0h 05m 00s
> [  554.131404 ]
> [  554.136248 ] overcommit_memory.c:117: INFO: MemTotal is 4033124 kB
> [  554.136251 ]
> [  554.141365 ] overcommit_memory.c:119: INFO: SwapTotal is 268435452 kB
> [  554.141368 ]
> [  554.146664 ] overcommit_memory.c:123: INFO: CommitLimit is 270452012 kB
> [  554.14 ]
> [  554.151375 ] mem.c:814: INFO: set overcommit_ratio to 50
> [  554.151378 ]
> [  554.155796 ] mem.c:814: INFO: set overcommit_memory to 2
> [  554.155799 ]
> [  554.159951 ] overcommit_memory.c:190: INFO: malloc 538608648 kB failed
> [  554.159953 ]
> [  554.164801 ] overcommit_memory.c:211: PASS: alloc failed as expected
> [  554.164804 ]
> [  554.170196 ] overcommit_memory.c:190: INFO: malloc 270452012 kB failed
> [  554.170199 ]
> [  554.175344 ] overcommit_memory.c:211: PASS: alloc failed as expected
> [  554.175347 ]
> [  554.180932 ] overcommit_memory.c:186: INFO: malloc 134651878 kB
> successfully
> [  554.180935 ]
> [  554.186522 ] overcommit_memory.c:205: PASS: alloc passed as expected
> [  554.186524 ]
> [  554.191234 ] mem.c:814: INFO: set overcommit_memory to 0
> [  554.191237 ]
> [  554.196930 ] overcommit_memory.c:186: INFO: malloc 134734122 kB
> successfully
> [  554.196933 ]
> [  554.202738 ] overcommit_memory.c:205: PASS: alloc passed as expected
> [  554.202742 ]
> [  554.208563 ] overcommit_memory.c:190: INFO: malloc 538936488 kB failed
> [  554.208566 ]
> [  554.214355 ] overcommit_memory.c:211: PASS: alloc failed as expected
> [  554.214357 ]
> [  554.220506 ] overcommit_memory.c:186: INFO: malloc 272468576 kB
> successfully
> [  554.220509 ]
> [  554.226564 ] overcommit_memory.c:213: FAIL: alloc passed, expected to fail
> [  554.226568 ]
> [  554.231870 ] mem.c:814: INFO: set overcommit_memory to 1
> [  554.231873 ]
> [  554.237819 ] overcommit_memory.c:186: INFO: malloc 136234288 kB
> successfully
> [  554.237821 ]
> [  554.243711 ] overcommit_memory.c:205: PASS: alloc passed as expected
> [  554.243713 ]
> [  554.249791 ] overcommit_memory.c:186: INFO: malloc 272468576 kB
> successfully
> [  554.249794 ]
> [  554.255069 ] overcommit_memory.c:205: PASS: alloc passed as expected
> [  554.255073 ]
> [  554.261594 ] overcommit_memory.c:186: INFO: malloc 544937152 kB
> successfully
> [  554.261597 ]
> [  554.267855 ] overcommit_memory.c:205: PASS: alloc passed as expected
> [  554.267858 ]
> [  554.273445 ] mem.c:814: INFO: set overcommit_memory to 0
> [  554.273448 ]
> [  554.278882 ] mem.c:814: INFO: set overcommit_ratio to 50
> [  554.278885 ]
> [  554.282419 ]
> [  554.284170 ] Summary:
> [  554.284173 ]
> [  554.287476 ] passed   8
> [  554.287480 ]
> [  554.290740 ] failed   1
> [  554.290743 ]
> [  554.293971 ] skipped  0
> [  554.293974 ]
> [  554.297123 ] warnings 0
> [  554.297126 ]
> [  554.300746 ] <<>>
> [  554.300749 ]
> [  554.304489 ] initiation_status="ok"
> [  554.304492 ]
> [  554.309762 ] duration=0 termination_type=exited termination_id=1
> corefile=no
> [  554.309765 ]
> [  554.313681 ] cutime=0 cstime=0
> [  554.313685 ]
> [  554.316880 ] <<>>
> 
> 
> 
> To reproduce:
> 
> # build kernel
>   cd linux
>   cp config-5.1.0-10246-g8c7829b .config
>   make HOSTCC=gcc-7 CC=gcc-7 ARCH=x86_64 olddefconfig
>   make HOSTCC=gcc-7 CC=gcc-7 ARCH=x86_64 prepare
>   make HOSTCC=gcc-7 CC=gcc-7 ARCH=x86_64 modules_prepare
>   make HOSTCC=gcc-7 CC=gcc-7 ARCH=x86_64 SHELL=/bin/bash
>   make HOSTCC=gcc-7 CC=gcc-7 ARCH=x86_64 bzImage
> 
> 
> git clone https://github.com/intel/lkp-tests.git
> cd lkp-tests
> find lib/ | cpio -o -H newc --quiet | gzip > modules.cgz
>   bin/lkp qemu -k  -m modules.cgz job-script # job-script is 
> attached
>   in this 

Re: [v2 PATCH] mm: mmu_gather: remove __tlb_reset_range() for force flush

2019-05-16 Thread Jan Stancek



- Original Message -
> On Mon, May 13, 2019 at 04:01:09PM -0700, Yang Shi wrote:
> > 
> > 
> > On 5/13/19 9:38 AM, Will Deacon wrote:
> > > On Fri, May 10, 2019 at 07:26:54AM +0800, Yang Shi wrote:
> > > > diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
> > > > index 99740e1..469492d 100644
> > > > --- a/mm/mmu_gather.c
> > > > +++ b/mm/mmu_gather.c
> > > > @@ -245,14 +245,39 @@ void tlb_finish_mmu(struct mmu_gather *tlb,
> > > >   {
> > > > /*
> > > >  * If there are parallel threads are doing PTE changes on same 
> > > > range
> > > > -* under non-exclusive lock(e.g., mmap_sem read-side) but defer 
> > > > TLB
> > > > -* flush by batching, a thread has stable TLB entry can fail to 
> > > > flush
> > > > -* the TLB by observing pte_none|!pte_dirty, for example so 
> > > > flush TLB
> > > > -* forcefully if we detect parallel PTE batching threads.
> > > > +* under non-exclusive lock (e.g., mmap_sem read-side) but 
> > > > defer TLB
> > > > +* flush by batching, one thread may end up seeing inconsistent 
> > > > PTEs
> > > > +* and result in having stale TLB entries.  So flush TLB 
> > > > forcefully
> > > > +* if we detect parallel PTE batching threads.
> > > > +*
> > > > +* However, some syscalls, e.g. munmap(), may free page tables, 
> > > > this
> > > > +* needs force flush everything in the given range. Otherwise 
> > > > this
> > > > +* may result in having stale TLB entries for some 
> > > > architectures,
> > > > +* e.g. aarch64, that could specify flush what level TLB.
> > > >  */
> > > > -   if (mm_tlb_flush_nested(tlb->mm)) {
> > > > -   __tlb_reset_range(tlb);
> > > > -   __tlb_adjust_range(tlb, start, end - start);
> > > > +   if (mm_tlb_flush_nested(tlb->mm) && !tlb->fullmm) {
> > > > +   /*
> > > > +* Since we can't tell what we actually should have
> > > > +* flushed, flush everything in the given range.
> > > > +*/
> > > > +   tlb->freed_tables = 1;
> > > > +   tlb->cleared_ptes = 1;
> > > > +   tlb->cleared_pmds = 1;
> > > > +   tlb->cleared_puds = 1;
> > > > +   tlb->cleared_p4ds = 1;
> > > > +
> > > > +   /*
> > > > +* Some architectures, e.g. ARM, that have range 
> > > > invalidation
> > > > +* and care about VM_EXEC for I-Cache invalidation, 
> > > > need force
> > > > +* vma_exec set.
> > > > +*/
> > > > +   tlb->vma_exec = 1;
> > > > +
> > > > +   /* Force vma_huge clear to guarantee safer flush */
> > > > +   tlb->vma_huge = 0;
> > > > +
> > > > +   tlb->start = start;
> > > > +   tlb->end = end;
> > > > }
> > > Whilst I think this is correct, it would be interesting to see whether
> > > or not it's actually faster than just nuking the whole mm, as I mentioned
> > > before.
> > > 
> > > At least in terms of getting a short-term fix, I'd prefer the diff below
> > > if it's not measurably worse.
> > 
> > I did a quick test with ebizzy (96 threads with 5 iterations) on my x86 VM,
> > it shows slightly slowdown on records/s but much more sys time spent with
> > fullmm flush, the below is the data.
> > 
> >     nofullmm fullmm
> > ops (records/s)  225606  225119
> > sys (s)    0.69    1.14
> > 
> > It looks the slight reduction of records/s is caused by the increase of sys
> > time.
> 
> That's not what I expected, and I'm unable to explain why moving to fullmm
> would /increase/ the system time. I would've thought the time spent doing
> the invalidation would decrease, with the downside that the TLB is cold
> when returning back to userspace.
> 

I tried ebizzy with various parameters (malloc vs mmap, ran it for hour),
but performance was very similar for both patches.

So, I was looking for workload that would demonstrate the largest difference.
Inspired by python xml-rpc, which can handle each request in new thread,
I tried following [1]:

16 threads, each looping 100k times over:
  mmap(16M)
  touch 1 page
  madvise(DONTNEED)
  munmap(16M)

This yields quite significant difference for 2 patches when running on
my 46 CPU arm host. I checked it twice - applied patch, recompiled, rebooted,
but numbers stayed +- couple seconds the same.

Does it somewhat match your expectation?

v2 patch
-
real2m33.460s
user0m3.359s
sys 15m32.307s

real2m33.895s
user0m2.749s
sys 16m34.500s

real2m35.666s
user0m3.528s
sys 15m23.377s

real2m32.898s
user0m2.789s
sys 16m18.801s

real2m33.087s
user0m3.565s
sys 16m23.815s


fullmm version
---
real0m46.811s
user0m1.596s
sys 

Re: LTP: mm: overcommit_memory01, 03...06 failed

2019-05-15 Thread Jan Stancek



- Original Message -
> ltp-mm-tests failed on Linux mainline kernel  5.1.0,
>   * overcommit_memory01 overcommit_memory
>   * overcommit_memory03 overcommit_memory -R 30
>   * overcommit_memory04 overcommit_memory -R 80
>   * overcommit_memory05 overcommit_memory -R 100
>   * overcommit_memory06 overcommit_memory -R 200
> 
> mem.c:814: INFO: set overcommit_memory to 0
> overcommit_memory.c:185: INFO: malloc 8094844 kB successfully
> overcommit_memory.c:204: PASS: alloc passed as expected
> overcommit_memory.c:189: INFO: malloc 32379376 kB failed
> overcommit_memory.c:210: PASS: alloc failed as expected
> overcommit_memory.c:185: INFO: malloc 16360216 kB successfully
> overcommit_memory.c:212: FAIL: alloc passed, expected to fail
> 
> Failed test log,
> https://lkft.validation.linaro.org/scheduler/job/726417#L22852
> 
> LTP version 20190115
> 
> Test case link,
> https://github.com/linux-test-project/ltp/blob/master/testcases/kernel/mem/tunable/overcommit_memory.c#L212
> 
> First bad commit:
> git branch master
> git commit e0654264c4806dc436b291294a0fbf9be7571ab6
> git describe v5.1-10706-ge0654264c480
> git repo https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
> 
> Last good commit:
> git branch master
> git commit 7e9890a3500d95c01511a4c45b7e7192dfa47ae2
> git describe v5.1-10326-g7e9890a3500d
> git repo https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

Heuristic changed a bit in:
  commit 8c7829b04c523cdc732cb77f59f03320e09f3386
  Author: Johannes Weiner 
  Date:   Mon May 13 17:21:50 2019 -0700
mm: fix false-positive OVERCOMMIT_GUESS failures

LTP tries to allocate "mem_total + swap_total":
  alloc_and_check(sum_total, EXPECT_FAIL);
which now presumably falls short to trigger failure.

> 
> Best regards
> Naresh Kamboju
> 


Re: [v2 PATCH] mm: mmu_gather: remove __tlb_reset_range() for force flush

2019-05-14 Thread Jan Stancek


- Original Message -
> 
> 
> On May 13, 2019 4:01 PM, Yang Shi  wrote:
> 
> 
> On 5/13/19 9:38 AM, Will Deacon wrote:
> > On Fri, May 10, 2019 at 07:26:54AM +0800, Yang Shi wrote:
> >> diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
> >> index 99740e1..469492d 100644
> >> --- a/mm/mmu_gather.c
> >> +++ b/mm/mmu_gather.c
> >> @@ -245,14 +245,39 @@ void tlb_finish_mmu(struct mmu_gather *tlb,
> >>   {
> >>   /*
> >>* If there are parallel threads are doing PTE changes on same range
> >> - * under non-exclusive lock(e.g., mmap_sem read-side) but defer TLB
> >> - * flush by batching, a thread has stable TLB entry can fail to flush
> >> - * the TLB by observing pte_none|!pte_dirty, for example so flush TLB
> >> - * forcefully if we detect parallel PTE batching threads.
> >> + * under non-exclusive lock (e.g., mmap_sem read-side) but defer TLB
> >> + * flush by batching, one thread may end up seeing inconsistent PTEs
> >> + * and result in having stale TLB entries.  So flush TLB forcefully
> >> + * if we detect parallel PTE batching threads.
> >> + *
> >> + * However, some syscalls, e.g. munmap(), may free page tables, this
> >> + * needs force flush everything in the given range. Otherwise this
> >> + * may result in having stale TLB entries for some architectures,
> >> + * e.g. aarch64, that could specify flush what level TLB.
> >>*/
> >> -if (mm_tlb_flush_nested(tlb->mm)) {
> >> -__tlb_reset_range(tlb);
> >> -__tlb_adjust_range(tlb, start, end - start);
> >> +if (mm_tlb_flush_nested(tlb->mm) && !tlb->fullmm) {
> >> +/*
> >> + * Since we can't tell what we actually should have
> >> + * flushed, flush everything in the given range.
> >> + */
> >> +tlb->freed_tables = 1;
> >> +tlb->cleared_ptes = 1;
> >> +tlb->cleared_pmds = 1;
> >> +tlb->cleared_puds = 1;
> >> +tlb->cleared_p4ds = 1;
> >> +
> >> +/*
> >> + * Some architectures, e.g. ARM, that have range invalidation
> >> + * and care about VM_EXEC for I-Cache invalidation, need
> >> force
> >> + * vma_exec set.
> >> + */
> >> +tlb->vma_exec = 1;
> >> +
> >> +/* Force vma_huge clear to guarantee safer flush */
> >> +tlb->vma_huge = 0;
> >> +
> >> +tlb->start = start;
> >> +tlb->end = end;
> >>   }
> > Whilst I think this is correct, it would be interesting to see whether
> > or not it's actually faster than just nuking the whole mm, as I mentioned
> > before.
> >
> > At least in terms of getting a short-term fix, I'd prefer the diff below
> > if it's not measurably worse.
> 
> I did a quick test with ebizzy (96 threads with 5 iterations) on my x86
> VM, it shows slightly slowdown on records/s but much more sys time spent
> with fullmm flush, the below is the data.
> 
>  nofullmm fullmm
> ops (records/s)  225606  225119
> sys (s)0.691.14
> 
> It looks the slight reduction of records/s is caused by the increase of
> sys time.
> 
> >
> > Will
> >
> > --->8
> >
> > diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
> > index 99740e1dd273..cc251422d307 100644
> > --- a/mm/mmu_gather.c
> > +++ b/mm/mmu_gather.c
> > @@ -251,8 +251,9 @@ void tlb_finish_mmu(struct mmu_gather *tlb,
> > * forcefully if we detect parallel PTE batching threads.
> > */
> >if (mm_tlb_flush_nested(tlb->mm)) {
> > + tlb->fullmm = 1;
> >__tlb_reset_range(tlb);
> > - __tlb_adjust_range(tlb, start, end - start);
> > + tlb->freed_tables = 1;
> >}
> >
> >tlb_flush_mmu(tlb);
> 
> 
> I think that this should have set need_flush_all and not fullmm.
> 

Wouldn't that skip the flush?

If fulmm == 0, then __tlb_reset_range() sets tlb->end = 0.
  tlb_flush_mmu
tlb_flush_mmu_tlbonly
  if (!tlb->end)
 return

Replacing fullmm with need_flush_all, brings the problem back / reproducer 
hangs.


Re: [PATCH] mm: mmu_gather: remove __tlb_reset_range() for force flush

2019-05-09 Thread Jan Stancek


- Original Message -
> 
> 
> On 5/9/19 2:06 PM, Jan Stancek wrote:
> > - Original Message -
> >>
> >> On 5/9/19 11:24 AM, Peter Zijlstra wrote:
> >>> On Thu, May 09, 2019 at 05:36:29PM +, Nadav Amit wrote:
> >>>>> On May 9, 2019, at 3:38 AM, Peter Zijlstra 
> >>>>> wrote:
> >>>>> diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
> >>>>> index 99740e1dd273..fe768f8d612e 100644
> >>>>> --- a/mm/mmu_gather.c
> >>>>> +++ b/mm/mmu_gather.c
> >>>>> @@ -244,15 +244,20 @@ void tlb_finish_mmu(struct mmu_gather *tlb,
> >>>>> unsigned long start, unsigned long end)
> >>>>> {
> >>>>> /*
> >>>>> -* If there are parallel threads are doing PTE changes on same 
> >>>>> range
> >>>>> -* under non-exclusive lock(e.g., mmap_sem read-side) but defer 
> >>>>> TLB
> >>>>> -* flush by batching, a thread has stable TLB entry can fail to 
> >>>>> flush
> >>>>> -* the TLB by observing pte_none|!pte_dirty, for example so 
> >>>>> flush TLB
> >>>>> -* forcefully if we detect parallel PTE batching threads.
> >>>>> +* Sensible comment goes here..
> >>>>>  */
> >>>>> -   if (mm_tlb_flush_nested(tlb->mm)) {
> >>>>> -   __tlb_reset_range(tlb);
> >>>>> -   __tlb_adjust_range(tlb, start, end - start);
> >>>>> +   if (mm_tlb_flush_nested(tlb->mm) && !tlb->full_mm) {
> >>>>> +   /*
> >>>>> +* Since we're can't tell what we actually should have
> >>>>> +* flushed flush everything in the given range.
> >>>>> +*/
> >>>>> +   tlb->start = start;
> >>>>> +   tlb->end = end;
> >>>>> +   tlb->freed_tables = 1;
> >>>>> +   tlb->cleared_ptes = 1;
> >>>>> +   tlb->cleared_pmds = 1;
> >>>>> +   tlb->cleared_puds = 1;
> >>>>> +   tlb->cleared_p4ds = 1;
> >>>>> }
> >>>>>
> >>>>> tlb_flush_mmu(tlb);
> >>>> As a simple optimization, I think it is possible to hold multiple
> >>>> nesting
> >>>> counters in the mm, similar to tlb_flush_pending, for freed_tables,
> >>>> cleared_ptes, etc.
> >>>>
> >>>> The first time you set tlb->freed_tables, you also atomically increase
> >>>> mm->tlb_flush_freed_tables. Then, in tlb_flush_mmu(), you just use
> >>>> mm->tlb_flush_freed_tables instead of tlb->freed_tables.
> >>> That sounds fraught with races and expensive; I would much prefer to not
> >>> go there for this arguably rare case.
> >>>
> >>> Consider such fun cases as where CPU-0 sees and clears a PTE, CPU-1
> >>> races and doesn't see that PTE. Therefore CPU-0 sets and counts
> >>> cleared_ptes. Then if CPU-1 flushes while CPU-0 is still in mmu_gather,
> >>> it will see cleared_ptes count increased and flush that granularity,
> >>> OTOH if CPU-1 flushes after CPU-0 completes, it will not and potentiall
> >>> miss an invalidate it should have had.
> >>>
> >>> This whole concurrent mmu_gather stuff is horrible.
> >>>
> >>> /me ponders more
> >>>
> >>> So I think the fundamental race here is this:
> >>>
> >>>   CPU-0   CPU-1
> >>>
> >>>   tlb_gather_mmu(.start=1,tlb_gather_mmu(.start=2,
> >>>  .end=3);.end=4);
> >>>
> >>>   ptep_get_and_clear_full(2)
> >>>   tlb_remove_tlb_entry(2);
> >>>   __tlb_remove_page();
> >>>   if (pte_present(2)) // nope
> >>>
> >>>   tlb_finish_mmu();
> >>>
> >>>   // continue without TLBI(2)
> >>>   // whoopsie
> >>>
> >>>   tlb_finish_mmu();
> >>> tlb_f

Re: [PATCH] mm: mmu_gather: remove __tlb_reset_range() for force flush

2019-05-09 Thread Jan Stancek


- Original Message -
> 
> 
> On 5/9/19 11:24 AM, Peter Zijlstra wrote:
> > On Thu, May 09, 2019 at 05:36:29PM +, Nadav Amit wrote:
> >>> On May 9, 2019, at 3:38 AM, Peter Zijlstra  wrote:
> >>> diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
> >>> index 99740e1dd273..fe768f8d612e 100644
> >>> --- a/mm/mmu_gather.c
> >>> +++ b/mm/mmu_gather.c
> >>> @@ -244,15 +244,20 @@ void tlb_finish_mmu(struct mmu_gather *tlb,
> >>>   unsigned long start, unsigned long end)
> >>> {
> >>>   /*
> >>> -  * If there are parallel threads are doing PTE changes on same range
> >>> -  * under non-exclusive lock(e.g., mmap_sem read-side) but defer TLB
> >>> -  * flush by batching, a thread has stable TLB entry can fail to flush
> >>> -  * the TLB by observing pte_none|!pte_dirty, for example so flush TLB
> >>> -  * forcefully if we detect parallel PTE batching threads.
> >>> +  * Sensible comment goes here..
> >>>*/
> >>> - if (mm_tlb_flush_nested(tlb->mm)) {
> >>> - __tlb_reset_range(tlb);
> >>> - __tlb_adjust_range(tlb, start, end - start);
> >>> + if (mm_tlb_flush_nested(tlb->mm) && !tlb->full_mm) {
> >>> + /*
> >>> +  * Since we're can't tell what we actually should have
> >>> +  * flushed flush everything in the given range.
> >>> +  */
> >>> + tlb->start = start;
> >>> + tlb->end = end;
> >>> + tlb->freed_tables = 1;
> >>> + tlb->cleared_ptes = 1;
> >>> + tlb->cleared_pmds = 1;
> >>> + tlb->cleared_puds = 1;
> >>> + tlb->cleared_p4ds = 1;
> >>>   }
> >>>
> >>>   tlb_flush_mmu(tlb);
> >> As a simple optimization, I think it is possible to hold multiple nesting
> >> counters in the mm, similar to tlb_flush_pending, for freed_tables,
> >> cleared_ptes, etc.
> >>
> >> The first time you set tlb->freed_tables, you also atomically increase
> >> mm->tlb_flush_freed_tables. Then, in tlb_flush_mmu(), you just use
> >> mm->tlb_flush_freed_tables instead of tlb->freed_tables.
> > That sounds fraught with races and expensive; I would much prefer to not
> > go there for this arguably rare case.
> >
> > Consider such fun cases as where CPU-0 sees and clears a PTE, CPU-1
> > races and doesn't see that PTE. Therefore CPU-0 sets and counts
> > cleared_ptes. Then if CPU-1 flushes while CPU-0 is still in mmu_gather,
> > it will see cleared_ptes count increased and flush that granularity,
> > OTOH if CPU-1 flushes after CPU-0 completes, it will not and potentiall
> > miss an invalidate it should have had.
> >
> > This whole concurrent mmu_gather stuff is horrible.
> >
> >/me ponders more
> >
> > So I think the fundamental race here is this:
> >
> > CPU-0   CPU-1
> >
> > tlb_gather_mmu(.start=1,tlb_gather_mmu(.start=2,
> >.end=3);.end=4);
> >
> > ptep_get_and_clear_full(2)
> > tlb_remove_tlb_entry(2);
> > __tlb_remove_page();
> > if (pte_present(2)) // nope
> >
> > tlb_finish_mmu();
> >
> > // continue without TLBI(2)
> > // whoopsie
> >
> > tlb_finish_mmu();
> >   tlb_flush()   ->  TLBI(2)
> 
> I'm not quite sure if this is the case Jan really met. But, according to
> his test, once correct tlb->freed_tables and tlb->cleared_* are set, his
> test works well.

My theory was following sequence:

t1: map_write_unmap() t2: dummy()

  map_address = mmap()
  map_address[i] = 'b'
  munmap(map_address)
  downgrade_write(>mmap_sem);
  unmap_region()
  tlb_gather_mmu()
inc_tlb_flush_pending(tlb->mm);
  free_pgtables()
tlb->freed_tables = 1
tlb->cleared_pmds = 1

pthread_exit()
madvise(thread_stack, 8M, MADV_DONTNEED)
  zap_page_range()
tlb_gather_mmu()
  inc_tlb_flush_pending(tlb->mm);

  tlb_finish_mmu()
if (mm_tlb_flush_nested(tlb->mm))
  __tlb_reset_range()
tlb->freed_tables = 0
tlb->cleared_pmds = 0
__flush_tlb_range(last_level = 0)
  ...
  map_address = mmap()
map_address[i] = 'b'
  
  # PTE appeared valid to me,
  # so I suspected stale TLB entry at higher level as result of 
"freed_tables = 0"


I'm happy to apply/run any debug patches to get more data that would help.

> 
> >
> >
> > And we can fix that by having tlb_finish_mmu() sync up. Never let a
> > concurrent tlb_finish_mmu() complete until all concurrenct mmu_gathers
> > have completed.
> 
> Not sure if this will scale well.
> 
> >
> > This should not be too hard to make happen.
> 
> 


Re: [PATCH] mm: mmu_gather: remove __tlb_reset_range() for force flush

2019-05-09 Thread Jan Stancek


> > I don't think we can elide the call __tlb_reset_range() entirely, since I
> > think we do want to clear the freed_pXX bits to ensure that we walk the
> > range with the smallest mapping granule that we have. Otherwise couldn't we
> > have a problem if we hit a PMD that had been cleared, but the TLB
> > invalidation for the PTEs that used to be linked below it was still
> > pending?
> 
> That's tlb->cleared_p*, and yes agreed. That is, right until some
> architecture has level dependent TLBI instructions, at which point we'll
> need to have them all set instead of cleared.
> 
> > Perhaps we should just set fullmm if we see that here's a concurrent
> > unmapper rather than do a worst-case range invalidation. Do you have a
> > feeling
> > for often the mm_tlb_flush_nested() triggers in practice?
> 
> Quite a bit for certain workloads I imagine, that was the whole point of
> doing it.
> 
> Anyway; am I correct in understanding that the actual problem is that
> we've cleared freed_tables and the ARM64 tlb_flush() will then not
> invalidate the cache and badness happens?

That is my understanding, only last level is flushed, which is not enough.

> 
> Because so far nobody has actually provided a coherent description of
> the actual problem we're trying to solve. But I'm thinking something
> like the below ought to do.

I applied it (and fixed small typo: s/tlb->full_mm/tlb->fullmm/).
It fixes the problem for me.

> 
> 
> diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
> index 99740e1dd273..fe768f8d612e 100644
> --- a/mm/mmu_gather.c
> +++ b/mm/mmu_gather.c
> @@ -244,15 +244,20 @@ void tlb_finish_mmu(struct mmu_gather *tlb,
>   unsigned long start, unsigned long end)
>  {
>   /*
> -  * If there are parallel threads are doing PTE changes on same range
> -  * under non-exclusive lock(e.g., mmap_sem read-side) but defer TLB
> -  * flush by batching, a thread has stable TLB entry can fail to flush
> -  * the TLB by observing pte_none|!pte_dirty, for example so flush TLB
> -  * forcefully if we detect parallel PTE batching threads.
> +  * Sensible comment goes here..
>*/
> - if (mm_tlb_flush_nested(tlb->mm)) {
> - __tlb_reset_range(tlb);
> - __tlb_adjust_range(tlb, start, end - start);
> + if (mm_tlb_flush_nested(tlb->mm) && !tlb->full_mm) {
> + /*
> +  * Since we're can't tell what we actually should have
> +  * flushed flush everything in the given range.
> +  */
> + tlb->start = start;
> + tlb->end = end;
> + tlb->freed_tables = 1;
> + tlb->cleared_ptes = 1;
> + tlb->cleared_pmds = 1;
> + tlb->cleared_puds = 1;
> + tlb->cleared_p4ds = 1;
>   }
>  
>   tlb_flush_mmu(tlb);
> 


[PATCH v3] mm/memory.c: do_fault: avoid usage of stale vm_area_struct

2019-03-02 Thread Jan Stancek
LTP testcase mtest06 [1] can trigger a crash on s390x running 5.0.0-rc8.
This is a stress test, where one thread mmaps/writes/munmaps memory area
and other thread is trying to read from it:

  CPU: 0 PID: 2611 Comm: mmap1 Not tainted 5.0.0-rc8+ #51
  Hardware name: IBM 2964 N63 400 (z/VM 6.4.0)
  Krnl PSW : 0404e0018000 001ac8d8 (__lock_acquire+0x7/0x7a8)
  Call Trace:
  ([<>]   (null))
   [<001adae4>] lock_acquire+0xec/0x258
   [<0080d1ac>] _raw_spin_lock_bh+0x5c/0x98
   [<0012a780>] page_table_free+0x48/0x1a8
   [<002f6e54>] do_fault+0xdc/0x670
   [<002fadae>] __handle_mm_fault+0x416/0x5f0
   [<002fb138>] handle_mm_fault+0x1b0/0x320
   [<001248cc>] do_dat_exception+0x19c/0x2c8
   [<0080e5ee>] pgm_check_handler+0x19e/0x200

page_table_free() is called with NULL mm parameter, but because
"0" is a valid address on s390 (see S390_lowcore), it keeps
going until it eventually crashes in lockdep's lock_acquire.
This crash is reproducible at least since 4.14.

Problem is that "vmf->vma" used in do_fault() can become stale.
Because mmap_sem may be released, other threads can come in,
call munmap() and cause "vma" be returned to kmem cache, and
get zeroed/re-initialized and re-used:

handle_mm_fault   |
  __handle_mm_fault   |
do_fault  |
  vma = vmf->vma  |
  do_read_fault   |
__do_fault|
  vma->vm_ops->fault(vmf);|
mmap_sem is released  |
  |
  | do_munmap()
  |   remove_vma_list()
  | remove_vma()
  |   vm_area_free()
  | # vma is released
  | ...
  | # same vma is allocated
  | # from kmem cache
  | do_mmap()
  |   vm_area_alloc()
  | memset(vma, 0, ...)
  |
  pte_free(vma->vm_mm, ...);  |
page_table_free   |
  spin_lock_bh(>context.lock);|
   |

Cache mm_struct to avoid using potentially stale "vma".

[1] 
https://github.com/linux-test-project/ltp/blob/master/testcases/kernel/mem/mtest06/mmap1.c

Signed-off-by: Jan Stancek 
Reviewed-by: Andrea Arcangeli 
---
 mm/memory.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/mm/memory.c b/mm/memory.c
index e11ca9dd823f..e8d69ade5acc 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3517,10 +3517,13 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf)
  * but allow concurrent faults).
  * The mmap_sem may have been released depending on flags and our
  * return value.  See filemap_fault() and __lock_page_or_retry().
+ * If mmap_sem is released, vma may become invalid (for example
+ * by other thread calling munmap()).
  */
 static vm_fault_t do_fault(struct vm_fault *vmf)
 {
struct vm_area_struct *vma = vmf->vma;
+   struct mm_struct *vm_mm = vma->vm_mm;
vm_fault_t ret;
 
/*
@@ -3561,7 +3564,7 @@ static vm_fault_t do_fault(struct vm_fault *vmf)
 
/* preallocated pagetable is unused: free it */
if (vmf->prealloc_pte) {
-   pte_free(vma->vm_mm, vmf->prealloc_pte);
+   pte_free(vm_mm, vmf->prealloc_pte);
vmf->prealloc_pte = NULL;
}
return ret;
-- 
1.8.3.1



Re: [PATCH v2] mm/memory.c: do_fault: avoid usage of stale vm_area_struct

2019-03-02 Thread Jan Stancek



- Original Message -
> Hello Jan,
> 
> On Sat, Mar 02, 2019 at 07:19:39PM +0100, Jan Stancek wrote:
> > +   struct mm_struct *vm_mm = READ_ONCE(vma->vm_mm);
> 
> The vma->vm_mm cannot change under gcc there, so no need of
> READ_ONCE. The release of mmap_sem has release semantics so the
> vma->vm_mm access cannot be reordered after up_read(mmap_sem) either.
> 
> Other than the above detail:
> 
> Reviewed-by: Andrea Arcangeli 

Thank you for review, I dropped READ_ONCE and sent v3 with your
Reviewed-by included. I also successfully re-ran tests over-night.

> Would this not need a corresponding WRITE_ONCE() in vma_init() ?

There's at least 2 context switches between, so I think it wouldn't matter.
My concern was gcc optimizing out vm_mm, and vma->vm_mm access happening only
after do_read_fault().



[PATCH v2] mm/memory.c: do_fault: avoid usage of stale vm_area_struct

2019-03-02 Thread Jan Stancek
LTP testcase mtest06 [1] can trigger a crash on s390x running 5.0.0-rc8.
This is a stress test, where one thread mmaps/writes/munmaps memory area
and other thread is trying to read from it:

  CPU: 0 PID: 2611 Comm: mmap1 Not tainted 5.0.0-rc8+ #51
  Hardware name: IBM 2964 N63 400 (z/VM 6.4.0)
  Krnl PSW : 0404e0018000 001ac8d8 (__lock_acquire+0x7/0x7a8)
  Call Trace:
  ([<>]   (null))
   [<001adae4>] lock_acquire+0xec/0x258
   [<0080d1ac>] _raw_spin_lock_bh+0x5c/0x98
   [<0012a780>] page_table_free+0x48/0x1a8
   [<002f6e54>] do_fault+0xdc/0x670
   [<002fadae>] __handle_mm_fault+0x416/0x5f0
   [<002fb138>] handle_mm_fault+0x1b0/0x320
   [<001248cc>] do_dat_exception+0x19c/0x2c8
   [<0080e5ee>] pgm_check_handler+0x19e/0x200

page_table_free() is called with NULL mm parameter, but because
"0" is a valid address on s390 (see S390_lowcore), it keeps
going until it eventually crashes in lockdep's lock_acquire.
This crash is reproducible at least since 4.14.

Problem is that "vmf->vma" used in do_fault() can become stale.
Because mmap_sem may be released, other threads can come in,
call munmap() and cause "vma" be returned to kmem cache, and
get zeroed/re-initialized and re-used:

handle_mm_fault   |
  __handle_mm_fault   |
do_fault  |
  vma = vmf->vma  |
  do_read_fault   |
__do_fault|
  vma->vm_ops->fault(vmf);|
mmap_sem is released  |
  |
  | do_munmap()
  |   remove_vma_list()
  | remove_vma()
  |   vm_area_free()
  | # vma is released
  | ...
  | # same vma is allocated
  | # from kmem cache
  | do_mmap()
  |   vm_area_alloc()
  | memset(vma, 0, ...)
  |
  pte_free(vma->vm_mm, ...);  |
page_table_free   |
  spin_lock_bh(>context.lock);|
   |

Cache mm_struct to avoid using potentially stale "vma".

[1] 
https://github.com/linux-test-project/ltp/blob/master/testcases/kernel/mem/mtest06/mmap1.c

Signed-off-by: Jan Stancek 
---
 mm/memory.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/mm/memory.c b/mm/memory.c
index e11ca9dd823f..6c1afc1ece50 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3517,10 +3517,13 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf)
  * but allow concurrent faults).
  * The mmap_sem may have been released depending on flags and our
  * return value.  See filemap_fault() and __lock_page_or_retry().
+ * If mmap_sem is released, vma may become invalid (for example
+ * by other thread calling munmap()).
  */
 static vm_fault_t do_fault(struct vm_fault *vmf)
 {
struct vm_area_struct *vma = vmf->vma;
+   struct mm_struct *vm_mm = READ_ONCE(vma->vm_mm);
vm_fault_t ret;
 
/*
@@ -3561,7 +3564,7 @@ static vm_fault_t do_fault(struct vm_fault *vmf)
 
/* preallocated pagetable is unused: free it */
if (vmf->prealloc_pte) {
-   pte_free(vma->vm_mm, vmf->prealloc_pte);
+   pte_free(vm_mm, vmf->prealloc_pte);
vmf->prealloc_pte = NULL;
}
return ret;
-- 
1.8.3.1



Re: [PATCH] mm/memory.c: do_fault: avoid usage of stale vm_area_struct

2019-03-02 Thread Jan Stancek



- Original Message -
> On Sat, Mar 02, 2019 at 04:11:26PM +0100, Jan Stancek wrote:
> > Problem is that "vmf->vma" used in do_fault() can become stale.
> > Because mmap_sem may be released, other threads can come in,
> > call munmap() and cause "vma" be returned to kmem cache, and
> > get zeroed/re-initialized and re-used:
> 
> > This patch pins mm_struct and stores its value, to avoid using
> > potentially stale "vma" when calling pte_free().
> 
> OK, we need to cache the mm_struct, but why do we need the extra atomic op?
> There's surely no way the mm can be freed while the thread is in the middle
> of handling a fault.

You're right, I was needlessly paranoid.

> 
> ie I would drop these lines:

I'll send v2.

Thanks,
Jan

> 
> > +   mmgrab(vm_mm);
> > +
> ...
> > +
> > +   mmdrop(vm_mm);
> > +
> 


[PATCH] mm/memory.c: do_fault: avoid usage of stale vm_area_struct

2019-03-02 Thread Jan Stancek
LTP testcase mtest06 [1] can trigger a crash on s390x running 5.0.0-rc8.
This is a stress test, where one thread mmaps/writes/munmaps memory area
and other thread is trying to read from it:

  CPU: 0 PID: 2611 Comm: mmap1 Not tainted 5.0.0-rc8+ #51
  Hardware name: IBM 2964 N63 400 (z/VM 6.4.0)
  Krnl PSW : 0404e0018000 001ac8d8 (__lock_acquire+0x7/0x7a8)
  Call Trace:
  ([<>]   (null))
   [<001adae4>] lock_acquire+0xec/0x258
   [<0080d1ac>] _raw_spin_lock_bh+0x5c/0x98
   [<0012a780>] page_table_free+0x48/0x1a8
   [<002f6e54>] do_fault+0xdc/0x670
   [<002fadae>] __handle_mm_fault+0x416/0x5f0
   [<002fb138>] handle_mm_fault+0x1b0/0x320
   [<001248cc>] do_dat_exception+0x19c/0x2c8
   [<0080e5ee>] pgm_check_handler+0x19e/0x200

page_table_free() is called with NULL mm parameter, but because
"0" is a valid address on s390 (see S390_lowcore), it keeps
going until it eventually crashes in lockdep's lock_acquire.
This crash is reproducible at least since 4.14.

Problem is that "vmf->vma" used in do_fault() can become stale.
Because mmap_sem may be released, other threads can come in,
call munmap() and cause "vma" be returned to kmem cache, and
get zeroed/re-initialized and re-used:

handle_mm_fault   |
  __handle_mm_fault   |
do_fault  |
  vma = vmf->vma  |
  do_read_fault   |
__do_fault|
  vma->vm_ops->fault(vmf);|
mmap_sem is released  |
  |
  | do_munmap()
  |   remove_vma_list()
  | remove_vma()
  |   vm_area_free()
  | # vma is released
  | ...
  | # same vma is allocated
  | # from kmem cache
  | do_mmap()
  |   vm_area_alloc()
  | memset(vma, 0, ...)
  |
  pte_free(vma->vm_mm, ...);  |
page_table_free   |
  spin_lock_bh(>context.lock);|
   |

This patch pins mm_struct and stores its value, to avoid using
potentially stale "vma" when calling pte_free().

[1] 
https://github.com/linux-test-project/ltp/blob/master/testcases/kernel/mem/mtest06/mmap1.c

Signed-off-by: Jan Stancek 
---
 mm/memory.c | 10 +-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/mm/memory.c b/mm/memory.c
index e11ca9dd823f..1287ee9acbdc 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3517,12 +3517,17 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf)
  * but allow concurrent faults).
  * The mmap_sem may have been released depending on flags and our
  * return value.  See filemap_fault() and __lock_page_or_retry().
+ * If mmap_sem is released, vma may become invalid (for example
+ * by other thread calling munmap()).
  */
 static vm_fault_t do_fault(struct vm_fault *vmf)
 {
struct vm_area_struct *vma = vmf->vma;
+   struct mm_struct *vm_mm = READ_ONCE(vma->vm_mm);
vm_fault_t ret;
 
+   mmgrab(vm_mm);
+
/*
 * The VMA was not fully populated on mmap() or missing VM_DONTEXPAND
 */
@@ -3561,9 +3566,12 @@ static vm_fault_t do_fault(struct vm_fault *vmf)
 
/* preallocated pagetable is unused: free it */
if (vmf->prealloc_pte) {
-   pte_free(vma->vm_mm, vmf->prealloc_pte);
+   pte_free(vm_mm, vmf->prealloc_pte);
vmf->prealloc_pte = NULL;
}
+
+   mmdrop(vm_mm);
+
return ret;
 }
 
-- 
1.8.3.1



Re: [PATCH v2] mm: page_mapped: don't assume compound page is huge or THP

2019-02-04 Thread Jan Stancek



- Original Message -
> On Fri, Nov 30, 2018 at 1:07 PM Jan Stancek  wrote:
> >
> > LTP proc01 testcase has been observed to rarely trigger crashes
> > on arm64:
> > page_mapped+0x78/0xb4
> > stable_page_flags+0x27c/0x338
> > kpageflags_read+0xfc/0x164
> > proc_reg_read+0x7c/0xb8
> > __vfs_read+0x58/0x178
> > vfs_read+0x90/0x14c
> > SyS_read+0x60/0xc0
> >
> > Issue is that page_mapped() assumes that if compound page is not
> > huge, then it must be THP. But if this is 'normal' compound page
> > (COMPOUND_PAGE_DTOR), then following loop can keep running
> > (for HPAGE_PMD_NR iterations) until it tries to read from memory
> > that isn't mapped and triggers a panic:
> > for (i = 0; i < hpage_nr_pages(page); i++) {
> > if (atomic_read([i]._mapcount) >= 0)
> > return true;
> > }
> >
> > I could replicate this on x86 (v4.20-rc4-98-g60b548237fed) only
> > with a custom kernel module [1] which:
> > - allocates compound page (PAGEC) of order 1
> > - allocates 2 normal pages (COPY), which are initialized to 0xff
> >   (to satisfy _mapcount >= 0)
> > - 2 PAGEC page structs are copied to address of first COPY page
> > - second page of COPY is marked as not present
> > - call to page_mapped(COPY) now triggers fault on access to 2nd
> >   COPY page at offset 0x30 (_mapcount)
> >
> > [1]
> > https://github.com/jstancek/reproducers/blob/master/kernel/page_mapped_crash/repro.c
> >
> > Fix the loop to iterate for "1 << compound_order" pages.
> >
> > Debugged-by: Laszlo Ersek 
> > Suggested-by: "Kirill A. Shutemov" 
> > Signed-off-by: Jan Stancek 
> > ---
> >  mm/util.c | 2 +-
> >  1 file changed, 1 insertion(+), 1 deletion(-)
> >
> > Changes in v2:
> > - change the loop instead so we check also mapcount of subpages
> >
> > diff --git a/mm/util.c b/mm/util.c
> > index 8bf08b5b5760..5c9c7359ee8a 100644
> > --- a/mm/util.c
> > +++ b/mm/util.c
> > @@ -478,7 +478,7 @@ bool page_mapped(struct page *page)
> > return true;
> > if (PageHuge(page))
> > return false;
> > -   for (i = 0; i < hpage_nr_pages(page); i++) {
> > +   for (i = 0; i < (1 << compound_order(page)); i++) {
> > if (atomic_read([i]._mapcount) >= 0)
> > return true;
> > }
> > --
> > 1.8.3.1
> 
> Hi all
> 
> This patch landed in the 4.9-stable tree starting from 4.9.151 and it
> broke our MIPS1004kc system with CONFIG_HIGHMEM=y.

Hi,

are you using THP (CONFIG_TRANSPARENT_HUGEPAGE)?

The changed line should affect only THP and normal compound pages,
so a test with THP disabled might be interesting. 

> 
> The breakage consists of random processes dying with SIGILL or SIGSEGV
> when we stress test the system with high memory pressure and explicit
> memory compaction requested through /proc/sys/vm/compact_memory.
> Reverting this patch fixes the crashes.
> 
> We can put some effort on debugging if there are no obvious
> explanations for this. Keep in mind that this is 32-bit system with
> HIGHMEM.

Nothing obvious that I can see. I've been trying to reproduce on
32-bit x86 Fedora with no luck so far.

Regards,
Jan


[PATCH v2] mm: page_mapped: don't assume compound page is huge or THP

2018-11-30 Thread Jan Stancek
LTP proc01 testcase has been observed to rarely trigger crashes
on arm64:
page_mapped+0x78/0xb4
stable_page_flags+0x27c/0x338
kpageflags_read+0xfc/0x164
proc_reg_read+0x7c/0xb8
__vfs_read+0x58/0x178
vfs_read+0x90/0x14c
SyS_read+0x60/0xc0

Issue is that page_mapped() assumes that if compound page is not
huge, then it must be THP. But if this is 'normal' compound page
(COMPOUND_PAGE_DTOR), then following loop can keep running
(for HPAGE_PMD_NR iterations) until it tries to read from memory
that isn't mapped and triggers a panic:
for (i = 0; i < hpage_nr_pages(page); i++) {
if (atomic_read([i]._mapcount) >= 0)
return true;
}

I could replicate this on x86 (v4.20-rc4-98-g60b548237fed) only
with a custom kernel module [1] which:
- allocates compound page (PAGEC) of order 1
- allocates 2 normal pages (COPY), which are initialized to 0xff
  (to satisfy _mapcount >= 0)
- 2 PAGEC page structs are copied to address of first COPY page
- second page of COPY is marked as not present
- call to page_mapped(COPY) now triggers fault on access to 2nd
  COPY page at offset 0x30 (_mapcount)

[1] 
https://github.com/jstancek/reproducers/blob/master/kernel/page_mapped_crash/repro.c

Fix the loop to iterate for "1 << compound_order" pages.

Debugged-by: Laszlo Ersek 
Suggested-by: "Kirill A. Shutemov" 
Signed-off-by: Jan Stancek 
---
 mm/util.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

Changes in v2:
- change the loop instead so we check also mapcount of subpages

diff --git a/mm/util.c b/mm/util.c
index 8bf08b5b5760..5c9c7359ee8a 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -478,7 +478,7 @@ bool page_mapped(struct page *page)
return true;
if (PageHuge(page))
return false;
-   for (i = 0; i < hpage_nr_pages(page); i++) {
+   for (i = 0; i < (1 << compound_order(page)); i++) {
if (atomic_read([i]._mapcount) >= 0)
return true;
}
-- 
1.8.3.1



[PATCH v2] mm: page_mapped: don't assume compound page is huge or THP

2018-11-30 Thread Jan Stancek
LTP proc01 testcase has been observed to rarely trigger crashes
on arm64:
page_mapped+0x78/0xb4
stable_page_flags+0x27c/0x338
kpageflags_read+0xfc/0x164
proc_reg_read+0x7c/0xb8
__vfs_read+0x58/0x178
vfs_read+0x90/0x14c
SyS_read+0x60/0xc0

Issue is that page_mapped() assumes that if compound page is not
huge, then it must be THP. But if this is 'normal' compound page
(COMPOUND_PAGE_DTOR), then following loop can keep running
(for HPAGE_PMD_NR iterations) until it tries to read from memory
that isn't mapped and triggers a panic:
for (i = 0; i < hpage_nr_pages(page); i++) {
if (atomic_read([i]._mapcount) >= 0)
return true;
}

I could replicate this on x86 (v4.20-rc4-98-g60b548237fed) only
with a custom kernel module [1] which:
- allocates compound page (PAGEC) of order 1
- allocates 2 normal pages (COPY), which are initialized to 0xff
  (to satisfy _mapcount >= 0)
- 2 PAGEC page structs are copied to address of first COPY page
- second page of COPY is marked as not present
- call to page_mapped(COPY) now triggers fault on access to 2nd
  COPY page at offset 0x30 (_mapcount)

[1] 
https://github.com/jstancek/reproducers/blob/master/kernel/page_mapped_crash/repro.c

Fix the loop to iterate for "1 << compound_order" pages.

Debugged-by: Laszlo Ersek 
Suggested-by: "Kirill A. Shutemov" 
Signed-off-by: Jan Stancek 
---
 mm/util.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

Changes in v2:
- change the loop instead so we check also mapcount of subpages

diff --git a/mm/util.c b/mm/util.c
index 8bf08b5b5760..5c9c7359ee8a 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -478,7 +478,7 @@ bool page_mapped(struct page *page)
return true;
if (PageHuge(page))
return false;
-   for (i = 0; i < hpage_nr_pages(page); i++) {
+   for (i = 0; i < (1 << compound_order(page)); i++) {
if (atomic_read([i]._mapcount) >= 0)
return true;
}
-- 
1.8.3.1



[PATCH] mm: page_mapped: don't assume compound page is huge or THP

2018-11-29 Thread Jan Stancek
LTP proc01 testcase has been observed to rarely trigger crashes
on arm64:
page_mapped+0x78/0xb4
stable_page_flags+0x27c/0x338
kpageflags_read+0xfc/0x164
proc_reg_read+0x7c/0xb8
__vfs_read+0x58/0x178
vfs_read+0x90/0x14c
SyS_read+0x60/0xc0

Issue is that page_mapped() assumes that if compound page is not
huge, then it must be THP. But if this is 'normal' compound page
(COMPOUND_PAGE_DTOR), then following loop can keep running until
it tries to read from memory that isn't mapped and triggers a panic:
for (i = 0; i < hpage_nr_pages(page); i++) {
if (atomic_read([i]._mapcount) >= 0)
return true;
}

I could replicate this on x86 (v4.20-rc4-98-g60b548237fed) only
with a custom kernel module [1] which:
- allocates compound page (PAGEC) of order 1
- allocates 2 normal pages (COPY), which are initialized to 0xff
  (to satisfy _mapcount >= 0)
- 2 PAGEC page structs are copied to address of first COPY page
- second page of COPY is marked as not present
- call to page_mapped(COPY) now triggers fault on access to 2nd COPY
  page at offset 0x30 (_mapcount)

[1] 
https://github.com/jstancek/reproducers/blob/master/kernel/page_mapped_crash/repro.c

This patch modifies page_mapped() to check for 'normal'
compound pages (COMPOUND_PAGE_DTOR).

Debugged-by: Laszlo Ersek 
Signed-off-by: Jan Stancek 
---
 include/linux/mm.h | 9 +
 mm/util.c  | 2 ++
 2 files changed, 11 insertions(+)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 5411de93a363..18b0bb953f92 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -700,6 +700,15 @@ static inline compound_page_dtor 
*get_compound_page_dtor(struct page *page)
return compound_page_dtors[page[1].compound_dtor];
 }
 
+static inline int PageNormalCompound(struct page *page)
+{
+   if (!PageCompound(page))
+   return 0;
+
+   page = compound_head(page);
+   return page[1].compound_dtor == COMPOUND_PAGE_DTOR;
+}
+
 static inline unsigned int compound_order(struct page *page)
 {
if (!PageHead(page))
diff --git a/mm/util.c b/mm/util.c
index 8bf08b5b5760..06c1640cb7b3 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -478,6 +478,8 @@ bool page_mapped(struct page *page)
return true;
if (PageHuge(page))
return false;
+   if (PageNormalCompound(page))
+   return false;
for (i = 0; i < hpage_nr_pages(page); i++) {
if (atomic_read([i]._mapcount) >= 0)
return true;
-- 
1.8.3.1



[PATCH] mm: page_mapped: don't assume compound page is huge or THP

2018-11-29 Thread Jan Stancek
LTP proc01 testcase has been observed to rarely trigger crashes
on arm64:
page_mapped+0x78/0xb4
stable_page_flags+0x27c/0x338
kpageflags_read+0xfc/0x164
proc_reg_read+0x7c/0xb8
__vfs_read+0x58/0x178
vfs_read+0x90/0x14c
SyS_read+0x60/0xc0

Issue is that page_mapped() assumes that if compound page is not
huge, then it must be THP. But if this is 'normal' compound page
(COMPOUND_PAGE_DTOR), then following loop can keep running until
it tries to read from memory that isn't mapped and triggers a panic:
for (i = 0; i < hpage_nr_pages(page); i++) {
if (atomic_read([i]._mapcount) >= 0)
return true;
}

I could replicate this on x86 (v4.20-rc4-98-g60b548237fed) only
with a custom kernel module [1] which:
- allocates compound page (PAGEC) of order 1
- allocates 2 normal pages (COPY), which are initialized to 0xff
  (to satisfy _mapcount >= 0)
- 2 PAGEC page structs are copied to address of first COPY page
- second page of COPY is marked as not present
- call to page_mapped(COPY) now triggers fault on access to 2nd COPY
  page at offset 0x30 (_mapcount)

[1] 
https://github.com/jstancek/reproducers/blob/master/kernel/page_mapped_crash/repro.c

This patch modifies page_mapped() to check for 'normal'
compound pages (COMPOUND_PAGE_DTOR).

Debugged-by: Laszlo Ersek 
Signed-off-by: Jan Stancek 
---
 include/linux/mm.h | 9 +
 mm/util.c  | 2 ++
 2 files changed, 11 insertions(+)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 5411de93a363..18b0bb953f92 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -700,6 +700,15 @@ static inline compound_page_dtor 
*get_compound_page_dtor(struct page *page)
return compound_page_dtors[page[1].compound_dtor];
 }
 
+static inline int PageNormalCompound(struct page *page)
+{
+   if (!PageCompound(page))
+   return 0;
+
+   page = compound_head(page);
+   return page[1].compound_dtor == COMPOUND_PAGE_DTOR;
+}
+
 static inline unsigned int compound_order(struct page *page)
 {
if (!PageHead(page))
diff --git a/mm/util.c b/mm/util.c
index 8bf08b5b5760..06c1640cb7b3 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -478,6 +478,8 @@ bool page_mapped(struct page *page)
return true;
if (PageHuge(page))
return false;
+   if (PageNormalCompound(page))
+   return false;
for (i = 0; i < hpage_nr_pages(page); i++) {
if (atomic_read([i]._mapcount) >= 0)
return true;
-- 
1.8.3.1



Re: [LTP] [PATCH 4.17 00/67] 4.17.7-stable review

2018-07-16 Thread Jan Stancek



- Original Message -
> On 16 July 2018 at 13:04, Greg Kroah-Hartman 
> wrote:
> > This is the start of the stable review cycle for the 4.17.7 release.
> > There are 67 patches in this series, all will be posted as a response
> > to this one.  If anyone has any issues with these being applied, please
> > let me know.
> >
> > Responses should be made by Wed Jul 18 07:34:11 UTC 2018.
> > Anything received after that time might be too late.
> >
> > The whole patch series can be found in one patch at:
> > 
> > https://www.kernel.org/pub/linux/kernel/v4.x/stable-review/patch-4.17.7-rc1.gz
> > or in the git tree and branch at:
> > 
> > git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git
> > linux-4.17.y
> > and the diffstat can be found below.
> >
> > thanks,
> >
> > greg k-h
> 
> 
> Results from Linaro’s test farm.
> Regressions detected.
> 
> LTP syscalls failed test cases on all devices arm64, arm32 and x86_64,
>  - creat08
>  - open10

This is consequence of:
  0fa3ecd87848 ("Fix up non-directory creation in SGID directories")

So likely cause of action here will be modifying both tests to not
expect S_ISGID in scenario described in commit above.

[adding Dave Chinner]

@Dave: Does XFS need separate patch for problem described in commit
0fa3ecd87848? XFS doesn't appear to use inode_init_owner() function,
both tests currently PASS the check for S_ISGID on file created by
non-group member as of v4.18-rc5.

Regards,
Jan

> 
> Reported this bug internally on upstream Linux mainline week back.
> Now this bug happening on 4.17, 4.14, 4.9 and 4.4
> 
> creat08 and open10 failed with this error,
> TFAIL  : testdir.B.3132/setgid: Incorrect modes, setgid bit should be set
> 
> Test case description:
> /*
>  * NAME
>  * creat08.c - Verifies that the group ID and setgid bit are
>  *set correctly when a new file is created.
>  *(ported from SPIE, section2/iosuite/creat5.c,
>  * by Airong Zhang )
>  * CALLS
>  * creat
>  *
>  * ALGORITHM
>  * Create two directories, one with the group ID of this process
>  * and the setgid bit not set, and the other with a group ID
>  * other than that of this process and with the setgid bit set.
>  * In each directory, create a file with and without the setgid
>  * bit set in the creation modes. Verify that the modes and group
>  * ID are correct on each of the 4 files.
>  * As root, create a file with the setgid bit on in the
>  * directory with the setgid bit.
>  * This tests the SVID3 create group semantics.
>  */
> 
> Block2 testing:
> /*--*/
> /* Block2: Create two files in testdir.B, one with the setgid   */
> /* bit set in the creation modes and the other without. */
> /*Both should inherit the group ID of the parent   */
> /*directory, group2.   */
> /*--*/
> 
> 
> Test results comparison on mainline versions,
> https://qa-reports.linaro.org/lkft/linux-mainline-oe/tests/ltp-syscalls-tests/creat08
> https://qa-reports.linaro.org/lkft/linux-mainline-oe/tests/ltp-syscalls-tests/open10
> 
> Test results comparison on 4.17 versions,
> https://qa-reports.linaro.org/lkft/linux-stable-rc-4.17-oe/tests/ltp-syscalls-tests/creat08
> https://qa-reports.linaro.org/lkft/linux-stable-rc-4.17-oe/tests/ltp-syscalls-tests/open10
> 
> Bug report link,
> https://bugs.linaro.org/show_bug.cgi?id=3940
> 
> Summary
> 
> 
> kernel: 4.17.7-rc1
> git repo:
> https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git
> git branch: linux-4.17.y
> git commit: bc0bd9e05fa1e213c689620eb4cba825c03dcc4a
> git describe: v4.17.6-68-gbc0bd9e05fa1
> Test details:
> https://qa-reports.linaro.org/lkft/linux-stable-rc-4.17-oe/build/v4.17.6-68-gbc0bd9e05fa1
> 
> Regressions (compared to build v4.17.6-67-g3b02b1fd1975)
> 
> 
> LTP syscalls failed test cases on all devices,
> 
>  - creat08
>  - open10
> 
> --
> Linaro LKFT
> https://lkft.linaro.org
> 
> --
> Mailing list info: https://lists.linux.it/listinfo/ltp
> 


Re: [LTP] [PATCH 4.17 00/67] 4.17.7-stable review

2018-07-16 Thread Jan Stancek



- Original Message -
> On 16 July 2018 at 13:04, Greg Kroah-Hartman 
> wrote:
> > This is the start of the stable review cycle for the 4.17.7 release.
> > There are 67 patches in this series, all will be posted as a response
> > to this one.  If anyone has any issues with these being applied, please
> > let me know.
> >
> > Responses should be made by Wed Jul 18 07:34:11 UTC 2018.
> > Anything received after that time might be too late.
> >
> > The whole patch series can be found in one patch at:
> > 
> > https://www.kernel.org/pub/linux/kernel/v4.x/stable-review/patch-4.17.7-rc1.gz
> > or in the git tree and branch at:
> > 
> > git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git
> > linux-4.17.y
> > and the diffstat can be found below.
> >
> > thanks,
> >
> > greg k-h
> 
> 
> Results from Linaro’s test farm.
> Regressions detected.
> 
> LTP syscalls failed test cases on all devices arm64, arm32 and x86_64,
>  - creat08
>  - open10

This is consequence of:
  0fa3ecd87848 ("Fix up non-directory creation in SGID directories")

So likely cause of action here will be modifying both tests to not
expect S_ISGID in scenario described in commit above.

[adding Dave Chinner]

@Dave: Does XFS need separate patch for problem described in commit
0fa3ecd87848? XFS doesn't appear to use inode_init_owner() function,
both tests currently PASS the check for S_ISGID on file created by
non-group member as of v4.18-rc5.

Regards,
Jan

> 
> Reported this bug internally on upstream Linux mainline week back.
> Now this bug happening on 4.17, 4.14, 4.9 and 4.4
> 
> creat08 and open10 failed with this error,
> TFAIL  : testdir.B.3132/setgid: Incorrect modes, setgid bit should be set
> 
> Test case description:
> /*
>  * NAME
>  * creat08.c - Verifies that the group ID and setgid bit are
>  *set correctly when a new file is created.
>  *(ported from SPIE, section2/iosuite/creat5.c,
>  * by Airong Zhang )
>  * CALLS
>  * creat
>  *
>  * ALGORITHM
>  * Create two directories, one with the group ID of this process
>  * and the setgid bit not set, and the other with a group ID
>  * other than that of this process and with the setgid bit set.
>  * In each directory, create a file with and without the setgid
>  * bit set in the creation modes. Verify that the modes and group
>  * ID are correct on each of the 4 files.
>  * As root, create a file with the setgid bit on in the
>  * directory with the setgid bit.
>  * This tests the SVID3 create group semantics.
>  */
> 
> Block2 testing:
> /*--*/
> /* Block2: Create two files in testdir.B, one with the setgid   */
> /* bit set in the creation modes and the other without. */
> /*Both should inherit the group ID of the parent   */
> /*directory, group2.   */
> /*--*/
> 
> 
> Test results comparison on mainline versions,
> https://qa-reports.linaro.org/lkft/linux-mainline-oe/tests/ltp-syscalls-tests/creat08
> https://qa-reports.linaro.org/lkft/linux-mainline-oe/tests/ltp-syscalls-tests/open10
> 
> Test results comparison on 4.17 versions,
> https://qa-reports.linaro.org/lkft/linux-stable-rc-4.17-oe/tests/ltp-syscalls-tests/creat08
> https://qa-reports.linaro.org/lkft/linux-stable-rc-4.17-oe/tests/ltp-syscalls-tests/open10
> 
> Bug report link,
> https://bugs.linaro.org/show_bug.cgi?id=3940
> 
> Summary
> 
> 
> kernel: 4.17.7-rc1
> git repo:
> https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git
> git branch: linux-4.17.y
> git commit: bc0bd9e05fa1e213c689620eb4cba825c03dcc4a
> git describe: v4.17.6-68-gbc0bd9e05fa1
> Test details:
> https://qa-reports.linaro.org/lkft/linux-stable-rc-4.17-oe/build/v4.17.6-68-gbc0bd9e05fa1
> 
> Regressions (compared to build v4.17.6-67-g3b02b1fd1975)
> 
> 
> LTP syscalls failed test cases on all devices,
> 
>  - creat08
>  - open10
> 
> --
> Linaro LKFT
> https://lkft.linaro.org
> 
> --
> Mailing list info: https://lists.linux.it/listinfo/ltp
> 


Re: LTP CVE cve-2017-17053 test failed on x86_64 device

2018-06-20 Thread Jan Stancek



- Original Message -
> LTP CVE cve-2017-17053 test failed on x86_64 device.
> FAIL on linux-next, mainline, and stable-rc-4.17.
> PASS on stable-rc 4.16, 4.14, 4.9 and 4.4 kernel.
> 
> Test FAIL case output,
> tst_test.c:1015: INFO: Timeout per run is 0h 15m 00s
> tst_taint.c:88: BROK: Kernel is already tainted: 512

You seem to be running into some nfs warning early during boot [1][2],
that taints the kernel. cve-2017-17053 test doesn't do much,
it fails in setup because it finds kernel already tainted.

Looks similar to:
  https://www.spinics.net/lists/linux-nfs/msg68064.html

Regards,
Jan

[1]
[   78.886529] [ cut here ]
[   78.891155] DEBUG_LOCKS_WARN_ON(sem->owner != ((struct task_struct *)(1UL << 
0)))
[   78.891161] WARNING: CPU: 0 PID: 33 at 
/srv/oe/build/tmp-rpb-glibc/work-shared/intel-core2-32/kernel-source/kernel/locking/rwsem.c:217
 up_read_non_owner+0x5d/0x70
[   78.913141] Modules linked in: fuse
[   78.916633] CPU: 0 PID: 33 Comm: kworker/0:1 Not tainted 4.18.0-rc1 #1
[   78.923150] Hardware name: Supermicro SYS-5019S-ML/X11SSH-F, BIOS 2.0b 
07/27/2017
[   78.930623] Workqueue: nfsiod rpc_async_release
[   78.935154] RIP: 0010:up_read_non_owner+0x5d/0x70
[   78.939851] Code: 00 5b 5d c3 e8 54 9f 3a 00 85 c0 74 de 8b 05 ba 40 6a 02 
85 c0 75 d4 48 c7 c6 c8 ce 8f 9c 48 c7 c7 93 8e 8e 9c e8 33 6b fa ff <0f> 0b eb 
bd 0f 1f 44 00 00 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 
[   78.958717] RSP: 0018:b46b419ebdd0 EFLAGS: 00010282
[   78.963935] RAX:  RBX: 882d1c390130 RCX: 0006
[   78.971059] RDX: 0007 RSI: 0001 RDI: 882d2fc15730
[   78.978184] RBP: b46b419ebdd8 R08:  R09: 
[   78.985318] R10: b46b419ebdd0 R11:  R12: 882d1ad56800
[   78.985319] R13: 882d1d133000 R14:  R15: 882d1f047840
[   78.985319] FS:  () GS:882d2fc0() 
knlGS:
[   78.985320] CS:  0010 DS:  ES:  CR0: 80050033
[   78.985321] CR2: 56385d962ae0 CR3: 21e1e002 CR4: 003606f0
[   78.985321] DR0:  DR1:  DR2: 
[   78.985322] DR3:  DR6: fffe0ff0 DR7: 0400
[   78.985322] Call Trace:
[[0;32m  OK  [[   78.985327]  nfs_async_unlink_release+0x32/0x80
[   78.985329]  rpc_free_task+0x30/0x50
[   78.985330]  rpc_async_release+0x12/0x20
[   78.985332]  process_one_work+0x278/0x670
0m] [   78.985335]  worker_thread+0x4d/0x410
[   78.985337]  kthread+0x10d/0x140
[   78.985338]  ? rescuer_thread+0x3a0/0x3a0
[   78.985339]  ? kthread_create_worker_on_cpu+0x70/0x70
[   78.985341]  ret_from_fork+0x3a/0x50
[   78.985344] irq event stamp: 36545
[   78.985346] hardirqs last  enabled at (36545): [] 
_raw_spin_unlock_irq+0x2c/0x40
[   78.985347] hardirqs last disabled at (36544): [] 
_raw_spin_lock_irq+0x13/0x50
[   78.985348] softirqs last  enabled at (34878): [] 
reg_todo+0x260/0x2f0
[   78.985350] softirqs last disabled at (34876): [] 
reg_todo+0x171/0x2f0
[   79.111787] ---[ end trace bed1f41cfea3a4c5 ]---

[2] https://lkft.validation.linaro.org/scheduler/job/290429#L893

> Summary:
> passed   0
> failed   0
> skipped  0
> warnings 0
> 
> Test comments,
> /* Regression test for CVE-2017-17053, original reproducer can be found
>  * here:
>  *
>  
> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=ccd5b3235180eef3cfec337df1c8554ab151b5cc
>  *
>  * Be careful! This test may crash your kernel!
>  */
> 
> Full test log:
> https://lkft.validation.linaro.org/scheduler/job/290429#L9581
> https://lkft.validation.linaro.org/scheduler/job/259217#L10308
> 
> History of the test case shows failed on
> Linux next kernel,
> https://qa-reports.linaro.org/lkft/linux-next-oe/tests/ltp-cve-tests/cve-2017-17053
> 
> Linux mainline kernel,
> https://qa-reports.linaro.org/lkft/linux-mainline-oe/tests/ltp-cve-tests \
> /cve-2017-17053
> ^ Please join link
> 
> Linux stable rc 4.17,
> https://qa-reports.linaro.org/lkft/linux-stable-rc-4.17-oe/tests/ltp-cve-tests
> \
> /cve-2017-17053
> ^ Please join link
> 
> Test PASS on 4.16 kernel.
> https://qa-reports.linaro.org/lkft/linux-stable-rc-4.16-oe/tests/ltp-cve-tests
> \
> /cve-2017-17053
> ^ Please join link
> 
> Best regards
> Naresh Kamboju
> 


Re: LTP CVE cve-2017-17053 test failed on x86_64 device

2018-06-20 Thread Jan Stancek



- Original Message -
> LTP CVE cve-2017-17053 test failed on x86_64 device.
> FAIL on linux-next, mainline, and stable-rc-4.17.
> PASS on stable-rc 4.16, 4.14, 4.9 and 4.4 kernel.
> 
> Test FAIL case output,
> tst_test.c:1015: INFO: Timeout per run is 0h 15m 00s
> tst_taint.c:88: BROK: Kernel is already tainted: 512

You seem to be running into some nfs warning early during boot [1][2],
that taints the kernel. cve-2017-17053 test doesn't do much,
it fails in setup because it finds kernel already tainted.

Looks similar to:
  https://www.spinics.net/lists/linux-nfs/msg68064.html

Regards,
Jan

[1]
[   78.886529] [ cut here ]
[   78.891155] DEBUG_LOCKS_WARN_ON(sem->owner != ((struct task_struct *)(1UL << 
0)))
[   78.891161] WARNING: CPU: 0 PID: 33 at 
/srv/oe/build/tmp-rpb-glibc/work-shared/intel-core2-32/kernel-source/kernel/locking/rwsem.c:217
 up_read_non_owner+0x5d/0x70
[   78.913141] Modules linked in: fuse
[   78.916633] CPU: 0 PID: 33 Comm: kworker/0:1 Not tainted 4.18.0-rc1 #1
[   78.923150] Hardware name: Supermicro SYS-5019S-ML/X11SSH-F, BIOS 2.0b 
07/27/2017
[   78.930623] Workqueue: nfsiod rpc_async_release
[   78.935154] RIP: 0010:up_read_non_owner+0x5d/0x70
[   78.939851] Code: 00 5b 5d c3 e8 54 9f 3a 00 85 c0 74 de 8b 05 ba 40 6a 02 
85 c0 75 d4 48 c7 c6 c8 ce 8f 9c 48 c7 c7 93 8e 8e 9c e8 33 6b fa ff <0f> 0b eb 
bd 0f 1f 44 00 00 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 
[   78.958717] RSP: 0018:b46b419ebdd0 EFLAGS: 00010282
[   78.963935] RAX:  RBX: 882d1c390130 RCX: 0006
[   78.971059] RDX: 0007 RSI: 0001 RDI: 882d2fc15730
[   78.978184] RBP: b46b419ebdd8 R08:  R09: 
[   78.985318] R10: b46b419ebdd0 R11:  R12: 882d1ad56800
[   78.985319] R13: 882d1d133000 R14:  R15: 882d1f047840
[   78.985319] FS:  () GS:882d2fc0() 
knlGS:
[   78.985320] CS:  0010 DS:  ES:  CR0: 80050033
[   78.985321] CR2: 56385d962ae0 CR3: 21e1e002 CR4: 003606f0
[   78.985321] DR0:  DR1:  DR2: 
[   78.985322] DR3:  DR6: fffe0ff0 DR7: 0400
[   78.985322] Call Trace:
[[0;32m  OK  [[   78.985327]  nfs_async_unlink_release+0x32/0x80
[   78.985329]  rpc_free_task+0x30/0x50
[   78.985330]  rpc_async_release+0x12/0x20
[   78.985332]  process_one_work+0x278/0x670
0m] [   78.985335]  worker_thread+0x4d/0x410
[   78.985337]  kthread+0x10d/0x140
[   78.985338]  ? rescuer_thread+0x3a0/0x3a0
[   78.985339]  ? kthread_create_worker_on_cpu+0x70/0x70
[   78.985341]  ret_from_fork+0x3a/0x50
[   78.985344] irq event stamp: 36545
[   78.985346] hardirqs last  enabled at (36545): [] 
_raw_spin_unlock_irq+0x2c/0x40
[   78.985347] hardirqs last disabled at (36544): [] 
_raw_spin_lock_irq+0x13/0x50
[   78.985348] softirqs last  enabled at (34878): [] 
reg_todo+0x260/0x2f0
[   78.985350] softirqs last disabled at (34876): [] 
reg_todo+0x171/0x2f0
[   79.111787] ---[ end trace bed1f41cfea3a4c5 ]---

[2] https://lkft.validation.linaro.org/scheduler/job/290429#L893

> Summary:
> passed   0
> failed   0
> skipped  0
> warnings 0
> 
> Test comments,
> /* Regression test for CVE-2017-17053, original reproducer can be found
>  * here:
>  *
>  
> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=ccd5b3235180eef3cfec337df1c8554ab151b5cc
>  *
>  * Be careful! This test may crash your kernel!
>  */
> 
> Full test log:
> https://lkft.validation.linaro.org/scheduler/job/290429#L9581
> https://lkft.validation.linaro.org/scheduler/job/259217#L10308
> 
> History of the test case shows failed on
> Linux next kernel,
> https://qa-reports.linaro.org/lkft/linux-next-oe/tests/ltp-cve-tests/cve-2017-17053
> 
> Linux mainline kernel,
> https://qa-reports.linaro.org/lkft/linux-mainline-oe/tests/ltp-cve-tests \
> /cve-2017-17053
> ^ Please join link
> 
> Linux stable rc 4.17,
> https://qa-reports.linaro.org/lkft/linux-stable-rc-4.17-oe/tests/ltp-cve-tests
> \
> /cve-2017-17053
> ^ Please join link
> 
> Test PASS on 4.16 kernel.
> https://qa-reports.linaro.org/lkft/linux-stable-rc-4.16-oe/tests/ltp-cve-tests
> \
> /cve-2017-17053
> ^ Please join link
> 
> Best regards
> Naresh Kamboju
> 


Re: [LTP] [PATCH 4.4 00/24] 4.4.137-stable review

2018-06-14 Thread Jan Stancek


- Original Message -
> On Thu, Jun 14, 2018 at 05:49:52AM -0400, Jan Stancek wrote:
> > 
> > - Original Message -
> > > On Thu, Jun 14, 2018 at 02:24:25PM +0530, Naresh Kamboju wrote:
> > > > On 14 June 2018 at 12:04, Greg Kroah-Hartman
> > > > 
> > > > wrote:
> > > > > On Wed, Jun 13, 2018 at 10:48:50PM -0300, Rafael Tinoco wrote:
> > > > >> On 13 June 2018 at 18:08, Rafael David Tinoco
> > > > >>  wrote:
> > > > >> > On Wed, Jun 13, 2018 at 6:00 PM, Greg Kroah-Hartman
> > > > >> >  wrote:
> > > > >> >> On Wed, Jun 13, 2018 at 05:47:49PM -0300, Rafael Tinoco wrote:
> > > > >> >>> Results from Linaro’s test farm.
> > > > >> >>> Regressions detected.
> > > > >> >>>
> > > > >> >>> NOTE:
> > > > >> >>>
> > > > >> >>> 1) LTP vma03 test (cve-2011-2496) broken on v4.4-137-rc1 because
> > > > >> >>> of:
> > > > >> >>>
> > > > >> >>>  6ea1dc96a03a mmap: relax file size limit for regular files
> > > > >> >>>  bd2f9ce5bacb mmap: introduce sane default mmap limits
> > > > >> >>>
> > > > >> >>>discussion:
> > > > >> >>>
> > > > >> >>>  https://github.com/linux-test-project/ltp/issues/341
> > > > >> >>>
> > > > >> >>>mainline commit (v4.13-rc7):
> > > > >> >>>
> > > > >> >>>  0cc3b0ec23ce Clarify (and fix) MAX_LFS_FILESIZE macros
> > > > >> >>>
> > > > >> >>>should be backported to 4.4.138-rc2 and fixes the issue.
> > > > >> >>
> > > > >> >> Really?  That commit says it fixes c2a9737f45e2 ("vfs,mm: fix a
> > > > >> >> dead
> > > > >> >> loop in truncate_inode_pages_range()") which is not in 4.4.y at
> > > > >> >> all.
> > > > >> >>
> > > > >> >> Did you test this out?
> > > > >> >
> > > > >> > Yes, the LTP contains the tests (last comment is the final test
> > > > >> > for
> > > > >> > arm32, right before Jan tests i686).
> > > > >> >
> > > > >> > Fixing MAX_LFS_FILESIZE fixes the new limit for mmap() brought by
> > > > >> > those 2 commits (file_mmap_size_max()).
> > > > >> > offset tested by the LTP test is 0xfffe000.
> > > > >> > file_mmap_size_max gives: 0x000 as max value, but only
> > > > >> > after
> > > > >> > the mentioned patch.
> > > > >> >
> > > > >> > Original intent for this fix was other though.
> > > > >>
> > > > >> To clarify this a bit further.
> > > > >>
> > > > >> The LTP CVE test is breaking in the first call to mmap(), even
> > > > >> before
> > > > >> trying to remap and test the security issue. That start happening in
> > > > >> this round because of those mmap() changes and the offset used in
> > > > >> the
> > > > >> LTP test. Linus changed limit checks and made them to be related to
> > > > >> MAX_LFS_FILESIZE. Unfortunately, in 4.4 stable, we were missing the
> > > > >> fix for MAX_LFS_FILESIZE (which before commit 0cc3b0ec23ce was less
> > > > >> than the REAL 32 bit limit).
> > > > >>
> > > > >> Commit 0cc3b0ec23ce was made because an user noticed the FS limit
> > > > >> not
> > > > >> being what it should be. In our case, the 4.4 stable kernel, we are
> > > > >> facing this 32 bit lower limit (than the real 32 bit real limit),
> > > > >> because of the LTP CVE test, so we need this fix to have the real 32
> > > > >> bit limit set for that macro (mmap limits did not use that macro
> > > > >> before).
> > > > >>
> > > > >> I have tested in arm32 and Jan Stancek, who first responded to LTP
> > > > >> issue, has tested this in i686 and both worked after that patch was
> >

Re: [LTP] [PATCH 4.4 00/24] 4.4.137-stable review

2018-06-14 Thread Jan Stancek


- Original Message -
> On Thu, Jun 14, 2018 at 05:49:52AM -0400, Jan Stancek wrote:
> > 
> > - Original Message -
> > > On Thu, Jun 14, 2018 at 02:24:25PM +0530, Naresh Kamboju wrote:
> > > > On 14 June 2018 at 12:04, Greg Kroah-Hartman
> > > > 
> > > > wrote:
> > > > > On Wed, Jun 13, 2018 at 10:48:50PM -0300, Rafael Tinoco wrote:
> > > > >> On 13 June 2018 at 18:08, Rafael David Tinoco
> > > > >>  wrote:
> > > > >> > On Wed, Jun 13, 2018 at 6:00 PM, Greg Kroah-Hartman
> > > > >> >  wrote:
> > > > >> >> On Wed, Jun 13, 2018 at 05:47:49PM -0300, Rafael Tinoco wrote:
> > > > >> >>> Results from Linaro’s test farm.
> > > > >> >>> Regressions detected.
> > > > >> >>>
> > > > >> >>> NOTE:
> > > > >> >>>
> > > > >> >>> 1) LTP vma03 test (cve-2011-2496) broken on v4.4-137-rc1 because
> > > > >> >>> of:
> > > > >> >>>
> > > > >> >>>  6ea1dc96a03a mmap: relax file size limit for regular files
> > > > >> >>>  bd2f9ce5bacb mmap: introduce sane default mmap limits
> > > > >> >>>
> > > > >> >>>discussion:
> > > > >> >>>
> > > > >> >>>  https://github.com/linux-test-project/ltp/issues/341
> > > > >> >>>
> > > > >> >>>mainline commit (v4.13-rc7):
> > > > >> >>>
> > > > >> >>>  0cc3b0ec23ce Clarify (and fix) MAX_LFS_FILESIZE macros
> > > > >> >>>
> > > > >> >>>should be backported to 4.4.138-rc2 and fixes the issue.
> > > > >> >>
> > > > >> >> Really?  That commit says it fixes c2a9737f45e2 ("vfs,mm: fix a
> > > > >> >> dead
> > > > >> >> loop in truncate_inode_pages_range()") which is not in 4.4.y at
> > > > >> >> all.
> > > > >> >>
> > > > >> >> Did you test this out?
> > > > >> >
> > > > >> > Yes, the LTP contains the tests (last comment is the final test
> > > > >> > for
> > > > >> > arm32, right before Jan tests i686).
> > > > >> >
> > > > >> > Fixing MAX_LFS_FILESIZE fixes the new limit for mmap() brought by
> > > > >> > those 2 commits (file_mmap_size_max()).
> > > > >> > offset tested by the LTP test is 0xfffe000.
> > > > >> > file_mmap_size_max gives: 0x000 as max value, but only
> > > > >> > after
> > > > >> > the mentioned patch.
> > > > >> >
> > > > >> > Original intent for this fix was other though.
> > > > >>
> > > > >> To clarify this a bit further.
> > > > >>
> > > > >> The LTP CVE test is breaking in the first call to mmap(), even
> > > > >> before
> > > > >> trying to remap and test the security issue. That start happening in
> > > > >> this round because of those mmap() changes and the offset used in
> > > > >> the
> > > > >> LTP test. Linus changed limit checks and made them to be related to
> > > > >> MAX_LFS_FILESIZE. Unfortunately, in 4.4 stable, we were missing the
> > > > >> fix for MAX_LFS_FILESIZE (which before commit 0cc3b0ec23ce was less
> > > > >> than the REAL 32 bit limit).
> > > > >>
> > > > >> Commit 0cc3b0ec23ce was made because an user noticed the FS limit
> > > > >> not
> > > > >> being what it should be. In our case, the 4.4 stable kernel, we are
> > > > >> facing this 32 bit lower limit (than the real 32 bit real limit),
> > > > >> because of the LTP CVE test, so we need this fix to have the real 32
> > > > >> bit limit set for that macro (mmap limits did not use that macro
> > > > >> before).
> > > > >>
> > > > >> I have tested in arm32 and Jan Stancek, who first responded to LTP
> > > > >> issue, has tested this in i686 and both worked after that patch was
> >

Re: [LTP] [PATCH 4.4 00/24] 4.4.137-stable review

2018-06-14 Thread Jan Stancek


- Original Message -
> On Thu, Jun 14, 2018 at 02:24:25PM +0530, Naresh Kamboju wrote:
> > On 14 June 2018 at 12:04, Greg Kroah-Hartman 
> > wrote:
> > > On Wed, Jun 13, 2018 at 10:48:50PM -0300, Rafael Tinoco wrote:
> > >> On 13 June 2018 at 18:08, Rafael David Tinoco
> > >>  wrote:
> > >> > On Wed, Jun 13, 2018 at 6:00 PM, Greg Kroah-Hartman
> > >> >  wrote:
> > >> >> On Wed, Jun 13, 2018 at 05:47:49PM -0300, Rafael Tinoco wrote:
> > >> >>> Results from Linaro’s test farm.
> > >> >>> Regressions detected.
> > >> >>>
> > >> >>> NOTE:
> > >> >>>
> > >> >>> 1) LTP vma03 test (cve-2011-2496) broken on v4.4-137-rc1 because of:
> > >> >>>
> > >> >>>  6ea1dc96a03a mmap: relax file size limit for regular files
> > >> >>>  bd2f9ce5bacb mmap: introduce sane default mmap limits
> > >> >>>
> > >> >>>discussion:
> > >> >>>
> > >> >>>  https://github.com/linux-test-project/ltp/issues/341
> > >> >>>
> > >> >>>mainline commit (v4.13-rc7):
> > >> >>>
> > >> >>>  0cc3b0ec23ce Clarify (and fix) MAX_LFS_FILESIZE macros
> > >> >>>
> > >> >>>should be backported to 4.4.138-rc2 and fixes the issue.
> > >> >>
> > >> >> Really?  That commit says it fixes c2a9737f45e2 ("vfs,mm: fix a dead
> > >> >> loop in truncate_inode_pages_range()") which is not in 4.4.y at all.
> > >> >>
> > >> >> Did you test this out?
> > >> >
> > >> > Yes, the LTP contains the tests (last comment is the final test for
> > >> > arm32, right before Jan tests i686).
> > >> >
> > >> > Fixing MAX_LFS_FILESIZE fixes the new limit for mmap() brought by
> > >> > those 2 commits (file_mmap_size_max()).
> > >> > offset tested by the LTP test is 0xfffe000.
> > >> > file_mmap_size_max gives: 0x000 as max value, but only after
> > >> > the mentioned patch.
> > >> >
> > >> > Original intent for this fix was other though.
> > >>
> > >> To clarify this a bit further.
> > >>
> > >> The LTP CVE test is breaking in the first call to mmap(), even before
> > >> trying to remap and test the security issue. That start happening in
> > >> this round because of those mmap() changes and the offset used in the
> > >> LTP test. Linus changed limit checks and made them to be related to
> > >> MAX_LFS_FILESIZE. Unfortunately, in 4.4 stable, we were missing the
> > >> fix for MAX_LFS_FILESIZE (which before commit 0cc3b0ec23ce was less
> > >> than the REAL 32 bit limit).
> > >>
> > >> Commit 0cc3b0ec23ce was made because an user noticed the FS limit not
> > >> being what it should be. In our case, the 4.4 stable kernel, we are
> > >> facing this 32 bit lower limit (than the real 32 bit real limit),
> > >> because of the LTP CVE test, so we need this fix to have the real 32
> > >> bit limit set for that macro (mmap limits did not use that macro
> > >> before).
> > >>
> > >> I have tested in arm32 and Jan Stancek, who first responded to LTP
> > >> issue, has tested this in i686 and both worked after that patch was
> > >> included to v4.4-137-rc1 (my last test was even with 4.4.138-rc1).
> > >>
> > >> Hope that helps a bit.
> > >
> > > Ok, thanks, it didn't apply cleanly but I've fixed it up now.
> > 
> > On the latest 4.4.138-rc1,
> > LTP "cve-2011-2496" test still fails on arm32 beagleboard x15 and qemu_arm.
> > 
> > Summary
> > 
> > kernel: 4.4.138-rc1
> > git repo:
> > https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git
> > git branch: linux-4.4.y
> > git commit: 7d690c56754ef7be647fbcf7bcdceebd59926b3f
> > git describe: v4.4.137-15-g7d690c56754e
> > Test details:
> > https://qa-reports.linaro.org/lkft/linux-stable-rc-4.4-oe/build/v4.4.137-15-g7d690c56754e
> 
> Ok, but what does this mean?  Is there a commit somewhere that I need to
> pick up for 4.4.y that is already in newer kernels?

Hi Greg,

I think the expectations was that:
  0cc3b0ec23ce Clarify (and fix) MAX_LFS_FILESIZE macros
has been included to linux-4.4.y HEAD, so they re-ran the tests.

Report from Naresh above looks like original report: LTP vma03 is cve-2011-2496 
test.

Regards,
Jan

> 
> I have no idea what that cve is
>, as I never track them, and it's
> something that was reported to predate the 4.4 kernel release :)
> 
> thanks,
> 
> greg k-h
> 
> --
> Mailing list info: https://lists.linux.it/listinfo/ltp
> 


Re: [LTP] [PATCH 4.4 00/24] 4.4.137-stable review

2018-06-14 Thread Jan Stancek


- Original Message -
> On Thu, Jun 14, 2018 at 02:24:25PM +0530, Naresh Kamboju wrote:
> > On 14 June 2018 at 12:04, Greg Kroah-Hartman 
> > wrote:
> > > On Wed, Jun 13, 2018 at 10:48:50PM -0300, Rafael Tinoco wrote:
> > >> On 13 June 2018 at 18:08, Rafael David Tinoco
> > >>  wrote:
> > >> > On Wed, Jun 13, 2018 at 6:00 PM, Greg Kroah-Hartman
> > >> >  wrote:
> > >> >> On Wed, Jun 13, 2018 at 05:47:49PM -0300, Rafael Tinoco wrote:
> > >> >>> Results from Linaro’s test farm.
> > >> >>> Regressions detected.
> > >> >>>
> > >> >>> NOTE:
> > >> >>>
> > >> >>> 1) LTP vma03 test (cve-2011-2496) broken on v4.4-137-rc1 because of:
> > >> >>>
> > >> >>>  6ea1dc96a03a mmap: relax file size limit for regular files
> > >> >>>  bd2f9ce5bacb mmap: introduce sane default mmap limits
> > >> >>>
> > >> >>>discussion:
> > >> >>>
> > >> >>>  https://github.com/linux-test-project/ltp/issues/341
> > >> >>>
> > >> >>>mainline commit (v4.13-rc7):
> > >> >>>
> > >> >>>  0cc3b0ec23ce Clarify (and fix) MAX_LFS_FILESIZE macros
> > >> >>>
> > >> >>>should be backported to 4.4.138-rc2 and fixes the issue.
> > >> >>
> > >> >> Really?  That commit says it fixes c2a9737f45e2 ("vfs,mm: fix a dead
> > >> >> loop in truncate_inode_pages_range()") which is not in 4.4.y at all.
> > >> >>
> > >> >> Did you test this out?
> > >> >
> > >> > Yes, the LTP contains the tests (last comment is the final test for
> > >> > arm32, right before Jan tests i686).
> > >> >
> > >> > Fixing MAX_LFS_FILESIZE fixes the new limit for mmap() brought by
> > >> > those 2 commits (file_mmap_size_max()).
> > >> > offset tested by the LTP test is 0xfffe000.
> > >> > file_mmap_size_max gives: 0x000 as max value, but only after
> > >> > the mentioned patch.
> > >> >
> > >> > Original intent for this fix was other though.
> > >>
> > >> To clarify this a bit further.
> > >>
> > >> The LTP CVE test is breaking in the first call to mmap(), even before
> > >> trying to remap and test the security issue. That start happening in
> > >> this round because of those mmap() changes and the offset used in the
> > >> LTP test. Linus changed limit checks and made them to be related to
> > >> MAX_LFS_FILESIZE. Unfortunately, in 4.4 stable, we were missing the
> > >> fix for MAX_LFS_FILESIZE (which before commit 0cc3b0ec23ce was less
> > >> than the REAL 32 bit limit).
> > >>
> > >> Commit 0cc3b0ec23ce was made because an user noticed the FS limit not
> > >> being what it should be. In our case, the 4.4 stable kernel, we are
> > >> facing this 32 bit lower limit (than the real 32 bit real limit),
> > >> because of the LTP CVE test, so we need this fix to have the real 32
> > >> bit limit set for that macro (mmap limits did not use that macro
> > >> before).
> > >>
> > >> I have tested in arm32 and Jan Stancek, who first responded to LTP
> > >> issue, has tested this in i686 and both worked after that patch was
> > >> included to v4.4-137-rc1 (my last test was even with 4.4.138-rc1).
> > >>
> > >> Hope that helps a bit.
> > >
> > > Ok, thanks, it didn't apply cleanly but I've fixed it up now.
> > 
> > On the latest 4.4.138-rc1,
> > LTP "cve-2011-2496" test still fails on arm32 beagleboard x15 and qemu_arm.
> > 
> > Summary
> > 
> > kernel: 4.4.138-rc1
> > git repo:
> > https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git
> > git branch: linux-4.4.y
> > git commit: 7d690c56754ef7be647fbcf7bcdceebd59926b3f
> > git describe: v4.4.137-15-g7d690c56754e
> > Test details:
> > https://qa-reports.linaro.org/lkft/linux-stable-rc-4.4-oe/build/v4.4.137-15-g7d690c56754e
> 
> Ok, but what does this mean?  Is there a commit somewhere that I need to
> pick up for 4.4.y that is already in newer kernels?

Hi Greg,

I think the expectations was that:
  0cc3b0ec23ce Clarify (and fix) MAX_LFS_FILESIZE macros
has been included to linux-4.4.y HEAD, so they re-ran the tests.

Report from Naresh above looks like original report: LTP vma03 is cve-2011-2496 
test.

Regards,
Jan

> 
> I have no idea what that cve is
>, as I never track them, and it's
> something that was reported to predate the 4.4 kernel release :)
> 
> thanks,
> 
> greg k-h
> 
> --
> Mailing list info: https://lists.linux.it/listinfo/ltp
> 


[PATCH] virtio_balloon: fix increment of vb->num_pfns in fill_balloon()

2017-12-01 Thread Jan Stancek
commit c7cdff0e8647 ("virtio_balloon: fix deadlock on OOM")
changed code to increment vb->num_pfns before call to
set_page_pfns(), which used to happen only after.

This patch fixes boot hang for me on ppc64le KVM guests.

Fixes: c7cdff0e8647 ("virtio_balloon: fix deadlock on OOM")
Cc: Michael S. Tsirkin <m...@redhat.com>
Cc: Tetsuo Handa <penguin-ker...@i-love.sakura.ne.jp>
Cc: Michal Hocko <mho...@suse.com>
Cc: Wei Wang <wei.w.w...@intel.com>
Signed-off-by: Jan Stancek <jstan...@redhat.com>
---
 drivers/virtio/virtio_balloon.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index 7960746f7597..a1fb52cb3f0a 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -174,13 +174,12 @@ static unsigned fill_balloon(struct virtio_balloon *vb, 
size_t num)
while ((page = balloon_page_pop())) {
balloon_page_enqueue(>vb_dev_info, page);
 
-   vb->num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE;
-
set_page_pfns(vb, vb->pfns + vb->num_pfns, page);
vb->num_pages += VIRTIO_BALLOON_PAGES_PER_PAGE;
if (!virtio_has_feature(vb->vdev,
VIRTIO_BALLOON_F_DEFLATE_ON_OOM))
adjust_managed_page_count(page, -1);
+   vb->num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE;
}
 
num_allocated_pages = vb->num_pfns;
-- 
1.8.3.1



[PATCH] virtio_balloon: fix increment of vb->num_pfns in fill_balloon()

2017-12-01 Thread Jan Stancek
commit c7cdff0e8647 ("virtio_balloon: fix deadlock on OOM")
changed code to increment vb->num_pfns before call to
set_page_pfns(), which used to happen only after.

This patch fixes boot hang for me on ppc64le KVM guests.

Fixes: c7cdff0e8647 ("virtio_balloon: fix deadlock on OOM")
Cc: Michael S. Tsirkin 
Cc: Tetsuo Handa 
Cc: Michal Hocko 
Cc: Wei Wang 
Signed-off-by: Jan Stancek 
---
 drivers/virtio/virtio_balloon.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index 7960746f7597..a1fb52cb3f0a 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -174,13 +174,12 @@ static unsigned fill_balloon(struct virtio_balloon *vb, 
size_t num)
while ((page = balloon_page_pop())) {
balloon_page_enqueue(>vb_dev_info, page);
 
-   vb->num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE;
-
set_page_pfns(vb, vb->pfns + vb->num_pfns, page);
vb->num_pages += VIRTIO_BALLOON_PAGES_PER_PAGE;
if (!virtio_has_feature(vb->vdev,
VIRTIO_BALLOON_F_DEFLATE_ON_OOM))
adjust_managed_page_count(page, -1);
+   vb->num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE;
}
 
num_allocated_pages = vb->num_pfns;
-- 
1.8.3.1



Re: [Patch V3] crypto: x86/sha1 : Fix reads beyond the number of blocks passed

2017-08-02 Thread Jan Stancek



- Original Message -
> On Wed, 2017-08-02 at 10:13 -0700, Megha Dey wrote:
> > It was reported that the sha1 AVX2 function(sha1_transform_avx2) is
> > reading ahead beyond its intended data, and causing a crash if the next
> > block is beyond page boundary:
> > http://marc.info/?l=linux-crypto-vger=149373371023377
> > 
> > This patch makes sure that there is no overflow for any buffer length.
> > 
> > It passes the tests written by Jan Stancek that revealed this problem:
> > https://github.com/jstancek/sha1-avx2-crash
> 
> Hi Jan,
> 
> Is it ok to add your Tested-by?

Yes, v3 patch is exactly the diff I was testing.

Regards,
Jan


Re: [Patch V3] crypto: x86/sha1 : Fix reads beyond the number of blocks passed

2017-08-02 Thread Jan Stancek



- Original Message -
> On Wed, 2017-08-02 at 10:13 -0700, Megha Dey wrote:
> > It was reported that the sha1 AVX2 function(sha1_transform_avx2) is
> > reading ahead beyond its intended data, and causing a crash if the next
> > block is beyond page boundary:
> > http://marc.info/?l=linux-crypto-vger=149373371023377
> > 
> > This patch makes sure that there is no overflow for any buffer length.
> > 
> > It passes the tests written by Jan Stancek that revealed this problem:
> > https://github.com/jstancek/sha1-avx2-crash
> 
> Hi Jan,
> 
> Is it ok to add your Tested-by?

Yes, v3 patch is exactly the diff I was testing.

Regards,
Jan


Re: [Patch V2] crypto: x86/sha1 : Fix reads beyond the number of blocks passed

2017-08-02 Thread Jan Stancek
On 08/02/2017 02:38 AM, Megha Dey wrote:
> It was reported that the sha1 AVX2 function(sha1_transform_avx2) is
> reading ahead beyond its intended data, and causing a crash if the next
> block is beyond page boundary:
> http://marc.info/?l=linux-crypto-vger=149373371023377
> 
> This patch makes sure that there is no overflow for any buffer length.
> 
> It passes the tests written by Jan Stancek that revealed this problem:
> https://github.com/jstancek/sha1-avx2-crash
> 
> Jan, can you verify this fix?

Hi,

Looks good, my tests (below) PASS-ed.

I updated reproducer at [1] to try more than just single scenario. It
now tries thousands of combinations with different starting offset within
page and sizes of data chunks.

(without patch) v4.13-rc3-102-g26c5ceb with avx2 enabled:
[  106.455175] sha_test module loaded
[  106.460458] data is at 0x8c88e58f8000, page_after_data: 
0x8c88e58fa000
[  106.468625] sha_test testing hash: sha1-generic, maxlen: 8192
[  127.091237] sha_test hash: sha1-generic calculated 1764032 hashes
[  127.098038] sha_test testing hash: sha1-ni, maxlen: 8192
[  127.108805] failed to alloc sha1-ni
[  127.112703] sha_test testing hash: sha1-avx, maxlen: 8192
[  141.166611] sha_test hash: sha1-avx calculated 1764032 hashes
[  141.173023] sha_test testing hash: sha1-avx2, maxlen: 8192
[  141.180046] BUG: unable to handle kernel paging request at 8c88e58fa000
[  141.187817] IP: _begin+0x28/0x187
[  141.191512] PGD 89c578067
[  141.191513] P4D 89c578067
[  141.194529] PUD c61f0a063
[  141.197545] PMD c64cb1063
[  141.200561] PTE 800c658fa062
[  141.203577]
[  141.208832] Oops:  [#1] SMP
...

With patch "[Patch V2] crypto: x86/sha1 : Fix reads beyond the number of blocks 
passed":
[  406.323781] sha_test module loaded
[  406.329062] data is at 0x93ab97108000, page_after_data: 
0x93ab9710a000
[  406.337185] sha_test testing hash: sha1-generic, maxlen: 8192
[  426.157242] sha_test hash: sha1-generic calculated 1764032 hashes
[  426.164043] sha_test testing hash: sha1-ni, maxlen: 8192
[  426.174244] failed to alloc sha1-ni
[  426.178139] sha_test testing hash: sha1-avx, maxlen: 8192
[  440.226240] sha_test hash: sha1-avx calculated 1764032 hashes
[  440.232651] sha_test testing hash: sha1-avx2, maxlen: 8192
[  452.497988] sha_test hash: sha1-avx2 calculated 1764032 hashes
[  452.504495] sha_test testing hash: sha1-ssse3, maxlen: 8192
[  467.319316] sha_test hash: sha1-ssse3 calculated 1764032 hashes
[  467.325922] sha_test done
[  471.827083] sha_test module unloaded

I also ran a test at [2], which is comparing hashes produced by
kernel with hashes produced by user-space utilities (e.g. 'sha1sum').
This test also PASS-ed.

Regards,
Jan

[1] https://github.com/jstancek/sha1-avx2-crash
[2] https://github.com/jstancek/kernel_sha_test/tree/sha1-avx2-4.13

> Herbert, can you re-enable sha1-avx2 once Jan has checked it out and
> revert commit b82ce24426a4071da9529d726057e4e642948667 ?
> 
> Originally-by: Ilya Albrekht <ilya.albre...@intel.com>
> Signed-off-by: Megha Dey <megha@linux.intel.com>
> Reported-by: Jan Stancek <jstan...@redhat.com>




Re: [Patch V2] crypto: x86/sha1 : Fix reads beyond the number of blocks passed

2017-08-02 Thread Jan Stancek
On 08/02/2017 02:38 AM, Megha Dey wrote:
> It was reported that the sha1 AVX2 function(sha1_transform_avx2) is
> reading ahead beyond its intended data, and causing a crash if the next
> block is beyond page boundary:
> http://marc.info/?l=linux-crypto-vger=149373371023377
> 
> This patch makes sure that there is no overflow for any buffer length.
> 
> It passes the tests written by Jan Stancek that revealed this problem:
> https://github.com/jstancek/sha1-avx2-crash
> 
> Jan, can you verify this fix?

Hi,

Looks good, my tests (below) PASS-ed.

I updated reproducer at [1] to try more than just single scenario. It
now tries thousands of combinations with different starting offset within
page and sizes of data chunks.

(without patch) v4.13-rc3-102-g26c5ceb with avx2 enabled:
[  106.455175] sha_test module loaded
[  106.460458] data is at 0x8c88e58f8000, page_after_data: 
0x8c88e58fa000
[  106.468625] sha_test testing hash: sha1-generic, maxlen: 8192
[  127.091237] sha_test hash: sha1-generic calculated 1764032 hashes
[  127.098038] sha_test testing hash: sha1-ni, maxlen: 8192
[  127.108805] failed to alloc sha1-ni
[  127.112703] sha_test testing hash: sha1-avx, maxlen: 8192
[  141.166611] sha_test hash: sha1-avx calculated 1764032 hashes
[  141.173023] sha_test testing hash: sha1-avx2, maxlen: 8192
[  141.180046] BUG: unable to handle kernel paging request at 8c88e58fa000
[  141.187817] IP: _begin+0x28/0x187
[  141.191512] PGD 89c578067
[  141.191513] P4D 89c578067
[  141.194529] PUD c61f0a063
[  141.197545] PMD c64cb1063
[  141.200561] PTE 800c658fa062
[  141.203577]
[  141.208832] Oops:  [#1] SMP
...

With patch "[Patch V2] crypto: x86/sha1 : Fix reads beyond the number of blocks 
passed":
[  406.323781] sha_test module loaded
[  406.329062] data is at 0x93ab97108000, page_after_data: 
0x93ab9710a000
[  406.337185] sha_test testing hash: sha1-generic, maxlen: 8192
[  426.157242] sha_test hash: sha1-generic calculated 1764032 hashes
[  426.164043] sha_test testing hash: sha1-ni, maxlen: 8192
[  426.174244] failed to alloc sha1-ni
[  426.178139] sha_test testing hash: sha1-avx, maxlen: 8192
[  440.226240] sha_test hash: sha1-avx calculated 1764032 hashes
[  440.232651] sha_test testing hash: sha1-avx2, maxlen: 8192
[  452.497988] sha_test hash: sha1-avx2 calculated 1764032 hashes
[  452.504495] sha_test testing hash: sha1-ssse3, maxlen: 8192
[  467.319316] sha_test hash: sha1-ssse3 calculated 1764032 hashes
[  467.325922] sha_test done
[  471.827083] sha_test module unloaded

I also ran a test at [2], which is comparing hashes produced by
kernel with hashes produced by user-space utilities (e.g. 'sha1sum').
This test also PASS-ed.

Regards,
Jan

[1] https://github.com/jstancek/sha1-avx2-crash
[2] https://github.com/jstancek/kernel_sha_test/tree/sha1-avx2-4.13

> Herbert, can you re-enable sha1-avx2 once Jan has checked it out and
> revert commit b82ce24426a4071da9529d726057e4e642948667 ?
> 
> Originally-by: Ilya Albrekht 
> Signed-off-by: Megha Dey 
> Reported-by: Jan Stancek 




Re: [PATCH 0/2] key payload access with just rcu_read_lock()

2017-03-01 Thread Jan Stancek




- Original Message -
> From: "David Howells" <dhowe...@redhat.com>
> To: "Jan Stancek" <jstan...@redhat.com>
> Cc: dhowe...@redhat.com, linux-kernel@vger.kernel.org, 
> linux-...@vger.kernel.org, bcodd...@redhat.com,
> asav...@redhat.com
> Sent: Wednesday, 1 March, 2017 10:40:13 AM
> Subject: Re: [PATCH 0/2] key payload access with just rcu_read_lock()
> 
> Jan Stancek <jstan...@redhat.com> wrote:
> 
> > That problem didn't show up with my NFS based reproducer.
> > I re-run it again with latest version of your patch, plus also
> > keyutils testsuite. Both completed OK for me, dmesg looks clean.
> 
> Can I put you down as a Tested-by?

Yes. Thanks for having a look at this.

Regards,
Jan


Re: [PATCH 0/2] key payload access with just rcu_read_lock()

2017-03-01 Thread Jan Stancek




- Original Message -
> From: "David Howells" 
> To: "Jan Stancek" 
> Cc: dhowe...@redhat.com, linux-kernel@vger.kernel.org, 
> linux-...@vger.kernel.org, bcodd...@redhat.com,
> asav...@redhat.com
> Sent: Wednesday, 1 March, 2017 10:40:13 AM
> Subject: Re: [PATCH 0/2] key payload access with just rcu_read_lock()
> 
> Jan Stancek  wrote:
> 
> > That problem didn't show up with my NFS based reproducer.
> > I re-run it again with latest version of your patch, plus also
> > keyutils testsuite. Both completed OK for me, dmesg looks clean.
> 
> Can I put you down as a Tested-by?

Yes. Thanks for having a look at this.

Regards,
Jan


Re: [PATCH 0/2] key payload access with just rcu_read_lock()

2017-02-28 Thread Jan Stancek

- Original Message -
> Here's an updated patch with fixed user_key_payload_locked() and user_read().
> 

That problem didn't show up with my NFS based reproducer.
I re-run it again with latest version of your patch, plus also
keyutils testsuite. Both completed OK for me, dmesg looks clean.

Regards,
Jan


Re: [PATCH 0/2] key payload access with just rcu_read_lock()

2017-02-28 Thread Jan Stancek

- Original Message -
> Here's an updated patch with fixed user_key_payload_locked() and user_read().
> 

That problem didn't show up with my NFS based reproducer.
I re-run it again with latest version of your patch, plus also
keyutils testsuite. Both completed OK for me, dmesg looks clean.

Regards,
Jan


Re: [PATCH 0/2] key payload access with just rcu_read_lock()

2017-02-28 Thread Jan Stancek


- Original Message -
> Jan Stancek <jstan...@redhat.com> wrote:
> 
> > Looks like there are still couple users that need updating,
> > I'm hitting following compilation error:
> 
> Aargh - I remembered to grep for rcu_dereference_key() but not
> user_key_payload().
> 
> How about the attached?

v2 builds OK (on top of 4.10), "suspicious RCU usage"
warning is gone and NFS connectathon is PASS-ing cleanly.

Regards,
Jan


Re: [PATCH 0/2] key payload access with just rcu_read_lock()

2017-02-28 Thread Jan Stancek


- Original Message -
> Jan Stancek  wrote:
> 
> > Looks like there are still couple users that need updating,
> > I'm hitting following compilation error:
> 
> Aargh - I remembered to grep for rcu_dereference_key() but not
> user_key_payload().
> 
> How about the attached?

v2 builds OK (on top of 4.10), "suspicious RCU usage"
warning is gone and NFS connectathon is PASS-ing cleanly.

Regards,
Jan


Re: [PATCH 0/2] key payload access with just rcu_read_lock()

2017-02-27 Thread Jan Stancek


- Original Message -
> From: "David Howells" <dhowe...@redhat.com>
> To: "Jan Stancek" <jstan...@redhat.com>
> Cc: dhowe...@redhat.com, linux-kernel@vger.kernel.org, 
> linux-...@vger.kernel.org, bcodd...@redhat.com,
> asav...@redhat.com
> Sent: Monday, 27 February, 2017 11:04:21 PM
> Subject: Re: [PATCH 0/2] key payload access with just rcu_read_lock()
> 
> Jan Stancek <jstan...@redhat.com> wrote:
> 
> > this is a follow-up for "suspicious RCU usage" warning described
> > in these 2 linux-nfs threads:
> >   http://marc.info/?t=14755883033=1=2
> >   http://marc.info/?t=14877677051=1=2
> > 
> > Did you have something like in mind?
> 
> How about the attached?  It's similar to what you did, but I made the split
> functions different from the original so that all users have to reconsider
> what they actually want.
> 
> David

Looks like there are still couple users that need updating,
I'm hitting following compilation error:

...
  CC [M]  fs/cifs/connect.o
fs/cifs/connect.c: In function ‘cifs_set_cifscreds’:
fs/cifs/connect.c:2442:2: error: implicit declaration of function 
‘user_key_payload’ [-Werror=implicit-function-declaration]
  upayload = user_key_payload(key);
  ^
fs/cifs/connect.c:2442:11: error: assignment makes pointer from integer without 
a cast [-Werror]
  upayload = user_key_payload(key);
   ^

# grep "user_key_payload(" -r . --include=*.c
./drivers/md/dm-crypt.c:ukp = user_key_payload(key);
./fs/cifs/connect.c:upayload = user_key_payload(key);
./fs/crypto/keyinfo.c:  ukp = user_key_payload(keyring_key);
./lib/digsig.c: ukp = user_key_payload(key);

# grep "user_key_payload(" -r . --include=*.h
./fs/ecryptfs/ecryptfs_kernel.h:return (struct 
ecryptfs_auth_tok *)user_key_payload(key)->data;

Regards,
Jan


Re: [PATCH 0/2] key payload access with just rcu_read_lock()

2017-02-27 Thread Jan Stancek


- Original Message -
> From: "David Howells" 
> To: "Jan Stancek" 
> Cc: dhowe...@redhat.com, linux-kernel@vger.kernel.org, 
> linux-...@vger.kernel.org, bcodd...@redhat.com,
> asav...@redhat.com
> Sent: Monday, 27 February, 2017 11:04:21 PM
> Subject: Re: [PATCH 0/2] key payload access with just rcu_read_lock()
> 
> Jan Stancek  wrote:
> 
> > this is a follow-up for "suspicious RCU usage" warning described
> > in these 2 linux-nfs threads:
> >   http://marc.info/?t=14755883033=1=2
> >   http://marc.info/?t=14877677051=1=2
> > 
> > Did you have something like in mind?
> 
> How about the attached?  It's similar to what you did, but I made the split
> functions different from the original so that all users have to reconsider
> what they actually want.
> 
> David

Looks like there are still couple users that need updating,
I'm hitting following compilation error:

...
  CC [M]  fs/cifs/connect.o
fs/cifs/connect.c: In function ‘cifs_set_cifscreds’:
fs/cifs/connect.c:2442:2: error: implicit declaration of function 
‘user_key_payload’ [-Werror=implicit-function-declaration]
  upayload = user_key_payload(key);
  ^
fs/cifs/connect.c:2442:11: error: assignment makes pointer from integer without 
a cast [-Werror]
  upayload = user_key_payload(key);
   ^

# grep "user_key_payload(" -r . --include=*.c
./drivers/md/dm-crypt.c:ukp = user_key_payload(key);
./fs/cifs/connect.c:upayload = user_key_payload(key);
./fs/crypto/keyinfo.c:  ukp = user_key_payload(keyring_key);
./lib/digsig.c: ukp = user_key_payload(key);

# grep "user_key_payload(" -r . --include=*.h
./fs/ecryptfs/ecryptfs_kernel.h:return (struct 
ecryptfs_auth_tok *)user_key_payload(key)->data;

Regards,
Jan


[PATCH 1/2] KEYS: add user_key_payload_rcu()

2017-02-22 Thread Jan Stancek
user_key_payload() is wrapper for rcu_dereference_protected(),
and can't be used with just rcu_read_lock() held.

This patch adds user_key_payload_rcu() for accessing key
payload in RCU read-side section, without the need
to hold key semaphore.

Signed-off-by: Jan Stancek <jstan...@redhat.com>
---
 Documentation/security/keys.txt | 5 +
 include/keys/user-type.h| 5 +
 include/linux/key.h | 3 +++
 3 files changed, 13 insertions(+)

diff --git a/Documentation/security/keys.txt b/Documentation/security/keys.txt
index 3849814bfe6d..da89a854edd2 100644
--- a/Documentation/security/keys.txt
+++ b/Documentation/security/keys.txt
@@ -1152,8 +1152,13 @@ access the data:
  wrap the RCU calls to this element:
 
rcu_assign_keypointer(struct key *key, void *data);
+
+   /* access payload with semaphore held */
void *rcu_dereference_key(struct key *key);
 
+   /* access payload in RCU read-side section*/
+   void *rcu_read_dereference_key(struct key *key);
+
 
 ===
 DEFINING A KEY TYPE
diff --git a/include/keys/user-type.h b/include/keys/user-type.h
index c56fef40f53e..521bf3369904 100644
--- a/include/keys/user-type.h
+++ b/include/keys/user-type.h
@@ -53,6 +53,11 @@ static inline const struct user_key_payload 
*user_key_payload(const struct key *
return (struct user_key_payload *)rcu_dereference_key(key);
 }
 
+static inline const struct user_key_payload *user_key_payload_rcu(const struct 
key *key)
+{
+   return (struct user_key_payload *)rcu_read_dereference_key(key);
+}
+
 #endif /* CONFIG_KEYS */
 
 #endif /* _KEYS_USER_TYPE_H */
diff --git a/include/linux/key.h b/include/linux/key.h
index 722914798f37..b6a8c5896761 100644
--- a/include/linux/key.h
+++ b/include/linux/key.h
@@ -358,6 +358,9 @@ static inline bool key_is_instantiated(const struct key 
*key)
(rcu_dereference_protected((KEY)->payload.rcu_data0,\
   rwsem_is_locked(&((struct key 
*)(KEY))->sem)))
 
+#define rcu_read_dereference_key(KEY)   \
+   (rcu_dereference((KEY)->payload.rcu_data0))
+
 #define rcu_assign_keypointer(KEY, PAYLOAD)\
 do {   \
rcu_assign_pointer((KEY)->payload.rcu_data0, (PAYLOAD));\
-- 
1.8.3.1



[PATCH 1/2] KEYS: add user_key_payload_rcu()

2017-02-22 Thread Jan Stancek
user_key_payload() is wrapper for rcu_dereference_protected(),
and can't be used with just rcu_read_lock() held.

This patch adds user_key_payload_rcu() for accessing key
payload in RCU read-side section, without the need
to hold key semaphore.

Signed-off-by: Jan Stancek 
---
 Documentation/security/keys.txt | 5 +
 include/keys/user-type.h| 5 +
 include/linux/key.h | 3 +++
 3 files changed, 13 insertions(+)

diff --git a/Documentation/security/keys.txt b/Documentation/security/keys.txt
index 3849814bfe6d..da89a854edd2 100644
--- a/Documentation/security/keys.txt
+++ b/Documentation/security/keys.txt
@@ -1152,8 +1152,13 @@ access the data:
  wrap the RCU calls to this element:
 
rcu_assign_keypointer(struct key *key, void *data);
+
+   /* access payload with semaphore held */
void *rcu_dereference_key(struct key *key);
 
+   /* access payload in RCU read-side section*/
+   void *rcu_read_dereference_key(struct key *key);
+
 
 ===
 DEFINING A KEY TYPE
diff --git a/include/keys/user-type.h b/include/keys/user-type.h
index c56fef40f53e..521bf3369904 100644
--- a/include/keys/user-type.h
+++ b/include/keys/user-type.h
@@ -53,6 +53,11 @@ static inline const struct user_key_payload 
*user_key_payload(const struct key *
return (struct user_key_payload *)rcu_dereference_key(key);
 }
 
+static inline const struct user_key_payload *user_key_payload_rcu(const struct 
key *key)
+{
+   return (struct user_key_payload *)rcu_read_dereference_key(key);
+}
+
 #endif /* CONFIG_KEYS */
 
 #endif /* _KEYS_USER_TYPE_H */
diff --git a/include/linux/key.h b/include/linux/key.h
index 722914798f37..b6a8c5896761 100644
--- a/include/linux/key.h
+++ b/include/linux/key.h
@@ -358,6 +358,9 @@ static inline bool key_is_instantiated(const struct key 
*key)
(rcu_dereference_protected((KEY)->payload.rcu_data0,\
   rwsem_is_locked(&((struct key 
*)(KEY))->sem)))
 
+#define rcu_read_dereference_key(KEY)   \
+   (rcu_dereference((KEY)->payload.rcu_data0))
+
 #define rcu_assign_keypointer(KEY, PAYLOAD)\
 do {   \
rcu_assign_pointer((KEY)->payload.rcu_data0, (PAYLOAD));\
-- 
1.8.3.1



[PATCH 2/2] NFS: use user_key_payload_rcu() in RCU read-side section

2017-02-22 Thread Jan Stancek
Use RCU read-side accessor for key payload, as calling
user_key_payload() without semaphore held leads to
following warning:

  ===
  [ INFO: suspicious RCU usage. ]
  4.10.0 #1 Tainted: GW
  ---
  ./include/keys/user-type.h:53 suspicious rcu_dereference_protected() usage!
  other info that might help us debug this:
  rcu_scheduler_active = 2, debug_locks = 0
  1 lock held by mount.nfs/5987:
#0:  (rcu_read_lock){..}, at: [] 
nfs_idmap_get_key+0x15c/0x420 [nfsv4]
  stack backtrace:
  CPU: 1 PID: 5987 Comm: mount.nfs Tainted: GW   4.10.0 #1
  Call Trace:
dump_stack+0xe8/0x154 (unreliable)
lockdep_rcu_suspicious+0x140/0x190
nfs_idmap_get_key+0x380/0x420 [nfsv4]
nfs_map_name_to_uid+0x2a0/0x3b0 [nfsv4]
decode_getfattr_attrs+0xfac/0x16b0 [nfsv4]
decode_getfattr_generic.constprop.106+0xbc/0x150 [nfsv4]
nfs4_xdr_dec_lookup_root+0xac/0xb0 [nfsv4]
rpcauth_unwrap_resp+0xe8/0x140 [sunrpc]
call_decode+0x29c/0x910 [sunrpc]
__rpc_execute+0x140/0x8f0 [sunrpc]
rpc_run_task+0x170/0x200 [sunrpc]
nfs4_call_sync_sequence+0x68/0xa0 [nfsv4]
_nfs4_lookup_root.isra.44+0xd0/0xf0 [nfsv4]
nfs4_lookup_root+0xe0/0x350 [nfsv4]
nfs4_lookup_root_sec+0x70/0xa0 [nfsv4]
nfs4_find_root_sec+0xc4/0x100 [nfsv4]
nfs4_proc_get_rootfh+0x5c/0xf0 [nfsv4]
nfs4_get_rootfh+0x6c/0x190 [nfsv4]
nfs4_server_common_setup+0xc4/0x260 [nfsv4]
nfs4_create_server+0x278/0x3c0 [nfsv4]
nfs4_remote_mount+0x50/0xb0 [nfsv4]
mount_fs+0x74/0x210
vfs_kern_mount+0x78/0x220
nfs_do_root_mount+0xb0/0x140 [nfsv4]
nfs4_try_mount+0x60/0x100 [nfsv4]
nfs_fs_mount+0x5ec/0xda0 [nfs]
mount_fs+0x74/0x210
vfs_kern_mount+0x78/0x220
do_mount+0x254/0xf70
SyS_mount+0x94/0x100
system_call+0x38/0xe0

Signed-off-by: Jan Stancek <jstan...@redhat.com>
---
 fs/nfs/nfs4idmap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/nfs/nfs4idmap.c b/fs/nfs/nfs4idmap.c
index c444285bb1b1..835c163f61af 100644
--- a/fs/nfs/nfs4idmap.c
+++ b/fs/nfs/nfs4idmap.c
@@ -316,7 +316,7 @@ static ssize_t nfs_idmap_get_key(const char *name, size_t 
namelen,
if (ret < 0)
goto out_up;
 
-   payload = user_key_payload(rkey);
+   payload = user_key_payload_rcu(rkey);
if (IS_ERR_OR_NULL(payload)) {
ret = PTR_ERR(payload);
goto out_up;
-- 
1.8.3.1



[PATCH 2/2] NFS: use user_key_payload_rcu() in RCU read-side section

2017-02-22 Thread Jan Stancek
Use RCU read-side accessor for key payload, as calling
user_key_payload() without semaphore held leads to
following warning:

  ===
  [ INFO: suspicious RCU usage. ]
  4.10.0 #1 Tainted: GW
  ---
  ./include/keys/user-type.h:53 suspicious rcu_dereference_protected() usage!
  other info that might help us debug this:
  rcu_scheduler_active = 2, debug_locks = 0
  1 lock held by mount.nfs/5987:
#0:  (rcu_read_lock){..}, at: [] 
nfs_idmap_get_key+0x15c/0x420 [nfsv4]
  stack backtrace:
  CPU: 1 PID: 5987 Comm: mount.nfs Tainted: GW   4.10.0 #1
  Call Trace:
dump_stack+0xe8/0x154 (unreliable)
lockdep_rcu_suspicious+0x140/0x190
nfs_idmap_get_key+0x380/0x420 [nfsv4]
nfs_map_name_to_uid+0x2a0/0x3b0 [nfsv4]
decode_getfattr_attrs+0xfac/0x16b0 [nfsv4]
decode_getfattr_generic.constprop.106+0xbc/0x150 [nfsv4]
nfs4_xdr_dec_lookup_root+0xac/0xb0 [nfsv4]
rpcauth_unwrap_resp+0xe8/0x140 [sunrpc]
call_decode+0x29c/0x910 [sunrpc]
__rpc_execute+0x140/0x8f0 [sunrpc]
rpc_run_task+0x170/0x200 [sunrpc]
nfs4_call_sync_sequence+0x68/0xa0 [nfsv4]
_nfs4_lookup_root.isra.44+0xd0/0xf0 [nfsv4]
nfs4_lookup_root+0xe0/0x350 [nfsv4]
nfs4_lookup_root_sec+0x70/0xa0 [nfsv4]
nfs4_find_root_sec+0xc4/0x100 [nfsv4]
nfs4_proc_get_rootfh+0x5c/0xf0 [nfsv4]
nfs4_get_rootfh+0x6c/0x190 [nfsv4]
nfs4_server_common_setup+0xc4/0x260 [nfsv4]
nfs4_create_server+0x278/0x3c0 [nfsv4]
nfs4_remote_mount+0x50/0xb0 [nfsv4]
mount_fs+0x74/0x210
vfs_kern_mount+0x78/0x220
nfs_do_root_mount+0xb0/0x140 [nfsv4]
nfs4_try_mount+0x60/0x100 [nfsv4]
nfs_fs_mount+0x5ec/0xda0 [nfs]
mount_fs+0x74/0x210
vfs_kern_mount+0x78/0x220
do_mount+0x254/0xf70
SyS_mount+0x94/0x100
system_call+0x38/0xe0

Signed-off-by: Jan Stancek 
---
 fs/nfs/nfs4idmap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/nfs/nfs4idmap.c b/fs/nfs/nfs4idmap.c
index c444285bb1b1..835c163f61af 100644
--- a/fs/nfs/nfs4idmap.c
+++ b/fs/nfs/nfs4idmap.c
@@ -316,7 +316,7 @@ static ssize_t nfs_idmap_get_key(const char *name, size_t 
namelen,
if (ret < 0)
goto out_up;
 
-   payload = user_key_payload(rkey);
+   payload = user_key_payload_rcu(rkey);
if (IS_ERR_OR_NULL(payload)) {
ret = PTR_ERR(payload);
goto out_up;
-- 
1.8.3.1



[PATCH 0/2] key payload access with just rcu_read_lock()

2017-02-22 Thread Jan Stancek
Hi David,

this is a follow-up for "suspicious RCU usage" warning described
in these 2 linux-nfs threads:
  http://marc.info/?t=14755883033=1=2
  http://marc.info/?t=14877677051=1=2

Did you have something like in mind?

Thanks,
Jan

Jan Stancek (2):
  KEYS: add user_key_payload_rcu()
  NFS: use user_key_payload_rcu() in RCU read-side section

 Documentation/security/keys.txt | 5 +
 fs/nfs/nfs4idmap.c  | 2 +-
 include/keys/user-type.h| 5 +
 include/linux/key.h | 3 +++
 4 files changed, 14 insertions(+), 1 deletion(-)

-- 
1.8.3.1



[PATCH 0/2] key payload access with just rcu_read_lock()

2017-02-22 Thread Jan Stancek
Hi David,

this is a follow-up for "suspicious RCU usage" warning described
in these 2 linux-nfs threads:
  http://marc.info/?t=14755883033=1=2
  http://marc.info/?t=14877677051=1=2

Did you have something like in mind?

Thanks,
Jan

Jan Stancek (2):
  KEYS: add user_key_payload_rcu()
  NFS: use user_key_payload_rcu() in RCU read-side section

 Documentation/security/keys.txt | 5 +
 fs/nfs/nfs4idmap.c  | 2 +-
 include/keys/user-type.h| 5 +
 include/linux/key.h | 3 +++
 4 files changed, 14 insertions(+), 1 deletion(-)

-- 
1.8.3.1



[tip:perf/urgent] perf header: Make build_cpu_topology skip offline/absent CPUs

2017-02-21 Thread tip-bot for Jan Stancek
Commit-ID:  43db2843a4a41cc8cdb6ab696639aeee1f4d5062
Gitweb: http://git.kernel.org/tip/43db2843a4a41cc8cdb6ab696639aeee1f4d5062
Author: Jan Stancek <jstan...@redhat.com>
AuthorDate: Fri, 17 Feb 2017 12:10:25 +0100
Committer:  Arnaldo Carvalho de Melo <a...@redhat.com>
CommitDate: Fri, 17 Feb 2017 12:37:04 -0300

perf header: Make build_cpu_topology skip offline/absent CPUs

When build_cpu_topo() encounters offline/absent CPUs, it fails to find any
sysfs entries and returns failure.

This leads to build_cpu_topology() and write_cpu_topology() failing as
well.

Because HEADER_CPU_TOPOLOGY has not been written, read leaves cpu_topology_map
NULL and we get NULL ptr deref at:

  ...
   cmd_test
__cmd_test
 test_and_print
  run_test
   test_session_topology
check_cpu_topology

  36: Session topology   :
  --- start ---
  test child forked, pid 14902
  templ file: /tmp/perf-test-4CKocW
  failed to write feature HEADER_CPU_TOPOLOGY
  perf: Segmentation fault
  Obtained 9 stack frames.
  ./perf(sighandler_dump_stack+0x41) [0x5095f1]
  /lib64/libc.so.6(+0x35250) [0x7f4b7c3c9250]
  ./perf(test_session_topology+0x1db) [0x490ceb]
  ./perf() [0x475b68]
  ./perf(cmd_test+0x5b9) [0x4763c9]
  ./perf() [0x4945a3]
  ./perf(main+0x69f) [0x427e8f]
  /lib64/libc.so.6(__libc_start_main+0xf5) [0x7f4b7c3b5b35]
  ./perf() [0x427fb9]
  test child interrupted
   end 
  Session topology: FAILED!

This patch makes build_cpu_topology() skip offline/absent CPUs, by checking
their presence against cpu_map built from online CPUs.

Signed-off-by: Jan Stancek <jstan...@redhat.com>
Acked-by: Jiri Olsa <jo...@kernel.org>
Cc: Alexander Shishkin <alexander.shish...@linux.intel.com>
Cc: Masami Hiramatsu <mhira...@kernel.org>
Cc: Peter Zijlstra <pet...@infradead.org>
Link: 
http://lkml.kernel.org/r/a271b770175524f4961d4903af33798358a4a518.1487146877.git.jstan...@redhat.com
Signed-off-by: Arnaldo Carvalho de Melo <a...@redhat.com>
---
 tools/perf/util/header.c | 19 ---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index 3d12c16..1222f6c 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -505,24 +505,31 @@ static void free_cpu_topo(struct cpu_topo *tp)
 
 static struct cpu_topo *build_cpu_topology(void)
 {
-   struct cpu_topo *tp;
+   struct cpu_topo *tp = NULL;
void *addr;
u32 nr, i;
size_t sz;
long ncpus;
int ret = -1;
+   struct cpu_map *map;
 
ncpus = sysconf(_SC_NPROCESSORS_CONF);
if (ncpus < 0)
return NULL;
 
+   /* build online CPU map */
+   map = cpu_map__new(NULL);
+   if (map == NULL) {
+   pr_debug("failed to get system cpumap\n");
+   return NULL;
+   }
+
nr = (u32)(ncpus & UINT_MAX);
 
sz = nr * sizeof(char *);
-
addr = calloc(1, sizeof(*tp) + 2 * sz);
if (!addr)
-   return NULL;
+   goto out_free;
 
tp = addr;
tp->cpu_nr = nr;
@@ -532,10 +539,16 @@ static struct cpu_topo *build_cpu_topology(void)
tp->thread_siblings = addr;
 
for (i = 0; i < nr; i++) {
+   if (!cpu_map__has(map, i))
+   continue;
+
ret = build_cpu_topo(tp, i);
if (ret < 0)
break;
}
+
+out_free:
+   cpu_map__put(map);
if (ret) {
free_cpu_topo(tp);
tp = NULL;


[tip:perf/urgent] perf cpumap: Add cpu__max_present_cpu()

2017-02-21 Thread tip-bot for Jan Stancek
Commit-ID:  92a7e1278005b6bb3459affc50b2b6e2464e7e7c
Gitweb: http://git.kernel.org/tip/92a7e1278005b6bb3459affc50b2b6e2464e7e7c
Author: Jan Stancek <jstan...@redhat.com>
AuthorDate: Fri, 17 Feb 2017 12:10:24 +0100
Committer:  Arnaldo Carvalho de Melo <a...@redhat.com>
CommitDate: Fri, 17 Feb 2017 12:33:05 -0300

perf cpumap: Add cpu__max_present_cpu()

Similar to cpu__max_cpu() (which returns the max possible CPU), returns
the max present CPU.

Signed-off-by: Jan Stancek <jstan...@redhat.com>
Acked-by: Jiri Olsa <jo...@kernel.org>
Cc: Alexander Shishkin <alexander.shish...@linux.intel.com>
Cc: Masami Hiramatsu <mhira...@kernel.org>
Cc: Peter Zijlstra <pet...@infradead.org>
Link: 
http://lkml.kernel.org/r/8ea4601b5cacc49927235b4ebac424bd6eeccb06.1487146877.git.jstan...@redhat.com
Signed-off-by: Arnaldo Carvalho de Melo <a...@redhat.com>
---
 tools/perf/util/cpumap.c | 22 ++
 tools/perf/util/cpumap.h |  1 +
 2 files changed, 23 insertions(+)

diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c
index 2c0b522..8c75049 100644
--- a/tools/perf/util/cpumap.c
+++ b/tools/perf/util/cpumap.c
@@ -9,6 +9,7 @@
 #include "asm/bug.h"
 
 static int max_cpu_num;
+static int max_present_cpu_num;
 static int max_node_num;
 static int *cpunode_map;
 
@@ -442,6 +443,7 @@ static void set_max_cpu_num(void)
 
/* set up default */
max_cpu_num = 4096;
+   max_present_cpu_num = 4096;
 
mnt = sysfs__mountpoint();
if (!mnt)
@@ -455,6 +457,17 @@ static void set_max_cpu_num(void)
}
 
ret = get_max_num(path, _cpu_num);
+   if (ret)
+   goto out;
+
+   /* get the highest present cpu number for a sparse allocation */
+   ret = snprintf(path, PATH_MAX, "%s/devices/system/cpu/present", mnt);
+   if (ret == PATH_MAX) {
+   pr_err("sysfs path crossed PATH_MAX(%d) size\n", PATH_MAX);
+   goto out;
+   }
+
+   ret = get_max_num(path, _present_cpu_num);
 
 out:
if (ret)
@@ -505,6 +518,15 @@ int cpu__max_cpu(void)
return max_cpu_num;
 }
 
+int cpu__max_present_cpu(void)
+{
+   if (unlikely(!max_present_cpu_num))
+   set_max_cpu_num();
+
+   return max_present_cpu_num;
+}
+
+
 int cpu__get_node(int cpu)
 {
if (unlikely(cpunode_map == NULL)) {
diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h
index 06bd689..1a0549a 100644
--- a/tools/perf/util/cpumap.h
+++ b/tools/perf/util/cpumap.h
@@ -62,6 +62,7 @@ int cpu__setup_cpunode_map(void);
 
 int cpu__max_node(void);
 int cpu__max_cpu(void);
+int cpu__max_present_cpu(void);
 int cpu__get_node(int cpu);
 
 int cpu_map__build_map(struct cpu_map *cpus, struct cpu_map **res,


[tip:perf/urgent] perf tools: Replace _SC_NPROCESSORS_CONF with max_present_cpu in cpu_topology_map

2017-02-21 Thread tip-bot for Jan Stancek
Commit-ID:  da8a58b56c661681f9b2fd2fa59c6da3a5bac8d1
Gitweb: http://git.kernel.org/tip/da8a58b56c661681f9b2fd2fa59c6da3a5bac8d1
Author: Jan Stancek <jstan...@redhat.com>
AuthorDate: Fri, 17 Feb 2017 12:10:26 +0100
Committer:  Arnaldo Carvalho de Melo <a...@redhat.com>
CommitDate: Fri, 17 Feb 2017 12:56:35 -0300

perf tools: Replace _SC_NPROCESSORS_CONF with max_present_cpu in 
cpu_topology_map

There are 2 problems wrt. cpu_topology_map on systems with sparse CPUs:

1. offline/absent CPUs will have their socket_id and core_id set to -1
   which triggers:
   "socket_id number is too big.You may need to upgrade the perf tool."

2. size of cpu_topology_map (perf_env.cpu[]) is allocated based on
   _SC_NPROCESSORS_CONF, but can be indexed with CPU ids going above.
   Users of perf_env.cpu[] are using CPU id as index. This can lead
   to read beyond what was allocated:
   ==19991== Invalid read of size 4
   ==19991==at 0x490CEB: check_cpu_topology (topology.c:69)
   ==19991==by 0x490CEB: test_session_topology (topology.c:106)
   ...

For example:
  _SC_NPROCESSORS_CONF == 16
  available: 2 nodes (0-1)
  node 0 cpus: 0 6 8 10 16 22 24 26
  node 0 size: 12004 MB
  node 0 free: 9470 MB
  node 1 cpus: 1 7 9 11 23 25 27
  node 1 size: 12093 MB
  node 1 free: 9406 MB
  node distances:
  node   0   1
0:  10  20
1:  20  10

This patch changes HEADER_NRCPUS.nr_cpus_available from _SC_NPROCESSORS_CONF
to max_present_cpu and updates any user of cpu_topology_map to iterate
with nr_cpus_avail.

As a consequence HEADER_CPU_TOPOLOGY core_id and socket_id lists get longer,
but maintain compatibility with pre-patch state - index to cpu_topology_map is
CPU id.

  perf test 36 -v
  36: Session topology   :
  --- start ---
  test child forked, pid 22211
  templ file: /tmp/perf-test-gmdX5i
  CPU 0, core 0, socket 0
  CPU 1, core 0, socket 1
  CPU 6, core 10, socket 0
  CPU 7, core 10, socket 1
  CPU 8, core 1, socket 0
  CPU 9, core 1, socket 1
  CPU 10, core 9, socket 0
  CPU 11, core 9, socket 1
  CPU 16, core 0, socket 0
  CPU 22, core 10, socket 0
  CPU 23, core 10, socket 1
  CPU 24, core 1, socket 0
  CPU 25, core 1, socket 1
  CPU 26, core 9, socket 0
  CPU 27, core 9, socket 1
  test child finished with 0
   end 
  Session topology: Ok

Signed-off-by: Jan Stancek <jstan...@redhat.com>
Cc: Alexander Shishkin <alexander.shish...@linux.intel.com>
Cc: Jiri Olsa <jo...@kernel.org>
Cc: Masami Hiramatsu <mhira...@kernel.org>
Cc: Peter Zijlstra <pet...@infradead.org>
Link: 
http://lkml.kernel.org/r/d7c05c6445fca74a8442c2c73cfffd349c52c44f.1487146877.git.jstan...@redhat.com
Signed-off-by: Arnaldo Carvalho de Melo <a...@redhat.com>
---
 tools/perf/builtin-stat.c   |  2 +-
 tools/perf/tests/topology.c |  4 +++-
 tools/perf/util/env.c   |  2 +-
 tools/perf/util/header.c| 16 +---
 4 files changed, 10 insertions(+), 14 deletions(-)

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index f287191..ca27a8a 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -1765,7 +1765,7 @@ static inline int perf_env__get_cpu(struct perf_env *env, 
struct cpu_map *map, i
 
cpu = map->map[idx];
 
-   if (cpu >= env->nr_cpus_online)
+   if (cpu >= env->nr_cpus_avail)
return -1;
 
return cpu;
diff --git a/tools/perf/tests/topology.c b/tools/perf/tests/topology.c
index 98fe69a..803f893 100644
--- a/tools/perf/tests/topology.c
+++ b/tools/perf/tests/topology.c
@@ -65,7 +65,9 @@ static int check_cpu_topology(char *path, struct cpu_map *map)
session = perf_session__new(, false, NULL);
TEST_ASSERT_VAL("can't get session", session);
 
-   for (i = 0; i < session->header.env.nr_cpus_online; i++) {
+   for (i = 0; i < session->header.env.nr_cpus_avail; i++) {
+   if (!cpu_map__has(map, i))
+   continue;
pr_debug("CPU %d, core %d, socket %d\n", i,
 session->header.env.cpu[i].core_id,
 session->header.env.cpu[i].socket_id);
diff --git a/tools/perf/util/env.c b/tools/perf/util/env.c
index bb964e8..075fc77 100644
--- a/tools/perf/util/env.c
+++ b/tools/perf/util/env.c
@@ -66,7 +66,7 @@ int perf_env__read_cpu_topology_map(struct perf_env *env)
return 0;
 
if (env->nr_cpus_avail == 0)
-   env->nr_cpus_avail = sysconf(_SC_NPROCESSORS_CONF);
+   env->nr_cpus_avail = cpu__max_present_cpu();
 
nr_cpus = env->nr_cpus_avail;
if (nr_cpus == -1)
diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index 1222f6c..05714d5 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -295,11 +295,7 @@ static int write_nrcpus(int fd, struct perf_header *h 
__maybe_unused,
u32 nrc, nra;
int

[tip:perf/urgent] perf header: Make build_cpu_topology skip offline/absent CPUs

2017-02-21 Thread tip-bot for Jan Stancek
Commit-ID:  43db2843a4a41cc8cdb6ab696639aeee1f4d5062
Gitweb: http://git.kernel.org/tip/43db2843a4a41cc8cdb6ab696639aeee1f4d5062
Author: Jan Stancek 
AuthorDate: Fri, 17 Feb 2017 12:10:25 +0100
Committer:  Arnaldo Carvalho de Melo 
CommitDate: Fri, 17 Feb 2017 12:37:04 -0300

perf header: Make build_cpu_topology skip offline/absent CPUs

When build_cpu_topo() encounters offline/absent CPUs, it fails to find any
sysfs entries and returns failure.

This leads to build_cpu_topology() and write_cpu_topology() failing as
well.

Because HEADER_CPU_TOPOLOGY has not been written, read leaves cpu_topology_map
NULL and we get NULL ptr deref at:

  ...
   cmd_test
__cmd_test
 test_and_print
  run_test
   test_session_topology
check_cpu_topology

  36: Session topology   :
  --- start ---
  test child forked, pid 14902
  templ file: /tmp/perf-test-4CKocW
  failed to write feature HEADER_CPU_TOPOLOGY
  perf: Segmentation fault
  Obtained 9 stack frames.
  ./perf(sighandler_dump_stack+0x41) [0x5095f1]
  /lib64/libc.so.6(+0x35250) [0x7f4b7c3c9250]
  ./perf(test_session_topology+0x1db) [0x490ceb]
  ./perf() [0x475b68]
  ./perf(cmd_test+0x5b9) [0x4763c9]
  ./perf() [0x4945a3]
  ./perf(main+0x69f) [0x427e8f]
  /lib64/libc.so.6(__libc_start_main+0xf5) [0x7f4b7c3b5b35]
  ./perf() [0x427fb9]
  test child interrupted
   end 
  Session topology: FAILED!

This patch makes build_cpu_topology() skip offline/absent CPUs, by checking
their presence against cpu_map built from online CPUs.

Signed-off-by: Jan Stancek 
Acked-by: Jiri Olsa 
Cc: Alexander Shishkin 
Cc: Masami Hiramatsu 
Cc: Peter Zijlstra 
Link: 
http://lkml.kernel.org/r/a271b770175524f4961d4903af33798358a4a518.1487146877.git.jstan...@redhat.com
Signed-off-by: Arnaldo Carvalho de Melo 
---
 tools/perf/util/header.c | 19 ---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index 3d12c16..1222f6c 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -505,24 +505,31 @@ static void free_cpu_topo(struct cpu_topo *tp)
 
 static struct cpu_topo *build_cpu_topology(void)
 {
-   struct cpu_topo *tp;
+   struct cpu_topo *tp = NULL;
void *addr;
u32 nr, i;
size_t sz;
long ncpus;
int ret = -1;
+   struct cpu_map *map;
 
ncpus = sysconf(_SC_NPROCESSORS_CONF);
if (ncpus < 0)
return NULL;
 
+   /* build online CPU map */
+   map = cpu_map__new(NULL);
+   if (map == NULL) {
+   pr_debug("failed to get system cpumap\n");
+   return NULL;
+   }
+
nr = (u32)(ncpus & UINT_MAX);
 
sz = nr * sizeof(char *);
-
addr = calloc(1, sizeof(*tp) + 2 * sz);
if (!addr)
-   return NULL;
+   goto out_free;
 
tp = addr;
tp->cpu_nr = nr;
@@ -532,10 +539,16 @@ static struct cpu_topo *build_cpu_topology(void)
tp->thread_siblings = addr;
 
for (i = 0; i < nr; i++) {
+   if (!cpu_map__has(map, i))
+   continue;
+
ret = build_cpu_topo(tp, i);
if (ret < 0)
break;
}
+
+out_free:
+   cpu_map__put(map);
if (ret) {
free_cpu_topo(tp);
tp = NULL;


[tip:perf/urgent] perf cpumap: Add cpu__max_present_cpu()

2017-02-21 Thread tip-bot for Jan Stancek
Commit-ID:  92a7e1278005b6bb3459affc50b2b6e2464e7e7c
Gitweb: http://git.kernel.org/tip/92a7e1278005b6bb3459affc50b2b6e2464e7e7c
Author: Jan Stancek 
AuthorDate: Fri, 17 Feb 2017 12:10:24 +0100
Committer:  Arnaldo Carvalho de Melo 
CommitDate: Fri, 17 Feb 2017 12:33:05 -0300

perf cpumap: Add cpu__max_present_cpu()

Similar to cpu__max_cpu() (which returns the max possible CPU), returns
the max present CPU.

Signed-off-by: Jan Stancek 
Acked-by: Jiri Olsa 
Cc: Alexander Shishkin 
Cc: Masami Hiramatsu 
Cc: Peter Zijlstra 
Link: 
http://lkml.kernel.org/r/8ea4601b5cacc49927235b4ebac424bd6eeccb06.1487146877.git.jstan...@redhat.com
Signed-off-by: Arnaldo Carvalho de Melo 
---
 tools/perf/util/cpumap.c | 22 ++
 tools/perf/util/cpumap.h |  1 +
 2 files changed, 23 insertions(+)

diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c
index 2c0b522..8c75049 100644
--- a/tools/perf/util/cpumap.c
+++ b/tools/perf/util/cpumap.c
@@ -9,6 +9,7 @@
 #include "asm/bug.h"
 
 static int max_cpu_num;
+static int max_present_cpu_num;
 static int max_node_num;
 static int *cpunode_map;
 
@@ -442,6 +443,7 @@ static void set_max_cpu_num(void)
 
/* set up default */
max_cpu_num = 4096;
+   max_present_cpu_num = 4096;
 
mnt = sysfs__mountpoint();
if (!mnt)
@@ -455,6 +457,17 @@ static void set_max_cpu_num(void)
}
 
ret = get_max_num(path, _cpu_num);
+   if (ret)
+   goto out;
+
+   /* get the highest present cpu number for a sparse allocation */
+   ret = snprintf(path, PATH_MAX, "%s/devices/system/cpu/present", mnt);
+   if (ret == PATH_MAX) {
+   pr_err("sysfs path crossed PATH_MAX(%d) size\n", PATH_MAX);
+   goto out;
+   }
+
+   ret = get_max_num(path, _present_cpu_num);
 
 out:
if (ret)
@@ -505,6 +518,15 @@ int cpu__max_cpu(void)
return max_cpu_num;
 }
 
+int cpu__max_present_cpu(void)
+{
+   if (unlikely(!max_present_cpu_num))
+   set_max_cpu_num();
+
+   return max_present_cpu_num;
+}
+
+
 int cpu__get_node(int cpu)
 {
if (unlikely(cpunode_map == NULL)) {
diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h
index 06bd689..1a0549a 100644
--- a/tools/perf/util/cpumap.h
+++ b/tools/perf/util/cpumap.h
@@ -62,6 +62,7 @@ int cpu__setup_cpunode_map(void);
 
 int cpu__max_node(void);
 int cpu__max_cpu(void);
+int cpu__max_present_cpu(void);
 int cpu__get_node(int cpu);
 
 int cpu_map__build_map(struct cpu_map *cpus, struct cpu_map **res,


[tip:perf/urgent] perf tools: Replace _SC_NPROCESSORS_CONF with max_present_cpu in cpu_topology_map

2017-02-21 Thread tip-bot for Jan Stancek
Commit-ID:  da8a58b56c661681f9b2fd2fa59c6da3a5bac8d1
Gitweb: http://git.kernel.org/tip/da8a58b56c661681f9b2fd2fa59c6da3a5bac8d1
Author: Jan Stancek 
AuthorDate: Fri, 17 Feb 2017 12:10:26 +0100
Committer:  Arnaldo Carvalho de Melo 
CommitDate: Fri, 17 Feb 2017 12:56:35 -0300

perf tools: Replace _SC_NPROCESSORS_CONF with max_present_cpu in 
cpu_topology_map

There are 2 problems wrt. cpu_topology_map on systems with sparse CPUs:

1. offline/absent CPUs will have their socket_id and core_id set to -1
   which triggers:
   "socket_id number is too big.You may need to upgrade the perf tool."

2. size of cpu_topology_map (perf_env.cpu[]) is allocated based on
   _SC_NPROCESSORS_CONF, but can be indexed with CPU ids going above.
   Users of perf_env.cpu[] are using CPU id as index. This can lead
   to read beyond what was allocated:
   ==19991== Invalid read of size 4
   ==19991==at 0x490CEB: check_cpu_topology (topology.c:69)
   ==19991==by 0x490CEB: test_session_topology (topology.c:106)
   ...

For example:
  _SC_NPROCESSORS_CONF == 16
  available: 2 nodes (0-1)
  node 0 cpus: 0 6 8 10 16 22 24 26
  node 0 size: 12004 MB
  node 0 free: 9470 MB
  node 1 cpus: 1 7 9 11 23 25 27
  node 1 size: 12093 MB
  node 1 free: 9406 MB
  node distances:
  node   0   1
0:  10  20
1:  20  10

This patch changes HEADER_NRCPUS.nr_cpus_available from _SC_NPROCESSORS_CONF
to max_present_cpu and updates any user of cpu_topology_map to iterate
with nr_cpus_avail.

As a consequence HEADER_CPU_TOPOLOGY core_id and socket_id lists get longer,
but maintain compatibility with pre-patch state - index to cpu_topology_map is
CPU id.

  perf test 36 -v
  36: Session topology   :
  --- start ---
  test child forked, pid 22211
  templ file: /tmp/perf-test-gmdX5i
  CPU 0, core 0, socket 0
  CPU 1, core 0, socket 1
  CPU 6, core 10, socket 0
  CPU 7, core 10, socket 1
  CPU 8, core 1, socket 0
  CPU 9, core 1, socket 1
  CPU 10, core 9, socket 0
  CPU 11, core 9, socket 1
  CPU 16, core 0, socket 0
  CPU 22, core 10, socket 0
  CPU 23, core 10, socket 1
  CPU 24, core 1, socket 0
  CPU 25, core 1, socket 1
  CPU 26, core 9, socket 0
  CPU 27, core 9, socket 1
  test child finished with 0
   end 
  Session topology: Ok

Signed-off-by: Jan Stancek 
Cc: Alexander Shishkin 
Cc: Jiri Olsa 
Cc: Masami Hiramatsu 
Cc: Peter Zijlstra 
Link: 
http://lkml.kernel.org/r/d7c05c6445fca74a8442c2c73cfffd349c52c44f.1487146877.git.jstan...@redhat.com
Signed-off-by: Arnaldo Carvalho de Melo 
---
 tools/perf/builtin-stat.c   |  2 +-
 tools/perf/tests/topology.c |  4 +++-
 tools/perf/util/env.c   |  2 +-
 tools/perf/util/header.c| 16 +---
 4 files changed, 10 insertions(+), 14 deletions(-)

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index f287191..ca27a8a 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -1765,7 +1765,7 @@ static inline int perf_env__get_cpu(struct perf_env *env, 
struct cpu_map *map, i
 
cpu = map->map[idx];
 
-   if (cpu >= env->nr_cpus_online)
+   if (cpu >= env->nr_cpus_avail)
return -1;
 
return cpu;
diff --git a/tools/perf/tests/topology.c b/tools/perf/tests/topology.c
index 98fe69a..803f893 100644
--- a/tools/perf/tests/topology.c
+++ b/tools/perf/tests/topology.c
@@ -65,7 +65,9 @@ static int check_cpu_topology(char *path, struct cpu_map *map)
session = perf_session__new(, false, NULL);
TEST_ASSERT_VAL("can't get session", session);
 
-   for (i = 0; i < session->header.env.nr_cpus_online; i++) {
+   for (i = 0; i < session->header.env.nr_cpus_avail; i++) {
+   if (!cpu_map__has(map, i))
+   continue;
pr_debug("CPU %d, core %d, socket %d\n", i,
 session->header.env.cpu[i].core_id,
 session->header.env.cpu[i].socket_id);
diff --git a/tools/perf/util/env.c b/tools/perf/util/env.c
index bb964e8..075fc77 100644
--- a/tools/perf/util/env.c
+++ b/tools/perf/util/env.c
@@ -66,7 +66,7 @@ int perf_env__read_cpu_topology_map(struct perf_env *env)
return 0;
 
if (env->nr_cpus_avail == 0)
-   env->nr_cpus_avail = sysconf(_SC_NPROCESSORS_CONF);
+   env->nr_cpus_avail = cpu__max_present_cpu();
 
nr_cpus = env->nr_cpus_avail;
if (nr_cpus == -1)
diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index 1222f6c..05714d5 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -295,11 +295,7 @@ static int write_nrcpus(int fd, struct perf_header *h 
__maybe_unused,
u32 nrc, nra;
int ret;
 
-   nr = sysconf(_SC_NPROCESSORS_CONF);
-   if (nr < 0)
-   return -1;
-
-   nrc = (u32)(nr & UINT_MAX);
+   nrc = cpu__max_present_cpu();
 
nr = sysconf(_SC

[PATCH v4 3/3] perf: Replace _SC_NPROCESSORS_CONF with max_present_cpu in cpu_topology_map

2017-02-20 Thread Jan Stancek
There are 2 problems wrt. cpu_topology_map on systems with sparse CPUs:

1. offline/absent CPUs will have their socket_id and core_id set to -1
   which triggers:
   "socket_id number is too big.You may need to upgrade the perf tool."

2. size of cpu_topology_map (perf_env.cpu[]) is allocated based on
   _SC_NPROCESSORS_CONF, but can be indexed with CPU ids going above.
   Users of perf_env.cpu[] are using CPU id as index. This can lead
   to read beyond what was allocated:
   ==19991== Invalid read of size 4
   ==19991==at 0x490CEB: check_cpu_topology (topology.c:69)
   ==19991==by 0x490CEB: test_session_topology (topology.c:106)
   ...

For example:
  _SC_NPROCESSORS_CONF == 16
  available: 2 nodes (0-1)
  node 0 cpus: 0 6 8 10 16 22 24 26
  node 0 size: 12004 MB
  node 0 free: 9470 MB
  node 1 cpus: 1 7 9 11 23 25 27
  node 1 size: 12093 MB
  node 1 free: 9406 MB
  node distances:
  node   0   1
0:  10  20
1:  20  10

This patch changes HEADER_NRCPUS.nr_cpus_available from _SC_NPROCESSORS_CONF
to max_present_cpu and updates any user of cpu_topology_map to iterate
with nr_cpus_avail. As consequence HEADER_CPU_TOPOLOGY core_id and
socket_id lists get longer, but maintain compatibility with pre-patch
state - index to cpu_topology_map is CPU id.

  # ./perf test 36 -v
  36: Session topology   :
  --- start ---
  test child forked, pid 22211
  templ file: /tmp/perf-test-gmdX5i
  CPU 0, core 0, socket 0
  CPU 1, core 0, socket 1
  CPU 6, core 10, socket 0
  CPU 7, core 10, socket 1
  CPU 8, core 1, socket 0
  CPU 9, core 1, socket 1
  CPU 10, core 9, socket 0
  CPU 11, core 9, socket 1
  CPU 16, core 0, socket 0
  CPU 22, core 10, socket 0
  CPU 23, core 10, socket 1
  CPU 24, core 1, socket 0
  CPU 25, core 1, socket 1
  CPU 26, core 9, socket 0
  CPU 27, core 9, socket 1
  test child finished with 0
   end 
  Session topology: Ok

Signed-off-by: Jan Stancek <jstan...@redhat.com>
---
 tools/perf/builtin-stat.c   |  2 +-
 tools/perf/tests/topology.c |  4 +++-
 tools/perf/util/env.c   |  2 +-
 tools/perf/util/header.c| 20 +---
 4 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index a02f2e965628..9e2be1c52fbd 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -1765,7 +1765,7 @@ static inline int perf_env__get_cpu(struct perf_env *env, 
struct cpu_map *map, i
 
cpu = map->map[idx];
 
-   if (cpu >= env->nr_cpus_online)
+   if (cpu >= env->nr_cpus_avail)
return -1;
 
return cpu;
diff --git a/tools/perf/tests/topology.c b/tools/perf/tests/topology.c
index 98fe69ac553c..803f893550d6 100644
--- a/tools/perf/tests/topology.c
+++ b/tools/perf/tests/topology.c
@@ -65,7 +65,9 @@ static int check_cpu_topology(char *path, struct cpu_map *map)
session = perf_session__new(, false, NULL);
TEST_ASSERT_VAL("can't get session", session);
 
-   for (i = 0; i < session->header.env.nr_cpus_online; i++) {
+   for (i = 0; i < session->header.env.nr_cpus_avail; i++) {
+   if (!cpu_map__has(map, i))
+   continue;
pr_debug("CPU %d, core %d, socket %d\n", i,
 session->header.env.cpu[i].core_id,
 session->header.env.cpu[i].socket_id);
diff --git a/tools/perf/util/env.c b/tools/perf/util/env.c
index bb964e86b09d..075fc77286bf 100644
--- a/tools/perf/util/env.c
+++ b/tools/perf/util/env.c
@@ -66,7 +66,7 @@ int perf_env__read_cpu_topology_map(struct perf_env *env)
return 0;
 
if (env->nr_cpus_avail == 0)
-   env->nr_cpus_avail = sysconf(_SC_NPROCESSORS_CONF);
+   env->nr_cpus_avail = cpu__max_present_cpu();
 
nr_cpus = env->nr_cpus_avail;
if (nr_cpus == -1)
diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index ca0f12fb5fd3..e97cbf78ef21 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -293,11 +293,7 @@ static int write_nrcpus(int fd, struct perf_header *h 
__maybe_unused,
u32 nrc, nra;
int ret;
 
-   nr = sysconf(_SC_NPROCESSORS_CONF);
-   if (nr < 0)
-   return -1;
-
-   nrc = (u32)(nr & UINT_MAX);
+   nrc = cpu__max_present_cpu();
 
nr = sysconf(_SC_NPROCESSORS_ONLN);
if (nr < 0)
@@ -511,9 +507,7 @@ static struct cpu_topo *build_cpu_topology(void)
int ret = -1;
struct cpu_map *map;
 
-   ncpus = sysconf(_SC_NPROCESSORS_CONF);
-   if (ncpus < 0)
-   return NULL;
+   ncpus = cpu__max_present_cpu();
 
/* build online CPU map */
map = cpu_map__new(NULL);
@@ -1137,7 +1131,7 @@ static void print_cpu_topology(struct perf_header *ph, 
int fd __maybe_unused,
 {
int nr, i;
char *str;
-   int 

[PATCH v4 2/3] perf header: Make build_cpu_topology skip offline/absent CPUs

2017-02-20 Thread Jan Stancek
When build_cpu_topo() encounters offline/absent CPUs,
it fails to find any sysfs entries and returns failure.
This leads to build_cpu_topology() and write_cpu_topology()
failing as well.

Because HEADER_CPU_TOPOLOGY has not been written, read leaves
cpu_topology_map NULL and we get NULL ptr deref at:
 ...
  cmd_test
   __cmd_test
test_and_print
 run_test
  test_session_topology
   check_cpu_topology

  # perf test 36
  36: Session topology   :
  --- start ---
  test child forked, pid 14902
  templ file: /tmp/perf-test-4CKocW
  failed to write feature HEADER_CPU_TOPOLOGY
  perf: Segmentation fault
  Obtained 9 stack frames.
  ./perf(sighandler_dump_stack+0x41) [0x5095f1]
  /lib64/libc.so.6(+0x35250) [0x7f4b7c3c9250]
  ./perf(test_session_topology+0x1db) [0x490ceb]
  ./perf() [0x475b68]
  ./perf(cmd_test+0x5b9) [0x4763c9]
  ./perf() [0x4945a3]
  ./perf(main+0x69f) [0x427e8f]
  /lib64/libc.so.6(__libc_start_main+0xf5) [0x7f4b7c3b5b35]
  ./perf() [0x427fb9]
  test child interrupted
   end 
  Session topology: FAILED!

This patch makes build_cpu_topology() skip offline/absent CPUs,
by checking their presence against cpu_map built from online CPUs.

Signed-off-by: Jan Stancek <jstan...@redhat.com>
---
 tools/perf/util/header.c | 19 ---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index d89c9c7ef4e5..ca0f12fb5fd3 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -503,24 +503,31 @@ static void free_cpu_topo(struct cpu_topo *tp)
 
 static struct cpu_topo *build_cpu_topology(void)
 {
-   struct cpu_topo *tp;
+   struct cpu_topo *tp = NULL;
void *addr;
u32 nr, i;
size_t sz;
long ncpus;
int ret = -1;
+   struct cpu_map *map;
 
ncpus = sysconf(_SC_NPROCESSORS_CONF);
if (ncpus < 0)
return NULL;
 
+   /* build online CPU map */
+   map = cpu_map__new(NULL);
+   if (map == NULL) {
+   pr_debug("failed to get system cpumap\n");
+   return NULL;
+   }
+
nr = (u32)(ncpus & UINT_MAX);
 
sz = nr * sizeof(char *);
-
addr = calloc(1, sizeof(*tp) + 2 * sz);
if (!addr)
-   return NULL;
+   goto out_free;
 
tp = addr;
tp->cpu_nr = nr;
@@ -530,10 +537,16 @@ static struct cpu_topo *build_cpu_topology(void)
tp->thread_siblings = addr;
 
for (i = 0; i < nr; i++) {
+   if (!cpu_map__has(map, i))
+   continue;
+
ret = build_cpu_topo(tp, i);
if (ret < 0)
break;
}
+
+out_free:
+   cpu_map__put(map);
if (ret) {
free_cpu_topo(tp);
tp = NULL;
-- 
1.8.3.1



[PATCH v4 1/3] perf cpu_map: Add cpu__max_present_cpu()

2017-02-20 Thread Jan Stancek
Similar to cpu__max_cpu() (which returns max possible CPU),
returns max present CPU.

Signed-off-by: Jan Stancek <jstan...@redhat.com>
---
 tools/perf/util/cpumap.c | 22 ++
 tools/perf/util/cpumap.h |  1 +
 2 files changed, 23 insertions(+)

diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c
index 2c0b52264a46..8c7504939113 100644
--- a/tools/perf/util/cpumap.c
+++ b/tools/perf/util/cpumap.c
@@ -9,6 +9,7 @@
 #include "asm/bug.h"
 
 static int max_cpu_num;
+static int max_present_cpu_num;
 static int max_node_num;
 static int *cpunode_map;
 
@@ -442,6 +443,7 @@ static void set_max_cpu_num(void)
 
/* set up default */
max_cpu_num = 4096;
+   max_present_cpu_num = 4096;
 
mnt = sysfs__mountpoint();
if (!mnt)
@@ -455,6 +457,17 @@ static void set_max_cpu_num(void)
}
 
ret = get_max_num(path, _cpu_num);
+   if (ret)
+   goto out;
+
+   /* get the highest present cpu number for a sparse allocation */
+   ret = snprintf(path, PATH_MAX, "%s/devices/system/cpu/present", mnt);
+   if (ret == PATH_MAX) {
+   pr_err("sysfs path crossed PATH_MAX(%d) size\n", PATH_MAX);
+   goto out;
+   }
+
+   ret = get_max_num(path, _present_cpu_num);
 
 out:
if (ret)
@@ -505,6 +518,15 @@ int cpu__max_cpu(void)
return max_cpu_num;
 }
 
+int cpu__max_present_cpu(void)
+{
+   if (unlikely(!max_present_cpu_num))
+   set_max_cpu_num();
+
+   return max_present_cpu_num;
+}
+
+
 int cpu__get_node(int cpu)
 {
if (unlikely(cpunode_map == NULL)) {
diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h
index 06bd689f5989..1a0549af8f5c 100644
--- a/tools/perf/util/cpumap.h
+++ b/tools/perf/util/cpumap.h
@@ -62,6 +62,7 @@ static inline bool cpu_map__empty(const struct cpu_map *map)
 
 int cpu__max_node(void);
 int cpu__max_cpu(void);
+int cpu__max_present_cpu(void);
 int cpu__get_node(int cpu);
 
 int cpu_map__build_map(struct cpu_map *cpus, struct cpu_map **res,
-- 
1.8.3.1



[PATCH v4 3/3] perf: Replace _SC_NPROCESSORS_CONF with max_present_cpu in cpu_topology_map

2017-02-20 Thread Jan Stancek
There are 2 problems wrt. cpu_topology_map on systems with sparse CPUs:

1. offline/absent CPUs will have their socket_id and core_id set to -1
   which triggers:
   "socket_id number is too big.You may need to upgrade the perf tool."

2. size of cpu_topology_map (perf_env.cpu[]) is allocated based on
   _SC_NPROCESSORS_CONF, but can be indexed with CPU ids going above.
   Users of perf_env.cpu[] are using CPU id as index. This can lead
   to read beyond what was allocated:
   ==19991== Invalid read of size 4
   ==19991==at 0x490CEB: check_cpu_topology (topology.c:69)
   ==19991==by 0x490CEB: test_session_topology (topology.c:106)
   ...

For example:
  _SC_NPROCESSORS_CONF == 16
  available: 2 nodes (0-1)
  node 0 cpus: 0 6 8 10 16 22 24 26
  node 0 size: 12004 MB
  node 0 free: 9470 MB
  node 1 cpus: 1 7 9 11 23 25 27
  node 1 size: 12093 MB
  node 1 free: 9406 MB
  node distances:
  node   0   1
0:  10  20
1:  20  10

This patch changes HEADER_NRCPUS.nr_cpus_available from _SC_NPROCESSORS_CONF
to max_present_cpu and updates any user of cpu_topology_map to iterate
with nr_cpus_avail. As consequence HEADER_CPU_TOPOLOGY core_id and
socket_id lists get longer, but maintain compatibility with pre-patch
state - index to cpu_topology_map is CPU id.

  # ./perf test 36 -v
  36: Session topology   :
  --- start ---
  test child forked, pid 22211
  templ file: /tmp/perf-test-gmdX5i
  CPU 0, core 0, socket 0
  CPU 1, core 0, socket 1
  CPU 6, core 10, socket 0
  CPU 7, core 10, socket 1
  CPU 8, core 1, socket 0
  CPU 9, core 1, socket 1
  CPU 10, core 9, socket 0
  CPU 11, core 9, socket 1
  CPU 16, core 0, socket 0
  CPU 22, core 10, socket 0
  CPU 23, core 10, socket 1
  CPU 24, core 1, socket 0
  CPU 25, core 1, socket 1
  CPU 26, core 9, socket 0
  CPU 27, core 9, socket 1
  test child finished with 0
   end 
  Session topology: Ok

Signed-off-by: Jan Stancek 
---
 tools/perf/builtin-stat.c   |  2 +-
 tools/perf/tests/topology.c |  4 +++-
 tools/perf/util/env.c   |  2 +-
 tools/perf/util/header.c| 20 +---
 4 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index a02f2e965628..9e2be1c52fbd 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -1765,7 +1765,7 @@ static inline int perf_env__get_cpu(struct perf_env *env, 
struct cpu_map *map, i
 
cpu = map->map[idx];
 
-   if (cpu >= env->nr_cpus_online)
+   if (cpu >= env->nr_cpus_avail)
return -1;
 
return cpu;
diff --git a/tools/perf/tests/topology.c b/tools/perf/tests/topology.c
index 98fe69ac553c..803f893550d6 100644
--- a/tools/perf/tests/topology.c
+++ b/tools/perf/tests/topology.c
@@ -65,7 +65,9 @@ static int check_cpu_topology(char *path, struct cpu_map *map)
session = perf_session__new(, false, NULL);
TEST_ASSERT_VAL("can't get session", session);
 
-   for (i = 0; i < session->header.env.nr_cpus_online; i++) {
+   for (i = 0; i < session->header.env.nr_cpus_avail; i++) {
+   if (!cpu_map__has(map, i))
+   continue;
pr_debug("CPU %d, core %d, socket %d\n", i,
 session->header.env.cpu[i].core_id,
 session->header.env.cpu[i].socket_id);
diff --git a/tools/perf/util/env.c b/tools/perf/util/env.c
index bb964e86b09d..075fc77286bf 100644
--- a/tools/perf/util/env.c
+++ b/tools/perf/util/env.c
@@ -66,7 +66,7 @@ int perf_env__read_cpu_topology_map(struct perf_env *env)
return 0;
 
if (env->nr_cpus_avail == 0)
-   env->nr_cpus_avail = sysconf(_SC_NPROCESSORS_CONF);
+   env->nr_cpus_avail = cpu__max_present_cpu();
 
nr_cpus = env->nr_cpus_avail;
if (nr_cpus == -1)
diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index ca0f12fb5fd3..e97cbf78ef21 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -293,11 +293,7 @@ static int write_nrcpus(int fd, struct perf_header *h 
__maybe_unused,
u32 nrc, nra;
int ret;
 
-   nr = sysconf(_SC_NPROCESSORS_CONF);
-   if (nr < 0)
-   return -1;
-
-   nrc = (u32)(nr & UINT_MAX);
+   nrc = cpu__max_present_cpu();
 
nr = sysconf(_SC_NPROCESSORS_ONLN);
if (nr < 0)
@@ -511,9 +507,7 @@ static struct cpu_topo *build_cpu_topology(void)
int ret = -1;
struct cpu_map *map;
 
-   ncpus = sysconf(_SC_NPROCESSORS_CONF);
-   if (ncpus < 0)
-   return NULL;
+   ncpus = cpu__max_present_cpu();
 
/* build online CPU map */
map = cpu_map__new(NULL);
@@ -1137,7 +1131,7 @@ static void print_cpu_topology(struct perf_header *ph, 
int fd __maybe_unused,
 {
int nr, i;
char *str;
-   int cpu_nr = ph->env.nr_cpus_

[PATCH v4 2/3] perf header: Make build_cpu_topology skip offline/absent CPUs

2017-02-20 Thread Jan Stancek
When build_cpu_topo() encounters offline/absent CPUs,
it fails to find any sysfs entries and returns failure.
This leads to build_cpu_topology() and write_cpu_topology()
failing as well.

Because HEADER_CPU_TOPOLOGY has not been written, read leaves
cpu_topology_map NULL and we get NULL ptr deref at:
 ...
  cmd_test
   __cmd_test
test_and_print
 run_test
  test_session_topology
   check_cpu_topology

  # perf test 36
  36: Session topology   :
  --- start ---
  test child forked, pid 14902
  templ file: /tmp/perf-test-4CKocW
  failed to write feature HEADER_CPU_TOPOLOGY
  perf: Segmentation fault
  Obtained 9 stack frames.
  ./perf(sighandler_dump_stack+0x41) [0x5095f1]
  /lib64/libc.so.6(+0x35250) [0x7f4b7c3c9250]
  ./perf(test_session_topology+0x1db) [0x490ceb]
  ./perf() [0x475b68]
  ./perf(cmd_test+0x5b9) [0x4763c9]
  ./perf() [0x4945a3]
  ./perf(main+0x69f) [0x427e8f]
  /lib64/libc.so.6(__libc_start_main+0xf5) [0x7f4b7c3b5b35]
  ./perf() [0x427fb9]
  test child interrupted
   end 
  Session topology: FAILED!

This patch makes build_cpu_topology() skip offline/absent CPUs,
by checking their presence against cpu_map built from online CPUs.

Signed-off-by: Jan Stancek 
---
 tools/perf/util/header.c | 19 ---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index d89c9c7ef4e5..ca0f12fb5fd3 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -503,24 +503,31 @@ static void free_cpu_topo(struct cpu_topo *tp)
 
 static struct cpu_topo *build_cpu_topology(void)
 {
-   struct cpu_topo *tp;
+   struct cpu_topo *tp = NULL;
void *addr;
u32 nr, i;
size_t sz;
long ncpus;
int ret = -1;
+   struct cpu_map *map;
 
ncpus = sysconf(_SC_NPROCESSORS_CONF);
if (ncpus < 0)
return NULL;
 
+   /* build online CPU map */
+   map = cpu_map__new(NULL);
+   if (map == NULL) {
+   pr_debug("failed to get system cpumap\n");
+   return NULL;
+   }
+
nr = (u32)(ncpus & UINT_MAX);
 
sz = nr * sizeof(char *);
-
addr = calloc(1, sizeof(*tp) + 2 * sz);
if (!addr)
-   return NULL;
+   goto out_free;
 
tp = addr;
tp->cpu_nr = nr;
@@ -530,10 +537,16 @@ static struct cpu_topo *build_cpu_topology(void)
tp->thread_siblings = addr;
 
for (i = 0; i < nr; i++) {
+   if (!cpu_map__has(map, i))
+   continue;
+
ret = build_cpu_topo(tp, i);
if (ret < 0)
break;
}
+
+out_free:
+   cpu_map__put(map);
if (ret) {
free_cpu_topo(tp);
tp = NULL;
-- 
1.8.3.1



[PATCH v4 1/3] perf cpu_map: Add cpu__max_present_cpu()

2017-02-20 Thread Jan Stancek
Similar to cpu__max_cpu() (which returns max possible CPU),
returns max present CPU.

Signed-off-by: Jan Stancek 
---
 tools/perf/util/cpumap.c | 22 ++
 tools/perf/util/cpumap.h |  1 +
 2 files changed, 23 insertions(+)

diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c
index 2c0b52264a46..8c7504939113 100644
--- a/tools/perf/util/cpumap.c
+++ b/tools/perf/util/cpumap.c
@@ -9,6 +9,7 @@
 #include "asm/bug.h"
 
 static int max_cpu_num;
+static int max_present_cpu_num;
 static int max_node_num;
 static int *cpunode_map;
 
@@ -442,6 +443,7 @@ static void set_max_cpu_num(void)
 
/* set up default */
max_cpu_num = 4096;
+   max_present_cpu_num = 4096;
 
mnt = sysfs__mountpoint();
if (!mnt)
@@ -455,6 +457,17 @@ static void set_max_cpu_num(void)
}
 
ret = get_max_num(path, _cpu_num);
+   if (ret)
+   goto out;
+
+   /* get the highest present cpu number for a sparse allocation */
+   ret = snprintf(path, PATH_MAX, "%s/devices/system/cpu/present", mnt);
+   if (ret == PATH_MAX) {
+   pr_err("sysfs path crossed PATH_MAX(%d) size\n", PATH_MAX);
+   goto out;
+   }
+
+   ret = get_max_num(path, _present_cpu_num);
 
 out:
if (ret)
@@ -505,6 +518,15 @@ int cpu__max_cpu(void)
return max_cpu_num;
 }
 
+int cpu__max_present_cpu(void)
+{
+   if (unlikely(!max_present_cpu_num))
+   set_max_cpu_num();
+
+   return max_present_cpu_num;
+}
+
+
 int cpu__get_node(int cpu)
 {
if (unlikely(cpunode_map == NULL)) {
diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h
index 06bd689f5989..1a0549af8f5c 100644
--- a/tools/perf/util/cpumap.h
+++ b/tools/perf/util/cpumap.h
@@ -62,6 +62,7 @@ static inline bool cpu_map__empty(const struct cpu_map *map)
 
 int cpu__max_node(void);
 int cpu__max_cpu(void);
+int cpu__max_present_cpu(void);
 int cpu__get_node(int cpu);
 
 int cpu_map__build_map(struct cpu_map *cpus, struct cpu_map **res,
-- 
1.8.3.1



[PATCH v4 0/3] perf: topology on systems with offline/absent CPUs

2017-02-20 Thread Jan Stancek
Changes in v4:
- rebase on top of 4.10
- fix style of commit logs
- add comment to process_cpu_topology()

This series aims to address issues on systems with offline/absent CPUs:
1. cpu_topology_map size
   Allocation based on _SC_NPROCESSORS_CONF is too small,
   there can be CPU IDs that are higher, which causes invalid reads. 
2. build_cpu_topology fails for offline/absent CPUs
   It is trying to parse sysfs entries, that do not exist.
3. perf topology test fails

Changes affect HEADER_NRCPUS and HEADER_CPU_TOPOLOGY:

For example on a system like this:
  _SC_NPROCESSORS_CONF == 16
  available: 2 nodes (0-1)
  node 0 cpus: 0 6 8 10 16 22 24 26
  node 0 size: 12004 MB
  node 0 free: 9470 MB
  node 1 cpus: 1 7 9 11 23 25 27
  node 1 size: 12093 MB
  node 1 free: 9406 MB
  node distances:
  node   0   1
0:  10  20
1:  20  10

HEADER_NRCPUS before this series:
nr_cpus_online = 15
nr_cpus_available = 16

HEADER_CPU_TOPOLOGY before this series:
core_sib_nr: 2
core_siblings: 0,6,8,10,16,22,24,26
core_siblings: 1,7,9,11,23,25,27
thread_sib_nr: 8
thread_siblings: 0,16
thread_siblings: 1
thread_siblings: 6,22
thread_siblings: 7,23
thread_siblings: 8,24
thread_siblings: 9,25
thread_siblings: 10,26
thread_siblings: 11,27
core_id: 0, socket_id: 0
core_id: 0, socket_id: 1
core_id: -1, socket_id: -1
core_id: -1, socket_id: -1
core_id: -1, socket_id: -1
core_id: -1, socket_id: -1
core_id: 10, socket_id: 0
core_id: 10, socket_id: 1
core_id: 1, socket_id: 0
core_id: 1, socket_id: 1
core_id: 9, socket_id: 0
core_id: 9, socket_id: 1
core_id: -1, socket_id: -1
core_id: -1, socket_id: -1
core_id: -1, socket_id: -1
core_id: -1, socket_id: -1

HEADER_NRCPUS with this series applied:
nr_cpus_online = 15
nr_cpus_available = 28

HEADER_CPU_TOPOLOGY with this series applied:
core_sib_nr: 2
core_siblings: 0,6,8,10,16,22,24,26
core_siblings: 1,7,9,11,23,25,27
thread_sib_nr: 8
thread_siblings: 0,16
thread_siblings: 1
thread_siblings: 6,22
thread_siblings: 7,23
thread_siblings: 8,24
thread_siblings: 9,25
thread_siblings: 10,26
thread_siblings: 11,27
core_id: 0, socket_id: 0
core_id: 0, socket_id: 1
core_id: -1, socket_id: -1
core_id: -1, socket_id: -1
core_id: -1, socket_id: -1
core_id: -1, socket_id: -1
core_id: 10, socket_id: 0
core_id: 10, socket_id: 1
core_id: 1, socket_id: 0
core_id: 1, socket_id: 1
core_id: 9, socket_id: 0
core_id: 9, socket_id: 1
core_id: -1, socket_id: -1
core_id: -1, socket_id: -1
core_id: -1, socket_id: -1
core_id: -1, socket_id: -1
core_id: 0, socket_id: 0
core_id: -1, socket_id: -1
core_id: -1, socket_id: -1
core_id: -1, socket_id: -1
core_id: -1, socket_id: -1
core_id: -1, socket_id: -1
core_id: 10, socket_id: 0
core_id: 10, socket_id: 1
core_id: 1, socket_id: 0
core_id: 1, socket_id: 1
core_id: 9, socket_id: 0
core_id: 9, socket_id: 1

Jan Stancek (3):
  perf cpu_map: Add cpu__max_present_cpu()
  perf header: Make build_cpu_topology skip offline/absent CPUs
  perf: Replace _SC_NPROCESSORS_CONF with max_present_cpu in
cpu_topology_map

 tools/perf/builtin-stat.c   |  2 +-
 tools/perf/tests/topology.c |  4 +++-
 tools/perf/util/cpumap.c| 22 ++
 tools/perf/util/cpumap.h|  1 +
 tools/perf/util/env.c   |  2 +-
 tools/perf/util/header.c| 37 -
 6 files changed, 52 insertions(+), 16 deletions(-)

-- 
1.8.3.1



[PATCH v4 0/3] perf: topology on systems with offline/absent CPUs

2017-02-20 Thread Jan Stancek
Changes in v4:
- rebase on top of 4.10
- fix style of commit logs
- add comment to process_cpu_topology()

This series aims to address issues on systems with offline/absent CPUs:
1. cpu_topology_map size
   Allocation based on _SC_NPROCESSORS_CONF is too small,
   there can be CPU IDs that are higher, which causes invalid reads. 
2. build_cpu_topology fails for offline/absent CPUs
   It is trying to parse sysfs entries, that do not exist.
3. perf topology test fails

Changes affect HEADER_NRCPUS and HEADER_CPU_TOPOLOGY:

For example on a system like this:
  _SC_NPROCESSORS_CONF == 16
  available: 2 nodes (0-1)
  node 0 cpus: 0 6 8 10 16 22 24 26
  node 0 size: 12004 MB
  node 0 free: 9470 MB
  node 1 cpus: 1 7 9 11 23 25 27
  node 1 size: 12093 MB
  node 1 free: 9406 MB
  node distances:
  node   0   1
0:  10  20
1:  20  10

HEADER_NRCPUS before this series:
nr_cpus_online = 15
nr_cpus_available = 16

HEADER_CPU_TOPOLOGY before this series:
core_sib_nr: 2
core_siblings: 0,6,8,10,16,22,24,26
core_siblings: 1,7,9,11,23,25,27
thread_sib_nr: 8
thread_siblings: 0,16
thread_siblings: 1
thread_siblings: 6,22
thread_siblings: 7,23
thread_siblings: 8,24
thread_siblings: 9,25
thread_siblings: 10,26
thread_siblings: 11,27
core_id: 0, socket_id: 0
core_id: 0, socket_id: 1
core_id: -1, socket_id: -1
core_id: -1, socket_id: -1
core_id: -1, socket_id: -1
core_id: -1, socket_id: -1
core_id: 10, socket_id: 0
core_id: 10, socket_id: 1
core_id: 1, socket_id: 0
core_id: 1, socket_id: 1
core_id: 9, socket_id: 0
core_id: 9, socket_id: 1
core_id: -1, socket_id: -1
core_id: -1, socket_id: -1
core_id: -1, socket_id: -1
core_id: -1, socket_id: -1

HEADER_NRCPUS with this series applied:
nr_cpus_online = 15
nr_cpus_available = 28

HEADER_CPU_TOPOLOGY with this series applied:
core_sib_nr: 2
core_siblings: 0,6,8,10,16,22,24,26
core_siblings: 1,7,9,11,23,25,27
thread_sib_nr: 8
thread_siblings: 0,16
thread_siblings: 1
thread_siblings: 6,22
thread_siblings: 7,23
thread_siblings: 8,24
thread_siblings: 9,25
thread_siblings: 10,26
thread_siblings: 11,27
core_id: 0, socket_id: 0
core_id: 0, socket_id: 1
core_id: -1, socket_id: -1
core_id: -1, socket_id: -1
core_id: -1, socket_id: -1
core_id: -1, socket_id: -1
core_id: 10, socket_id: 0
core_id: 10, socket_id: 1
core_id: 1, socket_id: 0
core_id: 1, socket_id: 1
core_id: 9, socket_id: 0
core_id: 9, socket_id: 1
core_id: -1, socket_id: -1
core_id: -1, socket_id: -1
core_id: -1, socket_id: -1
core_id: -1, socket_id: -1
core_id: 0, socket_id: 0
core_id: -1, socket_id: -1
core_id: -1, socket_id: -1
core_id: -1, socket_id: -1
core_id: -1, socket_id: -1
core_id: -1, socket_id: -1
core_id: 10, socket_id: 0
core_id: 10, socket_id: 1
core_id: 1, socket_id: 0
core_id: 1, socket_id: 1
core_id: 9, socket_id: 0
core_id: 9, socket_id: 1

Jan Stancek (3):
  perf cpu_map: Add cpu__max_present_cpu()
  perf header: Make build_cpu_topology skip offline/absent CPUs
  perf: Replace _SC_NPROCESSORS_CONF with max_present_cpu in
cpu_topology_map

 tools/perf/builtin-stat.c   |  2 +-
 tools/perf/tests/topology.c |  4 +++-
 tools/perf/util/cpumap.c| 22 ++
 tools/perf/util/cpumap.h|  1 +
 tools/perf/util/env.c   |  2 +-
 tools/perf/util/header.c| 37 -
 6 files changed, 52 insertions(+), 16 deletions(-)

-- 
1.8.3.1



[PATCH v3 2/3] perf: make build_cpu_topology skip offline/absent CPUs

2017-02-17 Thread Jan Stancek
When build_cpu_topo() encounters offline/absent CPUs,
it fails to find any sysfs entries and returns failure.
This leads to build_cpu_topology() and write_cpu_topology()
failing as well.

Because HEADER_CPU_TOPOLOGY has not been written, read leaves
cpu_topology_map NULL and we get NULL ptr deref at:

 ...
  cmd_test
   __cmd_test
test_and_print
 run_test
  test_session_topology
   check_cpu_topology

36: Session topology   :
--- start ---
test child forked, pid 14902
templ file: /tmp/perf-test-4CKocW
failed to write feature HEADER_CPU_TOPOLOGY
perf: Segmentation fault
Obtained 9 stack frames.
./perf(sighandler_dump_stack+0x41) [0x5095f1]
/lib64/libc.so.6(+0x35250) [0x7f4b7c3c9250]
./perf(test_session_topology+0x1db) [0x490ceb]
./perf() [0x475b68]
./perf(cmd_test+0x5b9) [0x4763c9]
./perf() [0x4945a3]
./perf(main+0x69f) [0x427e8f]
/lib64/libc.so.6(__libc_start_main+0xf5) [0x7f4b7c3b5b35]
./perf() [0x427fb9]
test child interrupted
 end 
Session topology: FAILED!

This patch makes build_cpu_topology() skip offline/absent CPUs,
by checking their presence against cpu_map built from online CPUs.

Signed-off-by: Jan Stancek <jstan...@redhat.com>
---
 tools/perf/util/header.c | 19 ---
 1 file changed, 16 insertions(+), 3 deletions(-)

Changes in v3:
- make ret = -1 by default, so we fail in case build_cpu_topo
  doesn't run at all

diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index d89c9c7ef4e5..ca0f12fb5fd3 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -503,24 +503,31 @@ static void free_cpu_topo(struct cpu_topo *tp)
 
 static struct cpu_topo *build_cpu_topology(void)
 {
-   struct cpu_topo *tp;
+   struct cpu_topo *tp = NULL;
void *addr;
u32 nr, i;
size_t sz;
long ncpus;
int ret = -1;
+   struct cpu_map *map;
 
ncpus = sysconf(_SC_NPROCESSORS_CONF);
if (ncpus < 0)
return NULL;
 
+   /* build online CPU map */
+   map = cpu_map__new(NULL);
+   if (map == NULL) {
+   pr_debug("failed to get system cpumap\n");
+   return NULL;
+   }
+
nr = (u32)(ncpus & UINT_MAX);
 
sz = nr * sizeof(char *);
-
addr = calloc(1, sizeof(*tp) + 2 * sz);
if (!addr)
-   return NULL;
+   goto out_free;
 
tp = addr;
tp->cpu_nr = nr;
@@ -530,10 +537,16 @@ static struct cpu_topo *build_cpu_topology(void)
tp->thread_siblings = addr;
 
for (i = 0; i < nr; i++) {
+   if (!cpu_map__has(map, i))
+   continue;
+
ret = build_cpu_topo(tp, i);
if (ret < 0)
break;
}
+
+out_free:
+   cpu_map__put(map);
if (ret) {
free_cpu_topo(tp);
tp = NULL;
-- 
1.8.3.1



[PATCH v3 1/3] perf: add cpu__max_present_cpu()

2017-02-17 Thread Jan Stancek
Similar to cpu__max_cpu() (which returns max possible CPU),
returns max present CPU.

Signed-off-by: Jan Stancek <jstan...@redhat.com>
---
 tools/perf/util/cpumap.c | 22 ++
 tools/perf/util/cpumap.h |  1 +
 2 files changed, 23 insertions(+)

diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c
index 2c0b52264a46..8c7504939113 100644
--- a/tools/perf/util/cpumap.c
+++ b/tools/perf/util/cpumap.c
@@ -9,6 +9,7 @@
 #include "asm/bug.h"
 
 static int max_cpu_num;
+static int max_present_cpu_num;
 static int max_node_num;
 static int *cpunode_map;
 
@@ -442,6 +443,7 @@ static void set_max_cpu_num(void)
 
/* set up default */
max_cpu_num = 4096;
+   max_present_cpu_num = 4096;
 
mnt = sysfs__mountpoint();
if (!mnt)
@@ -455,6 +457,17 @@ static void set_max_cpu_num(void)
}
 
ret = get_max_num(path, _cpu_num);
+   if (ret)
+   goto out;
+
+   /* get the highest present cpu number for a sparse allocation */
+   ret = snprintf(path, PATH_MAX, "%s/devices/system/cpu/present", mnt);
+   if (ret == PATH_MAX) {
+   pr_err("sysfs path crossed PATH_MAX(%d) size\n", PATH_MAX);
+   goto out;
+   }
+
+   ret = get_max_num(path, _present_cpu_num);
 
 out:
if (ret)
@@ -505,6 +518,15 @@ int cpu__max_cpu(void)
return max_cpu_num;
 }
 
+int cpu__max_present_cpu(void)
+{
+   if (unlikely(!max_present_cpu_num))
+   set_max_cpu_num();
+
+   return max_present_cpu_num;
+}
+
+
 int cpu__get_node(int cpu)
 {
if (unlikely(cpunode_map == NULL)) {
diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h
index 06bd689f5989..1a0549af8f5c 100644
--- a/tools/perf/util/cpumap.h
+++ b/tools/perf/util/cpumap.h
@@ -62,6 +62,7 @@ static inline bool cpu_map__empty(const struct cpu_map *map)
 
 int cpu__max_node(void);
 int cpu__max_cpu(void);
+int cpu__max_present_cpu(void);
 int cpu__get_node(int cpu);
 
 int cpu_map__build_map(struct cpu_map *cpus, struct cpu_map **res,
-- 
1.8.3.1



[PATCH v3 3/3] perf: replace _SC_NPROCESSORS_CONF with max_present_cpu in cpu_topology_map

2017-02-17 Thread Jan Stancek
There are 2 problems wrt. cpu_topology_map on systems with sparse CPUs:

1. offline/absent CPUs will have their socket_id and core_id set to -1
   which triggers:
   "socket_id number is too big.You may need to upgrade the perf tool."

2. size of cpu_topology_map (perf_env.cpu[]) is allocated based on
   _SC_NPROCESSORS_CONF, but can be indexed with CPU ids going above.
   Users of perf_env.cpu[] are using CPU id as index. This can lead
   to read beyond what was allocated:
   ==19991== Invalid read of size 4
   ==19991==at 0x490CEB: check_cpu_topology (topology.c:69)
   ==19991==by 0x490CEB: test_session_topology (topology.c:106)
   ...

For example:
  _SC_NPROCESSORS_CONF == 16
  available: 2 nodes (0-1)
  node 0 cpus: 0 6 8 10 16 22 24 26
  node 0 size: 12004 MB
  node 0 free: 9470 MB
  node 1 cpus: 1 7 9 11 23 25 27
  node 1 size: 12093 MB
  node 1 free: 9406 MB
  node distances:
  node   0   1
0:  10  20
1:  20  10

This patch changes HEADER_NRCPUS.nr_cpus_available from _SC_NPROCESSORS_CONF
to max_present_cpu and updates any user of cpu_topology_map to iterate
with nr_cpus_avail. As consequence HEADER_CPU_TOPOLOGY core_id and
socket_id lists get longer, but maintain compatibility with pre-patch
state - index to cpu_topology_map is CPU id.

 ./perf test 36 -v
36: Session topology   :
--- start ---
test child forked, pid 22211
templ file: /tmp/perf-test-gmdX5i
CPU 0, core 0, socket 0
CPU 1, core 0, socket 1
CPU 6, core 10, socket 0
CPU 7, core 10, socket 1
CPU 8, core 1, socket 0
CPU 9, core 1, socket 1
CPU 10, core 9, socket 0
CPU 11, core 9, socket 1
CPU 16, core 0, socket 0
CPU 22, core 10, socket 0
CPU 23, core 10, socket 1
CPU 24, core 1, socket 0
CPU 25, core 1, socket 1
CPU 26, core 9, socket 0
CPU 27, core 9, socket 1
test child finished with 0
 end 
Session topology: Ok

Signed-off-by: Jan Stancek <jstan...@redhat.com>
---
 tools/perf/builtin-stat.c   |  2 +-
 tools/perf/tests/topology.c |  4 +++-
 tools/perf/util/env.c   |  2 +-
 tools/perf/util/header.c| 16 +---
 4 files changed, 10 insertions(+), 14 deletions(-)

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index a02f2e965628..9e2be1c52fbd 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -1765,7 +1765,7 @@ static inline int perf_env__get_cpu(struct perf_env *env, 
struct cpu_map *map, i
 
cpu = map->map[idx];
 
-   if (cpu >= env->nr_cpus_online)
+   if (cpu >= env->nr_cpus_avail)
return -1;
 
return cpu;
diff --git a/tools/perf/tests/topology.c b/tools/perf/tests/topology.c
index 98fe69ac553c..803f893550d6 100644
--- a/tools/perf/tests/topology.c
+++ b/tools/perf/tests/topology.c
@@ -65,7 +65,9 @@ static int check_cpu_topology(char *path, struct cpu_map *map)
session = perf_session__new(, false, NULL);
TEST_ASSERT_VAL("can't get session", session);
 
-   for (i = 0; i < session->header.env.nr_cpus_online; i++) {
+   for (i = 0; i < session->header.env.nr_cpus_avail; i++) {
+   if (!cpu_map__has(map, i))
+   continue;
pr_debug("CPU %d, core %d, socket %d\n", i,
 session->header.env.cpu[i].core_id,
 session->header.env.cpu[i].socket_id);
diff --git a/tools/perf/util/env.c b/tools/perf/util/env.c
index bb964e86b09d..075fc77286bf 100644
--- a/tools/perf/util/env.c
+++ b/tools/perf/util/env.c
@@ -66,7 +66,7 @@ int perf_env__read_cpu_topology_map(struct perf_env *env)
return 0;
 
if (env->nr_cpus_avail == 0)
-   env->nr_cpus_avail = sysconf(_SC_NPROCESSORS_CONF);
+   env->nr_cpus_avail = cpu__max_present_cpu();
 
nr_cpus = env->nr_cpus_avail;
if (nr_cpus == -1)
diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index ca0f12fb5fd3..3a79a0f10bf6 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -293,11 +293,7 @@ static int write_nrcpus(int fd, struct perf_header *h 
__maybe_unused,
u32 nrc, nra;
int ret;
 
-   nr = sysconf(_SC_NPROCESSORS_CONF);
-   if (nr < 0)
-   return -1;
-
-   nrc = (u32)(nr & UINT_MAX);
+   nrc = cpu__max_present_cpu();
 
nr = sysconf(_SC_NPROCESSORS_ONLN);
if (nr < 0)
@@ -511,9 +507,7 @@ static struct cpu_topo *build_cpu_topology(void)
int ret = -1;
struct cpu_map *map;
 
-   ncpus = sysconf(_SC_NPROCESSORS_CONF);
-   if (ncpus < 0)
-   return NULL;
+   ncpus = cpu__max_present_cpu();
 
/* build online CPU map */
map = cpu_map__new(NULL);
@@ -1137,7 +1131,7 @@ static void print_cpu_topology(struct perf_header *ph, 
int fd __maybe_unused,
 {
int nr, i;
char *str;
-   int cpu_nr = ph->env.nr_cpus_online;
+   int cpu_nr = ph

[PATCH v3 2/3] perf: make build_cpu_topology skip offline/absent CPUs

2017-02-17 Thread Jan Stancek
When build_cpu_topo() encounters offline/absent CPUs,
it fails to find any sysfs entries and returns failure.
This leads to build_cpu_topology() and write_cpu_topology()
failing as well.

Because HEADER_CPU_TOPOLOGY has not been written, read leaves
cpu_topology_map NULL and we get NULL ptr deref at:

 ...
  cmd_test
   __cmd_test
test_and_print
 run_test
  test_session_topology
   check_cpu_topology

36: Session topology   :
--- start ---
test child forked, pid 14902
templ file: /tmp/perf-test-4CKocW
failed to write feature HEADER_CPU_TOPOLOGY
perf: Segmentation fault
Obtained 9 stack frames.
./perf(sighandler_dump_stack+0x41) [0x5095f1]
/lib64/libc.so.6(+0x35250) [0x7f4b7c3c9250]
./perf(test_session_topology+0x1db) [0x490ceb]
./perf() [0x475b68]
./perf(cmd_test+0x5b9) [0x4763c9]
./perf() [0x4945a3]
./perf(main+0x69f) [0x427e8f]
/lib64/libc.so.6(__libc_start_main+0xf5) [0x7f4b7c3b5b35]
./perf() [0x427fb9]
test child interrupted
 end 
Session topology: FAILED!

This patch makes build_cpu_topology() skip offline/absent CPUs,
by checking their presence against cpu_map built from online CPUs.

Signed-off-by: Jan Stancek 
---
 tools/perf/util/header.c | 19 ---
 1 file changed, 16 insertions(+), 3 deletions(-)

Changes in v3:
- make ret = -1 by default, so we fail in case build_cpu_topo
  doesn't run at all

diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index d89c9c7ef4e5..ca0f12fb5fd3 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -503,24 +503,31 @@ static void free_cpu_topo(struct cpu_topo *tp)
 
 static struct cpu_topo *build_cpu_topology(void)
 {
-   struct cpu_topo *tp;
+   struct cpu_topo *tp = NULL;
void *addr;
u32 nr, i;
size_t sz;
long ncpus;
int ret = -1;
+   struct cpu_map *map;
 
ncpus = sysconf(_SC_NPROCESSORS_CONF);
if (ncpus < 0)
return NULL;
 
+   /* build online CPU map */
+   map = cpu_map__new(NULL);
+   if (map == NULL) {
+   pr_debug("failed to get system cpumap\n");
+   return NULL;
+   }
+
nr = (u32)(ncpus & UINT_MAX);
 
sz = nr * sizeof(char *);
-
addr = calloc(1, sizeof(*tp) + 2 * sz);
if (!addr)
-   return NULL;
+   goto out_free;
 
tp = addr;
tp->cpu_nr = nr;
@@ -530,10 +537,16 @@ static struct cpu_topo *build_cpu_topology(void)
tp->thread_siblings = addr;
 
for (i = 0; i < nr; i++) {
+   if (!cpu_map__has(map, i))
+   continue;
+
ret = build_cpu_topo(tp, i);
if (ret < 0)
break;
}
+
+out_free:
+   cpu_map__put(map);
if (ret) {
free_cpu_topo(tp);
tp = NULL;
-- 
1.8.3.1



[PATCH v3 1/3] perf: add cpu__max_present_cpu()

2017-02-17 Thread Jan Stancek
Similar to cpu__max_cpu() (which returns max possible CPU),
returns max present CPU.

Signed-off-by: Jan Stancek 
---
 tools/perf/util/cpumap.c | 22 ++
 tools/perf/util/cpumap.h |  1 +
 2 files changed, 23 insertions(+)

diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c
index 2c0b52264a46..8c7504939113 100644
--- a/tools/perf/util/cpumap.c
+++ b/tools/perf/util/cpumap.c
@@ -9,6 +9,7 @@
 #include "asm/bug.h"
 
 static int max_cpu_num;
+static int max_present_cpu_num;
 static int max_node_num;
 static int *cpunode_map;
 
@@ -442,6 +443,7 @@ static void set_max_cpu_num(void)
 
/* set up default */
max_cpu_num = 4096;
+   max_present_cpu_num = 4096;
 
mnt = sysfs__mountpoint();
if (!mnt)
@@ -455,6 +457,17 @@ static void set_max_cpu_num(void)
}
 
ret = get_max_num(path, _cpu_num);
+   if (ret)
+   goto out;
+
+   /* get the highest present cpu number for a sparse allocation */
+   ret = snprintf(path, PATH_MAX, "%s/devices/system/cpu/present", mnt);
+   if (ret == PATH_MAX) {
+   pr_err("sysfs path crossed PATH_MAX(%d) size\n", PATH_MAX);
+   goto out;
+   }
+
+   ret = get_max_num(path, _present_cpu_num);
 
 out:
if (ret)
@@ -505,6 +518,15 @@ int cpu__max_cpu(void)
return max_cpu_num;
 }
 
+int cpu__max_present_cpu(void)
+{
+   if (unlikely(!max_present_cpu_num))
+   set_max_cpu_num();
+
+   return max_present_cpu_num;
+}
+
+
 int cpu__get_node(int cpu)
 {
if (unlikely(cpunode_map == NULL)) {
diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h
index 06bd689f5989..1a0549af8f5c 100644
--- a/tools/perf/util/cpumap.h
+++ b/tools/perf/util/cpumap.h
@@ -62,6 +62,7 @@ static inline bool cpu_map__empty(const struct cpu_map *map)
 
 int cpu__max_node(void);
 int cpu__max_cpu(void);
+int cpu__max_present_cpu(void);
 int cpu__get_node(int cpu);
 
 int cpu_map__build_map(struct cpu_map *cpus, struct cpu_map **res,
-- 
1.8.3.1



[PATCH v3 3/3] perf: replace _SC_NPROCESSORS_CONF with max_present_cpu in cpu_topology_map

2017-02-17 Thread Jan Stancek
There are 2 problems wrt. cpu_topology_map on systems with sparse CPUs:

1. offline/absent CPUs will have their socket_id and core_id set to -1
   which triggers:
   "socket_id number is too big.You may need to upgrade the perf tool."

2. size of cpu_topology_map (perf_env.cpu[]) is allocated based on
   _SC_NPROCESSORS_CONF, but can be indexed with CPU ids going above.
   Users of perf_env.cpu[] are using CPU id as index. This can lead
   to read beyond what was allocated:
   ==19991== Invalid read of size 4
   ==19991==at 0x490CEB: check_cpu_topology (topology.c:69)
   ==19991==by 0x490CEB: test_session_topology (topology.c:106)
   ...

For example:
  _SC_NPROCESSORS_CONF == 16
  available: 2 nodes (0-1)
  node 0 cpus: 0 6 8 10 16 22 24 26
  node 0 size: 12004 MB
  node 0 free: 9470 MB
  node 1 cpus: 1 7 9 11 23 25 27
  node 1 size: 12093 MB
  node 1 free: 9406 MB
  node distances:
  node   0   1
0:  10  20
1:  20  10

This patch changes HEADER_NRCPUS.nr_cpus_available from _SC_NPROCESSORS_CONF
to max_present_cpu and updates any user of cpu_topology_map to iterate
with nr_cpus_avail. As consequence HEADER_CPU_TOPOLOGY core_id and
socket_id lists get longer, but maintain compatibility with pre-patch
state - index to cpu_topology_map is CPU id.

 ./perf test 36 -v
36: Session topology   :
--- start ---
test child forked, pid 22211
templ file: /tmp/perf-test-gmdX5i
CPU 0, core 0, socket 0
CPU 1, core 0, socket 1
CPU 6, core 10, socket 0
CPU 7, core 10, socket 1
CPU 8, core 1, socket 0
CPU 9, core 1, socket 1
CPU 10, core 9, socket 0
CPU 11, core 9, socket 1
CPU 16, core 0, socket 0
CPU 22, core 10, socket 0
CPU 23, core 10, socket 1
CPU 24, core 1, socket 0
CPU 25, core 1, socket 1
CPU 26, core 9, socket 0
CPU 27, core 9, socket 1
test child finished with 0
 end 
Session topology: Ok

Signed-off-by: Jan Stancek 
---
 tools/perf/builtin-stat.c   |  2 +-
 tools/perf/tests/topology.c |  4 +++-
 tools/perf/util/env.c   |  2 +-
 tools/perf/util/header.c| 16 +---
 4 files changed, 10 insertions(+), 14 deletions(-)

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index a02f2e965628..9e2be1c52fbd 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -1765,7 +1765,7 @@ static inline int perf_env__get_cpu(struct perf_env *env, 
struct cpu_map *map, i
 
cpu = map->map[idx];
 
-   if (cpu >= env->nr_cpus_online)
+   if (cpu >= env->nr_cpus_avail)
return -1;
 
return cpu;
diff --git a/tools/perf/tests/topology.c b/tools/perf/tests/topology.c
index 98fe69ac553c..803f893550d6 100644
--- a/tools/perf/tests/topology.c
+++ b/tools/perf/tests/topology.c
@@ -65,7 +65,9 @@ static int check_cpu_topology(char *path, struct cpu_map *map)
session = perf_session__new(, false, NULL);
TEST_ASSERT_VAL("can't get session", session);
 
-   for (i = 0; i < session->header.env.nr_cpus_online; i++) {
+   for (i = 0; i < session->header.env.nr_cpus_avail; i++) {
+   if (!cpu_map__has(map, i))
+   continue;
pr_debug("CPU %d, core %d, socket %d\n", i,
 session->header.env.cpu[i].core_id,
 session->header.env.cpu[i].socket_id);
diff --git a/tools/perf/util/env.c b/tools/perf/util/env.c
index bb964e86b09d..075fc77286bf 100644
--- a/tools/perf/util/env.c
+++ b/tools/perf/util/env.c
@@ -66,7 +66,7 @@ int perf_env__read_cpu_topology_map(struct perf_env *env)
return 0;
 
if (env->nr_cpus_avail == 0)
-   env->nr_cpus_avail = sysconf(_SC_NPROCESSORS_CONF);
+   env->nr_cpus_avail = cpu__max_present_cpu();
 
nr_cpus = env->nr_cpus_avail;
if (nr_cpus == -1)
diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index ca0f12fb5fd3..3a79a0f10bf6 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -293,11 +293,7 @@ static int write_nrcpus(int fd, struct perf_header *h 
__maybe_unused,
u32 nrc, nra;
int ret;
 
-   nr = sysconf(_SC_NPROCESSORS_CONF);
-   if (nr < 0)
-   return -1;
-
-   nrc = (u32)(nr & UINT_MAX);
+   nrc = cpu__max_present_cpu();
 
nr = sysconf(_SC_NPROCESSORS_ONLN);
if (nr < 0)
@@ -511,9 +507,7 @@ static struct cpu_topo *build_cpu_topology(void)
int ret = -1;
struct cpu_map *map;
 
-   ncpus = sysconf(_SC_NPROCESSORS_CONF);
-   if (ncpus < 0)
-   return NULL;
+   ncpus = cpu__max_present_cpu();
 
/* build online CPU map */
map = cpu_map__new(NULL);
@@ -1137,7 +1131,7 @@ static void print_cpu_topology(struct perf_header *ph, 
int fd __maybe_unused,
 {
int nr, i;
char *str;
-   int cpu_nr = ph->env.nr_cpus_online;
+   int cpu_nr = ph->env.nr_cpus_avail;
 

Re: [PATCH v2 2/3] perf: make build_cpu_topology skip offline/absent CPUs

2017-02-15 Thread Jan Stancek


- Original Message -
> From: "Jiri Olsa" <jo...@redhat.com>
> To: "Jan Stancek" <jstan...@redhat.com>
> Cc: linux-kernel@vger.kernel.org, pet...@infradead.org, mi...@redhat.com, 
> a...@kernel.org, "alexander shishkin"
> <alexander.shish...@linux.intel.com>, jo...@kernel.org, mhira...@kernel.org
> Sent: Tuesday, 14 February, 2017 12:01:10 PM
> Subject: Re: [PATCH v2 2/3] perf: make build_cpu_topology skip offline/absent 
> CPUs
> 
> On Mon, Feb 13, 2017 at 04:34:35PM +0100, Jan Stancek wrote:
> 
> SNIP
> 
> > This patch makes build_cpu_topology() skip offline/absent CPUs,
> > by checking their presence against cpu_map built from online CPUs.
> > 
> > Signed-off-by: Jan Stancek <jstan...@redhat.com>
> > ---
> >  tools/perf/util/header.c | 21 +
> >  1 file changed, 17 insertions(+), 4 deletions(-)
> > 
> > Changes in v2:
> > - drop out label, use return NULL where possible
> > 
> > diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
> > index d89c9c7ef4e5..4b0ea4e92e9d 100644
> > --- a/tools/perf/util/header.c
> > +++ b/tools/perf/util/header.c
> > @@ -503,24 +503,31 @@ static void free_cpu_topo(struct cpu_topo *tp)
> >  
> >  static struct cpu_topo *build_cpu_topology(void)
> >  {
> > -   struct cpu_topo *tp;
> > +   struct cpu_topo *tp = NULL;
> > void *addr;
> > u32 nr, i;
> > size_t sz;
> > long ncpus;
> > -   int ret = -1;
> > +   int ret = 0;
> 
> hum, shoudn't we fail if we dont find any online cpu?
> 
> jirka

You're right, there's no reason to change this. I'll send v3.


Re: [PATCH v2 2/3] perf: make build_cpu_topology skip offline/absent CPUs

2017-02-15 Thread Jan Stancek


- Original Message -
> From: "Jiri Olsa" 
> To: "Jan Stancek" 
> Cc: linux-kernel@vger.kernel.org, pet...@infradead.org, mi...@redhat.com, 
> a...@kernel.org, "alexander shishkin"
> , jo...@kernel.org, mhira...@kernel.org
> Sent: Tuesday, 14 February, 2017 12:01:10 PM
> Subject: Re: [PATCH v2 2/3] perf: make build_cpu_topology skip offline/absent 
> CPUs
> 
> On Mon, Feb 13, 2017 at 04:34:35PM +0100, Jan Stancek wrote:
> 
> SNIP
> 
> > This patch makes build_cpu_topology() skip offline/absent CPUs,
> > by checking their presence against cpu_map built from online CPUs.
> > 
> > Signed-off-by: Jan Stancek 
> > ---
> >  tools/perf/util/header.c | 21 +
> >  1 file changed, 17 insertions(+), 4 deletions(-)
> > 
> > Changes in v2:
> > - drop out label, use return NULL where possible
> > 
> > diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
> > index d89c9c7ef4e5..4b0ea4e92e9d 100644
> > --- a/tools/perf/util/header.c
> > +++ b/tools/perf/util/header.c
> > @@ -503,24 +503,31 @@ static void free_cpu_topo(struct cpu_topo *tp)
> >  
> >  static struct cpu_topo *build_cpu_topology(void)
> >  {
> > -   struct cpu_topo *tp;
> > +   struct cpu_topo *tp = NULL;
> > void *addr;
> > u32 nr, i;
> > size_t sz;
> > long ncpus;
> > -   int ret = -1;
> > +   int ret = 0;
> 
> hum, shoudn't we fail if we dont find any online cpu?
> 
> jirka

You're right, there's no reason to change this. I'll send v3.


[PATCH v2 1/3] perf: add cpu__max_present_cpu()

2017-02-13 Thread Jan Stancek
Similar to cpu__max_cpu() (which returns max possible CPU),
returns max present CPU.

Signed-off-by: Jan Stancek <jstan...@redhat.com>
---
 tools/perf/util/cpumap.c | 22 ++
 tools/perf/util/cpumap.h |  1 +
 2 files changed, 23 insertions(+)

diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c
index 2c0b52264a46..8c7504939113 100644
--- a/tools/perf/util/cpumap.c
+++ b/tools/perf/util/cpumap.c
@@ -9,6 +9,7 @@
 #include "asm/bug.h"
 
 static int max_cpu_num;
+static int max_present_cpu_num;
 static int max_node_num;
 static int *cpunode_map;
 
@@ -442,6 +443,7 @@ static void set_max_cpu_num(void)
 
/* set up default */
max_cpu_num = 4096;
+   max_present_cpu_num = 4096;
 
mnt = sysfs__mountpoint();
if (!mnt)
@@ -455,6 +457,17 @@ static void set_max_cpu_num(void)
}
 
ret = get_max_num(path, _cpu_num);
+   if (ret)
+   goto out;
+
+   /* get the highest present cpu number for a sparse allocation */
+   ret = snprintf(path, PATH_MAX, "%s/devices/system/cpu/present", mnt);
+   if (ret == PATH_MAX) {
+   pr_err("sysfs path crossed PATH_MAX(%d) size\n", PATH_MAX);
+   goto out;
+   }
+
+   ret = get_max_num(path, _present_cpu_num);
 
 out:
if (ret)
@@ -505,6 +518,15 @@ int cpu__max_cpu(void)
return max_cpu_num;
 }
 
+int cpu__max_present_cpu(void)
+{
+   if (unlikely(!max_present_cpu_num))
+   set_max_cpu_num();
+
+   return max_present_cpu_num;
+}
+
+
 int cpu__get_node(int cpu)
 {
if (unlikely(cpunode_map == NULL)) {
diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h
index 06bd689f5989..1a0549af8f5c 100644
--- a/tools/perf/util/cpumap.h
+++ b/tools/perf/util/cpumap.h
@@ -62,6 +62,7 @@ static inline bool cpu_map__empty(const struct cpu_map *map)
 
 int cpu__max_node(void);
 int cpu__max_cpu(void);
+int cpu__max_present_cpu(void);
 int cpu__get_node(int cpu);
 
 int cpu_map__build_map(struct cpu_map *cpus, struct cpu_map **res,
-- 
1.8.3.1



[PATCH v2 1/3] perf: add cpu__max_present_cpu()

2017-02-13 Thread Jan Stancek
Similar to cpu__max_cpu() (which returns max possible CPU),
returns max present CPU.

Signed-off-by: Jan Stancek 
---
 tools/perf/util/cpumap.c | 22 ++
 tools/perf/util/cpumap.h |  1 +
 2 files changed, 23 insertions(+)

diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c
index 2c0b52264a46..8c7504939113 100644
--- a/tools/perf/util/cpumap.c
+++ b/tools/perf/util/cpumap.c
@@ -9,6 +9,7 @@
 #include "asm/bug.h"
 
 static int max_cpu_num;
+static int max_present_cpu_num;
 static int max_node_num;
 static int *cpunode_map;
 
@@ -442,6 +443,7 @@ static void set_max_cpu_num(void)
 
/* set up default */
max_cpu_num = 4096;
+   max_present_cpu_num = 4096;
 
mnt = sysfs__mountpoint();
if (!mnt)
@@ -455,6 +457,17 @@ static void set_max_cpu_num(void)
}
 
ret = get_max_num(path, _cpu_num);
+   if (ret)
+   goto out;
+
+   /* get the highest present cpu number for a sparse allocation */
+   ret = snprintf(path, PATH_MAX, "%s/devices/system/cpu/present", mnt);
+   if (ret == PATH_MAX) {
+   pr_err("sysfs path crossed PATH_MAX(%d) size\n", PATH_MAX);
+   goto out;
+   }
+
+   ret = get_max_num(path, _present_cpu_num);
 
 out:
if (ret)
@@ -505,6 +518,15 @@ int cpu__max_cpu(void)
return max_cpu_num;
 }
 
+int cpu__max_present_cpu(void)
+{
+   if (unlikely(!max_present_cpu_num))
+   set_max_cpu_num();
+
+   return max_present_cpu_num;
+}
+
+
 int cpu__get_node(int cpu)
 {
if (unlikely(cpunode_map == NULL)) {
diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h
index 06bd689f5989..1a0549af8f5c 100644
--- a/tools/perf/util/cpumap.h
+++ b/tools/perf/util/cpumap.h
@@ -62,6 +62,7 @@ static inline bool cpu_map__empty(const struct cpu_map *map)
 
 int cpu__max_node(void);
 int cpu__max_cpu(void);
+int cpu__max_present_cpu(void);
 int cpu__get_node(int cpu);
 
 int cpu_map__build_map(struct cpu_map *cpus, struct cpu_map **res,
-- 
1.8.3.1



[PATCH v2 3/3] perf: replace _SC_NPROCESSORS_CONF with max_present_cpu in cpu_topology_map

2017-02-13 Thread Jan Stancek
There are 2 problems wrt. cpu_topology_map on systems with sparse CPUs:

1. offline/absent CPUs will have their socket_id and core_id set to -1
   which triggers:
   "socket_id number is too big.You may need to upgrade the perf tool."

2. size of cpu_topology_map (perf_env.cpu[]) is allocated based on
   _SC_NPROCESSORS_CONF, but can be indexed with CPU ids going above.
   Users of perf_env.cpu[] are using CPU id as index. This can lead
   to read beyond what was allocated:
   ==19991== Invalid read of size 4
   ==19991==at 0x490CEB: check_cpu_topology (topology.c:69)
   ==19991==by 0x490CEB: test_session_topology (topology.c:106)
   ...

For example:
  _SC_NPROCESSORS_CONF == 16
  available: 2 nodes (0-1)
  node 0 cpus: 0 6 8 10 16 22 24 26
  node 0 size: 12004 MB
  node 0 free: 9470 MB
  node 1 cpus: 1 7 9 11 23 25 27
  node 1 size: 12093 MB
  node 1 free: 9406 MB
  node distances:
  node   0   1
0:  10  20
1:  20  10

This patch changes HEADER_NRCPUS.nr_cpus_available from _SC_NPROCESSORS_CONF
to max_present_cpu and updates any user of cpu_topology_map to iterate
with nr_cpus_avail. As consequence HEADER_CPU_TOPOLOGY core_id and
socket_id lists get longer, but maintain compatibility with pre-patch
state - index to cpu_topology_map is CPU id.

 ./perf test 36 -v
36: Session topology   :
--- start ---
test child forked, pid 22211
templ file: /tmp/perf-test-gmdX5i
CPU 0, core 0, socket 0
CPU 1, core 0, socket 1
CPU 6, core 10, socket 0
CPU 7, core 10, socket 1
CPU 8, core 1, socket 0
CPU 9, core 1, socket 1
CPU 10, core 9, socket 0
CPU 11, core 9, socket 1
CPU 16, core 0, socket 0
CPU 22, core 10, socket 0
CPU 23, core 10, socket 1
CPU 24, core 1, socket 0
CPU 25, core 1, socket 1
CPU 26, core 9, socket 0
CPU 27, core 9, socket 1
test child finished with 0
 end 
Session topology: Ok

Signed-off-by: Jan Stancek <jstan...@redhat.com>
---
 tools/perf/builtin-stat.c   |  2 +-
 tools/perf/tests/topology.c |  4 +++-
 tools/perf/util/env.c   |  2 +-
 tools/perf/util/header.c| 16 +---
 4 files changed, 10 insertions(+), 14 deletions(-)

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index a02f2e965628..9e2be1c52fbd 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -1765,7 +1765,7 @@ static inline int perf_env__get_cpu(struct perf_env *env, 
struct cpu_map *map, i
 
cpu = map->map[idx];
 
-   if (cpu >= env->nr_cpus_online)
+   if (cpu >= env->nr_cpus_avail)
return -1;
 
return cpu;
diff --git a/tools/perf/tests/topology.c b/tools/perf/tests/topology.c
index 98fe69ac553c..803f893550d6 100644
--- a/tools/perf/tests/topology.c
+++ b/tools/perf/tests/topology.c
@@ -65,7 +65,9 @@ static int check_cpu_topology(char *path, struct cpu_map *map)
session = perf_session__new(, false, NULL);
TEST_ASSERT_VAL("can't get session", session);
 
-   for (i = 0; i < session->header.env.nr_cpus_online; i++) {
+   for (i = 0; i < session->header.env.nr_cpus_avail; i++) {
+   if (!cpu_map__has(map, i))
+   continue;
pr_debug("CPU %d, core %d, socket %d\n", i,
 session->header.env.cpu[i].core_id,
 session->header.env.cpu[i].socket_id);
diff --git a/tools/perf/util/env.c b/tools/perf/util/env.c
index bb964e86b09d..075fc77286bf 100644
--- a/tools/perf/util/env.c
+++ b/tools/perf/util/env.c
@@ -66,7 +66,7 @@ int perf_env__read_cpu_topology_map(struct perf_env *env)
return 0;
 
if (env->nr_cpus_avail == 0)
-   env->nr_cpus_avail = sysconf(_SC_NPROCESSORS_CONF);
+   env->nr_cpus_avail = cpu__max_present_cpu();
 
nr_cpus = env->nr_cpus_avail;
if (nr_cpus == -1)
diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index 4b0ea4e92e9d..e9384d17ae7d 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -293,11 +293,7 @@ static int write_nrcpus(int fd, struct perf_header *h 
__maybe_unused,
u32 nrc, nra;
int ret;
 
-   nr = sysconf(_SC_NPROCESSORS_CONF);
-   if (nr < 0)
-   return -1;
-
-   nrc = (u32)(nr & UINT_MAX);
+   nrc = cpu__max_present_cpu();
 
nr = sysconf(_SC_NPROCESSORS_ONLN);
if (nr < 0)
@@ -511,9 +507,7 @@ static struct cpu_topo *build_cpu_topology(void)
int ret = 0;
struct cpu_map *map;
 
-   ncpus = sysconf(_SC_NPROCESSORS_CONF);
-   if (ncpus < 0)
-   return NULL;
+   ncpus = cpu__max_present_cpu();
 
/* build online CPU map */
map = cpu_map__new(NULL);
@@ -1137,7 +1131,7 @@ static void print_cpu_topology(struct perf_header *ph, 
int fd __maybe_unused,
 {
int nr, i;
char *str;
-   int cpu_nr = ph->env.nr_cpus_online;
+   int cpu_nr = ph

[PATCH v2 2/3] perf: make build_cpu_topology skip offline/absent CPUs

2017-02-13 Thread Jan Stancek
When build_cpu_topo() encounters offline/absent CPUs,
it fails to find any sysfs entries and returns failure.
This leads to build_cpu_topology() and write_cpu_topology()
failing as well.

Because HEADER_CPU_TOPOLOGY has not been written, read leaves
cpu_topology_map NULL and we get NULL ptr deref at:

 ...
  cmd_test
   __cmd_test
test_and_print
 run_test
  test_session_topology
   check_cpu_topology

36: Session topology   :
--- start ---
test child forked, pid 14902
templ file: /tmp/perf-test-4CKocW
failed to write feature HEADER_CPU_TOPOLOGY
perf: Segmentation fault
Obtained 9 stack frames.
./perf(sighandler_dump_stack+0x41) [0x5095f1]
/lib64/libc.so.6(+0x35250) [0x7f4b7c3c9250]
./perf(test_session_topology+0x1db) [0x490ceb]
./perf() [0x475b68]
./perf(cmd_test+0x5b9) [0x4763c9]
./perf() [0x4945a3]
./perf(main+0x69f) [0x427e8f]
/lib64/libc.so.6(__libc_start_main+0xf5) [0x7f4b7c3b5b35]
./perf() [0x427fb9]
test child interrupted
 end 
Session topology: FAILED!

This patch makes build_cpu_topology() skip offline/absent CPUs,
by checking their presence against cpu_map built from online CPUs.

Signed-off-by: Jan Stancek <jstan...@redhat.com>
---
 tools/perf/util/header.c | 21 +
 1 file changed, 17 insertions(+), 4 deletions(-)

Changes in v2:
- drop out label, use return NULL where possible

diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index d89c9c7ef4e5..4b0ea4e92e9d 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -503,24 +503,31 @@ static void free_cpu_topo(struct cpu_topo *tp)
 
 static struct cpu_topo *build_cpu_topology(void)
 {
-   struct cpu_topo *tp;
+   struct cpu_topo *tp = NULL;
void *addr;
u32 nr, i;
size_t sz;
long ncpus;
-   int ret = -1;
+   int ret = 0;
+   struct cpu_map *map;
 
ncpus = sysconf(_SC_NPROCESSORS_CONF);
if (ncpus < 0)
return NULL;
 
+   /* build online CPU map */
+   map = cpu_map__new(NULL);
+   if (map == NULL) {
+   pr_debug("failed to get system cpumap\n");
+   return NULL;
+   }
+
nr = (u32)(ncpus & UINT_MAX);
 
sz = nr * sizeof(char *);
-
addr = calloc(1, sizeof(*tp) + 2 * sz);
if (!addr)
-   return NULL;
+   goto out_free;
 
tp = addr;
tp->cpu_nr = nr;
@@ -530,10 +537,16 @@ static struct cpu_topo *build_cpu_topology(void)
tp->thread_siblings = addr;
 
for (i = 0; i < nr; i++) {
+   if (!cpu_map__has(map, i))
+   continue;
+
ret = build_cpu_topo(tp, i);
if (ret < 0)
break;
}
+
+out_free:
+   cpu_map__put(map);
if (ret) {
free_cpu_topo(tp);
tp = NULL;
-- 
1.8.3.1



[PATCH v2 3/3] perf: replace _SC_NPROCESSORS_CONF with max_present_cpu in cpu_topology_map

2017-02-13 Thread Jan Stancek
There are 2 problems wrt. cpu_topology_map on systems with sparse CPUs:

1. offline/absent CPUs will have their socket_id and core_id set to -1
   which triggers:
   "socket_id number is too big.You may need to upgrade the perf tool."

2. size of cpu_topology_map (perf_env.cpu[]) is allocated based on
   _SC_NPROCESSORS_CONF, but can be indexed with CPU ids going above.
   Users of perf_env.cpu[] are using CPU id as index. This can lead
   to read beyond what was allocated:
   ==19991== Invalid read of size 4
   ==19991==at 0x490CEB: check_cpu_topology (topology.c:69)
   ==19991==by 0x490CEB: test_session_topology (topology.c:106)
   ...

For example:
  _SC_NPROCESSORS_CONF == 16
  available: 2 nodes (0-1)
  node 0 cpus: 0 6 8 10 16 22 24 26
  node 0 size: 12004 MB
  node 0 free: 9470 MB
  node 1 cpus: 1 7 9 11 23 25 27
  node 1 size: 12093 MB
  node 1 free: 9406 MB
  node distances:
  node   0   1
0:  10  20
1:  20  10

This patch changes HEADER_NRCPUS.nr_cpus_available from _SC_NPROCESSORS_CONF
to max_present_cpu and updates any user of cpu_topology_map to iterate
with nr_cpus_avail. As consequence HEADER_CPU_TOPOLOGY core_id and
socket_id lists get longer, but maintain compatibility with pre-patch
state - index to cpu_topology_map is CPU id.

 ./perf test 36 -v
36: Session topology   :
--- start ---
test child forked, pid 22211
templ file: /tmp/perf-test-gmdX5i
CPU 0, core 0, socket 0
CPU 1, core 0, socket 1
CPU 6, core 10, socket 0
CPU 7, core 10, socket 1
CPU 8, core 1, socket 0
CPU 9, core 1, socket 1
CPU 10, core 9, socket 0
CPU 11, core 9, socket 1
CPU 16, core 0, socket 0
CPU 22, core 10, socket 0
CPU 23, core 10, socket 1
CPU 24, core 1, socket 0
CPU 25, core 1, socket 1
CPU 26, core 9, socket 0
CPU 27, core 9, socket 1
test child finished with 0
 end 
Session topology: Ok

Signed-off-by: Jan Stancek 
---
 tools/perf/builtin-stat.c   |  2 +-
 tools/perf/tests/topology.c |  4 +++-
 tools/perf/util/env.c   |  2 +-
 tools/perf/util/header.c| 16 +---
 4 files changed, 10 insertions(+), 14 deletions(-)

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index a02f2e965628..9e2be1c52fbd 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -1765,7 +1765,7 @@ static inline int perf_env__get_cpu(struct perf_env *env, 
struct cpu_map *map, i
 
cpu = map->map[idx];
 
-   if (cpu >= env->nr_cpus_online)
+   if (cpu >= env->nr_cpus_avail)
return -1;
 
return cpu;
diff --git a/tools/perf/tests/topology.c b/tools/perf/tests/topology.c
index 98fe69ac553c..803f893550d6 100644
--- a/tools/perf/tests/topology.c
+++ b/tools/perf/tests/topology.c
@@ -65,7 +65,9 @@ static int check_cpu_topology(char *path, struct cpu_map *map)
session = perf_session__new(, false, NULL);
TEST_ASSERT_VAL("can't get session", session);
 
-   for (i = 0; i < session->header.env.nr_cpus_online; i++) {
+   for (i = 0; i < session->header.env.nr_cpus_avail; i++) {
+   if (!cpu_map__has(map, i))
+   continue;
pr_debug("CPU %d, core %d, socket %d\n", i,
 session->header.env.cpu[i].core_id,
 session->header.env.cpu[i].socket_id);
diff --git a/tools/perf/util/env.c b/tools/perf/util/env.c
index bb964e86b09d..075fc77286bf 100644
--- a/tools/perf/util/env.c
+++ b/tools/perf/util/env.c
@@ -66,7 +66,7 @@ int perf_env__read_cpu_topology_map(struct perf_env *env)
return 0;
 
if (env->nr_cpus_avail == 0)
-   env->nr_cpus_avail = sysconf(_SC_NPROCESSORS_CONF);
+   env->nr_cpus_avail = cpu__max_present_cpu();
 
nr_cpus = env->nr_cpus_avail;
if (nr_cpus == -1)
diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index 4b0ea4e92e9d..e9384d17ae7d 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -293,11 +293,7 @@ static int write_nrcpus(int fd, struct perf_header *h 
__maybe_unused,
u32 nrc, nra;
int ret;
 
-   nr = sysconf(_SC_NPROCESSORS_CONF);
-   if (nr < 0)
-   return -1;
-
-   nrc = (u32)(nr & UINT_MAX);
+   nrc = cpu__max_present_cpu();
 
nr = sysconf(_SC_NPROCESSORS_ONLN);
if (nr < 0)
@@ -511,9 +507,7 @@ static struct cpu_topo *build_cpu_topology(void)
int ret = 0;
struct cpu_map *map;
 
-   ncpus = sysconf(_SC_NPROCESSORS_CONF);
-   if (ncpus < 0)
-   return NULL;
+   ncpus = cpu__max_present_cpu();
 
/* build online CPU map */
map = cpu_map__new(NULL);
@@ -1137,7 +1131,7 @@ static void print_cpu_topology(struct perf_header *ph, 
int fd __maybe_unused,
 {
int nr, i;
char *str;
-   int cpu_nr = ph->env.nr_cpus_online;
+   int cpu_nr = ph->env.nr_cpus_avail;
 

  1   2   3   >