[PATCH] mm: migrate: initialize err in do_migrate_pages

2021-01-05 Thread Jan Stancek
After commit 236c32eb1096 ("mm: migrate: clean up migrate_prep{_local}")',
do_migrate_pages can return uninitialized variable 'err' (which is
propagated to user-space as error) when 'from' and 'to' nodesets
are identical. This can be reproduced with LTP migrate_pages01,
which calls migrate_pages() with same set for both old/new_nodes.

Add 'err' initialization back.

Fixes: 236c32eb1096 ("mm: migrate: clean up migrate_prep{_local}")
Cc: Zi Yan 
Cc: Yang Shi 
Cc: Jan Kara 
Cc: Matthew Wilcox 
Cc: Mel Gorman 
Cc: Michal Hocko 
Cc: Song Liu 
Cc: Andrew Morton 
Signed-off-by: Jan Stancek 
---
 mm/mempolicy.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 8cf96bd21341..2c3a86502053 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -,7 +,7 @@ int do_migrate_pages(struct mm_struct *mm, const 
nodemask_t *from,
 const nodemask_t *to, int flags)
 {
int busy = 0;
-   int err;
+   int err = 0;
nodemask_t tmp;
 
migrate_prep();
-- 
2.18.1



Re: [LTP] [PATCH] selftests: vdso: hash entry size on alpha, s390x is 8 bytes

2020-08-07 Thread Jan Stancek


- Original Message -
> 
> - Original Message -
> > Hi!
> > As much as it's worth the changes looks good to me.
> > 
> > @Jan: I guess that we can as well fix this in LTP first then we can try
> >   to get the kernel version fixed...
> 
> Fine by me, I'll give it couple more days then push it.

Pushed.



Re: [LTP] [PATCH] selftests: vdso: hash entry size on alpha, s390x is 8 bytes

2020-08-05 Thread Jan Stancek


- Original Message -
> Hi!
> As much as it's worth the changes looks good to me.
> 
> @Jan: I guess that we can as well fix this in LTP first then we can try
>   to get the kernel version fixed...

Fine by me, I'll give it couple more days then push it.

I did repost it with original author CC-ed:
  
https://lore.kernel.org/lkml/93a07b1808e61babef3a20542cbeb4565d3c8410.1596458924.git.jstan...@redhat.com/
but haven't heard back yet.



[PATCH RESEND] selftests: vdso: hash entry size on alpha,s390x is 8 bytes

2020-08-03 Thread Jan Stancek
parse_vdso.c is crashing on 5.8-rc5 s390x, because it wrongly reads
nbucket as 0:
  Program received signal SIGFPE, Arithmetic exception.
  0x01000f3e in vdso_sym (version=0x1001280 "LINUX_2.6", name=0x100128a 
"__vdso_getcpu") at parse_vdso.c:207
  207 ELF(Word) chain = vdso_info.bucket[elf_hash(name) % 
vdso_info.nbucket];
  (gdb) p vdso_info.nbucket
  $1 = 0

Per readelf source:
  
https://sourceware.org/git/?p=binutils-gdb.git;a=blob;f=binutils/readelf.c;h=2406304fe35a832ac53aa7b1a367f3f7afed4264;hb=HEAD#l10027
and objdump output hash entries are double size on 64bit s390 and alpha:
  0120 <.hash>:
   120:   00 00 00 00 .long   0x
   124:   00 00 00 03 .long   0x0003
   128:   00 00 00 00 .long   0x
   12c:   00 00 00 07 .long   0x0007
   130:   00 00 00 00 .long   0x
   134:   00 00 00 01 .long   0x0001
   138:   00 00 00 00 .long   0x
   13c:   00 00 00 05 .long   0x0005
   140:   00 00 00 00 .long   0x
   144:   00 00 00 06 .long   0x0006
  ...
   16c:   00 00 00 02 .long   0x0002
   170:   00 00 00 00 .long   0x
   174:   00 00 00 03 .long   0x0003
   178:   00 00 00 00 .long   0x
   17c:   00 00 00 04 .long   0x0004

Do similar check in parse_vdso.c and treat hash entries as double word.

Signed-off-by: Jan Stancek 
---
 tools/testing/selftests/vDSO/parse_vdso.c | 48 +++
 1 file changed, 40 insertions(+), 8 deletions(-)

Resend with original author CC-ed.

diff --git a/tools/testing/selftests/vDSO/parse_vdso.c 
b/tools/testing/selftests/vDSO/parse_vdso.c
index 413f75620a35..e23efcbd1c88 100644
--- a/tools/testing/selftests/vDSO/parse_vdso.c
+++ b/tools/testing/selftests/vDSO/parse_vdso.c
@@ -47,8 +47,9 @@ static struct vdso_info
/* Symbol table */
ELF(Sym) *symtab;
const char *symstrings;
-   ELF(Word) *bucket, *chain;
+   void *bucket, *chain;
ELF(Word) nbucket, nchain;
+   bool hash_ent_is_dword;
 
/* Version table */
ELF(Versym) *versym;
@@ -69,6 +70,28 @@ static unsigned long elf_hash(const unsigned char *name)
return h;
 }
 
+/* return value of hash table entry */
+ELF(Word) get_hash_val(void *ptr, ELF(Word) idx)
+{
+   if (vdso_info.hash_ent_is_dword) {
+   ELF(Xword) *table = ptr;
+   /* for vdso assume all values fit in Elf Word */
+   return (ELF(Word)) table[idx];
+   }
+
+   ELF(Word) *table = ptr;
+   return table[idx];
+}
+
+/* return pointer to hash table entry */
+void *get_hash_ptr(void *ptr, ELF(Word) idx)
+{
+   if (vdso_info.hash_ent_is_dword)
+   return &((ELF(Xword) *) ptr)[idx];
+
+   return &((ELF(Word) *) ptr)[idx];
+}
+
 void vdso_init_from_sysinfo_ehdr(uintptr_t base)
 {
size_t i;
@@ -84,6 +107,14 @@ void vdso_init_from_sysinfo_ehdr(uintptr_t base)
return;  /* Wrong ELF class -- check ELF_BITS */
}
 
+   /* 64bit s390 and alpha have hash entry size of 8 bytes */
+   if ((hdr->e_machine == EM_ALPHA
+   || hdr->e_machine == EM_S390)
+   && hdr->e_ident[EI_CLASS] == ELFCLASS64)
+   vdso_info.hash_ent_is_dword = true;
+   else
+   vdso_info.hash_ent_is_dword = false;
+
ELF(Phdr) *pt = (ELF(Phdr)*)(vdso_info.load_addr + hdr->e_phoff);
ELF(Dyn) *dyn = 0;
 
@@ -149,11 +180,11 @@ void vdso_init_from_sysinfo_ehdr(uintptr_t base)
if (!vdso_info.verdef)
vdso_info.versym = 0;
 
-   /* Parse the hash table header. */
-   vdso_info.nbucket = hash[0];
-   vdso_info.nchain = hash[1];
-   vdso_info.bucket = &hash[2];
-   vdso_info.chain = &hash[vdso_info.nbucket + 2];
+
+   vdso_info.nbucket = get_hash_val(hash, 0);
+   vdso_info.nchain = get_hash_val(hash, 1);
+   vdso_info.bucket = get_hash_ptr(hash, 2);
+   vdso_info.chain = get_hash_ptr(hash, vdso_info.nbucket + 2);
 
/* That's all we need. */
vdso_info.valid = true;
@@ -204,9 +235,10 @@ void *vdso_sym(const char *version, const char *name)
return 0;
 
ver_hash = elf_hash(version);
-   ELF(Word) chain = vdso_info.bucket[elf_hash(name) % vdso_info.nbucket];
+   ELF(Word) chain = get_hash_val(vdso_info.bucket,
+   elf_hash(name) % vdso_info.nbucket);
 
-   for (; chain != STN_UNDEF; chain = vdso_info.chain[chain]) {
+   for (; chain != STN_UNDEF; chain = get_hash_val(vdso_info.chain, 
chain)) {
ELF(Sym) *sym = &vdso_info.symtab[chain];
 
/* Check for a defined global or weak function w/ right name. */
-- 
2.18.1



Re: [PATCH] selftests: vdso: hash entry size on alpha,s390x is 8 bytes

2020-08-03 Thread Jan Stancek


- Original Message -
> parse_vdso.c is crashing on 5.8-rc5 s390x, because it wrongly reads
> nbucket as 0:
>   Program received signal SIGFPE, Arithmetic exception.
>   0x01000f3e in vdso_sym (version=0x1001280 "LINUX_2.6",
>   name=0x100128a "__vdso_getcpu") at parse_vdso.c:207
>   207 ELF(Word) chain = vdso_info.bucket[elf_hash(name) %
>   vdso_info.nbucket];
>   (gdb) p vdso_info.nbucket
>   $1 = 0
> 
> Per readelf source:
>   
> https://sourceware.org/git/?p=binutils-gdb.git;a=blob;f=binutils/readelf.c;h=2406304fe35a832ac53aa7b1a367f3f7afed4264;hb=HEAD#l10027
> and objdump output hash entries are double size on 64bit s390 and alpha:
>   0120 <.hash>:
>120:   00 00 00 00 .long   0x
>124:   00 00 00 03 .long   0x0003
>128:   00 00 00 00 .long   0x
>12c:   00 00 00 07 .long   0x0007
>130:   00 00 00 00 .long   0x
>134:   00 00 00 01 .long   0x0001
>138:   00 00 00 00 .long   0x
>13c:   00 00 00 05 .long   0x0005
>140:   00 00 00 00 .long   0x
>144:   00 00 00 06 .long   0x0006
> ...
>16c:   00 00 00 02 .long   0x0002
>170:   00 00 00 00 .long   0x
>174:   00 00 00 03 .long   0x0003
>178:   00 00 00 00 .long   0x
>17c:   00 00 00 04 .long   0x0004
> 
> Do similar check in parse_vdso.c and treat hash entries as double word.

Ping, any thoughts about the issue or patch?



[PATCH] selftests: vdso: hash entry size on alpha,s390x is 8 bytes

2020-07-15 Thread Jan Stancek
parse_vdso.c is crashing on 5.8-rc5 s390x, because it wrongly reads
nbucket as 0:
  Program received signal SIGFPE, Arithmetic exception.
  0x01000f3e in vdso_sym (version=0x1001280 "LINUX_2.6", name=0x100128a 
"__vdso_getcpu") at parse_vdso.c:207
  207 ELF(Word) chain = vdso_info.bucket[elf_hash(name) % 
vdso_info.nbucket];
  (gdb) p vdso_info.nbucket
  $1 = 0

Per readelf source:
  
https://sourceware.org/git/?p=binutils-gdb.git;a=blob;f=binutils/readelf.c;h=2406304fe35a832ac53aa7b1a367f3f7afed4264;hb=HEAD#l10027
and objdump output hash entries are double size on 64bit s390 and alpha:
  0120 <.hash>:
   120:   00 00 00 00 .long   0x
   124:   00 00 00 03 .long   0x0003
   128:   00 00 00 00 .long   0x
   12c:   00 00 00 07 .long   0x0007
   130:   00 00 00 00 .long   0x
   134:   00 00 00 01 .long   0x0001
   138:   00 00 00 00 .long   0x
   13c:   00 00 00 05 .long   0x0005
   140:   00 00 00 00 .long   0x
   144:   00 00 00 06 .long   0x0006
  ...
   16c:   00 00 00 02 .long   0x0002
   170:   00 00 00 00 .long   0x
   174:   00 00 00 03 .long   0x0003
   178:   00 00 00 00 .long   0x
   17c:   00 00 00 04 .long   0x0004

Do similar check in parse_vdso.c and treat hash entries as double word.

Signed-off-by: Jan Stancek 
---
 tools/testing/selftests/vDSO/parse_vdso.c | 48 +++
 1 file changed, 40 insertions(+), 8 deletions(-)

diff --git a/tools/testing/selftests/vDSO/parse_vdso.c 
b/tools/testing/selftests/vDSO/parse_vdso.c
index 413f75620a35..e23efcbd1c88 100644
--- a/tools/testing/selftests/vDSO/parse_vdso.c
+++ b/tools/testing/selftests/vDSO/parse_vdso.c
@@ -47,8 +47,9 @@ static struct vdso_info
/* Symbol table */
ELF(Sym) *symtab;
const char *symstrings;
-   ELF(Word) *bucket, *chain;
+   void *bucket, *chain;
ELF(Word) nbucket, nchain;
+   bool hash_ent_is_dword;
 
/* Version table */
ELF(Versym) *versym;
@@ -69,6 +70,28 @@ static unsigned long elf_hash(const unsigned char *name)
return h;
 }
 
+/* return value of hash table entry */
+ELF(Word) get_hash_val(void *ptr, ELF(Word) idx)
+{
+   if (vdso_info.hash_ent_is_dword) {
+   ELF(Xword) *table = ptr;
+   /* for vdso assume all values fit in Elf Word */
+   return (ELF(Word)) table[idx];
+   }
+
+   ELF(Word) *table = ptr;
+   return table[idx];
+}
+
+/* return pointer to hash table entry */
+void *get_hash_ptr(void *ptr, ELF(Word) idx)
+{
+   if (vdso_info.hash_ent_is_dword)
+   return &((ELF(Xword) *) ptr)[idx];
+
+   return &((ELF(Word) *) ptr)[idx];
+}
+
 void vdso_init_from_sysinfo_ehdr(uintptr_t base)
 {
size_t i;
@@ -84,6 +107,14 @@ void vdso_init_from_sysinfo_ehdr(uintptr_t base)
return;  /* Wrong ELF class -- check ELF_BITS */
}
 
+   /* 64bit s390 and alpha have hash entry size of 8 bytes */
+   if ((hdr->e_machine == EM_ALPHA
+   || hdr->e_machine == EM_S390)
+   && hdr->e_ident[EI_CLASS] == ELFCLASS64)
+   vdso_info.hash_ent_is_dword = true;
+   else
+   vdso_info.hash_ent_is_dword = false;
+
ELF(Phdr) *pt = (ELF(Phdr)*)(vdso_info.load_addr + hdr->e_phoff);
ELF(Dyn) *dyn = 0;
 
@@ -149,11 +180,11 @@ void vdso_init_from_sysinfo_ehdr(uintptr_t base)
if (!vdso_info.verdef)
vdso_info.versym = 0;
 
-   /* Parse the hash table header. */
-   vdso_info.nbucket = hash[0];
-   vdso_info.nchain = hash[1];
-   vdso_info.bucket = &hash[2];
-   vdso_info.chain = &hash[vdso_info.nbucket + 2];
+
+   vdso_info.nbucket = get_hash_val(hash, 0);
+   vdso_info.nchain = get_hash_val(hash, 1);
+   vdso_info.bucket = get_hash_ptr(hash, 2);
+   vdso_info.chain = get_hash_ptr(hash, vdso_info.nbucket + 2);
 
/* That's all we need. */
vdso_info.valid = true;
@@ -204,9 +235,10 @@ void *vdso_sym(const char *version, const char *name)
return 0;
 
ver_hash = elf_hash(version);
-   ELF(Word) chain = vdso_info.bucket[elf_hash(name) % vdso_info.nbucket];
+   ELF(Word) chain = get_hash_val(vdso_info.bucket,
+   elf_hash(name) % vdso_info.nbucket);
 
-   for (; chain != STN_UNDEF; chain = vdso_info.chain[chain]) {
+   for (; chain != STN_UNDEF; chain = get_hash_val(vdso_info.chain, 
chain)) {
ELF(Sym) *sym = &vdso_info.symtab[chain];
 
/* Check for a defined global or weak function w/ right name. */
-- 
2.18.1



Re: [LTP] 303cc571d1: ltp.setns01.fail

2020-06-16 Thread Jan Stancek



- Original Message -
> I'll send a pr for this to Linus this week (or next since I'm on
> vacation this week) and get this fixed. Thanks for spotting this. What's
> the Reported-by: line format that LTP uses?

I'm not sure we ever used one, it's usually various CI systems that
reference LTP. This thread has been started by kernel test robot:
  Reported-by: kernel test robot 



Re: [LTP] 303cc571d1: ltp.setns01.fail

2020-06-15 Thread Jan Stancek



- Original Message -
> Hi!
> > setns01 6  TFAIL  :  setns01.c:176: regular file fd exp_errno=22:
> > errno=EBADF(9): Bad file descriptor
> > setns01 0  TINFO  :  setns(12, 0x2)
> > setns01 7  TFAIL  :  setns01.c:176: regular file fd exp_errno=22:
> > errno=EBADF(9): Bad file descriptor
> > setns01 0  TINFO  :  setns(12, 0x4000)
> > setns01 8  TFAIL  :  setns01.c:176: regular file fd exp_errno=22:
> > errno=EBADF(9): Bad file descriptor
> > setns01 0  TINFO  :  setns(12, 0x2000)
> > setns01 9  TFAIL  :  setns01.c:176: regular file fd exp_errno=22:
> > errno=EBADF(9): Bad file descriptor
> > setns01 0  TINFO  :  setns(12, 0x400)
> > setns0110  TFAIL  :  setns01.c:176: regular file fd exp_errno=22:
> > errno=EBADF(9): Bad file descriptor
> 
> The messages here are a bit cryptic, I will fix that later on, but what
> it means here is that the errno has changed from EINVAL to EBADF in a
> case we pass file descriptor to a regular file to setns().

I posted a series that accepts both errnos about week ago:
  https://lists.linux.it/pipermail/ltp/2020-June/017467.html



Re: LTP: syscalls: regression on mainline - ioctl_loop01 mknod07 setns01

2020-06-05 Thread Jan Stancek



- Original Message -
> Following three test cases reported as regression on Linux mainline kernel
> on x86_64, arm64, arm and i386
> 
>   ltp-syscalls-tests:
> * ioctl_loop01
> * mknod07

Test updated:
  
https://github.com/linux-test-project/ltp/commit/13fcfa2d6bdd1fb71c4528b47170e8e8fb3a8a32

> * setns01

commit 303cc571d107 ("nsproxy: attach to namespaces via pidfds")
changed errno that is returned for regular file from EINVAL to EBADF.
This appears to fit more current man page, so I think we need to fix
test to accept both. (I'm looking into that)



[bug?] LTP pt_test failing after 38bb8d77d0b9 "perf/x86/intel/pt: Split ToPA metadata and page layout"

2019-10-19 Thread Jan Stancek
Hi,

All variants of pt_test:
  
https://github.com/linux-test-project/ltp/blob/master/testcases/kernel/tracing/pt_test/pt_test.c
started failing after:
  38bb8d77d0b9 ("perf/x86/intel/pt: Split ToPA metadata and page layout")

with following error on console/dmesg:
  pt: ToPA ERROR encountered, trying to recover

on Broadwell-EP:

vendor_id   : GenuineIntel
cpu family  : 6
model   : 79
model name  : Intel(R) Xeon(R) CPU E5-2699 v4 @ 2.20GHz
stepping: 1

Regards,
Jan


Re: [PATCH] fat: fix corruption in fat_alloc_new_dir()

2019-09-08 Thread Jan Stancek



- Original Message -
> Jan Stancek  writes:
> 
> > sb_getblk does not guarantee that buffer_head is uptodate. If there is
> > async read running in parallel for same buffer_head, it can overwrite
> > just initialized msdos_dir_entry, leading to corruption:
> >   FAT-fs (loop0): error, corrupted directory (invalid entries)
> >   FAT-fs (loop0): Filesystem has been set read-only
> >
> > This can happen for example during LTP statx04, which creates loop
> > device, formats it (mkfs.vfat), mounts it and immediately creates
> > a new directory. In parallel, systemd-udevd is probing new block
> > device, which leads to async read.
> >
> >   do_mkdirat  ksys_read
> >vfs_mkdir   vfs_read
> > vfat_mkdir  __vfs_read
> >  fat_alloc_new_dir   new_sync_read
> >/* init de[0], de[1] */blkdev_read_iter
> >generic_file_read_iter
> > generic_file_buffered_read
> >  blkdev_readpage
> >   block_read_full_page
> >
> > Faster reproducer (based on LTP statx04):
> >
> > int main(void)
> > {
> > int i, j, ret, fd, loop_fd, ctrl_fd;
> > int loop_num;
> > char loopdev[256], tmp[256], testfile[256];
> >
> > mkdir("/tmp/mntpoint", 0777);
> > for (i = 0; ; i++) {
> > printf("Iteration: %d\n", i);
> > sprintf(testfile, "/tmp/test.img.%d", getpid());
> >
> > ctrl_fd = open("/dev/loop-control", O_RDWR);
> > loop_num = ioctl(ctrl_fd, LOOP_CTL_GET_FREE);
> > close(ctrl_fd);
> > sprintf(loopdev, "/dev/loop%d", loop_num);
> >
> > fd = open(testfile, O_WRONLY|O_CREAT|O_TRUNC, 0600);
> > fallocate(fd, 0, 0, 256*1024*1024);
> > close(fd);
> >
> > fd = open(testfile, O_RDWR);
> > loop_fd = open(loopdev, O_RDWR);
> > ioctl(loop_fd, LOOP_SET_FD, fd);
> > close(loop_fd);
> > close(fd);
> >
> > sprintf(tmp, "mkfs.vfat %s", loopdev);
> > system(tmp);
> > mount(loopdev, "/tmp/mntpoint", "vfat", 0, NULL);
> >
> > for (j = 0; j < 200; j++) {
> > sprintf(tmp, "/tmp/mntpoint/testdir%d", j);
> > ret = mkdir(tmp, 0777);
> > if (ret) {
> > perror("mkdir");
> > break;
> > }
> > }
> >
> > umount("/tmp/mntpoint");
> > loop_fd = open(loopdev, O_RDWR);
> > ioctl(loop_fd, LOOP_CLR_FD, fd);
> > close(loop_fd);
> > unlink(testfile);
> >
> > if (ret)
> > break;
> > }
> >
> > return 0;
> > }
> >
> > Issue triggers within minute on HPE Apollo 70 (arm64, 64GB RAM, 224 CPUs).
> 
> Using the device while mounting same device doesn't work reliably like
> this race. (getblk() is intentionally used to get the buffer to write
> new data.)

Are you saying this is expected even if 'usage' is just read?

> 
> mount(2) internally opens the device by EXCL mode, so I guess udev opens
> without EXCL (I dont know if it is intent or not).

I gave this a try and added O_EXCL to udev-builtin-blkid.c. My system had 
trouble
booting, it was getting stuck on mounting LVM volumes.

So, I'm not sure how to move forward here. 

Regards,
Jan


[tip: x86/urgent] x86/timer: Force PIT initialization when !X86_FEATURE_ARAT

2019-09-08 Thread tip-bot2 for Jan Stancek
The following commit has been merged into the x86/urgent branch of tip:

Commit-ID: afa8b475c1aec185a8e106c48b3832e0b88bc2de
Gitweb:
https://git.kernel.org/tip/afa8b475c1aec185a8e106c48b3832e0b88bc2de
Author:Jan Stancek 
AuthorDate:Sun, 08 Sep 2019 00:50:40 +02:00
Committer: Thomas Gleixner 
CommitterDate: Sun, 08 Sep 2019 09:01:15 +02:00

x86/timer: Force PIT initialization when !X86_FEATURE_ARAT

KVM guests with commit c8c4076723da ("x86/timer: Skip PIT initialization on
modern chipsets") applied to guest kernel have been observed to have
unusually higher CPU usage with symptoms of increase in vm exits for HLT
and MSW_WRITE (MSR_IA32_TSCDEADLINE).

This is caused by older QEMUs lacking support for X86_FEATURE_ARAT.  lapic
clock retains CLOCK_EVT_FEAT_C3STOP and nohz stays inactive.  There's no
usable broadcast device either.

Do the PIT initialization if guest CPU lacks X86_FEATURE_ARAT.  On real
hardware it shouldn't matter as ARAT and DEADLINE come together.

Fixes: c8c4076723da ("x86/timer: Skip PIT initialization on modern chipsets")
Signed-off-by: Jan Stancek 
Signed-off-by: Thomas Gleixner 

---
 arch/x86/kernel/apic/apic.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index dba2828..f91b3ff 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -834,6 +834,10 @@ bool __init apic_needs_pit(void)
if (!boot_cpu_has(X86_FEATURE_APIC))
return true;
 
+   /* Virt guests may lack ARAT, but still have DEADLINE */
+   if (!boot_cpu_has(X86_FEATURE_ARAT))
+   return true;
+
/* Deadline timer is based on TSC so no further PIT action required */
if (boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER))
return false;


[tip: x86/urgent] x86/timer: Force PIT initialization when !X86_FEATURE_ARAT

2019-09-08 Thread tip-bot2 for Jan Stancek
The following commit has been merged into the x86/urgent branch of tip:

Commit-ID: afa8b475c1aec185a8e106c48b3832e0b88bc2de
Gitweb:
https://git.kernel.org/tip/afa8b475c1aec185a8e106c48b3832e0b88bc2de
Author:Jan Stancek 
AuthorDate:Sun, 08 Sep 2019 00:50:40 +02:00
Committer: Thomas Gleixner 
CommitterDate: Sun, 08 Sep 2019 09:01:15 +02:00

x86/timer: Force PIT initialization when !X86_FEATURE_ARAT

KVM guests with commit c8c4076723da ("x86/timer: Skip PIT initialization on
modern chipsets") applied to guest kernel have been observed to have
unusually higher CPU usage with symptoms of increase in vm exits for HLT
and MSW_WRITE (MSR_IA32_TSCDEADLINE).

This is caused by older QEMUs lacking support for X86_FEATURE_ARAT.  lapic
clock retains CLOCK_EVT_FEAT_C3STOP and nohz stays inactive.  There's no
usable broadcast device either.

Do the PIT initialization if guest CPU lacks X86_FEATURE_ARAT.  On real
hardware it shouldn't matter as ARAT and DEADLINE come together.

Fixes: c8c4076723da ("x86/timer: Skip PIT initialization on modern chipsets")
Signed-off-by: Jan Stancek 
Signed-off-by: Thomas Gleixner 

---
 arch/x86/kernel/apic/apic.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index dba2828..f91b3ff 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -834,6 +834,10 @@ bool __init apic_needs_pit(void)
if (!boot_cpu_has(X86_FEATURE_APIC))
return true;
 
+   /* Virt guests may lack ARAT, but still have DEADLINE */
+   if (!boot_cpu_has(X86_FEATURE_ARAT))
+   return true;
+
/* Deadline timer is based on TSC so no further PIT action required */
if (boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER))
return false;


[PATCH] fat: fix corruption in fat_alloc_new_dir()

2019-09-02 Thread Jan Stancek
sb_getblk does not guarantee that buffer_head is uptodate. If there is
async read running in parallel for same buffer_head, it can overwrite
just initialized msdos_dir_entry, leading to corruption:
  FAT-fs (loop0): error, corrupted directory (invalid entries)
  FAT-fs (loop0): Filesystem has been set read-only

This can happen for example during LTP statx04, which creates loop
device, formats it (mkfs.vfat), mounts it and immediately creates
a new directory. In parallel, systemd-udevd is probing new block
device, which leads to async read.

  do_mkdirat  ksys_read
   vfs_mkdir   vfs_read
vfat_mkdir  __vfs_read
 fat_alloc_new_dir   new_sync_read
   /* init de[0], de[1] */blkdev_read_iter
   generic_file_read_iter
generic_file_buffered_read
 blkdev_readpage
  block_read_full_page

Faster reproducer (based on LTP statx04):
- 8< -
int main(void)
{
int i, j, ret, fd, loop_fd, ctrl_fd;
int loop_num;
char loopdev[256], tmp[256], testfile[256];

mkdir("/tmp/mntpoint", 0777);
for (i = 0; ; i++) {
printf("Iteration: %d\n", i);
sprintf(testfile, "/tmp/test.img.%d", getpid());

ctrl_fd = open("/dev/loop-control", O_RDWR);
loop_num = ioctl(ctrl_fd, LOOP_CTL_GET_FREE);
close(ctrl_fd);
sprintf(loopdev, "/dev/loop%d", loop_num);

fd = open(testfile, O_WRONLY|O_CREAT|O_TRUNC, 0600);
fallocate(fd, 0, 0, 256*1024*1024);
close(fd);

fd = open(testfile, O_RDWR);
loop_fd = open(loopdev, O_RDWR);
ioctl(loop_fd, LOOP_SET_FD, fd);
close(loop_fd);
close(fd);

sprintf(tmp, "mkfs.vfat %s", loopdev);
system(tmp);
mount(loopdev, "/tmp/mntpoint", "vfat", 0, NULL);

for (j = 0; j < 200; j++) {
sprintf(tmp, "/tmp/mntpoint/testdir%d", j);
ret = mkdir(tmp, 0777);
if (ret) {
perror("mkdir");
break;
}
}

umount("/tmp/mntpoint");
loop_fd = open(loopdev, O_RDWR);
ioctl(loop_fd, LOOP_CLR_FD, fd);
close(loop_fd);
unlink(testfile);

if (ret)
break;
}

return 0;
}
- 8< -------------

Issue triggers within minute on HPE Apollo 70 (arm64, 64GB RAM, 224 CPUs).

Signed-off-by: Jan Stancek 
Cc: OGAWA Hirofumi 
---
 fs/fat/dir.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 1bda2ab6745b..474fd6873ec8 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -1149,7 +1149,7 @@ int fat_alloc_new_dir(struct inode *dir, struct 
timespec64 *ts)
goto error;
 
blknr = fat_clus_to_blknr(sbi, cluster);
-   bhs[0] = sb_getblk(sb, blknr);
+   bhs[0] = sb_bread(sb, blknr);
if (!bhs[0]) {
err = -ENOMEM;
goto error_free;
-- 
1.8.3.1



Re: [PATCH v5.2 1/2] locking/rwsem: Add missing ACQUIRE to read_slowpath exit when queue is empty

2019-08-27 Thread Jan Stancek


- Original Message -
> From: Jan Stancek 
> 
> [ Upstream commit e1b98fa316648420d0434d9ff5b92ad6609ba6c3 ]
> 
> LTP mtest06 has been observed to occasionally hit "still mapped when
> deleted" and following BUG_ON on arm64.
> 
> The extra mapcount originated from pagefault handler, which handled
> pagefault for vma that has already been detached. vma is detached
> under mmap_sem write lock by detach_vmas_to_be_unmapped(), which
> also invalidates vmacache.
> 
> When the pagefault handler (under mmap_sem read lock) calls
> find_vma(), vmacache_valid() wrongly reports vmacache as valid.
> 
> After rwsem down_read() returns via 'queue empty' path (as of v5.2),
> it does so without an ACQUIRE on sem->count:
> 
>   down_read()
> __down_read()
>   rwsem_down_read_failed()
> __rwsem_down_read_failed_common()
>   raw_spin_lock_irq(&sem->wait_lock);
>   if (list_empty(&sem->wait_list)) {
> if (atomic_long_read(&sem->count) >= 0) {
>   raw_spin_unlock_irq(&sem->wait_lock);
>   return sem;
> 
> The problem can be reproduced by running LTP mtest06 in a loop and
> building the kernel (-j $NCPUS) in parallel. It does reproduces since
> v4.20 on arm64 HPE Apollo 70 (224 CPUs, 256GB RAM, 2 nodes). It
> triggers reliably in about an hour.
> 
> The patched kernel ran fine for 10+ hours.
> 
> Signed-off-by: Jan Stancek 
> Signed-off-by: Peter Zijlstra (Intel) 
> Reviewed-by: Will Deacon 
> Acked-by: Waiman Long 
> Cc: Linus Torvalds 
> Cc: Peter Zijlstra 
> Cc: Thomas Gleixner 
> Cc: dbu...@suse.de
> Fixes: 4b486b535c33 ("locking/rwsem: Exit read lock slowpath if queue empty &
> no writer")
> Link:
> https://lkml.kernel.org/r/50b8914e20d1d62bb2dee42d342836c2c16ebee7.1563438048.git.jstan...@redhat.com
> Signed-off-by: Ingo Molnar 
> Signed-off-by: Sasha Levin 
> ---
> 
> This is a backport for the v5.2 stable tree. There were multiple reports
> of this issue being hit.
> 
> Given that there were a few changes to the code around this, I'd
> appreciate an ack before pulling it in.

ACK, both look good to me.
I also re-ran reproducer with this series applied on top of 5.2.10, it PASS-ed.

Thanks,
Jan


Re: Linux-next-20190823: x86_64/i386: prot_hsymlinks.c:325: Failed to run cmd: useradd hsym

2019-08-27 Thread Jan Stancek



- Original Message -
> On Tue, 2019-08-27 at 06:25 -0400, Jan Stancek wrote:
> > That theory is probably not correct for this case, since EIO I see
> > appears
> > to originate from write and nfs_writeback_result(). This function
> > also
> > produces message we saw in logs from Naresh.
> > 
> > I can't find where/how is resp->count updated on WRITE reply in
> > NFSv2.
> > Issue also goes away with patch below, though I can't speak about its
> > correctness:
> > 
> > NFS version TypeTestReturn code
> > nfsvers=2   tcp -b:base 0
> > nfsvers=2   tcp -g:general  0
> > nfsvers=2   tcp -s:special  0
> > nfsvers=2   tcp -l:lock 0
> > Total time: 141
> > 
> > diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
> > index cbc17a203248..4913c6da270b 100644
> > --- a/fs/nfs/nfs2xdr.c
> > +++ b/fs/nfs/nfs2xdr.c
> > @@ -897,6 +897,16 @@ static int nfs2_xdr_dec_writeres(struct rpc_rqst
> > *req, struct xdr_stream *xdr,
> >  void *data)
> >  {
> > struct nfs_pgio_res *result = data;
> > +   struct rpc_task *rq_task  = req->rq_task;
> > +
> > +   if (rq_task) {
> > +   struct nfs_pgio_args *args = rq_task-
> > >tk_msg.rpc_argp;
> > +
> > +   if (args) {
> > +   result->count = args->count;
> > +   }
> > +   }
> >  
> > /* All NFSv2 writes are "file sync" writes */
> > result->verf->committed = NFS_FILE_SYNC;
> 
> Thanks! I've moved the above to nfs_write_done() so that we do it only
> on success (see
> http://git.linux-nfs.org/?p=trondmy/linux-nfs.git;a=commitdiff;h=3ba5688da709dd0f7d917029c206bc1848a6ae74
> )

Thanks, retested with 3ba5688da, all PASS:

NFS version TypeTestReturn code
nfsvers=2   tcp -b:base 0
nfsvers=2   tcp -g:general  0
nfsvers=2   tcp -s:special  0
nfsvers=2   tcp -l:lock 0

NFS version TypeTestReturn code
nfsvers=3   tcp -b:base 0
nfsvers=3   tcp -g:general  0
nfsvers=3   tcp -s:special  0
nfsvers=3   tcp -l:lock 0
nfsvers=3   tcp6-b:base 0
nfsvers=3   tcp6-g:general  0
nfsvers=3   tcp6-s:special  0
nfsvers=3   tcp6-l:lock 0

NFS version TypeTestReturn code
nfsvers=4   tcp -b:base 0
nfsvers=4   tcp -g:general  0
nfsvers=4   tcp -s:special  0
nfsvers=4   tcp -l:lock 0
nfsvers=4   tcp6-b:base 0
nfsvers=4   tcp6-g:general  0
nfsvers=4   tcp6-s:special  0
nfsvers=4   tcp6-l:lock 0

Feel free to add also:

Reported-by: Naresh Kamboju 
Tested-by: Jan Stancek 


Re: Linux-next-20190823: x86_64/i386: prot_hsymlinks.c:325: Failed to run cmd: useradd hsym

2019-08-27 Thread Jan Stancek


- Original Message -
> On Mon, 2019-08-26 at 19:12 -0400, Jan Stancek wrote:
> > - Original Message -
> > > On Mon, 2019-08-26 at 10:38 -0400, Jan Stancek wrote:
> > > > - Original Message -
> > > > > Hi Jan and Cyril,
> > > > > 
> > > > > On Mon, 26 Aug 2019 at 16:35, Jan Stancek 
> > > > > wrote:
> > > > > > 
> > > > > > - Original Message -
> > > > > > > Hi!
> > > > > > > > Do you see this LTP prot_hsymlinks failure on linux next
> > > > > > > > 20190823 on
> > > > > > > > x86_64 and i386 devices?
> > > > > > > > 
> > > > > > > > test output log,
> > > > > > > > useradd: failure while writing changes to /etc/passwd
> > > > > > > > useradd: /home/hsym was created, but could not be removed
> > > > > > > 
> > > > > > > This looks like an unrelated problem, failure to write to
> > > > > > > /etc/passwd
> > > > > > > probably means that filesystem is full or some problem
> > > > > > > happend
> > > > > > > and how
> > > > > > > is remounted RO.
> > > > > > 
> > > > > > In Naresh' example, root is on NFS:
> > > > > >   root=/dev/nfs rw
> > > > > >  
> > > > > > nfsroot=10.66.16.123:/var/lib/lava/dispatcher/tmp/886412/extr
> > > > > > act-
> > > > > > nfsrootfs-tyuevoxm,tcp,hard,intr
> > > > > 
> > > > > Right !
> > > > > root is mounted on NFS.
> > > > > 
> > > > > > 10.66.16.123:/var/lib/lava/dispatcher/tmp/886412/extract-
> > > > > > nfsrootfs-tyuevoxm
> > > > > > on / type nfs
> > > > > > (rw,relatime,vers=2,rsize=4096,wsize=4096,namlen=255,hard,nol
> > > > > > ock,
> > > > > > proto=tcp,timeo=600,retrans=2,sec=sys,mountaddr=10.66.16.123,
> > > > > > moun
> > > > > > tvers=1,mountproto=tcp,local_lock=all,addr=10.66.16.123)
> > > > > > devtmpfs on /dev type devtmpfs
> > > > > > (rw,relatime,size=3977640k,nr_inodes=994410,mode=755)
> > > > > > 
> > > 
> > > The only thing I can think of that might cause an EIO on NFSv2
> > > would be
> > > this patch
> > > http://git.linux-nfs.org/?p=trondmy/linux-nfs.git;a=commitdiff;h=627d48e597ec5993c4abb3b81dc75e554a07c7c0
> > > assuming that a bind-related error is leaking through.
> > > 
> > > I'd suggest something like the following to fix it up:
> > 
> > No change with that patch,
> > but following one fixes it for me:
> > 
> > diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
> > index 20b3717cd7ca..56cefa0ab804 100644
> > --- a/fs/nfs/pagelist.c
> > +++ b/fs/nfs/pagelist.c
> > @@ -590,7 +590,7 @@ static void nfs_pgio_rpcsetup(struct
> > nfs_pgio_header *hdr,
> > }
> >  
> > hdr->res.fattr   = &hdr->fattr;
> > -   hdr->res.count   = 0;
> > +   hdr->res.count   = count;
> > hdr->res.eof = 0;
> > hdr->res.verf= &hdr->verf;
> > nfs_fattr_init(&hdr->fattr);
> > 
> > which is functionally revert of "NFS: Fix initialisation of I/O
> > result struct in nfs_pgio_rpcsetup".
> > 
> > This hunk caught my eye, could res.eof == 0 explain those I/O errors?
> 
> Interesting hypothesis. It could if res.count ends up being 0. So does
> the following also fix the problem?

It didn't fix it.

That theory is probably not correct for this case, since EIO I see appears
to originate from write and nfs_writeback_result(). This function also
produces message we saw in logs from Naresh.

I can't find where/how is resp->count updated on WRITE reply in NFSv2.
Issue also goes away with patch below, though I can't speak about its 
correctness:

NFS version TypeTestReturn code
nfsvers=2   tcp -b:base 0
nfsvers=2   tcp -g:general  0
nfsvers=2   tcp -s:special  0
nfsvers=2   tcp -l:lock 0
Total time: 141

diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index cbc17a203248..4913c6da270b 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -897,6 +897,16 @@ static int nfs2_xdr_dec_writeres(struct rpc_rqst *req, 
struct xdr_stream *xdr,
 void *data)
 {
struct nfs_pgio_res *result = data;
+   struct rpc_task *rq_task  = req->rq_task;
+
+   if (rq_task) {
+   struct nfs_pgio_args *args = rq_task->tk_msg.rpc_argp;
+
+   if (args) {
+   result->count = args->count;
+   }
+   }
 
/* All NFSv2 writes are "file sync" writes */
result->verf->committed = NFS_FILE_SYNC;


Re: Linux-next-20190823: x86_64/i386: prot_hsymlinks.c:325: Failed to run cmd: useradd hsym

2019-08-26 Thread Jan Stancek


- Original Message -
> On Mon, 2019-08-26 at 10:38 -0400, Jan Stancek wrote:
> > - Original Message -
> > > Hi Jan and Cyril,
> > > 
> > > On Mon, 26 Aug 2019 at 16:35, Jan Stancek 
> > > wrote:
> > > > 
> > > > 
> > > > - Original Message -
> > > > > Hi!
> > > > > > Do you see this LTP prot_hsymlinks failure on linux next
> > > > > > 20190823 on
> > > > > > x86_64 and i386 devices?
> > > > > > 
> > > > > > test output log,
> > > > > > useradd: failure while writing changes to /etc/passwd
> > > > > > useradd: /home/hsym was created, but could not be removed
> > > > > 
> > > > > This looks like an unrelated problem, failure to write to
> > > > > /etc/passwd
> > > > > probably means that filesystem is full or some problem happend
> > > > > and how
> > > > > is remounted RO.
> > > > 
> > > > In Naresh' example, root is on NFS:
> > > >   root=/dev/nfs rw
> > > >  
> > > > nfsroot=10.66.16.123:/var/lib/lava/dispatcher/tmp/886412/extract-
> > > > nfsrootfs-tyuevoxm,tcp,hard,intr
> > > 
> > > Right !
> > > root is mounted on NFS.
> > > 
> > > > 10.66.16.123:/var/lib/lava/dispatcher/tmp/886412/extract-
> > > > nfsrootfs-tyuevoxm
> > > > on / type nfs
> > > > (rw,relatime,vers=2,rsize=4096,wsize=4096,namlen=255,hard,nolock,
> > > > proto=tcp,timeo=600,retrans=2,sec=sys,mountaddr=10.66.16.123,moun
> > > > tvers=1,mountproto=tcp,local_lock=all,addr=10.66.16.123)
> > > > devtmpfs on /dev type devtmpfs
> > > > (rw,relatime,size=3977640k,nr_inodes=994410,mode=755)
> > > > 
> 
> The only thing I can think of that might cause an EIO on NFSv2 would be
> this patch
> http://git.linux-nfs.org/?p=trondmy/linux-nfs.git;a=commitdiff;h=627d48e597ec5993c4abb3b81dc75e554a07c7c0
> assuming that a bind-related error is leaking through.
> 
> I'd suggest something like the following to fix it up:

No change with that patch,
but following one fixes it for me:

diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 20b3717cd7ca..56cefa0ab804 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -590,7 +590,7 @@ static void nfs_pgio_rpcsetup(struct nfs_pgio_header *hdr,
}
 
hdr->res.fattr   = &hdr->fattr;
-   hdr->res.count   = 0;
+   hdr->res.count   = count;
hdr->res.eof = 0;
hdr->res.verf= &hdr->verf;
nfs_fattr_init(&hdr->fattr);

which is functionally revert of "NFS: Fix initialisation of I/O result struct 
in nfs_pgio_rpcsetup".

This hunk caught my eye, could res.eof == 0 explain those I/O errors?
/* Emulate the eof flag, which isn't normally needed in NFSv2   
  
 * as it is guaranteed to always return the file attributes 
  
 */ 
  
if (hdr->args.offset + hdr->res.count >= hdr->res.fattr->size)  
  
hdr->res.eof = 1; 


Re: Linux-next-20190823: x86_64/i386: prot_hsymlinks.c:325: Failed to run cmd: useradd hsym

2019-08-26 Thread Jan Stancek


- Original Message -
> Hi Jan and Cyril,
> 
> On Mon, 26 Aug 2019 at 16:35, Jan Stancek  wrote:
> >
> >
> >
> > - Original Message -
> > > Hi!
> > > > Do you see this LTP prot_hsymlinks failure on linux next 20190823 on
> > > > x86_64 and i386 devices?
> > > >
> > > > test output log,
> > > > useradd: failure while writing changes to /etc/passwd
> > > > useradd: /home/hsym was created, but could not be removed
> > >
> > > This looks like an unrelated problem, failure to write to /etc/passwd
> > > probably means that filesystem is full or some problem happend and how
> > > is remounted RO.
> >
> > In Naresh' example, root is on NFS:
> >   root=/dev/nfs rw
> >   
> > nfsroot=10.66.16.123:/var/lib/lava/dispatcher/tmp/886412/extract-nfsrootfs-tyuevoxm,tcp,hard,intr
> 
> Right !
> root is mounted on NFS.
> 
> >
> > 10.66.16.123:/var/lib/lava/dispatcher/tmp/886412/extract-nfsrootfs-tyuevoxm
> > on / type nfs
> > (rw,relatime,vers=2,rsize=4096,wsize=4096,namlen=255,hard,nolock,proto=tcp,timeo=600,retrans=2,sec=sys,mountaddr=10.66.16.123,mountvers=1,mountproto=tcp,local_lock=all,addr=10.66.16.123)
> > devtmpfs on /dev type devtmpfs
> > (rw,relatime,size=3977640k,nr_inodes=994410,mode=755)
> >
> > Following message repeats couple times in logs:
> >   NFS: Server wrote zero bytes, expected XXX
> >
> > Naresh, can you check if there are any errors on NFS server side?
> 
> I have re-tested the failed tests on next-20190822 and all get pass
> which is also
> using same NFS server [1] [2].

Thanks, that suggests some client side change between next-20190822 and 
next-20190823
might introduced it.

> 
> > Maybe run NFS cthon against that server with client running next-20190822
> > and next-20190823.
> 
> Thanks for the pointers.
> I will setup and run NFS cthon on next-20190822 and next-20190823.

I'll try to reproduce too.

> 
> >
> > >
> > > I do not see the kernel messages from this job anywhere at the job
> > > pages, is it stored somewhere?
> >
> > It appears to be mixed in same log file:
> >   
> > https://qa-reports.linaro.org/lkft/linux-next-oe/build/next-20190823/testrun/886412/log
> 
> For the record the following tests failed on linux -next-20190823 on x86_64
> and i386. The filesystem is mounted on NFS and tests are using
> locally mounted hard drive ( with -d /scratch ).
> 
> Juno-r2 device filesystem mounted on NFS and did not see these errors
> and test getting pass on -next-20190823.
> 
> These failures are reproducible all time on next-20190823 kernel on x86_64
> and i386 device with root mounted on NFS [3] [4] [5] [6].
> 
> I will git bisect to find out which is bad commit.
> 
> prot_hsymlinks: [3]
> --
> useradd: failure while writing changes to /etc/passwd
> useradd: /home/hsym was created, but could not be removed
> userdel: user 'hsym' does not exist
> prot_hsymlinks1  TBROK  :  prot_hsymlinks.c:325: Failed to run
> cmd: useradd hsym
> prot_hsymlinks2  TBROK  :  prot_hsymlinks.c:325: Remaining cases broken
> prot_hsymlinks3  TBROK  :  prot_hsymlinks.c:325: Failed to run
> cmd: userdel -r hsym
> prot_hsymlinks4  TBROK  :  tst_sig.c:234: unexpected signal
> SIGIOT/SIGABRT(6) received (pid = 8324).
> 
> logrotate01: [4]
> -
> compressing log with: /bin/gzip
> error: error creating temp state file /var/lib/logrotate.status.tmp:
> Input/output error
> logrotate011  TFAIL  :  ltpapicmd.c:154: Test #1: logrotate
> command exited with 1 return code. Output:
> 
> sem_unlink_2-2: [5]
> --
> make[3]: Entering directory
> '/opt/ltp/testcases/open_posix_testsuite/conformance/interfaces/sem_unlink'
> cat: write error: Input/output error
> conformance/interfaces/sem_unlink/sem_unlink_2-2: execution: FAILED
> 
> syslog{01 ...10} [6]
> ---
> cp: failed to close '/etc/syslog.conf.ltpback': Input/output error
> syslog011  TBROK  :  ltpapicmd.c:188: failed to backup /etc/syslog.conf
> 
> cp: failed to close '/etc/syslog.conf.ltpback': Input/output error
> syslog021  TBROK  :  ltpapicmd.c:188: failed to backup /etc/syslog.conf
> 
> ...
> cp: failed to close '/etc/syslog.conf.ltpback': Input/output error
> syslog101  TBROK  :  ltpapicmd.c:188: failed to backup /etc/syslog.conf
> 
> ref:
> PASS on 20190222:
> [1] https://lkft.validation.linaro.org/scheduler/job/890446#L1232
> [2] https://lkft.validation.linaro.org/scheduler/job/890454
> 
> FAILED on 20190823:
> [3] https://lkft.validation.linaro.org/scheduler/job/890404#L1245
> [4] https://lkft.validation.linaro.org/scheduler/job/886408#L2544
> [5] https://lkft.validation.linaro.org/scheduler/job/886409#L3088
> [6] https://lkft.validation.linaro.org/scheduler/job/890400#L1234
> 
>  - Naresh
> 


Re: Linux-next-20190823: x86_64/i386: prot_hsymlinks.c:325: Failed to run cmd: useradd hsym

2019-08-26 Thread Jan Stancek



- Original Message -
> Hi!
> > Do you see this LTP prot_hsymlinks failure on linux next 20190823 on
> > x86_64 and i386 devices?
> > 
> > test output log,
> > useradd: failure while writing changes to /etc/passwd
> > useradd: /home/hsym was created, but could not be removed
> 
> This looks like an unrelated problem, failure to write to /etc/passwd
> probably means that filesystem is full or some problem happend and how
> is remounted RO.

In Naresh' example, root is on NFS:
  root=/dev/nfs rw 
nfsroot=10.66.16.123:/var/lib/lava/dispatcher/tmp/886412/extract-nfsrootfs-tyuevoxm,tcp,hard,intr

10.66.16.123:/var/lib/lava/dispatcher/tmp/886412/extract-nfsrootfs-tyuevoxm on 
/ type nfs 
(rw,relatime,vers=2,rsize=4096,wsize=4096,namlen=255,hard,nolock,proto=tcp,timeo=600,retrans=2,sec=sys,mountaddr=10.66.16.123,mountvers=1,mountproto=tcp,local_lock=all,addr=10.66.16.123)
devtmpfs on /dev type devtmpfs 
(rw,relatime,size=3977640k,nr_inodes=994410,mode=755)

Following message repeats couple times in logs:
  NFS: Server wrote zero bytes, expected XXX

Naresh, can you check if there are any errors on NFS server side?
Maybe run NFS cthon against that server with client running next-20190822 and 
next-20190823.

> 
> I do not see the kernel messages from this job anywhere at the job
> pages, is it stored somewhere?

It appears to be mixed in same log file:
  
https://qa-reports.linaro.org/lkft/linux-next-oe/build/next-20190823/testrun/886412/log


[tip:locking/core] locking/rwsem: Add missing ACQUIRE to read_slowpath exit when queue is empty

2019-07-25 Thread tip-bot for Jan Stancek
Commit-ID:  e1b98fa316648420d0434d9ff5b92ad6609ba6c3
Gitweb: https://git.kernel.org/tip/e1b98fa316648420d0434d9ff5b92ad6609ba6c3
Author: Jan Stancek 
AuthorDate: Thu, 18 Jul 2019 10:51:25 +0200
Committer:  Ingo Molnar 
CommitDate: Thu, 25 Jul 2019 15:39:23 +0200

locking/rwsem: Add missing ACQUIRE to read_slowpath exit when queue is empty

LTP mtest06 has been observed to occasionally hit "still mapped when
deleted" and following BUG_ON on arm64.

The extra mapcount originated from pagefault handler, which handled
pagefault for vma that has already been detached. vma is detached
under mmap_sem write lock by detach_vmas_to_be_unmapped(), which
also invalidates vmacache.

When the pagefault handler (under mmap_sem read lock) calls
find_vma(), vmacache_valid() wrongly reports vmacache as valid.

After rwsem down_read() returns via 'queue empty' path (as of v5.2),
it does so without an ACQUIRE on sem->count:

  down_read()
__down_read()
  rwsem_down_read_failed()
__rwsem_down_read_failed_common()
  raw_spin_lock_irq(&sem->wait_lock);
  if (list_empty(&sem->wait_list)) {
if (atomic_long_read(&sem->count) >= 0) {
  raw_spin_unlock_irq(&sem->wait_lock);
  return sem;

The problem can be reproduced by running LTP mtest06 in a loop and
building the kernel (-j $NCPUS) in parallel. It does reproduces since
v4.20 on arm64 HPE Apollo 70 (224 CPUs, 256GB RAM, 2 nodes). It
triggers reliably in about an hour.

The patched kernel ran fine for 10+ hours.

Signed-off-by: Jan Stancek 
Signed-off-by: Peter Zijlstra (Intel) 
Reviewed-by: Will Deacon 
Acked-by: Waiman Long 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Thomas Gleixner 
Cc: dbu...@suse.de
Fixes: 4b486b535c33 ("locking/rwsem: Exit read lock slowpath if queue empty & 
no writer")
Link: 
https://lkml.kernel.org/r/50b8914e20d1d62bb2dee42d342836c2c16ebee7.1563438048.git.jstan...@redhat.com
Signed-off-by: Ingo Molnar 
---
 kernel/locking/rwsem.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index bc91aacaab58..d3ce7c6c42a6 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -1036,6 +1036,8 @@ queue:
 */
if (adjustment && !(atomic_long_read(&sem->count) &
 (RWSEM_WRITER_MASK | RWSEM_FLAG_HANDOFF))) {
+   /* Provide lock ACQUIRE */
+   smp_acquire__after_ctrl_dep();
raw_spin_unlock_irq(&sem->wait_lock);
rwsem_set_reader_owned(sem);
lockevent_inc(rwsem_rlock_fast);


Re: [PATCH v2] locking/rwsem: add acquire barrier to read_slowpath exit when queue is empty

2019-07-18 Thread Jan Stancek


- Original Message -
> 
> It's simpler like so:
> 
> On Thu, Jul 18, 2019 at 01:04:46PM +0200, Peter Zijlstra wrote:
> > X = 0;
> > 
> > rwsem_down_read()
> >   for (;;) {
> > set_current_state(TASK_UNINTERRUPTIBLE);
> >
> > X = 1;
> >   
> > rwsem_up_write();
> >   
> > rwsem_mark_wake()
> > 
> > atomic_long_add(adjustment, &sem->count);
> > 
> > smp_store_release(&waiter->task, NULL);
> > 
> > if (!waiter.task)
> >   break;
> > 
> > ...
> >   }
> > 
> > r = X;
> > 

I see - it looks possible. Thank you for this example.


Re: [PATCH v2] locking/rwsem: add acquire barrier to read_slowpath exit when queue is empty

2019-07-18 Thread Jan Stancek


- Original Message -
> Hi Jan, Waiman, [+Jade and Paul for the litmus test at the end]
> 
> On Wed, Jul 17, 2019 at 09:22:00PM +0200, Jan Stancek wrote:
> > On Wed, Jul 17, 2019 at 10:19:04AM -0400, Waiman Long wrote:
> > > > If you add a comment to the code outlining the issue (preferably as a
> > > > litmus
> > > > test involving sem->count and some shared data which happens to be
> > > > vmacache_seqnum in your test)), then:
> > > > 
> > > > Reviewed-by: Will Deacon 
> > > > 
> > > > Thanks,
> > > > 
> > > > Will
> > > 
> > > Agreed. A comment just above smp_acquire__after_ctrl_dep() on why this
> > > is needed will be great.
> > > 
> > > Other than that,
> > > 
> > > Acked-by: Waiman Long 
> > > 
> > 
> > litmus test looks a bit long, would following be acceptable?
> > 
> > diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
> > index 37524a47f002..d9c96651bfc7 100644
> > --- a/kernel/locking/rwsem.c
> > +++ b/kernel/locking/rwsem.c
> > @@ -1032,6 +1032,13 @@ static inline bool rwsem_reader_phase_trylock(struct
> > rw_semaphore *sem,
> >  */
> > if (adjustment && !(atomic_long_read(&sem->count) &
> >  (RWSEM_WRITER_MASK | RWSEM_FLAG_HANDOFF))) {
> > +   /*
> > +* down_read() issued ACQUIRE on enter, but we can race
> > +* with writer who did RELEASE only after us.
> > +* ACQUIRE here makes sure reader operations happen only
> > +* after all writer ones.
> > +*/
> 
> How about an abridged form of the litmus test here, just to show the cod
> flow? e.g.:
> 
> /*
>  * We need to ensure ACQUIRE semantics when reading sem->count so that
>  * we pair with the RELEASE store performed by an unlocking/downgrading
>  * writer.
>  *
>  * P0 (writer)P1 (reader)
>  *
>  * down_write(sem);
>  * 
>  * downgrade_write(sem);
>  * -> fetch_add_release(&sem->count)
>  *
>  *down_read_slowpath(sem);
>  *-> atomic_read(&sem->count)
>  *   
>  *   smp_acquire__after_ctrl_dep()
>  *
>  */

Works for me. The code is at 3 level of indentation, but I can try
to squeeze it in for v4.

> 
> In writing this, I also noticed that we don't have any explicit ordering
> at the end of the reader slowpath when we wait on the queue but get woken
> immediately:
> 
>   if (!waiter.task)
>   break;
> 
> Am I missing something?

I'm assuming this isn't problem, because set_current_state() on line above
is using smp_store_mb().

> 
> > +   smp_acquire__after_ctrl_dep();
> > raw_spin_unlock_irq(&sem->wait_lock);
> > rwsem_set_reader_owned(sem);
> > lockevent_inc(rwsem_rlock_fast);
> > 
> > 
> > with litmus test in commit log:
> > --- 8< 
> > C rwsem
> > 
> > {
> > atomic_t rwsem_count = ATOMIC_INIT(1);
> > int vmacache_seqnum = 10;
> > }
> > 
> > P0(int *vmacache_seqnum, atomic_t *rwsem_count)
> > {
> > r0 = READ_ONCE(*vmacache_seqnum);
> > WRITE_ONCE(*vmacache_seqnum, r0 + 1);
> > /* downgrade_write */
> > r1 = atomic_fetch_add_release(-1+256, rwsem_count);
> > }
> > 
> > P1(int *vmacache_seqnum, atomic_t *rwsem_count, spinlock_t *sem_wait_lock)
> > {
> > /* rwsem_read_trylock */
> > r0 = atomic_add_return_acquire(256, rwsem_count);
> > /* rwsem_down_read_slowpath */
> > spin_lock(sem_wait_lock);
> > r0 = atomic_read(rwsem_count);
> > if ((r0 & 1) == 0) {
> > // BUG: needs barrier
> > spin_unlock(sem_wait_lock);
> > r1 = READ_ONCE(*vmacache_seqnum);
> > }
> > }
> > exists (1:r1=10)
> > --- 8< 
> 
> Thanks for writing this! It's definitely worth sticking it in the commit
> log, but Paul and Jade might also like to include it as part of their litmus
> test repository too.
> 
> Will
> 


[PATCH v3] locking/rwsem: add acquire barrier to read_slowpath exit when queue is empty

2019-07-18 Thread Jan Stancek
LTP mtest06 has been observed to rarely hit "still mapped when deleted"
and following BUG_ON on arm64:
  page:7e02fa37e480 refcount:3 mapcount:1 mapping:80be3d678ab0 index:0x0
  xfs_address_space_operations [xfs]
  flags: 0xbfffe00037(locked|referenced|uptodate|lru|active)
  page dumped because: VM_BUG_ON_PAGE(page_mapped(page))
  [ cut here ]
  kernel BUG at mm/filemap.c:171!
  Internal error: Oops - BUG: 0 [#1] SMP
  CPU: 220 PID: 154292 Comm: mmap1 Not tainted 5.2.0-0ecfebd.cki #1
  Hardware name: HPE Apollo 70 /C01_APACHE_MB , BIOS L50_5.13_1.10 05/17/2019
  pstate: 40400089 (nZcv daIf +PAN -UAO)
  pc : unaccount_page_cache_page+0x17c/0x1a0
  lr : unaccount_page_cache_page+0x17c/0x1a0
  Call trace:
  unaccount_page_cache_page+0x17c/0x1a0
  delete_from_page_cache_batch+0xa0/0x300
  truncate_inode_pages_range+0x1b8/0x640
  truncate_inode_pages_final+0x88/0xa8
  evict+0x1a0/0x1d8
  iput+0x150/0x240
  dentry_unlink_inode+0x120/0x130
  __dentry_kill+0xd8/0x1d0
  dentry_kill+0x88/0x248
  dput+0x168/0x1b8
  __fput+0xe8/0x208
  fput+0x20/0x30
  task_work_run+0xc0/0xf0
  do_notify_resume+0x2b0/0x328
  work_pending+0x8/0x10

The extra mapcount originated from pagefault handler, which handled
pagefault for vma that has already been detached. vma is detached
under mmap_sem write lock by detach_vmas_to_be_unmapped(), which
also invalidates vmacache.

When pagefault handler (under mmap_sem read lock) called find_vma(),
vmacache_valid() wrongly reported vmacache as valid.

After rwsem down_read() returns via 'queue empty' path (as of v5.2),
it does so without issuing read_acquire on sem->count:
  down_read
__down_read
  rwsem_down_read_failed
__rwsem_down_read_failed_common
  raw_spin_lock_irq(&sem->wait_lock);
  if (list_empty(&sem->wait_list)) {
if (atomic_long_read(&sem->count) >= 0) {
  raw_spin_unlock_irq(&sem->wait_lock);
  return sem;

Suspected problem here is that last *_acquire on down_read() side
happens before write side issues *_release:
  1. writer: has the lock
  2. reader: down_read() issues *read_acquire on entry
  3. writer: mm->vmacache_seqnum++; downgrades lock (*fetch_add_release)
  4. reader: __rwsem_down_read_failed_common() finds it can take lock and 
returns
  5. reader: observes stale mm->vmacache_seqnum

--- 8< 
C rwsem

{
atomic_t rwsem_count = ATOMIC_INIT(1);
int vmacache_seqnum = 10;
}

P0(int *vmacache_seqnum, atomic_t *rwsem_count)
{
r0 = READ_ONCE(*vmacache_seqnum);
WRITE_ONCE(*vmacache_seqnum, r0 + 1);
/* downgrade_write */
r1 = atomic_fetch_add_release(-1+256, rwsem_count);
}

P1(int *vmacache_seqnum, atomic_t *rwsem_count, spinlock_t *sem_wait_lock)
{
/* rwsem_read_trylock */
r0 = atomic_add_return_acquire(256, rwsem_count);
/* rwsem_down_read_slowpath */
spin_lock(sem_wait_lock);
r0 = atomic_read(rwsem_count);
if ((r0 & 1) == 0) {
// BUG: needs barrier
spin_unlock(sem_wait_lock);
r1 = READ_ONCE(*vmacache_seqnum);
}
}
exists (1:r1=10)
--- >8 

I can reproduce the problem by running LTP mtest06 in a loop and building
kernel (-j $NCPUS) in parallel. It does reproduce since v4.20 up to v5.2
on arm64 HPE Apollo 70 (224 CPUs, 256GB RAM, 2 nodes). It triggers reliably
within ~hour. Patched kernel ran fine for 10+ hours with clean dmesg.
Tests were done against v5.2, since commit cf69482d62d9 ("locking/rwsem:
Enable readers spinning on writer") makes it much harder to reproduce.

v2: Move barrier after test (Waiman Long)
Use smp_acquire__after_ctrl_dep() (Peter Zijlstra)
v3: Add comment to barrier (Waiman Long, Will Deacon)
Add litmus test

Related: 
https://github.com/linux-test-project/ltp/blob/master/testcases/kernel/mem/mtest06/mmap1.c
Related: commit dd2283f2605e ("mm: mmap: zap pages with read mmap_sem in 
munmap")
Fixes: 4b486b535c33 ("locking/rwsem: Exit read lock slowpath if queue empty & 
no writer")

Signed-off-by: Jan Stancek 
Reviewed-by: Will Deacon 
Acked-by: Waiman Long 
Cc: sta...@vger.kernel.org # v4.20+
Cc: Waiman Long 
Cc: Davidlohr Bueso 
Cc: Will Deacon 
Cc: Peter Zijlstra 
Cc: Ingo Molnar 
---
 kernel/locking/rwsem.c | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 37524a47f002..fe02aef39e9d 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -1032,6 +1032,13 @@ static inline bool rwsem_reader_phase_trylock(struct 
rw_semaphore *sem,
 */
if (adjustment && !(atomic_long_read(&sem->count) &
 (RWSEM_WRITER_MASK | RWSE

Re: [PATCH v2] locking/rwsem: add acquire barrier to read_slowpath exit when queue is empty

2019-07-17 Thread Jan Stancek

On Wed, Jul 17, 2019 at 10:19:04AM -0400, Waiman Long wrote:

If you add a comment to the code outlining the issue (preferably as a litmus
test involving sem->count and some shared data which happens to be
vmacache_seqnum in your test)), then:

Reviewed-by: Will Deacon 

Thanks,

Will


Agreed. A comment just above smp_acquire__after_ctrl_dep() on why this
is needed will be great.

Other than that,

Acked-by: Waiman Long 



litmus test looks a bit long, would following be acceptable?

diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 37524a47f002..d9c96651bfc7 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -1032,6 +1032,13 @@ static inline bool rwsem_reader_phase_trylock(struct 
rw_semaphore *sem,
 */
if (adjustment && !(atomic_long_read(&sem->count) &
 (RWSEM_WRITER_MASK | RWSEM_FLAG_HANDOFF))) {
+   /*
+* down_read() issued ACQUIRE on enter, but we can race
+* with writer who did RELEASE only after us.
+* ACQUIRE here makes sure reader operations happen only
+* after all writer ones.
+*/
+   smp_acquire__after_ctrl_dep();
raw_spin_unlock_irq(&sem->wait_lock);
rwsem_set_reader_owned(sem);
lockevent_inc(rwsem_rlock_fast);


with litmus test in commit log:
--- 8< 
C rwsem

{
atomic_t rwsem_count = ATOMIC_INIT(1);
int vmacache_seqnum = 10;
}

P0(int *vmacache_seqnum, atomic_t *rwsem_count)
{
r0 = READ_ONCE(*vmacache_seqnum);
WRITE_ONCE(*vmacache_seqnum, r0 + 1);
/* downgrade_write */
r1 = atomic_fetch_add_release(-1+256, rwsem_count);
}

P1(int *vmacache_seqnum, atomic_t *rwsem_count, spinlock_t *sem_wait_lock)
{
/* rwsem_read_trylock */
r0 = atomic_add_return_acquire(256, rwsem_count);
/* rwsem_down_read_slowpath */
spin_lock(sem_wait_lock);
r0 = atomic_read(rwsem_count);
if ((r0 & 1) == 0) {
// BUG: needs barrier
spin_unlock(sem_wait_lock);
r1 = READ_ONCE(*vmacache_seqnum);
}
}
exists (1:r1=10)
--- 8< 

Thanks,
Jan



[PATCH v2] locking/rwsem: add acquire barrier to read_slowpath exit when queue is empty

2019-07-17 Thread Jan Stancek
LTP mtest06 has been observed to rarely hit "still mapped when deleted"
and following BUG_ON on arm64:
  page:7e02fa37e480 refcount:3 mapcount:1 mapping:80be3d678ab0 index:0x0
  xfs_address_space_operations [xfs]
  flags: 0xbfffe00037(locked|referenced|uptodate|lru|active)
  page dumped because: VM_BUG_ON_PAGE(page_mapped(page))
  [ cut here ]
  kernel BUG at mm/filemap.c:171!
  Internal error: Oops - BUG: 0 [#1] SMP
  CPU: 220 PID: 154292 Comm: mmap1 Not tainted 5.2.0-0ecfebd.cki #1
  Hardware name: HPE Apollo 70 /C01_APACHE_MB , BIOS L50_5.13_1.10 05/17/2019
  pstate: 40400089 (nZcv daIf +PAN -UAO)
  pc : unaccount_page_cache_page+0x17c/0x1a0
  lr : unaccount_page_cache_page+0x17c/0x1a0
  Call trace:
  unaccount_page_cache_page+0x17c/0x1a0
  delete_from_page_cache_batch+0xa0/0x300
  truncate_inode_pages_range+0x1b8/0x640
  truncate_inode_pages_final+0x88/0xa8
  evict+0x1a0/0x1d8
  iput+0x150/0x240
  dentry_unlink_inode+0x120/0x130
  __dentry_kill+0xd8/0x1d0
  dentry_kill+0x88/0x248
  dput+0x168/0x1b8
  __fput+0xe8/0x208
  fput+0x20/0x30
  task_work_run+0xc0/0xf0
  do_notify_resume+0x2b0/0x328
  work_pending+0x8/0x10

The extra mapcount originated from pagefault handler, which handled
pagefault for vma that has already been detached. vma is detached
under mmap_sem write lock by detach_vmas_to_be_unmapped(), which
also invalidates vmacache.

When pagefault handler (under mmap_sem read lock) called find_vma(),
vmacache_valid() wrongly reported vmacache as valid.

After rwsem down_read() returns via 'queue empty' path (as of v5.2),
it does so without issuing read_acquire on sem->count:
  down_read
__down_read
  rwsem_down_read_failed
__rwsem_down_read_failed_common
  raw_spin_lock_irq(&sem->wait_lock);
  if (list_empty(&sem->wait_list)) {
if (atomic_long_read(&sem->count) >= 0) {
  raw_spin_unlock_irq(&sem->wait_lock);
  return sem;

Suspected problem here is that last *_acquire on down_read() side
happens before write side issues *_release:
  1. writer: has the lock
  2. reader: down_read() issues *read_acquire on entry
  3. writer: mm->vmacache_seqnum++; downgrades lock (*fetch_add_release)
  4. reader: __rwsem_down_read_failed_common() finds it can take lock and 
returns
  5. reader: observes stale mm->vmacache_seqnum

I can reproduce the problem by running LTP mtest06 in a loop and building
kernel (-j $NCPUS) in parallel. It does reproduce since v4.20 up to v5.2
on arm64 HPE Apollo 70 (224 CPUs, 256GB RAM, 2 nodes). It triggers reliably
within ~hour. Patched kernel ran fine for 10+ hours with clean dmesg.
Tests were done against v5.2, since commit cf69482d62d9 ("locking/rwsem:
Enable readers spinning on writer") makes it much harder to reproduce.

v2: Move barrier after test (Waiman Long)
Use smp_acquire__after_ctrl_dep() (Peter Zijlstra)

Related: 
https://github.com/linux-test-project/ltp/blob/master/testcases/kernel/mem/mtest06/mmap1.c
Related: commit dd2283f2605e ("mm: mmap: zap pages with read mmap_sem in 
munmap")
Fixes: 4b486b535c33 ("locking/rwsem: Exit read lock slowpath if queue empty & 
no writer")

Signed-off-by: Jan Stancek 
Cc: sta...@vger.kernel.org # v4.20+
Cc: Waiman Long 
Cc: Davidlohr Bueso 
Cc: Will Deacon 
Cc: Peter Zijlstra 
Cc: Ingo Molnar 
---
 kernel/locking/rwsem.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 37524a47f002..5ac72b60608b 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -1032,6 +1032,7 @@ static inline bool rwsem_reader_phase_trylock(struct 
rw_semaphore *sem,
 */
if (adjustment && !(atomic_long_read(&sem->count) &
 (RWSEM_WRITER_MASK | RWSEM_FLAG_HANDOFF))) {
+   smp_acquire__after_ctrl_dep();
raw_spin_unlock_irq(&sem->wait_lock);
rwsem_set_reader_owned(sem);
lockevent_inc(rwsem_rlock_fast);
-- 
1.8.3.1



Re: [PATCH] locking/rwsem: use read_acquire in read_slowpath exit when queue is empty

2019-07-16 Thread Jan Stancek



- Original Message -
> On 7/16/19 12:04 PM, Jan Stancek wrote:
> > LTP mtest06 has been observed to rarely hit "still mapped when deleted"
> > and following BUG_ON on arm64:
> >   page:7e02fa37e480 refcount:3 mapcount:1 mapping:80be3d678ab0
> >   index:0x0
> >   xfs_address_space_operations [xfs]
> >   flags: 0xbfffe00037(locked|referenced|uptodate|lru|active)
> >   page dumped because: VM_BUG_ON_PAGE(page_mapped(page))
> >   [ cut here ]
> >   kernel BUG at mm/filemap.c:171!
> >   Internal error: Oops - BUG: 0 [#1] SMP
> >   CPU: 220 PID: 154292 Comm: mmap1 Not tainted 5.2.0-0ecfebd.cki #1
> >   Hardware name: HPE Apollo 70 /C01_APACHE_MB , BIOS L50_5.13_1.10
> >   05/17/2019
> >   pstate: 40400089 (nZcv daIf +PAN -UAO)
> >   pc : unaccount_page_cache_page+0x17c/0x1a0
> >   lr : unaccount_page_cache_page+0x17c/0x1a0
> >   Call trace:
> >   unaccount_page_cache_page+0x17c/0x1a0
> >   delete_from_page_cache_batch+0xa0/0x300
> >   truncate_inode_pages_range+0x1b8/0x640
> >   truncate_inode_pages_final+0x88/0xa8
> >   evict+0x1a0/0x1d8
> >   iput+0x150/0x240
> >   dentry_unlink_inode+0x120/0x130
> >   __dentry_kill+0xd8/0x1d0
> >   dentry_kill+0x88/0x248
> >   dput+0x168/0x1b8
> >   __fput+0xe8/0x208
> >   fput+0x20/0x30
> >   task_work_run+0xc0/0xf0
> >   do_notify_resume+0x2b0/0x328
> >   work_pending+0x8/0x10
> >
> > The extra mapcount originated from pagefault handler, which handled
> > pagefault for vma that has already been detached. vma is detached
> > under mmap_sem write lock by detach_vmas_to_be_unmapped(), which
> > also invalidates vmacache.
> >
> > When pagefault handler (under mmap_sem read lock) called find_vma(),
> > vmacache_valid() wrongly reported vmacache as valid.
> >
> > After rwsem down_read() returns via 'queue empty' path (as of v5.2),
> > it does so without issuing read_acquire on sem->count:
> >   down_read
> > __down_read
> >   rwsem_down_read_failed
> > __rwsem_down_read_failed_common
> >   raw_spin_lock_irq(&sem->wait_lock);
> >   if (list_empty(&sem->wait_list)) {
> > if (atomic_long_read(&sem->count) >= 0) {
> >   raw_spin_unlock_irq(&sem->wait_lock);
> >   return sem;
> >
> > Suspected problem here is that last *_acquire on down_read() side
> > happens before write side issues *_release:
> >   1. writer: has the lock
> >   2. reader: down_read() issues *read_acquire on entry
> >   3. writer: mm->vmacache_seqnum++; downgrades lock (*fetch_add_release)
> >   4. reader: __rwsem_down_read_failed_common() finds it can take lock and
> >   returns
> >   5. reader: observes stale mm->vmacache_seqnum
> >
> > I can reproduce the problem by running LTP mtest06 in a loop and building
> > kernel (-j $NCPUS) in parallel. It does reproduce since v4.20 up to v5.2
> > on arm64 HPE Apollo 70 (224 CPUs, 256GB RAM, 2 nodes). It triggers reliably
> > within ~hour. Patched kernel ran fine for 5+ hours with clean dmesg.
> > Tests were done against v5.2, since commit cf69482d62d9 ("locking/rwsem:
> > Enable readers spinning on writer") makes it much harder to reproduce.
> The explanation makes sense to me. Thanks for the investigation.
> >
> > Related:
> > https://github.com/linux-test-project/ltp/blob/master/testcases/kernel/mem/mtest06/mmap1.c
> > Related: commit dd2283f2605e ("mm: mmap: zap pages with read mmap_sem in
> > munmap")
> > Fixes: 4b486b535c33 ("locking/rwsem: Exit read lock slowpath if queue empty
> > & no writer")
> >
> > Signed-off-by: Jan Stancek 
> > Cc: Waiman Long 
> > Cc: Davidlohr Bueso 
> > Cc: Will Deacon 
> > Cc: Peter Zijlstra 
> > Cc: Ingo Molnar 
> > ---
> >  kernel/locking/rwsem.c | 2 +-
> >  1 file changed, 1 insertion(+), 1 deletion(-)
> >
> > diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
> > index 37524a47f002..757b198d7a5b 100644
> > --- a/kernel/locking/rwsem.c
> > +++ b/kernel/locking/rwsem.c
> > @@ -1030,7 +1030,7 @@ static inline bool rwsem_reader_phase_trylock(struct
> > rw_semaphore *sem,
> >  * exit the slowpath and return immediately as its
> >  * RWSEM_READER_BIAS has already been set in the count.
> >  */
> > -   if (adjustment && !(atomic_long_read(&sem->count) &
> > +   if (adjustment && !(atomic_long_read_acquire(&sem->count) &
> >  (RWSEM_WRITER_MASK | RWSEM_FLAG_HANDOFF))) {
> > raw_spin_unlock_irq(&sem->wait_lock);
> > rwsem_set_reader_owned(sem);
> 
> The chance of taking this path is not that high. So instead of
> increasing the cost of the test by adding an acquire barrier,

Yes, that is good point.

> how about
> just adding smp_mb__after_spinlock() before spin_unlock_irq(). This
> should have the same effect of making sure that no stale data will be
> used in the read-lock critical section.

I'll redo the tests with smp_mb__after_spinlock().

Thanks,
Jan

> 
> -Longman
> 
> 


[PATCH] locking/rwsem: use read_acquire in read_slowpath exit when queue is empty

2019-07-16 Thread Jan Stancek
LTP mtest06 has been observed to rarely hit "still mapped when deleted"
and following BUG_ON on arm64:
  page:7e02fa37e480 refcount:3 mapcount:1 mapping:80be3d678ab0 index:0x0
  xfs_address_space_operations [xfs]
  flags: 0xbfffe00037(locked|referenced|uptodate|lru|active)
  page dumped because: VM_BUG_ON_PAGE(page_mapped(page))
  [ cut here ]
  kernel BUG at mm/filemap.c:171!
  Internal error: Oops - BUG: 0 [#1] SMP
  CPU: 220 PID: 154292 Comm: mmap1 Not tainted 5.2.0-0ecfebd.cki #1
  Hardware name: HPE Apollo 70 /C01_APACHE_MB , BIOS L50_5.13_1.10 05/17/2019
  pstate: 40400089 (nZcv daIf +PAN -UAO)
  pc : unaccount_page_cache_page+0x17c/0x1a0
  lr : unaccount_page_cache_page+0x17c/0x1a0
  Call trace:
  unaccount_page_cache_page+0x17c/0x1a0
  delete_from_page_cache_batch+0xa0/0x300
  truncate_inode_pages_range+0x1b8/0x640
  truncate_inode_pages_final+0x88/0xa8
  evict+0x1a0/0x1d8
  iput+0x150/0x240
  dentry_unlink_inode+0x120/0x130
  __dentry_kill+0xd8/0x1d0
  dentry_kill+0x88/0x248
  dput+0x168/0x1b8
  __fput+0xe8/0x208
  fput+0x20/0x30
  task_work_run+0xc0/0xf0
  do_notify_resume+0x2b0/0x328
  work_pending+0x8/0x10

The extra mapcount originated from pagefault handler, which handled
pagefault for vma that has already been detached. vma is detached
under mmap_sem write lock by detach_vmas_to_be_unmapped(), which
also invalidates vmacache.

When pagefault handler (under mmap_sem read lock) called find_vma(),
vmacache_valid() wrongly reported vmacache as valid.

After rwsem down_read() returns via 'queue empty' path (as of v5.2),
it does so without issuing read_acquire on sem->count:
  down_read
__down_read
  rwsem_down_read_failed
__rwsem_down_read_failed_common
  raw_spin_lock_irq(&sem->wait_lock);
  if (list_empty(&sem->wait_list)) {
if (atomic_long_read(&sem->count) >= 0) {
  raw_spin_unlock_irq(&sem->wait_lock);
  return sem;

Suspected problem here is that last *_acquire on down_read() side
happens before write side issues *_release:
  1. writer: has the lock
  2. reader: down_read() issues *read_acquire on entry
  3. writer: mm->vmacache_seqnum++; downgrades lock (*fetch_add_release)
  4. reader: __rwsem_down_read_failed_common() finds it can take lock and 
returns
  5. reader: observes stale mm->vmacache_seqnum

I can reproduce the problem by running LTP mtest06 in a loop and building
kernel (-j $NCPUS) in parallel. It does reproduce since v4.20 up to v5.2
on arm64 HPE Apollo 70 (224 CPUs, 256GB RAM, 2 nodes). It triggers reliably
within ~hour. Patched kernel ran fine for 5+ hours with clean dmesg.
Tests were done against v5.2, since commit cf69482d62d9 ("locking/rwsem:
Enable readers spinning on writer") makes it much harder to reproduce.

Related: 
https://github.com/linux-test-project/ltp/blob/master/testcases/kernel/mem/mtest06/mmap1.c
Related: commit dd2283f2605e ("mm: mmap: zap pages with read mmap_sem in 
munmap")
Fixes: 4b486b535c33 ("locking/rwsem: Exit read lock slowpath if queue empty & 
no writer")

Signed-off-by: Jan Stancek 
Cc: Waiman Long 
Cc: Davidlohr Bueso 
Cc: Will Deacon 
Cc: Peter Zijlstra 
Cc: Ingo Molnar 
---
 kernel/locking/rwsem.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 37524a47f002..757b198d7a5b 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -1030,7 +1030,7 @@ static inline bool rwsem_reader_phase_trylock(struct 
rw_semaphore *sem,
 * exit the slowpath and return immediately as its
 * RWSEM_READER_BIAS has already been set in the count.
 */
-   if (adjustment && !(atomic_long_read(&sem->count) &
+   if (adjustment && !(atomic_long_read_acquire(&sem->count) &
 (RWSEM_WRITER_MASK | RWSEM_FLAG_HANDOFF))) {
raw_spin_unlock_irq(&sem->wait_lock);
rwsem_set_reader_owned(sem);
-- 
1.8.3.1



Re: [LTP] [mm] 8c7829b04c: ltp.overcommit_memory01.fail

2019-05-20 Thread Jan Stancek



- Original Message -
> FYI, we noticed the following commit (built with gcc-7):
> 
> commit: 8c7829b04c523cdc732cb77f59f03320e09f3386 ("mm: fix false-positive
> OVERCOMMIT_GUESS failures")

This matches report from Naresh:
  http://lists.linux.it/pipermail/ltp/2019-May/011962.html

> https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git master
> 
> in testcase: ltp
> with following parameters:
> 
>   disk: 1HDD
>   test: mm-01
> 
> test-description: The LTP testsuite contains a collection of tools for
> testing the Linux kernel and related features.
> test-url: http://linux-test-project.github.io/
> 
> 
> on test machine: qemu-system-x86_64 -enable-kvm -cpu SandyBridge -smp 2 -m 4G
> 
> caused below changes (please refer to attached dmesg/kmsg for entire
> log/backtrace):
> 
> 
> If you fix the issue, kindly add following tag
> Reported-by: kernel test robot 
> 
> 
> [  554.112267 ] <<>>
> [  554.112269 ]
> [  554.115806 ] tag=overcommit_memory01 stime=1558074982
> [  554.115809 ]
> [  554.119303 ] cmdline="overcommit_memory"
> [  554.119306 ]
> [  554.121962 ] contacts=""
> [  554.121965 ]
> [  554.124459 ] analysis=exit
> [  554.124463 ]
> [  554.127140 ] <<>>
> [  554.127144 ]
> [  554.131401 ] tst_test.c:1096: INFO: Timeout per run is 0h 05m 00s
> [  554.131404 ]
> [  554.136248 ] overcommit_memory.c:117: INFO: MemTotal is 4033124 kB
> [  554.136251 ]
> [  554.141365 ] overcommit_memory.c:119: INFO: SwapTotal is 268435452 kB
> [  554.141368 ]
> [  554.146664 ] overcommit_memory.c:123: INFO: CommitLimit is 270452012 kB
> [  554.14 ]
> [  554.151375 ] mem.c:814: INFO: set overcommit_ratio to 50
> [  554.151378 ]
> [  554.155796 ] mem.c:814: INFO: set overcommit_memory to 2
> [  554.155799 ]
> [  554.159951 ] overcommit_memory.c:190: INFO: malloc 538608648 kB failed
> [  554.159953 ]
> [  554.164801 ] overcommit_memory.c:211: PASS: alloc failed as expected
> [  554.164804 ]
> [  554.170196 ] overcommit_memory.c:190: INFO: malloc 270452012 kB failed
> [  554.170199 ]
> [  554.175344 ] overcommit_memory.c:211: PASS: alloc failed as expected
> [  554.175347 ]
> [  554.180932 ] overcommit_memory.c:186: INFO: malloc 134651878 kB
> successfully
> [  554.180935 ]
> [  554.186522 ] overcommit_memory.c:205: PASS: alloc passed as expected
> [  554.186524 ]
> [  554.191234 ] mem.c:814: INFO: set overcommit_memory to 0
> [  554.191237 ]
> [  554.196930 ] overcommit_memory.c:186: INFO: malloc 134734122 kB
> successfully
> [  554.196933 ]
> [  554.202738 ] overcommit_memory.c:205: PASS: alloc passed as expected
> [  554.202742 ]
> [  554.208563 ] overcommit_memory.c:190: INFO: malloc 538936488 kB failed
> [  554.208566 ]
> [  554.214355 ] overcommit_memory.c:211: PASS: alloc failed as expected
> [  554.214357 ]
> [  554.220506 ] overcommit_memory.c:186: INFO: malloc 272468576 kB
> successfully
> [  554.220509 ]
> [  554.226564 ] overcommit_memory.c:213: FAIL: alloc passed, expected to fail
> [  554.226568 ]
> [  554.231870 ] mem.c:814: INFO: set overcommit_memory to 1
> [  554.231873 ]
> [  554.237819 ] overcommit_memory.c:186: INFO: malloc 136234288 kB
> successfully
> [  554.237821 ]
> [  554.243711 ] overcommit_memory.c:205: PASS: alloc passed as expected
> [  554.243713 ]
> [  554.249791 ] overcommit_memory.c:186: INFO: malloc 272468576 kB
> successfully
> [  554.249794 ]
> [  554.255069 ] overcommit_memory.c:205: PASS: alloc passed as expected
> [  554.255073 ]
> [  554.261594 ] overcommit_memory.c:186: INFO: malloc 544937152 kB
> successfully
> [  554.261597 ]
> [  554.267855 ] overcommit_memory.c:205: PASS: alloc passed as expected
> [  554.267858 ]
> [  554.273445 ] mem.c:814: INFO: set overcommit_memory to 0
> [  554.273448 ]
> [  554.278882 ] mem.c:814: INFO: set overcommit_ratio to 50
> [  554.278885 ]
> [  554.282419 ]
> [  554.284170 ] Summary:
> [  554.284173 ]
> [  554.287476 ] passed   8
> [  554.287480 ]
> [  554.290740 ] failed   1
> [  554.290743 ]
> [  554.293971 ] skipped  0
> [  554.293974 ]
> [  554.297123 ] warnings 0
> [  554.297126 ]
> [  554.300746 ] <<>>
> [  554.300749 ]
> [  554.304489 ] initiation_status="ok"
> [  554.304492 ]
> [  554.309762 ] duration=0 termination_type=exited termination_id=1
> corefile=no
> [  554.309765 ]
> [  554.313681 ] cutime=0 cstime=0
> [  554.313685 ]
> [  554.316880 ] <<>>
> 
> 
> 
> To reproduce:
> 
> # build kernel
>   cd linux
>   cp config-5.1.0-10246-g8c7829b .config
>   make HOSTCC=gcc-7 CC=gcc-7 ARCH=x86_64 olddefconfig
>   make HOSTCC=gcc-7 CC=gcc-7 ARCH=x86_64 prepare
>   make HOSTCC=gcc-7 CC=gcc-7 ARCH=x86_64 modules_prepare
>   make HOSTCC=gcc-7 CC=gcc-7 ARCH=x86_64 SHELL=/bin/bash
>   make HOSTCC=gcc-7 CC=gcc-7 ARCH=x86_64 bzImage
> 
> 
> git clone https://github.com/intel/lkp-tests.git
> cd lkp-tests
> find lib/ | cpio -o -H newc --quiet | gzip > modules.cgz
>   bin/lkp qemu -k  -m modules.cgz job-script # job-script is 
> attached
>   in this ema

Re: [v2 PATCH] mm: mmu_gather: remove __tlb_reset_range() for force flush

2019-05-16 Thread Jan Stancek



- Original Message -
> On Mon, May 13, 2019 at 04:01:09PM -0700, Yang Shi wrote:
> > 
> > 
> > On 5/13/19 9:38 AM, Will Deacon wrote:
> > > On Fri, May 10, 2019 at 07:26:54AM +0800, Yang Shi wrote:
> > > > diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
> > > > index 99740e1..469492d 100644
> > > > --- a/mm/mmu_gather.c
> > > > +++ b/mm/mmu_gather.c
> > > > @@ -245,14 +245,39 @@ void tlb_finish_mmu(struct mmu_gather *tlb,
> > > >   {
> > > > /*
> > > >  * If there are parallel threads are doing PTE changes on same 
> > > > range
> > > > -* under non-exclusive lock(e.g., mmap_sem read-side) but defer 
> > > > TLB
> > > > -* flush by batching, a thread has stable TLB entry can fail to 
> > > > flush
> > > > -* the TLB by observing pte_none|!pte_dirty, for example so 
> > > > flush TLB
> > > > -* forcefully if we detect parallel PTE batching threads.
> > > > +* under non-exclusive lock (e.g., mmap_sem read-side) but 
> > > > defer TLB
> > > > +* flush by batching, one thread may end up seeing inconsistent 
> > > > PTEs
> > > > +* and result in having stale TLB entries.  So flush TLB 
> > > > forcefully
> > > > +* if we detect parallel PTE batching threads.
> > > > +*
> > > > +* However, some syscalls, e.g. munmap(), may free page tables, 
> > > > this
> > > > +* needs force flush everything in the given range. Otherwise 
> > > > this
> > > > +* may result in having stale TLB entries for some 
> > > > architectures,
> > > > +* e.g. aarch64, that could specify flush what level TLB.
> > > >  */
> > > > -   if (mm_tlb_flush_nested(tlb->mm)) {
> > > > -   __tlb_reset_range(tlb);
> > > > -   __tlb_adjust_range(tlb, start, end - start);
> > > > +   if (mm_tlb_flush_nested(tlb->mm) && !tlb->fullmm) {
> > > > +   /*
> > > > +* Since we can't tell what we actually should have
> > > > +* flushed, flush everything in the given range.
> > > > +*/
> > > > +   tlb->freed_tables = 1;
> > > > +   tlb->cleared_ptes = 1;
> > > > +   tlb->cleared_pmds = 1;
> > > > +   tlb->cleared_puds = 1;
> > > > +   tlb->cleared_p4ds = 1;
> > > > +
> > > > +   /*
> > > > +* Some architectures, e.g. ARM, that have range 
> > > > invalidation
> > > > +* and care about VM_EXEC for I-Cache invalidation, 
> > > > need force
> > > > +* vma_exec set.
> > > > +*/
> > > > +   tlb->vma_exec = 1;
> > > > +
> > > > +   /* Force vma_huge clear to guarantee safer flush */
> > > > +   tlb->vma_huge = 0;
> > > > +
> > > > +   tlb->start = start;
> > > > +   tlb->end = end;
> > > > }
> > > Whilst I think this is correct, it would be interesting to see whether
> > > or not it's actually faster than just nuking the whole mm, as I mentioned
> > > before.
> > > 
> > > At least in terms of getting a short-term fix, I'd prefer the diff below
> > > if it's not measurably worse.
> > 
> > I did a quick test with ebizzy (96 threads with 5 iterations) on my x86 VM,
> > it shows slightly slowdown on records/s but much more sys time spent with
> > fullmm flush, the below is the data.
> > 
> >     nofullmm fullmm
> > ops (records/s)  225606  225119
> > sys (s)    0.69    1.14
> > 
> > It looks the slight reduction of records/s is caused by the increase of sys
> > time.
> 
> That's not what I expected, and I'm unable to explain why moving to fullmm
> would /increase/ the system time. I would've thought the time spent doing
> the invalidation would decrease, with the downside that the TLB is cold
> when returning back to userspace.
> 

I tried ebizzy with various parameters (malloc vs mmap, ran it for hour),
but performance was very similar for both patches.

So, I was looking for workload that would demonstrate the largest difference.
Inspired by python xml-rpc, which can handle each request in new thread,
I tried following [1]:

16 threads, each looping 100k times over:
  mmap(16M)
  touch 1 page
  madvise(DONTNEED)
  munmap(16M)

This yields quite significant difference for 2 patches when running on
my 46 CPU arm host. I checked it twice - applied patch, recompiled, rebooted,
but numbers stayed +- couple seconds the same.

Does it somewhat match your expectation?

v2 patch
-
real2m33.460s
user0m3.359s
sys 15m32.307s

real2m33.895s
user0m2.749s
sys 16m34.500s

real2m35.666s
user0m3.528s
sys 15m23.377s

real2m32.898s
user0m2.789s
sys 16m18.801s

real2m33.087s
user0m3.565s
sys 16m23.815s


fullmm version
---
real0m46.811s
user0m1.596s
sys 1m

Re: LTP: mm: overcommit_memory01, 03...06 failed

2019-05-15 Thread Jan Stancek



- Original Message -
> ltp-mm-tests failed on Linux mainline kernel  5.1.0,
>   * overcommit_memory01 overcommit_memory
>   * overcommit_memory03 overcommit_memory -R 30
>   * overcommit_memory04 overcommit_memory -R 80
>   * overcommit_memory05 overcommit_memory -R 100
>   * overcommit_memory06 overcommit_memory -R 200
> 
> mem.c:814: INFO: set overcommit_memory to 0
> overcommit_memory.c:185: INFO: malloc 8094844 kB successfully
> overcommit_memory.c:204: PASS: alloc passed as expected
> overcommit_memory.c:189: INFO: malloc 32379376 kB failed
> overcommit_memory.c:210: PASS: alloc failed as expected
> overcommit_memory.c:185: INFO: malloc 16360216 kB successfully
> overcommit_memory.c:212: FAIL: alloc passed, expected to fail
> 
> Failed test log,
> https://lkft.validation.linaro.org/scheduler/job/726417#L22852
> 
> LTP version 20190115
> 
> Test case link,
> https://github.com/linux-test-project/ltp/blob/master/testcases/kernel/mem/tunable/overcommit_memory.c#L212
> 
> First bad commit:
> git branch master
> git commit e0654264c4806dc436b291294a0fbf9be7571ab6
> git describe v5.1-10706-ge0654264c480
> git repo https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
> 
> Last good commit:
> git branch master
> git commit 7e9890a3500d95c01511a4c45b7e7192dfa47ae2
> git describe v5.1-10326-g7e9890a3500d
> git repo https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

Heuristic changed a bit in:
  commit 8c7829b04c523cdc732cb77f59f03320e09f3386
  Author: Johannes Weiner 
  Date:   Mon May 13 17:21:50 2019 -0700
mm: fix false-positive OVERCOMMIT_GUESS failures

LTP tries to allocate "mem_total + swap_total":
  alloc_and_check(sum_total, EXPECT_FAIL);
which now presumably falls short to trigger failure.

> 
> Best regards
> Naresh Kamboju
> 


Re: [v2 PATCH] mm: mmu_gather: remove __tlb_reset_range() for force flush

2019-05-14 Thread Jan Stancek


- Original Message -
> 
> 
> On May 13, 2019 4:01 PM, Yang Shi  wrote:
> 
> 
> On 5/13/19 9:38 AM, Will Deacon wrote:
> > On Fri, May 10, 2019 at 07:26:54AM +0800, Yang Shi wrote:
> >> diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
> >> index 99740e1..469492d 100644
> >> --- a/mm/mmu_gather.c
> >> +++ b/mm/mmu_gather.c
> >> @@ -245,14 +245,39 @@ void tlb_finish_mmu(struct mmu_gather *tlb,
> >>   {
> >>   /*
> >>* If there are parallel threads are doing PTE changes on same range
> >> - * under non-exclusive lock(e.g., mmap_sem read-side) but defer TLB
> >> - * flush by batching, a thread has stable TLB entry can fail to flush
> >> - * the TLB by observing pte_none|!pte_dirty, for example so flush TLB
> >> - * forcefully if we detect parallel PTE batching threads.
> >> + * under non-exclusive lock (e.g., mmap_sem read-side) but defer TLB
> >> + * flush by batching, one thread may end up seeing inconsistent PTEs
> >> + * and result in having stale TLB entries.  So flush TLB forcefully
> >> + * if we detect parallel PTE batching threads.
> >> + *
> >> + * However, some syscalls, e.g. munmap(), may free page tables, this
> >> + * needs force flush everything in the given range. Otherwise this
> >> + * may result in having stale TLB entries for some architectures,
> >> + * e.g. aarch64, that could specify flush what level TLB.
> >>*/
> >> -if (mm_tlb_flush_nested(tlb->mm)) {
> >> -__tlb_reset_range(tlb);
> >> -__tlb_adjust_range(tlb, start, end - start);
> >> +if (mm_tlb_flush_nested(tlb->mm) && !tlb->fullmm) {
> >> +/*
> >> + * Since we can't tell what we actually should have
> >> + * flushed, flush everything in the given range.
> >> + */
> >> +tlb->freed_tables = 1;
> >> +tlb->cleared_ptes = 1;
> >> +tlb->cleared_pmds = 1;
> >> +tlb->cleared_puds = 1;
> >> +tlb->cleared_p4ds = 1;
> >> +
> >> +/*
> >> + * Some architectures, e.g. ARM, that have range invalidation
> >> + * and care about VM_EXEC for I-Cache invalidation, need
> >> force
> >> + * vma_exec set.
> >> + */
> >> +tlb->vma_exec = 1;
> >> +
> >> +/* Force vma_huge clear to guarantee safer flush */
> >> +tlb->vma_huge = 0;
> >> +
> >> +tlb->start = start;
> >> +tlb->end = end;
> >>   }
> > Whilst I think this is correct, it would be interesting to see whether
> > or not it's actually faster than just nuking the whole mm, as I mentioned
> > before.
> >
> > At least in terms of getting a short-term fix, I'd prefer the diff below
> > if it's not measurably worse.
> 
> I did a quick test with ebizzy (96 threads with 5 iterations) on my x86
> VM, it shows slightly slowdown on records/s but much more sys time spent
> with fullmm flush, the below is the data.
> 
>  nofullmm fullmm
> ops (records/s)  225606  225119
> sys (s)0.691.14
> 
> It looks the slight reduction of records/s is caused by the increase of
> sys time.
> 
> >
> > Will
> >
> > --->8
> >
> > diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
> > index 99740e1dd273..cc251422d307 100644
> > --- a/mm/mmu_gather.c
> > +++ b/mm/mmu_gather.c
> > @@ -251,8 +251,9 @@ void tlb_finish_mmu(struct mmu_gather *tlb,
> > * forcefully if we detect parallel PTE batching threads.
> > */
> >if (mm_tlb_flush_nested(tlb->mm)) {
> > + tlb->fullmm = 1;
> >__tlb_reset_range(tlb);
> > - __tlb_adjust_range(tlb, start, end - start);
> > + tlb->freed_tables = 1;
> >}
> >
> >tlb_flush_mmu(tlb);
> 
> 
> I think that this should have set need_flush_all and not fullmm.
> 

Wouldn't that skip the flush?

If fulmm == 0, then __tlb_reset_range() sets tlb->end = 0.
  tlb_flush_mmu
tlb_flush_mmu_tlbonly
  if (!tlb->end)
 return

Replacing fullmm with need_flush_all, brings the problem back / reproducer 
hangs.


Re: [PATCH] mm: mmu_gather: remove __tlb_reset_range() for force flush

2019-05-09 Thread Jan Stancek


- Original Message -
> 
> 
> On 5/9/19 2:06 PM, Jan Stancek wrote:
> > - Original Message -
> >>
> >> On 5/9/19 11:24 AM, Peter Zijlstra wrote:
> >>> On Thu, May 09, 2019 at 05:36:29PM +, Nadav Amit wrote:
> >>>>> On May 9, 2019, at 3:38 AM, Peter Zijlstra 
> >>>>> wrote:
> >>>>> diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
> >>>>> index 99740e1dd273..fe768f8d612e 100644
> >>>>> --- a/mm/mmu_gather.c
> >>>>> +++ b/mm/mmu_gather.c
> >>>>> @@ -244,15 +244,20 @@ void tlb_finish_mmu(struct mmu_gather *tlb,
> >>>>> unsigned long start, unsigned long end)
> >>>>> {
> >>>>> /*
> >>>>> -* If there are parallel threads are doing PTE changes on same 
> >>>>> range
> >>>>> -* under non-exclusive lock(e.g., mmap_sem read-side) but defer 
> >>>>> TLB
> >>>>> -* flush by batching, a thread has stable TLB entry can fail to 
> >>>>> flush
> >>>>> -* the TLB by observing pte_none|!pte_dirty, for example so 
> >>>>> flush TLB
> >>>>> -* forcefully if we detect parallel PTE batching threads.
> >>>>> +* Sensible comment goes here..
> >>>>>  */
> >>>>> -   if (mm_tlb_flush_nested(tlb->mm)) {
> >>>>> -   __tlb_reset_range(tlb);
> >>>>> -   __tlb_adjust_range(tlb, start, end - start);
> >>>>> +   if (mm_tlb_flush_nested(tlb->mm) && !tlb->full_mm) {
> >>>>> +   /*
> >>>>> +* Since we're can't tell what we actually should have
> >>>>> +* flushed flush everything in the given range.
> >>>>> +*/
> >>>>> +   tlb->start = start;
> >>>>> +   tlb->end = end;
> >>>>> +   tlb->freed_tables = 1;
> >>>>> +   tlb->cleared_ptes = 1;
> >>>>> +   tlb->cleared_pmds = 1;
> >>>>> +   tlb->cleared_puds = 1;
> >>>>> +   tlb->cleared_p4ds = 1;
> >>>>> }
> >>>>>
> >>>>> tlb_flush_mmu(tlb);
> >>>> As a simple optimization, I think it is possible to hold multiple
> >>>> nesting
> >>>> counters in the mm, similar to tlb_flush_pending, for freed_tables,
> >>>> cleared_ptes, etc.
> >>>>
> >>>> The first time you set tlb->freed_tables, you also atomically increase
> >>>> mm->tlb_flush_freed_tables. Then, in tlb_flush_mmu(), you just use
> >>>> mm->tlb_flush_freed_tables instead of tlb->freed_tables.
> >>> That sounds fraught with races and expensive; I would much prefer to not
> >>> go there for this arguably rare case.
> >>>
> >>> Consider such fun cases as where CPU-0 sees and clears a PTE, CPU-1
> >>> races and doesn't see that PTE. Therefore CPU-0 sets and counts
> >>> cleared_ptes. Then if CPU-1 flushes while CPU-0 is still in mmu_gather,
> >>> it will see cleared_ptes count increased and flush that granularity,
> >>> OTOH if CPU-1 flushes after CPU-0 completes, it will not and potentiall
> >>> miss an invalidate it should have had.
> >>>
> >>> This whole concurrent mmu_gather stuff is horrible.
> >>>
> >>> /me ponders more
> >>>
> >>> So I think the fundamental race here is this:
> >>>
> >>>   CPU-0   CPU-1
> >>>
> >>>   tlb_gather_mmu(.start=1,tlb_gather_mmu(.start=2,
> >>>  .end=3);.end=4);
> >>>
> >>>   ptep_get_and_clear_full(2)
> >>>   tlb_remove_tlb_entry(2);
> >>>   __tlb_remove_page();
> >>>   if (pte_present(2)) // nope
> >>>
> >>>   tlb_finish_mmu();
> >>>
> >>>   // continue without TLBI(2)
> >>>   // whoopsie
> >>>
> >>>   tlb_finish_mmu();
> >&g

Re: [PATCH] mm: mmu_gather: remove __tlb_reset_range() for force flush

2019-05-09 Thread Jan Stancek


- Original Message -
> 
> 
> On 5/9/19 11:24 AM, Peter Zijlstra wrote:
> > On Thu, May 09, 2019 at 05:36:29PM +, Nadav Amit wrote:
> >>> On May 9, 2019, at 3:38 AM, Peter Zijlstra  wrote:
> >>> diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
> >>> index 99740e1dd273..fe768f8d612e 100644
> >>> --- a/mm/mmu_gather.c
> >>> +++ b/mm/mmu_gather.c
> >>> @@ -244,15 +244,20 @@ void tlb_finish_mmu(struct mmu_gather *tlb,
> >>>   unsigned long start, unsigned long end)
> >>> {
> >>>   /*
> >>> -  * If there are parallel threads are doing PTE changes on same range
> >>> -  * under non-exclusive lock(e.g., mmap_sem read-side) but defer TLB
> >>> -  * flush by batching, a thread has stable TLB entry can fail to flush
> >>> -  * the TLB by observing pte_none|!pte_dirty, for example so flush TLB
> >>> -  * forcefully if we detect parallel PTE batching threads.
> >>> +  * Sensible comment goes here..
> >>>*/
> >>> - if (mm_tlb_flush_nested(tlb->mm)) {
> >>> - __tlb_reset_range(tlb);
> >>> - __tlb_adjust_range(tlb, start, end - start);
> >>> + if (mm_tlb_flush_nested(tlb->mm) && !tlb->full_mm) {
> >>> + /*
> >>> +  * Since we're can't tell what we actually should have
> >>> +  * flushed flush everything in the given range.
> >>> +  */
> >>> + tlb->start = start;
> >>> + tlb->end = end;
> >>> + tlb->freed_tables = 1;
> >>> + tlb->cleared_ptes = 1;
> >>> + tlb->cleared_pmds = 1;
> >>> + tlb->cleared_puds = 1;
> >>> + tlb->cleared_p4ds = 1;
> >>>   }
> >>>
> >>>   tlb_flush_mmu(tlb);
> >> As a simple optimization, I think it is possible to hold multiple nesting
> >> counters in the mm, similar to tlb_flush_pending, for freed_tables,
> >> cleared_ptes, etc.
> >>
> >> The first time you set tlb->freed_tables, you also atomically increase
> >> mm->tlb_flush_freed_tables. Then, in tlb_flush_mmu(), you just use
> >> mm->tlb_flush_freed_tables instead of tlb->freed_tables.
> > That sounds fraught with races and expensive; I would much prefer to not
> > go there for this arguably rare case.
> >
> > Consider such fun cases as where CPU-0 sees and clears a PTE, CPU-1
> > races and doesn't see that PTE. Therefore CPU-0 sets and counts
> > cleared_ptes. Then if CPU-1 flushes while CPU-0 is still in mmu_gather,
> > it will see cleared_ptes count increased and flush that granularity,
> > OTOH if CPU-1 flushes after CPU-0 completes, it will not and potentiall
> > miss an invalidate it should have had.
> >
> > This whole concurrent mmu_gather stuff is horrible.
> >
> >/me ponders more
> >
> > So I think the fundamental race here is this:
> >
> > CPU-0   CPU-1
> >
> > tlb_gather_mmu(.start=1,tlb_gather_mmu(.start=2,
> >.end=3);.end=4);
> >
> > ptep_get_and_clear_full(2)
> > tlb_remove_tlb_entry(2);
> > __tlb_remove_page();
> > if (pte_present(2)) // nope
> >
> > tlb_finish_mmu();
> >
> > // continue without TLBI(2)
> > // whoopsie
> >
> > tlb_finish_mmu();
> >   tlb_flush()   ->  TLBI(2)
> 
> I'm not quite sure if this is the case Jan really met. But, according to
> his test, once correct tlb->freed_tables and tlb->cleared_* are set, his
> test works well.

My theory was following sequence:

t1: map_write_unmap() t2: dummy()

  map_address = mmap()
  map_address[i] = 'b'
  munmap(map_address)
  downgrade_write(&mm->mmap_sem);
  unmap_region()
  tlb_gather_mmu()
inc_tlb_flush_pending(tlb->mm);
  free_pgtables()
tlb->freed_tables = 1
tlb->cleared_pmds = 1

pthread_exit()
madvise(thread_stack, 8M, MADV_DONTNEED)
  zap_page_range()
tlb_gather_mmu()
  inc_tlb_flush_pending(tlb->mm);

  tlb_finish_mmu()
if (mm_tlb_flush_nested(tlb->mm))
  __tlb_reset_range()
tlb->freed_tables = 0
tlb->cleared_pmds = 0
__flush_tlb_range(last_level = 0)
  ...
  map_address = mmap()
map_address[i] = 'b'
  
  # PTE appeared valid to me,
  # so I suspected stale TLB entry at higher level as result of 
"freed_tables = 0"


I'm happy to apply/run any debug patches to get more data that would help.

> 
> >
> >
> > And we can fix that by having tlb_finish_mmu() sync up. Never let a
> > concurrent tlb_finish_mmu() complete until all concurrenct mmu_gathers
> > have completed.
> 
> Not sure if this will scale well.
> 
> >
> > This should not be too hard to make happen.
> 
> 


Re: [PATCH] mm: mmu_gather: remove __tlb_reset_range() for force flush

2019-05-09 Thread Jan Stancek


> > I don't think we can elide the call __tlb_reset_range() entirely, since I
> > think we do want to clear the freed_pXX bits to ensure that we walk the
> > range with the smallest mapping granule that we have. Otherwise couldn't we
> > have a problem if we hit a PMD that had been cleared, but the TLB
> > invalidation for the PTEs that used to be linked below it was still
> > pending?
> 
> That's tlb->cleared_p*, and yes agreed. That is, right until some
> architecture has level dependent TLBI instructions, at which point we'll
> need to have them all set instead of cleared.
> 
> > Perhaps we should just set fullmm if we see that here's a concurrent
> > unmapper rather than do a worst-case range invalidation. Do you have a
> > feeling
> > for often the mm_tlb_flush_nested() triggers in practice?
> 
> Quite a bit for certain workloads I imagine, that was the whole point of
> doing it.
> 
> Anyway; am I correct in understanding that the actual problem is that
> we've cleared freed_tables and the ARM64 tlb_flush() will then not
> invalidate the cache and badness happens?

That is my understanding, only last level is flushed, which is not enough.

> 
> Because so far nobody has actually provided a coherent description of
> the actual problem we're trying to solve. But I'm thinking something
> like the below ought to do.

I applied it (and fixed small typo: s/tlb->full_mm/tlb->fullmm/).
It fixes the problem for me.

> 
> 
> diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
> index 99740e1dd273..fe768f8d612e 100644
> --- a/mm/mmu_gather.c
> +++ b/mm/mmu_gather.c
> @@ -244,15 +244,20 @@ void tlb_finish_mmu(struct mmu_gather *tlb,
>   unsigned long start, unsigned long end)
>  {
>   /*
> -  * If there are parallel threads are doing PTE changes on same range
> -  * under non-exclusive lock(e.g., mmap_sem read-side) but defer TLB
> -  * flush by batching, a thread has stable TLB entry can fail to flush
> -  * the TLB by observing pte_none|!pte_dirty, for example so flush TLB
> -  * forcefully if we detect parallel PTE batching threads.
> +  * Sensible comment goes here..
>*/
> - if (mm_tlb_flush_nested(tlb->mm)) {
> - __tlb_reset_range(tlb);
> - __tlb_adjust_range(tlb, start, end - start);
> + if (mm_tlb_flush_nested(tlb->mm) && !tlb->full_mm) {
> + /*
> +  * Since we're can't tell what we actually should have
> +  * flushed flush everything in the given range.
> +  */
> + tlb->start = start;
> + tlb->end = end;
> + tlb->freed_tables = 1;
> + tlb->cleared_ptes = 1;
> + tlb->cleared_pmds = 1;
> + tlb->cleared_puds = 1;
> + tlb->cleared_p4ds = 1;
>   }
>  
>   tlb_flush_mmu(tlb);
> 


[PATCH v3] mm/memory.c: do_fault: avoid usage of stale vm_area_struct

2019-03-02 Thread Jan Stancek
LTP testcase mtest06 [1] can trigger a crash on s390x running 5.0.0-rc8.
This is a stress test, where one thread mmaps/writes/munmaps memory area
and other thread is trying to read from it:

  CPU: 0 PID: 2611 Comm: mmap1 Not tainted 5.0.0-rc8+ #51
  Hardware name: IBM 2964 N63 400 (z/VM 6.4.0)
  Krnl PSW : 0404e0018000 001ac8d8 (__lock_acquire+0x7/0x7a8)
  Call Trace:
  ([<>]   (null))
   [<001adae4>] lock_acquire+0xec/0x258
   [<0080d1ac>] _raw_spin_lock_bh+0x5c/0x98
   [<0012a780>] page_table_free+0x48/0x1a8
   [<002f6e54>] do_fault+0xdc/0x670
   [<002fadae>] __handle_mm_fault+0x416/0x5f0
   [<002fb138>] handle_mm_fault+0x1b0/0x320
   [<001248cc>] do_dat_exception+0x19c/0x2c8
   [<0080e5ee>] pgm_check_handler+0x19e/0x200

page_table_free() is called with NULL mm parameter, but because
"0" is a valid address on s390 (see S390_lowcore), it keeps
going until it eventually crashes in lockdep's lock_acquire.
This crash is reproducible at least since 4.14.

Problem is that "vmf->vma" used in do_fault() can become stale.
Because mmap_sem may be released, other threads can come in,
call munmap() and cause "vma" be returned to kmem cache, and
get zeroed/re-initialized and re-used:

handle_mm_fault   |
  __handle_mm_fault   |
do_fault  |
  vma = vmf->vma  |
  do_read_fault   |
__do_fault|
  vma->vm_ops->fault(vmf);|
mmap_sem is released  |
  |
  | do_munmap()
  |   remove_vma_list()
  | remove_vma()
  |   vm_area_free()
  | # vma is released
  | ...
  | # same vma is allocated
  | # from kmem cache
  | do_mmap()
  |   vm_area_alloc()
  | memset(vma, 0, ...)
  |
  pte_free(vma->vm_mm, ...);  |
page_table_free   |
  spin_lock_bh(&mm->context.lock);|
   |

Cache mm_struct to avoid using potentially stale "vma".

[1] 
https://github.com/linux-test-project/ltp/blob/master/testcases/kernel/mem/mtest06/mmap1.c

Signed-off-by: Jan Stancek 
Reviewed-by: Andrea Arcangeli 
---
 mm/memory.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/mm/memory.c b/mm/memory.c
index e11ca9dd823f..e8d69ade5acc 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3517,10 +3517,13 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf)
  * but allow concurrent faults).
  * The mmap_sem may have been released depending on flags and our
  * return value.  See filemap_fault() and __lock_page_or_retry().
+ * If mmap_sem is released, vma may become invalid (for example
+ * by other thread calling munmap()).
  */
 static vm_fault_t do_fault(struct vm_fault *vmf)
 {
struct vm_area_struct *vma = vmf->vma;
+   struct mm_struct *vm_mm = vma->vm_mm;
vm_fault_t ret;
 
/*
@@ -3561,7 +3564,7 @@ static vm_fault_t do_fault(struct vm_fault *vmf)
 
/* preallocated pagetable is unused: free it */
if (vmf->prealloc_pte) {
-   pte_free(vma->vm_mm, vmf->prealloc_pte);
+   pte_free(vm_mm, vmf->prealloc_pte);
vmf->prealloc_pte = NULL;
}
return ret;
-- 
1.8.3.1



Re: [PATCH v2] mm/memory.c: do_fault: avoid usage of stale vm_area_struct

2019-03-02 Thread Jan Stancek



- Original Message -
> Hello Jan,
> 
> On Sat, Mar 02, 2019 at 07:19:39PM +0100, Jan Stancek wrote:
> > +   struct mm_struct *vm_mm = READ_ONCE(vma->vm_mm);
> 
> The vma->vm_mm cannot change under gcc there, so no need of
> READ_ONCE. The release of mmap_sem has release semantics so the
> vma->vm_mm access cannot be reordered after up_read(mmap_sem) either.
> 
> Other than the above detail:
> 
> Reviewed-by: Andrea Arcangeli 

Thank you for review, I dropped READ_ONCE and sent v3 with your
Reviewed-by included. I also successfully re-ran tests over-night.

> Would this not need a corresponding WRITE_ONCE() in vma_init() ?

There's at least 2 context switches between, so I think it wouldn't matter.
My concern was gcc optimizing out vm_mm, and vma->vm_mm access happening only
after do_read_fault().



[PATCH v2] mm/memory.c: do_fault: avoid usage of stale vm_area_struct

2019-03-02 Thread Jan Stancek
LTP testcase mtest06 [1] can trigger a crash on s390x running 5.0.0-rc8.
This is a stress test, where one thread mmaps/writes/munmaps memory area
and other thread is trying to read from it:

  CPU: 0 PID: 2611 Comm: mmap1 Not tainted 5.0.0-rc8+ #51
  Hardware name: IBM 2964 N63 400 (z/VM 6.4.0)
  Krnl PSW : 0404e0018000 001ac8d8 (__lock_acquire+0x7/0x7a8)
  Call Trace:
  ([<>]   (null))
   [<001adae4>] lock_acquire+0xec/0x258
   [<0080d1ac>] _raw_spin_lock_bh+0x5c/0x98
   [<0012a780>] page_table_free+0x48/0x1a8
   [<002f6e54>] do_fault+0xdc/0x670
   [<002fadae>] __handle_mm_fault+0x416/0x5f0
   [<002fb138>] handle_mm_fault+0x1b0/0x320
   [<001248cc>] do_dat_exception+0x19c/0x2c8
   [<0080e5ee>] pgm_check_handler+0x19e/0x200

page_table_free() is called with NULL mm parameter, but because
"0" is a valid address on s390 (see S390_lowcore), it keeps
going until it eventually crashes in lockdep's lock_acquire.
This crash is reproducible at least since 4.14.

Problem is that "vmf->vma" used in do_fault() can become stale.
Because mmap_sem may be released, other threads can come in,
call munmap() and cause "vma" be returned to kmem cache, and
get zeroed/re-initialized and re-used:

handle_mm_fault   |
  __handle_mm_fault   |
do_fault  |
  vma = vmf->vma  |
  do_read_fault   |
__do_fault|
  vma->vm_ops->fault(vmf);|
mmap_sem is released  |
  |
  | do_munmap()
  |   remove_vma_list()
  | remove_vma()
  |   vm_area_free()
  | # vma is released
  | ...
  | # same vma is allocated
  | # from kmem cache
  | do_mmap()
  |   vm_area_alloc()
  | memset(vma, 0, ...)
  |
  pte_free(vma->vm_mm, ...);  |
page_table_free   |
  spin_lock_bh(&mm->context.lock);|
   |

Cache mm_struct to avoid using potentially stale "vma".

[1] 
https://github.com/linux-test-project/ltp/blob/master/testcases/kernel/mem/mtest06/mmap1.c

Signed-off-by: Jan Stancek 
---
 mm/memory.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/mm/memory.c b/mm/memory.c
index e11ca9dd823f..6c1afc1ece50 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3517,10 +3517,13 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf)
  * but allow concurrent faults).
  * The mmap_sem may have been released depending on flags and our
  * return value.  See filemap_fault() and __lock_page_or_retry().
+ * If mmap_sem is released, vma may become invalid (for example
+ * by other thread calling munmap()).
  */
 static vm_fault_t do_fault(struct vm_fault *vmf)
 {
struct vm_area_struct *vma = vmf->vma;
+   struct mm_struct *vm_mm = READ_ONCE(vma->vm_mm);
vm_fault_t ret;
 
/*
@@ -3561,7 +3564,7 @@ static vm_fault_t do_fault(struct vm_fault *vmf)
 
/* preallocated pagetable is unused: free it */
if (vmf->prealloc_pte) {
-   pte_free(vma->vm_mm, vmf->prealloc_pte);
+   pte_free(vm_mm, vmf->prealloc_pte);
vmf->prealloc_pte = NULL;
}
return ret;
-- 
1.8.3.1



Re: [PATCH] mm/memory.c: do_fault: avoid usage of stale vm_area_struct

2019-03-02 Thread Jan Stancek



- Original Message -
> On Sat, Mar 02, 2019 at 04:11:26PM +0100, Jan Stancek wrote:
> > Problem is that "vmf->vma" used in do_fault() can become stale.
> > Because mmap_sem may be released, other threads can come in,
> > call munmap() and cause "vma" be returned to kmem cache, and
> > get zeroed/re-initialized and re-used:
> 
> > This patch pins mm_struct and stores its value, to avoid using
> > potentially stale "vma" when calling pte_free().
> 
> OK, we need to cache the mm_struct, but why do we need the extra atomic op?
> There's surely no way the mm can be freed while the thread is in the middle
> of handling a fault.

You're right, I was needlessly paranoid.

> 
> ie I would drop these lines:

I'll send v2.

Thanks,
Jan

> 
> > +   mmgrab(vm_mm);
> > +
> ...
> > +
> > +   mmdrop(vm_mm);
> > +
> 


[PATCH] mm/memory.c: do_fault: avoid usage of stale vm_area_struct

2019-03-02 Thread Jan Stancek
LTP testcase mtest06 [1] can trigger a crash on s390x running 5.0.0-rc8.
This is a stress test, where one thread mmaps/writes/munmaps memory area
and other thread is trying to read from it:

  CPU: 0 PID: 2611 Comm: mmap1 Not tainted 5.0.0-rc8+ #51
  Hardware name: IBM 2964 N63 400 (z/VM 6.4.0)
  Krnl PSW : 0404e0018000 001ac8d8 (__lock_acquire+0x7/0x7a8)
  Call Trace:
  ([<>]   (null))
   [<001adae4>] lock_acquire+0xec/0x258
   [<0080d1ac>] _raw_spin_lock_bh+0x5c/0x98
   [<0012a780>] page_table_free+0x48/0x1a8
   [<002f6e54>] do_fault+0xdc/0x670
   [<002fadae>] __handle_mm_fault+0x416/0x5f0
   [<002fb138>] handle_mm_fault+0x1b0/0x320
   [<001248cc>] do_dat_exception+0x19c/0x2c8
   [<0080e5ee>] pgm_check_handler+0x19e/0x200

page_table_free() is called with NULL mm parameter, but because
"0" is a valid address on s390 (see S390_lowcore), it keeps
going until it eventually crashes in lockdep's lock_acquire.
This crash is reproducible at least since 4.14.

Problem is that "vmf->vma" used in do_fault() can become stale.
Because mmap_sem may be released, other threads can come in,
call munmap() and cause "vma" be returned to kmem cache, and
get zeroed/re-initialized and re-used:

handle_mm_fault   |
  __handle_mm_fault   |
do_fault  |
  vma = vmf->vma  |
  do_read_fault   |
__do_fault|
  vma->vm_ops->fault(vmf);|
mmap_sem is released  |
  |
  | do_munmap()
  |   remove_vma_list()
  | remove_vma()
  |   vm_area_free()
  | # vma is released
  | ...
  | # same vma is allocated
  | # from kmem cache
  | do_mmap()
  |   vm_area_alloc()
  | memset(vma, 0, ...)
  |
  pte_free(vma->vm_mm, ...);  |
page_table_free   |
  spin_lock_bh(&mm->context.lock);|
   |

This patch pins mm_struct and stores its value, to avoid using
potentially stale "vma" when calling pte_free().

[1] 
https://github.com/linux-test-project/ltp/blob/master/testcases/kernel/mem/mtest06/mmap1.c

Signed-off-by: Jan Stancek 
---
 mm/memory.c | 10 +-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/mm/memory.c b/mm/memory.c
index e11ca9dd823f..1287ee9acbdc 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3517,12 +3517,17 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf)
  * but allow concurrent faults).
  * The mmap_sem may have been released depending on flags and our
  * return value.  See filemap_fault() and __lock_page_or_retry().
+ * If mmap_sem is released, vma may become invalid (for example
+ * by other thread calling munmap()).
  */
 static vm_fault_t do_fault(struct vm_fault *vmf)
 {
struct vm_area_struct *vma = vmf->vma;
+   struct mm_struct *vm_mm = READ_ONCE(vma->vm_mm);
vm_fault_t ret;
 
+   mmgrab(vm_mm);
+
/*
 * The VMA was not fully populated on mmap() or missing VM_DONTEXPAND
 */
@@ -3561,9 +3566,12 @@ static vm_fault_t do_fault(struct vm_fault *vmf)
 
/* preallocated pagetable is unused: free it */
if (vmf->prealloc_pte) {
-   pte_free(vma->vm_mm, vmf->prealloc_pte);
+   pte_free(vm_mm, vmf->prealloc_pte);
vmf->prealloc_pte = NULL;
}
+
+   mmdrop(vm_mm);
+
return ret;
 }
 
-- 
1.8.3.1



Re: [PATCH v2] mm: page_mapped: don't assume compound page is huge or THP

2019-02-04 Thread Jan Stancek



- Original Message -
> On Fri, Nov 30, 2018 at 1:07 PM Jan Stancek  wrote:
> >
> > LTP proc01 testcase has been observed to rarely trigger crashes
> > on arm64:
> > page_mapped+0x78/0xb4
> > stable_page_flags+0x27c/0x338
> > kpageflags_read+0xfc/0x164
> > proc_reg_read+0x7c/0xb8
> > __vfs_read+0x58/0x178
> > vfs_read+0x90/0x14c
> > SyS_read+0x60/0xc0
> >
> > Issue is that page_mapped() assumes that if compound page is not
> > huge, then it must be THP. But if this is 'normal' compound page
> > (COMPOUND_PAGE_DTOR), then following loop can keep running
> > (for HPAGE_PMD_NR iterations) until it tries to read from memory
> > that isn't mapped and triggers a panic:
> > for (i = 0; i < hpage_nr_pages(page); i++) {
> > if (atomic_read(&page[i]._mapcount) >= 0)
> > return true;
> > }
> >
> > I could replicate this on x86 (v4.20-rc4-98-g60b548237fed) only
> > with a custom kernel module [1] which:
> > - allocates compound page (PAGEC) of order 1
> > - allocates 2 normal pages (COPY), which are initialized to 0xff
> >   (to satisfy _mapcount >= 0)
> > - 2 PAGEC page structs are copied to address of first COPY page
> > - second page of COPY is marked as not present
> > - call to page_mapped(COPY) now triggers fault on access to 2nd
> >   COPY page at offset 0x30 (_mapcount)
> >
> > [1]
> > https://github.com/jstancek/reproducers/blob/master/kernel/page_mapped_crash/repro.c
> >
> > Fix the loop to iterate for "1 << compound_order" pages.
> >
> > Debugged-by: Laszlo Ersek 
> > Suggested-by: "Kirill A. Shutemov" 
> > Signed-off-by: Jan Stancek 
> > ---
> >  mm/util.c | 2 +-
> >  1 file changed, 1 insertion(+), 1 deletion(-)
> >
> > Changes in v2:
> > - change the loop instead so we check also mapcount of subpages
> >
> > diff --git a/mm/util.c b/mm/util.c
> > index 8bf08b5b5760..5c9c7359ee8a 100644
> > --- a/mm/util.c
> > +++ b/mm/util.c
> > @@ -478,7 +478,7 @@ bool page_mapped(struct page *page)
> > return true;
> > if (PageHuge(page))
> > return false;
> > -   for (i = 0; i < hpage_nr_pages(page); i++) {
> > +   for (i = 0; i < (1 << compound_order(page)); i++) {
> > if (atomic_read(&page[i]._mapcount) >= 0)
> > return true;
> > }
> > --
> > 1.8.3.1
> 
> Hi all
> 
> This patch landed in the 4.9-stable tree starting from 4.9.151 and it
> broke our MIPS1004kc system with CONFIG_HIGHMEM=y.

Hi,

are you using THP (CONFIG_TRANSPARENT_HUGEPAGE)?

The changed line should affect only THP and normal compound pages,
so a test with THP disabled might be interesting. 

> 
> The breakage consists of random processes dying with SIGILL or SIGSEGV
> when we stress test the system with high memory pressure and explicit
> memory compaction requested through /proc/sys/vm/compact_memory.
> Reverting this patch fixes the crashes.
> 
> We can put some effort on debugging if there are no obvious
> explanations for this. Keep in mind that this is 32-bit system with
> HIGHMEM.

Nothing obvious that I can see. I've been trying to reproduce on
32-bit x86 Fedora with no luck so far.

Regards,
Jan


[PATCH v2] mm: page_mapped: don't assume compound page is huge or THP

2018-11-30 Thread Jan Stancek
LTP proc01 testcase has been observed to rarely trigger crashes
on arm64:
page_mapped+0x78/0xb4
stable_page_flags+0x27c/0x338
kpageflags_read+0xfc/0x164
proc_reg_read+0x7c/0xb8
__vfs_read+0x58/0x178
vfs_read+0x90/0x14c
SyS_read+0x60/0xc0

Issue is that page_mapped() assumes that if compound page is not
huge, then it must be THP. But if this is 'normal' compound page
(COMPOUND_PAGE_DTOR), then following loop can keep running
(for HPAGE_PMD_NR iterations) until it tries to read from memory
that isn't mapped and triggers a panic:
for (i = 0; i < hpage_nr_pages(page); i++) {
if (atomic_read(&page[i]._mapcount) >= 0)
return true;
}

I could replicate this on x86 (v4.20-rc4-98-g60b548237fed) only
with a custom kernel module [1] which:
- allocates compound page (PAGEC) of order 1
- allocates 2 normal pages (COPY), which are initialized to 0xff
  (to satisfy _mapcount >= 0)
- 2 PAGEC page structs are copied to address of first COPY page
- second page of COPY is marked as not present
- call to page_mapped(COPY) now triggers fault on access to 2nd
  COPY page at offset 0x30 (_mapcount)

[1] 
https://github.com/jstancek/reproducers/blob/master/kernel/page_mapped_crash/repro.c

Fix the loop to iterate for "1 << compound_order" pages.

Debugged-by: Laszlo Ersek 
Suggested-by: "Kirill A. Shutemov" 
Signed-off-by: Jan Stancek 
---
 mm/util.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

Changes in v2:
- change the loop instead so we check also mapcount of subpages

diff --git a/mm/util.c b/mm/util.c
index 8bf08b5b5760..5c9c7359ee8a 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -478,7 +478,7 @@ bool page_mapped(struct page *page)
return true;
if (PageHuge(page))
return false;
-   for (i = 0; i < hpage_nr_pages(page); i++) {
+   for (i = 0; i < (1 << compound_order(page)); i++) {
if (atomic_read(&page[i]._mapcount) >= 0)
return true;
}
-- 
1.8.3.1



[PATCH] mm: page_mapped: don't assume compound page is huge or THP

2018-11-29 Thread Jan Stancek
LTP proc01 testcase has been observed to rarely trigger crashes
on arm64:
page_mapped+0x78/0xb4
stable_page_flags+0x27c/0x338
kpageflags_read+0xfc/0x164
proc_reg_read+0x7c/0xb8
__vfs_read+0x58/0x178
vfs_read+0x90/0x14c
SyS_read+0x60/0xc0

Issue is that page_mapped() assumes that if compound page is not
huge, then it must be THP. But if this is 'normal' compound page
(COMPOUND_PAGE_DTOR), then following loop can keep running until
it tries to read from memory that isn't mapped and triggers a panic:
for (i = 0; i < hpage_nr_pages(page); i++) {
if (atomic_read(&page[i]._mapcount) >= 0)
return true;
}

I could replicate this on x86 (v4.20-rc4-98-g60b548237fed) only
with a custom kernel module [1] which:
- allocates compound page (PAGEC) of order 1
- allocates 2 normal pages (COPY), which are initialized to 0xff
  (to satisfy _mapcount >= 0)
- 2 PAGEC page structs are copied to address of first COPY page
- second page of COPY is marked as not present
- call to page_mapped(COPY) now triggers fault on access to 2nd COPY
  page at offset 0x30 (_mapcount)

[1] 
https://github.com/jstancek/reproducers/blob/master/kernel/page_mapped_crash/repro.c

This patch modifies page_mapped() to check for 'normal'
compound pages (COMPOUND_PAGE_DTOR).

Debugged-by: Laszlo Ersek 
Signed-off-by: Jan Stancek 
---
 include/linux/mm.h | 9 +
 mm/util.c  | 2 ++
 2 files changed, 11 insertions(+)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 5411de93a363..18b0bb953f92 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -700,6 +700,15 @@ static inline compound_page_dtor 
*get_compound_page_dtor(struct page *page)
return compound_page_dtors[page[1].compound_dtor];
 }
 
+static inline int PageNormalCompound(struct page *page)
+{
+   if (!PageCompound(page))
+   return 0;
+
+   page = compound_head(page);
+   return page[1].compound_dtor == COMPOUND_PAGE_DTOR;
+}
+
 static inline unsigned int compound_order(struct page *page)
 {
if (!PageHead(page))
diff --git a/mm/util.c b/mm/util.c
index 8bf08b5b5760..06c1640cb7b3 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -478,6 +478,8 @@ bool page_mapped(struct page *page)
return true;
if (PageHuge(page))
return false;
+   if (PageNormalCompound(page))
+   return false;
for (i = 0; i < hpage_nr_pages(page); i++) {
if (atomic_read(&page[i]._mapcount) >= 0)
return true;
-- 
1.8.3.1



Re: [LTP] [PATCH 4.17 00/67] 4.17.7-stable review

2018-07-16 Thread Jan Stancek



- Original Message -
> On 16 July 2018 at 13:04, Greg Kroah-Hartman 
> wrote:
> > This is the start of the stable review cycle for the 4.17.7 release.
> > There are 67 patches in this series, all will be posted as a response
> > to this one.  If anyone has any issues with these being applied, please
> > let me know.
> >
> > Responses should be made by Wed Jul 18 07:34:11 UTC 2018.
> > Anything received after that time might be too late.
> >
> > The whole patch series can be found in one patch at:
> > 
> > https://www.kernel.org/pub/linux/kernel/v4.x/stable-review/patch-4.17.7-rc1.gz
> > or in the git tree and branch at:
> > 
> > git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git
> > linux-4.17.y
> > and the diffstat can be found below.
> >
> > thanks,
> >
> > greg k-h
> 
> 
> Results from Linaro’s test farm.
> Regressions detected.
> 
> LTP syscalls failed test cases on all devices arm64, arm32 and x86_64,
>  - creat08
>  - open10

This is consequence of:
  0fa3ecd87848 ("Fix up non-directory creation in SGID directories")

So likely cause of action here will be modifying both tests to not
expect S_ISGID in scenario described in commit above.

[adding Dave Chinner]

@Dave: Does XFS need separate patch for problem described in commit
0fa3ecd87848? XFS doesn't appear to use inode_init_owner() function,
both tests currently PASS the check for S_ISGID on file created by
non-group member as of v4.18-rc5.

Regards,
Jan

> 
> Reported this bug internally on upstream Linux mainline week back.
> Now this bug happening on 4.17, 4.14, 4.9 and 4.4
> 
> creat08 and open10 failed with this error,
> TFAIL  : testdir.B.3132/setgid: Incorrect modes, setgid bit should be set
> 
> Test case description:
> /*
>  * NAME
>  * creat08.c - Verifies that the group ID and setgid bit are
>  *set correctly when a new file is created.
>  *(ported from SPIE, section2/iosuite/creat5.c,
>  * by Airong Zhang )
>  * CALLS
>  * creat
>  *
>  * ALGORITHM
>  * Create two directories, one with the group ID of this process
>  * and the setgid bit not set, and the other with a group ID
>  * other than that of this process and with the setgid bit set.
>  * In each directory, create a file with and without the setgid
>  * bit set in the creation modes. Verify that the modes and group
>  * ID are correct on each of the 4 files.
>  * As root, create a file with the setgid bit on in the
>  * directory with the setgid bit.
>  * This tests the SVID3 create group semantics.
>  */
> 
> Block2 testing:
> /*--*/
> /* Block2: Create two files in testdir.B, one with the setgid   */
> /* bit set in the creation modes and the other without. */
> /*Both should inherit the group ID of the parent   */
> /*directory, group2.   */
> /*--*/
> 
> 
> Test results comparison on mainline versions,
> https://qa-reports.linaro.org/lkft/linux-mainline-oe/tests/ltp-syscalls-tests/creat08
> https://qa-reports.linaro.org/lkft/linux-mainline-oe/tests/ltp-syscalls-tests/open10
> 
> Test results comparison on 4.17 versions,
> https://qa-reports.linaro.org/lkft/linux-stable-rc-4.17-oe/tests/ltp-syscalls-tests/creat08
> https://qa-reports.linaro.org/lkft/linux-stable-rc-4.17-oe/tests/ltp-syscalls-tests/open10
> 
> Bug report link,
> https://bugs.linaro.org/show_bug.cgi?id=3940
> 
> Summary
> 
> 
> kernel: 4.17.7-rc1
> git repo:
> https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git
> git branch: linux-4.17.y
> git commit: bc0bd9e05fa1e213c689620eb4cba825c03dcc4a
> git describe: v4.17.6-68-gbc0bd9e05fa1
> Test details:
> https://qa-reports.linaro.org/lkft/linux-stable-rc-4.17-oe/build/v4.17.6-68-gbc0bd9e05fa1
> 
> Regressions (compared to build v4.17.6-67-g3b02b1fd1975)
> 
> 
> LTP syscalls failed test cases on all devices,
> 
>  - creat08
>  - open10
> 
> --
> Linaro LKFT
> https://lkft.linaro.org
> 
> --
> Mailing list info: https://lists.linux.it/listinfo/ltp
> 


Re: LTP CVE cve-2017-17053 test failed on x86_64 device

2018-06-20 Thread Jan Stancek



- Original Message -
> LTP CVE cve-2017-17053 test failed on x86_64 device.
> FAIL on linux-next, mainline, and stable-rc-4.17.
> PASS on stable-rc 4.16, 4.14, 4.9 and 4.4 kernel.
> 
> Test FAIL case output,
> tst_test.c:1015: INFO: Timeout per run is 0h 15m 00s
> tst_taint.c:88: BROK: Kernel is already tainted: 512

You seem to be running into some nfs warning early during boot [1][2],
that taints the kernel. cve-2017-17053 test doesn't do much,
it fails in setup because it finds kernel already tainted.

Looks similar to:
  https://www.spinics.net/lists/linux-nfs/msg68064.html

Regards,
Jan

[1]
[   78.886529] [ cut here ]
[   78.891155] DEBUG_LOCKS_WARN_ON(sem->owner != ((struct task_struct *)(1UL << 
0)))
[   78.891161] WARNING: CPU: 0 PID: 33 at 
/srv/oe/build/tmp-rpb-glibc/work-shared/intel-core2-32/kernel-source/kernel/locking/rwsem.c:217
 up_read_non_owner+0x5d/0x70
[   78.913141] Modules linked in: fuse
[   78.916633] CPU: 0 PID: 33 Comm: kworker/0:1 Not tainted 4.18.0-rc1 #1
[   78.923150] Hardware name: Supermicro SYS-5019S-ML/X11SSH-F, BIOS 2.0b 
07/27/2017
[   78.930623] Workqueue: nfsiod rpc_async_release
[   78.935154] RIP: 0010:up_read_non_owner+0x5d/0x70
[   78.939851] Code: 00 5b 5d c3 e8 54 9f 3a 00 85 c0 74 de 8b 05 ba 40 6a 02 
85 c0 75 d4 48 c7 c6 c8 ce 8f 9c 48 c7 c7 93 8e 8e 9c e8 33 6b fa ff <0f> 0b eb 
bd 0f 1f 44 00 00 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 
[   78.958717] RSP: 0018:b46b419ebdd0 EFLAGS: 00010282
[   78.963935] RAX:  RBX: 882d1c390130 RCX: 0006
[   78.971059] RDX: 0007 RSI: 0001 RDI: 882d2fc15730
[   78.978184] RBP: b46b419ebdd8 R08:  R09: 
[   78.985318] R10: b46b419ebdd0 R11:  R12: 882d1ad56800
[   78.985319] R13: 882d1d133000 R14:  R15: 882d1f047840
[   78.985319] FS:  () GS:882d2fc0() 
knlGS:
[   78.985320] CS:  0010 DS:  ES:  CR0: 80050033
[   78.985321] CR2: 56385d962ae0 CR3: 21e1e002 CR4: 003606f0
[   78.985321] DR0:  DR1:  DR2: 
[   78.985322] DR3:  DR6: fffe0ff0 DR7: 0400
[   78.985322] Call Trace:
[[0;32m  OK  [[   78.985327]  nfs_async_unlink_release+0x32/0x80
[   78.985329]  rpc_free_task+0x30/0x50
[   78.985330]  rpc_async_release+0x12/0x20
[   78.985332]  process_one_work+0x278/0x670
0m] [   78.985335]  worker_thread+0x4d/0x410
[   78.985337]  kthread+0x10d/0x140
[   78.985338]  ? rescuer_thread+0x3a0/0x3a0
[   78.985339]  ? kthread_create_worker_on_cpu+0x70/0x70
[   78.985341]  ret_from_fork+0x3a/0x50
[   78.985344] irq event stamp: 36545
[   78.985346] hardirqs last  enabled at (36545): [] 
_raw_spin_unlock_irq+0x2c/0x40
[   78.985347] hardirqs last disabled at (36544): [] 
_raw_spin_lock_irq+0x13/0x50
[   78.985348] softirqs last  enabled at (34878): [] 
reg_todo+0x260/0x2f0
[   78.985350] softirqs last disabled at (34876): [] 
reg_todo+0x171/0x2f0
[   79.111787] ---[ end trace bed1f41cfea3a4c5 ]---

[2] https://lkft.validation.linaro.org/scheduler/job/290429#L893

> Summary:
> passed   0
> failed   0
> skipped  0
> warnings 0
> 
> Test comments,
> /* Regression test for CVE-2017-17053, original reproducer can be found
>  * here:
>  *
>  
> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=ccd5b3235180eef3cfec337df1c8554ab151b5cc
>  *
>  * Be careful! This test may crash your kernel!
>  */
> 
> Full test log:
> https://lkft.validation.linaro.org/scheduler/job/290429#L9581
> https://lkft.validation.linaro.org/scheduler/job/259217#L10308
> 
> History of the test case shows failed on
> Linux next kernel,
> https://qa-reports.linaro.org/lkft/linux-next-oe/tests/ltp-cve-tests/cve-2017-17053
> 
> Linux mainline kernel,
> https://qa-reports.linaro.org/lkft/linux-mainline-oe/tests/ltp-cve-tests \
> /cve-2017-17053
> ^ Please join link
> 
> Linux stable rc 4.17,
> https://qa-reports.linaro.org/lkft/linux-stable-rc-4.17-oe/tests/ltp-cve-tests
> \
> /cve-2017-17053
> ^ Please join link
> 
> Test PASS on 4.16 kernel.
> https://qa-reports.linaro.org/lkft/linux-stable-rc-4.16-oe/tests/ltp-cve-tests
> \
> /cve-2017-17053
> ^ Please join link
> 
> Best regards
> Naresh Kamboju
> 


Re: [LTP] [PATCH 4.4 00/24] 4.4.137-stable review

2018-06-14 Thread Jan Stancek


- Original Message -
> On Thu, Jun 14, 2018 at 05:49:52AM -0400, Jan Stancek wrote:
> > 
> > - Original Message -
> > > On Thu, Jun 14, 2018 at 02:24:25PM +0530, Naresh Kamboju wrote:
> > > > On 14 June 2018 at 12:04, Greg Kroah-Hartman
> > > > 
> > > > wrote:
> > > > > On Wed, Jun 13, 2018 at 10:48:50PM -0300, Rafael Tinoco wrote:
> > > > >> On 13 June 2018 at 18:08, Rafael David Tinoco
> > > > >>  wrote:
> > > > >> > On Wed, Jun 13, 2018 at 6:00 PM, Greg Kroah-Hartman
> > > > >> >  wrote:
> > > > >> >> On Wed, Jun 13, 2018 at 05:47:49PM -0300, Rafael Tinoco wrote:
> > > > >> >>> Results from Linaro’s test farm.
> > > > >> >>> Regressions detected.
> > > > >> >>>
> > > > >> >>> NOTE:
> > > > >> >>>
> > > > >> >>> 1) LTP vma03 test (cve-2011-2496) broken on v4.4-137-rc1 because
> > > > >> >>> of:
> > > > >> >>>
> > > > >> >>>  6ea1dc96a03a mmap: relax file size limit for regular files
> > > > >> >>>  bd2f9ce5bacb mmap: introduce sane default mmap limits
> > > > >> >>>
> > > > >> >>>discussion:
> > > > >> >>>
> > > > >> >>>  https://github.com/linux-test-project/ltp/issues/341
> > > > >> >>>
> > > > >> >>>mainline commit (v4.13-rc7):
> > > > >> >>>
> > > > >> >>>  0cc3b0ec23ce Clarify (and fix) MAX_LFS_FILESIZE macros
> > > > >> >>>
> > > > >> >>>should be backported to 4.4.138-rc2 and fixes the issue.
> > > > >> >>
> > > > >> >> Really?  That commit says it fixes c2a9737f45e2 ("vfs,mm: fix a
> > > > >> >> dead
> > > > >> >> loop in truncate_inode_pages_range()") which is not in 4.4.y at
> > > > >> >> all.
> > > > >> >>
> > > > >> >> Did you test this out?
> > > > >> >
> > > > >> > Yes, the LTP contains the tests (last comment is the final test
> > > > >> > for
> > > > >> > arm32, right before Jan tests i686).
> > > > >> >
> > > > >> > Fixing MAX_LFS_FILESIZE fixes the new limit for mmap() brought by
> > > > >> > those 2 commits (file_mmap_size_max()).
> > > > >> > offset tested by the LTP test is 0xfffe000.
> > > > >> > file_mmap_size_max gives: 0x000 as max value, but only
> > > > >> > after
> > > > >> > the mentioned patch.
> > > > >> >
> > > > >> > Original intent for this fix was other though.
> > > > >>
> > > > >> To clarify this a bit further.
> > > > >>
> > > > >> The LTP CVE test is breaking in the first call to mmap(), even
> > > > >> before
> > > > >> trying to remap and test the security issue. That start happening in
> > > > >> this round because of those mmap() changes and the offset used in
> > > > >> the
> > > > >> LTP test. Linus changed limit checks and made them to be related to
> > > > >> MAX_LFS_FILESIZE. Unfortunately, in 4.4 stable, we were missing the
> > > > >> fix for MAX_LFS_FILESIZE (which before commit 0cc3b0ec23ce was less
> > > > >> than the REAL 32 bit limit).
> > > > >>
> > > > >> Commit 0cc3b0ec23ce was made because an user noticed the FS limit
> > > > >> not
> > > > >> being what it should be. In our case, the 4.4 stable kernel, we are
> > > > >> facing this 32 bit lower limit (than the real 32 bit real limit),
> > > > >> because of the LTP CVE test, so we need this fix to have the real 32
> > > > >> bit limit set for that macro (mmap limits did not use that macro
> > > > >> before).
> > > > >>
> > > > >> I have tested in arm32 and Jan Stancek, who first responded to LTP
> > > > >> issue, has tested this in i686 and both worked after that patch was
> >

Re: [LTP] [PATCH 4.4 00/24] 4.4.137-stable review

2018-06-14 Thread Jan Stancek


- Original Message -
> On Thu, Jun 14, 2018 at 02:24:25PM +0530, Naresh Kamboju wrote:
> > On 14 June 2018 at 12:04, Greg Kroah-Hartman 
> > wrote:
> > > On Wed, Jun 13, 2018 at 10:48:50PM -0300, Rafael Tinoco wrote:
> > >> On 13 June 2018 at 18:08, Rafael David Tinoco
> > >>  wrote:
> > >> > On Wed, Jun 13, 2018 at 6:00 PM, Greg Kroah-Hartman
> > >> >  wrote:
> > >> >> On Wed, Jun 13, 2018 at 05:47:49PM -0300, Rafael Tinoco wrote:
> > >> >>> Results from Linaro’s test farm.
> > >> >>> Regressions detected.
> > >> >>>
> > >> >>> NOTE:
> > >> >>>
> > >> >>> 1) LTP vma03 test (cve-2011-2496) broken on v4.4-137-rc1 because of:
> > >> >>>
> > >> >>>  6ea1dc96a03a mmap: relax file size limit for regular files
> > >> >>>  bd2f9ce5bacb mmap: introduce sane default mmap limits
> > >> >>>
> > >> >>>discussion:
> > >> >>>
> > >> >>>  https://github.com/linux-test-project/ltp/issues/341
> > >> >>>
> > >> >>>mainline commit (v4.13-rc7):
> > >> >>>
> > >> >>>  0cc3b0ec23ce Clarify (and fix) MAX_LFS_FILESIZE macros
> > >> >>>
> > >> >>>should be backported to 4.4.138-rc2 and fixes the issue.
> > >> >>
> > >> >> Really?  That commit says it fixes c2a9737f45e2 ("vfs,mm: fix a dead
> > >> >> loop in truncate_inode_pages_range()") which is not in 4.4.y at all.
> > >> >>
> > >> >> Did you test this out?
> > >> >
> > >> > Yes, the LTP contains the tests (last comment is the final test for
> > >> > arm32, right before Jan tests i686).
> > >> >
> > >> > Fixing MAX_LFS_FILESIZE fixes the new limit for mmap() brought by
> > >> > those 2 commits (file_mmap_size_max()).
> > >> > offset tested by the LTP test is 0xfffe000.
> > >> > file_mmap_size_max gives: 0x000 as max value, but only after
> > >> > the mentioned patch.
> > >> >
> > >> > Original intent for this fix was other though.
> > >>
> > >> To clarify this a bit further.
> > >>
> > >> The LTP CVE test is breaking in the first call to mmap(), even before
> > >> trying to remap and test the security issue. That start happening in
> > >> this round because of those mmap() changes and the offset used in the
> > >> LTP test. Linus changed limit checks and made them to be related to
> > >> MAX_LFS_FILESIZE. Unfortunately, in 4.4 stable, we were missing the
> > >> fix for MAX_LFS_FILESIZE (which before commit 0cc3b0ec23ce was less
> > >> than the REAL 32 bit limit).
> > >>
> > >> Commit 0cc3b0ec23ce was made because an user noticed the FS limit not
> > >> being what it should be. In our case, the 4.4 stable kernel, we are
> > >> facing this 32 bit lower limit (than the real 32 bit real limit),
> > >> because of the LTP CVE test, so we need this fix to have the real 32
> > >> bit limit set for that macro (mmap limits did not use that macro
> > >> before).
> > >>
> > >> I have tested in arm32 and Jan Stancek, who first responded to LTP
> > >> issue, has tested this in i686 and both worked after that patch was
> > >> included to v4.4-137-rc1 (my last test was even with 4.4.138-rc1).
> > >>
> > >> Hope that helps a bit.
> > >
> > > Ok, thanks, it didn't apply cleanly but I've fixed it up now.
> > 
> > On the latest 4.4.138-rc1,
> > LTP "cve-2011-2496" test still fails on arm32 beagleboard x15 and qemu_arm.
> > 
> > Summary
> > 
> > kernel: 4.4.138-rc1
> > git repo:
> > https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git
> > git branch: linux-4.4.y
> > git commit: 7d690c56754ef7be647fbcf7bcdceebd59926b3f
> > git describe: v4.4.137-15-g7d690c56754e
> > Test details:
> > https://qa-reports.linaro.org/lkft/linux-stable-rc-4.4-oe/build/v4.4.137-15-g7d690c56754e
> 
> Ok, but what does this mean?  Is there a commit somewhere that I need to
> pick up for 4.4.y that is already in newer kernels?

Hi Greg,

I think the expectations was that:
  0cc3b0ec23ce Clarify (and fix) MAX_LFS_FILESIZE macros
has been included to linux-4.4.y HEAD, so they re-ran the tests.

Report from Naresh above looks like original report: LTP vma03 is cve-2011-2496 
test.

Regards,
Jan

> 
> I have no idea what that cve is
>, as I never track them, and it's
> something that was reported to predate the 4.4 kernel release :)
> 
> thanks,
> 
> greg k-h
> 
> --
> Mailing list info: https://lists.linux.it/listinfo/ltp
> 


[PATCH] virtio_balloon: fix increment of vb->num_pfns in fill_balloon()

2017-12-01 Thread Jan Stancek
commit c7cdff0e8647 ("virtio_balloon: fix deadlock on OOM")
changed code to increment vb->num_pfns before call to
set_page_pfns(), which used to happen only after.

This patch fixes boot hang for me on ppc64le KVM guests.

Fixes: c7cdff0e8647 ("virtio_balloon: fix deadlock on OOM")
Cc: Michael S. Tsirkin 
Cc: Tetsuo Handa 
Cc: Michal Hocko 
Cc: Wei Wang 
Signed-off-by: Jan Stancek 
---
 drivers/virtio/virtio_balloon.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index 7960746f7597..a1fb52cb3f0a 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -174,13 +174,12 @@ static unsigned fill_balloon(struct virtio_balloon *vb, 
size_t num)
while ((page = balloon_page_pop(&pages))) {
balloon_page_enqueue(&vb->vb_dev_info, page);
 
-   vb->num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE;
-
set_page_pfns(vb, vb->pfns + vb->num_pfns, page);
vb->num_pages += VIRTIO_BALLOON_PAGES_PER_PAGE;
if (!virtio_has_feature(vb->vdev,
VIRTIO_BALLOON_F_DEFLATE_ON_OOM))
adjust_managed_page_count(page, -1);
+   vb->num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE;
}
 
num_allocated_pages = vb->num_pfns;
-- 
1.8.3.1



Re: [Patch V3] crypto: x86/sha1 : Fix reads beyond the number of blocks passed

2017-08-02 Thread Jan Stancek



- Original Message -
> On Wed, 2017-08-02 at 10:13 -0700, Megha Dey wrote:
> > It was reported that the sha1 AVX2 function(sha1_transform_avx2) is
> > reading ahead beyond its intended data, and causing a crash if the next
> > block is beyond page boundary:
> > http://marc.info/?l=linux-crypto-vger&m=149373371023377
> > 
> > This patch makes sure that there is no overflow for any buffer length.
> > 
> > It passes the tests written by Jan Stancek that revealed this problem:
> > https://github.com/jstancek/sha1-avx2-crash
> 
> Hi Jan,
> 
> Is it ok to add your Tested-by?

Yes, v3 patch is exactly the diff I was testing.

Regards,
Jan


Re: [Patch V2] crypto: x86/sha1 : Fix reads beyond the number of blocks passed

2017-08-02 Thread Jan Stancek
On 08/02/2017 02:38 AM, Megha Dey wrote:
> It was reported that the sha1 AVX2 function(sha1_transform_avx2) is
> reading ahead beyond its intended data, and causing a crash if the next
> block is beyond page boundary:
> http://marc.info/?l=linux-crypto-vger&m=149373371023377
> 
> This patch makes sure that there is no overflow for any buffer length.
> 
> It passes the tests written by Jan Stancek that revealed this problem:
> https://github.com/jstancek/sha1-avx2-crash
> 
> Jan, can you verify this fix?

Hi,

Looks good, my tests (below) PASS-ed.

I updated reproducer at [1] to try more than just single scenario. It
now tries thousands of combinations with different starting offset within
page and sizes of data chunks.

(without patch) v4.13-rc3-102-g26c5ceb with avx2 enabled:
[  106.455175] sha_test module loaded
[  106.460458] data is at 0x8c88e58f8000, page_after_data: 
0x8c88e58fa000
[  106.468625] sha_test testing hash: sha1-generic, maxlen: 8192
[  127.091237] sha_test hash: sha1-generic calculated 1764032 hashes
[  127.098038] sha_test testing hash: sha1-ni, maxlen: 8192
[  127.108805] failed to alloc sha1-ni
[  127.112703] sha_test testing hash: sha1-avx, maxlen: 8192
[  141.166611] sha_test hash: sha1-avx calculated 1764032 hashes
[  141.173023] sha_test testing hash: sha1-avx2, maxlen: 8192
[  141.180046] BUG: unable to handle kernel paging request at 8c88e58fa000
[  141.187817] IP: _begin+0x28/0x187
[  141.191512] PGD 89c578067
[  141.191513] P4D 89c578067
[  141.194529] PUD c61f0a063
[  141.197545] PMD c64cb1063
[  141.200561] PTE 800c658fa062
[  141.203577]
[  141.208832] Oops:  [#1] SMP
...

With patch "[Patch V2] crypto: x86/sha1 : Fix reads beyond the number of blocks 
passed":
[  406.323781] sha_test module loaded
[  406.329062] data is at 0x93ab97108000, page_after_data: 
0x93ab9710a000
[  406.337185] sha_test testing hash: sha1-generic, maxlen: 8192
[  426.157242] sha_test hash: sha1-generic calculated 1764032 hashes
[  426.164043] sha_test testing hash: sha1-ni, maxlen: 8192
[  426.174244] failed to alloc sha1-ni
[  426.178139] sha_test testing hash: sha1-avx, maxlen: 8192
[  440.226240] sha_test hash: sha1-avx calculated 1764032 hashes
[  440.232651] sha_test testing hash: sha1-avx2, maxlen: 8192
[  452.497988] sha_test hash: sha1-avx2 calculated 1764032 hashes
[  452.504495] sha_test testing hash: sha1-ssse3, maxlen: 8192
[  467.319316] sha_test hash: sha1-ssse3 calculated 1764032 hashes
[  467.325922] sha_test done
[  471.827083] sha_test module unloaded

I also ran a test at [2], which is comparing hashes produced by
kernel with hashes produced by user-space utilities (e.g. 'sha1sum').
This test also PASS-ed.

Regards,
Jan

[1] https://github.com/jstancek/sha1-avx2-crash
[2] https://github.com/jstancek/kernel_sha_test/tree/sha1-avx2-4.13

> Herbert, can you re-enable sha1-avx2 once Jan has checked it out and
> revert commit b82ce24426a4071da9529d726057e4e642948667 ?
> 
> Originally-by: Ilya Albrekht 
> Signed-off-by: Megha Dey 
> Reported-by: Jan Stancek 




Re: [PATCH 0/2] key payload access with just rcu_read_lock()

2017-03-01 Thread Jan Stancek




- Original Message -
> From: "David Howells" 
> To: "Jan Stancek" 
> Cc: dhowe...@redhat.com, linux-kernel@vger.kernel.org, 
> linux-...@vger.kernel.org, bcodd...@redhat.com,
> asav...@redhat.com
> Sent: Wednesday, 1 March, 2017 10:40:13 AM
> Subject: Re: [PATCH 0/2] key payload access with just rcu_read_lock()
> 
> Jan Stancek  wrote:
> 
> > That problem didn't show up with my NFS based reproducer.
> > I re-run it again with latest version of your patch, plus also
> > keyutils testsuite. Both completed OK for me, dmesg looks clean.
> 
> Can I put you down as a Tested-by?

Yes. Thanks for having a look at this.

Regards,
Jan


Re: [PATCH 0/2] key payload access with just rcu_read_lock()

2017-02-28 Thread Jan Stancek

- Original Message -
> Here's an updated patch with fixed user_key_payload_locked() and user_read().
> 

That problem didn't show up with my NFS based reproducer.
I re-run it again with latest version of your patch, plus also
keyutils testsuite. Both completed OK for me, dmesg looks clean.

Regards,
Jan


Re: [PATCH 0/2] key payload access with just rcu_read_lock()

2017-02-28 Thread Jan Stancek


- Original Message -
> Jan Stancek  wrote:
> 
> > Looks like there are still couple users that need updating,
> > I'm hitting following compilation error:
> 
> Aargh - I remembered to grep for rcu_dereference_key() but not
> user_key_payload().
> 
> How about the attached?

v2 builds OK (on top of 4.10), "suspicious RCU usage"
warning is gone and NFS connectathon is PASS-ing cleanly.

Regards,
Jan


Re: [PATCH 0/2] key payload access with just rcu_read_lock()

2017-02-27 Thread Jan Stancek


- Original Message -
> From: "David Howells" 
> To: "Jan Stancek" 
> Cc: dhowe...@redhat.com, linux-kernel@vger.kernel.org, 
> linux-...@vger.kernel.org, bcodd...@redhat.com,
> asav...@redhat.com
> Sent: Monday, 27 February, 2017 11:04:21 PM
> Subject: Re: [PATCH 0/2] key payload access with just rcu_read_lock()
> 
> Jan Stancek  wrote:
> 
> > this is a follow-up for "suspicious RCU usage" warning described
> > in these 2 linux-nfs threads:
> >   http://marc.info/?t=14755883033&r=1&w=2
> >   http://marc.info/?t=14877677051&r=1&w=2
> > 
> > Did you have something like in mind?
> 
> How about the attached?  It's similar to what you did, but I made the split
> functions different from the original so that all users have to reconsider
> what they actually want.
> 
> David

Looks like there are still couple users that need updating,
I'm hitting following compilation error:

...
  CC [M]  fs/cifs/connect.o
fs/cifs/connect.c: In function ‘cifs_set_cifscreds’:
fs/cifs/connect.c:2442:2: error: implicit declaration of function 
‘user_key_payload’ [-Werror=implicit-function-declaration]
  upayload = user_key_payload(key);
  ^
fs/cifs/connect.c:2442:11: error: assignment makes pointer from integer without 
a cast [-Werror]
  upayload = user_key_payload(key);
   ^

# grep "user_key_payload(" -r . --include=*.c
./drivers/md/dm-crypt.c:ukp = user_key_payload(key);
./fs/cifs/connect.c:upayload = user_key_payload(key);
./fs/crypto/keyinfo.c:  ukp = user_key_payload(keyring_key);
./lib/digsig.c: ukp = user_key_payload(key);

# grep "user_key_payload(" -r . --include=*.h
./fs/ecryptfs/ecryptfs_kernel.h:return (struct 
ecryptfs_auth_tok *)user_key_payload(key)->data;

Regards,
Jan


[PATCH 1/2] KEYS: add user_key_payload_rcu()

2017-02-22 Thread Jan Stancek
user_key_payload() is wrapper for rcu_dereference_protected(),
and can't be used with just rcu_read_lock() held.

This patch adds user_key_payload_rcu() for accessing key
payload in RCU read-side section, without the need
to hold key semaphore.

Signed-off-by: Jan Stancek 
---
 Documentation/security/keys.txt | 5 +
 include/keys/user-type.h| 5 +
 include/linux/key.h | 3 +++
 3 files changed, 13 insertions(+)

diff --git a/Documentation/security/keys.txt b/Documentation/security/keys.txt
index 3849814bfe6d..da89a854edd2 100644
--- a/Documentation/security/keys.txt
+++ b/Documentation/security/keys.txt
@@ -1152,8 +1152,13 @@ access the data:
  wrap the RCU calls to this element:
 
rcu_assign_keypointer(struct key *key, void *data);
+
+   /* access payload with semaphore held */
void *rcu_dereference_key(struct key *key);
 
+   /* access payload in RCU read-side section*/
+   void *rcu_read_dereference_key(struct key *key);
+
 
 ===
 DEFINING A KEY TYPE
diff --git a/include/keys/user-type.h b/include/keys/user-type.h
index c56fef40f53e..521bf3369904 100644
--- a/include/keys/user-type.h
+++ b/include/keys/user-type.h
@@ -53,6 +53,11 @@ static inline const struct user_key_payload 
*user_key_payload(const struct key *
return (struct user_key_payload *)rcu_dereference_key(key);
 }
 
+static inline const struct user_key_payload *user_key_payload_rcu(const struct 
key *key)
+{
+   return (struct user_key_payload *)rcu_read_dereference_key(key);
+}
+
 #endif /* CONFIG_KEYS */
 
 #endif /* _KEYS_USER_TYPE_H */
diff --git a/include/linux/key.h b/include/linux/key.h
index 722914798f37..b6a8c5896761 100644
--- a/include/linux/key.h
+++ b/include/linux/key.h
@@ -358,6 +358,9 @@ static inline bool key_is_instantiated(const struct key 
*key)
(rcu_dereference_protected((KEY)->payload.rcu_data0,\
   rwsem_is_locked(&((struct key 
*)(KEY))->sem)))
 
+#define rcu_read_dereference_key(KEY)   \
+   (rcu_dereference((KEY)->payload.rcu_data0))
+
 #define rcu_assign_keypointer(KEY, PAYLOAD)\
 do {   \
rcu_assign_pointer((KEY)->payload.rcu_data0, (PAYLOAD));\
-- 
1.8.3.1



[PATCH 2/2] NFS: use user_key_payload_rcu() in RCU read-side section

2017-02-22 Thread Jan Stancek
Use RCU read-side accessor for key payload, as calling
user_key_payload() without semaphore held leads to
following warning:

  ===
  [ INFO: suspicious RCU usage. ]
  4.10.0 #1 Tainted: GW
  ---
  ./include/keys/user-type.h:53 suspicious rcu_dereference_protected() usage!
  other info that might help us debug this:
  rcu_scheduler_active = 2, debug_locks = 0
  1 lock held by mount.nfs/5987:
#0:  (rcu_read_lock){..}, at: [] 
nfs_idmap_get_key+0x15c/0x420 [nfsv4]
  stack backtrace:
  CPU: 1 PID: 5987 Comm: mount.nfs Tainted: GW   4.10.0 #1
  Call Trace:
dump_stack+0xe8/0x154 (unreliable)
lockdep_rcu_suspicious+0x140/0x190
nfs_idmap_get_key+0x380/0x420 [nfsv4]
nfs_map_name_to_uid+0x2a0/0x3b0 [nfsv4]
decode_getfattr_attrs+0xfac/0x16b0 [nfsv4]
decode_getfattr_generic.constprop.106+0xbc/0x150 [nfsv4]
nfs4_xdr_dec_lookup_root+0xac/0xb0 [nfsv4]
rpcauth_unwrap_resp+0xe8/0x140 [sunrpc]
call_decode+0x29c/0x910 [sunrpc]
__rpc_execute+0x140/0x8f0 [sunrpc]
rpc_run_task+0x170/0x200 [sunrpc]
nfs4_call_sync_sequence+0x68/0xa0 [nfsv4]
_nfs4_lookup_root.isra.44+0xd0/0xf0 [nfsv4]
nfs4_lookup_root+0xe0/0x350 [nfsv4]
nfs4_lookup_root_sec+0x70/0xa0 [nfsv4]
nfs4_find_root_sec+0xc4/0x100 [nfsv4]
nfs4_proc_get_rootfh+0x5c/0xf0 [nfsv4]
nfs4_get_rootfh+0x6c/0x190 [nfsv4]
nfs4_server_common_setup+0xc4/0x260 [nfsv4]
nfs4_create_server+0x278/0x3c0 [nfsv4]
nfs4_remote_mount+0x50/0xb0 [nfsv4]
mount_fs+0x74/0x210
vfs_kern_mount+0x78/0x220
nfs_do_root_mount+0xb0/0x140 [nfsv4]
nfs4_try_mount+0x60/0x100 [nfsv4]
nfs_fs_mount+0x5ec/0xda0 [nfs]
mount_fs+0x74/0x210
vfs_kern_mount+0x78/0x220
do_mount+0x254/0xf70
SyS_mount+0x94/0x100
system_call+0x38/0xe0

Signed-off-by: Jan Stancek 
---
 fs/nfs/nfs4idmap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/nfs/nfs4idmap.c b/fs/nfs/nfs4idmap.c
index c444285bb1b1..835c163f61af 100644
--- a/fs/nfs/nfs4idmap.c
+++ b/fs/nfs/nfs4idmap.c
@@ -316,7 +316,7 @@ static ssize_t nfs_idmap_get_key(const char *name, size_t 
namelen,
if (ret < 0)
goto out_up;
 
-   payload = user_key_payload(rkey);
+   payload = user_key_payload_rcu(rkey);
if (IS_ERR_OR_NULL(payload)) {
ret = PTR_ERR(payload);
goto out_up;
-- 
1.8.3.1



[PATCH 0/2] key payload access with just rcu_read_lock()

2017-02-22 Thread Jan Stancek
Hi David,

this is a follow-up for "suspicious RCU usage" warning described
in these 2 linux-nfs threads:
  http://marc.info/?t=14755883033&r=1&w=2
  http://marc.info/?t=14877677051&r=1&w=2

Did you have something like in mind?

Thanks,
Jan

Jan Stancek (2):
  KEYS: add user_key_payload_rcu()
  NFS: use user_key_payload_rcu() in RCU read-side section

 Documentation/security/keys.txt | 5 +
 fs/nfs/nfs4idmap.c  | 2 +-
 include/keys/user-type.h| 5 +
 include/linux/key.h | 3 +++
 4 files changed, 14 insertions(+), 1 deletion(-)

-- 
1.8.3.1



[tip:perf/urgent] perf header: Make build_cpu_topology skip offline/absent CPUs

2017-02-21 Thread tip-bot for Jan Stancek
Commit-ID:  43db2843a4a41cc8cdb6ab696639aeee1f4d5062
Gitweb: http://git.kernel.org/tip/43db2843a4a41cc8cdb6ab696639aeee1f4d5062
Author: Jan Stancek 
AuthorDate: Fri, 17 Feb 2017 12:10:25 +0100
Committer:  Arnaldo Carvalho de Melo 
CommitDate: Fri, 17 Feb 2017 12:37:04 -0300

perf header: Make build_cpu_topology skip offline/absent CPUs

When build_cpu_topo() encounters offline/absent CPUs, it fails to find any
sysfs entries and returns failure.

This leads to build_cpu_topology() and write_cpu_topology() failing as
well.

Because HEADER_CPU_TOPOLOGY has not been written, read leaves cpu_topology_map
NULL and we get NULL ptr deref at:

  ...
   cmd_test
__cmd_test
 test_and_print
  run_test
   test_session_topology
check_cpu_topology

  36: Session topology   :
  --- start ---
  test child forked, pid 14902
  templ file: /tmp/perf-test-4CKocW
  failed to write feature HEADER_CPU_TOPOLOGY
  perf: Segmentation fault
  Obtained 9 stack frames.
  ./perf(sighandler_dump_stack+0x41) [0x5095f1]
  /lib64/libc.so.6(+0x35250) [0x7f4b7c3c9250]
  ./perf(test_session_topology+0x1db) [0x490ceb]
  ./perf() [0x475b68]
  ./perf(cmd_test+0x5b9) [0x4763c9]
  ./perf() [0x4945a3]
  ./perf(main+0x69f) [0x427e8f]
  /lib64/libc.so.6(__libc_start_main+0xf5) [0x7f4b7c3b5b35]
  ./perf() [0x427fb9]
  test child interrupted
   end 
  Session topology: FAILED!

This patch makes build_cpu_topology() skip offline/absent CPUs, by checking
their presence against cpu_map built from online CPUs.

Signed-off-by: Jan Stancek 
Acked-by: Jiri Olsa 
Cc: Alexander Shishkin 
Cc: Masami Hiramatsu 
Cc: Peter Zijlstra 
Link: 
http://lkml.kernel.org/r/a271b770175524f4961d4903af33798358a4a518.1487146877.git.jstan...@redhat.com
Signed-off-by: Arnaldo Carvalho de Melo 
---
 tools/perf/util/header.c | 19 ---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index 3d12c16..1222f6c 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -505,24 +505,31 @@ static void free_cpu_topo(struct cpu_topo *tp)
 
 static struct cpu_topo *build_cpu_topology(void)
 {
-   struct cpu_topo *tp;
+   struct cpu_topo *tp = NULL;
void *addr;
u32 nr, i;
size_t sz;
long ncpus;
int ret = -1;
+   struct cpu_map *map;
 
ncpus = sysconf(_SC_NPROCESSORS_CONF);
if (ncpus < 0)
return NULL;
 
+   /* build online CPU map */
+   map = cpu_map__new(NULL);
+   if (map == NULL) {
+   pr_debug("failed to get system cpumap\n");
+   return NULL;
+   }
+
nr = (u32)(ncpus & UINT_MAX);
 
sz = nr * sizeof(char *);
-
addr = calloc(1, sizeof(*tp) + 2 * sz);
if (!addr)
-   return NULL;
+   goto out_free;
 
tp = addr;
tp->cpu_nr = nr;
@@ -532,10 +539,16 @@ static struct cpu_topo *build_cpu_topology(void)
tp->thread_siblings = addr;
 
for (i = 0; i < nr; i++) {
+   if (!cpu_map__has(map, i))
+   continue;
+
ret = build_cpu_topo(tp, i);
if (ret < 0)
break;
}
+
+out_free:
+   cpu_map__put(map);
if (ret) {
free_cpu_topo(tp);
tp = NULL;


[tip:perf/urgent] perf cpumap: Add cpu__max_present_cpu()

2017-02-21 Thread tip-bot for Jan Stancek
Commit-ID:  92a7e1278005b6bb3459affc50b2b6e2464e7e7c
Gitweb: http://git.kernel.org/tip/92a7e1278005b6bb3459affc50b2b6e2464e7e7c
Author: Jan Stancek 
AuthorDate: Fri, 17 Feb 2017 12:10:24 +0100
Committer:  Arnaldo Carvalho de Melo 
CommitDate: Fri, 17 Feb 2017 12:33:05 -0300

perf cpumap: Add cpu__max_present_cpu()

Similar to cpu__max_cpu() (which returns the max possible CPU), returns
the max present CPU.

Signed-off-by: Jan Stancek 
Acked-by: Jiri Olsa 
Cc: Alexander Shishkin 
Cc: Masami Hiramatsu 
Cc: Peter Zijlstra 
Link: 
http://lkml.kernel.org/r/8ea4601b5cacc49927235b4ebac424bd6eeccb06.1487146877.git.jstan...@redhat.com
Signed-off-by: Arnaldo Carvalho de Melo 
---
 tools/perf/util/cpumap.c | 22 ++
 tools/perf/util/cpumap.h |  1 +
 2 files changed, 23 insertions(+)

diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c
index 2c0b522..8c75049 100644
--- a/tools/perf/util/cpumap.c
+++ b/tools/perf/util/cpumap.c
@@ -9,6 +9,7 @@
 #include "asm/bug.h"
 
 static int max_cpu_num;
+static int max_present_cpu_num;
 static int max_node_num;
 static int *cpunode_map;
 
@@ -442,6 +443,7 @@ static void set_max_cpu_num(void)
 
/* set up default */
max_cpu_num = 4096;
+   max_present_cpu_num = 4096;
 
mnt = sysfs__mountpoint();
if (!mnt)
@@ -455,6 +457,17 @@ static void set_max_cpu_num(void)
}
 
ret = get_max_num(path, &max_cpu_num);
+   if (ret)
+   goto out;
+
+   /* get the highest present cpu number for a sparse allocation */
+   ret = snprintf(path, PATH_MAX, "%s/devices/system/cpu/present", mnt);
+   if (ret == PATH_MAX) {
+   pr_err("sysfs path crossed PATH_MAX(%d) size\n", PATH_MAX);
+   goto out;
+   }
+
+   ret = get_max_num(path, &max_present_cpu_num);
 
 out:
if (ret)
@@ -505,6 +518,15 @@ int cpu__max_cpu(void)
return max_cpu_num;
 }
 
+int cpu__max_present_cpu(void)
+{
+   if (unlikely(!max_present_cpu_num))
+   set_max_cpu_num();
+
+   return max_present_cpu_num;
+}
+
+
 int cpu__get_node(int cpu)
 {
if (unlikely(cpunode_map == NULL)) {
diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h
index 06bd689..1a0549a 100644
--- a/tools/perf/util/cpumap.h
+++ b/tools/perf/util/cpumap.h
@@ -62,6 +62,7 @@ int cpu__setup_cpunode_map(void);
 
 int cpu__max_node(void);
 int cpu__max_cpu(void);
+int cpu__max_present_cpu(void);
 int cpu__get_node(int cpu);
 
 int cpu_map__build_map(struct cpu_map *cpus, struct cpu_map **res,


[tip:perf/urgent] perf tools: Replace _SC_NPROCESSORS_CONF with max_present_cpu in cpu_topology_map

2017-02-21 Thread tip-bot for Jan Stancek
Commit-ID:  da8a58b56c661681f9b2fd2fa59c6da3a5bac8d1
Gitweb: http://git.kernel.org/tip/da8a58b56c661681f9b2fd2fa59c6da3a5bac8d1
Author: Jan Stancek 
AuthorDate: Fri, 17 Feb 2017 12:10:26 +0100
Committer:  Arnaldo Carvalho de Melo 
CommitDate: Fri, 17 Feb 2017 12:56:35 -0300

perf tools: Replace _SC_NPROCESSORS_CONF with max_present_cpu in 
cpu_topology_map

There are 2 problems wrt. cpu_topology_map on systems with sparse CPUs:

1. offline/absent CPUs will have their socket_id and core_id set to -1
   which triggers:
   "socket_id number is too big.You may need to upgrade the perf tool."

2. size of cpu_topology_map (perf_env.cpu[]) is allocated based on
   _SC_NPROCESSORS_CONF, but can be indexed with CPU ids going above.
   Users of perf_env.cpu[] are using CPU id as index. This can lead
   to read beyond what was allocated:
   ==19991== Invalid read of size 4
   ==19991==at 0x490CEB: check_cpu_topology (topology.c:69)
   ==19991==by 0x490CEB: test_session_topology (topology.c:106)
   ...

For example:
  _SC_NPROCESSORS_CONF == 16
  available: 2 nodes (0-1)
  node 0 cpus: 0 6 8 10 16 22 24 26
  node 0 size: 12004 MB
  node 0 free: 9470 MB
  node 1 cpus: 1 7 9 11 23 25 27
  node 1 size: 12093 MB
  node 1 free: 9406 MB
  node distances:
  node   0   1
0:  10  20
1:  20  10

This patch changes HEADER_NRCPUS.nr_cpus_available from _SC_NPROCESSORS_CONF
to max_present_cpu and updates any user of cpu_topology_map to iterate
with nr_cpus_avail.

As a consequence HEADER_CPU_TOPOLOGY core_id and socket_id lists get longer,
but maintain compatibility with pre-patch state - index to cpu_topology_map is
CPU id.

  perf test 36 -v
  36: Session topology   :
  --- start ---
  test child forked, pid 22211
  templ file: /tmp/perf-test-gmdX5i
  CPU 0, core 0, socket 0
  CPU 1, core 0, socket 1
  CPU 6, core 10, socket 0
  CPU 7, core 10, socket 1
  CPU 8, core 1, socket 0
  CPU 9, core 1, socket 1
  CPU 10, core 9, socket 0
  CPU 11, core 9, socket 1
  CPU 16, core 0, socket 0
  CPU 22, core 10, socket 0
  CPU 23, core 10, socket 1
  CPU 24, core 1, socket 0
  CPU 25, core 1, socket 1
  CPU 26, core 9, socket 0
  CPU 27, core 9, socket 1
  test child finished with 0
   end 
  Session topology: Ok

Signed-off-by: Jan Stancek 
Cc: Alexander Shishkin 
Cc: Jiri Olsa 
Cc: Masami Hiramatsu 
Cc: Peter Zijlstra 
Link: 
http://lkml.kernel.org/r/d7c05c6445fca74a8442c2c73cfffd349c52c44f.1487146877.git.jstan...@redhat.com
Signed-off-by: Arnaldo Carvalho de Melo 
---
 tools/perf/builtin-stat.c   |  2 +-
 tools/perf/tests/topology.c |  4 +++-
 tools/perf/util/env.c   |  2 +-
 tools/perf/util/header.c| 16 +---
 4 files changed, 10 insertions(+), 14 deletions(-)

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index f287191..ca27a8a 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -1765,7 +1765,7 @@ static inline int perf_env__get_cpu(struct perf_env *env, 
struct cpu_map *map, i
 
cpu = map->map[idx];
 
-   if (cpu >= env->nr_cpus_online)
+   if (cpu >= env->nr_cpus_avail)
return -1;
 
return cpu;
diff --git a/tools/perf/tests/topology.c b/tools/perf/tests/topology.c
index 98fe69a..803f893 100644
--- a/tools/perf/tests/topology.c
+++ b/tools/perf/tests/topology.c
@@ -65,7 +65,9 @@ static int check_cpu_topology(char *path, struct cpu_map *map)
session = perf_session__new(&file, false, NULL);
TEST_ASSERT_VAL("can't get session", session);
 
-   for (i = 0; i < session->header.env.nr_cpus_online; i++) {
+   for (i = 0; i < session->header.env.nr_cpus_avail; i++) {
+   if (!cpu_map__has(map, i))
+   continue;
pr_debug("CPU %d, core %d, socket %d\n", i,
 session->header.env.cpu[i].core_id,
 session->header.env.cpu[i].socket_id);
diff --git a/tools/perf/util/env.c b/tools/perf/util/env.c
index bb964e8..075fc77 100644
--- a/tools/perf/util/env.c
+++ b/tools/perf/util/env.c
@@ -66,7 +66,7 @@ int perf_env__read_cpu_topology_map(struct perf_env *env)
return 0;
 
if (env->nr_cpus_avail == 0)
-   env->nr_cpus_avail = sysconf(_SC_NPROCESSORS_CONF);
+   env->nr_cpus_avail = cpu__max_present_cpu();
 
nr_cpus = env->nr_cpus_avail;
if (nr_cpus == -1)
diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index 1222f6c..05714d5 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -295,11 +295,7 @@ static int write_nrcpus(int fd, struct perf_header *h 
__maybe_unused,
u32 nrc, nra;
int ret;
 
-   nr = sysconf(_SC_NPROCESSORS_CONF);
-   if (nr < 0)
-   return -1;
-
-   nrc = (u32)(nr & UINT_MAX);
+   nrc = cpu__max_present_cpu();
 
nr = sy

[PATCH v4 3/3] perf: Replace _SC_NPROCESSORS_CONF with max_present_cpu in cpu_topology_map

2017-02-20 Thread Jan Stancek
There are 2 problems wrt. cpu_topology_map on systems with sparse CPUs:

1. offline/absent CPUs will have their socket_id and core_id set to -1
   which triggers:
   "socket_id number is too big.You may need to upgrade the perf tool."

2. size of cpu_topology_map (perf_env.cpu[]) is allocated based on
   _SC_NPROCESSORS_CONF, but can be indexed with CPU ids going above.
   Users of perf_env.cpu[] are using CPU id as index. This can lead
   to read beyond what was allocated:
   ==19991== Invalid read of size 4
   ==19991==at 0x490CEB: check_cpu_topology (topology.c:69)
   ==19991==by 0x490CEB: test_session_topology (topology.c:106)
   ...

For example:
  _SC_NPROCESSORS_CONF == 16
  available: 2 nodes (0-1)
  node 0 cpus: 0 6 8 10 16 22 24 26
  node 0 size: 12004 MB
  node 0 free: 9470 MB
  node 1 cpus: 1 7 9 11 23 25 27
  node 1 size: 12093 MB
  node 1 free: 9406 MB
  node distances:
  node   0   1
0:  10  20
1:  20  10

This patch changes HEADER_NRCPUS.nr_cpus_available from _SC_NPROCESSORS_CONF
to max_present_cpu and updates any user of cpu_topology_map to iterate
with nr_cpus_avail. As consequence HEADER_CPU_TOPOLOGY core_id and
socket_id lists get longer, but maintain compatibility with pre-patch
state - index to cpu_topology_map is CPU id.

  # ./perf test 36 -v
  36: Session topology   :
  --- start ---
  test child forked, pid 22211
  templ file: /tmp/perf-test-gmdX5i
  CPU 0, core 0, socket 0
  CPU 1, core 0, socket 1
  CPU 6, core 10, socket 0
  CPU 7, core 10, socket 1
  CPU 8, core 1, socket 0
  CPU 9, core 1, socket 1
  CPU 10, core 9, socket 0
  CPU 11, core 9, socket 1
  CPU 16, core 0, socket 0
  CPU 22, core 10, socket 0
  CPU 23, core 10, socket 1
  CPU 24, core 1, socket 0
  CPU 25, core 1, socket 1
  CPU 26, core 9, socket 0
  CPU 27, core 9, socket 1
  test child finished with 0
   end 
  Session topology: Ok

Signed-off-by: Jan Stancek 
---
 tools/perf/builtin-stat.c   |  2 +-
 tools/perf/tests/topology.c |  4 +++-
 tools/perf/util/env.c   |  2 +-
 tools/perf/util/header.c| 20 +---
 4 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index a02f2e965628..9e2be1c52fbd 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -1765,7 +1765,7 @@ static inline int perf_env__get_cpu(struct perf_env *env, 
struct cpu_map *map, i
 
cpu = map->map[idx];
 
-   if (cpu >= env->nr_cpus_online)
+   if (cpu >= env->nr_cpus_avail)
return -1;
 
return cpu;
diff --git a/tools/perf/tests/topology.c b/tools/perf/tests/topology.c
index 98fe69ac553c..803f893550d6 100644
--- a/tools/perf/tests/topology.c
+++ b/tools/perf/tests/topology.c
@@ -65,7 +65,9 @@ static int check_cpu_topology(char *path, struct cpu_map *map)
session = perf_session__new(&file, false, NULL);
TEST_ASSERT_VAL("can't get session", session);
 
-   for (i = 0; i < session->header.env.nr_cpus_online; i++) {
+   for (i = 0; i < session->header.env.nr_cpus_avail; i++) {
+   if (!cpu_map__has(map, i))
+   continue;
pr_debug("CPU %d, core %d, socket %d\n", i,
 session->header.env.cpu[i].core_id,
 session->header.env.cpu[i].socket_id);
diff --git a/tools/perf/util/env.c b/tools/perf/util/env.c
index bb964e86b09d..075fc77286bf 100644
--- a/tools/perf/util/env.c
+++ b/tools/perf/util/env.c
@@ -66,7 +66,7 @@ int perf_env__read_cpu_topology_map(struct perf_env *env)
return 0;
 
if (env->nr_cpus_avail == 0)
-   env->nr_cpus_avail = sysconf(_SC_NPROCESSORS_CONF);
+   env->nr_cpus_avail = cpu__max_present_cpu();
 
nr_cpus = env->nr_cpus_avail;
if (nr_cpus == -1)
diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index ca0f12fb5fd3..e97cbf78ef21 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -293,11 +293,7 @@ static int write_nrcpus(int fd, struct perf_header *h 
__maybe_unused,
u32 nrc, nra;
int ret;
 
-   nr = sysconf(_SC_NPROCESSORS_CONF);
-   if (nr < 0)
-   return -1;
-
-   nrc = (u32)(nr & UINT_MAX);
+   nrc = cpu__max_present_cpu();
 
nr = sysconf(_SC_NPROCESSORS_ONLN);
if (nr < 0)
@@ -511,9 +507,7 @@ static struct cpu_topo *build_cpu_topology(void)
int ret = -1;
struct cpu_map *map;
 
-   ncpus = sysconf(_SC_NPROCESSORS_CONF);
-   if (ncpus < 0)
-   return NULL;
+   ncpus = cpu__max_present_cpu();
 
/* build online CPU map */
map = cpu_map__new(NULL);
@@ -1137,7 +1131,7 @@ static void print_cpu_topology(struct perf_header *ph, 
int fd __maybe_unused,
 {
int nr, i;
char *str;
-   int cpu_nr = ph->env

[PATCH v4 2/3] perf header: Make build_cpu_topology skip offline/absent CPUs

2017-02-20 Thread Jan Stancek
When build_cpu_topo() encounters offline/absent CPUs,
it fails to find any sysfs entries and returns failure.
This leads to build_cpu_topology() and write_cpu_topology()
failing as well.

Because HEADER_CPU_TOPOLOGY has not been written, read leaves
cpu_topology_map NULL and we get NULL ptr deref at:
 ...
  cmd_test
   __cmd_test
test_and_print
 run_test
  test_session_topology
   check_cpu_topology

  # perf test 36
  36: Session topology   :
  --- start ---
  test child forked, pid 14902
  templ file: /tmp/perf-test-4CKocW
  failed to write feature HEADER_CPU_TOPOLOGY
  perf: Segmentation fault
  Obtained 9 stack frames.
  ./perf(sighandler_dump_stack+0x41) [0x5095f1]
  /lib64/libc.so.6(+0x35250) [0x7f4b7c3c9250]
  ./perf(test_session_topology+0x1db) [0x490ceb]
  ./perf() [0x475b68]
  ./perf(cmd_test+0x5b9) [0x4763c9]
  ./perf() [0x4945a3]
  ./perf(main+0x69f) [0x427e8f]
  /lib64/libc.so.6(__libc_start_main+0xf5) [0x7f4b7c3b5b35]
  ./perf() [0x427fb9]
  test child interrupted
   end 
  Session topology: FAILED!

This patch makes build_cpu_topology() skip offline/absent CPUs,
by checking their presence against cpu_map built from online CPUs.

Signed-off-by: Jan Stancek 
---
 tools/perf/util/header.c | 19 ---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index d89c9c7ef4e5..ca0f12fb5fd3 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -503,24 +503,31 @@ static void free_cpu_topo(struct cpu_topo *tp)
 
 static struct cpu_topo *build_cpu_topology(void)
 {
-   struct cpu_topo *tp;
+   struct cpu_topo *tp = NULL;
void *addr;
u32 nr, i;
size_t sz;
long ncpus;
int ret = -1;
+   struct cpu_map *map;
 
ncpus = sysconf(_SC_NPROCESSORS_CONF);
if (ncpus < 0)
return NULL;
 
+   /* build online CPU map */
+   map = cpu_map__new(NULL);
+   if (map == NULL) {
+   pr_debug("failed to get system cpumap\n");
+   return NULL;
+   }
+
nr = (u32)(ncpus & UINT_MAX);
 
sz = nr * sizeof(char *);
-
addr = calloc(1, sizeof(*tp) + 2 * sz);
if (!addr)
-   return NULL;
+   goto out_free;
 
tp = addr;
tp->cpu_nr = nr;
@@ -530,10 +537,16 @@ static struct cpu_topo *build_cpu_topology(void)
tp->thread_siblings = addr;
 
for (i = 0; i < nr; i++) {
+   if (!cpu_map__has(map, i))
+   continue;
+
ret = build_cpu_topo(tp, i);
if (ret < 0)
break;
}
+
+out_free:
+   cpu_map__put(map);
if (ret) {
free_cpu_topo(tp);
tp = NULL;
-- 
1.8.3.1



[PATCH v4 1/3] perf cpu_map: Add cpu__max_present_cpu()

2017-02-20 Thread Jan Stancek
Similar to cpu__max_cpu() (which returns max possible CPU),
returns max present CPU.

Signed-off-by: Jan Stancek 
---
 tools/perf/util/cpumap.c | 22 ++
 tools/perf/util/cpumap.h |  1 +
 2 files changed, 23 insertions(+)

diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c
index 2c0b52264a46..8c7504939113 100644
--- a/tools/perf/util/cpumap.c
+++ b/tools/perf/util/cpumap.c
@@ -9,6 +9,7 @@
 #include "asm/bug.h"
 
 static int max_cpu_num;
+static int max_present_cpu_num;
 static int max_node_num;
 static int *cpunode_map;
 
@@ -442,6 +443,7 @@ static void set_max_cpu_num(void)
 
/* set up default */
max_cpu_num = 4096;
+   max_present_cpu_num = 4096;
 
mnt = sysfs__mountpoint();
if (!mnt)
@@ -455,6 +457,17 @@ static void set_max_cpu_num(void)
}
 
ret = get_max_num(path, &max_cpu_num);
+   if (ret)
+   goto out;
+
+   /* get the highest present cpu number for a sparse allocation */
+   ret = snprintf(path, PATH_MAX, "%s/devices/system/cpu/present", mnt);
+   if (ret == PATH_MAX) {
+   pr_err("sysfs path crossed PATH_MAX(%d) size\n", PATH_MAX);
+   goto out;
+   }
+
+   ret = get_max_num(path, &max_present_cpu_num);
 
 out:
if (ret)
@@ -505,6 +518,15 @@ int cpu__max_cpu(void)
return max_cpu_num;
 }
 
+int cpu__max_present_cpu(void)
+{
+   if (unlikely(!max_present_cpu_num))
+   set_max_cpu_num();
+
+   return max_present_cpu_num;
+}
+
+
 int cpu__get_node(int cpu)
 {
if (unlikely(cpunode_map == NULL)) {
diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h
index 06bd689f5989..1a0549af8f5c 100644
--- a/tools/perf/util/cpumap.h
+++ b/tools/perf/util/cpumap.h
@@ -62,6 +62,7 @@ static inline bool cpu_map__empty(const struct cpu_map *map)
 
 int cpu__max_node(void);
 int cpu__max_cpu(void);
+int cpu__max_present_cpu(void);
 int cpu__get_node(int cpu);
 
 int cpu_map__build_map(struct cpu_map *cpus, struct cpu_map **res,
-- 
1.8.3.1



[PATCH v4 0/3] perf: topology on systems with offline/absent CPUs

2017-02-20 Thread Jan Stancek
Changes in v4:
- rebase on top of 4.10
- fix style of commit logs
- add comment to process_cpu_topology()

This series aims to address issues on systems with offline/absent CPUs:
1. cpu_topology_map size
   Allocation based on _SC_NPROCESSORS_CONF is too small,
   there can be CPU IDs that are higher, which causes invalid reads. 
2. build_cpu_topology fails for offline/absent CPUs
   It is trying to parse sysfs entries, that do not exist.
3. perf topology test fails

Changes affect HEADER_NRCPUS and HEADER_CPU_TOPOLOGY:

For example on a system like this:
  _SC_NPROCESSORS_CONF == 16
  available: 2 nodes (0-1)
  node 0 cpus: 0 6 8 10 16 22 24 26
  node 0 size: 12004 MB
  node 0 free: 9470 MB
  node 1 cpus: 1 7 9 11 23 25 27
  node 1 size: 12093 MB
  node 1 free: 9406 MB
  node distances:
  node   0   1
0:  10  20
1:  20  10

HEADER_NRCPUS before this series:
nr_cpus_online = 15
nr_cpus_available = 16

HEADER_CPU_TOPOLOGY before this series:
core_sib_nr: 2
core_siblings: 0,6,8,10,16,22,24,26
core_siblings: 1,7,9,11,23,25,27
thread_sib_nr: 8
thread_siblings: 0,16
thread_siblings: 1
thread_siblings: 6,22
thread_siblings: 7,23
thread_siblings: 8,24
thread_siblings: 9,25
thread_siblings: 10,26
thread_siblings: 11,27
core_id: 0, socket_id: 0
core_id: 0, socket_id: 1
core_id: -1, socket_id: -1
core_id: -1, socket_id: -1
core_id: -1, socket_id: -1
core_id: -1, socket_id: -1
core_id: 10, socket_id: 0
core_id: 10, socket_id: 1
core_id: 1, socket_id: 0
core_id: 1, socket_id: 1
core_id: 9, socket_id: 0
core_id: 9, socket_id: 1
core_id: -1, socket_id: -1
core_id: -1, socket_id: -1
core_id: -1, socket_id: -1
core_id: -1, socket_id: -1

HEADER_NRCPUS with this series applied:
nr_cpus_online = 15
nr_cpus_available = 28

HEADER_CPU_TOPOLOGY with this series applied:
core_sib_nr: 2
core_siblings: 0,6,8,10,16,22,24,26
core_siblings: 1,7,9,11,23,25,27
thread_sib_nr: 8
thread_siblings: 0,16
thread_siblings: 1
thread_siblings: 6,22
thread_siblings: 7,23
thread_siblings: 8,24
thread_siblings: 9,25
thread_siblings: 10,26
thread_siblings: 11,27
core_id: 0, socket_id: 0
core_id: 0, socket_id: 1
core_id: -1, socket_id: -1
core_id: -1, socket_id: -1
core_id: -1, socket_id: -1
core_id: -1, socket_id: -1
core_id: 10, socket_id: 0
core_id: 10, socket_id: 1
core_id: 1, socket_id: 0
core_id: 1, socket_id: 1
core_id: 9, socket_id: 0
core_id: 9, socket_id: 1
core_id: -1, socket_id: -1
core_id: -1, socket_id: -1
core_id: -1, socket_id: -1
core_id: -1, socket_id: -1
core_id: 0, socket_id: 0
core_id: -1, socket_id: -1
core_id: -1, socket_id: -1
core_id: -1, socket_id: -1
core_id: -1, socket_id: -1
core_id: -1, socket_id: -1
core_id: 10, socket_id: 0
core_id: 10, socket_id: 1
core_id: 1, socket_id: 0
core_id: 1, socket_id: 1
core_id: 9, socket_id: 0
core_id: 9, socket_id: 1

Jan Stancek (3):
  perf cpu_map: Add cpu__max_present_cpu()
  perf header: Make build_cpu_topology skip offline/absent CPUs
  perf: Replace _SC_NPROCESSORS_CONF with max_present_cpu in
cpu_topology_map

 tools/perf/builtin-stat.c   |  2 +-
 tools/perf/tests/topology.c |  4 +++-
 tools/perf/util/cpumap.c| 22 ++
 tools/perf/util/cpumap.h|  1 +
 tools/perf/util/env.c   |  2 +-
 tools/perf/util/header.c| 37 -
 6 files changed, 52 insertions(+), 16 deletions(-)

-- 
1.8.3.1



[PATCH v3 2/3] perf: make build_cpu_topology skip offline/absent CPUs

2017-02-17 Thread Jan Stancek
When build_cpu_topo() encounters offline/absent CPUs,
it fails to find any sysfs entries and returns failure.
This leads to build_cpu_topology() and write_cpu_topology()
failing as well.

Because HEADER_CPU_TOPOLOGY has not been written, read leaves
cpu_topology_map NULL and we get NULL ptr deref at:

 ...
  cmd_test
   __cmd_test
test_and_print
 run_test
  test_session_topology
   check_cpu_topology

36: Session topology   :
--- start ---
test child forked, pid 14902
templ file: /tmp/perf-test-4CKocW
failed to write feature HEADER_CPU_TOPOLOGY
perf: Segmentation fault
Obtained 9 stack frames.
./perf(sighandler_dump_stack+0x41) [0x5095f1]
/lib64/libc.so.6(+0x35250) [0x7f4b7c3c9250]
./perf(test_session_topology+0x1db) [0x490ceb]
./perf() [0x475b68]
./perf(cmd_test+0x5b9) [0x4763c9]
./perf() [0x4945a3]
./perf(main+0x69f) [0x427e8f]
/lib64/libc.so.6(__libc_start_main+0xf5) [0x7f4b7c3b5b35]
./perf() [0x427fb9]
test child interrupted
 end 
Session topology: FAILED!

This patch makes build_cpu_topology() skip offline/absent CPUs,
by checking their presence against cpu_map built from online CPUs.

Signed-off-by: Jan Stancek 
---
 tools/perf/util/header.c | 19 ---
 1 file changed, 16 insertions(+), 3 deletions(-)

Changes in v3:
- make ret = -1 by default, so we fail in case build_cpu_topo
  doesn't run at all

diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index d89c9c7ef4e5..ca0f12fb5fd3 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -503,24 +503,31 @@ static void free_cpu_topo(struct cpu_topo *tp)
 
 static struct cpu_topo *build_cpu_topology(void)
 {
-   struct cpu_topo *tp;
+   struct cpu_topo *tp = NULL;
void *addr;
u32 nr, i;
size_t sz;
long ncpus;
int ret = -1;
+   struct cpu_map *map;
 
ncpus = sysconf(_SC_NPROCESSORS_CONF);
if (ncpus < 0)
return NULL;
 
+   /* build online CPU map */
+   map = cpu_map__new(NULL);
+   if (map == NULL) {
+   pr_debug("failed to get system cpumap\n");
+   return NULL;
+   }
+
nr = (u32)(ncpus & UINT_MAX);
 
sz = nr * sizeof(char *);
-
addr = calloc(1, sizeof(*tp) + 2 * sz);
if (!addr)
-   return NULL;
+   goto out_free;
 
tp = addr;
tp->cpu_nr = nr;
@@ -530,10 +537,16 @@ static struct cpu_topo *build_cpu_topology(void)
tp->thread_siblings = addr;
 
for (i = 0; i < nr; i++) {
+   if (!cpu_map__has(map, i))
+   continue;
+
ret = build_cpu_topo(tp, i);
if (ret < 0)
break;
}
+
+out_free:
+   cpu_map__put(map);
if (ret) {
free_cpu_topo(tp);
tp = NULL;
-- 
1.8.3.1



[PATCH v3 1/3] perf: add cpu__max_present_cpu()

2017-02-17 Thread Jan Stancek
Similar to cpu__max_cpu() (which returns max possible CPU),
returns max present CPU.

Signed-off-by: Jan Stancek 
---
 tools/perf/util/cpumap.c | 22 ++
 tools/perf/util/cpumap.h |  1 +
 2 files changed, 23 insertions(+)

diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c
index 2c0b52264a46..8c7504939113 100644
--- a/tools/perf/util/cpumap.c
+++ b/tools/perf/util/cpumap.c
@@ -9,6 +9,7 @@
 #include "asm/bug.h"
 
 static int max_cpu_num;
+static int max_present_cpu_num;
 static int max_node_num;
 static int *cpunode_map;
 
@@ -442,6 +443,7 @@ static void set_max_cpu_num(void)
 
/* set up default */
max_cpu_num = 4096;
+   max_present_cpu_num = 4096;
 
mnt = sysfs__mountpoint();
if (!mnt)
@@ -455,6 +457,17 @@ static void set_max_cpu_num(void)
}
 
ret = get_max_num(path, &max_cpu_num);
+   if (ret)
+   goto out;
+
+   /* get the highest present cpu number for a sparse allocation */
+   ret = snprintf(path, PATH_MAX, "%s/devices/system/cpu/present", mnt);
+   if (ret == PATH_MAX) {
+   pr_err("sysfs path crossed PATH_MAX(%d) size\n", PATH_MAX);
+   goto out;
+   }
+
+   ret = get_max_num(path, &max_present_cpu_num);
 
 out:
if (ret)
@@ -505,6 +518,15 @@ int cpu__max_cpu(void)
return max_cpu_num;
 }
 
+int cpu__max_present_cpu(void)
+{
+   if (unlikely(!max_present_cpu_num))
+   set_max_cpu_num();
+
+   return max_present_cpu_num;
+}
+
+
 int cpu__get_node(int cpu)
 {
if (unlikely(cpunode_map == NULL)) {
diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h
index 06bd689f5989..1a0549af8f5c 100644
--- a/tools/perf/util/cpumap.h
+++ b/tools/perf/util/cpumap.h
@@ -62,6 +62,7 @@ static inline bool cpu_map__empty(const struct cpu_map *map)
 
 int cpu__max_node(void);
 int cpu__max_cpu(void);
+int cpu__max_present_cpu(void);
 int cpu__get_node(int cpu);
 
 int cpu_map__build_map(struct cpu_map *cpus, struct cpu_map **res,
-- 
1.8.3.1



[PATCH v3 3/3] perf: replace _SC_NPROCESSORS_CONF with max_present_cpu in cpu_topology_map

2017-02-17 Thread Jan Stancek
There are 2 problems wrt. cpu_topology_map on systems with sparse CPUs:

1. offline/absent CPUs will have their socket_id and core_id set to -1
   which triggers:
   "socket_id number is too big.You may need to upgrade the perf tool."

2. size of cpu_topology_map (perf_env.cpu[]) is allocated based on
   _SC_NPROCESSORS_CONF, but can be indexed with CPU ids going above.
   Users of perf_env.cpu[] are using CPU id as index. This can lead
   to read beyond what was allocated:
   ==19991== Invalid read of size 4
   ==19991==at 0x490CEB: check_cpu_topology (topology.c:69)
   ==19991==by 0x490CEB: test_session_topology (topology.c:106)
   ...

For example:
  _SC_NPROCESSORS_CONF == 16
  available: 2 nodes (0-1)
  node 0 cpus: 0 6 8 10 16 22 24 26
  node 0 size: 12004 MB
  node 0 free: 9470 MB
  node 1 cpus: 1 7 9 11 23 25 27
  node 1 size: 12093 MB
  node 1 free: 9406 MB
  node distances:
  node   0   1
0:  10  20
1:  20  10

This patch changes HEADER_NRCPUS.nr_cpus_available from _SC_NPROCESSORS_CONF
to max_present_cpu and updates any user of cpu_topology_map to iterate
with nr_cpus_avail. As consequence HEADER_CPU_TOPOLOGY core_id and
socket_id lists get longer, but maintain compatibility with pre-patch
state - index to cpu_topology_map is CPU id.

 ./perf test 36 -v
36: Session topology   :
--- start ---
test child forked, pid 22211
templ file: /tmp/perf-test-gmdX5i
CPU 0, core 0, socket 0
CPU 1, core 0, socket 1
CPU 6, core 10, socket 0
CPU 7, core 10, socket 1
CPU 8, core 1, socket 0
CPU 9, core 1, socket 1
CPU 10, core 9, socket 0
CPU 11, core 9, socket 1
CPU 16, core 0, socket 0
CPU 22, core 10, socket 0
CPU 23, core 10, socket 1
CPU 24, core 1, socket 0
CPU 25, core 1, socket 1
CPU 26, core 9, socket 0
CPU 27, core 9, socket 1
test child finished with 0
 end 
Session topology: Ok

Signed-off-by: Jan Stancek 
---
 tools/perf/builtin-stat.c   |  2 +-
 tools/perf/tests/topology.c |  4 +++-
 tools/perf/util/env.c   |  2 +-
 tools/perf/util/header.c| 16 +---
 4 files changed, 10 insertions(+), 14 deletions(-)

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index a02f2e965628..9e2be1c52fbd 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -1765,7 +1765,7 @@ static inline int perf_env__get_cpu(struct perf_env *env, 
struct cpu_map *map, i
 
cpu = map->map[idx];
 
-   if (cpu >= env->nr_cpus_online)
+   if (cpu >= env->nr_cpus_avail)
return -1;
 
return cpu;
diff --git a/tools/perf/tests/topology.c b/tools/perf/tests/topology.c
index 98fe69ac553c..803f893550d6 100644
--- a/tools/perf/tests/topology.c
+++ b/tools/perf/tests/topology.c
@@ -65,7 +65,9 @@ static int check_cpu_topology(char *path, struct cpu_map *map)
session = perf_session__new(&file, false, NULL);
TEST_ASSERT_VAL("can't get session", session);
 
-   for (i = 0; i < session->header.env.nr_cpus_online; i++) {
+   for (i = 0; i < session->header.env.nr_cpus_avail; i++) {
+   if (!cpu_map__has(map, i))
+   continue;
pr_debug("CPU %d, core %d, socket %d\n", i,
 session->header.env.cpu[i].core_id,
 session->header.env.cpu[i].socket_id);
diff --git a/tools/perf/util/env.c b/tools/perf/util/env.c
index bb964e86b09d..075fc77286bf 100644
--- a/tools/perf/util/env.c
+++ b/tools/perf/util/env.c
@@ -66,7 +66,7 @@ int perf_env__read_cpu_topology_map(struct perf_env *env)
return 0;
 
if (env->nr_cpus_avail == 0)
-   env->nr_cpus_avail = sysconf(_SC_NPROCESSORS_CONF);
+   env->nr_cpus_avail = cpu__max_present_cpu();
 
nr_cpus = env->nr_cpus_avail;
if (nr_cpus == -1)
diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index ca0f12fb5fd3..3a79a0f10bf6 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -293,11 +293,7 @@ static int write_nrcpus(int fd, struct perf_header *h 
__maybe_unused,
u32 nrc, nra;
int ret;
 
-   nr = sysconf(_SC_NPROCESSORS_CONF);
-   if (nr < 0)
-   return -1;
-
-   nrc = (u32)(nr & UINT_MAX);
+   nrc = cpu__max_present_cpu();
 
nr = sysconf(_SC_NPROCESSORS_ONLN);
if (nr < 0)
@@ -511,9 +507,7 @@ static struct cpu_topo *build_cpu_topology(void)
int ret = -1;
struct cpu_map *map;
 
-   ncpus = sysconf(_SC_NPROCESSORS_CONF);
-   if (ncpus < 0)
-   return NULL;
+   ncpus = cpu__max_present_cpu();
 
/* build online CPU map */
map = cpu_map__new(NULL);
@@ -1137,7 +1131,7 @@ static void print_cpu_topology(struct perf_header *ph, 
int fd __maybe_unused,
 {
int nr, i;
char *str;
-   int cpu_nr = ph->env.nr_cpus_online;
+   int cpu_nr = ph->env.

Re: [PATCH v2 2/3] perf: make build_cpu_topology skip offline/absent CPUs

2017-02-15 Thread Jan Stancek


- Original Message -
> From: "Jiri Olsa" 
> To: "Jan Stancek" 
> Cc: linux-kernel@vger.kernel.org, pet...@infradead.org, mi...@redhat.com, 
> a...@kernel.org, "alexander shishkin"
> , jo...@kernel.org, mhira...@kernel.org
> Sent: Tuesday, 14 February, 2017 12:01:10 PM
> Subject: Re: [PATCH v2 2/3] perf: make build_cpu_topology skip offline/absent 
> CPUs
> 
> On Mon, Feb 13, 2017 at 04:34:35PM +0100, Jan Stancek wrote:
> 
> SNIP
> 
> > This patch makes build_cpu_topology() skip offline/absent CPUs,
> > by checking their presence against cpu_map built from online CPUs.
> > 
> > Signed-off-by: Jan Stancek 
> > ---
> >  tools/perf/util/header.c | 21 +
> >  1 file changed, 17 insertions(+), 4 deletions(-)
> > 
> > Changes in v2:
> > - drop out label, use return NULL where possible
> > 
> > diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
> > index d89c9c7ef4e5..4b0ea4e92e9d 100644
> > --- a/tools/perf/util/header.c
> > +++ b/tools/perf/util/header.c
> > @@ -503,24 +503,31 @@ static void free_cpu_topo(struct cpu_topo *tp)
> >  
> >  static struct cpu_topo *build_cpu_topology(void)
> >  {
> > -   struct cpu_topo *tp;
> > +   struct cpu_topo *tp = NULL;
> > void *addr;
> > u32 nr, i;
> > size_t sz;
> > long ncpus;
> > -   int ret = -1;
> > +   int ret = 0;
> 
> hum, shoudn't we fail if we dont find any online cpu?
> 
> jirka

You're right, there's no reason to change this. I'll send v3.


[PATCH v2 1/3] perf: add cpu__max_present_cpu()

2017-02-13 Thread Jan Stancek
Similar to cpu__max_cpu() (which returns max possible CPU),
returns max present CPU.

Signed-off-by: Jan Stancek 
---
 tools/perf/util/cpumap.c | 22 ++
 tools/perf/util/cpumap.h |  1 +
 2 files changed, 23 insertions(+)

diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c
index 2c0b52264a46..8c7504939113 100644
--- a/tools/perf/util/cpumap.c
+++ b/tools/perf/util/cpumap.c
@@ -9,6 +9,7 @@
 #include "asm/bug.h"
 
 static int max_cpu_num;
+static int max_present_cpu_num;
 static int max_node_num;
 static int *cpunode_map;
 
@@ -442,6 +443,7 @@ static void set_max_cpu_num(void)
 
/* set up default */
max_cpu_num = 4096;
+   max_present_cpu_num = 4096;
 
mnt = sysfs__mountpoint();
if (!mnt)
@@ -455,6 +457,17 @@ static void set_max_cpu_num(void)
}
 
ret = get_max_num(path, &max_cpu_num);
+   if (ret)
+   goto out;
+
+   /* get the highest present cpu number for a sparse allocation */
+   ret = snprintf(path, PATH_MAX, "%s/devices/system/cpu/present", mnt);
+   if (ret == PATH_MAX) {
+   pr_err("sysfs path crossed PATH_MAX(%d) size\n", PATH_MAX);
+   goto out;
+   }
+
+   ret = get_max_num(path, &max_present_cpu_num);
 
 out:
if (ret)
@@ -505,6 +518,15 @@ int cpu__max_cpu(void)
return max_cpu_num;
 }
 
+int cpu__max_present_cpu(void)
+{
+   if (unlikely(!max_present_cpu_num))
+   set_max_cpu_num();
+
+   return max_present_cpu_num;
+}
+
+
 int cpu__get_node(int cpu)
 {
if (unlikely(cpunode_map == NULL)) {
diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h
index 06bd689f5989..1a0549af8f5c 100644
--- a/tools/perf/util/cpumap.h
+++ b/tools/perf/util/cpumap.h
@@ -62,6 +62,7 @@ static inline bool cpu_map__empty(const struct cpu_map *map)
 
 int cpu__max_node(void);
 int cpu__max_cpu(void);
+int cpu__max_present_cpu(void);
 int cpu__get_node(int cpu);
 
 int cpu_map__build_map(struct cpu_map *cpus, struct cpu_map **res,
-- 
1.8.3.1



[PATCH v2 3/3] perf: replace _SC_NPROCESSORS_CONF with max_present_cpu in cpu_topology_map

2017-02-13 Thread Jan Stancek
There are 2 problems wrt. cpu_topology_map on systems with sparse CPUs:

1. offline/absent CPUs will have their socket_id and core_id set to -1
   which triggers:
   "socket_id number is too big.You may need to upgrade the perf tool."

2. size of cpu_topology_map (perf_env.cpu[]) is allocated based on
   _SC_NPROCESSORS_CONF, but can be indexed with CPU ids going above.
   Users of perf_env.cpu[] are using CPU id as index. This can lead
   to read beyond what was allocated:
   ==19991== Invalid read of size 4
   ==19991==at 0x490CEB: check_cpu_topology (topology.c:69)
   ==19991==by 0x490CEB: test_session_topology (topology.c:106)
   ...

For example:
  _SC_NPROCESSORS_CONF == 16
  available: 2 nodes (0-1)
  node 0 cpus: 0 6 8 10 16 22 24 26
  node 0 size: 12004 MB
  node 0 free: 9470 MB
  node 1 cpus: 1 7 9 11 23 25 27
  node 1 size: 12093 MB
  node 1 free: 9406 MB
  node distances:
  node   0   1
0:  10  20
1:  20  10

This patch changes HEADER_NRCPUS.nr_cpus_available from _SC_NPROCESSORS_CONF
to max_present_cpu and updates any user of cpu_topology_map to iterate
with nr_cpus_avail. As consequence HEADER_CPU_TOPOLOGY core_id and
socket_id lists get longer, but maintain compatibility with pre-patch
state - index to cpu_topology_map is CPU id.

 ./perf test 36 -v
36: Session topology   :
--- start ---
test child forked, pid 22211
templ file: /tmp/perf-test-gmdX5i
CPU 0, core 0, socket 0
CPU 1, core 0, socket 1
CPU 6, core 10, socket 0
CPU 7, core 10, socket 1
CPU 8, core 1, socket 0
CPU 9, core 1, socket 1
CPU 10, core 9, socket 0
CPU 11, core 9, socket 1
CPU 16, core 0, socket 0
CPU 22, core 10, socket 0
CPU 23, core 10, socket 1
CPU 24, core 1, socket 0
CPU 25, core 1, socket 1
CPU 26, core 9, socket 0
CPU 27, core 9, socket 1
test child finished with 0
 end 
Session topology: Ok

Signed-off-by: Jan Stancek 
---
 tools/perf/builtin-stat.c   |  2 +-
 tools/perf/tests/topology.c |  4 +++-
 tools/perf/util/env.c   |  2 +-
 tools/perf/util/header.c| 16 +---
 4 files changed, 10 insertions(+), 14 deletions(-)

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index a02f2e965628..9e2be1c52fbd 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -1765,7 +1765,7 @@ static inline int perf_env__get_cpu(struct perf_env *env, 
struct cpu_map *map, i
 
cpu = map->map[idx];
 
-   if (cpu >= env->nr_cpus_online)
+   if (cpu >= env->nr_cpus_avail)
return -1;
 
return cpu;
diff --git a/tools/perf/tests/topology.c b/tools/perf/tests/topology.c
index 98fe69ac553c..803f893550d6 100644
--- a/tools/perf/tests/topology.c
+++ b/tools/perf/tests/topology.c
@@ -65,7 +65,9 @@ static int check_cpu_topology(char *path, struct cpu_map *map)
session = perf_session__new(&file, false, NULL);
TEST_ASSERT_VAL("can't get session", session);
 
-   for (i = 0; i < session->header.env.nr_cpus_online; i++) {
+   for (i = 0; i < session->header.env.nr_cpus_avail; i++) {
+   if (!cpu_map__has(map, i))
+   continue;
pr_debug("CPU %d, core %d, socket %d\n", i,
 session->header.env.cpu[i].core_id,
 session->header.env.cpu[i].socket_id);
diff --git a/tools/perf/util/env.c b/tools/perf/util/env.c
index bb964e86b09d..075fc77286bf 100644
--- a/tools/perf/util/env.c
+++ b/tools/perf/util/env.c
@@ -66,7 +66,7 @@ int perf_env__read_cpu_topology_map(struct perf_env *env)
return 0;
 
if (env->nr_cpus_avail == 0)
-   env->nr_cpus_avail = sysconf(_SC_NPROCESSORS_CONF);
+   env->nr_cpus_avail = cpu__max_present_cpu();
 
nr_cpus = env->nr_cpus_avail;
if (nr_cpus == -1)
diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index 4b0ea4e92e9d..e9384d17ae7d 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -293,11 +293,7 @@ static int write_nrcpus(int fd, struct perf_header *h 
__maybe_unused,
u32 nrc, nra;
int ret;
 
-   nr = sysconf(_SC_NPROCESSORS_CONF);
-   if (nr < 0)
-   return -1;
-
-   nrc = (u32)(nr & UINT_MAX);
+   nrc = cpu__max_present_cpu();
 
nr = sysconf(_SC_NPROCESSORS_ONLN);
if (nr < 0)
@@ -511,9 +507,7 @@ static struct cpu_topo *build_cpu_topology(void)
int ret = 0;
struct cpu_map *map;
 
-   ncpus = sysconf(_SC_NPROCESSORS_CONF);
-   if (ncpus < 0)
-   return NULL;
+   ncpus = cpu__max_present_cpu();
 
/* build online CPU map */
map = cpu_map__new(NULL);
@@ -1137,7 +1131,7 @@ static void print_cpu_topology(struct perf_header *ph, 
int fd __maybe_unused,
 {
int nr, i;
char *str;
-   int cpu_nr = ph->env.nr_cpus_online;
+   int cpu_nr = ph->env.

[PATCH v2 2/3] perf: make build_cpu_topology skip offline/absent CPUs

2017-02-13 Thread Jan Stancek
When build_cpu_topo() encounters offline/absent CPUs,
it fails to find any sysfs entries and returns failure.
This leads to build_cpu_topology() and write_cpu_topology()
failing as well.

Because HEADER_CPU_TOPOLOGY has not been written, read leaves
cpu_topology_map NULL and we get NULL ptr deref at:

 ...
  cmd_test
   __cmd_test
test_and_print
 run_test
  test_session_topology
   check_cpu_topology

36: Session topology   :
--- start ---
test child forked, pid 14902
templ file: /tmp/perf-test-4CKocW
failed to write feature HEADER_CPU_TOPOLOGY
perf: Segmentation fault
Obtained 9 stack frames.
./perf(sighandler_dump_stack+0x41) [0x5095f1]
/lib64/libc.so.6(+0x35250) [0x7f4b7c3c9250]
./perf(test_session_topology+0x1db) [0x490ceb]
./perf() [0x475b68]
./perf(cmd_test+0x5b9) [0x4763c9]
./perf() [0x4945a3]
./perf(main+0x69f) [0x427e8f]
/lib64/libc.so.6(__libc_start_main+0xf5) [0x7f4b7c3b5b35]
./perf() [0x427fb9]
test child interrupted
 end 
Session topology: FAILED!

This patch makes build_cpu_topology() skip offline/absent CPUs,
by checking their presence against cpu_map built from online CPUs.

Signed-off-by: Jan Stancek 
---
 tools/perf/util/header.c | 21 +
 1 file changed, 17 insertions(+), 4 deletions(-)

Changes in v2:
- drop out label, use return NULL where possible

diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index d89c9c7ef4e5..4b0ea4e92e9d 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -503,24 +503,31 @@ static void free_cpu_topo(struct cpu_topo *tp)
 
 static struct cpu_topo *build_cpu_topology(void)
 {
-   struct cpu_topo *tp;
+   struct cpu_topo *tp = NULL;
void *addr;
u32 nr, i;
size_t sz;
long ncpus;
-   int ret = -1;
+   int ret = 0;
+   struct cpu_map *map;
 
ncpus = sysconf(_SC_NPROCESSORS_CONF);
if (ncpus < 0)
return NULL;
 
+   /* build online CPU map */
+   map = cpu_map__new(NULL);
+   if (map == NULL) {
+   pr_debug("failed to get system cpumap\n");
+   return NULL;
+   }
+
nr = (u32)(ncpus & UINT_MAX);
 
sz = nr * sizeof(char *);
-
addr = calloc(1, sizeof(*tp) + 2 * sz);
if (!addr)
-   return NULL;
+   goto out_free;
 
tp = addr;
tp->cpu_nr = nr;
@@ -530,10 +537,16 @@ static struct cpu_topo *build_cpu_topology(void)
tp->thread_siblings = addr;
 
for (i = 0; i < nr; i++) {
+   if (!cpu_map__has(map, i))
+   continue;
+
ret = build_cpu_topo(tp, i);
if (ret < 0)
break;
}
+
+out_free:
+   cpu_map__put(map);
if (ret) {
free_cpu_topo(tp);
tp = NULL;
-- 
1.8.3.1



Re: [PATCH] perf: fix topology test on systems with sparse CPUs

2017-02-02 Thread Jan Stancek
> 
> > When build_cpu_topo() encounters offline/absent CPUs,
> > it fails to find any sysfs entries and returns failure.
> > This leads to build_cpu_topology() and write_cpu_topology()
> > failing as well.
> > 
> > Because HEADER_CPU_TOPOLOGY has not been written, read leaves
> > cpu_topology_map NULL and we get NULL ptr deref at:
> > 
> >  ...
> >   cmd_test
> >__cmd_test
> > test_and_print
> >  run_test
> >   test_session_topology
> >check_cpu_topology
> 
> So IIUIC that's the key issue here.. write_cpu_topology that fails
> to write the TOPO data and following readers crashing on processing
> uncomplete data? if thats the case write_cpu_topology needs to
> be fixed, instead of doing workarounds

It's already late when you are in write_cpu_topology(), because
build_cpu_topology() returned you NULL - there's nothing to write.
That's why patch aims to fix this in build_cpu_topology().

> 
> SNIP
> 
> > u32 nr, i;
> > size_t sz;
> > long ncpus;
> > -   int ret = -1;
> > +   int ret = 0;
> > +   struct cpu_map *map;
> >  
> > ncpus = sysconf(_SC_NPROCESSORS_CONF);
> > if (ncpus < 0)
> > -   return NULL;
> > +   goto out;
> 
> can just return NULL
> 
> > +
> > +   /* build online CPU map */
> > +   map = cpu_map__new(NULL);
> > +   if (map == NULL) {
> > +   pr_debug("failed to get system cpumap\n");
> > +   goto out;
> > +   }
> >  
> > nr = (u32)(ncpus & UINT_MAX);
> >  
> > sz = nr * sizeof(char *);
> > -
> > addr = calloc(1, sizeof(*tp) + 2 * sz);
> > if (!addr)
> > -   return NULL;
> > +   goto out_free;
> >  
> > tp = addr;
> > tp->cpu_nr = nr;
> > @@ -530,14 +537,21 @@ static struct cpu_topo *build_cpu_topology(void)
> > tp->thread_siblings = addr;
> >  
> > for (i = 0; i < nr; i++) {
> > +   if (!cpu_map__has(map, i))
> > +   continue;
> > +
> 
> so this prevents build_cpu_topo to fail due to missing topology
> info because cpu is offline.. can it fail for other reasons?

It's unlikely, though I suppose if you couldn't open and read something
from sysfs (say sysfs is not mounted) it can fail for online CPU too.

> 
> 
> > ret = build_cpu_topo(tp, i);
> > if (ret < 0)
> > break;
> 

SNIP

> For example:
>   _SC_NPROCESSORS_CONF == 16
>   available: 2 nodes (0-1)
>   node 0 cpus: 0 6 8 10 16 22 24 26
>   node 0 size: 12004 MB
>   node 0 free: 9470 MB
>   node 1 cpus: 1 7 9 11 23 25 27
>   node 1 size: 12093 MB
>   node 1 free: 9406 MB
>   node distances:
>   node   0   1
> 0:  10  20
> 1:  20  10
> so what's max_present_cpu in this example?

It's 28, which is the number of core_id/socket_id entries,
for CPUs 0 up to 27.

Regards,
Jan


Re: [PATCH] perf: fix topology test on systems with sparse CPUs

2017-01-31 Thread Jan Stancek
On 01/30/2017 07:49 PM, Jiri Olsa wrote:
> so basically we're changing from avail to online cpus
> 
> have you checked all the users of this FEATURE
> if such change is ok?

Jiri,

It wasn't OK as there are other users who index cpu_topology_map by CPU id.
I decided to give the alternative a try (attached): keep cpu_topology_map
indexed by CPU id, but extend it to fit max present CPU.

So for a system like this one:
  _SC_NPROCESSORS_CONF == 16
  available: 2 nodes (0-1)
  node 0 cpus: 0 6 8 10 16 22 24 26
  node 0 size: 12004 MB
  node 0 free: 9470 MB
  node 1 cpus: 1 7 9 11 23 25 27
  node 1 size: 12093 MB
  node 1 free: 9406 MB
  node distances:
  node   0   1
0:  10  20
1:  20  10

HEADER_NRCPUS before:
nr_cpus_online = 15
nr_cpus_available = 16

HEADER_CPU_TOPOLOGY before:
core_sib_nr: 2
core_siblings: 0,6,8,10,16,22,24,26
core_siblings: 1,7,9,11,23,25,27
thread_sib_nr: 8
thread_siblings: 0,16
thread_siblings: 1
thread_siblings: 6,22
thread_siblings: 7,23
thread_siblings: 8,24
thread_siblings: 9,25
thread_siblings: 10,26
thread_siblings: 11,27
core_id: 0, socket_id: 0
core_id: 0, socket_id: 1
core_id: -1, socket_id: -1
core_id: -1, socket_id: -1
core_id: -1, socket_id: -1
core_id: -1, socket_id: -1
core_id: 10, socket_id: 0
core_id: 10, socket_id: 1
core_id: 1, socket_id: 0
core_id: 1, socket_id: 1
core_id: 9, socket_id: 0
core_id: 9, socket_id: 1
core_id: -1, socket_id: -1
core_id: -1, socket_id: -1
core_id: -1, socket_id: -1
core_id: -1, socket_id: -1

HEADER_NRCPUS after:
nr_cpus_online = 15
nr_cpus_available = 28

HEADER_CPU_TOPOLOGY after:
core_sib_nr: 2
core_siblings: 0,6,8,10,16,22,24,26
core_siblings: 1,7,9,11,23,25,27
thread_sib_nr: 8
thread_siblings: 0,16
thread_siblings: 1
thread_siblings: 6,22
thread_siblings: 7,23
thread_siblings: 8,24
thread_siblings: 9,25
thread_siblings: 10,26
thread_siblings: 11,27
core_id: 0, socket_id: 0
core_id: 0, socket_id: 1
core_id: -1, socket_id: -1
core_id: -1, socket_id: -1
core_id: -1, socket_id: -1
core_id: -1, socket_id: -1
core_id: 10, socket_id: 0
core_id: 10, socket_id: 1
core_id: 1, socket_id: 0
core_id: 1, socket_id: 1
core_id: 9, socket_id: 0
core_id: 9, socket_id: 1
core_id: -1, socket_id: -1
core_id: -1, socket_id: -1
core_id: -1, socket_id: -1
core_id: -1, socket_id: -1
core_id: 0, socket_id: 0
core_id: -1, socket_id: -1
core_id: -1, socket_id: -1
core_id: -1, socket_id: -1
core_id: -1, socket_id: -1
core_id: -1, socket_id: -1
core_id: 10, socket_id: 0
core_id: 10, socket_id: 1
core_id: 1, socket_id: 0
core_id: 1, socket_id: 1
core_id: 9, socket_id: 0
core_id: 9, socket_id: 1

Regards,
Jan

>From 9bf8ece1e397b851beedaeceeb0cd07421ff6f43 Mon Sep 17 00:00:00 2001
Message-Id: <9bf8ece1e397b851beedaeceeb0cd07421ff6f43.1485877985.git.jstan...@redhat.com>
From: Jan Stancek 
Date: Tue, 31 Jan 2017 14:41:46 +0100
Subject: [PATCH 1/3 v2] perf: add cpu__max_present_cpu()

Similar to cpu__max_cpu() (which returns max possible CPU),
returns max present CPU.

Signed-off-by: Jan Stancek 
---
 tools/perf/util/cpumap.c | 22 ++
 tools/perf/util/cpumap.h |  1 +
 2 files changed, 23 insertions(+)

diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c
index 2c0b52264a46..8c7504939113 100644
--- a/tools/perf/util/cpumap.c
+++ b/tools/perf/util/cpumap.c
@@ -9,6 +9,7 @@
 #include "asm/bug.h"
 
 static int max_cpu_num;
+static int max_present_cpu_num;
 static int max_node_num;
 static int *cpunode_map;
 
@@ -442,6 +443,7 @@ static void set_max_cpu_num(void)
 
 	/* set up default */
 	max_cpu_num = 4096;
+	max_present_cpu_num = 4096;
 
 	mnt = sysfs__mountpoint();
 	if (!mnt)
@@ -455,6 +457,17 @@ static void set_max_cpu_num(void)
 	}
 
 	ret = get_max_num(path, &max_cpu_num);
+	if (ret)
+		goto out;
+
+	/* get the highest present cpu number for a sparse allocation */
+	ret = snprintf(path, PATH_MAX, "%s/devices/system/cpu/present", mnt);
+	if (ret == PATH_MAX) {
+		pr_err("sysfs path crossed PATH_MAX(%d) size\n", PATH_MAX);
+		goto out;
+	}
+
+	ret = get_max_num(path, &max_present_cpu_num);
 
 out:
 	if (ret)
@@ -505,6 +518,15 @@ int cpu__max_cpu(void)
 	return max_cpu_num;
 }
 
+int cpu__max_present_cpu(void)
+{
+	if (unlikely(!max_present_cpu_num))
+		set_max_cpu_num();
+
+	return max_present_cpu_num;
+}
+
+
 int cpu__get_node(int cpu)
 {
 	if (unlikely(cpunode_map == NULL)) {
diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h
index 06bd689f5989..1a0549af8f5c 100644
--- a/tools/perf/util/cpumap.h
+++ b/tools/perf/util/cpumap.h
@@ -62,6 +62,7 @@ static inline bool cpu_map__empty(const struct cpu_map *map)
 
 int cpu__max_node(void);
 int cpu__max_cpu(void);
+int cpu__max_present_cpu(void);
 int cpu__get_node(int cpu);
 
 int cpu_map__build_map(struct cpu_map *cpus, struct cpu_map **res,
-- 
1.8.3.1

>From 8f23543a314f359ee0625345bbc6d54b0e278358 Mon Sep 17 00:00:00 2001
Message-Id: <8f23543a314f3

Re: [PATCH] perf: fix topology test on systems with sparse CPUs

2017-01-30 Thread Jan Stancek




- Original Message -
> From: "Jiri Olsa" 
> To: "Jan Stancek" 
> Cc: linux-kernel@vger.kernel.org, pet...@infradead.org, mi...@redhat.com, 
> a...@kernel.org, "alexander shishkin"
> , jo...@kernel.org, mhira...@kernel.org, 
> "rui teng"
> , suka...@linux.vnet.ibm.com
> Sent: Monday, 30 January, 2017 7:49:08 PM
> Subject: Re: [PATCH] perf: fix topology test on systems with sparse CPUs
> 
> On Mon, Jan 30, 2017 at 05:53:34PM +0100, Jan Stancek wrote:
> 
> SNIP
> 
> > +   ret = build_cpu_topo(tp, cpu);
> > if (ret < 0)
> > break;
> > }
> > +
> > +out_free:
> > +   cpu_map__put(map);
> > if (ret) {
> > free_cpu_topo(tp);
> > tp = NULL;
> > }
> > +out:
> > return tp;
> >  }
> >  
> > @@ -575,7 +579,7 @@ static int write_cpu_topology(int fd, struct
> > perf_header *h __maybe_unused,
> > if (ret < 0)
> > goto done;
> >  
> > -   for (j = 0; j < perf_env.nr_cpus_avail; j++) {
> > +   for (j = 0; j < perf_env.nr_cpus_online; j++) {
> 
> so basically we're changing from avail to online cpus
> 
> have you checked all the users of this FEATURE
> if such change is ok?

You're right, I missed some. Looking again, I see at least
perf_env__get_core() could break.

Regards,
Jan


[PATCH] perf: fix topology test on systems with sparse CPUs

2017-01-30 Thread Jan Stancek
Topology test fails on systems with sparse CPUs, e.g.
CPU not present or offline:

36: Test topology in session :
--- start ---
test child forked, pid 23703
templ file: /tmp/perf-test-i2rNki
failed to write feature 13
perf: Segmentation fault

available: 2 nodes (0-1)
node 0 cpus: 0 6 8 10 16 22 24 26
node 0 size: 11797 MB
node 0 free: 10526 MB
node 1 cpus: 1 7 9 11 17 23 25 27
node 1 size: 12065 MB
node 1 free: 10770 MB
node distances:
node   0   1
  0:  10  20
  1:  20  10

Enumerating CPU ids from 0 to _SC_NPROCESSORS_CONF-1 in header.env.cpu[]
doesn't work on system like one above, because some ids are higher than
number of CPUs, and there can be gaps.

On top of that, if CPU is offline, we can't get topology info from
sysfs entries, because they don't exist for offline CPUs.

This patch stores topology data only for online CPUs in header.env.cpu[]
list, regardless of their CPU ids, and then uses cpu_map to translate
index to actual CPU id.

Example:
coreid socketid for CPU0
coreid socketid for CPU1
coreid socketid for CPU6
coreid socketid for CPU7
...

Alternative is we go from 0 to highest CPU id, but CPUs which are
missing would contain some dummy values in topology data.

Example:
coreid socketid for CPU0
coreid socketid for CPU1
-1 -1
-1 -1
-1 -1
-1 -1
coreid socketid for CPU6
coreid socketid for CPU7
...

Signed-off-by: Jan Stancek 
---
 tools/perf/tests/topology.c |  7 ---
 tools/perf/util/env.c   | 40 
 tools/perf/util/header.c| 36 
 3 files changed, 52 insertions(+), 31 deletions(-)

diff --git a/tools/perf/tests/topology.c b/tools/perf/tests/topology.c
index 98fe69ac553c..7b0b621ea8c0 100644
--- a/tools/perf/tests/topology.c
+++ b/tools/perf/tests/topology.c
@@ -66,17 +66,18 @@ static int check_cpu_topology(char *path, struct cpu_map 
*map)
TEST_ASSERT_VAL("can't get session", session);
 
for (i = 0; i < session->header.env.nr_cpus_online; i++) {
-   pr_debug("CPU %d, core %d, socket %d\n", i,
+   pr_debug("CPU %d, core %d, socket %d\n", map->map[i],
 session->header.env.cpu[i].core_id,
 session->header.env.cpu[i].socket_id);
}
 
for (i = 0; i < map->nr; i++) {
+   int cpu = map->map[i];
TEST_ASSERT_VAL("Core ID doesn't match",
-   (session->header.env.cpu[map->map[i]].core_id == 
(cpu_map__get_core(map, i, NULL) & 0x)));
+   (session->header.env.cpu[i].core_id == 
(cpu_map__get_core_id(cpu) & 0x)));
 
TEST_ASSERT_VAL("Socket ID doesn't match",
-   (session->header.env.cpu[map->map[i]].socket_id == 
cpu_map__get_socket(map, i, NULL)));
+   (session->header.env.cpu[i].socket_id == 
cpu_map__get_socket_id(cpu)));
}
 
perf_session__delete(session);
diff --git a/tools/perf/util/env.c b/tools/perf/util/env.c
index bb964e86b09d..0c2cae807a61 100644
--- a/tools/perf/util/env.c
+++ b/tools/perf/util/env.c
@@ -60,29 +60,45 @@ int perf_env__set_cmdline(struct perf_env *env, int argc, 
const char *argv[])
 
 int perf_env__read_cpu_topology_map(struct perf_env *env)
 {
-   int cpu, nr_cpus;
+   int cpu, nr_cpus, i, err = 0;
+   struct cpu_map *map;
 
if (env->cpu != NULL)
return 0;
 
-   if (env->nr_cpus_avail == 0)
-   env->nr_cpus_avail = sysconf(_SC_NPROCESSORS_CONF);
+   map = cpu_map__new(NULL);
+   if (map == NULL) {
+   pr_debug("failed to get system cpumap\n");
+   err = -ENOMEM;
+   goto out;
+   }
+
+   if (env->nr_cpus_online == 0)
+   env->nr_cpus_online = map->nr;
 
-   nr_cpus = env->nr_cpus_avail;
-   if (nr_cpus == -1)
-   return -EINVAL;
+   nr_cpus = env->nr_cpus_online;
+   if (nr_cpus == -1 || map->nr < nr_cpus) {
+   err = -EINVAL;
+   goto out_free;
+   }
 
env->cpu = calloc(nr_cpus, sizeof(env->cpu[0]));
-   if (env->cpu == NULL)
-   return -ENOMEM;
+   if (env->cpu == NULL) {
+   err = -ENOMEM;
+   goto out_free;
+   }
 
-   for (cpu = 0; cpu < nr_cpus; ++cpu) {
-   env->cpu[cpu].core_id   = cpu_map__get_core_id(cpu);
-   env->cpu[cpu].socket_id = cpu_map__get_socket_id(cpu);
+   for (i = 0; i < nr_cpus; i++) {
+   cpu = map->map[i];
+   env->cpu[i].core_id = cpu_map__get_core_id(cpu);
+   env->cpu[i].socket_id   = cpu_map__get_socket_id(cpu);
}
 
env->nr_cpus_avail = nr_cpus;
-   return 0;
+out_

unable to load modules with CONFIG_MODVERSIONS=y after commit 8ab2ae655b

2016-12-06 Thread Jan Stancek
Hi,

Starting with 4.9-rc8 / commit 8ab2ae655b ("default exported asm symbols to 
zero")
I'm running into issue with kernel built with CONFIG_MODVERSIONS=y
and (older) binutils (binutils-2.25.1-20.base.el7.ppc64le). 

Modules fail to load, for example:

[3.163646] Found checksum 0 vs module 4829A47E 
[3.163787] dm_mod: disagrees about version of symbol memcpy 
[3.163862] dm_mod: Unknown symbol memcpy (err -22)

Bisect led me to 8ab2ae655b, reverting it allows boot to
progress as before.

Regards,
Jan


Re: [PATCH 0/1] mm/hugetlb: fix huge page reservation leak in private mapping error paths

2016-10-20 Thread Jan Stancek




- Original Message -
> From: "Mike Kravetz" 
> To: linux...@kvack.org, linux-kernel@vger.kernel.org
> Cc: "Aneesh Kumar K . V" , "Naoya Horiguchi" 
> , "Michal
> Hocko" , "Kirill A . Shutemov" 
> , "Hillf Danton"
> , "Dave Hansen" , "Jan 
> Stancek" , "Mike
> Kravetz" 
> Sent: Thursday, 20 October, 2016 5:11:16 AM
> Subject: [PATCH 0/1] mm/hugetlb: fix huge page reservation leak in private 
> mapping error paths
> 
> This issue was discovered by Jan Stancek as described in
> https://lkml.kernel.org/r/57ff7bb4.1070...@redhat.com
> 
> Error paths in hugetlb_cow() and hugetlb_no_page() do not properly clean
> up reservation entries when freeing a newly allocated huge page.  This
> issue was introduced with commit 67961f9db8c4 ("mm/hugetlb: fix huge page
> reserve accounting for private mappings).  That commit uses the information
> in private mapping reserve maps to determine if a reservation was already
> consumed.  This is important in the case of hole punch and truncate as the
> pages are released, but reservation entries are not restored.
> 
> This patch restores the reserve entries in hugetlb_cow and hugetlb_no_page
> such that reserve entries are consistent with the global reservation count.
> 
> The huge page reservation code is quite hard to follow, and this patch
> makes it even more complex.  One thought I had was to change the way
> hole punch and truncate work so that private mapping pages are not thrown
> away.  This would eliminate the need for this patch as well as 67961f9db8c4.
> It would change the existing semantics (as seen by the user) in this area,
> but I believe the documentation (man pages) say the behavior is unspecified.
> This could be a future change as well as rewriting the existing reservation
> code to make it easier to understand/maintain.  Thoughts?
> 
> In any case, this patch addresses the immediate issue.

Mike,

Just to confirm, I ran this patch on my setup (without the patch from Aneesh)
with libhugetlbfs testsuite in loop for several hours. There were no
ENOMEM/OOM failures, I did not observe resv leak after it finished.

Regards,
Jan

> 
> Mike Kravetz (1):
>   mm/hugetlb: fix huge page reservation leak in private mapping error
> paths
> 
>  mm/hugetlb.c | 66
>  
>  1 file changed, 66 insertions(+)
> 
> --
> 2.7.4
> 
> 


Re: [bug/regression] libhugetlbfs testsuite failures and OOMs eventually kill my system

2016-10-18 Thread Jan Stancek




- Original Message -
> Jan Stancek  writes:
> > Hi Mike,
> >
> > Revert of 67961f9db8c4 helps, I let whole suite run for 100 iterations,
> > there were no issues.
> >
> > I cut down reproducer and removed last mmap/write/munmap as that is enough
> > to reproduce the problem. Then I started introducing some traces into
> > kernel
> > and noticed that on ppc I get 3 faults, while on x86 I get only 2.
> >
> > Interesting is the 2nd fault, that is first write after mapping as PRIVATE.
> > Following condition fails on ppc first time:
> > if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) {
> > but it's immediately followed by fault that looks identical
> > and in that one it evaluates as true.
> >
> > Same with alloc_huge_page(), on x86_64 it's called twice, on ppc three
> > times.
> > In 2nd call vma_needs_reservation() returns 0, in 3rd it returns 1.
> >
> >  ppc -> 2nd and 3rd fault ---
> > mmap(MAP_PRIVATE)
> > hugetlb_fault address: 3e00, flags: 55
> > hugetlb_cow old_page: f10fc000
> > alloc_huge_page ret: f110
> > hugetlb_cow ptep: c00455b27cf8, pte_same: 0
> > free_huge_page page: f110, restore_reserve: 1
> > hugetlb_fault address: 3e00, flags: 55
> > hugetlb_cow old_page: f10fc000
> > alloc_huge_page ret: f110
> > hugetlb_cow ptep: c00455b27cf8, pte_same: 1
> >
> > --- x86_64 -> 2nd fault ---
> > mmap(MAP_PRIVATE)
> > hugetlb_fault address: 7f71a420, flags: 55
> > hugetlb_cow address 0x7f71a420, old_page: ea0008d2
> > alloc_huge_page ret: ea0008d38000
> > hugetlb_cow ptep: 8802314c7908, pte_same: 1
> >
> > Regards,
> > Jan
> >
> 
> Can you check with the below patch. I ran the corrupt-by-cow-opt test with
> this patch
> and resv count got correctly updated.

I am running libhugetlbfs suite with patch below in loop for
~2 hours now and I don't see any problems/ENOMEMs/OOMs or
leaked resv pages:

0   hugepages-16384kB/free_hugepages
0   hugepages-16384kB/nr_hugepages
0   hugepages-16384kB/nr_hugepages_mempolicy
0   hugepages-16384kB/nr_overcommit_hugepages
0   hugepages-16384kB/resv_hugepages
0   hugepages-16384kB/surplus_hugepages
0   hugepages-16777216kB/free_hugepages
0   hugepages-16777216kB/nr_hugepages
0   hugepages-16777216kB/nr_hugepages_mempolicy
0   hugepages-16777216kB/nr_overcommit_hugepages
0   hugepages-16777216kB/resv_hugepages
0   hugepages-16777216kB/surplus_hugepages

Regards,
Jan

> 
> commit fb2e0c081d2922c8aaa49bbe166472aac68ef5e1
> Author: Aneesh Kumar K.V 
> Date:   Tue Oct 18 11:23:11 2016 +0530
> 
> mm/hugetlb: Use the right pte val for compare in hugetlb_cow
> 
> We cannot use the pte value used in set_pte_at for pte_same comparison,
> because archs like ppc64, filter/add new pte flag in set_pte_at. Instead
> fetch the pte value inside hugetlb_cow. We are comparing pte value to
> make sure the pte didn't change since we dropped the page table lock.
> hugetlb_cow get called with page table lock held, and we can take a copy
> of the pte value before we drop the page table lock.
> 
> Signed-off-by: Aneesh Kumar K.V 
> 
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index ec49d9ef1eef..da8fbd02b92e 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -3386,15 +3386,17 @@ static void unmap_ref_private(struct mm_struct *mm,
> struct vm_area_struct *vma,
>   * Keep the pte_same checks anyway to make transition from the mutex easier.
>   */
>  static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
> - unsigned long address, pte_t *ptep, pte_t pte,
> - struct page *pagecache_page, spinlock_t *ptl)
> +unsigned long address, pte_t *ptep,
> +struct page *pagecache_page, spinlock_t *ptl)
>  {
> + pte_t pte;
>   struct hstate *h = hstate_vma(vma);
>   struct page *old_page, *new_page;
>   int ret = 0, outside_reserve = 0;
>   unsigned long mmun_start;   /* For mmu_notifiers */
>   unsigned long mmun_end; /* For mmu_notifiers */
>  
> + pte = huge_ptep_get(ptep);
>   old_page = pte_page(pte);
>  
>  retry_avoidcopy:
> @@ -3668,7 +3670,7 @@ static int hugetlb_no_page(struct mm_struct *mm, struct
> vm_area_struct *vma,
>   hugetlb_count_add(pages_per_huge_page(h), mm);
>   if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
>   /* Optimization, do the COW witho

Re: [bug/regression] libhugetlbfs testsuite failures and OOMs eventually kill my system

2016-10-17 Thread Jan Stancek

- Original Message -
> From: "Mike Kravetz" 
> To: "Jan Stancek" , linux...@kvack.org, 
> linux-kernel@vger.kernel.org
> Cc: "hillf zj" , "dave hansen" 
> , "kirill shutemov"
> , mho...@suse.cz, n-horigu...@ah.jp.nec.com, 
> "aneesh kumar"
> , "iamjoonsoo kim" 
> Sent: Saturday, 15 October, 2016 1:57:31 AM
> Subject: Re: [bug/regression] libhugetlbfs testsuite failures and OOMs 
> eventually kill my system
> 
> 
> It is pretty consistent that we leak a reserve page every time this
> test is run.
> 
> The interesting thing is that corrupt-by-cow-opt is a very simple
> test case.  commit 67961f9db8c4 potentially changes the return value
> of the functions vma_has_reserves() and vma_needs/commit_reservation()
> for the owner (HPAGE_RESV_OWNER) of private mappings.  running the
> test with and without the commit results in the same return values for
> these routines on x86.  And, no leaked reserve pages.
> 
> Is it possible to revert this commit and run the libhugetlbs tests
> (func and stress) again while monitoring the counts in /sys?  The
> counts should go to zero after cleanup as you describe above.  I just
> want to make sure that this commit is causing all the problems you
> are seeing.  If it is, then we can consider reverting and I can try
> to think of another way to address the original issue.
> 
> Thanks for your efforts on this.  I can not reproduce on x86 or sparc
> and do not see any similar symptoms on these architectures.
> 
> --
> Mike Kravetz
> 

Hi Mike,

Revert of 67961f9db8c4 helps, I let whole suite run for 100 iterations,
there were no issues.

I cut down reproducer and removed last mmap/write/munmap as that is enough
to reproduce the problem. Then I started introducing some traces into kernel
and noticed that on ppc I get 3 faults, while on x86 I get only 2.

Interesting is the 2nd fault, that is first write after mapping as PRIVATE.
Following condition fails on ppc first time:
if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) {
but it's immediately followed by fault that looks identical
and in that one it evaluates as true.

Same with alloc_huge_page(), on x86_64 it's called twice, on ppc three times.
In 2nd call vma_needs_reservation() returns 0, in 3rd it returns 1.

 ppc -> 2nd and 3rd fault ---
mmap(MAP_PRIVATE)
hugetlb_fault address: 3e00, flags: 55
hugetlb_cow old_page: f10fc000
alloc_huge_page ret: f110
hugetlb_cow ptep: c00455b27cf8, pte_same: 0
free_huge_page page: f110, restore_reserve: 1
hugetlb_fault address: 3e00, flags: 55
hugetlb_cow old_page: f10fc000
alloc_huge_page ret: f110
hugetlb_cow ptep: c00455b27cf8, pte_same: 1

--- x86_64 -> 2nd fault ---
mmap(MAP_PRIVATE)
hugetlb_fault address: 7f71a420, flags: 55
hugetlb_cow address 0x7f71a420, old_page: ea0008d2
alloc_huge_page ret: ea0008d38000
hugetlb_cow ptep: 8802314c7908, pte_same: 1

Regards,
Jan


Re: [bug/regression] libhugetlbfs testsuite failures and OOMs eventually kill my system

2016-10-14 Thread Jan Stancek
On 10/14/2016 01:26 AM, Mike Kravetz wrote:
> 
> Hi Jan,
> 
> Any chance you can get the contents of /sys/kernel/mm/hugepages
> before and after the first run of libhugetlbfs testsuite on Power?
> Perhaps a script like:
> 
> cd /sys/kernel/mm/hugepages
> for f in hugepages-*/*; do
>   n=`cat $f`;
>   echo -e "$n\t$f";
> done
> 
> Just want to make sure the numbers look as they should.
> 

Hi Mike,

Numbers are below. I have also isolated a single testcase from "func"
group of tests: corrupt-by-cow-opt [1]. This test stops working if I
run it 19 times (with 20 hugepages). And if I disable this test,
"func" group tests can all pass repeatedly.

[1] 
https://github.com/libhugetlbfs/libhugetlbfs/blob/master/tests/corrupt-by-cow-opt.c

Regards,
Jan

Kernel is v4.8-14230-gb67be92, with reboot between each run.
1) Only func tests
System boot
After setup:
20  hugepages-16384kB/free_hugepages
20  hugepages-16384kB/nr_hugepages
20  hugepages-16384kB/nr_hugepages_mempolicy
0   hugepages-16384kB/nr_overcommit_hugepages
0   hugepages-16384kB/resv_hugepages
0   hugepages-16384kB/surplus_hugepages
0   hugepages-16777216kB/free_hugepages
0   hugepages-16777216kB/nr_hugepages
0   hugepages-16777216kB/nr_hugepages_mempolicy
0   hugepages-16777216kB/nr_overcommit_hugepages
0   hugepages-16777216kB/resv_hugepages
0   hugepages-16777216kB/surplus_hugepages

After func tests:
** TEST SUMMARY
*  16M
*  32-bit 64-bit
* Total testcases: 0 85
* Skipped: 0  0
*PASS: 0 81
*FAIL: 0  4
*Killed by signal: 0  0
*   Bad configuration: 0  0
*   Expected FAIL: 0  0
* Unexpected PASS: 0  0
* Strange test result: 0  0

26  hugepages-16384kB/free_hugepages
26  hugepages-16384kB/nr_hugepages
26  hugepages-16384kB/nr_hugepages_mempolicy
0   hugepages-16384kB/nr_overcommit_hugepages
1   hugepages-16384kB/resv_hugepages
0   hugepages-16384kB/surplus_hugepages
0   hugepages-16777216kB/free_hugepages
0   hugepages-16777216kB/nr_hugepages
0   hugepages-16777216kB/nr_hugepages_mempolicy
0   hugepages-16777216kB/nr_overcommit_hugepages
0   hugepages-16777216kB/resv_hugepages
0   hugepages-16777216kB/surplus_hugepages

After test cleanup:
 umount -a -t hugetlbfs
 hugeadm --pool-pages-max ${HPSIZE}:0

1   hugepages-16384kB/free_hugepages
1   hugepages-16384kB/nr_hugepages
1   hugepages-16384kB/nr_hugepages_mempolicy
0   hugepages-16384kB/nr_overcommit_hugepages
1   hugepages-16384kB/resv_hugepages
1   hugepages-16384kB/surplus_hugepages
0   hugepages-16777216kB/free_hugepages
0   hugepages-16777216kB/nr_hugepages
0   hugepages-16777216kB/nr_hugepages_mempolicy
0   hugepages-16777216kB/nr_overcommit_hugepages
0   hugepages-16777216kB/resv_hugepages
0   hugepages-16777216kB/surplus_hugepages

---

2) Only stress tests
System boot
After setup:
20  hugepages-16384kB/free_hugepages
20  hugepages-16384kB/nr_hugepages
20  hugepages-16384kB/nr_hugepages_mempolicy
0   hugepages-16384kB/nr_overcommit_hugepages
0   hugepages-16384kB/resv_hugepages
0   hugepages-16384kB/surplus_hugepages
0   hugepages-16777216kB/free_hugepages
0   hugepages-16777216kB/nr_hugepages
0   hugepages-16777216kB/nr_hugepages_mempolicy
0   hugepages-16777216kB/nr_overcommit_hugepages
0   hugepages-16777216kB/resv_hugepages
0   hugepages-16777216kB/surplus_hugepages

After stress tests:
20  hugepages-16384kB/free_hugepages
20  hugepages-16384kB/nr_hugepages
20  hugepages-16384kB/nr_hugepages_mempolicy
0   hugepages-16384kB/nr_overcommit_hugepages
17  hugepages-16384kB/resv_hugepages
0   hugepages-16384kB/surplus_hugepages
0   hugepages-16777216kB/free_hugepages
0   hugepages-16777216kB/nr_hugepages
0   hugepages-16777216kB/nr_hugepages_mempolicy
0   hugepages-16777216kB/nr_overcommit_hugepages
0   hugepages-16777216kB/resv_hugepages
0   hugepages-16777216kB/surplus_hugepages

After cleanup:
17  hugepages-16384kB/free_hugepages
17  hugepages-16384kB/nr_hugepages
17  hugepages-16384kB/nr_hugepages_mempolicy
0   hugepages-16384kB/nr_overcommit_hugepages
17  hugepages-16384kB/resv_hugepages
17  hugepages-16384kB/surplus_hugepages
0   hugepages-16777216kB/free_hugepages
0   hugepages-16777216kB/nr_hugepages
0   hugepages-16777216kB/nr_hugepages_mempolicy
0   hugepages-16777216kB/nr_overcommit_hugepages
0   hugepages-16777216kB/resv_hugepages
0   hugepages-16777216kB/surplus_hugepages

---

3) only corrupt-by-cow-opt

System boot
After setup:
20  hugepages-16384kB/free_hugepages
20  hugepages-16384kB/nr_hugepages
20  hugepages-16384kB/nr_hugepages_mempolicy
0   hugepages-16384kB/nr_overcommit_hugepages
0   huge

[bug/regression] libhugetlbfs testsuite failures and OOMs eventually kill my system

2016-10-13 Thread Jan Stancek
Hi,

I'm running into ENOMEM failures with libhugetlbfs testsuite [1] on
a power8 lpar system running 4.8 or latest git [2]. Repeated runs of
this suite trigger multiple OOMs, that eventually kill entire system,
it usually takes 3-5 runs:

 * Total System Memory..:  18024 MB
 * Shared Mem Max Mapping...:320 MB
 * System Huge Page Size: 16 MB
 * Available Huge Pages.: 20
 * Total size of Huge Pages.:320 MB
 * Remaining System Memory..:  17704 MB
 * Huge Page User Group.:  hugepages (1001)

I see this only on ppc (BE/LE), x86_64 seems unaffected and successfully
ran the tests for ~12 hours.

Bisect has identified following patch as culprit:
  commit 67961f9db8c477026ea20ce05761bde6f8bf85b0
  Author: Mike Kravetz 
  Date:   Wed Jun 8 15:33:42 2016 -0700
mm/hugetlb: fix huge page reserve accounting for private mappings


Following patch (made with my limited insight) applied to
latest git [2] fixes the problem for me:

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index ec49d9e..7261583 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1876,7 +1876,7 @@ static long __vma_reservation_common(struct hstate *h,
 * return value of this routine is the opposite of the
 * value returned from reserve map manipulation routines above.
 */
-   if (ret)
+   if (ret >= 0)
return 0;
else
return 1;

Regards,
Jan

[1] https://github.com/libhugetlbfs/libhugetlbfs
[2] v4.8-14230-gb67be92


Re: [PATCH] perf powerpc: Don't call perf_event_disable from atomic context

2016-10-05 Thread Jan Stancek


- Original Message -
> From: "Michael Ellerman" 
> To: "Jiri Olsa" , "Peter Zijlstra" 
> Cc: "lkml" , "Ingo Molnar" , 
> "Michael Neuling" ,
> "Paul Mackerras" , "Alexander Shishkin" 
> , "Jan Stancek"
> 
> Sent: Tuesday, 4 October, 2016 6:08:27 AM
> Subject: Re: [PATCH] perf powerpc: Don't call perf_event_disable from atomic 
> context
> 
> Jiri Olsa  writes:
> 
> > The trinity syscall fuzzer triggered following WARN on powerpc:
> >   WARNING: CPU: 9 PID: 2998 at arch/powerpc/kernel/hw_breakpoint.c:278
> >   ...
> >   NIP [c093aedc] .hw_breakpoint_handler+0x28c/0x2b0
> >   LR [c093aed8] .hw_breakpoint_handler+0x288/0x2b0
> >   Call Trace:
> >   [c002f7933580] [c093aed8] .hw_breakpoint_handler+0x288/0x2b0
> >   (unreliable)
> >   [c002f7933630] [c00f671c] .notifier_call_chain+0x7c/0xf0
> >   [c002f79336d0] [c00f6abc]
> >   .__atomic_notifier_call_chain+0xbc/0x1c0
> >   [c002f7933780] [c00f6c40] .notify_die+0x70/0xd0
> >   [c002f7933820] [c001a74c] .do_break+0x4c/0x100
> >   [c002f7933920] [c00089fc] handle_dabr_fault+0x14/0x48
> 
> Is that the full stack trace? It doesn't look like it.
> 
> And were you running trinity as root or regular user?

As regular user:

# adduser dummy
# su dummy /mnt/testarea/trinity --children $proc_num -m --syslog -q -T DIE

Regards,
Jan


[PATCH] crypto: testmgr - add guard to dst buffer for ahash_export

2016-09-28 Thread Jan Stancek
Add a guard to 'state' buffer and warn if its consistency after
call to crypto_ahash_export() changes, so that any write that
goes beyond advertised statesize (and thus causing potential
memory corruption [1]) is more visible.

[1] https://marc.info/?l=linux-crypto-vger&m=147467656516085

Signed-off-by: Jan Stancek 
Cc: Herbert Xu 
Cc: Marcelo Cerri 
---
 crypto/testmgr.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/crypto/testmgr.c b/crypto/testmgr.c
index 5c9d5a5e7b65..96343bcae01e 100644
--- a/crypto/testmgr.c
+++ b/crypto/testmgr.c
@@ -209,16 +209,19 @@ static int ahash_partial_update(struct ahash_request 
**preq,
char *state;
struct ahash_request *req;
int statesize, ret = -EINVAL;
+   const char guard[] = { 0x00, 0xba, 0xad, 0x00 };
 
req = *preq;
statesize = crypto_ahash_statesize(
crypto_ahash_reqtfm(req));
-   state = kmalloc(statesize, GFP_KERNEL);
+   state = kmalloc(statesize + sizeof(guard), GFP_KERNEL);
if (!state) {
pr_err("alt: hash: Failed to alloc state for %s\n", algo);
goto out_nostate;
}
+   memcpy(state + statesize, guard, sizeof(guard));
ret = crypto_ahash_export(req, state);
+   WARN_ON(memcmp(state + statesize, guard, sizeof(guard)));
if (ret) {
pr_err("alt: hash: Failed to export() for %s\n", algo);
goto out;
-- 
1.8.3.1



Re: [bug] crypto/vmx/p8_ghash memory corruption in 4.8-rc7

2016-09-28 Thread Jan Stancek
> Jan,
> 
> Can you check if the problem occurs with this patch?

No issues in over-night test with this patch.

> --- a/drivers/crypto/vmx/vmx.c
> +++ b/drivers/crypto/vmx/vmx.c
> @@ -28,6 +28,8 @@
>  #include 
>  #include 
>  
> +int p8_ghash_fallback_descsize(void);
> +
>  extern struct shash_alg p8_ghash_alg;
>  extern struct crypto_alg p8_aes_alg;
>  extern struct crypto_alg p8_aes_cbc_alg;
> @@ -45,6 +47,7 @@ int __init p8_init(void)
>  {
>   int ret = 0;
>   struct crypto_alg **alg_it;
> + int ghash_descsize;
>  
>   for (alg_it = algs; *alg_it; alg_it++) {
>   ret = crypto_register_alg(*alg_it);
> @@ -59,6 +62,12 @@ int __init p8_init(void)
>   if (ret)
>   return ret;
>  
> + ghash_descsize = p8_ghash_fallback_descsize();
> + if (ghash_descsize < 0) {
> + printk(KERN_ERR "Cannot get descsize for p8_ghash fallback\n");
> + return ghash_descsize;
> + }
> + p8_ghash_alg.descsize += ghash_descsize;
>   ret = crypto_register_shash(&p8_ghash_alg);
>   if (ret) {
>   for (alg_it = algs; *alg_it; alg_it++)

I'd suggest to move this inside vmx/ghash.c to a new function, so all p8_ghash
related code is at single place. Then p8_init() would just call the new
function:
-ret = crypto_register_shash(&p8_ghash_alg);
+ret = register_p8_ghash();

Regards,
Jan


Re: [bug] crypto/vmx/p8_ghash memory corruption in 4.8-rc7

2016-09-28 Thread Jan Stancek




- Original Message -
> From: "Herbert Xu" 
> To: "Marcelo Cerri" 
> Cc: "Jan Stancek" , "rui y wang" , 
> mhce...@linux.vnet.ibm.com,
> leosi...@linux.vnet.ibm.com, pfsmor...@linux.vnet.ibm.com, 
> linux-cry...@vger.kernel.org,
> linuxppc-...@lists.ozlabs.org, linux-kernel@vger.kernel.org
> Sent: Wednesday, 28 September, 2016 4:45:49 AM
> Subject: Re: [bug] crypto/vmx/p8_ghash memory corruption in 4.8-rc7
> 
> On Tue, Sep 27, 2016 at 04:46:44PM -0300, Marcelo Cerri wrote:
> > 
> > Can you check if the problem occurs with this patch?
> 
> In light of the fact that padlock-sha is the correct example
> to follow, you only need to add one line to the init_tfm fucntion
> to update the descsize based on that of the fallback.

Thanks for clearing up how this works in padlock-sha, but
we are not exactly in same situation with p8_ghash.

p8_ghash_init_tfm() already updates descsize. Problem in original report
is that without custom export/import/statesize p8_ghash_alg.statesize
gets initialized by shash_prepare_alg() to alg->descsize:

crash> p p8_ghash_alg.statesize
$1 = 56

testmgr allocates space for export based on crypto_shash_statesize(),
but shash_default_export() writes based on crypto_shash_descsize():
[8.297902] state: c004b873aa80, statesize: 56
[8.297932] shash_default_export memcpy c004b873aa80 c004b8607da0, 
len: 76

so I think we need either:
1) make sure p8_ghash_alg.descsize is correct before we register shash,
   this is what Marcelo's last patch is doing
2) provide custom export/import/statesize for p8_ghash_alg

Regards,
Jan

> 
> Thanks,
> --
> Email: Herbert Xu 
> Home Page: http://gondor.apana.org.au/~herbert/
> PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
> 


Re: [bug] crypto/vmx/p8_ghash memory corruption in 4.8-rc7

2016-09-27 Thread Jan Stancek




- Original Message -
> From: "Herbert Xu" 
> To: "Marcelo Cerri" 
> Cc: "Jan Stancek" , "rui y wang" , 
> mhce...@linux.vnet.ibm.com,
> leosi...@linux.vnet.ibm.com, pfsmor...@linux.vnet.ibm.com, 
> linux-cry...@vger.kernel.org,
> linuxppc-...@lists.ozlabs.org, linux-kernel@vger.kernel.org
> Sent: Tuesday, 27 September, 2016 5:08:26 AM
> Subject: Re: [bug] crypto/vmx/p8_ghash memory corruption in 4.8-rc7
> 
> On Mon, Sep 26, 2016 at 02:43:17PM -0300, Marcelo Cerri wrote:
> > 
> > Wouldn't be enough to provide a pair of import/export functions as the
> > padlock-sha driver does?
> 
> I don't think that will help as ultimately you need to call the
> export function on the fallback and that's what requires the extra
> memory.  In fact very operation involving the fallback will need
> that extra memory too.

So, if we extended p8_ghash_desc_ctx to accommodate fallback_desc's ctx
and then provided statesize/import/export, would that be acceptable?

struct p8_ghash_desc_ctx {
...
struct shash_desc fallback_desc;
+   char fallback_ctx[sizeof(struct ghash_desc_ctx)];


Also, does that mean that padlock_sha has similar problem?
It does not seem to reserve any space for fallback __ctx and it calls
init()/update()/export() with padlock_sha_desc's fallback:

struct padlock_sha_desc {
struct shash_desc fallback;
};

static struct shash_alg sha1_alg = {
.descsize   =   sizeof(struct padlock_sha_desc),

Regards,
Jan

> 
> Cheers,
> --
> Email: Herbert Xu 
> Home Page: http://gondor.apana.org.au/~herbert/
> PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
> 


Re: [bug] crypto/vmx/p8_ghash memory corruption in 4.8-rc7

2016-09-26 Thread Jan Stancek



- Original Message -
> From: "Marcelo Cerri" 
> To: "Jan Stancek" 
> Cc: "rui y wang" , herb...@gondor.apana.org.au, 
> mhce...@linux.vnet.ibm.com,
> leosi...@linux.vnet.ibm.com, pfsmor...@linux.vnet.ibm.com, 
> linux-cry...@vger.kernel.org,
> linuxppc-...@lists.ozlabs.org, linux-kernel@vger.kernel.org
> Sent: Monday, 26 September, 2016 4:15:10 PM
> Subject: Re: [bug] crypto/vmx/p8_ghash memory corruption in 4.8-rc7
> 
> Hi Jan,
> 
> Just out of curiosity, have you tried to use "76" on both values to
> check if the problem still happens?

I did, I haven't seen any panics with such patch:

@@ -211,7 +212,7 @@ struct shash_alg p8_ghash_alg = {
.update = p8_ghash_update,
.final = p8_ghash_final,
.setkey = p8_ghash_setkey,
-   .descsize = sizeof(struct p8_ghash_desc_ctx),
+   .descsize = sizeof(struct p8_ghash_desc_ctx) + 20,
.base = {
 .cra_name = "ghash",
 .cra_driver_name = "p8_ghash",


[bug] crypto/vmx/p8_ghash memory corruption in 4.8-rc7

2016-09-23 Thread Jan Stancek
Hi,

I'm chasing a memory corruption with 4.8-rc7 as I'm observing random Oopses
on ppc BE/LE systems (lpars, KVM guests). About 30% of issues is that
module list gets corrupted, and "cat /proc/modules" or "lsmod" triggers
an Oops, for example:

[   88.486041] Unable to handle kernel paging request for data at address 
0x0020
...
[   88.487658] NIP [c020f820] m_show+0xa0/0x240
[   88.487689] LR [c020f834] m_show+0xb4/0x240
[   88.487719] Call Trace:
[   88.487736] [c004b605bbb0] [c020f834] m_show+0xb4/0x240 
(unreliable)
[   88.487796] [c004b605bc50] [c045e73c] seq_read+0x36c/0x520
[   88.487843] [c004b605bcf0] [c04e1014] proc_reg_read+0x84/0x120
[   88.487889] [c004b605bd30] [c040df88] vfs_read+0xf8/0x380
[   88.487934] [c004b605bde0] [c040fd40] SyS_read+0x60/0x110
[   88.487981] [c004b605be30] [c0009590] system_call+0x38/0xec

0x20 offset is module_use->source, module_use is NULL because module.source_list
gets corrupted.

The source of corruption appears to originate from a 'ahash' test for p8_ghash:

cryptomgr_test
 alg_test
  alg_test_hash
   test_hash
__test_hash
 ahash_partial_update
  shash_async_export
   memcpy

With some extra traces [1], I'm seeing that ahash_partial_update() allocates 56 
bytes
for 'state', and then crypto_ahash_export() writes 76 bytes into it:

[5.970887] __test_hash alg name p8_ghash, result: c4333ac0, key: 
c004b860a500, req: c004b860a380
[5.970963] state: c4333f00, statesize: 56
[5.970995] shash_default_export memcpy c4333f00 c004b860a3e0, 
len: 76

This seems to directly correspond with:
  p8_ghash_alg.descsize = sizeof(struct p8_ghash_desc_ctx) == 56
  shash_tfm->descsize = sizeof(struct p8_ghash_desc_ctx) + 
crypto_shash_descsize(fallback) == 56 + 20
where 20 is presumably coming from "ghash_alg.descsize".

My gut feeling was that these 2 should match, but I'd love to hear
what crypto people think.

Thank you,
Jan

[1]
diff --git a/crypto/shash.c b/crypto/shash.c
index a051541..49fe182 100644
--- a/crypto/shash.c
+++ b/crypto/shash.c
@@ -188,6 +188,8 @@ EXPORT_SYMBOL_GPL(crypto_shash_digest);

 static int shash_default_export(struct shash_desc *desc, void *out)
 {
+   int len = crypto_shash_descsize(desc->tfm);
+   printk("shash_default_export memcpy %p %p, len: %d\n", out, 
shash_desc_ctx(desc), len);
memcpy(out, shash_desc_ctx(desc), crypto_shash_descsize(desc->tfm));
return 0;
 }
diff --git a/crypto/testmgr.c b/crypto/testmgr.c
index 5c9d5a5..2e54579 100644
--- a/crypto/testmgr.c
+++ b/crypto/testmgr.c
@@ -218,6 +218,8 @@ static int ahash_partial_update(struct ahash_request **preq,
pr_err("alt: hash: Failed to alloc state for %s\n", algo);
goto out_nostate;
}
+   printk("state: %p, statesize: %d\n", state, statesize);
+
ret = crypto_ahash_export(req, state);
if (ret) {
pr_err("alt: hash: Failed to export() for %s\n", algo);
@@ -288,6 +290,7 @@ static int __test_hash(struct crypto_ahash *tfm, struct 
hash_testvec *template,
   "%s\n", algo);
goto out_noreq;
}
+   printk("__test_hash alg name %s, result: %p, key: %p, req: %p\n", algo, 
result, key, req);
ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG,
   tcrypt_complete, &tresult);


Re: [bug] pwritev02 hang on s390x with 4.8.0-rc7

2016-09-20 Thread Jan Stancek



- Original Message -
> From: "Al Viro" 
> To: "Jan Stancek" 
> Cc: linux-kernel@vger.kernel.org
> Sent: Tuesday, 20 September, 2016 5:06:57 PM
> Subject: Re: [bug] pwritev02 hang on s390x with 4.8.0-rc7
> 
> On Tue, Sep 20, 2016 at 02:56:06PM +0200, Jan Stancek wrote:
> > Hi,
> > 
> > I'm hitting a regression with LTP's pwritev02 [1] on s390x with 4.8.0-rc7.
> > Specifically the EFAULT case, which is passing an iovec with invalid base
> > address:
> >   #define CHUNK 64
> >   static struct iovec wr_iovec3[] = {
> > {(char *)-1, CHUNK},
> >   };
> > hangs with 100% cpu usage and not very helpful stack trace:
> >   # cat /proc/28544/stack
> >   [<1000>] 0x1000
> >   [] 0x
> > 
> > The problem starts with d4690f1e1cda "fix iov_iter_fault_in_readable()".
> > 
> > Before this commit fault_in_pages_readable() called __get_user() on start
> > address which presumably failed with -EFAULT immediately.
> > 
> > With this commit applied fault_in_multipages_readable() appears to return 0
> > for the case when "start" is invalid but "end" overflows. Then according to
> > my traces we keep spinning inside while loop in iomap_write_actor().
> 
> Cute.  Let me see if I understand what's going on there: we have a wraparound
> that would've been caught by most of access_ok(), but not on an architectures
> where access_ok() is a no-op; in that case the loop is skipped and we
> just check the last address, which passes and we get a false positive.
> Bug is real and it's definitely -stable fodder.
> 
> I'm not sure that the fix you propose is right, though.  Note that ERR_PTR()
> is not a valid address on any architecture, so any wraparound automatically
> means -EFAULT and we can simply check unlikely(uaddr > end) and bugger off
> if it holds.  while() turns into do-while(), of course, and the same is
> needed for the read side.
> 
> Could you check if the following works for you?

This fixes pwritev02 hang.

I ran all syscalls tests from LTP, and I see a change in behaviour
of couple other tests (writev01, writev03 and writev04 [1]) in 4.8.0-rc7.

These call writev() with partially invalid iovecs, and now fail with
EFAULT, while with previous -rc6 kernel they returned number of bytes
written before they encountered invalid iovec record.
This should be reproducible also on x86.

Regards,
Jan

[1] 
https://github.com/linux-test-project/ltp/tree/master/testcases/kernel/syscalls/writev

> 
> diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
> index 66a1260..7e3d537 100644
> --- a/include/linux/pagemap.h
> +++ b/include/linux/pagemap.h
> @@ -571,56 +571,56 @@ static inline int fault_in_pages_readable(const char
> __user *uaddr, int size)
>   */
>  static inline int fault_in_multipages_writeable(char __user *uaddr, int
>  size)
>  {
> - int ret = 0;
>   char __user *end = uaddr + size - 1;
>  
>   if (unlikely(size == 0))
> - return ret;
> + return 0;
>  
> + if (unlikely(uaddr > end))
> + return -EFAULT;
>   /*
>* Writing zeroes into userspace here is OK, because we know that if
>* the zero gets there, we'll be overwriting it.
>*/
> - while (uaddr <= end) {
> - ret = __put_user(0, uaddr);
> - if (ret != 0)
> - return ret;
> + do {
> + if (unlikely(__put_user(0, uaddr) != 0))
> + return -EFAULT;
>   uaddr += PAGE_SIZE;
> - }
> + } while (uaddr <= end);
>  
>   /* Check whether the range spilled into the next page. */
>   if (((unsigned long)uaddr & PAGE_MASK) ==
>   ((unsigned long)end & PAGE_MASK))
> - ret = __put_user(0, end);
> + return __put_user(0, end);
>  
> - return ret;
> + return 0;
>  }
>  
>  static inline int fault_in_multipages_readable(const char __user *uaddr,
>  int size)
>  {
>   volatile char c;
> - int ret = 0;
>   const char __user *end = uaddr + size - 1;
>  
>   if (unlikely(size == 0))
> - return ret;
> + return 0;
>  
> - while (uaddr <= end) {
> - ret = __get_user(c, uaddr);
> - if (ret != 0)
> - return ret;
> + if (unlikely(uaddr > end))
> + return -EFAULT;
> +
> + do {
> + if (unlikely(__get_user(c, uaddr) != 0))
> + return

[bug] pwritev02 hang on s390x with 4.8.0-rc7

2016-09-20 Thread Jan Stancek
Hi,

I'm hitting a regression with LTP's pwritev02 [1] on s390x with 4.8.0-rc7.
Specifically the EFAULT case, which is passing an iovec with invalid base 
address:
  #define CHUNK 64
  static struct iovec wr_iovec3[] = {
{(char *)-1, CHUNK},
  };
hangs with 100% cpu usage and not very helpful stack trace:
  # cat /proc/28544/stack
  [<1000>] 0x1000
  [] 0x

The problem starts with d4690f1e1cda "fix iov_iter_fault_in_readable()".

Before this commit fault_in_pages_readable() called __get_user() on start
address which presumably failed with -EFAULT immediately.

With this commit applied fault_in_multipages_readable() appears to return 0
for the case when "start" is invalid but "end" overflows. Then according to
my traces we keep spinning inside while loop in iomap_write_actor().

Patch below makes the issue go away for me:

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 66a1260b33de..e443dbd2b5df 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -600,11 +600,11 @@ static inline int fault_in_multipages_readable(const char 
__user *uaddr,
   int size)
 {
volatile char c;
-   int ret = 0;
+   int ret = -EFAULT;
const char __user *end = uaddr + size - 1;

if (unlikely(size == 0))
-   return ret;
+   return 0;

while (uaddr <= end) {
ret = __get_user(c, uaddr);

Regards,
Jan

[1] 
https://github.com/linux-test-project/ltp/blob/master/testcases/kernel/syscalls/pwritev/pwritev02.c


[tip:perf/urgent] perf tests: objdump output can contain multi byte chunks

2016-08-04 Thread tip-bot for Jan Stancek
Commit-ID:  b2d0dbf09772d091368261ce95db3afce45d994d
Gitweb: http://git.kernel.org/tip/b2d0dbf09772d091368261ce95db3afce45d994d
Author: Jan Stancek 
AuthorDate: Tue, 12 Jan 2016 11:07:44 +0100
Committer:  Arnaldo Carvalho de Melo 
CommitDate: Tue, 2 Aug 2016 16:42:51 -0300

perf tests: objdump output can contain multi byte chunks

objdump's raw insn output can vary across architectures on the number of
bytes per chunk (bpc) displayed and their endianness.

The code-reading test relied on reading objdump output as 1 bpc. Kaixu
Xia reported test failure on ARM64, where objdump displays 4 bpc:

  70c48:f90027bf strxzr, [x29,#72]
  70c4c:91224000 addx0, x0, #0x890
  70c50:f90023a0 strx0, [x29,#64]

This patch adds support to read raw insn output for any bpc length.
In case of 2+ bpc it also guesses objdump's display endian.

Reported-and-Tested-by: Kaixu Xia 
Signed-off-by: Jan Stancek 
Acked-by: Adrian Hunter 
Cc: Corey Ashford 
Cc: David Ahern 
Cc: Frederic Weisbecker 
Cc: Jiri Olsa 
Cc: Namhyung Kim 
Cc: Paul Mackerras 
Cc: Peter Zijlstra 
Link: 
http://lkml.kernel.org/r/07f0f7bcbda78deb423298708ef9b6a54d6b92bd.1452592712.git.jstan...@redhat.com
[ Fix up pr_fmt() call to use %zd for size_t variables, fixing the build on 
Ubuntu cross-compiling to armhf and ppc64 ]
Signed-off-by: Arnaldo Carvalho de Melo 
---
 tools/perf/tests/code-reading.c | 100 
 1 file changed, 71 insertions(+), 29 deletions(-)

diff --git a/tools/perf/tests/code-reading.c b/tools/perf/tests/code-reading.c
index 68a69a1..2af156a 100644
--- a/tools/perf/tests/code-reading.c
+++ b/tools/perf/tests/code-reading.c
@@ -33,44 +33,86 @@ static unsigned int hex(char c)
return c - 'A' + 10;
 }
 
-static size_t read_objdump_line(const char *line, size_t line_len, void *buf,
- size_t len)
+static size_t read_objdump_chunk(const char **line, unsigned char **buf,
+size_t *buf_len)
 {
-   const char *p;
-   size_t i, j = 0;
-
-   /* Skip to a colon */
-   p = strchr(line, ':');
-   if (!p)
-   return 0;
-   i = p + 1 - line;
+   size_t bytes_read = 0;
+   unsigned char *chunk_start = *buf;
 
/* Read bytes */
-   while (j < len) {
+   while (*buf_len > 0) {
char c1, c2;
 
-   /* Skip spaces */
-   for (; i < line_len; i++) {
-   if (!isspace(line[i]))
-   break;
-   }
/* Get 2 hex digits */
-   if (i >= line_len || !isxdigit(line[i]))
+   c1 = *(*line)++;
+   if (!isxdigit(c1))
break;
-   c1 = line[i++];
-   if (i >= line_len || !isxdigit(line[i]))
+   c2 = *(*line)++;
+   if (!isxdigit(c2))
break;
-   c2 = line[i++];
-   /* Followed by a space */
-   if (i < line_len && line[i] && !isspace(line[i]))
+
+   /* Store byte and advance buf */
+   **buf = (hex(c1) << 4) | hex(c2);
+   (*buf)++;
+   (*buf_len)--;
+   bytes_read++;
+
+   /* End of chunk? */
+   if (isspace(**line))
break;
-   /* Store byte */
-   *(unsigned char *)buf = (hex(c1) << 4) | hex(c2);
-   buf += 1;
-   j++;
}
+
+   /*
+* objdump will display raw insn as LE if code endian
+* is LE and bytes_per_chunk > 1. In that case reverse
+* the chunk we just read.
+*
+* see disassemble_bytes() at binutils/objdump.c for details
+* how objdump chooses display endian)
+*/
+   if (bytes_read > 1 && !bigendian()) {
+   unsigned char *chunk_end = chunk_start + bytes_read - 1;
+   unsigned char tmp;
+
+   while (chunk_start < chunk_end) {
+   tmp = *chunk_start;
+   *chunk_start = *chunk_end;
+   *chunk_end = tmp;
+   chunk_start++;
+   chunk_end--;
+   }
+   }
+
+   return bytes_read;
+}
+
+static size_t read_objdump_line(const char *line, unsigned char *buf,
+   size_t buf_len)
+{
+   const char *p;
+   size_t ret, bytes_read = 0;
+
+   /* Skip to a colon */
+   p = strchr(line, ':');
+   if (!p)
+   return 0;
+   p++;
+
+   /* Skip initial spaces */
+   while (*p) {
+   if (!isspace(*p))
+   break;
+   p++;
+   }
+
+   do {
+   ret = read_objdump_chun

[PATCH] crypto: qat - make qat_asym_algs.o depend on asn1 headers

2016-06-30 Thread Jan Stancek
Parallel build can sporadically fail because asn1 headers may
not be built yet by the time qat_asym_algs.o is compiled:
  drivers/crypto/qat/qat_common/qat_asym_algs.c:55:32: fatal error: 
qat_rsapubkey-asn1.h: No such file or directory
   #include "qat_rsapubkey-asn1.h"

Signed-off-by: Jan Stancek 
Cc: Tadeusz Struk 
Cc: Herbert Xu 
---
 drivers/crypto/qat/qat_common/Makefile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/crypto/qat/qat_common/Makefile 
b/drivers/crypto/qat/qat_common/Makefile
index 6d74b91f2152..5fc3dbb9ada0 100644
--- a/drivers/crypto/qat/qat_common/Makefile
+++ b/drivers/crypto/qat/qat_common/Makefile
@@ -2,6 +2,7 @@ $(obj)/qat_rsapubkey-asn1.o: $(obj)/qat_rsapubkey-asn1.c \
 $(obj)/qat_rsapubkey-asn1.h
 $(obj)/qat_rsaprivkey-asn1.o: $(obj)/qat_rsaprivkey-asn1.c \
  $(obj)/qat_rsaprivkey-asn1.h
+$(obj)/qat_asym_algs.o: $(obj)/qat_rsapubkey-asn1.h 
$(obj)/qat_rsaprivkey-asn1.h
 
 clean-files += qat_rsapubkey-asn1.c qat_rsapubkey-asn1.h
 clean-files += qat_rsaprivkey-asn1.c qat_rsaprivkey-asn1.h
-- 
1.8.3.1



Re: [PATCH] mm/hugetlb: use EOPNOTSUPP in hugetlb sysctl handlers

2016-03-05 Thread Jan Stancek




- Original Message -
> From: "Andrew Morton" 
> To: "Jan Stancek" 
> Cc: linux...@kvack.org, linux-kernel@vger.kernel.org, 
> n-horigu...@ah.jp.nec.com, "mike kravetz"
> , "hillf zj" , "kirill 
> shutemov"
> , "dave hansen" 
> , "paul gortmaker"
> 
> Sent: Friday, 4 March, 2016 10:38:07 PM
> Subject: Re: [PATCH] mm/hugetlb: use EOPNOTSUPP in hugetlb sysctl handlers
> 
> On Thu,  3 Mar 2016 11:02:51 +0100 Jan Stancek  wrote:
> 
> > Replace ENOTSUPP with EOPNOTSUPP. If hugepages are not supported,
> > this value is propagated to userspace. EOPNOTSUPP is part of uapi
> > and is widely supported by libc libraries.
> 
> hm, what is the actual user-visible effect of this change?  Does it fix
> some misbehaviour?
> 

It gives nicer message to user, rather than:
# cat /proc/sys/vm/nr_hugepages
cat: /proc/sys/vm/nr_hugepages: Unknown error 524

And also LTP's proc01 test was failing because this ret code (524)
was unexpected:
proc01  1  TFAIL  :  proc01.c:396: read failed: /proc/sys/vm/nr_hugepages: 
errno=???(524): Unknown error 524
proc01  2  TFAIL  :  proc01.c:396: read failed: 
/proc/sys/vm/nr_hugepages_mempolicy: errno=???(524): Unknown error 524
proc01  3  TFAIL  :  proc01.c:396: read failed: 
/proc/sys/vm/nr_overcommit_hugepages: errno=???(524): Unknown error 524

Regards,
Jan


[PATCH] mm/hugetlb: use EOPNOTSUPP in hugetlb sysctl handlers

2016-03-03 Thread Jan Stancek
Replace ENOTSUPP with EOPNOTSUPP. If hugepages are not supported,
this value is propagated to userspace. EOPNOTSUPP is part of uapi
and is widely supported by libc libraries.

Cc: Andrew Morton 
Cc: Naoya Horiguchi 
Cc: Mike Kravetz 
Cc: Hillf Danton 
Cc: "Kirill A. Shutemov" 
Cc: Dave Hansen 
Cc: Paul Gortmaker 

Signed-off-by: Jan Stancek 
---
 mm/hugetlb.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 01f2b48c8618..851a29928a99 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2751,7 +2751,7 @@ static int hugetlb_sysctl_handler_common(bool 
obey_mempolicy,
int ret;
 
if (!hugepages_supported())
-   return -ENOTSUPP;
+   return -EOPNOTSUPP;
 
table->data = &tmp;
table->maxlen = sizeof(unsigned long);
@@ -2792,7 +2792,7 @@ int hugetlb_overcommit_handler(struct ctl_table *table, 
int write,
int ret;
 
if (!hugepages_supported())
-   return -ENOTSUPP;
+   return -EOPNOTSUPP;
 
tmp = h->nr_overcommit_huge_pages;
 
-- 
1.8.3.1



Re: [BUG] scheduler doesn't balance thread to idle cpu for 3 seconds

2016-02-08 Thread Jan Stancek
On 01/29/2016 11:33 AM, Jan Stancek wrote:
>>
>> Also note that I don't think failing this test is a bug per se.
>> Undesirable maybe, but within spec, since SIGALRM is process wide, so it
>> being delivered to the SCHED_OTHER task is accepted, and SCHED_OTHER has
>> no timeliness guarantees.
>>
>> That said; if I could reliably reproduce I'd have a go at fixing this, I
>> suspect there's a 'fun' problem at the bottom of this.
> 
> Thanks for trying, I'll see if I can find some more reliable way.

I think I have found a more reliably way, however it requires an older
stable kernel: 3.12.53 up to 4.1.17.

Consider following scenario:
- all tasks on system have RT sched class
- main thread of reproducer becomes the only SCHED_OTHER task on system
- when alarm(2) expires, main thread is woken up on cpu that is occupied by
  busy looping RT thread (low_priority_thread)
- because main thread was sleeping for 2 seconds, its load has decayed to 0
- the only chance for main thread to run is if it gets balanced to idle CPU
- task_tick_fair() doesn't run, there is RT task running on this CPU
- main thread is on cfs run queue but its load stays 0
- load balancer never sees this CPU (group) as busy

Attached is reproducer and script, which tries to trigger scenario above.
I can reproduce it with 4.1.17 on baremetal 4 CPU x86_64 with about 1:50 chance.

In this setup failure state persists for a long time, perhaps indefinitely.
I tried extending RUNTIME to 10 minutes, main thread still wouldn't run.

One more clue: I could work around this issue if I forced an 
update_entity_load_avg()
on sched_entities that have not been updated for some time, as part of
periodic rebalance_domains() call.

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c7c1d28..1b5fe80 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5264,6 +5264,7 @@ static void update_blocked_averages(int cpu)
struct rq *rq = cpu_rq(cpu);
struct cfs_rq *cfs_rq;
unsigned long flags;
+   struct rb_node *rb;

raw_spin_lock_irqsave(&rq->lock, flags);
update_rq_clock(rq);
@@ -5281,6 +5282,19 @@ static void update_blocked_averages(int cpu)
}

raw_spin_unlock_irqrestore(&rq->lock, flags);
+
+   cfs_rq = &(cpu_rq(cpu)->cfs);
+   for (rb = rb_first_postorder(&cfs_rq->tasks_timeline); rb; rb = 
rb_next_postorder(rb)) {
+   struct sched_entity *se = rb_entry(rb, struct sched_entity, 
run_node);
+
+   // Task on rq has not been updated for 500ms :-(
+   if ((cfs_rq_clock_task(cfs_rq) - se->avg.last_runnable_update) 
> 500L * (1 << 20))
+   update_entity_load_avg(se, 1);
+   }
 }

 /*

Regards,
Jan

/*
 * reproducer v3 for:
 * [BUG] scheduler doesn't balance thread to idle cpu for 3 seconds
 *
 * Based on LTP's pthread_cond_wait_1.c
 *
 */

#define _GNU_SOURCE
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 

#define ERROR_PREFIX "unexpected error: "

#define HIGH_PRIORITY 10
#define LOW_PRIORITY  5
#define RUNTIME   5
#define POLICYSCHED_RR

#define PTS_PASS 0
#define PTS_FAIL 1
#define PTS_UNRESOLVED 2

pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
pthread_cond_t cond = PTHREAD_COND_INITIALIZER;

/* Flags that the threads use to indicate events */
volatile int woken_up = 0;
volatile int low_done = 0;

/* Signal handler that handle the ALRM and wakes up
 * the high priority thread
 */
void signal_handler(int sig)
{
(void) sig;
if (pthread_cond_signal(&cond) != 0) {
printf(ERROR_PREFIX "pthread_cond_signal\n");
exit(PTS_UNRESOLVED);
}
}

/* Utility function to find difference between two time values */
float timediff(struct timespec t2, struct timespec t1)
{
float diff = t2.tv_sec - t1.tv_sec;
diff += (t2.tv_nsec - t1.tv_nsec) / 10.0;
return diff;
}

void *hi_priority_thread(void *tmp)
{
struct sched_param param;
int policy;
int rc = 0;

(void) tmp;
param.sched_priority = HIGH_PRIORITY;

rc = pthread_setschedparam(pthread_self(), POLICY, ¶m);
if (rc != 0) {
printf(ERROR_PREFIX "pthread_setschedparam\n");
exit(PTS_UNRESOLVED);
}
rc = pthread_getschedparam(pthread_self(), &policy, ¶m);
if (rc != 0) {
printf(ERROR_PREFIX "pthread_getschedparam\n");
exit(PTS_UNRESOLVED);
}
if ((policy != POLICY) || (param.sched_priority != HIGH_PRIORITY)) {
printf("Error: the policy or priority not correct\n");
exit(PTS_UNRESOLVED);
}

/* Install a signal handler for ALRM */
if

Re: [BUG] scheduler doesn't balance thread to idle cpu for 3 seconds

2016-01-29 Thread Jan Stancek




- Original Message -
> From: "Peter Zijlstra" 
> To: "Jan Stancek" 
> Cc: "alex shi" , "guz fnst" , 
> mi...@redhat.com, jo...@redhat.com,
> r...@redhat.com, linux-kernel@vger.kernel.org
> Sent: Friday, 29 January, 2016 11:15:22 AM
> Subject: Re: [BUG] scheduler doesn't balance thread to idle cpu for 3 seconds
> 
> On Thu, Jan 28, 2016 at 01:43:13PM -0500, Jan Stancek wrote:
> > > How long should I have to wait for a fail?
> > 
> > It's about 1000-2000 iterations for me, which I think you covered
> > by now in those 2 hours.
> 
> So I've been running:
> 
>   while ! ./pthread_cond_wait_1 ; do sleep 1; done
> 
> overnight on the machine, and have yet to hit a wobbly -- that is, its
> still running.

I have seen similar result.

Then, instead of turning CPUs off, I spawned more low prio threads to scale
with number of CPUs on system:

@@ -213,10 +213,14 @@
printf(ERROR_PREFIX "pthread_attr_setschedparam\n");
exit(PTS_UNRESOLVED);
}
-   rc = pthread_create(&low_id, &low_attr, low_priority_thread, NULL);
-   if (rc != 0) {
-   printf(ERROR_PREFIX "pthread_create\n");
-   exit(PTS_UNRESOLVED);
+
+   int i, ncpus = sysconf(_SC_NPROCESSORS_ONLN);
+   for (i = 0; i < ncpus - 1; i++) {
+   rc = pthread_create(&low_id, &low_attr, low_priority_thread, 
NULL);
+   if (rc != 0) {
+   printf(ERROR_PREFIX "pthread_create\n");
+   exit(PTS_UNRESOLVED);
+   }

and let this ran on 3 bare metal x86 systems over night (v4.5-rc1). It
failed on 2 systems (12 and 24 CPUs) with 1:1000 chance, it never failed
on 3rd one (4 CPUs).

> 
> Also note that I don't think failing this test is a bug per se.
> Undesirable maybe, but within spec, since SIGALRM is process wide, so it
> being delivered to the SCHED_OTHER task is accepted, and SCHED_OTHER has
> no timeliness guarantees.
> 
> That said; if I could reliably reproduce I'd have a go at fixing this, I
> suspect there's a 'fun' problem at the bottom of this.

Thanks for trying, I'll see if I can find some more reliable way.

Regards,
Jan


Re: [BUG] scheduler doesn't balance thread to idle cpu for 3 seconds

2016-01-28 Thread Jan Stancek




- Original Message -
> From: "Peter Zijlstra" 
> To: "Jan Stancek" 
> Cc: "alex shi" , "guz fnst" , 
> mi...@redhat.com, jo...@redhat.com,
> r...@redhat.com, linux-kernel@vger.kernel.org
> Sent: Thursday, 28 January, 2016 6:49:03 PM
> Subject: Re: [BUG] scheduler doesn't balance thread to idle cpu for 3 seconds
> 
> On Thu, Jan 28, 2016 at 04:55:02PM +0100, Jan Stancek wrote:
> > On 01/27/2016 03:52 PM, Jan Stancek wrote:
> > > Hello,
> > > 
> > > pthread_cond_wait_1/2 [1] is rarely failing for me on 4.5.0-rc1,
> > > on x86_64 KVM guest with 2 CPUs.
> > > 
> > > This test [1]:
> > > - spawns 2 SCHED_RR threads
> > > - first thread with higher priority sets alarm for 2 seconds and blocks
> > > on condition
> > > - second thread with lower priority is busy looping for 5 seconds
> > > - after 2 seconds alarm signal arrives and handler signals condition
> > > - high priority thread should resume running
> > 
> > I have slightly modified testcase, so it will finish immediately when high
> > prio
> > thread is done. And also to allow it to compile outside of openposix
> > testsuite.
> 
> Yeah, I 'fixed' the testcase too.
> 
> So I've had it run for almost 2 hours without a single fail. I've
> hot-plugged my cpu count down to 2.

I can try that too. I'm mostly seeing this on s390 and x86_64 KVM guests,
both have 2 CPUs.

Have you noticed if iteration times vary or if they stay consitently
at ~2 seconds?

> 
> How long should I have to wait for a fail?

It's about 1000-2000 iterations for me, which I think you covered
by now in those 2 hours.

Regards,
Jan





Re: [BUG] scheduler doesn't balance thread to idle cpu for 3 seconds

2016-01-28 Thread Jan Stancek
On 01/27/2016 03:52 PM, Jan Stancek wrote:
> Hello,
> 
> pthread_cond_wait_1/2 [1] is rarely failing for me on 4.5.0-rc1,
> on x86_64 KVM guest with 2 CPUs.
> 
> This test [1]:
> - spawns 2 SCHED_RR threads
> - first thread with higher priority sets alarm for 2 seconds and blocks on 
> condition
> - second thread with lower priority is busy looping for 5 seconds
> - after 2 seconds alarm signal arrives and handler signals condition
> - high priority thread should resume running

I have slightly modified testcase, so it will finish immediately when high prio
thread is done. And also to allow it to compile outside of openposix testsuite.

Testcase is attached. I'm running it in following way:

gcc -O2 -pthread pthread_cond_wait_1.c
while [ True ]; do
  time ./a.out
  sleep 1
done

for couple thousand iterations. About half of those are
on system booted with init=/bin/bash.

> 
> But rarely I see that high priority thread doesn't resume running until
> low priority thread completes its 5 second busy loop.
> 
> Looking at traces (short version attached, long version at [2]),
> I see that after 2 seconds scheduler tries to wake up main thread, but it
> appears to do that on same CPU where SCHED_RR low prio thread is running,
> so nothing happens. Then scheduler makes numerous balance attempts,
> but main thread is not balanced to idle CPU.
> 
> My guess is this started with following commit, which changed 
> weighted_cpuload():
>   commit b92486cbf2aa230d00f160664858495c81d2b37b
>   Author: Alex Shi 
>   Date:   Thu Jun 20 10:18:50 2013 +0800
> sched: Compute runnable load avg in cpu_load and cpu_avg_load_per_task

Here are some numbers gathered from kernels with HEAD at b92486c and
previous commit 83dfd52. System is 2 CPU KVM guest.

Each iteration measures how long it took for testcase to finish.
Ideally it should take about 2 seconds.

1. HEAD at 83dfd52 sched: Update cpu load after task_tick

  finish time [s]  |   iterations
--
[2,   2.2] |   3134
[  2.2,   2.5] | 18
[  2.5, 3] |  0
[3, 4] |  0
[4, 5] |  0
[5,   999] |  0


2. HEAD at b92486c sched: Compute runnable load avg in cpu_load and 
cpu_avg_load_per_task

  finish time [s]  |   iterations
--
[2,   2.2] |   1617
[  2.2,   2.5] | 38
[  2.5, 3] |727
[3, 4] |399
[4, 5] | 17
[5,   999] | 11

Regards,
Jan

> 
> I could reproduce it with HEAD set at above commit, I couldn't reproduce it
> with 3.10 kernel so far.
> 
> Regards,
> Jan
> 
> [1] 
> https://github.com/linux-test-project/ltp/blob/master/testcases/open_posix_testsuite/functional/threads/condvar/pthread_cond_wait_1.c
> [2] http://jan.stancek.eu/tmp/pthread_cond_wait_failure/sched-trace1.tar.bz2
> 

/*
 * Copyright (c) 2004, QUALCOMM Inc. All rights reserved.
 * Created by:  abisain REMOVE-THIS AT qualcomm DOT com
 * This file is licensed under the GPL license.  For the full content
 * of this license, see the COPYING file at the top level of this
 * source tree.

 * Test that pthread_cond_signal()
 *   shall wakeup a high priority thread even when a low priority thread
 *   is running

 * Steps:
 * 1. Create a condition variable
 * 2. Create a high priority thread and make it wait on the cond
 * 3. Create a low priority thread and let it busy-loop
 * 4. Signal the cond in a signal handler and check that high
 *priority thread got woken up
 *
 */

#include 
#include 
#include 
#include 
#include 
#include 

#define TEST "5-1"
#define AREA "scheduler"
#define ERROR_PREFIX "unexpected error: " AREA " " TEST ": "

#define HIGH_PRIORITY 10
#define LOW_PRIORITY  5
#define RUNTIME   5
#define POLICYSCHED_RR

#define PTS_PASS 0
#define PTS_FAIL 1
#define PTS_UNRESOLVED 2


/* mutex required by the cond variable */
pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
/* condition variable that threads block on*/
pthread_cond_t cond = PTHREAD_COND_INITIALIZER;

/* Flags that the threads use to indicate events */
volatile int woken_up = 0;
volatile int low_done = 0;

/* Signal handler that handle the ALRM and wakes up
 * the high priority thread
 */
void signal_handler(int sig)
{
(void) sig;
if (pthread_cond_signal(&cond) != 0) {
printf(ERROR_PREFIX "pthread_cond_signal\n");
exit(PTS_UNRESOLVED);
}
}

/* Utility function to find difference between two time values */
float timediff(struct timespec t2, struct timespec t1)
{
float diff = t2.tv_sec - t1.tv_sec;
diff += (t2.tv_nsec - t1.tv_nsec) / 10.0;
return diff;
}

void *hi_priority_thread(void *tmp)
{
   

[BUG] scheduler doesn't balance thread to idle cpu for 3 seconds

2016-01-27 Thread Jan Stancek
Hello,

pthread_cond_wait_1/2 [1] is rarely failing for me on 4.5.0-rc1,
on x86_64 KVM guest with 2 CPUs.

This test [1]:
- spawns 2 SCHED_RR threads
- first thread with higher priority sets alarm for 2 seconds and blocks on 
condition
- second thread with lower priority is busy looping for 5 seconds
- after 2 seconds alarm signal arrives and handler signals condition
- high priority thread should resume running

But rarely I see that high priority thread doesn't resume running until
low priority thread completes its 5 second busy loop.

Looking at traces (short version attached, long version at [2]),
I see that after 2 seconds scheduler tries to wake up main thread, but it
appears to do that on same CPU where SCHED_RR low prio thread is running,
so nothing happens. Then scheduler makes numerous balance attempts,
but main thread is not balanced to idle CPU.

My guess is this started with following commit, which changed 
weighted_cpuload():
  commit b92486cbf2aa230d00f160664858495c81d2b37b
  Author: Alex Shi 
  Date:   Thu Jun 20 10:18:50 2013 +0800
sched: Compute runnable load avg in cpu_load and cpu_avg_load_per_task

I could reproduce it with HEAD set at above commit, I couldn't reproduce it
with 3.10 kernel so far.

Regards,
Jan

[1] 
https://github.com/linux-test-project/ltp/blob/master/testcases/open_posix_testsuite/functional/threads/condvar/pthread_cond_wait_1.c
[2] http://jan.stancek.eu/tmp/pthread_cond_wait_failure/sched-trace1.tar.bz2
# test starts running
bash-5626  [001]    294.609021: sched_process_fork: comm=bash 
pid=5626 child_comm=bash child_pid=10005
bash-5626  [001] d...   294.609023: sched_migrate_task: comm=bash 
pid=10005 prio=120 orig_cpu=1 dest_cpu=0
bash-5626  [001] d...   294.609029: sched_wakeup_new: comm=bash 
pid=10005 prio=120 target_cpu=000
  -0 [000] d...   294.609073: sched_switch: 
prev_comm=swapper/0 prev_pid=0 prev_prio=120 prev_state=R ==> next_comm=bash 
next_pid=10005 next_prio=120
 condvar_pthread-10005 [000] d.h.   294.609355: sched_stat_runtime: comm=bash 
pid=10005 runtime=330015 [ns] vruntime=937586860 [ns]
 condvar_pthread-10005 [000] d...   294.610222: sched_stat_runtime: 
comm=condvar_pthread pid=10005 runtime=82 [ns] vruntime=938453522 [ns]
 condvar_pthread-10005 [000] d...   294.610224: sched_migrate_task: 
comm=condvar_pthread pid=10005 prio=120 orig_cpu=0 dest_cpu=0
 condvar_pthread-10005 [000]    294.610231: sched_process_fork: 
comm=condvar_pthread pid=10005 child_comm=condvar_pthread child_pid=10006
# high prio thread (10006) is started
 condvar_pthread-10005 [000] d...   294.610232: sched_migrate_task: 
comm=condvar_pthread pid=10006 prio=120 orig_cpu=0 dest_cpu=0
 condvar_pthread-10005 [000] d...   294.610233: sched_stat_runtime: 
comm=condvar_pthread pid=10005 runtime=11929 [ns] vruntime=938465451 [ns]
 condvar_pthread-10005 [000] d...   294.610238: sched_wakeup_new: 
comm=condvar_pthread pid=10006 prio=120 target_cpu=000
 condvar_pthread-10005 [000] d...   294.610267: sched_stat_runtime: 
comm=condvar_pthread pid=10005 runtime=34190 [ns] vruntime=938499641 [ns]
 condvar_pthread-10005 [000] d...   294.610268: sched_migrate_task: 
comm=condvar_pthread pid=10005 prio=120 orig_cpu=0 dest_cpu=0
 condvar_pthread-10005 [000]    294.610270: sched_process_fork: 
comm=condvar_pthread pid=10005 child_comm=condvar_pthread child_pid=10007
# low prio thread (10007) is started
 condvar_pthread-10005 [000] d...   294.610271: sched_migrate_task: 
comm=condvar_pthread pid=10007 prio=120 orig_cpu=0 dest_cpu=1
 condvar_pthread-10005 [000] d...   294.610277: sched_wakeup_new: 
comm=condvar_pthread pid=10007 prio=120 target_cpu=001
 condvar_pthread-10005 [000] d...   294.610298: sched_stat_runtime: 
comm=condvar_pthread pid=10005 runtime=31353 [ns] vruntime=938530994 [ns]
 condvar_pthread-10005 [000] d...   294.610302: sched_switch: 
prev_comm=condvar_pthread prev_pid=10005 prev_prio=120 prev_state=S ==> 
next_comm=condvar_pthread next_pid=10006 next_prio=120
 condvar_pthread-10006 [000] d...   294.610311: sched_stat_runtime: 
comm=condvar_pthread pid=10006 runtime=12355 [ns] vruntime=944465872 [ns]
  -0 [001] d...   294.610318: sched_switch: 
prev_comm=swapper/1 prev_pid=0 prev_prio=120 prev_state=R ==> 
next_comm=condvar_pthread next_pid=10007 next_prio=120
 condvar_pthread-10007 [001] d...   294.610332: sched_stat_runtime: 
comm=condvar_pthread pid=10007 runtime=59910 [ns] vruntime=581696119 [ns]
 condvar_pthread-10006 [000] d...   294.610340: sched_switch: 
prev_comm=condvar_pthread prev_pid=10006 prev_prio=89 prev_state=S ==> 
next_comm=swapper/0 next_pid=0 next_prio=120
 condvar_pthread-10007 [001] d...   296.571389: sched_switch: 
prev_comm=condvar_pthread prev_pid=10007 prev_prio=94 prev_state=R ==> 
next_comm=watchdog/1 next_pid=13 next_prio=0
  watchdog/1-13[001] d...   296.571393: sched_migrate_task: 
comm=condvar_pthread pid=10007 prio=94 orig_cpu=1 dest_cpu=0
  -0

Re: [BUG] perf test 21("Test object code reading") failure on ARM64

2015-12-19 Thread Jan Stancek
On Sat, Dec 19, 2015 at 11:04:21AM +0800, xiakaixu wrote:
> 
> >>>...
> > 
> > Hi,
> > 
> > What is your objdump version?
> 
> Hi,
> 
> Sorry for the late reply.
> 
> # objdump --version
> GNU objdump (GNU Binutils) 2.25.
> 
> I am sure that the system is Little endian.
> > 

I have attached a patch if you care to try it with your setup.
If it still fails, output from -v and last objdump command output
would be helpful.

Regards,
Jan
diff --git a/tools/perf/tests/code-reading.c b/tools/perf/tests/code-reading.c
index a767a6400c5c..faf726865fac 100644
--- a/tools/perf/tests/code-reading.c
+++ b/tools/perf/tests/code-reading.c
@@ -33,44 +33,83 @@ static unsigned int hex(char c)
return c - 'A' + 10;
 }
 
-static size_t read_objdump_line(const char *line, size_t line_len, void *buf,
- size_t len)
+static size_t read_objdump_chunk(const char **line, unsigned char **buf,
+size_t *buf_len)
+{
+   size_t bytes_read = 0;
+   unsigned char *chunk_start = *buf;
+
+   /* Read bytes */
+   while (*buf_len > 0) {
+   char c1, c2;
+
+   /* Get 2 hex digits */
+   c1 = *(*line)++;
+   if (!isxdigit(c1))
+   break;
+   c2 = *(*line)++;
+   if (!isxdigit(c2))
+   break;
+
+   /* Store byte and advance buf */
+   **buf = (hex(c1) << 4) | hex(c2);
+   (*buf)++;
+   (*buf_len)--;
+   bytes_read++;
+
+   /* End of chunk? */
+   if (isspace(**line))
+   break;
+   }
+
+   /*
+* objdump will display raw insn as LE if code endian
+* is LE and bytes_per_chunk > 1. In that case reverse
+* the chunk we just read.
+*/
+   if (bytes_read > 1 && !bigendian()) {
+   unsigned char *chunk_end = chunk_start + bytes_read - 1;
+   unsigned char tmp;
+
+   while (chunk_start < chunk_end) {
+   tmp = *chunk_start;
+   *chunk_start = *chunk_end;
+   *chunk_end = tmp;
+   chunk_start++;
+   chunk_end--;
+   }
+   }
+
+   return bytes_read;
+}
+
+static size_t read_objdump_line(const char *line, unsigned char *buf,
+   size_t buf_len)
 {
const char *p;
-   size_t i, j = 0;
+   size_t ret, bytes_read = 0;
 
/* Skip to a colon */
p = strchr(line, ':');
if (!p)
return 0;
-   i = p + 1 - line;
+   p++;
 
-   /* Read bytes */
-   while (j < len) {
-   char c1, c2;
-
-   /* Skip spaces */
-   for (; i < line_len; i++) {
-   if (!isspace(line[i]))
-   break;
-   }
-   /* Get 2 hex digits */
-   if (i >= line_len || !isxdigit(line[i]))
-   break;
-   c1 = line[i++];
-   if (i >= line_len || !isxdigit(line[i]))
-   break;
-   c2 = line[i++];
-   /* Followed by a space */
-   if (i < line_len && line[i] && !isspace(line[i]))
+   /* Skip initial spaces */
+   while (*p) {
+   if (!isspace(*p))
break;
-   /* Store byte */
-   *(unsigned char *)buf = (hex(c1) << 4) | hex(c2);
-   buf += 1;
-   j++;
+   p++;
}
+
+   do {
+   ret = read_objdump_chunk(&p, &buf, &buf_len);
+   bytes_read += ret;
+   p++;
+   } while (ret > 0);
+
/* return number of successfully read bytes */
-   return j;
+   return bytes_read;
 }
 
 static int read_objdump_output(FILE *f, void *buf, size_t *len, u64 start_addr)
@@ -95,7 +134,7 @@ static int read_objdump_output(FILE *f, void *buf, size_t 
*len, u64 start_addr)
}
 
/* read objdump data into temporary buffer */
-   read_bytes = read_objdump_line(line, ret, tmp, sizeof(tmp));
+   read_bytes = read_objdump_line(line, tmp, sizeof(tmp));
if (!read_bytes)
continue;
 
@@ -152,7 +191,7 @@ static int read_via_objdump(const char *filename, u64 addr, 
void *buf,
 
ret = read_objdump_output(f, buf, &len, addr);
if (len) {
-   pr_debug("objdump read too few bytes\n");
+   pr_debug("objdump read too few bytes: %lu\n", len);
if (!ret)
ret = len;
}


  1   2   >