[PATCH] mm: migrate: initialize err in do_migrate_pages
After commit 236c32eb1096 ("mm: migrate: clean up migrate_prep{_local}")', do_migrate_pages can return uninitialized variable 'err' (which is propagated to user-space as error) when 'from' and 'to' nodesets are identical. This can be reproduced with LTP migrate_pages01, which calls migrate_pages() with same set for both old/new_nodes. Add 'err' initialization back. Fixes: 236c32eb1096 ("mm: migrate: clean up migrate_prep{_local}") Cc: Zi Yan Cc: Yang Shi Cc: Jan Kara Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Song Liu Cc: Andrew Morton Signed-off-by: Jan Stancek --- mm/mempolicy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 8cf96bd21341..2c3a86502053 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -,7 +,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, const nodemask_t *to, int flags) { int busy = 0; - int err; + int err = 0; nodemask_t tmp; migrate_prep(); -- 2.18.1
Re: [LTP] [PATCH] selftests: vdso: hash entry size on alpha, s390x is 8 bytes
- Original Message - > > - Original Message - > > Hi! > > As much as it's worth the changes looks good to me. > > > > @Jan: I guess that we can as well fix this in LTP first then we can try > > to get the kernel version fixed... > > Fine by me, I'll give it couple more days then push it. Pushed.
Re: [LTP] [PATCH] selftests: vdso: hash entry size on alpha, s390x is 8 bytes
- Original Message - > Hi! > As much as it's worth the changes looks good to me. > > @Jan: I guess that we can as well fix this in LTP first then we can try > to get the kernel version fixed... Fine by me, I'll give it couple more days then push it. I did repost it with original author CC-ed: https://lore.kernel.org/lkml/93a07b1808e61babef3a20542cbeb4565d3c8410.1596458924.git.jstan...@redhat.com/ but haven't heard back yet.
[PATCH RESEND] selftests: vdso: hash entry size on alpha,s390x is 8 bytes
parse_vdso.c is crashing on 5.8-rc5 s390x, because it wrongly reads nbucket as 0: Program received signal SIGFPE, Arithmetic exception. 0x01000f3e in vdso_sym (version=0x1001280 "LINUX_2.6", name=0x100128a "__vdso_getcpu") at parse_vdso.c:207 207 ELF(Word) chain = vdso_info.bucket[elf_hash(name) % vdso_info.nbucket]; (gdb) p vdso_info.nbucket $1 = 0 Per readelf source: https://sourceware.org/git/?p=binutils-gdb.git;a=blob;f=binutils/readelf.c;h=2406304fe35a832ac53aa7b1a367f3f7afed4264;hb=HEAD#l10027 and objdump output hash entries are double size on 64bit s390 and alpha: 0120 <.hash>: 120: 00 00 00 00 .long 0x 124: 00 00 00 03 .long 0x0003 128: 00 00 00 00 .long 0x 12c: 00 00 00 07 .long 0x0007 130: 00 00 00 00 .long 0x 134: 00 00 00 01 .long 0x0001 138: 00 00 00 00 .long 0x 13c: 00 00 00 05 .long 0x0005 140: 00 00 00 00 .long 0x 144: 00 00 00 06 .long 0x0006 ... 16c: 00 00 00 02 .long 0x0002 170: 00 00 00 00 .long 0x 174: 00 00 00 03 .long 0x0003 178: 00 00 00 00 .long 0x 17c: 00 00 00 04 .long 0x0004 Do similar check in parse_vdso.c and treat hash entries as double word. Signed-off-by: Jan Stancek --- tools/testing/selftests/vDSO/parse_vdso.c | 48 +++ 1 file changed, 40 insertions(+), 8 deletions(-) Resend with original author CC-ed. diff --git a/tools/testing/selftests/vDSO/parse_vdso.c b/tools/testing/selftests/vDSO/parse_vdso.c index 413f75620a35..e23efcbd1c88 100644 --- a/tools/testing/selftests/vDSO/parse_vdso.c +++ b/tools/testing/selftests/vDSO/parse_vdso.c @@ -47,8 +47,9 @@ static struct vdso_info /* Symbol table */ ELF(Sym) *symtab; const char *symstrings; - ELF(Word) *bucket, *chain; + void *bucket, *chain; ELF(Word) nbucket, nchain; + bool hash_ent_is_dword; /* Version table */ ELF(Versym) *versym; @@ -69,6 +70,28 @@ static unsigned long elf_hash(const unsigned char *name) return h; } +/* return value of hash table entry */ +ELF(Word) get_hash_val(void *ptr, ELF(Word) idx) +{ + if (vdso_info.hash_ent_is_dword) { + ELF(Xword) *table = ptr; + /* for vdso assume all values fit in Elf Word */ + return (ELF(Word)) table[idx]; + } + + ELF(Word) *table = ptr; + return table[idx]; +} + +/* return pointer to hash table entry */ +void *get_hash_ptr(void *ptr, ELF(Word) idx) +{ + if (vdso_info.hash_ent_is_dword) + return &((ELF(Xword) *) ptr)[idx]; + + return &((ELF(Word) *) ptr)[idx]; +} + void vdso_init_from_sysinfo_ehdr(uintptr_t base) { size_t i; @@ -84,6 +107,14 @@ void vdso_init_from_sysinfo_ehdr(uintptr_t base) return; /* Wrong ELF class -- check ELF_BITS */ } + /* 64bit s390 and alpha have hash entry size of 8 bytes */ + if ((hdr->e_machine == EM_ALPHA + || hdr->e_machine == EM_S390) + && hdr->e_ident[EI_CLASS] == ELFCLASS64) + vdso_info.hash_ent_is_dword = true; + else + vdso_info.hash_ent_is_dword = false; + ELF(Phdr) *pt = (ELF(Phdr)*)(vdso_info.load_addr + hdr->e_phoff); ELF(Dyn) *dyn = 0; @@ -149,11 +180,11 @@ void vdso_init_from_sysinfo_ehdr(uintptr_t base) if (!vdso_info.verdef) vdso_info.versym = 0; - /* Parse the hash table header. */ - vdso_info.nbucket = hash[0]; - vdso_info.nchain = hash[1]; - vdso_info.bucket = [2]; - vdso_info.chain = [vdso_info.nbucket + 2]; + + vdso_info.nbucket = get_hash_val(hash, 0); + vdso_info.nchain = get_hash_val(hash, 1); + vdso_info.bucket = get_hash_ptr(hash, 2); + vdso_info.chain = get_hash_ptr(hash, vdso_info.nbucket + 2); /* That's all we need. */ vdso_info.valid = true; @@ -204,9 +235,10 @@ void *vdso_sym(const char *version, const char *name) return 0; ver_hash = elf_hash(version); - ELF(Word) chain = vdso_info.bucket[elf_hash(name) % vdso_info.nbucket]; + ELF(Word) chain = get_hash_val(vdso_info.bucket, + elf_hash(name) % vdso_info.nbucket); - for (; chain != STN_UNDEF; chain = vdso_info.chain[chain]) { + for (; chain != STN_UNDEF; chain = get_hash_val(vdso_info.chain, chain)) { ELF(Sym) *sym = _info.symtab[chain]; /* Check for a defined global or weak function w/ right name. */ -- 2.18.1
Re: [PATCH] selftests: vdso: hash entry size on alpha,s390x is 8 bytes
- Original Message - > parse_vdso.c is crashing on 5.8-rc5 s390x, because it wrongly reads > nbucket as 0: > Program received signal SIGFPE, Arithmetic exception. > 0x01000f3e in vdso_sym (version=0x1001280 "LINUX_2.6", > name=0x100128a "__vdso_getcpu") at parse_vdso.c:207 > 207 ELF(Word) chain = vdso_info.bucket[elf_hash(name) % > vdso_info.nbucket]; > (gdb) p vdso_info.nbucket > $1 = 0 > > Per readelf source: > > https://sourceware.org/git/?p=binutils-gdb.git;a=blob;f=binutils/readelf.c;h=2406304fe35a832ac53aa7b1a367f3f7afed4264;hb=HEAD#l10027 > and objdump output hash entries are double size on 64bit s390 and alpha: > 0120 <.hash>: >120: 00 00 00 00 .long 0x >124: 00 00 00 03 .long 0x0003 >128: 00 00 00 00 .long 0x >12c: 00 00 00 07 .long 0x0007 >130: 00 00 00 00 .long 0x >134: 00 00 00 01 .long 0x0001 >138: 00 00 00 00 .long 0x >13c: 00 00 00 05 .long 0x0005 >140: 00 00 00 00 .long 0x >144: 00 00 00 06 .long 0x0006 > ... >16c: 00 00 00 02 .long 0x0002 >170: 00 00 00 00 .long 0x >174: 00 00 00 03 .long 0x0003 >178: 00 00 00 00 .long 0x >17c: 00 00 00 04 .long 0x0004 > > Do similar check in parse_vdso.c and treat hash entries as double word. Ping, any thoughts about the issue or patch?
[PATCH] selftests: vdso: hash entry size on alpha,s390x is 8 bytes
parse_vdso.c is crashing on 5.8-rc5 s390x, because it wrongly reads nbucket as 0: Program received signal SIGFPE, Arithmetic exception. 0x01000f3e in vdso_sym (version=0x1001280 "LINUX_2.6", name=0x100128a "__vdso_getcpu") at parse_vdso.c:207 207 ELF(Word) chain = vdso_info.bucket[elf_hash(name) % vdso_info.nbucket]; (gdb) p vdso_info.nbucket $1 = 0 Per readelf source: https://sourceware.org/git/?p=binutils-gdb.git;a=blob;f=binutils/readelf.c;h=2406304fe35a832ac53aa7b1a367f3f7afed4264;hb=HEAD#l10027 and objdump output hash entries are double size on 64bit s390 and alpha: 0120 <.hash>: 120: 00 00 00 00 .long 0x 124: 00 00 00 03 .long 0x0003 128: 00 00 00 00 .long 0x 12c: 00 00 00 07 .long 0x0007 130: 00 00 00 00 .long 0x 134: 00 00 00 01 .long 0x0001 138: 00 00 00 00 .long 0x 13c: 00 00 00 05 .long 0x0005 140: 00 00 00 00 .long 0x 144: 00 00 00 06 .long 0x0006 ... 16c: 00 00 00 02 .long 0x0002 170: 00 00 00 00 .long 0x 174: 00 00 00 03 .long 0x0003 178: 00 00 00 00 .long 0x 17c: 00 00 00 04 .long 0x0004 Do similar check in parse_vdso.c and treat hash entries as double word. Signed-off-by: Jan Stancek --- tools/testing/selftests/vDSO/parse_vdso.c | 48 +++ 1 file changed, 40 insertions(+), 8 deletions(-) diff --git a/tools/testing/selftests/vDSO/parse_vdso.c b/tools/testing/selftests/vDSO/parse_vdso.c index 413f75620a35..e23efcbd1c88 100644 --- a/tools/testing/selftests/vDSO/parse_vdso.c +++ b/tools/testing/selftests/vDSO/parse_vdso.c @@ -47,8 +47,9 @@ static struct vdso_info /* Symbol table */ ELF(Sym) *symtab; const char *symstrings; - ELF(Word) *bucket, *chain; + void *bucket, *chain; ELF(Word) nbucket, nchain; + bool hash_ent_is_dword; /* Version table */ ELF(Versym) *versym; @@ -69,6 +70,28 @@ static unsigned long elf_hash(const unsigned char *name) return h; } +/* return value of hash table entry */ +ELF(Word) get_hash_val(void *ptr, ELF(Word) idx) +{ + if (vdso_info.hash_ent_is_dword) { + ELF(Xword) *table = ptr; + /* for vdso assume all values fit in Elf Word */ + return (ELF(Word)) table[idx]; + } + + ELF(Word) *table = ptr; + return table[idx]; +} + +/* return pointer to hash table entry */ +void *get_hash_ptr(void *ptr, ELF(Word) idx) +{ + if (vdso_info.hash_ent_is_dword) + return &((ELF(Xword) *) ptr)[idx]; + + return &((ELF(Word) *) ptr)[idx]; +} + void vdso_init_from_sysinfo_ehdr(uintptr_t base) { size_t i; @@ -84,6 +107,14 @@ void vdso_init_from_sysinfo_ehdr(uintptr_t base) return; /* Wrong ELF class -- check ELF_BITS */ } + /* 64bit s390 and alpha have hash entry size of 8 bytes */ + if ((hdr->e_machine == EM_ALPHA + || hdr->e_machine == EM_S390) + && hdr->e_ident[EI_CLASS] == ELFCLASS64) + vdso_info.hash_ent_is_dword = true; + else + vdso_info.hash_ent_is_dword = false; + ELF(Phdr) *pt = (ELF(Phdr)*)(vdso_info.load_addr + hdr->e_phoff); ELF(Dyn) *dyn = 0; @@ -149,11 +180,11 @@ void vdso_init_from_sysinfo_ehdr(uintptr_t base) if (!vdso_info.verdef) vdso_info.versym = 0; - /* Parse the hash table header. */ - vdso_info.nbucket = hash[0]; - vdso_info.nchain = hash[1]; - vdso_info.bucket = [2]; - vdso_info.chain = [vdso_info.nbucket + 2]; + + vdso_info.nbucket = get_hash_val(hash, 0); + vdso_info.nchain = get_hash_val(hash, 1); + vdso_info.bucket = get_hash_ptr(hash, 2); + vdso_info.chain = get_hash_ptr(hash, vdso_info.nbucket + 2); /* That's all we need. */ vdso_info.valid = true; @@ -204,9 +235,10 @@ void *vdso_sym(const char *version, const char *name) return 0; ver_hash = elf_hash(version); - ELF(Word) chain = vdso_info.bucket[elf_hash(name) % vdso_info.nbucket]; + ELF(Word) chain = get_hash_val(vdso_info.bucket, + elf_hash(name) % vdso_info.nbucket); - for (; chain != STN_UNDEF; chain = vdso_info.chain[chain]) { + for (; chain != STN_UNDEF; chain = get_hash_val(vdso_info.chain, chain)) { ELF(Sym) *sym = _info.symtab[chain]; /* Check for a defined global or weak function w/ right name. */ -- 2.18.1
Re: [LTP] 303cc571d1: ltp.setns01.fail
- Original Message - > I'll send a pr for this to Linus this week (or next since I'm on > vacation this week) and get this fixed. Thanks for spotting this. What's > the Reported-by: line format that LTP uses? I'm not sure we ever used one, it's usually various CI systems that reference LTP. This thread has been started by kernel test robot: Reported-by: kernel test robot
Re: [LTP] 303cc571d1: ltp.setns01.fail
- Original Message - > Hi! > > setns01 6 TFAIL : setns01.c:176: regular file fd exp_errno=22: > > errno=EBADF(9): Bad file descriptor > > setns01 0 TINFO : setns(12, 0x2) > > setns01 7 TFAIL : setns01.c:176: regular file fd exp_errno=22: > > errno=EBADF(9): Bad file descriptor > > setns01 0 TINFO : setns(12, 0x4000) > > setns01 8 TFAIL : setns01.c:176: regular file fd exp_errno=22: > > errno=EBADF(9): Bad file descriptor > > setns01 0 TINFO : setns(12, 0x2000) > > setns01 9 TFAIL : setns01.c:176: regular file fd exp_errno=22: > > errno=EBADF(9): Bad file descriptor > > setns01 0 TINFO : setns(12, 0x400) > > setns0110 TFAIL : setns01.c:176: regular file fd exp_errno=22: > > errno=EBADF(9): Bad file descriptor > > The messages here are a bit cryptic, I will fix that later on, but what > it means here is that the errno has changed from EINVAL to EBADF in a > case we pass file descriptor to a regular file to setns(). I posted a series that accepts both errnos about week ago: https://lists.linux.it/pipermail/ltp/2020-June/017467.html
Re: LTP: syscalls: regression on mainline - ioctl_loop01 mknod07 setns01
- Original Message - > Following three test cases reported as regression on Linux mainline kernel > on x86_64, arm64, arm and i386 > > ltp-syscalls-tests: > * ioctl_loop01 > * mknod07 Test updated: https://github.com/linux-test-project/ltp/commit/13fcfa2d6bdd1fb71c4528b47170e8e8fb3a8a32 > * setns01 commit 303cc571d107 ("nsproxy: attach to namespaces via pidfds") changed errno that is returned for regular file from EINVAL to EBADF. This appears to fit more current man page, so I think we need to fix test to accept both. (I'm looking into that)
[bug?] LTP pt_test failing after 38bb8d77d0b9 "perf/x86/intel/pt: Split ToPA metadata and page layout"
Hi, All variants of pt_test: https://github.com/linux-test-project/ltp/blob/master/testcases/kernel/tracing/pt_test/pt_test.c started failing after: 38bb8d77d0b9 ("perf/x86/intel/pt: Split ToPA metadata and page layout") with following error on console/dmesg: pt: ToPA ERROR encountered, trying to recover on Broadwell-EP: vendor_id : GenuineIntel cpu family : 6 model : 79 model name : Intel(R) Xeon(R) CPU E5-2699 v4 @ 2.20GHz stepping: 1 Regards, Jan
Re: [PATCH] fat: fix corruption in fat_alloc_new_dir()
- Original Message - > Jan Stancek writes: > > > sb_getblk does not guarantee that buffer_head is uptodate. If there is > > async read running in parallel for same buffer_head, it can overwrite > > just initialized msdos_dir_entry, leading to corruption: > > FAT-fs (loop0): error, corrupted directory (invalid entries) > > FAT-fs (loop0): Filesystem has been set read-only > > > > This can happen for example during LTP statx04, which creates loop > > device, formats it (mkfs.vfat), mounts it and immediately creates > > a new directory. In parallel, systemd-udevd is probing new block > > device, which leads to async read. > > > > do_mkdirat ksys_read > >vfs_mkdir vfs_read > > vfat_mkdir __vfs_read > > fat_alloc_new_dir new_sync_read > >/* init de[0], de[1] */blkdev_read_iter > >generic_file_read_iter > > generic_file_buffered_read > > blkdev_readpage > > block_read_full_page > > > > Faster reproducer (based on LTP statx04): > > > > int main(void) > > { > > int i, j, ret, fd, loop_fd, ctrl_fd; > > int loop_num; > > char loopdev[256], tmp[256], testfile[256]; > > > > mkdir("/tmp/mntpoint", 0777); > > for (i = 0; ; i++) { > > printf("Iteration: %d\n", i); > > sprintf(testfile, "/tmp/test.img.%d", getpid()); > > > > ctrl_fd = open("/dev/loop-control", O_RDWR); > > loop_num = ioctl(ctrl_fd, LOOP_CTL_GET_FREE); > > close(ctrl_fd); > > sprintf(loopdev, "/dev/loop%d", loop_num); > > > > fd = open(testfile, O_WRONLY|O_CREAT|O_TRUNC, 0600); > > fallocate(fd, 0, 0, 256*1024*1024); > > close(fd); > > > > fd = open(testfile, O_RDWR); > > loop_fd = open(loopdev, O_RDWR); > > ioctl(loop_fd, LOOP_SET_FD, fd); > > close(loop_fd); > > close(fd); > > > > sprintf(tmp, "mkfs.vfat %s", loopdev); > > system(tmp); > > mount(loopdev, "/tmp/mntpoint", "vfat", 0, NULL); > > > > for (j = 0; j < 200; j++) { > > sprintf(tmp, "/tmp/mntpoint/testdir%d", j); > > ret = mkdir(tmp, 0777); > > if (ret) { > > perror("mkdir"); > > break; > > } > > } > > > > umount("/tmp/mntpoint"); > > loop_fd = open(loopdev, O_RDWR); > > ioctl(loop_fd, LOOP_CLR_FD, fd); > > close(loop_fd); > > unlink(testfile); > > > > if (ret) > > break; > > } > > > > return 0; > > } > > > > Issue triggers within minute on HPE Apollo 70 (arm64, 64GB RAM, 224 CPUs). > > Using the device while mounting same device doesn't work reliably like > this race. (getblk() is intentionally used to get the buffer to write > new data.) Are you saying this is expected even if 'usage' is just read? > > mount(2) internally opens the device by EXCL mode, so I guess udev opens > without EXCL (I dont know if it is intent or not). I gave this a try and added O_EXCL to udev-builtin-blkid.c. My system had trouble booting, it was getting stuck on mounting LVM volumes. So, I'm not sure how to move forward here. Regards, Jan
[tip: x86/urgent] x86/timer: Force PIT initialization when !X86_FEATURE_ARAT
The following commit has been merged into the x86/urgent branch of tip: Commit-ID: afa8b475c1aec185a8e106c48b3832e0b88bc2de Gitweb: https://git.kernel.org/tip/afa8b475c1aec185a8e106c48b3832e0b88bc2de Author:Jan Stancek AuthorDate:Sun, 08 Sep 2019 00:50:40 +02:00 Committer: Thomas Gleixner CommitterDate: Sun, 08 Sep 2019 09:01:15 +02:00 x86/timer: Force PIT initialization when !X86_FEATURE_ARAT KVM guests with commit c8c4076723da ("x86/timer: Skip PIT initialization on modern chipsets") applied to guest kernel have been observed to have unusually higher CPU usage with symptoms of increase in vm exits for HLT and MSW_WRITE (MSR_IA32_TSCDEADLINE). This is caused by older QEMUs lacking support for X86_FEATURE_ARAT. lapic clock retains CLOCK_EVT_FEAT_C3STOP and nohz stays inactive. There's no usable broadcast device either. Do the PIT initialization if guest CPU lacks X86_FEATURE_ARAT. On real hardware it shouldn't matter as ARAT and DEADLINE come together. Fixes: c8c4076723da ("x86/timer: Skip PIT initialization on modern chipsets") Signed-off-by: Jan Stancek Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/apic.c | 4 1 file changed, 4 insertions(+) diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index dba2828..f91b3ff 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -834,6 +834,10 @@ bool __init apic_needs_pit(void) if (!boot_cpu_has(X86_FEATURE_APIC)) return true; + /* Virt guests may lack ARAT, but still have DEADLINE */ + if (!boot_cpu_has(X86_FEATURE_ARAT)) + return true; + /* Deadline timer is based on TSC so no further PIT action required */ if (boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER)) return false;
[tip: x86/urgent] x86/timer: Force PIT initialization when !X86_FEATURE_ARAT
The following commit has been merged into the x86/urgent branch of tip: Commit-ID: afa8b475c1aec185a8e106c48b3832e0b88bc2de Gitweb: https://git.kernel.org/tip/afa8b475c1aec185a8e106c48b3832e0b88bc2de Author:Jan Stancek AuthorDate:Sun, 08 Sep 2019 00:50:40 +02:00 Committer: Thomas Gleixner CommitterDate: Sun, 08 Sep 2019 09:01:15 +02:00 x86/timer: Force PIT initialization when !X86_FEATURE_ARAT KVM guests with commit c8c4076723da ("x86/timer: Skip PIT initialization on modern chipsets") applied to guest kernel have been observed to have unusually higher CPU usage with symptoms of increase in vm exits for HLT and MSW_WRITE (MSR_IA32_TSCDEADLINE). This is caused by older QEMUs lacking support for X86_FEATURE_ARAT. lapic clock retains CLOCK_EVT_FEAT_C3STOP and nohz stays inactive. There's no usable broadcast device either. Do the PIT initialization if guest CPU lacks X86_FEATURE_ARAT. On real hardware it shouldn't matter as ARAT and DEADLINE come together. Fixes: c8c4076723da ("x86/timer: Skip PIT initialization on modern chipsets") Signed-off-by: Jan Stancek Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/apic.c | 4 1 file changed, 4 insertions(+) diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index dba2828..f91b3ff 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -834,6 +834,10 @@ bool __init apic_needs_pit(void) if (!boot_cpu_has(X86_FEATURE_APIC)) return true; + /* Virt guests may lack ARAT, but still have DEADLINE */ + if (!boot_cpu_has(X86_FEATURE_ARAT)) + return true; + /* Deadline timer is based on TSC so no further PIT action required */ if (boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER)) return false;
[PATCH] fat: fix corruption in fat_alloc_new_dir()
sb_getblk does not guarantee that buffer_head is uptodate. If there is async read running in parallel for same buffer_head, it can overwrite just initialized msdos_dir_entry, leading to corruption: FAT-fs (loop0): error, corrupted directory (invalid entries) FAT-fs (loop0): Filesystem has been set read-only This can happen for example during LTP statx04, which creates loop device, formats it (mkfs.vfat), mounts it and immediately creates a new directory. In parallel, systemd-udevd is probing new block device, which leads to async read. do_mkdirat ksys_read vfs_mkdir vfs_read vfat_mkdir __vfs_read fat_alloc_new_dir new_sync_read /* init de[0], de[1] */blkdev_read_iter generic_file_read_iter generic_file_buffered_read blkdev_readpage block_read_full_page Faster reproducer (based on LTP statx04): - 8< - int main(void) { int i, j, ret, fd, loop_fd, ctrl_fd; int loop_num; char loopdev[256], tmp[256], testfile[256]; mkdir("/tmp/mntpoint", 0777); for (i = 0; ; i++) { printf("Iteration: %d\n", i); sprintf(testfile, "/tmp/test.img.%d", getpid()); ctrl_fd = open("/dev/loop-control", O_RDWR); loop_num = ioctl(ctrl_fd, LOOP_CTL_GET_FREE); close(ctrl_fd); sprintf(loopdev, "/dev/loop%d", loop_num); fd = open(testfile, O_WRONLY|O_CREAT|O_TRUNC, 0600); fallocate(fd, 0, 0, 256*1024*1024); close(fd); fd = open(testfile, O_RDWR); loop_fd = open(loopdev, O_RDWR); ioctl(loop_fd, LOOP_SET_FD, fd); close(loop_fd); close(fd); sprintf(tmp, "mkfs.vfat %s", loopdev); system(tmp); mount(loopdev, "/tmp/mntpoint", "vfat", 0, NULL); for (j = 0; j < 200; j++) { sprintf(tmp, "/tmp/mntpoint/testdir%d", j); ret = mkdir(tmp, 0777); if (ret) { perror("mkdir"); break; } } umount("/tmp/mntpoint"); loop_fd = open(loopdev, O_RDWR); ioctl(loop_fd, LOOP_CLR_FD, fd); close(loop_fd); unlink(testfile); if (ret) break; } return 0; } - 8< ------------- Issue triggers within minute on HPE Apollo 70 (arm64, 64GB RAM, 224 CPUs). Signed-off-by: Jan Stancek Cc: OGAWA Hirofumi --- fs/fat/dir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/fat/dir.c b/fs/fat/dir.c index 1bda2ab6745b..474fd6873ec8 100644 --- a/fs/fat/dir.c +++ b/fs/fat/dir.c @@ -1149,7 +1149,7 @@ int fat_alloc_new_dir(struct inode *dir, struct timespec64 *ts) goto error; blknr = fat_clus_to_blknr(sbi, cluster); - bhs[0] = sb_getblk(sb, blknr); + bhs[0] = sb_bread(sb, blknr); if (!bhs[0]) { err = -ENOMEM; goto error_free; -- 1.8.3.1
Re: [PATCH v5.2 1/2] locking/rwsem: Add missing ACQUIRE to read_slowpath exit when queue is empty
- Original Message - > From: Jan Stancek > > [ Upstream commit e1b98fa316648420d0434d9ff5b92ad6609ba6c3 ] > > LTP mtest06 has been observed to occasionally hit "still mapped when > deleted" and following BUG_ON on arm64. > > The extra mapcount originated from pagefault handler, which handled > pagefault for vma that has already been detached. vma is detached > under mmap_sem write lock by detach_vmas_to_be_unmapped(), which > also invalidates vmacache. > > When the pagefault handler (under mmap_sem read lock) calls > find_vma(), vmacache_valid() wrongly reports vmacache as valid. > > After rwsem down_read() returns via 'queue empty' path (as of v5.2), > it does so without an ACQUIRE on sem->count: > > down_read() > __down_read() > rwsem_down_read_failed() > __rwsem_down_read_failed_common() > raw_spin_lock_irq(>wait_lock); > if (list_empty(>wait_list)) { > if (atomic_long_read(>count) >= 0) { > raw_spin_unlock_irq(>wait_lock); > return sem; > > The problem can be reproduced by running LTP mtest06 in a loop and > building the kernel (-j $NCPUS) in parallel. It does reproduces since > v4.20 on arm64 HPE Apollo 70 (224 CPUs, 256GB RAM, 2 nodes). It > triggers reliably in about an hour. > > The patched kernel ran fine for 10+ hours. > > Signed-off-by: Jan Stancek > Signed-off-by: Peter Zijlstra (Intel) > Reviewed-by: Will Deacon > Acked-by: Waiman Long > Cc: Linus Torvalds > Cc: Peter Zijlstra > Cc: Thomas Gleixner > Cc: dbu...@suse.de > Fixes: 4b486b535c33 ("locking/rwsem: Exit read lock slowpath if queue empty & > no writer") > Link: > https://lkml.kernel.org/r/50b8914e20d1d62bb2dee42d342836c2c16ebee7.1563438048.git.jstan...@redhat.com > Signed-off-by: Ingo Molnar > Signed-off-by: Sasha Levin > --- > > This is a backport for the v5.2 stable tree. There were multiple reports > of this issue being hit. > > Given that there were a few changes to the code around this, I'd > appreciate an ack before pulling it in. ACK, both look good to me. I also re-ran reproducer with this series applied on top of 5.2.10, it PASS-ed. Thanks, Jan
Re: Linux-next-20190823: x86_64/i386: prot_hsymlinks.c:325: Failed to run cmd: useradd hsym
- Original Message - > On Tue, 2019-08-27 at 06:25 -0400, Jan Stancek wrote: > > That theory is probably not correct for this case, since EIO I see > > appears > > to originate from write and nfs_writeback_result(). This function > > also > > produces message we saw in logs from Naresh. > > > > I can't find where/how is resp->count updated on WRITE reply in > > NFSv2. > > Issue also goes away with patch below, though I can't speak about its > > correctness: > > > > NFS version TypeTestReturn code > > nfsvers=2 tcp -b:base 0 > > nfsvers=2 tcp -g:general 0 > > nfsvers=2 tcp -s:special 0 > > nfsvers=2 tcp -l:lock 0 > > Total time: 141 > > > > diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c > > index cbc17a203248..4913c6da270b 100644 > > --- a/fs/nfs/nfs2xdr.c > > +++ b/fs/nfs/nfs2xdr.c > > @@ -897,6 +897,16 @@ static int nfs2_xdr_dec_writeres(struct rpc_rqst > > *req, struct xdr_stream *xdr, > > void *data) > > { > > struct nfs_pgio_res *result = data; > > + struct rpc_task *rq_task = req->rq_task; > > + > > + if (rq_task) { > > + struct nfs_pgio_args *args = rq_task- > > >tk_msg.rpc_argp; > > + > > + if (args) { > > + result->count = args->count; > > + } > > + } > > > > /* All NFSv2 writes are "file sync" writes */ > > result->verf->committed = NFS_FILE_SYNC; > > Thanks! I've moved the above to nfs_write_done() so that we do it only > on success (see > http://git.linux-nfs.org/?p=trondmy/linux-nfs.git;a=commitdiff;h=3ba5688da709dd0f7d917029c206bc1848a6ae74 > ) Thanks, retested with 3ba5688da, all PASS: NFS version TypeTestReturn code nfsvers=2 tcp -b:base 0 nfsvers=2 tcp -g:general 0 nfsvers=2 tcp -s:special 0 nfsvers=2 tcp -l:lock 0 NFS version TypeTestReturn code nfsvers=3 tcp -b:base 0 nfsvers=3 tcp -g:general 0 nfsvers=3 tcp -s:special 0 nfsvers=3 tcp -l:lock 0 nfsvers=3 tcp6-b:base 0 nfsvers=3 tcp6-g:general 0 nfsvers=3 tcp6-s:special 0 nfsvers=3 tcp6-l:lock 0 NFS version TypeTestReturn code nfsvers=4 tcp -b:base 0 nfsvers=4 tcp -g:general 0 nfsvers=4 tcp -s:special 0 nfsvers=4 tcp -l:lock 0 nfsvers=4 tcp6-b:base 0 nfsvers=4 tcp6-g:general 0 nfsvers=4 tcp6-s:special 0 nfsvers=4 tcp6-l:lock 0 Feel free to add also: Reported-by: Naresh Kamboju Tested-by: Jan Stancek
Re: Linux-next-20190823: x86_64/i386: prot_hsymlinks.c:325: Failed to run cmd: useradd hsym
- Original Message - > On Mon, 2019-08-26 at 19:12 -0400, Jan Stancek wrote: > > - Original Message - > > > On Mon, 2019-08-26 at 10:38 -0400, Jan Stancek wrote: > > > > - Original Message - > > > > > Hi Jan and Cyril, > > > > > > > > > > On Mon, 26 Aug 2019 at 16:35, Jan Stancek > > > > > wrote: > > > > > > > > > > > > - Original Message - > > > > > > > Hi! > > > > > > > > Do you see this LTP prot_hsymlinks failure on linux next > > > > > > > > 20190823 on > > > > > > > > x86_64 and i386 devices? > > > > > > > > > > > > > > > > test output log, > > > > > > > > useradd: failure while writing changes to /etc/passwd > > > > > > > > useradd: /home/hsym was created, but could not be removed > > > > > > > > > > > > > > This looks like an unrelated problem, failure to write to > > > > > > > /etc/passwd > > > > > > > probably means that filesystem is full or some problem > > > > > > > happend > > > > > > > and how > > > > > > > is remounted RO. > > > > > > > > > > > > In Naresh' example, root is on NFS: > > > > > > root=/dev/nfs rw > > > > > > > > > > > > nfsroot=10.66.16.123:/var/lib/lava/dispatcher/tmp/886412/extr > > > > > > act- > > > > > > nfsrootfs-tyuevoxm,tcp,hard,intr > > > > > > > > > > Right ! > > > > > root is mounted on NFS. > > > > > > > > > > > 10.66.16.123:/var/lib/lava/dispatcher/tmp/886412/extract- > > > > > > nfsrootfs-tyuevoxm > > > > > > on / type nfs > > > > > > (rw,relatime,vers=2,rsize=4096,wsize=4096,namlen=255,hard,nol > > > > > > ock, > > > > > > proto=tcp,timeo=600,retrans=2,sec=sys,mountaddr=10.66.16.123, > > > > > > moun > > > > > > tvers=1,mountproto=tcp,local_lock=all,addr=10.66.16.123) > > > > > > devtmpfs on /dev type devtmpfs > > > > > > (rw,relatime,size=3977640k,nr_inodes=994410,mode=755) > > > > > > > > > > > > The only thing I can think of that might cause an EIO on NFSv2 > > > would be > > > this patch > > > http://git.linux-nfs.org/?p=trondmy/linux-nfs.git;a=commitdiff;h=627d48e597ec5993c4abb3b81dc75e554a07c7c0 > > > assuming that a bind-related error is leaking through. > > > > > > I'd suggest something like the following to fix it up: > > > > No change with that patch, > > but following one fixes it for me: > > > > diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c > > index 20b3717cd7ca..56cefa0ab804 100644 > > --- a/fs/nfs/pagelist.c > > +++ b/fs/nfs/pagelist.c > > @@ -590,7 +590,7 @@ static void nfs_pgio_rpcsetup(struct > > nfs_pgio_header *hdr, > > } > > > > hdr->res.fattr = >fattr; > > - hdr->res.count = 0; > > + hdr->res.count = count; > > hdr->res.eof = 0; > > hdr->res.verf= >verf; > > nfs_fattr_init(>fattr); > > > > which is functionally revert of "NFS: Fix initialisation of I/O > > result struct in nfs_pgio_rpcsetup". > > > > This hunk caught my eye, could res.eof == 0 explain those I/O errors? > > Interesting hypothesis. It could if res.count ends up being 0. So does > the following also fix the problem? It didn't fix it. That theory is probably not correct for this case, since EIO I see appears to originate from write and nfs_writeback_result(). This function also produces message we saw in logs from Naresh. I can't find where/how is resp->count updated on WRITE reply in NFSv2. Issue also goes away with patch below, though I can't speak about its correctness: NFS version TypeTestReturn code nfsvers=2 tcp -b:base 0 nfsvers=2 tcp -g:general 0 nfsvers=2 tcp -s:special 0 nfsvers=2 tcp -l:lock 0 Total time: 141 diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c index cbc17a203248..4913c6da270b 100644 --- a/fs/nfs/nfs2xdr.c +++ b/fs/nfs/nfs2xdr.c @@ -897,6 +897,16 @@ static int nfs2_xdr_dec_writeres(struct rpc_rqst *req, struct xdr_stream *xdr, void *data) { struct nfs_pgio_res *result = data; + struct rpc_task *rq_task = req->rq_task; + + if (rq_task) { + struct nfs_pgio_args *args = rq_task->tk_msg.rpc_argp; + + if (args) { + result->count = args->count; + } + } /* All NFSv2 writes are "file sync" writes */ result->verf->committed = NFS_FILE_SYNC;
Re: Linux-next-20190823: x86_64/i386: prot_hsymlinks.c:325: Failed to run cmd: useradd hsym
- Original Message - > On Mon, 2019-08-26 at 10:38 -0400, Jan Stancek wrote: > > - Original Message - > > > Hi Jan and Cyril, > > > > > > On Mon, 26 Aug 2019 at 16:35, Jan Stancek > > > wrote: > > > > > > > > > > > > - Original Message - > > > > > Hi! > > > > > > Do you see this LTP prot_hsymlinks failure on linux next > > > > > > 20190823 on > > > > > > x86_64 and i386 devices? > > > > > > > > > > > > test output log, > > > > > > useradd: failure while writing changes to /etc/passwd > > > > > > useradd: /home/hsym was created, but could not be removed > > > > > > > > > > This looks like an unrelated problem, failure to write to > > > > > /etc/passwd > > > > > probably means that filesystem is full or some problem happend > > > > > and how > > > > > is remounted RO. > > > > > > > > In Naresh' example, root is on NFS: > > > > root=/dev/nfs rw > > > > > > > > nfsroot=10.66.16.123:/var/lib/lava/dispatcher/tmp/886412/extract- > > > > nfsrootfs-tyuevoxm,tcp,hard,intr > > > > > > Right ! > > > root is mounted on NFS. > > > > > > > 10.66.16.123:/var/lib/lava/dispatcher/tmp/886412/extract- > > > > nfsrootfs-tyuevoxm > > > > on / type nfs > > > > (rw,relatime,vers=2,rsize=4096,wsize=4096,namlen=255,hard,nolock, > > > > proto=tcp,timeo=600,retrans=2,sec=sys,mountaddr=10.66.16.123,moun > > > > tvers=1,mountproto=tcp,local_lock=all,addr=10.66.16.123) > > > > devtmpfs on /dev type devtmpfs > > > > (rw,relatime,size=3977640k,nr_inodes=994410,mode=755) > > > > > > The only thing I can think of that might cause an EIO on NFSv2 would be > this patch > http://git.linux-nfs.org/?p=trondmy/linux-nfs.git;a=commitdiff;h=627d48e597ec5993c4abb3b81dc75e554a07c7c0 > assuming that a bind-related error is leaking through. > > I'd suggest something like the following to fix it up: No change with that patch, but following one fixes it for me: diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 20b3717cd7ca..56cefa0ab804 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -590,7 +590,7 @@ static void nfs_pgio_rpcsetup(struct nfs_pgio_header *hdr, } hdr->res.fattr = >fattr; - hdr->res.count = 0; + hdr->res.count = count; hdr->res.eof = 0; hdr->res.verf= >verf; nfs_fattr_init(>fattr); which is functionally revert of "NFS: Fix initialisation of I/O result struct in nfs_pgio_rpcsetup". This hunk caught my eye, could res.eof == 0 explain those I/O errors? /* Emulate the eof flag, which isn't normally needed in NFSv2 * as it is guaranteed to always return the file attributes */ if (hdr->args.offset + hdr->res.count >= hdr->res.fattr->size) hdr->res.eof = 1;
Re: Linux-next-20190823: x86_64/i386: prot_hsymlinks.c:325: Failed to run cmd: useradd hsym
- Original Message - > Hi Jan and Cyril, > > On Mon, 26 Aug 2019 at 16:35, Jan Stancek wrote: > > > > > > > > - Original Message - > > > Hi! > > > > Do you see this LTP prot_hsymlinks failure on linux next 20190823 on > > > > x86_64 and i386 devices? > > > > > > > > test output log, > > > > useradd: failure while writing changes to /etc/passwd > > > > useradd: /home/hsym was created, but could not be removed > > > > > > This looks like an unrelated problem, failure to write to /etc/passwd > > > probably means that filesystem is full or some problem happend and how > > > is remounted RO. > > > > In Naresh' example, root is on NFS: > > root=/dev/nfs rw > > > > nfsroot=10.66.16.123:/var/lib/lava/dispatcher/tmp/886412/extract-nfsrootfs-tyuevoxm,tcp,hard,intr > > Right ! > root is mounted on NFS. > > > > > 10.66.16.123:/var/lib/lava/dispatcher/tmp/886412/extract-nfsrootfs-tyuevoxm > > on / type nfs > > (rw,relatime,vers=2,rsize=4096,wsize=4096,namlen=255,hard,nolock,proto=tcp,timeo=600,retrans=2,sec=sys,mountaddr=10.66.16.123,mountvers=1,mountproto=tcp,local_lock=all,addr=10.66.16.123) > > devtmpfs on /dev type devtmpfs > > (rw,relatime,size=3977640k,nr_inodes=994410,mode=755) > > > > Following message repeats couple times in logs: > > NFS: Server wrote zero bytes, expected XXX > > > > Naresh, can you check if there are any errors on NFS server side? > > I have re-tested the failed tests on next-20190822 and all get pass > which is also > using same NFS server [1] [2]. Thanks, that suggests some client side change between next-20190822 and next-20190823 might introduced it. > > > Maybe run NFS cthon against that server with client running next-20190822 > > and next-20190823. > > Thanks for the pointers. > I will setup and run NFS cthon on next-20190822 and next-20190823. I'll try to reproduce too. > > > > > > > > > I do not see the kernel messages from this job anywhere at the job > > > pages, is it stored somewhere? > > > > It appears to be mixed in same log file: > > > > https://qa-reports.linaro.org/lkft/linux-next-oe/build/next-20190823/testrun/886412/log > > For the record the following tests failed on linux -next-20190823 on x86_64 > and i386. The filesystem is mounted on NFS and tests are using > locally mounted hard drive ( with -d /scratch ). > > Juno-r2 device filesystem mounted on NFS and did not see these errors > and test getting pass on -next-20190823. > > These failures are reproducible all time on next-20190823 kernel on x86_64 > and i386 device with root mounted on NFS [3] [4] [5] [6]. > > I will git bisect to find out which is bad commit. > > prot_hsymlinks: [3] > -- > useradd: failure while writing changes to /etc/passwd > useradd: /home/hsym was created, but could not be removed > userdel: user 'hsym' does not exist > prot_hsymlinks1 TBROK : prot_hsymlinks.c:325: Failed to run > cmd: useradd hsym > prot_hsymlinks2 TBROK : prot_hsymlinks.c:325: Remaining cases broken > prot_hsymlinks3 TBROK : prot_hsymlinks.c:325: Failed to run > cmd: userdel -r hsym > prot_hsymlinks4 TBROK : tst_sig.c:234: unexpected signal > SIGIOT/SIGABRT(6) received (pid = 8324). > > logrotate01: [4] > - > compressing log with: /bin/gzip > error: error creating temp state file /var/lib/logrotate.status.tmp: > Input/output error > logrotate011 TFAIL : ltpapicmd.c:154: Test #1: logrotate > command exited with 1 return code. Output: > > sem_unlink_2-2: [5] > -- > make[3]: Entering directory > '/opt/ltp/testcases/open_posix_testsuite/conformance/interfaces/sem_unlink' > cat: write error: Input/output error > conformance/interfaces/sem_unlink/sem_unlink_2-2: execution: FAILED > > syslog{01 ...10} [6] > --- > cp: failed to close '/etc/syslog.conf.ltpback': Input/output error > syslog011 TBROK : ltpapicmd.c:188: failed to backup /etc/syslog.conf > > cp: failed to close '/etc/syslog.conf.ltpback': Input/output error > syslog021 TBROK : ltpapicmd.c:188: failed to backup /etc/syslog.conf > > ... > cp: failed to close '/etc/syslog.conf.ltpback': Input/output error > syslog101 TBROK : ltpapicmd.c:188: failed to backup /etc/syslog.conf > > ref: > PASS on 20190222: > [1] https://lkft.validation.linaro.org/scheduler/job/890446#L1232 > [2] https://lkft.validation.linaro.org/scheduler/job/890454 > > FAILED on 20190823: > [3] https://lkft.validation.linaro.org/scheduler/job/890404#L1245 > [4] https://lkft.validation.linaro.org/scheduler/job/886408#L2544 > [5] https://lkft.validation.linaro.org/scheduler/job/886409#L3088 > [6] https://lkft.validation.linaro.org/scheduler/job/890400#L1234 > > - Naresh >
Re: Linux-next-20190823: x86_64/i386: prot_hsymlinks.c:325: Failed to run cmd: useradd hsym
- Original Message - > Hi! > > Do you see this LTP prot_hsymlinks failure on linux next 20190823 on > > x86_64 and i386 devices? > > > > test output log, > > useradd: failure while writing changes to /etc/passwd > > useradd: /home/hsym was created, but could not be removed > > This looks like an unrelated problem, failure to write to /etc/passwd > probably means that filesystem is full or some problem happend and how > is remounted RO. In Naresh' example, root is on NFS: root=/dev/nfs rw nfsroot=10.66.16.123:/var/lib/lava/dispatcher/tmp/886412/extract-nfsrootfs-tyuevoxm,tcp,hard,intr 10.66.16.123:/var/lib/lava/dispatcher/tmp/886412/extract-nfsrootfs-tyuevoxm on / type nfs (rw,relatime,vers=2,rsize=4096,wsize=4096,namlen=255,hard,nolock,proto=tcp,timeo=600,retrans=2,sec=sys,mountaddr=10.66.16.123,mountvers=1,mountproto=tcp,local_lock=all,addr=10.66.16.123) devtmpfs on /dev type devtmpfs (rw,relatime,size=3977640k,nr_inodes=994410,mode=755) Following message repeats couple times in logs: NFS: Server wrote zero bytes, expected XXX Naresh, can you check if there are any errors on NFS server side? Maybe run NFS cthon against that server with client running next-20190822 and next-20190823. > > I do not see the kernel messages from this job anywhere at the job > pages, is it stored somewhere? It appears to be mixed in same log file: https://qa-reports.linaro.org/lkft/linux-next-oe/build/next-20190823/testrun/886412/log
[tip:locking/core] locking/rwsem: Add missing ACQUIRE to read_slowpath exit when queue is empty
Commit-ID: e1b98fa316648420d0434d9ff5b92ad6609ba6c3 Gitweb: https://git.kernel.org/tip/e1b98fa316648420d0434d9ff5b92ad6609ba6c3 Author: Jan Stancek AuthorDate: Thu, 18 Jul 2019 10:51:25 +0200 Committer: Ingo Molnar CommitDate: Thu, 25 Jul 2019 15:39:23 +0200 locking/rwsem: Add missing ACQUIRE to read_slowpath exit when queue is empty LTP mtest06 has been observed to occasionally hit "still mapped when deleted" and following BUG_ON on arm64. The extra mapcount originated from pagefault handler, which handled pagefault for vma that has already been detached. vma is detached under mmap_sem write lock by detach_vmas_to_be_unmapped(), which also invalidates vmacache. When the pagefault handler (under mmap_sem read lock) calls find_vma(), vmacache_valid() wrongly reports vmacache as valid. After rwsem down_read() returns via 'queue empty' path (as of v5.2), it does so without an ACQUIRE on sem->count: down_read() __down_read() rwsem_down_read_failed() __rwsem_down_read_failed_common() raw_spin_lock_irq(>wait_lock); if (list_empty(>wait_list)) { if (atomic_long_read(>count) >= 0) { raw_spin_unlock_irq(>wait_lock); return sem; The problem can be reproduced by running LTP mtest06 in a loop and building the kernel (-j $NCPUS) in parallel. It does reproduces since v4.20 on arm64 HPE Apollo 70 (224 CPUs, 256GB RAM, 2 nodes). It triggers reliably in about an hour. The patched kernel ran fine for 10+ hours. Signed-off-by: Jan Stancek Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Will Deacon Acked-by: Waiman Long Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dbu...@suse.de Fixes: 4b486b535c33 ("locking/rwsem: Exit read lock slowpath if queue empty & no writer") Link: https://lkml.kernel.org/r/50b8914e20d1d62bb2dee42d342836c2c16ebee7.1563438048.git.jstan...@redhat.com Signed-off-by: Ingo Molnar --- kernel/locking/rwsem.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index bc91aacaab58..d3ce7c6c42a6 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -1036,6 +1036,8 @@ queue: */ if (adjustment && !(atomic_long_read(>count) & (RWSEM_WRITER_MASK | RWSEM_FLAG_HANDOFF))) { + /* Provide lock ACQUIRE */ + smp_acquire__after_ctrl_dep(); raw_spin_unlock_irq(>wait_lock); rwsem_set_reader_owned(sem); lockevent_inc(rwsem_rlock_fast);
Re: [PATCH v2] locking/rwsem: add acquire barrier to read_slowpath exit when queue is empty
- Original Message - > > It's simpler like so: > > On Thu, Jul 18, 2019 at 01:04:46PM +0200, Peter Zijlstra wrote: > > X = 0; > > > > rwsem_down_read() > > for (;;) { > > set_current_state(TASK_UNINTERRUPTIBLE); > > > > X = 1; > > > > rwsem_up_write(); > > > > rwsem_mark_wake() > > > > atomic_long_add(adjustment, >count); > > > > smp_store_release(>task, NULL); > > > > if (!waiter.task) > > break; > > > > ... > > } > > > > r = X; > > I see - it looks possible. Thank you for this example.
Re: [PATCH v2] locking/rwsem: add acquire barrier to read_slowpath exit when queue is empty
- Original Message - > Hi Jan, Waiman, [+Jade and Paul for the litmus test at the end] > > On Wed, Jul 17, 2019 at 09:22:00PM +0200, Jan Stancek wrote: > > On Wed, Jul 17, 2019 at 10:19:04AM -0400, Waiman Long wrote: > > > > If you add a comment to the code outlining the issue (preferably as a > > > > litmus > > > > test involving sem->count and some shared data which happens to be > > > > vmacache_seqnum in your test)), then: > > > > > > > > Reviewed-by: Will Deacon > > > > > > > > Thanks, > > > > > > > > Will > > > > > > Agreed. A comment just above smp_acquire__after_ctrl_dep() on why this > > > is needed will be great. > > > > > > Other than that, > > > > > > Acked-by: Waiman Long > > > > > > > litmus test looks a bit long, would following be acceptable? > > > > diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c > > index 37524a47f002..d9c96651bfc7 100644 > > --- a/kernel/locking/rwsem.c > > +++ b/kernel/locking/rwsem.c > > @@ -1032,6 +1032,13 @@ static inline bool rwsem_reader_phase_trylock(struct > > rw_semaphore *sem, > > */ > > if (adjustment && !(atomic_long_read(>count) & > > (RWSEM_WRITER_MASK | RWSEM_FLAG_HANDOFF))) { > > + /* > > +* down_read() issued ACQUIRE on enter, but we can race > > +* with writer who did RELEASE only after us. > > +* ACQUIRE here makes sure reader operations happen only > > +* after all writer ones. > > +*/ > > How about an abridged form of the litmus test here, just to show the cod > flow? e.g.: > > /* > * We need to ensure ACQUIRE semantics when reading sem->count so that > * we pair with the RELEASE store performed by an unlocking/downgrading > * writer. > * > * P0 (writer)P1 (reader) > * > * down_write(sem); > * > * downgrade_write(sem); > * -> fetch_add_release(>count) > * > *down_read_slowpath(sem); > *-> atomic_read(>count) > * > * smp_acquire__after_ctrl_dep() > * > */ Works for me. The code is at 3 level of indentation, but I can try to squeeze it in for v4. > > In writing this, I also noticed that we don't have any explicit ordering > at the end of the reader slowpath when we wait on the queue but get woken > immediately: > > if (!waiter.task) > break; > > Am I missing something? I'm assuming this isn't problem, because set_current_state() on line above is using smp_store_mb(). > > > + smp_acquire__after_ctrl_dep(); > > raw_spin_unlock_irq(>wait_lock); > > rwsem_set_reader_owned(sem); > > lockevent_inc(rwsem_rlock_fast); > > > > > > with litmus test in commit log: > > --- 8< > > C rwsem > > > > { > > atomic_t rwsem_count = ATOMIC_INIT(1); > > int vmacache_seqnum = 10; > > } > > > > P0(int *vmacache_seqnum, atomic_t *rwsem_count) > > { > > r0 = READ_ONCE(*vmacache_seqnum); > > WRITE_ONCE(*vmacache_seqnum, r0 + 1); > > /* downgrade_write */ > > r1 = atomic_fetch_add_release(-1+256, rwsem_count); > > } > > > > P1(int *vmacache_seqnum, atomic_t *rwsem_count, spinlock_t *sem_wait_lock) > > { > > /* rwsem_read_trylock */ > > r0 = atomic_add_return_acquire(256, rwsem_count); > > /* rwsem_down_read_slowpath */ > > spin_lock(sem_wait_lock); > > r0 = atomic_read(rwsem_count); > > if ((r0 & 1) == 0) { > > // BUG: needs barrier > > spin_unlock(sem_wait_lock); > > r1 = READ_ONCE(*vmacache_seqnum); > > } > > } > > exists (1:r1=10) > > --- 8< > > Thanks for writing this! It's definitely worth sticking it in the commit > log, but Paul and Jade might also like to include it as part of their litmus > test repository too. > > Will >
[PATCH v3] locking/rwsem: add acquire barrier to read_slowpath exit when queue is empty
LTP mtest06 has been observed to rarely hit "still mapped when deleted" and following BUG_ON on arm64: page:7e02fa37e480 refcount:3 mapcount:1 mapping:80be3d678ab0 index:0x0 xfs_address_space_operations [xfs] flags: 0xbfffe00037(locked|referenced|uptodate|lru|active) page dumped because: VM_BUG_ON_PAGE(page_mapped(page)) [ cut here ] kernel BUG at mm/filemap.c:171! Internal error: Oops - BUG: 0 [#1] SMP CPU: 220 PID: 154292 Comm: mmap1 Not tainted 5.2.0-0ecfebd.cki #1 Hardware name: HPE Apollo 70 /C01_APACHE_MB , BIOS L50_5.13_1.10 05/17/2019 pstate: 40400089 (nZcv daIf +PAN -UAO) pc : unaccount_page_cache_page+0x17c/0x1a0 lr : unaccount_page_cache_page+0x17c/0x1a0 Call trace: unaccount_page_cache_page+0x17c/0x1a0 delete_from_page_cache_batch+0xa0/0x300 truncate_inode_pages_range+0x1b8/0x640 truncate_inode_pages_final+0x88/0xa8 evict+0x1a0/0x1d8 iput+0x150/0x240 dentry_unlink_inode+0x120/0x130 __dentry_kill+0xd8/0x1d0 dentry_kill+0x88/0x248 dput+0x168/0x1b8 __fput+0xe8/0x208 fput+0x20/0x30 task_work_run+0xc0/0xf0 do_notify_resume+0x2b0/0x328 work_pending+0x8/0x10 The extra mapcount originated from pagefault handler, which handled pagefault for vma that has already been detached. vma is detached under mmap_sem write lock by detach_vmas_to_be_unmapped(), which also invalidates vmacache. When pagefault handler (under mmap_sem read lock) called find_vma(), vmacache_valid() wrongly reported vmacache as valid. After rwsem down_read() returns via 'queue empty' path (as of v5.2), it does so without issuing read_acquire on sem->count: down_read __down_read rwsem_down_read_failed __rwsem_down_read_failed_common raw_spin_lock_irq(>wait_lock); if (list_empty(>wait_list)) { if (atomic_long_read(>count) >= 0) { raw_spin_unlock_irq(>wait_lock); return sem; Suspected problem here is that last *_acquire on down_read() side happens before write side issues *_release: 1. writer: has the lock 2. reader: down_read() issues *read_acquire on entry 3. writer: mm->vmacache_seqnum++; downgrades lock (*fetch_add_release) 4. reader: __rwsem_down_read_failed_common() finds it can take lock and returns 5. reader: observes stale mm->vmacache_seqnum --- 8< C rwsem { atomic_t rwsem_count = ATOMIC_INIT(1); int vmacache_seqnum = 10; } P0(int *vmacache_seqnum, atomic_t *rwsem_count) { r0 = READ_ONCE(*vmacache_seqnum); WRITE_ONCE(*vmacache_seqnum, r0 + 1); /* downgrade_write */ r1 = atomic_fetch_add_release(-1+256, rwsem_count); } P1(int *vmacache_seqnum, atomic_t *rwsem_count, spinlock_t *sem_wait_lock) { /* rwsem_read_trylock */ r0 = atomic_add_return_acquire(256, rwsem_count); /* rwsem_down_read_slowpath */ spin_lock(sem_wait_lock); r0 = atomic_read(rwsem_count); if ((r0 & 1) == 0) { // BUG: needs barrier spin_unlock(sem_wait_lock); r1 = READ_ONCE(*vmacache_seqnum); } } exists (1:r1=10) --- >8 I can reproduce the problem by running LTP mtest06 in a loop and building kernel (-j $NCPUS) in parallel. It does reproduce since v4.20 up to v5.2 on arm64 HPE Apollo 70 (224 CPUs, 256GB RAM, 2 nodes). It triggers reliably within ~hour. Patched kernel ran fine for 10+ hours with clean dmesg. Tests were done against v5.2, since commit cf69482d62d9 ("locking/rwsem: Enable readers spinning on writer") makes it much harder to reproduce. v2: Move barrier after test (Waiman Long) Use smp_acquire__after_ctrl_dep() (Peter Zijlstra) v3: Add comment to barrier (Waiman Long, Will Deacon) Add litmus test Related: https://github.com/linux-test-project/ltp/blob/master/testcases/kernel/mem/mtest06/mmap1.c Related: commit dd2283f2605e ("mm: mmap: zap pages with read mmap_sem in munmap") Fixes: 4b486b535c33 ("locking/rwsem: Exit read lock slowpath if queue empty & no writer") Signed-off-by: Jan Stancek Reviewed-by: Will Deacon Acked-by: Waiman Long Cc: sta...@vger.kernel.org # v4.20+ Cc: Waiman Long Cc: Davidlohr Bueso Cc: Will Deacon Cc: Peter Zijlstra Cc: Ingo Molnar --- kernel/locking/rwsem.c | 7 +++ 1 file changed, 7 insertions(+) diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 37524a47f002..fe02aef39e9d 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -1032,6 +1032,13 @@ static inline bool rwsem_reader_phase_trylock(struct rw_semaphore *sem, */ if (adjustment && !(atomic_long_read(>count) & (RWSEM_WRITER_MASK | RWSEM_FLAG_HANDOFF))) { +
Re: [PATCH v2] locking/rwsem: add acquire barrier to read_slowpath exit when queue is empty
On Wed, Jul 17, 2019 at 10:19:04AM -0400, Waiman Long wrote: If you add a comment to the code outlining the issue (preferably as a litmus test involving sem->count and some shared data which happens to be vmacache_seqnum in your test)), then: Reviewed-by: Will Deacon Thanks, Will Agreed. A comment just above smp_acquire__after_ctrl_dep() on why this is needed will be great. Other than that, Acked-by: Waiman Long litmus test looks a bit long, would following be acceptable? diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 37524a47f002..d9c96651bfc7 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -1032,6 +1032,13 @@ static inline bool rwsem_reader_phase_trylock(struct rw_semaphore *sem, */ if (adjustment && !(atomic_long_read(>count) & (RWSEM_WRITER_MASK | RWSEM_FLAG_HANDOFF))) { + /* +* down_read() issued ACQUIRE on enter, but we can race +* with writer who did RELEASE only after us. +* ACQUIRE here makes sure reader operations happen only +* after all writer ones. +*/ + smp_acquire__after_ctrl_dep(); raw_spin_unlock_irq(>wait_lock); rwsem_set_reader_owned(sem); lockevent_inc(rwsem_rlock_fast); with litmus test in commit log: --- 8< C rwsem { atomic_t rwsem_count = ATOMIC_INIT(1); int vmacache_seqnum = 10; } P0(int *vmacache_seqnum, atomic_t *rwsem_count) { r0 = READ_ONCE(*vmacache_seqnum); WRITE_ONCE(*vmacache_seqnum, r0 + 1); /* downgrade_write */ r1 = atomic_fetch_add_release(-1+256, rwsem_count); } P1(int *vmacache_seqnum, atomic_t *rwsem_count, spinlock_t *sem_wait_lock) { /* rwsem_read_trylock */ r0 = atomic_add_return_acquire(256, rwsem_count); /* rwsem_down_read_slowpath */ spin_lock(sem_wait_lock); r0 = atomic_read(rwsem_count); if ((r0 & 1) == 0) { // BUG: needs barrier spin_unlock(sem_wait_lock); r1 = READ_ONCE(*vmacache_seqnum); } } exists (1:r1=10) --- 8< Thanks, Jan
[PATCH v2] locking/rwsem: add acquire barrier to read_slowpath exit when queue is empty
LTP mtest06 has been observed to rarely hit "still mapped when deleted" and following BUG_ON on arm64: page:7e02fa37e480 refcount:3 mapcount:1 mapping:80be3d678ab0 index:0x0 xfs_address_space_operations [xfs] flags: 0xbfffe00037(locked|referenced|uptodate|lru|active) page dumped because: VM_BUG_ON_PAGE(page_mapped(page)) [ cut here ] kernel BUG at mm/filemap.c:171! Internal error: Oops - BUG: 0 [#1] SMP CPU: 220 PID: 154292 Comm: mmap1 Not tainted 5.2.0-0ecfebd.cki #1 Hardware name: HPE Apollo 70 /C01_APACHE_MB , BIOS L50_5.13_1.10 05/17/2019 pstate: 40400089 (nZcv daIf +PAN -UAO) pc : unaccount_page_cache_page+0x17c/0x1a0 lr : unaccount_page_cache_page+0x17c/0x1a0 Call trace: unaccount_page_cache_page+0x17c/0x1a0 delete_from_page_cache_batch+0xa0/0x300 truncate_inode_pages_range+0x1b8/0x640 truncate_inode_pages_final+0x88/0xa8 evict+0x1a0/0x1d8 iput+0x150/0x240 dentry_unlink_inode+0x120/0x130 __dentry_kill+0xd8/0x1d0 dentry_kill+0x88/0x248 dput+0x168/0x1b8 __fput+0xe8/0x208 fput+0x20/0x30 task_work_run+0xc0/0xf0 do_notify_resume+0x2b0/0x328 work_pending+0x8/0x10 The extra mapcount originated from pagefault handler, which handled pagefault for vma that has already been detached. vma is detached under mmap_sem write lock by detach_vmas_to_be_unmapped(), which also invalidates vmacache. When pagefault handler (under mmap_sem read lock) called find_vma(), vmacache_valid() wrongly reported vmacache as valid. After rwsem down_read() returns via 'queue empty' path (as of v5.2), it does so without issuing read_acquire on sem->count: down_read __down_read rwsem_down_read_failed __rwsem_down_read_failed_common raw_spin_lock_irq(>wait_lock); if (list_empty(>wait_list)) { if (atomic_long_read(>count) >= 0) { raw_spin_unlock_irq(>wait_lock); return sem; Suspected problem here is that last *_acquire on down_read() side happens before write side issues *_release: 1. writer: has the lock 2. reader: down_read() issues *read_acquire on entry 3. writer: mm->vmacache_seqnum++; downgrades lock (*fetch_add_release) 4. reader: __rwsem_down_read_failed_common() finds it can take lock and returns 5. reader: observes stale mm->vmacache_seqnum I can reproduce the problem by running LTP mtest06 in a loop and building kernel (-j $NCPUS) in parallel. It does reproduce since v4.20 up to v5.2 on arm64 HPE Apollo 70 (224 CPUs, 256GB RAM, 2 nodes). It triggers reliably within ~hour. Patched kernel ran fine for 10+ hours with clean dmesg. Tests were done against v5.2, since commit cf69482d62d9 ("locking/rwsem: Enable readers spinning on writer") makes it much harder to reproduce. v2: Move barrier after test (Waiman Long) Use smp_acquire__after_ctrl_dep() (Peter Zijlstra) Related: https://github.com/linux-test-project/ltp/blob/master/testcases/kernel/mem/mtest06/mmap1.c Related: commit dd2283f2605e ("mm: mmap: zap pages with read mmap_sem in munmap") Fixes: 4b486b535c33 ("locking/rwsem: Exit read lock slowpath if queue empty & no writer") Signed-off-by: Jan Stancek Cc: sta...@vger.kernel.org # v4.20+ Cc: Waiman Long Cc: Davidlohr Bueso Cc: Will Deacon Cc: Peter Zijlstra Cc: Ingo Molnar --- kernel/locking/rwsem.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 37524a47f002..5ac72b60608b 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -1032,6 +1032,7 @@ static inline bool rwsem_reader_phase_trylock(struct rw_semaphore *sem, */ if (adjustment && !(atomic_long_read(>count) & (RWSEM_WRITER_MASK | RWSEM_FLAG_HANDOFF))) { + smp_acquire__after_ctrl_dep(); raw_spin_unlock_irq(>wait_lock); rwsem_set_reader_owned(sem); lockevent_inc(rwsem_rlock_fast); -- 1.8.3.1
Re: [PATCH] locking/rwsem: use read_acquire in read_slowpath exit when queue is empty
- Original Message - > On 7/16/19 12:04 PM, Jan Stancek wrote: > > LTP mtest06 has been observed to rarely hit "still mapped when deleted" > > and following BUG_ON on arm64: > > page:7e02fa37e480 refcount:3 mapcount:1 mapping:80be3d678ab0 > > index:0x0 > > xfs_address_space_operations [xfs] > > flags: 0xbfffe00037(locked|referenced|uptodate|lru|active) > > page dumped because: VM_BUG_ON_PAGE(page_mapped(page)) > > [ cut here ] > > kernel BUG at mm/filemap.c:171! > > Internal error: Oops - BUG: 0 [#1] SMP > > CPU: 220 PID: 154292 Comm: mmap1 Not tainted 5.2.0-0ecfebd.cki #1 > > Hardware name: HPE Apollo 70 /C01_APACHE_MB , BIOS L50_5.13_1.10 > > 05/17/2019 > > pstate: 40400089 (nZcv daIf +PAN -UAO) > > pc : unaccount_page_cache_page+0x17c/0x1a0 > > lr : unaccount_page_cache_page+0x17c/0x1a0 > > Call trace: > > unaccount_page_cache_page+0x17c/0x1a0 > > delete_from_page_cache_batch+0xa0/0x300 > > truncate_inode_pages_range+0x1b8/0x640 > > truncate_inode_pages_final+0x88/0xa8 > > evict+0x1a0/0x1d8 > > iput+0x150/0x240 > > dentry_unlink_inode+0x120/0x130 > > __dentry_kill+0xd8/0x1d0 > > dentry_kill+0x88/0x248 > > dput+0x168/0x1b8 > > __fput+0xe8/0x208 > > fput+0x20/0x30 > > task_work_run+0xc0/0xf0 > > do_notify_resume+0x2b0/0x328 > > work_pending+0x8/0x10 > > > > The extra mapcount originated from pagefault handler, which handled > > pagefault for vma that has already been detached. vma is detached > > under mmap_sem write lock by detach_vmas_to_be_unmapped(), which > > also invalidates vmacache. > > > > When pagefault handler (under mmap_sem read lock) called find_vma(), > > vmacache_valid() wrongly reported vmacache as valid. > > > > After rwsem down_read() returns via 'queue empty' path (as of v5.2), > > it does so without issuing read_acquire on sem->count: > > down_read > > __down_read > > rwsem_down_read_failed > > __rwsem_down_read_failed_common > > raw_spin_lock_irq(>wait_lock); > > if (list_empty(>wait_list)) { > > if (atomic_long_read(>count) >= 0) { > > raw_spin_unlock_irq(>wait_lock); > > return sem; > > > > Suspected problem here is that last *_acquire on down_read() side > > happens before write side issues *_release: > > 1. writer: has the lock > > 2. reader: down_read() issues *read_acquire on entry > > 3. writer: mm->vmacache_seqnum++; downgrades lock (*fetch_add_release) > > 4. reader: __rwsem_down_read_failed_common() finds it can take lock and > > returns > > 5. reader: observes stale mm->vmacache_seqnum > > > > I can reproduce the problem by running LTP mtest06 in a loop and building > > kernel (-j $NCPUS) in parallel. It does reproduce since v4.20 up to v5.2 > > on arm64 HPE Apollo 70 (224 CPUs, 256GB RAM, 2 nodes). It triggers reliably > > within ~hour. Patched kernel ran fine for 5+ hours with clean dmesg. > > Tests were done against v5.2, since commit cf69482d62d9 ("locking/rwsem: > > Enable readers spinning on writer") makes it much harder to reproduce. > The explanation makes sense to me. Thanks for the investigation. > > > > Related: > > https://github.com/linux-test-project/ltp/blob/master/testcases/kernel/mem/mtest06/mmap1.c > > Related: commit dd2283f2605e ("mm: mmap: zap pages with read mmap_sem in > > munmap") > > Fixes: 4b486b535c33 ("locking/rwsem: Exit read lock slowpath if queue empty > > & no writer") > > > > Signed-off-by: Jan Stancek > > Cc: Waiman Long > > Cc: Davidlohr Bueso > > Cc: Will Deacon > > Cc: Peter Zijlstra > > Cc: Ingo Molnar > > --- > > kernel/locking/rwsem.c | 2 +- > > 1 file changed, 1 insertion(+), 1 deletion(-) > > > > diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c > > index 37524a47f002..757b198d7a5b 100644 > > --- a/kernel/locking/rwsem.c > > +++ b/kernel/locking/rwsem.c > > @@ -1030,7 +1030,7 @@ static inline bool rwsem_reader_phase_trylock(struct > > rw_semaphore *sem, > > * exit the slowpath and return immediately as its > > * RWSEM_READER_BIAS has already been set in the count. > > */ > > - if (adjustment && !(atomic_long_read(>count) & > > + if (adjustment && !(atomic_long_read_acquire(>count) & > > (RWSEM_WRITER_MASK | RWSEM_FLAG_HANDOFF))) { > > raw_spin_unlock_irq(>wait_lock); > > rwsem_set_reader_owned(sem); > > The chance of taking this path is not that high. So instead of > increasing the cost of the test by adding an acquire barrier, Yes, that is good point. > how about > just adding smp_mb__after_spinlock() before spin_unlock_irq(). This > should have the same effect of making sure that no stale data will be > used in the read-lock critical section. I'll redo the tests with smp_mb__after_spinlock(). Thanks, Jan > > -Longman > >
[PATCH] locking/rwsem: use read_acquire in read_slowpath exit when queue is empty
LTP mtest06 has been observed to rarely hit "still mapped when deleted" and following BUG_ON on arm64: page:7e02fa37e480 refcount:3 mapcount:1 mapping:80be3d678ab0 index:0x0 xfs_address_space_operations [xfs] flags: 0xbfffe00037(locked|referenced|uptodate|lru|active) page dumped because: VM_BUG_ON_PAGE(page_mapped(page)) [ cut here ] kernel BUG at mm/filemap.c:171! Internal error: Oops - BUG: 0 [#1] SMP CPU: 220 PID: 154292 Comm: mmap1 Not tainted 5.2.0-0ecfebd.cki #1 Hardware name: HPE Apollo 70 /C01_APACHE_MB , BIOS L50_5.13_1.10 05/17/2019 pstate: 40400089 (nZcv daIf +PAN -UAO) pc : unaccount_page_cache_page+0x17c/0x1a0 lr : unaccount_page_cache_page+0x17c/0x1a0 Call trace: unaccount_page_cache_page+0x17c/0x1a0 delete_from_page_cache_batch+0xa0/0x300 truncate_inode_pages_range+0x1b8/0x640 truncate_inode_pages_final+0x88/0xa8 evict+0x1a0/0x1d8 iput+0x150/0x240 dentry_unlink_inode+0x120/0x130 __dentry_kill+0xd8/0x1d0 dentry_kill+0x88/0x248 dput+0x168/0x1b8 __fput+0xe8/0x208 fput+0x20/0x30 task_work_run+0xc0/0xf0 do_notify_resume+0x2b0/0x328 work_pending+0x8/0x10 The extra mapcount originated from pagefault handler, which handled pagefault for vma that has already been detached. vma is detached under mmap_sem write lock by detach_vmas_to_be_unmapped(), which also invalidates vmacache. When pagefault handler (under mmap_sem read lock) called find_vma(), vmacache_valid() wrongly reported vmacache as valid. After rwsem down_read() returns via 'queue empty' path (as of v5.2), it does so without issuing read_acquire on sem->count: down_read __down_read rwsem_down_read_failed __rwsem_down_read_failed_common raw_spin_lock_irq(>wait_lock); if (list_empty(>wait_list)) { if (atomic_long_read(>count) >= 0) { raw_spin_unlock_irq(>wait_lock); return sem; Suspected problem here is that last *_acquire on down_read() side happens before write side issues *_release: 1. writer: has the lock 2. reader: down_read() issues *read_acquire on entry 3. writer: mm->vmacache_seqnum++; downgrades lock (*fetch_add_release) 4. reader: __rwsem_down_read_failed_common() finds it can take lock and returns 5. reader: observes stale mm->vmacache_seqnum I can reproduce the problem by running LTP mtest06 in a loop and building kernel (-j $NCPUS) in parallel. It does reproduce since v4.20 up to v5.2 on arm64 HPE Apollo 70 (224 CPUs, 256GB RAM, 2 nodes). It triggers reliably within ~hour. Patched kernel ran fine for 5+ hours with clean dmesg. Tests were done against v5.2, since commit cf69482d62d9 ("locking/rwsem: Enable readers spinning on writer") makes it much harder to reproduce. Related: https://github.com/linux-test-project/ltp/blob/master/testcases/kernel/mem/mtest06/mmap1.c Related: commit dd2283f2605e ("mm: mmap: zap pages with read mmap_sem in munmap") Fixes: 4b486b535c33 ("locking/rwsem: Exit read lock slowpath if queue empty & no writer") Signed-off-by: Jan Stancek Cc: Waiman Long Cc: Davidlohr Bueso Cc: Will Deacon Cc: Peter Zijlstra Cc: Ingo Molnar --- kernel/locking/rwsem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 37524a47f002..757b198d7a5b 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -1030,7 +1030,7 @@ static inline bool rwsem_reader_phase_trylock(struct rw_semaphore *sem, * exit the slowpath and return immediately as its * RWSEM_READER_BIAS has already been set in the count. */ - if (adjustment && !(atomic_long_read(>count) & + if (adjustment && !(atomic_long_read_acquire(>count) & (RWSEM_WRITER_MASK | RWSEM_FLAG_HANDOFF))) { raw_spin_unlock_irq(>wait_lock); rwsem_set_reader_owned(sem); -- 1.8.3.1
Re: [LTP] [mm] 8c7829b04c: ltp.overcommit_memory01.fail
- Original Message - > FYI, we noticed the following commit (built with gcc-7): > > commit: 8c7829b04c523cdc732cb77f59f03320e09f3386 ("mm: fix false-positive > OVERCOMMIT_GUESS failures") This matches report from Naresh: http://lists.linux.it/pipermail/ltp/2019-May/011962.html > https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git master > > in testcase: ltp > with following parameters: > > disk: 1HDD > test: mm-01 > > test-description: The LTP testsuite contains a collection of tools for > testing the Linux kernel and related features. > test-url: http://linux-test-project.github.io/ > > > on test machine: qemu-system-x86_64 -enable-kvm -cpu SandyBridge -smp 2 -m 4G > > caused below changes (please refer to attached dmesg/kmsg for entire > log/backtrace): > > > If you fix the issue, kindly add following tag > Reported-by: kernel test robot > > > [ 554.112267 ] <<>> > [ 554.112269 ] > [ 554.115806 ] tag=overcommit_memory01 stime=1558074982 > [ 554.115809 ] > [ 554.119303 ] cmdline="overcommit_memory" > [ 554.119306 ] > [ 554.121962 ] contacts="" > [ 554.121965 ] > [ 554.124459 ] analysis=exit > [ 554.124463 ] > [ 554.127140 ] <<>> > [ 554.127144 ] > [ 554.131401 ] tst_test.c:1096: INFO: Timeout per run is 0h 05m 00s > [ 554.131404 ] > [ 554.136248 ] overcommit_memory.c:117: INFO: MemTotal is 4033124 kB > [ 554.136251 ] > [ 554.141365 ] overcommit_memory.c:119: INFO: SwapTotal is 268435452 kB > [ 554.141368 ] > [ 554.146664 ] overcommit_memory.c:123: INFO: CommitLimit is 270452012 kB > [ 554.14 ] > [ 554.151375 ] mem.c:814: INFO: set overcommit_ratio to 50 > [ 554.151378 ] > [ 554.155796 ] mem.c:814: INFO: set overcommit_memory to 2 > [ 554.155799 ] > [ 554.159951 ] overcommit_memory.c:190: INFO: malloc 538608648 kB failed > [ 554.159953 ] > [ 554.164801 ] overcommit_memory.c:211: PASS: alloc failed as expected > [ 554.164804 ] > [ 554.170196 ] overcommit_memory.c:190: INFO: malloc 270452012 kB failed > [ 554.170199 ] > [ 554.175344 ] overcommit_memory.c:211: PASS: alloc failed as expected > [ 554.175347 ] > [ 554.180932 ] overcommit_memory.c:186: INFO: malloc 134651878 kB > successfully > [ 554.180935 ] > [ 554.186522 ] overcommit_memory.c:205: PASS: alloc passed as expected > [ 554.186524 ] > [ 554.191234 ] mem.c:814: INFO: set overcommit_memory to 0 > [ 554.191237 ] > [ 554.196930 ] overcommit_memory.c:186: INFO: malloc 134734122 kB > successfully > [ 554.196933 ] > [ 554.202738 ] overcommit_memory.c:205: PASS: alloc passed as expected > [ 554.202742 ] > [ 554.208563 ] overcommit_memory.c:190: INFO: malloc 538936488 kB failed > [ 554.208566 ] > [ 554.214355 ] overcommit_memory.c:211: PASS: alloc failed as expected > [ 554.214357 ] > [ 554.220506 ] overcommit_memory.c:186: INFO: malloc 272468576 kB > successfully > [ 554.220509 ] > [ 554.226564 ] overcommit_memory.c:213: FAIL: alloc passed, expected to fail > [ 554.226568 ] > [ 554.231870 ] mem.c:814: INFO: set overcommit_memory to 1 > [ 554.231873 ] > [ 554.237819 ] overcommit_memory.c:186: INFO: malloc 136234288 kB > successfully > [ 554.237821 ] > [ 554.243711 ] overcommit_memory.c:205: PASS: alloc passed as expected > [ 554.243713 ] > [ 554.249791 ] overcommit_memory.c:186: INFO: malloc 272468576 kB > successfully > [ 554.249794 ] > [ 554.255069 ] overcommit_memory.c:205: PASS: alloc passed as expected > [ 554.255073 ] > [ 554.261594 ] overcommit_memory.c:186: INFO: malloc 544937152 kB > successfully > [ 554.261597 ] > [ 554.267855 ] overcommit_memory.c:205: PASS: alloc passed as expected > [ 554.267858 ] > [ 554.273445 ] mem.c:814: INFO: set overcommit_memory to 0 > [ 554.273448 ] > [ 554.278882 ] mem.c:814: INFO: set overcommit_ratio to 50 > [ 554.278885 ] > [ 554.282419 ] > [ 554.284170 ] Summary: > [ 554.284173 ] > [ 554.287476 ] passed 8 > [ 554.287480 ] > [ 554.290740 ] failed 1 > [ 554.290743 ] > [ 554.293971 ] skipped 0 > [ 554.293974 ] > [ 554.297123 ] warnings 0 > [ 554.297126 ] > [ 554.300746 ] <<>> > [ 554.300749 ] > [ 554.304489 ] initiation_status="ok" > [ 554.304492 ] > [ 554.309762 ] duration=0 termination_type=exited termination_id=1 > corefile=no > [ 554.309765 ] > [ 554.313681 ] cutime=0 cstime=0 > [ 554.313685 ] > [ 554.316880 ] <<>> > > > > To reproduce: > > # build kernel > cd linux > cp config-5.1.0-10246-g8c7829b .config > make HOSTCC=gcc-7 CC=gcc-7 ARCH=x86_64 olddefconfig > make HOSTCC=gcc-7 CC=gcc-7 ARCH=x86_64 prepare > make HOSTCC=gcc-7 CC=gcc-7 ARCH=x86_64 modules_prepare > make HOSTCC=gcc-7 CC=gcc-7 ARCH=x86_64 SHELL=/bin/bash > make HOSTCC=gcc-7 CC=gcc-7 ARCH=x86_64 bzImage > > > git clone https://github.com/intel/lkp-tests.git > cd lkp-tests > find lib/ | cpio -o -H newc --quiet | gzip > modules.cgz > bin/lkp qemu -k -m modules.cgz job-script # job-script is > attached > in this
Re: [v2 PATCH] mm: mmu_gather: remove __tlb_reset_range() for force flush
- Original Message - > On Mon, May 13, 2019 at 04:01:09PM -0700, Yang Shi wrote: > > > > > > On 5/13/19 9:38 AM, Will Deacon wrote: > > > On Fri, May 10, 2019 at 07:26:54AM +0800, Yang Shi wrote: > > > > diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c > > > > index 99740e1..469492d 100644 > > > > --- a/mm/mmu_gather.c > > > > +++ b/mm/mmu_gather.c > > > > @@ -245,14 +245,39 @@ void tlb_finish_mmu(struct mmu_gather *tlb, > > > > { > > > > /* > > > > * If there are parallel threads are doing PTE changes on same > > > > range > > > > -* under non-exclusive lock(e.g., mmap_sem read-side) but defer > > > > TLB > > > > -* flush by batching, a thread has stable TLB entry can fail to > > > > flush > > > > -* the TLB by observing pte_none|!pte_dirty, for example so > > > > flush TLB > > > > -* forcefully if we detect parallel PTE batching threads. > > > > +* under non-exclusive lock (e.g., mmap_sem read-side) but > > > > defer TLB > > > > +* flush by batching, one thread may end up seeing inconsistent > > > > PTEs > > > > +* and result in having stale TLB entries. So flush TLB > > > > forcefully > > > > +* if we detect parallel PTE batching threads. > > > > +* > > > > +* However, some syscalls, e.g. munmap(), may free page tables, > > > > this > > > > +* needs force flush everything in the given range. Otherwise > > > > this > > > > +* may result in having stale TLB entries for some > > > > architectures, > > > > +* e.g. aarch64, that could specify flush what level TLB. > > > > */ > > > > - if (mm_tlb_flush_nested(tlb->mm)) { > > > > - __tlb_reset_range(tlb); > > > > - __tlb_adjust_range(tlb, start, end - start); > > > > + if (mm_tlb_flush_nested(tlb->mm) && !tlb->fullmm) { > > > > + /* > > > > +* Since we can't tell what we actually should have > > > > +* flushed, flush everything in the given range. > > > > +*/ > > > > + tlb->freed_tables = 1; > > > > + tlb->cleared_ptes = 1; > > > > + tlb->cleared_pmds = 1; > > > > + tlb->cleared_puds = 1; > > > > + tlb->cleared_p4ds = 1; > > > > + > > > > + /* > > > > +* Some architectures, e.g. ARM, that have range > > > > invalidation > > > > +* and care about VM_EXEC for I-Cache invalidation, > > > > need force > > > > +* vma_exec set. > > > > +*/ > > > > + tlb->vma_exec = 1; > > > > + > > > > + /* Force vma_huge clear to guarantee safer flush */ > > > > + tlb->vma_huge = 0; > > > > + > > > > + tlb->start = start; > > > > + tlb->end = end; > > > > } > > > Whilst I think this is correct, it would be interesting to see whether > > > or not it's actually faster than just nuking the whole mm, as I mentioned > > > before. > > > > > > At least in terms of getting a short-term fix, I'd prefer the diff below > > > if it's not measurably worse. > > > > I did a quick test with ebizzy (96 threads with 5 iterations) on my x86 VM, > > it shows slightly slowdown on records/s but much more sys time spent with > > fullmm flush, the below is the data. > > > > Â Â Â nofullmm fullmm > > ops (records/s) 225606Â 225119 > > sys (s)Â Â Â 0.69Â Â Â 1.14 > > > > It looks the slight reduction of records/s is caused by the increase of sys > > time. > > That's not what I expected, and I'm unable to explain why moving to fullmm > would /increase/ the system time. I would've thought the time spent doing > the invalidation would decrease, with the downside that the TLB is cold > when returning back to userspace. > I tried ebizzy with various parameters (malloc vs mmap, ran it for hour), but performance was very similar for both patches. So, I was looking for workload that would demonstrate the largest difference. Inspired by python xml-rpc, which can handle each request in new thread, I tried following [1]: 16 threads, each looping 100k times over: mmap(16M) touch 1 page madvise(DONTNEED) munmap(16M) This yields quite significant difference for 2 patches when running on my 46 CPU arm host. I checked it twice - applied patch, recompiled, rebooted, but numbers stayed +- couple seconds the same. Does it somewhat match your expectation? v2 patch - real2m33.460s user0m3.359s sys 15m32.307s real2m33.895s user0m2.749s sys 16m34.500s real2m35.666s user0m3.528s sys 15m23.377s real2m32.898s user0m2.789s sys 16m18.801s real2m33.087s user0m3.565s sys 16m23.815s fullmm version --- real0m46.811s user0m1.596s sys
Re: LTP: mm: overcommit_memory01, 03...06 failed
- Original Message - > ltp-mm-tests failed on Linux mainline kernel 5.1.0, > * overcommit_memory01 overcommit_memory > * overcommit_memory03 overcommit_memory -R 30 > * overcommit_memory04 overcommit_memory -R 80 > * overcommit_memory05 overcommit_memory -R 100 > * overcommit_memory06 overcommit_memory -R 200 > > mem.c:814: INFO: set overcommit_memory to 0 > overcommit_memory.c:185: INFO: malloc 8094844 kB successfully > overcommit_memory.c:204: PASS: alloc passed as expected > overcommit_memory.c:189: INFO: malloc 32379376 kB failed > overcommit_memory.c:210: PASS: alloc failed as expected > overcommit_memory.c:185: INFO: malloc 16360216 kB successfully > overcommit_memory.c:212: FAIL: alloc passed, expected to fail > > Failed test log, > https://lkft.validation.linaro.org/scheduler/job/726417#L22852 > > LTP version 20190115 > > Test case link, > https://github.com/linux-test-project/ltp/blob/master/testcases/kernel/mem/tunable/overcommit_memory.c#L212 > > First bad commit: > git branch master > git commit e0654264c4806dc436b291294a0fbf9be7571ab6 > git describe v5.1-10706-ge0654264c480 > git repo https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git > > Last good commit: > git branch master > git commit 7e9890a3500d95c01511a4c45b7e7192dfa47ae2 > git describe v5.1-10326-g7e9890a3500d > git repo https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git Heuristic changed a bit in: commit 8c7829b04c523cdc732cb77f59f03320e09f3386 Author: Johannes Weiner Date: Mon May 13 17:21:50 2019 -0700 mm: fix false-positive OVERCOMMIT_GUESS failures LTP tries to allocate "mem_total + swap_total": alloc_and_check(sum_total, EXPECT_FAIL); which now presumably falls short to trigger failure. > > Best regards > Naresh Kamboju >
Re: [v2 PATCH] mm: mmu_gather: remove __tlb_reset_range() for force flush
- Original Message - > > > On May 13, 2019 4:01 PM, Yang Shi wrote: > > > On 5/13/19 9:38 AM, Will Deacon wrote: > > On Fri, May 10, 2019 at 07:26:54AM +0800, Yang Shi wrote: > >> diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c > >> index 99740e1..469492d 100644 > >> --- a/mm/mmu_gather.c > >> +++ b/mm/mmu_gather.c > >> @@ -245,14 +245,39 @@ void tlb_finish_mmu(struct mmu_gather *tlb, > >> { > >> /* > >>* If there are parallel threads are doing PTE changes on same range > >> - * under non-exclusive lock(e.g., mmap_sem read-side) but defer TLB > >> - * flush by batching, a thread has stable TLB entry can fail to flush > >> - * the TLB by observing pte_none|!pte_dirty, for example so flush TLB > >> - * forcefully if we detect parallel PTE batching threads. > >> + * under non-exclusive lock (e.g., mmap_sem read-side) but defer TLB > >> + * flush by batching, one thread may end up seeing inconsistent PTEs > >> + * and result in having stale TLB entries. So flush TLB forcefully > >> + * if we detect parallel PTE batching threads. > >> + * > >> + * However, some syscalls, e.g. munmap(), may free page tables, this > >> + * needs force flush everything in the given range. Otherwise this > >> + * may result in having stale TLB entries for some architectures, > >> + * e.g. aarch64, that could specify flush what level TLB. > >>*/ > >> -if (mm_tlb_flush_nested(tlb->mm)) { > >> -__tlb_reset_range(tlb); > >> -__tlb_adjust_range(tlb, start, end - start); > >> +if (mm_tlb_flush_nested(tlb->mm) && !tlb->fullmm) { > >> +/* > >> + * Since we can't tell what we actually should have > >> + * flushed, flush everything in the given range. > >> + */ > >> +tlb->freed_tables = 1; > >> +tlb->cleared_ptes = 1; > >> +tlb->cleared_pmds = 1; > >> +tlb->cleared_puds = 1; > >> +tlb->cleared_p4ds = 1; > >> + > >> +/* > >> + * Some architectures, e.g. ARM, that have range invalidation > >> + * and care about VM_EXEC for I-Cache invalidation, need > >> force > >> + * vma_exec set. > >> + */ > >> +tlb->vma_exec = 1; > >> + > >> +/* Force vma_huge clear to guarantee safer flush */ > >> +tlb->vma_huge = 0; > >> + > >> +tlb->start = start; > >> +tlb->end = end; > >> } > > Whilst I think this is correct, it would be interesting to see whether > > or not it's actually faster than just nuking the whole mm, as I mentioned > > before. > > > > At least in terms of getting a short-term fix, I'd prefer the diff below > > if it's not measurably worse. > > I did a quick test with ebizzy (96 threads with 5 iterations) on my x86 > VM, it shows slightly slowdown on records/s but much more sys time spent > with fullmm flush, the below is the data. > > nofullmm fullmm > ops (records/s) 225606 225119 > sys (s)0.691.14 > > It looks the slight reduction of records/s is caused by the increase of > sys time. > > > > > Will > > > > --->8 > > > > diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c > > index 99740e1dd273..cc251422d307 100644 > > --- a/mm/mmu_gather.c > > +++ b/mm/mmu_gather.c > > @@ -251,8 +251,9 @@ void tlb_finish_mmu(struct mmu_gather *tlb, > > * forcefully if we detect parallel PTE batching threads. > > */ > >if (mm_tlb_flush_nested(tlb->mm)) { > > + tlb->fullmm = 1; > >__tlb_reset_range(tlb); > > - __tlb_adjust_range(tlb, start, end - start); > > + tlb->freed_tables = 1; > >} > > > >tlb_flush_mmu(tlb); > > > I think that this should have set need_flush_all and not fullmm. > Wouldn't that skip the flush? If fulmm == 0, then __tlb_reset_range() sets tlb->end = 0. tlb_flush_mmu tlb_flush_mmu_tlbonly if (!tlb->end) return Replacing fullmm with need_flush_all, brings the problem back / reproducer hangs.
Re: [PATCH] mm: mmu_gather: remove __tlb_reset_range() for force flush
- Original Message - > > > On 5/9/19 2:06 PM, Jan Stancek wrote: > > - Original Message - > >> > >> On 5/9/19 11:24 AM, Peter Zijlstra wrote: > >>> On Thu, May 09, 2019 at 05:36:29PM +, Nadav Amit wrote: > >>>>> On May 9, 2019, at 3:38 AM, Peter Zijlstra > >>>>> wrote: > >>>>> diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c > >>>>> index 99740e1dd273..fe768f8d612e 100644 > >>>>> --- a/mm/mmu_gather.c > >>>>> +++ b/mm/mmu_gather.c > >>>>> @@ -244,15 +244,20 @@ void tlb_finish_mmu(struct mmu_gather *tlb, > >>>>> unsigned long start, unsigned long end) > >>>>> { > >>>>> /* > >>>>> -* If there are parallel threads are doing PTE changes on same > >>>>> range > >>>>> -* under non-exclusive lock(e.g., mmap_sem read-side) but defer > >>>>> TLB > >>>>> -* flush by batching, a thread has stable TLB entry can fail to > >>>>> flush > >>>>> -* the TLB by observing pte_none|!pte_dirty, for example so > >>>>> flush TLB > >>>>> -* forcefully if we detect parallel PTE batching threads. > >>>>> +* Sensible comment goes here.. > >>>>> */ > >>>>> - if (mm_tlb_flush_nested(tlb->mm)) { > >>>>> - __tlb_reset_range(tlb); > >>>>> - __tlb_adjust_range(tlb, start, end - start); > >>>>> + if (mm_tlb_flush_nested(tlb->mm) && !tlb->full_mm) { > >>>>> + /* > >>>>> +* Since we're can't tell what we actually should have > >>>>> +* flushed flush everything in the given range. > >>>>> +*/ > >>>>> + tlb->start = start; > >>>>> + tlb->end = end; > >>>>> + tlb->freed_tables = 1; > >>>>> + tlb->cleared_ptes = 1; > >>>>> + tlb->cleared_pmds = 1; > >>>>> + tlb->cleared_puds = 1; > >>>>> + tlb->cleared_p4ds = 1; > >>>>> } > >>>>> > >>>>> tlb_flush_mmu(tlb); > >>>> As a simple optimization, I think it is possible to hold multiple > >>>> nesting > >>>> counters in the mm, similar to tlb_flush_pending, for freed_tables, > >>>> cleared_ptes, etc. > >>>> > >>>> The first time you set tlb->freed_tables, you also atomically increase > >>>> mm->tlb_flush_freed_tables. Then, in tlb_flush_mmu(), you just use > >>>> mm->tlb_flush_freed_tables instead of tlb->freed_tables. > >>> That sounds fraught with races and expensive; I would much prefer to not > >>> go there for this arguably rare case. > >>> > >>> Consider such fun cases as where CPU-0 sees and clears a PTE, CPU-1 > >>> races and doesn't see that PTE. Therefore CPU-0 sets and counts > >>> cleared_ptes. Then if CPU-1 flushes while CPU-0 is still in mmu_gather, > >>> it will see cleared_ptes count increased and flush that granularity, > >>> OTOH if CPU-1 flushes after CPU-0 completes, it will not and potentiall > >>> miss an invalidate it should have had. > >>> > >>> This whole concurrent mmu_gather stuff is horrible. > >>> > >>> /me ponders more > >>> > >>> So I think the fundamental race here is this: > >>> > >>> CPU-0 CPU-1 > >>> > >>> tlb_gather_mmu(.start=1,tlb_gather_mmu(.start=2, > >>> .end=3);.end=4); > >>> > >>> ptep_get_and_clear_full(2) > >>> tlb_remove_tlb_entry(2); > >>> __tlb_remove_page(); > >>> if (pte_present(2)) // nope > >>> > >>> tlb_finish_mmu(); > >>> > >>> // continue without TLBI(2) > >>> // whoopsie > >>> > >>> tlb_finish_mmu(); > >>> tlb_f
Re: [PATCH] mm: mmu_gather: remove __tlb_reset_range() for force flush
- Original Message - > > > On 5/9/19 11:24 AM, Peter Zijlstra wrote: > > On Thu, May 09, 2019 at 05:36:29PM +, Nadav Amit wrote: > >>> On May 9, 2019, at 3:38 AM, Peter Zijlstra wrote: > >>> diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c > >>> index 99740e1dd273..fe768f8d612e 100644 > >>> --- a/mm/mmu_gather.c > >>> +++ b/mm/mmu_gather.c > >>> @@ -244,15 +244,20 @@ void tlb_finish_mmu(struct mmu_gather *tlb, > >>> unsigned long start, unsigned long end) > >>> { > >>> /* > >>> - * If there are parallel threads are doing PTE changes on same range > >>> - * under non-exclusive lock(e.g., mmap_sem read-side) but defer TLB > >>> - * flush by batching, a thread has stable TLB entry can fail to flush > >>> - * the TLB by observing pte_none|!pte_dirty, for example so flush TLB > >>> - * forcefully if we detect parallel PTE batching threads. > >>> + * Sensible comment goes here.. > >>>*/ > >>> - if (mm_tlb_flush_nested(tlb->mm)) { > >>> - __tlb_reset_range(tlb); > >>> - __tlb_adjust_range(tlb, start, end - start); > >>> + if (mm_tlb_flush_nested(tlb->mm) && !tlb->full_mm) { > >>> + /* > >>> + * Since we're can't tell what we actually should have > >>> + * flushed flush everything in the given range. > >>> + */ > >>> + tlb->start = start; > >>> + tlb->end = end; > >>> + tlb->freed_tables = 1; > >>> + tlb->cleared_ptes = 1; > >>> + tlb->cleared_pmds = 1; > >>> + tlb->cleared_puds = 1; > >>> + tlb->cleared_p4ds = 1; > >>> } > >>> > >>> tlb_flush_mmu(tlb); > >> As a simple optimization, I think it is possible to hold multiple nesting > >> counters in the mm, similar to tlb_flush_pending, for freed_tables, > >> cleared_ptes, etc. > >> > >> The first time you set tlb->freed_tables, you also atomically increase > >> mm->tlb_flush_freed_tables. Then, in tlb_flush_mmu(), you just use > >> mm->tlb_flush_freed_tables instead of tlb->freed_tables. > > That sounds fraught with races and expensive; I would much prefer to not > > go there for this arguably rare case. > > > > Consider such fun cases as where CPU-0 sees and clears a PTE, CPU-1 > > races and doesn't see that PTE. Therefore CPU-0 sets and counts > > cleared_ptes. Then if CPU-1 flushes while CPU-0 is still in mmu_gather, > > it will see cleared_ptes count increased and flush that granularity, > > OTOH if CPU-1 flushes after CPU-0 completes, it will not and potentiall > > miss an invalidate it should have had. > > > > This whole concurrent mmu_gather stuff is horrible. > > > >/me ponders more > > > > So I think the fundamental race here is this: > > > > CPU-0 CPU-1 > > > > tlb_gather_mmu(.start=1,tlb_gather_mmu(.start=2, > >.end=3);.end=4); > > > > ptep_get_and_clear_full(2) > > tlb_remove_tlb_entry(2); > > __tlb_remove_page(); > > if (pte_present(2)) // nope > > > > tlb_finish_mmu(); > > > > // continue without TLBI(2) > > // whoopsie > > > > tlb_finish_mmu(); > > tlb_flush() -> TLBI(2) > > I'm not quite sure if this is the case Jan really met. But, according to > his test, once correct tlb->freed_tables and tlb->cleared_* are set, his > test works well. My theory was following sequence: t1: map_write_unmap() t2: dummy() map_address = mmap() map_address[i] = 'b' munmap(map_address) downgrade_write(>mmap_sem); unmap_region() tlb_gather_mmu() inc_tlb_flush_pending(tlb->mm); free_pgtables() tlb->freed_tables = 1 tlb->cleared_pmds = 1 pthread_exit() madvise(thread_stack, 8M, MADV_DONTNEED) zap_page_range() tlb_gather_mmu() inc_tlb_flush_pending(tlb->mm); tlb_finish_mmu() if (mm_tlb_flush_nested(tlb->mm)) __tlb_reset_range() tlb->freed_tables = 0 tlb->cleared_pmds = 0 __flush_tlb_range(last_level = 0) ... map_address = mmap() map_address[i] = 'b' # PTE appeared valid to me, # so I suspected stale TLB entry at higher level as result of "freed_tables = 0" I'm happy to apply/run any debug patches to get more data that would help. > > > > > > > And we can fix that by having tlb_finish_mmu() sync up. Never let a > > concurrent tlb_finish_mmu() complete until all concurrenct mmu_gathers > > have completed. > > Not sure if this will scale well. > > > > > This should not be too hard to make happen. > >
Re: [PATCH] mm: mmu_gather: remove __tlb_reset_range() for force flush
> > I don't think we can elide the call __tlb_reset_range() entirely, since I > > think we do want to clear the freed_pXX bits to ensure that we walk the > > range with the smallest mapping granule that we have. Otherwise couldn't we > > have a problem if we hit a PMD that had been cleared, but the TLB > > invalidation for the PTEs that used to be linked below it was still > > pending? > > That's tlb->cleared_p*, and yes agreed. That is, right until some > architecture has level dependent TLBI instructions, at which point we'll > need to have them all set instead of cleared. > > > Perhaps we should just set fullmm if we see that here's a concurrent > > unmapper rather than do a worst-case range invalidation. Do you have a > > feeling > > for often the mm_tlb_flush_nested() triggers in practice? > > Quite a bit for certain workloads I imagine, that was the whole point of > doing it. > > Anyway; am I correct in understanding that the actual problem is that > we've cleared freed_tables and the ARM64 tlb_flush() will then not > invalidate the cache and badness happens? That is my understanding, only last level is flushed, which is not enough. > > Because so far nobody has actually provided a coherent description of > the actual problem we're trying to solve. But I'm thinking something > like the below ought to do. I applied it (and fixed small typo: s/tlb->full_mm/tlb->fullmm/). It fixes the problem for me. > > > diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c > index 99740e1dd273..fe768f8d612e 100644 > --- a/mm/mmu_gather.c > +++ b/mm/mmu_gather.c > @@ -244,15 +244,20 @@ void tlb_finish_mmu(struct mmu_gather *tlb, > unsigned long start, unsigned long end) > { > /* > - * If there are parallel threads are doing PTE changes on same range > - * under non-exclusive lock(e.g., mmap_sem read-side) but defer TLB > - * flush by batching, a thread has stable TLB entry can fail to flush > - * the TLB by observing pte_none|!pte_dirty, for example so flush TLB > - * forcefully if we detect parallel PTE batching threads. > + * Sensible comment goes here.. >*/ > - if (mm_tlb_flush_nested(tlb->mm)) { > - __tlb_reset_range(tlb); > - __tlb_adjust_range(tlb, start, end - start); > + if (mm_tlb_flush_nested(tlb->mm) && !tlb->full_mm) { > + /* > + * Since we're can't tell what we actually should have > + * flushed flush everything in the given range. > + */ > + tlb->start = start; > + tlb->end = end; > + tlb->freed_tables = 1; > + tlb->cleared_ptes = 1; > + tlb->cleared_pmds = 1; > + tlb->cleared_puds = 1; > + tlb->cleared_p4ds = 1; > } > > tlb_flush_mmu(tlb); >
[PATCH v3] mm/memory.c: do_fault: avoid usage of stale vm_area_struct
LTP testcase mtest06 [1] can trigger a crash on s390x running 5.0.0-rc8. This is a stress test, where one thread mmaps/writes/munmaps memory area and other thread is trying to read from it: CPU: 0 PID: 2611 Comm: mmap1 Not tainted 5.0.0-rc8+ #51 Hardware name: IBM 2964 N63 400 (z/VM 6.4.0) Krnl PSW : 0404e0018000 001ac8d8 (__lock_acquire+0x7/0x7a8) Call Trace: ([<>] (null)) [<001adae4>] lock_acquire+0xec/0x258 [<0080d1ac>] _raw_spin_lock_bh+0x5c/0x98 [<0012a780>] page_table_free+0x48/0x1a8 [<002f6e54>] do_fault+0xdc/0x670 [<002fadae>] __handle_mm_fault+0x416/0x5f0 [<002fb138>] handle_mm_fault+0x1b0/0x320 [<001248cc>] do_dat_exception+0x19c/0x2c8 [<0080e5ee>] pgm_check_handler+0x19e/0x200 page_table_free() is called with NULL mm parameter, but because "0" is a valid address on s390 (see S390_lowcore), it keeps going until it eventually crashes in lockdep's lock_acquire. This crash is reproducible at least since 4.14. Problem is that "vmf->vma" used in do_fault() can become stale. Because mmap_sem may be released, other threads can come in, call munmap() and cause "vma" be returned to kmem cache, and get zeroed/re-initialized and re-used: handle_mm_fault | __handle_mm_fault | do_fault | vma = vmf->vma | do_read_fault | __do_fault| vma->vm_ops->fault(vmf);| mmap_sem is released | | | do_munmap() | remove_vma_list() | remove_vma() | vm_area_free() | # vma is released | ... | # same vma is allocated | # from kmem cache | do_mmap() | vm_area_alloc() | memset(vma, 0, ...) | pte_free(vma->vm_mm, ...); | page_table_free | spin_lock_bh(>context.lock);| | Cache mm_struct to avoid using potentially stale "vma". [1] https://github.com/linux-test-project/ltp/blob/master/testcases/kernel/mem/mtest06/mmap1.c Signed-off-by: Jan Stancek Reviewed-by: Andrea Arcangeli --- mm/memory.c | 5 - 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/mm/memory.c b/mm/memory.c index e11ca9dd823f..e8d69ade5acc 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3517,10 +3517,13 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf) * but allow concurrent faults). * The mmap_sem may have been released depending on flags and our * return value. See filemap_fault() and __lock_page_or_retry(). + * If mmap_sem is released, vma may become invalid (for example + * by other thread calling munmap()). */ static vm_fault_t do_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; + struct mm_struct *vm_mm = vma->vm_mm; vm_fault_t ret; /* @@ -3561,7 +3564,7 @@ static vm_fault_t do_fault(struct vm_fault *vmf) /* preallocated pagetable is unused: free it */ if (vmf->prealloc_pte) { - pte_free(vma->vm_mm, vmf->prealloc_pte); + pte_free(vm_mm, vmf->prealloc_pte); vmf->prealloc_pte = NULL; } return ret; -- 1.8.3.1
Re: [PATCH v2] mm/memory.c: do_fault: avoid usage of stale vm_area_struct
- Original Message - > Hello Jan, > > On Sat, Mar 02, 2019 at 07:19:39PM +0100, Jan Stancek wrote: > > + struct mm_struct *vm_mm = READ_ONCE(vma->vm_mm); > > The vma->vm_mm cannot change under gcc there, so no need of > READ_ONCE. The release of mmap_sem has release semantics so the > vma->vm_mm access cannot be reordered after up_read(mmap_sem) either. > > Other than the above detail: > > Reviewed-by: Andrea Arcangeli Thank you for review, I dropped READ_ONCE and sent v3 with your Reviewed-by included. I also successfully re-ran tests over-night. > Would this not need a corresponding WRITE_ONCE() in vma_init() ? There's at least 2 context switches between, so I think it wouldn't matter. My concern was gcc optimizing out vm_mm, and vma->vm_mm access happening only after do_read_fault().
[PATCH v2] mm/memory.c: do_fault: avoid usage of stale vm_area_struct
LTP testcase mtest06 [1] can trigger a crash on s390x running 5.0.0-rc8. This is a stress test, where one thread mmaps/writes/munmaps memory area and other thread is trying to read from it: CPU: 0 PID: 2611 Comm: mmap1 Not tainted 5.0.0-rc8+ #51 Hardware name: IBM 2964 N63 400 (z/VM 6.4.0) Krnl PSW : 0404e0018000 001ac8d8 (__lock_acquire+0x7/0x7a8) Call Trace: ([<>] (null)) [<001adae4>] lock_acquire+0xec/0x258 [<0080d1ac>] _raw_spin_lock_bh+0x5c/0x98 [<0012a780>] page_table_free+0x48/0x1a8 [<002f6e54>] do_fault+0xdc/0x670 [<002fadae>] __handle_mm_fault+0x416/0x5f0 [<002fb138>] handle_mm_fault+0x1b0/0x320 [<001248cc>] do_dat_exception+0x19c/0x2c8 [<0080e5ee>] pgm_check_handler+0x19e/0x200 page_table_free() is called with NULL mm parameter, but because "0" is a valid address on s390 (see S390_lowcore), it keeps going until it eventually crashes in lockdep's lock_acquire. This crash is reproducible at least since 4.14. Problem is that "vmf->vma" used in do_fault() can become stale. Because mmap_sem may be released, other threads can come in, call munmap() and cause "vma" be returned to kmem cache, and get zeroed/re-initialized and re-used: handle_mm_fault | __handle_mm_fault | do_fault | vma = vmf->vma | do_read_fault | __do_fault| vma->vm_ops->fault(vmf);| mmap_sem is released | | | do_munmap() | remove_vma_list() | remove_vma() | vm_area_free() | # vma is released | ... | # same vma is allocated | # from kmem cache | do_mmap() | vm_area_alloc() | memset(vma, 0, ...) | pte_free(vma->vm_mm, ...); | page_table_free | spin_lock_bh(>context.lock);| | Cache mm_struct to avoid using potentially stale "vma". [1] https://github.com/linux-test-project/ltp/blob/master/testcases/kernel/mem/mtest06/mmap1.c Signed-off-by: Jan Stancek --- mm/memory.c | 5 - 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/mm/memory.c b/mm/memory.c index e11ca9dd823f..6c1afc1ece50 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3517,10 +3517,13 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf) * but allow concurrent faults). * The mmap_sem may have been released depending on flags and our * return value. See filemap_fault() and __lock_page_or_retry(). + * If mmap_sem is released, vma may become invalid (for example + * by other thread calling munmap()). */ static vm_fault_t do_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; + struct mm_struct *vm_mm = READ_ONCE(vma->vm_mm); vm_fault_t ret; /* @@ -3561,7 +3564,7 @@ static vm_fault_t do_fault(struct vm_fault *vmf) /* preallocated pagetable is unused: free it */ if (vmf->prealloc_pte) { - pte_free(vma->vm_mm, vmf->prealloc_pte); + pte_free(vm_mm, vmf->prealloc_pte); vmf->prealloc_pte = NULL; } return ret; -- 1.8.3.1
Re: [PATCH] mm/memory.c: do_fault: avoid usage of stale vm_area_struct
- Original Message - > On Sat, Mar 02, 2019 at 04:11:26PM +0100, Jan Stancek wrote: > > Problem is that "vmf->vma" used in do_fault() can become stale. > > Because mmap_sem may be released, other threads can come in, > > call munmap() and cause "vma" be returned to kmem cache, and > > get zeroed/re-initialized and re-used: > > > This patch pins mm_struct and stores its value, to avoid using > > potentially stale "vma" when calling pte_free(). > > OK, we need to cache the mm_struct, but why do we need the extra atomic op? > There's surely no way the mm can be freed while the thread is in the middle > of handling a fault. You're right, I was needlessly paranoid. > > ie I would drop these lines: I'll send v2. Thanks, Jan > > > + mmgrab(vm_mm); > > + > ... > > + > > + mmdrop(vm_mm); > > + >
[PATCH] mm/memory.c: do_fault: avoid usage of stale vm_area_struct
LTP testcase mtest06 [1] can trigger a crash on s390x running 5.0.0-rc8. This is a stress test, where one thread mmaps/writes/munmaps memory area and other thread is trying to read from it: CPU: 0 PID: 2611 Comm: mmap1 Not tainted 5.0.0-rc8+ #51 Hardware name: IBM 2964 N63 400 (z/VM 6.4.0) Krnl PSW : 0404e0018000 001ac8d8 (__lock_acquire+0x7/0x7a8) Call Trace: ([<>] (null)) [<001adae4>] lock_acquire+0xec/0x258 [<0080d1ac>] _raw_spin_lock_bh+0x5c/0x98 [<0012a780>] page_table_free+0x48/0x1a8 [<002f6e54>] do_fault+0xdc/0x670 [<002fadae>] __handle_mm_fault+0x416/0x5f0 [<002fb138>] handle_mm_fault+0x1b0/0x320 [<001248cc>] do_dat_exception+0x19c/0x2c8 [<0080e5ee>] pgm_check_handler+0x19e/0x200 page_table_free() is called with NULL mm parameter, but because "0" is a valid address on s390 (see S390_lowcore), it keeps going until it eventually crashes in lockdep's lock_acquire. This crash is reproducible at least since 4.14. Problem is that "vmf->vma" used in do_fault() can become stale. Because mmap_sem may be released, other threads can come in, call munmap() and cause "vma" be returned to kmem cache, and get zeroed/re-initialized and re-used: handle_mm_fault | __handle_mm_fault | do_fault | vma = vmf->vma | do_read_fault | __do_fault| vma->vm_ops->fault(vmf);| mmap_sem is released | | | do_munmap() | remove_vma_list() | remove_vma() | vm_area_free() | # vma is released | ... | # same vma is allocated | # from kmem cache | do_mmap() | vm_area_alloc() | memset(vma, 0, ...) | pte_free(vma->vm_mm, ...); | page_table_free | spin_lock_bh(>context.lock);| | This patch pins mm_struct and stores its value, to avoid using potentially stale "vma" when calling pte_free(). [1] https://github.com/linux-test-project/ltp/blob/master/testcases/kernel/mem/mtest06/mmap1.c Signed-off-by: Jan Stancek --- mm/memory.c | 10 +- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/mm/memory.c b/mm/memory.c index e11ca9dd823f..1287ee9acbdc 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3517,12 +3517,17 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf) * but allow concurrent faults). * The mmap_sem may have been released depending on flags and our * return value. See filemap_fault() and __lock_page_or_retry(). + * If mmap_sem is released, vma may become invalid (for example + * by other thread calling munmap()). */ static vm_fault_t do_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; + struct mm_struct *vm_mm = READ_ONCE(vma->vm_mm); vm_fault_t ret; + mmgrab(vm_mm); + /* * The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */ @@ -3561,9 +3566,12 @@ static vm_fault_t do_fault(struct vm_fault *vmf) /* preallocated pagetable is unused: free it */ if (vmf->prealloc_pte) { - pte_free(vma->vm_mm, vmf->prealloc_pte); + pte_free(vm_mm, vmf->prealloc_pte); vmf->prealloc_pte = NULL; } + + mmdrop(vm_mm); + return ret; } -- 1.8.3.1
Re: [PATCH v2] mm: page_mapped: don't assume compound page is huge or THP
- Original Message - > On Fri, Nov 30, 2018 at 1:07 PM Jan Stancek wrote: > > > > LTP proc01 testcase has been observed to rarely trigger crashes > > on arm64: > > page_mapped+0x78/0xb4 > > stable_page_flags+0x27c/0x338 > > kpageflags_read+0xfc/0x164 > > proc_reg_read+0x7c/0xb8 > > __vfs_read+0x58/0x178 > > vfs_read+0x90/0x14c > > SyS_read+0x60/0xc0 > > > > Issue is that page_mapped() assumes that if compound page is not > > huge, then it must be THP. But if this is 'normal' compound page > > (COMPOUND_PAGE_DTOR), then following loop can keep running > > (for HPAGE_PMD_NR iterations) until it tries to read from memory > > that isn't mapped and triggers a panic: > > for (i = 0; i < hpage_nr_pages(page); i++) { > > if (atomic_read([i]._mapcount) >= 0) > > return true; > > } > > > > I could replicate this on x86 (v4.20-rc4-98-g60b548237fed) only > > with a custom kernel module [1] which: > > - allocates compound page (PAGEC) of order 1 > > - allocates 2 normal pages (COPY), which are initialized to 0xff > > (to satisfy _mapcount >= 0) > > - 2 PAGEC page structs are copied to address of first COPY page > > - second page of COPY is marked as not present > > - call to page_mapped(COPY) now triggers fault on access to 2nd > > COPY page at offset 0x30 (_mapcount) > > > > [1] > > https://github.com/jstancek/reproducers/blob/master/kernel/page_mapped_crash/repro.c > > > > Fix the loop to iterate for "1 << compound_order" pages. > > > > Debugged-by: Laszlo Ersek > > Suggested-by: "Kirill A. Shutemov" > > Signed-off-by: Jan Stancek > > --- > > mm/util.c | 2 +- > > 1 file changed, 1 insertion(+), 1 deletion(-) > > > > Changes in v2: > > - change the loop instead so we check also mapcount of subpages > > > > diff --git a/mm/util.c b/mm/util.c > > index 8bf08b5b5760..5c9c7359ee8a 100644 > > --- a/mm/util.c > > +++ b/mm/util.c > > @@ -478,7 +478,7 @@ bool page_mapped(struct page *page) > > return true; > > if (PageHuge(page)) > > return false; > > - for (i = 0; i < hpage_nr_pages(page); i++) { > > + for (i = 0; i < (1 << compound_order(page)); i++) { > > if (atomic_read([i]._mapcount) >= 0) > > return true; > > } > > -- > > 1.8.3.1 > > Hi all > > This patch landed in the 4.9-stable tree starting from 4.9.151 and it > broke our MIPS1004kc system with CONFIG_HIGHMEM=y. Hi, are you using THP (CONFIG_TRANSPARENT_HUGEPAGE)? The changed line should affect only THP and normal compound pages, so a test with THP disabled might be interesting. > > The breakage consists of random processes dying with SIGILL or SIGSEGV > when we stress test the system with high memory pressure and explicit > memory compaction requested through /proc/sys/vm/compact_memory. > Reverting this patch fixes the crashes. > > We can put some effort on debugging if there are no obvious > explanations for this. Keep in mind that this is 32-bit system with > HIGHMEM. Nothing obvious that I can see. I've been trying to reproduce on 32-bit x86 Fedora with no luck so far. Regards, Jan
[PATCH v2] mm: page_mapped: don't assume compound page is huge or THP
LTP proc01 testcase has been observed to rarely trigger crashes on arm64: page_mapped+0x78/0xb4 stable_page_flags+0x27c/0x338 kpageflags_read+0xfc/0x164 proc_reg_read+0x7c/0xb8 __vfs_read+0x58/0x178 vfs_read+0x90/0x14c SyS_read+0x60/0xc0 Issue is that page_mapped() assumes that if compound page is not huge, then it must be THP. But if this is 'normal' compound page (COMPOUND_PAGE_DTOR), then following loop can keep running (for HPAGE_PMD_NR iterations) until it tries to read from memory that isn't mapped and triggers a panic: for (i = 0; i < hpage_nr_pages(page); i++) { if (atomic_read([i]._mapcount) >= 0) return true; } I could replicate this on x86 (v4.20-rc4-98-g60b548237fed) only with a custom kernel module [1] which: - allocates compound page (PAGEC) of order 1 - allocates 2 normal pages (COPY), which are initialized to 0xff (to satisfy _mapcount >= 0) - 2 PAGEC page structs are copied to address of first COPY page - second page of COPY is marked as not present - call to page_mapped(COPY) now triggers fault on access to 2nd COPY page at offset 0x30 (_mapcount) [1] https://github.com/jstancek/reproducers/blob/master/kernel/page_mapped_crash/repro.c Fix the loop to iterate for "1 << compound_order" pages. Debugged-by: Laszlo Ersek Suggested-by: "Kirill A. Shutemov" Signed-off-by: Jan Stancek --- mm/util.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) Changes in v2: - change the loop instead so we check also mapcount of subpages diff --git a/mm/util.c b/mm/util.c index 8bf08b5b5760..5c9c7359ee8a 100644 --- a/mm/util.c +++ b/mm/util.c @@ -478,7 +478,7 @@ bool page_mapped(struct page *page) return true; if (PageHuge(page)) return false; - for (i = 0; i < hpage_nr_pages(page); i++) { + for (i = 0; i < (1 << compound_order(page)); i++) { if (atomic_read([i]._mapcount) >= 0) return true; } -- 1.8.3.1
[PATCH v2] mm: page_mapped: don't assume compound page is huge or THP
LTP proc01 testcase has been observed to rarely trigger crashes on arm64: page_mapped+0x78/0xb4 stable_page_flags+0x27c/0x338 kpageflags_read+0xfc/0x164 proc_reg_read+0x7c/0xb8 __vfs_read+0x58/0x178 vfs_read+0x90/0x14c SyS_read+0x60/0xc0 Issue is that page_mapped() assumes that if compound page is not huge, then it must be THP. But if this is 'normal' compound page (COMPOUND_PAGE_DTOR), then following loop can keep running (for HPAGE_PMD_NR iterations) until it tries to read from memory that isn't mapped and triggers a panic: for (i = 0; i < hpage_nr_pages(page); i++) { if (atomic_read([i]._mapcount) >= 0) return true; } I could replicate this on x86 (v4.20-rc4-98-g60b548237fed) only with a custom kernel module [1] which: - allocates compound page (PAGEC) of order 1 - allocates 2 normal pages (COPY), which are initialized to 0xff (to satisfy _mapcount >= 0) - 2 PAGEC page structs are copied to address of first COPY page - second page of COPY is marked as not present - call to page_mapped(COPY) now triggers fault on access to 2nd COPY page at offset 0x30 (_mapcount) [1] https://github.com/jstancek/reproducers/blob/master/kernel/page_mapped_crash/repro.c Fix the loop to iterate for "1 << compound_order" pages. Debugged-by: Laszlo Ersek Suggested-by: "Kirill A. Shutemov" Signed-off-by: Jan Stancek --- mm/util.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) Changes in v2: - change the loop instead so we check also mapcount of subpages diff --git a/mm/util.c b/mm/util.c index 8bf08b5b5760..5c9c7359ee8a 100644 --- a/mm/util.c +++ b/mm/util.c @@ -478,7 +478,7 @@ bool page_mapped(struct page *page) return true; if (PageHuge(page)) return false; - for (i = 0; i < hpage_nr_pages(page); i++) { + for (i = 0; i < (1 << compound_order(page)); i++) { if (atomic_read([i]._mapcount) >= 0) return true; } -- 1.8.3.1
[PATCH] mm: page_mapped: don't assume compound page is huge or THP
LTP proc01 testcase has been observed to rarely trigger crashes on arm64: page_mapped+0x78/0xb4 stable_page_flags+0x27c/0x338 kpageflags_read+0xfc/0x164 proc_reg_read+0x7c/0xb8 __vfs_read+0x58/0x178 vfs_read+0x90/0x14c SyS_read+0x60/0xc0 Issue is that page_mapped() assumes that if compound page is not huge, then it must be THP. But if this is 'normal' compound page (COMPOUND_PAGE_DTOR), then following loop can keep running until it tries to read from memory that isn't mapped and triggers a panic: for (i = 0; i < hpage_nr_pages(page); i++) { if (atomic_read([i]._mapcount) >= 0) return true; } I could replicate this on x86 (v4.20-rc4-98-g60b548237fed) only with a custom kernel module [1] which: - allocates compound page (PAGEC) of order 1 - allocates 2 normal pages (COPY), which are initialized to 0xff (to satisfy _mapcount >= 0) - 2 PAGEC page structs are copied to address of first COPY page - second page of COPY is marked as not present - call to page_mapped(COPY) now triggers fault on access to 2nd COPY page at offset 0x30 (_mapcount) [1] https://github.com/jstancek/reproducers/blob/master/kernel/page_mapped_crash/repro.c This patch modifies page_mapped() to check for 'normal' compound pages (COMPOUND_PAGE_DTOR). Debugged-by: Laszlo Ersek Signed-off-by: Jan Stancek --- include/linux/mm.h | 9 + mm/util.c | 2 ++ 2 files changed, 11 insertions(+) diff --git a/include/linux/mm.h b/include/linux/mm.h index 5411de93a363..18b0bb953f92 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -700,6 +700,15 @@ static inline compound_page_dtor *get_compound_page_dtor(struct page *page) return compound_page_dtors[page[1].compound_dtor]; } +static inline int PageNormalCompound(struct page *page) +{ + if (!PageCompound(page)) + return 0; + + page = compound_head(page); + return page[1].compound_dtor == COMPOUND_PAGE_DTOR; +} + static inline unsigned int compound_order(struct page *page) { if (!PageHead(page)) diff --git a/mm/util.c b/mm/util.c index 8bf08b5b5760..06c1640cb7b3 100644 --- a/mm/util.c +++ b/mm/util.c @@ -478,6 +478,8 @@ bool page_mapped(struct page *page) return true; if (PageHuge(page)) return false; + if (PageNormalCompound(page)) + return false; for (i = 0; i < hpage_nr_pages(page); i++) { if (atomic_read([i]._mapcount) >= 0) return true; -- 1.8.3.1
[PATCH] mm: page_mapped: don't assume compound page is huge or THP
LTP proc01 testcase has been observed to rarely trigger crashes on arm64: page_mapped+0x78/0xb4 stable_page_flags+0x27c/0x338 kpageflags_read+0xfc/0x164 proc_reg_read+0x7c/0xb8 __vfs_read+0x58/0x178 vfs_read+0x90/0x14c SyS_read+0x60/0xc0 Issue is that page_mapped() assumes that if compound page is not huge, then it must be THP. But if this is 'normal' compound page (COMPOUND_PAGE_DTOR), then following loop can keep running until it tries to read from memory that isn't mapped and triggers a panic: for (i = 0; i < hpage_nr_pages(page); i++) { if (atomic_read([i]._mapcount) >= 0) return true; } I could replicate this on x86 (v4.20-rc4-98-g60b548237fed) only with a custom kernel module [1] which: - allocates compound page (PAGEC) of order 1 - allocates 2 normal pages (COPY), which are initialized to 0xff (to satisfy _mapcount >= 0) - 2 PAGEC page structs are copied to address of first COPY page - second page of COPY is marked as not present - call to page_mapped(COPY) now triggers fault on access to 2nd COPY page at offset 0x30 (_mapcount) [1] https://github.com/jstancek/reproducers/blob/master/kernel/page_mapped_crash/repro.c This patch modifies page_mapped() to check for 'normal' compound pages (COMPOUND_PAGE_DTOR). Debugged-by: Laszlo Ersek Signed-off-by: Jan Stancek --- include/linux/mm.h | 9 + mm/util.c | 2 ++ 2 files changed, 11 insertions(+) diff --git a/include/linux/mm.h b/include/linux/mm.h index 5411de93a363..18b0bb953f92 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -700,6 +700,15 @@ static inline compound_page_dtor *get_compound_page_dtor(struct page *page) return compound_page_dtors[page[1].compound_dtor]; } +static inline int PageNormalCompound(struct page *page) +{ + if (!PageCompound(page)) + return 0; + + page = compound_head(page); + return page[1].compound_dtor == COMPOUND_PAGE_DTOR; +} + static inline unsigned int compound_order(struct page *page) { if (!PageHead(page)) diff --git a/mm/util.c b/mm/util.c index 8bf08b5b5760..06c1640cb7b3 100644 --- a/mm/util.c +++ b/mm/util.c @@ -478,6 +478,8 @@ bool page_mapped(struct page *page) return true; if (PageHuge(page)) return false; + if (PageNormalCompound(page)) + return false; for (i = 0; i < hpage_nr_pages(page); i++) { if (atomic_read([i]._mapcount) >= 0) return true; -- 1.8.3.1
Re: [LTP] [PATCH 4.17 00/67] 4.17.7-stable review
- Original Message - > On 16 July 2018 at 13:04, Greg Kroah-Hartman > wrote: > > This is the start of the stable review cycle for the 4.17.7 release. > > There are 67 patches in this series, all will be posted as a response > > to this one. If anyone has any issues with these being applied, please > > let me know. > > > > Responses should be made by Wed Jul 18 07:34:11 UTC 2018. > > Anything received after that time might be too late. > > > > The whole patch series can be found in one patch at: > > > > https://www.kernel.org/pub/linux/kernel/v4.x/stable-review/patch-4.17.7-rc1.gz > > or in the git tree and branch at: > > > > git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git > > linux-4.17.y > > and the diffstat can be found below. > > > > thanks, > > > > greg k-h > > > Results from Linaro’s test farm. > Regressions detected. > > LTP syscalls failed test cases on all devices arm64, arm32 and x86_64, > - creat08 > - open10 This is consequence of: 0fa3ecd87848 ("Fix up non-directory creation in SGID directories") So likely cause of action here will be modifying both tests to not expect S_ISGID in scenario described in commit above. [adding Dave Chinner] @Dave: Does XFS need separate patch for problem described in commit 0fa3ecd87848? XFS doesn't appear to use inode_init_owner() function, both tests currently PASS the check for S_ISGID on file created by non-group member as of v4.18-rc5. Regards, Jan > > Reported this bug internally on upstream Linux mainline week back. > Now this bug happening on 4.17, 4.14, 4.9 and 4.4 > > creat08 and open10 failed with this error, > TFAIL : testdir.B.3132/setgid: Incorrect modes, setgid bit should be set > > Test case description: > /* > * NAME > * creat08.c - Verifies that the group ID and setgid bit are > *set correctly when a new file is created. > *(ported from SPIE, section2/iosuite/creat5.c, > * by Airong Zhang ) > * CALLS > * creat > * > * ALGORITHM > * Create two directories, one with the group ID of this process > * and the setgid bit not set, and the other with a group ID > * other than that of this process and with the setgid bit set. > * In each directory, create a file with and without the setgid > * bit set in the creation modes. Verify that the modes and group > * ID are correct on each of the 4 files. > * As root, create a file with the setgid bit on in the > * directory with the setgid bit. > * This tests the SVID3 create group semantics. > */ > > Block2 testing: > /*--*/ > /* Block2: Create two files in testdir.B, one with the setgid */ > /* bit set in the creation modes and the other without. */ > /*Both should inherit the group ID of the parent */ > /*directory, group2. */ > /*--*/ > > > Test results comparison on mainline versions, > https://qa-reports.linaro.org/lkft/linux-mainline-oe/tests/ltp-syscalls-tests/creat08 > https://qa-reports.linaro.org/lkft/linux-mainline-oe/tests/ltp-syscalls-tests/open10 > > Test results comparison on 4.17 versions, > https://qa-reports.linaro.org/lkft/linux-stable-rc-4.17-oe/tests/ltp-syscalls-tests/creat08 > https://qa-reports.linaro.org/lkft/linux-stable-rc-4.17-oe/tests/ltp-syscalls-tests/open10 > > Bug report link, > https://bugs.linaro.org/show_bug.cgi?id=3940 > > Summary > > > kernel: 4.17.7-rc1 > git repo: > https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git > git branch: linux-4.17.y > git commit: bc0bd9e05fa1e213c689620eb4cba825c03dcc4a > git describe: v4.17.6-68-gbc0bd9e05fa1 > Test details: > https://qa-reports.linaro.org/lkft/linux-stable-rc-4.17-oe/build/v4.17.6-68-gbc0bd9e05fa1 > > Regressions (compared to build v4.17.6-67-g3b02b1fd1975) > > > LTP syscalls failed test cases on all devices, > > - creat08 > - open10 > > -- > Linaro LKFT > https://lkft.linaro.org > > -- > Mailing list info: https://lists.linux.it/listinfo/ltp >
Re: [LTP] [PATCH 4.17 00/67] 4.17.7-stable review
- Original Message - > On 16 July 2018 at 13:04, Greg Kroah-Hartman > wrote: > > This is the start of the stable review cycle for the 4.17.7 release. > > There are 67 patches in this series, all will be posted as a response > > to this one. If anyone has any issues with these being applied, please > > let me know. > > > > Responses should be made by Wed Jul 18 07:34:11 UTC 2018. > > Anything received after that time might be too late. > > > > The whole patch series can be found in one patch at: > > > > https://www.kernel.org/pub/linux/kernel/v4.x/stable-review/patch-4.17.7-rc1.gz > > or in the git tree and branch at: > > > > git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git > > linux-4.17.y > > and the diffstat can be found below. > > > > thanks, > > > > greg k-h > > > Results from Linaro’s test farm. > Regressions detected. > > LTP syscalls failed test cases on all devices arm64, arm32 and x86_64, > - creat08 > - open10 This is consequence of: 0fa3ecd87848 ("Fix up non-directory creation in SGID directories") So likely cause of action here will be modifying both tests to not expect S_ISGID in scenario described in commit above. [adding Dave Chinner] @Dave: Does XFS need separate patch for problem described in commit 0fa3ecd87848? XFS doesn't appear to use inode_init_owner() function, both tests currently PASS the check for S_ISGID on file created by non-group member as of v4.18-rc5. Regards, Jan > > Reported this bug internally on upstream Linux mainline week back. > Now this bug happening on 4.17, 4.14, 4.9 and 4.4 > > creat08 and open10 failed with this error, > TFAIL : testdir.B.3132/setgid: Incorrect modes, setgid bit should be set > > Test case description: > /* > * NAME > * creat08.c - Verifies that the group ID and setgid bit are > *set correctly when a new file is created. > *(ported from SPIE, section2/iosuite/creat5.c, > * by Airong Zhang ) > * CALLS > * creat > * > * ALGORITHM > * Create two directories, one with the group ID of this process > * and the setgid bit not set, and the other with a group ID > * other than that of this process and with the setgid bit set. > * In each directory, create a file with and without the setgid > * bit set in the creation modes. Verify that the modes and group > * ID are correct on each of the 4 files. > * As root, create a file with the setgid bit on in the > * directory with the setgid bit. > * This tests the SVID3 create group semantics. > */ > > Block2 testing: > /*--*/ > /* Block2: Create two files in testdir.B, one with the setgid */ > /* bit set in the creation modes and the other without. */ > /*Both should inherit the group ID of the parent */ > /*directory, group2. */ > /*--*/ > > > Test results comparison on mainline versions, > https://qa-reports.linaro.org/lkft/linux-mainline-oe/tests/ltp-syscalls-tests/creat08 > https://qa-reports.linaro.org/lkft/linux-mainline-oe/tests/ltp-syscalls-tests/open10 > > Test results comparison on 4.17 versions, > https://qa-reports.linaro.org/lkft/linux-stable-rc-4.17-oe/tests/ltp-syscalls-tests/creat08 > https://qa-reports.linaro.org/lkft/linux-stable-rc-4.17-oe/tests/ltp-syscalls-tests/open10 > > Bug report link, > https://bugs.linaro.org/show_bug.cgi?id=3940 > > Summary > > > kernel: 4.17.7-rc1 > git repo: > https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git > git branch: linux-4.17.y > git commit: bc0bd9e05fa1e213c689620eb4cba825c03dcc4a > git describe: v4.17.6-68-gbc0bd9e05fa1 > Test details: > https://qa-reports.linaro.org/lkft/linux-stable-rc-4.17-oe/build/v4.17.6-68-gbc0bd9e05fa1 > > Regressions (compared to build v4.17.6-67-g3b02b1fd1975) > > > LTP syscalls failed test cases on all devices, > > - creat08 > - open10 > > -- > Linaro LKFT > https://lkft.linaro.org > > -- > Mailing list info: https://lists.linux.it/listinfo/ltp >
Re: LTP CVE cve-2017-17053 test failed on x86_64 device
- Original Message - > LTP CVE cve-2017-17053 test failed on x86_64 device. > FAIL on linux-next, mainline, and stable-rc-4.17. > PASS on stable-rc 4.16, 4.14, 4.9 and 4.4 kernel. > > Test FAIL case output, > tst_test.c:1015: INFO: Timeout per run is 0h 15m 00s > tst_taint.c:88: BROK: Kernel is already tainted: 512 You seem to be running into some nfs warning early during boot [1][2], that taints the kernel. cve-2017-17053 test doesn't do much, it fails in setup because it finds kernel already tainted. Looks similar to: https://www.spinics.net/lists/linux-nfs/msg68064.html Regards, Jan [1] [ 78.886529] [ cut here ] [ 78.891155] DEBUG_LOCKS_WARN_ON(sem->owner != ((struct task_struct *)(1UL << 0))) [ 78.891161] WARNING: CPU: 0 PID: 33 at /srv/oe/build/tmp-rpb-glibc/work-shared/intel-core2-32/kernel-source/kernel/locking/rwsem.c:217 up_read_non_owner+0x5d/0x70 [ 78.913141] Modules linked in: fuse [ 78.916633] CPU: 0 PID: 33 Comm: kworker/0:1 Not tainted 4.18.0-rc1 #1 [ 78.923150] Hardware name: Supermicro SYS-5019S-ML/X11SSH-F, BIOS 2.0b 07/27/2017 [ 78.930623] Workqueue: nfsiod rpc_async_release [ 78.935154] RIP: 0010:up_read_non_owner+0x5d/0x70 [ 78.939851] Code: 00 5b 5d c3 e8 54 9f 3a 00 85 c0 74 de 8b 05 ba 40 6a 02 85 c0 75 d4 48 c7 c6 c8 ce 8f 9c 48 c7 c7 93 8e 8e 9c e8 33 6b fa ff <0f> 0b eb bd 0f 1f 44 00 00 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 [ 78.958717] RSP: 0018:b46b419ebdd0 EFLAGS: 00010282 [ 78.963935] RAX: RBX: 882d1c390130 RCX: 0006 [ 78.971059] RDX: 0007 RSI: 0001 RDI: 882d2fc15730 [ 78.978184] RBP: b46b419ebdd8 R08: R09: [ 78.985318] R10: b46b419ebdd0 R11: R12: 882d1ad56800 [ 78.985319] R13: 882d1d133000 R14: R15: 882d1f047840 [ 78.985319] FS: () GS:882d2fc0() knlGS: [ 78.985320] CS: 0010 DS: ES: CR0: 80050033 [ 78.985321] CR2: 56385d962ae0 CR3: 21e1e002 CR4: 003606f0 [ 78.985321] DR0: DR1: DR2: [ 78.985322] DR3: DR6: fffe0ff0 DR7: 0400 [ 78.985322] Call Trace: [[0;32m OK [[ 78.985327] nfs_async_unlink_release+0x32/0x80 [ 78.985329] rpc_free_task+0x30/0x50 [ 78.985330] rpc_async_release+0x12/0x20 [ 78.985332] process_one_work+0x278/0x670 0m] [ 78.985335] worker_thread+0x4d/0x410 [ 78.985337] kthread+0x10d/0x140 [ 78.985338] ? rescuer_thread+0x3a0/0x3a0 [ 78.985339] ? kthread_create_worker_on_cpu+0x70/0x70 [ 78.985341] ret_from_fork+0x3a/0x50 [ 78.985344] irq event stamp: 36545 [ 78.985346] hardirqs last enabled at (36545): [] _raw_spin_unlock_irq+0x2c/0x40 [ 78.985347] hardirqs last disabled at (36544): [] _raw_spin_lock_irq+0x13/0x50 [ 78.985348] softirqs last enabled at (34878): [] reg_todo+0x260/0x2f0 [ 78.985350] softirqs last disabled at (34876): [] reg_todo+0x171/0x2f0 [ 79.111787] ---[ end trace bed1f41cfea3a4c5 ]--- [2] https://lkft.validation.linaro.org/scheduler/job/290429#L893 > Summary: > passed 0 > failed 0 > skipped 0 > warnings 0 > > Test comments, > /* Regression test for CVE-2017-17053, original reproducer can be found > * here: > * > > https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=ccd5b3235180eef3cfec337df1c8554ab151b5cc > * > * Be careful! This test may crash your kernel! > */ > > Full test log: > https://lkft.validation.linaro.org/scheduler/job/290429#L9581 > https://lkft.validation.linaro.org/scheduler/job/259217#L10308 > > History of the test case shows failed on > Linux next kernel, > https://qa-reports.linaro.org/lkft/linux-next-oe/tests/ltp-cve-tests/cve-2017-17053 > > Linux mainline kernel, > https://qa-reports.linaro.org/lkft/linux-mainline-oe/tests/ltp-cve-tests \ > /cve-2017-17053 > ^ Please join link > > Linux stable rc 4.17, > https://qa-reports.linaro.org/lkft/linux-stable-rc-4.17-oe/tests/ltp-cve-tests > \ > /cve-2017-17053 > ^ Please join link > > Test PASS on 4.16 kernel. > https://qa-reports.linaro.org/lkft/linux-stable-rc-4.16-oe/tests/ltp-cve-tests > \ > /cve-2017-17053 > ^ Please join link > > Best regards > Naresh Kamboju >
Re: LTP CVE cve-2017-17053 test failed on x86_64 device
- Original Message - > LTP CVE cve-2017-17053 test failed on x86_64 device. > FAIL on linux-next, mainline, and stable-rc-4.17. > PASS on stable-rc 4.16, 4.14, 4.9 and 4.4 kernel. > > Test FAIL case output, > tst_test.c:1015: INFO: Timeout per run is 0h 15m 00s > tst_taint.c:88: BROK: Kernel is already tainted: 512 You seem to be running into some nfs warning early during boot [1][2], that taints the kernel. cve-2017-17053 test doesn't do much, it fails in setup because it finds kernel already tainted. Looks similar to: https://www.spinics.net/lists/linux-nfs/msg68064.html Regards, Jan [1] [ 78.886529] [ cut here ] [ 78.891155] DEBUG_LOCKS_WARN_ON(sem->owner != ((struct task_struct *)(1UL << 0))) [ 78.891161] WARNING: CPU: 0 PID: 33 at /srv/oe/build/tmp-rpb-glibc/work-shared/intel-core2-32/kernel-source/kernel/locking/rwsem.c:217 up_read_non_owner+0x5d/0x70 [ 78.913141] Modules linked in: fuse [ 78.916633] CPU: 0 PID: 33 Comm: kworker/0:1 Not tainted 4.18.0-rc1 #1 [ 78.923150] Hardware name: Supermicro SYS-5019S-ML/X11SSH-F, BIOS 2.0b 07/27/2017 [ 78.930623] Workqueue: nfsiod rpc_async_release [ 78.935154] RIP: 0010:up_read_non_owner+0x5d/0x70 [ 78.939851] Code: 00 5b 5d c3 e8 54 9f 3a 00 85 c0 74 de 8b 05 ba 40 6a 02 85 c0 75 d4 48 c7 c6 c8 ce 8f 9c 48 c7 c7 93 8e 8e 9c e8 33 6b fa ff <0f> 0b eb bd 0f 1f 44 00 00 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 [ 78.958717] RSP: 0018:b46b419ebdd0 EFLAGS: 00010282 [ 78.963935] RAX: RBX: 882d1c390130 RCX: 0006 [ 78.971059] RDX: 0007 RSI: 0001 RDI: 882d2fc15730 [ 78.978184] RBP: b46b419ebdd8 R08: R09: [ 78.985318] R10: b46b419ebdd0 R11: R12: 882d1ad56800 [ 78.985319] R13: 882d1d133000 R14: R15: 882d1f047840 [ 78.985319] FS: () GS:882d2fc0() knlGS: [ 78.985320] CS: 0010 DS: ES: CR0: 80050033 [ 78.985321] CR2: 56385d962ae0 CR3: 21e1e002 CR4: 003606f0 [ 78.985321] DR0: DR1: DR2: [ 78.985322] DR3: DR6: fffe0ff0 DR7: 0400 [ 78.985322] Call Trace: [[0;32m OK [[ 78.985327] nfs_async_unlink_release+0x32/0x80 [ 78.985329] rpc_free_task+0x30/0x50 [ 78.985330] rpc_async_release+0x12/0x20 [ 78.985332] process_one_work+0x278/0x670 0m] [ 78.985335] worker_thread+0x4d/0x410 [ 78.985337] kthread+0x10d/0x140 [ 78.985338] ? rescuer_thread+0x3a0/0x3a0 [ 78.985339] ? kthread_create_worker_on_cpu+0x70/0x70 [ 78.985341] ret_from_fork+0x3a/0x50 [ 78.985344] irq event stamp: 36545 [ 78.985346] hardirqs last enabled at (36545): [] _raw_spin_unlock_irq+0x2c/0x40 [ 78.985347] hardirqs last disabled at (36544): [] _raw_spin_lock_irq+0x13/0x50 [ 78.985348] softirqs last enabled at (34878): [] reg_todo+0x260/0x2f0 [ 78.985350] softirqs last disabled at (34876): [] reg_todo+0x171/0x2f0 [ 79.111787] ---[ end trace bed1f41cfea3a4c5 ]--- [2] https://lkft.validation.linaro.org/scheduler/job/290429#L893 > Summary: > passed 0 > failed 0 > skipped 0 > warnings 0 > > Test comments, > /* Regression test for CVE-2017-17053, original reproducer can be found > * here: > * > > https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=ccd5b3235180eef3cfec337df1c8554ab151b5cc > * > * Be careful! This test may crash your kernel! > */ > > Full test log: > https://lkft.validation.linaro.org/scheduler/job/290429#L9581 > https://lkft.validation.linaro.org/scheduler/job/259217#L10308 > > History of the test case shows failed on > Linux next kernel, > https://qa-reports.linaro.org/lkft/linux-next-oe/tests/ltp-cve-tests/cve-2017-17053 > > Linux mainline kernel, > https://qa-reports.linaro.org/lkft/linux-mainline-oe/tests/ltp-cve-tests \ > /cve-2017-17053 > ^ Please join link > > Linux stable rc 4.17, > https://qa-reports.linaro.org/lkft/linux-stable-rc-4.17-oe/tests/ltp-cve-tests > \ > /cve-2017-17053 > ^ Please join link > > Test PASS on 4.16 kernel. > https://qa-reports.linaro.org/lkft/linux-stable-rc-4.16-oe/tests/ltp-cve-tests > \ > /cve-2017-17053 > ^ Please join link > > Best regards > Naresh Kamboju >
Re: [LTP] [PATCH 4.4 00/24] 4.4.137-stable review
- Original Message - > On Thu, Jun 14, 2018 at 05:49:52AM -0400, Jan Stancek wrote: > > > > - Original Message - > > > On Thu, Jun 14, 2018 at 02:24:25PM +0530, Naresh Kamboju wrote: > > > > On 14 June 2018 at 12:04, Greg Kroah-Hartman > > > > > > > > wrote: > > > > > On Wed, Jun 13, 2018 at 10:48:50PM -0300, Rafael Tinoco wrote: > > > > >> On 13 June 2018 at 18:08, Rafael David Tinoco > > > > >> wrote: > > > > >> > On Wed, Jun 13, 2018 at 6:00 PM, Greg Kroah-Hartman > > > > >> > wrote: > > > > >> >> On Wed, Jun 13, 2018 at 05:47:49PM -0300, Rafael Tinoco wrote: > > > > >> >>> Results from Linaro’s test farm. > > > > >> >>> Regressions detected. > > > > >> >>> > > > > >> >>> NOTE: > > > > >> >>> > > > > >> >>> 1) LTP vma03 test (cve-2011-2496) broken on v4.4-137-rc1 because > > > > >> >>> of: > > > > >> >>> > > > > >> >>> 6ea1dc96a03a mmap: relax file size limit for regular files > > > > >> >>> bd2f9ce5bacb mmap: introduce sane default mmap limits > > > > >> >>> > > > > >> >>>discussion: > > > > >> >>> > > > > >> >>> https://github.com/linux-test-project/ltp/issues/341 > > > > >> >>> > > > > >> >>>mainline commit (v4.13-rc7): > > > > >> >>> > > > > >> >>> 0cc3b0ec23ce Clarify (and fix) MAX_LFS_FILESIZE macros > > > > >> >>> > > > > >> >>>should be backported to 4.4.138-rc2 and fixes the issue. > > > > >> >> > > > > >> >> Really? That commit says it fixes c2a9737f45e2 ("vfs,mm: fix a > > > > >> >> dead > > > > >> >> loop in truncate_inode_pages_range()") which is not in 4.4.y at > > > > >> >> all. > > > > >> >> > > > > >> >> Did you test this out? > > > > >> > > > > > >> > Yes, the LTP contains the tests (last comment is the final test > > > > >> > for > > > > >> > arm32, right before Jan tests i686). > > > > >> > > > > > >> > Fixing MAX_LFS_FILESIZE fixes the new limit for mmap() brought by > > > > >> > those 2 commits (file_mmap_size_max()). > > > > >> > offset tested by the LTP test is 0xfffe000. > > > > >> > file_mmap_size_max gives: 0x000 as max value, but only > > > > >> > after > > > > >> > the mentioned patch. > > > > >> > > > > > >> > Original intent for this fix was other though. > > > > >> > > > > >> To clarify this a bit further. > > > > >> > > > > >> The LTP CVE test is breaking in the first call to mmap(), even > > > > >> before > > > > >> trying to remap and test the security issue. That start happening in > > > > >> this round because of those mmap() changes and the offset used in > > > > >> the > > > > >> LTP test. Linus changed limit checks and made them to be related to > > > > >> MAX_LFS_FILESIZE. Unfortunately, in 4.4 stable, we were missing the > > > > >> fix for MAX_LFS_FILESIZE (which before commit 0cc3b0ec23ce was less > > > > >> than the REAL 32 bit limit). > > > > >> > > > > >> Commit 0cc3b0ec23ce was made because an user noticed the FS limit > > > > >> not > > > > >> being what it should be. In our case, the 4.4 stable kernel, we are > > > > >> facing this 32 bit lower limit (than the real 32 bit real limit), > > > > >> because of the LTP CVE test, so we need this fix to have the real 32 > > > > >> bit limit set for that macro (mmap limits did not use that macro > > > > >> before). > > > > >> > > > > >> I have tested in arm32 and Jan Stancek, who first responded to LTP > > > > >> issue, has tested this in i686 and both worked after that patch was > >
Re: [LTP] [PATCH 4.4 00/24] 4.4.137-stable review
- Original Message - > On Thu, Jun 14, 2018 at 05:49:52AM -0400, Jan Stancek wrote: > > > > - Original Message - > > > On Thu, Jun 14, 2018 at 02:24:25PM +0530, Naresh Kamboju wrote: > > > > On 14 June 2018 at 12:04, Greg Kroah-Hartman > > > > > > > > wrote: > > > > > On Wed, Jun 13, 2018 at 10:48:50PM -0300, Rafael Tinoco wrote: > > > > >> On 13 June 2018 at 18:08, Rafael David Tinoco > > > > >> wrote: > > > > >> > On Wed, Jun 13, 2018 at 6:00 PM, Greg Kroah-Hartman > > > > >> > wrote: > > > > >> >> On Wed, Jun 13, 2018 at 05:47:49PM -0300, Rafael Tinoco wrote: > > > > >> >>> Results from Linaro’s test farm. > > > > >> >>> Regressions detected. > > > > >> >>> > > > > >> >>> NOTE: > > > > >> >>> > > > > >> >>> 1) LTP vma03 test (cve-2011-2496) broken on v4.4-137-rc1 because > > > > >> >>> of: > > > > >> >>> > > > > >> >>> 6ea1dc96a03a mmap: relax file size limit for regular files > > > > >> >>> bd2f9ce5bacb mmap: introduce sane default mmap limits > > > > >> >>> > > > > >> >>>discussion: > > > > >> >>> > > > > >> >>> https://github.com/linux-test-project/ltp/issues/341 > > > > >> >>> > > > > >> >>>mainline commit (v4.13-rc7): > > > > >> >>> > > > > >> >>> 0cc3b0ec23ce Clarify (and fix) MAX_LFS_FILESIZE macros > > > > >> >>> > > > > >> >>>should be backported to 4.4.138-rc2 and fixes the issue. > > > > >> >> > > > > >> >> Really? That commit says it fixes c2a9737f45e2 ("vfs,mm: fix a > > > > >> >> dead > > > > >> >> loop in truncate_inode_pages_range()") which is not in 4.4.y at > > > > >> >> all. > > > > >> >> > > > > >> >> Did you test this out? > > > > >> > > > > > >> > Yes, the LTP contains the tests (last comment is the final test > > > > >> > for > > > > >> > arm32, right before Jan tests i686). > > > > >> > > > > > >> > Fixing MAX_LFS_FILESIZE fixes the new limit for mmap() brought by > > > > >> > those 2 commits (file_mmap_size_max()). > > > > >> > offset tested by the LTP test is 0xfffe000. > > > > >> > file_mmap_size_max gives: 0x000 as max value, but only > > > > >> > after > > > > >> > the mentioned patch. > > > > >> > > > > > >> > Original intent for this fix was other though. > > > > >> > > > > >> To clarify this a bit further. > > > > >> > > > > >> The LTP CVE test is breaking in the first call to mmap(), even > > > > >> before > > > > >> trying to remap and test the security issue. That start happening in > > > > >> this round because of those mmap() changes and the offset used in > > > > >> the > > > > >> LTP test. Linus changed limit checks and made them to be related to > > > > >> MAX_LFS_FILESIZE. Unfortunately, in 4.4 stable, we were missing the > > > > >> fix for MAX_LFS_FILESIZE (which before commit 0cc3b0ec23ce was less > > > > >> than the REAL 32 bit limit). > > > > >> > > > > >> Commit 0cc3b0ec23ce was made because an user noticed the FS limit > > > > >> not > > > > >> being what it should be. In our case, the 4.4 stable kernel, we are > > > > >> facing this 32 bit lower limit (than the real 32 bit real limit), > > > > >> because of the LTP CVE test, so we need this fix to have the real 32 > > > > >> bit limit set for that macro (mmap limits did not use that macro > > > > >> before). > > > > >> > > > > >> I have tested in arm32 and Jan Stancek, who first responded to LTP > > > > >> issue, has tested this in i686 and both worked after that patch was > >
Re: [LTP] [PATCH 4.4 00/24] 4.4.137-stable review
- Original Message - > On Thu, Jun 14, 2018 at 02:24:25PM +0530, Naresh Kamboju wrote: > > On 14 June 2018 at 12:04, Greg Kroah-Hartman > > wrote: > > > On Wed, Jun 13, 2018 at 10:48:50PM -0300, Rafael Tinoco wrote: > > >> On 13 June 2018 at 18:08, Rafael David Tinoco > > >> wrote: > > >> > On Wed, Jun 13, 2018 at 6:00 PM, Greg Kroah-Hartman > > >> > wrote: > > >> >> On Wed, Jun 13, 2018 at 05:47:49PM -0300, Rafael Tinoco wrote: > > >> >>> Results from Linaro’s test farm. > > >> >>> Regressions detected. > > >> >>> > > >> >>> NOTE: > > >> >>> > > >> >>> 1) LTP vma03 test (cve-2011-2496) broken on v4.4-137-rc1 because of: > > >> >>> > > >> >>> 6ea1dc96a03a mmap: relax file size limit for regular files > > >> >>> bd2f9ce5bacb mmap: introduce sane default mmap limits > > >> >>> > > >> >>>discussion: > > >> >>> > > >> >>> https://github.com/linux-test-project/ltp/issues/341 > > >> >>> > > >> >>>mainline commit (v4.13-rc7): > > >> >>> > > >> >>> 0cc3b0ec23ce Clarify (and fix) MAX_LFS_FILESIZE macros > > >> >>> > > >> >>>should be backported to 4.4.138-rc2 and fixes the issue. > > >> >> > > >> >> Really? That commit says it fixes c2a9737f45e2 ("vfs,mm: fix a dead > > >> >> loop in truncate_inode_pages_range()") which is not in 4.4.y at all. > > >> >> > > >> >> Did you test this out? > > >> > > > >> > Yes, the LTP contains the tests (last comment is the final test for > > >> > arm32, right before Jan tests i686). > > >> > > > >> > Fixing MAX_LFS_FILESIZE fixes the new limit for mmap() brought by > > >> > those 2 commits (file_mmap_size_max()). > > >> > offset tested by the LTP test is 0xfffe000. > > >> > file_mmap_size_max gives: 0x000 as max value, but only after > > >> > the mentioned patch. > > >> > > > >> > Original intent for this fix was other though. > > >> > > >> To clarify this a bit further. > > >> > > >> The LTP CVE test is breaking in the first call to mmap(), even before > > >> trying to remap and test the security issue. That start happening in > > >> this round because of those mmap() changes and the offset used in the > > >> LTP test. Linus changed limit checks and made them to be related to > > >> MAX_LFS_FILESIZE. Unfortunately, in 4.4 stable, we were missing the > > >> fix for MAX_LFS_FILESIZE (which before commit 0cc3b0ec23ce was less > > >> than the REAL 32 bit limit). > > >> > > >> Commit 0cc3b0ec23ce was made because an user noticed the FS limit not > > >> being what it should be. In our case, the 4.4 stable kernel, we are > > >> facing this 32 bit lower limit (than the real 32 bit real limit), > > >> because of the LTP CVE test, so we need this fix to have the real 32 > > >> bit limit set for that macro (mmap limits did not use that macro > > >> before). > > >> > > >> I have tested in arm32 and Jan Stancek, who first responded to LTP > > >> issue, has tested this in i686 and both worked after that patch was > > >> included to v4.4-137-rc1 (my last test was even with 4.4.138-rc1). > > >> > > >> Hope that helps a bit. > > > > > > Ok, thanks, it didn't apply cleanly but I've fixed it up now. > > > > On the latest 4.4.138-rc1, > > LTP "cve-2011-2496" test still fails on arm32 beagleboard x15 and qemu_arm. > > > > Summary > > > > kernel: 4.4.138-rc1 > > git repo: > > https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git > > git branch: linux-4.4.y > > git commit: 7d690c56754ef7be647fbcf7bcdceebd59926b3f > > git describe: v4.4.137-15-g7d690c56754e > > Test details: > > https://qa-reports.linaro.org/lkft/linux-stable-rc-4.4-oe/build/v4.4.137-15-g7d690c56754e > > Ok, but what does this mean? Is there a commit somewhere that I need to > pick up for 4.4.y that is already in newer kernels? Hi Greg, I think the expectations was that: 0cc3b0ec23ce Clarify (and fix) MAX_LFS_FILESIZE macros has been included to linux-4.4.y HEAD, so they re-ran the tests. Report from Naresh above looks like original report: LTP vma03 is cve-2011-2496 test. Regards, Jan > > I have no idea what that cve is >, as I never track them, and it's > something that was reported to predate the 4.4 kernel release :) > > thanks, > > greg k-h > > -- > Mailing list info: https://lists.linux.it/listinfo/ltp >
Re: [LTP] [PATCH 4.4 00/24] 4.4.137-stable review
- Original Message - > On Thu, Jun 14, 2018 at 02:24:25PM +0530, Naresh Kamboju wrote: > > On 14 June 2018 at 12:04, Greg Kroah-Hartman > > wrote: > > > On Wed, Jun 13, 2018 at 10:48:50PM -0300, Rafael Tinoco wrote: > > >> On 13 June 2018 at 18:08, Rafael David Tinoco > > >> wrote: > > >> > On Wed, Jun 13, 2018 at 6:00 PM, Greg Kroah-Hartman > > >> > wrote: > > >> >> On Wed, Jun 13, 2018 at 05:47:49PM -0300, Rafael Tinoco wrote: > > >> >>> Results from Linaro’s test farm. > > >> >>> Regressions detected. > > >> >>> > > >> >>> NOTE: > > >> >>> > > >> >>> 1) LTP vma03 test (cve-2011-2496) broken on v4.4-137-rc1 because of: > > >> >>> > > >> >>> 6ea1dc96a03a mmap: relax file size limit for regular files > > >> >>> bd2f9ce5bacb mmap: introduce sane default mmap limits > > >> >>> > > >> >>>discussion: > > >> >>> > > >> >>> https://github.com/linux-test-project/ltp/issues/341 > > >> >>> > > >> >>>mainline commit (v4.13-rc7): > > >> >>> > > >> >>> 0cc3b0ec23ce Clarify (and fix) MAX_LFS_FILESIZE macros > > >> >>> > > >> >>>should be backported to 4.4.138-rc2 and fixes the issue. > > >> >> > > >> >> Really? That commit says it fixes c2a9737f45e2 ("vfs,mm: fix a dead > > >> >> loop in truncate_inode_pages_range()") which is not in 4.4.y at all. > > >> >> > > >> >> Did you test this out? > > >> > > > >> > Yes, the LTP contains the tests (last comment is the final test for > > >> > arm32, right before Jan tests i686). > > >> > > > >> > Fixing MAX_LFS_FILESIZE fixes the new limit for mmap() brought by > > >> > those 2 commits (file_mmap_size_max()). > > >> > offset tested by the LTP test is 0xfffe000. > > >> > file_mmap_size_max gives: 0x000 as max value, but only after > > >> > the mentioned patch. > > >> > > > >> > Original intent for this fix was other though. > > >> > > >> To clarify this a bit further. > > >> > > >> The LTP CVE test is breaking in the first call to mmap(), even before > > >> trying to remap and test the security issue. That start happening in > > >> this round because of those mmap() changes and the offset used in the > > >> LTP test. Linus changed limit checks and made them to be related to > > >> MAX_LFS_FILESIZE. Unfortunately, in 4.4 stable, we were missing the > > >> fix for MAX_LFS_FILESIZE (which before commit 0cc3b0ec23ce was less > > >> than the REAL 32 bit limit). > > >> > > >> Commit 0cc3b0ec23ce was made because an user noticed the FS limit not > > >> being what it should be. In our case, the 4.4 stable kernel, we are > > >> facing this 32 bit lower limit (than the real 32 bit real limit), > > >> because of the LTP CVE test, so we need this fix to have the real 32 > > >> bit limit set for that macro (mmap limits did not use that macro > > >> before). > > >> > > >> I have tested in arm32 and Jan Stancek, who first responded to LTP > > >> issue, has tested this in i686 and both worked after that patch was > > >> included to v4.4-137-rc1 (my last test was even with 4.4.138-rc1). > > >> > > >> Hope that helps a bit. > > > > > > Ok, thanks, it didn't apply cleanly but I've fixed it up now. > > > > On the latest 4.4.138-rc1, > > LTP "cve-2011-2496" test still fails on arm32 beagleboard x15 and qemu_arm. > > > > Summary > > > > kernel: 4.4.138-rc1 > > git repo: > > https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git > > git branch: linux-4.4.y > > git commit: 7d690c56754ef7be647fbcf7bcdceebd59926b3f > > git describe: v4.4.137-15-g7d690c56754e > > Test details: > > https://qa-reports.linaro.org/lkft/linux-stable-rc-4.4-oe/build/v4.4.137-15-g7d690c56754e > > Ok, but what does this mean? Is there a commit somewhere that I need to > pick up for 4.4.y that is already in newer kernels? Hi Greg, I think the expectations was that: 0cc3b0ec23ce Clarify (and fix) MAX_LFS_FILESIZE macros has been included to linux-4.4.y HEAD, so they re-ran the tests. Report from Naresh above looks like original report: LTP vma03 is cve-2011-2496 test. Regards, Jan > > I have no idea what that cve is >, as I never track them, and it's > something that was reported to predate the 4.4 kernel release :) > > thanks, > > greg k-h > > -- > Mailing list info: https://lists.linux.it/listinfo/ltp >
[PATCH] virtio_balloon: fix increment of vb->num_pfns in fill_balloon()
commit c7cdff0e8647 ("virtio_balloon: fix deadlock on OOM") changed code to increment vb->num_pfns before call to set_page_pfns(), which used to happen only after. This patch fixes boot hang for me on ppc64le KVM guests. Fixes: c7cdff0e8647 ("virtio_balloon: fix deadlock on OOM") Cc: Michael S. Tsirkin <m...@redhat.com> Cc: Tetsuo Handa <penguin-ker...@i-love.sakura.ne.jp> Cc: Michal Hocko <mho...@suse.com> Cc: Wei Wang <wei.w.w...@intel.com> Signed-off-by: Jan Stancek <jstan...@redhat.com> --- drivers/virtio/virtio_balloon.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c index 7960746f7597..a1fb52cb3f0a 100644 --- a/drivers/virtio/virtio_balloon.c +++ b/drivers/virtio/virtio_balloon.c @@ -174,13 +174,12 @@ static unsigned fill_balloon(struct virtio_balloon *vb, size_t num) while ((page = balloon_page_pop())) { balloon_page_enqueue(>vb_dev_info, page); - vb->num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE; - set_page_pfns(vb, vb->pfns + vb->num_pfns, page); vb->num_pages += VIRTIO_BALLOON_PAGES_PER_PAGE; if (!virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_DEFLATE_ON_OOM)) adjust_managed_page_count(page, -1); + vb->num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE; } num_allocated_pages = vb->num_pfns; -- 1.8.3.1
[PATCH] virtio_balloon: fix increment of vb->num_pfns in fill_balloon()
commit c7cdff0e8647 ("virtio_balloon: fix deadlock on OOM") changed code to increment vb->num_pfns before call to set_page_pfns(), which used to happen only after. This patch fixes boot hang for me on ppc64le KVM guests. Fixes: c7cdff0e8647 ("virtio_balloon: fix deadlock on OOM") Cc: Michael S. Tsirkin Cc: Tetsuo Handa Cc: Michal Hocko Cc: Wei Wang Signed-off-by: Jan Stancek --- drivers/virtio/virtio_balloon.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c index 7960746f7597..a1fb52cb3f0a 100644 --- a/drivers/virtio/virtio_balloon.c +++ b/drivers/virtio/virtio_balloon.c @@ -174,13 +174,12 @@ static unsigned fill_balloon(struct virtio_balloon *vb, size_t num) while ((page = balloon_page_pop())) { balloon_page_enqueue(>vb_dev_info, page); - vb->num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE; - set_page_pfns(vb, vb->pfns + vb->num_pfns, page); vb->num_pages += VIRTIO_BALLOON_PAGES_PER_PAGE; if (!virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_DEFLATE_ON_OOM)) adjust_managed_page_count(page, -1); + vb->num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE; } num_allocated_pages = vb->num_pfns; -- 1.8.3.1
Re: [Patch V3] crypto: x86/sha1 : Fix reads beyond the number of blocks passed
- Original Message - > On Wed, 2017-08-02 at 10:13 -0700, Megha Dey wrote: > > It was reported that the sha1 AVX2 function(sha1_transform_avx2) is > > reading ahead beyond its intended data, and causing a crash if the next > > block is beyond page boundary: > > http://marc.info/?l=linux-crypto-vger=149373371023377 > > > > This patch makes sure that there is no overflow for any buffer length. > > > > It passes the tests written by Jan Stancek that revealed this problem: > > https://github.com/jstancek/sha1-avx2-crash > > Hi Jan, > > Is it ok to add your Tested-by? Yes, v3 patch is exactly the diff I was testing. Regards, Jan
Re: [Patch V3] crypto: x86/sha1 : Fix reads beyond the number of blocks passed
- Original Message - > On Wed, 2017-08-02 at 10:13 -0700, Megha Dey wrote: > > It was reported that the sha1 AVX2 function(sha1_transform_avx2) is > > reading ahead beyond its intended data, and causing a crash if the next > > block is beyond page boundary: > > http://marc.info/?l=linux-crypto-vger=149373371023377 > > > > This patch makes sure that there is no overflow for any buffer length. > > > > It passes the tests written by Jan Stancek that revealed this problem: > > https://github.com/jstancek/sha1-avx2-crash > > Hi Jan, > > Is it ok to add your Tested-by? Yes, v3 patch is exactly the diff I was testing. Regards, Jan
Re: [Patch V2] crypto: x86/sha1 : Fix reads beyond the number of blocks passed
On 08/02/2017 02:38 AM, Megha Dey wrote: > It was reported that the sha1 AVX2 function(sha1_transform_avx2) is > reading ahead beyond its intended data, and causing a crash if the next > block is beyond page boundary: > http://marc.info/?l=linux-crypto-vger=149373371023377 > > This patch makes sure that there is no overflow for any buffer length. > > It passes the tests written by Jan Stancek that revealed this problem: > https://github.com/jstancek/sha1-avx2-crash > > Jan, can you verify this fix? Hi, Looks good, my tests (below) PASS-ed. I updated reproducer at [1] to try more than just single scenario. It now tries thousands of combinations with different starting offset within page and sizes of data chunks. (without patch) v4.13-rc3-102-g26c5ceb with avx2 enabled: [ 106.455175] sha_test module loaded [ 106.460458] data is at 0x8c88e58f8000, page_after_data: 0x8c88e58fa000 [ 106.468625] sha_test testing hash: sha1-generic, maxlen: 8192 [ 127.091237] sha_test hash: sha1-generic calculated 1764032 hashes [ 127.098038] sha_test testing hash: sha1-ni, maxlen: 8192 [ 127.108805] failed to alloc sha1-ni [ 127.112703] sha_test testing hash: sha1-avx, maxlen: 8192 [ 141.166611] sha_test hash: sha1-avx calculated 1764032 hashes [ 141.173023] sha_test testing hash: sha1-avx2, maxlen: 8192 [ 141.180046] BUG: unable to handle kernel paging request at 8c88e58fa000 [ 141.187817] IP: _begin+0x28/0x187 [ 141.191512] PGD 89c578067 [ 141.191513] P4D 89c578067 [ 141.194529] PUD c61f0a063 [ 141.197545] PMD c64cb1063 [ 141.200561] PTE 800c658fa062 [ 141.203577] [ 141.208832] Oops: [#1] SMP ... With patch "[Patch V2] crypto: x86/sha1 : Fix reads beyond the number of blocks passed": [ 406.323781] sha_test module loaded [ 406.329062] data is at 0x93ab97108000, page_after_data: 0x93ab9710a000 [ 406.337185] sha_test testing hash: sha1-generic, maxlen: 8192 [ 426.157242] sha_test hash: sha1-generic calculated 1764032 hashes [ 426.164043] sha_test testing hash: sha1-ni, maxlen: 8192 [ 426.174244] failed to alloc sha1-ni [ 426.178139] sha_test testing hash: sha1-avx, maxlen: 8192 [ 440.226240] sha_test hash: sha1-avx calculated 1764032 hashes [ 440.232651] sha_test testing hash: sha1-avx2, maxlen: 8192 [ 452.497988] sha_test hash: sha1-avx2 calculated 1764032 hashes [ 452.504495] sha_test testing hash: sha1-ssse3, maxlen: 8192 [ 467.319316] sha_test hash: sha1-ssse3 calculated 1764032 hashes [ 467.325922] sha_test done [ 471.827083] sha_test module unloaded I also ran a test at [2], which is comparing hashes produced by kernel with hashes produced by user-space utilities (e.g. 'sha1sum'). This test also PASS-ed. Regards, Jan [1] https://github.com/jstancek/sha1-avx2-crash [2] https://github.com/jstancek/kernel_sha_test/tree/sha1-avx2-4.13 > Herbert, can you re-enable sha1-avx2 once Jan has checked it out and > revert commit b82ce24426a4071da9529d726057e4e642948667 ? > > Originally-by: Ilya Albrekht <ilya.albre...@intel.com> > Signed-off-by: Megha Dey <megha@linux.intel.com> > Reported-by: Jan Stancek <jstan...@redhat.com>
Re: [Patch V2] crypto: x86/sha1 : Fix reads beyond the number of blocks passed
On 08/02/2017 02:38 AM, Megha Dey wrote: > It was reported that the sha1 AVX2 function(sha1_transform_avx2) is > reading ahead beyond its intended data, and causing a crash if the next > block is beyond page boundary: > http://marc.info/?l=linux-crypto-vger=149373371023377 > > This patch makes sure that there is no overflow for any buffer length. > > It passes the tests written by Jan Stancek that revealed this problem: > https://github.com/jstancek/sha1-avx2-crash > > Jan, can you verify this fix? Hi, Looks good, my tests (below) PASS-ed. I updated reproducer at [1] to try more than just single scenario. It now tries thousands of combinations with different starting offset within page and sizes of data chunks. (without patch) v4.13-rc3-102-g26c5ceb with avx2 enabled: [ 106.455175] sha_test module loaded [ 106.460458] data is at 0x8c88e58f8000, page_after_data: 0x8c88e58fa000 [ 106.468625] sha_test testing hash: sha1-generic, maxlen: 8192 [ 127.091237] sha_test hash: sha1-generic calculated 1764032 hashes [ 127.098038] sha_test testing hash: sha1-ni, maxlen: 8192 [ 127.108805] failed to alloc sha1-ni [ 127.112703] sha_test testing hash: sha1-avx, maxlen: 8192 [ 141.166611] sha_test hash: sha1-avx calculated 1764032 hashes [ 141.173023] sha_test testing hash: sha1-avx2, maxlen: 8192 [ 141.180046] BUG: unable to handle kernel paging request at 8c88e58fa000 [ 141.187817] IP: _begin+0x28/0x187 [ 141.191512] PGD 89c578067 [ 141.191513] P4D 89c578067 [ 141.194529] PUD c61f0a063 [ 141.197545] PMD c64cb1063 [ 141.200561] PTE 800c658fa062 [ 141.203577] [ 141.208832] Oops: [#1] SMP ... With patch "[Patch V2] crypto: x86/sha1 : Fix reads beyond the number of blocks passed": [ 406.323781] sha_test module loaded [ 406.329062] data is at 0x93ab97108000, page_after_data: 0x93ab9710a000 [ 406.337185] sha_test testing hash: sha1-generic, maxlen: 8192 [ 426.157242] sha_test hash: sha1-generic calculated 1764032 hashes [ 426.164043] sha_test testing hash: sha1-ni, maxlen: 8192 [ 426.174244] failed to alloc sha1-ni [ 426.178139] sha_test testing hash: sha1-avx, maxlen: 8192 [ 440.226240] sha_test hash: sha1-avx calculated 1764032 hashes [ 440.232651] sha_test testing hash: sha1-avx2, maxlen: 8192 [ 452.497988] sha_test hash: sha1-avx2 calculated 1764032 hashes [ 452.504495] sha_test testing hash: sha1-ssse3, maxlen: 8192 [ 467.319316] sha_test hash: sha1-ssse3 calculated 1764032 hashes [ 467.325922] sha_test done [ 471.827083] sha_test module unloaded I also ran a test at [2], which is comparing hashes produced by kernel with hashes produced by user-space utilities (e.g. 'sha1sum'). This test also PASS-ed. Regards, Jan [1] https://github.com/jstancek/sha1-avx2-crash [2] https://github.com/jstancek/kernel_sha_test/tree/sha1-avx2-4.13 > Herbert, can you re-enable sha1-avx2 once Jan has checked it out and > revert commit b82ce24426a4071da9529d726057e4e642948667 ? > > Originally-by: Ilya Albrekht > Signed-off-by: Megha Dey > Reported-by: Jan Stancek
Re: [PATCH 0/2] key payload access with just rcu_read_lock()
- Original Message - > From: "David Howells" <dhowe...@redhat.com> > To: "Jan Stancek" <jstan...@redhat.com> > Cc: dhowe...@redhat.com, linux-kernel@vger.kernel.org, > linux-...@vger.kernel.org, bcodd...@redhat.com, > asav...@redhat.com > Sent: Wednesday, 1 March, 2017 10:40:13 AM > Subject: Re: [PATCH 0/2] key payload access with just rcu_read_lock() > > Jan Stancek <jstan...@redhat.com> wrote: > > > That problem didn't show up with my NFS based reproducer. > > I re-run it again with latest version of your patch, plus also > > keyutils testsuite. Both completed OK for me, dmesg looks clean. > > Can I put you down as a Tested-by? Yes. Thanks for having a look at this. Regards, Jan
Re: [PATCH 0/2] key payload access with just rcu_read_lock()
- Original Message - > From: "David Howells" > To: "Jan Stancek" > Cc: dhowe...@redhat.com, linux-kernel@vger.kernel.org, > linux-...@vger.kernel.org, bcodd...@redhat.com, > asav...@redhat.com > Sent: Wednesday, 1 March, 2017 10:40:13 AM > Subject: Re: [PATCH 0/2] key payload access with just rcu_read_lock() > > Jan Stancek wrote: > > > That problem didn't show up with my NFS based reproducer. > > I re-run it again with latest version of your patch, plus also > > keyutils testsuite. Both completed OK for me, dmesg looks clean. > > Can I put you down as a Tested-by? Yes. Thanks for having a look at this. Regards, Jan
Re: [PATCH 0/2] key payload access with just rcu_read_lock()
- Original Message - > Here's an updated patch with fixed user_key_payload_locked() and user_read(). > That problem didn't show up with my NFS based reproducer. I re-run it again with latest version of your patch, plus also keyutils testsuite. Both completed OK for me, dmesg looks clean. Regards, Jan
Re: [PATCH 0/2] key payload access with just rcu_read_lock()
- Original Message - > Here's an updated patch with fixed user_key_payload_locked() and user_read(). > That problem didn't show up with my NFS based reproducer. I re-run it again with latest version of your patch, plus also keyutils testsuite. Both completed OK for me, dmesg looks clean. Regards, Jan
Re: [PATCH 0/2] key payload access with just rcu_read_lock()
- Original Message - > Jan Stancek <jstan...@redhat.com> wrote: > > > Looks like there are still couple users that need updating, > > I'm hitting following compilation error: > > Aargh - I remembered to grep for rcu_dereference_key() but not > user_key_payload(). > > How about the attached? v2 builds OK (on top of 4.10), "suspicious RCU usage" warning is gone and NFS connectathon is PASS-ing cleanly. Regards, Jan
Re: [PATCH 0/2] key payload access with just rcu_read_lock()
- Original Message - > Jan Stancek wrote: > > > Looks like there are still couple users that need updating, > > I'm hitting following compilation error: > > Aargh - I remembered to grep for rcu_dereference_key() but not > user_key_payload(). > > How about the attached? v2 builds OK (on top of 4.10), "suspicious RCU usage" warning is gone and NFS connectathon is PASS-ing cleanly. Regards, Jan
Re: [PATCH 0/2] key payload access with just rcu_read_lock()
- Original Message - > From: "David Howells" <dhowe...@redhat.com> > To: "Jan Stancek" <jstan...@redhat.com> > Cc: dhowe...@redhat.com, linux-kernel@vger.kernel.org, > linux-...@vger.kernel.org, bcodd...@redhat.com, > asav...@redhat.com > Sent: Monday, 27 February, 2017 11:04:21 PM > Subject: Re: [PATCH 0/2] key payload access with just rcu_read_lock() > > Jan Stancek <jstan...@redhat.com> wrote: > > > this is a follow-up for "suspicious RCU usage" warning described > > in these 2 linux-nfs threads: > > http://marc.info/?t=14755883033=1=2 > > http://marc.info/?t=14877677051=1=2 > > > > Did you have something like in mind? > > How about the attached? It's similar to what you did, but I made the split > functions different from the original so that all users have to reconsider > what they actually want. > > David Looks like there are still couple users that need updating, I'm hitting following compilation error: ... CC [M] fs/cifs/connect.o fs/cifs/connect.c: In function ‘cifs_set_cifscreds’: fs/cifs/connect.c:2442:2: error: implicit declaration of function ‘user_key_payload’ [-Werror=implicit-function-declaration] upayload = user_key_payload(key); ^ fs/cifs/connect.c:2442:11: error: assignment makes pointer from integer without a cast [-Werror] upayload = user_key_payload(key); ^ # grep "user_key_payload(" -r . --include=*.c ./drivers/md/dm-crypt.c:ukp = user_key_payload(key); ./fs/cifs/connect.c:upayload = user_key_payload(key); ./fs/crypto/keyinfo.c: ukp = user_key_payload(keyring_key); ./lib/digsig.c: ukp = user_key_payload(key); # grep "user_key_payload(" -r . --include=*.h ./fs/ecryptfs/ecryptfs_kernel.h:return (struct ecryptfs_auth_tok *)user_key_payload(key)->data; Regards, Jan
Re: [PATCH 0/2] key payload access with just rcu_read_lock()
- Original Message - > From: "David Howells" > To: "Jan Stancek" > Cc: dhowe...@redhat.com, linux-kernel@vger.kernel.org, > linux-...@vger.kernel.org, bcodd...@redhat.com, > asav...@redhat.com > Sent: Monday, 27 February, 2017 11:04:21 PM > Subject: Re: [PATCH 0/2] key payload access with just rcu_read_lock() > > Jan Stancek wrote: > > > this is a follow-up for "suspicious RCU usage" warning described > > in these 2 linux-nfs threads: > > http://marc.info/?t=14755883033=1=2 > > http://marc.info/?t=14877677051=1=2 > > > > Did you have something like in mind? > > How about the attached? It's similar to what you did, but I made the split > functions different from the original so that all users have to reconsider > what they actually want. > > David Looks like there are still couple users that need updating, I'm hitting following compilation error: ... CC [M] fs/cifs/connect.o fs/cifs/connect.c: In function ‘cifs_set_cifscreds’: fs/cifs/connect.c:2442:2: error: implicit declaration of function ‘user_key_payload’ [-Werror=implicit-function-declaration] upayload = user_key_payload(key); ^ fs/cifs/connect.c:2442:11: error: assignment makes pointer from integer without a cast [-Werror] upayload = user_key_payload(key); ^ # grep "user_key_payload(" -r . --include=*.c ./drivers/md/dm-crypt.c:ukp = user_key_payload(key); ./fs/cifs/connect.c:upayload = user_key_payload(key); ./fs/crypto/keyinfo.c: ukp = user_key_payload(keyring_key); ./lib/digsig.c: ukp = user_key_payload(key); # grep "user_key_payload(" -r . --include=*.h ./fs/ecryptfs/ecryptfs_kernel.h:return (struct ecryptfs_auth_tok *)user_key_payload(key)->data; Regards, Jan
[PATCH 1/2] KEYS: add user_key_payload_rcu()
user_key_payload() is wrapper for rcu_dereference_protected(), and can't be used with just rcu_read_lock() held. This patch adds user_key_payload_rcu() for accessing key payload in RCU read-side section, without the need to hold key semaphore. Signed-off-by: Jan Stancek <jstan...@redhat.com> --- Documentation/security/keys.txt | 5 + include/keys/user-type.h| 5 + include/linux/key.h | 3 +++ 3 files changed, 13 insertions(+) diff --git a/Documentation/security/keys.txt b/Documentation/security/keys.txt index 3849814bfe6d..da89a854edd2 100644 --- a/Documentation/security/keys.txt +++ b/Documentation/security/keys.txt @@ -1152,8 +1152,13 @@ access the data: wrap the RCU calls to this element: rcu_assign_keypointer(struct key *key, void *data); + + /* access payload with semaphore held */ void *rcu_dereference_key(struct key *key); + /* access payload in RCU read-side section*/ + void *rcu_read_dereference_key(struct key *key); + === DEFINING A KEY TYPE diff --git a/include/keys/user-type.h b/include/keys/user-type.h index c56fef40f53e..521bf3369904 100644 --- a/include/keys/user-type.h +++ b/include/keys/user-type.h @@ -53,6 +53,11 @@ static inline const struct user_key_payload *user_key_payload(const struct key * return (struct user_key_payload *)rcu_dereference_key(key); } +static inline const struct user_key_payload *user_key_payload_rcu(const struct key *key) +{ + return (struct user_key_payload *)rcu_read_dereference_key(key); +} + #endif /* CONFIG_KEYS */ #endif /* _KEYS_USER_TYPE_H */ diff --git a/include/linux/key.h b/include/linux/key.h index 722914798f37..b6a8c5896761 100644 --- a/include/linux/key.h +++ b/include/linux/key.h @@ -358,6 +358,9 @@ static inline bool key_is_instantiated(const struct key *key) (rcu_dereference_protected((KEY)->payload.rcu_data0,\ rwsem_is_locked(&((struct key *)(KEY))->sem))) +#define rcu_read_dereference_key(KEY) \ + (rcu_dereference((KEY)->payload.rcu_data0)) + #define rcu_assign_keypointer(KEY, PAYLOAD)\ do { \ rcu_assign_pointer((KEY)->payload.rcu_data0, (PAYLOAD));\ -- 1.8.3.1
[PATCH 1/2] KEYS: add user_key_payload_rcu()
user_key_payload() is wrapper for rcu_dereference_protected(), and can't be used with just rcu_read_lock() held. This patch adds user_key_payload_rcu() for accessing key payload in RCU read-side section, without the need to hold key semaphore. Signed-off-by: Jan Stancek --- Documentation/security/keys.txt | 5 + include/keys/user-type.h| 5 + include/linux/key.h | 3 +++ 3 files changed, 13 insertions(+) diff --git a/Documentation/security/keys.txt b/Documentation/security/keys.txt index 3849814bfe6d..da89a854edd2 100644 --- a/Documentation/security/keys.txt +++ b/Documentation/security/keys.txt @@ -1152,8 +1152,13 @@ access the data: wrap the RCU calls to this element: rcu_assign_keypointer(struct key *key, void *data); + + /* access payload with semaphore held */ void *rcu_dereference_key(struct key *key); + /* access payload in RCU read-side section*/ + void *rcu_read_dereference_key(struct key *key); + === DEFINING A KEY TYPE diff --git a/include/keys/user-type.h b/include/keys/user-type.h index c56fef40f53e..521bf3369904 100644 --- a/include/keys/user-type.h +++ b/include/keys/user-type.h @@ -53,6 +53,11 @@ static inline const struct user_key_payload *user_key_payload(const struct key * return (struct user_key_payload *)rcu_dereference_key(key); } +static inline const struct user_key_payload *user_key_payload_rcu(const struct key *key) +{ + return (struct user_key_payload *)rcu_read_dereference_key(key); +} + #endif /* CONFIG_KEYS */ #endif /* _KEYS_USER_TYPE_H */ diff --git a/include/linux/key.h b/include/linux/key.h index 722914798f37..b6a8c5896761 100644 --- a/include/linux/key.h +++ b/include/linux/key.h @@ -358,6 +358,9 @@ static inline bool key_is_instantiated(const struct key *key) (rcu_dereference_protected((KEY)->payload.rcu_data0,\ rwsem_is_locked(&((struct key *)(KEY))->sem))) +#define rcu_read_dereference_key(KEY) \ + (rcu_dereference((KEY)->payload.rcu_data0)) + #define rcu_assign_keypointer(KEY, PAYLOAD)\ do { \ rcu_assign_pointer((KEY)->payload.rcu_data0, (PAYLOAD));\ -- 1.8.3.1
[PATCH 2/2] NFS: use user_key_payload_rcu() in RCU read-side section
Use RCU read-side accessor for key payload, as calling user_key_payload() without semaphore held leads to following warning: === [ INFO: suspicious RCU usage. ] 4.10.0 #1 Tainted: GW --- ./include/keys/user-type.h:53 suspicious rcu_dereference_protected() usage! other info that might help us debug this: rcu_scheduler_active = 2, debug_locks = 0 1 lock held by mount.nfs/5987: #0: (rcu_read_lock){..}, at: [] nfs_idmap_get_key+0x15c/0x420 [nfsv4] stack backtrace: CPU: 1 PID: 5987 Comm: mount.nfs Tainted: GW 4.10.0 #1 Call Trace: dump_stack+0xe8/0x154 (unreliable) lockdep_rcu_suspicious+0x140/0x190 nfs_idmap_get_key+0x380/0x420 [nfsv4] nfs_map_name_to_uid+0x2a0/0x3b0 [nfsv4] decode_getfattr_attrs+0xfac/0x16b0 [nfsv4] decode_getfattr_generic.constprop.106+0xbc/0x150 [nfsv4] nfs4_xdr_dec_lookup_root+0xac/0xb0 [nfsv4] rpcauth_unwrap_resp+0xe8/0x140 [sunrpc] call_decode+0x29c/0x910 [sunrpc] __rpc_execute+0x140/0x8f0 [sunrpc] rpc_run_task+0x170/0x200 [sunrpc] nfs4_call_sync_sequence+0x68/0xa0 [nfsv4] _nfs4_lookup_root.isra.44+0xd0/0xf0 [nfsv4] nfs4_lookup_root+0xe0/0x350 [nfsv4] nfs4_lookup_root_sec+0x70/0xa0 [nfsv4] nfs4_find_root_sec+0xc4/0x100 [nfsv4] nfs4_proc_get_rootfh+0x5c/0xf0 [nfsv4] nfs4_get_rootfh+0x6c/0x190 [nfsv4] nfs4_server_common_setup+0xc4/0x260 [nfsv4] nfs4_create_server+0x278/0x3c0 [nfsv4] nfs4_remote_mount+0x50/0xb0 [nfsv4] mount_fs+0x74/0x210 vfs_kern_mount+0x78/0x220 nfs_do_root_mount+0xb0/0x140 [nfsv4] nfs4_try_mount+0x60/0x100 [nfsv4] nfs_fs_mount+0x5ec/0xda0 [nfs] mount_fs+0x74/0x210 vfs_kern_mount+0x78/0x220 do_mount+0x254/0xf70 SyS_mount+0x94/0x100 system_call+0x38/0xe0 Signed-off-by: Jan Stancek <jstan...@redhat.com> --- fs/nfs/nfs4idmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfs/nfs4idmap.c b/fs/nfs/nfs4idmap.c index c444285bb1b1..835c163f61af 100644 --- a/fs/nfs/nfs4idmap.c +++ b/fs/nfs/nfs4idmap.c @@ -316,7 +316,7 @@ static ssize_t nfs_idmap_get_key(const char *name, size_t namelen, if (ret < 0) goto out_up; - payload = user_key_payload(rkey); + payload = user_key_payload_rcu(rkey); if (IS_ERR_OR_NULL(payload)) { ret = PTR_ERR(payload); goto out_up; -- 1.8.3.1
[PATCH 2/2] NFS: use user_key_payload_rcu() in RCU read-side section
Use RCU read-side accessor for key payload, as calling user_key_payload() without semaphore held leads to following warning: === [ INFO: suspicious RCU usage. ] 4.10.0 #1 Tainted: GW --- ./include/keys/user-type.h:53 suspicious rcu_dereference_protected() usage! other info that might help us debug this: rcu_scheduler_active = 2, debug_locks = 0 1 lock held by mount.nfs/5987: #0: (rcu_read_lock){..}, at: [] nfs_idmap_get_key+0x15c/0x420 [nfsv4] stack backtrace: CPU: 1 PID: 5987 Comm: mount.nfs Tainted: GW 4.10.0 #1 Call Trace: dump_stack+0xe8/0x154 (unreliable) lockdep_rcu_suspicious+0x140/0x190 nfs_idmap_get_key+0x380/0x420 [nfsv4] nfs_map_name_to_uid+0x2a0/0x3b0 [nfsv4] decode_getfattr_attrs+0xfac/0x16b0 [nfsv4] decode_getfattr_generic.constprop.106+0xbc/0x150 [nfsv4] nfs4_xdr_dec_lookup_root+0xac/0xb0 [nfsv4] rpcauth_unwrap_resp+0xe8/0x140 [sunrpc] call_decode+0x29c/0x910 [sunrpc] __rpc_execute+0x140/0x8f0 [sunrpc] rpc_run_task+0x170/0x200 [sunrpc] nfs4_call_sync_sequence+0x68/0xa0 [nfsv4] _nfs4_lookup_root.isra.44+0xd0/0xf0 [nfsv4] nfs4_lookup_root+0xe0/0x350 [nfsv4] nfs4_lookup_root_sec+0x70/0xa0 [nfsv4] nfs4_find_root_sec+0xc4/0x100 [nfsv4] nfs4_proc_get_rootfh+0x5c/0xf0 [nfsv4] nfs4_get_rootfh+0x6c/0x190 [nfsv4] nfs4_server_common_setup+0xc4/0x260 [nfsv4] nfs4_create_server+0x278/0x3c0 [nfsv4] nfs4_remote_mount+0x50/0xb0 [nfsv4] mount_fs+0x74/0x210 vfs_kern_mount+0x78/0x220 nfs_do_root_mount+0xb0/0x140 [nfsv4] nfs4_try_mount+0x60/0x100 [nfsv4] nfs_fs_mount+0x5ec/0xda0 [nfs] mount_fs+0x74/0x210 vfs_kern_mount+0x78/0x220 do_mount+0x254/0xf70 SyS_mount+0x94/0x100 system_call+0x38/0xe0 Signed-off-by: Jan Stancek --- fs/nfs/nfs4idmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfs/nfs4idmap.c b/fs/nfs/nfs4idmap.c index c444285bb1b1..835c163f61af 100644 --- a/fs/nfs/nfs4idmap.c +++ b/fs/nfs/nfs4idmap.c @@ -316,7 +316,7 @@ static ssize_t nfs_idmap_get_key(const char *name, size_t namelen, if (ret < 0) goto out_up; - payload = user_key_payload(rkey); + payload = user_key_payload_rcu(rkey); if (IS_ERR_OR_NULL(payload)) { ret = PTR_ERR(payload); goto out_up; -- 1.8.3.1
[PATCH 0/2] key payload access with just rcu_read_lock()
Hi David, this is a follow-up for "suspicious RCU usage" warning described in these 2 linux-nfs threads: http://marc.info/?t=14755883033=1=2 http://marc.info/?t=14877677051=1=2 Did you have something like in mind? Thanks, Jan Jan Stancek (2): KEYS: add user_key_payload_rcu() NFS: use user_key_payload_rcu() in RCU read-side section Documentation/security/keys.txt | 5 + fs/nfs/nfs4idmap.c | 2 +- include/keys/user-type.h| 5 + include/linux/key.h | 3 +++ 4 files changed, 14 insertions(+), 1 deletion(-) -- 1.8.3.1
[PATCH 0/2] key payload access with just rcu_read_lock()
Hi David, this is a follow-up for "suspicious RCU usage" warning described in these 2 linux-nfs threads: http://marc.info/?t=14755883033=1=2 http://marc.info/?t=14877677051=1=2 Did you have something like in mind? Thanks, Jan Jan Stancek (2): KEYS: add user_key_payload_rcu() NFS: use user_key_payload_rcu() in RCU read-side section Documentation/security/keys.txt | 5 + fs/nfs/nfs4idmap.c | 2 +- include/keys/user-type.h| 5 + include/linux/key.h | 3 +++ 4 files changed, 14 insertions(+), 1 deletion(-) -- 1.8.3.1
[tip:perf/urgent] perf header: Make build_cpu_topology skip offline/absent CPUs
Commit-ID: 43db2843a4a41cc8cdb6ab696639aeee1f4d5062 Gitweb: http://git.kernel.org/tip/43db2843a4a41cc8cdb6ab696639aeee1f4d5062 Author: Jan Stancek <jstan...@redhat.com> AuthorDate: Fri, 17 Feb 2017 12:10:25 +0100 Committer: Arnaldo Carvalho de Melo <a...@redhat.com> CommitDate: Fri, 17 Feb 2017 12:37:04 -0300 perf header: Make build_cpu_topology skip offline/absent CPUs When build_cpu_topo() encounters offline/absent CPUs, it fails to find any sysfs entries and returns failure. This leads to build_cpu_topology() and write_cpu_topology() failing as well. Because HEADER_CPU_TOPOLOGY has not been written, read leaves cpu_topology_map NULL and we get NULL ptr deref at: ... cmd_test __cmd_test test_and_print run_test test_session_topology check_cpu_topology 36: Session topology : --- start --- test child forked, pid 14902 templ file: /tmp/perf-test-4CKocW failed to write feature HEADER_CPU_TOPOLOGY perf: Segmentation fault Obtained 9 stack frames. ./perf(sighandler_dump_stack+0x41) [0x5095f1] /lib64/libc.so.6(+0x35250) [0x7f4b7c3c9250] ./perf(test_session_topology+0x1db) [0x490ceb] ./perf() [0x475b68] ./perf(cmd_test+0x5b9) [0x4763c9] ./perf() [0x4945a3] ./perf(main+0x69f) [0x427e8f] /lib64/libc.so.6(__libc_start_main+0xf5) [0x7f4b7c3b5b35] ./perf() [0x427fb9] test child interrupted end Session topology: FAILED! This patch makes build_cpu_topology() skip offline/absent CPUs, by checking their presence against cpu_map built from online CPUs. Signed-off-by: Jan Stancek <jstan...@redhat.com> Acked-by: Jiri Olsa <jo...@kernel.org> Cc: Alexander Shishkin <alexander.shish...@linux.intel.com> Cc: Masami Hiramatsu <mhira...@kernel.org> Cc: Peter Zijlstra <pet...@infradead.org> Link: http://lkml.kernel.org/r/a271b770175524f4961d4903af33798358a4a518.1487146877.git.jstan...@redhat.com Signed-off-by: Arnaldo Carvalho de Melo <a...@redhat.com> --- tools/perf/util/header.c | 19 --- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index 3d12c16..1222f6c 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -505,24 +505,31 @@ static void free_cpu_topo(struct cpu_topo *tp) static struct cpu_topo *build_cpu_topology(void) { - struct cpu_topo *tp; + struct cpu_topo *tp = NULL; void *addr; u32 nr, i; size_t sz; long ncpus; int ret = -1; + struct cpu_map *map; ncpus = sysconf(_SC_NPROCESSORS_CONF); if (ncpus < 0) return NULL; + /* build online CPU map */ + map = cpu_map__new(NULL); + if (map == NULL) { + pr_debug("failed to get system cpumap\n"); + return NULL; + } + nr = (u32)(ncpus & UINT_MAX); sz = nr * sizeof(char *); - addr = calloc(1, sizeof(*tp) + 2 * sz); if (!addr) - return NULL; + goto out_free; tp = addr; tp->cpu_nr = nr; @@ -532,10 +539,16 @@ static struct cpu_topo *build_cpu_topology(void) tp->thread_siblings = addr; for (i = 0; i < nr; i++) { + if (!cpu_map__has(map, i)) + continue; + ret = build_cpu_topo(tp, i); if (ret < 0) break; } + +out_free: + cpu_map__put(map); if (ret) { free_cpu_topo(tp); tp = NULL;
[tip:perf/urgent] perf cpumap: Add cpu__max_present_cpu()
Commit-ID: 92a7e1278005b6bb3459affc50b2b6e2464e7e7c Gitweb: http://git.kernel.org/tip/92a7e1278005b6bb3459affc50b2b6e2464e7e7c Author: Jan Stancek <jstan...@redhat.com> AuthorDate: Fri, 17 Feb 2017 12:10:24 +0100 Committer: Arnaldo Carvalho de Melo <a...@redhat.com> CommitDate: Fri, 17 Feb 2017 12:33:05 -0300 perf cpumap: Add cpu__max_present_cpu() Similar to cpu__max_cpu() (which returns the max possible CPU), returns the max present CPU. Signed-off-by: Jan Stancek <jstan...@redhat.com> Acked-by: Jiri Olsa <jo...@kernel.org> Cc: Alexander Shishkin <alexander.shish...@linux.intel.com> Cc: Masami Hiramatsu <mhira...@kernel.org> Cc: Peter Zijlstra <pet...@infradead.org> Link: http://lkml.kernel.org/r/8ea4601b5cacc49927235b4ebac424bd6eeccb06.1487146877.git.jstan...@redhat.com Signed-off-by: Arnaldo Carvalho de Melo <a...@redhat.com> --- tools/perf/util/cpumap.c | 22 ++ tools/perf/util/cpumap.h | 1 + 2 files changed, 23 insertions(+) diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c index 2c0b522..8c75049 100644 --- a/tools/perf/util/cpumap.c +++ b/tools/perf/util/cpumap.c @@ -9,6 +9,7 @@ #include "asm/bug.h" static int max_cpu_num; +static int max_present_cpu_num; static int max_node_num; static int *cpunode_map; @@ -442,6 +443,7 @@ static void set_max_cpu_num(void) /* set up default */ max_cpu_num = 4096; + max_present_cpu_num = 4096; mnt = sysfs__mountpoint(); if (!mnt) @@ -455,6 +457,17 @@ static void set_max_cpu_num(void) } ret = get_max_num(path, _cpu_num); + if (ret) + goto out; + + /* get the highest present cpu number for a sparse allocation */ + ret = snprintf(path, PATH_MAX, "%s/devices/system/cpu/present", mnt); + if (ret == PATH_MAX) { + pr_err("sysfs path crossed PATH_MAX(%d) size\n", PATH_MAX); + goto out; + } + + ret = get_max_num(path, _present_cpu_num); out: if (ret) @@ -505,6 +518,15 @@ int cpu__max_cpu(void) return max_cpu_num; } +int cpu__max_present_cpu(void) +{ + if (unlikely(!max_present_cpu_num)) + set_max_cpu_num(); + + return max_present_cpu_num; +} + + int cpu__get_node(int cpu) { if (unlikely(cpunode_map == NULL)) { diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h index 06bd689..1a0549a 100644 --- a/tools/perf/util/cpumap.h +++ b/tools/perf/util/cpumap.h @@ -62,6 +62,7 @@ int cpu__setup_cpunode_map(void); int cpu__max_node(void); int cpu__max_cpu(void); +int cpu__max_present_cpu(void); int cpu__get_node(int cpu); int cpu_map__build_map(struct cpu_map *cpus, struct cpu_map **res,
[tip:perf/urgent] perf tools: Replace _SC_NPROCESSORS_CONF with max_present_cpu in cpu_topology_map
Commit-ID: da8a58b56c661681f9b2fd2fa59c6da3a5bac8d1 Gitweb: http://git.kernel.org/tip/da8a58b56c661681f9b2fd2fa59c6da3a5bac8d1 Author: Jan Stancek <jstan...@redhat.com> AuthorDate: Fri, 17 Feb 2017 12:10:26 +0100 Committer: Arnaldo Carvalho de Melo <a...@redhat.com> CommitDate: Fri, 17 Feb 2017 12:56:35 -0300 perf tools: Replace _SC_NPROCESSORS_CONF with max_present_cpu in cpu_topology_map There are 2 problems wrt. cpu_topology_map on systems with sparse CPUs: 1. offline/absent CPUs will have their socket_id and core_id set to -1 which triggers: "socket_id number is too big.You may need to upgrade the perf tool." 2. size of cpu_topology_map (perf_env.cpu[]) is allocated based on _SC_NPROCESSORS_CONF, but can be indexed with CPU ids going above. Users of perf_env.cpu[] are using CPU id as index. This can lead to read beyond what was allocated: ==19991== Invalid read of size 4 ==19991==at 0x490CEB: check_cpu_topology (topology.c:69) ==19991==by 0x490CEB: test_session_topology (topology.c:106) ... For example: _SC_NPROCESSORS_CONF == 16 available: 2 nodes (0-1) node 0 cpus: 0 6 8 10 16 22 24 26 node 0 size: 12004 MB node 0 free: 9470 MB node 1 cpus: 1 7 9 11 23 25 27 node 1 size: 12093 MB node 1 free: 9406 MB node distances: node 0 1 0: 10 20 1: 20 10 This patch changes HEADER_NRCPUS.nr_cpus_available from _SC_NPROCESSORS_CONF to max_present_cpu and updates any user of cpu_topology_map to iterate with nr_cpus_avail. As a consequence HEADER_CPU_TOPOLOGY core_id and socket_id lists get longer, but maintain compatibility with pre-patch state - index to cpu_topology_map is CPU id. perf test 36 -v 36: Session topology : --- start --- test child forked, pid 22211 templ file: /tmp/perf-test-gmdX5i CPU 0, core 0, socket 0 CPU 1, core 0, socket 1 CPU 6, core 10, socket 0 CPU 7, core 10, socket 1 CPU 8, core 1, socket 0 CPU 9, core 1, socket 1 CPU 10, core 9, socket 0 CPU 11, core 9, socket 1 CPU 16, core 0, socket 0 CPU 22, core 10, socket 0 CPU 23, core 10, socket 1 CPU 24, core 1, socket 0 CPU 25, core 1, socket 1 CPU 26, core 9, socket 0 CPU 27, core 9, socket 1 test child finished with 0 end Session topology: Ok Signed-off-by: Jan Stancek <jstan...@redhat.com> Cc: Alexander Shishkin <alexander.shish...@linux.intel.com> Cc: Jiri Olsa <jo...@kernel.org> Cc: Masami Hiramatsu <mhira...@kernel.org> Cc: Peter Zijlstra <pet...@infradead.org> Link: http://lkml.kernel.org/r/d7c05c6445fca74a8442c2c73cfffd349c52c44f.1487146877.git.jstan...@redhat.com Signed-off-by: Arnaldo Carvalho de Melo <a...@redhat.com> --- tools/perf/builtin-stat.c | 2 +- tools/perf/tests/topology.c | 4 +++- tools/perf/util/env.c | 2 +- tools/perf/util/header.c| 16 +--- 4 files changed, 10 insertions(+), 14 deletions(-) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index f287191..ca27a8a 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -1765,7 +1765,7 @@ static inline int perf_env__get_cpu(struct perf_env *env, struct cpu_map *map, i cpu = map->map[idx]; - if (cpu >= env->nr_cpus_online) + if (cpu >= env->nr_cpus_avail) return -1; return cpu; diff --git a/tools/perf/tests/topology.c b/tools/perf/tests/topology.c index 98fe69a..803f893 100644 --- a/tools/perf/tests/topology.c +++ b/tools/perf/tests/topology.c @@ -65,7 +65,9 @@ static int check_cpu_topology(char *path, struct cpu_map *map) session = perf_session__new(, false, NULL); TEST_ASSERT_VAL("can't get session", session); - for (i = 0; i < session->header.env.nr_cpus_online; i++) { + for (i = 0; i < session->header.env.nr_cpus_avail; i++) { + if (!cpu_map__has(map, i)) + continue; pr_debug("CPU %d, core %d, socket %d\n", i, session->header.env.cpu[i].core_id, session->header.env.cpu[i].socket_id); diff --git a/tools/perf/util/env.c b/tools/perf/util/env.c index bb964e8..075fc77 100644 --- a/tools/perf/util/env.c +++ b/tools/perf/util/env.c @@ -66,7 +66,7 @@ int perf_env__read_cpu_topology_map(struct perf_env *env) return 0; if (env->nr_cpus_avail == 0) - env->nr_cpus_avail = sysconf(_SC_NPROCESSORS_CONF); + env->nr_cpus_avail = cpu__max_present_cpu(); nr_cpus = env->nr_cpus_avail; if (nr_cpus == -1) diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index 1222f6c..05714d5 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -295,11 +295,7 @@ static int write_nrcpus(int fd, struct perf_header *h __maybe_unused, u32 nrc, nra; int
[tip:perf/urgent] perf header: Make build_cpu_topology skip offline/absent CPUs
Commit-ID: 43db2843a4a41cc8cdb6ab696639aeee1f4d5062 Gitweb: http://git.kernel.org/tip/43db2843a4a41cc8cdb6ab696639aeee1f4d5062 Author: Jan Stancek AuthorDate: Fri, 17 Feb 2017 12:10:25 +0100 Committer: Arnaldo Carvalho de Melo CommitDate: Fri, 17 Feb 2017 12:37:04 -0300 perf header: Make build_cpu_topology skip offline/absent CPUs When build_cpu_topo() encounters offline/absent CPUs, it fails to find any sysfs entries and returns failure. This leads to build_cpu_topology() and write_cpu_topology() failing as well. Because HEADER_CPU_TOPOLOGY has not been written, read leaves cpu_topology_map NULL and we get NULL ptr deref at: ... cmd_test __cmd_test test_and_print run_test test_session_topology check_cpu_topology 36: Session topology : --- start --- test child forked, pid 14902 templ file: /tmp/perf-test-4CKocW failed to write feature HEADER_CPU_TOPOLOGY perf: Segmentation fault Obtained 9 stack frames. ./perf(sighandler_dump_stack+0x41) [0x5095f1] /lib64/libc.so.6(+0x35250) [0x7f4b7c3c9250] ./perf(test_session_topology+0x1db) [0x490ceb] ./perf() [0x475b68] ./perf(cmd_test+0x5b9) [0x4763c9] ./perf() [0x4945a3] ./perf(main+0x69f) [0x427e8f] /lib64/libc.so.6(__libc_start_main+0xf5) [0x7f4b7c3b5b35] ./perf() [0x427fb9] test child interrupted end Session topology: FAILED! This patch makes build_cpu_topology() skip offline/absent CPUs, by checking their presence against cpu_map built from online CPUs. Signed-off-by: Jan Stancek Acked-by: Jiri Olsa Cc: Alexander Shishkin Cc: Masami Hiramatsu Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/a271b770175524f4961d4903af33798358a4a518.1487146877.git.jstan...@redhat.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/header.c | 19 --- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index 3d12c16..1222f6c 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -505,24 +505,31 @@ static void free_cpu_topo(struct cpu_topo *tp) static struct cpu_topo *build_cpu_topology(void) { - struct cpu_topo *tp; + struct cpu_topo *tp = NULL; void *addr; u32 nr, i; size_t sz; long ncpus; int ret = -1; + struct cpu_map *map; ncpus = sysconf(_SC_NPROCESSORS_CONF); if (ncpus < 0) return NULL; + /* build online CPU map */ + map = cpu_map__new(NULL); + if (map == NULL) { + pr_debug("failed to get system cpumap\n"); + return NULL; + } + nr = (u32)(ncpus & UINT_MAX); sz = nr * sizeof(char *); - addr = calloc(1, sizeof(*tp) + 2 * sz); if (!addr) - return NULL; + goto out_free; tp = addr; tp->cpu_nr = nr; @@ -532,10 +539,16 @@ static struct cpu_topo *build_cpu_topology(void) tp->thread_siblings = addr; for (i = 0; i < nr; i++) { + if (!cpu_map__has(map, i)) + continue; + ret = build_cpu_topo(tp, i); if (ret < 0) break; } + +out_free: + cpu_map__put(map); if (ret) { free_cpu_topo(tp); tp = NULL;
[tip:perf/urgent] perf cpumap: Add cpu__max_present_cpu()
Commit-ID: 92a7e1278005b6bb3459affc50b2b6e2464e7e7c Gitweb: http://git.kernel.org/tip/92a7e1278005b6bb3459affc50b2b6e2464e7e7c Author: Jan Stancek AuthorDate: Fri, 17 Feb 2017 12:10:24 +0100 Committer: Arnaldo Carvalho de Melo CommitDate: Fri, 17 Feb 2017 12:33:05 -0300 perf cpumap: Add cpu__max_present_cpu() Similar to cpu__max_cpu() (which returns the max possible CPU), returns the max present CPU. Signed-off-by: Jan Stancek Acked-by: Jiri Olsa Cc: Alexander Shishkin Cc: Masami Hiramatsu Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/8ea4601b5cacc49927235b4ebac424bd6eeccb06.1487146877.git.jstan...@redhat.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/cpumap.c | 22 ++ tools/perf/util/cpumap.h | 1 + 2 files changed, 23 insertions(+) diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c index 2c0b522..8c75049 100644 --- a/tools/perf/util/cpumap.c +++ b/tools/perf/util/cpumap.c @@ -9,6 +9,7 @@ #include "asm/bug.h" static int max_cpu_num; +static int max_present_cpu_num; static int max_node_num; static int *cpunode_map; @@ -442,6 +443,7 @@ static void set_max_cpu_num(void) /* set up default */ max_cpu_num = 4096; + max_present_cpu_num = 4096; mnt = sysfs__mountpoint(); if (!mnt) @@ -455,6 +457,17 @@ static void set_max_cpu_num(void) } ret = get_max_num(path, _cpu_num); + if (ret) + goto out; + + /* get the highest present cpu number for a sparse allocation */ + ret = snprintf(path, PATH_MAX, "%s/devices/system/cpu/present", mnt); + if (ret == PATH_MAX) { + pr_err("sysfs path crossed PATH_MAX(%d) size\n", PATH_MAX); + goto out; + } + + ret = get_max_num(path, _present_cpu_num); out: if (ret) @@ -505,6 +518,15 @@ int cpu__max_cpu(void) return max_cpu_num; } +int cpu__max_present_cpu(void) +{ + if (unlikely(!max_present_cpu_num)) + set_max_cpu_num(); + + return max_present_cpu_num; +} + + int cpu__get_node(int cpu) { if (unlikely(cpunode_map == NULL)) { diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h index 06bd689..1a0549a 100644 --- a/tools/perf/util/cpumap.h +++ b/tools/perf/util/cpumap.h @@ -62,6 +62,7 @@ int cpu__setup_cpunode_map(void); int cpu__max_node(void); int cpu__max_cpu(void); +int cpu__max_present_cpu(void); int cpu__get_node(int cpu); int cpu_map__build_map(struct cpu_map *cpus, struct cpu_map **res,
[tip:perf/urgent] perf tools: Replace _SC_NPROCESSORS_CONF with max_present_cpu in cpu_topology_map
Commit-ID: da8a58b56c661681f9b2fd2fa59c6da3a5bac8d1 Gitweb: http://git.kernel.org/tip/da8a58b56c661681f9b2fd2fa59c6da3a5bac8d1 Author: Jan Stancek AuthorDate: Fri, 17 Feb 2017 12:10:26 +0100 Committer: Arnaldo Carvalho de Melo CommitDate: Fri, 17 Feb 2017 12:56:35 -0300 perf tools: Replace _SC_NPROCESSORS_CONF with max_present_cpu in cpu_topology_map There are 2 problems wrt. cpu_topology_map on systems with sparse CPUs: 1. offline/absent CPUs will have their socket_id and core_id set to -1 which triggers: "socket_id number is too big.You may need to upgrade the perf tool." 2. size of cpu_topology_map (perf_env.cpu[]) is allocated based on _SC_NPROCESSORS_CONF, but can be indexed with CPU ids going above. Users of perf_env.cpu[] are using CPU id as index. This can lead to read beyond what was allocated: ==19991== Invalid read of size 4 ==19991==at 0x490CEB: check_cpu_topology (topology.c:69) ==19991==by 0x490CEB: test_session_topology (topology.c:106) ... For example: _SC_NPROCESSORS_CONF == 16 available: 2 nodes (0-1) node 0 cpus: 0 6 8 10 16 22 24 26 node 0 size: 12004 MB node 0 free: 9470 MB node 1 cpus: 1 7 9 11 23 25 27 node 1 size: 12093 MB node 1 free: 9406 MB node distances: node 0 1 0: 10 20 1: 20 10 This patch changes HEADER_NRCPUS.nr_cpus_available from _SC_NPROCESSORS_CONF to max_present_cpu and updates any user of cpu_topology_map to iterate with nr_cpus_avail. As a consequence HEADER_CPU_TOPOLOGY core_id and socket_id lists get longer, but maintain compatibility with pre-patch state - index to cpu_topology_map is CPU id. perf test 36 -v 36: Session topology : --- start --- test child forked, pid 22211 templ file: /tmp/perf-test-gmdX5i CPU 0, core 0, socket 0 CPU 1, core 0, socket 1 CPU 6, core 10, socket 0 CPU 7, core 10, socket 1 CPU 8, core 1, socket 0 CPU 9, core 1, socket 1 CPU 10, core 9, socket 0 CPU 11, core 9, socket 1 CPU 16, core 0, socket 0 CPU 22, core 10, socket 0 CPU 23, core 10, socket 1 CPU 24, core 1, socket 0 CPU 25, core 1, socket 1 CPU 26, core 9, socket 0 CPU 27, core 9, socket 1 test child finished with 0 end Session topology: Ok Signed-off-by: Jan Stancek Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Masami Hiramatsu Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/d7c05c6445fca74a8442c2c73cfffd349c52c44f.1487146877.git.jstan...@redhat.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-stat.c | 2 +- tools/perf/tests/topology.c | 4 +++- tools/perf/util/env.c | 2 +- tools/perf/util/header.c| 16 +--- 4 files changed, 10 insertions(+), 14 deletions(-) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index f287191..ca27a8a 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -1765,7 +1765,7 @@ static inline int perf_env__get_cpu(struct perf_env *env, struct cpu_map *map, i cpu = map->map[idx]; - if (cpu >= env->nr_cpus_online) + if (cpu >= env->nr_cpus_avail) return -1; return cpu; diff --git a/tools/perf/tests/topology.c b/tools/perf/tests/topology.c index 98fe69a..803f893 100644 --- a/tools/perf/tests/topology.c +++ b/tools/perf/tests/topology.c @@ -65,7 +65,9 @@ static int check_cpu_topology(char *path, struct cpu_map *map) session = perf_session__new(, false, NULL); TEST_ASSERT_VAL("can't get session", session); - for (i = 0; i < session->header.env.nr_cpus_online; i++) { + for (i = 0; i < session->header.env.nr_cpus_avail; i++) { + if (!cpu_map__has(map, i)) + continue; pr_debug("CPU %d, core %d, socket %d\n", i, session->header.env.cpu[i].core_id, session->header.env.cpu[i].socket_id); diff --git a/tools/perf/util/env.c b/tools/perf/util/env.c index bb964e8..075fc77 100644 --- a/tools/perf/util/env.c +++ b/tools/perf/util/env.c @@ -66,7 +66,7 @@ int perf_env__read_cpu_topology_map(struct perf_env *env) return 0; if (env->nr_cpus_avail == 0) - env->nr_cpus_avail = sysconf(_SC_NPROCESSORS_CONF); + env->nr_cpus_avail = cpu__max_present_cpu(); nr_cpus = env->nr_cpus_avail; if (nr_cpus == -1) diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index 1222f6c..05714d5 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -295,11 +295,7 @@ static int write_nrcpus(int fd, struct perf_header *h __maybe_unused, u32 nrc, nra; int ret; - nr = sysconf(_SC_NPROCESSORS_CONF); - if (nr < 0) - return -1; - - nrc = (u32)(nr & UINT_MAX); + nrc = cpu__max_present_cpu(); nr = sysconf(_SC
[PATCH v4 3/3] perf: Replace _SC_NPROCESSORS_CONF with max_present_cpu in cpu_topology_map
There are 2 problems wrt. cpu_topology_map on systems with sparse CPUs: 1. offline/absent CPUs will have their socket_id and core_id set to -1 which triggers: "socket_id number is too big.You may need to upgrade the perf tool." 2. size of cpu_topology_map (perf_env.cpu[]) is allocated based on _SC_NPROCESSORS_CONF, but can be indexed with CPU ids going above. Users of perf_env.cpu[] are using CPU id as index. This can lead to read beyond what was allocated: ==19991== Invalid read of size 4 ==19991==at 0x490CEB: check_cpu_topology (topology.c:69) ==19991==by 0x490CEB: test_session_topology (topology.c:106) ... For example: _SC_NPROCESSORS_CONF == 16 available: 2 nodes (0-1) node 0 cpus: 0 6 8 10 16 22 24 26 node 0 size: 12004 MB node 0 free: 9470 MB node 1 cpus: 1 7 9 11 23 25 27 node 1 size: 12093 MB node 1 free: 9406 MB node distances: node 0 1 0: 10 20 1: 20 10 This patch changes HEADER_NRCPUS.nr_cpus_available from _SC_NPROCESSORS_CONF to max_present_cpu and updates any user of cpu_topology_map to iterate with nr_cpus_avail. As consequence HEADER_CPU_TOPOLOGY core_id and socket_id lists get longer, but maintain compatibility with pre-patch state - index to cpu_topology_map is CPU id. # ./perf test 36 -v 36: Session topology : --- start --- test child forked, pid 22211 templ file: /tmp/perf-test-gmdX5i CPU 0, core 0, socket 0 CPU 1, core 0, socket 1 CPU 6, core 10, socket 0 CPU 7, core 10, socket 1 CPU 8, core 1, socket 0 CPU 9, core 1, socket 1 CPU 10, core 9, socket 0 CPU 11, core 9, socket 1 CPU 16, core 0, socket 0 CPU 22, core 10, socket 0 CPU 23, core 10, socket 1 CPU 24, core 1, socket 0 CPU 25, core 1, socket 1 CPU 26, core 9, socket 0 CPU 27, core 9, socket 1 test child finished with 0 end Session topology: Ok Signed-off-by: Jan Stancek <jstan...@redhat.com> --- tools/perf/builtin-stat.c | 2 +- tools/perf/tests/topology.c | 4 +++- tools/perf/util/env.c | 2 +- tools/perf/util/header.c| 20 +--- 4 files changed, 14 insertions(+), 14 deletions(-) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index a02f2e965628..9e2be1c52fbd 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -1765,7 +1765,7 @@ static inline int perf_env__get_cpu(struct perf_env *env, struct cpu_map *map, i cpu = map->map[idx]; - if (cpu >= env->nr_cpus_online) + if (cpu >= env->nr_cpus_avail) return -1; return cpu; diff --git a/tools/perf/tests/topology.c b/tools/perf/tests/topology.c index 98fe69ac553c..803f893550d6 100644 --- a/tools/perf/tests/topology.c +++ b/tools/perf/tests/topology.c @@ -65,7 +65,9 @@ static int check_cpu_topology(char *path, struct cpu_map *map) session = perf_session__new(, false, NULL); TEST_ASSERT_VAL("can't get session", session); - for (i = 0; i < session->header.env.nr_cpus_online; i++) { + for (i = 0; i < session->header.env.nr_cpus_avail; i++) { + if (!cpu_map__has(map, i)) + continue; pr_debug("CPU %d, core %d, socket %d\n", i, session->header.env.cpu[i].core_id, session->header.env.cpu[i].socket_id); diff --git a/tools/perf/util/env.c b/tools/perf/util/env.c index bb964e86b09d..075fc77286bf 100644 --- a/tools/perf/util/env.c +++ b/tools/perf/util/env.c @@ -66,7 +66,7 @@ int perf_env__read_cpu_topology_map(struct perf_env *env) return 0; if (env->nr_cpus_avail == 0) - env->nr_cpus_avail = sysconf(_SC_NPROCESSORS_CONF); + env->nr_cpus_avail = cpu__max_present_cpu(); nr_cpus = env->nr_cpus_avail; if (nr_cpus == -1) diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index ca0f12fb5fd3..e97cbf78ef21 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -293,11 +293,7 @@ static int write_nrcpus(int fd, struct perf_header *h __maybe_unused, u32 nrc, nra; int ret; - nr = sysconf(_SC_NPROCESSORS_CONF); - if (nr < 0) - return -1; - - nrc = (u32)(nr & UINT_MAX); + nrc = cpu__max_present_cpu(); nr = sysconf(_SC_NPROCESSORS_ONLN); if (nr < 0) @@ -511,9 +507,7 @@ static struct cpu_topo *build_cpu_topology(void) int ret = -1; struct cpu_map *map; - ncpus = sysconf(_SC_NPROCESSORS_CONF); - if (ncpus < 0) - return NULL; + ncpus = cpu__max_present_cpu(); /* build online CPU map */ map = cpu_map__new(NULL); @@ -1137,7 +1131,7 @@ static void print_cpu_topology(struct perf_header *ph, int fd __maybe_unused, { int nr, i; char *str; - int
[PATCH v4 2/3] perf header: Make build_cpu_topology skip offline/absent CPUs
When build_cpu_topo() encounters offline/absent CPUs, it fails to find any sysfs entries and returns failure. This leads to build_cpu_topology() and write_cpu_topology() failing as well. Because HEADER_CPU_TOPOLOGY has not been written, read leaves cpu_topology_map NULL and we get NULL ptr deref at: ... cmd_test __cmd_test test_and_print run_test test_session_topology check_cpu_topology # perf test 36 36: Session topology : --- start --- test child forked, pid 14902 templ file: /tmp/perf-test-4CKocW failed to write feature HEADER_CPU_TOPOLOGY perf: Segmentation fault Obtained 9 stack frames. ./perf(sighandler_dump_stack+0x41) [0x5095f1] /lib64/libc.so.6(+0x35250) [0x7f4b7c3c9250] ./perf(test_session_topology+0x1db) [0x490ceb] ./perf() [0x475b68] ./perf(cmd_test+0x5b9) [0x4763c9] ./perf() [0x4945a3] ./perf(main+0x69f) [0x427e8f] /lib64/libc.so.6(__libc_start_main+0xf5) [0x7f4b7c3b5b35] ./perf() [0x427fb9] test child interrupted end Session topology: FAILED! This patch makes build_cpu_topology() skip offline/absent CPUs, by checking their presence against cpu_map built from online CPUs. Signed-off-by: Jan Stancek <jstan...@redhat.com> --- tools/perf/util/header.c | 19 --- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index d89c9c7ef4e5..ca0f12fb5fd3 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -503,24 +503,31 @@ static void free_cpu_topo(struct cpu_topo *tp) static struct cpu_topo *build_cpu_topology(void) { - struct cpu_topo *tp; + struct cpu_topo *tp = NULL; void *addr; u32 nr, i; size_t sz; long ncpus; int ret = -1; + struct cpu_map *map; ncpus = sysconf(_SC_NPROCESSORS_CONF); if (ncpus < 0) return NULL; + /* build online CPU map */ + map = cpu_map__new(NULL); + if (map == NULL) { + pr_debug("failed to get system cpumap\n"); + return NULL; + } + nr = (u32)(ncpus & UINT_MAX); sz = nr * sizeof(char *); - addr = calloc(1, sizeof(*tp) + 2 * sz); if (!addr) - return NULL; + goto out_free; tp = addr; tp->cpu_nr = nr; @@ -530,10 +537,16 @@ static struct cpu_topo *build_cpu_topology(void) tp->thread_siblings = addr; for (i = 0; i < nr; i++) { + if (!cpu_map__has(map, i)) + continue; + ret = build_cpu_topo(tp, i); if (ret < 0) break; } + +out_free: + cpu_map__put(map); if (ret) { free_cpu_topo(tp); tp = NULL; -- 1.8.3.1
[PATCH v4 1/3] perf cpu_map: Add cpu__max_present_cpu()
Similar to cpu__max_cpu() (which returns max possible CPU), returns max present CPU. Signed-off-by: Jan Stancek <jstan...@redhat.com> --- tools/perf/util/cpumap.c | 22 ++ tools/perf/util/cpumap.h | 1 + 2 files changed, 23 insertions(+) diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c index 2c0b52264a46..8c7504939113 100644 --- a/tools/perf/util/cpumap.c +++ b/tools/perf/util/cpumap.c @@ -9,6 +9,7 @@ #include "asm/bug.h" static int max_cpu_num; +static int max_present_cpu_num; static int max_node_num; static int *cpunode_map; @@ -442,6 +443,7 @@ static void set_max_cpu_num(void) /* set up default */ max_cpu_num = 4096; + max_present_cpu_num = 4096; mnt = sysfs__mountpoint(); if (!mnt) @@ -455,6 +457,17 @@ static void set_max_cpu_num(void) } ret = get_max_num(path, _cpu_num); + if (ret) + goto out; + + /* get the highest present cpu number for a sparse allocation */ + ret = snprintf(path, PATH_MAX, "%s/devices/system/cpu/present", mnt); + if (ret == PATH_MAX) { + pr_err("sysfs path crossed PATH_MAX(%d) size\n", PATH_MAX); + goto out; + } + + ret = get_max_num(path, _present_cpu_num); out: if (ret) @@ -505,6 +518,15 @@ int cpu__max_cpu(void) return max_cpu_num; } +int cpu__max_present_cpu(void) +{ + if (unlikely(!max_present_cpu_num)) + set_max_cpu_num(); + + return max_present_cpu_num; +} + + int cpu__get_node(int cpu) { if (unlikely(cpunode_map == NULL)) { diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h index 06bd689f5989..1a0549af8f5c 100644 --- a/tools/perf/util/cpumap.h +++ b/tools/perf/util/cpumap.h @@ -62,6 +62,7 @@ static inline bool cpu_map__empty(const struct cpu_map *map) int cpu__max_node(void); int cpu__max_cpu(void); +int cpu__max_present_cpu(void); int cpu__get_node(int cpu); int cpu_map__build_map(struct cpu_map *cpus, struct cpu_map **res, -- 1.8.3.1
[PATCH v4 3/3] perf: Replace _SC_NPROCESSORS_CONF with max_present_cpu in cpu_topology_map
There are 2 problems wrt. cpu_topology_map on systems with sparse CPUs: 1. offline/absent CPUs will have their socket_id and core_id set to -1 which triggers: "socket_id number is too big.You may need to upgrade the perf tool." 2. size of cpu_topology_map (perf_env.cpu[]) is allocated based on _SC_NPROCESSORS_CONF, but can be indexed with CPU ids going above. Users of perf_env.cpu[] are using CPU id as index. This can lead to read beyond what was allocated: ==19991== Invalid read of size 4 ==19991==at 0x490CEB: check_cpu_topology (topology.c:69) ==19991==by 0x490CEB: test_session_topology (topology.c:106) ... For example: _SC_NPROCESSORS_CONF == 16 available: 2 nodes (0-1) node 0 cpus: 0 6 8 10 16 22 24 26 node 0 size: 12004 MB node 0 free: 9470 MB node 1 cpus: 1 7 9 11 23 25 27 node 1 size: 12093 MB node 1 free: 9406 MB node distances: node 0 1 0: 10 20 1: 20 10 This patch changes HEADER_NRCPUS.nr_cpus_available from _SC_NPROCESSORS_CONF to max_present_cpu and updates any user of cpu_topology_map to iterate with nr_cpus_avail. As consequence HEADER_CPU_TOPOLOGY core_id and socket_id lists get longer, but maintain compatibility with pre-patch state - index to cpu_topology_map is CPU id. # ./perf test 36 -v 36: Session topology : --- start --- test child forked, pid 22211 templ file: /tmp/perf-test-gmdX5i CPU 0, core 0, socket 0 CPU 1, core 0, socket 1 CPU 6, core 10, socket 0 CPU 7, core 10, socket 1 CPU 8, core 1, socket 0 CPU 9, core 1, socket 1 CPU 10, core 9, socket 0 CPU 11, core 9, socket 1 CPU 16, core 0, socket 0 CPU 22, core 10, socket 0 CPU 23, core 10, socket 1 CPU 24, core 1, socket 0 CPU 25, core 1, socket 1 CPU 26, core 9, socket 0 CPU 27, core 9, socket 1 test child finished with 0 end Session topology: Ok Signed-off-by: Jan Stancek --- tools/perf/builtin-stat.c | 2 +- tools/perf/tests/topology.c | 4 +++- tools/perf/util/env.c | 2 +- tools/perf/util/header.c| 20 +--- 4 files changed, 14 insertions(+), 14 deletions(-) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index a02f2e965628..9e2be1c52fbd 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -1765,7 +1765,7 @@ static inline int perf_env__get_cpu(struct perf_env *env, struct cpu_map *map, i cpu = map->map[idx]; - if (cpu >= env->nr_cpus_online) + if (cpu >= env->nr_cpus_avail) return -1; return cpu; diff --git a/tools/perf/tests/topology.c b/tools/perf/tests/topology.c index 98fe69ac553c..803f893550d6 100644 --- a/tools/perf/tests/topology.c +++ b/tools/perf/tests/topology.c @@ -65,7 +65,9 @@ static int check_cpu_topology(char *path, struct cpu_map *map) session = perf_session__new(, false, NULL); TEST_ASSERT_VAL("can't get session", session); - for (i = 0; i < session->header.env.nr_cpus_online; i++) { + for (i = 0; i < session->header.env.nr_cpus_avail; i++) { + if (!cpu_map__has(map, i)) + continue; pr_debug("CPU %d, core %d, socket %d\n", i, session->header.env.cpu[i].core_id, session->header.env.cpu[i].socket_id); diff --git a/tools/perf/util/env.c b/tools/perf/util/env.c index bb964e86b09d..075fc77286bf 100644 --- a/tools/perf/util/env.c +++ b/tools/perf/util/env.c @@ -66,7 +66,7 @@ int perf_env__read_cpu_topology_map(struct perf_env *env) return 0; if (env->nr_cpus_avail == 0) - env->nr_cpus_avail = sysconf(_SC_NPROCESSORS_CONF); + env->nr_cpus_avail = cpu__max_present_cpu(); nr_cpus = env->nr_cpus_avail; if (nr_cpus == -1) diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index ca0f12fb5fd3..e97cbf78ef21 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -293,11 +293,7 @@ static int write_nrcpus(int fd, struct perf_header *h __maybe_unused, u32 nrc, nra; int ret; - nr = sysconf(_SC_NPROCESSORS_CONF); - if (nr < 0) - return -1; - - nrc = (u32)(nr & UINT_MAX); + nrc = cpu__max_present_cpu(); nr = sysconf(_SC_NPROCESSORS_ONLN); if (nr < 0) @@ -511,9 +507,7 @@ static struct cpu_topo *build_cpu_topology(void) int ret = -1; struct cpu_map *map; - ncpus = sysconf(_SC_NPROCESSORS_CONF); - if (ncpus < 0) - return NULL; + ncpus = cpu__max_present_cpu(); /* build online CPU map */ map = cpu_map__new(NULL); @@ -1137,7 +1131,7 @@ static void print_cpu_topology(struct perf_header *ph, int fd __maybe_unused, { int nr, i; char *str; - int cpu_nr = ph->env.nr_cpus_
[PATCH v4 2/3] perf header: Make build_cpu_topology skip offline/absent CPUs
When build_cpu_topo() encounters offline/absent CPUs, it fails to find any sysfs entries and returns failure. This leads to build_cpu_topology() and write_cpu_topology() failing as well. Because HEADER_CPU_TOPOLOGY has not been written, read leaves cpu_topology_map NULL and we get NULL ptr deref at: ... cmd_test __cmd_test test_and_print run_test test_session_topology check_cpu_topology # perf test 36 36: Session topology : --- start --- test child forked, pid 14902 templ file: /tmp/perf-test-4CKocW failed to write feature HEADER_CPU_TOPOLOGY perf: Segmentation fault Obtained 9 stack frames. ./perf(sighandler_dump_stack+0x41) [0x5095f1] /lib64/libc.so.6(+0x35250) [0x7f4b7c3c9250] ./perf(test_session_topology+0x1db) [0x490ceb] ./perf() [0x475b68] ./perf(cmd_test+0x5b9) [0x4763c9] ./perf() [0x4945a3] ./perf(main+0x69f) [0x427e8f] /lib64/libc.so.6(__libc_start_main+0xf5) [0x7f4b7c3b5b35] ./perf() [0x427fb9] test child interrupted end Session topology: FAILED! This patch makes build_cpu_topology() skip offline/absent CPUs, by checking their presence against cpu_map built from online CPUs. Signed-off-by: Jan Stancek --- tools/perf/util/header.c | 19 --- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index d89c9c7ef4e5..ca0f12fb5fd3 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -503,24 +503,31 @@ static void free_cpu_topo(struct cpu_topo *tp) static struct cpu_topo *build_cpu_topology(void) { - struct cpu_topo *tp; + struct cpu_topo *tp = NULL; void *addr; u32 nr, i; size_t sz; long ncpus; int ret = -1; + struct cpu_map *map; ncpus = sysconf(_SC_NPROCESSORS_CONF); if (ncpus < 0) return NULL; + /* build online CPU map */ + map = cpu_map__new(NULL); + if (map == NULL) { + pr_debug("failed to get system cpumap\n"); + return NULL; + } + nr = (u32)(ncpus & UINT_MAX); sz = nr * sizeof(char *); - addr = calloc(1, sizeof(*tp) + 2 * sz); if (!addr) - return NULL; + goto out_free; tp = addr; tp->cpu_nr = nr; @@ -530,10 +537,16 @@ static struct cpu_topo *build_cpu_topology(void) tp->thread_siblings = addr; for (i = 0; i < nr; i++) { + if (!cpu_map__has(map, i)) + continue; + ret = build_cpu_topo(tp, i); if (ret < 0) break; } + +out_free: + cpu_map__put(map); if (ret) { free_cpu_topo(tp); tp = NULL; -- 1.8.3.1
[PATCH v4 1/3] perf cpu_map: Add cpu__max_present_cpu()
Similar to cpu__max_cpu() (which returns max possible CPU), returns max present CPU. Signed-off-by: Jan Stancek --- tools/perf/util/cpumap.c | 22 ++ tools/perf/util/cpumap.h | 1 + 2 files changed, 23 insertions(+) diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c index 2c0b52264a46..8c7504939113 100644 --- a/tools/perf/util/cpumap.c +++ b/tools/perf/util/cpumap.c @@ -9,6 +9,7 @@ #include "asm/bug.h" static int max_cpu_num; +static int max_present_cpu_num; static int max_node_num; static int *cpunode_map; @@ -442,6 +443,7 @@ static void set_max_cpu_num(void) /* set up default */ max_cpu_num = 4096; + max_present_cpu_num = 4096; mnt = sysfs__mountpoint(); if (!mnt) @@ -455,6 +457,17 @@ static void set_max_cpu_num(void) } ret = get_max_num(path, _cpu_num); + if (ret) + goto out; + + /* get the highest present cpu number for a sparse allocation */ + ret = snprintf(path, PATH_MAX, "%s/devices/system/cpu/present", mnt); + if (ret == PATH_MAX) { + pr_err("sysfs path crossed PATH_MAX(%d) size\n", PATH_MAX); + goto out; + } + + ret = get_max_num(path, _present_cpu_num); out: if (ret) @@ -505,6 +518,15 @@ int cpu__max_cpu(void) return max_cpu_num; } +int cpu__max_present_cpu(void) +{ + if (unlikely(!max_present_cpu_num)) + set_max_cpu_num(); + + return max_present_cpu_num; +} + + int cpu__get_node(int cpu) { if (unlikely(cpunode_map == NULL)) { diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h index 06bd689f5989..1a0549af8f5c 100644 --- a/tools/perf/util/cpumap.h +++ b/tools/perf/util/cpumap.h @@ -62,6 +62,7 @@ static inline bool cpu_map__empty(const struct cpu_map *map) int cpu__max_node(void); int cpu__max_cpu(void); +int cpu__max_present_cpu(void); int cpu__get_node(int cpu); int cpu_map__build_map(struct cpu_map *cpus, struct cpu_map **res, -- 1.8.3.1
[PATCH v4 0/3] perf: topology on systems with offline/absent CPUs
Changes in v4: - rebase on top of 4.10 - fix style of commit logs - add comment to process_cpu_topology() This series aims to address issues on systems with offline/absent CPUs: 1. cpu_topology_map size Allocation based on _SC_NPROCESSORS_CONF is too small, there can be CPU IDs that are higher, which causes invalid reads. 2. build_cpu_topology fails for offline/absent CPUs It is trying to parse sysfs entries, that do not exist. 3. perf topology test fails Changes affect HEADER_NRCPUS and HEADER_CPU_TOPOLOGY: For example on a system like this: _SC_NPROCESSORS_CONF == 16 available: 2 nodes (0-1) node 0 cpus: 0 6 8 10 16 22 24 26 node 0 size: 12004 MB node 0 free: 9470 MB node 1 cpus: 1 7 9 11 23 25 27 node 1 size: 12093 MB node 1 free: 9406 MB node distances: node 0 1 0: 10 20 1: 20 10 HEADER_NRCPUS before this series: nr_cpus_online = 15 nr_cpus_available = 16 HEADER_CPU_TOPOLOGY before this series: core_sib_nr: 2 core_siblings: 0,6,8,10,16,22,24,26 core_siblings: 1,7,9,11,23,25,27 thread_sib_nr: 8 thread_siblings: 0,16 thread_siblings: 1 thread_siblings: 6,22 thread_siblings: 7,23 thread_siblings: 8,24 thread_siblings: 9,25 thread_siblings: 10,26 thread_siblings: 11,27 core_id: 0, socket_id: 0 core_id: 0, socket_id: 1 core_id: -1, socket_id: -1 core_id: -1, socket_id: -1 core_id: -1, socket_id: -1 core_id: -1, socket_id: -1 core_id: 10, socket_id: 0 core_id: 10, socket_id: 1 core_id: 1, socket_id: 0 core_id: 1, socket_id: 1 core_id: 9, socket_id: 0 core_id: 9, socket_id: 1 core_id: -1, socket_id: -1 core_id: -1, socket_id: -1 core_id: -1, socket_id: -1 core_id: -1, socket_id: -1 HEADER_NRCPUS with this series applied: nr_cpus_online = 15 nr_cpus_available = 28 HEADER_CPU_TOPOLOGY with this series applied: core_sib_nr: 2 core_siblings: 0,6,8,10,16,22,24,26 core_siblings: 1,7,9,11,23,25,27 thread_sib_nr: 8 thread_siblings: 0,16 thread_siblings: 1 thread_siblings: 6,22 thread_siblings: 7,23 thread_siblings: 8,24 thread_siblings: 9,25 thread_siblings: 10,26 thread_siblings: 11,27 core_id: 0, socket_id: 0 core_id: 0, socket_id: 1 core_id: -1, socket_id: -1 core_id: -1, socket_id: -1 core_id: -1, socket_id: -1 core_id: -1, socket_id: -1 core_id: 10, socket_id: 0 core_id: 10, socket_id: 1 core_id: 1, socket_id: 0 core_id: 1, socket_id: 1 core_id: 9, socket_id: 0 core_id: 9, socket_id: 1 core_id: -1, socket_id: -1 core_id: -1, socket_id: -1 core_id: -1, socket_id: -1 core_id: -1, socket_id: -1 core_id: 0, socket_id: 0 core_id: -1, socket_id: -1 core_id: -1, socket_id: -1 core_id: -1, socket_id: -1 core_id: -1, socket_id: -1 core_id: -1, socket_id: -1 core_id: 10, socket_id: 0 core_id: 10, socket_id: 1 core_id: 1, socket_id: 0 core_id: 1, socket_id: 1 core_id: 9, socket_id: 0 core_id: 9, socket_id: 1 Jan Stancek (3): perf cpu_map: Add cpu__max_present_cpu() perf header: Make build_cpu_topology skip offline/absent CPUs perf: Replace _SC_NPROCESSORS_CONF with max_present_cpu in cpu_topology_map tools/perf/builtin-stat.c | 2 +- tools/perf/tests/topology.c | 4 +++- tools/perf/util/cpumap.c| 22 ++ tools/perf/util/cpumap.h| 1 + tools/perf/util/env.c | 2 +- tools/perf/util/header.c| 37 - 6 files changed, 52 insertions(+), 16 deletions(-) -- 1.8.3.1
[PATCH v4 0/3] perf: topology on systems with offline/absent CPUs
Changes in v4: - rebase on top of 4.10 - fix style of commit logs - add comment to process_cpu_topology() This series aims to address issues on systems with offline/absent CPUs: 1. cpu_topology_map size Allocation based on _SC_NPROCESSORS_CONF is too small, there can be CPU IDs that are higher, which causes invalid reads. 2. build_cpu_topology fails for offline/absent CPUs It is trying to parse sysfs entries, that do not exist. 3. perf topology test fails Changes affect HEADER_NRCPUS and HEADER_CPU_TOPOLOGY: For example on a system like this: _SC_NPROCESSORS_CONF == 16 available: 2 nodes (0-1) node 0 cpus: 0 6 8 10 16 22 24 26 node 0 size: 12004 MB node 0 free: 9470 MB node 1 cpus: 1 7 9 11 23 25 27 node 1 size: 12093 MB node 1 free: 9406 MB node distances: node 0 1 0: 10 20 1: 20 10 HEADER_NRCPUS before this series: nr_cpus_online = 15 nr_cpus_available = 16 HEADER_CPU_TOPOLOGY before this series: core_sib_nr: 2 core_siblings: 0,6,8,10,16,22,24,26 core_siblings: 1,7,9,11,23,25,27 thread_sib_nr: 8 thread_siblings: 0,16 thread_siblings: 1 thread_siblings: 6,22 thread_siblings: 7,23 thread_siblings: 8,24 thread_siblings: 9,25 thread_siblings: 10,26 thread_siblings: 11,27 core_id: 0, socket_id: 0 core_id: 0, socket_id: 1 core_id: -1, socket_id: -1 core_id: -1, socket_id: -1 core_id: -1, socket_id: -1 core_id: -1, socket_id: -1 core_id: 10, socket_id: 0 core_id: 10, socket_id: 1 core_id: 1, socket_id: 0 core_id: 1, socket_id: 1 core_id: 9, socket_id: 0 core_id: 9, socket_id: 1 core_id: -1, socket_id: -1 core_id: -1, socket_id: -1 core_id: -1, socket_id: -1 core_id: -1, socket_id: -1 HEADER_NRCPUS with this series applied: nr_cpus_online = 15 nr_cpus_available = 28 HEADER_CPU_TOPOLOGY with this series applied: core_sib_nr: 2 core_siblings: 0,6,8,10,16,22,24,26 core_siblings: 1,7,9,11,23,25,27 thread_sib_nr: 8 thread_siblings: 0,16 thread_siblings: 1 thread_siblings: 6,22 thread_siblings: 7,23 thread_siblings: 8,24 thread_siblings: 9,25 thread_siblings: 10,26 thread_siblings: 11,27 core_id: 0, socket_id: 0 core_id: 0, socket_id: 1 core_id: -1, socket_id: -1 core_id: -1, socket_id: -1 core_id: -1, socket_id: -1 core_id: -1, socket_id: -1 core_id: 10, socket_id: 0 core_id: 10, socket_id: 1 core_id: 1, socket_id: 0 core_id: 1, socket_id: 1 core_id: 9, socket_id: 0 core_id: 9, socket_id: 1 core_id: -1, socket_id: -1 core_id: -1, socket_id: -1 core_id: -1, socket_id: -1 core_id: -1, socket_id: -1 core_id: 0, socket_id: 0 core_id: -1, socket_id: -1 core_id: -1, socket_id: -1 core_id: -1, socket_id: -1 core_id: -1, socket_id: -1 core_id: -1, socket_id: -1 core_id: 10, socket_id: 0 core_id: 10, socket_id: 1 core_id: 1, socket_id: 0 core_id: 1, socket_id: 1 core_id: 9, socket_id: 0 core_id: 9, socket_id: 1 Jan Stancek (3): perf cpu_map: Add cpu__max_present_cpu() perf header: Make build_cpu_topology skip offline/absent CPUs perf: Replace _SC_NPROCESSORS_CONF with max_present_cpu in cpu_topology_map tools/perf/builtin-stat.c | 2 +- tools/perf/tests/topology.c | 4 +++- tools/perf/util/cpumap.c| 22 ++ tools/perf/util/cpumap.h| 1 + tools/perf/util/env.c | 2 +- tools/perf/util/header.c| 37 - 6 files changed, 52 insertions(+), 16 deletions(-) -- 1.8.3.1
[PATCH v3 2/3] perf: make build_cpu_topology skip offline/absent CPUs
When build_cpu_topo() encounters offline/absent CPUs, it fails to find any sysfs entries and returns failure. This leads to build_cpu_topology() and write_cpu_topology() failing as well. Because HEADER_CPU_TOPOLOGY has not been written, read leaves cpu_topology_map NULL and we get NULL ptr deref at: ... cmd_test __cmd_test test_and_print run_test test_session_topology check_cpu_topology 36: Session topology : --- start --- test child forked, pid 14902 templ file: /tmp/perf-test-4CKocW failed to write feature HEADER_CPU_TOPOLOGY perf: Segmentation fault Obtained 9 stack frames. ./perf(sighandler_dump_stack+0x41) [0x5095f1] /lib64/libc.so.6(+0x35250) [0x7f4b7c3c9250] ./perf(test_session_topology+0x1db) [0x490ceb] ./perf() [0x475b68] ./perf(cmd_test+0x5b9) [0x4763c9] ./perf() [0x4945a3] ./perf(main+0x69f) [0x427e8f] /lib64/libc.so.6(__libc_start_main+0xf5) [0x7f4b7c3b5b35] ./perf() [0x427fb9] test child interrupted end Session topology: FAILED! This patch makes build_cpu_topology() skip offline/absent CPUs, by checking their presence against cpu_map built from online CPUs. Signed-off-by: Jan Stancek <jstan...@redhat.com> --- tools/perf/util/header.c | 19 --- 1 file changed, 16 insertions(+), 3 deletions(-) Changes in v3: - make ret = -1 by default, so we fail in case build_cpu_topo doesn't run at all diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index d89c9c7ef4e5..ca0f12fb5fd3 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -503,24 +503,31 @@ static void free_cpu_topo(struct cpu_topo *tp) static struct cpu_topo *build_cpu_topology(void) { - struct cpu_topo *tp; + struct cpu_topo *tp = NULL; void *addr; u32 nr, i; size_t sz; long ncpus; int ret = -1; + struct cpu_map *map; ncpus = sysconf(_SC_NPROCESSORS_CONF); if (ncpus < 0) return NULL; + /* build online CPU map */ + map = cpu_map__new(NULL); + if (map == NULL) { + pr_debug("failed to get system cpumap\n"); + return NULL; + } + nr = (u32)(ncpus & UINT_MAX); sz = nr * sizeof(char *); - addr = calloc(1, sizeof(*tp) + 2 * sz); if (!addr) - return NULL; + goto out_free; tp = addr; tp->cpu_nr = nr; @@ -530,10 +537,16 @@ static struct cpu_topo *build_cpu_topology(void) tp->thread_siblings = addr; for (i = 0; i < nr; i++) { + if (!cpu_map__has(map, i)) + continue; + ret = build_cpu_topo(tp, i); if (ret < 0) break; } + +out_free: + cpu_map__put(map); if (ret) { free_cpu_topo(tp); tp = NULL; -- 1.8.3.1
[PATCH v3 1/3] perf: add cpu__max_present_cpu()
Similar to cpu__max_cpu() (which returns max possible CPU), returns max present CPU. Signed-off-by: Jan Stancek <jstan...@redhat.com> --- tools/perf/util/cpumap.c | 22 ++ tools/perf/util/cpumap.h | 1 + 2 files changed, 23 insertions(+) diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c index 2c0b52264a46..8c7504939113 100644 --- a/tools/perf/util/cpumap.c +++ b/tools/perf/util/cpumap.c @@ -9,6 +9,7 @@ #include "asm/bug.h" static int max_cpu_num; +static int max_present_cpu_num; static int max_node_num; static int *cpunode_map; @@ -442,6 +443,7 @@ static void set_max_cpu_num(void) /* set up default */ max_cpu_num = 4096; + max_present_cpu_num = 4096; mnt = sysfs__mountpoint(); if (!mnt) @@ -455,6 +457,17 @@ static void set_max_cpu_num(void) } ret = get_max_num(path, _cpu_num); + if (ret) + goto out; + + /* get the highest present cpu number for a sparse allocation */ + ret = snprintf(path, PATH_MAX, "%s/devices/system/cpu/present", mnt); + if (ret == PATH_MAX) { + pr_err("sysfs path crossed PATH_MAX(%d) size\n", PATH_MAX); + goto out; + } + + ret = get_max_num(path, _present_cpu_num); out: if (ret) @@ -505,6 +518,15 @@ int cpu__max_cpu(void) return max_cpu_num; } +int cpu__max_present_cpu(void) +{ + if (unlikely(!max_present_cpu_num)) + set_max_cpu_num(); + + return max_present_cpu_num; +} + + int cpu__get_node(int cpu) { if (unlikely(cpunode_map == NULL)) { diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h index 06bd689f5989..1a0549af8f5c 100644 --- a/tools/perf/util/cpumap.h +++ b/tools/perf/util/cpumap.h @@ -62,6 +62,7 @@ static inline bool cpu_map__empty(const struct cpu_map *map) int cpu__max_node(void); int cpu__max_cpu(void); +int cpu__max_present_cpu(void); int cpu__get_node(int cpu); int cpu_map__build_map(struct cpu_map *cpus, struct cpu_map **res, -- 1.8.3.1
[PATCH v3 3/3] perf: replace _SC_NPROCESSORS_CONF with max_present_cpu in cpu_topology_map
There are 2 problems wrt. cpu_topology_map on systems with sparse CPUs: 1. offline/absent CPUs will have their socket_id and core_id set to -1 which triggers: "socket_id number is too big.You may need to upgrade the perf tool." 2. size of cpu_topology_map (perf_env.cpu[]) is allocated based on _SC_NPROCESSORS_CONF, but can be indexed with CPU ids going above. Users of perf_env.cpu[] are using CPU id as index. This can lead to read beyond what was allocated: ==19991== Invalid read of size 4 ==19991==at 0x490CEB: check_cpu_topology (topology.c:69) ==19991==by 0x490CEB: test_session_topology (topology.c:106) ... For example: _SC_NPROCESSORS_CONF == 16 available: 2 nodes (0-1) node 0 cpus: 0 6 8 10 16 22 24 26 node 0 size: 12004 MB node 0 free: 9470 MB node 1 cpus: 1 7 9 11 23 25 27 node 1 size: 12093 MB node 1 free: 9406 MB node distances: node 0 1 0: 10 20 1: 20 10 This patch changes HEADER_NRCPUS.nr_cpus_available from _SC_NPROCESSORS_CONF to max_present_cpu and updates any user of cpu_topology_map to iterate with nr_cpus_avail. As consequence HEADER_CPU_TOPOLOGY core_id and socket_id lists get longer, but maintain compatibility with pre-patch state - index to cpu_topology_map is CPU id. ./perf test 36 -v 36: Session topology : --- start --- test child forked, pid 22211 templ file: /tmp/perf-test-gmdX5i CPU 0, core 0, socket 0 CPU 1, core 0, socket 1 CPU 6, core 10, socket 0 CPU 7, core 10, socket 1 CPU 8, core 1, socket 0 CPU 9, core 1, socket 1 CPU 10, core 9, socket 0 CPU 11, core 9, socket 1 CPU 16, core 0, socket 0 CPU 22, core 10, socket 0 CPU 23, core 10, socket 1 CPU 24, core 1, socket 0 CPU 25, core 1, socket 1 CPU 26, core 9, socket 0 CPU 27, core 9, socket 1 test child finished with 0 end Session topology: Ok Signed-off-by: Jan Stancek <jstan...@redhat.com> --- tools/perf/builtin-stat.c | 2 +- tools/perf/tests/topology.c | 4 +++- tools/perf/util/env.c | 2 +- tools/perf/util/header.c| 16 +--- 4 files changed, 10 insertions(+), 14 deletions(-) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index a02f2e965628..9e2be1c52fbd 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -1765,7 +1765,7 @@ static inline int perf_env__get_cpu(struct perf_env *env, struct cpu_map *map, i cpu = map->map[idx]; - if (cpu >= env->nr_cpus_online) + if (cpu >= env->nr_cpus_avail) return -1; return cpu; diff --git a/tools/perf/tests/topology.c b/tools/perf/tests/topology.c index 98fe69ac553c..803f893550d6 100644 --- a/tools/perf/tests/topology.c +++ b/tools/perf/tests/topology.c @@ -65,7 +65,9 @@ static int check_cpu_topology(char *path, struct cpu_map *map) session = perf_session__new(, false, NULL); TEST_ASSERT_VAL("can't get session", session); - for (i = 0; i < session->header.env.nr_cpus_online; i++) { + for (i = 0; i < session->header.env.nr_cpus_avail; i++) { + if (!cpu_map__has(map, i)) + continue; pr_debug("CPU %d, core %d, socket %d\n", i, session->header.env.cpu[i].core_id, session->header.env.cpu[i].socket_id); diff --git a/tools/perf/util/env.c b/tools/perf/util/env.c index bb964e86b09d..075fc77286bf 100644 --- a/tools/perf/util/env.c +++ b/tools/perf/util/env.c @@ -66,7 +66,7 @@ int perf_env__read_cpu_topology_map(struct perf_env *env) return 0; if (env->nr_cpus_avail == 0) - env->nr_cpus_avail = sysconf(_SC_NPROCESSORS_CONF); + env->nr_cpus_avail = cpu__max_present_cpu(); nr_cpus = env->nr_cpus_avail; if (nr_cpus == -1) diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index ca0f12fb5fd3..3a79a0f10bf6 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -293,11 +293,7 @@ static int write_nrcpus(int fd, struct perf_header *h __maybe_unused, u32 nrc, nra; int ret; - nr = sysconf(_SC_NPROCESSORS_CONF); - if (nr < 0) - return -1; - - nrc = (u32)(nr & UINT_MAX); + nrc = cpu__max_present_cpu(); nr = sysconf(_SC_NPROCESSORS_ONLN); if (nr < 0) @@ -511,9 +507,7 @@ static struct cpu_topo *build_cpu_topology(void) int ret = -1; struct cpu_map *map; - ncpus = sysconf(_SC_NPROCESSORS_CONF); - if (ncpus < 0) - return NULL; + ncpus = cpu__max_present_cpu(); /* build online CPU map */ map = cpu_map__new(NULL); @@ -1137,7 +1131,7 @@ static void print_cpu_topology(struct perf_header *ph, int fd __maybe_unused, { int nr, i; char *str; - int cpu_nr = ph->env.nr_cpus_online; + int cpu_nr = ph
[PATCH v3 2/3] perf: make build_cpu_topology skip offline/absent CPUs
When build_cpu_topo() encounters offline/absent CPUs, it fails to find any sysfs entries and returns failure. This leads to build_cpu_topology() and write_cpu_topology() failing as well. Because HEADER_CPU_TOPOLOGY has not been written, read leaves cpu_topology_map NULL and we get NULL ptr deref at: ... cmd_test __cmd_test test_and_print run_test test_session_topology check_cpu_topology 36: Session topology : --- start --- test child forked, pid 14902 templ file: /tmp/perf-test-4CKocW failed to write feature HEADER_CPU_TOPOLOGY perf: Segmentation fault Obtained 9 stack frames. ./perf(sighandler_dump_stack+0x41) [0x5095f1] /lib64/libc.so.6(+0x35250) [0x7f4b7c3c9250] ./perf(test_session_topology+0x1db) [0x490ceb] ./perf() [0x475b68] ./perf(cmd_test+0x5b9) [0x4763c9] ./perf() [0x4945a3] ./perf(main+0x69f) [0x427e8f] /lib64/libc.so.6(__libc_start_main+0xf5) [0x7f4b7c3b5b35] ./perf() [0x427fb9] test child interrupted end Session topology: FAILED! This patch makes build_cpu_topology() skip offline/absent CPUs, by checking their presence against cpu_map built from online CPUs. Signed-off-by: Jan Stancek --- tools/perf/util/header.c | 19 --- 1 file changed, 16 insertions(+), 3 deletions(-) Changes in v3: - make ret = -1 by default, so we fail in case build_cpu_topo doesn't run at all diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index d89c9c7ef4e5..ca0f12fb5fd3 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -503,24 +503,31 @@ static void free_cpu_topo(struct cpu_topo *tp) static struct cpu_topo *build_cpu_topology(void) { - struct cpu_topo *tp; + struct cpu_topo *tp = NULL; void *addr; u32 nr, i; size_t sz; long ncpus; int ret = -1; + struct cpu_map *map; ncpus = sysconf(_SC_NPROCESSORS_CONF); if (ncpus < 0) return NULL; + /* build online CPU map */ + map = cpu_map__new(NULL); + if (map == NULL) { + pr_debug("failed to get system cpumap\n"); + return NULL; + } + nr = (u32)(ncpus & UINT_MAX); sz = nr * sizeof(char *); - addr = calloc(1, sizeof(*tp) + 2 * sz); if (!addr) - return NULL; + goto out_free; tp = addr; tp->cpu_nr = nr; @@ -530,10 +537,16 @@ static struct cpu_topo *build_cpu_topology(void) tp->thread_siblings = addr; for (i = 0; i < nr; i++) { + if (!cpu_map__has(map, i)) + continue; + ret = build_cpu_topo(tp, i); if (ret < 0) break; } + +out_free: + cpu_map__put(map); if (ret) { free_cpu_topo(tp); tp = NULL; -- 1.8.3.1
[PATCH v3 1/3] perf: add cpu__max_present_cpu()
Similar to cpu__max_cpu() (which returns max possible CPU), returns max present CPU. Signed-off-by: Jan Stancek --- tools/perf/util/cpumap.c | 22 ++ tools/perf/util/cpumap.h | 1 + 2 files changed, 23 insertions(+) diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c index 2c0b52264a46..8c7504939113 100644 --- a/tools/perf/util/cpumap.c +++ b/tools/perf/util/cpumap.c @@ -9,6 +9,7 @@ #include "asm/bug.h" static int max_cpu_num; +static int max_present_cpu_num; static int max_node_num; static int *cpunode_map; @@ -442,6 +443,7 @@ static void set_max_cpu_num(void) /* set up default */ max_cpu_num = 4096; + max_present_cpu_num = 4096; mnt = sysfs__mountpoint(); if (!mnt) @@ -455,6 +457,17 @@ static void set_max_cpu_num(void) } ret = get_max_num(path, _cpu_num); + if (ret) + goto out; + + /* get the highest present cpu number for a sparse allocation */ + ret = snprintf(path, PATH_MAX, "%s/devices/system/cpu/present", mnt); + if (ret == PATH_MAX) { + pr_err("sysfs path crossed PATH_MAX(%d) size\n", PATH_MAX); + goto out; + } + + ret = get_max_num(path, _present_cpu_num); out: if (ret) @@ -505,6 +518,15 @@ int cpu__max_cpu(void) return max_cpu_num; } +int cpu__max_present_cpu(void) +{ + if (unlikely(!max_present_cpu_num)) + set_max_cpu_num(); + + return max_present_cpu_num; +} + + int cpu__get_node(int cpu) { if (unlikely(cpunode_map == NULL)) { diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h index 06bd689f5989..1a0549af8f5c 100644 --- a/tools/perf/util/cpumap.h +++ b/tools/perf/util/cpumap.h @@ -62,6 +62,7 @@ static inline bool cpu_map__empty(const struct cpu_map *map) int cpu__max_node(void); int cpu__max_cpu(void); +int cpu__max_present_cpu(void); int cpu__get_node(int cpu); int cpu_map__build_map(struct cpu_map *cpus, struct cpu_map **res, -- 1.8.3.1
[PATCH v3 3/3] perf: replace _SC_NPROCESSORS_CONF with max_present_cpu in cpu_topology_map
There are 2 problems wrt. cpu_topology_map on systems with sparse CPUs: 1. offline/absent CPUs will have their socket_id and core_id set to -1 which triggers: "socket_id number is too big.You may need to upgrade the perf tool." 2. size of cpu_topology_map (perf_env.cpu[]) is allocated based on _SC_NPROCESSORS_CONF, but can be indexed with CPU ids going above. Users of perf_env.cpu[] are using CPU id as index. This can lead to read beyond what was allocated: ==19991== Invalid read of size 4 ==19991==at 0x490CEB: check_cpu_topology (topology.c:69) ==19991==by 0x490CEB: test_session_topology (topology.c:106) ... For example: _SC_NPROCESSORS_CONF == 16 available: 2 nodes (0-1) node 0 cpus: 0 6 8 10 16 22 24 26 node 0 size: 12004 MB node 0 free: 9470 MB node 1 cpus: 1 7 9 11 23 25 27 node 1 size: 12093 MB node 1 free: 9406 MB node distances: node 0 1 0: 10 20 1: 20 10 This patch changes HEADER_NRCPUS.nr_cpus_available from _SC_NPROCESSORS_CONF to max_present_cpu and updates any user of cpu_topology_map to iterate with nr_cpus_avail. As consequence HEADER_CPU_TOPOLOGY core_id and socket_id lists get longer, but maintain compatibility with pre-patch state - index to cpu_topology_map is CPU id. ./perf test 36 -v 36: Session topology : --- start --- test child forked, pid 22211 templ file: /tmp/perf-test-gmdX5i CPU 0, core 0, socket 0 CPU 1, core 0, socket 1 CPU 6, core 10, socket 0 CPU 7, core 10, socket 1 CPU 8, core 1, socket 0 CPU 9, core 1, socket 1 CPU 10, core 9, socket 0 CPU 11, core 9, socket 1 CPU 16, core 0, socket 0 CPU 22, core 10, socket 0 CPU 23, core 10, socket 1 CPU 24, core 1, socket 0 CPU 25, core 1, socket 1 CPU 26, core 9, socket 0 CPU 27, core 9, socket 1 test child finished with 0 end Session topology: Ok Signed-off-by: Jan Stancek --- tools/perf/builtin-stat.c | 2 +- tools/perf/tests/topology.c | 4 +++- tools/perf/util/env.c | 2 +- tools/perf/util/header.c| 16 +--- 4 files changed, 10 insertions(+), 14 deletions(-) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index a02f2e965628..9e2be1c52fbd 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -1765,7 +1765,7 @@ static inline int perf_env__get_cpu(struct perf_env *env, struct cpu_map *map, i cpu = map->map[idx]; - if (cpu >= env->nr_cpus_online) + if (cpu >= env->nr_cpus_avail) return -1; return cpu; diff --git a/tools/perf/tests/topology.c b/tools/perf/tests/topology.c index 98fe69ac553c..803f893550d6 100644 --- a/tools/perf/tests/topology.c +++ b/tools/perf/tests/topology.c @@ -65,7 +65,9 @@ static int check_cpu_topology(char *path, struct cpu_map *map) session = perf_session__new(, false, NULL); TEST_ASSERT_VAL("can't get session", session); - for (i = 0; i < session->header.env.nr_cpus_online; i++) { + for (i = 0; i < session->header.env.nr_cpus_avail; i++) { + if (!cpu_map__has(map, i)) + continue; pr_debug("CPU %d, core %d, socket %d\n", i, session->header.env.cpu[i].core_id, session->header.env.cpu[i].socket_id); diff --git a/tools/perf/util/env.c b/tools/perf/util/env.c index bb964e86b09d..075fc77286bf 100644 --- a/tools/perf/util/env.c +++ b/tools/perf/util/env.c @@ -66,7 +66,7 @@ int perf_env__read_cpu_topology_map(struct perf_env *env) return 0; if (env->nr_cpus_avail == 0) - env->nr_cpus_avail = sysconf(_SC_NPROCESSORS_CONF); + env->nr_cpus_avail = cpu__max_present_cpu(); nr_cpus = env->nr_cpus_avail; if (nr_cpus == -1) diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index ca0f12fb5fd3..3a79a0f10bf6 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -293,11 +293,7 @@ static int write_nrcpus(int fd, struct perf_header *h __maybe_unused, u32 nrc, nra; int ret; - nr = sysconf(_SC_NPROCESSORS_CONF); - if (nr < 0) - return -1; - - nrc = (u32)(nr & UINT_MAX); + nrc = cpu__max_present_cpu(); nr = sysconf(_SC_NPROCESSORS_ONLN); if (nr < 0) @@ -511,9 +507,7 @@ static struct cpu_topo *build_cpu_topology(void) int ret = -1; struct cpu_map *map; - ncpus = sysconf(_SC_NPROCESSORS_CONF); - if (ncpus < 0) - return NULL; + ncpus = cpu__max_present_cpu(); /* build online CPU map */ map = cpu_map__new(NULL); @@ -1137,7 +1131,7 @@ static void print_cpu_topology(struct perf_header *ph, int fd __maybe_unused, { int nr, i; char *str; - int cpu_nr = ph->env.nr_cpus_online; + int cpu_nr = ph->env.nr_cpus_avail;
Re: [PATCH v2 2/3] perf: make build_cpu_topology skip offline/absent CPUs
- Original Message - > From: "Jiri Olsa" <jo...@redhat.com> > To: "Jan Stancek" <jstan...@redhat.com> > Cc: linux-kernel@vger.kernel.org, pet...@infradead.org, mi...@redhat.com, > a...@kernel.org, "alexander shishkin" > <alexander.shish...@linux.intel.com>, jo...@kernel.org, mhira...@kernel.org > Sent: Tuesday, 14 February, 2017 12:01:10 PM > Subject: Re: [PATCH v2 2/3] perf: make build_cpu_topology skip offline/absent > CPUs > > On Mon, Feb 13, 2017 at 04:34:35PM +0100, Jan Stancek wrote: > > SNIP > > > This patch makes build_cpu_topology() skip offline/absent CPUs, > > by checking their presence against cpu_map built from online CPUs. > > > > Signed-off-by: Jan Stancek <jstan...@redhat.com> > > --- > > tools/perf/util/header.c | 21 + > > 1 file changed, 17 insertions(+), 4 deletions(-) > > > > Changes in v2: > > - drop out label, use return NULL where possible > > > > diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c > > index d89c9c7ef4e5..4b0ea4e92e9d 100644 > > --- a/tools/perf/util/header.c > > +++ b/tools/perf/util/header.c > > @@ -503,24 +503,31 @@ static void free_cpu_topo(struct cpu_topo *tp) > > > > static struct cpu_topo *build_cpu_topology(void) > > { > > - struct cpu_topo *tp; > > + struct cpu_topo *tp = NULL; > > void *addr; > > u32 nr, i; > > size_t sz; > > long ncpus; > > - int ret = -1; > > + int ret = 0; > > hum, shoudn't we fail if we dont find any online cpu? > > jirka You're right, there's no reason to change this. I'll send v3.
Re: [PATCH v2 2/3] perf: make build_cpu_topology skip offline/absent CPUs
- Original Message - > From: "Jiri Olsa" > To: "Jan Stancek" > Cc: linux-kernel@vger.kernel.org, pet...@infradead.org, mi...@redhat.com, > a...@kernel.org, "alexander shishkin" > , jo...@kernel.org, mhira...@kernel.org > Sent: Tuesday, 14 February, 2017 12:01:10 PM > Subject: Re: [PATCH v2 2/3] perf: make build_cpu_topology skip offline/absent > CPUs > > On Mon, Feb 13, 2017 at 04:34:35PM +0100, Jan Stancek wrote: > > SNIP > > > This patch makes build_cpu_topology() skip offline/absent CPUs, > > by checking their presence against cpu_map built from online CPUs. > > > > Signed-off-by: Jan Stancek > > --- > > tools/perf/util/header.c | 21 + > > 1 file changed, 17 insertions(+), 4 deletions(-) > > > > Changes in v2: > > - drop out label, use return NULL where possible > > > > diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c > > index d89c9c7ef4e5..4b0ea4e92e9d 100644 > > --- a/tools/perf/util/header.c > > +++ b/tools/perf/util/header.c > > @@ -503,24 +503,31 @@ static void free_cpu_topo(struct cpu_topo *tp) > > > > static struct cpu_topo *build_cpu_topology(void) > > { > > - struct cpu_topo *tp; > > + struct cpu_topo *tp = NULL; > > void *addr; > > u32 nr, i; > > size_t sz; > > long ncpus; > > - int ret = -1; > > + int ret = 0; > > hum, shoudn't we fail if we dont find any online cpu? > > jirka You're right, there's no reason to change this. I'll send v3.
[PATCH v2 1/3] perf: add cpu__max_present_cpu()
Similar to cpu__max_cpu() (which returns max possible CPU), returns max present CPU. Signed-off-by: Jan Stancek <jstan...@redhat.com> --- tools/perf/util/cpumap.c | 22 ++ tools/perf/util/cpumap.h | 1 + 2 files changed, 23 insertions(+) diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c index 2c0b52264a46..8c7504939113 100644 --- a/tools/perf/util/cpumap.c +++ b/tools/perf/util/cpumap.c @@ -9,6 +9,7 @@ #include "asm/bug.h" static int max_cpu_num; +static int max_present_cpu_num; static int max_node_num; static int *cpunode_map; @@ -442,6 +443,7 @@ static void set_max_cpu_num(void) /* set up default */ max_cpu_num = 4096; + max_present_cpu_num = 4096; mnt = sysfs__mountpoint(); if (!mnt) @@ -455,6 +457,17 @@ static void set_max_cpu_num(void) } ret = get_max_num(path, _cpu_num); + if (ret) + goto out; + + /* get the highest present cpu number for a sparse allocation */ + ret = snprintf(path, PATH_MAX, "%s/devices/system/cpu/present", mnt); + if (ret == PATH_MAX) { + pr_err("sysfs path crossed PATH_MAX(%d) size\n", PATH_MAX); + goto out; + } + + ret = get_max_num(path, _present_cpu_num); out: if (ret) @@ -505,6 +518,15 @@ int cpu__max_cpu(void) return max_cpu_num; } +int cpu__max_present_cpu(void) +{ + if (unlikely(!max_present_cpu_num)) + set_max_cpu_num(); + + return max_present_cpu_num; +} + + int cpu__get_node(int cpu) { if (unlikely(cpunode_map == NULL)) { diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h index 06bd689f5989..1a0549af8f5c 100644 --- a/tools/perf/util/cpumap.h +++ b/tools/perf/util/cpumap.h @@ -62,6 +62,7 @@ static inline bool cpu_map__empty(const struct cpu_map *map) int cpu__max_node(void); int cpu__max_cpu(void); +int cpu__max_present_cpu(void); int cpu__get_node(int cpu); int cpu_map__build_map(struct cpu_map *cpus, struct cpu_map **res, -- 1.8.3.1
[PATCH v2 1/3] perf: add cpu__max_present_cpu()
Similar to cpu__max_cpu() (which returns max possible CPU), returns max present CPU. Signed-off-by: Jan Stancek --- tools/perf/util/cpumap.c | 22 ++ tools/perf/util/cpumap.h | 1 + 2 files changed, 23 insertions(+) diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c index 2c0b52264a46..8c7504939113 100644 --- a/tools/perf/util/cpumap.c +++ b/tools/perf/util/cpumap.c @@ -9,6 +9,7 @@ #include "asm/bug.h" static int max_cpu_num; +static int max_present_cpu_num; static int max_node_num; static int *cpunode_map; @@ -442,6 +443,7 @@ static void set_max_cpu_num(void) /* set up default */ max_cpu_num = 4096; + max_present_cpu_num = 4096; mnt = sysfs__mountpoint(); if (!mnt) @@ -455,6 +457,17 @@ static void set_max_cpu_num(void) } ret = get_max_num(path, _cpu_num); + if (ret) + goto out; + + /* get the highest present cpu number for a sparse allocation */ + ret = snprintf(path, PATH_MAX, "%s/devices/system/cpu/present", mnt); + if (ret == PATH_MAX) { + pr_err("sysfs path crossed PATH_MAX(%d) size\n", PATH_MAX); + goto out; + } + + ret = get_max_num(path, _present_cpu_num); out: if (ret) @@ -505,6 +518,15 @@ int cpu__max_cpu(void) return max_cpu_num; } +int cpu__max_present_cpu(void) +{ + if (unlikely(!max_present_cpu_num)) + set_max_cpu_num(); + + return max_present_cpu_num; +} + + int cpu__get_node(int cpu) { if (unlikely(cpunode_map == NULL)) { diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h index 06bd689f5989..1a0549af8f5c 100644 --- a/tools/perf/util/cpumap.h +++ b/tools/perf/util/cpumap.h @@ -62,6 +62,7 @@ static inline bool cpu_map__empty(const struct cpu_map *map) int cpu__max_node(void); int cpu__max_cpu(void); +int cpu__max_present_cpu(void); int cpu__get_node(int cpu); int cpu_map__build_map(struct cpu_map *cpus, struct cpu_map **res, -- 1.8.3.1
[PATCH v2 3/3] perf: replace _SC_NPROCESSORS_CONF with max_present_cpu in cpu_topology_map
There are 2 problems wrt. cpu_topology_map on systems with sparse CPUs: 1. offline/absent CPUs will have their socket_id and core_id set to -1 which triggers: "socket_id number is too big.You may need to upgrade the perf tool." 2. size of cpu_topology_map (perf_env.cpu[]) is allocated based on _SC_NPROCESSORS_CONF, but can be indexed with CPU ids going above. Users of perf_env.cpu[] are using CPU id as index. This can lead to read beyond what was allocated: ==19991== Invalid read of size 4 ==19991==at 0x490CEB: check_cpu_topology (topology.c:69) ==19991==by 0x490CEB: test_session_topology (topology.c:106) ... For example: _SC_NPROCESSORS_CONF == 16 available: 2 nodes (0-1) node 0 cpus: 0 6 8 10 16 22 24 26 node 0 size: 12004 MB node 0 free: 9470 MB node 1 cpus: 1 7 9 11 23 25 27 node 1 size: 12093 MB node 1 free: 9406 MB node distances: node 0 1 0: 10 20 1: 20 10 This patch changes HEADER_NRCPUS.nr_cpus_available from _SC_NPROCESSORS_CONF to max_present_cpu and updates any user of cpu_topology_map to iterate with nr_cpus_avail. As consequence HEADER_CPU_TOPOLOGY core_id and socket_id lists get longer, but maintain compatibility with pre-patch state - index to cpu_topology_map is CPU id. ./perf test 36 -v 36: Session topology : --- start --- test child forked, pid 22211 templ file: /tmp/perf-test-gmdX5i CPU 0, core 0, socket 0 CPU 1, core 0, socket 1 CPU 6, core 10, socket 0 CPU 7, core 10, socket 1 CPU 8, core 1, socket 0 CPU 9, core 1, socket 1 CPU 10, core 9, socket 0 CPU 11, core 9, socket 1 CPU 16, core 0, socket 0 CPU 22, core 10, socket 0 CPU 23, core 10, socket 1 CPU 24, core 1, socket 0 CPU 25, core 1, socket 1 CPU 26, core 9, socket 0 CPU 27, core 9, socket 1 test child finished with 0 end Session topology: Ok Signed-off-by: Jan Stancek <jstan...@redhat.com> --- tools/perf/builtin-stat.c | 2 +- tools/perf/tests/topology.c | 4 +++- tools/perf/util/env.c | 2 +- tools/perf/util/header.c| 16 +--- 4 files changed, 10 insertions(+), 14 deletions(-) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index a02f2e965628..9e2be1c52fbd 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -1765,7 +1765,7 @@ static inline int perf_env__get_cpu(struct perf_env *env, struct cpu_map *map, i cpu = map->map[idx]; - if (cpu >= env->nr_cpus_online) + if (cpu >= env->nr_cpus_avail) return -1; return cpu; diff --git a/tools/perf/tests/topology.c b/tools/perf/tests/topology.c index 98fe69ac553c..803f893550d6 100644 --- a/tools/perf/tests/topology.c +++ b/tools/perf/tests/topology.c @@ -65,7 +65,9 @@ static int check_cpu_topology(char *path, struct cpu_map *map) session = perf_session__new(, false, NULL); TEST_ASSERT_VAL("can't get session", session); - for (i = 0; i < session->header.env.nr_cpus_online; i++) { + for (i = 0; i < session->header.env.nr_cpus_avail; i++) { + if (!cpu_map__has(map, i)) + continue; pr_debug("CPU %d, core %d, socket %d\n", i, session->header.env.cpu[i].core_id, session->header.env.cpu[i].socket_id); diff --git a/tools/perf/util/env.c b/tools/perf/util/env.c index bb964e86b09d..075fc77286bf 100644 --- a/tools/perf/util/env.c +++ b/tools/perf/util/env.c @@ -66,7 +66,7 @@ int perf_env__read_cpu_topology_map(struct perf_env *env) return 0; if (env->nr_cpus_avail == 0) - env->nr_cpus_avail = sysconf(_SC_NPROCESSORS_CONF); + env->nr_cpus_avail = cpu__max_present_cpu(); nr_cpus = env->nr_cpus_avail; if (nr_cpus == -1) diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index 4b0ea4e92e9d..e9384d17ae7d 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -293,11 +293,7 @@ static int write_nrcpus(int fd, struct perf_header *h __maybe_unused, u32 nrc, nra; int ret; - nr = sysconf(_SC_NPROCESSORS_CONF); - if (nr < 0) - return -1; - - nrc = (u32)(nr & UINT_MAX); + nrc = cpu__max_present_cpu(); nr = sysconf(_SC_NPROCESSORS_ONLN); if (nr < 0) @@ -511,9 +507,7 @@ static struct cpu_topo *build_cpu_topology(void) int ret = 0; struct cpu_map *map; - ncpus = sysconf(_SC_NPROCESSORS_CONF); - if (ncpus < 0) - return NULL; + ncpus = cpu__max_present_cpu(); /* build online CPU map */ map = cpu_map__new(NULL); @@ -1137,7 +1131,7 @@ static void print_cpu_topology(struct perf_header *ph, int fd __maybe_unused, { int nr, i; char *str; - int cpu_nr = ph->env.nr_cpus_online; + int cpu_nr = ph
[PATCH v2 2/3] perf: make build_cpu_topology skip offline/absent CPUs
When build_cpu_topo() encounters offline/absent CPUs, it fails to find any sysfs entries and returns failure. This leads to build_cpu_topology() and write_cpu_topology() failing as well. Because HEADER_CPU_TOPOLOGY has not been written, read leaves cpu_topology_map NULL and we get NULL ptr deref at: ... cmd_test __cmd_test test_and_print run_test test_session_topology check_cpu_topology 36: Session topology : --- start --- test child forked, pid 14902 templ file: /tmp/perf-test-4CKocW failed to write feature HEADER_CPU_TOPOLOGY perf: Segmentation fault Obtained 9 stack frames. ./perf(sighandler_dump_stack+0x41) [0x5095f1] /lib64/libc.so.6(+0x35250) [0x7f4b7c3c9250] ./perf(test_session_topology+0x1db) [0x490ceb] ./perf() [0x475b68] ./perf(cmd_test+0x5b9) [0x4763c9] ./perf() [0x4945a3] ./perf(main+0x69f) [0x427e8f] /lib64/libc.so.6(__libc_start_main+0xf5) [0x7f4b7c3b5b35] ./perf() [0x427fb9] test child interrupted end Session topology: FAILED! This patch makes build_cpu_topology() skip offline/absent CPUs, by checking their presence against cpu_map built from online CPUs. Signed-off-by: Jan Stancek <jstan...@redhat.com> --- tools/perf/util/header.c | 21 + 1 file changed, 17 insertions(+), 4 deletions(-) Changes in v2: - drop out label, use return NULL where possible diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index d89c9c7ef4e5..4b0ea4e92e9d 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -503,24 +503,31 @@ static void free_cpu_topo(struct cpu_topo *tp) static struct cpu_topo *build_cpu_topology(void) { - struct cpu_topo *tp; + struct cpu_topo *tp = NULL; void *addr; u32 nr, i; size_t sz; long ncpus; - int ret = -1; + int ret = 0; + struct cpu_map *map; ncpus = sysconf(_SC_NPROCESSORS_CONF); if (ncpus < 0) return NULL; + /* build online CPU map */ + map = cpu_map__new(NULL); + if (map == NULL) { + pr_debug("failed to get system cpumap\n"); + return NULL; + } + nr = (u32)(ncpus & UINT_MAX); sz = nr * sizeof(char *); - addr = calloc(1, sizeof(*tp) + 2 * sz); if (!addr) - return NULL; + goto out_free; tp = addr; tp->cpu_nr = nr; @@ -530,10 +537,16 @@ static struct cpu_topo *build_cpu_topology(void) tp->thread_siblings = addr; for (i = 0; i < nr; i++) { + if (!cpu_map__has(map, i)) + continue; + ret = build_cpu_topo(tp, i); if (ret < 0) break; } + +out_free: + cpu_map__put(map); if (ret) { free_cpu_topo(tp); tp = NULL; -- 1.8.3.1
[PATCH v2 3/3] perf: replace _SC_NPROCESSORS_CONF with max_present_cpu in cpu_topology_map
There are 2 problems wrt. cpu_topology_map on systems with sparse CPUs: 1. offline/absent CPUs will have their socket_id and core_id set to -1 which triggers: "socket_id number is too big.You may need to upgrade the perf tool." 2. size of cpu_topology_map (perf_env.cpu[]) is allocated based on _SC_NPROCESSORS_CONF, but can be indexed with CPU ids going above. Users of perf_env.cpu[] are using CPU id as index. This can lead to read beyond what was allocated: ==19991== Invalid read of size 4 ==19991==at 0x490CEB: check_cpu_topology (topology.c:69) ==19991==by 0x490CEB: test_session_topology (topology.c:106) ... For example: _SC_NPROCESSORS_CONF == 16 available: 2 nodes (0-1) node 0 cpus: 0 6 8 10 16 22 24 26 node 0 size: 12004 MB node 0 free: 9470 MB node 1 cpus: 1 7 9 11 23 25 27 node 1 size: 12093 MB node 1 free: 9406 MB node distances: node 0 1 0: 10 20 1: 20 10 This patch changes HEADER_NRCPUS.nr_cpus_available from _SC_NPROCESSORS_CONF to max_present_cpu and updates any user of cpu_topology_map to iterate with nr_cpus_avail. As consequence HEADER_CPU_TOPOLOGY core_id and socket_id lists get longer, but maintain compatibility with pre-patch state - index to cpu_topology_map is CPU id. ./perf test 36 -v 36: Session topology : --- start --- test child forked, pid 22211 templ file: /tmp/perf-test-gmdX5i CPU 0, core 0, socket 0 CPU 1, core 0, socket 1 CPU 6, core 10, socket 0 CPU 7, core 10, socket 1 CPU 8, core 1, socket 0 CPU 9, core 1, socket 1 CPU 10, core 9, socket 0 CPU 11, core 9, socket 1 CPU 16, core 0, socket 0 CPU 22, core 10, socket 0 CPU 23, core 10, socket 1 CPU 24, core 1, socket 0 CPU 25, core 1, socket 1 CPU 26, core 9, socket 0 CPU 27, core 9, socket 1 test child finished with 0 end Session topology: Ok Signed-off-by: Jan Stancek --- tools/perf/builtin-stat.c | 2 +- tools/perf/tests/topology.c | 4 +++- tools/perf/util/env.c | 2 +- tools/perf/util/header.c| 16 +--- 4 files changed, 10 insertions(+), 14 deletions(-) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index a02f2e965628..9e2be1c52fbd 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -1765,7 +1765,7 @@ static inline int perf_env__get_cpu(struct perf_env *env, struct cpu_map *map, i cpu = map->map[idx]; - if (cpu >= env->nr_cpus_online) + if (cpu >= env->nr_cpus_avail) return -1; return cpu; diff --git a/tools/perf/tests/topology.c b/tools/perf/tests/topology.c index 98fe69ac553c..803f893550d6 100644 --- a/tools/perf/tests/topology.c +++ b/tools/perf/tests/topology.c @@ -65,7 +65,9 @@ static int check_cpu_topology(char *path, struct cpu_map *map) session = perf_session__new(, false, NULL); TEST_ASSERT_VAL("can't get session", session); - for (i = 0; i < session->header.env.nr_cpus_online; i++) { + for (i = 0; i < session->header.env.nr_cpus_avail; i++) { + if (!cpu_map__has(map, i)) + continue; pr_debug("CPU %d, core %d, socket %d\n", i, session->header.env.cpu[i].core_id, session->header.env.cpu[i].socket_id); diff --git a/tools/perf/util/env.c b/tools/perf/util/env.c index bb964e86b09d..075fc77286bf 100644 --- a/tools/perf/util/env.c +++ b/tools/perf/util/env.c @@ -66,7 +66,7 @@ int perf_env__read_cpu_topology_map(struct perf_env *env) return 0; if (env->nr_cpus_avail == 0) - env->nr_cpus_avail = sysconf(_SC_NPROCESSORS_CONF); + env->nr_cpus_avail = cpu__max_present_cpu(); nr_cpus = env->nr_cpus_avail; if (nr_cpus == -1) diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index 4b0ea4e92e9d..e9384d17ae7d 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -293,11 +293,7 @@ static int write_nrcpus(int fd, struct perf_header *h __maybe_unused, u32 nrc, nra; int ret; - nr = sysconf(_SC_NPROCESSORS_CONF); - if (nr < 0) - return -1; - - nrc = (u32)(nr & UINT_MAX); + nrc = cpu__max_present_cpu(); nr = sysconf(_SC_NPROCESSORS_ONLN); if (nr < 0) @@ -511,9 +507,7 @@ static struct cpu_topo *build_cpu_topology(void) int ret = 0; struct cpu_map *map; - ncpus = sysconf(_SC_NPROCESSORS_CONF); - if (ncpus < 0) - return NULL; + ncpus = cpu__max_present_cpu(); /* build online CPU map */ map = cpu_map__new(NULL); @@ -1137,7 +1131,7 @@ static void print_cpu_topology(struct perf_header *ph, int fd __maybe_unused, { int nr, i; char *str; - int cpu_nr = ph->env.nr_cpus_online; + int cpu_nr = ph->env.nr_cpus_avail;