[PATCH] mm: migrate: initialize err in do_migrate_pages
After commit 236c32eb1096 ("mm: migrate: clean up migrate_prep{_local}")', do_migrate_pages can return uninitialized variable 'err' (which is propagated to user-space as error) when 'from' and 'to' nodesets are identical. This can be reproduced with LTP migrate_pages01, which calls migrate_pages() with same set for both old/new_nodes. Add 'err' initialization back. Fixes: 236c32eb1096 ("mm: migrate: clean up migrate_prep{_local}") Cc: Zi Yan Cc: Yang Shi Cc: Jan Kara Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Song Liu Cc: Andrew Morton Signed-off-by: Jan Stancek --- mm/mempolicy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 8cf96bd21341..2c3a86502053 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -,7 +,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, const nodemask_t *to, int flags) { int busy = 0; - int err; + int err = 0; nodemask_t tmp; migrate_prep(); -- 2.18.1
Re: [LTP] [PATCH] selftests: vdso: hash entry size on alpha, s390x is 8 bytes
- Original Message - > > - Original Message - > > Hi! > > As much as it's worth the changes looks good to me. > > > > @Jan: I guess that we can as well fix this in LTP first then we can try > > to get the kernel version fixed... > > Fine by me, I'll give it couple more days then push it. Pushed.
Re: [LTP] [PATCH] selftests: vdso: hash entry size on alpha, s390x is 8 bytes
- Original Message - > Hi! > As much as it's worth the changes looks good to me. > > @Jan: I guess that we can as well fix this in LTP first then we can try > to get the kernel version fixed... Fine by me, I'll give it couple more days then push it. I did repost it with original author CC-ed: https://lore.kernel.org/lkml/93a07b1808e61babef3a20542cbeb4565d3c8410.1596458924.git.jstan...@redhat.com/ but haven't heard back yet.
[PATCH RESEND] selftests: vdso: hash entry size on alpha,s390x is 8 bytes
parse_vdso.c is crashing on 5.8-rc5 s390x, because it wrongly reads nbucket as 0: Program received signal SIGFPE, Arithmetic exception. 0x01000f3e in vdso_sym (version=0x1001280 "LINUX_2.6", name=0x100128a "__vdso_getcpu") at parse_vdso.c:207 207 ELF(Word) chain = vdso_info.bucket[elf_hash(name) % vdso_info.nbucket]; (gdb) p vdso_info.nbucket $1 = 0 Per readelf source: https://sourceware.org/git/?p=binutils-gdb.git;a=blob;f=binutils/readelf.c;h=2406304fe35a832ac53aa7b1a367f3f7afed4264;hb=HEAD#l10027 and objdump output hash entries are double size on 64bit s390 and alpha: 0120 <.hash>: 120: 00 00 00 00 .long 0x 124: 00 00 00 03 .long 0x0003 128: 00 00 00 00 .long 0x 12c: 00 00 00 07 .long 0x0007 130: 00 00 00 00 .long 0x 134: 00 00 00 01 .long 0x0001 138: 00 00 00 00 .long 0x 13c: 00 00 00 05 .long 0x0005 140: 00 00 00 00 .long 0x 144: 00 00 00 06 .long 0x0006 ... 16c: 00 00 00 02 .long 0x0002 170: 00 00 00 00 .long 0x 174: 00 00 00 03 .long 0x0003 178: 00 00 00 00 .long 0x 17c: 00 00 00 04 .long 0x0004 Do similar check in parse_vdso.c and treat hash entries as double word. Signed-off-by: Jan Stancek --- tools/testing/selftests/vDSO/parse_vdso.c | 48 +++ 1 file changed, 40 insertions(+), 8 deletions(-) Resend with original author CC-ed. diff --git a/tools/testing/selftests/vDSO/parse_vdso.c b/tools/testing/selftests/vDSO/parse_vdso.c index 413f75620a35..e23efcbd1c88 100644 --- a/tools/testing/selftests/vDSO/parse_vdso.c +++ b/tools/testing/selftests/vDSO/parse_vdso.c @@ -47,8 +47,9 @@ static struct vdso_info /* Symbol table */ ELF(Sym) *symtab; const char *symstrings; - ELF(Word) *bucket, *chain; + void *bucket, *chain; ELF(Word) nbucket, nchain; + bool hash_ent_is_dword; /* Version table */ ELF(Versym) *versym; @@ -69,6 +70,28 @@ static unsigned long elf_hash(const unsigned char *name) return h; } +/* return value of hash table entry */ +ELF(Word) get_hash_val(void *ptr, ELF(Word) idx) +{ + if (vdso_info.hash_ent_is_dword) { + ELF(Xword) *table = ptr; + /* for vdso assume all values fit in Elf Word */ + return (ELF(Word)) table[idx]; + } + + ELF(Word) *table = ptr; + return table[idx]; +} + +/* return pointer to hash table entry */ +void *get_hash_ptr(void *ptr, ELF(Word) idx) +{ + if (vdso_info.hash_ent_is_dword) + return &((ELF(Xword) *) ptr)[idx]; + + return &((ELF(Word) *) ptr)[idx]; +} + void vdso_init_from_sysinfo_ehdr(uintptr_t base) { size_t i; @@ -84,6 +107,14 @@ void vdso_init_from_sysinfo_ehdr(uintptr_t base) return; /* Wrong ELF class -- check ELF_BITS */ } + /* 64bit s390 and alpha have hash entry size of 8 bytes */ + if ((hdr->e_machine == EM_ALPHA + || hdr->e_machine == EM_S390) + && hdr->e_ident[EI_CLASS] == ELFCLASS64) + vdso_info.hash_ent_is_dword = true; + else + vdso_info.hash_ent_is_dword = false; + ELF(Phdr) *pt = (ELF(Phdr)*)(vdso_info.load_addr + hdr->e_phoff); ELF(Dyn) *dyn = 0; @@ -149,11 +180,11 @@ void vdso_init_from_sysinfo_ehdr(uintptr_t base) if (!vdso_info.verdef) vdso_info.versym = 0; - /* Parse the hash table header. */ - vdso_info.nbucket = hash[0]; - vdso_info.nchain = hash[1]; - vdso_info.bucket = &hash[2]; - vdso_info.chain = &hash[vdso_info.nbucket + 2]; + + vdso_info.nbucket = get_hash_val(hash, 0); + vdso_info.nchain = get_hash_val(hash, 1); + vdso_info.bucket = get_hash_ptr(hash, 2); + vdso_info.chain = get_hash_ptr(hash, vdso_info.nbucket + 2); /* That's all we need. */ vdso_info.valid = true; @@ -204,9 +235,10 @@ void *vdso_sym(const char *version, const char *name) return 0; ver_hash = elf_hash(version); - ELF(Word) chain = vdso_info.bucket[elf_hash(name) % vdso_info.nbucket]; + ELF(Word) chain = get_hash_val(vdso_info.bucket, + elf_hash(name) % vdso_info.nbucket); - for (; chain != STN_UNDEF; chain = vdso_info.chain[chain]) { + for (; chain != STN_UNDEF; chain = get_hash_val(vdso_info.chain, chain)) { ELF(Sym) *sym = &vdso_info.symtab[chain]; /* Check for a defined global or weak function w/ right name. */ -- 2.18.1
Re: [PATCH] selftests: vdso: hash entry size on alpha,s390x is 8 bytes
- Original Message - > parse_vdso.c is crashing on 5.8-rc5 s390x, because it wrongly reads > nbucket as 0: > Program received signal SIGFPE, Arithmetic exception. > 0x01000f3e in vdso_sym (version=0x1001280 "LINUX_2.6", > name=0x100128a "__vdso_getcpu") at parse_vdso.c:207 > 207 ELF(Word) chain = vdso_info.bucket[elf_hash(name) % > vdso_info.nbucket]; > (gdb) p vdso_info.nbucket > $1 = 0 > > Per readelf source: > > https://sourceware.org/git/?p=binutils-gdb.git;a=blob;f=binutils/readelf.c;h=2406304fe35a832ac53aa7b1a367f3f7afed4264;hb=HEAD#l10027 > and objdump output hash entries are double size on 64bit s390 and alpha: > 0120 <.hash>: >120: 00 00 00 00 .long 0x >124: 00 00 00 03 .long 0x0003 >128: 00 00 00 00 .long 0x >12c: 00 00 00 07 .long 0x0007 >130: 00 00 00 00 .long 0x >134: 00 00 00 01 .long 0x0001 >138: 00 00 00 00 .long 0x >13c: 00 00 00 05 .long 0x0005 >140: 00 00 00 00 .long 0x >144: 00 00 00 06 .long 0x0006 > ... >16c: 00 00 00 02 .long 0x0002 >170: 00 00 00 00 .long 0x >174: 00 00 00 03 .long 0x0003 >178: 00 00 00 00 .long 0x >17c: 00 00 00 04 .long 0x0004 > > Do similar check in parse_vdso.c and treat hash entries as double word. Ping, any thoughts about the issue or patch?
[PATCH] selftests: vdso: hash entry size on alpha,s390x is 8 bytes
parse_vdso.c is crashing on 5.8-rc5 s390x, because it wrongly reads nbucket as 0: Program received signal SIGFPE, Arithmetic exception. 0x01000f3e in vdso_sym (version=0x1001280 "LINUX_2.6", name=0x100128a "__vdso_getcpu") at parse_vdso.c:207 207 ELF(Word) chain = vdso_info.bucket[elf_hash(name) % vdso_info.nbucket]; (gdb) p vdso_info.nbucket $1 = 0 Per readelf source: https://sourceware.org/git/?p=binutils-gdb.git;a=blob;f=binutils/readelf.c;h=2406304fe35a832ac53aa7b1a367f3f7afed4264;hb=HEAD#l10027 and objdump output hash entries are double size on 64bit s390 and alpha: 0120 <.hash>: 120: 00 00 00 00 .long 0x 124: 00 00 00 03 .long 0x0003 128: 00 00 00 00 .long 0x 12c: 00 00 00 07 .long 0x0007 130: 00 00 00 00 .long 0x 134: 00 00 00 01 .long 0x0001 138: 00 00 00 00 .long 0x 13c: 00 00 00 05 .long 0x0005 140: 00 00 00 00 .long 0x 144: 00 00 00 06 .long 0x0006 ... 16c: 00 00 00 02 .long 0x0002 170: 00 00 00 00 .long 0x 174: 00 00 00 03 .long 0x0003 178: 00 00 00 00 .long 0x 17c: 00 00 00 04 .long 0x0004 Do similar check in parse_vdso.c and treat hash entries as double word. Signed-off-by: Jan Stancek --- tools/testing/selftests/vDSO/parse_vdso.c | 48 +++ 1 file changed, 40 insertions(+), 8 deletions(-) diff --git a/tools/testing/selftests/vDSO/parse_vdso.c b/tools/testing/selftests/vDSO/parse_vdso.c index 413f75620a35..e23efcbd1c88 100644 --- a/tools/testing/selftests/vDSO/parse_vdso.c +++ b/tools/testing/selftests/vDSO/parse_vdso.c @@ -47,8 +47,9 @@ static struct vdso_info /* Symbol table */ ELF(Sym) *symtab; const char *symstrings; - ELF(Word) *bucket, *chain; + void *bucket, *chain; ELF(Word) nbucket, nchain; + bool hash_ent_is_dword; /* Version table */ ELF(Versym) *versym; @@ -69,6 +70,28 @@ static unsigned long elf_hash(const unsigned char *name) return h; } +/* return value of hash table entry */ +ELF(Word) get_hash_val(void *ptr, ELF(Word) idx) +{ + if (vdso_info.hash_ent_is_dword) { + ELF(Xword) *table = ptr; + /* for vdso assume all values fit in Elf Word */ + return (ELF(Word)) table[idx]; + } + + ELF(Word) *table = ptr; + return table[idx]; +} + +/* return pointer to hash table entry */ +void *get_hash_ptr(void *ptr, ELF(Word) idx) +{ + if (vdso_info.hash_ent_is_dword) + return &((ELF(Xword) *) ptr)[idx]; + + return &((ELF(Word) *) ptr)[idx]; +} + void vdso_init_from_sysinfo_ehdr(uintptr_t base) { size_t i; @@ -84,6 +107,14 @@ void vdso_init_from_sysinfo_ehdr(uintptr_t base) return; /* Wrong ELF class -- check ELF_BITS */ } + /* 64bit s390 and alpha have hash entry size of 8 bytes */ + if ((hdr->e_machine == EM_ALPHA + || hdr->e_machine == EM_S390) + && hdr->e_ident[EI_CLASS] == ELFCLASS64) + vdso_info.hash_ent_is_dword = true; + else + vdso_info.hash_ent_is_dword = false; + ELF(Phdr) *pt = (ELF(Phdr)*)(vdso_info.load_addr + hdr->e_phoff); ELF(Dyn) *dyn = 0; @@ -149,11 +180,11 @@ void vdso_init_from_sysinfo_ehdr(uintptr_t base) if (!vdso_info.verdef) vdso_info.versym = 0; - /* Parse the hash table header. */ - vdso_info.nbucket = hash[0]; - vdso_info.nchain = hash[1]; - vdso_info.bucket = &hash[2]; - vdso_info.chain = &hash[vdso_info.nbucket + 2]; + + vdso_info.nbucket = get_hash_val(hash, 0); + vdso_info.nchain = get_hash_val(hash, 1); + vdso_info.bucket = get_hash_ptr(hash, 2); + vdso_info.chain = get_hash_ptr(hash, vdso_info.nbucket + 2); /* That's all we need. */ vdso_info.valid = true; @@ -204,9 +235,10 @@ void *vdso_sym(const char *version, const char *name) return 0; ver_hash = elf_hash(version); - ELF(Word) chain = vdso_info.bucket[elf_hash(name) % vdso_info.nbucket]; + ELF(Word) chain = get_hash_val(vdso_info.bucket, + elf_hash(name) % vdso_info.nbucket); - for (; chain != STN_UNDEF; chain = vdso_info.chain[chain]) { + for (; chain != STN_UNDEF; chain = get_hash_val(vdso_info.chain, chain)) { ELF(Sym) *sym = &vdso_info.symtab[chain]; /* Check for a defined global or weak function w/ right name. */ -- 2.18.1
Re: [LTP] 303cc571d1: ltp.setns01.fail
- Original Message - > I'll send a pr for this to Linus this week (or next since I'm on > vacation this week) and get this fixed. Thanks for spotting this. What's > the Reported-by: line format that LTP uses? I'm not sure we ever used one, it's usually various CI systems that reference LTP. This thread has been started by kernel test robot: Reported-by: kernel test robot
Re: [LTP] 303cc571d1: ltp.setns01.fail
- Original Message - > Hi! > > setns01 6 TFAIL : setns01.c:176: regular file fd exp_errno=22: > > errno=EBADF(9): Bad file descriptor > > setns01 0 TINFO : setns(12, 0x2) > > setns01 7 TFAIL : setns01.c:176: regular file fd exp_errno=22: > > errno=EBADF(9): Bad file descriptor > > setns01 0 TINFO : setns(12, 0x4000) > > setns01 8 TFAIL : setns01.c:176: regular file fd exp_errno=22: > > errno=EBADF(9): Bad file descriptor > > setns01 0 TINFO : setns(12, 0x2000) > > setns01 9 TFAIL : setns01.c:176: regular file fd exp_errno=22: > > errno=EBADF(9): Bad file descriptor > > setns01 0 TINFO : setns(12, 0x400) > > setns0110 TFAIL : setns01.c:176: regular file fd exp_errno=22: > > errno=EBADF(9): Bad file descriptor > > The messages here are a bit cryptic, I will fix that later on, but what > it means here is that the errno has changed from EINVAL to EBADF in a > case we pass file descriptor to a regular file to setns(). I posted a series that accepts both errnos about week ago: https://lists.linux.it/pipermail/ltp/2020-June/017467.html
Re: LTP: syscalls: regression on mainline - ioctl_loop01 mknod07 setns01
- Original Message - > Following three test cases reported as regression on Linux mainline kernel > on x86_64, arm64, arm and i386 > > ltp-syscalls-tests: > * ioctl_loop01 > * mknod07 Test updated: https://github.com/linux-test-project/ltp/commit/13fcfa2d6bdd1fb71c4528b47170e8e8fb3a8a32 > * setns01 commit 303cc571d107 ("nsproxy: attach to namespaces via pidfds") changed errno that is returned for regular file from EINVAL to EBADF. This appears to fit more current man page, so I think we need to fix test to accept both. (I'm looking into that)
[bug?] LTP pt_test failing after 38bb8d77d0b9 "perf/x86/intel/pt: Split ToPA metadata and page layout"
Hi, All variants of pt_test: https://github.com/linux-test-project/ltp/blob/master/testcases/kernel/tracing/pt_test/pt_test.c started failing after: 38bb8d77d0b9 ("perf/x86/intel/pt: Split ToPA metadata and page layout") with following error on console/dmesg: pt: ToPA ERROR encountered, trying to recover on Broadwell-EP: vendor_id : GenuineIntel cpu family : 6 model : 79 model name : Intel(R) Xeon(R) CPU E5-2699 v4 @ 2.20GHz stepping: 1 Regards, Jan
Re: [PATCH] fat: fix corruption in fat_alloc_new_dir()
- Original Message - > Jan Stancek writes: > > > sb_getblk does not guarantee that buffer_head is uptodate. If there is > > async read running in parallel for same buffer_head, it can overwrite > > just initialized msdos_dir_entry, leading to corruption: > > FAT-fs (loop0): error, corrupted directory (invalid entries) > > FAT-fs (loop0): Filesystem has been set read-only > > > > This can happen for example during LTP statx04, which creates loop > > device, formats it (mkfs.vfat), mounts it and immediately creates > > a new directory. In parallel, systemd-udevd is probing new block > > device, which leads to async read. > > > > do_mkdirat ksys_read > >vfs_mkdir vfs_read > > vfat_mkdir __vfs_read > > fat_alloc_new_dir new_sync_read > >/* init de[0], de[1] */blkdev_read_iter > >generic_file_read_iter > > generic_file_buffered_read > > blkdev_readpage > > block_read_full_page > > > > Faster reproducer (based on LTP statx04): > > > > int main(void) > > { > > int i, j, ret, fd, loop_fd, ctrl_fd; > > int loop_num; > > char loopdev[256], tmp[256], testfile[256]; > > > > mkdir("/tmp/mntpoint", 0777); > > for (i = 0; ; i++) { > > printf("Iteration: %d\n", i); > > sprintf(testfile, "/tmp/test.img.%d", getpid()); > > > > ctrl_fd = open("/dev/loop-control", O_RDWR); > > loop_num = ioctl(ctrl_fd, LOOP_CTL_GET_FREE); > > close(ctrl_fd); > > sprintf(loopdev, "/dev/loop%d", loop_num); > > > > fd = open(testfile, O_WRONLY|O_CREAT|O_TRUNC, 0600); > > fallocate(fd, 0, 0, 256*1024*1024); > > close(fd); > > > > fd = open(testfile, O_RDWR); > > loop_fd = open(loopdev, O_RDWR); > > ioctl(loop_fd, LOOP_SET_FD, fd); > > close(loop_fd); > > close(fd); > > > > sprintf(tmp, "mkfs.vfat %s", loopdev); > > system(tmp); > > mount(loopdev, "/tmp/mntpoint", "vfat", 0, NULL); > > > > for (j = 0; j < 200; j++) { > > sprintf(tmp, "/tmp/mntpoint/testdir%d", j); > > ret = mkdir(tmp, 0777); > > if (ret) { > > perror("mkdir"); > > break; > > } > > } > > > > umount("/tmp/mntpoint"); > > loop_fd = open(loopdev, O_RDWR); > > ioctl(loop_fd, LOOP_CLR_FD, fd); > > close(loop_fd); > > unlink(testfile); > > > > if (ret) > > break; > > } > > > > return 0; > > } > > > > Issue triggers within minute on HPE Apollo 70 (arm64, 64GB RAM, 224 CPUs). > > Using the device while mounting same device doesn't work reliably like > this race. (getblk() is intentionally used to get the buffer to write > new data.) Are you saying this is expected even if 'usage' is just read? > > mount(2) internally opens the device by EXCL mode, so I guess udev opens > without EXCL (I dont know if it is intent or not). I gave this a try and added O_EXCL to udev-builtin-blkid.c. My system had trouble booting, it was getting stuck on mounting LVM volumes. So, I'm not sure how to move forward here. Regards, Jan
[tip: x86/urgent] x86/timer: Force PIT initialization when !X86_FEATURE_ARAT
The following commit has been merged into the x86/urgent branch of tip: Commit-ID: afa8b475c1aec185a8e106c48b3832e0b88bc2de Gitweb: https://git.kernel.org/tip/afa8b475c1aec185a8e106c48b3832e0b88bc2de Author:Jan Stancek AuthorDate:Sun, 08 Sep 2019 00:50:40 +02:00 Committer: Thomas Gleixner CommitterDate: Sun, 08 Sep 2019 09:01:15 +02:00 x86/timer: Force PIT initialization when !X86_FEATURE_ARAT KVM guests with commit c8c4076723da ("x86/timer: Skip PIT initialization on modern chipsets") applied to guest kernel have been observed to have unusually higher CPU usage with symptoms of increase in vm exits for HLT and MSW_WRITE (MSR_IA32_TSCDEADLINE). This is caused by older QEMUs lacking support for X86_FEATURE_ARAT. lapic clock retains CLOCK_EVT_FEAT_C3STOP and nohz stays inactive. There's no usable broadcast device either. Do the PIT initialization if guest CPU lacks X86_FEATURE_ARAT. On real hardware it shouldn't matter as ARAT and DEADLINE come together. Fixes: c8c4076723da ("x86/timer: Skip PIT initialization on modern chipsets") Signed-off-by: Jan Stancek Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/apic.c | 4 1 file changed, 4 insertions(+) diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index dba2828..f91b3ff 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -834,6 +834,10 @@ bool __init apic_needs_pit(void) if (!boot_cpu_has(X86_FEATURE_APIC)) return true; + /* Virt guests may lack ARAT, but still have DEADLINE */ + if (!boot_cpu_has(X86_FEATURE_ARAT)) + return true; + /* Deadline timer is based on TSC so no further PIT action required */ if (boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER)) return false;
[tip: x86/urgent] x86/timer: Force PIT initialization when !X86_FEATURE_ARAT
The following commit has been merged into the x86/urgent branch of tip: Commit-ID: afa8b475c1aec185a8e106c48b3832e0b88bc2de Gitweb: https://git.kernel.org/tip/afa8b475c1aec185a8e106c48b3832e0b88bc2de Author:Jan Stancek AuthorDate:Sun, 08 Sep 2019 00:50:40 +02:00 Committer: Thomas Gleixner CommitterDate: Sun, 08 Sep 2019 09:01:15 +02:00 x86/timer: Force PIT initialization when !X86_FEATURE_ARAT KVM guests with commit c8c4076723da ("x86/timer: Skip PIT initialization on modern chipsets") applied to guest kernel have been observed to have unusually higher CPU usage with symptoms of increase in vm exits for HLT and MSW_WRITE (MSR_IA32_TSCDEADLINE). This is caused by older QEMUs lacking support for X86_FEATURE_ARAT. lapic clock retains CLOCK_EVT_FEAT_C3STOP and nohz stays inactive. There's no usable broadcast device either. Do the PIT initialization if guest CPU lacks X86_FEATURE_ARAT. On real hardware it shouldn't matter as ARAT and DEADLINE come together. Fixes: c8c4076723da ("x86/timer: Skip PIT initialization on modern chipsets") Signed-off-by: Jan Stancek Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/apic.c | 4 1 file changed, 4 insertions(+) diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index dba2828..f91b3ff 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -834,6 +834,10 @@ bool __init apic_needs_pit(void) if (!boot_cpu_has(X86_FEATURE_APIC)) return true; + /* Virt guests may lack ARAT, but still have DEADLINE */ + if (!boot_cpu_has(X86_FEATURE_ARAT)) + return true; + /* Deadline timer is based on TSC so no further PIT action required */ if (boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER)) return false;
[PATCH] fat: fix corruption in fat_alloc_new_dir()
sb_getblk does not guarantee that buffer_head is uptodate. If there is async read running in parallel for same buffer_head, it can overwrite just initialized msdos_dir_entry, leading to corruption: FAT-fs (loop0): error, corrupted directory (invalid entries) FAT-fs (loop0): Filesystem has been set read-only This can happen for example during LTP statx04, which creates loop device, formats it (mkfs.vfat), mounts it and immediately creates a new directory. In parallel, systemd-udevd is probing new block device, which leads to async read. do_mkdirat ksys_read vfs_mkdir vfs_read vfat_mkdir __vfs_read fat_alloc_new_dir new_sync_read /* init de[0], de[1] */blkdev_read_iter generic_file_read_iter generic_file_buffered_read blkdev_readpage block_read_full_page Faster reproducer (based on LTP statx04): - 8< - int main(void) { int i, j, ret, fd, loop_fd, ctrl_fd; int loop_num; char loopdev[256], tmp[256], testfile[256]; mkdir("/tmp/mntpoint", 0777); for (i = 0; ; i++) { printf("Iteration: %d\n", i); sprintf(testfile, "/tmp/test.img.%d", getpid()); ctrl_fd = open("/dev/loop-control", O_RDWR); loop_num = ioctl(ctrl_fd, LOOP_CTL_GET_FREE); close(ctrl_fd); sprintf(loopdev, "/dev/loop%d", loop_num); fd = open(testfile, O_WRONLY|O_CREAT|O_TRUNC, 0600); fallocate(fd, 0, 0, 256*1024*1024); close(fd); fd = open(testfile, O_RDWR); loop_fd = open(loopdev, O_RDWR); ioctl(loop_fd, LOOP_SET_FD, fd); close(loop_fd); close(fd); sprintf(tmp, "mkfs.vfat %s", loopdev); system(tmp); mount(loopdev, "/tmp/mntpoint", "vfat", 0, NULL); for (j = 0; j < 200; j++) { sprintf(tmp, "/tmp/mntpoint/testdir%d", j); ret = mkdir(tmp, 0777); if (ret) { perror("mkdir"); break; } } umount("/tmp/mntpoint"); loop_fd = open(loopdev, O_RDWR); ioctl(loop_fd, LOOP_CLR_FD, fd); close(loop_fd); unlink(testfile); if (ret) break; } return 0; } - 8< ------------- Issue triggers within minute on HPE Apollo 70 (arm64, 64GB RAM, 224 CPUs). Signed-off-by: Jan Stancek Cc: OGAWA Hirofumi --- fs/fat/dir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/fat/dir.c b/fs/fat/dir.c index 1bda2ab6745b..474fd6873ec8 100644 --- a/fs/fat/dir.c +++ b/fs/fat/dir.c @@ -1149,7 +1149,7 @@ int fat_alloc_new_dir(struct inode *dir, struct timespec64 *ts) goto error; blknr = fat_clus_to_blknr(sbi, cluster); - bhs[0] = sb_getblk(sb, blknr); + bhs[0] = sb_bread(sb, blknr); if (!bhs[0]) { err = -ENOMEM; goto error_free; -- 1.8.3.1
Re: [PATCH v5.2 1/2] locking/rwsem: Add missing ACQUIRE to read_slowpath exit when queue is empty
- Original Message - > From: Jan Stancek > > [ Upstream commit e1b98fa316648420d0434d9ff5b92ad6609ba6c3 ] > > LTP mtest06 has been observed to occasionally hit "still mapped when > deleted" and following BUG_ON on arm64. > > The extra mapcount originated from pagefault handler, which handled > pagefault for vma that has already been detached. vma is detached > under mmap_sem write lock by detach_vmas_to_be_unmapped(), which > also invalidates vmacache. > > When the pagefault handler (under mmap_sem read lock) calls > find_vma(), vmacache_valid() wrongly reports vmacache as valid. > > After rwsem down_read() returns via 'queue empty' path (as of v5.2), > it does so without an ACQUIRE on sem->count: > > down_read() > __down_read() > rwsem_down_read_failed() > __rwsem_down_read_failed_common() > raw_spin_lock_irq(&sem->wait_lock); > if (list_empty(&sem->wait_list)) { > if (atomic_long_read(&sem->count) >= 0) { > raw_spin_unlock_irq(&sem->wait_lock); > return sem; > > The problem can be reproduced by running LTP mtest06 in a loop and > building the kernel (-j $NCPUS) in parallel. It does reproduces since > v4.20 on arm64 HPE Apollo 70 (224 CPUs, 256GB RAM, 2 nodes). It > triggers reliably in about an hour. > > The patched kernel ran fine for 10+ hours. > > Signed-off-by: Jan Stancek > Signed-off-by: Peter Zijlstra (Intel) > Reviewed-by: Will Deacon > Acked-by: Waiman Long > Cc: Linus Torvalds > Cc: Peter Zijlstra > Cc: Thomas Gleixner > Cc: dbu...@suse.de > Fixes: 4b486b535c33 ("locking/rwsem: Exit read lock slowpath if queue empty & > no writer") > Link: > https://lkml.kernel.org/r/50b8914e20d1d62bb2dee42d342836c2c16ebee7.1563438048.git.jstan...@redhat.com > Signed-off-by: Ingo Molnar > Signed-off-by: Sasha Levin > --- > > This is a backport for the v5.2 stable tree. There were multiple reports > of this issue being hit. > > Given that there were a few changes to the code around this, I'd > appreciate an ack before pulling it in. ACK, both look good to me. I also re-ran reproducer with this series applied on top of 5.2.10, it PASS-ed. Thanks, Jan
Re: Linux-next-20190823: x86_64/i386: prot_hsymlinks.c:325: Failed to run cmd: useradd hsym
- Original Message - > On Tue, 2019-08-27 at 06:25 -0400, Jan Stancek wrote: > > That theory is probably not correct for this case, since EIO I see > > appears > > to originate from write and nfs_writeback_result(). This function > > also > > produces message we saw in logs from Naresh. > > > > I can't find where/how is resp->count updated on WRITE reply in > > NFSv2. > > Issue also goes away with patch below, though I can't speak about its > > correctness: > > > > NFS version TypeTestReturn code > > nfsvers=2 tcp -b:base 0 > > nfsvers=2 tcp -g:general 0 > > nfsvers=2 tcp -s:special 0 > > nfsvers=2 tcp -l:lock 0 > > Total time: 141 > > > > diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c > > index cbc17a203248..4913c6da270b 100644 > > --- a/fs/nfs/nfs2xdr.c > > +++ b/fs/nfs/nfs2xdr.c > > @@ -897,6 +897,16 @@ static int nfs2_xdr_dec_writeres(struct rpc_rqst > > *req, struct xdr_stream *xdr, > > void *data) > > { > > struct nfs_pgio_res *result = data; > > + struct rpc_task *rq_task = req->rq_task; > > + > > + if (rq_task) { > > + struct nfs_pgio_args *args = rq_task- > > >tk_msg.rpc_argp; > > + > > + if (args) { > > + result->count = args->count; > > + } > > + } > > > > /* All NFSv2 writes are "file sync" writes */ > > result->verf->committed = NFS_FILE_SYNC; > > Thanks! I've moved the above to nfs_write_done() so that we do it only > on success (see > http://git.linux-nfs.org/?p=trondmy/linux-nfs.git;a=commitdiff;h=3ba5688da709dd0f7d917029c206bc1848a6ae74 > ) Thanks, retested with 3ba5688da, all PASS: NFS version TypeTestReturn code nfsvers=2 tcp -b:base 0 nfsvers=2 tcp -g:general 0 nfsvers=2 tcp -s:special 0 nfsvers=2 tcp -l:lock 0 NFS version TypeTestReturn code nfsvers=3 tcp -b:base 0 nfsvers=3 tcp -g:general 0 nfsvers=3 tcp -s:special 0 nfsvers=3 tcp -l:lock 0 nfsvers=3 tcp6-b:base 0 nfsvers=3 tcp6-g:general 0 nfsvers=3 tcp6-s:special 0 nfsvers=3 tcp6-l:lock 0 NFS version TypeTestReturn code nfsvers=4 tcp -b:base 0 nfsvers=4 tcp -g:general 0 nfsvers=4 tcp -s:special 0 nfsvers=4 tcp -l:lock 0 nfsvers=4 tcp6-b:base 0 nfsvers=4 tcp6-g:general 0 nfsvers=4 tcp6-s:special 0 nfsvers=4 tcp6-l:lock 0 Feel free to add also: Reported-by: Naresh Kamboju Tested-by: Jan Stancek
Re: Linux-next-20190823: x86_64/i386: prot_hsymlinks.c:325: Failed to run cmd: useradd hsym
- Original Message - > On Mon, 2019-08-26 at 19:12 -0400, Jan Stancek wrote: > > - Original Message - > > > On Mon, 2019-08-26 at 10:38 -0400, Jan Stancek wrote: > > > > - Original Message - > > > > > Hi Jan and Cyril, > > > > > > > > > > On Mon, 26 Aug 2019 at 16:35, Jan Stancek > > > > > wrote: > > > > > > > > > > > > - Original Message - > > > > > > > Hi! > > > > > > > > Do you see this LTP prot_hsymlinks failure on linux next > > > > > > > > 20190823 on > > > > > > > > x86_64 and i386 devices? > > > > > > > > > > > > > > > > test output log, > > > > > > > > useradd: failure while writing changes to /etc/passwd > > > > > > > > useradd: /home/hsym was created, but could not be removed > > > > > > > > > > > > > > This looks like an unrelated problem, failure to write to > > > > > > > /etc/passwd > > > > > > > probably means that filesystem is full or some problem > > > > > > > happend > > > > > > > and how > > > > > > > is remounted RO. > > > > > > > > > > > > In Naresh' example, root is on NFS: > > > > > > root=/dev/nfs rw > > > > > > > > > > > > nfsroot=10.66.16.123:/var/lib/lava/dispatcher/tmp/886412/extr > > > > > > act- > > > > > > nfsrootfs-tyuevoxm,tcp,hard,intr > > > > > > > > > > Right ! > > > > > root is mounted on NFS. > > > > > > > > > > > 10.66.16.123:/var/lib/lava/dispatcher/tmp/886412/extract- > > > > > > nfsrootfs-tyuevoxm > > > > > > on / type nfs > > > > > > (rw,relatime,vers=2,rsize=4096,wsize=4096,namlen=255,hard,nol > > > > > > ock, > > > > > > proto=tcp,timeo=600,retrans=2,sec=sys,mountaddr=10.66.16.123, > > > > > > moun > > > > > > tvers=1,mountproto=tcp,local_lock=all,addr=10.66.16.123) > > > > > > devtmpfs on /dev type devtmpfs > > > > > > (rw,relatime,size=3977640k,nr_inodes=994410,mode=755) > > > > > > > > > > > > The only thing I can think of that might cause an EIO on NFSv2 > > > would be > > > this patch > > > http://git.linux-nfs.org/?p=trondmy/linux-nfs.git;a=commitdiff;h=627d48e597ec5993c4abb3b81dc75e554a07c7c0 > > > assuming that a bind-related error is leaking through. > > > > > > I'd suggest something like the following to fix it up: > > > > No change with that patch, > > but following one fixes it for me: > > > > diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c > > index 20b3717cd7ca..56cefa0ab804 100644 > > --- a/fs/nfs/pagelist.c > > +++ b/fs/nfs/pagelist.c > > @@ -590,7 +590,7 @@ static void nfs_pgio_rpcsetup(struct > > nfs_pgio_header *hdr, > > } > > > > hdr->res.fattr = &hdr->fattr; > > - hdr->res.count = 0; > > + hdr->res.count = count; > > hdr->res.eof = 0; > > hdr->res.verf= &hdr->verf; > > nfs_fattr_init(&hdr->fattr); > > > > which is functionally revert of "NFS: Fix initialisation of I/O > > result struct in nfs_pgio_rpcsetup". > > > > This hunk caught my eye, could res.eof == 0 explain those I/O errors? > > Interesting hypothesis. It could if res.count ends up being 0. So does > the following also fix the problem? It didn't fix it. That theory is probably not correct for this case, since EIO I see appears to originate from write and nfs_writeback_result(). This function also produces message we saw in logs from Naresh. I can't find where/how is resp->count updated on WRITE reply in NFSv2. Issue also goes away with patch below, though I can't speak about its correctness: NFS version TypeTestReturn code nfsvers=2 tcp -b:base 0 nfsvers=2 tcp -g:general 0 nfsvers=2 tcp -s:special 0 nfsvers=2 tcp -l:lock 0 Total time: 141 diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c index cbc17a203248..4913c6da270b 100644 --- a/fs/nfs/nfs2xdr.c +++ b/fs/nfs/nfs2xdr.c @@ -897,6 +897,16 @@ static int nfs2_xdr_dec_writeres(struct rpc_rqst *req, struct xdr_stream *xdr, void *data) { struct nfs_pgio_res *result = data; + struct rpc_task *rq_task = req->rq_task; + + if (rq_task) { + struct nfs_pgio_args *args = rq_task->tk_msg.rpc_argp; + + if (args) { + result->count = args->count; + } + } /* All NFSv2 writes are "file sync" writes */ result->verf->committed = NFS_FILE_SYNC;
Re: Linux-next-20190823: x86_64/i386: prot_hsymlinks.c:325: Failed to run cmd: useradd hsym
- Original Message - > On Mon, 2019-08-26 at 10:38 -0400, Jan Stancek wrote: > > - Original Message - > > > Hi Jan and Cyril, > > > > > > On Mon, 26 Aug 2019 at 16:35, Jan Stancek > > > wrote: > > > > > > > > > > > > - Original Message - > > > > > Hi! > > > > > > Do you see this LTP prot_hsymlinks failure on linux next > > > > > > 20190823 on > > > > > > x86_64 and i386 devices? > > > > > > > > > > > > test output log, > > > > > > useradd: failure while writing changes to /etc/passwd > > > > > > useradd: /home/hsym was created, but could not be removed > > > > > > > > > > This looks like an unrelated problem, failure to write to > > > > > /etc/passwd > > > > > probably means that filesystem is full or some problem happend > > > > > and how > > > > > is remounted RO. > > > > > > > > In Naresh' example, root is on NFS: > > > > root=/dev/nfs rw > > > > > > > > nfsroot=10.66.16.123:/var/lib/lava/dispatcher/tmp/886412/extract- > > > > nfsrootfs-tyuevoxm,tcp,hard,intr > > > > > > Right ! > > > root is mounted on NFS. > > > > > > > 10.66.16.123:/var/lib/lava/dispatcher/tmp/886412/extract- > > > > nfsrootfs-tyuevoxm > > > > on / type nfs > > > > (rw,relatime,vers=2,rsize=4096,wsize=4096,namlen=255,hard,nolock, > > > > proto=tcp,timeo=600,retrans=2,sec=sys,mountaddr=10.66.16.123,moun > > > > tvers=1,mountproto=tcp,local_lock=all,addr=10.66.16.123) > > > > devtmpfs on /dev type devtmpfs > > > > (rw,relatime,size=3977640k,nr_inodes=994410,mode=755) > > > > > > The only thing I can think of that might cause an EIO on NFSv2 would be > this patch > http://git.linux-nfs.org/?p=trondmy/linux-nfs.git;a=commitdiff;h=627d48e597ec5993c4abb3b81dc75e554a07c7c0 > assuming that a bind-related error is leaking through. > > I'd suggest something like the following to fix it up: No change with that patch, but following one fixes it for me: diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 20b3717cd7ca..56cefa0ab804 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -590,7 +590,7 @@ static void nfs_pgio_rpcsetup(struct nfs_pgio_header *hdr, } hdr->res.fattr = &hdr->fattr; - hdr->res.count = 0; + hdr->res.count = count; hdr->res.eof = 0; hdr->res.verf= &hdr->verf; nfs_fattr_init(&hdr->fattr); which is functionally revert of "NFS: Fix initialisation of I/O result struct in nfs_pgio_rpcsetup". This hunk caught my eye, could res.eof == 0 explain those I/O errors? /* Emulate the eof flag, which isn't normally needed in NFSv2 * as it is guaranteed to always return the file attributes */ if (hdr->args.offset + hdr->res.count >= hdr->res.fattr->size) hdr->res.eof = 1;
Re: Linux-next-20190823: x86_64/i386: prot_hsymlinks.c:325: Failed to run cmd: useradd hsym
- Original Message - > Hi Jan and Cyril, > > On Mon, 26 Aug 2019 at 16:35, Jan Stancek wrote: > > > > > > > > - Original Message - > > > Hi! > > > > Do you see this LTP prot_hsymlinks failure on linux next 20190823 on > > > > x86_64 and i386 devices? > > > > > > > > test output log, > > > > useradd: failure while writing changes to /etc/passwd > > > > useradd: /home/hsym was created, but could not be removed > > > > > > This looks like an unrelated problem, failure to write to /etc/passwd > > > probably means that filesystem is full or some problem happend and how > > > is remounted RO. > > > > In Naresh' example, root is on NFS: > > root=/dev/nfs rw > > > > nfsroot=10.66.16.123:/var/lib/lava/dispatcher/tmp/886412/extract-nfsrootfs-tyuevoxm,tcp,hard,intr > > Right ! > root is mounted on NFS. > > > > > 10.66.16.123:/var/lib/lava/dispatcher/tmp/886412/extract-nfsrootfs-tyuevoxm > > on / type nfs > > (rw,relatime,vers=2,rsize=4096,wsize=4096,namlen=255,hard,nolock,proto=tcp,timeo=600,retrans=2,sec=sys,mountaddr=10.66.16.123,mountvers=1,mountproto=tcp,local_lock=all,addr=10.66.16.123) > > devtmpfs on /dev type devtmpfs > > (rw,relatime,size=3977640k,nr_inodes=994410,mode=755) > > > > Following message repeats couple times in logs: > > NFS: Server wrote zero bytes, expected XXX > > > > Naresh, can you check if there are any errors on NFS server side? > > I have re-tested the failed tests on next-20190822 and all get pass > which is also > using same NFS server [1] [2]. Thanks, that suggests some client side change between next-20190822 and next-20190823 might introduced it. > > > Maybe run NFS cthon against that server with client running next-20190822 > > and next-20190823. > > Thanks for the pointers. > I will setup and run NFS cthon on next-20190822 and next-20190823. I'll try to reproduce too. > > > > > > > > > I do not see the kernel messages from this job anywhere at the job > > > pages, is it stored somewhere? > > > > It appears to be mixed in same log file: > > > > https://qa-reports.linaro.org/lkft/linux-next-oe/build/next-20190823/testrun/886412/log > > For the record the following tests failed on linux -next-20190823 on x86_64 > and i386. The filesystem is mounted on NFS and tests are using > locally mounted hard drive ( with -d /scratch ). > > Juno-r2 device filesystem mounted on NFS and did not see these errors > and test getting pass on -next-20190823. > > These failures are reproducible all time on next-20190823 kernel on x86_64 > and i386 device with root mounted on NFS [3] [4] [5] [6]. > > I will git bisect to find out which is bad commit. > > prot_hsymlinks: [3] > -- > useradd: failure while writing changes to /etc/passwd > useradd: /home/hsym was created, but could not be removed > userdel: user 'hsym' does not exist > prot_hsymlinks1 TBROK : prot_hsymlinks.c:325: Failed to run > cmd: useradd hsym > prot_hsymlinks2 TBROK : prot_hsymlinks.c:325: Remaining cases broken > prot_hsymlinks3 TBROK : prot_hsymlinks.c:325: Failed to run > cmd: userdel -r hsym > prot_hsymlinks4 TBROK : tst_sig.c:234: unexpected signal > SIGIOT/SIGABRT(6) received (pid = 8324). > > logrotate01: [4] > - > compressing log with: /bin/gzip > error: error creating temp state file /var/lib/logrotate.status.tmp: > Input/output error > logrotate011 TFAIL : ltpapicmd.c:154: Test #1: logrotate > command exited with 1 return code. Output: > > sem_unlink_2-2: [5] > -- > make[3]: Entering directory > '/opt/ltp/testcases/open_posix_testsuite/conformance/interfaces/sem_unlink' > cat: write error: Input/output error > conformance/interfaces/sem_unlink/sem_unlink_2-2: execution: FAILED > > syslog{01 ...10} [6] > --- > cp: failed to close '/etc/syslog.conf.ltpback': Input/output error > syslog011 TBROK : ltpapicmd.c:188: failed to backup /etc/syslog.conf > > cp: failed to close '/etc/syslog.conf.ltpback': Input/output error > syslog021 TBROK : ltpapicmd.c:188: failed to backup /etc/syslog.conf > > ... > cp: failed to close '/etc/syslog.conf.ltpback': Input/output error > syslog101 TBROK : ltpapicmd.c:188: failed to backup /etc/syslog.conf > > ref: > PASS on 20190222: > [1] https://lkft.validation.linaro.org/scheduler/job/890446#L1232 > [2] https://lkft.validation.linaro.org/scheduler/job/890454 > > FAILED on 20190823: > [3] https://lkft.validation.linaro.org/scheduler/job/890404#L1245 > [4] https://lkft.validation.linaro.org/scheduler/job/886408#L2544 > [5] https://lkft.validation.linaro.org/scheduler/job/886409#L3088 > [6] https://lkft.validation.linaro.org/scheduler/job/890400#L1234 > > - Naresh >
Re: Linux-next-20190823: x86_64/i386: prot_hsymlinks.c:325: Failed to run cmd: useradd hsym
- Original Message - > Hi! > > Do you see this LTP prot_hsymlinks failure on linux next 20190823 on > > x86_64 and i386 devices? > > > > test output log, > > useradd: failure while writing changes to /etc/passwd > > useradd: /home/hsym was created, but could not be removed > > This looks like an unrelated problem, failure to write to /etc/passwd > probably means that filesystem is full or some problem happend and how > is remounted RO. In Naresh' example, root is on NFS: root=/dev/nfs rw nfsroot=10.66.16.123:/var/lib/lava/dispatcher/tmp/886412/extract-nfsrootfs-tyuevoxm,tcp,hard,intr 10.66.16.123:/var/lib/lava/dispatcher/tmp/886412/extract-nfsrootfs-tyuevoxm on / type nfs (rw,relatime,vers=2,rsize=4096,wsize=4096,namlen=255,hard,nolock,proto=tcp,timeo=600,retrans=2,sec=sys,mountaddr=10.66.16.123,mountvers=1,mountproto=tcp,local_lock=all,addr=10.66.16.123) devtmpfs on /dev type devtmpfs (rw,relatime,size=3977640k,nr_inodes=994410,mode=755) Following message repeats couple times in logs: NFS: Server wrote zero bytes, expected XXX Naresh, can you check if there are any errors on NFS server side? Maybe run NFS cthon against that server with client running next-20190822 and next-20190823. > > I do not see the kernel messages from this job anywhere at the job > pages, is it stored somewhere? It appears to be mixed in same log file: https://qa-reports.linaro.org/lkft/linux-next-oe/build/next-20190823/testrun/886412/log
[tip:locking/core] locking/rwsem: Add missing ACQUIRE to read_slowpath exit when queue is empty
Commit-ID: e1b98fa316648420d0434d9ff5b92ad6609ba6c3 Gitweb: https://git.kernel.org/tip/e1b98fa316648420d0434d9ff5b92ad6609ba6c3 Author: Jan Stancek AuthorDate: Thu, 18 Jul 2019 10:51:25 +0200 Committer: Ingo Molnar CommitDate: Thu, 25 Jul 2019 15:39:23 +0200 locking/rwsem: Add missing ACQUIRE to read_slowpath exit when queue is empty LTP mtest06 has been observed to occasionally hit "still mapped when deleted" and following BUG_ON on arm64. The extra mapcount originated from pagefault handler, which handled pagefault for vma that has already been detached. vma is detached under mmap_sem write lock by detach_vmas_to_be_unmapped(), which also invalidates vmacache. When the pagefault handler (under mmap_sem read lock) calls find_vma(), vmacache_valid() wrongly reports vmacache as valid. After rwsem down_read() returns via 'queue empty' path (as of v5.2), it does so without an ACQUIRE on sem->count: down_read() __down_read() rwsem_down_read_failed() __rwsem_down_read_failed_common() raw_spin_lock_irq(&sem->wait_lock); if (list_empty(&sem->wait_list)) { if (atomic_long_read(&sem->count) >= 0) { raw_spin_unlock_irq(&sem->wait_lock); return sem; The problem can be reproduced by running LTP mtest06 in a loop and building the kernel (-j $NCPUS) in parallel. It does reproduces since v4.20 on arm64 HPE Apollo 70 (224 CPUs, 256GB RAM, 2 nodes). It triggers reliably in about an hour. The patched kernel ran fine for 10+ hours. Signed-off-by: Jan Stancek Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Will Deacon Acked-by: Waiman Long Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dbu...@suse.de Fixes: 4b486b535c33 ("locking/rwsem: Exit read lock slowpath if queue empty & no writer") Link: https://lkml.kernel.org/r/50b8914e20d1d62bb2dee42d342836c2c16ebee7.1563438048.git.jstan...@redhat.com Signed-off-by: Ingo Molnar --- kernel/locking/rwsem.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index bc91aacaab58..d3ce7c6c42a6 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -1036,6 +1036,8 @@ queue: */ if (adjustment && !(atomic_long_read(&sem->count) & (RWSEM_WRITER_MASK | RWSEM_FLAG_HANDOFF))) { + /* Provide lock ACQUIRE */ + smp_acquire__after_ctrl_dep(); raw_spin_unlock_irq(&sem->wait_lock); rwsem_set_reader_owned(sem); lockevent_inc(rwsem_rlock_fast);
Re: [PATCH v2] locking/rwsem: add acquire barrier to read_slowpath exit when queue is empty
- Original Message - > > It's simpler like so: > > On Thu, Jul 18, 2019 at 01:04:46PM +0200, Peter Zijlstra wrote: > > X = 0; > > > > rwsem_down_read() > > for (;;) { > > set_current_state(TASK_UNINTERRUPTIBLE); > > > > X = 1; > > > > rwsem_up_write(); > > > > rwsem_mark_wake() > > > > atomic_long_add(adjustment, &sem->count); > > > > smp_store_release(&waiter->task, NULL); > > > > if (!waiter.task) > > break; > > > > ... > > } > > > > r = X; > > I see - it looks possible. Thank you for this example.
Re: [PATCH v2] locking/rwsem: add acquire barrier to read_slowpath exit when queue is empty
- Original Message - > Hi Jan, Waiman, [+Jade and Paul for the litmus test at the end] > > On Wed, Jul 17, 2019 at 09:22:00PM +0200, Jan Stancek wrote: > > On Wed, Jul 17, 2019 at 10:19:04AM -0400, Waiman Long wrote: > > > > If you add a comment to the code outlining the issue (preferably as a > > > > litmus > > > > test involving sem->count and some shared data which happens to be > > > > vmacache_seqnum in your test)), then: > > > > > > > > Reviewed-by: Will Deacon > > > > > > > > Thanks, > > > > > > > > Will > > > > > > Agreed. A comment just above smp_acquire__after_ctrl_dep() on why this > > > is needed will be great. > > > > > > Other than that, > > > > > > Acked-by: Waiman Long > > > > > > > litmus test looks a bit long, would following be acceptable? > > > > diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c > > index 37524a47f002..d9c96651bfc7 100644 > > --- a/kernel/locking/rwsem.c > > +++ b/kernel/locking/rwsem.c > > @@ -1032,6 +1032,13 @@ static inline bool rwsem_reader_phase_trylock(struct > > rw_semaphore *sem, > > */ > > if (adjustment && !(atomic_long_read(&sem->count) & > > (RWSEM_WRITER_MASK | RWSEM_FLAG_HANDOFF))) { > > + /* > > +* down_read() issued ACQUIRE on enter, but we can race > > +* with writer who did RELEASE only after us. > > +* ACQUIRE here makes sure reader operations happen only > > +* after all writer ones. > > +*/ > > How about an abridged form of the litmus test here, just to show the cod > flow? e.g.: > > /* > * We need to ensure ACQUIRE semantics when reading sem->count so that > * we pair with the RELEASE store performed by an unlocking/downgrading > * writer. > * > * P0 (writer)P1 (reader) > * > * down_write(sem); > * > * downgrade_write(sem); > * -> fetch_add_release(&sem->count) > * > *down_read_slowpath(sem); > *-> atomic_read(&sem->count) > * > * smp_acquire__after_ctrl_dep() > * > */ Works for me. The code is at 3 level of indentation, but I can try to squeeze it in for v4. > > In writing this, I also noticed that we don't have any explicit ordering > at the end of the reader slowpath when we wait on the queue but get woken > immediately: > > if (!waiter.task) > break; > > Am I missing something? I'm assuming this isn't problem, because set_current_state() on line above is using smp_store_mb(). > > > + smp_acquire__after_ctrl_dep(); > > raw_spin_unlock_irq(&sem->wait_lock); > > rwsem_set_reader_owned(sem); > > lockevent_inc(rwsem_rlock_fast); > > > > > > with litmus test in commit log: > > --- 8< > > C rwsem > > > > { > > atomic_t rwsem_count = ATOMIC_INIT(1); > > int vmacache_seqnum = 10; > > } > > > > P0(int *vmacache_seqnum, atomic_t *rwsem_count) > > { > > r0 = READ_ONCE(*vmacache_seqnum); > > WRITE_ONCE(*vmacache_seqnum, r0 + 1); > > /* downgrade_write */ > > r1 = atomic_fetch_add_release(-1+256, rwsem_count); > > } > > > > P1(int *vmacache_seqnum, atomic_t *rwsem_count, spinlock_t *sem_wait_lock) > > { > > /* rwsem_read_trylock */ > > r0 = atomic_add_return_acquire(256, rwsem_count); > > /* rwsem_down_read_slowpath */ > > spin_lock(sem_wait_lock); > > r0 = atomic_read(rwsem_count); > > if ((r0 & 1) == 0) { > > // BUG: needs barrier > > spin_unlock(sem_wait_lock); > > r1 = READ_ONCE(*vmacache_seqnum); > > } > > } > > exists (1:r1=10) > > --- 8< > > Thanks for writing this! It's definitely worth sticking it in the commit > log, but Paul and Jade might also like to include it as part of their litmus > test repository too. > > Will >
[PATCH v3] locking/rwsem: add acquire barrier to read_slowpath exit when queue is empty
LTP mtest06 has been observed to rarely hit "still mapped when deleted" and following BUG_ON on arm64: page:7e02fa37e480 refcount:3 mapcount:1 mapping:80be3d678ab0 index:0x0 xfs_address_space_operations [xfs] flags: 0xbfffe00037(locked|referenced|uptodate|lru|active) page dumped because: VM_BUG_ON_PAGE(page_mapped(page)) [ cut here ] kernel BUG at mm/filemap.c:171! Internal error: Oops - BUG: 0 [#1] SMP CPU: 220 PID: 154292 Comm: mmap1 Not tainted 5.2.0-0ecfebd.cki #1 Hardware name: HPE Apollo 70 /C01_APACHE_MB , BIOS L50_5.13_1.10 05/17/2019 pstate: 40400089 (nZcv daIf +PAN -UAO) pc : unaccount_page_cache_page+0x17c/0x1a0 lr : unaccount_page_cache_page+0x17c/0x1a0 Call trace: unaccount_page_cache_page+0x17c/0x1a0 delete_from_page_cache_batch+0xa0/0x300 truncate_inode_pages_range+0x1b8/0x640 truncate_inode_pages_final+0x88/0xa8 evict+0x1a0/0x1d8 iput+0x150/0x240 dentry_unlink_inode+0x120/0x130 __dentry_kill+0xd8/0x1d0 dentry_kill+0x88/0x248 dput+0x168/0x1b8 __fput+0xe8/0x208 fput+0x20/0x30 task_work_run+0xc0/0xf0 do_notify_resume+0x2b0/0x328 work_pending+0x8/0x10 The extra mapcount originated from pagefault handler, which handled pagefault for vma that has already been detached. vma is detached under mmap_sem write lock by detach_vmas_to_be_unmapped(), which also invalidates vmacache. When pagefault handler (under mmap_sem read lock) called find_vma(), vmacache_valid() wrongly reported vmacache as valid. After rwsem down_read() returns via 'queue empty' path (as of v5.2), it does so without issuing read_acquire on sem->count: down_read __down_read rwsem_down_read_failed __rwsem_down_read_failed_common raw_spin_lock_irq(&sem->wait_lock); if (list_empty(&sem->wait_list)) { if (atomic_long_read(&sem->count) >= 0) { raw_spin_unlock_irq(&sem->wait_lock); return sem; Suspected problem here is that last *_acquire on down_read() side happens before write side issues *_release: 1. writer: has the lock 2. reader: down_read() issues *read_acquire on entry 3. writer: mm->vmacache_seqnum++; downgrades lock (*fetch_add_release) 4. reader: __rwsem_down_read_failed_common() finds it can take lock and returns 5. reader: observes stale mm->vmacache_seqnum --- 8< C rwsem { atomic_t rwsem_count = ATOMIC_INIT(1); int vmacache_seqnum = 10; } P0(int *vmacache_seqnum, atomic_t *rwsem_count) { r0 = READ_ONCE(*vmacache_seqnum); WRITE_ONCE(*vmacache_seqnum, r0 + 1); /* downgrade_write */ r1 = atomic_fetch_add_release(-1+256, rwsem_count); } P1(int *vmacache_seqnum, atomic_t *rwsem_count, spinlock_t *sem_wait_lock) { /* rwsem_read_trylock */ r0 = atomic_add_return_acquire(256, rwsem_count); /* rwsem_down_read_slowpath */ spin_lock(sem_wait_lock); r0 = atomic_read(rwsem_count); if ((r0 & 1) == 0) { // BUG: needs barrier spin_unlock(sem_wait_lock); r1 = READ_ONCE(*vmacache_seqnum); } } exists (1:r1=10) --- >8 I can reproduce the problem by running LTP mtest06 in a loop and building kernel (-j $NCPUS) in parallel. It does reproduce since v4.20 up to v5.2 on arm64 HPE Apollo 70 (224 CPUs, 256GB RAM, 2 nodes). It triggers reliably within ~hour. Patched kernel ran fine for 10+ hours with clean dmesg. Tests were done against v5.2, since commit cf69482d62d9 ("locking/rwsem: Enable readers spinning on writer") makes it much harder to reproduce. v2: Move barrier after test (Waiman Long) Use smp_acquire__after_ctrl_dep() (Peter Zijlstra) v3: Add comment to barrier (Waiman Long, Will Deacon) Add litmus test Related: https://github.com/linux-test-project/ltp/blob/master/testcases/kernel/mem/mtest06/mmap1.c Related: commit dd2283f2605e ("mm: mmap: zap pages with read mmap_sem in munmap") Fixes: 4b486b535c33 ("locking/rwsem: Exit read lock slowpath if queue empty & no writer") Signed-off-by: Jan Stancek Reviewed-by: Will Deacon Acked-by: Waiman Long Cc: sta...@vger.kernel.org # v4.20+ Cc: Waiman Long Cc: Davidlohr Bueso Cc: Will Deacon Cc: Peter Zijlstra Cc: Ingo Molnar --- kernel/locking/rwsem.c | 7 +++ 1 file changed, 7 insertions(+) diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 37524a47f002..fe02aef39e9d 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -1032,6 +1032,13 @@ static inline bool rwsem_reader_phase_trylock(struct rw_semaphore *sem, */ if (adjustment && !(atomic_long_read(&sem->count) & (RWSEM_WRITER_MASK | RWSE
Re: [PATCH v2] locking/rwsem: add acquire barrier to read_slowpath exit when queue is empty
On Wed, Jul 17, 2019 at 10:19:04AM -0400, Waiman Long wrote: If you add a comment to the code outlining the issue (preferably as a litmus test involving sem->count and some shared data which happens to be vmacache_seqnum in your test)), then: Reviewed-by: Will Deacon Thanks, Will Agreed. A comment just above smp_acquire__after_ctrl_dep() on why this is needed will be great. Other than that, Acked-by: Waiman Long litmus test looks a bit long, would following be acceptable? diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 37524a47f002..d9c96651bfc7 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -1032,6 +1032,13 @@ static inline bool rwsem_reader_phase_trylock(struct rw_semaphore *sem, */ if (adjustment && !(atomic_long_read(&sem->count) & (RWSEM_WRITER_MASK | RWSEM_FLAG_HANDOFF))) { + /* +* down_read() issued ACQUIRE on enter, but we can race +* with writer who did RELEASE only after us. +* ACQUIRE here makes sure reader operations happen only +* after all writer ones. +*/ + smp_acquire__after_ctrl_dep(); raw_spin_unlock_irq(&sem->wait_lock); rwsem_set_reader_owned(sem); lockevent_inc(rwsem_rlock_fast); with litmus test in commit log: --- 8< C rwsem { atomic_t rwsem_count = ATOMIC_INIT(1); int vmacache_seqnum = 10; } P0(int *vmacache_seqnum, atomic_t *rwsem_count) { r0 = READ_ONCE(*vmacache_seqnum); WRITE_ONCE(*vmacache_seqnum, r0 + 1); /* downgrade_write */ r1 = atomic_fetch_add_release(-1+256, rwsem_count); } P1(int *vmacache_seqnum, atomic_t *rwsem_count, spinlock_t *sem_wait_lock) { /* rwsem_read_trylock */ r0 = atomic_add_return_acquire(256, rwsem_count); /* rwsem_down_read_slowpath */ spin_lock(sem_wait_lock); r0 = atomic_read(rwsem_count); if ((r0 & 1) == 0) { // BUG: needs barrier spin_unlock(sem_wait_lock); r1 = READ_ONCE(*vmacache_seqnum); } } exists (1:r1=10) --- 8< Thanks, Jan
[PATCH v2] locking/rwsem: add acquire barrier to read_slowpath exit when queue is empty
LTP mtest06 has been observed to rarely hit "still mapped when deleted" and following BUG_ON on arm64: page:7e02fa37e480 refcount:3 mapcount:1 mapping:80be3d678ab0 index:0x0 xfs_address_space_operations [xfs] flags: 0xbfffe00037(locked|referenced|uptodate|lru|active) page dumped because: VM_BUG_ON_PAGE(page_mapped(page)) [ cut here ] kernel BUG at mm/filemap.c:171! Internal error: Oops - BUG: 0 [#1] SMP CPU: 220 PID: 154292 Comm: mmap1 Not tainted 5.2.0-0ecfebd.cki #1 Hardware name: HPE Apollo 70 /C01_APACHE_MB , BIOS L50_5.13_1.10 05/17/2019 pstate: 40400089 (nZcv daIf +PAN -UAO) pc : unaccount_page_cache_page+0x17c/0x1a0 lr : unaccount_page_cache_page+0x17c/0x1a0 Call trace: unaccount_page_cache_page+0x17c/0x1a0 delete_from_page_cache_batch+0xa0/0x300 truncate_inode_pages_range+0x1b8/0x640 truncate_inode_pages_final+0x88/0xa8 evict+0x1a0/0x1d8 iput+0x150/0x240 dentry_unlink_inode+0x120/0x130 __dentry_kill+0xd8/0x1d0 dentry_kill+0x88/0x248 dput+0x168/0x1b8 __fput+0xe8/0x208 fput+0x20/0x30 task_work_run+0xc0/0xf0 do_notify_resume+0x2b0/0x328 work_pending+0x8/0x10 The extra mapcount originated from pagefault handler, which handled pagefault for vma that has already been detached. vma is detached under mmap_sem write lock by detach_vmas_to_be_unmapped(), which also invalidates vmacache. When pagefault handler (under mmap_sem read lock) called find_vma(), vmacache_valid() wrongly reported vmacache as valid. After rwsem down_read() returns via 'queue empty' path (as of v5.2), it does so without issuing read_acquire on sem->count: down_read __down_read rwsem_down_read_failed __rwsem_down_read_failed_common raw_spin_lock_irq(&sem->wait_lock); if (list_empty(&sem->wait_list)) { if (atomic_long_read(&sem->count) >= 0) { raw_spin_unlock_irq(&sem->wait_lock); return sem; Suspected problem here is that last *_acquire on down_read() side happens before write side issues *_release: 1. writer: has the lock 2. reader: down_read() issues *read_acquire on entry 3. writer: mm->vmacache_seqnum++; downgrades lock (*fetch_add_release) 4. reader: __rwsem_down_read_failed_common() finds it can take lock and returns 5. reader: observes stale mm->vmacache_seqnum I can reproduce the problem by running LTP mtest06 in a loop and building kernel (-j $NCPUS) in parallel. It does reproduce since v4.20 up to v5.2 on arm64 HPE Apollo 70 (224 CPUs, 256GB RAM, 2 nodes). It triggers reliably within ~hour. Patched kernel ran fine for 10+ hours with clean dmesg. Tests were done against v5.2, since commit cf69482d62d9 ("locking/rwsem: Enable readers spinning on writer") makes it much harder to reproduce. v2: Move barrier after test (Waiman Long) Use smp_acquire__after_ctrl_dep() (Peter Zijlstra) Related: https://github.com/linux-test-project/ltp/blob/master/testcases/kernel/mem/mtest06/mmap1.c Related: commit dd2283f2605e ("mm: mmap: zap pages with read mmap_sem in munmap") Fixes: 4b486b535c33 ("locking/rwsem: Exit read lock slowpath if queue empty & no writer") Signed-off-by: Jan Stancek Cc: sta...@vger.kernel.org # v4.20+ Cc: Waiman Long Cc: Davidlohr Bueso Cc: Will Deacon Cc: Peter Zijlstra Cc: Ingo Molnar --- kernel/locking/rwsem.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 37524a47f002..5ac72b60608b 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -1032,6 +1032,7 @@ static inline bool rwsem_reader_phase_trylock(struct rw_semaphore *sem, */ if (adjustment && !(atomic_long_read(&sem->count) & (RWSEM_WRITER_MASK | RWSEM_FLAG_HANDOFF))) { + smp_acquire__after_ctrl_dep(); raw_spin_unlock_irq(&sem->wait_lock); rwsem_set_reader_owned(sem); lockevent_inc(rwsem_rlock_fast); -- 1.8.3.1
Re: [PATCH] locking/rwsem: use read_acquire in read_slowpath exit when queue is empty
- Original Message - > On 7/16/19 12:04 PM, Jan Stancek wrote: > > LTP mtest06 has been observed to rarely hit "still mapped when deleted" > > and following BUG_ON on arm64: > > page:7e02fa37e480 refcount:3 mapcount:1 mapping:80be3d678ab0 > > index:0x0 > > xfs_address_space_operations [xfs] > > flags: 0xbfffe00037(locked|referenced|uptodate|lru|active) > > page dumped because: VM_BUG_ON_PAGE(page_mapped(page)) > > [ cut here ] > > kernel BUG at mm/filemap.c:171! > > Internal error: Oops - BUG: 0 [#1] SMP > > CPU: 220 PID: 154292 Comm: mmap1 Not tainted 5.2.0-0ecfebd.cki #1 > > Hardware name: HPE Apollo 70 /C01_APACHE_MB , BIOS L50_5.13_1.10 > > 05/17/2019 > > pstate: 40400089 (nZcv daIf +PAN -UAO) > > pc : unaccount_page_cache_page+0x17c/0x1a0 > > lr : unaccount_page_cache_page+0x17c/0x1a0 > > Call trace: > > unaccount_page_cache_page+0x17c/0x1a0 > > delete_from_page_cache_batch+0xa0/0x300 > > truncate_inode_pages_range+0x1b8/0x640 > > truncate_inode_pages_final+0x88/0xa8 > > evict+0x1a0/0x1d8 > > iput+0x150/0x240 > > dentry_unlink_inode+0x120/0x130 > > __dentry_kill+0xd8/0x1d0 > > dentry_kill+0x88/0x248 > > dput+0x168/0x1b8 > > __fput+0xe8/0x208 > > fput+0x20/0x30 > > task_work_run+0xc0/0xf0 > > do_notify_resume+0x2b0/0x328 > > work_pending+0x8/0x10 > > > > The extra mapcount originated from pagefault handler, which handled > > pagefault for vma that has already been detached. vma is detached > > under mmap_sem write lock by detach_vmas_to_be_unmapped(), which > > also invalidates vmacache. > > > > When pagefault handler (under mmap_sem read lock) called find_vma(), > > vmacache_valid() wrongly reported vmacache as valid. > > > > After rwsem down_read() returns via 'queue empty' path (as of v5.2), > > it does so without issuing read_acquire on sem->count: > > down_read > > __down_read > > rwsem_down_read_failed > > __rwsem_down_read_failed_common > > raw_spin_lock_irq(&sem->wait_lock); > > if (list_empty(&sem->wait_list)) { > > if (atomic_long_read(&sem->count) >= 0) { > > raw_spin_unlock_irq(&sem->wait_lock); > > return sem; > > > > Suspected problem here is that last *_acquire on down_read() side > > happens before write side issues *_release: > > 1. writer: has the lock > > 2. reader: down_read() issues *read_acquire on entry > > 3. writer: mm->vmacache_seqnum++; downgrades lock (*fetch_add_release) > > 4. reader: __rwsem_down_read_failed_common() finds it can take lock and > > returns > > 5. reader: observes stale mm->vmacache_seqnum > > > > I can reproduce the problem by running LTP mtest06 in a loop and building > > kernel (-j $NCPUS) in parallel. It does reproduce since v4.20 up to v5.2 > > on arm64 HPE Apollo 70 (224 CPUs, 256GB RAM, 2 nodes). It triggers reliably > > within ~hour. Patched kernel ran fine for 5+ hours with clean dmesg. > > Tests were done against v5.2, since commit cf69482d62d9 ("locking/rwsem: > > Enable readers spinning on writer") makes it much harder to reproduce. > The explanation makes sense to me. Thanks for the investigation. > > > > Related: > > https://github.com/linux-test-project/ltp/blob/master/testcases/kernel/mem/mtest06/mmap1.c > > Related: commit dd2283f2605e ("mm: mmap: zap pages with read mmap_sem in > > munmap") > > Fixes: 4b486b535c33 ("locking/rwsem: Exit read lock slowpath if queue empty > > & no writer") > > > > Signed-off-by: Jan Stancek > > Cc: Waiman Long > > Cc: Davidlohr Bueso > > Cc: Will Deacon > > Cc: Peter Zijlstra > > Cc: Ingo Molnar > > --- > > kernel/locking/rwsem.c | 2 +- > > 1 file changed, 1 insertion(+), 1 deletion(-) > > > > diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c > > index 37524a47f002..757b198d7a5b 100644 > > --- a/kernel/locking/rwsem.c > > +++ b/kernel/locking/rwsem.c > > @@ -1030,7 +1030,7 @@ static inline bool rwsem_reader_phase_trylock(struct > > rw_semaphore *sem, > > * exit the slowpath and return immediately as its > > * RWSEM_READER_BIAS has already been set in the count. > > */ > > - if (adjustment && !(atomic_long_read(&sem->count) & > > + if (adjustment && !(atomic_long_read_acquire(&sem->count) & > > (RWSEM_WRITER_MASK | RWSEM_FLAG_HANDOFF))) { > > raw_spin_unlock_irq(&sem->wait_lock); > > rwsem_set_reader_owned(sem); > > The chance of taking this path is not that high. So instead of > increasing the cost of the test by adding an acquire barrier, Yes, that is good point. > how about > just adding smp_mb__after_spinlock() before spin_unlock_irq(). This > should have the same effect of making sure that no stale data will be > used in the read-lock critical section. I'll redo the tests with smp_mb__after_spinlock(). Thanks, Jan > > -Longman > >
[PATCH] locking/rwsem: use read_acquire in read_slowpath exit when queue is empty
LTP mtest06 has been observed to rarely hit "still mapped when deleted" and following BUG_ON on arm64: page:7e02fa37e480 refcount:3 mapcount:1 mapping:80be3d678ab0 index:0x0 xfs_address_space_operations [xfs] flags: 0xbfffe00037(locked|referenced|uptodate|lru|active) page dumped because: VM_BUG_ON_PAGE(page_mapped(page)) [ cut here ] kernel BUG at mm/filemap.c:171! Internal error: Oops - BUG: 0 [#1] SMP CPU: 220 PID: 154292 Comm: mmap1 Not tainted 5.2.0-0ecfebd.cki #1 Hardware name: HPE Apollo 70 /C01_APACHE_MB , BIOS L50_5.13_1.10 05/17/2019 pstate: 40400089 (nZcv daIf +PAN -UAO) pc : unaccount_page_cache_page+0x17c/0x1a0 lr : unaccount_page_cache_page+0x17c/0x1a0 Call trace: unaccount_page_cache_page+0x17c/0x1a0 delete_from_page_cache_batch+0xa0/0x300 truncate_inode_pages_range+0x1b8/0x640 truncate_inode_pages_final+0x88/0xa8 evict+0x1a0/0x1d8 iput+0x150/0x240 dentry_unlink_inode+0x120/0x130 __dentry_kill+0xd8/0x1d0 dentry_kill+0x88/0x248 dput+0x168/0x1b8 __fput+0xe8/0x208 fput+0x20/0x30 task_work_run+0xc0/0xf0 do_notify_resume+0x2b0/0x328 work_pending+0x8/0x10 The extra mapcount originated from pagefault handler, which handled pagefault for vma that has already been detached. vma is detached under mmap_sem write lock by detach_vmas_to_be_unmapped(), which also invalidates vmacache. When pagefault handler (under mmap_sem read lock) called find_vma(), vmacache_valid() wrongly reported vmacache as valid. After rwsem down_read() returns via 'queue empty' path (as of v5.2), it does so without issuing read_acquire on sem->count: down_read __down_read rwsem_down_read_failed __rwsem_down_read_failed_common raw_spin_lock_irq(&sem->wait_lock); if (list_empty(&sem->wait_list)) { if (atomic_long_read(&sem->count) >= 0) { raw_spin_unlock_irq(&sem->wait_lock); return sem; Suspected problem here is that last *_acquire on down_read() side happens before write side issues *_release: 1. writer: has the lock 2. reader: down_read() issues *read_acquire on entry 3. writer: mm->vmacache_seqnum++; downgrades lock (*fetch_add_release) 4. reader: __rwsem_down_read_failed_common() finds it can take lock and returns 5. reader: observes stale mm->vmacache_seqnum I can reproduce the problem by running LTP mtest06 in a loop and building kernel (-j $NCPUS) in parallel. It does reproduce since v4.20 up to v5.2 on arm64 HPE Apollo 70 (224 CPUs, 256GB RAM, 2 nodes). It triggers reliably within ~hour. Patched kernel ran fine for 5+ hours with clean dmesg. Tests were done against v5.2, since commit cf69482d62d9 ("locking/rwsem: Enable readers spinning on writer") makes it much harder to reproduce. Related: https://github.com/linux-test-project/ltp/blob/master/testcases/kernel/mem/mtest06/mmap1.c Related: commit dd2283f2605e ("mm: mmap: zap pages with read mmap_sem in munmap") Fixes: 4b486b535c33 ("locking/rwsem: Exit read lock slowpath if queue empty & no writer") Signed-off-by: Jan Stancek Cc: Waiman Long Cc: Davidlohr Bueso Cc: Will Deacon Cc: Peter Zijlstra Cc: Ingo Molnar --- kernel/locking/rwsem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 37524a47f002..757b198d7a5b 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -1030,7 +1030,7 @@ static inline bool rwsem_reader_phase_trylock(struct rw_semaphore *sem, * exit the slowpath and return immediately as its * RWSEM_READER_BIAS has already been set in the count. */ - if (adjustment && !(atomic_long_read(&sem->count) & + if (adjustment && !(atomic_long_read_acquire(&sem->count) & (RWSEM_WRITER_MASK | RWSEM_FLAG_HANDOFF))) { raw_spin_unlock_irq(&sem->wait_lock); rwsem_set_reader_owned(sem); -- 1.8.3.1
Re: [LTP] [mm] 8c7829b04c: ltp.overcommit_memory01.fail
- Original Message - > FYI, we noticed the following commit (built with gcc-7): > > commit: 8c7829b04c523cdc732cb77f59f03320e09f3386 ("mm: fix false-positive > OVERCOMMIT_GUESS failures") This matches report from Naresh: http://lists.linux.it/pipermail/ltp/2019-May/011962.html > https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git master > > in testcase: ltp > with following parameters: > > disk: 1HDD > test: mm-01 > > test-description: The LTP testsuite contains a collection of tools for > testing the Linux kernel and related features. > test-url: http://linux-test-project.github.io/ > > > on test machine: qemu-system-x86_64 -enable-kvm -cpu SandyBridge -smp 2 -m 4G > > caused below changes (please refer to attached dmesg/kmsg for entire > log/backtrace): > > > If you fix the issue, kindly add following tag > Reported-by: kernel test robot > > > [ 554.112267 ] <<>> > [ 554.112269 ] > [ 554.115806 ] tag=overcommit_memory01 stime=1558074982 > [ 554.115809 ] > [ 554.119303 ] cmdline="overcommit_memory" > [ 554.119306 ] > [ 554.121962 ] contacts="" > [ 554.121965 ] > [ 554.124459 ] analysis=exit > [ 554.124463 ] > [ 554.127140 ] <<>> > [ 554.127144 ] > [ 554.131401 ] tst_test.c:1096: INFO: Timeout per run is 0h 05m 00s > [ 554.131404 ] > [ 554.136248 ] overcommit_memory.c:117: INFO: MemTotal is 4033124 kB > [ 554.136251 ] > [ 554.141365 ] overcommit_memory.c:119: INFO: SwapTotal is 268435452 kB > [ 554.141368 ] > [ 554.146664 ] overcommit_memory.c:123: INFO: CommitLimit is 270452012 kB > [ 554.14 ] > [ 554.151375 ] mem.c:814: INFO: set overcommit_ratio to 50 > [ 554.151378 ] > [ 554.155796 ] mem.c:814: INFO: set overcommit_memory to 2 > [ 554.155799 ] > [ 554.159951 ] overcommit_memory.c:190: INFO: malloc 538608648 kB failed > [ 554.159953 ] > [ 554.164801 ] overcommit_memory.c:211: PASS: alloc failed as expected > [ 554.164804 ] > [ 554.170196 ] overcommit_memory.c:190: INFO: malloc 270452012 kB failed > [ 554.170199 ] > [ 554.175344 ] overcommit_memory.c:211: PASS: alloc failed as expected > [ 554.175347 ] > [ 554.180932 ] overcommit_memory.c:186: INFO: malloc 134651878 kB > successfully > [ 554.180935 ] > [ 554.186522 ] overcommit_memory.c:205: PASS: alloc passed as expected > [ 554.186524 ] > [ 554.191234 ] mem.c:814: INFO: set overcommit_memory to 0 > [ 554.191237 ] > [ 554.196930 ] overcommit_memory.c:186: INFO: malloc 134734122 kB > successfully > [ 554.196933 ] > [ 554.202738 ] overcommit_memory.c:205: PASS: alloc passed as expected > [ 554.202742 ] > [ 554.208563 ] overcommit_memory.c:190: INFO: malloc 538936488 kB failed > [ 554.208566 ] > [ 554.214355 ] overcommit_memory.c:211: PASS: alloc failed as expected > [ 554.214357 ] > [ 554.220506 ] overcommit_memory.c:186: INFO: malloc 272468576 kB > successfully > [ 554.220509 ] > [ 554.226564 ] overcommit_memory.c:213: FAIL: alloc passed, expected to fail > [ 554.226568 ] > [ 554.231870 ] mem.c:814: INFO: set overcommit_memory to 1 > [ 554.231873 ] > [ 554.237819 ] overcommit_memory.c:186: INFO: malloc 136234288 kB > successfully > [ 554.237821 ] > [ 554.243711 ] overcommit_memory.c:205: PASS: alloc passed as expected > [ 554.243713 ] > [ 554.249791 ] overcommit_memory.c:186: INFO: malloc 272468576 kB > successfully > [ 554.249794 ] > [ 554.255069 ] overcommit_memory.c:205: PASS: alloc passed as expected > [ 554.255073 ] > [ 554.261594 ] overcommit_memory.c:186: INFO: malloc 544937152 kB > successfully > [ 554.261597 ] > [ 554.267855 ] overcommit_memory.c:205: PASS: alloc passed as expected > [ 554.267858 ] > [ 554.273445 ] mem.c:814: INFO: set overcommit_memory to 0 > [ 554.273448 ] > [ 554.278882 ] mem.c:814: INFO: set overcommit_ratio to 50 > [ 554.278885 ] > [ 554.282419 ] > [ 554.284170 ] Summary: > [ 554.284173 ] > [ 554.287476 ] passed 8 > [ 554.287480 ] > [ 554.290740 ] failed 1 > [ 554.290743 ] > [ 554.293971 ] skipped 0 > [ 554.293974 ] > [ 554.297123 ] warnings 0 > [ 554.297126 ] > [ 554.300746 ] <<>> > [ 554.300749 ] > [ 554.304489 ] initiation_status="ok" > [ 554.304492 ] > [ 554.309762 ] duration=0 termination_type=exited termination_id=1 > corefile=no > [ 554.309765 ] > [ 554.313681 ] cutime=0 cstime=0 > [ 554.313685 ] > [ 554.316880 ] <<>> > > > > To reproduce: > > # build kernel > cd linux > cp config-5.1.0-10246-g8c7829b .config > make HOSTCC=gcc-7 CC=gcc-7 ARCH=x86_64 olddefconfig > make HOSTCC=gcc-7 CC=gcc-7 ARCH=x86_64 prepare > make HOSTCC=gcc-7 CC=gcc-7 ARCH=x86_64 modules_prepare > make HOSTCC=gcc-7 CC=gcc-7 ARCH=x86_64 SHELL=/bin/bash > make HOSTCC=gcc-7 CC=gcc-7 ARCH=x86_64 bzImage > > > git clone https://github.com/intel/lkp-tests.git > cd lkp-tests > find lib/ | cpio -o -H newc --quiet | gzip > modules.cgz > bin/lkp qemu -k -m modules.cgz job-script # job-script is > attached > in this ema
Re: [v2 PATCH] mm: mmu_gather: remove __tlb_reset_range() for force flush
- Original Message - > On Mon, May 13, 2019 at 04:01:09PM -0700, Yang Shi wrote: > > > > > > On 5/13/19 9:38 AM, Will Deacon wrote: > > > On Fri, May 10, 2019 at 07:26:54AM +0800, Yang Shi wrote: > > > > diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c > > > > index 99740e1..469492d 100644 > > > > --- a/mm/mmu_gather.c > > > > +++ b/mm/mmu_gather.c > > > > @@ -245,14 +245,39 @@ void tlb_finish_mmu(struct mmu_gather *tlb, > > > > { > > > > /* > > > > * If there are parallel threads are doing PTE changes on same > > > > range > > > > -* under non-exclusive lock(e.g., mmap_sem read-side) but defer > > > > TLB > > > > -* flush by batching, a thread has stable TLB entry can fail to > > > > flush > > > > -* the TLB by observing pte_none|!pte_dirty, for example so > > > > flush TLB > > > > -* forcefully if we detect parallel PTE batching threads. > > > > +* under non-exclusive lock (e.g., mmap_sem read-side) but > > > > defer TLB > > > > +* flush by batching, one thread may end up seeing inconsistent > > > > PTEs > > > > +* and result in having stale TLB entries. So flush TLB > > > > forcefully > > > > +* if we detect parallel PTE batching threads. > > > > +* > > > > +* However, some syscalls, e.g. munmap(), may free page tables, > > > > this > > > > +* needs force flush everything in the given range. Otherwise > > > > this > > > > +* may result in having stale TLB entries for some > > > > architectures, > > > > +* e.g. aarch64, that could specify flush what level TLB. > > > > */ > > > > - if (mm_tlb_flush_nested(tlb->mm)) { > > > > - __tlb_reset_range(tlb); > > > > - __tlb_adjust_range(tlb, start, end - start); > > > > + if (mm_tlb_flush_nested(tlb->mm) && !tlb->fullmm) { > > > > + /* > > > > +* Since we can't tell what we actually should have > > > > +* flushed, flush everything in the given range. > > > > +*/ > > > > + tlb->freed_tables = 1; > > > > + tlb->cleared_ptes = 1; > > > > + tlb->cleared_pmds = 1; > > > > + tlb->cleared_puds = 1; > > > > + tlb->cleared_p4ds = 1; > > > > + > > > > + /* > > > > +* Some architectures, e.g. ARM, that have range > > > > invalidation > > > > +* and care about VM_EXEC for I-Cache invalidation, > > > > need force > > > > +* vma_exec set. > > > > +*/ > > > > + tlb->vma_exec = 1; > > > > + > > > > + /* Force vma_huge clear to guarantee safer flush */ > > > > + tlb->vma_huge = 0; > > > > + > > > > + tlb->start = start; > > > > + tlb->end = end; > > > > } > > > Whilst I think this is correct, it would be interesting to see whether > > > or not it's actually faster than just nuking the whole mm, as I mentioned > > > before. > > > > > > At least in terms of getting a short-term fix, I'd prefer the diff below > > > if it's not measurably worse. > > > > I did a quick test with ebizzy (96 threads with 5 iterations) on my x86 VM, > > it shows slightly slowdown on records/s but much more sys time spent with > > fullmm flush, the below is the data. > > > > nofullmm fullmm > > ops (records/s) 225606 225119 > > sys (s) 0.69 1.14 > > > > It looks the slight reduction of records/s is caused by the increase of sys > > time. > > That's not what I expected, and I'm unable to explain why moving to fullmm > would /increase/ the system time. I would've thought the time spent doing > the invalidation would decrease, with the downside that the TLB is cold > when returning back to userspace. > I tried ebizzy with various parameters (malloc vs mmap, ran it for hour), but performance was very similar for both patches. So, I was looking for workload that would demonstrate the largest difference. Inspired by python xml-rpc, which can handle each request in new thread, I tried following [1]: 16 threads, each looping 100k times over: mmap(16M) touch 1 page madvise(DONTNEED) munmap(16M) This yields quite significant difference for 2 patches when running on my 46 CPU arm host. I checked it twice - applied patch, recompiled, rebooted, but numbers stayed +- couple seconds the same. Does it somewhat match your expectation? v2 patch - real2m33.460s user0m3.359s sys 15m32.307s real2m33.895s user0m2.749s sys 16m34.500s real2m35.666s user0m3.528s sys 15m23.377s real2m32.898s user0m2.789s sys 16m18.801s real2m33.087s user0m3.565s sys 16m23.815s fullmm version --- real0m46.811s user0m1.596s sys 1m
Re: LTP: mm: overcommit_memory01, 03...06 failed
- Original Message - > ltp-mm-tests failed on Linux mainline kernel 5.1.0, > * overcommit_memory01 overcommit_memory > * overcommit_memory03 overcommit_memory -R 30 > * overcommit_memory04 overcommit_memory -R 80 > * overcommit_memory05 overcommit_memory -R 100 > * overcommit_memory06 overcommit_memory -R 200 > > mem.c:814: INFO: set overcommit_memory to 0 > overcommit_memory.c:185: INFO: malloc 8094844 kB successfully > overcommit_memory.c:204: PASS: alloc passed as expected > overcommit_memory.c:189: INFO: malloc 32379376 kB failed > overcommit_memory.c:210: PASS: alloc failed as expected > overcommit_memory.c:185: INFO: malloc 16360216 kB successfully > overcommit_memory.c:212: FAIL: alloc passed, expected to fail > > Failed test log, > https://lkft.validation.linaro.org/scheduler/job/726417#L22852 > > LTP version 20190115 > > Test case link, > https://github.com/linux-test-project/ltp/blob/master/testcases/kernel/mem/tunable/overcommit_memory.c#L212 > > First bad commit: > git branch master > git commit e0654264c4806dc436b291294a0fbf9be7571ab6 > git describe v5.1-10706-ge0654264c480 > git repo https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git > > Last good commit: > git branch master > git commit 7e9890a3500d95c01511a4c45b7e7192dfa47ae2 > git describe v5.1-10326-g7e9890a3500d > git repo https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git Heuristic changed a bit in: commit 8c7829b04c523cdc732cb77f59f03320e09f3386 Author: Johannes Weiner Date: Mon May 13 17:21:50 2019 -0700 mm: fix false-positive OVERCOMMIT_GUESS failures LTP tries to allocate "mem_total + swap_total": alloc_and_check(sum_total, EXPECT_FAIL); which now presumably falls short to trigger failure. > > Best regards > Naresh Kamboju >
Re: [v2 PATCH] mm: mmu_gather: remove __tlb_reset_range() for force flush
- Original Message - > > > On May 13, 2019 4:01 PM, Yang Shi wrote: > > > On 5/13/19 9:38 AM, Will Deacon wrote: > > On Fri, May 10, 2019 at 07:26:54AM +0800, Yang Shi wrote: > >> diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c > >> index 99740e1..469492d 100644 > >> --- a/mm/mmu_gather.c > >> +++ b/mm/mmu_gather.c > >> @@ -245,14 +245,39 @@ void tlb_finish_mmu(struct mmu_gather *tlb, > >> { > >> /* > >>* If there are parallel threads are doing PTE changes on same range > >> - * under non-exclusive lock(e.g., mmap_sem read-side) but defer TLB > >> - * flush by batching, a thread has stable TLB entry can fail to flush > >> - * the TLB by observing pte_none|!pte_dirty, for example so flush TLB > >> - * forcefully if we detect parallel PTE batching threads. > >> + * under non-exclusive lock (e.g., mmap_sem read-side) but defer TLB > >> + * flush by batching, one thread may end up seeing inconsistent PTEs > >> + * and result in having stale TLB entries. So flush TLB forcefully > >> + * if we detect parallel PTE batching threads. > >> + * > >> + * However, some syscalls, e.g. munmap(), may free page tables, this > >> + * needs force flush everything in the given range. Otherwise this > >> + * may result in having stale TLB entries for some architectures, > >> + * e.g. aarch64, that could specify flush what level TLB. > >>*/ > >> -if (mm_tlb_flush_nested(tlb->mm)) { > >> -__tlb_reset_range(tlb); > >> -__tlb_adjust_range(tlb, start, end - start); > >> +if (mm_tlb_flush_nested(tlb->mm) && !tlb->fullmm) { > >> +/* > >> + * Since we can't tell what we actually should have > >> + * flushed, flush everything in the given range. > >> + */ > >> +tlb->freed_tables = 1; > >> +tlb->cleared_ptes = 1; > >> +tlb->cleared_pmds = 1; > >> +tlb->cleared_puds = 1; > >> +tlb->cleared_p4ds = 1; > >> + > >> +/* > >> + * Some architectures, e.g. ARM, that have range invalidation > >> + * and care about VM_EXEC for I-Cache invalidation, need > >> force > >> + * vma_exec set. > >> + */ > >> +tlb->vma_exec = 1; > >> + > >> +/* Force vma_huge clear to guarantee safer flush */ > >> +tlb->vma_huge = 0; > >> + > >> +tlb->start = start; > >> +tlb->end = end; > >> } > > Whilst I think this is correct, it would be interesting to see whether > > or not it's actually faster than just nuking the whole mm, as I mentioned > > before. > > > > At least in terms of getting a short-term fix, I'd prefer the diff below > > if it's not measurably worse. > > I did a quick test with ebizzy (96 threads with 5 iterations) on my x86 > VM, it shows slightly slowdown on records/s but much more sys time spent > with fullmm flush, the below is the data. > > nofullmm fullmm > ops (records/s) 225606 225119 > sys (s)0.691.14 > > It looks the slight reduction of records/s is caused by the increase of > sys time. > > > > > Will > > > > --->8 > > > > diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c > > index 99740e1dd273..cc251422d307 100644 > > --- a/mm/mmu_gather.c > > +++ b/mm/mmu_gather.c > > @@ -251,8 +251,9 @@ void tlb_finish_mmu(struct mmu_gather *tlb, > > * forcefully if we detect parallel PTE batching threads. > > */ > >if (mm_tlb_flush_nested(tlb->mm)) { > > + tlb->fullmm = 1; > >__tlb_reset_range(tlb); > > - __tlb_adjust_range(tlb, start, end - start); > > + tlb->freed_tables = 1; > >} > > > >tlb_flush_mmu(tlb); > > > I think that this should have set need_flush_all and not fullmm. > Wouldn't that skip the flush? If fulmm == 0, then __tlb_reset_range() sets tlb->end = 0. tlb_flush_mmu tlb_flush_mmu_tlbonly if (!tlb->end) return Replacing fullmm with need_flush_all, brings the problem back / reproducer hangs.
Re: [PATCH] mm: mmu_gather: remove __tlb_reset_range() for force flush
- Original Message - > > > On 5/9/19 2:06 PM, Jan Stancek wrote: > > - Original Message - > >> > >> On 5/9/19 11:24 AM, Peter Zijlstra wrote: > >>> On Thu, May 09, 2019 at 05:36:29PM +, Nadav Amit wrote: > >>>>> On May 9, 2019, at 3:38 AM, Peter Zijlstra > >>>>> wrote: > >>>>> diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c > >>>>> index 99740e1dd273..fe768f8d612e 100644 > >>>>> --- a/mm/mmu_gather.c > >>>>> +++ b/mm/mmu_gather.c > >>>>> @@ -244,15 +244,20 @@ void tlb_finish_mmu(struct mmu_gather *tlb, > >>>>> unsigned long start, unsigned long end) > >>>>> { > >>>>> /* > >>>>> -* If there are parallel threads are doing PTE changes on same > >>>>> range > >>>>> -* under non-exclusive lock(e.g., mmap_sem read-side) but defer > >>>>> TLB > >>>>> -* flush by batching, a thread has stable TLB entry can fail to > >>>>> flush > >>>>> -* the TLB by observing pte_none|!pte_dirty, for example so > >>>>> flush TLB > >>>>> -* forcefully if we detect parallel PTE batching threads. > >>>>> +* Sensible comment goes here.. > >>>>> */ > >>>>> - if (mm_tlb_flush_nested(tlb->mm)) { > >>>>> - __tlb_reset_range(tlb); > >>>>> - __tlb_adjust_range(tlb, start, end - start); > >>>>> + if (mm_tlb_flush_nested(tlb->mm) && !tlb->full_mm) { > >>>>> + /* > >>>>> +* Since we're can't tell what we actually should have > >>>>> +* flushed flush everything in the given range. > >>>>> +*/ > >>>>> + tlb->start = start; > >>>>> + tlb->end = end; > >>>>> + tlb->freed_tables = 1; > >>>>> + tlb->cleared_ptes = 1; > >>>>> + tlb->cleared_pmds = 1; > >>>>> + tlb->cleared_puds = 1; > >>>>> + tlb->cleared_p4ds = 1; > >>>>> } > >>>>> > >>>>> tlb_flush_mmu(tlb); > >>>> As a simple optimization, I think it is possible to hold multiple > >>>> nesting > >>>> counters in the mm, similar to tlb_flush_pending, for freed_tables, > >>>> cleared_ptes, etc. > >>>> > >>>> The first time you set tlb->freed_tables, you also atomically increase > >>>> mm->tlb_flush_freed_tables. Then, in tlb_flush_mmu(), you just use > >>>> mm->tlb_flush_freed_tables instead of tlb->freed_tables. > >>> That sounds fraught with races and expensive; I would much prefer to not > >>> go there for this arguably rare case. > >>> > >>> Consider such fun cases as where CPU-0 sees and clears a PTE, CPU-1 > >>> races and doesn't see that PTE. Therefore CPU-0 sets and counts > >>> cleared_ptes. Then if CPU-1 flushes while CPU-0 is still in mmu_gather, > >>> it will see cleared_ptes count increased and flush that granularity, > >>> OTOH if CPU-1 flushes after CPU-0 completes, it will not and potentiall > >>> miss an invalidate it should have had. > >>> > >>> This whole concurrent mmu_gather stuff is horrible. > >>> > >>> /me ponders more > >>> > >>> So I think the fundamental race here is this: > >>> > >>> CPU-0 CPU-1 > >>> > >>> tlb_gather_mmu(.start=1,tlb_gather_mmu(.start=2, > >>> .end=3);.end=4); > >>> > >>> ptep_get_and_clear_full(2) > >>> tlb_remove_tlb_entry(2); > >>> __tlb_remove_page(); > >>> if (pte_present(2)) // nope > >>> > >>> tlb_finish_mmu(); > >>> > >>> // continue without TLBI(2) > >>> // whoopsie > >>> > >>> tlb_finish_mmu(); > >&g
Re: [PATCH] mm: mmu_gather: remove __tlb_reset_range() for force flush
- Original Message - > > > On 5/9/19 11:24 AM, Peter Zijlstra wrote: > > On Thu, May 09, 2019 at 05:36:29PM +, Nadav Amit wrote: > >>> On May 9, 2019, at 3:38 AM, Peter Zijlstra wrote: > >>> diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c > >>> index 99740e1dd273..fe768f8d612e 100644 > >>> --- a/mm/mmu_gather.c > >>> +++ b/mm/mmu_gather.c > >>> @@ -244,15 +244,20 @@ void tlb_finish_mmu(struct mmu_gather *tlb, > >>> unsigned long start, unsigned long end) > >>> { > >>> /* > >>> - * If there are parallel threads are doing PTE changes on same range > >>> - * under non-exclusive lock(e.g., mmap_sem read-side) but defer TLB > >>> - * flush by batching, a thread has stable TLB entry can fail to flush > >>> - * the TLB by observing pte_none|!pte_dirty, for example so flush TLB > >>> - * forcefully if we detect parallel PTE batching threads. > >>> + * Sensible comment goes here.. > >>>*/ > >>> - if (mm_tlb_flush_nested(tlb->mm)) { > >>> - __tlb_reset_range(tlb); > >>> - __tlb_adjust_range(tlb, start, end - start); > >>> + if (mm_tlb_flush_nested(tlb->mm) && !tlb->full_mm) { > >>> + /* > >>> + * Since we're can't tell what we actually should have > >>> + * flushed flush everything in the given range. > >>> + */ > >>> + tlb->start = start; > >>> + tlb->end = end; > >>> + tlb->freed_tables = 1; > >>> + tlb->cleared_ptes = 1; > >>> + tlb->cleared_pmds = 1; > >>> + tlb->cleared_puds = 1; > >>> + tlb->cleared_p4ds = 1; > >>> } > >>> > >>> tlb_flush_mmu(tlb); > >> As a simple optimization, I think it is possible to hold multiple nesting > >> counters in the mm, similar to tlb_flush_pending, for freed_tables, > >> cleared_ptes, etc. > >> > >> The first time you set tlb->freed_tables, you also atomically increase > >> mm->tlb_flush_freed_tables. Then, in tlb_flush_mmu(), you just use > >> mm->tlb_flush_freed_tables instead of tlb->freed_tables. > > That sounds fraught with races and expensive; I would much prefer to not > > go there for this arguably rare case. > > > > Consider such fun cases as where CPU-0 sees and clears a PTE, CPU-1 > > races and doesn't see that PTE. Therefore CPU-0 sets and counts > > cleared_ptes. Then if CPU-1 flushes while CPU-0 is still in mmu_gather, > > it will see cleared_ptes count increased and flush that granularity, > > OTOH if CPU-1 flushes after CPU-0 completes, it will not and potentiall > > miss an invalidate it should have had. > > > > This whole concurrent mmu_gather stuff is horrible. > > > >/me ponders more > > > > So I think the fundamental race here is this: > > > > CPU-0 CPU-1 > > > > tlb_gather_mmu(.start=1,tlb_gather_mmu(.start=2, > >.end=3);.end=4); > > > > ptep_get_and_clear_full(2) > > tlb_remove_tlb_entry(2); > > __tlb_remove_page(); > > if (pte_present(2)) // nope > > > > tlb_finish_mmu(); > > > > // continue without TLBI(2) > > // whoopsie > > > > tlb_finish_mmu(); > > tlb_flush() -> TLBI(2) > > I'm not quite sure if this is the case Jan really met. But, according to > his test, once correct tlb->freed_tables and tlb->cleared_* are set, his > test works well. My theory was following sequence: t1: map_write_unmap() t2: dummy() map_address = mmap() map_address[i] = 'b' munmap(map_address) downgrade_write(&mm->mmap_sem); unmap_region() tlb_gather_mmu() inc_tlb_flush_pending(tlb->mm); free_pgtables() tlb->freed_tables = 1 tlb->cleared_pmds = 1 pthread_exit() madvise(thread_stack, 8M, MADV_DONTNEED) zap_page_range() tlb_gather_mmu() inc_tlb_flush_pending(tlb->mm); tlb_finish_mmu() if (mm_tlb_flush_nested(tlb->mm)) __tlb_reset_range() tlb->freed_tables = 0 tlb->cleared_pmds = 0 __flush_tlb_range(last_level = 0) ... map_address = mmap() map_address[i] = 'b' # PTE appeared valid to me, # so I suspected stale TLB entry at higher level as result of "freed_tables = 0" I'm happy to apply/run any debug patches to get more data that would help. > > > > > > > And we can fix that by having tlb_finish_mmu() sync up. Never let a > > concurrent tlb_finish_mmu() complete until all concurrenct mmu_gathers > > have completed. > > Not sure if this will scale well. > > > > > This should not be too hard to make happen. > >
Re: [PATCH] mm: mmu_gather: remove __tlb_reset_range() for force flush
> > I don't think we can elide the call __tlb_reset_range() entirely, since I > > think we do want to clear the freed_pXX bits to ensure that we walk the > > range with the smallest mapping granule that we have. Otherwise couldn't we > > have a problem if we hit a PMD that had been cleared, but the TLB > > invalidation for the PTEs that used to be linked below it was still > > pending? > > That's tlb->cleared_p*, and yes agreed. That is, right until some > architecture has level dependent TLBI instructions, at which point we'll > need to have them all set instead of cleared. > > > Perhaps we should just set fullmm if we see that here's a concurrent > > unmapper rather than do a worst-case range invalidation. Do you have a > > feeling > > for often the mm_tlb_flush_nested() triggers in practice? > > Quite a bit for certain workloads I imagine, that was the whole point of > doing it. > > Anyway; am I correct in understanding that the actual problem is that > we've cleared freed_tables and the ARM64 tlb_flush() will then not > invalidate the cache and badness happens? That is my understanding, only last level is flushed, which is not enough. > > Because so far nobody has actually provided a coherent description of > the actual problem we're trying to solve. But I'm thinking something > like the below ought to do. I applied it (and fixed small typo: s/tlb->full_mm/tlb->fullmm/). It fixes the problem for me. > > > diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c > index 99740e1dd273..fe768f8d612e 100644 > --- a/mm/mmu_gather.c > +++ b/mm/mmu_gather.c > @@ -244,15 +244,20 @@ void tlb_finish_mmu(struct mmu_gather *tlb, > unsigned long start, unsigned long end) > { > /* > - * If there are parallel threads are doing PTE changes on same range > - * under non-exclusive lock(e.g., mmap_sem read-side) but defer TLB > - * flush by batching, a thread has stable TLB entry can fail to flush > - * the TLB by observing pte_none|!pte_dirty, for example so flush TLB > - * forcefully if we detect parallel PTE batching threads. > + * Sensible comment goes here.. >*/ > - if (mm_tlb_flush_nested(tlb->mm)) { > - __tlb_reset_range(tlb); > - __tlb_adjust_range(tlb, start, end - start); > + if (mm_tlb_flush_nested(tlb->mm) && !tlb->full_mm) { > + /* > + * Since we're can't tell what we actually should have > + * flushed flush everything in the given range. > + */ > + tlb->start = start; > + tlb->end = end; > + tlb->freed_tables = 1; > + tlb->cleared_ptes = 1; > + tlb->cleared_pmds = 1; > + tlb->cleared_puds = 1; > + tlb->cleared_p4ds = 1; > } > > tlb_flush_mmu(tlb); >
[PATCH v3] mm/memory.c: do_fault: avoid usage of stale vm_area_struct
LTP testcase mtest06 [1] can trigger a crash on s390x running 5.0.0-rc8. This is a stress test, where one thread mmaps/writes/munmaps memory area and other thread is trying to read from it: CPU: 0 PID: 2611 Comm: mmap1 Not tainted 5.0.0-rc8+ #51 Hardware name: IBM 2964 N63 400 (z/VM 6.4.0) Krnl PSW : 0404e0018000 001ac8d8 (__lock_acquire+0x7/0x7a8) Call Trace: ([<>] (null)) [<001adae4>] lock_acquire+0xec/0x258 [<0080d1ac>] _raw_spin_lock_bh+0x5c/0x98 [<0012a780>] page_table_free+0x48/0x1a8 [<002f6e54>] do_fault+0xdc/0x670 [<002fadae>] __handle_mm_fault+0x416/0x5f0 [<002fb138>] handle_mm_fault+0x1b0/0x320 [<001248cc>] do_dat_exception+0x19c/0x2c8 [<0080e5ee>] pgm_check_handler+0x19e/0x200 page_table_free() is called with NULL mm parameter, but because "0" is a valid address on s390 (see S390_lowcore), it keeps going until it eventually crashes in lockdep's lock_acquire. This crash is reproducible at least since 4.14. Problem is that "vmf->vma" used in do_fault() can become stale. Because mmap_sem may be released, other threads can come in, call munmap() and cause "vma" be returned to kmem cache, and get zeroed/re-initialized and re-used: handle_mm_fault | __handle_mm_fault | do_fault | vma = vmf->vma | do_read_fault | __do_fault| vma->vm_ops->fault(vmf);| mmap_sem is released | | | do_munmap() | remove_vma_list() | remove_vma() | vm_area_free() | # vma is released | ... | # same vma is allocated | # from kmem cache | do_mmap() | vm_area_alloc() | memset(vma, 0, ...) | pte_free(vma->vm_mm, ...); | page_table_free | spin_lock_bh(&mm->context.lock);| | Cache mm_struct to avoid using potentially stale "vma". [1] https://github.com/linux-test-project/ltp/blob/master/testcases/kernel/mem/mtest06/mmap1.c Signed-off-by: Jan Stancek Reviewed-by: Andrea Arcangeli --- mm/memory.c | 5 - 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/mm/memory.c b/mm/memory.c index e11ca9dd823f..e8d69ade5acc 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3517,10 +3517,13 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf) * but allow concurrent faults). * The mmap_sem may have been released depending on flags and our * return value. See filemap_fault() and __lock_page_or_retry(). + * If mmap_sem is released, vma may become invalid (for example + * by other thread calling munmap()). */ static vm_fault_t do_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; + struct mm_struct *vm_mm = vma->vm_mm; vm_fault_t ret; /* @@ -3561,7 +3564,7 @@ static vm_fault_t do_fault(struct vm_fault *vmf) /* preallocated pagetable is unused: free it */ if (vmf->prealloc_pte) { - pte_free(vma->vm_mm, vmf->prealloc_pte); + pte_free(vm_mm, vmf->prealloc_pte); vmf->prealloc_pte = NULL; } return ret; -- 1.8.3.1
Re: [PATCH v2] mm/memory.c: do_fault: avoid usage of stale vm_area_struct
- Original Message - > Hello Jan, > > On Sat, Mar 02, 2019 at 07:19:39PM +0100, Jan Stancek wrote: > > + struct mm_struct *vm_mm = READ_ONCE(vma->vm_mm); > > The vma->vm_mm cannot change under gcc there, so no need of > READ_ONCE. The release of mmap_sem has release semantics so the > vma->vm_mm access cannot be reordered after up_read(mmap_sem) either. > > Other than the above detail: > > Reviewed-by: Andrea Arcangeli Thank you for review, I dropped READ_ONCE and sent v3 with your Reviewed-by included. I also successfully re-ran tests over-night. > Would this not need a corresponding WRITE_ONCE() in vma_init() ? There's at least 2 context switches between, so I think it wouldn't matter. My concern was gcc optimizing out vm_mm, and vma->vm_mm access happening only after do_read_fault().
[PATCH v2] mm/memory.c: do_fault: avoid usage of stale vm_area_struct
LTP testcase mtest06 [1] can trigger a crash on s390x running 5.0.0-rc8. This is a stress test, where one thread mmaps/writes/munmaps memory area and other thread is trying to read from it: CPU: 0 PID: 2611 Comm: mmap1 Not tainted 5.0.0-rc8+ #51 Hardware name: IBM 2964 N63 400 (z/VM 6.4.0) Krnl PSW : 0404e0018000 001ac8d8 (__lock_acquire+0x7/0x7a8) Call Trace: ([<>] (null)) [<001adae4>] lock_acquire+0xec/0x258 [<0080d1ac>] _raw_spin_lock_bh+0x5c/0x98 [<0012a780>] page_table_free+0x48/0x1a8 [<002f6e54>] do_fault+0xdc/0x670 [<002fadae>] __handle_mm_fault+0x416/0x5f0 [<002fb138>] handle_mm_fault+0x1b0/0x320 [<001248cc>] do_dat_exception+0x19c/0x2c8 [<0080e5ee>] pgm_check_handler+0x19e/0x200 page_table_free() is called with NULL mm parameter, but because "0" is a valid address on s390 (see S390_lowcore), it keeps going until it eventually crashes in lockdep's lock_acquire. This crash is reproducible at least since 4.14. Problem is that "vmf->vma" used in do_fault() can become stale. Because mmap_sem may be released, other threads can come in, call munmap() and cause "vma" be returned to kmem cache, and get zeroed/re-initialized and re-used: handle_mm_fault | __handle_mm_fault | do_fault | vma = vmf->vma | do_read_fault | __do_fault| vma->vm_ops->fault(vmf);| mmap_sem is released | | | do_munmap() | remove_vma_list() | remove_vma() | vm_area_free() | # vma is released | ... | # same vma is allocated | # from kmem cache | do_mmap() | vm_area_alloc() | memset(vma, 0, ...) | pte_free(vma->vm_mm, ...); | page_table_free | spin_lock_bh(&mm->context.lock);| | Cache mm_struct to avoid using potentially stale "vma". [1] https://github.com/linux-test-project/ltp/blob/master/testcases/kernel/mem/mtest06/mmap1.c Signed-off-by: Jan Stancek --- mm/memory.c | 5 - 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/mm/memory.c b/mm/memory.c index e11ca9dd823f..6c1afc1ece50 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3517,10 +3517,13 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf) * but allow concurrent faults). * The mmap_sem may have been released depending on flags and our * return value. See filemap_fault() and __lock_page_or_retry(). + * If mmap_sem is released, vma may become invalid (for example + * by other thread calling munmap()). */ static vm_fault_t do_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; + struct mm_struct *vm_mm = READ_ONCE(vma->vm_mm); vm_fault_t ret; /* @@ -3561,7 +3564,7 @@ static vm_fault_t do_fault(struct vm_fault *vmf) /* preallocated pagetable is unused: free it */ if (vmf->prealloc_pte) { - pte_free(vma->vm_mm, vmf->prealloc_pte); + pte_free(vm_mm, vmf->prealloc_pte); vmf->prealloc_pte = NULL; } return ret; -- 1.8.3.1
Re: [PATCH] mm/memory.c: do_fault: avoid usage of stale vm_area_struct
- Original Message - > On Sat, Mar 02, 2019 at 04:11:26PM +0100, Jan Stancek wrote: > > Problem is that "vmf->vma" used in do_fault() can become stale. > > Because mmap_sem may be released, other threads can come in, > > call munmap() and cause "vma" be returned to kmem cache, and > > get zeroed/re-initialized and re-used: > > > This patch pins mm_struct and stores its value, to avoid using > > potentially stale "vma" when calling pte_free(). > > OK, we need to cache the mm_struct, but why do we need the extra atomic op? > There's surely no way the mm can be freed while the thread is in the middle > of handling a fault. You're right, I was needlessly paranoid. > > ie I would drop these lines: I'll send v2. Thanks, Jan > > > + mmgrab(vm_mm); > > + > ... > > + > > + mmdrop(vm_mm); > > + >
[PATCH] mm/memory.c: do_fault: avoid usage of stale vm_area_struct
LTP testcase mtest06 [1] can trigger a crash on s390x running 5.0.0-rc8. This is a stress test, where one thread mmaps/writes/munmaps memory area and other thread is trying to read from it: CPU: 0 PID: 2611 Comm: mmap1 Not tainted 5.0.0-rc8+ #51 Hardware name: IBM 2964 N63 400 (z/VM 6.4.0) Krnl PSW : 0404e0018000 001ac8d8 (__lock_acquire+0x7/0x7a8) Call Trace: ([<>] (null)) [<001adae4>] lock_acquire+0xec/0x258 [<0080d1ac>] _raw_spin_lock_bh+0x5c/0x98 [<0012a780>] page_table_free+0x48/0x1a8 [<002f6e54>] do_fault+0xdc/0x670 [<002fadae>] __handle_mm_fault+0x416/0x5f0 [<002fb138>] handle_mm_fault+0x1b0/0x320 [<001248cc>] do_dat_exception+0x19c/0x2c8 [<0080e5ee>] pgm_check_handler+0x19e/0x200 page_table_free() is called with NULL mm parameter, but because "0" is a valid address on s390 (see S390_lowcore), it keeps going until it eventually crashes in lockdep's lock_acquire. This crash is reproducible at least since 4.14. Problem is that "vmf->vma" used in do_fault() can become stale. Because mmap_sem may be released, other threads can come in, call munmap() and cause "vma" be returned to kmem cache, and get zeroed/re-initialized and re-used: handle_mm_fault | __handle_mm_fault | do_fault | vma = vmf->vma | do_read_fault | __do_fault| vma->vm_ops->fault(vmf);| mmap_sem is released | | | do_munmap() | remove_vma_list() | remove_vma() | vm_area_free() | # vma is released | ... | # same vma is allocated | # from kmem cache | do_mmap() | vm_area_alloc() | memset(vma, 0, ...) | pte_free(vma->vm_mm, ...); | page_table_free | spin_lock_bh(&mm->context.lock);| | This patch pins mm_struct and stores its value, to avoid using potentially stale "vma" when calling pte_free(). [1] https://github.com/linux-test-project/ltp/blob/master/testcases/kernel/mem/mtest06/mmap1.c Signed-off-by: Jan Stancek --- mm/memory.c | 10 +- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/mm/memory.c b/mm/memory.c index e11ca9dd823f..1287ee9acbdc 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3517,12 +3517,17 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf) * but allow concurrent faults). * The mmap_sem may have been released depending on flags and our * return value. See filemap_fault() and __lock_page_or_retry(). + * If mmap_sem is released, vma may become invalid (for example + * by other thread calling munmap()). */ static vm_fault_t do_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; + struct mm_struct *vm_mm = READ_ONCE(vma->vm_mm); vm_fault_t ret; + mmgrab(vm_mm); + /* * The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */ @@ -3561,9 +3566,12 @@ static vm_fault_t do_fault(struct vm_fault *vmf) /* preallocated pagetable is unused: free it */ if (vmf->prealloc_pte) { - pte_free(vma->vm_mm, vmf->prealloc_pte); + pte_free(vm_mm, vmf->prealloc_pte); vmf->prealloc_pte = NULL; } + + mmdrop(vm_mm); + return ret; } -- 1.8.3.1
Re: [PATCH v2] mm: page_mapped: don't assume compound page is huge or THP
- Original Message - > On Fri, Nov 30, 2018 at 1:07 PM Jan Stancek wrote: > > > > LTP proc01 testcase has been observed to rarely trigger crashes > > on arm64: > > page_mapped+0x78/0xb4 > > stable_page_flags+0x27c/0x338 > > kpageflags_read+0xfc/0x164 > > proc_reg_read+0x7c/0xb8 > > __vfs_read+0x58/0x178 > > vfs_read+0x90/0x14c > > SyS_read+0x60/0xc0 > > > > Issue is that page_mapped() assumes that if compound page is not > > huge, then it must be THP. But if this is 'normal' compound page > > (COMPOUND_PAGE_DTOR), then following loop can keep running > > (for HPAGE_PMD_NR iterations) until it tries to read from memory > > that isn't mapped and triggers a panic: > > for (i = 0; i < hpage_nr_pages(page); i++) { > > if (atomic_read(&page[i]._mapcount) >= 0) > > return true; > > } > > > > I could replicate this on x86 (v4.20-rc4-98-g60b548237fed) only > > with a custom kernel module [1] which: > > - allocates compound page (PAGEC) of order 1 > > - allocates 2 normal pages (COPY), which are initialized to 0xff > > (to satisfy _mapcount >= 0) > > - 2 PAGEC page structs are copied to address of first COPY page > > - second page of COPY is marked as not present > > - call to page_mapped(COPY) now triggers fault on access to 2nd > > COPY page at offset 0x30 (_mapcount) > > > > [1] > > https://github.com/jstancek/reproducers/blob/master/kernel/page_mapped_crash/repro.c > > > > Fix the loop to iterate for "1 << compound_order" pages. > > > > Debugged-by: Laszlo Ersek > > Suggested-by: "Kirill A. Shutemov" > > Signed-off-by: Jan Stancek > > --- > > mm/util.c | 2 +- > > 1 file changed, 1 insertion(+), 1 deletion(-) > > > > Changes in v2: > > - change the loop instead so we check also mapcount of subpages > > > > diff --git a/mm/util.c b/mm/util.c > > index 8bf08b5b5760..5c9c7359ee8a 100644 > > --- a/mm/util.c > > +++ b/mm/util.c > > @@ -478,7 +478,7 @@ bool page_mapped(struct page *page) > > return true; > > if (PageHuge(page)) > > return false; > > - for (i = 0; i < hpage_nr_pages(page); i++) { > > + for (i = 0; i < (1 << compound_order(page)); i++) { > > if (atomic_read(&page[i]._mapcount) >= 0) > > return true; > > } > > -- > > 1.8.3.1 > > Hi all > > This patch landed in the 4.9-stable tree starting from 4.9.151 and it > broke our MIPS1004kc system with CONFIG_HIGHMEM=y. Hi, are you using THP (CONFIG_TRANSPARENT_HUGEPAGE)? The changed line should affect only THP and normal compound pages, so a test with THP disabled might be interesting. > > The breakage consists of random processes dying with SIGILL or SIGSEGV > when we stress test the system with high memory pressure and explicit > memory compaction requested through /proc/sys/vm/compact_memory. > Reverting this patch fixes the crashes. > > We can put some effort on debugging if there are no obvious > explanations for this. Keep in mind that this is 32-bit system with > HIGHMEM. Nothing obvious that I can see. I've been trying to reproduce on 32-bit x86 Fedora with no luck so far. Regards, Jan
[PATCH v2] mm: page_mapped: don't assume compound page is huge or THP
LTP proc01 testcase has been observed to rarely trigger crashes on arm64: page_mapped+0x78/0xb4 stable_page_flags+0x27c/0x338 kpageflags_read+0xfc/0x164 proc_reg_read+0x7c/0xb8 __vfs_read+0x58/0x178 vfs_read+0x90/0x14c SyS_read+0x60/0xc0 Issue is that page_mapped() assumes that if compound page is not huge, then it must be THP. But if this is 'normal' compound page (COMPOUND_PAGE_DTOR), then following loop can keep running (for HPAGE_PMD_NR iterations) until it tries to read from memory that isn't mapped and triggers a panic: for (i = 0; i < hpage_nr_pages(page); i++) { if (atomic_read(&page[i]._mapcount) >= 0) return true; } I could replicate this on x86 (v4.20-rc4-98-g60b548237fed) only with a custom kernel module [1] which: - allocates compound page (PAGEC) of order 1 - allocates 2 normal pages (COPY), which are initialized to 0xff (to satisfy _mapcount >= 0) - 2 PAGEC page structs are copied to address of first COPY page - second page of COPY is marked as not present - call to page_mapped(COPY) now triggers fault on access to 2nd COPY page at offset 0x30 (_mapcount) [1] https://github.com/jstancek/reproducers/blob/master/kernel/page_mapped_crash/repro.c Fix the loop to iterate for "1 << compound_order" pages. Debugged-by: Laszlo Ersek Suggested-by: "Kirill A. Shutemov" Signed-off-by: Jan Stancek --- mm/util.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) Changes in v2: - change the loop instead so we check also mapcount of subpages diff --git a/mm/util.c b/mm/util.c index 8bf08b5b5760..5c9c7359ee8a 100644 --- a/mm/util.c +++ b/mm/util.c @@ -478,7 +478,7 @@ bool page_mapped(struct page *page) return true; if (PageHuge(page)) return false; - for (i = 0; i < hpage_nr_pages(page); i++) { + for (i = 0; i < (1 << compound_order(page)); i++) { if (atomic_read(&page[i]._mapcount) >= 0) return true; } -- 1.8.3.1
[PATCH] mm: page_mapped: don't assume compound page is huge or THP
LTP proc01 testcase has been observed to rarely trigger crashes on arm64: page_mapped+0x78/0xb4 stable_page_flags+0x27c/0x338 kpageflags_read+0xfc/0x164 proc_reg_read+0x7c/0xb8 __vfs_read+0x58/0x178 vfs_read+0x90/0x14c SyS_read+0x60/0xc0 Issue is that page_mapped() assumes that if compound page is not huge, then it must be THP. But if this is 'normal' compound page (COMPOUND_PAGE_DTOR), then following loop can keep running until it tries to read from memory that isn't mapped and triggers a panic: for (i = 0; i < hpage_nr_pages(page); i++) { if (atomic_read(&page[i]._mapcount) >= 0) return true; } I could replicate this on x86 (v4.20-rc4-98-g60b548237fed) only with a custom kernel module [1] which: - allocates compound page (PAGEC) of order 1 - allocates 2 normal pages (COPY), which are initialized to 0xff (to satisfy _mapcount >= 0) - 2 PAGEC page structs are copied to address of first COPY page - second page of COPY is marked as not present - call to page_mapped(COPY) now triggers fault on access to 2nd COPY page at offset 0x30 (_mapcount) [1] https://github.com/jstancek/reproducers/blob/master/kernel/page_mapped_crash/repro.c This patch modifies page_mapped() to check for 'normal' compound pages (COMPOUND_PAGE_DTOR). Debugged-by: Laszlo Ersek Signed-off-by: Jan Stancek --- include/linux/mm.h | 9 + mm/util.c | 2 ++ 2 files changed, 11 insertions(+) diff --git a/include/linux/mm.h b/include/linux/mm.h index 5411de93a363..18b0bb953f92 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -700,6 +700,15 @@ static inline compound_page_dtor *get_compound_page_dtor(struct page *page) return compound_page_dtors[page[1].compound_dtor]; } +static inline int PageNormalCompound(struct page *page) +{ + if (!PageCompound(page)) + return 0; + + page = compound_head(page); + return page[1].compound_dtor == COMPOUND_PAGE_DTOR; +} + static inline unsigned int compound_order(struct page *page) { if (!PageHead(page)) diff --git a/mm/util.c b/mm/util.c index 8bf08b5b5760..06c1640cb7b3 100644 --- a/mm/util.c +++ b/mm/util.c @@ -478,6 +478,8 @@ bool page_mapped(struct page *page) return true; if (PageHuge(page)) return false; + if (PageNormalCompound(page)) + return false; for (i = 0; i < hpage_nr_pages(page); i++) { if (atomic_read(&page[i]._mapcount) >= 0) return true; -- 1.8.3.1
Re: [LTP] [PATCH 4.17 00/67] 4.17.7-stable review
- Original Message - > On 16 July 2018 at 13:04, Greg Kroah-Hartman > wrote: > > This is the start of the stable review cycle for the 4.17.7 release. > > There are 67 patches in this series, all will be posted as a response > > to this one. If anyone has any issues with these being applied, please > > let me know. > > > > Responses should be made by Wed Jul 18 07:34:11 UTC 2018. > > Anything received after that time might be too late. > > > > The whole patch series can be found in one patch at: > > > > https://www.kernel.org/pub/linux/kernel/v4.x/stable-review/patch-4.17.7-rc1.gz > > or in the git tree and branch at: > > > > git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git > > linux-4.17.y > > and the diffstat can be found below. > > > > thanks, > > > > greg k-h > > > Results from Linaro’s test farm. > Regressions detected. > > LTP syscalls failed test cases on all devices arm64, arm32 and x86_64, > - creat08 > - open10 This is consequence of: 0fa3ecd87848 ("Fix up non-directory creation in SGID directories") So likely cause of action here will be modifying both tests to not expect S_ISGID in scenario described in commit above. [adding Dave Chinner] @Dave: Does XFS need separate patch for problem described in commit 0fa3ecd87848? XFS doesn't appear to use inode_init_owner() function, both tests currently PASS the check for S_ISGID on file created by non-group member as of v4.18-rc5. Regards, Jan > > Reported this bug internally on upstream Linux mainline week back. > Now this bug happening on 4.17, 4.14, 4.9 and 4.4 > > creat08 and open10 failed with this error, > TFAIL : testdir.B.3132/setgid: Incorrect modes, setgid bit should be set > > Test case description: > /* > * NAME > * creat08.c - Verifies that the group ID and setgid bit are > *set correctly when a new file is created. > *(ported from SPIE, section2/iosuite/creat5.c, > * by Airong Zhang ) > * CALLS > * creat > * > * ALGORITHM > * Create two directories, one with the group ID of this process > * and the setgid bit not set, and the other with a group ID > * other than that of this process and with the setgid bit set. > * In each directory, create a file with and without the setgid > * bit set in the creation modes. Verify that the modes and group > * ID are correct on each of the 4 files. > * As root, create a file with the setgid bit on in the > * directory with the setgid bit. > * This tests the SVID3 create group semantics. > */ > > Block2 testing: > /*--*/ > /* Block2: Create two files in testdir.B, one with the setgid */ > /* bit set in the creation modes and the other without. */ > /*Both should inherit the group ID of the parent */ > /*directory, group2. */ > /*--*/ > > > Test results comparison on mainline versions, > https://qa-reports.linaro.org/lkft/linux-mainline-oe/tests/ltp-syscalls-tests/creat08 > https://qa-reports.linaro.org/lkft/linux-mainline-oe/tests/ltp-syscalls-tests/open10 > > Test results comparison on 4.17 versions, > https://qa-reports.linaro.org/lkft/linux-stable-rc-4.17-oe/tests/ltp-syscalls-tests/creat08 > https://qa-reports.linaro.org/lkft/linux-stable-rc-4.17-oe/tests/ltp-syscalls-tests/open10 > > Bug report link, > https://bugs.linaro.org/show_bug.cgi?id=3940 > > Summary > > > kernel: 4.17.7-rc1 > git repo: > https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git > git branch: linux-4.17.y > git commit: bc0bd9e05fa1e213c689620eb4cba825c03dcc4a > git describe: v4.17.6-68-gbc0bd9e05fa1 > Test details: > https://qa-reports.linaro.org/lkft/linux-stable-rc-4.17-oe/build/v4.17.6-68-gbc0bd9e05fa1 > > Regressions (compared to build v4.17.6-67-g3b02b1fd1975) > > > LTP syscalls failed test cases on all devices, > > - creat08 > - open10 > > -- > Linaro LKFT > https://lkft.linaro.org > > -- > Mailing list info: https://lists.linux.it/listinfo/ltp >
Re: LTP CVE cve-2017-17053 test failed on x86_64 device
- Original Message - > LTP CVE cve-2017-17053 test failed on x86_64 device. > FAIL on linux-next, mainline, and stable-rc-4.17. > PASS on stable-rc 4.16, 4.14, 4.9 and 4.4 kernel. > > Test FAIL case output, > tst_test.c:1015: INFO: Timeout per run is 0h 15m 00s > tst_taint.c:88: BROK: Kernel is already tainted: 512 You seem to be running into some nfs warning early during boot [1][2], that taints the kernel. cve-2017-17053 test doesn't do much, it fails in setup because it finds kernel already tainted. Looks similar to: https://www.spinics.net/lists/linux-nfs/msg68064.html Regards, Jan [1] [ 78.886529] [ cut here ] [ 78.891155] DEBUG_LOCKS_WARN_ON(sem->owner != ((struct task_struct *)(1UL << 0))) [ 78.891161] WARNING: CPU: 0 PID: 33 at /srv/oe/build/tmp-rpb-glibc/work-shared/intel-core2-32/kernel-source/kernel/locking/rwsem.c:217 up_read_non_owner+0x5d/0x70 [ 78.913141] Modules linked in: fuse [ 78.916633] CPU: 0 PID: 33 Comm: kworker/0:1 Not tainted 4.18.0-rc1 #1 [ 78.923150] Hardware name: Supermicro SYS-5019S-ML/X11SSH-F, BIOS 2.0b 07/27/2017 [ 78.930623] Workqueue: nfsiod rpc_async_release [ 78.935154] RIP: 0010:up_read_non_owner+0x5d/0x70 [ 78.939851] Code: 00 5b 5d c3 e8 54 9f 3a 00 85 c0 74 de 8b 05 ba 40 6a 02 85 c0 75 d4 48 c7 c6 c8 ce 8f 9c 48 c7 c7 93 8e 8e 9c e8 33 6b fa ff <0f> 0b eb bd 0f 1f 44 00 00 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 [ 78.958717] RSP: 0018:b46b419ebdd0 EFLAGS: 00010282 [ 78.963935] RAX: RBX: 882d1c390130 RCX: 0006 [ 78.971059] RDX: 0007 RSI: 0001 RDI: 882d2fc15730 [ 78.978184] RBP: b46b419ebdd8 R08: R09: [ 78.985318] R10: b46b419ebdd0 R11: R12: 882d1ad56800 [ 78.985319] R13: 882d1d133000 R14: R15: 882d1f047840 [ 78.985319] FS: () GS:882d2fc0() knlGS: [ 78.985320] CS: 0010 DS: ES: CR0: 80050033 [ 78.985321] CR2: 56385d962ae0 CR3: 21e1e002 CR4: 003606f0 [ 78.985321] DR0: DR1: DR2: [ 78.985322] DR3: DR6: fffe0ff0 DR7: 0400 [ 78.985322] Call Trace: [[0;32m OK [[ 78.985327] nfs_async_unlink_release+0x32/0x80 [ 78.985329] rpc_free_task+0x30/0x50 [ 78.985330] rpc_async_release+0x12/0x20 [ 78.985332] process_one_work+0x278/0x670 0m] [ 78.985335] worker_thread+0x4d/0x410 [ 78.985337] kthread+0x10d/0x140 [ 78.985338] ? rescuer_thread+0x3a0/0x3a0 [ 78.985339] ? kthread_create_worker_on_cpu+0x70/0x70 [ 78.985341] ret_from_fork+0x3a/0x50 [ 78.985344] irq event stamp: 36545 [ 78.985346] hardirqs last enabled at (36545): [] _raw_spin_unlock_irq+0x2c/0x40 [ 78.985347] hardirqs last disabled at (36544): [] _raw_spin_lock_irq+0x13/0x50 [ 78.985348] softirqs last enabled at (34878): [] reg_todo+0x260/0x2f0 [ 78.985350] softirqs last disabled at (34876): [] reg_todo+0x171/0x2f0 [ 79.111787] ---[ end trace bed1f41cfea3a4c5 ]--- [2] https://lkft.validation.linaro.org/scheduler/job/290429#L893 > Summary: > passed 0 > failed 0 > skipped 0 > warnings 0 > > Test comments, > /* Regression test for CVE-2017-17053, original reproducer can be found > * here: > * > > https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=ccd5b3235180eef3cfec337df1c8554ab151b5cc > * > * Be careful! This test may crash your kernel! > */ > > Full test log: > https://lkft.validation.linaro.org/scheduler/job/290429#L9581 > https://lkft.validation.linaro.org/scheduler/job/259217#L10308 > > History of the test case shows failed on > Linux next kernel, > https://qa-reports.linaro.org/lkft/linux-next-oe/tests/ltp-cve-tests/cve-2017-17053 > > Linux mainline kernel, > https://qa-reports.linaro.org/lkft/linux-mainline-oe/tests/ltp-cve-tests \ > /cve-2017-17053 > ^ Please join link > > Linux stable rc 4.17, > https://qa-reports.linaro.org/lkft/linux-stable-rc-4.17-oe/tests/ltp-cve-tests > \ > /cve-2017-17053 > ^ Please join link > > Test PASS on 4.16 kernel. > https://qa-reports.linaro.org/lkft/linux-stable-rc-4.16-oe/tests/ltp-cve-tests > \ > /cve-2017-17053 > ^ Please join link > > Best regards > Naresh Kamboju >
Re: [LTP] [PATCH 4.4 00/24] 4.4.137-stable review
- Original Message - > On Thu, Jun 14, 2018 at 05:49:52AM -0400, Jan Stancek wrote: > > > > - Original Message - > > > On Thu, Jun 14, 2018 at 02:24:25PM +0530, Naresh Kamboju wrote: > > > > On 14 June 2018 at 12:04, Greg Kroah-Hartman > > > > > > > > wrote: > > > > > On Wed, Jun 13, 2018 at 10:48:50PM -0300, Rafael Tinoco wrote: > > > > >> On 13 June 2018 at 18:08, Rafael David Tinoco > > > > >> wrote: > > > > >> > On Wed, Jun 13, 2018 at 6:00 PM, Greg Kroah-Hartman > > > > >> > wrote: > > > > >> >> On Wed, Jun 13, 2018 at 05:47:49PM -0300, Rafael Tinoco wrote: > > > > >> >>> Results from Linaro’s test farm. > > > > >> >>> Regressions detected. > > > > >> >>> > > > > >> >>> NOTE: > > > > >> >>> > > > > >> >>> 1) LTP vma03 test (cve-2011-2496) broken on v4.4-137-rc1 because > > > > >> >>> of: > > > > >> >>> > > > > >> >>> 6ea1dc96a03a mmap: relax file size limit for regular files > > > > >> >>> bd2f9ce5bacb mmap: introduce sane default mmap limits > > > > >> >>> > > > > >> >>>discussion: > > > > >> >>> > > > > >> >>> https://github.com/linux-test-project/ltp/issues/341 > > > > >> >>> > > > > >> >>>mainline commit (v4.13-rc7): > > > > >> >>> > > > > >> >>> 0cc3b0ec23ce Clarify (and fix) MAX_LFS_FILESIZE macros > > > > >> >>> > > > > >> >>>should be backported to 4.4.138-rc2 and fixes the issue. > > > > >> >> > > > > >> >> Really? That commit says it fixes c2a9737f45e2 ("vfs,mm: fix a > > > > >> >> dead > > > > >> >> loop in truncate_inode_pages_range()") which is not in 4.4.y at > > > > >> >> all. > > > > >> >> > > > > >> >> Did you test this out? > > > > >> > > > > > >> > Yes, the LTP contains the tests (last comment is the final test > > > > >> > for > > > > >> > arm32, right before Jan tests i686). > > > > >> > > > > > >> > Fixing MAX_LFS_FILESIZE fixes the new limit for mmap() brought by > > > > >> > those 2 commits (file_mmap_size_max()). > > > > >> > offset tested by the LTP test is 0xfffe000. > > > > >> > file_mmap_size_max gives: 0x000 as max value, but only > > > > >> > after > > > > >> > the mentioned patch. > > > > >> > > > > > >> > Original intent for this fix was other though. > > > > >> > > > > >> To clarify this a bit further. > > > > >> > > > > >> The LTP CVE test is breaking in the first call to mmap(), even > > > > >> before > > > > >> trying to remap and test the security issue. That start happening in > > > > >> this round because of those mmap() changes and the offset used in > > > > >> the > > > > >> LTP test. Linus changed limit checks and made them to be related to > > > > >> MAX_LFS_FILESIZE. Unfortunately, in 4.4 stable, we were missing the > > > > >> fix for MAX_LFS_FILESIZE (which before commit 0cc3b0ec23ce was less > > > > >> than the REAL 32 bit limit). > > > > >> > > > > >> Commit 0cc3b0ec23ce was made because an user noticed the FS limit > > > > >> not > > > > >> being what it should be. In our case, the 4.4 stable kernel, we are > > > > >> facing this 32 bit lower limit (than the real 32 bit real limit), > > > > >> because of the LTP CVE test, so we need this fix to have the real 32 > > > > >> bit limit set for that macro (mmap limits did not use that macro > > > > >> before). > > > > >> > > > > >> I have tested in arm32 and Jan Stancek, who first responded to LTP > > > > >> issue, has tested this in i686 and both worked after that patch was > >
Re: [LTP] [PATCH 4.4 00/24] 4.4.137-stable review
- Original Message - > On Thu, Jun 14, 2018 at 02:24:25PM +0530, Naresh Kamboju wrote: > > On 14 June 2018 at 12:04, Greg Kroah-Hartman > > wrote: > > > On Wed, Jun 13, 2018 at 10:48:50PM -0300, Rafael Tinoco wrote: > > >> On 13 June 2018 at 18:08, Rafael David Tinoco > > >> wrote: > > >> > On Wed, Jun 13, 2018 at 6:00 PM, Greg Kroah-Hartman > > >> > wrote: > > >> >> On Wed, Jun 13, 2018 at 05:47:49PM -0300, Rafael Tinoco wrote: > > >> >>> Results from Linaro’s test farm. > > >> >>> Regressions detected. > > >> >>> > > >> >>> NOTE: > > >> >>> > > >> >>> 1) LTP vma03 test (cve-2011-2496) broken on v4.4-137-rc1 because of: > > >> >>> > > >> >>> 6ea1dc96a03a mmap: relax file size limit for regular files > > >> >>> bd2f9ce5bacb mmap: introduce sane default mmap limits > > >> >>> > > >> >>>discussion: > > >> >>> > > >> >>> https://github.com/linux-test-project/ltp/issues/341 > > >> >>> > > >> >>>mainline commit (v4.13-rc7): > > >> >>> > > >> >>> 0cc3b0ec23ce Clarify (and fix) MAX_LFS_FILESIZE macros > > >> >>> > > >> >>>should be backported to 4.4.138-rc2 and fixes the issue. > > >> >> > > >> >> Really? That commit says it fixes c2a9737f45e2 ("vfs,mm: fix a dead > > >> >> loop in truncate_inode_pages_range()") which is not in 4.4.y at all. > > >> >> > > >> >> Did you test this out? > > >> > > > >> > Yes, the LTP contains the tests (last comment is the final test for > > >> > arm32, right before Jan tests i686). > > >> > > > >> > Fixing MAX_LFS_FILESIZE fixes the new limit for mmap() brought by > > >> > those 2 commits (file_mmap_size_max()). > > >> > offset tested by the LTP test is 0xfffe000. > > >> > file_mmap_size_max gives: 0x000 as max value, but only after > > >> > the mentioned patch. > > >> > > > >> > Original intent for this fix was other though. > > >> > > >> To clarify this a bit further. > > >> > > >> The LTP CVE test is breaking in the first call to mmap(), even before > > >> trying to remap and test the security issue. That start happening in > > >> this round because of those mmap() changes and the offset used in the > > >> LTP test. Linus changed limit checks and made them to be related to > > >> MAX_LFS_FILESIZE. Unfortunately, in 4.4 stable, we were missing the > > >> fix for MAX_LFS_FILESIZE (which before commit 0cc3b0ec23ce was less > > >> than the REAL 32 bit limit). > > >> > > >> Commit 0cc3b0ec23ce was made because an user noticed the FS limit not > > >> being what it should be. In our case, the 4.4 stable kernel, we are > > >> facing this 32 bit lower limit (than the real 32 bit real limit), > > >> because of the LTP CVE test, so we need this fix to have the real 32 > > >> bit limit set for that macro (mmap limits did not use that macro > > >> before). > > >> > > >> I have tested in arm32 and Jan Stancek, who first responded to LTP > > >> issue, has tested this in i686 and both worked after that patch was > > >> included to v4.4-137-rc1 (my last test was even with 4.4.138-rc1). > > >> > > >> Hope that helps a bit. > > > > > > Ok, thanks, it didn't apply cleanly but I've fixed it up now. > > > > On the latest 4.4.138-rc1, > > LTP "cve-2011-2496" test still fails on arm32 beagleboard x15 and qemu_arm. > > > > Summary > > > > kernel: 4.4.138-rc1 > > git repo: > > https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git > > git branch: linux-4.4.y > > git commit: 7d690c56754ef7be647fbcf7bcdceebd59926b3f > > git describe: v4.4.137-15-g7d690c56754e > > Test details: > > https://qa-reports.linaro.org/lkft/linux-stable-rc-4.4-oe/build/v4.4.137-15-g7d690c56754e > > Ok, but what does this mean? Is there a commit somewhere that I need to > pick up for 4.4.y that is already in newer kernels? Hi Greg, I think the expectations was that: 0cc3b0ec23ce Clarify (and fix) MAX_LFS_FILESIZE macros has been included to linux-4.4.y HEAD, so they re-ran the tests. Report from Naresh above looks like original report: LTP vma03 is cve-2011-2496 test. Regards, Jan > > I have no idea what that cve is >, as I never track them, and it's > something that was reported to predate the 4.4 kernel release :) > > thanks, > > greg k-h > > -- > Mailing list info: https://lists.linux.it/listinfo/ltp >
[PATCH] virtio_balloon: fix increment of vb->num_pfns in fill_balloon()
commit c7cdff0e8647 ("virtio_balloon: fix deadlock on OOM") changed code to increment vb->num_pfns before call to set_page_pfns(), which used to happen only after. This patch fixes boot hang for me on ppc64le KVM guests. Fixes: c7cdff0e8647 ("virtio_balloon: fix deadlock on OOM") Cc: Michael S. Tsirkin Cc: Tetsuo Handa Cc: Michal Hocko Cc: Wei Wang Signed-off-by: Jan Stancek --- drivers/virtio/virtio_balloon.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c index 7960746f7597..a1fb52cb3f0a 100644 --- a/drivers/virtio/virtio_balloon.c +++ b/drivers/virtio/virtio_balloon.c @@ -174,13 +174,12 @@ static unsigned fill_balloon(struct virtio_balloon *vb, size_t num) while ((page = balloon_page_pop(&pages))) { balloon_page_enqueue(&vb->vb_dev_info, page); - vb->num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE; - set_page_pfns(vb, vb->pfns + vb->num_pfns, page); vb->num_pages += VIRTIO_BALLOON_PAGES_PER_PAGE; if (!virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_DEFLATE_ON_OOM)) adjust_managed_page_count(page, -1); + vb->num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE; } num_allocated_pages = vb->num_pfns; -- 1.8.3.1
Re: [Patch V3] crypto: x86/sha1 : Fix reads beyond the number of blocks passed
- Original Message - > On Wed, 2017-08-02 at 10:13 -0700, Megha Dey wrote: > > It was reported that the sha1 AVX2 function(sha1_transform_avx2) is > > reading ahead beyond its intended data, and causing a crash if the next > > block is beyond page boundary: > > http://marc.info/?l=linux-crypto-vger&m=149373371023377 > > > > This patch makes sure that there is no overflow for any buffer length. > > > > It passes the tests written by Jan Stancek that revealed this problem: > > https://github.com/jstancek/sha1-avx2-crash > > Hi Jan, > > Is it ok to add your Tested-by? Yes, v3 patch is exactly the diff I was testing. Regards, Jan
Re: [Patch V2] crypto: x86/sha1 : Fix reads beyond the number of blocks passed
On 08/02/2017 02:38 AM, Megha Dey wrote: > It was reported that the sha1 AVX2 function(sha1_transform_avx2) is > reading ahead beyond its intended data, and causing a crash if the next > block is beyond page boundary: > http://marc.info/?l=linux-crypto-vger&m=149373371023377 > > This patch makes sure that there is no overflow for any buffer length. > > It passes the tests written by Jan Stancek that revealed this problem: > https://github.com/jstancek/sha1-avx2-crash > > Jan, can you verify this fix? Hi, Looks good, my tests (below) PASS-ed. I updated reproducer at [1] to try more than just single scenario. It now tries thousands of combinations with different starting offset within page and sizes of data chunks. (without patch) v4.13-rc3-102-g26c5ceb with avx2 enabled: [ 106.455175] sha_test module loaded [ 106.460458] data is at 0x8c88e58f8000, page_after_data: 0x8c88e58fa000 [ 106.468625] sha_test testing hash: sha1-generic, maxlen: 8192 [ 127.091237] sha_test hash: sha1-generic calculated 1764032 hashes [ 127.098038] sha_test testing hash: sha1-ni, maxlen: 8192 [ 127.108805] failed to alloc sha1-ni [ 127.112703] sha_test testing hash: sha1-avx, maxlen: 8192 [ 141.166611] sha_test hash: sha1-avx calculated 1764032 hashes [ 141.173023] sha_test testing hash: sha1-avx2, maxlen: 8192 [ 141.180046] BUG: unable to handle kernel paging request at 8c88e58fa000 [ 141.187817] IP: _begin+0x28/0x187 [ 141.191512] PGD 89c578067 [ 141.191513] P4D 89c578067 [ 141.194529] PUD c61f0a063 [ 141.197545] PMD c64cb1063 [ 141.200561] PTE 800c658fa062 [ 141.203577] [ 141.208832] Oops: [#1] SMP ... With patch "[Patch V2] crypto: x86/sha1 : Fix reads beyond the number of blocks passed": [ 406.323781] sha_test module loaded [ 406.329062] data is at 0x93ab97108000, page_after_data: 0x93ab9710a000 [ 406.337185] sha_test testing hash: sha1-generic, maxlen: 8192 [ 426.157242] sha_test hash: sha1-generic calculated 1764032 hashes [ 426.164043] sha_test testing hash: sha1-ni, maxlen: 8192 [ 426.174244] failed to alloc sha1-ni [ 426.178139] sha_test testing hash: sha1-avx, maxlen: 8192 [ 440.226240] sha_test hash: sha1-avx calculated 1764032 hashes [ 440.232651] sha_test testing hash: sha1-avx2, maxlen: 8192 [ 452.497988] sha_test hash: sha1-avx2 calculated 1764032 hashes [ 452.504495] sha_test testing hash: sha1-ssse3, maxlen: 8192 [ 467.319316] sha_test hash: sha1-ssse3 calculated 1764032 hashes [ 467.325922] sha_test done [ 471.827083] sha_test module unloaded I also ran a test at [2], which is comparing hashes produced by kernel with hashes produced by user-space utilities (e.g. 'sha1sum'). This test also PASS-ed. Regards, Jan [1] https://github.com/jstancek/sha1-avx2-crash [2] https://github.com/jstancek/kernel_sha_test/tree/sha1-avx2-4.13 > Herbert, can you re-enable sha1-avx2 once Jan has checked it out and > revert commit b82ce24426a4071da9529d726057e4e642948667 ? > > Originally-by: Ilya Albrekht > Signed-off-by: Megha Dey > Reported-by: Jan Stancek
Re: [PATCH 0/2] key payload access with just rcu_read_lock()
- Original Message - > From: "David Howells" > To: "Jan Stancek" > Cc: dhowe...@redhat.com, linux-kernel@vger.kernel.org, > linux-...@vger.kernel.org, bcodd...@redhat.com, > asav...@redhat.com > Sent: Wednesday, 1 March, 2017 10:40:13 AM > Subject: Re: [PATCH 0/2] key payload access with just rcu_read_lock() > > Jan Stancek wrote: > > > That problem didn't show up with my NFS based reproducer. > > I re-run it again with latest version of your patch, plus also > > keyutils testsuite. Both completed OK for me, dmesg looks clean. > > Can I put you down as a Tested-by? Yes. Thanks for having a look at this. Regards, Jan
Re: [PATCH 0/2] key payload access with just rcu_read_lock()
- Original Message - > Here's an updated patch with fixed user_key_payload_locked() and user_read(). > That problem didn't show up with my NFS based reproducer. I re-run it again with latest version of your patch, plus also keyutils testsuite. Both completed OK for me, dmesg looks clean. Regards, Jan
Re: [PATCH 0/2] key payload access with just rcu_read_lock()
- Original Message - > Jan Stancek wrote: > > > Looks like there are still couple users that need updating, > > I'm hitting following compilation error: > > Aargh - I remembered to grep for rcu_dereference_key() but not > user_key_payload(). > > How about the attached? v2 builds OK (on top of 4.10), "suspicious RCU usage" warning is gone and NFS connectathon is PASS-ing cleanly. Regards, Jan
Re: [PATCH 0/2] key payload access with just rcu_read_lock()
- Original Message - > From: "David Howells" > To: "Jan Stancek" > Cc: dhowe...@redhat.com, linux-kernel@vger.kernel.org, > linux-...@vger.kernel.org, bcodd...@redhat.com, > asav...@redhat.com > Sent: Monday, 27 February, 2017 11:04:21 PM > Subject: Re: [PATCH 0/2] key payload access with just rcu_read_lock() > > Jan Stancek wrote: > > > this is a follow-up for "suspicious RCU usage" warning described > > in these 2 linux-nfs threads: > > http://marc.info/?t=14755883033&r=1&w=2 > > http://marc.info/?t=14877677051&r=1&w=2 > > > > Did you have something like in mind? > > How about the attached? It's similar to what you did, but I made the split > functions different from the original so that all users have to reconsider > what they actually want. > > David Looks like there are still couple users that need updating, I'm hitting following compilation error: ... CC [M] fs/cifs/connect.o fs/cifs/connect.c: In function ‘cifs_set_cifscreds’: fs/cifs/connect.c:2442:2: error: implicit declaration of function ‘user_key_payload’ [-Werror=implicit-function-declaration] upayload = user_key_payload(key); ^ fs/cifs/connect.c:2442:11: error: assignment makes pointer from integer without a cast [-Werror] upayload = user_key_payload(key); ^ # grep "user_key_payload(" -r . --include=*.c ./drivers/md/dm-crypt.c:ukp = user_key_payload(key); ./fs/cifs/connect.c:upayload = user_key_payload(key); ./fs/crypto/keyinfo.c: ukp = user_key_payload(keyring_key); ./lib/digsig.c: ukp = user_key_payload(key); # grep "user_key_payload(" -r . --include=*.h ./fs/ecryptfs/ecryptfs_kernel.h:return (struct ecryptfs_auth_tok *)user_key_payload(key)->data; Regards, Jan
[PATCH 1/2] KEYS: add user_key_payload_rcu()
user_key_payload() is wrapper for rcu_dereference_protected(), and can't be used with just rcu_read_lock() held. This patch adds user_key_payload_rcu() for accessing key payload in RCU read-side section, without the need to hold key semaphore. Signed-off-by: Jan Stancek --- Documentation/security/keys.txt | 5 + include/keys/user-type.h| 5 + include/linux/key.h | 3 +++ 3 files changed, 13 insertions(+) diff --git a/Documentation/security/keys.txt b/Documentation/security/keys.txt index 3849814bfe6d..da89a854edd2 100644 --- a/Documentation/security/keys.txt +++ b/Documentation/security/keys.txt @@ -1152,8 +1152,13 @@ access the data: wrap the RCU calls to this element: rcu_assign_keypointer(struct key *key, void *data); + + /* access payload with semaphore held */ void *rcu_dereference_key(struct key *key); + /* access payload in RCU read-side section*/ + void *rcu_read_dereference_key(struct key *key); + === DEFINING A KEY TYPE diff --git a/include/keys/user-type.h b/include/keys/user-type.h index c56fef40f53e..521bf3369904 100644 --- a/include/keys/user-type.h +++ b/include/keys/user-type.h @@ -53,6 +53,11 @@ static inline const struct user_key_payload *user_key_payload(const struct key * return (struct user_key_payload *)rcu_dereference_key(key); } +static inline const struct user_key_payload *user_key_payload_rcu(const struct key *key) +{ + return (struct user_key_payload *)rcu_read_dereference_key(key); +} + #endif /* CONFIG_KEYS */ #endif /* _KEYS_USER_TYPE_H */ diff --git a/include/linux/key.h b/include/linux/key.h index 722914798f37..b6a8c5896761 100644 --- a/include/linux/key.h +++ b/include/linux/key.h @@ -358,6 +358,9 @@ static inline bool key_is_instantiated(const struct key *key) (rcu_dereference_protected((KEY)->payload.rcu_data0,\ rwsem_is_locked(&((struct key *)(KEY))->sem))) +#define rcu_read_dereference_key(KEY) \ + (rcu_dereference((KEY)->payload.rcu_data0)) + #define rcu_assign_keypointer(KEY, PAYLOAD)\ do { \ rcu_assign_pointer((KEY)->payload.rcu_data0, (PAYLOAD));\ -- 1.8.3.1
[PATCH 2/2] NFS: use user_key_payload_rcu() in RCU read-side section
Use RCU read-side accessor for key payload, as calling user_key_payload() without semaphore held leads to following warning: === [ INFO: suspicious RCU usage. ] 4.10.0 #1 Tainted: GW --- ./include/keys/user-type.h:53 suspicious rcu_dereference_protected() usage! other info that might help us debug this: rcu_scheduler_active = 2, debug_locks = 0 1 lock held by mount.nfs/5987: #0: (rcu_read_lock){..}, at: [] nfs_idmap_get_key+0x15c/0x420 [nfsv4] stack backtrace: CPU: 1 PID: 5987 Comm: mount.nfs Tainted: GW 4.10.0 #1 Call Trace: dump_stack+0xe8/0x154 (unreliable) lockdep_rcu_suspicious+0x140/0x190 nfs_idmap_get_key+0x380/0x420 [nfsv4] nfs_map_name_to_uid+0x2a0/0x3b0 [nfsv4] decode_getfattr_attrs+0xfac/0x16b0 [nfsv4] decode_getfattr_generic.constprop.106+0xbc/0x150 [nfsv4] nfs4_xdr_dec_lookup_root+0xac/0xb0 [nfsv4] rpcauth_unwrap_resp+0xe8/0x140 [sunrpc] call_decode+0x29c/0x910 [sunrpc] __rpc_execute+0x140/0x8f0 [sunrpc] rpc_run_task+0x170/0x200 [sunrpc] nfs4_call_sync_sequence+0x68/0xa0 [nfsv4] _nfs4_lookup_root.isra.44+0xd0/0xf0 [nfsv4] nfs4_lookup_root+0xe0/0x350 [nfsv4] nfs4_lookup_root_sec+0x70/0xa0 [nfsv4] nfs4_find_root_sec+0xc4/0x100 [nfsv4] nfs4_proc_get_rootfh+0x5c/0xf0 [nfsv4] nfs4_get_rootfh+0x6c/0x190 [nfsv4] nfs4_server_common_setup+0xc4/0x260 [nfsv4] nfs4_create_server+0x278/0x3c0 [nfsv4] nfs4_remote_mount+0x50/0xb0 [nfsv4] mount_fs+0x74/0x210 vfs_kern_mount+0x78/0x220 nfs_do_root_mount+0xb0/0x140 [nfsv4] nfs4_try_mount+0x60/0x100 [nfsv4] nfs_fs_mount+0x5ec/0xda0 [nfs] mount_fs+0x74/0x210 vfs_kern_mount+0x78/0x220 do_mount+0x254/0xf70 SyS_mount+0x94/0x100 system_call+0x38/0xe0 Signed-off-by: Jan Stancek --- fs/nfs/nfs4idmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfs/nfs4idmap.c b/fs/nfs/nfs4idmap.c index c444285bb1b1..835c163f61af 100644 --- a/fs/nfs/nfs4idmap.c +++ b/fs/nfs/nfs4idmap.c @@ -316,7 +316,7 @@ static ssize_t nfs_idmap_get_key(const char *name, size_t namelen, if (ret < 0) goto out_up; - payload = user_key_payload(rkey); + payload = user_key_payload_rcu(rkey); if (IS_ERR_OR_NULL(payload)) { ret = PTR_ERR(payload); goto out_up; -- 1.8.3.1
[PATCH 0/2] key payload access with just rcu_read_lock()
Hi David, this is a follow-up for "suspicious RCU usage" warning described in these 2 linux-nfs threads: http://marc.info/?t=14755883033&r=1&w=2 http://marc.info/?t=14877677051&r=1&w=2 Did you have something like in mind? Thanks, Jan Jan Stancek (2): KEYS: add user_key_payload_rcu() NFS: use user_key_payload_rcu() in RCU read-side section Documentation/security/keys.txt | 5 + fs/nfs/nfs4idmap.c | 2 +- include/keys/user-type.h| 5 + include/linux/key.h | 3 +++ 4 files changed, 14 insertions(+), 1 deletion(-) -- 1.8.3.1
[tip:perf/urgent] perf header: Make build_cpu_topology skip offline/absent CPUs
Commit-ID: 43db2843a4a41cc8cdb6ab696639aeee1f4d5062 Gitweb: http://git.kernel.org/tip/43db2843a4a41cc8cdb6ab696639aeee1f4d5062 Author: Jan Stancek AuthorDate: Fri, 17 Feb 2017 12:10:25 +0100 Committer: Arnaldo Carvalho de Melo CommitDate: Fri, 17 Feb 2017 12:37:04 -0300 perf header: Make build_cpu_topology skip offline/absent CPUs When build_cpu_topo() encounters offline/absent CPUs, it fails to find any sysfs entries and returns failure. This leads to build_cpu_topology() and write_cpu_topology() failing as well. Because HEADER_CPU_TOPOLOGY has not been written, read leaves cpu_topology_map NULL and we get NULL ptr deref at: ... cmd_test __cmd_test test_and_print run_test test_session_topology check_cpu_topology 36: Session topology : --- start --- test child forked, pid 14902 templ file: /tmp/perf-test-4CKocW failed to write feature HEADER_CPU_TOPOLOGY perf: Segmentation fault Obtained 9 stack frames. ./perf(sighandler_dump_stack+0x41) [0x5095f1] /lib64/libc.so.6(+0x35250) [0x7f4b7c3c9250] ./perf(test_session_topology+0x1db) [0x490ceb] ./perf() [0x475b68] ./perf(cmd_test+0x5b9) [0x4763c9] ./perf() [0x4945a3] ./perf(main+0x69f) [0x427e8f] /lib64/libc.so.6(__libc_start_main+0xf5) [0x7f4b7c3b5b35] ./perf() [0x427fb9] test child interrupted end Session topology: FAILED! This patch makes build_cpu_topology() skip offline/absent CPUs, by checking their presence against cpu_map built from online CPUs. Signed-off-by: Jan Stancek Acked-by: Jiri Olsa Cc: Alexander Shishkin Cc: Masami Hiramatsu Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/a271b770175524f4961d4903af33798358a4a518.1487146877.git.jstan...@redhat.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/header.c | 19 --- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index 3d12c16..1222f6c 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -505,24 +505,31 @@ static void free_cpu_topo(struct cpu_topo *tp) static struct cpu_topo *build_cpu_topology(void) { - struct cpu_topo *tp; + struct cpu_topo *tp = NULL; void *addr; u32 nr, i; size_t sz; long ncpus; int ret = -1; + struct cpu_map *map; ncpus = sysconf(_SC_NPROCESSORS_CONF); if (ncpus < 0) return NULL; + /* build online CPU map */ + map = cpu_map__new(NULL); + if (map == NULL) { + pr_debug("failed to get system cpumap\n"); + return NULL; + } + nr = (u32)(ncpus & UINT_MAX); sz = nr * sizeof(char *); - addr = calloc(1, sizeof(*tp) + 2 * sz); if (!addr) - return NULL; + goto out_free; tp = addr; tp->cpu_nr = nr; @@ -532,10 +539,16 @@ static struct cpu_topo *build_cpu_topology(void) tp->thread_siblings = addr; for (i = 0; i < nr; i++) { + if (!cpu_map__has(map, i)) + continue; + ret = build_cpu_topo(tp, i); if (ret < 0) break; } + +out_free: + cpu_map__put(map); if (ret) { free_cpu_topo(tp); tp = NULL;
[tip:perf/urgent] perf cpumap: Add cpu__max_present_cpu()
Commit-ID: 92a7e1278005b6bb3459affc50b2b6e2464e7e7c Gitweb: http://git.kernel.org/tip/92a7e1278005b6bb3459affc50b2b6e2464e7e7c Author: Jan Stancek AuthorDate: Fri, 17 Feb 2017 12:10:24 +0100 Committer: Arnaldo Carvalho de Melo CommitDate: Fri, 17 Feb 2017 12:33:05 -0300 perf cpumap: Add cpu__max_present_cpu() Similar to cpu__max_cpu() (which returns the max possible CPU), returns the max present CPU. Signed-off-by: Jan Stancek Acked-by: Jiri Olsa Cc: Alexander Shishkin Cc: Masami Hiramatsu Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/8ea4601b5cacc49927235b4ebac424bd6eeccb06.1487146877.git.jstan...@redhat.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/cpumap.c | 22 ++ tools/perf/util/cpumap.h | 1 + 2 files changed, 23 insertions(+) diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c index 2c0b522..8c75049 100644 --- a/tools/perf/util/cpumap.c +++ b/tools/perf/util/cpumap.c @@ -9,6 +9,7 @@ #include "asm/bug.h" static int max_cpu_num; +static int max_present_cpu_num; static int max_node_num; static int *cpunode_map; @@ -442,6 +443,7 @@ static void set_max_cpu_num(void) /* set up default */ max_cpu_num = 4096; + max_present_cpu_num = 4096; mnt = sysfs__mountpoint(); if (!mnt) @@ -455,6 +457,17 @@ static void set_max_cpu_num(void) } ret = get_max_num(path, &max_cpu_num); + if (ret) + goto out; + + /* get the highest present cpu number for a sparse allocation */ + ret = snprintf(path, PATH_MAX, "%s/devices/system/cpu/present", mnt); + if (ret == PATH_MAX) { + pr_err("sysfs path crossed PATH_MAX(%d) size\n", PATH_MAX); + goto out; + } + + ret = get_max_num(path, &max_present_cpu_num); out: if (ret) @@ -505,6 +518,15 @@ int cpu__max_cpu(void) return max_cpu_num; } +int cpu__max_present_cpu(void) +{ + if (unlikely(!max_present_cpu_num)) + set_max_cpu_num(); + + return max_present_cpu_num; +} + + int cpu__get_node(int cpu) { if (unlikely(cpunode_map == NULL)) { diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h index 06bd689..1a0549a 100644 --- a/tools/perf/util/cpumap.h +++ b/tools/perf/util/cpumap.h @@ -62,6 +62,7 @@ int cpu__setup_cpunode_map(void); int cpu__max_node(void); int cpu__max_cpu(void); +int cpu__max_present_cpu(void); int cpu__get_node(int cpu); int cpu_map__build_map(struct cpu_map *cpus, struct cpu_map **res,
[tip:perf/urgent] perf tools: Replace _SC_NPROCESSORS_CONF with max_present_cpu in cpu_topology_map
Commit-ID: da8a58b56c661681f9b2fd2fa59c6da3a5bac8d1 Gitweb: http://git.kernel.org/tip/da8a58b56c661681f9b2fd2fa59c6da3a5bac8d1 Author: Jan Stancek AuthorDate: Fri, 17 Feb 2017 12:10:26 +0100 Committer: Arnaldo Carvalho de Melo CommitDate: Fri, 17 Feb 2017 12:56:35 -0300 perf tools: Replace _SC_NPROCESSORS_CONF with max_present_cpu in cpu_topology_map There are 2 problems wrt. cpu_topology_map on systems with sparse CPUs: 1. offline/absent CPUs will have their socket_id and core_id set to -1 which triggers: "socket_id number is too big.You may need to upgrade the perf tool." 2. size of cpu_topology_map (perf_env.cpu[]) is allocated based on _SC_NPROCESSORS_CONF, but can be indexed with CPU ids going above. Users of perf_env.cpu[] are using CPU id as index. This can lead to read beyond what was allocated: ==19991== Invalid read of size 4 ==19991==at 0x490CEB: check_cpu_topology (topology.c:69) ==19991==by 0x490CEB: test_session_topology (topology.c:106) ... For example: _SC_NPROCESSORS_CONF == 16 available: 2 nodes (0-1) node 0 cpus: 0 6 8 10 16 22 24 26 node 0 size: 12004 MB node 0 free: 9470 MB node 1 cpus: 1 7 9 11 23 25 27 node 1 size: 12093 MB node 1 free: 9406 MB node distances: node 0 1 0: 10 20 1: 20 10 This patch changes HEADER_NRCPUS.nr_cpus_available from _SC_NPROCESSORS_CONF to max_present_cpu and updates any user of cpu_topology_map to iterate with nr_cpus_avail. As a consequence HEADER_CPU_TOPOLOGY core_id and socket_id lists get longer, but maintain compatibility with pre-patch state - index to cpu_topology_map is CPU id. perf test 36 -v 36: Session topology : --- start --- test child forked, pid 22211 templ file: /tmp/perf-test-gmdX5i CPU 0, core 0, socket 0 CPU 1, core 0, socket 1 CPU 6, core 10, socket 0 CPU 7, core 10, socket 1 CPU 8, core 1, socket 0 CPU 9, core 1, socket 1 CPU 10, core 9, socket 0 CPU 11, core 9, socket 1 CPU 16, core 0, socket 0 CPU 22, core 10, socket 0 CPU 23, core 10, socket 1 CPU 24, core 1, socket 0 CPU 25, core 1, socket 1 CPU 26, core 9, socket 0 CPU 27, core 9, socket 1 test child finished with 0 end Session topology: Ok Signed-off-by: Jan Stancek Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Masami Hiramatsu Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/d7c05c6445fca74a8442c2c73cfffd349c52c44f.1487146877.git.jstan...@redhat.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-stat.c | 2 +- tools/perf/tests/topology.c | 4 +++- tools/perf/util/env.c | 2 +- tools/perf/util/header.c| 16 +--- 4 files changed, 10 insertions(+), 14 deletions(-) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index f287191..ca27a8a 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -1765,7 +1765,7 @@ static inline int perf_env__get_cpu(struct perf_env *env, struct cpu_map *map, i cpu = map->map[idx]; - if (cpu >= env->nr_cpus_online) + if (cpu >= env->nr_cpus_avail) return -1; return cpu; diff --git a/tools/perf/tests/topology.c b/tools/perf/tests/topology.c index 98fe69a..803f893 100644 --- a/tools/perf/tests/topology.c +++ b/tools/perf/tests/topology.c @@ -65,7 +65,9 @@ static int check_cpu_topology(char *path, struct cpu_map *map) session = perf_session__new(&file, false, NULL); TEST_ASSERT_VAL("can't get session", session); - for (i = 0; i < session->header.env.nr_cpus_online; i++) { + for (i = 0; i < session->header.env.nr_cpus_avail; i++) { + if (!cpu_map__has(map, i)) + continue; pr_debug("CPU %d, core %d, socket %d\n", i, session->header.env.cpu[i].core_id, session->header.env.cpu[i].socket_id); diff --git a/tools/perf/util/env.c b/tools/perf/util/env.c index bb964e8..075fc77 100644 --- a/tools/perf/util/env.c +++ b/tools/perf/util/env.c @@ -66,7 +66,7 @@ int perf_env__read_cpu_topology_map(struct perf_env *env) return 0; if (env->nr_cpus_avail == 0) - env->nr_cpus_avail = sysconf(_SC_NPROCESSORS_CONF); + env->nr_cpus_avail = cpu__max_present_cpu(); nr_cpus = env->nr_cpus_avail; if (nr_cpus == -1) diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index 1222f6c..05714d5 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -295,11 +295,7 @@ static int write_nrcpus(int fd, struct perf_header *h __maybe_unused, u32 nrc, nra; int ret; - nr = sysconf(_SC_NPROCESSORS_CONF); - if (nr < 0) - return -1; - - nrc = (u32)(nr & UINT_MAX); + nrc = cpu__max_present_cpu(); nr = sy
[PATCH v4 3/3] perf: Replace _SC_NPROCESSORS_CONF with max_present_cpu in cpu_topology_map
There are 2 problems wrt. cpu_topology_map on systems with sparse CPUs: 1. offline/absent CPUs will have their socket_id and core_id set to -1 which triggers: "socket_id number is too big.You may need to upgrade the perf tool." 2. size of cpu_topology_map (perf_env.cpu[]) is allocated based on _SC_NPROCESSORS_CONF, but can be indexed with CPU ids going above. Users of perf_env.cpu[] are using CPU id as index. This can lead to read beyond what was allocated: ==19991== Invalid read of size 4 ==19991==at 0x490CEB: check_cpu_topology (topology.c:69) ==19991==by 0x490CEB: test_session_topology (topology.c:106) ... For example: _SC_NPROCESSORS_CONF == 16 available: 2 nodes (0-1) node 0 cpus: 0 6 8 10 16 22 24 26 node 0 size: 12004 MB node 0 free: 9470 MB node 1 cpus: 1 7 9 11 23 25 27 node 1 size: 12093 MB node 1 free: 9406 MB node distances: node 0 1 0: 10 20 1: 20 10 This patch changes HEADER_NRCPUS.nr_cpus_available from _SC_NPROCESSORS_CONF to max_present_cpu and updates any user of cpu_topology_map to iterate with nr_cpus_avail. As consequence HEADER_CPU_TOPOLOGY core_id and socket_id lists get longer, but maintain compatibility with pre-patch state - index to cpu_topology_map is CPU id. # ./perf test 36 -v 36: Session topology : --- start --- test child forked, pid 22211 templ file: /tmp/perf-test-gmdX5i CPU 0, core 0, socket 0 CPU 1, core 0, socket 1 CPU 6, core 10, socket 0 CPU 7, core 10, socket 1 CPU 8, core 1, socket 0 CPU 9, core 1, socket 1 CPU 10, core 9, socket 0 CPU 11, core 9, socket 1 CPU 16, core 0, socket 0 CPU 22, core 10, socket 0 CPU 23, core 10, socket 1 CPU 24, core 1, socket 0 CPU 25, core 1, socket 1 CPU 26, core 9, socket 0 CPU 27, core 9, socket 1 test child finished with 0 end Session topology: Ok Signed-off-by: Jan Stancek --- tools/perf/builtin-stat.c | 2 +- tools/perf/tests/topology.c | 4 +++- tools/perf/util/env.c | 2 +- tools/perf/util/header.c| 20 +--- 4 files changed, 14 insertions(+), 14 deletions(-) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index a02f2e965628..9e2be1c52fbd 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -1765,7 +1765,7 @@ static inline int perf_env__get_cpu(struct perf_env *env, struct cpu_map *map, i cpu = map->map[idx]; - if (cpu >= env->nr_cpus_online) + if (cpu >= env->nr_cpus_avail) return -1; return cpu; diff --git a/tools/perf/tests/topology.c b/tools/perf/tests/topology.c index 98fe69ac553c..803f893550d6 100644 --- a/tools/perf/tests/topology.c +++ b/tools/perf/tests/topology.c @@ -65,7 +65,9 @@ static int check_cpu_topology(char *path, struct cpu_map *map) session = perf_session__new(&file, false, NULL); TEST_ASSERT_VAL("can't get session", session); - for (i = 0; i < session->header.env.nr_cpus_online; i++) { + for (i = 0; i < session->header.env.nr_cpus_avail; i++) { + if (!cpu_map__has(map, i)) + continue; pr_debug("CPU %d, core %d, socket %d\n", i, session->header.env.cpu[i].core_id, session->header.env.cpu[i].socket_id); diff --git a/tools/perf/util/env.c b/tools/perf/util/env.c index bb964e86b09d..075fc77286bf 100644 --- a/tools/perf/util/env.c +++ b/tools/perf/util/env.c @@ -66,7 +66,7 @@ int perf_env__read_cpu_topology_map(struct perf_env *env) return 0; if (env->nr_cpus_avail == 0) - env->nr_cpus_avail = sysconf(_SC_NPROCESSORS_CONF); + env->nr_cpus_avail = cpu__max_present_cpu(); nr_cpus = env->nr_cpus_avail; if (nr_cpus == -1) diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index ca0f12fb5fd3..e97cbf78ef21 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -293,11 +293,7 @@ static int write_nrcpus(int fd, struct perf_header *h __maybe_unused, u32 nrc, nra; int ret; - nr = sysconf(_SC_NPROCESSORS_CONF); - if (nr < 0) - return -1; - - nrc = (u32)(nr & UINT_MAX); + nrc = cpu__max_present_cpu(); nr = sysconf(_SC_NPROCESSORS_ONLN); if (nr < 0) @@ -511,9 +507,7 @@ static struct cpu_topo *build_cpu_topology(void) int ret = -1; struct cpu_map *map; - ncpus = sysconf(_SC_NPROCESSORS_CONF); - if (ncpus < 0) - return NULL; + ncpus = cpu__max_present_cpu(); /* build online CPU map */ map = cpu_map__new(NULL); @@ -1137,7 +1131,7 @@ static void print_cpu_topology(struct perf_header *ph, int fd __maybe_unused, { int nr, i; char *str; - int cpu_nr = ph->env
[PATCH v4 2/3] perf header: Make build_cpu_topology skip offline/absent CPUs
When build_cpu_topo() encounters offline/absent CPUs, it fails to find any sysfs entries and returns failure. This leads to build_cpu_topology() and write_cpu_topology() failing as well. Because HEADER_CPU_TOPOLOGY has not been written, read leaves cpu_topology_map NULL and we get NULL ptr deref at: ... cmd_test __cmd_test test_and_print run_test test_session_topology check_cpu_topology # perf test 36 36: Session topology : --- start --- test child forked, pid 14902 templ file: /tmp/perf-test-4CKocW failed to write feature HEADER_CPU_TOPOLOGY perf: Segmentation fault Obtained 9 stack frames. ./perf(sighandler_dump_stack+0x41) [0x5095f1] /lib64/libc.so.6(+0x35250) [0x7f4b7c3c9250] ./perf(test_session_topology+0x1db) [0x490ceb] ./perf() [0x475b68] ./perf(cmd_test+0x5b9) [0x4763c9] ./perf() [0x4945a3] ./perf(main+0x69f) [0x427e8f] /lib64/libc.so.6(__libc_start_main+0xf5) [0x7f4b7c3b5b35] ./perf() [0x427fb9] test child interrupted end Session topology: FAILED! This patch makes build_cpu_topology() skip offline/absent CPUs, by checking their presence against cpu_map built from online CPUs. Signed-off-by: Jan Stancek --- tools/perf/util/header.c | 19 --- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index d89c9c7ef4e5..ca0f12fb5fd3 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -503,24 +503,31 @@ static void free_cpu_topo(struct cpu_topo *tp) static struct cpu_topo *build_cpu_topology(void) { - struct cpu_topo *tp; + struct cpu_topo *tp = NULL; void *addr; u32 nr, i; size_t sz; long ncpus; int ret = -1; + struct cpu_map *map; ncpus = sysconf(_SC_NPROCESSORS_CONF); if (ncpus < 0) return NULL; + /* build online CPU map */ + map = cpu_map__new(NULL); + if (map == NULL) { + pr_debug("failed to get system cpumap\n"); + return NULL; + } + nr = (u32)(ncpus & UINT_MAX); sz = nr * sizeof(char *); - addr = calloc(1, sizeof(*tp) + 2 * sz); if (!addr) - return NULL; + goto out_free; tp = addr; tp->cpu_nr = nr; @@ -530,10 +537,16 @@ static struct cpu_topo *build_cpu_topology(void) tp->thread_siblings = addr; for (i = 0; i < nr; i++) { + if (!cpu_map__has(map, i)) + continue; + ret = build_cpu_topo(tp, i); if (ret < 0) break; } + +out_free: + cpu_map__put(map); if (ret) { free_cpu_topo(tp); tp = NULL; -- 1.8.3.1
[PATCH v4 1/3] perf cpu_map: Add cpu__max_present_cpu()
Similar to cpu__max_cpu() (which returns max possible CPU), returns max present CPU. Signed-off-by: Jan Stancek --- tools/perf/util/cpumap.c | 22 ++ tools/perf/util/cpumap.h | 1 + 2 files changed, 23 insertions(+) diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c index 2c0b52264a46..8c7504939113 100644 --- a/tools/perf/util/cpumap.c +++ b/tools/perf/util/cpumap.c @@ -9,6 +9,7 @@ #include "asm/bug.h" static int max_cpu_num; +static int max_present_cpu_num; static int max_node_num; static int *cpunode_map; @@ -442,6 +443,7 @@ static void set_max_cpu_num(void) /* set up default */ max_cpu_num = 4096; + max_present_cpu_num = 4096; mnt = sysfs__mountpoint(); if (!mnt) @@ -455,6 +457,17 @@ static void set_max_cpu_num(void) } ret = get_max_num(path, &max_cpu_num); + if (ret) + goto out; + + /* get the highest present cpu number for a sparse allocation */ + ret = snprintf(path, PATH_MAX, "%s/devices/system/cpu/present", mnt); + if (ret == PATH_MAX) { + pr_err("sysfs path crossed PATH_MAX(%d) size\n", PATH_MAX); + goto out; + } + + ret = get_max_num(path, &max_present_cpu_num); out: if (ret) @@ -505,6 +518,15 @@ int cpu__max_cpu(void) return max_cpu_num; } +int cpu__max_present_cpu(void) +{ + if (unlikely(!max_present_cpu_num)) + set_max_cpu_num(); + + return max_present_cpu_num; +} + + int cpu__get_node(int cpu) { if (unlikely(cpunode_map == NULL)) { diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h index 06bd689f5989..1a0549af8f5c 100644 --- a/tools/perf/util/cpumap.h +++ b/tools/perf/util/cpumap.h @@ -62,6 +62,7 @@ static inline bool cpu_map__empty(const struct cpu_map *map) int cpu__max_node(void); int cpu__max_cpu(void); +int cpu__max_present_cpu(void); int cpu__get_node(int cpu); int cpu_map__build_map(struct cpu_map *cpus, struct cpu_map **res, -- 1.8.3.1
[PATCH v4 0/3] perf: topology on systems with offline/absent CPUs
Changes in v4: - rebase on top of 4.10 - fix style of commit logs - add comment to process_cpu_topology() This series aims to address issues on systems with offline/absent CPUs: 1. cpu_topology_map size Allocation based on _SC_NPROCESSORS_CONF is too small, there can be CPU IDs that are higher, which causes invalid reads. 2. build_cpu_topology fails for offline/absent CPUs It is trying to parse sysfs entries, that do not exist. 3. perf topology test fails Changes affect HEADER_NRCPUS and HEADER_CPU_TOPOLOGY: For example on a system like this: _SC_NPROCESSORS_CONF == 16 available: 2 nodes (0-1) node 0 cpus: 0 6 8 10 16 22 24 26 node 0 size: 12004 MB node 0 free: 9470 MB node 1 cpus: 1 7 9 11 23 25 27 node 1 size: 12093 MB node 1 free: 9406 MB node distances: node 0 1 0: 10 20 1: 20 10 HEADER_NRCPUS before this series: nr_cpus_online = 15 nr_cpus_available = 16 HEADER_CPU_TOPOLOGY before this series: core_sib_nr: 2 core_siblings: 0,6,8,10,16,22,24,26 core_siblings: 1,7,9,11,23,25,27 thread_sib_nr: 8 thread_siblings: 0,16 thread_siblings: 1 thread_siblings: 6,22 thread_siblings: 7,23 thread_siblings: 8,24 thread_siblings: 9,25 thread_siblings: 10,26 thread_siblings: 11,27 core_id: 0, socket_id: 0 core_id: 0, socket_id: 1 core_id: -1, socket_id: -1 core_id: -1, socket_id: -1 core_id: -1, socket_id: -1 core_id: -1, socket_id: -1 core_id: 10, socket_id: 0 core_id: 10, socket_id: 1 core_id: 1, socket_id: 0 core_id: 1, socket_id: 1 core_id: 9, socket_id: 0 core_id: 9, socket_id: 1 core_id: -1, socket_id: -1 core_id: -1, socket_id: -1 core_id: -1, socket_id: -1 core_id: -1, socket_id: -1 HEADER_NRCPUS with this series applied: nr_cpus_online = 15 nr_cpus_available = 28 HEADER_CPU_TOPOLOGY with this series applied: core_sib_nr: 2 core_siblings: 0,6,8,10,16,22,24,26 core_siblings: 1,7,9,11,23,25,27 thread_sib_nr: 8 thread_siblings: 0,16 thread_siblings: 1 thread_siblings: 6,22 thread_siblings: 7,23 thread_siblings: 8,24 thread_siblings: 9,25 thread_siblings: 10,26 thread_siblings: 11,27 core_id: 0, socket_id: 0 core_id: 0, socket_id: 1 core_id: -1, socket_id: -1 core_id: -1, socket_id: -1 core_id: -1, socket_id: -1 core_id: -1, socket_id: -1 core_id: 10, socket_id: 0 core_id: 10, socket_id: 1 core_id: 1, socket_id: 0 core_id: 1, socket_id: 1 core_id: 9, socket_id: 0 core_id: 9, socket_id: 1 core_id: -1, socket_id: -1 core_id: -1, socket_id: -1 core_id: -1, socket_id: -1 core_id: -1, socket_id: -1 core_id: 0, socket_id: 0 core_id: -1, socket_id: -1 core_id: -1, socket_id: -1 core_id: -1, socket_id: -1 core_id: -1, socket_id: -1 core_id: -1, socket_id: -1 core_id: 10, socket_id: 0 core_id: 10, socket_id: 1 core_id: 1, socket_id: 0 core_id: 1, socket_id: 1 core_id: 9, socket_id: 0 core_id: 9, socket_id: 1 Jan Stancek (3): perf cpu_map: Add cpu__max_present_cpu() perf header: Make build_cpu_topology skip offline/absent CPUs perf: Replace _SC_NPROCESSORS_CONF with max_present_cpu in cpu_topology_map tools/perf/builtin-stat.c | 2 +- tools/perf/tests/topology.c | 4 +++- tools/perf/util/cpumap.c| 22 ++ tools/perf/util/cpumap.h| 1 + tools/perf/util/env.c | 2 +- tools/perf/util/header.c| 37 - 6 files changed, 52 insertions(+), 16 deletions(-) -- 1.8.3.1
[PATCH v3 2/3] perf: make build_cpu_topology skip offline/absent CPUs
When build_cpu_topo() encounters offline/absent CPUs, it fails to find any sysfs entries and returns failure. This leads to build_cpu_topology() and write_cpu_topology() failing as well. Because HEADER_CPU_TOPOLOGY has not been written, read leaves cpu_topology_map NULL and we get NULL ptr deref at: ... cmd_test __cmd_test test_and_print run_test test_session_topology check_cpu_topology 36: Session topology : --- start --- test child forked, pid 14902 templ file: /tmp/perf-test-4CKocW failed to write feature HEADER_CPU_TOPOLOGY perf: Segmentation fault Obtained 9 stack frames. ./perf(sighandler_dump_stack+0x41) [0x5095f1] /lib64/libc.so.6(+0x35250) [0x7f4b7c3c9250] ./perf(test_session_topology+0x1db) [0x490ceb] ./perf() [0x475b68] ./perf(cmd_test+0x5b9) [0x4763c9] ./perf() [0x4945a3] ./perf(main+0x69f) [0x427e8f] /lib64/libc.so.6(__libc_start_main+0xf5) [0x7f4b7c3b5b35] ./perf() [0x427fb9] test child interrupted end Session topology: FAILED! This patch makes build_cpu_topology() skip offline/absent CPUs, by checking their presence against cpu_map built from online CPUs. Signed-off-by: Jan Stancek --- tools/perf/util/header.c | 19 --- 1 file changed, 16 insertions(+), 3 deletions(-) Changes in v3: - make ret = -1 by default, so we fail in case build_cpu_topo doesn't run at all diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index d89c9c7ef4e5..ca0f12fb5fd3 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -503,24 +503,31 @@ static void free_cpu_topo(struct cpu_topo *tp) static struct cpu_topo *build_cpu_topology(void) { - struct cpu_topo *tp; + struct cpu_topo *tp = NULL; void *addr; u32 nr, i; size_t sz; long ncpus; int ret = -1; + struct cpu_map *map; ncpus = sysconf(_SC_NPROCESSORS_CONF); if (ncpus < 0) return NULL; + /* build online CPU map */ + map = cpu_map__new(NULL); + if (map == NULL) { + pr_debug("failed to get system cpumap\n"); + return NULL; + } + nr = (u32)(ncpus & UINT_MAX); sz = nr * sizeof(char *); - addr = calloc(1, sizeof(*tp) + 2 * sz); if (!addr) - return NULL; + goto out_free; tp = addr; tp->cpu_nr = nr; @@ -530,10 +537,16 @@ static struct cpu_topo *build_cpu_topology(void) tp->thread_siblings = addr; for (i = 0; i < nr; i++) { + if (!cpu_map__has(map, i)) + continue; + ret = build_cpu_topo(tp, i); if (ret < 0) break; } + +out_free: + cpu_map__put(map); if (ret) { free_cpu_topo(tp); tp = NULL; -- 1.8.3.1
[PATCH v3 1/3] perf: add cpu__max_present_cpu()
Similar to cpu__max_cpu() (which returns max possible CPU), returns max present CPU. Signed-off-by: Jan Stancek --- tools/perf/util/cpumap.c | 22 ++ tools/perf/util/cpumap.h | 1 + 2 files changed, 23 insertions(+) diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c index 2c0b52264a46..8c7504939113 100644 --- a/tools/perf/util/cpumap.c +++ b/tools/perf/util/cpumap.c @@ -9,6 +9,7 @@ #include "asm/bug.h" static int max_cpu_num; +static int max_present_cpu_num; static int max_node_num; static int *cpunode_map; @@ -442,6 +443,7 @@ static void set_max_cpu_num(void) /* set up default */ max_cpu_num = 4096; + max_present_cpu_num = 4096; mnt = sysfs__mountpoint(); if (!mnt) @@ -455,6 +457,17 @@ static void set_max_cpu_num(void) } ret = get_max_num(path, &max_cpu_num); + if (ret) + goto out; + + /* get the highest present cpu number for a sparse allocation */ + ret = snprintf(path, PATH_MAX, "%s/devices/system/cpu/present", mnt); + if (ret == PATH_MAX) { + pr_err("sysfs path crossed PATH_MAX(%d) size\n", PATH_MAX); + goto out; + } + + ret = get_max_num(path, &max_present_cpu_num); out: if (ret) @@ -505,6 +518,15 @@ int cpu__max_cpu(void) return max_cpu_num; } +int cpu__max_present_cpu(void) +{ + if (unlikely(!max_present_cpu_num)) + set_max_cpu_num(); + + return max_present_cpu_num; +} + + int cpu__get_node(int cpu) { if (unlikely(cpunode_map == NULL)) { diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h index 06bd689f5989..1a0549af8f5c 100644 --- a/tools/perf/util/cpumap.h +++ b/tools/perf/util/cpumap.h @@ -62,6 +62,7 @@ static inline bool cpu_map__empty(const struct cpu_map *map) int cpu__max_node(void); int cpu__max_cpu(void); +int cpu__max_present_cpu(void); int cpu__get_node(int cpu); int cpu_map__build_map(struct cpu_map *cpus, struct cpu_map **res, -- 1.8.3.1
[PATCH v3 3/3] perf: replace _SC_NPROCESSORS_CONF with max_present_cpu in cpu_topology_map
There are 2 problems wrt. cpu_topology_map on systems with sparse CPUs: 1. offline/absent CPUs will have their socket_id and core_id set to -1 which triggers: "socket_id number is too big.You may need to upgrade the perf tool." 2. size of cpu_topology_map (perf_env.cpu[]) is allocated based on _SC_NPROCESSORS_CONF, but can be indexed with CPU ids going above. Users of perf_env.cpu[] are using CPU id as index. This can lead to read beyond what was allocated: ==19991== Invalid read of size 4 ==19991==at 0x490CEB: check_cpu_topology (topology.c:69) ==19991==by 0x490CEB: test_session_topology (topology.c:106) ... For example: _SC_NPROCESSORS_CONF == 16 available: 2 nodes (0-1) node 0 cpus: 0 6 8 10 16 22 24 26 node 0 size: 12004 MB node 0 free: 9470 MB node 1 cpus: 1 7 9 11 23 25 27 node 1 size: 12093 MB node 1 free: 9406 MB node distances: node 0 1 0: 10 20 1: 20 10 This patch changes HEADER_NRCPUS.nr_cpus_available from _SC_NPROCESSORS_CONF to max_present_cpu and updates any user of cpu_topology_map to iterate with nr_cpus_avail. As consequence HEADER_CPU_TOPOLOGY core_id and socket_id lists get longer, but maintain compatibility with pre-patch state - index to cpu_topology_map is CPU id. ./perf test 36 -v 36: Session topology : --- start --- test child forked, pid 22211 templ file: /tmp/perf-test-gmdX5i CPU 0, core 0, socket 0 CPU 1, core 0, socket 1 CPU 6, core 10, socket 0 CPU 7, core 10, socket 1 CPU 8, core 1, socket 0 CPU 9, core 1, socket 1 CPU 10, core 9, socket 0 CPU 11, core 9, socket 1 CPU 16, core 0, socket 0 CPU 22, core 10, socket 0 CPU 23, core 10, socket 1 CPU 24, core 1, socket 0 CPU 25, core 1, socket 1 CPU 26, core 9, socket 0 CPU 27, core 9, socket 1 test child finished with 0 end Session topology: Ok Signed-off-by: Jan Stancek --- tools/perf/builtin-stat.c | 2 +- tools/perf/tests/topology.c | 4 +++- tools/perf/util/env.c | 2 +- tools/perf/util/header.c| 16 +--- 4 files changed, 10 insertions(+), 14 deletions(-) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index a02f2e965628..9e2be1c52fbd 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -1765,7 +1765,7 @@ static inline int perf_env__get_cpu(struct perf_env *env, struct cpu_map *map, i cpu = map->map[idx]; - if (cpu >= env->nr_cpus_online) + if (cpu >= env->nr_cpus_avail) return -1; return cpu; diff --git a/tools/perf/tests/topology.c b/tools/perf/tests/topology.c index 98fe69ac553c..803f893550d6 100644 --- a/tools/perf/tests/topology.c +++ b/tools/perf/tests/topology.c @@ -65,7 +65,9 @@ static int check_cpu_topology(char *path, struct cpu_map *map) session = perf_session__new(&file, false, NULL); TEST_ASSERT_VAL("can't get session", session); - for (i = 0; i < session->header.env.nr_cpus_online; i++) { + for (i = 0; i < session->header.env.nr_cpus_avail; i++) { + if (!cpu_map__has(map, i)) + continue; pr_debug("CPU %d, core %d, socket %d\n", i, session->header.env.cpu[i].core_id, session->header.env.cpu[i].socket_id); diff --git a/tools/perf/util/env.c b/tools/perf/util/env.c index bb964e86b09d..075fc77286bf 100644 --- a/tools/perf/util/env.c +++ b/tools/perf/util/env.c @@ -66,7 +66,7 @@ int perf_env__read_cpu_topology_map(struct perf_env *env) return 0; if (env->nr_cpus_avail == 0) - env->nr_cpus_avail = sysconf(_SC_NPROCESSORS_CONF); + env->nr_cpus_avail = cpu__max_present_cpu(); nr_cpus = env->nr_cpus_avail; if (nr_cpus == -1) diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index ca0f12fb5fd3..3a79a0f10bf6 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -293,11 +293,7 @@ static int write_nrcpus(int fd, struct perf_header *h __maybe_unused, u32 nrc, nra; int ret; - nr = sysconf(_SC_NPROCESSORS_CONF); - if (nr < 0) - return -1; - - nrc = (u32)(nr & UINT_MAX); + nrc = cpu__max_present_cpu(); nr = sysconf(_SC_NPROCESSORS_ONLN); if (nr < 0) @@ -511,9 +507,7 @@ static struct cpu_topo *build_cpu_topology(void) int ret = -1; struct cpu_map *map; - ncpus = sysconf(_SC_NPROCESSORS_CONF); - if (ncpus < 0) - return NULL; + ncpus = cpu__max_present_cpu(); /* build online CPU map */ map = cpu_map__new(NULL); @@ -1137,7 +1131,7 @@ static void print_cpu_topology(struct perf_header *ph, int fd __maybe_unused, { int nr, i; char *str; - int cpu_nr = ph->env.nr_cpus_online; + int cpu_nr = ph->env.
Re: [PATCH v2 2/3] perf: make build_cpu_topology skip offline/absent CPUs
- Original Message - > From: "Jiri Olsa" > To: "Jan Stancek" > Cc: linux-kernel@vger.kernel.org, pet...@infradead.org, mi...@redhat.com, > a...@kernel.org, "alexander shishkin" > , jo...@kernel.org, mhira...@kernel.org > Sent: Tuesday, 14 February, 2017 12:01:10 PM > Subject: Re: [PATCH v2 2/3] perf: make build_cpu_topology skip offline/absent > CPUs > > On Mon, Feb 13, 2017 at 04:34:35PM +0100, Jan Stancek wrote: > > SNIP > > > This patch makes build_cpu_topology() skip offline/absent CPUs, > > by checking their presence against cpu_map built from online CPUs. > > > > Signed-off-by: Jan Stancek > > --- > > tools/perf/util/header.c | 21 + > > 1 file changed, 17 insertions(+), 4 deletions(-) > > > > Changes in v2: > > - drop out label, use return NULL where possible > > > > diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c > > index d89c9c7ef4e5..4b0ea4e92e9d 100644 > > --- a/tools/perf/util/header.c > > +++ b/tools/perf/util/header.c > > @@ -503,24 +503,31 @@ static void free_cpu_topo(struct cpu_topo *tp) > > > > static struct cpu_topo *build_cpu_topology(void) > > { > > - struct cpu_topo *tp; > > + struct cpu_topo *tp = NULL; > > void *addr; > > u32 nr, i; > > size_t sz; > > long ncpus; > > - int ret = -1; > > + int ret = 0; > > hum, shoudn't we fail if we dont find any online cpu? > > jirka You're right, there's no reason to change this. I'll send v3.
[PATCH v2 1/3] perf: add cpu__max_present_cpu()
Similar to cpu__max_cpu() (which returns max possible CPU), returns max present CPU. Signed-off-by: Jan Stancek --- tools/perf/util/cpumap.c | 22 ++ tools/perf/util/cpumap.h | 1 + 2 files changed, 23 insertions(+) diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c index 2c0b52264a46..8c7504939113 100644 --- a/tools/perf/util/cpumap.c +++ b/tools/perf/util/cpumap.c @@ -9,6 +9,7 @@ #include "asm/bug.h" static int max_cpu_num; +static int max_present_cpu_num; static int max_node_num; static int *cpunode_map; @@ -442,6 +443,7 @@ static void set_max_cpu_num(void) /* set up default */ max_cpu_num = 4096; + max_present_cpu_num = 4096; mnt = sysfs__mountpoint(); if (!mnt) @@ -455,6 +457,17 @@ static void set_max_cpu_num(void) } ret = get_max_num(path, &max_cpu_num); + if (ret) + goto out; + + /* get the highest present cpu number for a sparse allocation */ + ret = snprintf(path, PATH_MAX, "%s/devices/system/cpu/present", mnt); + if (ret == PATH_MAX) { + pr_err("sysfs path crossed PATH_MAX(%d) size\n", PATH_MAX); + goto out; + } + + ret = get_max_num(path, &max_present_cpu_num); out: if (ret) @@ -505,6 +518,15 @@ int cpu__max_cpu(void) return max_cpu_num; } +int cpu__max_present_cpu(void) +{ + if (unlikely(!max_present_cpu_num)) + set_max_cpu_num(); + + return max_present_cpu_num; +} + + int cpu__get_node(int cpu) { if (unlikely(cpunode_map == NULL)) { diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h index 06bd689f5989..1a0549af8f5c 100644 --- a/tools/perf/util/cpumap.h +++ b/tools/perf/util/cpumap.h @@ -62,6 +62,7 @@ static inline bool cpu_map__empty(const struct cpu_map *map) int cpu__max_node(void); int cpu__max_cpu(void); +int cpu__max_present_cpu(void); int cpu__get_node(int cpu); int cpu_map__build_map(struct cpu_map *cpus, struct cpu_map **res, -- 1.8.3.1
[PATCH v2 3/3] perf: replace _SC_NPROCESSORS_CONF with max_present_cpu in cpu_topology_map
There are 2 problems wrt. cpu_topology_map on systems with sparse CPUs: 1. offline/absent CPUs will have their socket_id and core_id set to -1 which triggers: "socket_id number is too big.You may need to upgrade the perf tool." 2. size of cpu_topology_map (perf_env.cpu[]) is allocated based on _SC_NPROCESSORS_CONF, but can be indexed with CPU ids going above. Users of perf_env.cpu[] are using CPU id as index. This can lead to read beyond what was allocated: ==19991== Invalid read of size 4 ==19991==at 0x490CEB: check_cpu_topology (topology.c:69) ==19991==by 0x490CEB: test_session_topology (topology.c:106) ... For example: _SC_NPROCESSORS_CONF == 16 available: 2 nodes (0-1) node 0 cpus: 0 6 8 10 16 22 24 26 node 0 size: 12004 MB node 0 free: 9470 MB node 1 cpus: 1 7 9 11 23 25 27 node 1 size: 12093 MB node 1 free: 9406 MB node distances: node 0 1 0: 10 20 1: 20 10 This patch changes HEADER_NRCPUS.nr_cpus_available from _SC_NPROCESSORS_CONF to max_present_cpu and updates any user of cpu_topology_map to iterate with nr_cpus_avail. As consequence HEADER_CPU_TOPOLOGY core_id and socket_id lists get longer, but maintain compatibility with pre-patch state - index to cpu_topology_map is CPU id. ./perf test 36 -v 36: Session topology : --- start --- test child forked, pid 22211 templ file: /tmp/perf-test-gmdX5i CPU 0, core 0, socket 0 CPU 1, core 0, socket 1 CPU 6, core 10, socket 0 CPU 7, core 10, socket 1 CPU 8, core 1, socket 0 CPU 9, core 1, socket 1 CPU 10, core 9, socket 0 CPU 11, core 9, socket 1 CPU 16, core 0, socket 0 CPU 22, core 10, socket 0 CPU 23, core 10, socket 1 CPU 24, core 1, socket 0 CPU 25, core 1, socket 1 CPU 26, core 9, socket 0 CPU 27, core 9, socket 1 test child finished with 0 end Session topology: Ok Signed-off-by: Jan Stancek --- tools/perf/builtin-stat.c | 2 +- tools/perf/tests/topology.c | 4 +++- tools/perf/util/env.c | 2 +- tools/perf/util/header.c| 16 +--- 4 files changed, 10 insertions(+), 14 deletions(-) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index a02f2e965628..9e2be1c52fbd 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -1765,7 +1765,7 @@ static inline int perf_env__get_cpu(struct perf_env *env, struct cpu_map *map, i cpu = map->map[idx]; - if (cpu >= env->nr_cpus_online) + if (cpu >= env->nr_cpus_avail) return -1; return cpu; diff --git a/tools/perf/tests/topology.c b/tools/perf/tests/topology.c index 98fe69ac553c..803f893550d6 100644 --- a/tools/perf/tests/topology.c +++ b/tools/perf/tests/topology.c @@ -65,7 +65,9 @@ static int check_cpu_topology(char *path, struct cpu_map *map) session = perf_session__new(&file, false, NULL); TEST_ASSERT_VAL("can't get session", session); - for (i = 0; i < session->header.env.nr_cpus_online; i++) { + for (i = 0; i < session->header.env.nr_cpus_avail; i++) { + if (!cpu_map__has(map, i)) + continue; pr_debug("CPU %d, core %d, socket %d\n", i, session->header.env.cpu[i].core_id, session->header.env.cpu[i].socket_id); diff --git a/tools/perf/util/env.c b/tools/perf/util/env.c index bb964e86b09d..075fc77286bf 100644 --- a/tools/perf/util/env.c +++ b/tools/perf/util/env.c @@ -66,7 +66,7 @@ int perf_env__read_cpu_topology_map(struct perf_env *env) return 0; if (env->nr_cpus_avail == 0) - env->nr_cpus_avail = sysconf(_SC_NPROCESSORS_CONF); + env->nr_cpus_avail = cpu__max_present_cpu(); nr_cpus = env->nr_cpus_avail; if (nr_cpus == -1) diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index 4b0ea4e92e9d..e9384d17ae7d 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -293,11 +293,7 @@ static int write_nrcpus(int fd, struct perf_header *h __maybe_unused, u32 nrc, nra; int ret; - nr = sysconf(_SC_NPROCESSORS_CONF); - if (nr < 0) - return -1; - - nrc = (u32)(nr & UINT_MAX); + nrc = cpu__max_present_cpu(); nr = sysconf(_SC_NPROCESSORS_ONLN); if (nr < 0) @@ -511,9 +507,7 @@ static struct cpu_topo *build_cpu_topology(void) int ret = 0; struct cpu_map *map; - ncpus = sysconf(_SC_NPROCESSORS_CONF); - if (ncpus < 0) - return NULL; + ncpus = cpu__max_present_cpu(); /* build online CPU map */ map = cpu_map__new(NULL); @@ -1137,7 +1131,7 @@ static void print_cpu_topology(struct perf_header *ph, int fd __maybe_unused, { int nr, i; char *str; - int cpu_nr = ph->env.nr_cpus_online; + int cpu_nr = ph->env.
[PATCH v2 2/3] perf: make build_cpu_topology skip offline/absent CPUs
When build_cpu_topo() encounters offline/absent CPUs, it fails to find any sysfs entries and returns failure. This leads to build_cpu_topology() and write_cpu_topology() failing as well. Because HEADER_CPU_TOPOLOGY has not been written, read leaves cpu_topology_map NULL and we get NULL ptr deref at: ... cmd_test __cmd_test test_and_print run_test test_session_topology check_cpu_topology 36: Session topology : --- start --- test child forked, pid 14902 templ file: /tmp/perf-test-4CKocW failed to write feature HEADER_CPU_TOPOLOGY perf: Segmentation fault Obtained 9 stack frames. ./perf(sighandler_dump_stack+0x41) [0x5095f1] /lib64/libc.so.6(+0x35250) [0x7f4b7c3c9250] ./perf(test_session_topology+0x1db) [0x490ceb] ./perf() [0x475b68] ./perf(cmd_test+0x5b9) [0x4763c9] ./perf() [0x4945a3] ./perf(main+0x69f) [0x427e8f] /lib64/libc.so.6(__libc_start_main+0xf5) [0x7f4b7c3b5b35] ./perf() [0x427fb9] test child interrupted end Session topology: FAILED! This patch makes build_cpu_topology() skip offline/absent CPUs, by checking their presence against cpu_map built from online CPUs. Signed-off-by: Jan Stancek --- tools/perf/util/header.c | 21 + 1 file changed, 17 insertions(+), 4 deletions(-) Changes in v2: - drop out label, use return NULL where possible diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index d89c9c7ef4e5..4b0ea4e92e9d 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -503,24 +503,31 @@ static void free_cpu_topo(struct cpu_topo *tp) static struct cpu_topo *build_cpu_topology(void) { - struct cpu_topo *tp; + struct cpu_topo *tp = NULL; void *addr; u32 nr, i; size_t sz; long ncpus; - int ret = -1; + int ret = 0; + struct cpu_map *map; ncpus = sysconf(_SC_NPROCESSORS_CONF); if (ncpus < 0) return NULL; + /* build online CPU map */ + map = cpu_map__new(NULL); + if (map == NULL) { + pr_debug("failed to get system cpumap\n"); + return NULL; + } + nr = (u32)(ncpus & UINT_MAX); sz = nr * sizeof(char *); - addr = calloc(1, sizeof(*tp) + 2 * sz); if (!addr) - return NULL; + goto out_free; tp = addr; tp->cpu_nr = nr; @@ -530,10 +537,16 @@ static struct cpu_topo *build_cpu_topology(void) tp->thread_siblings = addr; for (i = 0; i < nr; i++) { + if (!cpu_map__has(map, i)) + continue; + ret = build_cpu_topo(tp, i); if (ret < 0) break; } + +out_free: + cpu_map__put(map); if (ret) { free_cpu_topo(tp); tp = NULL; -- 1.8.3.1
Re: [PATCH] perf: fix topology test on systems with sparse CPUs
> > > When build_cpu_topo() encounters offline/absent CPUs, > > it fails to find any sysfs entries and returns failure. > > This leads to build_cpu_topology() and write_cpu_topology() > > failing as well. > > > > Because HEADER_CPU_TOPOLOGY has not been written, read leaves > > cpu_topology_map NULL and we get NULL ptr deref at: > > > > ... > > cmd_test > >__cmd_test > > test_and_print > > run_test > > test_session_topology > >check_cpu_topology > > So IIUIC that's the key issue here.. write_cpu_topology that fails > to write the TOPO data and following readers crashing on processing > uncomplete data? if thats the case write_cpu_topology needs to > be fixed, instead of doing workarounds It's already late when you are in write_cpu_topology(), because build_cpu_topology() returned you NULL - there's nothing to write. That's why patch aims to fix this in build_cpu_topology(). > > SNIP > > > u32 nr, i; > > size_t sz; > > long ncpus; > > - int ret = -1; > > + int ret = 0; > > + struct cpu_map *map; > > > > ncpus = sysconf(_SC_NPROCESSORS_CONF); > > if (ncpus < 0) > > - return NULL; > > + goto out; > > can just return NULL > > > + > > + /* build online CPU map */ > > + map = cpu_map__new(NULL); > > + if (map == NULL) { > > + pr_debug("failed to get system cpumap\n"); > > + goto out; > > + } > > > > nr = (u32)(ncpus & UINT_MAX); > > > > sz = nr * sizeof(char *); > > - > > addr = calloc(1, sizeof(*tp) + 2 * sz); > > if (!addr) > > - return NULL; > > + goto out_free; > > > > tp = addr; > > tp->cpu_nr = nr; > > @@ -530,14 +537,21 @@ static struct cpu_topo *build_cpu_topology(void) > > tp->thread_siblings = addr; > > > > for (i = 0; i < nr; i++) { > > + if (!cpu_map__has(map, i)) > > + continue; > > + > > so this prevents build_cpu_topo to fail due to missing topology > info because cpu is offline.. can it fail for other reasons? It's unlikely, though I suppose if you couldn't open and read something from sysfs (say sysfs is not mounted) it can fail for online CPU too. > > > > ret = build_cpu_topo(tp, i); > > if (ret < 0) > > break; > SNIP > For example: > _SC_NPROCESSORS_CONF == 16 > available: 2 nodes (0-1) > node 0 cpus: 0 6 8 10 16 22 24 26 > node 0 size: 12004 MB > node 0 free: 9470 MB > node 1 cpus: 1 7 9 11 23 25 27 > node 1 size: 12093 MB > node 1 free: 9406 MB > node distances: > node 0 1 > 0: 10 20 > 1: 20 10 > so what's max_present_cpu in this example? It's 28, which is the number of core_id/socket_id entries, for CPUs 0 up to 27. Regards, Jan
Re: [PATCH] perf: fix topology test on systems with sparse CPUs
On 01/30/2017 07:49 PM, Jiri Olsa wrote: > so basically we're changing from avail to online cpus > > have you checked all the users of this FEATURE > if such change is ok? Jiri, It wasn't OK as there are other users who index cpu_topology_map by CPU id. I decided to give the alternative a try (attached): keep cpu_topology_map indexed by CPU id, but extend it to fit max present CPU. So for a system like this one: _SC_NPROCESSORS_CONF == 16 available: 2 nodes (0-1) node 0 cpus: 0 6 8 10 16 22 24 26 node 0 size: 12004 MB node 0 free: 9470 MB node 1 cpus: 1 7 9 11 23 25 27 node 1 size: 12093 MB node 1 free: 9406 MB node distances: node 0 1 0: 10 20 1: 20 10 HEADER_NRCPUS before: nr_cpus_online = 15 nr_cpus_available = 16 HEADER_CPU_TOPOLOGY before: core_sib_nr: 2 core_siblings: 0,6,8,10,16,22,24,26 core_siblings: 1,7,9,11,23,25,27 thread_sib_nr: 8 thread_siblings: 0,16 thread_siblings: 1 thread_siblings: 6,22 thread_siblings: 7,23 thread_siblings: 8,24 thread_siblings: 9,25 thread_siblings: 10,26 thread_siblings: 11,27 core_id: 0, socket_id: 0 core_id: 0, socket_id: 1 core_id: -1, socket_id: -1 core_id: -1, socket_id: -1 core_id: -1, socket_id: -1 core_id: -1, socket_id: -1 core_id: 10, socket_id: 0 core_id: 10, socket_id: 1 core_id: 1, socket_id: 0 core_id: 1, socket_id: 1 core_id: 9, socket_id: 0 core_id: 9, socket_id: 1 core_id: -1, socket_id: -1 core_id: -1, socket_id: -1 core_id: -1, socket_id: -1 core_id: -1, socket_id: -1 HEADER_NRCPUS after: nr_cpus_online = 15 nr_cpus_available = 28 HEADER_CPU_TOPOLOGY after: core_sib_nr: 2 core_siblings: 0,6,8,10,16,22,24,26 core_siblings: 1,7,9,11,23,25,27 thread_sib_nr: 8 thread_siblings: 0,16 thread_siblings: 1 thread_siblings: 6,22 thread_siblings: 7,23 thread_siblings: 8,24 thread_siblings: 9,25 thread_siblings: 10,26 thread_siblings: 11,27 core_id: 0, socket_id: 0 core_id: 0, socket_id: 1 core_id: -1, socket_id: -1 core_id: -1, socket_id: -1 core_id: -1, socket_id: -1 core_id: -1, socket_id: -1 core_id: 10, socket_id: 0 core_id: 10, socket_id: 1 core_id: 1, socket_id: 0 core_id: 1, socket_id: 1 core_id: 9, socket_id: 0 core_id: 9, socket_id: 1 core_id: -1, socket_id: -1 core_id: -1, socket_id: -1 core_id: -1, socket_id: -1 core_id: -1, socket_id: -1 core_id: 0, socket_id: 0 core_id: -1, socket_id: -1 core_id: -1, socket_id: -1 core_id: -1, socket_id: -1 core_id: -1, socket_id: -1 core_id: -1, socket_id: -1 core_id: 10, socket_id: 0 core_id: 10, socket_id: 1 core_id: 1, socket_id: 0 core_id: 1, socket_id: 1 core_id: 9, socket_id: 0 core_id: 9, socket_id: 1 Regards, Jan >From 9bf8ece1e397b851beedaeceeb0cd07421ff6f43 Mon Sep 17 00:00:00 2001 Message-Id: <9bf8ece1e397b851beedaeceeb0cd07421ff6f43.1485877985.git.jstan...@redhat.com> From: Jan Stancek Date: Tue, 31 Jan 2017 14:41:46 +0100 Subject: [PATCH 1/3 v2] perf: add cpu__max_present_cpu() Similar to cpu__max_cpu() (which returns max possible CPU), returns max present CPU. Signed-off-by: Jan Stancek --- tools/perf/util/cpumap.c | 22 ++ tools/perf/util/cpumap.h | 1 + 2 files changed, 23 insertions(+) diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c index 2c0b52264a46..8c7504939113 100644 --- a/tools/perf/util/cpumap.c +++ b/tools/perf/util/cpumap.c @@ -9,6 +9,7 @@ #include "asm/bug.h" static int max_cpu_num; +static int max_present_cpu_num; static int max_node_num; static int *cpunode_map; @@ -442,6 +443,7 @@ static void set_max_cpu_num(void) /* set up default */ max_cpu_num = 4096; + max_present_cpu_num = 4096; mnt = sysfs__mountpoint(); if (!mnt) @@ -455,6 +457,17 @@ static void set_max_cpu_num(void) } ret = get_max_num(path, &max_cpu_num); + if (ret) + goto out; + + /* get the highest present cpu number for a sparse allocation */ + ret = snprintf(path, PATH_MAX, "%s/devices/system/cpu/present", mnt); + if (ret == PATH_MAX) { + pr_err("sysfs path crossed PATH_MAX(%d) size\n", PATH_MAX); + goto out; + } + + ret = get_max_num(path, &max_present_cpu_num); out: if (ret) @@ -505,6 +518,15 @@ int cpu__max_cpu(void) return max_cpu_num; } +int cpu__max_present_cpu(void) +{ + if (unlikely(!max_present_cpu_num)) + set_max_cpu_num(); + + return max_present_cpu_num; +} + + int cpu__get_node(int cpu) { if (unlikely(cpunode_map == NULL)) { diff --git a/tools/perf/util/cpumap.h b/tools/perf/util/cpumap.h index 06bd689f5989..1a0549af8f5c 100644 --- a/tools/perf/util/cpumap.h +++ b/tools/perf/util/cpumap.h @@ -62,6 +62,7 @@ static inline bool cpu_map__empty(const struct cpu_map *map) int cpu__max_node(void); int cpu__max_cpu(void); +int cpu__max_present_cpu(void); int cpu__get_node(int cpu); int cpu_map__build_map(struct cpu_map *cpus, struct cpu_map **res, -- 1.8.3.1 >From 8f23543a314f359ee0625345bbc6d54b0e278358 Mon Sep 17 00:00:00 2001 Message-Id: <8f23543a314f3
Re: [PATCH] perf: fix topology test on systems with sparse CPUs
- Original Message - > From: "Jiri Olsa" > To: "Jan Stancek" > Cc: linux-kernel@vger.kernel.org, pet...@infradead.org, mi...@redhat.com, > a...@kernel.org, "alexander shishkin" > , jo...@kernel.org, mhira...@kernel.org, > "rui teng" > , suka...@linux.vnet.ibm.com > Sent: Monday, 30 January, 2017 7:49:08 PM > Subject: Re: [PATCH] perf: fix topology test on systems with sparse CPUs > > On Mon, Jan 30, 2017 at 05:53:34PM +0100, Jan Stancek wrote: > > SNIP > > > + ret = build_cpu_topo(tp, cpu); > > if (ret < 0) > > break; > > } > > + > > +out_free: > > + cpu_map__put(map); > > if (ret) { > > free_cpu_topo(tp); > > tp = NULL; > > } > > +out: > > return tp; > > } > > > > @@ -575,7 +579,7 @@ static int write_cpu_topology(int fd, struct > > perf_header *h __maybe_unused, > > if (ret < 0) > > goto done; > > > > - for (j = 0; j < perf_env.nr_cpus_avail; j++) { > > + for (j = 0; j < perf_env.nr_cpus_online; j++) { > > so basically we're changing from avail to online cpus > > have you checked all the users of this FEATURE > if such change is ok? You're right, I missed some. Looking again, I see at least perf_env__get_core() could break. Regards, Jan
[PATCH] perf: fix topology test on systems with sparse CPUs
Topology test fails on systems with sparse CPUs, e.g. CPU not present or offline: 36: Test topology in session : --- start --- test child forked, pid 23703 templ file: /tmp/perf-test-i2rNki failed to write feature 13 perf: Segmentation fault available: 2 nodes (0-1) node 0 cpus: 0 6 8 10 16 22 24 26 node 0 size: 11797 MB node 0 free: 10526 MB node 1 cpus: 1 7 9 11 17 23 25 27 node 1 size: 12065 MB node 1 free: 10770 MB node distances: node 0 1 0: 10 20 1: 20 10 Enumerating CPU ids from 0 to _SC_NPROCESSORS_CONF-1 in header.env.cpu[] doesn't work on system like one above, because some ids are higher than number of CPUs, and there can be gaps. On top of that, if CPU is offline, we can't get topology info from sysfs entries, because they don't exist for offline CPUs. This patch stores topology data only for online CPUs in header.env.cpu[] list, regardless of their CPU ids, and then uses cpu_map to translate index to actual CPU id. Example: coreid socketid for CPU0 coreid socketid for CPU1 coreid socketid for CPU6 coreid socketid for CPU7 ... Alternative is we go from 0 to highest CPU id, but CPUs which are missing would contain some dummy values in topology data. Example: coreid socketid for CPU0 coreid socketid for CPU1 -1 -1 -1 -1 -1 -1 -1 -1 coreid socketid for CPU6 coreid socketid for CPU7 ... Signed-off-by: Jan Stancek --- tools/perf/tests/topology.c | 7 --- tools/perf/util/env.c | 40 tools/perf/util/header.c| 36 3 files changed, 52 insertions(+), 31 deletions(-) diff --git a/tools/perf/tests/topology.c b/tools/perf/tests/topology.c index 98fe69ac553c..7b0b621ea8c0 100644 --- a/tools/perf/tests/topology.c +++ b/tools/perf/tests/topology.c @@ -66,17 +66,18 @@ static int check_cpu_topology(char *path, struct cpu_map *map) TEST_ASSERT_VAL("can't get session", session); for (i = 0; i < session->header.env.nr_cpus_online; i++) { - pr_debug("CPU %d, core %d, socket %d\n", i, + pr_debug("CPU %d, core %d, socket %d\n", map->map[i], session->header.env.cpu[i].core_id, session->header.env.cpu[i].socket_id); } for (i = 0; i < map->nr; i++) { + int cpu = map->map[i]; TEST_ASSERT_VAL("Core ID doesn't match", - (session->header.env.cpu[map->map[i]].core_id == (cpu_map__get_core(map, i, NULL) & 0x))); + (session->header.env.cpu[i].core_id == (cpu_map__get_core_id(cpu) & 0x))); TEST_ASSERT_VAL("Socket ID doesn't match", - (session->header.env.cpu[map->map[i]].socket_id == cpu_map__get_socket(map, i, NULL))); + (session->header.env.cpu[i].socket_id == cpu_map__get_socket_id(cpu))); } perf_session__delete(session); diff --git a/tools/perf/util/env.c b/tools/perf/util/env.c index bb964e86b09d..0c2cae807a61 100644 --- a/tools/perf/util/env.c +++ b/tools/perf/util/env.c @@ -60,29 +60,45 @@ int perf_env__set_cmdline(struct perf_env *env, int argc, const char *argv[]) int perf_env__read_cpu_topology_map(struct perf_env *env) { - int cpu, nr_cpus; + int cpu, nr_cpus, i, err = 0; + struct cpu_map *map; if (env->cpu != NULL) return 0; - if (env->nr_cpus_avail == 0) - env->nr_cpus_avail = sysconf(_SC_NPROCESSORS_CONF); + map = cpu_map__new(NULL); + if (map == NULL) { + pr_debug("failed to get system cpumap\n"); + err = -ENOMEM; + goto out; + } + + if (env->nr_cpus_online == 0) + env->nr_cpus_online = map->nr; - nr_cpus = env->nr_cpus_avail; - if (nr_cpus == -1) - return -EINVAL; + nr_cpus = env->nr_cpus_online; + if (nr_cpus == -1 || map->nr < nr_cpus) { + err = -EINVAL; + goto out_free; + } env->cpu = calloc(nr_cpus, sizeof(env->cpu[0])); - if (env->cpu == NULL) - return -ENOMEM; + if (env->cpu == NULL) { + err = -ENOMEM; + goto out_free; + } - for (cpu = 0; cpu < nr_cpus; ++cpu) { - env->cpu[cpu].core_id = cpu_map__get_core_id(cpu); - env->cpu[cpu].socket_id = cpu_map__get_socket_id(cpu); + for (i = 0; i < nr_cpus; i++) { + cpu = map->map[i]; + env->cpu[i].core_id = cpu_map__get_core_id(cpu); + env->cpu[i].socket_id = cpu_map__get_socket_id(cpu); } env->nr_cpus_avail = nr_cpus; - return 0; +out_
unable to load modules with CONFIG_MODVERSIONS=y after commit 8ab2ae655b
Hi, Starting with 4.9-rc8 / commit 8ab2ae655b ("default exported asm symbols to zero") I'm running into issue with kernel built with CONFIG_MODVERSIONS=y and (older) binutils (binutils-2.25.1-20.base.el7.ppc64le). Modules fail to load, for example: [3.163646] Found checksum 0 vs module 4829A47E [3.163787] dm_mod: disagrees about version of symbol memcpy [3.163862] dm_mod: Unknown symbol memcpy (err -22) Bisect led me to 8ab2ae655b, reverting it allows boot to progress as before. Regards, Jan
Re: [PATCH 0/1] mm/hugetlb: fix huge page reservation leak in private mapping error paths
- Original Message - > From: "Mike Kravetz" > To: linux...@kvack.org, linux-kernel@vger.kernel.org > Cc: "Aneesh Kumar K . V" , "Naoya Horiguchi" > , "Michal > Hocko" , "Kirill A . Shutemov" > , "Hillf Danton" > , "Dave Hansen" , "Jan > Stancek" , "Mike > Kravetz" > Sent: Thursday, 20 October, 2016 5:11:16 AM > Subject: [PATCH 0/1] mm/hugetlb: fix huge page reservation leak in private > mapping error paths > > This issue was discovered by Jan Stancek as described in > https://lkml.kernel.org/r/57ff7bb4.1070...@redhat.com > > Error paths in hugetlb_cow() and hugetlb_no_page() do not properly clean > up reservation entries when freeing a newly allocated huge page. This > issue was introduced with commit 67961f9db8c4 ("mm/hugetlb: fix huge page > reserve accounting for private mappings). That commit uses the information > in private mapping reserve maps to determine if a reservation was already > consumed. This is important in the case of hole punch and truncate as the > pages are released, but reservation entries are not restored. > > This patch restores the reserve entries in hugetlb_cow and hugetlb_no_page > such that reserve entries are consistent with the global reservation count. > > The huge page reservation code is quite hard to follow, and this patch > makes it even more complex. One thought I had was to change the way > hole punch and truncate work so that private mapping pages are not thrown > away. This would eliminate the need for this patch as well as 67961f9db8c4. > It would change the existing semantics (as seen by the user) in this area, > but I believe the documentation (man pages) say the behavior is unspecified. > This could be a future change as well as rewriting the existing reservation > code to make it easier to understand/maintain. Thoughts? > > In any case, this patch addresses the immediate issue. Mike, Just to confirm, I ran this patch on my setup (without the patch from Aneesh) with libhugetlbfs testsuite in loop for several hours. There were no ENOMEM/OOM failures, I did not observe resv leak after it finished. Regards, Jan > > Mike Kravetz (1): > mm/hugetlb: fix huge page reservation leak in private mapping error > paths > > mm/hugetlb.c | 66 > > 1 file changed, 66 insertions(+) > > -- > 2.7.4 > >
Re: [bug/regression] libhugetlbfs testsuite failures and OOMs eventually kill my system
- Original Message - > Jan Stancek writes: > > Hi Mike, > > > > Revert of 67961f9db8c4 helps, I let whole suite run for 100 iterations, > > there were no issues. > > > > I cut down reproducer and removed last mmap/write/munmap as that is enough > > to reproduce the problem. Then I started introducing some traces into > > kernel > > and noticed that on ppc I get 3 faults, while on x86 I get only 2. > > > > Interesting is the 2nd fault, that is first write after mapping as PRIVATE. > > Following condition fails on ppc first time: > > if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) { > > but it's immediately followed by fault that looks identical > > and in that one it evaluates as true. > > > > Same with alloc_huge_page(), on x86_64 it's called twice, on ppc three > > times. > > In 2nd call vma_needs_reservation() returns 0, in 3rd it returns 1. > > > > ppc -> 2nd and 3rd fault --- > > mmap(MAP_PRIVATE) > > hugetlb_fault address: 3e00, flags: 55 > > hugetlb_cow old_page: f10fc000 > > alloc_huge_page ret: f110 > > hugetlb_cow ptep: c00455b27cf8, pte_same: 0 > > free_huge_page page: f110, restore_reserve: 1 > > hugetlb_fault address: 3e00, flags: 55 > > hugetlb_cow old_page: f10fc000 > > alloc_huge_page ret: f110 > > hugetlb_cow ptep: c00455b27cf8, pte_same: 1 > > > > --- x86_64 -> 2nd fault --- > > mmap(MAP_PRIVATE) > > hugetlb_fault address: 7f71a420, flags: 55 > > hugetlb_cow address 0x7f71a420, old_page: ea0008d2 > > alloc_huge_page ret: ea0008d38000 > > hugetlb_cow ptep: 8802314c7908, pte_same: 1 > > > > Regards, > > Jan > > > > Can you check with the below patch. I ran the corrupt-by-cow-opt test with > this patch > and resv count got correctly updated. I am running libhugetlbfs suite with patch below in loop for ~2 hours now and I don't see any problems/ENOMEMs/OOMs or leaked resv pages: 0 hugepages-16384kB/free_hugepages 0 hugepages-16384kB/nr_hugepages 0 hugepages-16384kB/nr_hugepages_mempolicy 0 hugepages-16384kB/nr_overcommit_hugepages 0 hugepages-16384kB/resv_hugepages 0 hugepages-16384kB/surplus_hugepages 0 hugepages-16777216kB/free_hugepages 0 hugepages-16777216kB/nr_hugepages 0 hugepages-16777216kB/nr_hugepages_mempolicy 0 hugepages-16777216kB/nr_overcommit_hugepages 0 hugepages-16777216kB/resv_hugepages 0 hugepages-16777216kB/surplus_hugepages Regards, Jan > > commit fb2e0c081d2922c8aaa49bbe166472aac68ef5e1 > Author: Aneesh Kumar K.V > Date: Tue Oct 18 11:23:11 2016 +0530 > > mm/hugetlb: Use the right pte val for compare in hugetlb_cow > > We cannot use the pte value used in set_pte_at for pte_same comparison, > because archs like ppc64, filter/add new pte flag in set_pte_at. Instead > fetch the pte value inside hugetlb_cow. We are comparing pte value to > make sure the pte didn't change since we dropped the page table lock. > hugetlb_cow get called with page table lock held, and we can take a copy > of the pte value before we drop the page table lock. > > Signed-off-by: Aneesh Kumar K.V > > diff --git a/mm/hugetlb.c b/mm/hugetlb.c > index ec49d9ef1eef..da8fbd02b92e 100644 > --- a/mm/hugetlb.c > +++ b/mm/hugetlb.c > @@ -3386,15 +3386,17 @@ static void unmap_ref_private(struct mm_struct *mm, > struct vm_area_struct *vma, > * Keep the pte_same checks anyway to make transition from the mutex easier. > */ > static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, > - unsigned long address, pte_t *ptep, pte_t pte, > - struct page *pagecache_page, spinlock_t *ptl) > +unsigned long address, pte_t *ptep, > +struct page *pagecache_page, spinlock_t *ptl) > { > + pte_t pte; > struct hstate *h = hstate_vma(vma); > struct page *old_page, *new_page; > int ret = 0, outside_reserve = 0; > unsigned long mmun_start; /* For mmu_notifiers */ > unsigned long mmun_end; /* For mmu_notifiers */ > > + pte = huge_ptep_get(ptep); > old_page = pte_page(pte); > > retry_avoidcopy: > @@ -3668,7 +3670,7 @@ static int hugetlb_no_page(struct mm_struct *mm, struct > vm_area_struct *vma, > hugetlb_count_add(pages_per_huge_page(h), mm); > if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { > /* Optimization, do the COW witho
Re: [bug/regression] libhugetlbfs testsuite failures and OOMs eventually kill my system
- Original Message - > From: "Mike Kravetz" > To: "Jan Stancek" , linux...@kvack.org, > linux-kernel@vger.kernel.org > Cc: "hillf zj" , "dave hansen" > , "kirill shutemov" > , mho...@suse.cz, n-horigu...@ah.jp.nec.com, > "aneesh kumar" > , "iamjoonsoo kim" > Sent: Saturday, 15 October, 2016 1:57:31 AM > Subject: Re: [bug/regression] libhugetlbfs testsuite failures and OOMs > eventually kill my system > > > It is pretty consistent that we leak a reserve page every time this > test is run. > > The interesting thing is that corrupt-by-cow-opt is a very simple > test case. commit 67961f9db8c4 potentially changes the return value > of the functions vma_has_reserves() and vma_needs/commit_reservation() > for the owner (HPAGE_RESV_OWNER) of private mappings. running the > test with and without the commit results in the same return values for > these routines on x86. And, no leaked reserve pages. > > Is it possible to revert this commit and run the libhugetlbs tests > (func and stress) again while monitoring the counts in /sys? The > counts should go to zero after cleanup as you describe above. I just > want to make sure that this commit is causing all the problems you > are seeing. If it is, then we can consider reverting and I can try > to think of another way to address the original issue. > > Thanks for your efforts on this. I can not reproduce on x86 or sparc > and do not see any similar symptoms on these architectures. > > -- > Mike Kravetz > Hi Mike, Revert of 67961f9db8c4 helps, I let whole suite run for 100 iterations, there were no issues. I cut down reproducer and removed last mmap/write/munmap as that is enough to reproduce the problem. Then I started introducing some traces into kernel and noticed that on ppc I get 3 faults, while on x86 I get only 2. Interesting is the 2nd fault, that is first write after mapping as PRIVATE. Following condition fails on ppc first time: if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) { but it's immediately followed by fault that looks identical and in that one it evaluates as true. Same with alloc_huge_page(), on x86_64 it's called twice, on ppc three times. In 2nd call vma_needs_reservation() returns 0, in 3rd it returns 1. ppc -> 2nd and 3rd fault --- mmap(MAP_PRIVATE) hugetlb_fault address: 3e00, flags: 55 hugetlb_cow old_page: f10fc000 alloc_huge_page ret: f110 hugetlb_cow ptep: c00455b27cf8, pte_same: 0 free_huge_page page: f110, restore_reserve: 1 hugetlb_fault address: 3e00, flags: 55 hugetlb_cow old_page: f10fc000 alloc_huge_page ret: f110 hugetlb_cow ptep: c00455b27cf8, pte_same: 1 --- x86_64 -> 2nd fault --- mmap(MAP_PRIVATE) hugetlb_fault address: 7f71a420, flags: 55 hugetlb_cow address 0x7f71a420, old_page: ea0008d2 alloc_huge_page ret: ea0008d38000 hugetlb_cow ptep: 8802314c7908, pte_same: 1 Regards, Jan
Re: [bug/regression] libhugetlbfs testsuite failures and OOMs eventually kill my system
On 10/14/2016 01:26 AM, Mike Kravetz wrote: > > Hi Jan, > > Any chance you can get the contents of /sys/kernel/mm/hugepages > before and after the first run of libhugetlbfs testsuite on Power? > Perhaps a script like: > > cd /sys/kernel/mm/hugepages > for f in hugepages-*/*; do > n=`cat $f`; > echo -e "$n\t$f"; > done > > Just want to make sure the numbers look as they should. > Hi Mike, Numbers are below. I have also isolated a single testcase from "func" group of tests: corrupt-by-cow-opt [1]. This test stops working if I run it 19 times (with 20 hugepages). And if I disable this test, "func" group tests can all pass repeatedly. [1] https://github.com/libhugetlbfs/libhugetlbfs/blob/master/tests/corrupt-by-cow-opt.c Regards, Jan Kernel is v4.8-14230-gb67be92, with reboot between each run. 1) Only func tests System boot After setup: 20 hugepages-16384kB/free_hugepages 20 hugepages-16384kB/nr_hugepages 20 hugepages-16384kB/nr_hugepages_mempolicy 0 hugepages-16384kB/nr_overcommit_hugepages 0 hugepages-16384kB/resv_hugepages 0 hugepages-16384kB/surplus_hugepages 0 hugepages-16777216kB/free_hugepages 0 hugepages-16777216kB/nr_hugepages 0 hugepages-16777216kB/nr_hugepages_mempolicy 0 hugepages-16777216kB/nr_overcommit_hugepages 0 hugepages-16777216kB/resv_hugepages 0 hugepages-16777216kB/surplus_hugepages After func tests: ** TEST SUMMARY * 16M * 32-bit 64-bit * Total testcases: 0 85 * Skipped: 0 0 *PASS: 0 81 *FAIL: 0 4 *Killed by signal: 0 0 * Bad configuration: 0 0 * Expected FAIL: 0 0 * Unexpected PASS: 0 0 * Strange test result: 0 0 26 hugepages-16384kB/free_hugepages 26 hugepages-16384kB/nr_hugepages 26 hugepages-16384kB/nr_hugepages_mempolicy 0 hugepages-16384kB/nr_overcommit_hugepages 1 hugepages-16384kB/resv_hugepages 0 hugepages-16384kB/surplus_hugepages 0 hugepages-16777216kB/free_hugepages 0 hugepages-16777216kB/nr_hugepages 0 hugepages-16777216kB/nr_hugepages_mempolicy 0 hugepages-16777216kB/nr_overcommit_hugepages 0 hugepages-16777216kB/resv_hugepages 0 hugepages-16777216kB/surplus_hugepages After test cleanup: umount -a -t hugetlbfs hugeadm --pool-pages-max ${HPSIZE}:0 1 hugepages-16384kB/free_hugepages 1 hugepages-16384kB/nr_hugepages 1 hugepages-16384kB/nr_hugepages_mempolicy 0 hugepages-16384kB/nr_overcommit_hugepages 1 hugepages-16384kB/resv_hugepages 1 hugepages-16384kB/surplus_hugepages 0 hugepages-16777216kB/free_hugepages 0 hugepages-16777216kB/nr_hugepages 0 hugepages-16777216kB/nr_hugepages_mempolicy 0 hugepages-16777216kB/nr_overcommit_hugepages 0 hugepages-16777216kB/resv_hugepages 0 hugepages-16777216kB/surplus_hugepages --- 2) Only stress tests System boot After setup: 20 hugepages-16384kB/free_hugepages 20 hugepages-16384kB/nr_hugepages 20 hugepages-16384kB/nr_hugepages_mempolicy 0 hugepages-16384kB/nr_overcommit_hugepages 0 hugepages-16384kB/resv_hugepages 0 hugepages-16384kB/surplus_hugepages 0 hugepages-16777216kB/free_hugepages 0 hugepages-16777216kB/nr_hugepages 0 hugepages-16777216kB/nr_hugepages_mempolicy 0 hugepages-16777216kB/nr_overcommit_hugepages 0 hugepages-16777216kB/resv_hugepages 0 hugepages-16777216kB/surplus_hugepages After stress tests: 20 hugepages-16384kB/free_hugepages 20 hugepages-16384kB/nr_hugepages 20 hugepages-16384kB/nr_hugepages_mempolicy 0 hugepages-16384kB/nr_overcommit_hugepages 17 hugepages-16384kB/resv_hugepages 0 hugepages-16384kB/surplus_hugepages 0 hugepages-16777216kB/free_hugepages 0 hugepages-16777216kB/nr_hugepages 0 hugepages-16777216kB/nr_hugepages_mempolicy 0 hugepages-16777216kB/nr_overcommit_hugepages 0 hugepages-16777216kB/resv_hugepages 0 hugepages-16777216kB/surplus_hugepages After cleanup: 17 hugepages-16384kB/free_hugepages 17 hugepages-16384kB/nr_hugepages 17 hugepages-16384kB/nr_hugepages_mempolicy 0 hugepages-16384kB/nr_overcommit_hugepages 17 hugepages-16384kB/resv_hugepages 17 hugepages-16384kB/surplus_hugepages 0 hugepages-16777216kB/free_hugepages 0 hugepages-16777216kB/nr_hugepages 0 hugepages-16777216kB/nr_hugepages_mempolicy 0 hugepages-16777216kB/nr_overcommit_hugepages 0 hugepages-16777216kB/resv_hugepages 0 hugepages-16777216kB/surplus_hugepages --- 3) only corrupt-by-cow-opt System boot After setup: 20 hugepages-16384kB/free_hugepages 20 hugepages-16384kB/nr_hugepages 20 hugepages-16384kB/nr_hugepages_mempolicy 0 hugepages-16384kB/nr_overcommit_hugepages 0 huge
[bug/regression] libhugetlbfs testsuite failures and OOMs eventually kill my system
Hi, I'm running into ENOMEM failures with libhugetlbfs testsuite [1] on a power8 lpar system running 4.8 or latest git [2]. Repeated runs of this suite trigger multiple OOMs, that eventually kill entire system, it usually takes 3-5 runs: * Total System Memory..: 18024 MB * Shared Mem Max Mapping...:320 MB * System Huge Page Size: 16 MB * Available Huge Pages.: 20 * Total size of Huge Pages.:320 MB * Remaining System Memory..: 17704 MB * Huge Page User Group.: hugepages (1001) I see this only on ppc (BE/LE), x86_64 seems unaffected and successfully ran the tests for ~12 hours. Bisect has identified following patch as culprit: commit 67961f9db8c477026ea20ce05761bde6f8bf85b0 Author: Mike Kravetz Date: Wed Jun 8 15:33:42 2016 -0700 mm/hugetlb: fix huge page reserve accounting for private mappings Following patch (made with my limited insight) applied to latest git [2] fixes the problem for me: diff --git a/mm/hugetlb.c b/mm/hugetlb.c index ec49d9e..7261583 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1876,7 +1876,7 @@ static long __vma_reservation_common(struct hstate *h, * return value of this routine is the opposite of the * value returned from reserve map manipulation routines above. */ - if (ret) + if (ret >= 0) return 0; else return 1; Regards, Jan [1] https://github.com/libhugetlbfs/libhugetlbfs [2] v4.8-14230-gb67be92
Re: [PATCH] perf powerpc: Don't call perf_event_disable from atomic context
- Original Message - > From: "Michael Ellerman" > To: "Jiri Olsa" , "Peter Zijlstra" > Cc: "lkml" , "Ingo Molnar" , > "Michael Neuling" , > "Paul Mackerras" , "Alexander Shishkin" > , "Jan Stancek" > > Sent: Tuesday, 4 October, 2016 6:08:27 AM > Subject: Re: [PATCH] perf powerpc: Don't call perf_event_disable from atomic > context > > Jiri Olsa writes: > > > The trinity syscall fuzzer triggered following WARN on powerpc: > > WARNING: CPU: 9 PID: 2998 at arch/powerpc/kernel/hw_breakpoint.c:278 > > ... > > NIP [c093aedc] .hw_breakpoint_handler+0x28c/0x2b0 > > LR [c093aed8] .hw_breakpoint_handler+0x288/0x2b0 > > Call Trace: > > [c002f7933580] [c093aed8] .hw_breakpoint_handler+0x288/0x2b0 > > (unreliable) > > [c002f7933630] [c00f671c] .notifier_call_chain+0x7c/0xf0 > > [c002f79336d0] [c00f6abc] > > .__atomic_notifier_call_chain+0xbc/0x1c0 > > [c002f7933780] [c00f6c40] .notify_die+0x70/0xd0 > > [c002f7933820] [c001a74c] .do_break+0x4c/0x100 > > [c002f7933920] [c00089fc] handle_dabr_fault+0x14/0x48 > > Is that the full stack trace? It doesn't look like it. > > And were you running trinity as root or regular user? As regular user: # adduser dummy # su dummy /mnt/testarea/trinity --children $proc_num -m --syslog -q -T DIE Regards, Jan
[PATCH] crypto: testmgr - add guard to dst buffer for ahash_export
Add a guard to 'state' buffer and warn if its consistency after call to crypto_ahash_export() changes, so that any write that goes beyond advertised statesize (and thus causing potential memory corruption [1]) is more visible. [1] https://marc.info/?l=linux-crypto-vger&m=147467656516085 Signed-off-by: Jan Stancek Cc: Herbert Xu Cc: Marcelo Cerri --- crypto/testmgr.c | 5 - 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/crypto/testmgr.c b/crypto/testmgr.c index 5c9d5a5e7b65..96343bcae01e 100644 --- a/crypto/testmgr.c +++ b/crypto/testmgr.c @@ -209,16 +209,19 @@ static int ahash_partial_update(struct ahash_request **preq, char *state; struct ahash_request *req; int statesize, ret = -EINVAL; + const char guard[] = { 0x00, 0xba, 0xad, 0x00 }; req = *preq; statesize = crypto_ahash_statesize( crypto_ahash_reqtfm(req)); - state = kmalloc(statesize, GFP_KERNEL); + state = kmalloc(statesize + sizeof(guard), GFP_KERNEL); if (!state) { pr_err("alt: hash: Failed to alloc state for %s\n", algo); goto out_nostate; } + memcpy(state + statesize, guard, sizeof(guard)); ret = crypto_ahash_export(req, state); + WARN_ON(memcmp(state + statesize, guard, sizeof(guard))); if (ret) { pr_err("alt: hash: Failed to export() for %s\n", algo); goto out; -- 1.8.3.1
Re: [bug] crypto/vmx/p8_ghash memory corruption in 4.8-rc7
> Jan, > > Can you check if the problem occurs with this patch? No issues in over-night test with this patch. > --- a/drivers/crypto/vmx/vmx.c > +++ b/drivers/crypto/vmx/vmx.c > @@ -28,6 +28,8 @@ > #include > #include > > +int p8_ghash_fallback_descsize(void); > + > extern struct shash_alg p8_ghash_alg; > extern struct crypto_alg p8_aes_alg; > extern struct crypto_alg p8_aes_cbc_alg; > @@ -45,6 +47,7 @@ int __init p8_init(void) > { > int ret = 0; > struct crypto_alg **alg_it; > + int ghash_descsize; > > for (alg_it = algs; *alg_it; alg_it++) { > ret = crypto_register_alg(*alg_it); > @@ -59,6 +62,12 @@ int __init p8_init(void) > if (ret) > return ret; > > + ghash_descsize = p8_ghash_fallback_descsize(); > + if (ghash_descsize < 0) { > + printk(KERN_ERR "Cannot get descsize for p8_ghash fallback\n"); > + return ghash_descsize; > + } > + p8_ghash_alg.descsize += ghash_descsize; > ret = crypto_register_shash(&p8_ghash_alg); > if (ret) { > for (alg_it = algs; *alg_it; alg_it++) I'd suggest to move this inside vmx/ghash.c to a new function, so all p8_ghash related code is at single place. Then p8_init() would just call the new function: -ret = crypto_register_shash(&p8_ghash_alg); +ret = register_p8_ghash(); Regards, Jan
Re: [bug] crypto/vmx/p8_ghash memory corruption in 4.8-rc7
- Original Message - > From: "Herbert Xu" > To: "Marcelo Cerri" > Cc: "Jan Stancek" , "rui y wang" , > mhce...@linux.vnet.ibm.com, > leosi...@linux.vnet.ibm.com, pfsmor...@linux.vnet.ibm.com, > linux-cry...@vger.kernel.org, > linuxppc-...@lists.ozlabs.org, linux-kernel@vger.kernel.org > Sent: Wednesday, 28 September, 2016 4:45:49 AM > Subject: Re: [bug] crypto/vmx/p8_ghash memory corruption in 4.8-rc7 > > On Tue, Sep 27, 2016 at 04:46:44PM -0300, Marcelo Cerri wrote: > > > > Can you check if the problem occurs with this patch? > > In light of the fact that padlock-sha is the correct example > to follow, you only need to add one line to the init_tfm fucntion > to update the descsize based on that of the fallback. Thanks for clearing up how this works in padlock-sha, but we are not exactly in same situation with p8_ghash. p8_ghash_init_tfm() already updates descsize. Problem in original report is that without custom export/import/statesize p8_ghash_alg.statesize gets initialized by shash_prepare_alg() to alg->descsize: crash> p p8_ghash_alg.statesize $1 = 56 testmgr allocates space for export based on crypto_shash_statesize(), but shash_default_export() writes based on crypto_shash_descsize(): [8.297902] state: c004b873aa80, statesize: 56 [8.297932] shash_default_export memcpy c004b873aa80 c004b8607da0, len: 76 so I think we need either: 1) make sure p8_ghash_alg.descsize is correct before we register shash, this is what Marcelo's last patch is doing 2) provide custom export/import/statesize for p8_ghash_alg Regards, Jan > > Thanks, > -- > Email: Herbert Xu > Home Page: http://gondor.apana.org.au/~herbert/ > PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt >
Re: [bug] crypto/vmx/p8_ghash memory corruption in 4.8-rc7
- Original Message - > From: "Herbert Xu" > To: "Marcelo Cerri" > Cc: "Jan Stancek" , "rui y wang" , > mhce...@linux.vnet.ibm.com, > leosi...@linux.vnet.ibm.com, pfsmor...@linux.vnet.ibm.com, > linux-cry...@vger.kernel.org, > linuxppc-...@lists.ozlabs.org, linux-kernel@vger.kernel.org > Sent: Tuesday, 27 September, 2016 5:08:26 AM > Subject: Re: [bug] crypto/vmx/p8_ghash memory corruption in 4.8-rc7 > > On Mon, Sep 26, 2016 at 02:43:17PM -0300, Marcelo Cerri wrote: > > > > Wouldn't be enough to provide a pair of import/export functions as the > > padlock-sha driver does? > > I don't think that will help as ultimately you need to call the > export function on the fallback and that's what requires the extra > memory. In fact very operation involving the fallback will need > that extra memory too. So, if we extended p8_ghash_desc_ctx to accommodate fallback_desc's ctx and then provided statesize/import/export, would that be acceptable? struct p8_ghash_desc_ctx { ... struct shash_desc fallback_desc; + char fallback_ctx[sizeof(struct ghash_desc_ctx)]; Also, does that mean that padlock_sha has similar problem? It does not seem to reserve any space for fallback __ctx and it calls init()/update()/export() with padlock_sha_desc's fallback: struct padlock_sha_desc { struct shash_desc fallback; }; static struct shash_alg sha1_alg = { .descsize = sizeof(struct padlock_sha_desc), Regards, Jan > > Cheers, > -- > Email: Herbert Xu > Home Page: http://gondor.apana.org.au/~herbert/ > PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt >
Re: [bug] crypto/vmx/p8_ghash memory corruption in 4.8-rc7
- Original Message - > From: "Marcelo Cerri" > To: "Jan Stancek" > Cc: "rui y wang" , herb...@gondor.apana.org.au, > mhce...@linux.vnet.ibm.com, > leosi...@linux.vnet.ibm.com, pfsmor...@linux.vnet.ibm.com, > linux-cry...@vger.kernel.org, > linuxppc-...@lists.ozlabs.org, linux-kernel@vger.kernel.org > Sent: Monday, 26 September, 2016 4:15:10 PM > Subject: Re: [bug] crypto/vmx/p8_ghash memory corruption in 4.8-rc7 > > Hi Jan, > > Just out of curiosity, have you tried to use "76" on both values to > check if the problem still happens? I did, I haven't seen any panics with such patch: @@ -211,7 +212,7 @@ struct shash_alg p8_ghash_alg = { .update = p8_ghash_update, .final = p8_ghash_final, .setkey = p8_ghash_setkey, - .descsize = sizeof(struct p8_ghash_desc_ctx), + .descsize = sizeof(struct p8_ghash_desc_ctx) + 20, .base = { .cra_name = "ghash", .cra_driver_name = "p8_ghash",
[bug] crypto/vmx/p8_ghash memory corruption in 4.8-rc7
Hi, I'm chasing a memory corruption with 4.8-rc7 as I'm observing random Oopses on ppc BE/LE systems (lpars, KVM guests). About 30% of issues is that module list gets corrupted, and "cat /proc/modules" or "lsmod" triggers an Oops, for example: [ 88.486041] Unable to handle kernel paging request for data at address 0x0020 ... [ 88.487658] NIP [c020f820] m_show+0xa0/0x240 [ 88.487689] LR [c020f834] m_show+0xb4/0x240 [ 88.487719] Call Trace: [ 88.487736] [c004b605bbb0] [c020f834] m_show+0xb4/0x240 (unreliable) [ 88.487796] [c004b605bc50] [c045e73c] seq_read+0x36c/0x520 [ 88.487843] [c004b605bcf0] [c04e1014] proc_reg_read+0x84/0x120 [ 88.487889] [c004b605bd30] [c040df88] vfs_read+0xf8/0x380 [ 88.487934] [c004b605bde0] [c040fd40] SyS_read+0x60/0x110 [ 88.487981] [c004b605be30] [c0009590] system_call+0x38/0xec 0x20 offset is module_use->source, module_use is NULL because module.source_list gets corrupted. The source of corruption appears to originate from a 'ahash' test for p8_ghash: cryptomgr_test alg_test alg_test_hash test_hash __test_hash ahash_partial_update shash_async_export memcpy With some extra traces [1], I'm seeing that ahash_partial_update() allocates 56 bytes for 'state', and then crypto_ahash_export() writes 76 bytes into it: [5.970887] __test_hash alg name p8_ghash, result: c4333ac0, key: c004b860a500, req: c004b860a380 [5.970963] state: c4333f00, statesize: 56 [5.970995] shash_default_export memcpy c4333f00 c004b860a3e0, len: 76 This seems to directly correspond with: p8_ghash_alg.descsize = sizeof(struct p8_ghash_desc_ctx) == 56 shash_tfm->descsize = sizeof(struct p8_ghash_desc_ctx) + crypto_shash_descsize(fallback) == 56 + 20 where 20 is presumably coming from "ghash_alg.descsize". My gut feeling was that these 2 should match, but I'd love to hear what crypto people think. Thank you, Jan [1] diff --git a/crypto/shash.c b/crypto/shash.c index a051541..49fe182 100644 --- a/crypto/shash.c +++ b/crypto/shash.c @@ -188,6 +188,8 @@ EXPORT_SYMBOL_GPL(crypto_shash_digest); static int shash_default_export(struct shash_desc *desc, void *out) { + int len = crypto_shash_descsize(desc->tfm); + printk("shash_default_export memcpy %p %p, len: %d\n", out, shash_desc_ctx(desc), len); memcpy(out, shash_desc_ctx(desc), crypto_shash_descsize(desc->tfm)); return 0; } diff --git a/crypto/testmgr.c b/crypto/testmgr.c index 5c9d5a5..2e54579 100644 --- a/crypto/testmgr.c +++ b/crypto/testmgr.c @@ -218,6 +218,8 @@ static int ahash_partial_update(struct ahash_request **preq, pr_err("alt: hash: Failed to alloc state for %s\n", algo); goto out_nostate; } + printk("state: %p, statesize: %d\n", state, statesize); + ret = crypto_ahash_export(req, state); if (ret) { pr_err("alt: hash: Failed to export() for %s\n", algo); @@ -288,6 +290,7 @@ static int __test_hash(struct crypto_ahash *tfm, struct hash_testvec *template, "%s\n", algo); goto out_noreq; } + printk("__test_hash alg name %s, result: %p, key: %p, req: %p\n", algo, result, key, req); ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG, tcrypt_complete, &tresult);
Re: [bug] pwritev02 hang on s390x with 4.8.0-rc7
- Original Message - > From: "Al Viro" > To: "Jan Stancek" > Cc: linux-kernel@vger.kernel.org > Sent: Tuesday, 20 September, 2016 5:06:57 PM > Subject: Re: [bug] pwritev02 hang on s390x with 4.8.0-rc7 > > On Tue, Sep 20, 2016 at 02:56:06PM +0200, Jan Stancek wrote: > > Hi, > > > > I'm hitting a regression with LTP's pwritev02 [1] on s390x with 4.8.0-rc7. > > Specifically the EFAULT case, which is passing an iovec with invalid base > > address: > > #define CHUNK 64 > > static struct iovec wr_iovec3[] = { > > {(char *)-1, CHUNK}, > > }; > > hangs with 100% cpu usage and not very helpful stack trace: > > # cat /proc/28544/stack > > [<1000>] 0x1000 > > [] 0x > > > > The problem starts with d4690f1e1cda "fix iov_iter_fault_in_readable()". > > > > Before this commit fault_in_pages_readable() called __get_user() on start > > address which presumably failed with -EFAULT immediately. > > > > With this commit applied fault_in_multipages_readable() appears to return 0 > > for the case when "start" is invalid but "end" overflows. Then according to > > my traces we keep spinning inside while loop in iomap_write_actor(). > > Cute. Let me see if I understand what's going on there: we have a wraparound > that would've been caught by most of access_ok(), but not on an architectures > where access_ok() is a no-op; in that case the loop is skipped and we > just check the last address, which passes and we get a false positive. > Bug is real and it's definitely -stable fodder. > > I'm not sure that the fix you propose is right, though. Note that ERR_PTR() > is not a valid address on any architecture, so any wraparound automatically > means -EFAULT and we can simply check unlikely(uaddr > end) and bugger off > if it holds. while() turns into do-while(), of course, and the same is > needed for the read side. > > Could you check if the following works for you? This fixes pwritev02 hang. I ran all syscalls tests from LTP, and I see a change in behaviour of couple other tests (writev01, writev03 and writev04 [1]) in 4.8.0-rc7. These call writev() with partially invalid iovecs, and now fail with EFAULT, while with previous -rc6 kernel they returned number of bytes written before they encountered invalid iovec record. This should be reproducible also on x86. Regards, Jan [1] https://github.com/linux-test-project/ltp/tree/master/testcases/kernel/syscalls/writev > > diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h > index 66a1260..7e3d537 100644 > --- a/include/linux/pagemap.h > +++ b/include/linux/pagemap.h > @@ -571,56 +571,56 @@ static inline int fault_in_pages_readable(const char > __user *uaddr, int size) > */ > static inline int fault_in_multipages_writeable(char __user *uaddr, int > size) > { > - int ret = 0; > char __user *end = uaddr + size - 1; > > if (unlikely(size == 0)) > - return ret; > + return 0; > > + if (unlikely(uaddr > end)) > + return -EFAULT; > /* >* Writing zeroes into userspace here is OK, because we know that if >* the zero gets there, we'll be overwriting it. >*/ > - while (uaddr <= end) { > - ret = __put_user(0, uaddr); > - if (ret != 0) > - return ret; > + do { > + if (unlikely(__put_user(0, uaddr) != 0)) > + return -EFAULT; > uaddr += PAGE_SIZE; > - } > + } while (uaddr <= end); > > /* Check whether the range spilled into the next page. */ > if (((unsigned long)uaddr & PAGE_MASK) == > ((unsigned long)end & PAGE_MASK)) > - ret = __put_user(0, end); > + return __put_user(0, end); > > - return ret; > + return 0; > } > > static inline int fault_in_multipages_readable(const char __user *uaddr, > int size) > { > volatile char c; > - int ret = 0; > const char __user *end = uaddr + size - 1; > > if (unlikely(size == 0)) > - return ret; > + return 0; > > - while (uaddr <= end) { > - ret = __get_user(c, uaddr); > - if (ret != 0) > - return ret; > + if (unlikely(uaddr > end)) > + return -EFAULT; > + > + do { > + if (unlikely(__get_user(c, uaddr) != 0)) > + return
[bug] pwritev02 hang on s390x with 4.8.0-rc7
Hi, I'm hitting a regression with LTP's pwritev02 [1] on s390x with 4.8.0-rc7. Specifically the EFAULT case, which is passing an iovec with invalid base address: #define CHUNK 64 static struct iovec wr_iovec3[] = { {(char *)-1, CHUNK}, }; hangs with 100% cpu usage and not very helpful stack trace: # cat /proc/28544/stack [<1000>] 0x1000 [] 0x The problem starts with d4690f1e1cda "fix iov_iter_fault_in_readable()". Before this commit fault_in_pages_readable() called __get_user() on start address which presumably failed with -EFAULT immediately. With this commit applied fault_in_multipages_readable() appears to return 0 for the case when "start" is invalid but "end" overflows. Then according to my traces we keep spinning inside while loop in iomap_write_actor(). Patch below makes the issue go away for me: diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 66a1260b33de..e443dbd2b5df 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -600,11 +600,11 @@ static inline int fault_in_multipages_readable(const char __user *uaddr, int size) { volatile char c; - int ret = 0; + int ret = -EFAULT; const char __user *end = uaddr + size - 1; if (unlikely(size == 0)) - return ret; + return 0; while (uaddr <= end) { ret = __get_user(c, uaddr); Regards, Jan [1] https://github.com/linux-test-project/ltp/blob/master/testcases/kernel/syscalls/pwritev/pwritev02.c
[tip:perf/urgent] perf tests: objdump output can contain multi byte chunks
Commit-ID: b2d0dbf09772d091368261ce95db3afce45d994d Gitweb: http://git.kernel.org/tip/b2d0dbf09772d091368261ce95db3afce45d994d Author: Jan Stancek AuthorDate: Tue, 12 Jan 2016 11:07:44 +0100 Committer: Arnaldo Carvalho de Melo CommitDate: Tue, 2 Aug 2016 16:42:51 -0300 perf tests: objdump output can contain multi byte chunks objdump's raw insn output can vary across architectures on the number of bytes per chunk (bpc) displayed and their endianness. The code-reading test relied on reading objdump output as 1 bpc. Kaixu Xia reported test failure on ARM64, where objdump displays 4 bpc: 70c48:f90027bf strxzr, [x29,#72] 70c4c:91224000 addx0, x0, #0x890 70c50:f90023a0 strx0, [x29,#64] This patch adds support to read raw insn output for any bpc length. In case of 2+ bpc it also guesses objdump's display endian. Reported-and-Tested-by: Kaixu Xia Signed-off-by: Jan Stancek Acked-by: Adrian Hunter Cc: Corey Ashford Cc: David Ahern Cc: Frederic Weisbecker Cc: Jiri Olsa Cc: Namhyung Kim Cc: Paul Mackerras Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/07f0f7bcbda78deb423298708ef9b6a54d6b92bd.1452592712.git.jstan...@redhat.com [ Fix up pr_fmt() call to use %zd for size_t variables, fixing the build on Ubuntu cross-compiling to armhf and ppc64 ] Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/code-reading.c | 100 1 file changed, 71 insertions(+), 29 deletions(-) diff --git a/tools/perf/tests/code-reading.c b/tools/perf/tests/code-reading.c index 68a69a1..2af156a 100644 --- a/tools/perf/tests/code-reading.c +++ b/tools/perf/tests/code-reading.c @@ -33,44 +33,86 @@ static unsigned int hex(char c) return c - 'A' + 10; } -static size_t read_objdump_line(const char *line, size_t line_len, void *buf, - size_t len) +static size_t read_objdump_chunk(const char **line, unsigned char **buf, +size_t *buf_len) { - const char *p; - size_t i, j = 0; - - /* Skip to a colon */ - p = strchr(line, ':'); - if (!p) - return 0; - i = p + 1 - line; + size_t bytes_read = 0; + unsigned char *chunk_start = *buf; /* Read bytes */ - while (j < len) { + while (*buf_len > 0) { char c1, c2; - /* Skip spaces */ - for (; i < line_len; i++) { - if (!isspace(line[i])) - break; - } /* Get 2 hex digits */ - if (i >= line_len || !isxdigit(line[i])) + c1 = *(*line)++; + if (!isxdigit(c1)) break; - c1 = line[i++]; - if (i >= line_len || !isxdigit(line[i])) + c2 = *(*line)++; + if (!isxdigit(c2)) break; - c2 = line[i++]; - /* Followed by a space */ - if (i < line_len && line[i] && !isspace(line[i])) + + /* Store byte and advance buf */ + **buf = (hex(c1) << 4) | hex(c2); + (*buf)++; + (*buf_len)--; + bytes_read++; + + /* End of chunk? */ + if (isspace(**line)) break; - /* Store byte */ - *(unsigned char *)buf = (hex(c1) << 4) | hex(c2); - buf += 1; - j++; } + + /* +* objdump will display raw insn as LE if code endian +* is LE and bytes_per_chunk > 1. In that case reverse +* the chunk we just read. +* +* see disassemble_bytes() at binutils/objdump.c for details +* how objdump chooses display endian) +*/ + if (bytes_read > 1 && !bigendian()) { + unsigned char *chunk_end = chunk_start + bytes_read - 1; + unsigned char tmp; + + while (chunk_start < chunk_end) { + tmp = *chunk_start; + *chunk_start = *chunk_end; + *chunk_end = tmp; + chunk_start++; + chunk_end--; + } + } + + return bytes_read; +} + +static size_t read_objdump_line(const char *line, unsigned char *buf, + size_t buf_len) +{ + const char *p; + size_t ret, bytes_read = 0; + + /* Skip to a colon */ + p = strchr(line, ':'); + if (!p) + return 0; + p++; + + /* Skip initial spaces */ + while (*p) { + if (!isspace(*p)) + break; + p++; + } + + do { + ret = read_objdump_chun
[PATCH] crypto: qat - make qat_asym_algs.o depend on asn1 headers
Parallel build can sporadically fail because asn1 headers may not be built yet by the time qat_asym_algs.o is compiled: drivers/crypto/qat/qat_common/qat_asym_algs.c:55:32: fatal error: qat_rsapubkey-asn1.h: No such file or directory #include "qat_rsapubkey-asn1.h" Signed-off-by: Jan Stancek Cc: Tadeusz Struk Cc: Herbert Xu --- drivers/crypto/qat/qat_common/Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/crypto/qat/qat_common/Makefile b/drivers/crypto/qat/qat_common/Makefile index 6d74b91f2152..5fc3dbb9ada0 100644 --- a/drivers/crypto/qat/qat_common/Makefile +++ b/drivers/crypto/qat/qat_common/Makefile @@ -2,6 +2,7 @@ $(obj)/qat_rsapubkey-asn1.o: $(obj)/qat_rsapubkey-asn1.c \ $(obj)/qat_rsapubkey-asn1.h $(obj)/qat_rsaprivkey-asn1.o: $(obj)/qat_rsaprivkey-asn1.c \ $(obj)/qat_rsaprivkey-asn1.h +$(obj)/qat_asym_algs.o: $(obj)/qat_rsapubkey-asn1.h $(obj)/qat_rsaprivkey-asn1.h clean-files += qat_rsapubkey-asn1.c qat_rsapubkey-asn1.h clean-files += qat_rsaprivkey-asn1.c qat_rsaprivkey-asn1.h -- 1.8.3.1
Re: [PATCH] mm/hugetlb: use EOPNOTSUPP in hugetlb sysctl handlers
- Original Message - > From: "Andrew Morton" > To: "Jan Stancek" > Cc: linux...@kvack.org, linux-kernel@vger.kernel.org, > n-horigu...@ah.jp.nec.com, "mike kravetz" > , "hillf zj" , "kirill > shutemov" > , "dave hansen" > , "paul gortmaker" > > Sent: Friday, 4 March, 2016 10:38:07 PM > Subject: Re: [PATCH] mm/hugetlb: use EOPNOTSUPP in hugetlb sysctl handlers > > On Thu, 3 Mar 2016 11:02:51 +0100 Jan Stancek wrote: > > > Replace ENOTSUPP with EOPNOTSUPP. If hugepages are not supported, > > this value is propagated to userspace. EOPNOTSUPP is part of uapi > > and is widely supported by libc libraries. > > hm, what is the actual user-visible effect of this change? Does it fix > some misbehaviour? > It gives nicer message to user, rather than: # cat /proc/sys/vm/nr_hugepages cat: /proc/sys/vm/nr_hugepages: Unknown error 524 And also LTP's proc01 test was failing because this ret code (524) was unexpected: proc01 1 TFAIL : proc01.c:396: read failed: /proc/sys/vm/nr_hugepages: errno=???(524): Unknown error 524 proc01 2 TFAIL : proc01.c:396: read failed: /proc/sys/vm/nr_hugepages_mempolicy: errno=???(524): Unknown error 524 proc01 3 TFAIL : proc01.c:396: read failed: /proc/sys/vm/nr_overcommit_hugepages: errno=???(524): Unknown error 524 Regards, Jan
[PATCH] mm/hugetlb: use EOPNOTSUPP in hugetlb sysctl handlers
Replace ENOTSUPP with EOPNOTSUPP. If hugepages are not supported, this value is propagated to userspace. EOPNOTSUPP is part of uapi and is widely supported by libc libraries. Cc: Andrew Morton Cc: Naoya Horiguchi Cc: Mike Kravetz Cc: Hillf Danton Cc: "Kirill A. Shutemov" Cc: Dave Hansen Cc: Paul Gortmaker Signed-off-by: Jan Stancek --- mm/hugetlb.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 01f2b48c8618..851a29928a99 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2751,7 +2751,7 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy, int ret; if (!hugepages_supported()) - return -ENOTSUPP; + return -EOPNOTSUPP; table->data = &tmp; table->maxlen = sizeof(unsigned long); @@ -2792,7 +2792,7 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write, int ret; if (!hugepages_supported()) - return -ENOTSUPP; + return -EOPNOTSUPP; tmp = h->nr_overcommit_huge_pages; -- 1.8.3.1
Re: [BUG] scheduler doesn't balance thread to idle cpu for 3 seconds
On 01/29/2016 11:33 AM, Jan Stancek wrote: >> >> Also note that I don't think failing this test is a bug per se. >> Undesirable maybe, but within spec, since SIGALRM is process wide, so it >> being delivered to the SCHED_OTHER task is accepted, and SCHED_OTHER has >> no timeliness guarantees. >> >> That said; if I could reliably reproduce I'd have a go at fixing this, I >> suspect there's a 'fun' problem at the bottom of this. > > Thanks for trying, I'll see if I can find some more reliable way. I think I have found a more reliably way, however it requires an older stable kernel: 3.12.53 up to 4.1.17. Consider following scenario: - all tasks on system have RT sched class - main thread of reproducer becomes the only SCHED_OTHER task on system - when alarm(2) expires, main thread is woken up on cpu that is occupied by busy looping RT thread (low_priority_thread) - because main thread was sleeping for 2 seconds, its load has decayed to 0 - the only chance for main thread to run is if it gets balanced to idle CPU - task_tick_fair() doesn't run, there is RT task running on this CPU - main thread is on cfs run queue but its load stays 0 - load balancer never sees this CPU (group) as busy Attached is reproducer and script, which tries to trigger scenario above. I can reproduce it with 4.1.17 on baremetal 4 CPU x86_64 with about 1:50 chance. In this setup failure state persists for a long time, perhaps indefinitely. I tried extending RUNTIME to 10 minutes, main thread still wouldn't run. One more clue: I could work around this issue if I forced an update_entity_load_avg() on sched_entities that have not been updated for some time, as part of periodic rebalance_domains() call. diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c7c1d28..1b5fe80 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5264,6 +5264,7 @@ static void update_blocked_averages(int cpu) struct rq *rq = cpu_rq(cpu); struct cfs_rq *cfs_rq; unsigned long flags; + struct rb_node *rb; raw_spin_lock_irqsave(&rq->lock, flags); update_rq_clock(rq); @@ -5281,6 +5282,19 @@ static void update_blocked_averages(int cpu) } raw_spin_unlock_irqrestore(&rq->lock, flags); + + cfs_rq = &(cpu_rq(cpu)->cfs); + for (rb = rb_first_postorder(&cfs_rq->tasks_timeline); rb; rb = rb_next_postorder(rb)) { + struct sched_entity *se = rb_entry(rb, struct sched_entity, run_node); + + // Task on rq has not been updated for 500ms :-( + if ((cfs_rq_clock_task(cfs_rq) - se->avg.last_runnable_update) > 500L * (1 << 20)) + update_entity_load_avg(se, 1); + } } /* Regards, Jan /* * reproducer v3 for: * [BUG] scheduler doesn't balance thread to idle cpu for 3 seconds * * Based on LTP's pthread_cond_wait_1.c * */ #define _GNU_SOURCE #include #include #include #include #include #include #include #include #include #include #define ERROR_PREFIX "unexpected error: " #define HIGH_PRIORITY 10 #define LOW_PRIORITY 5 #define RUNTIME 5 #define POLICYSCHED_RR #define PTS_PASS 0 #define PTS_FAIL 1 #define PTS_UNRESOLVED 2 pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER; pthread_cond_t cond = PTHREAD_COND_INITIALIZER; /* Flags that the threads use to indicate events */ volatile int woken_up = 0; volatile int low_done = 0; /* Signal handler that handle the ALRM and wakes up * the high priority thread */ void signal_handler(int sig) { (void) sig; if (pthread_cond_signal(&cond) != 0) { printf(ERROR_PREFIX "pthread_cond_signal\n"); exit(PTS_UNRESOLVED); } } /* Utility function to find difference between two time values */ float timediff(struct timespec t2, struct timespec t1) { float diff = t2.tv_sec - t1.tv_sec; diff += (t2.tv_nsec - t1.tv_nsec) / 10.0; return diff; } void *hi_priority_thread(void *tmp) { struct sched_param param; int policy; int rc = 0; (void) tmp; param.sched_priority = HIGH_PRIORITY; rc = pthread_setschedparam(pthread_self(), POLICY, ¶m); if (rc != 0) { printf(ERROR_PREFIX "pthread_setschedparam\n"); exit(PTS_UNRESOLVED); } rc = pthread_getschedparam(pthread_self(), &policy, ¶m); if (rc != 0) { printf(ERROR_PREFIX "pthread_getschedparam\n"); exit(PTS_UNRESOLVED); } if ((policy != POLICY) || (param.sched_priority != HIGH_PRIORITY)) { printf("Error: the policy or priority not correct\n"); exit(PTS_UNRESOLVED); } /* Install a signal handler for ALRM */ if
Re: [BUG] scheduler doesn't balance thread to idle cpu for 3 seconds
- Original Message - > From: "Peter Zijlstra" > To: "Jan Stancek" > Cc: "alex shi" , "guz fnst" , > mi...@redhat.com, jo...@redhat.com, > r...@redhat.com, linux-kernel@vger.kernel.org > Sent: Friday, 29 January, 2016 11:15:22 AM > Subject: Re: [BUG] scheduler doesn't balance thread to idle cpu for 3 seconds > > On Thu, Jan 28, 2016 at 01:43:13PM -0500, Jan Stancek wrote: > > > How long should I have to wait for a fail? > > > > It's about 1000-2000 iterations for me, which I think you covered > > by now in those 2 hours. > > So I've been running: > > while ! ./pthread_cond_wait_1 ; do sleep 1; done > > overnight on the machine, and have yet to hit a wobbly -- that is, its > still running. I have seen similar result. Then, instead of turning CPUs off, I spawned more low prio threads to scale with number of CPUs on system: @@ -213,10 +213,14 @@ printf(ERROR_PREFIX "pthread_attr_setschedparam\n"); exit(PTS_UNRESOLVED); } - rc = pthread_create(&low_id, &low_attr, low_priority_thread, NULL); - if (rc != 0) { - printf(ERROR_PREFIX "pthread_create\n"); - exit(PTS_UNRESOLVED); + + int i, ncpus = sysconf(_SC_NPROCESSORS_ONLN); + for (i = 0; i < ncpus - 1; i++) { + rc = pthread_create(&low_id, &low_attr, low_priority_thread, NULL); + if (rc != 0) { + printf(ERROR_PREFIX "pthread_create\n"); + exit(PTS_UNRESOLVED); + } and let this ran on 3 bare metal x86 systems over night (v4.5-rc1). It failed on 2 systems (12 and 24 CPUs) with 1:1000 chance, it never failed on 3rd one (4 CPUs). > > Also note that I don't think failing this test is a bug per se. > Undesirable maybe, but within spec, since SIGALRM is process wide, so it > being delivered to the SCHED_OTHER task is accepted, and SCHED_OTHER has > no timeliness guarantees. > > That said; if I could reliably reproduce I'd have a go at fixing this, I > suspect there's a 'fun' problem at the bottom of this. Thanks for trying, I'll see if I can find some more reliable way. Regards, Jan
Re: [BUG] scheduler doesn't balance thread to idle cpu for 3 seconds
- Original Message - > From: "Peter Zijlstra" > To: "Jan Stancek" > Cc: "alex shi" , "guz fnst" , > mi...@redhat.com, jo...@redhat.com, > r...@redhat.com, linux-kernel@vger.kernel.org > Sent: Thursday, 28 January, 2016 6:49:03 PM > Subject: Re: [BUG] scheduler doesn't balance thread to idle cpu for 3 seconds > > On Thu, Jan 28, 2016 at 04:55:02PM +0100, Jan Stancek wrote: > > On 01/27/2016 03:52 PM, Jan Stancek wrote: > > > Hello, > > > > > > pthread_cond_wait_1/2 [1] is rarely failing for me on 4.5.0-rc1, > > > on x86_64 KVM guest with 2 CPUs. > > > > > > This test [1]: > > > - spawns 2 SCHED_RR threads > > > - first thread with higher priority sets alarm for 2 seconds and blocks > > > on condition > > > - second thread with lower priority is busy looping for 5 seconds > > > - after 2 seconds alarm signal arrives and handler signals condition > > > - high priority thread should resume running > > > > I have slightly modified testcase, so it will finish immediately when high > > prio > > thread is done. And also to allow it to compile outside of openposix > > testsuite. > > Yeah, I 'fixed' the testcase too. > > So I've had it run for almost 2 hours without a single fail. I've > hot-plugged my cpu count down to 2. I can try that too. I'm mostly seeing this on s390 and x86_64 KVM guests, both have 2 CPUs. Have you noticed if iteration times vary or if they stay consitently at ~2 seconds? > > How long should I have to wait for a fail? It's about 1000-2000 iterations for me, which I think you covered by now in those 2 hours. Regards, Jan
Re: [BUG] scheduler doesn't balance thread to idle cpu for 3 seconds
On 01/27/2016 03:52 PM, Jan Stancek wrote: > Hello, > > pthread_cond_wait_1/2 [1] is rarely failing for me on 4.5.0-rc1, > on x86_64 KVM guest with 2 CPUs. > > This test [1]: > - spawns 2 SCHED_RR threads > - first thread with higher priority sets alarm for 2 seconds and blocks on > condition > - second thread with lower priority is busy looping for 5 seconds > - after 2 seconds alarm signal arrives and handler signals condition > - high priority thread should resume running I have slightly modified testcase, so it will finish immediately when high prio thread is done. And also to allow it to compile outside of openposix testsuite. Testcase is attached. I'm running it in following way: gcc -O2 -pthread pthread_cond_wait_1.c while [ True ]; do time ./a.out sleep 1 done for couple thousand iterations. About half of those are on system booted with init=/bin/bash. > > But rarely I see that high priority thread doesn't resume running until > low priority thread completes its 5 second busy loop. > > Looking at traces (short version attached, long version at [2]), > I see that after 2 seconds scheduler tries to wake up main thread, but it > appears to do that on same CPU where SCHED_RR low prio thread is running, > so nothing happens. Then scheduler makes numerous balance attempts, > but main thread is not balanced to idle CPU. > > My guess is this started with following commit, which changed > weighted_cpuload(): > commit b92486cbf2aa230d00f160664858495c81d2b37b > Author: Alex Shi > Date: Thu Jun 20 10:18:50 2013 +0800 > sched: Compute runnable load avg in cpu_load and cpu_avg_load_per_task Here are some numbers gathered from kernels with HEAD at b92486c and previous commit 83dfd52. System is 2 CPU KVM guest. Each iteration measures how long it took for testcase to finish. Ideally it should take about 2 seconds. 1. HEAD at 83dfd52 sched: Update cpu load after task_tick finish time [s] | iterations -- [2, 2.2] | 3134 [ 2.2, 2.5] | 18 [ 2.5, 3] | 0 [3, 4] | 0 [4, 5] | 0 [5, 999] | 0 2. HEAD at b92486c sched: Compute runnable load avg in cpu_load and cpu_avg_load_per_task finish time [s] | iterations -- [2, 2.2] | 1617 [ 2.2, 2.5] | 38 [ 2.5, 3] |727 [3, 4] |399 [4, 5] | 17 [5, 999] | 11 Regards, Jan > > I could reproduce it with HEAD set at above commit, I couldn't reproduce it > with 3.10 kernel so far. > > Regards, > Jan > > [1] > https://github.com/linux-test-project/ltp/blob/master/testcases/open_posix_testsuite/functional/threads/condvar/pthread_cond_wait_1.c > [2] http://jan.stancek.eu/tmp/pthread_cond_wait_failure/sched-trace1.tar.bz2 > /* * Copyright (c) 2004, QUALCOMM Inc. All rights reserved. * Created by: abisain REMOVE-THIS AT qualcomm DOT com * This file is licensed under the GPL license. For the full content * of this license, see the COPYING file at the top level of this * source tree. * Test that pthread_cond_signal() * shall wakeup a high priority thread even when a low priority thread * is running * Steps: * 1. Create a condition variable * 2. Create a high priority thread and make it wait on the cond * 3. Create a low priority thread and let it busy-loop * 4. Signal the cond in a signal handler and check that high *priority thread got woken up * */ #include #include #include #include #include #include #define TEST "5-1" #define AREA "scheduler" #define ERROR_PREFIX "unexpected error: " AREA " " TEST ": " #define HIGH_PRIORITY 10 #define LOW_PRIORITY 5 #define RUNTIME 5 #define POLICYSCHED_RR #define PTS_PASS 0 #define PTS_FAIL 1 #define PTS_UNRESOLVED 2 /* mutex required by the cond variable */ pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER; /* condition variable that threads block on*/ pthread_cond_t cond = PTHREAD_COND_INITIALIZER; /* Flags that the threads use to indicate events */ volatile int woken_up = 0; volatile int low_done = 0; /* Signal handler that handle the ALRM and wakes up * the high priority thread */ void signal_handler(int sig) { (void) sig; if (pthread_cond_signal(&cond) != 0) { printf(ERROR_PREFIX "pthread_cond_signal\n"); exit(PTS_UNRESOLVED); } } /* Utility function to find difference between two time values */ float timediff(struct timespec t2, struct timespec t1) { float diff = t2.tv_sec - t1.tv_sec; diff += (t2.tv_nsec - t1.tv_nsec) / 10.0; return diff; } void *hi_priority_thread(void *tmp) {
[BUG] scheduler doesn't balance thread to idle cpu for 3 seconds
Hello, pthread_cond_wait_1/2 [1] is rarely failing for me on 4.5.0-rc1, on x86_64 KVM guest with 2 CPUs. This test [1]: - spawns 2 SCHED_RR threads - first thread with higher priority sets alarm for 2 seconds and blocks on condition - second thread with lower priority is busy looping for 5 seconds - after 2 seconds alarm signal arrives and handler signals condition - high priority thread should resume running But rarely I see that high priority thread doesn't resume running until low priority thread completes its 5 second busy loop. Looking at traces (short version attached, long version at [2]), I see that after 2 seconds scheduler tries to wake up main thread, but it appears to do that on same CPU where SCHED_RR low prio thread is running, so nothing happens. Then scheduler makes numerous balance attempts, but main thread is not balanced to idle CPU. My guess is this started with following commit, which changed weighted_cpuload(): commit b92486cbf2aa230d00f160664858495c81d2b37b Author: Alex Shi Date: Thu Jun 20 10:18:50 2013 +0800 sched: Compute runnable load avg in cpu_load and cpu_avg_load_per_task I could reproduce it with HEAD set at above commit, I couldn't reproduce it with 3.10 kernel so far. Regards, Jan [1] https://github.com/linux-test-project/ltp/blob/master/testcases/open_posix_testsuite/functional/threads/condvar/pthread_cond_wait_1.c [2] http://jan.stancek.eu/tmp/pthread_cond_wait_failure/sched-trace1.tar.bz2 # test starts running bash-5626 [001] 294.609021: sched_process_fork: comm=bash pid=5626 child_comm=bash child_pid=10005 bash-5626 [001] d... 294.609023: sched_migrate_task: comm=bash pid=10005 prio=120 orig_cpu=1 dest_cpu=0 bash-5626 [001] d... 294.609029: sched_wakeup_new: comm=bash pid=10005 prio=120 target_cpu=000 -0 [000] d... 294.609073: sched_switch: prev_comm=swapper/0 prev_pid=0 prev_prio=120 prev_state=R ==> next_comm=bash next_pid=10005 next_prio=120 condvar_pthread-10005 [000] d.h. 294.609355: sched_stat_runtime: comm=bash pid=10005 runtime=330015 [ns] vruntime=937586860 [ns] condvar_pthread-10005 [000] d... 294.610222: sched_stat_runtime: comm=condvar_pthread pid=10005 runtime=82 [ns] vruntime=938453522 [ns] condvar_pthread-10005 [000] d... 294.610224: sched_migrate_task: comm=condvar_pthread pid=10005 prio=120 orig_cpu=0 dest_cpu=0 condvar_pthread-10005 [000] 294.610231: sched_process_fork: comm=condvar_pthread pid=10005 child_comm=condvar_pthread child_pid=10006 # high prio thread (10006) is started condvar_pthread-10005 [000] d... 294.610232: sched_migrate_task: comm=condvar_pthread pid=10006 prio=120 orig_cpu=0 dest_cpu=0 condvar_pthread-10005 [000] d... 294.610233: sched_stat_runtime: comm=condvar_pthread pid=10005 runtime=11929 [ns] vruntime=938465451 [ns] condvar_pthread-10005 [000] d... 294.610238: sched_wakeup_new: comm=condvar_pthread pid=10006 prio=120 target_cpu=000 condvar_pthread-10005 [000] d... 294.610267: sched_stat_runtime: comm=condvar_pthread pid=10005 runtime=34190 [ns] vruntime=938499641 [ns] condvar_pthread-10005 [000] d... 294.610268: sched_migrate_task: comm=condvar_pthread pid=10005 prio=120 orig_cpu=0 dest_cpu=0 condvar_pthread-10005 [000] 294.610270: sched_process_fork: comm=condvar_pthread pid=10005 child_comm=condvar_pthread child_pid=10007 # low prio thread (10007) is started condvar_pthread-10005 [000] d... 294.610271: sched_migrate_task: comm=condvar_pthread pid=10007 prio=120 orig_cpu=0 dest_cpu=1 condvar_pthread-10005 [000] d... 294.610277: sched_wakeup_new: comm=condvar_pthread pid=10007 prio=120 target_cpu=001 condvar_pthread-10005 [000] d... 294.610298: sched_stat_runtime: comm=condvar_pthread pid=10005 runtime=31353 [ns] vruntime=938530994 [ns] condvar_pthread-10005 [000] d... 294.610302: sched_switch: prev_comm=condvar_pthread prev_pid=10005 prev_prio=120 prev_state=S ==> next_comm=condvar_pthread next_pid=10006 next_prio=120 condvar_pthread-10006 [000] d... 294.610311: sched_stat_runtime: comm=condvar_pthread pid=10006 runtime=12355 [ns] vruntime=944465872 [ns] -0 [001] d... 294.610318: sched_switch: prev_comm=swapper/1 prev_pid=0 prev_prio=120 prev_state=R ==> next_comm=condvar_pthread next_pid=10007 next_prio=120 condvar_pthread-10007 [001] d... 294.610332: sched_stat_runtime: comm=condvar_pthread pid=10007 runtime=59910 [ns] vruntime=581696119 [ns] condvar_pthread-10006 [000] d... 294.610340: sched_switch: prev_comm=condvar_pthread prev_pid=10006 prev_prio=89 prev_state=S ==> next_comm=swapper/0 next_pid=0 next_prio=120 condvar_pthread-10007 [001] d... 296.571389: sched_switch: prev_comm=condvar_pthread prev_pid=10007 prev_prio=94 prev_state=R ==> next_comm=watchdog/1 next_pid=13 next_prio=0 watchdog/1-13[001] d... 296.571393: sched_migrate_task: comm=condvar_pthread pid=10007 prio=94 orig_cpu=1 dest_cpu=0 -0
Re: [BUG] perf test 21("Test object code reading") failure on ARM64
On Sat, Dec 19, 2015 at 11:04:21AM +0800, xiakaixu wrote: > > >>>... > > > > Hi, > > > > What is your objdump version? > > Hi, > > Sorry for the late reply. > > # objdump --version > GNU objdump (GNU Binutils) 2.25. > > I am sure that the system is Little endian. > > I have attached a patch if you care to try it with your setup. If it still fails, output from -v and last objdump command output would be helpful. Regards, Jan diff --git a/tools/perf/tests/code-reading.c b/tools/perf/tests/code-reading.c index a767a6400c5c..faf726865fac 100644 --- a/tools/perf/tests/code-reading.c +++ b/tools/perf/tests/code-reading.c @@ -33,44 +33,83 @@ static unsigned int hex(char c) return c - 'A' + 10; } -static size_t read_objdump_line(const char *line, size_t line_len, void *buf, - size_t len) +static size_t read_objdump_chunk(const char **line, unsigned char **buf, +size_t *buf_len) +{ + size_t bytes_read = 0; + unsigned char *chunk_start = *buf; + + /* Read bytes */ + while (*buf_len > 0) { + char c1, c2; + + /* Get 2 hex digits */ + c1 = *(*line)++; + if (!isxdigit(c1)) + break; + c2 = *(*line)++; + if (!isxdigit(c2)) + break; + + /* Store byte and advance buf */ + **buf = (hex(c1) << 4) | hex(c2); + (*buf)++; + (*buf_len)--; + bytes_read++; + + /* End of chunk? */ + if (isspace(**line)) + break; + } + + /* +* objdump will display raw insn as LE if code endian +* is LE and bytes_per_chunk > 1. In that case reverse +* the chunk we just read. +*/ + if (bytes_read > 1 && !bigendian()) { + unsigned char *chunk_end = chunk_start + bytes_read - 1; + unsigned char tmp; + + while (chunk_start < chunk_end) { + tmp = *chunk_start; + *chunk_start = *chunk_end; + *chunk_end = tmp; + chunk_start++; + chunk_end--; + } + } + + return bytes_read; +} + +static size_t read_objdump_line(const char *line, unsigned char *buf, + size_t buf_len) { const char *p; - size_t i, j = 0; + size_t ret, bytes_read = 0; /* Skip to a colon */ p = strchr(line, ':'); if (!p) return 0; - i = p + 1 - line; + p++; - /* Read bytes */ - while (j < len) { - char c1, c2; - - /* Skip spaces */ - for (; i < line_len; i++) { - if (!isspace(line[i])) - break; - } - /* Get 2 hex digits */ - if (i >= line_len || !isxdigit(line[i])) - break; - c1 = line[i++]; - if (i >= line_len || !isxdigit(line[i])) - break; - c2 = line[i++]; - /* Followed by a space */ - if (i < line_len && line[i] && !isspace(line[i])) + /* Skip initial spaces */ + while (*p) { + if (!isspace(*p)) break; - /* Store byte */ - *(unsigned char *)buf = (hex(c1) << 4) | hex(c2); - buf += 1; - j++; + p++; } + + do { + ret = read_objdump_chunk(&p, &buf, &buf_len); + bytes_read += ret; + p++; + } while (ret > 0); + /* return number of successfully read bytes */ - return j; + return bytes_read; } static int read_objdump_output(FILE *f, void *buf, size_t *len, u64 start_addr) @@ -95,7 +134,7 @@ static int read_objdump_output(FILE *f, void *buf, size_t *len, u64 start_addr) } /* read objdump data into temporary buffer */ - read_bytes = read_objdump_line(line, ret, tmp, sizeof(tmp)); + read_bytes = read_objdump_line(line, tmp, sizeof(tmp)); if (!read_bytes) continue; @@ -152,7 +191,7 @@ static int read_via_objdump(const char *filename, u64 addr, void *buf, ret = read_objdump_output(f, buf, &len, addr); if (len) { - pr_debug("objdump read too few bytes\n"); + pr_debug("objdump read too few bytes: %lu\n", len); if (!ret) ret = len; }