[PATCH] selftests: use "$(MAKE)" instead of "make" for headers_install
If top make invocation uses -j4 or larger, this patch reduces "make headers_install" subtask run time from 30 to 7 seconds. CC: Shuah Khan CC: Shuah Khan CC: linux-kselft...@vger.kernel.org CC: linux-kernel@vger.kernel.org Signed-off-by: Denys Vlasenko --- tools/testing/selftests/lib.mk | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/lib.mk b/tools/testing/selftests/lib.mk index 7a17ea815736..51124b962d56 100644 --- a/tools/testing/selftests/lib.mk +++ b/tools/testing/selftests/lib.mk @@ -47,9 +47,9 @@ ARCH ?= $(SUBARCH) khdr: ifndef KSFT_KHDR_INSTALL_DONE ifeq (1,$(DEFAULT_INSTALL_HDR_PATH)) - make --no-builtin-rules ARCH=$(ARCH) -C $(top_srcdir) headers_install + $(MAKE) --no-builtin-rules ARCH=$(ARCH) -C $(top_srcdir) headers_install else - make --no-builtin-rules INSTALL_HDR_PATH=$$OUTPUT/usr \ + $(MAKE) --no-builtin-rules INSTALL_HDR_PATH=$$OUTPUT/usr \ ARCH=$(ARCH) -C $(top_srcdir) headers_install endif endif -- 2.25.0
Re: CBL issue #431: lld: x86_64: sysfs: cannot create duplicate filename $module/.rodata.cst{16,32}
On 4/8/19 4:57 PM, Sedat Dilek wrote: We have arch/x86/crypto/chacha-avx2-x86_64.S and arch/x86/crypto/chacha-avx512vl-x86_64.S: .rodata.cst32.CTR2BL .rodata.cst32.CTR4BL .rodata.cst32.CTR2BL .rodata.cst32.CTR4BL ...and in arch/x86/crypto/sha256-avx2-asm.S and arch/x86/crypto/sha512-avx2-asm.S: .rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK Correct? You mean, we have duplicate section names. Well, this brings me to my initial response - "Not sure how exactly this causes the error". Duplicate section names are allowed by the linker. There is nothing wrong with that. The warnings you see come from some other tooling, which does not handle correctly object files with more than one section with the same name. (Having unique names is still preferable, it helps humans to more easily find where sections come from, and for potential future --gc-sections optimization).
Re: CBL issue #431: lld: x86_64: sysfs: cannot create duplicate filename $module/.rodata.cst{16,32}
On 4/8/19 4:34 PM, Sedat Dilek wrote: v2: sdi@iniza:~/src/linux-kernel/linux$ git --no-pager diff diff --git a/arch/x86/crypto/camellia-aesni-avx-asm_64.S b/arch/x86/crypto/camellia-aesni-avx-asm_64.S index a14af6eb09cb..712d6a7e8b8f 100644 --- a/arch/x86/crypto/camellia-aesni-avx-asm_64.S +++ b/arch/x86/crypto/camellia-aesni-avx-asm_64.S @@ -573,8 +573,12 @@ ENDPROC(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) vmovdqu y7, 15 * 16(rio); -/* NB: section is mergeable, all elements must be aligned 16-byte blocks */ -.section .rodata.cst16, "aM", @progbits, 16 +/* + * NB: section is mergeable, all elements must be aligned 16-byte blocks + * There is more than one object in this section, let's use module name + * instead of object name as unique suffix + */ +.section.rodata.cst16.camellia_aesni_avx_asm_64, "aM", @progbits, 16 .align 16 #define SHUFB_BYTES(idx) \ diff --git a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S index b66bbfa62f50..34f6b0c4196d 100644 --- a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S +++ b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S @@ -626,8 +626,12 @@ ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) .long 0x00010203, 0x04050607, 0x80808080, 0x80808080 .long 0x00010203, 0x04050607, 0x80808080, 0x80808080 -/* NB: section is mergeable, all elements must be aligned 16-byte blocks */ -.section .rodata.cst16, "aM", @progbits, 16 +/* + * NB: section is mergeable, all elements must be aligned 16-byte blocks + * There is more than one object in this section, let's use module name + * instead of object name as unique suffix +*/ +.section.rodata.cst16.ccamellia_aesni_avx2_asm_64, "aM", @progbits, 16 why "ccamellia", not "camellia"? I tried to check for the .rodata.cst32 case, how do I identify the *.S files? ? Looks like all .rodata.cst32 sections have suffixes, nothing to fix.
Re: CBL issue #431: lld: x86_64: sysfs: cannot create duplicate filename $module/.rodata.cst{16,32}
On 4/8/19 4:23 PM, Sedat Dilek wrote: For the .rodata.cst16 part you mean sth. like this? yes, see below --- a/arch/x86/crypto/camellia-aesni-avx-asm_64.S +++ b/arch/x86/crypto/camellia-aesni-avx-asm_64.S @@ -573,8 +573,12 @@ ENDPROC(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) vmovdqu y7, 15 * 16(rio); -/* NB: section is mergeable, all elements must be aligned 16-byte blocks */ -.section .rodata.cst16, "aM", @progbits, 16 +/* + * NB: section is mergeable, all elements must be aligned 16-byte blocks + * There is more than one object in this section, let's use module name + * instead of object name as unique suffix + */ +.section.rodata.cst16.camellia-aesni-avx-asm_64, "aM", @progbits, 16 dashes in the name may cause problems, replace with '_'. .align 16 #define SHUFB_BYTES(idx) \ diff --git a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S index b66bbfa62f50..d6ce36e82a93 100644 --- a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S +++ b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S @@ -626,8 +626,12 @@ ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) .long 0x00010203, 0x04050607, 0x80808080, 0x80808080 .long 0x00010203, 0x04050607, 0x80808080, 0x80808080 -/* NB: section is mergeable, all elements must be aligned 16-byte blocks */ -.section .rodata.cst16, "aM", @progbits, 16 +/* + * NB: section is mergeable, all elements must be aligned 16-byte blocks + * There is more than one object in this section, let's use module name + * instead of object name as unique suffix +*/ +.section.rodata.cst16.ccamellia-aesni-avx2-asm_64, "aM", @progbits, 16 dashes in the name may cause problems, replace with '_'.
Re: CBL issue #431: lld: x86_64: sysfs: cannot create duplicate filename $module/.rodata.cst{16,32}
On 4/8/19 3:56 PM, Denys Vlasenko wrote: I propose to change section name, append _module_ name and optionally a comment why this is done: /* NB: section is mergeable, all elements must be aligned 16-byte blocks */ +/* There is more than one object in this section, let's use module name + as unique suffix */ Probably not clear enough... maybe this is better: +/* There is more than one object in this section, let's use module name + instead of object name as unique suffix */ -.section .rodata.cst16, "aM", @progbits, 16 and do not use '-' in the name, replace with '_': +.section .rodata.cst16.cast6_avx_x86_64_asm_64, "aM", @progbits, 16
Re: CBL issue #431: lld: x86_64: sysfs: cannot create duplicate filename $module/.rodata.cst{16,32}
On 4/8/19 3:36 PM, Sedat Dilek wrote: I fell over your commit "crypto: x86 - make constants readonly, allow linker to merge them" [1] while digging into ClangBuiltLinux issue 431 [0]. I see the following in my dmesg-log: $ grep sysfs: dmesg_5.0.4-rc1-1-amd64-cbl-asmgoto.txt [Fri Mar 22 10:32:09 2019] sysfs: cannot create duplicate filename '/module/usbcore/sections/.rodata.cst16' [Fri Mar 22 10:32:18 2019] sysfs: cannot create duplicate filename '/module/nfsd/sections/.rodata.cst32' [Fri Mar 22 10:32:18 2019] sysfs: cannot create duplicate filename '/module/iwlwifi/sections/.rodata.cst16' [Fri Mar 22 10:32:18 2019] sysfs: cannot create duplicate filename '/module/i915/sections/.rodata.cst32' [Fri Mar 22 10:32:18 2019] sysfs: cannot create duplicate filename '/module/mac80211/sections/.rodata.cst32' [Fri Mar 22 10:32:18 2019] sysfs: cannot create duplicate filename '/module/iwlmvm/sections/.rodata.cst16' [Fri Mar 22 10:32:20 2019] sysfs: cannot create duplicate filename '/module/bluetooth/sections/.rodata.cst16' Above modules have dependencies to stuff from arch/x86/crypto (see below P.S.). Not sure how exactly this causes the error, but the cause seems to be having more than one section with the same name. This occurs in only three files (grep for ".rodata.cst16," string): cast6-avx-x86_64-asm_64.S camellia-aesni-avx2-asm_64.S camellia-aesni-avx-asm_64.S /* NB: section is mergeable, all elements must be aligned 16-byte blocks */ .section.rodata.cst16, "aM", @progbits, 16 In other places I used .rodata.cst16.OBJECTNAME, but in these three cases there are more than one object in the section, so I left it w/o OBJECTNAME. I propose to change section name, append _module_ name and optionally a comment why this is done: /* NB: section is mergeable, all elements must be aligned 16-byte blocks */ +/* There is more than one object in this section, let's use module name + as unique suffix */ -.section.rodata.cst16, "aM", @progbits, 16 +.section.rodata.cst16.cast6-avx-x86_64-asm_64, "aM", @progbits, 16 Looks like LLD defaults to -ffunction-sections and -fdata-sections. Do you happen to know what the defaults are for BFD linker? linker does not create section names. Compiler does for .c files, for .S files they are made by a human.
Re: [PATCH 01/25] x86: Make SMAP 64-bit only
On 3/18/19 7:10 PM, Linus Torvalds wrote: On Mon, Mar 18, 2019 at 10:51 AM Peter Zijlstra wrote: How about I do a patch that schedules EFLAGS for both 32bit and 64bit, mark this for backporting to infinity. And then at the end, after the objtool-ac bits land, I do a patch removing the EFLAGS scheduling for x86_64. Sounds sane to me. And we can make it AC-conditional if it's actually shown to be visible from a performance standpoint. But iirc pushf/popf isn't really that expensive - in fact I think it's pretty cheap when system flags don't change. I did not see evidence of this. In my testing, POPF is always ~20 cycles, even if popped flags are identical to current state of flags.
Re: preempt.h: some SOFTIRQ_OFFSET should be SOFTIRQ_MASK?
On Wed, Feb 13, 2019 at 5:05 AM Frederic Weisbecker wrote: > On Tue, Feb 05, 2019 at 07:34:31PM +0100, Denys Vlasenko wrote: > > SOFTIRQ is a counter. > > Why here: > > > > #define in_serving_softirq()(softirq_count() & SOFTIRQ_OFFSET) > > #define in_task() (!(preempt_count() & \ > >(NMI_MASK | HARDIRQ_MASK | > > SOFTIRQ_OFFSET))) > > > > we check only lowest bit? > > So we have SOFTIRQ_OFFSET that is used when serving softirqs. > And we have SOFTIRQ_DISABLE_OFFSET that is used when we disable > softirqs. > > I think the choice is right on both tests above, or am I missing something? >From my reading of the following: * We put the hardirq and softirq counter into the preemption * counter. The bitmask has the following meaning: * - bits 0-7 are the preemption count (max preemption depth: 256) * - bits 8-15 are the softirq count (max # of softirqs: 256) * The hardirq count could in theory be the same as the number of * interrupts in the system, but we run all interrupt handlers with * interrupts disabled, so we cannot have nesting interrupts. Though * there are a few palaeontologic drivers which reenable interrupts in * the handler, so we need more than one bit here. * PREEMPT_MASK:0x00ff * SOFTIRQ_MASK:0xff00 * HARDIRQ_MASK:0x000f * NMI_MASK:0x0010 * PREEMPT_NEED_RESCHED:0x8000 */ #define PREEMPT_BITS8 #define SOFTIRQ_BITS8 #define HARDIRQ_BITS4 #define NMI_BITS1 it seems that 8-bit SOFTIRQ_MASK contains depth count of nested softirqs. Therefore, to test whether we are in softirq, you need: define in_serving_softirq()(softirq_count() & SOFTIRQ_MASK) But existing code uses SOFTIRQ_OFFSET, not SOFTIRQ_MASK, for the mask. This means that if nest count is e.g. 2, in_serving_softirq() will return "false".
preempt.h: some SOFTIRQ_OFFSET should be SOFTIRQ_MASK?
SOFTIRQ is a counter. Why here: #define in_serving_softirq()(softirq_count() & SOFTIRQ_OFFSET) #define in_task() (!(preempt_count() & \ (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET))) we check only lowest bit?
Re: [PATCH] x86: only use ERMS for user copies for larger sizes
On 11/21/2018 02:32 PM, Jens Axboe wrote: On 11/20/18 11:36 PM, Ingo Molnar wrote: * Jens Axboe wrote: So this is a fun one... While I was doing the aio polled work, I noticed that the submitting process spent a substantial amount of time copying data to/from userspace. For aio, that's iocb and io_event, which are 64 and 32 bytes respectively. Looking closer at this, and it seems that ERMS rep movsb is SLOWER for smaller copies, due to a higher startup cost. I came up with this hack to test it out, and low and behold, we now cut the time spent in copying in half. 50% less. Since these kinds of patches tend to lend themselves to bike shedding, I also ran a string of kernel compilations out of RAM. Results are as follows: Patched : 62.86s avg, stddev 0.65s Stock : 63.73s avg, stddev 0.67s which would also seem to indicate that we're faster punting smaller (< 128 byte) copies. CPU: Intel(R) Xeon(R) CPU E5-2650 v4 @ 2.20GHz Interestingly, text size is smaller with the patch as well?! I'm sure there are smarter ways to do this, but results look fairly conclusive. FWIW, the behaviorial change was introduced by: commit 954e482bde20b0e208fd4d34ef26e10afd194600 Author: Fenghua Yu Date: Thu May 24 18:19:45 2012 -0700 x86/copy_user_generic: Optimize copy_user_generic with CPU erms feature which contains nothing in terms of benchmarking or results, just claims that the new hotness is better. Signed-off-by: Jens Axboe --- diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index a9d637bc301d..7dbb78827e64 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h @@ -29,16 +29,27 @@ copy_user_generic(void *to, const void *from, unsigned len) { unsigned ret; + /* +* For smaller copies, don't use ERMS as it's slower. +*/ + if (len < 128) { + alternative_call(copy_user_generic_unrolled, +copy_user_generic_string, X86_FEATURE_REP_GOOD, +ASM_OUTPUT2("=a" (ret), "=D" (to), "=S" (from), +"=d" (len)), +"1" (to), "2" (from), "3" (len) +: "memory", "rcx", "r8", "r9", "r10", "r11"); + return ret; + } + /* * If CPU has ERMS feature, use copy_user_enhanced_fast_string. * Otherwise, if CPU has rep_good feature, use copy_user_generic_string. * Otherwise, use copy_user_generic_unrolled. */ alternative_call_2(copy_user_generic_unrolled, -copy_user_generic_string, -X86_FEATURE_REP_GOOD, -copy_user_enhanced_fast_string, -X86_FEATURE_ERMS, +copy_user_generic_string, X86_FEATURE_REP_GOOD, +copy_user_enhanced_fast_string, X86_FEATURE_ERMS, ASM_OUTPUT2("=a" (ret), "=D" (to), "=S" (from), "=d" (len)), "1" (to), "2" (from), "3" (len) So I'm inclined to do something like yours, because clearly the changelog of 954e482bde20 was at least partly false: Intel can say whatever they want, it's a fact that ERMS has high setup costs for low buffer sizes - ERMS is optimized for large size, cache-aligned copies mainly. I'm actually surprised that something like that was accepted, I guess 2012 was a simpler time :-) But the result is counter-intuitive in terms of kernel text footprint, plus the '128' is pretty arbitrary - we should at least try to come up with a break-even point where manual copy is about as fast as ERMS - on at least a single CPU ... I did some more investigation yesterday, and found this: commit 236222d39347e0e486010f10c1493e83dbbdfba8 Author: Paolo Abeni Date: Thu Jun 29 15:55:58 2017 +0200 x86/uaccess: Optimize copy_user_enhanced_fast_string() for short strings which does attempt to rectify it, but only using ERMS for >= 64 byte copies. At least for me, looks like the break even point is higher than that, which would mean that something like the below would be more appropriate. I also tested this while working for string ops code in musl. I think at least 128 bytes would be the minimum where "REP insn" are more efficient. In my testing, it's more like 256 bytes...
Re: [tip:x86/asm] x86/entry/64: Add two more instruction suffixes
On 07/03/2018 10:46 AM, David Laight wrote: From: Jan Beulich Sent: 03 July 2018 09:36 ... As said there, omitting suffixes from instructions in AT&T mode is bad practice when operand size cannot be determined by the assembler from register operands, and is likely going to be warned about by upstream gas in the future (mine does already). ... - bt $9, EFLAGS(%rsp)/* interrupts off? */ + btl $9, EFLAGS(%rsp)/* interrupts off? */ Hmmm Does the operand size make any difference at all for the bit instructions? I'm pretty sure that the cpus (386 onwards) have always done aligned 32bit transfers (the docs never actually said aligned). I can't remember whether 64bit mode allows immediates above 31. Immediates up to 63 are allowed in 64 bit mode (IOW: for REX-prefixed form) (run-tested). Keep in mind that this instruction is "special" with register bit offset: Register/memory form (BT REG,[MEM]) does not limit or mask the value of bit offset in REG, the instruction uses bit REG%8 in byte at address [MEM+REG/8]. This works correctly even for negative values: REG = -1 will access the most significant bit in the byte immediately before MEM. Thus, for accesses of standard RAM locations (not memory-mapped IO and such), the "operand size" concept for this instruction (and BTC, BTR, BTS) does not make much sense: it accesses one bit. The width of actual memory access is irrelevant. I'd say assembler should just use the "natural" width for current mode (16 or 32-bit), and warn when code tries to use immediate operand which will be truncated and thus needs a wider operand size. Intel documentation says that immediate operand in BT IMM,[MEM] is truncated to operand size. My experiment seems to confirm it: 254:1 <- BT 254,[MEM] actually accesses bit #30, not #254 255:0 254:0 255:0 #include #include int main(int argc, char **argv, char **envp) { char buf[256]; int result; memset(buf, 0x55, sizeof(buf)); /* bit pattern: 01010101 */ buf[255/8] = 0; asm("\n" " bt %1, %2\n" " sbb %0, %0\n" : "=r" (result) : "i" (254), "m" (buf) ); printf("254:%x\n", !!result); asm("\n" " bt %1, %2\n" " sbb %0, %0\n" : "=r" (result) : "i" (255), "m" (buf) ); printf("255:%x\n", !!result); buf[255/8] = 0x55; buf[31/8] = 0; asm("\n" " bt %1, %2\n" " sbb %0, %0\n" : "=r" (result) : "i" (254), "m" (buf) ); printf("254:%x\n", !!result); asm("\n" " bt %1, %2\n" " sbb %0, %0\n" : "=r" (result) : "i" (255), "m" (buf) ); printf("255:%x\n", !!result); return 0; } When I use "r" instead of "i" to generate REG,[MEM] form, the instruction does access bit #254: 254:0 255:0 254:1 255:0
Re: [PATCH] x86/entry/64/compat: Preserve r8-r11 in int $0x80
On 04/18/2018 06:53 PM, Andy Lutomirski wrote: On Tue, Apr 17, 2018 at 8:00 AM, Denys Vlasenko wrote: This means that the new behavior is there for some 8 years already. Whoever was impacted by it, probably already switched to the new ABI. Current ABI is "weaker", it allows kernel to save fewer registers. Which is generally a good thing, since saving/restoring things cost cycles, and sometimes painful on entry paths where you may desperately need a scratch register or two. (Recall this one? - ... movq%rsp, PER_CPU_VAR(rsp_scratch) movqPER_CPU_VAR(cpu_current_top_of_stack), %rsp /* Construct struct pt_regs on stack */ pushq $__USER_DS /* pt_regs->ss */ pushq PER_CPU_VAR(rsp_scratch)/* pt_regs->sp */ ... wouldn't it be _great_ if one of GPRs would be available here to hold userspace %rsp? ) But this is the int $0x80 entry, which uses the stack sanely and doesn't have this problem at all. It was a general point why not committing to save every register may help on the callee (in this case kernel) side.
Re: [PATCH] x86/entry/64/compat: Preserve r8-r11 in int $0x80
On 04/17/2018 04:36 PM, Andy Lutomirski wrote: 32-bit user code that uses int $80 doesn't care about r8-r11. There is, however, some 64-bit user code that intentionally uses int $0x80 to invoke 32-bit system calls. From what I've seen, basically all such code assumes that r8-r15 are all preserved, but the kernel clobbers r8-r11. Since I doubt that there's any code that depends on int $0x80 zeroing r8-r11, change the kernel to preserve them. I suspect that very little user code is broken by the old clobber, since r8-r11 are only rarely allocated by gcc, and they're clobbered by function calls, so they only way we'd see a problem is if the same function that invokes int $0x80 also spills something important to one of these registers. The current behavior seems to date back to the historical commit "[PATCH] x86-64 merge for 2.6.4". Before that, all regs were preserved. I can't find any explanation of why this change was made. This means that the new behavior is there for some 8 years already. Whoever was impacted by it, probably already switched to the new ABI. Current ABI is "weaker", it allows kernel to save fewer registers. Which is generally a good thing, since saving/restoring things cost cycles, and sometimes painful on entry paths where you may desperately need a scratch register or two. (Recall this one? - ... movq%rsp, PER_CPU_VAR(rsp_scratch) movqPER_CPU_VAR(cpu_current_top_of_stack), %rsp /* Construct struct pt_regs on stack */ pushq $__USER_DS /* pt_regs->ss */ pushq PER_CPU_VAR(rsp_scratch)/* pt_regs->sp */ ... wouldn't it be _great_ if one of GPRs would be available here to hold userspace %rsp? ) If userspace needs some registers saved, it's trivial for it to have: push reg1 push reg2 int 0x80 pop reg2 pop reg1 OTOH if userspace _does not_ need some registers saved, but they are defined as saved by the entrypoint ABI, then save/restore work is done every time, even when not needed. Thus, I propose to retain the current behavior.
Re: [PATCH 34/34] x86/mm/pti: Add Warning when booting on a PCIE capable CPU
On 03/05/2018 11:26 AM, Joerg Roedel wrote: From: Joerg Roedel Warn the user in case the performance can be significantly improved by switching to a 64-bit kernel. Suggested-by: Andy Lutomirski Signed-off-by: Joerg Roedel --- arch/x86/mm/pti.c | 16 1 file changed, 16 insertions(+) diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c index 3ffd923..8f5aa0d 100644 --- a/arch/x86/mm/pti.c +++ b/arch/x86/mm/pti.c @@ -385,6 +385,22 @@ void __init pti_init(void) pr_info("enabled\n"); +#ifdef CONFIG_X86_32 + if (boot_cpu_has(X86_FEATURE_PCID)) { + /* Use printk to work around pr_fmt() */ + printk(KERN_WARNING "\n"); + printk(KERN_WARNING "\n"); + printk(KERN_WARNING "** WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! **\n"); + printk(KERN_WARNING "** **\n"); + printk(KERN_WARNING "** You are using 32-bit PTI on a 64-bit PCID-capable CPU. **\n"); + printk(KERN_WARNING "** Your performance will increase dramatically if you **\n"); + printk(KERN_WARNING "** switch to a 64-bit kernel! **\n"); + printk(KERN_WARNING "** **\n"); + printk(KERN_WARNING "** WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! **\n"); + printk(KERN_WARNING "\n"); Isn't it a bit too dramatic? Not one, but two lines of big fat warnings? There are people who run 32-bit kernels on purpose, not because they did not yet realize 64 bits are upon us. E.g. industrial setups with strict regulations and licensing requirements. In many such cases they already are more than satisfied with CPU speeds, thus not interested in 64-bit migration for performance reasons, and avoid it because it would incur mountains of paperwork with no tangible gains. The big fat warning on every boot would be irritating.
[PATCH v2] net: make getname() functions return length rather than use int* parameter
Changes since v1: Added changes in these files: drivers/infiniband/hw/usnic/usnic_transport.c drivers/staging/lustre/lnet/lnet/lib-socket.c drivers/target/iscsi/iscsi_target_login.c drivers/vhost/net.c fs/dlm/lowcomms.c fs/ocfs2/cluster/tcp.c security/tomoyo/network.c Before: All these functions either return a negative error indicator, or store length of sockaddr into "int *socklen" parameter and return zero on success. "int *socklen" parameter is awkward. For example, if caller does not care, it still needs to provide on-stack storage for the value it does not need. None of the many FOO_getname() functions of various protocols ever used old value of *socklen. They always just overwrite it. This change drops this parameter, and makes all these functions, on success, return length of sockaddr. It's always >= 0 and can be differentiated from an error. Tests in callers are changed from "if (err)" to "if (err < 0)", where needed. rpc_sockname() lost "int buflen" parameter, since its only use was to be passed to kernel_getsockname() as &buflen and subsequently not used in any way. Userspace API is not changed. textdata bss dec hex filename 30108430 2633624 873672 33615726 200ef6e vmlinux.before.o 30108109 2633612 873672 33615393 200ee21 vmlinux.o Signed-off-by: Denys Vlasenko CC: David S. Miller CC: linux-kernel@vger.kernel.org CC: net...@vger.kernel.org CC: linux-blueto...@vger.kernel.org CC: linux-decnet-u...@lists.sourceforge.net CC: linux-wirel...@vger.kernel.org CC: linux-r...@vger.kernel.org CC: linux-s...@vger.kernel.org CC: linux-...@vger.kernel.org CC: linux-...@vger.kernel.org --- drivers/infiniband/hw/usnic/usnic_transport.c | 5 ++-- drivers/isdn/mISDN/socket.c | 5 ++-- drivers/net/ppp/pppoe.c | 6 ++--- drivers/net/ppp/pptp.c| 6 ++--- drivers/scsi/iscsi_tcp.c | 14 +-- drivers/soc/qcom/qmi_interface.c | 3 +-- drivers/staging/ipx/af_ipx.c | 6 ++--- drivers/staging/irda/net/af_irda.c| 8 +++--- drivers/staging/lustre/lnet/lnet/lib-socket.c | 7 +++--- drivers/target/iscsi/iscsi_target_login.c | 18 +++--- drivers/vhost/net.c | 7 +++--- fs/dlm/lowcomms.c | 7 +++--- fs/ocfs2/cluster/tcp.c| 6 ++--- include/linux/net.h | 8 +++--- include/net/inet_common.h | 2 +- include/net/ipv6.h| 2 +- include/net/sock.h| 2 +- net/appletalk/ddp.c | 5 ++-- net/atm/pvc.c | 5 ++-- net/atm/svc.c | 5 ++-- net/ax25/af_ax25.c| 4 +-- net/bluetooth/hci_sock.c | 4 +-- net/bluetooth/l2cap_sock.c| 5 ++-- net/bluetooth/rfcomm/sock.c | 5 ++-- net/bluetooth/sco.c | 5 ++-- net/can/raw.c | 6 ++--- net/core/sock.c | 5 ++-- net/decnet/af_decnet.c| 6 ++--- net/ipv4/af_inet.c| 5 ++-- net/ipv6/af_inet6.c | 5 ++-- net/iucv/af_iucv.c| 5 ++-- net/l2tp/l2tp_ip.c| 5 ++-- net/l2tp/l2tp_ip6.c | 5 ++-- net/l2tp/l2tp_ppp.c | 5 ++-- net/llc/af_llc.c | 5 ++-- net/netlink/af_netlink.c | 5 ++-- net/netrom/af_netrom.c| 9 --- net/nfc/llcp_sock.c | 5 ++-- net/packet/af_packet.c| 10 +++- net/phonet/socket.c | 5 ++-- net/qrtr/qrtr.c | 5 ++-- net/rds/af_rds.c | 5 ++-- net/rds/tcp.c | 7 ++ net/rose/af_rose.c| 5 ++-- net/sctp/ipv6.c | 8 +++--- net/smc/af_smc.c | 11 - net/socket.c | 35 +-- net/sunrpc/clnt.c | 6 ++--- net/sunrpc/svcsock.c | 13 ++ net/sunrpc/xprtsock.c | 3 +-- net/tipc/socket.c | 5 ++-- net/unix/af_unix.c| 10 net/vmw_vsock/af_vsock.c | 4 +-- net/x25/af_x25.c | 4 +-- security/tomoyo/network.c | 5 ++-- 55 files changed, 159 insertions(+), 203 deletio
Re: [PATCH] net: make getname() functions return length rather than use int* parameter
On 02/12/2018 06:47 PM, David Miller wrote: From: Denys Vlasenko Date: Mon, 12 Feb 2018 15:15:18 +0100 Before: All these functions either return a negative error indicator, or store length of sockaddr into "int *socklen" parameter and return zero on success. "int *socklen" parameter is awkward. For example, if caller does not care, it still needs to provide on-stack storage for the value it does not need. None of the many FOO_getname() functions of various protocols ever used old value of *socklen. They always just overwrite it. This change drops this parameter, and makes all these functions, on success, return length of sockaddr. It's always >= 0 and can be differentiated from an error. Tests in callers are changed from "if (err)" to "if (err < 0)", where needed. rpc_sockname() lost "int buflen" parameter, since its only use was to be passed to kernel_getsockname() as &buflen and subsequently not used in any way. Userspace API is not changed. textdata bss dec hex filename 30108430 2633624 873672 33615726 200ef6e vmlinux.before.o 30108109 2633612 873672 33615393 200ee21 vmlinux.o Signed-off-by: Denys Vlasenko Please do an allmodconfig build, there are still some conversions you missed: security/tomoyo/network.c: In function ‘tomoyo_socket_listen_permission’: security/tomoyo/network.c:658:19: warning: passing argument 3 of ‘sock->ops->getname’ makes integer from pointer without a cast [-Wint-conversion] &addr, &addr_len, 0); ^ security/tomoyo/network.c:658:19: note: expected ‘int’ but argument is of type ‘int *’ security/tomoyo/network.c:657:21: error: too many arguments to function ‘sock->ops->getname’ const int error = sock->ops->getname(sock, (struct sockaddr *) ^~~~ fs/dlm/lowcomms.c: In function ‘lowcomms_error_report’: fs/dlm/lowcomms.c:495:6: error: too many arguments to function ‘kernel_getpeername’ kernel_getpeername(con->sock, (struct sockaddr *)&saddr, &buflen)) { ^~ fs/dlm/lowcomms.c: In function ‘tcp_accept_from_sock’: fs/dlm/lowcomms.c:761:7: warning: passing argument 3 of ‘newsock->ops->getname’ makes integer from pointer without a cast [-Wint-conversion] &len, 2)) { ^ fs/dlm/lowcomms.c:761:7: note: expected ‘int’ but argument is of type ‘int *’ fs/dlm/lowcomms.c:760:6: error: too many arguments to function ‘newsock->ops->getname’ if (newsock->ops->getname(newsock, (struct sockaddr *)&peeraddr, ^~~ Sorry. Will send updated patch.
[PATCH] net: make getname() functions return length rather than use int* parameter
Before: All these functions either return a negative error indicator, or store length of sockaddr into "int *socklen" parameter and return zero on success. "int *socklen" parameter is awkward. For example, if caller does not care, it still needs to provide on-stack storage for the value it does not need. None of the many FOO_getname() functions of various protocols ever used old value of *socklen. They always just overwrite it. This change drops this parameter, and makes all these functions, on success, return length of sockaddr. It's always >= 0 and can be differentiated from an error. Tests in callers are changed from "if (err)" to "if (err < 0)", where needed. rpc_sockname() lost "int buflen" parameter, since its only use was to be passed to kernel_getsockname() as &buflen and subsequently not used in any way. Userspace API is not changed. textdata bss dec hex filename 30108430 2633624 873672 33615726 200ef6e vmlinux.before.o 30108109 2633612 873672 33615393 200ee21 vmlinux.o Signed-off-by: Denys Vlasenko CC: David S. Miller CC: linux-kernel@vger.kernel.org CC: net...@vger.kernel.org CC: linux-blueto...@vger.kernel.org CC: linux-decnet-u...@lists.sourceforge.net CC: linux-wirel...@vger.kernel.org CC: linux-r...@vger.kernel.org CC: linux-s...@vger.kernel.org CC: linux-...@vger.kernel.org CC: linux-...@vger.kernel.org --- drivers/isdn/mISDN/socket.c| 5 ++--- drivers/net/ppp/pppoe.c| 6 ++ drivers/net/ppp/pptp.c | 6 ++ drivers/scsi/iscsi_tcp.c | 14 +++--- drivers/soc/qcom/qmi_interface.c | 3 +-- drivers/staging/ipx/af_ipx.c | 6 ++ drivers/staging/irda/net/af_irda.c | 8 +++- include/linux/net.h| 8 +++- include/net/inet_common.h | 2 +- include/net/ipv6.h | 2 +- include/net/sock.h | 2 +- net/appletalk/ddp.c| 5 ++--- net/atm/pvc.c | 5 ++--- net/atm/svc.c | 5 ++--- net/ax25/af_ax25.c | 4 ++-- net/bluetooth/hci_sock.c | 4 ++-- net/bluetooth/l2cap_sock.c | 5 ++--- net/bluetooth/rfcomm/sock.c| 5 ++--- net/bluetooth/sco.c| 5 ++--- net/can/raw.c | 6 ++ net/core/sock.c| 5 +++-- net/decnet/af_decnet.c | 6 ++ net/ipv4/af_inet.c | 5 ++--- net/ipv6/af_inet6.c| 5 ++--- net/iucv/af_iucv.c | 5 ++--- net/l2tp/l2tp_ip.c | 5 ++--- net/l2tp/l2tp_ip6.c| 5 ++--- net/l2tp/l2tp_ppp.c| 5 ++--- net/llc/af_llc.c | 5 ++--- net/netlink/af_netlink.c | 5 ++--- net/netrom/af_netrom.c | 9 + net/nfc/llcp_sock.c| 5 ++--- net/packet/af_packet.c | 10 -- net/phonet/socket.c| 5 ++--- net/qrtr/qrtr.c| 5 ++--- net/rds/af_rds.c | 5 ++--- net/rds/tcp.c | 7 ++- net/rose/af_rose.c | 5 ++--- net/sctp/ipv6.c| 8 net/smc/af_smc.c | 7 +++ net/socket.c | 35 +-- net/sunrpc/clnt.c | 6 +++--- net/sunrpc/svcsock.c | 13 - net/sunrpc/xprtsock.c | 3 +-- net/tipc/socket.c | 5 ++--- net/unix/af_unix.c | 10 +- net/vmw_vsock/af_vsock.c | 4 ++-- net/x25/af_x25.c | 4 ++-- 48 files changed, 132 insertions(+), 171 deletions(-) diff --git a/drivers/isdn/mISDN/socket.c b/drivers/isdn/mISDN/socket.c index c5603d1a07d6..1f8f489b4167 100644 --- a/drivers/isdn/mISDN/socket.c +++ b/drivers/isdn/mISDN/socket.c @@ -560,7 +560,7 @@ data_sock_bind(struct socket *sock, struct sockaddr *addr, int addr_len) static int data_sock_getname(struct socket *sock, struct sockaddr *addr, - int *addr_len, int peer) + int peer) { struct sockaddr_mISDN *maddr = (struct sockaddr_mISDN *) addr; struct sock *sk = sock->sk; @@ -570,14 +570,13 @@ data_sock_getname(struct socket *sock, struct sockaddr *addr, lock_sock(sk); - *addr_len = sizeof(*maddr); maddr->family = AF_ISDN; maddr->dev = _pms(sk)->dev->id; maddr->channel = _pms(sk)->ch.nr; maddr->sapi = _pms(sk)->ch.addr & 0xff; maddr->tei = (_pms(sk)->ch.addr >> 8) & 0xff; release_sock(sk); - return 0; + return sizeof(*maddr); } static const struct proto_ops data_sock_ops = { diff --git a/drivers/net/ppp/pppoe.c b/drivers/net/ppp/pppoe.c index 5aa59f41bf8c
Re: [tip:x86/pti] x86/entry/64: Introduce the PUSH_AND_CLEAN_REGS macro
On 02/12/2018 02:36 PM, David Laight wrote: From: Denys Vlasenko Sent: 12 February 2018 13:29 ... x86/entry/64: Introduce the PUSH_AND_CLEAN_REGS macro Those instances where ALLOC_PT_GPREGS_ON_STACK is called just before SAVE_AND_CLEAR_REGS can trivially be replaced by PUSH_AND_CLEAN_REGS. This macro uses PUSH instead of MOV and should therefore be faster, at least on newer CPUs. ... Link: http://lkml.kernel.org/r/20180211104949.12992-5-li...@dominikbrodowski.net Signed-off-by: Ingo Molnar --- arch/x86/entry/calling.h | 36 arch/x86/entry/entry_64.S | 6 ++ 2 files changed, 38 insertions(+), 4 deletions(-) diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index a05cbb8..57b1b87 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h @@ -137,6 +137,42 @@ For 32-bit we have the following conventions - kernel is built with UNWIND_HINT_REGS offset=\offset .endm + .macro PUSH_AND_CLEAR_REGS + /* +* Push registers and sanitize registers of values that a +* speculation attack might otherwise want to exploit. The +* lower registers are likely clobbered well before they +* could be put to use in a speculative execution gadget. +* Interleave XOR with PUSH for better uop scheduling: +*/ + pushq %rdi/* pt_regs->di */ + pushq %rsi/* pt_regs->si */ + pushq %rdx/* pt_regs->dx */ + pushq %rcx/* pt_regs->cx */ + pushq %rax/* pt_regs->ax */ + pushq %r8 /* pt_regs->r8 */ + xorq%r8, %r8/* nospec r8 */ xorq's are slower than xorl's on Silvermont/Knights Landing. I propose using xorl instead. Does using movq to copy the first zero to the other registers make the code any faster? ISTR mov reg-reg is often implemented as a register rename rather than an alu operation. xorl is implemented in register rename as well. Just, for some reason, xorq did not get the same treatment on those CPUs.
Re: [tip:x86/pti] x86/entry/64: Introduce the PUSH_AND_CLEAN_REGS macro
On 02/12/2018 11:17 AM, tip-bot for Dominik Brodowski wrote: Commit-ID: 7b7b09f110f06f3c006e9b3f4590f7d9ba91345b Gitweb: https://git.kernel.org/tip/7b7b09f110f06f3c006e9b3f4590f7d9ba91345b Author: Dominik Brodowski AuthorDate: Sun, 11 Feb 2018 11:49:45 +0100 Committer: Ingo Molnar CommitDate: Mon, 12 Feb 2018 08:06:36 +0100 x86/entry/64: Introduce the PUSH_AND_CLEAN_REGS macro Those instances where ALLOC_PT_GPREGS_ON_STACK is called just before SAVE_AND_CLEAR_REGS can trivially be replaced by PUSH_AND_CLEAN_REGS. This macro uses PUSH instead of MOV and should therefore be faster, at least on newer CPUs. Suggested-by: Linus Torvalds Signed-off-by: Dominik Brodowski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: dan.j.willi...@intel.com Link: http://lkml.kernel.org/r/20180211104949.12992-5-li...@dominikbrodowski.net Signed-off-by: Ingo Molnar --- arch/x86/entry/calling.h | 36 arch/x86/entry/entry_64.S | 6 ++ 2 files changed, 38 insertions(+), 4 deletions(-) diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index a05cbb8..57b1b87 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h @@ -137,6 +137,42 @@ For 32-bit we have the following conventions - kernel is built with UNWIND_HINT_REGS offset=\offset .endm + .macro PUSH_AND_CLEAR_REGS + /* +* Push registers and sanitize registers of values that a +* speculation attack might otherwise want to exploit. The +* lower registers are likely clobbered well before they +* could be put to use in a speculative execution gadget. +* Interleave XOR with PUSH for better uop scheduling: +*/ + pushq %rdi/* pt_regs->di */ + pushq %rsi/* pt_regs->si */ + pushq %rdx/* pt_regs->dx */ + pushq %rcx/* pt_regs->cx */ + pushq %rax/* pt_regs->ax */ + pushq %r8 /* pt_regs->r8 */ + xorq%r8, %r8/* nospec r8 */ xorq's are slower than xorl's on Silvermont/Knights Landing. I propose using xorl instead. + pushq %r9 /* pt_regs->r9 */ + xorq%r9, %r9/* nospec r9 */ + pushq %r10/* pt_regs->r10 */ + xorq%r10, %r10 /* nospec r10 */ + pushq %r11/* pt_regs->r11 */ + xorq%r11, %r11 /* nospec r11*/ + pushq %rbx/* pt_regs->rbx */ + xorl%ebx, %ebx /* nospec rbx*/ + pushq %rbp/* pt_regs->rbp */ + xorl%ebp, %ebp /* nospec rbp*/ + pushq %r12/* pt_regs->r12 */ + xorq%r12, %r12 /* nospec r12*/ + pushq %r13/* pt_regs->r13 */ + xorq%r13, %r13 /* nospec r13*/ + pushq %r14/* pt_regs->r14 */ + xorq%r14, %r14 /* nospec r14*/ + pushq %r15/* pt_regs->r15 */ + xorq%r15, %r15 /* nospec r15*/
Re: [PATCH 09/31] x86/entry/32: Leave the kernel via trampoline stack
On 02/09/2018 08:02 PM, Joerg Roedel wrote: On Fri, Feb 09, 2018 at 09:05:02AM -0800, Linus Torvalds wrote: On Fri, Feb 9, 2018 at 1:25 AM, Joerg Roedel wrote: + + /* Copy over the stack-frame */ + cld + rep movsb Ugh. This is going to be horrendous. Maybe not noticeable on modern CPU's, but the whole 32-bit code is kind of pointless on a modern CPU. At least use "rep movsl". If the kernel stack isn't 4-byte aligned, you have issues. Okay, I used movsb because I remembered that being the recommendation for the most efficient memcpy, and it safes me an instruction. But that is probably only true on modern CPUs. It's fast (copies data with full-width loads and stores, up to 64-byte wide on latest Intel CPUs), but this kicks in only for largish blocks. In your case, you are copying less than 100 bytes.
Re: [PATCH 09/31] x86/entry/32: Leave the kernel via trampoline stack
On 02/09/2018 06:05 PM, Linus Torvalds wrote: On Fri, Feb 9, 2018 at 1:25 AM, Joerg Roedel wrote: + + /* Copy over the stack-frame */ + cld + rep movsb Ugh. This is going to be horrendous. Maybe not noticeable on modern CPU's, but the whole 32-bit code is kind of pointless on a modern CPU. At least use "rep movsl". If the kernel stack isn't 4-byte aligned, you have issues. Indeed, "rep movs" has some setup overhead that makes it undesirable for small sizes. In my testing, moving less than 128 bytes with "rep movs" is a loss.
Re: [PATCH v2 09/18] x86/asm: Move SYSENTER_stack to the beginning of struct tss_struct
On Wed, Nov 22, 2017 at 5:44 AM, Andy Lutomirski wrote: > I want SYSENTER_stack to have reliable overflow detection, which > means that it needs to be at the bottom of a page, not the top. > Move it to the beginning of struct tss_struct and page-align it. > > Also add an assertion to make sure that the fixed hardware TSS > doesn't cross a page boundary. > > Signed-off-by: Andy Lutomirski > --- > arch/x86/include/asm/processor.h | 21 - > arch/x86/kernel/cpu/common.c | 22 ++ > 2 files changed, 34 insertions(+), 9 deletions(-) > > diff --git a/arch/x86/include/asm/processor.h > b/arch/x86/include/asm/processor.h > index d32a3c88a968..8f5dac9dfbdc 100644 > --- a/arch/x86/include/asm/processor.h > +++ b/arch/x86/include/asm/processor.h > @@ -327,7 +327,16 @@ struct x86_hw_tss { > > struct tss_struct { > /* > -* The hardware state: > +* Space for the temporary SYSENTER stack, used for SYSENTER > +* and the entry trampoline as well. > +*/ > + unsigned long SYSENTER_stack_canary; > + unsigned long SYSENTER_stack[64]; > + > + /* > +* The fixed hardware portion. This must not cross a page boundary > +* at risk of violating the SDM's advice and potentially triggering > +* errata. > */ > struct x86_hw_tss x86_tss; > > @@ -338,15 +347,9 @@ struct tss_struct { > * be within the limit. > */ > unsigned long io_bitmap[IO_BITMAP_LONGS + 1]; > +} __attribute__((__aligned__(PAGE_SIZE))); > > - /* > -* Space for the temporary SYSENTER stack. > -*/ > - unsigned long SYSENTER_stack_canary; > - unsigned long SYSENTER_stack[64]; > -} cacheline_aligned; You may also move this initializer in process.c: .SYSENTER_stack_canary = STACK_END_MAGIC,
Re: [tip:x86/asm] x86/umip: Add emulation code for UMIP instructions
On 11/08/2017 06:14 PM, Paolo Bonzini wrote: On 08/11/2017 18:09, Denys Vlasenko wrote: On 11/08/2017 05:57 PM, Linus Torvalds wrote: On Wed, Nov 8, 2017 at 8:53 AM, Denys Vlasenko wrote: We can postpone enabling UMIP by default by a year or so. By this time, new Wine will be on majority of users' machines. So you are suggesting we run unnecessarily insecure, only in order to not do the emulation that we already have the code for and that the patch implements? We ran insecure in this way for ~25 years. Why? To avoid having to maintain more obscure, rarely executed code. As a start, you could propose a patch to disable the emulation code through a sysctl or Kconfig symbol. This way, the emulation code will still be in the kernel, and still need to be maintained. In my opinion, if we do emulate these insns, then adding even more code to disable that emulation is not worth doing. I would be surprised if it takes more time than what you've spent writing emails in this thread. Yes, I not only f**ing retarded, I'm also lazy. Thanks guys...
Re: [tip:x86/asm] x86/umip: Add emulation code for UMIP instructions
On 11/08/2017 05:57 PM, Linus Torvalds wrote: On Wed, Nov 8, 2017 at 8:53 AM, Denys Vlasenko wrote: We can postpone enabling UMIP by default by a year or so. By this time, new Wine will be on majority of users' machines. So you are suggesting we run unnecessarily insecure, only in order to not do the emulation that we already have the code for and that the patch implements? We ran insecure in this way for ~25 years. Why? To avoid having to maintain more obscure, rarely executed code.
Re: [tip:x86/asm] x86/umip: Add emulation code for UMIP instructions
On 11/08/2017 05:34 PM, Linus Torvalds wrote: On Wed, Nov 8, 2017 at 8:14 AM, Denys Vlasenko wrote: Can we avoid maintain emulation of these isns, by asking Wine to remove their use instead? If we ask the Wine people to remove the instruction use, that may mean that we can avoid the emulation in four or five _years_ once everybody has updated. But it wouldn't mean that we could avoid it today. We can postpone enabling UMIP by default by a year or so. By this time, new Wine will be on majority of users' machines. Then, when kernels switch to enable UMIP by default, umip=0 kernel flag can be used if somebody for some reason updates their kernel but not Wine. This is much less code, and simpler code, than implementing SIDT et al emulation. Keep in mind that our SIDT emulation itself can turn out to be buggy, in the worst case it may end up having worse holes that SIDT isns was - what if it can be tricked into writing into arbitrary kernel address?
Re: [tip:x86/asm] x86/umip: Add emulation code for UMIP instructions
On 11/08/2017 12:00 PM, tip-bot for Ricardo Neri wrote: Commit-ID: 1e5db223696afa55e6a038fac638f759e1fdcc01 Gitweb: https://git.kernel.org/tip/1e5db223696afa55e6a038fac638f759e1fdcc01 Author: Ricardo Neri AuthorDate: Sun, 5 Nov 2017 18:27:52 -0800 Committer: Ingo Molnar CommitDate: Wed, 8 Nov 2017 11:16:22 +0100 x86/umip: Add emulation code for UMIP instructions The feature User-Mode Instruction Prevention present in recent Intel processor prevents a group of instructions (sgdt, sidt, sldt, smsw, and str) from being executed with CPL > 0. Otherwise, a general protection fault is issued. This was arguably an oversight on Intel's part - these insns should have been protected from the start, as they leak a tiny bit of kernel data. Rather than relaying to the user space the general protection fault caused by the UMIP-protected instructions (in the form of a SIGSEGV signal), it can be trapped and the instruction emulated to provide a dummy result. This allows to both conserve the current kernel behavior and not reveal the system resources that UMIP intends to protect (i.e., the locations of the global descriptor and interrupt descriptor tables, the segment selectors of the local descriptor table, the value of the task state register and the contents of the CR0 register). This emulation is needed because certain applications (e.g., WineHQ and DOSEMU2) rely on this subset of instructions to function. I'm surprised. What in the world they need those insns for? Wine uses sidt like this, to emulate "mov from r/m to reg" insns: static LDT_ENTRY idt[256]; ... case 0x8a: /* mov Eb, Gb */ case 0x8b: /* mov Ev, Gv */ { BYTE *data = INSTR_GetOperandAddr(context, instr + 1, long_addr, segprefix, &len); unsigned int data_size = (*instr == 0x8b) ? (long_op ? 4 : 2) : 1; struct idtr idtr = get_idtr(); <=== HERE unsigned int offset = data - idtr.base; if (offset <= idtr.limit + 1 - data_size) { idt[1].LimitLow = 0x100; /* FIXME */ idt[2].LimitLow = 0x11E; /* FIXME */ idt[3].LimitLow = 0x500; /* FIXME */ switch (*instr) { case 0x8a: store_reg_byte( context, instr[1], (BYTE *)idt + offset ); break; case 0x8b: store_reg_word( context, instr[1], (BYTE *)idt + offset, long_op ); break; } context->Eip += prefixlen + len + 1; return ExceptionContinueExecution; } break; /* Unable to emulate it */ } Looks baffling, to say the least... this supports someone who reads IDT bytes via those insns, and they need to ensure that the values read from idt[1/2/3].LimitLow are as expected. That's it? Pity git history doesn't go far enough in the past, and comments are not informative as well... I did not find smsw or sgdt in Wine git tree. I did not find smsw, sidt or sgdt in dosemu2-devel git tree. Can we avoid maintain emulation of these isns, by asking Wine to remove their use instead?
Is GET_CR0_INTO_EAX macro unused?
Hi Andy, From by git archaeology, looks like last use of GET_CR0_INTO_EAX was removed long ago, in 2008 (see commit below). Right now, I only grep it here in a comment in entry_32.S: /* * We use macros for low-level operations which need to be overridden * for paravirtualization. The following will never clobber any registers: * INTERRUPT_RETURN (aka. "iret") * GET_CR0_INTO_EAX (aka. "movl %cr0, %eax") To support GET_CR0_INTO_EAX, paravirt has PV_CPU_read_cr0 a.k.a. struct pv_cpu_ops::read_cr0 and a bunch of code to support it for each hypervisor. Can we delete it? Or am I missing a non-obvious place where this macro is still in use? commit 7643e9b936b4af31ba4851eb7d5b3a3bfad52502 Author: Alexander van Heukelum Date: Tue Sep 9 21:56:02 2008 +0200 i386: convert hardware exception 7 to an interrupt gate Handle no coprocessor exception with interrupt initially off. device_not_available in entry_32.S calls either math_state_restore or math_emulate. This patch adds an extra indirection to be able to re-enable interrupts explicitly in traps_32.c Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 109792b..5a88585 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -760,20 +760,9 @@ ENTRY(device_not_available) RING0_INT_FRAME pushl $-1 # mark this as an int CFI_ADJUST_CFA_OFFSET 4 - SAVE_ALL - GET_CR0_INTO_EAX - testl $0x4, %eax# EM (math emulation bit) - jne device_not_available_emulate - preempt_stop(CLBR_ANY) - call math_state_restore - jmp ret_from_exception -device_not_available_emulate: - pushl $0# temporary storage for ORIG_EIP + pushl $do_device_not_available CFI_ADJUST_CFA_OFFSET 4 - call math_emulate - addl $4, %esp - CFI_ADJUST_CFA_OFFSET -4 - jmp ret_from_exception + jmp error_code CFI_ENDPROC END(device_not_available)
Re: [PATCH] KVM: SVM: refactor avic VM ID allocation
On 08/17/2017 04:33 PM, Paolo Bonzini wrote: On 15/08/2017 22:12, Radim Krčmář wrote: 2017-08-11 22:11+0200, Denys Vlasenko: With lightly tweaked defconfig: textdata bss dec hex filename 11259661 5109408 2981888 19350957 12745ad vmlinux.before 11259661 5109408 884736 17253805 10745ad vmlinux.after Only compile-tested. Signed-off-by: Denys Vlasenko Cc: Joerg Roedel Cc: pbonz...@redhat.com Cc: rkrc...@redhat.com Cc: t...@linutronix.de Cc: mi...@redhat.com Cc: h...@zytor.com Cc: x...@kernel.org Cc: k...@vger.kernel.org Cc: linux-kernel@vger.kernel.org --- Ah, I suspected my todo wasn't this short; thanks for the patch! @@ -1468,6 +1433,22 @@ static int avic_vm_init(struct kvm *kvm) clear_page(page_address(l_page)); spin_lock_irqsave(&svm_vm_data_hash_lock, flags); + again: + vm_id = next_vm_id = (next_vm_id + 1) & AVIC_VM_ID_MASK; + if (vm_id == 0) { /* id is 1-based, zero is not okay */ Suravee, did the reserved zero mean something? + next_vm_id_wrapped = 1; + goto again; + } + /* Is it still in use? Only possible if wrapped at least once */ + if (next_vm_id_wrapped) { + hash_for_each_possible(svm_vm_data_hash, ka, hnode, vm_id) { + struct kvm *k2 = container_of(ka, struct kvm, arch); + struct kvm_arch *vd2 = &k2->arch; + if (vd2->avic_vm_id == vm_id) + goto again; Although hitting the case where all 2^24 ids are already used is practically impossible, I don't like the loose end ... I think even the case where 2^16 ids are used deserves a medal. Why don't we just make the bitmap 8 KiB and call it a day? :) Well, the RAM is cheap, but a 4-byte variable is still thousands of times smaller than a 8K bitmap. Since a 256 element hash table is used here, you need to have ~one hundred VMs to start seeing (very small) degradation in speed of creation of new VMs compared to bitmap approach, and that is only after 16777216 VMs were created since reboot. If you want to spend RAM on speeding this up, you can increase hash table size instead. That would speed up avic_ga_log_notifier() too.
[PATCH] lib/raid6: align AVX512 constants to 512 bits, not bytes
Signed-off-by: Denys Vlasenko Cc: H. Peter Anvin Cc: mi...@redhat.com Cc: Jim Kukunas Cc: Fenghua Yu Cc: Megha Dey Cc: Gayatri Kammela Cc: Shaohua Li Cc: x...@kernel.org Cc: linux-kernel@vger.kernel.org --- avx512.o before: Sections: Idx Name Size VMA LMA File off Algn 4 .rodata 0240 0c00 2**9 CONTENTS, ALLOC, LOAD, RELOC, READONLY, DATA ... Contents of section .rodata: 0010 0020 0100 0030 0040 0050 0060 0100 0070 0080 0090 00a0 0100 00b0 00c0 00d0 00e0 00f0 0100 0110 0120 0130 0140 0150 0160 0170 0180 0190 01a0 01b0 01c0 01d0 01e0 01f0 0200 1d1d1d1d 1d1d1d1d 1d1d1d1d 1d1d1d1d 0210 1d1d1d1d 1d1d1d1d 1d1d1d1d 1d1d1d1d 0220 1d1d1d1d 1d1d1d1d 1d1d1d1d 1d1d1d1d 0230 1d1d1d1d 1d1d1d1d 1d1d1d1d 1d1d1d1d avx512.o after: Sections: Idx Name Size VMA LMA File off Algn 4 .rodata 0100 0b40 2**6 CONTENTS, ALLOC, LOAD, RELOC, READONLY, DATA ... Contents of section .rodata: 0010 0020 0100 0030 0040 0050 0060 0100 0070 0080 0090 00a0 0100 00b0 00c0 1d1d1d1d 1d1d1d1d 1d1d1d1d 1d1d1d1d 00d0 1d1d1d1d 1d1d1d1d 1d1d1d1d 1d1d1d1d 00e0 1d1d1d1d 1d1d1d1d 1d1d1d1d 1d1d1d1d 00f0 1d1d1d1d 1d1d1d1d 1d1d1d1d 1d1d1d1d lib/raid6/avx512.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/raid6/avx512.c b/lib/raid6/avx512.c index f524a79..46df797 100644 --- a/lib/raid6/avx512.c +++ b/lib/raid6/avx512.c @@ -29,7 +29,7 @@ static const struct raid6_avx512_constants { u64 x1d[8]; -} raid6_avx512_constants __aligned(512) = { +} raid6_avx512_constants __aligned(512/8) = { { 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL, -- 2.9.2
Re: [PATCH] KVM/x86: Increase max vcpu number to 352
On Thu, Aug 10, 2017 at 12:00 PM, Lan Tianyu wrote: > Intel Xeon phi chip will support 352 logical threads. For HPC usage > case, it will create a huge VM with vcpu number as same as host cpus. This > patch is to increase max vcpu number to 352. This number was bumped in the past to 288 to accommodate Knights Landing, but KNL's max designed thread number is actually 304: the on-die interconnect mesh is 6*7, with four cells taken for interconnect and memory controllers, there are 38 CPU cells. Each CPU cell has two cores with shared L2. Each core is SMT4. 38*8 = 304. Intel fuses two cells (or more), so 288 is the largest number of threads on a KNL you can buy, but 304 thread KNLs most probably also exist (however they may be rather rare since they require completely defect-free die). I think it's better if Linux would support those too. What is the design maximum for these new "nominally 352 thread" Xeon Phis which are "nominally 352 thread"? 360? (If the mesh is 7*7 and the same 4 cells are taked for non-CPU needs)
[PATCH] KVM: SVM: delete avic_vm_id_bitmap (2 megabyte static array)
With lightly tweaked defconfig: textdata bss dec hex filename 11259661 5109408 2981888 19350957 12745ad vmlinux.before 11259661 5109408 884736 17253805 10745ad vmlinux.after Only compile-tested. Signed-off-by: Denys Vlasenko Cc: Joerg Roedel Cc: pbonz...@redhat.com Cc: rkrc...@redhat.com Cc: t...@linutronix.de Cc: mi...@redhat.com Cc: h...@zytor.com Cc: x...@kernel.org Cc: k...@vger.kernel.org Cc: linux-kernel@vger.kernel.org --- arch/x86/kvm/svm.c | 61 +++--- 1 file changed, 21 insertions(+), 40 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 1107626..f575089 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -280,10 +280,6 @@ module_param(avic, int, S_IRUGO); static int vls = true; module_param(vls, int, 0444); -/* AVIC VM ID bit masks and lock */ -static DECLARE_BITMAP(avic_vm_id_bitmap, AVIC_VM_ID_NR); -static DEFINE_SPINLOCK(avic_vm_id_lock); - static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); static void svm_flush_tlb(struct kvm_vcpu *vcpu); static void svm_complete_interrupts(struct vcpu_svm *svm); @@ -989,6 +985,8 @@ static void disable_nmi_singlestep(struct vcpu_svm *svm) */ #define SVM_VM_DATA_HASH_BITS 8 static DEFINE_HASHTABLE(svm_vm_data_hash, SVM_VM_DATA_HASH_BITS); +static u32 next_vm_id = 0; +static bool next_vm_id_wrapped = 0; static DEFINE_SPINLOCK(svm_vm_data_hash_lock); /* Note: @@ -1387,34 +1385,6 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu) return 0; } -static inline int avic_get_next_vm_id(void) -{ - int id; - - spin_lock(&avic_vm_id_lock); - - /* AVIC VM ID is one-based. */ - id = find_next_zero_bit(avic_vm_id_bitmap, AVIC_VM_ID_NR, 1); - if (id <= AVIC_VM_ID_MASK) - __set_bit(id, avic_vm_id_bitmap); - else - id = -EAGAIN; - - spin_unlock(&avic_vm_id_lock); - return id; -} - -static inline int avic_free_vm_id(int id) -{ - if (id <= 0 || id > AVIC_VM_ID_MASK) - return -EINVAL; - - spin_lock(&avic_vm_id_lock); - __clear_bit(id, avic_vm_id_bitmap); - spin_unlock(&avic_vm_id_lock); - return 0; -} - static void avic_vm_destroy(struct kvm *kvm) { unsigned long flags; @@ -1423,8 +1393,6 @@ static void avic_vm_destroy(struct kvm *kvm) if (!avic) return; - avic_free_vm_id(vm_data->avic_vm_id); - if (vm_data->avic_logical_id_table_page) __free_page(vm_data->avic_logical_id_table_page); if (vm_data->avic_physical_id_table_page) @@ -1438,19 +1406,16 @@ static void avic_vm_destroy(struct kvm *kvm) static int avic_vm_init(struct kvm *kvm) { unsigned long flags; - int vm_id, err = -ENOMEM; + int err = -ENOMEM; struct kvm_arch *vm_data = &kvm->arch; struct page *p_page; struct page *l_page; + struct kvm_arch *ka; + u32 vm_id; if (!avic) return 0; - vm_id = avic_get_next_vm_id(); - if (vm_id < 0) - return vm_id; - vm_data->avic_vm_id = (u32)vm_id; - /* Allocating physical APIC ID table (4KB) */ p_page = alloc_page(GFP_KERNEL); if (!p_page) @@ -1468,6 +1433,22 @@ static int avic_vm_init(struct kvm *kvm) clear_page(page_address(l_page)); spin_lock_irqsave(&svm_vm_data_hash_lock, flags); + again: + vm_id = next_vm_id = (next_vm_id + 1) & AVIC_VM_ID_MASK; + if (vm_id == 0) { /* id is 1-based, zero is not okay */ + next_vm_id_wrapped = 1; + goto again; + } + /* Is it still in use? Only possible if wrapped at least once */ + if (next_vm_id_wrapped) { + hash_for_each_possible(svm_vm_data_hash, ka, hnode, vm_id) { + struct kvm *k2 = container_of(ka, struct kvm, arch); + struct kvm_arch *vd2 = &k2->arch; + if (vd2->avic_vm_id == vm_id) + goto again; + } + } + vm_data->avic_vm_id = vm_id; hash_add(svm_vm_data_hash, &vm_data->hnode, vm_data->avic_vm_id); spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags); -- 2.9.2
Re: [PATCH] selftests: ftrace: Use md5sum to take less time of checking logs
On Tue, Jun 27, 2017 at 12:28 PM, Masami Hiramatsu wrote: > Use md5sum so that it takes less time of checking > trace logs update. Since busybox tail/cat takes too > long time to read the trace log, this uses md5sum > to check whether trace log is updated or not. > > Signed-off-by: Masami Hiramatsu > --- > .../test.d/ftrace/func_traceonoff_triggers.tc |6 +++--- > 1 file changed, 3 insertions(+), 3 deletions(-) > > diff --git > a/tools/testing/selftests/ftrace/test.d/ftrace/func_traceonoff_triggers.tc > b/tools/testing/selftests/ftrace/test.d/ftrace/func_traceonoff_triggers.tc > index 9cf3852..7a9ab4f 100644 > --- a/tools/testing/selftests/ftrace/test.d/ftrace/func_traceonoff_triggers.tc > +++ b/tools/testing/selftests/ftrace/test.d/ftrace/func_traceonoff_triggers.tc > @@ -103,11 +103,11 @@ if [ $on != "0" ]; then > fail "Tracing is not off" > fi > > -line1=`cat trace | tail -1` > +csum1=`md5sum trace` > sleep $SLEEP_TIME > -line2=`cat trace | tail -1` > +csum2=`md5sum trace` If you replace that with "tail -1 0) { ... #if 1 /* This is technically incorrect for *LONG* strings, but very useful */ /* Optimizing count-lines case if the file is seekable. * We assume the lines are <64k. * (Users complain that tail takes too long * on multi-gigabyte files) */ off = (count | 0xf); /* for small counts, be more paranoid */ if (off > (INT_MAX / (64*1024))) off = (INT_MAX / (64*1024)); current -= off * (64*1024); if (current < 0) current = 0; xlseek(fd, current, SEEK_SET); #endif Example: "tail -1 10GB_FILE" seeks back ~1 mbyte and looks for the last line: lseek(3, 0, SEEK_END) = 10991787403 lseek(3, 10990804363, SEEK_SET) = 10990804363 read(3, "\0\0020d\340\326\0\0\0\0020e\6\364\0\0\0\0020eMI\0\0\0\0020el\210\0\0"..., 8192) = 8192 read(3, "\0\0021\"p\321\0\0\0\0021\"\216\31\0\0\0\0021\"\254q\0\0\0\0021\"\367u\0\0"..., 8192) = 8192 read(3, "\0\0021\342\216\302\0\0\0\0021\342\2243\0\0\0\0021\342\266T\0\0\0\0021\342\326\v\0\0"..., 8192) = 8192 ... read(3, "\0\0\2\211\3272\211\0\0\0\2\211\3277\235\0\0\0\2\211\327mx\0\0\0\2\211\327\224\22\0"..., 11841) = 11841 read(3, "\0\2\212\357\366\306\0\0\0\2\212\360`\232\0\0\0\2\212\360w\262\0\0\0\2\212\360\354\365\0\0"..., 12512) = 12512 read(3, "\0\2\214uL\3\0\0\0\2\214unc\0\0\0\2\214w_\373\0\0\0\2\214wn\6\0\0"..., 11176) = 8424 read(3, "", 2752) = 0
Re: [PATCH v2] net/sctp/ulpevent.c: Deinline sctp_ulpevent_set_owner, save 1616 bytes
On 06/21/2017 09:24 PM, Marcelo Ricardo Leitner wrote: On Wed, Jun 21, 2017 at 07:03:27PM +0200, Denys Vlasenko wrote: This function compiles to 147 bytes of machine code. 13 callsites. I'm no expert on SCTP events, but quick reading of SCTP docs tells me that SCTP events are not happening on every packet. They are ASSOC_CHANGE, PEER_ADDR_CHANGE, REMOTE_ERROR and such. Does not look performance critical. Signed-off-by: Denys Vlasenko CC: Vlad Yasevich CC: Neil Horman CC: David Miller CC: linux-s...@vger.kernel.org CC: net...@vger.kernel.org CC: linux-kernel@vger.kernel.org Acked-by: Marcelo Ricardo Leitner Just wondering, are you conducting further research on this topic? Because we probably could use such review on SCTP stack. Here is the list of sctp inlines which has any (however small) benefit when deinlined. filename:lineno:inline_name:lines_of_source_code:saved_bytes_by_deinlining:size_of_code_of_deinlined_function include/net/sctp/command.h:228:sctp_add_cmd_sf:7:8306:38 net/sctp/ulpevent.c:91:sctp_ulpevent_set_owner:13:1616:147 include/net/sctp/sctp.h:414:sctp_skb_set_owner_r:10:934:75 net/sctp/input.c:840:sctp_hash_key:13:896:359 net/sctp/input.c:823:sctp_hash_obj:13:704:409 include/net/sctp/checksum.h:60:sctp_compute_cksum:13:595:85 net/sctp/input.c:800:sctp_hash_cmp:18:320:124 net/sctp/socket.c:117:sctp_wspace:19:256:76 include/net/sctp/sctp.h:272:sctp_max_rto:7:204:68 net/sctp/socket.c:173:sctp_verify_addr:15:192:72 include/net/sctp/checksum.h:46:sctp_csum_update:4:147:21 include/net/sctp/sctp.h:519:param_type2af:8:134:35 include/net/sctp/sctp.h:399:sctp_list_dequeue:7:123:59 include/net/sctp/sctp.h:596:sctp_transport_dst_check:4:120:60 include/net/sctp/sctp.h:435:sctp_frag_point:12:65:65 net/sctp/outqueue.c:82:sctp_outq_dequeue_data:10:64:87 include/net/sctp/command.h:243:sctp_next_cmd:4:64:37 include/net/sctp/sm.h:347:sctp_data_size:6:19:19 For .c files, the patches are trivial and I have them auto-generated, I'll send them to you privately to save you the mechanical work. Unfortunately, for inlines in .h files this does not work (a human is needed to decide where to more the function). Of course, not every deinlining makes sense.
Can avic_vm_id_bitmap be made smaller? 2 mbytes feels too large
#define AVIC_VM_ID_BITS24 #define AVIC_VM_ID_NR (1 << AVIC_VM_ID_BITS) ... static DECLARE_BITMAP(avic_vm_id_bitmap, AVIC_VM_ID_NR); The above results in a data object which is 2 megabytes large. avic_vm_init() -> avic_get_next_vm_id() allocates a new vm_id by looking for a free bit there. That's the whole purpose for that bitmap existing. Is there a way to do this less wastefully? Say, such as iterating over all existing VMs and picking an id which is not taken. I imagine VM creation operation is not that frequent, no need to make new vm_id selection super fast.
[PATCH] minix: Deinline get_block, save 2691 bytes
This function compiles to 1402 bytes of machine code. It has 2 callsites, and also a not-inlined copy gets created by compiler anyway since its address gets passed as a parameter to block_truncate_page(). Signed-off-by: Denys Vlasenko CC: Al Viro CC: linux-fsde...@vger.kernel.org CC: linux-kernel@vger.kernel.org --- fs/minix/itree_common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/minix/itree_common.c b/fs/minix/itree_common.c index 4c57c9a..2325777 100644 --- a/fs/minix/itree_common.c +++ b/fs/minix/itree_common.c @@ -142,7 +142,7 @@ static inline int splice_branch(struct inode *inode, return -EAGAIN; } -static inline int get_block(struct inode * inode, sector_t block, +static int get_block(struct inode * inode, sector_t block, struct buffer_head *bh, int create) { int err = -EIO; -- 1.8.3.1
[PATCH] kernel/sched/fair.c: Deinline update_load_avg, save 8720 bytes
This function compiles to 1378 bytes of machine code. 8 callsites. Signed-off-by: Denys Vlasenko CC: Peter Zijlstra CC: linux-kernel@vger.kernel.org --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d711093..c4d6603 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3312,7 +3312,7 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) #define SKIP_AGE_LOAD 0x2 /* Update task and its cfs_rq load average */ -static inline void update_load_avg(struct sched_entity *se, int flags) +static void update_load_avg(struct sched_entity *se, int flags) { struct cfs_rq *cfs_rq = cfs_rq_of(se); u64 now = cfs_rq_clock_task(cfs_rq); -- 1.8.3.1
[PATCH v2] net/sctp/ulpevent.c: Deinline sctp_ulpevent_set_owner, save 1616 bytes
This function compiles to 147 bytes of machine code. 13 callsites. I'm no expert on SCTP events, but quick reading of SCTP docs tells me that SCTP events are not happening on every packet. They are ASSOC_CHANGE, PEER_ADDR_CHANGE, REMOTE_ERROR and such. Does not look performance critical. Signed-off-by: Denys Vlasenko CC: Vlad Yasevich CC: Neil Horman CC: David Miller CC: linux-s...@vger.kernel.org CC: net...@vger.kernel.org CC: linux-kernel@vger.kernel.org --- Changed since v1: * reindented function argument list net/sctp/ulpevent.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c index ec2b3e0..bc3f495 100644 --- a/net/sctp/ulpevent.c +++ b/net/sctp/ulpevent.c @@ -88,8 +88,8 @@ int sctp_ulpevent_is_notification(const struct sctp_ulpevent *event) /* Hold the association in case the msg_name needs read out of * the association. */ -static inline void sctp_ulpevent_set_owner(struct sctp_ulpevent *event, - const struct sctp_association *asoc) +static void sctp_ulpevent_set_owner(struct sctp_ulpevent *event, + const struct sctp_association *asoc) { struct sctp_chunk *chunk = event->chunk; struct sk_buff *skb; -- 1.8.3.1
[PATCH] net/sctp/ulpevent.c: Deinline sctp_ulpevent_set_owner, save 1616 bytes
This function compiles to 147 bytes of machine code. 13 callsites. I'm no expert on SCTP events, but quick reading of SCTP docs tells me that SCTP events are not happening on every packet. They are ASSOC_CHANGE, PEER_ADDR_CHANGE, REMOTE_ERROR and such. Does not look performance critical. Signed-off-by: Denys Vlasenko CC: Vlad Yasevich CC: Neil Horman CC: David Miller CC: linux-s...@vger.kernel.org CC: net...@vger.kernel.org CC: linux-kernel@vger.kernel.org --- net/sctp/ulpevent.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c index ec2b3e0..049aa5a 100644 --- a/net/sctp/ulpevent.c +++ b/net/sctp/ulpevent.c @@ -88,7 +88,7 @@ int sctp_ulpevent_is_notification(const struct sctp_ulpevent *event) /* Hold the association in case the msg_name needs read out of * the association. */ -static inline void sctp_ulpevent_set_owner(struct sctp_ulpevent *event, +static void sctp_ulpevent_set_owner(struct sctp_ulpevent *event, const struct sctp_association *asoc) { struct sctp_chunk *chunk = event->chunk; -- 1.8.3.1
[PATCH] liquidio: stop using huge static buffer, save 4096k in .data
Only compile-tested - I don't have the hardware. >From code inspection, octeon_pci_write_core_mem() appears to be safe wrt unaligned source. In any case, u8 fbuf[] was not guaranteed to be aligned anyway. Signed-off-by: Denys Vlasenko CC: Felix Manlunas CC: Prasad Kanneganti CC: Derek Chickles CC: David Miller CC: net...@vger.kernel.org CC: linux-kernel@vger.kernel.org --- drivers/net/ethernet/cavium/liquidio/octeon_console.c | 6 +- drivers/net/ethernet/cavium/liquidio/octeon_mem_ops.c | 4 ++-- drivers/net/ethernet/cavium/liquidio/octeon_mem_ops.h | 2 +- 3 files changed, 4 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/cavium/liquidio/octeon_console.c b/drivers/net/ethernet/cavium/liquidio/octeon_console.c index 53f38d0..e08f760 100644 --- a/drivers/net/ethernet/cavium/liquidio/octeon_console.c +++ b/drivers/net/ethernet/cavium/liquidio/octeon_console.c @@ -724,13 +724,11 @@ static int octeon_console_read(struct octeon_device *oct, u32 console_num, } #define FBUF_SIZE (4 * 1024 * 1024) -u8 fbuf[FBUF_SIZE]; int octeon_download_firmware(struct octeon_device *oct, const u8 *data, size_t size) { int ret = 0; - u8 *p = fbuf; u32 crc32_result; u64 load_addr; u32 image_len; @@ -805,10 +803,8 @@ int octeon_download_firmware(struct octeon_device *oct, const u8 *data, else size = FBUF_SIZE; - memcpy(p, data, size); - /* download the image */ - octeon_pci_write_core_mem(oct, load_addr, p, (u32)size); + octeon_pci_write_core_mem(oct, load_addr, data, (u32)size); data += size; rem -= (u32)size; diff --git a/drivers/net/ethernet/cavium/liquidio/octeon_mem_ops.c b/drivers/net/ethernet/cavium/liquidio/octeon_mem_ops.c index 5cd96e7..4c85ae6 100644 --- a/drivers/net/ethernet/cavium/liquidio/octeon_mem_ops.c +++ b/drivers/net/ethernet/cavium/liquidio/octeon_mem_ops.c @@ -167,10 +167,10 @@ octeon_pci_read_core_mem(struct octeon_device *oct, void octeon_pci_write_core_mem(struct octeon_device *oct, u64 coreaddr, - u8 *buf, + const u8 *buf, u32 len) { - __octeon_pci_rw_core_mem(oct, coreaddr, buf, len, 0); + __octeon_pci_rw_core_mem(oct, coreaddr, (u8 *)buf, len, 0); } u64 octeon_read_device_mem64(struct octeon_device *oct, u64 coreaddr) diff --git a/drivers/net/ethernet/cavium/liquidio/octeon_mem_ops.h b/drivers/net/ethernet/cavium/liquidio/octeon_mem_ops.h index bae2fdd..47a3ff5 100644 --- a/drivers/net/ethernet/cavium/liquidio/octeon_mem_ops.h +++ b/drivers/net/ethernet/cavium/liquidio/octeon_mem_ops.h @@ -66,7 +66,7 @@ octeon_pci_read_core_mem(struct octeon_device *oct, void octeon_pci_write_core_mem(struct octeon_device *oct, u64 coreaddr, - u8 *buf, + const u8 *buf, u32 len); #endif -- 2.9.2
Re: [PATCH v2] x86, msr: Document AMD "tweak MSRs", use MSR_FnnH_NAME scheme for them
On 04/25/2017 06:23 PM, Borislav Petkov wrote: On Tue, Apr 25, 2017 at 06:15:23PM +0200, Denys Vlasenko wrote: On 04/25/2017 06:06 PM, Borislav Petkov wrote: Pls no. Not every MSR for every family. Only the 4 which are actually being used. We can't hold in here the full 32-bit MSR space. The replacement of four define names is not the purpose of the proposed patch. The patch was prompted by the realization that these particular MSRs are so badly and inconsistently documented that it takes many hours of work and requires reading of literally a dozen PDFs to figure out what are their names, which CPUs have them, and what bits are known. They're all documented in the respective BKDGs or revision guides. Yes. For some definition of "documented". Let's say you are looking at all available documentation for Fam10h CPUs - BKDG, Revision Guide, five volumes of APM, software optimization guide. Eight documents. If you read all of them, you can find exactly one mention that MSR 0xC0011029 exists. It is mentioned by number. As a reader of this documentation, can you find out what is it? Does it have a name, at least? You are right that kernel is not exactly the best place to store more info about such things, but AMD probably won't accept my edits to their documentation.
Re: [PATCH v2] x86, msr: Document AMD "tweak MSRs", use MSR_FnnH_NAME scheme for them
On 04/25/2017 06:06 PM, Borislav Petkov wrote: Pls no. Not every MSR for every family. Only the 4 which are actually being used. We can't hold in here the full 32-bit MSR space. The replacement of four define names is not the purpose of the proposed patch. The patch was prompted by the realization that these particular MSRs are so badly and inconsistently documented that it takes many hours of work and requires reading of literally a dozen PDFs to figure out what are their names, which CPUs have them, and what bits are known. Anyone who looks at only one document won't see the full picture. Patch does not document bits, but at least documents all MSR names and explains why documentation is so sparse. If you think it's not useful, so be it.
[PATCH v2] x86, msr: Document AMD "tweak MSRs", use MSR_FnnH_NAME scheme for them
_1022 Data Cache Configuration (DC_CFG) MSRC001_1023 Bus Unit Configuration Register (BU_CFG) MSRC001_102A Bus Unit Configuration 2 (BU_CFG2) 11h_BKDG documents: MSRC001_1022 Data Cache Configuration (DC_CFG) MSRC001_1023 Bus Unit Configuration Register (BU_CFG) 12h_BKDG documents: MSRC001_1020 Load-Store Configuration (LS_CFG) MSRC001_1021 Instruction Cache Configuration (IC_CFG) MSRC001_1022 Data Cache Configuration (DC_CFG) MSRC001_1029 Decode Configuration (DE_CFG) MSRC001_102A Combined Unit Configuration 2 (CU_CFG2) - name change since 10h 14h_Mod_00h-0Fh_BKDG documents only: MSRC001_1020 Load-Store Configuration (LS_CFG) MSRC001_1021 Instruction Cache Configuration (IC_CFG) MSRC001_1022 Data Cache Configuration (DC_CFG) 15h_Mod_00h-0Fh_BKDG documents more: MSRC001_1020 Load-Store Configuration (LS_CFG) MSRC001_1021 Instruction Cache Configuration (IC_CFG) MSRC001_1022 Data Cache Configuration (DC_CFG) MSRC001_1023 Combined Unit Configuration (CU_CFG) - name change since 11h MSRC001_1028 Floating Point Configuration (FP_CFG) MSRC001_1029 Decode Configuration (DE_CFG) MSRC001_102A Combined Unit Configuration 2 (CU_CFG2) MSRC001_102B Combined Unit Configuration 3 (CU_CFG3) MSRC001_102C Execution Unit Configuration (EX_CFG) MSRC001_102D Load-Store Configuration 2 (LS_CFG2) 15h_Mod_10h-1Fh_BKDG: does not mention MSRC001_1029 and MSRC001_102C. 15h_Mod_30h-3Fh_BKDG: does not mention MSRC001_1029, MSRC001_102C and MSRC001_102D, adds new one: MSRC001_102F Prefetch Throttling Configuration (CU_PFTCFG) 15h_Mod_60h-6Fh_BKDG: also fails to mention MSRC001_1029, MSRC001_102C and MSRC001_102D, but has new ones: MSRC001_101C Load-Store Configuration 3 (LS_CFG3) MSRC001_1090 Processor Feedback Constants 0 MSRC001_10A1 Contention Blocking Buffer Control (CU_CBBCFG) MSRC001_1000 is only mentioned in 15h erratas, name unknown. 16h_Mod_00h-0Fh_BKDG: stuff disappeared or got renamed (1023 and 102A are "Bus Unit Configuration" again): MSRC001_1020 Load-Store Configuration (LS_CFG) MSRC001_1021 Instruction Cache Configuration (IC_CFG) MSRC001_1022 Data Cache Configuration (DC_CFG) MSRC001_1023 Bus Unit Configuration (BU_CFG) MSRC001_1028 Floating Point Configuration (FP_CFG) - all bits are "reserved" MSRC001_102A Bus Unit Configuration 2 (BU_CFG2) 16h_Mod_30h-3Fh_BKDG: FP_CFG now has one documented field CC: Ingo Molnar CC: Andy Lutomirski CC: Borislav Petkov CC: Brian Gerst CC: Peter Zijlstra CC: "H. Peter Anvin" CC: x...@kernel.org CC: linux-kernel@vger.kernel.org Signed-off-by: Denys Vlasenko --- arch/x86/include/asm/msr-index.h | 63 +--- arch/x86/kernel/cpu/amd.c| 10 +++ arch/x86/kvm/svm.c | 4 +-- arch/x86/kvm/x86.c | 4 +-- 4 files changed, 67 insertions(+), 14 deletions(-) diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index d8b5f8a..8f89dd3 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -293,9 +293,6 @@ #define MSR_AMD64_PATCH_LOADER 0xc0010020 #define MSR_AMD64_OSVW_ID_LENGTH 0xc0010140 #define MSR_AMD64_OSVW_STATUS 0xc0010141 -#define MSR_AMD64_LS_CFG 0xc0011020 -#define MSR_AMD64_DC_CFG 0xc0011022 -#define MSR_AMD64_BU_CFG2 0xc001102a #define MSR_AMD64_IBSFETCHCTL 0xc0011030 #define MSR_AMD64_IBSFETCHLINAD0xc0011031 #define MSR_AMD64_IBSFETCHPHYSAD 0xc0011032 @@ -315,6 +312,65 @@ #define MSR_AMD64_IBSOPDATA4 0xc001103d #define MSR_AMD64_IBS_REG_COUNT_MAX8 /* includes MSR_AMD64_IBSBRTARGET */ +/* + * MSRs in 0xc001101c-0xc001102f range are sparsely documented in BKDGs, + * but sometimes they can be found in errata documents. + * Registers 1020-1023 exist since K8 (mentioned in errata docs). + * Fam10h also has registers 1029, 102a (maybe more, not in docs). + * Fam15h BKDGs document registers 1028, 102b-102f, 101c, 1090, 10a1. + * Registers 1023 and 102a are called "Combined Unit Cfg" or "Bus Unit Cfg", + * depending on the CPU family. + */ +#define MSR_K8_LS_CFG 0xc0011020 +#define MSR_K8_IC_CFG 0xc0011021 +#define MSR_K8_DC_CFG 0xc0011022 +#define MSR_K8_BU_CFG 0xc0011023 + +#define MSR_F10H_LS_CFG0xc0011020 +#define MSR_F10H_IC_CFG0xc0011021 +#define MSR_F10H_DC_CFG0xc0011022 +#define MSR_F10H_BU_CFG0xc0011023 +#define MSR_F10H_DE_CFG0xc0011029 +#define MSR_F10H_BU_CFG2 0xc001102a + +#define MSR_F12H_LS_CFG0xc0011020 +#define MSR_F12H_IC_CFG0xc0011021 +#define MSR_F12H_DC_CFG0xc0011022 +#define MSR_F12H_CU_CFG0xc0011023 +#define MSR_F12H_DE_CFG0xc0011029 +#define MSR
Re: [PATCH] x86, msr: Better document AMD "tweak MSRs", rename MSR_F15H_IC_CFG
On 04/25/2017 02:59 PM, Borislav Petkov wrote: On Tue, Apr 25, 2017 at 02:16:55PM +0200, Denys Vlasenko wrote: However, all IBS registers are in this range. I knew you were gonna say that. But IBS registers are architectural too in the sense that they are behind a CPUID bit. DRi_ADDR_MASK are in this range - and these are very useful, likely to stay. Those are too behind a CPUID bit. In the arch/x86/include/asm/msr-index.h file we already have three "tweak" MSRs defined with "AMD64": #define MSR_AMD64_LS_CFG 0xc0011020 #define MSR_AMD64_DC_CFG 0xc0011022 #define MSR_AMD64_BU_CFG2 0xc001102a I just noticed that we have a fourth one in arch/x86/kernel/cpu/amd.c: #define MSR_AMD64_DE_CFG 0xC0011029 That's wrong. I think we should call those something else but not "AMD64". Okay. Propose a naming scheme for these which looks god to you. Perhaps the families for which the workaround is being applied. In the last case, MSR_F12H_DE_CFG, for example. And yes, I should've paid attention to that but ... A bit problematic: MSR C001_1020 is used (mentioned in Rev Guides as a possible way to work around an errata) by all Fams starting from K8, except Fam15h. MSR C001_1022 is used by K8, 10h, 15h. Etc...
Re: [PATCH] x86, msr: Better document AMD "tweak MSRs", rename MSR_F15H_IC_CFG
On 04/25/2017 01:59 PM, Borislav Petkov wrote: On Tue, Apr 25, 2017 at 01:45:41PM +0200, Denys Vlasenko wrote: Before this patch, we have a define for MSR 0xc0011021: MSR_F15H_IC_CFG. But it exists starting from K8, so it's not really a Fam15h MSR only. Lat's call it MSR_AMD64_IC_CFG. Except that we name only those MSRs with "AMD64" which are architectural. See "Appendix A MSR Cross-Reference" in APM vol 2. Yes, APM vol 2 has none of c001_1xxx MSRs. However, all IBS registers are in this range. DRi_ADDR_MASK are in this range - and these are very useful, likely to stay. In the arch/x86/include/asm/msr-index.h file we already have three "tweak" MSRs defined with "AMD64": #define MSR_AMD64_LS_CFG 0xc0011020 #define MSR_AMD64_DC_CFG 0xc0011022 #define MSR_AMD64_BU_CFG2 0xc001102a I just noticed that we have a fourth one in arch/x86/kernel/cpu/amd.c: #define MSR_AMD64_DE_CFG 0xC0011029
[PATCH] x86, msr: Better document AMD "tweak MSRs", rename MSR_F15H_IC_CFG
G2) 11h_BKDG documents: MSRC001_1022 Data Cache Configuration (DC_CFG) MSRC001_1023 Bus Unit Configuration Register (BU_CFG) 12h_BKDG documents: MSRC001_1020 Load-Store Configuration (LS_CFG) MSRC001_1021 Instruction Cache Configuration (IC_CFG) MSRC001_1022 Data Cache Configuration (DC_CFG) MSRC001_1029 Decode Configuration (DE_CFG) MSRC001_102A Combined Unit Configuration 2 (CU_CFG2) - name change since 10h 14h_Mod_00h-0Fh_BKDG documents only: MSRC001_1020 Load-Store Configuration (LS_CFG) MSRC001_1021 Instruction Cache Configuration (IC_CFG) MSRC001_1022 Data Cache Configuration (DC_CFG) 15h_Mod_00h-0Fh_BKDG documents more: MSRC001_1020 Load-Store Configuration (LS_CFG) MSRC001_1021 Instruction Cache Configuration (IC_CFG) MSRC001_1022 Data Cache Configuration (DC_CFG) MSRC001_1023 Combined Unit Configuration (CU_CFG) - name change since 11h MSRC001_1028 Floating Point Configuration (FP_CFG) MSRC001_1029 Decode Configuration (DE_CFG) MSRC001_102A Combined Unit Configuration 2 (CU_CFG2) MSRC001_102B Combined Unit Configuration 3 (CU_CFG3) MSRC001_102C Execution Unit Configuration (EX_CFG) MSRC001_102D Load-Store Configuration 2 (LS_CFG2) 15h_Mod_10h-1Fh_BKDG: does not mention MSRC001_1029 and MSRC001_102C. 15h_Mod_30h-3Fh_BKDG: does not mention MSRC001_1029, MSRC001_102C and MSRC001_102D, adds new one: MSRC001_102F Prefetch Throttling Configuration (CU_PFTCFG) 15h_Mod_60h-6Fh_BKDG: also fails to mention MSRC001_1029, MSRC001_102C and MSRC001_102D, but has new ones: MSRC001_101C Load-Store Configuration 3 (LS_CFG3) MSRC001_1090 Processor Feedback Constants 0 MSRC001_10A1 Contention Blocking Buffer Control (CU_CBBCFG) 16h_Mod_00h-0Fh_BKDG: stuff disappeared or got renamed (1023 and 102A are "Bus Unit Configuration" again): MSRC001_1020 Load-Store Configuration (LS_CFG) MSRC001_1021 Instruction Cache Configuration (IC_CFG) MSRC001_1022 Data Cache Configuration (DC_CFG) MSRC001_1023 Bus Unit Configuration (BU_CFG) MSRC001_1028 Floating Point Configuration (FP_CFG) - all bits are "reserved" MSRC001_102A Bus Unit Configuration 2 (BU_CFG2) 16h_Mod_30h-3Fh_BKDG: FP_CFG now has one documented field CC: Ingo Molnar CC: Andy Lutomirski CC: Borislav Petkov CC: Brian Gerst CC: Peter Zijlstra CC: "H. Peter Anvin" CC: x...@kernel.org CC: linux-kernel@vger.kernel.org Signed-off-by: Denys Vlasenko --- arch/x86/include/asm/msr-index.h | 20 +++- arch/x86/kernel/cpu/amd.c| 4 ++-- arch/x86/kvm/svm.c | 2 +- 3 files changed, 22 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index d8b5f8a..4195681 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -293,9 +293,28 @@ #define MSR_AMD64_PATCH_LOADER 0xc0010020 #define MSR_AMD64_OSVW_ID_LENGTH 0xc0010140 #define MSR_AMD64_OSVW_STATUS 0xc0010141 +/* + * MSRs in 0xc001101c-0xc001102f range are sparsely documented in BKDGs, + * but sometimes they can be found in errata documents. + * Registers 1020..1023 exist since K8. Fam10h and later have regs 1028..102d. + * Fam15h BKDGs document registers 102f, 101c, 1090, 10a1. + * Registers 1023 and 102a are called "Combined Unit Cfg" in Fam12h-15h, + * but "Bus Unit Cfg" in K8, Fam10h, 11h and 16h. Other "CU_" registers + * might also be "BU_" (did not find them in docs for these families). + */ +#define MSR_AMD64_LS_CFG3 0xc001101c #define MSR_AMD64_LS_CFG 0xc0011020 +#define MSR_AMD64_IC_CFG 0xc0011021 #define MSR_AMD64_DC_CFG 0xc0011022 +#define MSR_AMD64_BU_CFG 0xc0011023 +#define MSR_AMD64_FP_CFG 0xc0011028 +#define MSR_AMD64_DE_CFG 0xc0011029 #define MSR_AMD64_BU_CFG2 0xc001102a +#define MSR_AMD64_CU_CFG3 0xc001102b +#define MSR_AMD64_EX_CFG 0xc001102c +#define MSR_AMD64_LS_CFG2 0xc001102d +#define MSR_AMD64_CU_PFTCFG0xc001102f + #define MSR_AMD64_IBSFETCHCTL 0xc0011030 #define MSR_AMD64_IBSFETCHLINAD0xc0011031 #define MSR_AMD64_IBSFETCHPHYSAD 0xc0011032 @@ -332,7 +351,6 @@ #define MSR_F15H_NB_PERF_CTL 0xc0010240 #define MSR_F15H_NB_PERF_CTR 0xc0010241 #define MSR_F15H_PTSC 0xc0010280 -#define MSR_F15H_IC_CFG0xc0011021 /* Fam 10h MSRs */ #define MSR_FAM10H_MMIO_CONF_BASE 0xc0010058 diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index c36140d..7d4a5bf 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -720,9 +720,9 @@ static void init_amd_bd(struct cpuinfo_x86 *c) * Disable it on the affected CPUs. */ if ((c->x86_model >= 0x02) && (c->x86_model < 0x20)) { - if (!rdmsrl_safe(MSR_F15H_IC_CFG, &value) && !(value
Re: [tip:x86/mm] x86/asm: Remove __VIRTUAL_MASK_SHIFT==47 assert
On 04/05/2017 01:12 PM, Kirill A. Shutemov wrote: On Tue, Apr 04, 2017 at 05:36:33PM +0200, Denys Vlasenko wrote: diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 044d18e..f07b4ef 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -265,12 +265,9 @@ return_from_SYSCALL_64: * * If width of "canonical tail" ever becomes variable, this will need * to be updated to remain correct on both old and new CPUs. +* +* Change top 16 bits to be the sign-extension of 47th bit The comment above stops being correct: it's not necessary 16 top bits we sign-extend now. With larger __VIRTUAL_MASK_SHIFT for 5-level translation, it will become 7 bits (if I do the math right). Does the patch below look okay to you? */ - .ifne __VIRTUAL_MASK_SHIFT - 47 - .error "virtual address width changed -- SYSRET checks need update" - .endif - - /* Change top 16 bits to be the sign-extension of 47th bit */ shl $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx sar $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx The bigger problem here would be the future boot-time choice of 4/5-level page tables: __VIRTUAL_MASK_SHIFT will need to depend on that choice, but in this location it is preferable to not use any variables (memory references). Yeah. Will see what I will be able to come up with. Not sure yet. ---8<-- From 2433cf4f8847bbc41cc2b02d6af4f191b3b5a0c5 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 5 Apr 2017 14:06:15 +0300 Subject: [PATCH] x86/asm: Fix comment in return_from_SYSCALL_64 On x86-64 __VIRTUAL_MASK_SHIFT depends on paging mode now. Signed-off-by: Kirill A. Shutemov --- arch/x86/entry/entry_64.S | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 607d72c4a485..c70e064d9592 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -266,7 +266,8 @@ return_from_SYSCALL_64: * If width of "canonical tail" ever becomes variable, this will need * to be updated to remain correct on both old and new CPUs. * -* Change top 16 bits to be the sign-extension of 47th bit +* Change top bits to match most significant valuable bit (47 or 56 +* depending on paging mode) in the address. Er "Change top bits ... ((47 or 56 [bits] depending on paging mode)"? I know that's wrong and that's not what you meant to say, but it can be read this way too. "47th" instead of "47" would eliminate this reading, but you removed "th". Spell it out to eliminate any chance of confusion: Change top bits to match most significant bit (47th or 56th bit depending on paging mode) in the address. */ shl $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx sar $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
Re: [tip:x86/mm] x86/asm: Remove __VIRTUAL_MASK_SHIFT==47 assert
On 04/04/2017 10:29 AM, tip-bot for Kirill A. Shutemov wrote: Commit-ID: 361b4b58ec4cf123e12a773909c6454dbd5e6dbc Gitweb: http://git.kernel.org/tip/361b4b58ec4cf123e12a773909c6454dbd5e6dbc Author: Kirill A. Shutemov AuthorDate: Thu, 30 Mar 2017 11:07:26 +0300 Committer: Ingo Molnar CommitDate: Tue, 4 Apr 2017 08:22:33 +0200 x86/asm: Remove __VIRTUAL_MASK_SHIFT==47 assert We don't need the assert anymore, as: 17be0aec74fb ("x86/asm/entry/64: Implement better check for canonical addresses") made canonical address checks generic wrt. address width. Signed-off-by: Kirill A. Shutemov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-a...@vger.kernel.org Cc: linux...@kvack.org Link: http://lkml.kernel.org/r/20170330080731.65421-3-kirill.shute...@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64.S | 7 ++- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 044d18e..f07b4ef 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -265,12 +265,9 @@ return_from_SYSCALL_64: * * If width of "canonical tail" ever becomes variable, this will need * to be updated to remain correct on both old and new CPUs. +* +* Change top 16 bits to be the sign-extension of 47th bit The comment above stops being correct: it's not necessary 16 top bits we sign-extend now. With larger __VIRTUAL_MASK_SHIFT for 5-level translation, it will become 7 bits (if I do the math right). */ - .ifne __VIRTUAL_MASK_SHIFT - 47 - .error "virtual address width changed -- SYSRET checks need update" - .endif - - /* Change top 16 bits to be the sign-extension of 47th bit */ shl $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx sar $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx The bigger problem here would be the future boot-time choice of 4/5-level page tables: __VIRTUAL_MASK_SHIFT will need to depend on that choice, but in this location it is preferable to not use any variables (memory references). -- vda
Re: random: /dev/random often returns short reads
On Wed, Jan 18, 2017 at 7:07 PM, Theodore Ts'o wrote: > On Wed, Jan 18, 2017 at 04:44:54PM +0100, Denys Vlasenko wrote: >> In this case, what code does is it returns fewer bytes, >> even though *it has enough random bytes to return the full request*. >> >> This happens because the patch which added more conservative >> accounting, while containing technically correct accounting per se, >> it forgot to take in the account another part of the code >> which was relying on the previous, simplistic logic >> "if we add N random bytes to a pool, now it has +N random bytes". >> >> In my opinion, it should have changed that part of code simultaneously >> with introducing more conservative accounting. > > In the ideal world, yes. I've acknowledged this is a bug, in the "be > conservative in what you send, liberal in what you receive" sense.. > But no one complained for three year, and userspace needs to be able > to retry short reads instead of immediately erroring out. > > The problem is changing that code to figure out exactly how many bytes > you need to get in order to have N random bytes is non-trivial. So > our choices are: > > 1) Transfer more bytes than might be needed to the secondary pool ... > 2) Transfer bytes without using the conservative entropy "overwrite" > calculation if the blocking pool is mostly empty. This means we will > be over-estimating the entropy in that pool, which is unfortunate. > One could argue that all of the cases where people have been whining > about this, they are using HAVEGED which is providing pretend entropy > based on the presumed unpredictability of Intel cahce timing, so > careful entropy calculations is kind of silly anyway. However, there > might be some people who really are trying to do carefule entropy > measurement, so compromising this isn't really a great idea.> > I'm leaning a bit towards 1 if we have to do something (which is my > proposed, untested patch). I spend quite some time looking at both your patch which implements #1 and at the commit 30e37ec516ae5a6957596de7661673c615c82ea4 which introduced "conservative accounting" code, and the same thought returns to me: this complexity and problems are self-inflicted by commit 30e37ec516ae5a6957596de7661673c615c82ea4. The code is already conservative elsewhere, adding more conservative code - and more complex code, especially considering that it should be even more complex than it is, since it should be further fixed up in "xfer_secondary_pool(r, nbytes)" location - it does not look like worthy improvement to me. I propose to simply revert 30e37ec516ae5a6957596de7661673c615c82ea4. If you worry about this accounting more than I do, how about dealing with it in a simpler way, a-la - xfer_secondary_pool(r, nbytes); + xfer_secondary_pool(r, nbytes * 5/4); /* be a bit paranoid */
Re: [PATCH] x86/crypto: make constants readonly, allow linker to merge them
On 01/20/2017 12:09 AM, Thomas Gleixner wrote: On Thu, 19 Jan 2017, Denys Vlasenko wrote: A lot of asm-optimized routines in arch/x86/crypto/ keep its constants in .data. This is wrong, they should be on .rodata. Mnay of these constants are the same in different modules. For example, 128-bit shuffle mask 0x000102030405060708090A0B0C0D0E0F exists in at least half a dozen places. There is a way to let linker merge them and use just one copy. The rules are as follows: mergeable objects of different sizes should not share sections. You can't put them all in one .rodata section, they will lose "mergeability". GCC puts its mergeable constants in ".rodata.cstSIZE" sections, or ".rodata.cstSIZE." if -fdata-sections is used. This patch does the same: .section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16 It is important that all data in such section consists of 16-byte elements, not larger ones, and there are no implicit use of one element from another. When this is not the case, use non-mergeable section: .section .rodata[.VAR_NAME], "a", @progbits This reduces .data by ~15 kbytes: textdata bss dec hex filename 11097415 2705840 2630712 16433967 fac32f vmlinux-prev.o 2095 2690672 2630712 16433479 fac147 vmlinux.o And at the same time it increases text size by ~15k. The overall change in total size is 488 byte reduction. Weird. Of course: the constants do need to go somewhere. They migrate to .rodata.blabla sections, which eventually got appended to read-only .text Without merging, if I would just move constants to .rodata, there would be no net size win at all. I did not look deepper why the overall change is smaller than expected. It may be affected by changed padding. The linker is not too clever about it, and also IIRC we don't actually giving it the best options to sort sections during link time to pack them better.
Re: random: /dev/random often returns short reads
On Wed, Jan 18, 2017 at 7:07 PM, Theodore Ts'o wrote: > In the ideal world, yes. I've acknowledged this is a bug, in the "be > conservative in what you send, liberal in what you receive" sense.. > But no one complained for three year, and userspace needs to be able > to retry short reads instead of immediately erroring out. > > The problem is changing that code to figure out exactly how many bytes > you need to get in order to have N random bytes is non-trivial. So > our choices are: > > 1) Transfer more bytes than might be needed to the secondary pool, > which results in resource stranding --- since entropy in the secondary > pool isn't available for reseeding the CRNG. OTOH, given that we're > now using the CRNG solution, and we're only reseeding every five > minutes, I'm not actually all that worried about stranding some extra > entropy bits in the blocking pool, since that's only going to happen > if we have people *using* the /dev/random pool, and so that entropy > will likely be used eventually anyway ... ... > I'm leaning a bit towards 1 if we have to do something (which is my > proposed, untested patch). Thanks, this solution is okay for me.
[PATCH] x86/crypto: make constants readonly, allow linker to merge them
A lot of asm-optimized routines in arch/x86/crypto/ keep its constants in .data. This is wrong, they should be on .rodata. Mnay of these constants are the same in different modules. For example, 128-bit shuffle mask 0x000102030405060708090A0B0C0D0E0F exists in at least half a dozen places. There is a way to let linker merge them and use just one copy. The rules are as follows: mergeable objects of different sizes should not share sections. You can't put them all in one .rodata section, they will lose "mergeability". GCC puts its mergeable constants in ".rodata.cstSIZE" sections, or ".rodata.cstSIZE." if -fdata-sections is used. This patch does the same: .section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16 It is important that all data in such section consists of 16-byte elements, not larger ones, and there are no implicit use of one element from another. When this is not the case, use non-mergeable section: .section .rodata[.VAR_NAME], "a", @progbits This reduces .data by ~15 kbytes: textdata bss dec hex filename 11097415 2705840 2630712 16433967 fac32f vmlinux-prev.o 2095 2690672 2630712 16433479 fac147 vmlinux.o Merged objects are visible in System.map: 81a28810 r POLY 81a28810 r POLY 81a28820 r TWOONE 81a28820 r TWOONE 81a28830 r PSHUFFLE_BYTE_FLIP_MASK <- merged regardless of 81a28830 r SHUF_MASK <- the name difference 81a28830 r SHUF_MASK 81a28830 r SHUF_MASK .. 81a28d00 r K512 <- merged three identical 640-byte tables 81a28d00 r K512 81a28d00 r K512 Use of object names in section name suffixes is not strictly necessary, but might help if someday link stage will use garbage collection to eliminate unused sections (ld --gc-sections). Signed-off-by: Denys Vlasenko CC: Herbert Xu CC: Josh Poimboeuf CC: Xiaodong Liu CC: Megha Dey CC: linux-cry...@vger.kernel.org CC: x...@kernel.org CC: linux-kernel@vger.kernel.org --- arch/x86/crypto/aesni-intel_asm.S | 37 +- arch/x86/crypto/aesni-intel_avx-x86_64.S | 32 ++- arch/x86/crypto/camellia-aesni-avx-asm_64.S| 5 ++- arch/x86/crypto/camellia-aesni-avx2-asm_64.S | 12 +-- arch/x86/crypto/cast5-avx-x86_64-asm_64.S | 14 ++-- arch/x86/crypto/cast6-avx-x86_64-asm_64.S | 12 +-- arch/x86/crypto/chacha20-avx2-x86_64.S | 9 -- arch/x86/crypto/chacha20-ssse3-x86_64.S| 7 ++-- arch/x86/crypto/crct10dif-pcl-asm_64.S | 14 ++-- arch/x86/crypto/des3_ede-asm_64.S | 2 +- arch/x86/crypto/ghash-clmulni-intel_asm.S | 3 +- arch/x86/crypto/poly1305-avx2-x86_64.S | 6 ++-- arch/x86/crypto/poly1305-sse2-x86_64.S | 6 ++-- arch/x86/crypto/serpent-avx-x86_64-asm_64.S| 5 +-- arch/x86/crypto/serpent-avx2-asm_64.S | 9 -- arch/x86/crypto/sha1-mb/sha1_mb_mgr_flush_avx2.S | 6 ++-- arch/x86/crypto/sha1-mb/sha1_mb_mgr_submit_avx2.S | 3 +- arch/x86/crypto/sha1-mb/sha1_x8_avx2.S | 15 +++-- arch/x86/crypto/sha1_ni_asm.S | 8 +++-- arch/x86/crypto/sha256-avx-asm.S | 9 +- arch/x86/crypto/sha256-avx2-asm.S | 9 +- .../crypto/sha256-mb/sha256_mb_mgr_flush_avx2.S| 6 ++-- .../crypto/sha256-mb/sha256_mb_mgr_submit_avx2.S | 3 +- arch/x86/crypto/sha256-mb/sha256_x8_avx2.S | 7 +++- arch/x86/crypto/sha256-ssse3-asm.S | 8 - arch/x86/crypto/sha256_ni_asm.S| 4 ++- arch/x86/crypto/sha512-avx-asm.S | 9 -- arch/x86/crypto/sha512-avx2-asm.S | 10 -- .../crypto/sha512-mb/sha512_mb_mgr_flush_avx2.S| 10 -- .../crypto/sha512-mb/sha512_mb_mgr_submit_avx2.S | 4 ++- arch/x86/crypto/sha512-mb/sha512_x4_avx2.S | 4 ++- arch/x86/crypto/sha512-ssse3-asm.S | 9 -- arch/x86/crypto/twofish-avx-x86_64-asm_64.S| 6 ++-- 33 files changed, 229 insertions(+), 74 deletions(-) diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S index 383a6f8..3c46518 100644 --- a/arch/x86/crypto/aesni-intel_asm.S +++ b/arch/x86/crypto/aesni-intel_asm.S @@ -46,28 +46,49 @@ #ifdef __x86_64__ -.data +# constants in mergeable sections, linker can reorder and merge +.section .rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16 .align 16 .Lgf128mul_x_ble_mask: .octa 0x00010087 +.section .rodata.cst16.POLY, "aM", @progbits, 16 +.align 16 POLY: .octa 0xC201 +.section .rodata.cst16.TWOONE, "aM", @progbits, 16 +.align 16 TWOONE: .octa 0x00010001 -# order of these constan
[PATCH] x86/crypto: fix %progbits -> @progbits
%progbits form is used on ARM (where @ is a comment char). x86 consistently uses @progbits everywhere else. Signed-off-by: Denys Vlasenko CC: Herbert Xu CC: Josh Poimboeuf CC: Xiaodong Liu CC: Megha Dey CC: George Spelvin CC: linux-cry...@vger.kernel.org CC: x...@kernel.org CC: linux-kernel@vger.kernel.org --- arch/x86/crypto/crc32c-pcl-intel-asm_64.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S index dc05f01..7a7de27 100644 --- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S +++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S @@ -312,7 +312,7 @@ do_return: ret ENDPROC(crc_pcl) -.section .rodata, "a", %progbits +.section .rodata, "a", @progbits ## jump tableTable is 129 entries x 2 bytes each -- 2.9.2
Re: random: /dev/random often returns short reads
On 01/17/2017 11:29 PM, H. Peter Anvin wrote: On 01/17/17 09:34, Denys Vlasenko wrote: On 01/17/2017 06:15 PM, Theodore Ts'o wrote: On Tue, Jan 17, 2017 at 09:21:31AM +0100, Denys Vlasenko wrote: If someone wants to send me a patch, I'll happily take a look at it, Will something along these lines be accepted? The problem is that this won't work. In the cases that we're talking about, the entropy counter in the secondary pool is not zero, but close to zero, we'll still have short reads. And that's going to happen a fair amount of the time. Perhaps the best *hacky* solution would be to say, ok if the entropy count is less than some threshold, don't use the correct entropy calculation, but rather assume that all of the new bits won't land on top of existing entropy bits. IOW, something like this: --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -653,6 +653,9 @@ static void credit_entropy_bits(struct entropy_store *r, int nbits) if (nfrac < 0) { /* Debit */ entropy_count += nfrac; + } else if (entropy_count < ((8 * 8) << ENTROPY_SHIFT)) { + /* Credit, and the pool is almost empty */ + entropy_count += nfrac; } else { /* * Credit: we have to account for the possibility of * overwriting already present entropy. Even in the Want the patch? If yes, what name of the constant you prefer? How about This seems very wrong. The whole point is that we keep it conservative -- always less than or equal to the correct number. In this case, what code does is it returns fewer bytes, even though *it has enough random bytes to return the full request*. This happens because the patch which added more conservative accounting, while containing technically correct accounting per se, it forgot to take in the account another part of the code which was relying on the previous, simplistic logic "if we add N random bytes to a pool, now it has +N random bytes". In my opinion, it should have changed that part of code simultaneously with introducing more conservative accounting.
Re: random: /dev/random often returns short reads
On 01/17/2017 06:15 PM, Theodore Ts'o wrote: On Tue, Jan 17, 2017 at 09:21:31AM +0100, Denys Vlasenko wrote: If someone wants to send me a patch, I'll happily take a look at it, Will something along these lines be accepted? The problem is that this won't work. In the cases that we're talking about, the entropy counter in the secondary pool is not zero, but close to zero, we'll still have short reads. And that's going to happen a fair amount of the time. Perhaps the best *hacky* solution would be to say, ok if the entropy count is less than some threshold, don't use the correct entropy calculation, but rather assume that all of the new bits won't land on top of existing entropy bits. IOW, something like this: --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -653,6 +653,9 @@ static void credit_entropy_bits(struct entropy_store *r, int nbits) if (nfrac < 0) { /* Debit */ entropy_count += nfrac; + } else if (entropy_count < ((8 * 8) << ENTROPY_SHIFT)) { + /* Credit, and the pool is almost empty */ + entropy_count += nfrac; } else { /* * Credit: we have to account for the possibility of * overwriting already present entropy. Even in the Want the patch? If yes, what name of the constant you prefer? How about /* Has less than 8 bytes */ #define ALMOST_EMPTY_POOL_frac ((8 * 8) << ENTROPY_SHIFT)
Re: random: /dev/random often returns short reads
On Tue, Jan 17, 2017 at 5:36 AM, Theodore Ts'o wrote: > On Mon, Jan 16, 2017 at 07:50:55PM +0100, Denys Vlasenko wrote: >> >> /dev/random can legitimately returns short reads >> when there is not enough entropy for the full request. > > Yes, but callers of /dev/random should be able to handle short reads. > So it's a bug in the application as well. I absolutely agree, whoever stumbled over it has a bug in their app. >> The code looks like it effectively credits the pool only for ~3/4 >> of the amount, i.e. 24 bytes, not 32. > > How much it credits the pools varies depending on how many bits of > entropy are being transferred and how full the pool happens to be > beforehand. I think the problem is that even if the target pool has no entropy at all, current algorithm thinks that transferring N random bytes to it gives it N*3/4 bytes of randomness. > Reversing the calculation so that we transfer exactly the > right number of bits is tricky, and if we transfer too many bits, we > risk "wasting" entropy bits. Of course, it doesn't matter if we're > transfering pretend entropy only for the purposes of getting FIPS > certification, but getting it Right(tm) is non-trivial. > > If someone wants to send me a patch, I'll happily take a look at it, Will something along these lines be accepted? --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -653,6 +653,9 @@ static void credit_entropy_bits(struct entropy_store *r, int nbits) if (nfrac < 0) { /* Debit */ entropy_count += nfrac; + } else if (entropy_count == 0) { + /* Credit, and the pool is empty */ + entropy_count += nfrac; } else { /* * Credit: we have to account for the possibility of * overwriting already present entropy. Even in the > but given that fixing userspace is something you really should do > anyway I agree. It's just not in my (or my company's, IIUC) userspace code. I wouldn't even know about this thing since *my* programs do handle short reads correctly.
Re: random: /dev/random often returns short reads
> # while ./eat_dev_random; do ./eat_dev_random; done; ./eat_dev_random > read of 32 returns 32 > read of 32 returns 32 > read of 32 returns 28 > read of 32 returns 24 > > Just two few first requests worked, and then ouch... > > I think this is what happens here: > we transfer 4 bytes of entrophy to /dev/random pool: erm... 32 bytes, of course.
random: /dev/random often returns short reads
Hi, /dev/random can legitimately returns short reads when there is not enough entropy for the full request. However, now it does so far too often, and it appears to be a bug: #include #include #include #include #include #include int main(int argc, char **argv) { int fd, ret, len; char buf[16 * 1024]; len = argv[1] ? atoi(argv[1]) : 32; fd = open("/dev/random", O_RDONLY); ret = read(fd, buf, len); printf("read of %d returns %d\n", len, ret); if (ret != len) return 1; return 0; } # gcc -Os -Wall eat_dev_random.c -o eat_dev_random # while ./eat_dev_random; do ./eat_dev_random; done; ./eat_dev_random read of 32 returns 32 read of 32 returns 32 read of 32 returns 28 read of 32 returns 24 Just two few first requests worked, and then ouch... I think this is what happens here: we transfer 4 bytes of entrophy to /dev/random pool: _xfer_secondary_pool(struct entropy_store *r, size_t nbytes) int bytes = nbytes; /* pull at least as much as a wakeup */ bytes = max_t(int, bytes, random_read_wakeup_bits / 8); /* but never more than the buffer size */ bytes = min_t(int, bytes, sizeof(tmp)); bytes = extract_entropy(r->pull, tmp, bytes, random_read_wakeup_bits / 8, rsvd_bytes); mix_pool_bytes(r, tmp, bytes); credit_entropy_bits(r, bytes*8); but when we enter credit_entropy_bits(), there is a defensive code which slightly underestimates the amount of entropy! It was added by this commit: commit 30e37ec516ae5a6957596de7661673c615c82ea4 Author: H. Peter Anvin Date: Tue Sep 10 23:16:17 2013 -0400 random: account for entropy loss due to overwrites When we write entropy into a non-empty pool, we currently don't account at all for the fact that we will probabilistically overwrite some of the entropy in that pool. This means that unless the pool is fully empty, we are currently *guaranteed* to overestimate the amount of entropy in the pool! The code looks like it effectively credits the pool only for ~3/4 of the amount, i.e. 24 bytes, not 32. If /dev/random pool was empty or nearly so, further it results in a short read. This is wrong because _xfer_secondary_pool() could well had lots and lots of entropy to supply, it just did not give enough.
[PATCH v8] powerpc: Do not make the entire heap executable
On 32-bit powerpc the ELF PLT sections of binaries (built with --bss-plt, or with a toolchain which defaults to it) look like this: [17] .sbss NOBITS 0002aff8 01aff8 14 00 WA 0 0 4 [18] .plt NOBITS 0002b00c 01aff8 84 00 WAX 0 0 4 [19] .bss NOBITS 0002b090 01aff8 a4 00 WA 0 0 4 Which results in an ELF load header: Type Offset VirtAddr PhysAddr FileSiz MemSiz Flg Align LOAD 0x019c70 0x00029c70 0x00029c70 0x01388 0x014c4 RWE 0x1 This is all correct, the load region containing the PLT is marked as executable. Note that the PLT starts at 0002b00c but the file mapping ends at 0002aff8, so the PLT falls in the 0 fill section described by the load header, and after a page boundary. Unfortunately the generic ELF loader ignores the X bit in the load headers when it creates the 0 filled non-file backed mappings. It assumes all of these mappings are RW BSS sections, which is not the case for PPC. gcc/ld has an option (--secure-plt) to not do this, this is said to incur a small performance penalty. Currently, to support 32-bit binaries with PLT in BSS kernel maps *entire brk area* with executable rights for all binaries, even --secure-plt ones. Stop doing that. Teach the ELF loader to check the X bit in the relevant load header and create 0 filled anonymous mappings that are executable if the load header requests that. Test program showing the difference in /proc/$PID/maps: int main() { char buf[16*1024]; char *p = malloc(123); /* make "[heap]" mapping appear */ int fd = open("/proc/self/maps", O_RDONLY); int len = read(fd, buf, sizeof(buf)); write(1, buf, len); printf("%p\n", p); return 0; } Compiled using: gcc -mbss-plt -m32 -Os test.c -otest Unpatched ppc64 kernel: 0010-0012 r-xp 00:00 0 [vdso] 0fe1-0ffd r-xp fd:00 67898094 /usr/lib/libc-2.17.so 0ffd-0ffe r--p 001b fd:00 67898094 /usr/lib/libc-2.17.so 0ffe-0fff rw-p 001c fd:00 67898094 /usr/lib/libc-2.17.so 1000-1001 r-xp fd:00 100674505 /home/user/test 1001-1002 r--p fd:00 100674505 /home/user/test 1002-1003 rw-p 0001 fd:00 100674505 /home/user/test 1069-106c rwxp 00:00 0 [heap] f7f7-f7fa r-xp fd:00 67898089 /usr/lib/ld-2.17.so f7fa-f7fb r--p 0002 fd:00 67898089 /usr/lib/ld-2.17.so f7fb-f7fc rw-p 0003 fd:00 67898089 /usr/lib/ld-2.17.so ffa9-ffac rw-p 00:00 0 [stack] 0x10690008 Patched ppc64 kernel: 0010-0012 r-xp 00:00 0 [vdso] 0fe1-0ffd r-xp fd:00 67898094 /usr/lib/libc-2.17.so 0ffd-0ffe r--p 001b fd:00 67898094 /usr/lib/libc-2.17.so 0ffe-0fff rw-p 001c fd:00 67898094 /usr/lib/libc-2.17.so 1000-1001 r-xp fd:00 100674505 /home/user/test 1001-1002 r--p fd:00 100674505 /home/user/test 1002-1003 rw-p 0001 fd:00 100674505 /home/user/test 1018-101b rw-p 00:00 0 [heap] this has changed f7c6-f7c9 r-xp fd:00 67898089 /usr/lib/ld-2.17.so f7c9-f7ca r--p 0002 fd:00 67898089 /usr/lib/ld-2.17.so f7ca-f7cb rw-p 0003 fd:00 67898089 /usr/lib/ld-2.17.so ff86-ff89 rw-p 00:00 0 [stack] 0x10180008 The patch was originally posted in 2012 by Jason Gunthorpe and apparently ignored: https://lkml.org/lkml/2012/9/30/138 Lightly run-tested. Signed-off-by: Jason Gunthorpe Signed-off-by: Denys Vlasenko Acked-by: Kees Cook Acked-by: Michael Ellerman Tested-by: Jason Gunthorpe CC: Andrew Morton CC: Benjamin Herrenschmidt CC: Paul Mackerras CC: "Aneesh Kumar K.V" CC: Kees Cook CC: Oleg Nesterov CC: Michael Ellerman CC: Florian Weimer CC: linux...@kvack.org CC: linuxppc-...@lists.ozlabs.org CC: linux-kernel@vger.kernel.org --- Changes since v7: * added /proc/$PID/maps example in the commit message Changes since v6: * rebased to current Linus tree * sending to akpm Changes since v5: * made do_brk_flags() error out if any bits other than VM_EXEC are set. (Kees Cook: "With this, I'd be happy
[PATCH v7] powerpc: Do not make the entire heap executable
On 32-bit powerpc the ELF PLT sections of binaries (built with --bss-plt, or with a toolchain which defaults to it) look like this: [17] .sbss NOBITS 0002aff8 01aff8 14 00 WA 0 0 4 [18] .plt NOBITS 0002b00c 01aff8 84 00 WAX 0 0 4 [19] .bss NOBITS 0002b090 01aff8 a4 00 WA 0 0 4 Which results in an ELF load header: Type Offset VirtAddr PhysAddr FileSiz MemSiz Flg Align LOAD 0x019c70 0x00029c70 0x00029c70 0x01388 0x014c4 RWE 0x1 This is all correct, the load region containing the PLT is marked as executable. Note that the PLT starts at 0002b00c but the file mapping ends at 0002aff8, so the PLT falls in the 0 fill section described by the load header, and after a page boundary. Unfortunately the generic ELF loader ignores the X bit in the load headers when it creates the 0 filled non-file backed mappings. It assumes all of these mappings are RW BSS sections, which is not the case for PPC. gcc/ld has an option (--secure-plt) to not do this, this is said to incur a small performance penalty. Currently, to support 32-bit binaries with PLT in BSS kernel maps *entire brk area* with executable rights for all binaries, even --secure-plt ones. Stop doing that. Teach the ELF loader to check the X bit in the relevant load header and create 0 filled anonymous mappings that are executable if the load header requests that. The patch was originally posted in 2012 by Jason Gunthorpe and apparently ignored: https://lkml.org/lkml/2012/9/30/138 Lightly run-tested. Signed-off-by: Jason Gunthorpe Signed-off-by: Denys Vlasenko Acked-by: Kees Cook Acked-by: Michael Ellerman Tested-by: Jason Gunthorpe CC: Andrew Morton CC: Benjamin Herrenschmidt CC: Paul Mackerras CC: "Aneesh Kumar K.V" CC: Kees Cook CC: Oleg Nesterov CC: Michael Ellerman CC: Florian Weimer CC: linux...@kvack.org CC: linuxppc-...@lists.ozlabs.org CC: linux-kernel@vger.kernel.org --- Changes since v6: * rebased to current Linus tree * sending to akpm Changes since v5: * made do_brk_flags() error out if any bits other than VM_EXEC are set. (Kees Cook: "With this, I'd be happy to Ack.") See https://patchwork.ozlabs.org/patch/661595/ Changes since v4: * if (current->personality & READ_IMPLIES_EXEC), still use VM_EXEC for 32-bit executables. Changes since v3: * typo fix in commit message * rebased to current Linus tree Changes since v2: * moved capability to map with VM_EXEC into vm_brk_flags() Changes since v1: * wrapped lines to not exceed 79 chars * improved comment * expanded CC list arch/powerpc/include/asm/page.h | 4 +++- fs/binfmt_elf.c | 30 ++ include/linux/mm.h | 1 + mm/mmap.c | 24 +++- 4 files changed, 45 insertions(+), 14 deletions(-) diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h index 56398e7..17d3d2c 100644 --- a/arch/powerpc/include/asm/page.h +++ b/arch/powerpc/include/asm/page.h @@ -230,7 +230,9 @@ extern long long virt_phys_offset; * and needs to be executable. This means the whole heap ends * up being executable. */ -#define VM_DATA_DEFAULT_FLAGS32(VM_READ | VM_WRITE | VM_EXEC | \ +#define VM_DATA_DEFAULT_FLAGS32 \ + (((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0) | \ +VM_READ | VM_WRITE | \ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) #define VM_DATA_DEFAULT_FLAGS64(VM_READ | VM_WRITE | \ diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 2472af2..065134b 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -91,12 +91,18 @@ static struct linux_binfmt elf_format = { #define BAD_ADDR(x) ((unsigned long)(x) >= TASK_SIZE) -static int set_brk(unsigned long start, unsigned long end) +static int set_brk(unsigned long start, unsigned long end, int prot) { start = ELF_PAGEALIGN(start); end = ELF_PAGEALIGN(end); if (end > start) { - int error = vm_brk(start, end - start); + /* +* Map the last of the bss segment. +* If the header is requesting these pages to be +* executable, honour that (ppc32 needs this). +*/ + int error = vm_brk_flags(start, end - start, + prot & PROT_EXEC ? VM_EXEC : 0); if (error) return error; } @@ -524,6 +530,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex, unsigned long load_addr = 0; int load_addr_set = 0; unsigned long last_bss = 0, elf_bss = 0; + int bss_prot = 0; unsigned long error = ~0UL; unsigned long total_size; int i; @@ -606,8 +613,10 @@ static unsigned l
[PATCH] scsi: aic7xxx: fix ahc_delay and ahd_delay
They are buggy: while (usec > 0) udelay(usec % 1024); usec -= 1024; For example, for usec = 100*1024 + 1, old code will udelay(1) 101 times, i.e. it will be approximately equivalent to udelay(101), not the expected udelay(102400). This did not bite because all callers use values far from "pathological" ones, such as 500 and 1000 - these work fine with buggy code. This was reported in 2006 but was missed. Signed-off-by: Denys Vlasenko CC: James Bottomley CC: Hannes Reinicke CC: linux-s...@vger.kernel.org CC: linux-kernel@vger.kernel.org --- drivers/scsi/aic7xxx/aic79xx_osm.c | 7 --- drivers/scsi/aic7xxx/aic7xxx_osm.c | 7 --- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/drivers/scsi/aic7xxx/aic79xx_osm.c b/drivers/scsi/aic7xxx/aic79xx_osm.c index 2588b8f..e7a7838 100644 --- a/drivers/scsi/aic7xxx/aic79xx_osm.c +++ b/drivers/scsi/aic7xxx/aic79xx_osm.c @@ -380,9 +380,10 @@ ahd_delay(long usec) * multi-millisecond waits. Wait at most * 1024us per call. */ - while (usec > 0) { - udelay(usec % 1024); - usec -= 1024; + udelay(usec & 1023); + usec >>= 10; + while (--usec >= 0) { + udelay(1024); } } diff --git a/drivers/scsi/aic7xxx/aic7xxx_osm.c b/drivers/scsi/aic7xxx/aic7xxx_osm.c index fc6a831..c81798e 100644 --- a/drivers/scsi/aic7xxx/aic7xxx_osm.c +++ b/drivers/scsi/aic7xxx/aic7xxx_osm.c @@ -388,9 +388,10 @@ ahc_delay(long usec) * multi-millisecond waits. Wait at most * 1024us per call. */ - while (usec > 0) { - udelay(usec % 1024); - usec -= 1024; + udelay(usec & 1023); + usec >>= 10; + while (--usec >= 0) { + udelay(1024); } } -- 2.9.2
Re: Another gcc corruption bug (was Re: [PATCH] [RFC] x86: avoid -mtune=atom for objtool warnings)
On 10/13/2016 02:46 PM, Josh Poimboeuf wrote: On Tue, Oct 11, 2016 at 10:38:42PM +0200, Arnd Bergmann wrote: On Tuesday, October 11, 2016 10:51:46 AM CEST Josh Poimboeuf wrote: Notice how it just falls off the end of the function. We had a similar bug before: https://lkml.kernel.org/r/20160413033649.7r3msnmo3trtq47z@treble I remember that nightmare :( https://gcc.gnu.org/bugzilla/show_bug.cgi?id=70646 I'm not sure yet if this is the same gcc bug or a different one. Maybe it's related to the new GCC_PLUGIN_SANCOV? I've reduced one of the test cases to this now: /* gcc-6 -O2 -fno-strict-aliasing -fno-reorder-blocks -fno-omit-frame-pointer -Wno-pointer-sign -fsanitize-coverage=trace-pc -Wall -Werror -c snic_res.c -o snic_res.o */ typedef int spinlock_t; extern unsigned int ioread32(void *); struct vnic_wq_ctrl { unsigned int error_status; }; struct vnic_wq { struct vnic_wq_ctrl *ctrl; } mempool_t; struct snic { unsigned int wq_count; __attribute__ ((__aligned__)) struct vnic_wq wq[1]; spinlock_t wq_lock[1]; }; unsigned int snic_log_q_error_err_status; void snic_log_q_error(struct snic *snic) { unsigned int i; for (i = 0; i < snic->wq_count; i++) snic_log_q_error_err_status = ioread32(&snic->wq[i].ctrl->error_status); } which gets compiled into : 0: 55 push %rbp 1: 48 89 e5mov%rsp,%rbp 4: 53 push %rbx 5: 48 89 fbmov%rdi,%rbx 8: 48 83 ec 08 sub$0x8,%rsp c: e8 00 00 00 00 callq 11 d: R_X86_64_PC32__sanitizer_cov_trace_pc-0x4 11: 8b 03 mov(%rbx),%eax 13: 85 c0 test %eax,%eax 15: 75 11 jne28 17: 48 83 c4 08 add$0x8,%rsp 1b: 5b pop%rbx 1c: 5d pop%rbp 1d: e9 00 00 00 00 jmpq 22 1e: R_X86_64_PC32 __sanitizer_cov_trace_pc-0x4 22: 66 0f 1f 44 00 00 nopw 0x0(%rax,%rax,1) 28: e8 00 00 00 00 callq 2d 29: R_X86_64_PC32 __sanitizer_cov_trace_pc-0x4 2d: 48 8b 7b 10 mov0x10(%rbx),%rdi 31: e8 00 00 00 00 callq 36 32: R_X86_64_PC32 ioread32-0x4 36: 89 05 00 00 00 00 mov%eax,0x0(%rip)# 3c 38: R_X86_64_PC32 snic_log_q_error_err_status-0x4 3c: 83 3b 01cmpl $0x1,(%rbx) 3f: 76 d6 jbe17 41: e8 00 00 00 00 callq 46 42: R_X86_64_PC32 __sanitizer_cov_trace_pc-0x4 I opened a bug: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=77966 Surprisingly, it's really "not a bug". The only way you can end up in this branch is if you have a bug and run off the end of wq[1] array member: i.e. if snic->wq_count >= 2. (See gcc BZ for smaller example) It's debatable whether it's okay for gcc to just let buggy code to run off and execute something random. It is surely surprising, and not debug-friendly. An option to emit a crashing instruction (HLT, INT3, that sort of thing) instead of just stopping code generation might be useful.
Re: [PATCH][v10] PM / hibernate: Verify the consistent of e820 memory map by md5 digest
On Fri, Sep 9, 2016 at 2:21 PM, Chen Yu wrote: > On some platforms, there is occasional panic triggered when trying to > resume from hibernation, a typical panic looks like: > > "BUG: unable to handle kernel paging request at 880085894000 > IP: [] load_image_lzo+0x8c2/0xe70" > > Investigation carried out by Lee Chun-Yi show that this is because > e820 map has been changed by BIOS across hibernation, and one > of the page frames from suspend kernel is right located in restore > kernel's unmapped region, so panic comes out when accessing unmapped > kernel address. > > In order to expose this issue earlier, the md5 hash of e820 map > is passed from suspend kernel to restore kernel, and the restore > kernel will terminate the resume process once it finds the md5 > hash are not the same. > > As the format of image header has been modified, the magic number > should also be adjusted as kernels with the same RESTORE_MAGIC have > to use the same header format and interpret all of the fields in > it in the same way. > > If the suspend kernel is built without md5 support, and the restore > kernel has md5 support, then the latter will bypass the check process. > Vice versa the restore kernel will bypass the check if it does not > support md5 operation. > > Note: > 1. Without this patch applied, it is possible that BIOS has >provided an inconsistent memory map, but the resume kernel is still >able to restore the image anyway(e.g, E820_RAM region is the superset >of the previous one), although the system might be unstable. So this >patch tries to treat any inconsistent e820 as illegal. > > 2. Another case is, this patch replies on comparing the e820_saved, but >currently the e820_save might not be strictly the same across >hibernation, even if BIOS has provided consistent e820 map - In >theory mptable might modify the BIOS-provided e820_saved dynamically >in early_reserve_e820_mpc_new, which would allocate a buffer from >E820_RAM, and marks it from E820_RAM to E820_RESERVED). >This is a potential and rare case we need to deal with in OS in >the future. > > Suggested-by: Pavel Machek > Suggested-by: Rafael J. Wysocki > Cc: Rafael J. Wysocki > Cc: Pavel Machek > Cc: Lee Chun-Yi > Cc: Borislav Petkov > Acked-by: Pavel Machek > Signed-off-by: Chen Yu > --- > +static int get_e820_md5(struct e820map *map, void *buf) > +{ > + struct scatterlist sg; > + struct crypto_ahash *tfm; > + struct ahash_request *req; > + int ret = 0; > + > + tfm = crypto_alloc_ahash("md5", 0, CRYPTO_ALG_ASYNC); > + if (IS_ERR(tfm)) > + return -ENOMEM; > + > + req = ahash_request_alloc(tfm, GFP_KERNEL); > + if (!req) { > + ret = -ENOMEM; > + goto free_ahash; > + } I looked elsewhere in kernel, and there is this idiom for placing struct ahash_request on stack. Instead of the ahash_request_alloc() and never-actually-tirggering-error handling, you can do: { AHASH_REQUEST_ON_STACK(req, tfm); > + > + sg_init_one(&sg, (u8 *)map, sizeof(struct e820map)); > + ahash_request_set_callback(req, 0, NULL, NULL); > + ahash_request_set_crypt(req, &sg, buf, sizeof(struct e820map)); > + > + if (crypto_ahash_digest(req)) > + ret = -EINVAL; > + > + ahash_request_free(req); > + free_ahash: and, naturally, the free() and the label would not be needed too, just close the one extra brace: > + crypto_free_ahash(tfm); > + > + return ret; } > +}
[PATCH v6] powerpc: Do not make the entire heap executable
On 32-bit powerpc the ELF PLT sections of binaries (built with --bss-plt, or with a toolchain which defaults to it) look like this: [17] .sbss NOBITS 0002aff8 01aff8 14 00 WA 0 0 4 [18] .plt NOBITS 0002b00c 01aff8 84 00 WAX 0 0 4 [19] .bss NOBITS 0002b090 01aff8 a4 00 WA 0 0 4 Which results in an ELF load header: Type Offset VirtAddr PhysAddr FileSiz MemSiz Flg Align LOAD 0x019c70 0x00029c70 0x00029c70 0x01388 0x014c4 RWE 0x1 This is all correct, the load region containing the PLT is marked as executable. Note that the PLT starts at 0002b00c but the file mapping ends at 0002aff8, so the PLT falls in the 0 fill section described by the load header, and after a page boundary. Unfortunately the generic ELF loader ignores the X bit in the load headers when it creates the 0 filled non-file backed mappings. It assumes all of these mappings are RW BSS sections, which is not the case for PPC. gcc/ld has an option (--secure-plt) to not do this, this is said to incur a small performance penalty. Currently, to support 32-bit binaries with PLT in BSS kernel maps *entire brk area* with executable rights for all binaries, even --secure-plt ones. Stop doing that. Teach the ELF loader to check the X bit in the relevant load header and create 0 filled anonymous mappings that are executable if the load header requests that. The patch was originally posted in 2012 by Jason Gunthorpe and apparently ignored: https://lkml.org/lkml/2012/9/30/138 Lightly run-tested. Signed-off-by: Jason Gunthorpe Signed-off-by: Denys Vlasenko Acked-by: Kees Cook Acked-by: Michael Ellerman CC: Benjamin Herrenschmidt CC: Paul Mackerras CC: "Aneesh Kumar K.V" CC: Kees Cook CC: Oleg Nesterov CC: Michael Ellerman CC: Florian Weimer CC: linux...@kvack.org CC: linuxppc-...@lists.ozlabs.org CC: linux-kernel@vger.kernel.org --- Changes since v5: * made do_brk_flags() error out if any bits other than VM_EXEC are set. (Kees Cook: "With this, I'd be happy to Ack.") See https://patchwork.ozlabs.org/patch/661595/ Changes since v4: * if (current->personality & READ_IMPLIES_EXEC), still use VM_EXEC for 32-bit executables. Changes since v3: * typo fix in commit message * rebased to current Linus tree Changes since v2: * moved capability to map with VM_EXEC into vm_brk_flags() Changes since v1: * wrapped lines to not exceed 79 chars * improved comment * expanded CC list arch/powerpc/include/asm/page.h | 4 +++- fs/binfmt_elf.c | 30 ++ include/linux/mm.h | 1 + mm/mmap.c | 24 +++- 4 files changed, 45 insertions(+), 14 deletions(-) diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h index 56398e7..17d3d2c 100644 --- a/arch/powerpc/include/asm/page.h +++ b/arch/powerpc/include/asm/page.h @@ -230,7 +230,9 @@ extern long long virt_phys_offset; * and needs to be executable. This means the whole heap ends * up being executable. */ -#define VM_DATA_DEFAULT_FLAGS32(VM_READ | VM_WRITE | VM_EXEC | \ +#define VM_DATA_DEFAULT_FLAGS32 \ + (((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0) | \ +VM_READ | VM_WRITE | \ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) #define VM_DATA_DEFAULT_FLAGS64(VM_READ | VM_WRITE | \ diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index e5495f3..12b0d19 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -91,12 +91,18 @@ static struct linux_binfmt elf_format = { #define BAD_ADDR(x) ((unsigned long)(x) >= TASK_SIZE) -static int set_brk(unsigned long start, unsigned long end) +static int set_brk(unsigned long start, unsigned long end, int prot) { start = ELF_PAGEALIGN(start); end = ELF_PAGEALIGN(end); if (end > start) { - int error = vm_brk(start, end - start); + /* +* Map the last of the bss segment. +* If the header is requesting these pages to be +* executable, honour that (ppc32 needs this). +*/ + int error = vm_brk_flags(start, end - start, + prot & PROT_EXEC ? VM_EXEC : 0); if (error) return error; } @@ -524,6 +530,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex, unsigned long load_addr = 0; int load_addr_set = 0; unsigned long last_bss = 0, elf_bss = 0; + int bss_prot = 0; unsigned long error = ~0UL; unsigned long total_size; int i; @@ -606,8 +613,10 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex, * elf_bss and last_bss is the bss section.
Huge static buffer in liquidio
Hello, I would like to discuss this commit: commit d3d7e6c65f75de18ced10a98595a847f9f95f0ce Author: Raghu Vatsavayi Date: Tue Jun 21 22:53:07 2016 -0700 liquidio: Firmware image download This patch has firmware image related changes for: firmware release upon failure, support latest firmware version and firmware download in 4MB chunks. Here is a part of it: +u8 fbuf[4 * 1024 * 1024]; ^^^ what?? [also, why it is not static?] + int octeon_download_firmware(struct octeon_device *oct, const u8 *data, size_t size) { int ret = 0; - u8 *p; - u8 *buffer; + u8 *p = fbuf; Don't you think that using 4 megabytes of static buffer *just for the firmware upload* is not a good practice? Further down the patch it's obvious that the buffer is not even needed, because the firmware is already in memory, the "data" and "size" parameters of this function point to it. The meat of the change of the FW download is this (deleted some debug messages code): - buffer = kmemdup(data, size, GFP_KERNEL); - if (!buffer) - return -ENOMEM; - - p = buffer + sizeof(struct octeon_firmware_file_header); + data += sizeof(struct octeon_firmware_file_header); /* load all images */ for (i = 0; i < be32_to_cpu(h->num_images); i++) { load_addr = be64_to_cpu(h->desc[i].addr); image_len = be32_to_cpu(h->desc[i].len); - /* download the image */ - octeon_pci_write_core_mem(oct, load_addr, p, image_len); + /* Write in 4MB chunks*/ + rem = image_len; - p += image_len; + while (rem) { + if (rem < (4 * 1024 * 1024)) + size = rem; + else + size = 4 * 1024 * 1024; + + memcpy(p, data, size); + + /* download the image */ + octeon_pci_write_core_mem(oct, load_addr, p, (u32)size); + + data += size; + rem -= (u32)size; + load_addr += size; + } } + dev_info(&oct->pci_dev->dev, "Writing boot command: %s\n", +h->bootcmd); /* Invoke the bootcmd */ ret = octeon_console_send_cmd(oct, h->bootcmd, 50); -done_downloading: - kfree(buffer); octeon_pci_write_core_mem() takes spinlock around copy op, I take this was the reason for this change: reduce IRQ-disabled time. Do you actually need an intermediate fbuf[] buffer here? (IOW: can't you just write data to PCI from memory area pointed by "data" ptr?) If there is indeed a reason for intermediate buffer, why did you drop the approach of having a temporary kmalloc'ed buffer and switches to a static (and *huge*) buffer? In my opinion, 4 Mbyte buffer is an overkill in any case. A buffer of ~4..16 Kbyte one would work just fine - it's not like you need to squeeze last 0.5% of speed when you upload firmware.
[tip:x86/boot] x86/e820: Use much less memory for e820/e820_saved, save up to 120k
Commit-ID: 1827822902cf659d60d3413fd42c7e6cbd18df4d Gitweb: http://git.kernel.org/tip/1827822902cf659d60d3413fd42c7e6cbd18df4d Author: Denys Vlasenko AuthorDate: Sun, 18 Sep 2016 20:21:25 +0200 Committer: Ingo Molnar CommitDate: Wed, 21 Sep 2016 15:02:12 +0200 x86/e820: Use much less memory for e820/e820_saved, save up to 120k The maximum size of e820 map array for EFI systems is defined as E820_X_MAX (E820MAX + 3 * MAX_NUMNODES). In x86_64 defconfig, this ends up with E820_X_MAX = 320, e820 and e820_saved are 6404 bytes each. With larger configs, for example Fedora kernels, E820_X_MAX = 3200, e820 and e820_saved are 64004 bytes each. Most of this space is wasted. Typical machines have some 20-30 e820 areas at most. After previous patch, e820 and e820_saved are pointers to e280 maps. Change them to initially point to maps which are __initdata. At the very end of kernel init, just before __init[data] sections are freed in free_initmem(), allocate smaller blocks, copy maps there, and change pointers. The late switch makes sure that all functions which can be used to change e820 maps are no longer accessible (they are all __init functions). Run-tested. Signed-off-by: Denys Vlasenko Acked-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Yinghai Lu Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/20160918182125.21000-1-dvlas...@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/e820.c | 8 arch/x86/mm/init.c | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 585000c9..bb8c690 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -40,10 +40,10 @@ * user can e.g. boot the original kernel with mem=1G while still booting the * next kernel with full memory. */ -static struct e820map initial_e820; -static struct e820map initial_e820_saved; -struct e820map *e820 = &initial_e820; -struct e820map *e820_saved = &initial_e820_saved; +static struct e820map initial_e820 __initdata; +static struct e820map initial_e820_saved __initdata; +struct e820map *e820 __refdata = &initial_e820; +struct e820map *e820_saved __refdata = &initial_e820_saved; /* For PCI or other memory-mapped resources */ unsigned long pci_mem_start = 0xaeedbabe; diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 167deae..22af912 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -699,9 +699,9 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end) } } -void free_initmem(void) +void __ref free_initmem(void) { - /* e820_reallocate_tables(); - disabled for now */ + e820_reallocate_tables(); free_init_pages("unused kernel", (unsigned long)(&__init_begin),
[tip:x86/boot] x86/e820: Prepare e280 code for switch to dynamic storage
Commit-ID: 475339684ef19e46f4702e2d185a869a5c454688 Gitweb: http://git.kernel.org/tip/475339684ef19e46f4702e2d185a869a5c454688 Author: Denys Vlasenko AuthorDate: Sat, 17 Sep 2016 23:39:26 +0200 Committer: Ingo Molnar CommitDate: Wed, 21 Sep 2016 15:02:12 +0200 x86/e820: Prepare e280 code for switch to dynamic storage This patch turns e820 and e820_saved into pointers to e820 tables, of the same size as before. Signed-off-by: Denys Vlasenko Acked-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Yinghai Lu Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/20160917213927.1787-2-dvlas...@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/e820.h | 6 +- arch/x86/kernel/e820.c| 125 +++--- arch/x86/kernel/early-quirks.c| 2 +- arch/x86/kernel/kexec-bzimage64.c | 4 +- arch/x86/kernel/resource.c| 4 +- arch/x86/kernel/setup.c | 8 +-- arch/x86/kernel/tboot.c | 8 +-- arch/x86/mm/init.c| 2 + arch/x86/platform/efi/efi.c | 2 +- arch/x86/xen/setup.c | 2 +- 10 files changed, 98 insertions(+), 65 deletions(-) diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h index 3ab0537..476b574 100644 --- a/arch/x86/include/asm/e820.h +++ b/arch/x86/include/asm/e820.h @@ -10,8 +10,8 @@ #include #ifndef __ASSEMBLY__ /* see comment in arch/x86/kernel/e820.c */ -extern struct e820map e820; -extern struct e820map e820_saved; +extern struct e820map *e820; +extern struct e820map *e820_saved; extern unsigned long pci_mem_start; extern int e820_any_mapped(u64 start, u64 end, unsigned type); @@ -53,6 +53,8 @@ extern void e820_reserve_resources_late(void); extern void setup_memory_map(void); extern char *default_machine_specific_memory_setup(void); +extern void e820_reallocate_tables(void); + /* * Returns true iff the specified range [s,e) is completely contained inside * the ISA region. diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 4d3dd9a..585000c9 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -40,8 +40,10 @@ * user can e.g. boot the original kernel with mem=1G while still booting the * next kernel with full memory. */ -struct e820map e820; -struct e820map e820_saved; +static struct e820map initial_e820; +static struct e820map initial_e820_saved; +struct e820map *e820 = &initial_e820; +struct e820map *e820_saved = &initial_e820_saved; /* For PCI or other memory-mapped resources */ unsigned long pci_mem_start = 0xaeedbabe; @@ -58,8 +60,8 @@ e820_any_mapped(u64 start, u64 end, unsigned type) { int i; - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; + for (i = 0; i < e820->nr_map; i++) { + struct e820entry *ei = &e820->map[i]; if (type && ei->type != type) continue; @@ -81,8 +83,8 @@ int __init e820_all_mapped(u64 start, u64 end, unsigned type) { int i; - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; + for (i = 0; i < e820->nr_map; i++) { + struct e820entry *ei = &e820->map[i]; if (type && ei->type != type) continue; @@ -128,7 +130,7 @@ static void __init __e820_add_region(struct e820map *e820x, u64 start, u64 size, void __init e820_add_region(u64 start, u64 size, int type) { - __e820_add_region(&e820, start, size, type); + __e820_add_region(e820, start, size, type); } static void __init e820_print_type(u32 type) @@ -164,12 +166,12 @@ void __init e820_print_map(char *who) { int i; - for (i = 0; i < e820.nr_map; i++) { + for (i = 0; i < e820->nr_map; i++) { printk(KERN_INFO "%s: [mem %#018Lx-%#018Lx] ", who, - (unsigned long long) e820.map[i].addr, + (unsigned long long) e820->map[i].addr, (unsigned long long) - (e820.map[i].addr + e820.map[i].size - 1)); - e820_print_type(e820.map[i].type); + (e820->map[i].addr + e820->map[i].size - 1)); + e820_print_type(e820->map[i].type); printk(KERN_CONT "\n"); } } @@ -493,13 +495,13 @@ static u64 __init __e820_update_range(struct e820map *e820x, u64 start, u64 __init e820_update_range(u64 start, u64 size, unsigned old_type, unsigned new_type) { - return __e820_update_range(&e820, start, size, old_type, new_type); + return __e820_update_range(e820, start, size, old_type, new_type);
[tip:x86/boot] x86/e820: Mark some static functions __init
Commit-ID: 8c2103f224216a45c1a4d7aebbc13f3e007cde34 Gitweb: http://git.kernel.org/tip/8c2103f224216a45c1a4d7aebbc13f3e007cde34 Author: Denys Vlasenko AuthorDate: Sat, 17 Sep 2016 23:39:25 +0200 Committer: Ingo Molnar CommitDate: Wed, 21 Sep 2016 15:02:11 +0200 x86/e820: Mark some static functions __init They are all called only from other __init functions in e820.c Signed-off-by: Denys Vlasenko Acked-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Yinghai Lu Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/20160917213927.1787-1-dvlas...@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/e820.c | 12 ++-- 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 871f186..4d3dd9a 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -802,7 +802,7 @@ unsigned long __init e820_end_of_low_ram_pfn(void) return e820_end_pfn(1UL << (32-PAGE_SHIFT)); } -static void early_panic(char *msg) +static void __init early_panic(char *msg) { early_printk(msg); panic(msg); @@ -912,7 +912,7 @@ void __init finish_e820_parsing(void) } } -static const char *e820_type_to_string(int e820_type) +static const char *__init e820_type_to_string(int e820_type) { switch (e820_type) { case E820_RESERVED_KERN: @@ -926,7 +926,7 @@ static const char *e820_type_to_string(int e820_type) } } -static unsigned long e820_type_to_iomem_type(int e820_type) +static unsigned long __init e820_type_to_iomem_type(int e820_type) { switch (e820_type) { case E820_RESERVED_KERN: @@ -942,7 +942,7 @@ static unsigned long e820_type_to_iomem_type(int e820_type) } } -static unsigned long e820_type_to_iores_desc(int e820_type) +static unsigned long __init e820_type_to_iores_desc(int e820_type) { switch (e820_type) { case E820_ACPI: @@ -961,7 +961,7 @@ static unsigned long e820_type_to_iores_desc(int e820_type) } } -static bool do_mark_busy(u32 type, struct resource *res) +static bool __init do_mark_busy(u32 type, struct resource *res) { /* this is the legacy bios/dos rom-shadow + mmio region */ if (res->start < (1ULL<<20)) @@ -1027,7 +1027,7 @@ void __init e820_reserve_resources(void) } /* How much should we pad RAM ending depending on where it is? */ -static unsigned long ram_alignment(resource_size_t pos) +static unsigned long __init ram_alignment(resource_size_t pos) { unsigned long mb = pos >> 20;
[tip:x86/apic] x86/apic: Get rid of apic_version[] array
Commit-ID: cff9ab2b291e64259d97add48fe073c081afe4e2 Gitweb: http://git.kernel.org/tip/cff9ab2b291e64259d97add48fe073c081afe4e2 Author: Denys Vlasenko AuthorDate: Tue, 13 Sep 2016 20:12:32 +0200 Committer: Thomas Gleixner CommitDate: Tue, 20 Sep 2016 00:31:19 +0200 x86/apic: Get rid of apic_version[] array The array has a size of MAX_LOCAL_APIC, which can be as large as 32k, so it can consume up to 128k. The array has been there forever and was never used for anything useful other than a version mismatch check which was introduced in 2009. There is no reason to store the version in an array. The kernel is not prepared to handle different APIC versions anyway, so the real important part is to detect a version mismatch and warn about it, which can be done with a single variable as well. [ tglx: Massaged changelog ] Signed-off-by: Denys Vlasenko CC: Andy Lutomirski CC: Borislav Petkov CC: Brian Gerst CC: Mike Travis Link: http://lkml.kernel.org/r/20160913181232.30815-1-dvlas...@redhat.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/mpspec.h | 2 +- arch/x86/kernel/acpi/boot.c | 2 +- arch/x86/kernel/apic/apic.c | 17 +++-- arch/x86/kernel/apic/io_apic.c | 4 ++-- arch/x86/kernel/apic/probe_32.c | 2 +- arch/x86/kernel/smpboot.c | 10 +- 6 files changed, 17 insertions(+), 20 deletions(-) diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h index b07233b..c2f94dc 100644 --- a/arch/x86/include/asm/mpspec.h +++ b/arch/x86/include/asm/mpspec.h @@ -6,7 +6,6 @@ #include #include -extern int apic_version[]; extern int pic_mode; #ifdef CONFIG_X86_32 @@ -40,6 +39,7 @@ extern int mp_bus_id_to_type[MAX_MP_BUSSES]; extern DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES); extern unsigned int boot_cpu_physical_apicid; +extern u8 boot_cpu_apic_version; extern unsigned long mp_lapic_addr; #ifdef CONFIG_X86_LOCAL_APIC diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 1ad5fe2..0447e31 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -182,7 +182,7 @@ static int acpi_register_lapic(int id, u32 acpiid, u8 enabled) } if (boot_cpu_physical_apicid != -1U) - ver = apic_version[boot_cpu_physical_apicid]; + ver = boot_cpu_apic_version; cpu = generic_processor_info(id, ver); if (cpu >= 0) diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 1cbae30..779dae5 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -64,6 +64,8 @@ unsigned disabled_cpus; unsigned int boot_cpu_physical_apicid = -1U; EXPORT_SYMBOL_GPL(boot_cpu_physical_apicid); +u8 boot_cpu_apic_version; + /* * The highest APIC ID seen during enumeration. */ @@ -1812,8 +1814,7 @@ void __init init_apic_mappings(void) * since smp_sanity_check is prepared for such a case * and disable smp mode */ - apic_version[new_apicid] = -GET_APIC_VERSION(apic_read(APIC_LVR)); + boot_cpu_apic_version = GET_APIC_VERSION(apic_read(APIC_LVR)); } } @@ -1828,13 +1829,10 @@ void __init register_lapic_address(unsigned long address) } if (boot_cpu_physical_apicid == -1U) { boot_cpu_physical_apicid = read_apic_id(); - apic_version[boot_cpu_physical_apicid] = -GET_APIC_VERSION(apic_read(APIC_LVR)); + boot_cpu_apic_version = GET_APIC_VERSION(apic_read(APIC_LVR)); } } -int apic_version[MAX_LOCAL_APIC]; - /* * Local APIC interrupts */ @@ -2124,11 +2122,10 @@ int generic_processor_info(int apicid, int version) cpu, apicid); version = 0x10; } - apic_version[apicid] = version; - if (version != apic_version[boot_cpu_physical_apicid]) { + if (version != boot_cpu_apic_version) { pr_warning("BIOS bug: APIC version mismatch, boot CPU: %x, CPU %d: version %x\n", - apic_version[boot_cpu_physical_apicid], cpu, version); + boot_cpu_apic_version, cpu, version); } physid_set(apicid, phys_cpu_present_map); @@ -2271,7 +2268,7 @@ int __init APIC_init_uniprocessor(void) * Complain if the BIOS pretends there is one. */ if (!boot_cpu_has(X86_FEATURE_APIC) && - APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) { + APIC_INTEGRATED(boot_cpu_apic_version)) { pr_err("BIOS bug, local APIC 0x%x not detected!...\n", boot_cpu_physical_apicid); return -1; diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 7491f41..48e6d84 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@
[PATCH 3/3 v2] x86/e820: Use much less memory for e820/e820_saved, save up to 120k
The maximum size of e820 map array for EFI systems is defined as E820_X_MAX (E820MAX + 3 * MAX_NUMNODES). In x86_64 defconfig, this ends up with E820_X_MAX = 320, e820 and e820_saved are 6404 bytes each. With larger configs, for example Fedora kernels, E820_X_MAX = 3200, e820 and e820_saved are 64004 bytes each. Most of this space is wasted. Typical machines have some 20-30 e820 areas at most. After previous patch, e820 and e820_saved are pointers to e280 maps. Change them to initially point to maps which are __initdata. At the very end of kernel init, just before __init[data] sections are freed in free_initmem(), allocate smaller blocks, copy maps there, and change pointers. The late switch makes sure that all functions which can be used to change e820 maps are no longer accessible (they are all __init functions). Run-tested. Signed-off-by: Denys Vlasenko CC: Ingo Molnar CC: Andy Lutomirski CC: "H. Peter Anvin" CC: Borislav Petkov CC: Brian Gerst CC: x...@kernel.org CC: linux-kernel@vger.kernel.org --- Changes since v1: free_initmem() needs __ref annotation. arch/x86/kernel/e820.c | 8 arch/x86/mm/init.c | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index cf7c9ad..7e9249c 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -40,10 +40,10 @@ * user can e.g. boot the original kernel with mem=1G while still booting the * next kernel with full memory. */ -static struct e820map initial_e820; -static struct e820map initial_e820_saved; -struct e820map *e820 = &initial_e820; -struct e820map *e820_saved = &initial_e820_saved; +static struct e820map initial_e820 __initdata; +static struct e820map initial_e820_saved __initdata; +struct e820map *e820 __refdata = &initial_e820; +struct e820map *e820_saved __refdata = &initial_e820_saved; /* For PCI or other memory-mapped resources */ unsigned long pci_mem_start = 0xaeedbabe; diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 167deae..22af912 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -699,9 +699,9 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end) } } -void free_initmem(void) +void __ref free_initmem(void) { - /* e820_reallocate_tables(); - disabled for now */ + e820_reallocate_tables(); free_init_pages("unused kernel", (unsigned long)(&__init_begin), -- 2.9.2
Re: [PATCH 2/2 v2] x86/e820: Use much less memory for e820/e820_saved, save up to 120k
On 09/18/2016 10:31 AM, Ingo Molnar wrote: * Denys Vlasenko wrote: On 09/15/2016 09:04 AM, Ingo Molnar wrote: * Denys Vlasenko wrote: The maximum size of e820 map array for EFI systems is defined as E820_X_MAX (E820MAX + 3 * MAX_NUMNODES). In x86_64 defconfig, this ends up with E820_X_MAX = 320, e820 and e820_saved are 6404 bytes each. With larger configs, for example Fedora kernels, E820_X_MAX = 3200, e820 and e820_saved are 64004 bytes each. Most of this space is wasted. Typical machines have some 20-30 e820 areas at most. This patch turns e820 and e820_saved to pointers which initially point to __initdata tables, of the same size as before. At the very end of setup_arch(), when we are done fiddling with these maps, allocate smaller alloc_bootmem blocks, copy maps there, and change pointers. Run-tested. +/* + * Initial e820 and e820_saved are largish __initdata arrays. + * Copy them to (usually much smaller) dynamically allocated area. + * This is done after all tweaks we ever do to them. + */ +__init void e820_reallocate_tables(void) +{ + struct e820map *n; + int size; + + size = offsetof(struct e820map, map) + sizeof(struct e820entry) * e820->nr_map; + n = alloc_bootmem(size); + memcpy(n, e820, size); + e820 = n; + + size = offsetof(struct e820map, map) + sizeof(struct e820entry) * e820_saved->nr_map; + n = alloc_bootmem(size); + memcpy(n, e820_saved, size); + e820_saved = n; +} Ok, this makes me quite nervous, could you please split this into two patches so that any fails can be nicely bisected to? No problem. First patch only does the pointerization changes with a trivial placeholder structure (full size, static allocated), second patch does all the dangerous bits such as changing it to __initdata, allocating and copying over bits. Also, could we please also add some minimal debugging facility to make sure the memory table does not get extended after it's been reallocated? I have another idea: run e820_reallocate_tables() later, just before we free __init and __initdata. Then e820 tables _can't_ be_ changed - all functions which do that are __init functions. Will test this now, and send a patchset. Could we also mark it __ro_after_init? __ro_after_init makes sense only for statically allocated objects. e820_reallocate_tables() copies e280 maps to kmalloced memory.
[PATCH 3/3] x86/e820: Use much less memory for e820/e820_saved, save up to 120k
The maximum size of e820 map array for EFI systems is defined as E820_X_MAX (E820MAX + 3 * MAX_NUMNODES). In x86_64 defconfig, this ends up with E820_X_MAX = 320, e820 and e820_saved are 6404 bytes each. With larger configs, for example Fedora kernels, E820_X_MAX = 3200, e820 and e820_saved are 64004 bytes each. Most of this space is wasted. Typical machines have some 20-30 e820 areas at most. After previous patch, e820 and e820_saved are pointers to e280 maps. Change them to initially point to maps which are __initdata. At the very end of kernel init, just before __init[data] sections are freed in free_initmem(), allocate smaller blocks, copy maps there, and change pointers. The late switch makes sure that all functions which can be used to change e820 maps are no longer accessible (they are all __init functions). Run-tested. Signed-off-by: Denys Vlasenko CC: Ingo Molnar CC: Andy Lutomirski CC: "H. Peter Anvin" CC: Borislav Petkov CC: Brian Gerst CC: x...@kernel.org CC: linux-kernel@vger.kernel.org --- arch/x86/kernel/e820.c | 8 arch/x86/mm/init.c | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index cf7c9ad..7e9249c 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -40,10 +40,10 @@ * user can e.g. boot the original kernel with mem=1G while still booting the * next kernel with full memory. */ -static struct e820map initial_e820; -static struct e820map initial_e820_saved; -struct e820map *e820 = &initial_e820; -struct e820map *e820_saved = &initial_e820_saved; +static struct e820map initial_e820 __initdata; +static struct e820map initial_e820_saved __initdata; +struct e820map *e820 __refdata = &initial_e820; +struct e820map *e820_saved __refdata = &initial_e820_saved; /* For PCI or other memory-mapped resources */ unsigned long pci_mem_start = 0xaeedbabe; diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 167deae..7be8dbf 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -701,7 +701,7 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end) void free_initmem(void) { - /* e820_reallocate_tables(); - disabled for now */ + e820_reallocate_tables(); free_init_pages("unused kernel", (unsigned long)(&__init_begin), -- 2.9.2
[PATCH 1/3] x86/e820: Mark some static functions __init
They are all called only from other __init functions in e820.c Signed-off-by: Denys Vlasenko CC: Ingo Molnar CC: Andy Lutomirski CC: "H. Peter Anvin" CC: Borislav Petkov CC: Brian Gerst CC: x...@kernel.org CC: linux-kernel@vger.kernel.org --- arch/x86/kernel/e820.c | 12 ++-- 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 621b501..a6d4f10 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -802,7 +802,7 @@ unsigned long __init e820_end_of_low_ram_pfn(void) return e820_end_pfn(1UL << (32-PAGE_SHIFT)); } -static void early_panic(char *msg) +static void __init early_panic(char *msg) { early_printk(msg); panic(msg); @@ -912,7 +912,7 @@ void __init finish_e820_parsing(void) } } -static const char *e820_type_to_string(int e820_type) +static const char *__init e820_type_to_string(int e820_type) { switch (e820_type) { case E820_RESERVED_KERN: @@ -926,7 +926,7 @@ static const char *e820_type_to_string(int e820_type) } } -static unsigned long e820_type_to_iomem_type(int e820_type) +static unsigned long __init e820_type_to_iomem_type(int e820_type) { switch (e820_type) { case E820_RESERVED_KERN: @@ -942,7 +942,7 @@ static unsigned long e820_type_to_iomem_type(int e820_type) } } -static unsigned long e820_type_to_iores_desc(int e820_type) +static unsigned long __init e820_type_to_iores_desc(int e820_type) { switch (e820_type) { case E820_ACPI: @@ -961,7 +961,7 @@ static unsigned long e820_type_to_iores_desc(int e820_type) } } -static bool do_mark_busy(u32 type, struct resource *res) +static bool __init do_mark_busy(u32 type, struct resource *res) { /* this is the legacy bios/dos rom-shadow + mmio region */ if (res->start < (1ULL<<20)) @@ -1027,7 +1027,7 @@ void __init e820_reserve_resources(void) } /* How much should we pad RAM ending depending on where it is? */ -static unsigned long ram_alignment(resource_size_t pos) +static unsigned long __init ram_alignment(resource_size_t pos) { unsigned long mb = pos >> 20; -- 2.9.2
[PATCH 2/3] x86/e820: Prepare e280 code for switch to dynamic storage
This patch turns e820 and e820_saved into pointers to e820 tables, of the same size as before. Signed-off-by: Denys Vlasenko CC: Ingo Molnar CC: Andy Lutomirski CC: "H. Peter Anvin" CC: Borislav Petkov CC: Brian Gerst CC: x...@kernel.org CC: linux-kernel@vger.kernel.org Signed-off-by: Denys Vlasenko --- arch/x86/include/asm/e820.h | 6 +- arch/x86/kernel/e820.c| 125 +++--- arch/x86/kernel/early-quirks.c| 2 +- arch/x86/kernel/kexec-bzimage64.c | 4 +- arch/x86/kernel/resource.c| 4 +- arch/x86/kernel/setup.c | 8 +-- arch/x86/kernel/tboot.c | 8 +-- arch/x86/mm/init.c| 2 + arch/x86/platform/efi/efi.c | 2 +- arch/x86/xen/setup.c | 2 +- 10 files changed, 98 insertions(+), 65 deletions(-) diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h index 3ab0537..476b574 100644 --- a/arch/x86/include/asm/e820.h +++ b/arch/x86/include/asm/e820.h @@ -10,8 +10,8 @@ #include #ifndef __ASSEMBLY__ /* see comment in arch/x86/kernel/e820.c */ -extern struct e820map e820; -extern struct e820map e820_saved; +extern struct e820map *e820; +extern struct e820map *e820_saved; extern unsigned long pci_mem_start; extern int e820_any_mapped(u64 start, u64 end, unsigned type); @@ -53,6 +53,8 @@ extern void e820_reserve_resources_late(void); extern void setup_memory_map(void); extern char *default_machine_specific_memory_setup(void); +extern void e820_reallocate_tables(void); + /* * Returns true iff the specified range [s,e) is completely contained inside * the ISA region. diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index a6d4f10..cf7c9ad 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -40,8 +40,10 @@ * user can e.g. boot the original kernel with mem=1G while still booting the * next kernel with full memory. */ -struct e820map e820; -struct e820map e820_saved; +static struct e820map initial_e820; +static struct e820map initial_e820_saved; +struct e820map *e820 = &initial_e820; +struct e820map *e820_saved = &initial_e820_saved; /* For PCI or other memory-mapped resources */ unsigned long pci_mem_start = 0xaeedbabe; @@ -58,8 +60,8 @@ e820_any_mapped(u64 start, u64 end, unsigned type) { int i; - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; + for (i = 0; i < e820->nr_map; i++) { + struct e820entry *ei = &e820->map[i]; if (type && ei->type != type) continue; @@ -81,8 +83,8 @@ int __init e820_all_mapped(u64 start, u64 end, unsigned type) { int i; - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; + for (i = 0; i < e820->nr_map; i++) { + struct e820entry *ei = &e820->map[i]; if (type && ei->type != type) continue; @@ -128,7 +130,7 @@ static void __init __e820_add_region(struct e820map *e820x, u64 start, u64 size, void __init e820_add_region(u64 start, u64 size, int type) { - __e820_add_region(&e820, start, size, type); + __e820_add_region(e820, start, size, type); } static void __init e820_print_type(u32 type) @@ -164,12 +166,12 @@ void __init e820_print_map(char *who) { int i; - for (i = 0; i < e820.nr_map; i++) { + for (i = 0; i < e820->nr_map; i++) { printk(KERN_INFO "%s: [mem %#018Lx-%#018Lx] ", who, - (unsigned long long) e820.map[i].addr, + (unsigned long long) e820->map[i].addr, (unsigned long long) - (e820.map[i].addr + e820.map[i].size - 1)); - e820_print_type(e820.map[i].type); + (e820->map[i].addr + e820->map[i].size - 1)); + e820_print_type(e820->map[i].type); printk(KERN_CONT "\n"); } } @@ -493,13 +495,13 @@ static u64 __init __e820_update_range(struct e820map *e820x, u64 start, u64 __init e820_update_range(u64 start, u64 size, unsigned old_type, unsigned new_type) { - return __e820_update_range(&e820, start, size, old_type, new_type); + return __e820_update_range(e820, start, size, old_type, new_type); } static u64 __init e820_update_range_saved(u64 start, u64 size, unsigned old_type, unsigned new_type) { - return __e820_update_range(&e820_saved, start, size, old_type, + return __e820_update_range(e820_saved, start, size, old_type, new_type); } @@ -521,8 +523,8 @@ u64 __init e820_remove_range(u64 start, u64 size, unsigned old_type, e820_print_type(old_ty
Re: [PATCH 2/2 v2] x86/e820: Use much less memory for e820/e820_saved, save up to 120k
On 09/15/2016 09:04 AM, Ingo Molnar wrote: * Denys Vlasenko wrote: The maximum size of e820 map array for EFI systems is defined as E820_X_MAX (E820MAX + 3 * MAX_NUMNODES). In x86_64 defconfig, this ends up with E820_X_MAX = 320, e820 and e820_saved are 6404 bytes each. With larger configs, for example Fedora kernels, E820_X_MAX = 3200, e820 and e820_saved are 64004 bytes each. Most of this space is wasted. Typical machines have some 20-30 e820 areas at most. This patch turns e820 and e820_saved to pointers which initially point to __initdata tables, of the same size as before. At the very end of setup_arch(), when we are done fiddling with these maps, allocate smaller alloc_bootmem blocks, copy maps there, and change pointers. Run-tested. +/* + * Initial e820 and e820_saved are largish __initdata arrays. + * Copy them to (usually much smaller) dynamically allocated area. + * This is done after all tweaks we ever do to them. + */ +__init void e820_reallocate_tables(void) +{ + struct e820map *n; + int size; + + size = offsetof(struct e820map, map) + sizeof(struct e820entry) * e820->nr_map; + n = alloc_bootmem(size); + memcpy(n, e820, size); + e820 = n; + + size = offsetof(struct e820map, map) + sizeof(struct e820entry) * e820_saved->nr_map; + n = alloc_bootmem(size); + memcpy(n, e820_saved, size); + e820_saved = n; +} Ok, this makes me quite nervous, could you please split this into two patches so that any fails can be nicely bisected to? No problem. First patch only does the pointerization changes with a trivial placeholder structure (full size, static allocated), second patch does all the dangerous bits such as changing it to __initdata, allocating and copying over bits. Also, could we please also add some minimal debugging facility to make sure the memory table does not get extended after it's been reallocated? I have another idea: run e820_reallocate_tables() later, just before we free __init and __initdata. Then e820 tables _can't_ be_ changed - all functions which do that are __init functions. Will test this now, and send a patchset.
[PATCH] x86/apic: Get rid of apic_version[] array. Save up to 128k
This array is [MAX_LOCAL_APIC], and MAX_LOCAL_APIC can easily be up to 32k. The "APIC version mismatch" message, which should trigger when two different APIC versions in different CPUs are seen, is in kernel since 2009 and no one is hitting it. Replace the array with a single variable, u8 boot_cpu_apic_version. (Why u8? APIC version values as of year 2016 are no larger than 0x1f on all known CPUs. Version field in the APIC register is 8 bit wide - not likely to overflow byte range in foreseeable future.) "APIC version mismatch" check is not removed. Signed-off-by: Denys Vlasenko CC: Ingo Molnar CC: Andy Lutomirski CC: "H. Peter Anvin" CC: Borislav Petkov CC: Brian Gerst CC: Mike Travis CC: x...@kernel.org CC: linux-kernel@vger.kernel.org --- This is the alternative patch for "x86/apic: Use byte array apic_version[], not int array" one. arch/x86/include/asm/mpspec.h | 2 +- arch/x86/kernel/acpi/boot.c | 2 +- arch/x86/kernel/apic/apic.c | 19 --- arch/x86/kernel/apic/io_apic.c | 4 ++-- arch/x86/kernel/apic/probe_32.c | 2 +- arch/x86/kernel/smpboot.c | 10 +- 6 files changed, 18 insertions(+), 21 deletions(-) diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h index b07233b..c2f94dc 100644 --- a/arch/x86/include/asm/mpspec.h +++ b/arch/x86/include/asm/mpspec.h @@ -6,7 +6,6 @@ #include #include -extern int apic_version[]; extern int pic_mode; #ifdef CONFIG_X86_32 @@ -40,6 +39,7 @@ extern int mp_bus_id_to_type[MAX_MP_BUSSES]; extern DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES); extern unsigned int boot_cpu_physical_apicid; +extern u8 boot_cpu_apic_version; extern unsigned long mp_lapic_addr; #ifdef CONFIG_X86_LOCAL_APIC diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 90d84c3..fbd1944 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -182,7 +182,7 @@ static int acpi_register_lapic(int id, u32 acpiid, u8 enabled) } if (boot_cpu_physical_apicid != -1U) - ver = apic_version[boot_cpu_physical_apicid]; + ver = boot_cpu_apic_version; cpu = generic_processor_info(id, ver); if (cpu >= 0) diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 50c95af..8f9be74 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -63,6 +63,8 @@ unsigned disabled_cpus; /* Processor that is doing the boot up */ unsigned int boot_cpu_physical_apicid = -1U; EXPORT_SYMBOL_GPL(boot_cpu_physical_apicid); + +u8 boot_cpu_apic_version; /* * The highest APIC ID seen during enumeration. @@ -1816,8 +1818,7 @@ void __init init_apic_mappings(void) * since smp_sanity_check is prepared for such a case * and disable smp mode */ - apic_version[new_apicid] = -GET_APIC_VERSION(apic_read(APIC_LVR)); + boot_cpu_apic_version = GET_APIC_VERSION(apic_read(APIC_LVR)); } } @@ -1831,14 +1832,11 @@ void __init register_lapic_address(unsigned long address) APIC_BASE, mp_lapic_addr); } if (boot_cpu_physical_apicid == -1U) { - boot_cpu_physical_apicid = read_apic_id(); - apic_version[boot_cpu_physical_apicid] = -GET_APIC_VERSION(apic_read(APIC_LVR)); + boot_cpu_physical_apicid = read_apic_id(); + boot_cpu_apic_version = GET_APIC_VERSION(apic_read(APIC_LVR)); } } -int apic_version[MAX_LOCAL_APIC]; - /* * Local APIC interrupts */ @@ -2128,11 +2126,10 @@ int generic_processor_info(int apicid, int version) cpu, apicid); version = 0x10; } - apic_version[apicid] = version; - if (version != apic_version[boot_cpu_physical_apicid]) { + if (version != boot_cpu_apic_version) { pr_warning("BIOS bug: APIC version mismatch, boot CPU: %x, CPU %d: version %x\n", - apic_version[boot_cpu_physical_apicid], cpu, version); + boot_cpu_apic_version, cpu, version); } physid_set(apicid, phys_cpu_present_map); @@ -2275,7 +2272,7 @@ int __init APIC_init_uniprocessor(void) * Complain if the BIOS pretends there is one. */ if (!boot_cpu_has(X86_FEATURE_APIC) && - APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) { + APIC_INTEGRATED(boot_cpu_apic_version)) { pr_err("BIOS bug, local APIC 0x%x not detected!...\n", boot_cpu_physical_apicid); return -1; diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 7491f41..48e6d84 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1593,7 +1593
Re: [PATCH] x86/apic: Use byte array apic_version[], not int array. Saves up to 96k
On 09/13/2016 05:33 PM, Thomas Gleixner wrote: On Sun, 11 Sep 2016, Borislav Petkov wrote: On Fri, Sep 09, 2016 at 10:32:04AM +0200, Denys Vlasenko wrote: This array is [MAX_LOCAL_APIC], and MAX_LOCAL_APIC can easily be up to 32k. This patch changes apic_version[] array elements from int to u8 - APIC version values as of year 2016 are no larger than 0x1f on all known CPUs. Version field in the APIC register is 8 bit wide - not likely to overflow byte range in foreseeable future. The "ver" argument of generic_processor_info(id,ver), which goes into apic_version[id], is also changed from int to u8: make it obvious that assignment can't overflow. generic_processor_info() has four callsites, none of them can put an out-of-range value into this argument. Right, so I dug a bit into this and found: http://marc.info/?l=linux-kernel&m=123230551709711 and b2b815d80a5c ("x86: put trigger in to detect mismatched apic versions") It is from 2009 and I don't know how relevant 16-bit APIC IDs are anymore... I guess you probably want to run this by SGI folk first. Otherwise I was going to propose to kill that apic_version array altogether and cache only the version of the previous CPU and compare it to the current one to catch mismatches... Yeah, the idea was back then to eliminate the array, but we wanted to make sure that we don't have systems out in the wild which have different apic versions. I really doubt that we can deal with that proper, so having a single version entry and yelling loudly when we detect a mismatch is good enough. Makes sense. I'll send a patch
[PATCH] x86/apic: Use byte array apic_version[], not int array. Saves up to 96k
This array is [MAX_LOCAL_APIC], and MAX_LOCAL_APIC can easily be up to 32k. This patch changes apic_version[] array elements from int to u8 - APIC version values as of year 2016 are no larger than 0x1f on all known CPUs. Version field in the APIC register is 8 bit wide - not likely to overflow byte range in foreseeable future. The "ver" argument of generic_processor_info(id,ver), which goes into apic_version[id], is also changed from int to u8: make it obvious that assignment can't overflow. generic_processor_info() has four callsites, none of them can put an out-of-range value into this argument. Signed-off-by: Denys Vlasenko CC: Ingo Molnar CC: Andy Lutomirski CC: "H. Peter Anvin" CC: Borislav Petkov CC: Brian Gerst CC: x...@kernel.org CC: linux-kernel@vger.kernel.org --- arch/x86/include/asm/mpspec.h | 4 ++-- arch/x86/kernel/acpi/boot.c | 2 +- arch/x86/kernel/apic/apic.c | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h index b07233b..a0bc349 100644 --- a/arch/x86/include/asm/mpspec.h +++ b/arch/x86/include/asm/mpspec.h @@ -6,7 +6,7 @@ #include #include -extern int apic_version[]; +extern u8 apic_version[]; extern int pic_mode; #ifdef CONFIG_X86_32 @@ -85,7 +85,7 @@ static inline void early_reserve_e820_mpc_new(void) { } #define default_get_smp_config x86_init_uint_noop #endif -int generic_processor_info(int apicid, int version); +int generic_processor_info(int apicid, u8 version); #define PHYSID_ARRAY_SIZE BITS_TO_LONGS(MAX_LOCAL_APIC) diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 90d84c3..fde236f 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -168,7 +168,7 @@ static int __init acpi_parse_madt(struct acpi_table_header *table) */ static int acpi_register_lapic(int id, u32 acpiid, u8 enabled) { - unsigned int ver = 0; + u8 ver = 0; int cpu; if (id >= MAX_LOCAL_APIC) { diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 50c95af..d5cc7c6 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1837,7 +1837,7 @@ void __init register_lapic_address(unsigned long address) } } -int apic_version[MAX_LOCAL_APIC]; +u8 apic_version[MAX_LOCAL_APIC]; /* * Local APIC interrupts @@ -2027,7 +2027,7 @@ void disconnect_bsp_APIC(int virt_wire_setup) apic_write(APIC_LVT1, value); } -int generic_processor_info(int apicid, int version) +int generic_processor_info(int apicid, u8 version) { int cpu, max = nr_cpu_ids; bool boot_cpu_detected = physid_isset(boot_cpu_physical_apicid, -- 2.9.2
[PATCH 2/2 v2] x86/e820: Use much less memory for e820/e820_saved, save up to 120k
The maximum size of e820 map array for EFI systems is defined as E820_X_MAX (E820MAX + 3 * MAX_NUMNODES). In x86_64 defconfig, this ends up with E820_X_MAX = 320, e820 and e820_saved are 6404 bytes each. With larger configs, for example Fedora kernels, E820_X_MAX = 3200, e820 and e820_saved are 64004 bytes each. Most of this space is wasted. Typical machines have some 20-30 e820 areas at most. This patch turns e820 and e820_saved to pointers which initially point to __initdata tables, of the same size as before. At the very end of setup_arch(), when we are done fiddling with these maps, allocate smaller alloc_bootmem blocks, copy maps there, and change pointers. Run-tested. Signed-off-by: Denys Vlasenko CC: Ingo Molnar CC: Andy Lutomirski CC: "H. Peter Anvin" CC: Borislav Petkov CC: Brian Gerst CC: x...@kernel.org CC: linux-kernel@vger.kernel.org --- Changes since v1: added __refdata annotations. arch/x86/include/asm/e820.h | 6 +- arch/x86/kernel/e820.c| 123 +++--- arch/x86/kernel/early-quirks.c| 2 +- arch/x86/kernel/kexec-bzimage64.c | 4 +- arch/x86/kernel/resource.c| 4 +- arch/x86/kernel/setup.c | 10 ++-- arch/x86/kernel/tboot.c | 8 +-- arch/x86/platform/efi/efi.c | 2 +- arch/x86/xen/setup.c | 2 +- 9 files changed, 96 insertions(+), 65 deletions(-) diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h index 3ab0537..476b574 100644 --- a/arch/x86/include/asm/e820.h +++ b/arch/x86/include/asm/e820.h @@ -10,8 +10,8 @@ #include #ifndef __ASSEMBLY__ /* see comment in arch/x86/kernel/e820.c */ -extern struct e820map e820; -extern struct e820map e820_saved; +extern struct e820map *e820; +extern struct e820map *e820_saved; extern unsigned long pci_mem_start; extern int e820_any_mapped(u64 start, u64 end, unsigned type); @@ -53,6 +53,8 @@ extern void e820_reserve_resources_late(void); extern void setup_memory_map(void); extern char *default_machine_specific_memory_setup(void); +extern void e820_reallocate_tables(void); + /* * Returns true iff the specified range [s,e) is completely contained inside * the ISA region. diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index a6d4f10..4e52859 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -40,8 +40,14 @@ * user can e.g. boot the original kernel with mem=1G while still booting the * next kernel with full memory. */ -struct e820map e820; -struct e820map e820_saved; +static struct e820map initial_e820 __initdata; +static struct e820map initial_e820_saved __initdata; +/* + * After we are done fiddling with maps at boot, we reallocate (shrink) them. + * Otherwise each map uses up to ~64k, but typically only ~1k is needed. + */ +struct e820map *e820 __refdata = &initial_e820; +struct e820map *e820_saved __refdata = &initial_e820_saved; /* For PCI or other memory-mapped resources */ unsigned long pci_mem_start = 0xaeedbabe; @@ -58,8 +64,8 @@ e820_any_mapped(u64 start, u64 end, unsigned type) { int i; - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; + for (i = 0; i < e820->nr_map; i++) { + struct e820entry *ei = &e820->map[i]; if (type && ei->type != type) continue; @@ -81,8 +87,8 @@ int __init e820_all_mapped(u64 start, u64 end, unsigned type) { int i; - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; + for (i = 0; i < e820->nr_map; i++) { + struct e820entry *ei = &e820->map[i]; if (type && ei->type != type) continue; @@ -128,7 +134,7 @@ static void __init __e820_add_region(struct e820map *e820x, u64 start, u64 size, void __init e820_add_region(u64 start, u64 size, int type) { - __e820_add_region(&e820, start, size, type); + __e820_add_region(e820, start, size, type); } static void __init e820_print_type(u32 type) @@ -164,12 +170,12 @@ void __init e820_print_map(char *who) { int i; - for (i = 0; i < e820.nr_map; i++) { + for (i = 0; i < e820->nr_map; i++) { printk(KERN_INFO "%s: [mem %#018Lx-%#018Lx] ", who, - (unsigned long long) e820.map[i].addr, + (unsigned long long) e820->map[i].addr, (unsigned long long) - (e820.map[i].addr + e820.map[i].size - 1)); - e820_print_type(e820.map[i].type); + (e820->map[i].addr + e820->map[i].size - 1)); + e820_print_type(e820->map[i].type); printk(KERN_CONT "\n"); } } @@ -493,13 +499,13 @@ static u64 __init __e820_update_range(struct e820map *
[PATCH 2/2] x86/e820: Use much less memory for e820/e820_saved, save up to 120k
The maximum size of e820 map array for EFI systems is defined as E820_X_MAX (E820MAX + 3 * MAX_NUMNODES). In x86_64 defconfig, this ends up with E820_X_MAX = 320, e820 and e820_saved are 6404 bytes each. With larger configs, for example Fedora kernels, E820_X_MAX = 3200, e820 and e820_saved are 64004 bytes each. Most of this space is wasted. Typical machines have some 20-30 e820 areas at most. This patch turns e820 and e820_saved to pointers which initially point to __initdata tables, of the same size as before. At the very end of setup_arch(), when we are done fiddling with these maps, allocate smaller alloc_bootmem blocks, copy maps there, and change pointers. Run-tested. Signed-off-by: Denys Vlasenko CC: Ingo Molnar CC: Andy Lutomirski CC: "H. Peter Anvin" CC: Borislav Petkov CC: Brian Gerst CC: x...@kernel.org CC: linux-kernel@vger.kernel.org --- arch/x86/include/asm/e820.h | 6 +- arch/x86/kernel/e820.c| 123 +++--- arch/x86/kernel/early-quirks.c| 2 +- arch/x86/kernel/kexec-bzimage64.c | 4 +- arch/x86/kernel/resource.c| 4 +- arch/x86/kernel/setup.c | 10 ++-- arch/x86/kernel/tboot.c | 8 +-- arch/x86/platform/efi/efi.c | 2 +- arch/x86/xen/setup.c | 2 +- 9 files changed, 96 insertions(+), 65 deletions(-) diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h index 3ab0537..476b574 100644 --- a/arch/x86/include/asm/e820.h +++ b/arch/x86/include/asm/e820.h @@ -10,8 +10,8 @@ #include #ifndef __ASSEMBLY__ /* see comment in arch/x86/kernel/e820.c */ -extern struct e820map e820; -extern struct e820map e820_saved; +extern struct e820map *e820; +extern struct e820map *e820_saved; extern unsigned long pci_mem_start; extern int e820_any_mapped(u64 start, u64 end, unsigned type); @@ -53,6 +53,8 @@ extern void e820_reserve_resources_late(void); extern void setup_memory_map(void); extern char *default_machine_specific_memory_setup(void); +extern void e820_reallocate_tables(void); + /* * Returns true iff the specified range [s,e) is completely contained inside * the ISA region. diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index a6d4f10..95fd1ec 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -40,8 +40,14 @@ * user can e.g. boot the original kernel with mem=1G while still booting the * next kernel with full memory. */ -struct e820map e820; -struct e820map e820_saved; +static struct e820map initial_e820 __initdata; +static struct e820map initial_e820_saved __initdata; +/* + * After we are done fiddling with maps at boot, we reallocate (shrink) them. + * Otherwise each map uses up to ~64k, but typically only ~1k is needed. + */ +struct e820map *e820 = &initial_e820; +struct e820map *e820_saved = &initial_e820_saved; /* For PCI or other memory-mapped resources */ unsigned long pci_mem_start = 0xaeedbabe; @@ -58,8 +64,8 @@ e820_any_mapped(u64 start, u64 end, unsigned type) { int i; - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; + for (i = 0; i < e820->nr_map; i++) { + struct e820entry *ei = &e820->map[i]; if (type && ei->type != type) continue; @@ -81,8 +87,8 @@ int __init e820_all_mapped(u64 start, u64 end, unsigned type) { int i; - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; + for (i = 0; i < e820->nr_map; i++) { + struct e820entry *ei = &e820->map[i]; if (type && ei->type != type) continue; @@ -128,7 +134,7 @@ static void __init __e820_add_region(struct e820map *e820x, u64 start, u64 size, void __init e820_add_region(u64 start, u64 size, int type) { - __e820_add_region(&e820, start, size, type); + __e820_add_region(e820, start, size, type); } static void __init e820_print_type(u32 type) @@ -164,12 +170,12 @@ void __init e820_print_map(char *who) { int i; - for (i = 0; i < e820.nr_map; i++) { + for (i = 0; i < e820->nr_map; i++) { printk(KERN_INFO "%s: [mem %#018Lx-%#018Lx] ", who, - (unsigned long long) e820.map[i].addr, + (unsigned long long) e820->map[i].addr, (unsigned long long) - (e820.map[i].addr + e820.map[i].size - 1)); - e820_print_type(e820.map[i].type); + (e820->map[i].addr + e820->map[i].size - 1)); + e820_print_type(e820->map[i].type); printk(KERN_CONT "\n"); } } @@ -493,13 +499,13 @@ static u64 __init __e820_update_range(struct e820map *e820x, u64 start, u64 __init e820_update_range(u64 start, u64 size
[PATCH 1/2] x86/e820: mark some static functions __init
They are all called only from other __init functions. Signed-off-by: Denys Vlasenko CC: Ingo Molnar CC: Andy Lutomirski CC: "H. Peter Anvin" CC: Borislav Petkov CC: Brian Gerst CC: x...@kernel.org CC: linux-kernel@vger.kernel.org --- arch/x86/kernel/e820.c | 12 ++-- 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 621b501..a6d4f10 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -802,7 +802,7 @@ unsigned long __init e820_end_of_low_ram_pfn(void) return e820_end_pfn(1UL << (32-PAGE_SHIFT)); } -static void early_panic(char *msg) +static void __init early_panic(char *msg) { early_printk(msg); panic(msg); @@ -912,7 +912,7 @@ void __init finish_e820_parsing(void) } } -static const char *e820_type_to_string(int e820_type) +static const char *__init e820_type_to_string(int e820_type) { switch (e820_type) { case E820_RESERVED_KERN: @@ -926,7 +926,7 @@ static const char *e820_type_to_string(int e820_type) } } -static unsigned long e820_type_to_iomem_type(int e820_type) +static unsigned long __init e820_type_to_iomem_type(int e820_type) { switch (e820_type) { case E820_RESERVED_KERN: @@ -942,7 +942,7 @@ static unsigned long e820_type_to_iomem_type(int e820_type) } } -static unsigned long e820_type_to_iores_desc(int e820_type) +static unsigned long __init e820_type_to_iores_desc(int e820_type) { switch (e820_type) { case E820_ACPI: @@ -961,7 +961,7 @@ static unsigned long e820_type_to_iores_desc(int e820_type) } } -static bool do_mark_busy(u32 type, struct resource *res) +static bool __init do_mark_busy(u32 type, struct resource *res) { /* this is the legacy bios/dos rom-shadow + mmio region */ if (res->start < (1ULL<<20)) @@ -1027,7 +1027,7 @@ void __init e820_reserve_resources(void) } /* How much should we pad RAM ending depending on where it is? */ -static unsigned long ram_alignment(resource_size_t pos) +static unsigned long __init ram_alignment(resource_size_t pos) { unsigned long mb = pos >> 20; -- 2.9.2
Re: RFC: Petition Intel/AMD to add POPF_IF insn
On 08/19/2016 12:54 PM, Paolo Bonzini wrote: On 18/08/2016 19:24, Linus Torvalds wrote: I didn't do CPL0 tests yet. Realized that cli/sti can be tested in userspace if we set iopl(3) first. Yes, but it might not be the same. So the timings could be very different from a cpl0 case. FWIW I recently measured around 20 cycles for a popf as well on Haswell-EP and CPL=0 (that was for commit f2485b3e0c6c, "KVM: x86: use guest_exit_irqoff", 2016-07-01). Thanks for confirmation. I revisited benchmarking of the if (flags & X86_EFLAGS_IF) native_irq_enable(); patch. In "make -j20" kernel compiles on a 8-way (HT) CPU, it shows some ~5 second improvement during ~16 minute compile. That's 0.5% speedup. It's ok, but not something to bee too excited. 80 e6 02and$0x2,%dh 74 01 je 810101ae fb sti 41 f6 86 91 00 00 00 02 testb $0x2,0x91(%r14) 74 01 je 81013ce7 fb sti f6 83 91 00 00 00 02testb $0x2,0x91(%rbx) 74 01 je 81013efa fb sti 41 f7 c4 00 02 00 00test $0x200,%r12d 74 01 je 8101615d fb sti Here we trade 20-cycle POPF for either 4-cycle STI, or a branch (which is either ~1 cycle if predicted, or ~20 cycles if mispredicted). The disassembly of vmlinux shows that gcc generates these asm patterns: I still think a dedicated instruction for a conditional STI is worth asking for. Along the lines of "If bit 9 in the r/m argument is set, then STI, else nothing". What do people from CPU companies say?
[PATCH v5] powerpc: Do not make the entire heap executable
On 32-bit powerpc the ELF PLT sections of binaries (built with --bss-plt, or with a toolchain which defaults to it) look like this: [17] .sbss NOBITS 0002aff8 01aff8 14 00 WA 0 0 4 [18] .plt NOBITS 0002b00c 01aff8 84 00 WAX 0 0 4 [19] .bss NOBITS 0002b090 01aff8 a4 00 WA 0 0 4 Which results in an ELF load header: Type Offset VirtAddr PhysAddr FileSiz MemSiz Flg Align LOAD 0x019c70 0x00029c70 0x00029c70 0x01388 0x014c4 RWE 0x1 This is all correct, the load region containing the PLT is marked as executable. Note that the PLT starts at 0002b00c but the file mapping ends at 0002aff8, so the PLT falls in the 0 fill section described by the load header, and after a page boundary. Unfortunately the generic ELF loader ignores the X bit in the load headers when it creates the 0 filled non-file backed mappings. It assumes all of these mappings are RW BSS sections, which is not the case for PPC. gcc/ld has an option (--secure-plt) to not do this, this is said to incur a small performance penalty. Currently, to support 32-bit binaries with PLT in BSS kernel maps *entire brk area* with executable rights for all binaries, even --secure-plt ones. Stop doing that. Teach the ELF loader to check the X bit in the relevant load header and create 0 filled anonymous mappings that are executable if the load header requests that. The patch was originally posted in 2012 by Jason Gunthorpe and apparently ignored: https://lkml.org/lkml/2012/9/30/138 Lightly run-tested. Signed-off-by: Jason Gunthorpe Signed-off-by: Denys Vlasenko Reviewed-by: Kees Cook CC: Benjamin Herrenschmidt CC: Paul Mackerras CC: "Aneesh Kumar K.V" CC: Kees Cook CC: Oleg Nesterov CC: Michael Ellerman CC: Florian Weimer CC: linux...@kvack.org CC: linuxppc-...@lists.ozlabs.org CC: linux-kernel@vger.kernel.org --- Changes since v4: * if (current->personality & READ_IMPLIES_EXEC), still use VM_EXEC for 32-bit executables. Changes since v3: * typo fix in commit message * rebased to current Linus tree Changes since v2: * moved capability to map with VM_EXEC into vm_brk_flags() Changes since v1: * wrapped lines to not exceed 79 chars * improved comment * expanded CC list arch/powerpc/include/asm/page.h | 3 ++- fs/binfmt_elf.c | 30 ++ include/linux/mm.h | 1 + mm/mmap.c | 21 - 4 files changed, 41 insertions(+), 14 deletions(-) diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h index 56398e7..d188f51 100644 --- a/arch/powerpc/include/asm/page.h +++ b/arch/powerpc/include/asm/page.h @@ -230,7 +230,9 @@ extern long long virt_phys_offset; * and needs to be executable. This means the whole heap ends * up being executable. */ -#define VM_DATA_DEFAULT_FLAGS32(VM_READ | VM_WRITE | VM_EXEC | \ +#define VM_DATA_DEFAULT_FLAGS32 \ + (((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0) | \ +VM_READ | VM_WRITE | \ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) #define VM_DATA_DEFAULT_FLAGS64(VM_READ | VM_WRITE | \ diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 7f6aff3f..2b57b5a 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -91,12 +91,18 @@ static struct linux_binfmt elf_format = { #define BAD_ADDR(x) ((unsigned long)(x) >= TASK_SIZE) -static int set_brk(unsigned long start, unsigned long end) +static int set_brk(unsigned long start, unsigned long end, int prot) { start = ELF_PAGEALIGN(start); end = ELF_PAGEALIGN(end); if (end > start) { - int error = vm_brk(start, end - start); + /* +* Map the last of the bss segment. +* If the header is requesting these pages to be +* executable, honour that (ppc32 needs this). +*/ + int error = vm_brk_flags(start, end - start, + prot & PROT_EXEC ? VM_EXEC : 0); if (error) return error; } @@ -524,6 +530,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex, unsigned long load_addr = 0; int load_addr_set = 0; unsigned long last_bss = 0, elf_bss = 0; + int bss_prot = 0; unsigned long error = ~0UL; unsigned long total_size; int i; @@ -606,8 +613,10 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex, * elf_bss and last_bss is the bss section. */ k = load_addr + eppnt->p_vaddr + eppnt->p_memsz; - if (k > last_bss) + if (k > last_bss) { last_bss = k; +
Re: [PATCH v4] powerpc: Do not make the entire heap executable
On 08/21/2016 05:47 PM, Aneesh Kumar K.V wrote: Denys Vlasenko writes: On 32-bit powerpc the ELF PLT sections of binaries (built with --bss-plt, or with a toolchain which defaults to it) look like this: [17] .sbss NOBITS 0002aff8 01aff8 14 00 WA 0 0 4 [18] .plt NOBITS 0002b00c 01aff8 84 00 WAX 0 0 4 [19] .bss NOBITS 0002b090 01aff8 a4 00 WA 0 0 4 Which results in an ELF load header: Type Offset VirtAddr PhysAddr FileSiz MemSiz Flg Align LOAD 0x019c70 0x00029c70 0x00029c70 0x01388 0x014c4 RWE 0x1 This is all correct, the load region containing the PLT is marked as executable. Note that the PLT starts at 0002b00c but the file mapping ends at 0002aff8, so the PLT falls in the 0 fill section described by the load header, and after a page boundary. Unfortunately the generic ELF loader ignores the X bit in the load headers when it creates the 0 filled non-file backed mappings. It assumes all of these mappings are RW BSS sections, which is not the case for PPC. gcc/ld has an option (--secure-plt) to not do this, this is said to incur a small performance penalty. Currently, to support 32-bit binaries with PLT in BSS kernel maps *entire brk area* with executable rights for all binaries, even --secure-plt ones. Is this going to break any application ? I am asking because you mentioned the patch is lightly tested. I booted powerpc64 machine with RHEL7 installation, it did not catch fire. x86 do have a #define VM_DATA_DEFAULT_FLAGS \ (((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0 ) | \ VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) ie, it can force a read implies exec mode. Do we need that ? powerpc64 never had that. 32-bit mode may need it, since before this patch all 32-bit tasks were unconditionally getting VM_DATA_DEFAULT_FLAGS with VM_EXEC bit. I'll send an updated patch.
Re: [PATCH v4] powerpc: Do not make the entire heap executable
On 08/10/2016 03:00 PM, Denys Vlasenko wrote: On 32-bit powerpc the ELF PLT sections of binaries (built with --bss-plt, or with a toolchain which defaults to it) look like this: [17] .sbss NOBITS 0002aff8 01aff8 14 00 WA 0 0 4 [18] .plt NOBITS 0002b00c 01aff8 84 00 WAX 0 0 4 [19] .bss NOBITS 0002b090 01aff8 a4 00 WA 0 0 4 Which results in an ELF load header: Type Offset VirtAddr PhysAddr FileSiz MemSiz Flg Align LOAD 0x019c70 0x00029c70 0x00029c70 0x01388 0x014c4 RWE 0x1 This is all correct, the load region containing the PLT is marked as executable. Note that the PLT starts at 0002b00c but the file mapping ends at 0002aff8, so the PLT falls in the 0 fill section described by the load header, and after a page boundary. Unfortunately the generic ELF loader ignores the X bit in the load headers when it creates the 0 filled non-file backed mappings. It assumes all of these mappings are RW BSS sections, which is not the case for PPC. gcc/ld has an option (--secure-plt) to not do this, this is said to incur a small performance penalty. Currently, to support 32-bit binaries with PLT in BSS kernel maps *entire brk area* with executable rights for all binaries, even --secure-plt ones. Stop doing that. Teach the ELF loader to check the X bit in the relevant load header and create 0 filled anonymous mappings that are executable if the load header requests that. The patch was originally posted in 2012 by Jason Gunthorpe and apparently ignored: https://lkml.org/lkml/2012/9/30/138 Lightly run-tested. Ping powerpc/mm people. How does this patch look? Are you taking it? -static int do_brk(unsigned long addr, unsigned long request) +static int do_brk_flags(unsigned long addr, unsigned long request, unsigned long flags) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma, *prev; - unsigned long flags, len; + unsigned long len; struct rb_node **rb_link, *rb_parent; pgoff_t pgoff = addr >> PAGE_SHIFT; int error; @@ -2668,7 +2668,7 @@ static int do_brk(unsigned long addr, unsigned long request) if (!len) return 0; - flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; + flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED); Regarding "maybe VM_LOCKED needs to be masked out of flags?" in the fragment above. I agree. In a sense that "Yes, maybe. I don't really know whether mm people feel it is worth the cost." I'd be happy to send a new version if someone will express a definite request to add that.
Re: RFC: Petition Intel/AMD to add POPF_IF insn
On 08/18/2016 07:24 PM, Linus Torvalds wrote: That said, your numbers really aren't very convincing. If popf really is just 10 cycles on modern Intel hardware, it's already fast enough that I really don't think it matters. It's 20 cycles. I was wrong in my email, I forgot that the insn count also counts "push %ebx" insns. Since I already made a mistake, let me double-check. 200 million iterations of this loop execute under 17 seconds: 400100: b8 00 c2 eb 0b mov$0xbebc200,%eax # 1000*1000*1000 400105: 9c pushfq 400106: 5b pop%rbx 400107: 90 nop 00400140 : 400140: 53 push %rbx 400141: 9d popfq 400142: 53 push %rbx 400143: 9d popfq 400144: 53 push %rbx 400145: 9d popfq 400146: 53 push %rbx 400147: 9d popfq 400148: 53 push %rbx 400149: 9d popfq 40014a: 53 push %rbx 40014b: 9d popfq 40014c: 53 push %rbx 40014d: 9d popfq 40014e: 53 push %rbx 40014f: 9d popfq 400150: 53 push %rbx 400151: 9d popfq 400152: 53 push %rbx 400153: 9d popfq 400154: 53 push %rbx 400155: 9d popfq 400156: 53 push %rbx 400157: 9d popfq 400158: 53 push %rbx 400159: 9d popfq 40015a: 53 push %rbx 40015b: 9d popfq 40015c: ff c8 dec%eax 40015e: 75 e0 jne400140 The loop is exactly 32 bytes, aligned. There are 14 POPFs. Other insns are very fast. No perf, just "time taskset 1 ./test". My CPU frequency hovers around 3500 MHz when loaded. 17 seconds is 17*3500 million cycles. 17*3500 million cycles / 200*14 million cycles = 21.25 Thus, one POPF in CPL3 is ~20 cycles on Skylake.
Re: RFC: Petition Intel/AMD to add POPF_IF insn
On 08/18/2016 07:47 PM, Denys Vlasenko wrote: On 08/18/2016 07:24 PM, Linus Torvalds wrote: That said, your numbers really aren't very convincing. If popf really is just 10 cycles on modern Intel hardware, it's already fast enough that I really don't think it matters. It's 20 cycles. I was wrong in my email, I forgot that the insn count also counts "push %ebx" insns. Since I already made a mistake, let me double-check. 200 million iterations of this loop execute under 17 seconds: 400100:b8 00 c2 eb 0b mov$0xbebc200,%eax # 1000*1000*1000 Grr. It's 200*1000*1000, not one billion. Sorry.
Re: RFC: Petition Intel/AMD to add POPF_IF insn
Of course, somebody really should do timings on modern CPU's (in cpl0, comparing native_fl() that enables interrupts with a popf) I didn't do CPL0 tests yet. Realized that cli/sti can be tested in userspace if we set iopl(3) first. Surprisingly, STI is slower than CLI. A loop with 27 CLI's and one STI converges to about ~0.5 insn/cycle: # compile with: gcc -nostartfiles -nostdlib _start: .globl _start mov $172, %eax #iopl mov $3, %edi syscall mov $200*1000*1000, %eax .balign 64 loop: cli;cli;cli;cli cli;cli;cli;cli cli;cli;cli;cli cli;cli;cli;cli cli;cli;cli;cli cli;cli;cli;cli cli;cli;cli;sti dec %eax jnz loop mov $231, %eax #exit_group syscall perf stat: 6,015,787,968 instructions #0.52 insn per cycle 3.355474199 seconds time elapsed With all CLIs replaced by STIs, it's ~0.25 insn/cycle: 6,030,530,328 instructions #0.27 insn per cycle 6.547200322 seconds time elapsed POPF which needs to enable interrupts is not measurably faster than one which does not change .IF: Loop with: 400158: fa cli 400159: 53 push %rbx #saved eflags with if=1 40015a: 9d popfq shows: 8,908,857,324 instructions #0.11 insn per cycle ( +- 0.00% ) Loop with: 400140: fb sti 400141: 53 push %rbx 400142: 9d popfq shows: 8,920,243,701 instructions #0.10 insn per cycle ( +- 0.01% ) Even loop with neither CLI nor STI, only with POPF: 400140: 53 push %rbx 400141: 9d popfq shows: 6,079,936,714 instructions #0.10 insn per cycle ( +- 0.00% ) This is on a Skylake CPU. The gist of it: CLI is 2 cycles, STI is 4 cycles, POPF is 10 cycles seemingly regardless of prior value of EFLAGS.IF.
Re: RFC: Petition Intel/AMD to add POPF_IF insn
On 08/18/2016 11:21 AM, Denys Vlasenko wrote: Of course, if somebody uses native_restore_fl() to actually *disable* interrupts (when they weren't already disabled), then this untested patch will just not work. But why would you do somethign so stupid? Famous last words... Looking around, the vsmp code actually uses "native_restore_fl()" to not just set the interrupt flag, but AC as well. And the PV spinlock case has that "push;popf" sequence encoded in an alternate. So that trivial patch may (or may not - still not tested) work for some quick testing, but needs more effort for any *real* use. I'm going to test the attached patch. ... +# CONFIG_UPROBES is not set +# CONFIG_SCHED_OMIT_FRAME_POINTER is not set +# CONFIG_HYPERVISOR_GUEST is not set +# CONFIG_SYS_HYPERVISOR is not set +# CONFIG_FRAME_POINTER is not set +# CONFIG_KMEMCHECK is not set +# CONFIG_DEBUG_LOCK_ALLOC is not set +# CONFIG_PROVE_LOCKING is not set +# CONFIG_LOCK_STAT is not set +# CONFIG_PROVE_RCU is not set +# CONFIG_LATENCYTOP is not set +# CONFIG_FTRACE is not set +# CONFIG_BINARY_PRINTF is not set Need also !CONFIG_DEBUG_SPINLOCK, then unpatched irqrestore is: 817115a0 <_raw_spin_unlock_irqrestore>: 817115a0: c6 07 00movb $0x0,(%rdi) 817115a3: 56 push %rsi 817115a4: 9d popfq 817115a5: 65 ff 0d e4 ad 8f 7edecl %gs:__preempt_count 817115ac: c3 retq 817115ad: 0f 1f 00nopl (%rax) patched one is 81711660 <_raw_spin_unlock_irqrestore>: 81711660: f7 c6 00 02 00 00 test $0x200,%esi 81711666: c6 07 00movb $0x0,(%rdi) 81711669: 74 01 je 8171166c <_raw_spin_unlock_irqrestore+0xc> 8171166b: fb sti 8171166c: 65 ff 0d 1d ad 8f 7edecl %gs:__preempt_count 81711673: c3 retq 81711674: 66 90 xchg %ax,%ax 81711676: 66 2e 0f 1f 84 00 00 00 00 00 nopw %cs:0x0(%rax,%rax,1) Ran the following twice on a quiesced machine: taskset 1 perf stat -r60 perf bench sched messaging taskset 1 perf stat -r60 perf bench sched pipe Out of these four runs, both "perf bench sched pipe" runs show improvements: - 2648.279829 task-clock (msec) #1.000 CPUs utilized ( +- 0.24% ) + 2483.143469 task-clock (msec) #0.998 CPUs utilized ( +- 0.31% ) - 2,000,002 context-switches #0.755 M/sec ( +- 0.00% ) + 2,000,013 context-switches #0.805 M/sec ( +- 0.00% ) - 547 page-faults #0.206 K/sec ( +- 0.04% ) + 546 page-faults #0.220 K/sec ( +- 0.04% ) - 8,723,284,926 cycles#3.294 GHz ( +- 0.06% ) + 8,157,949,449 cycles#3.285 GHz ( +- 0.07% ) -12,286,937,344 instructions #1.41 insn per cycle ( +- 0.03% ) +12,255,616,405 instructions #1.50 insn per cycle ( +- 0.05% ) - 2,588,839,023 branches # 977.555 M/sec ( +- 0.02% ) + 2,599,319,615 branches # 1046.786 M/sec ( +- 0.04% ) - 3,620,273 branch-misses #0.14% of all branches ( +- 0.67% ) + 3,577,771 branch-misses #0.14% of all branches ( +- 0.69% ) - 2.648799072 seconds time elapsed ( +- 0.24% ) + 2.487452268 seconds time elapsed ( +- 0.31% ) Good, we run more insns/cycle, as expected. However, a bit more branches. But of two "perf bench sched messaging" run, one was slower on a patched kernel, and perf shows why: more branches, and also branch miss percentage is larger: -690.008697 task-clock (msec) #0.996 CPUs utilized ( +- 0.45% ) +699.526509 task-clock (msec) #0.994 CPUs utilized ( +- 0.28% ) -26,796 context-switches #0.039 M/sec ( +- 8.31% ) +29,088 context-switches #0.042 M/sec ( +- 6.62% ) -35,477 page-faults #0.051 M/sec ( +- 0.11% ) +35,504 page-faults #0.051 M/sec ( +- 0.14%
Re: RFC: Petition Intel/AMD to add POPF_IF insn
On 08/17/2016 11:35 PM, Linus Torvalds wrote: On Wed, Aug 17, 2016 at 2:26 PM, Linus Torvalds wrote: Of course, if somebody uses native_restore_fl() to actually *disable* interrupts (when they weren't already disabled), then this untested patch will just not work. But why would you do somethign so stupid? Famous last words... Looking around, the vsmp code actually uses "native_restore_fl()" to not just set the interrupt flag, but AC as well. And the PV spinlock case has that "push;popf" sequence encoded in an alternate. So that trivial patch may (or may not - still not tested) work for some quick testing, but needs more effort for any *real* use. I'm going to test the attached patch. PV and debug maze is daunting... I discovered that Fedora kernels compile with the set of .config options which result in spin_unlock_irqrestore which looks like this: 818d0f40 <_raw_spin_unlock_irqrestore>: 818d0f40: e8 1b 31 00 00 callq 818d4060 <__fentry__>818d0f41: R_X86_64_PC32 __fentry__-0x4 818d0f45: 55 push %rbp 818d0f46: 48 89 e5mov%rsp,%rbp 818d0f49: 41 54 push %r12 818d0f4b: 53 push %rbx 818d0f4c: 48 8b 55 08 mov0x8(%rbp),%rdx 818d0f50: 49 89 fcmov%rdi,%r12 818d0f53: 48 89 f3mov%rsi,%rbx 818d0f56: 48 83 c7 18 add$0x18,%rdi 818d0f5a: be 01 00 00 00 mov$0x1,%esi 818d0f5f: e8 8c b8 83 ff callq 8110c7f0 818d0f60: R_X86_64_PC32 lock_release-0x4 818d0f64: 4c 89 e7mov%r12,%rdi 818d0f67: e8 d4 fb 83 ff callq 81110b40 818d0f68: R_X86_64_PC32 do_raw_spin_unlock-0x4 818d0f6c: f6 c7 02test $0x2,%bh 818d0f6f: 74 1b je 818d0f8c <_raw_spin_unlock_irqrestore+0x4c> 818d0f71: e8 aa 9d 83 ff callq 8110ad20 818d0f72: R_X86_64_PC32 trace_hardirqs_on-0x4 818d0f76: 48 89 dfmov%rbx,%rdi 818d0f79: ff 14 25 48 32 e2 81callq *0x81e23248 818d0f7c: R_X86_64_32S pv_irq_ops+0x8 818d0f80: 65 ff 0d c9 c4 73 7edecl %gs:0x7e73c4c9(%rip)# d450 <__preempt_count> 818d0f83: R_X86_64_PC32 __preempt_count-0x4 818d0f87: 5b pop%rbx 818d0f88: 41 5c pop%r12 818d0f8a: 5d pop%rbp 818d0f8b: c3 retq 818d0f8c: 48 89 dfmov%rbx,%rdi 818d0f8f: ff 14 25 48 32 e2 81callq *0x81e23248 818d0f92: R_X86_64_32S pv_irq_ops+0x8 818d0f96: e8 35 5b 83 ff callq 81106ad0 818d0f97: R_X86_64_PC32 trace_hardirqs_off-0x4 818d0f9b: eb e3 jmp818d0f80 <_raw_spin_unlock_irqrestore+0x40> 818d0f9d: 0f 1f 00nopl (%rax) Gawd... really Fedora? All this is needed? Testing with _this_ is not going to show any differences, I need to weed all the cruft out and test a fast, non-debug .config. Changed it like this: +# CONFIG_UPROBES is not set +# CONFIG_SCHED_OMIT_FRAME_POINTER is not set +# CONFIG_HYPERVISOR_GUEST is not set +# CONFIG_SYS_HYPERVISOR is not set +# CONFIG_FRAME_POINTER is not set +# CONFIG_KMEMCHECK is not set +# CONFIG_DEBUG_LOCK_ALLOC is not set +# CONFIG_PROVE_LOCKING is not set +# CONFIG_LOCK_STAT is not set +# CONFIG_PROVE_RCU is not set +# CONFIG_LATENCYTOP is not set +# CONFIG_FTRACE is not set +# CONFIG_BINARY_PRINTF is not set Looks better (it's with the patch already): 00f0 <_raw_spin_unlock_irqrestore>: f0: 53 push %rbx f1: 48 89 f3mov%rsi,%rbx f4: e8 00 00 00 00 callq f9 <_raw_spin_unlock_irqrestore+0x9> f5: R_X86_64_PC32 do_raw_spin_unlock-0x4 f9: 80 e7 02and$0x2,%bh fc: 74 01 je ff <_raw_spin_unlock_irqrestore+0xf> fe: fb sti ff: 65 ff 0d 00 00 00 00decl %gs:0x0(%rip)# 106 <_raw_spin_unlock_irqrestore+0x16>102: R_X86_64_PC32 __preempt_count-0x4 106: 5b pop%rbx 107: c3 retq This also raises questions. Such as "why do_raw_spin_unlock() is not inlined here?" Anyway... on to more kernel builds and testing. Please take a look at the below patch. If it's obviously buggy, let me know. diff -urp linux-4.7.1.org/arch/x86/include/asm/irqflags.h linux-4.7.1.obj/arch/x86/include/a
Re: RFC: Petition Intel/AMD to add POPF_IF insn
On 08/17/2016 09:32 PM, Linus Torvalds wrote: On Wed, Aug 17, 2016 at 12:26 PM, Denys Vlasenko wrote: Exactly. And more: All of which is ENTIRELY IRRELEVANT. The 2-page pseudo-code is about behavior. It's trivial to short-circuit. There are obvious optimizations, which involve just making the rare cases go slow and have a trivial "if those bits didn't change, don't go through the expense of testing them". The problem is that IF is almost certainly right now in that rare case mask, and so popf is stupidly slow for IF. I ran the test, see the first email in the thread. Experimentally, POPF is stupidly slow _always_. 6 cycles even if none of the "scary" flags are changed. Either: * its microcode has no rare case mask or * its microcode is so slow that even fast path is slow, and slow path is worse
Re: RFC: Petition Intel/AMD to add POPF_IF insn
On 08/17/2016 09:13 PM, Andy Lutomirski wrote: On Wed, Aug 17, 2016 at 12:01 PM, Linus Torvalds wrote: On Aug 17, 2016 11:41 AM, "Denys Vlasenko" wrote: OTOH 5 years will inevitably pass. Yes. But in five years, maybe we'll have a popf that is faster anyway. I'd actually prefer that in the end. The problem with popf right now seems to be mainly that it's effectively serializing and does stupid things in microcode. It doesn't have to be that way. It could actually do much better, but it hasn't been a high enough priority for Intel. It wouldn't surprise me if that were easier said than done. popf potentially changes AC, and AC affects address translation. popf also potentially changes IOPL, and I don't know whether Intel chips track IOPL in a way that lets them find all the dependent instructions without serializing. But maybe their pipeline is fancy enough. Exactly. And more: POPF potentially changes TF (and it works even in CPL3). POPD changes DF - must serialize versus string insns. POPF changes NT - must serialize versus IRET insns. POPF changes VIF, from a different bit in a popped value, and under a rather complex conditions. Intel's documentation has a pseudo-code for the instructions. For POPF, that pseudo-code is two pages long...
Re: RFC: Petition Intel/AMD to add POPF_IF insn
On 08/17/2016 07:32 PM, Linus Torvalds wrote: On Wed, Aug 17, 2016 at 10:20 AM, Denys Vlasenko wrote: Last year, a proposal was floated to avoid costly POPF. In particular, each spin_unlock_irqrestore() does it, a rather common operation. Quiet frankly, it takes so long for hw features that I don't think it's worth it for something like this. True, it will take some ~5 years for new hardware to become widely used. OTOH 5 years will inevitably pass. Just like now 32-bit Linux userspace is commonly compiled to "i686" instruction set, because inevitably enough time has passed that you can safely assume most people run at least Pentium Pro-level CPU, with CMOV, CMPXCHG, etc. If the new instruction will be implemented with "REPZ POPF" encoding, no recompilation or alternatives will be needed for the new kernel to run on an old CPU. On an old CPU, entire EFLAGS will be restored. On a new one, only EFLAGS.IF.
Re: RFC: Petition Intel/AMD to add POPF_IF insn
On 08/17/2016 07:30 PM, Christian König wrote: But in my measurements POPF is not fast even in the case where restored flags are not changed at all: mov $200*1000*1000, %eax pushf pop %rbx .balign 64 loop: push%rbx popf dec %eax jnz loop # perf stat -r20 ./popf_1g 4,929,012,093 cycles#3.412 GHz ( +- 0.02% ) 835,721,371 instructions #0.17 insn per cycle ( +- 0.02% ) 1.446185359 seconds time elapsed ( +- 0.46% ) If I replace POPF with a pop into an unused register, I get this: You are comparing apples and bananas here. Yes, I know. Pop into a register here is basically free. I'd also add a STI and measure how much it takes to enable interrupts, but unfortunately STI throws a #GP in CPL 3.
RFC: Petition Intel/AMD to add POPF_IF insn
Last year, a proposal was floated to avoid costly POPF. In particular, each spin_unlock_irqrestore() does it, a rather common operation. https://lkml.org/lkml/2015/4/21/290 [RFC PATCH] x86/asm/irq: Don't use POPF but STI * Andy Lutomirski wrote: > Another different approach would be to formally state that > pv_irq_ops.save_fl() needs to return all the flags, which would > make local_irq_save() safe to use in this circumstance, but that > makes a hotpath longer for the sake of a single boot time check. ...which reminds me: Why does native_restore_fl restore anything other than IF? A branch and sti should be considerably faster than popf. Ingo agreed: Yes, this has come up in the past, something like the patch below? Totally untested and not signed off yet: because we'd first have to make sure (via irq flags debugging) that it's not used in reverse, to re-disable interrupts: local_irq_save(flags); local_irq_enable(); ... local_irq_restore(flags); /* effective local_irq_disable() */ I don't think we have many (any?) such patterns left, but it has to be checked first. If we have such cases then we'll have to use different primitives there. Linus replied: = "popf" is fast for the "no changes to IF" case, and is a smaller instruction anyway. This basically shot down the proposal. But in my measurements POPF is not fast even in the case where restored flags are not changes at all: mov $200*1000*1000, %eax pushf pop %rbx .balign 64 loop: push%rbx popf dec %eax jnz loop # perf stat -r20 ./popf_1g 4,929,012,093 cycles#3.412 GHz ( +- 0.02% ) 835,721,371 instructions #0.17 insn per cycle ( +- 0.02% ) 1.446185359 seconds time elapsed ( +- 0.46% ) If I replace POPF with a pop into an unused register, I get this: loop: push%rbx pop %rcx dec %eax jnz loop 209,247,645 cycles#3.209 GHz ( +- 0.11% ) 801,898,016 instructions #3.83 insn per cycle ( +- 0.00% ) 0.066210725 seconds time elapsed ( +- 0.59% ) IOW, POPF takes at least 6 cycles. Linus does have a point that a "test+branch+STI" may end up not a clear win because of the branch. But the need to restore IF flag exists, it is necessary not only for Linux, but for any OS running on x86: they all have some sort of spinlock. The addition of a POPF instruction variant which looks only at IF bit and changes only that bit in EFLAGS may be a good idea, for all OSes. I propose that we ask Intel / AMD to do that. Maybe by the "ignored prefix" trick which was used when LZCNT insn was introduced as REPZ-prefixed BSR? Currently, REPZ POPF (f3 9d) insn does execute. Redefine this opcode as POPF_IF. Then the same kernel will work on old and new CPUs. CC'ing some @intel and @amd emails...
Re: [PATCH] uprobes/x86: fix rip-relative handling of EVEX-encoded insns.
On 08/12/2016 07:39 AM, Srikar Dronamraju wrote: * Denys Vlasenko [2016-08-11 17:45:21]: Since instruction decoder now supports EVEX-encoded insns, two fixes are needed to correctly handle them in uprobes. Could you please add the commit that helps support the insns in the Changelog. Sorry, I don't understand the above. Can you rephrase it?
[tip:perf/urgent] uprobes/x86: Fix RIP-relative handling of EVEX-encoded instructions
Commit-ID: 68187872c76a96ed4db7bfb064272591f02e208b Gitweb: http://git.kernel.org/tip/68187872c76a96ed4db7bfb064272591f02e208b Author: Denys Vlasenko AuthorDate: Thu, 11 Aug 2016 17:45:21 +0200 Committer: Ingo Molnar CommitDate: Fri, 12 Aug 2016 08:29:24 +0200 uprobes/x86: Fix RIP-relative handling of EVEX-encoded instructions Since instruction decoder now supports EVEX-encoded instructions, two fixes are needed to correctly handle them in uprobes. Extended bits for MODRM.rm field need to be sanitized just like we do it for VEX3, to avoid encoding wrong register for register-relative access. EVEX has _two_ extended bits: b and x. Theoretically, EVEX.x should be ignored by the CPU (since GPRs go only up to 15, not 31), but let's be paranoid here: proper encoding for register-relative access should have EVEX.x = 1. Secondly, we should fetch vex. for EVEX too. This is now super easy because instruction decoder populates vex_prefix.bytes[2] for all flavors of (e)vex encodings, even for VEX2. Signed-off-by: Denys Vlasenko Acked-by: Masami Hiramatsu Acked-by: Srikar Dronamraju Cc: Alexander Shishkin Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Borislav Petkov Cc: Brian Gerst Cc: H. Peter Anvin Cc: Jim Keniston Cc: Jiri Olsa Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: linux-kernel@vger.kernel.org Cc: # v4.1+ Fixes: 8a764a875fe3 ("x86/asm/decoder: Create artificial 3rd byte for 2-byte VEX") Link: http://lkml.kernel.org/r/20160811154521.20469-1-dvlas...@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/uprobes.c | 22 +++--- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 6c1ff31..495c776 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -357,20 +357,22 @@ static void riprel_analyze(struct arch_uprobe *auprobe, struct insn *insn) *cursor &= 0xfe; } /* -* Similar treatment for VEX3 prefix. -* TODO: add XOP/EVEX treatment when insn decoder supports them +* Similar treatment for VEX3/EVEX prefix. +* TODO: add XOP treatment when insn decoder supports them */ - if (insn->vex_prefix.nbytes == 3) { + if (insn->vex_prefix.nbytes >= 3) { /* * vex2: c5rLpp (has no b bit) * vex3/xop: c4/8f rxbm wLpp * evex: 62rxbR00mm w1pp zllBVaaa -* (evex will need setting of both b and x since -* in non-sib encoding evex.x is 4th bit of MODRM.rm) -* Setting VEX3.b (setting because it has inverted meaning): +* Setting VEX3.b (setting because it has inverted meaning). +* Setting EVEX.x since (in non-SIB encoding) EVEX.x +* is the 4th bit of MODRM.rm, and needs the same treatment. +* For VEX3-encoded insns, VEX3.x value has no effect in +* non-SIB encoding, the change is superfluous but harmless. */ cursor = auprobe->insn + insn_offset_vex_prefix(insn) + 1; - *cursor |= 0x20; + *cursor |= 0x60; } /* @@ -415,12 +417,10 @@ static void riprel_analyze(struct arch_uprobe *auprobe, struct insn *insn) reg = MODRM_REG(insn); /* Fetch modrm.reg */ reg2 = 0xff;/* Fetch vex. */ - if (insn->vex_prefix.nbytes == 2) - reg2 = insn->vex_prefix.bytes[1]; - else if (insn->vex_prefix.nbytes == 3) + if (insn->vex_prefix.nbytes) reg2 = insn->vex_prefix.bytes[2]; /* -* TODO: add XOP, EXEV reading. +* TODO: add XOP reading. * * vex. field is in bits 6-3, bits are inverted. * But in 32-bit mode, high-order bit may be ignored.
[PATCH] uprobes/x86: fix rip-relative handling of EVEX-encoded insns.
Since instruction decoder now supports EVEX-encoded insns, two fixes are needed to correctly handle them in uprobes. Extended bits for MODRM.rm field need to be sanitized just like we do it for VEX3, to avoid encoding wrong register for register-relative access. EVEX has _two_ extended bits: b and x. Theoretically, EVEX.x should be ignored by CPU (since GPRs go only up to 15, not 31), but let's be paranoid here: proper encoding for register-relative access should have EVEX.x = 1. Secondly, we should fetch vex. for EVEX too. This is now super easy because insn decoder populates vex_prefix.bytes[2] for all flavors of (e)vex encodings, even for vex2. Signed-off-by: Denys Vlasenko CC: Jim Keniston CC: Masami Hiramatsu CC: Srikar Dronamraju CC: Ingo Molnar CC: Oleg Nesterov CC: x...@kernel.org CC: linux-kernel@vger.kernel.org --- arch/x86/kernel/uprobes.c | 22 +++--- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 6c1ff31..a6e6096 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -357,20 +357,22 @@ static void riprel_analyze(struct arch_uprobe *auprobe, struct insn *insn) *cursor &= 0xfe; } /* -* Similar treatment for VEX3 prefix. -* TODO: add XOP/EVEX treatment when insn decoder supports them +* Similar treatment for VEX3/EVEX prefix. +* TODO: add XOP treatment when insn decoder supports them */ - if (insn->vex_prefix.nbytes == 3) { + if (insn->vex_prefix.nbytes >= 3) { /* * vex2: c5rLpp (has no b bit) * vex3/xop: c4/8f rxbm wLpp * evex: 62rxbR00mm w1pp zllBVaaa -* (evex will need setting of both b and x since -* in non-sib encoding evex.x is 4th bit of MODRM.rm) -* Setting VEX3.b (setting because it has inverted meaning): +* Setting VEX3.b (setting because it has inverted meaning). +* Setting EVEX.x since (in non-SIB encoding) EVEX.x +* is the 4th bit of MODRM.rm, and needs the same treatment. +* For VEX3-encoded insns, VEX3.x value has no effect in +* non-SIB encoding, the change is superfluous but harmless. */ cursor = auprobe->insn + insn_offset_vex_prefix(insn) + 1; - *cursor |= 0x20; + *cursor |= 0x60; } /* @@ -415,12 +417,10 @@ static void riprel_analyze(struct arch_uprobe *auprobe, struct insn *insn) reg = MODRM_REG(insn); /* Fetch modrm.reg */ reg2 = 0xff;/* Fetch vex. */ - if (insn->vex_prefix.nbytes == 2) - reg2 = insn->vex_prefix.bytes[1]; - else if (insn->vex_prefix.nbytes == 3) + if (insn->vex_prefix.nbytes) reg2 = insn->vex_prefix.bytes[2]; /* -* TODO: add XOP, EXEV reading. +* TODO: add XOP reading. * * vex. field is in bits 6-3, bits are inverted. * But in 32-bit mode, high-order bit may be ignored. -- 2.9.2
Re: [PATCH v3] powerpc: Do not make the entire heap executable
On 08/10/2016 06:36 AM, Michael Ellerman wrote: Denys Vlasenko writes: On 32-bit powerps the ELF PLT sections of binaries (built with --bss-plt, or with a toolchain which defaults to it) look like this: ... arch/powerpc/include/asm/page.h| 10 +- arch/powerpc/include/asm/page_32.h | 2 -- arch/powerpc/include/asm/page_64.h | 4 fs/binfmt_elf.c| 34 ++ include/linux/mm.h | 1 + mm/mmap.c | 20 +++- 6 files changed, 43 insertions(+), 28 deletions(-) What tree is this against? Linus' tree from before August 2. The "mm: refuse wrapped vm_brk requests" commit collided with my changes I'll send patch v4 rebased to today's linus tree as soon as I finish testing it.
Re: [PATCH v3] powerpc: Do not make the entire heap executable
On 08/10/2016 12:43 AM, Kees Cook wrote: -static int do_brk(unsigned long addr, unsigned long len) +static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long flags) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma, *prev; - unsigned long flags; struct rb_node **rb_link, *rb_parent; pgoff_t pgoff = addr >> PAGE_SHIFT; int error; @@ -2666,7 +2665,7 @@ static int do_brk(unsigned long addr, unsigned long len) if (!len) return 0; - flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; + flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; For sanity's sake, should a mask be applied here? i.e. to be extra careful about what flags can get passed in? Maybe... I am leaving it to mm experts. Otherwise, this looks okay to me: Reviewed-by: Kees Cook -Kees
[PATCH v4] powerpc: Do not make the entire heap executable
On 32-bit powerpc the ELF PLT sections of binaries (built with --bss-plt, or with a toolchain which defaults to it) look like this: [17] .sbss NOBITS 0002aff8 01aff8 14 00 WA 0 0 4 [18] .plt NOBITS 0002b00c 01aff8 84 00 WAX 0 0 4 [19] .bss NOBITS 0002b090 01aff8 a4 00 WA 0 0 4 Which results in an ELF load header: Type Offset VirtAddr PhysAddr FileSiz MemSiz Flg Align LOAD 0x019c70 0x00029c70 0x00029c70 0x01388 0x014c4 RWE 0x1 This is all correct, the load region containing the PLT is marked as executable. Note that the PLT starts at 0002b00c but the file mapping ends at 0002aff8, so the PLT falls in the 0 fill section described by the load header, and after a page boundary. Unfortunately the generic ELF loader ignores the X bit in the load headers when it creates the 0 filled non-file backed mappings. It assumes all of these mappings are RW BSS sections, which is not the case for PPC. gcc/ld has an option (--secure-plt) to not do this, this is said to incur a small performance penalty. Currently, to support 32-bit binaries with PLT in BSS kernel maps *entire brk area* with executable rights for all binaries, even --secure-plt ones. Stop doing that. Teach the ELF loader to check the X bit in the relevant load header and create 0 filled anonymous mappings that are executable if the load header requests that. The patch was originally posted in 2012 by Jason Gunthorpe and apparently ignored: https://lkml.org/lkml/2012/9/30/138 Lightly run-tested. Signed-off-by: Jason Gunthorpe Signed-off-by: Denys Vlasenko Reviewed-by: Kees Cook CC: Benjamin Herrenschmidt CC: Paul Mackerras CC: Kees Cook CC: Oleg Nesterov CC: Michael Ellerman CC: Florian Weimer CC: linux...@kvack.org CC: linuxppc-...@lists.ozlabs.org CC: linux-kernel@vger.kernel.org --- Changes since v3: * typo fix in commit message * rebased to current Linus tree Changes since v2: * moved capability to map with VM_EXEC into vm_brk_flags() Changes since v1: * wrapped lines to not exceed 79 chars * improved comment * expanded CC list arch/powerpc/include/asm/page.h| 10 +- arch/powerpc/include/asm/page_32.h | 2 -- arch/powerpc/include/asm/page_64.h | 4 fs/binfmt_elf.c| 30 ++ include/linux/mm.h | 1 + mm/mmap.c | 21 - 6 files changed, 40 insertions(+), 28 deletions(-) diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h index 56398e7..42d7ea1 100644 --- a/arch/powerpc/include/asm/page.h +++ b/arch/powerpc/include/asm/page.h @@ -225,15 +225,7 @@ extern long long virt_phys_offset; #endif #endif -/* - * Unfortunately the PLT is in the BSS in the PPC32 ELF ABI, - * and needs to be executable. This means the whole heap ends - * up being executable. - */ -#define VM_DATA_DEFAULT_FLAGS32(VM_READ | VM_WRITE | VM_EXEC | \ -VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) - -#define VM_DATA_DEFAULT_FLAGS64(VM_READ | VM_WRITE | \ +#define VM_DATA_DEFAULT_FLAGS (VM_READ | VM_WRITE | \ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) #ifdef __powerpc64__ diff --git a/arch/powerpc/include/asm/page_32.h b/arch/powerpc/include/asm/page_32.h index 6a8e179..6113fa8 100644 --- a/arch/powerpc/include/asm/page_32.h +++ b/arch/powerpc/include/asm/page_32.h @@ -9,8 +9,6 @@ #endif #endif -#define VM_DATA_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS32 - #ifdef CONFIG_NOT_COHERENT_CACHE #define ARCH_DMA_MINALIGN L1_CACHE_BYTES #endif diff --git a/arch/powerpc/include/asm/page_64.h b/arch/powerpc/include/asm/page_64.h index dd5f071..52d8e9c 100644 --- a/arch/powerpc/include/asm/page_64.h +++ b/arch/powerpc/include/asm/page_64.h @@ -159,10 +159,6 @@ do { \ #endif /* !CONFIG_HUGETLB_PAGE */ -#define VM_DATA_DEFAULT_FLAGS \ - (is_32bit_task() ? \ -VM_DATA_DEFAULT_FLAGS32 : VM_DATA_DEFAULT_FLAGS64) - /* * This is the default if a program doesn't have a PT_GNU_STACK * program header entry. The PPC64 ELF ABI has a non executable stack diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 7f6aff3f..2b57b5a 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -91,12 +91,18 @@ static struct linux_binfmt elf_format = { #define BAD_ADDR(x) ((unsigned long)(x) >= TASK_SIZE) -static int set_brk(unsigned long start, unsigned long end) +static int set_brk(unsigned long start, unsigned long end, int prot) { start = ELF_PAGEALIGN(start); end = ELF_PAGEALIGN(end); if (end > start) { - int error = vm_brk(start, end - start); + /* +* Map the last of the bss segment. +* If the header is requesting these pages to be +* executable, honour