[PATCH] selftests: use "$(MAKE)" instead of "make" for headers_install

2020-08-17 Thread Denys Vlasenko
If top make invocation uses -j4 or larger, this patch reduces
"make headers_install" subtask run time from 30 to 7 seconds.

CC: Shuah Khan 
CC: Shuah Khan 
CC: linux-kselft...@vger.kernel.org
CC: linux-kernel@vger.kernel.org
Signed-off-by: Denys Vlasenko 
---
 tools/testing/selftests/lib.mk | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/lib.mk b/tools/testing/selftests/lib.mk
index 7a17ea815736..51124b962d56 100644
--- a/tools/testing/selftests/lib.mk
+++ b/tools/testing/selftests/lib.mk
@@ -47,9 +47,9 @@ ARCH  ?= $(SUBARCH)
 khdr:
 ifndef KSFT_KHDR_INSTALL_DONE
 ifeq (1,$(DEFAULT_INSTALL_HDR_PATH))
-   make --no-builtin-rules ARCH=$(ARCH) -C $(top_srcdir) headers_install
+   $(MAKE) --no-builtin-rules ARCH=$(ARCH) -C $(top_srcdir) headers_install
 else
-   make --no-builtin-rules INSTALL_HDR_PATH=$$OUTPUT/usr \
+   $(MAKE) --no-builtin-rules INSTALL_HDR_PATH=$$OUTPUT/usr \
ARCH=$(ARCH) -C $(top_srcdir) headers_install
 endif
 endif
-- 
2.25.0



Re: CBL issue #431: lld: x86_64: sysfs: cannot create duplicate filename $module/.rodata.cst{16,32}

2019-04-08 Thread Denys Vlasenko

On 4/8/19 4:57 PM, Sedat Dilek wrote:

We have arch/x86/crypto/chacha-avx2-x86_64.S and
arch/x86/crypto/chacha-avx512vl-x86_64.S:

.rodata.cst32.CTR2BL
.rodata.cst32.CTR4BL
.rodata.cst32.CTR2BL
.rodata.cst32.CTR4BL

...and in arch/x86/crypto/sha256-avx2-asm.S and
arch/x86/crypto/sha512-avx2-asm.S:

.rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK

Correct?


You mean, we have duplicate section names.

Well, this brings me to my initial response -
"Not sure how exactly this causes the error".

Duplicate section names are allowed by the linker.
There is nothing wrong with that.

The warnings you see come from some other tooling,
which does not handle correctly object files with more
than one section with the same name.

(Having unique names is still preferable,
it helps humans to more easily find where sections come from,
and for potential future --gc-sections optimization).


Re: CBL issue #431: lld: x86_64: sysfs: cannot create duplicate filename $module/.rodata.cst{16,32}

2019-04-08 Thread Denys Vlasenko

On 4/8/19 4:34 PM, Sedat Dilek wrote:

v2:

sdi@iniza:~/src/linux-kernel/linux$ git --no-pager diff
diff --git a/arch/x86/crypto/camellia-aesni-avx-asm_64.S
b/arch/x86/crypto/camellia-aesni-avx-asm_64.S
index a14af6eb09cb..712d6a7e8b8f 100644
--- a/arch/x86/crypto/camellia-aesni-avx-asm_64.S
+++ b/arch/x86/crypto/camellia-aesni-avx-asm_64.S
@@ -573,8 +573,12 @@
ENDPROC(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
 vmovdqu y7, 15 * 16(rio);


-/* NB: section is mergeable, all elements must be aligned 16-byte blocks */
-.section   .rodata.cst16, "aM", @progbits, 16
+/*
+ * NB: section is mergeable, all elements must be aligned 16-byte blocks
+ * There is more than one object in this section, let's use module name
+ * instead of object name as unique suffix
+ */
+.section.rodata.cst16.camellia_aesni_avx_asm_64, "aM", @progbits, 16
  .align 16

  #define SHUFB_BYTES(idx) \
diff --git a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S
b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S
index b66bbfa62f50..34f6b0c4196d 100644
--- a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S
+++ b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S
@@ -626,8 +626,12 @@
ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
 .long 0x00010203, 0x04050607, 0x80808080, 0x80808080
 .long 0x00010203, 0x04050607, 0x80808080, 0x80808080

-/* NB: section is mergeable, all elements must be aligned 16-byte blocks */
-.section   .rodata.cst16, "aM", @progbits, 16
+/*
+ * NB: section is mergeable, all elements must be aligned 16-byte blocks
+ * There is more than one object in this section, let's use module name
+ * instead of object name as unique suffix
+*/
+.section.rodata.cst16.ccamellia_aesni_avx2_asm_64, "aM", @progbits, 16


why "ccamellia", not "camellia"?



I tried to check for the .rodata.cst32 case, how do I identify the *.S files?


?

Looks like all .rodata.cst32 sections have suffixes, nothing to fix.


Re: CBL issue #431: lld: x86_64: sysfs: cannot create duplicate filename $module/.rodata.cst{16,32}

2019-04-08 Thread Denys Vlasenko

On 4/8/19 4:23 PM, Sedat Dilek wrote:

For the .rodata.cst16 part you mean sth. like this?


yes, see below


--- a/arch/x86/crypto/camellia-aesni-avx-asm_64.S
+++ b/arch/x86/crypto/camellia-aesni-avx-asm_64.S
@@ -573,8 +573,12 @@
ENDPROC(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
 vmovdqu y7, 15 * 16(rio);


-/* NB: section is mergeable, all elements must be aligned 16-byte blocks */
-.section   .rodata.cst16, "aM", @progbits, 16
+/*
+ * NB: section is mergeable, all elements must be aligned 16-byte blocks
+ * There is more than one object in this section, let's use module name
+ * instead of object name as unique suffix
+ */
+.section.rodata.cst16.camellia-aesni-avx-asm_64, "aM", @progbits, 16


dashes in the name may cause problems, replace with '_'.


  .align 16

  #define SHUFB_BYTES(idx) \
diff --git a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S
b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S
index b66bbfa62f50..d6ce36e82a93 100644
--- a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S
+++ b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S
@@ -626,8 +626,12 @@
ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
 .long 0x00010203, 0x04050607, 0x80808080, 0x80808080
 .long 0x00010203, 0x04050607, 0x80808080, 0x80808080

-/* NB: section is mergeable, all elements must be aligned 16-byte blocks */
-.section   .rodata.cst16, "aM", @progbits, 16
+/*
+ * NB: section is mergeable, all elements must be aligned 16-byte blocks
+ * There is more than one object in this section, let's use module name
+ * instead of object name as unique suffix
+*/
+.section.rodata.cst16.ccamellia-aesni-avx2-asm_64, "aM", @progbits, 16


dashes in the name may cause problems, replace with '_'.


Re: CBL issue #431: lld: x86_64: sysfs: cannot create duplicate filename $module/.rodata.cst{16,32}

2019-04-08 Thread Denys Vlasenko

On 4/8/19 3:56 PM, Denys Vlasenko wrote:

I propose to change section name, append _module_ name and optionally
a comment why this is done:

  /* NB: section is mergeable, all elements must be aligned 16-byte blocks */
+/* There is more than one object in this section, let's use module name
+   as unique suffix */


Probably not clear enough... maybe this is better:

+/* There is more than one object in this section, let's use module name
+   instead of object name as unique suffix */
-.section    .rodata.cst16, "aM", @progbits, 16

and do not use '-' in the name, replace with '_':

+.section    .rodata.cst16.cast6_avx_x86_64_asm_64, "aM", @progbits, 16




Re: CBL issue #431: lld: x86_64: sysfs: cannot create duplicate filename $module/.rodata.cst{16,32}

2019-04-08 Thread Denys Vlasenko

On 4/8/19 3:36 PM, Sedat Dilek wrote:

I fell over your commit "crypto: x86 - make constants readonly, allow
linker to merge them" [1] while digging into ClangBuiltLinux issue 431
[0].

I see the following in my dmesg-log:

$ grep sysfs: dmesg_5.0.4-rc1-1-amd64-cbl-asmgoto.txt
[Fri Mar 22 10:32:09 2019] sysfs: cannot create duplicate filename
'/module/usbcore/sections/.rodata.cst16'
[Fri Mar 22 10:32:18 2019] sysfs: cannot create duplicate filename
'/module/nfsd/sections/.rodata.cst32'
[Fri Mar 22 10:32:18 2019] sysfs: cannot create duplicate filename
'/module/iwlwifi/sections/.rodata.cst16'
[Fri Mar 22 10:32:18 2019] sysfs: cannot create duplicate filename
'/module/i915/sections/.rodata.cst32'
[Fri Mar 22 10:32:18 2019] sysfs: cannot create duplicate filename
'/module/mac80211/sections/.rodata.cst32'
[Fri Mar 22 10:32:18 2019] sysfs: cannot create duplicate filename
'/module/iwlmvm/sections/.rodata.cst16'
[Fri Mar 22 10:32:20 2019] sysfs: cannot create duplicate filename
'/module/bluetooth/sections/.rodata.cst16'

Above modules have dependencies to stuff from arch/x86/crypto (see below P.S.).


Not sure how exactly this causes the error, but the cause seems to be
having more than one section with the same name. This occurs in only three files
(grep for ".rodata.cst16," string):

cast6-avx-x86_64-asm_64.S
camellia-aesni-avx2-asm_64.S
camellia-aesni-avx-asm_64.S

/* NB: section is mergeable, all elements must be aligned 16-byte blocks */
.section.rodata.cst16, "aM", @progbits, 16

In other places I used .rodata.cst16.OBJECTNAME, but in these three cases
there are more than one object in the section, so I left it w/o OBJECTNAME.

I propose to change section name, append _module_ name and optionally
a comment why this is done:

 /* NB: section is mergeable, all elements must be aligned 16-byte blocks */
+/* There is more than one object in this section, let's use module name
+   as unique suffix */
-.section.rodata.cst16, "aM", @progbits, 16
+.section.rodata.cst16.cast6-avx-x86_64-asm_64, "aM", @progbits, 16



Looks like LLD defaults to -ffunction-sections and -fdata-sections.
Do you happen to know what the defaults are for BFD linker?


linker does not create section names. Compiler does for .c files,
for .S files they are made by a human.


Re: [PATCH 01/25] x86: Make SMAP 64-bit only

2019-03-21 Thread Denys Vlasenko

On 3/18/19 7:10 PM, Linus Torvalds wrote:

On Mon, Mar 18, 2019 at 10:51 AM Peter Zijlstra  wrote:


How about I do a patch that schedules EFLAGS for both 32bit and 64bit,
mark this for backporting to infinity.

And then at the end, after the objtool-ac bits land, I do a patch
removing the EFLAGS scheduling for x86_64.


Sounds sane to me.

And we can make it AC-conditional if it's actually shown to be visible
from a performance standpoint.

But iirc pushf/popf isn't really that expensive - in fact I think it's
pretty cheap when system flags don't change.


I did not see evidence of this. In my testing,
POPF is always ~20 cycles, even if popped flags are identical to current
state of flags.


Re: preempt.h: some SOFTIRQ_OFFSET should be SOFTIRQ_MASK?

2019-02-13 Thread Denys Vlasenko
On Wed, Feb 13, 2019 at 5:05 AM Frederic Weisbecker  wrote:
> On Tue, Feb 05, 2019 at 07:34:31PM +0100, Denys Vlasenko wrote:
> > SOFTIRQ is a counter.
> > Why here:
> >
> > #define in_serving_softirq()(softirq_count() & SOFTIRQ_OFFSET)
> > #define in_task()   (!(preempt_count() & \
> >(NMI_MASK | HARDIRQ_MASK | 
> > SOFTIRQ_OFFSET)))
> >
> > we check only lowest bit?
>
> So we have SOFTIRQ_OFFSET that is used when serving softirqs.
> And we have SOFTIRQ_DISABLE_OFFSET that is used when we disable
> softirqs.
>
> I think the choice is right on both tests above, or am I missing something?

>From my reading of the following:

 * We put the hardirq and softirq counter into the preemption
 * counter. The bitmask has the following meaning:
 * - bits 0-7 are the preemption count (max preemption depth: 256)
 * - bits 8-15 are the softirq count (max # of softirqs: 256)
 * The hardirq count could in theory be the same as the number of
 * interrupts in the system, but we run all interrupt handlers with
 * interrupts disabled, so we cannot have nesting interrupts. Though
 * there are a few palaeontologic drivers which reenable interrupts in
 * the handler, so we need more than one bit here.
 * PREEMPT_MASK:0x00ff
 * SOFTIRQ_MASK:0xff00
 * HARDIRQ_MASK:0x000f
 * NMI_MASK:0x0010
 * PREEMPT_NEED_RESCHED:0x8000
 */
#define PREEMPT_BITS8
#define SOFTIRQ_BITS8
#define HARDIRQ_BITS4
#define NMI_BITS1

it seems that 8-bit SOFTIRQ_MASK contains depth count of nested softirqs.
Therefore, to test whether we are in softirq, you need:

define in_serving_softirq()(softirq_count() & SOFTIRQ_MASK)

But existing code uses SOFTIRQ_OFFSET, not SOFTIRQ_MASK, for the mask.
This means that if nest count is e.g. 2, in_serving_softirq() will return
"false".


preempt.h: some SOFTIRQ_OFFSET should be SOFTIRQ_MASK?

2019-02-05 Thread Denys Vlasenko
SOFTIRQ is a counter.
Why here:

#define in_serving_softirq()(softirq_count() & SOFTIRQ_OFFSET)
#define in_task()   (!(preempt_count() & \
   (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)))

we check only lowest bit?


Re: [PATCH] x86: only use ERMS for user copies for larger sizes

2018-11-21 Thread Denys Vlasenko

On 11/21/2018 02:32 PM, Jens Axboe wrote:

On 11/20/18 11:36 PM, Ingo Molnar wrote:

* Jens Axboe  wrote:

So this is a fun one... While I was doing the aio polled work, I noticed
that the submitting process spent a substantial amount of time copying
data to/from userspace. For aio, that's iocb and io_event, which are 64
and 32 bytes respectively. Looking closer at this, and it seems that
ERMS rep movsb is SLOWER for smaller copies, due to a higher startup
cost.

I came up with this hack to test it out, and low and behold, we now cut
the time spent in copying in half. 50% less.

Since these kinds of patches tend to lend themselves to bike shedding, I
also ran a string of kernel compilations out of RAM. Results are as
follows:

Patched : 62.86s avg, stddev 0.65s
Stock   : 63.73s avg, stddev 0.67s

which would also seem to indicate that we're faster punting smaller
(< 128 byte) copies.

CPU: Intel(R) Xeon(R) CPU E5-2650 v4 @ 2.20GHz

Interestingly, text size is smaller with the patch as well?!

I'm sure there are smarter ways to do this, but results look fairly
conclusive. FWIW, the behaviorial change was introduced by:

commit 954e482bde20b0e208fd4d34ef26e10afd194600
Author: Fenghua Yu 
Date:   Thu May 24 18:19:45 2012 -0700

 x86/copy_user_generic: Optimize copy_user_generic with CPU erms feature

which contains nothing in terms of benchmarking or results, just claims
that the new hotness is better.

Signed-off-by: Jens Axboe 
---

diff --git a/arch/x86/include/asm/uaccess_64.h 
b/arch/x86/include/asm/uaccess_64.h
index a9d637bc301d..7dbb78827e64 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -29,16 +29,27 @@ copy_user_generic(void *to, const void *from, unsigned len)
  {
unsigned ret;
  
+	/*

+* For smaller copies, don't use ERMS as it's slower.
+*/
+   if (len < 128) {
+   alternative_call(copy_user_generic_unrolled,
+copy_user_generic_string, X86_FEATURE_REP_GOOD,
+ASM_OUTPUT2("=a" (ret), "=D" (to), "=S" (from),
+"=d" (len)),
+"1" (to), "2" (from), "3" (len)
+: "memory", "rcx", "r8", "r9", "r10", "r11");
+   return ret;
+   }
+
/*
 * If CPU has ERMS feature, use copy_user_enhanced_fast_string.
 * Otherwise, if CPU has rep_good feature, use copy_user_generic_string.
 * Otherwise, use copy_user_generic_unrolled.
 */
alternative_call_2(copy_user_generic_unrolled,
-copy_user_generic_string,
-X86_FEATURE_REP_GOOD,
-copy_user_enhanced_fast_string,
-X86_FEATURE_ERMS,
+copy_user_generic_string, X86_FEATURE_REP_GOOD,
+copy_user_enhanced_fast_string, X86_FEATURE_ERMS,
 ASM_OUTPUT2("=a" (ret), "=D" (to), "=S" (from),
 "=d" (len)),
 "1" (to), "2" (from), "3" (len)


So I'm inclined to do something like yours, because clearly the changelog
of 954e482bde20 was at least partly false: Intel can say whatever they
want, it's a fact that ERMS has high setup costs for low buffer sizes -
ERMS is optimized for large size, cache-aligned copies mainly.


I'm actually surprised that something like that was accepted, I guess
2012 was a simpler time :-)


But the result is counter-intuitive in terms of kernel text footprint,
plus the '128' is pretty arbitrary - we should at least try to come up
with a break-even point where manual copy is about as fast as ERMS - on
at least a single CPU ...


I did some more investigation yesterday, and found this:

commit 236222d39347e0e486010f10c1493e83dbbdfba8
Author: Paolo Abeni 
Date:   Thu Jun 29 15:55:58 2017 +0200

 x86/uaccess: Optimize copy_user_enhanced_fast_string() for short strings

which does attempt to rectify it, but only using ERMS for >= 64 byte copies.
At least for me, looks like the break even point is higher than that, which
would mean that something like the below would be more appropriate.


I also tested this while working for string ops code in musl.

I think at least 128 bytes would be the minimum where "REP insn"
are more efficient. In my testing, it's more like 256 bytes...


Re: [tip:x86/asm] x86/entry/64: Add two more instruction suffixes

2018-07-03 Thread Denys Vlasenko

On 07/03/2018 10:46 AM, David Laight wrote:

From: Jan Beulich

Sent: 03 July 2018 09:36

...

As said there, omitting suffixes from instructions in AT&T mode is bad
practice when operand size cannot be determined by the assembler from
register operands, and is likely going to be warned about by upstream
gas in the future (mine does already).

...

-   bt  $9, EFLAGS(%rsp)/* interrupts off? */
+   btl $9, EFLAGS(%rsp)/* interrupts off? */


Hmmm
Does the operand size make any difference at all for the bit instructions?
I'm pretty sure that the cpus (386 onwards) have always done aligned 32bit
transfers (the docs never actually said aligned).
I can't remember whether 64bit mode allows immediates above 31.


Immediates up to 63 are allowed in 64 bit mode (IOW: for REX-prefixed form)
(run-tested).

Keep in mind that this instruction is "special" with register bit offset:

Register/memory form (BT REG,[MEM]) does not limit or mask the value of bit 
offset
in REG, the instruction uses bit REG%8 in byte at address [MEM+REG/8].

This works correctly even for negative values: REG = -1 will access
the most significant bit in the byte immediately before MEM.

Thus, for accesses of standard RAM locations (not memory-mapped IO and such),
the "operand size" concept for this instruction (and BTC, BTR, BTS)
does not make much sense: it accesses one bit. The width of actual memory
access is irrelevant.

I'd say assembler should just use the "natural" width for current mode
(16 or 32-bit), and warn when code tries to use immediate operand which
will be truncated and thus needs a wider operand size.

Intel documentation says that immediate operand in BT IMM,[MEM]
is truncated to operand size. My experiment seems to confirm it:

254:1  <- BT 254,[MEM] actually accesses bit #30, not #254
255:0
254:0
255:0

#include 
#include 
int main(int argc, char **argv, char **envp)
{
char buf[256];
int result;

memset(buf, 0x55, sizeof(buf)); /* bit pattern: 01010101 */

buf[255/8] = 0;
asm("\n"
"   bt  %1, %2\n"
"   sbb %0, %0\n"
: "=r" (result)
: "i" (254), "m" (buf)
);
printf("254:%x\n", !!result);
asm("\n"
"   bt  %1, %2\n"
"   sbb %0, %0\n"
: "=r" (result)
: "i" (255), "m" (buf)
);
printf("255:%x\n", !!result);
buf[255/8] = 0x55;

buf[31/8] = 0;
asm("\n"
"   bt  %1, %2\n"
"   sbb %0, %0\n"
: "=r" (result)
: "i" (254), "m" (buf)
);
printf("254:%x\n", !!result);
asm("\n"
"   bt  %1, %2\n"
"   sbb %0, %0\n"
: "=r" (result)
: "i" (255), "m" (buf)
);
printf("255:%x\n", !!result);

return 0;
}

When I use "r" instead of "i" to generate REG,[MEM] form,
the instruction does access bit #254:

254:0
255:0
254:1
255:0



Re: [PATCH] x86/entry/64/compat: Preserve r8-r11 in int $0x80

2018-04-18 Thread Denys Vlasenko

On 04/18/2018 06:53 PM, Andy Lutomirski wrote:

On Tue, Apr 17, 2018 at 8:00 AM, Denys Vlasenko  wrote:

This means that the new behavior is there for some 8 years already.
Whoever was impacted by it, probably already switched to the new ABI.

Current ABI is "weaker", it allows kernel to save fewer registers.

Which is generally a good thing, since saving/restoring things cost
cycles, and sometimes painful on entry paths where you may desperately
need a scratch register or two. (Recall this one? -
...
 movq%rsp, PER_CPU_VAR(rsp_scratch)
 movqPER_CPU_VAR(cpu_current_top_of_stack), %rsp
 /* Construct struct pt_regs on stack */
 pushq   $__USER_DS  /* pt_regs->ss */
 pushq   PER_CPU_VAR(rsp_scratch)/* pt_regs->sp */
...
wouldn't it be _great_ if one of GPRs would be available here
to hold userspace %rsp?
)


But this is the int $0x80 entry, which uses the stack sanely and
doesn't have this problem at all.


It was a general point why not committing to save every register
may help on the callee (in this case kernel) side.


Re: [PATCH] x86/entry/64/compat: Preserve r8-r11 in int $0x80

2018-04-17 Thread Denys Vlasenko

On 04/17/2018 04:36 PM, Andy Lutomirski wrote:

32-bit user code that uses int $80 doesn't care about r8-r11.  There is,
however, some 64-bit user code that intentionally uses int $0x80 to
invoke 32-bit system calls.  From what I've seen, basically all such
code assumes that r8-r15 are all preserved, but the kernel clobbers
r8-r11.  Since I doubt that there's any code that depends on int $0x80
zeroing r8-r11, change the kernel to preserve them.

I suspect that very little user code is broken by the old clobber,
since r8-r11 are only rarely allocated by gcc, and they're clobbered
by function calls, so they only way we'd see a problem is if the
same function that invokes int $0x80 also spills something important
to one of these registers.

The current behavior seems to date back to the historical commit
"[PATCH] x86-64 merge for 2.6.4".  Before that, all regs were
preserved.  I can't find any explanation of why this change was made.


This means that the new behavior is there for some 8 years already.
Whoever was impacted by it, probably already switched to the new ABI.

Current ABI is "weaker", it allows kernel to save fewer registers.

Which is generally a good thing, since saving/restoring things cost
cycles, and sometimes painful on entry paths where you may desperately
need a scratch register or two. (Recall this one? -
...
movq%rsp, PER_CPU_VAR(rsp_scratch)
movqPER_CPU_VAR(cpu_current_top_of_stack), %rsp
/* Construct struct pt_regs on stack */
pushq   $__USER_DS  /* pt_regs->ss */
pushq   PER_CPU_VAR(rsp_scratch)/* pt_regs->sp */
...
wouldn't it be _great_ if one of GPRs would be available here
to hold userspace %rsp?
)

If userspace needs some registers saved, it's trivial for it to have:

push reg1
push reg2
int  0x80
pop  reg2
pop  reg1

OTOH if userspace _does not_ need some registers saved,
but they are defined as saved by the entrypoint ABI, then save/restore work
is done every time, even when not needed.

Thus, I propose to retain the current behavior.


Re: [PATCH 34/34] x86/mm/pti: Add Warning when booting on a PCIE capable CPU

2018-03-05 Thread Denys Vlasenko

On 03/05/2018 11:26 AM, Joerg Roedel wrote:

From: Joerg Roedel 

Warn the user in case the performance can be significantly
improved by switching to a 64-bit kernel.

Suggested-by: Andy Lutomirski 
Signed-off-by: Joerg Roedel 
---
  arch/x86/mm/pti.c | 16 
  1 file changed, 16 insertions(+)

diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
index 3ffd923..8f5aa0d 100644
--- a/arch/x86/mm/pti.c
+++ b/arch/x86/mm/pti.c
@@ -385,6 +385,22 @@ void __init pti_init(void)
  
  	pr_info("enabled\n");
  
+#ifdef CONFIG_X86_32

+   if (boot_cpu_has(X86_FEATURE_PCID)) {
+   /* Use printk to work around pr_fmt() */
+   printk(KERN_WARNING "\n");
+   printk(KERN_WARNING 
"\n");
+   printk(KERN_WARNING "** WARNING! WARNING! WARNING! WARNING! WARNING! 
WARNING!  **\n");
+   printk(KERN_WARNING "**  
  **\n");
+   printk(KERN_WARNING "** You are using 32-bit PTI on a 64-bit 
PCID-capable CPU. **\n");
+   printk(KERN_WARNING "** Your performance will increase dramatically 
if you **\n");
+   printk(KERN_WARNING "** switch to a 64-bit kernel!   
  **\n");
+   printk(KERN_WARNING "**  
  **\n");
+   printk(KERN_WARNING "** WARNING! WARNING! WARNING! WARNING! WARNING! 
WARNING!  **\n");
+   printk(KERN_WARNING 
"\n");


Isn't it a bit too dramatic? Not one, but two lines of big fat warnings?

There are people who run 32-bit kernels on purpose, not because they
did not yet realize 64 bits are upon us.

E.g. industrial setups with strict regulations and licensing requirements.
In many such cases they already are more than satisfied with CPU speeds,
thus not interested in 64-bit migration for performance reasons,
and avoid it because it would incur mountains of paperwork
with no tangible gains.

The big fat warning on every boot would be irritating.


[PATCH v2] net: make getname() functions return length rather than use int* parameter

2018-02-12 Thread Denys Vlasenko
Changes since v1:
Added changes in these files:
drivers/infiniband/hw/usnic/usnic_transport.c
drivers/staging/lustre/lnet/lnet/lib-socket.c
drivers/target/iscsi/iscsi_target_login.c
drivers/vhost/net.c
fs/dlm/lowcomms.c
fs/ocfs2/cluster/tcp.c
security/tomoyo/network.c


Before:
All these functions either return a negative error indicator,
or store length of sockaddr into "int *socklen" parameter
and return zero on success.

"int *socklen" parameter is awkward. For example, if caller does not
care, it still needs to provide on-stack storage for the value
it does not need.

None of the many FOO_getname() functions of various protocols
ever used old value of *socklen. They always just overwrite it.

This change drops this parameter, and makes all these functions, on success,
return length of sockaddr. It's always >= 0 and can be differentiated
from an error.

Tests in callers are changed from "if (err)" to "if (err < 0)", where needed.

rpc_sockname() lost "int buflen" parameter, since its only use was
to be passed to kernel_getsockname() as &buflen and subsequently
not used in any way.

Userspace API is not changed.

textdata bss  dec hex filename
30108430 2633624  873672 33615726 200ef6e vmlinux.before.o
30108109 2633612  873672 33615393 200ee21 vmlinux.o

Signed-off-by: Denys Vlasenko 
CC: David S. Miller 
CC: linux-kernel@vger.kernel.org
CC: net...@vger.kernel.org
CC: linux-blueto...@vger.kernel.org
CC: linux-decnet-u...@lists.sourceforge.net
CC: linux-wirel...@vger.kernel.org
CC: linux-r...@vger.kernel.org
CC: linux-s...@vger.kernel.org
CC: linux-...@vger.kernel.org
CC: linux-...@vger.kernel.org
---
 drivers/infiniband/hw/usnic/usnic_transport.c |  5 ++--
 drivers/isdn/mISDN/socket.c   |  5 ++--
 drivers/net/ppp/pppoe.c   |  6 ++---
 drivers/net/ppp/pptp.c|  6 ++---
 drivers/scsi/iscsi_tcp.c  | 14 +--
 drivers/soc/qcom/qmi_interface.c  |  3 +--
 drivers/staging/ipx/af_ipx.c  |  6 ++---
 drivers/staging/irda/net/af_irda.c|  8 +++---
 drivers/staging/lustre/lnet/lnet/lib-socket.c |  7 +++---
 drivers/target/iscsi/iscsi_target_login.c | 18 +++---
 drivers/vhost/net.c   |  7 +++---
 fs/dlm/lowcomms.c |  7 +++---
 fs/ocfs2/cluster/tcp.c|  6 ++---
 include/linux/net.h   |  8 +++---
 include/net/inet_common.h |  2 +-
 include/net/ipv6.h|  2 +-
 include/net/sock.h|  2 +-
 net/appletalk/ddp.c   |  5 ++--
 net/atm/pvc.c |  5 ++--
 net/atm/svc.c |  5 ++--
 net/ax25/af_ax25.c|  4 +--
 net/bluetooth/hci_sock.c  |  4 +--
 net/bluetooth/l2cap_sock.c|  5 ++--
 net/bluetooth/rfcomm/sock.c   |  5 ++--
 net/bluetooth/sco.c   |  5 ++--
 net/can/raw.c |  6 ++---
 net/core/sock.c   |  5 ++--
 net/decnet/af_decnet.c|  6 ++---
 net/ipv4/af_inet.c|  5 ++--
 net/ipv6/af_inet6.c   |  5 ++--
 net/iucv/af_iucv.c|  5 ++--
 net/l2tp/l2tp_ip.c|  5 ++--
 net/l2tp/l2tp_ip6.c   |  5 ++--
 net/l2tp/l2tp_ppp.c   |  5 ++--
 net/llc/af_llc.c  |  5 ++--
 net/netlink/af_netlink.c  |  5 ++--
 net/netrom/af_netrom.c|  9 ---
 net/nfc/llcp_sock.c   |  5 ++--
 net/packet/af_packet.c| 10 +++-
 net/phonet/socket.c   |  5 ++--
 net/qrtr/qrtr.c   |  5 ++--
 net/rds/af_rds.c  |  5 ++--
 net/rds/tcp.c |  7 ++
 net/rose/af_rose.c|  5 ++--
 net/sctp/ipv6.c   |  8 +++---
 net/smc/af_smc.c  | 11 -
 net/socket.c  | 35 +--
 net/sunrpc/clnt.c |  6 ++---
 net/sunrpc/svcsock.c  | 13 ++
 net/sunrpc/xprtsock.c |  3 +--
 net/tipc/socket.c |  5 ++--
 net/unix/af_unix.c| 10 
 net/vmw_vsock/af_vsock.c  |  4 +--
 net/x25/af_x25.c  |  4 +--
 security/tomoyo/network.c |  5 ++--
 55 files changed, 159 insertions(+), 203 deletio

Re: [PATCH] net: make getname() functions return length rather than use int* parameter

2018-02-12 Thread Denys Vlasenko

On 02/12/2018 06:47 PM, David Miller wrote:

From: Denys Vlasenko 
Date: Mon, 12 Feb 2018 15:15:18 +0100


Before:
All these functions either return a negative error indicator,
or store length of sockaddr into "int *socklen" parameter
and return zero on success.

"int *socklen" parameter is awkward. For example, if caller does not
care, it still needs to provide on-stack storage for the value
it does not need.

None of the many FOO_getname() functions of various protocols
ever used old value of *socklen. They always just overwrite it.

This change drops this parameter, and makes all these functions, on success,
return length of sockaddr. It's always >= 0 and can be differentiated
from an error.

Tests in callers are changed from "if (err)" to "if (err < 0)", where needed.

rpc_sockname() lost "int buflen" parameter, since its only use was
to be passed to kernel_getsockname() as &buflen and subsequently
not used in any way.

Userspace API is not changed.

 textdata bss  dec hex filename
30108430 2633624  873672 33615726 200ef6e vmlinux.before.o
30108109 2633612  873672 33615393 200ee21 vmlinux.o

Signed-off-by: Denys Vlasenko 


Please do an allmodconfig build, there are still some conversions you
missed:

security/tomoyo/network.c: In function ‘tomoyo_socket_listen_permission’:
security/tomoyo/network.c:658:19: warning: passing argument 3 of 
‘sock->ops->getname’ makes integer from pointer without a cast 
[-Wint-conversion]
 &addr, &addr_len, 0);
^
security/tomoyo/network.c:658:19: note: expected ‘int’ but argument is of type 
‘int *’
security/tomoyo/network.c:657:21: error: too many arguments to function 
‘sock->ops->getname’
const int error = sock->ops->getname(sock, (struct sockaddr *)
  ^~~~
fs/dlm/lowcomms.c: In function ‘lowcomms_error_report’:
fs/dlm/lowcomms.c:495:6: error: too many arguments to function 
‘kernel_getpeername’
   kernel_getpeername(con->sock, (struct sockaddr *)&saddr, &buflen)) {
   ^~
fs/dlm/lowcomms.c: In function ‘tcp_accept_from_sock’:
fs/dlm/lowcomms.c:761:7: warning: passing argument 3 of ‘newsock->ops->getname’ 
makes integer from pointer without a cast [-Wint-conversion]
&len, 2)) {
^
fs/dlm/lowcomms.c:761:7: note: expected ‘int’ but argument is of type ‘int *’
fs/dlm/lowcomms.c:760:6: error: too many arguments to function 
‘newsock->ops->getname’
   if (newsock->ops->getname(newsock, (struct sockaddr *)&peeraddr,
   ^~~


Sorry. Will send updated patch.
 


[PATCH] net: make getname() functions return length rather than use int* parameter

2018-02-12 Thread Denys Vlasenko
Before:
All these functions either return a negative error indicator,
or store length of sockaddr into "int *socklen" parameter
and return zero on success.

"int *socklen" parameter is awkward. For example, if caller does not
care, it still needs to provide on-stack storage for the value
it does not need.

None of the many FOO_getname() functions of various protocols
ever used old value of *socklen. They always just overwrite it.

This change drops this parameter, and makes all these functions, on success,
return length of sockaddr. It's always >= 0 and can be differentiated
from an error.

Tests in callers are changed from "if (err)" to "if (err < 0)", where needed.

rpc_sockname() lost "int buflen" parameter, since its only use was
to be passed to kernel_getsockname() as &buflen and subsequently
not used in any way.

Userspace API is not changed.

textdata bss  dec hex filename
30108430 2633624  873672 33615726 200ef6e vmlinux.before.o
30108109 2633612  873672 33615393 200ee21 vmlinux.o

Signed-off-by: Denys Vlasenko 
CC: David S. Miller 
CC: linux-kernel@vger.kernel.org
CC: net...@vger.kernel.org
CC: linux-blueto...@vger.kernel.org
CC: linux-decnet-u...@lists.sourceforge.net
CC: linux-wirel...@vger.kernel.org
CC: linux-r...@vger.kernel.org
CC: linux-s...@vger.kernel.org
CC: linux-...@vger.kernel.org
CC: linux-...@vger.kernel.org
---
 drivers/isdn/mISDN/socket.c|  5 ++---
 drivers/net/ppp/pppoe.c|  6 ++
 drivers/net/ppp/pptp.c |  6 ++
 drivers/scsi/iscsi_tcp.c   | 14 +++---
 drivers/soc/qcom/qmi_interface.c   |  3 +--
 drivers/staging/ipx/af_ipx.c   |  6 ++
 drivers/staging/irda/net/af_irda.c |  8 +++-
 include/linux/net.h|  8 +++-
 include/net/inet_common.h  |  2 +-
 include/net/ipv6.h |  2 +-
 include/net/sock.h |  2 +-
 net/appletalk/ddp.c|  5 ++---
 net/atm/pvc.c  |  5 ++---
 net/atm/svc.c  |  5 ++---
 net/ax25/af_ax25.c |  4 ++--
 net/bluetooth/hci_sock.c   |  4 ++--
 net/bluetooth/l2cap_sock.c |  5 ++---
 net/bluetooth/rfcomm/sock.c|  5 ++---
 net/bluetooth/sco.c|  5 ++---
 net/can/raw.c  |  6 ++
 net/core/sock.c|  5 +++--
 net/decnet/af_decnet.c |  6 ++
 net/ipv4/af_inet.c |  5 ++---
 net/ipv6/af_inet6.c|  5 ++---
 net/iucv/af_iucv.c |  5 ++---
 net/l2tp/l2tp_ip.c |  5 ++---
 net/l2tp/l2tp_ip6.c|  5 ++---
 net/l2tp/l2tp_ppp.c|  5 ++---
 net/llc/af_llc.c   |  5 ++---
 net/netlink/af_netlink.c   |  5 ++---
 net/netrom/af_netrom.c |  9 +
 net/nfc/llcp_sock.c|  5 ++---
 net/packet/af_packet.c | 10 --
 net/phonet/socket.c|  5 ++---
 net/qrtr/qrtr.c|  5 ++---
 net/rds/af_rds.c   |  5 ++---
 net/rds/tcp.c  |  7 ++-
 net/rose/af_rose.c |  5 ++---
 net/sctp/ipv6.c|  8 
 net/smc/af_smc.c   |  7 +++
 net/socket.c   | 35 +--
 net/sunrpc/clnt.c  |  6 +++---
 net/sunrpc/svcsock.c   | 13 -
 net/sunrpc/xprtsock.c  |  3 +--
 net/tipc/socket.c  |  5 ++---
 net/unix/af_unix.c | 10 +-
 net/vmw_vsock/af_vsock.c   |  4 ++--
 net/x25/af_x25.c   |  4 ++--
 48 files changed, 132 insertions(+), 171 deletions(-)

diff --git a/drivers/isdn/mISDN/socket.c b/drivers/isdn/mISDN/socket.c
index c5603d1a07d6..1f8f489b4167 100644
--- a/drivers/isdn/mISDN/socket.c
+++ b/drivers/isdn/mISDN/socket.c
@@ -560,7 +560,7 @@ data_sock_bind(struct socket *sock, struct sockaddr *addr, 
int addr_len)
 
 static int
 data_sock_getname(struct socket *sock, struct sockaddr *addr,
- int *addr_len, int peer)
+ int peer)
 {
struct sockaddr_mISDN   *maddr = (struct sockaddr_mISDN *) addr;
struct sock *sk = sock->sk;
@@ -570,14 +570,13 @@ data_sock_getname(struct socket *sock, struct sockaddr 
*addr,
 
lock_sock(sk);
 
-   *addr_len = sizeof(*maddr);
maddr->family = AF_ISDN;
maddr->dev = _pms(sk)->dev->id;
maddr->channel = _pms(sk)->ch.nr;
maddr->sapi = _pms(sk)->ch.addr & 0xff;
maddr->tei = (_pms(sk)->ch.addr >> 8) & 0xff;
release_sock(sk);
-   return 0;
+   return sizeof(*maddr);
 }
 
 static const struct proto_ops data_sock_ops = {
diff --git a/drivers/net/ppp/pppoe.c b/drivers/net/ppp/pppoe.c
index 5aa59f41bf8c

Re: [tip:x86/pti] x86/entry/64: Introduce the PUSH_AND_CLEAN_REGS macro

2018-02-12 Thread Denys Vlasenko

On 02/12/2018 02:36 PM, David Laight wrote:

From: Denys Vlasenko

Sent: 12 February 2018 13:29

...


x86/entry/64: Introduce the PUSH_AND_CLEAN_REGS macro

Those instances where ALLOC_PT_GPREGS_ON_STACK is called just before
SAVE_AND_CLEAR_REGS can trivially be replaced by PUSH_AND_CLEAN_REGS.
This macro uses PUSH instead of MOV and should therefore be faster, at
least on newer CPUs.

...

Link: http://lkml.kernel.org/r/20180211104949.12992-5-li...@dominikbrodowski.net
Signed-off-by: Ingo Molnar 
---
   arch/x86/entry/calling.h  | 36 
   arch/x86/entry/entry_64.S |  6 ++
   2 files changed, 38 insertions(+), 4 deletions(-)

diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index a05cbb8..57b1b87 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -137,6 +137,42 @@ For 32-bit we have the following conventions - kernel is 
built with
UNWIND_HINT_REGS offset=\offset
.endm

+   .macro PUSH_AND_CLEAR_REGS
+   /*
+* Push registers and sanitize registers of values that a
+* speculation attack might otherwise want to exploit. The
+* lower registers are likely clobbered well before they
+* could be put to use in a speculative execution gadget.
+* Interleave XOR with PUSH for better uop scheduling:
+*/
+   pushq   %rdi/* pt_regs->di */
+   pushq   %rsi/* pt_regs->si */
+   pushq   %rdx/* pt_regs->dx */
+   pushq   %rcx/* pt_regs->cx */
+   pushq   %rax/* pt_regs->ax */
+   pushq   %r8 /* pt_regs->r8 */
+   xorq%r8, %r8/* nospec   r8 */


xorq's are slower than xorl's on Silvermont/Knights Landing.
I propose using xorl instead.


Does using movq to copy the first zero to the other registers make
the code any faster?

ISTR mov reg-reg is often implemented as a register rename rather than an
alu operation.


xorl is implemented in register rename as well. Just, for some reason,
xorq did not get the same treatment on those CPUs.


Re: [tip:x86/pti] x86/entry/64: Introduce the PUSH_AND_CLEAN_REGS macro

2018-02-12 Thread Denys Vlasenko

On 02/12/2018 11:17 AM, tip-bot for Dominik Brodowski wrote:

Commit-ID:  7b7b09f110f06f3c006e9b3f4590f7d9ba91345b
Gitweb: https://git.kernel.org/tip/7b7b09f110f06f3c006e9b3f4590f7d9ba91345b
Author: Dominik Brodowski 
AuthorDate: Sun, 11 Feb 2018 11:49:45 +0100
Committer:  Ingo Molnar 
CommitDate: Mon, 12 Feb 2018 08:06:36 +0100

x86/entry/64: Introduce the PUSH_AND_CLEAN_REGS macro

Those instances where ALLOC_PT_GPREGS_ON_STACK is called just before
SAVE_AND_CLEAR_REGS can trivially be replaced by PUSH_AND_CLEAN_REGS.
This macro uses PUSH instead of MOV and should therefore be faster, at
least on newer CPUs.

Suggested-by: Linus Torvalds 
Signed-off-by: Dominik Brodowski 
Cc: Andy Lutomirski 
Cc: Borislav Petkov 
Cc: Brian Gerst 
Cc: Denys Vlasenko 
Cc: H. Peter Anvin 
Cc: Josh Poimboeuf 
Cc: Peter Zijlstra 
Cc: Thomas Gleixner 
Cc: dan.j.willi...@intel.com
Link: http://lkml.kernel.org/r/20180211104949.12992-5-li...@dominikbrodowski.net
Signed-off-by: Ingo Molnar 
---
  arch/x86/entry/calling.h  | 36 
  arch/x86/entry/entry_64.S |  6 ++
  2 files changed, 38 insertions(+), 4 deletions(-)

diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index a05cbb8..57b1b87 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -137,6 +137,42 @@ For 32-bit we have the following conventions - kernel is 
built with
UNWIND_HINT_REGS offset=\offset
.endm
  
+	.macro PUSH_AND_CLEAR_REGS

+   /*
+* Push registers and sanitize registers of values that a
+* speculation attack might otherwise want to exploit. The
+* lower registers are likely clobbered well before they
+* could be put to use in a speculative execution gadget.
+* Interleave XOR with PUSH for better uop scheduling:
+*/
+   pushq   %rdi/* pt_regs->di */
+   pushq   %rsi/* pt_regs->si */
+   pushq   %rdx/* pt_regs->dx */
+   pushq   %rcx/* pt_regs->cx */
+   pushq   %rax/* pt_regs->ax */
+   pushq   %r8 /* pt_regs->r8 */
+   xorq%r8, %r8/* nospec   r8 */


xorq's are slower than xorl's on Silvermont/Knights Landing.
I propose using xorl instead.


+   pushq   %r9 /* pt_regs->r9 */
+   xorq%r9, %r9/* nospec   r9 */
+   pushq   %r10/* pt_regs->r10 */
+   xorq%r10, %r10  /* nospec   r10 */
+   pushq   %r11/* pt_regs->r11 */
+   xorq%r11, %r11  /* nospec   r11*/
+   pushq   %rbx/* pt_regs->rbx */
+   xorl%ebx, %ebx  /* nospec   rbx*/
+   pushq   %rbp/* pt_regs->rbp */
+   xorl%ebp, %ebp  /* nospec   rbp*/
+   pushq   %r12/* pt_regs->r12 */
+   xorq%r12, %r12  /* nospec   r12*/
+   pushq   %r13/* pt_regs->r13 */
+   xorq%r13, %r13  /* nospec   r13*/
+   pushq   %r14/* pt_regs->r14 */
+   xorq%r14, %r14  /* nospec   r14*/
+   pushq   %r15/* pt_regs->r15 */
+   xorq%r15, %r15  /* nospec   r15*/


Re: [PATCH 09/31] x86/entry/32: Leave the kernel via trampoline stack

2018-02-09 Thread Denys Vlasenko

On 02/09/2018 08:02 PM, Joerg Roedel wrote:

On Fri, Feb 09, 2018 at 09:05:02AM -0800, Linus Torvalds wrote:

On Fri, Feb 9, 2018 at 1:25 AM, Joerg Roedel  wrote:

+
+   /* Copy over the stack-frame */
+   cld
+   rep movsb


Ugh. This is going to be horrendous. Maybe not noticeable on modern
CPU's, but the whole 32-bit code is kind of pointless on a modern CPU.

At least use "rep movsl". If the kernel stack isn't 4-byte aligned,
you have issues.


Okay, I used movsb because I remembered that being the recommendation
for the most efficient memcpy, and it safes me an instruction. But that
is probably only true on modern CPUs.


It's fast (copies data with full-width loads and stores,
up to 64-byte wide on latest Intel CPUs), but this kicks in only for
largish blocks. In your case, you are copying less than 100 bytes.


Re: [PATCH 09/31] x86/entry/32: Leave the kernel via trampoline stack

2018-02-09 Thread Denys Vlasenko

On 02/09/2018 06:05 PM, Linus Torvalds wrote:

On Fri, Feb 9, 2018 at 1:25 AM, Joerg Roedel  wrote:

+
+   /* Copy over the stack-frame */
+   cld
+   rep movsb


Ugh. This is going to be horrendous. Maybe not noticeable on modern
CPU's, but the whole 32-bit code is kind of pointless on a modern CPU.

At least use "rep movsl". If the kernel stack isn't 4-byte aligned,
you have issues.


Indeed, "rep movs" has some setup overhead that makes it undesirable
for small sizes. In my testing, moving less than 128 bytes with "rep movs"
is a loss.


Re: [PATCH v2 09/18] x86/asm: Move SYSENTER_stack to the beginning of struct tss_struct

2017-11-23 Thread Denys Vlasenko
On Wed, Nov 22, 2017 at 5:44 AM, Andy Lutomirski  wrote:
> I want SYSENTER_stack to have reliable overflow detection, which
> means that it needs to be at the bottom of a page, not the top.
> Move it to the beginning of struct tss_struct and page-align it.
>
> Also add an assertion to make sure that the fixed hardware TSS
> doesn't cross a page boundary.
>
> Signed-off-by: Andy Lutomirski 
> ---
>  arch/x86/include/asm/processor.h | 21 -
>  arch/x86/kernel/cpu/common.c | 22 ++
>  2 files changed, 34 insertions(+), 9 deletions(-)
>
> diff --git a/arch/x86/include/asm/processor.h 
> b/arch/x86/include/asm/processor.h
> index d32a3c88a968..8f5dac9dfbdc 100644
> --- a/arch/x86/include/asm/processor.h
> +++ b/arch/x86/include/asm/processor.h
> @@ -327,7 +327,16 @@ struct x86_hw_tss {
>
>  struct tss_struct {
> /*
> -* The hardware state:
> +* Space for the temporary SYSENTER stack, used for SYSENTER
> +* and the entry trampoline as well.
> +*/
> +   unsigned long   SYSENTER_stack_canary;
> +   unsigned long   SYSENTER_stack[64];
> +
> +   /*
> +* The fixed hardware portion.  This must not cross a page boundary
> +* at risk of violating the SDM's advice and potentially triggering
> +* errata.
>  */
> struct x86_hw_tss   x86_tss;
>
> @@ -338,15 +347,9 @@ struct tss_struct {
>  * be within the limit.
>  */
> unsigned long   io_bitmap[IO_BITMAP_LONGS + 1];
> +} __attribute__((__aligned__(PAGE_SIZE)));
>
> -   /*
> -* Space for the temporary SYSENTER stack.
> -*/
> -   unsigned long   SYSENTER_stack_canary;
> -   unsigned long   SYSENTER_stack[64];
> -} cacheline_aligned;

You may also move this initializer in process.c:

.SYSENTER_stack_canary  = STACK_END_MAGIC,


Re: [tip:x86/asm] x86/umip: Add emulation code for UMIP instructions

2017-11-08 Thread Denys Vlasenko

On 11/08/2017 06:14 PM, Paolo Bonzini wrote:

On 08/11/2017 18:09, Denys Vlasenko wrote:

On 11/08/2017 05:57 PM, Linus Torvalds wrote:

On Wed, Nov 8, 2017 at 8:53 AM, Denys Vlasenko 
wrote:

We can postpone enabling UMIP by default by a year or so.
By this time, new Wine will be on majority of users' machines.


So you are suggesting we run unnecessarily insecure, only in order to
not do the emulation that we already have the code for and that the
patch implements?


We ran insecure in this way for ~25 years.


Why?


To avoid having to maintain more obscure, rarely executed code.


As a start, you could propose a patch to disable the emulation code
through a sysctl or Kconfig symbol.


This way, the emulation code will still be in the kernel, and still
need to be maintained.

In my opinion, if we do emulate these insns, then adding even more code
to disable that emulation is not worth doing.


I would be surprised if it takes
more time than what you've spent writing emails in this thread.


Yes, I not only f**ing retarded, I'm also lazy. Thanks guys...


Re: [tip:x86/asm] x86/umip: Add emulation code for UMIP instructions

2017-11-08 Thread Denys Vlasenko

On 11/08/2017 05:57 PM, Linus Torvalds wrote:

On Wed, Nov 8, 2017 at 8:53 AM, Denys Vlasenko  wrote:

We can postpone enabling UMIP by default by a year or so.
By this time, new Wine will be on majority of users' machines.


So you are suggesting we run unnecessarily insecure, only in order to
not do the emulation that we already have the code for and that the
patch implements?


We ran insecure in this way for ~25 years.


Why?


To avoid having to maintain more obscure, rarely executed code.


Re: [tip:x86/asm] x86/umip: Add emulation code for UMIP instructions

2017-11-08 Thread Denys Vlasenko

On 11/08/2017 05:34 PM, Linus Torvalds wrote:

On Wed, Nov 8, 2017 at 8:14 AM, Denys Vlasenko  wrote:


Can we avoid maintain emulation of these isns, by asking Wine to remove
their use instead?


If we ask the Wine people to remove the instruction use, that may mean
that we can avoid the emulation in four or five _years_ once everybody
has updated.

But it wouldn't mean that we could avoid it today.


We can postpone enabling UMIP by default by a year or so.
By this time, new Wine will be on majority of users' machines.

Then, when kernels switch to enable UMIP by default, umip=0
kernel flag can be used if somebody for some reason updates
their kernel but not Wine.

This is much less code, and simpler code, than implementing SIDT et al
emulation.
Keep in mind that our SIDT emulation itself can turn out to be buggy,
in the worst case it may end up having worse holes that
SIDT isns was - what if it can be tricked into writing into
arbitrary kernel address?


Re: [tip:x86/asm] x86/umip: Add emulation code for UMIP instructions

2017-11-08 Thread Denys Vlasenko

On 11/08/2017 12:00 PM, tip-bot for Ricardo Neri wrote:

Commit-ID:  1e5db223696afa55e6a038fac638f759e1fdcc01
Gitweb: https://git.kernel.org/tip/1e5db223696afa55e6a038fac638f759e1fdcc01
Author: Ricardo Neri 
AuthorDate: Sun, 5 Nov 2017 18:27:52 -0800
Committer:  Ingo Molnar 
CommitDate: Wed, 8 Nov 2017 11:16:22 +0100

x86/umip: Add emulation code for UMIP instructions

The feature User-Mode Instruction Prevention present in recent Intel
processor prevents a group of instructions (sgdt, sidt, sldt, smsw, and
str) from being executed with CPL > 0. Otherwise, a general protection
fault is issued.


This was arguably an oversight on Intel's part - these insns should have been
protected from the start, as they leak a tiny bit of kernel data.


Rather than relaying to the user space the general protection fault caused
by the UMIP-protected instructions (in the form of a SIGSEGV signal), it
can be trapped and the instruction emulated to provide a dummy result.
This allows to both conserve the current kernel behavior and not reveal the
system resources that UMIP intends to protect (i.e., the locations of the
global descriptor and interrupt descriptor tables, the segment selectors of
the local descriptor table, the value of the task state register and the
contents of the CR0 register).

This emulation is needed because certain applications (e.g., WineHQ and
DOSEMU2) rely on this subset of instructions to function.


I'm surprised. What in the world they need those insns for?

Wine uses sidt like this, to emulate "mov from r/m to reg" insns:

static LDT_ENTRY idt[256];
...
case 0x8a: /* mov Eb, Gb */
case 0x8b: /* mov Ev, Gv */
{
BYTE *data = INSTR_GetOperandAddr(context, instr + 1, long_addr,
  segprefix, &len);
unsigned int data_size = (*instr == 0x8b) ? (long_op ? 4 : 2) : 1;
struct idtr idtr = get_idtr(); <=== HERE
unsigned int offset = data - idtr.base;

if (offset <= idtr.limit + 1 - data_size)
{
idt[1].LimitLow = 0x100; /* FIXME */
idt[2].LimitLow = 0x11E; /* FIXME */
idt[3].LimitLow = 0x500; /* FIXME */

switch (*instr)
{
case 0x8a: store_reg_byte( context, instr[1], (BYTE *)idt + offset 
); break;
case 0x8b: store_reg_word( context, instr[1], (BYTE *)idt + offset, 
long_op ); break;
}
context->Eip += prefixlen + len + 1;
return ExceptionContinueExecution;
}
break;  /* Unable to emulate it */
}

Looks baffling, to say the least... this supports someone who reads
IDT bytes via those insns, and they need to ensure that the values read
from idt[1/2/3].LimitLow are as expected. That's it? Pity git history
doesn't go far enough in the past, and comments are not informative as well...

I did not find smsw or sgdt in Wine git tree.

I did not find smsw, sidt or sgdt in dosemu2-devel git tree.

Can we avoid maintain emulation of these isns, by asking Wine to remove their 
use instead?


Is GET_CR0_INTO_EAX macro unused?

2017-10-02 Thread Denys Vlasenko

Hi Andy,

From by git archaeology, looks like last use of GET_CR0_INTO_EAX
was removed long ago, in 2008 (see commit below).

Right now, I only grep it here in a comment in entry_32.S:
/*
 * We use macros for low-level operations which need to be overridden
 * for paravirtualization.  The following will never clobber any registers:
 *   INTERRUPT_RETURN (aka. "iret")
 *   GET_CR0_INTO_EAX (aka. "movl %cr0, %eax")

To support GET_CR0_INTO_EAX, paravirt has PV_CPU_read_cr0 a.k.a.
struct pv_cpu_ops::read_cr0 and a bunch of code to support it
for each hypervisor.

Can we delete it?
Or am I missing a non-obvious place where this macro is still in use?



commit 7643e9b936b4af31ba4851eb7d5b3a3bfad52502
Author: Alexander van Heukelum 
Date:   Tue Sep 9 21:56:02 2008 +0200

i386: convert hardware exception 7 to an interrupt gate

Handle no coprocessor exception with interrupt initially off.

device_not_available in entry_32.S calls either math_state_restore
or math_emulate. This patch adds an extra indirection to be
able to re-enable interrupts explicitly in traps_32.c

Signed-off-by: Alexander van Heukelum 
Signed-off-by: Ingo Molnar 

diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 109792b..5a88585 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -760,20 +760,9 @@ ENTRY(device_not_available)
RING0_INT_FRAME
pushl $-1   # mark this as an int
CFI_ADJUST_CFA_OFFSET 4
-   SAVE_ALL
-   GET_CR0_INTO_EAX
-   testl $0x4, %eax# EM (math emulation bit)
-   jne device_not_available_emulate
-   preempt_stop(CLBR_ANY)
-   call math_state_restore
-   jmp ret_from_exception
-device_not_available_emulate:
-   pushl $0# temporary storage for ORIG_EIP
+   pushl $do_device_not_available
CFI_ADJUST_CFA_OFFSET 4
-   call math_emulate
-   addl $4, %esp
-   CFI_ADJUST_CFA_OFFSET -4
-   jmp ret_from_exception
+   jmp error_code
CFI_ENDPROC
 END(device_not_available)


Re: [PATCH] KVM: SVM: refactor avic VM ID allocation

2017-08-18 Thread Denys Vlasenko

On 08/17/2017 04:33 PM, Paolo Bonzini wrote:

On 15/08/2017 22:12, Radim Krčmář wrote:

2017-08-11 22:11+0200, Denys Vlasenko:

With lightly tweaked defconfig:

textdata bss  dec hex filename
11259661 5109408 2981888 19350957 12745ad vmlinux.before
11259661 5109408  884736 17253805 10745ad vmlinux.after

Only compile-tested.

Signed-off-by: Denys Vlasenko 
Cc: Joerg Roedel 
Cc: pbonz...@redhat.com
Cc: rkrc...@redhat.com
Cc: t...@linutronix.de
Cc: mi...@redhat.com
Cc: h...@zytor.com
Cc: x...@kernel.org
Cc: k...@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
---


Ah, I suspected my todo wasn't this short;  thanks for the patch!


@@ -1468,6 +1433,22 @@ static int avic_vm_init(struct kvm *kvm)
clear_page(page_address(l_page));

spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
+ again:
+   vm_id = next_vm_id = (next_vm_id + 1) & AVIC_VM_ID_MASK;
+   if (vm_id == 0) { /* id is 1-based, zero is not okay */


Suravee, did the reserved zero mean something?


+   next_vm_id_wrapped = 1;
+   goto again;
+   }
+   /* Is it still in use? Only possible if wrapped at least once */
+   if (next_vm_id_wrapped) {
+   hash_for_each_possible(svm_vm_data_hash, ka, hnode, vm_id) {
+   struct kvm *k2 = container_of(ka, struct kvm, arch);
+   struct kvm_arch *vd2 = &k2->arch;
+   if (vd2->avic_vm_id == vm_id)
+   goto again;


Although hitting the case where all 2^24 ids are already used is
practically impossible, I don't like the loose end ...


I think even the case where 2^16 ids are used deserves a medal.  Why
don't we just make the bitmap 8 KiB and call it a day? :)


Well, the RAM is cheap, but a 4-byte variable is still thousands of times
smaller than a 8K bitmap.

Since a 256 element hash table is used here, you need to have ~one hundred
VMs to start seeing (very small) degradation in speed of creation of new VMs
compared to bitmap approach, and that is only after 16777216 VMs
were created since reboot.

If you want to spend RAM on speeding this up, you can increase hash table size
instead. That would speed up avic_ga_log_notifier() too.


[PATCH] lib/raid6: align AVX512 constants to 512 bits, not bytes

2017-08-12 Thread Denys Vlasenko
Signed-off-by: Denys Vlasenko 
Cc: H. Peter Anvin 
Cc: mi...@redhat.com
Cc: Jim Kukunas 
Cc: Fenghua Yu 
Cc: Megha Dey 
Cc: Gayatri Kammela 
Cc: Shaohua Li 
Cc: x...@kernel.org
Cc: linux-kernel@vger.kernel.org
---
avx512.o before:
Sections:
Idx Name  Size  VMA   LMA   File off  Algn
  4 .rodata   0240      0c00  2**9
  CONTENTS, ALLOC, LOAD, RELOC, READONLY, DATA
...
Contents of section .rodata:
       
 0010      
 0020 0100     
 0030      
 0040      
 0050      
 0060 0100     
 0070      
 0080      
 0090      
 00a0 0100     
 00b0      
 00c0      
 00d0      
 00e0      
 00f0      
 0100      
 0110      
 0120      
 0130      
 0140      
 0150      
 0160      
 0170      
 0180      
 0190      
 01a0      
 01b0      
 01c0      
 01d0      
 01e0      
 01f0      
 0200 1d1d1d1d 1d1d1d1d 1d1d1d1d 1d1d1d1d  
 0210 1d1d1d1d 1d1d1d1d 1d1d1d1d 1d1d1d1d  
 0220 1d1d1d1d 1d1d1d1d 1d1d1d1d 1d1d1d1d  
 0230 1d1d1d1d 1d1d1d1d 1d1d1d1d 1d1d1d1d  

avx512.o after:
Sections:
Idx Name  Size  VMA   LMA   File off  Algn
  4 .rodata   0100      0b40  2**6
  CONTENTS, ALLOC, LOAD, RELOC, READONLY, DATA
...
Contents of section .rodata:
       
 0010      
 0020 0100     
 0030      
 0040      
 0050      
 0060 0100     
 0070      
 0080      
 0090      
 00a0 0100     
 00b0      
 00c0 1d1d1d1d 1d1d1d1d 1d1d1d1d 1d1d1d1d  
 00d0 1d1d1d1d 1d1d1d1d 1d1d1d1d 1d1d1d1d  
 00e0 1d1d1d1d 1d1d1d1d 1d1d1d1d 1d1d1d1d  
 00f0 1d1d1d1d 1d1d1d1d 1d1d1d1d 1d1d1d1d  


 lib/raid6/avx512.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/raid6/avx512.c b/lib/raid6/avx512.c
index f524a79..46df797 100644
--- a/lib/raid6/avx512.c
+++ b/lib/raid6/avx512.c
@@ -29,7 +29,7 @@
 
 static const struct raid6_avx512_constants {
u64 x1d[8];
-} raid6_avx512_constants __aligned(512) = {
+} raid6_avx512_constants __aligned(512/8) = {
{ 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,
  0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,
  0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,
-- 
2.9.2



Re: [PATCH] KVM/x86: Increase max vcpu number to 352

2017-08-11 Thread Denys Vlasenko
On Thu, Aug 10, 2017 at 12:00 PM, Lan Tianyu  wrote:
> Intel Xeon phi chip will support 352 logical threads. For HPC usage
> case, it will create a huge VM with vcpu number as same as host cpus. This
> patch is to increase max vcpu number to 352.

This number was bumped in the past to 288 to accommodate Knights Landing,
but KNL's max designed thread number is actually 304: the on-die
interconnect mesh is 6*7, with four cells taken for interconnect
and memory controllers, there are 38 CPU cells.

Each CPU cell has two cores with shared L2.
Each core is SMT4. 38*8 = 304.

Intel fuses two cells (or more), so 288 is the largest number of threads
on a KNL you can buy, but 304 thread KNLs most probably also exist
(however they may be rather rare since they require completely
defect-free die).

I think it's better if Linux would support those too.

What is the design maximum for these new "nominally 352 thread" Xeon Phis
which are "nominally 352 thread"? 360? (If the mesh is 7*7 and the same
4 cells are taked for non-CPU needs)


[PATCH] KVM: SVM: delete avic_vm_id_bitmap (2 megabyte static array)

2017-08-11 Thread Denys Vlasenko
With lightly tweaked defconfig:

textdata bss  dec hex filename
11259661 5109408 2981888 19350957 12745ad vmlinux.before
11259661 5109408  884736 17253805 10745ad vmlinux.after

Only compile-tested.

Signed-off-by: Denys Vlasenko 
Cc: Joerg Roedel 
Cc: pbonz...@redhat.com
Cc: rkrc...@redhat.com
Cc: t...@linutronix.de
Cc: mi...@redhat.com
Cc: h...@zytor.com
Cc: x...@kernel.org
Cc: k...@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
---
 arch/x86/kvm/svm.c | 61 +++---
 1 file changed, 21 insertions(+), 40 deletions(-)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 1107626..f575089 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -280,10 +280,6 @@ module_param(avic, int, S_IRUGO);
 static int vls = true;
 module_param(vls, int, 0444);
 
-/* AVIC VM ID bit masks and lock */
-static DECLARE_BITMAP(avic_vm_id_bitmap, AVIC_VM_ID_NR);
-static DEFINE_SPINLOCK(avic_vm_id_lock);
-
 static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
 static void svm_flush_tlb(struct kvm_vcpu *vcpu);
 static void svm_complete_interrupts(struct vcpu_svm *svm);
@@ -989,6 +985,8 @@ static void disable_nmi_singlestep(struct vcpu_svm *svm)
  */
 #define SVM_VM_DATA_HASH_BITS  8
 static DEFINE_HASHTABLE(svm_vm_data_hash, SVM_VM_DATA_HASH_BITS);
+static u32 next_vm_id = 0;
+static bool next_vm_id_wrapped = 0;
 static DEFINE_SPINLOCK(svm_vm_data_hash_lock);
 
 /* Note:
@@ -1387,34 +1385,6 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu)
return 0;
 }
 
-static inline int avic_get_next_vm_id(void)
-{
-   int id;
-
-   spin_lock(&avic_vm_id_lock);
-
-   /* AVIC VM ID is one-based. */
-   id = find_next_zero_bit(avic_vm_id_bitmap, AVIC_VM_ID_NR, 1);
-   if (id <= AVIC_VM_ID_MASK)
-   __set_bit(id, avic_vm_id_bitmap);
-   else
-   id = -EAGAIN;
-
-   spin_unlock(&avic_vm_id_lock);
-   return id;
-}
-
-static inline int avic_free_vm_id(int id)
-{
-   if (id <= 0 || id > AVIC_VM_ID_MASK)
-   return -EINVAL;
-
-   spin_lock(&avic_vm_id_lock);
-   __clear_bit(id, avic_vm_id_bitmap);
-   spin_unlock(&avic_vm_id_lock);
-   return 0;
-}
-
 static void avic_vm_destroy(struct kvm *kvm)
 {
unsigned long flags;
@@ -1423,8 +1393,6 @@ static void avic_vm_destroy(struct kvm *kvm)
if (!avic)
return;
 
-   avic_free_vm_id(vm_data->avic_vm_id);
-
if (vm_data->avic_logical_id_table_page)
__free_page(vm_data->avic_logical_id_table_page);
if (vm_data->avic_physical_id_table_page)
@@ -1438,19 +1406,16 @@ static void avic_vm_destroy(struct kvm *kvm)
 static int avic_vm_init(struct kvm *kvm)
 {
unsigned long flags;
-   int vm_id, err = -ENOMEM;
+   int err = -ENOMEM;
struct kvm_arch *vm_data = &kvm->arch;
struct page *p_page;
struct page *l_page;
+   struct kvm_arch *ka;
+   u32 vm_id;
 
if (!avic)
return 0;
 
-   vm_id = avic_get_next_vm_id();
-   if (vm_id < 0)
-   return vm_id;
-   vm_data->avic_vm_id = (u32)vm_id;
-
/* Allocating physical APIC ID table (4KB) */
p_page = alloc_page(GFP_KERNEL);
if (!p_page)
@@ -1468,6 +1433,22 @@ static int avic_vm_init(struct kvm *kvm)
clear_page(page_address(l_page));
 
spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
+ again:
+   vm_id = next_vm_id = (next_vm_id + 1) & AVIC_VM_ID_MASK;
+   if (vm_id == 0) { /* id is 1-based, zero is not okay */
+   next_vm_id_wrapped = 1;
+   goto again;
+   }
+   /* Is it still in use? Only possible if wrapped at least once */
+   if (next_vm_id_wrapped) {
+   hash_for_each_possible(svm_vm_data_hash, ka, hnode, vm_id) {
+   struct kvm *k2 = container_of(ka, struct kvm, arch);
+   struct kvm_arch *vd2 = &k2->arch;
+   if (vd2->avic_vm_id == vm_id)
+   goto again;
+   }
+   }
+   vm_data->avic_vm_id = vm_id;
hash_add(svm_vm_data_hash, &vm_data->hnode, vm_data->avic_vm_id);
spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
 
-- 
2.9.2



Re: [PATCH] selftests: ftrace: Use md5sum to take less time of checking logs

2017-06-30 Thread Denys Vlasenko
On Tue, Jun 27, 2017 at 12:28 PM, Masami Hiramatsu  wrote:
> Use md5sum so that it takes less time of checking
> trace logs update. Since busybox tail/cat takes too
> long time to read the trace log, this uses md5sum
> to check whether trace log is updated or not.
>
> Signed-off-by: Masami Hiramatsu 
> ---
>  .../test.d/ftrace/func_traceonoff_triggers.tc  |6 +++---
>  1 file changed, 3 insertions(+), 3 deletions(-)
>
> diff --git 
> a/tools/testing/selftests/ftrace/test.d/ftrace/func_traceonoff_triggers.tc 
> b/tools/testing/selftests/ftrace/test.d/ftrace/func_traceonoff_triggers.tc
> index 9cf3852..7a9ab4f 100644
> --- a/tools/testing/selftests/ftrace/test.d/ftrace/func_traceonoff_triggers.tc
> +++ b/tools/testing/selftests/ftrace/test.d/ftrace/func_traceonoff_triggers.tc
> @@ -103,11 +103,11 @@ if [ $on != "0" ]; then
>  fail "Tracing is not off"
>  fi
>
> -line1=`cat trace | tail -1`
> +csum1=`md5sum trace`
>  sleep $SLEEP_TIME
> -line2=`cat trace | tail -1`
> +csum2=`md5sum trace`

If you replace that with "tail -1  0) {
...
#if 1 /* This is technically incorrect for *LONG* strings, but very useful */
/* Optimizing count-lines case if the
file is seekable.
 * We assume the lines are <64k.
 * (Users complain that tail takes too long
 * on multi-gigabyte files) */
off = (count | 0xf); /* for small
counts, be more paranoid */
if (off > (INT_MAX / (64*1024)))
off = (INT_MAX / (64*1024));
current -= off * (64*1024);
if (current < 0)
current = 0;
xlseek(fd, current, SEEK_SET);
#endif

Example: "tail -1 10GB_FILE" seeks back ~1 mbyte and looks for the last line:

lseek(3, 0, SEEK_END)   = 10991787403
lseek(3, 10990804363, SEEK_SET) = 10990804363
read(3, 
"\0\0020d\340\326\0\0\0\0020e\6\364\0\0\0\0020eMI\0\0\0\0020el\210\0\0"...,
8192) = 8192
read(3, 
"\0\0021\"p\321\0\0\0\0021\"\216\31\0\0\0\0021\"\254q\0\0\0\0021\"\367u\0\0"...,
8192) = 8192
read(3, 
"\0\0021\342\216\302\0\0\0\0021\342\2243\0\0\0\0021\342\266T\0\0\0\0021\342\326\v\0\0"...,
8192) = 8192
...
read(3, 
"\0\0\2\211\3272\211\0\0\0\2\211\3277\235\0\0\0\2\211\327mx\0\0\0\2\211\327\224\22\0"...,
11841) = 11841
read(3, 
"\0\2\212\357\366\306\0\0\0\2\212\360`\232\0\0\0\2\212\360w\262\0\0\0\2\212\360\354\365\0\0"...,
12512) = 12512
read(3, "\0\2\214uL\3\0\0\0\2\214unc\0\0\0\2\214w_\373\0\0\0\2\214wn\6\0\0"...,
11176) = 8424
read(3, "", 2752)   = 0


Re: [PATCH v2] net/sctp/ulpevent.c: Deinline sctp_ulpevent_set_owner, save 1616 bytes

2017-06-21 Thread Denys Vlasenko

On 06/21/2017 09:24 PM, Marcelo Ricardo Leitner wrote:

On Wed, Jun 21, 2017 at 07:03:27PM +0200, Denys Vlasenko wrote:

This function compiles to 147 bytes of machine code. 13 callsites.

I'm no expert on SCTP events, but quick reading of SCTP docs tells me that
SCTP events are not happening on every packet.
They are ASSOC_CHANGE, PEER_ADDR_CHANGE, REMOTE_ERROR and such.
Does not look performance critical.

Signed-off-by: Denys Vlasenko 
CC: Vlad Yasevich 
CC: Neil Horman 
CC: David Miller 
CC: linux-s...@vger.kernel.org
CC: net...@vger.kernel.org
CC: linux-kernel@vger.kernel.org


Acked-by: Marcelo Ricardo Leitner 

Just wondering, are you conducting further research on this topic?
Because we probably could use such review on SCTP stack.


Here is the list of sctp inlines which has any (however small)
benefit when deinlined.

filename:lineno:inline_name:lines_of_source_code:saved_bytes_by_deinlining:size_of_code_of_deinlined_function

include/net/sctp/command.h:228:sctp_add_cmd_sf:7:8306:38
net/sctp/ulpevent.c:91:sctp_ulpevent_set_owner:13:1616:147
include/net/sctp/sctp.h:414:sctp_skb_set_owner_r:10:934:75
net/sctp/input.c:840:sctp_hash_key:13:896:359
net/sctp/input.c:823:sctp_hash_obj:13:704:409
include/net/sctp/checksum.h:60:sctp_compute_cksum:13:595:85
net/sctp/input.c:800:sctp_hash_cmp:18:320:124
net/sctp/socket.c:117:sctp_wspace:19:256:76
include/net/sctp/sctp.h:272:sctp_max_rto:7:204:68
net/sctp/socket.c:173:sctp_verify_addr:15:192:72
include/net/sctp/checksum.h:46:sctp_csum_update:4:147:21
include/net/sctp/sctp.h:519:param_type2af:8:134:35
include/net/sctp/sctp.h:399:sctp_list_dequeue:7:123:59
include/net/sctp/sctp.h:596:sctp_transport_dst_check:4:120:60
include/net/sctp/sctp.h:435:sctp_frag_point:12:65:65
net/sctp/outqueue.c:82:sctp_outq_dequeue_data:10:64:87
include/net/sctp/command.h:243:sctp_next_cmd:4:64:37
include/net/sctp/sm.h:347:sctp_data_size:6:19:19

For .c files, the patches are trivial and I have them auto-generated,
I'll send them to you privately to save you the mechanical work.

Unfortunately, for inlines in .h files this does not work
(a human is needed to decide where to more the function).

Of course, not every deinlining makes sense.


Can avic_vm_id_bitmap be made smaller? 2 mbytes feels too large

2017-06-21 Thread Denys Vlasenko

#define AVIC_VM_ID_BITS24
#define AVIC_VM_ID_NR  (1 << AVIC_VM_ID_BITS)
...
static DECLARE_BITMAP(avic_vm_id_bitmap, AVIC_VM_ID_NR);

The above results in a data object which is 2 megabytes large.

avic_vm_init() -> avic_get_next_vm_id() allocates a new vm_id
by looking for a free bit there. That's the whole purpose
for that bitmap existing.

Is there a way to do this less wastefully? Say, such as
iterating over all existing VMs and picking an id which is
not taken.

I imagine VM creation operation is not that frequent, no need
to make new vm_id selection super fast.


[PATCH] minix: Deinline get_block, save 2691 bytes

2017-06-21 Thread Denys Vlasenko
This function compiles to 1402 bytes of machine code.

It has 2 callsites, and also a not-inlined copy gets created by compiler
anyway since its address gets passed as a parameter to block_truncate_page().

Signed-off-by: Denys Vlasenko 
CC: Al Viro 
CC: linux-fsde...@vger.kernel.org
CC: linux-kernel@vger.kernel.org
---
 fs/minix/itree_common.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/minix/itree_common.c b/fs/minix/itree_common.c
index 4c57c9a..2325777 100644
--- a/fs/minix/itree_common.c
+++ b/fs/minix/itree_common.c
@@ -142,7 +142,7 @@ static inline int splice_branch(struct inode *inode,
return -EAGAIN;
 }
 
-static inline int get_block(struct inode * inode, sector_t block,
+static int get_block(struct inode * inode, sector_t block,
struct buffer_head *bh, int create)
 {
int err = -EIO;
-- 
1.8.3.1



[PATCH] kernel/sched/fair.c: Deinline update_load_avg, save 8720 bytes

2017-06-21 Thread Denys Vlasenko
This function compiles to 1378 bytes of machine code. 8 callsites.

Signed-off-by: Denys Vlasenko 
CC: Peter Zijlstra 
CC: linux-kernel@vger.kernel.org
---
 kernel/sched/fair.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d711093..c4d6603 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3312,7 +3312,7 @@ static inline void cfs_rq_util_change(struct cfs_rq 
*cfs_rq)
 #define SKIP_AGE_LOAD  0x2
 
 /* Update task and its cfs_rq load average */
-static inline void update_load_avg(struct sched_entity *se, int flags)
+static void update_load_avg(struct sched_entity *se, int flags)
 {
struct cfs_rq *cfs_rq = cfs_rq_of(se);
u64 now = cfs_rq_clock_task(cfs_rq);
-- 
1.8.3.1



[PATCH v2] net/sctp/ulpevent.c: Deinline sctp_ulpevent_set_owner, save 1616 bytes

2017-06-21 Thread Denys Vlasenko
This function compiles to 147 bytes of machine code. 13 callsites.

I'm no expert on SCTP events, but quick reading of SCTP docs tells me that
SCTP events are not happening on every packet.
They are ASSOC_CHANGE, PEER_ADDR_CHANGE, REMOTE_ERROR and such.
Does not look performance critical.

Signed-off-by: Denys Vlasenko 
CC: Vlad Yasevich 
CC: Neil Horman 
CC: David Miller 
CC: linux-s...@vger.kernel.org
CC: net...@vger.kernel.org
CC: linux-kernel@vger.kernel.org
---
Changed since v1:
* reindented function argument list

 net/sctp/ulpevent.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c
index ec2b3e0..bc3f495 100644
--- a/net/sctp/ulpevent.c
+++ b/net/sctp/ulpevent.c
@@ -88,8 +88,8 @@ int sctp_ulpevent_is_notification(const struct sctp_ulpevent 
*event)
 /* Hold the association in case the msg_name needs read out of
  * the association.
  */
-static inline void sctp_ulpevent_set_owner(struct sctp_ulpevent *event,
-  const struct sctp_association *asoc)
+static void sctp_ulpevent_set_owner(struct sctp_ulpevent *event,
+   const struct sctp_association *asoc)
 {
struct sctp_chunk *chunk = event->chunk;
struct sk_buff *skb;
-- 
1.8.3.1



[PATCH] net/sctp/ulpevent.c: Deinline sctp_ulpevent_set_owner, save 1616 bytes

2017-06-21 Thread Denys Vlasenko
This function compiles to 147 bytes of machine code. 13 callsites.

I'm no expert on SCTP events, but quick reading of SCTP docs tells me that
SCTP events are not happening on every packet.
They are ASSOC_CHANGE, PEER_ADDR_CHANGE, REMOTE_ERROR and such.
Does not look performance critical.

Signed-off-by: Denys Vlasenko 
CC: Vlad Yasevich 
CC: Neil Horman 
CC: David Miller 
CC: linux-s...@vger.kernel.org
CC: net...@vger.kernel.org
CC: linux-kernel@vger.kernel.org
---
 net/sctp/ulpevent.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c
index ec2b3e0..049aa5a 100644
--- a/net/sctp/ulpevent.c
+++ b/net/sctp/ulpevent.c
@@ -88,7 +88,7 @@ int sctp_ulpevent_is_notification(const struct sctp_ulpevent 
*event)
 /* Hold the association in case the msg_name needs read out of
  * the association.
  */
-static inline void sctp_ulpevent_set_owner(struct sctp_ulpevent *event,
+static void sctp_ulpevent_set_owner(struct sctp_ulpevent *event,
   const struct sctp_association *asoc)
 {
struct sctp_chunk *chunk = event->chunk;
-- 
1.8.3.1



[PATCH] liquidio: stop using huge static buffer, save 4096k in .data

2017-06-19 Thread Denys Vlasenko
Only compile-tested - I don't have the hardware.

>From code inspection, octeon_pci_write_core_mem() appears to be safe wrt
unaligned source. In any case, u8 fbuf[] was not guaranteed to be aligned
anyway.

Signed-off-by: Denys Vlasenko 
CC: Felix Manlunas 
CC: Prasad Kanneganti 
CC: Derek Chickles 
CC: David Miller 
CC: net...@vger.kernel.org
CC: linux-kernel@vger.kernel.org
---
 drivers/net/ethernet/cavium/liquidio/octeon_console.c | 6 +-
 drivers/net/ethernet/cavium/liquidio/octeon_mem_ops.c | 4 ++--
 drivers/net/ethernet/cavium/liquidio/octeon_mem_ops.h | 2 +-
 3 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/cavium/liquidio/octeon_console.c 
b/drivers/net/ethernet/cavium/liquidio/octeon_console.c
index 53f38d0..e08f760 100644
--- a/drivers/net/ethernet/cavium/liquidio/octeon_console.c
+++ b/drivers/net/ethernet/cavium/liquidio/octeon_console.c
@@ -724,13 +724,11 @@ static int octeon_console_read(struct octeon_device *oct, 
u32 console_num,
 }
 
 #define FBUF_SIZE  (4 * 1024 * 1024)
-u8 fbuf[FBUF_SIZE];
 
 int octeon_download_firmware(struct octeon_device *oct, const u8 *data,
 size_t size)
 {
int ret = 0;
-   u8 *p = fbuf;
u32 crc32_result;
u64 load_addr;
u32 image_len;
@@ -805,10 +803,8 @@ int octeon_download_firmware(struct octeon_device *oct, 
const u8 *data,
else
size = FBUF_SIZE;
 
-   memcpy(p, data, size);
-
/* download the image */
-   octeon_pci_write_core_mem(oct, load_addr, p, (u32)size);
+   octeon_pci_write_core_mem(oct, load_addr, data, 
(u32)size);
 
data += size;
rem -= (u32)size;
diff --git a/drivers/net/ethernet/cavium/liquidio/octeon_mem_ops.c 
b/drivers/net/ethernet/cavium/liquidio/octeon_mem_ops.c
index 5cd96e7..4c85ae6 100644
--- a/drivers/net/ethernet/cavium/liquidio/octeon_mem_ops.c
+++ b/drivers/net/ethernet/cavium/liquidio/octeon_mem_ops.c
@@ -167,10 +167,10 @@ octeon_pci_read_core_mem(struct octeon_device *oct,
 void
 octeon_pci_write_core_mem(struct octeon_device *oct,
  u64 coreaddr,
- u8 *buf,
+ const u8 *buf,
  u32 len)
 {
-   __octeon_pci_rw_core_mem(oct, coreaddr, buf, len, 0);
+   __octeon_pci_rw_core_mem(oct, coreaddr, (u8 *)buf, len, 0);
 }
 
 u64 octeon_read_device_mem64(struct octeon_device *oct, u64 coreaddr)
diff --git a/drivers/net/ethernet/cavium/liquidio/octeon_mem_ops.h 
b/drivers/net/ethernet/cavium/liquidio/octeon_mem_ops.h
index bae2fdd..47a3ff5 100644
--- a/drivers/net/ethernet/cavium/liquidio/octeon_mem_ops.h
+++ b/drivers/net/ethernet/cavium/liquidio/octeon_mem_ops.h
@@ -66,7 +66,7 @@ octeon_pci_read_core_mem(struct octeon_device *oct,
 void
 octeon_pci_write_core_mem(struct octeon_device *oct,
  u64 coreaddr,
- u8 *buf,
+ const u8 *buf,
  u32 len);
 
 #endif
-- 
2.9.2



Re: [PATCH v2] x86, msr: Document AMD "tweak MSRs", use MSR_FnnH_NAME scheme for them

2017-04-25 Thread Denys Vlasenko

On 04/25/2017 06:23 PM, Borislav Petkov wrote:

On Tue, Apr 25, 2017 at 06:15:23PM +0200, Denys Vlasenko wrote:

On 04/25/2017 06:06 PM, Borislav Petkov wrote:

Pls no. Not every MSR for every family. Only the 4 which are actually
being used. We can't hold in here the full 32-bit MSR space.


The replacement of four define names is not the purpose
of the proposed patch.

The patch was prompted by the realization that these particular MSRs
are so badly and inconsistently documented that it takes many hours
of work and requires reading of literally a dozen PDFs to figure out
what are their names, which CPUs have them, and what bits are known.


They're all documented in the respective BKDGs or revision guides.


Yes. For some definition of "documented".

Let's say you are looking at all available documentation for Fam10h CPUs -
BKDG, Revision Guide, five volumes of APM, software optimization guide.
Eight documents.

If you read all of them, you can find exactly one mention that
MSR 0xC0011029 exists. It is mentioned by number.

As a reader of this documentation, can you find out what is it?
Does it have a name, at least?

You are right that kernel is not exactly the best place to store more info
about such things, but AMD probably won't accept my edits to their
documentation.


Re: [PATCH v2] x86, msr: Document AMD "tweak MSRs", use MSR_FnnH_NAME scheme for them

2017-04-25 Thread Denys Vlasenko

On 04/25/2017 06:06 PM, Borislav Petkov wrote:

Pls no. Not every MSR for every family. Only the 4 which are actually
being used. We can't hold in here the full 32-bit MSR space.


The replacement of four define names is not the purpose
of the proposed patch.

The patch was prompted by the realization that these particular MSRs
are so badly and inconsistently documented that it takes many hours
of work and requires reading of literally a dozen PDFs to figure out
what are their names, which CPUs have them, and what bits are known.

Anyone who looks at only one document won't see the full picture.

Patch does not document bits, but at least documents all MSR names
and explains why documentation is so sparse.

If you think it's not useful, so be it.


[PATCH v2] x86, msr: Document AMD "tweak MSRs", use MSR_FnnH_NAME scheme for them

2017-04-25 Thread Denys Vlasenko
_1022 Data Cache Configuration (DC_CFG)
MSRC001_1023 Bus Unit Configuration Register (BU_CFG)
MSRC001_102A Bus Unit Configuration 2 (BU_CFG2)

11h_BKDG documents:
MSRC001_1022 Data Cache Configuration (DC_CFG)
MSRC001_1023 Bus Unit Configuration Register (BU_CFG)

12h_BKDG documents:
MSRC001_1020 Load-Store Configuration (LS_CFG)
MSRC001_1021 Instruction Cache Configuration (IC_CFG)
MSRC001_1022 Data Cache Configuration (DC_CFG)
MSRC001_1029 Decode Configuration (DE_CFG)
MSRC001_102A Combined Unit Configuration 2 (CU_CFG2) - name change since 10h

14h_Mod_00h-0Fh_BKDG documents only:
MSRC001_1020 Load-Store Configuration (LS_CFG)
MSRC001_1021 Instruction Cache Configuration (IC_CFG)
MSRC001_1022 Data Cache Configuration (DC_CFG)

15h_Mod_00h-0Fh_BKDG documents more:
MSRC001_1020 Load-Store Configuration (LS_CFG)
MSRC001_1021 Instruction Cache Configuration (IC_CFG)
MSRC001_1022 Data Cache Configuration (DC_CFG)
MSRC001_1023 Combined Unit Configuration (CU_CFG) - name change since 11h
MSRC001_1028 Floating Point Configuration (FP_CFG)
MSRC001_1029 Decode Configuration (DE_CFG)
MSRC001_102A Combined Unit Configuration 2 (CU_CFG2)
MSRC001_102B Combined Unit Configuration 3 (CU_CFG3)
MSRC001_102C Execution Unit Configuration (EX_CFG)
MSRC001_102D Load-Store Configuration 2 (LS_CFG2)

15h_Mod_10h-1Fh_BKDG: does not mention MSRC001_1029 and MSRC001_102C.

15h_Mod_30h-3Fh_BKDG: does not mention MSRC001_1029, MSRC001_102C
and MSRC001_102D, adds new one:
MSRC001_102F Prefetch Throttling Configuration (CU_PFTCFG)

15h_Mod_60h-6Fh_BKDG: also fails to mention MSRC001_1029, MSRC001_102C
and MSRC001_102D, but has new ones:
MSRC001_101C Load-Store Configuration 3 (LS_CFG3)
MSRC001_1090 Processor Feedback Constants 0
MSRC001_10A1 Contention Blocking Buffer Control (CU_CBBCFG)

MSRC001_1000 is only mentioned in 15h erratas, name unknown.

16h_Mod_00h-0Fh_BKDG: stuff disappeared or got renamed
(1023 and 102A are "Bus Unit Configuration" again):
MSRC001_1020 Load-Store Configuration (LS_CFG)
MSRC001_1021 Instruction Cache Configuration (IC_CFG)
MSRC001_1022 Data Cache Configuration (DC_CFG)
MSRC001_1023 Bus Unit Configuration (BU_CFG)
MSRC001_1028 Floating Point Configuration (FP_CFG) - all bits are "reserved"
MSRC001_102A Bus Unit Configuration 2 (BU_CFG2)

16h_Mod_30h-3Fh_BKDG: FP_CFG now has one documented field

CC: Ingo Molnar 
CC: Andy Lutomirski 
CC: Borislav Petkov 
CC: Brian Gerst 
CC: Peter Zijlstra 
CC: "H. Peter Anvin" 
CC: x...@kernel.org
CC: linux-kernel@vger.kernel.org
Signed-off-by: Denys Vlasenko 
---
 arch/x86/include/asm/msr-index.h | 63 +---
 arch/x86/kernel/cpu/amd.c| 10 +++
 arch/x86/kvm/svm.c   |  4 +--
 arch/x86/kvm/x86.c   |  4 +--
 4 files changed, 67 insertions(+), 14 deletions(-)

diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index d8b5f8a..8f89dd3 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -293,9 +293,6 @@
 #define MSR_AMD64_PATCH_LOADER 0xc0010020
 #define MSR_AMD64_OSVW_ID_LENGTH   0xc0010140
 #define MSR_AMD64_OSVW_STATUS  0xc0010141
-#define MSR_AMD64_LS_CFG   0xc0011020
-#define MSR_AMD64_DC_CFG   0xc0011022
-#define MSR_AMD64_BU_CFG2  0xc001102a
 #define MSR_AMD64_IBSFETCHCTL  0xc0011030
 #define MSR_AMD64_IBSFETCHLINAD0xc0011031
 #define MSR_AMD64_IBSFETCHPHYSAD   0xc0011032
@@ -315,6 +312,65 @@
 #define MSR_AMD64_IBSOPDATA4   0xc001103d
 #define MSR_AMD64_IBS_REG_COUNT_MAX8 /* includes MSR_AMD64_IBSBRTARGET */
 
+/*
+ * MSRs in 0xc001101c-0xc001102f range are sparsely documented in BKDGs,
+ * but sometimes they can be found in errata documents.
+ * Registers 1020-1023 exist since K8 (mentioned in errata docs).
+ * Fam10h also has registers 1029, 102a (maybe more, not in docs).
+ * Fam15h BKDGs document registers 1028, 102b-102f, 101c, 1090, 10a1.
+ * Registers 1023 and 102a are called "Combined Unit Cfg" or "Bus Unit Cfg",
+ * depending on the CPU family.
+ */
+#define MSR_K8_LS_CFG  0xc0011020
+#define MSR_K8_IC_CFG  0xc0011021
+#define MSR_K8_DC_CFG  0xc0011022
+#define MSR_K8_BU_CFG  0xc0011023
+
+#define MSR_F10H_LS_CFG0xc0011020
+#define MSR_F10H_IC_CFG0xc0011021
+#define MSR_F10H_DC_CFG0xc0011022
+#define MSR_F10H_BU_CFG0xc0011023
+#define MSR_F10H_DE_CFG0xc0011029
+#define MSR_F10H_BU_CFG2   0xc001102a
+
+#define MSR_F12H_LS_CFG0xc0011020
+#define MSR_F12H_IC_CFG0xc0011021
+#define MSR_F12H_DC_CFG0xc0011022
+#define MSR_F12H_CU_CFG0xc0011023
+#define MSR_F12H_DE_CFG0xc0011029
+#define MSR

Re: [PATCH] x86, msr: Better document AMD "tweak MSRs", rename MSR_F15H_IC_CFG

2017-04-25 Thread Denys Vlasenko

On 04/25/2017 02:59 PM, Borislav Petkov wrote:

On Tue, Apr 25, 2017 at 02:16:55PM +0200, Denys Vlasenko wrote:

However, all IBS registers are in this range.


I knew you were gonna say that. But IBS registers are architectural too
in the sense that they are behind a CPUID bit.


DRi_ADDR_MASK are in this range - and these are very useful, likely to
stay.


Those are too behind a CPUID bit.


In the arch/x86/include/asm/msr-index.h file we already have
three "tweak" MSRs defined with "AMD64":

#define MSR_AMD64_LS_CFG   0xc0011020
#define MSR_AMD64_DC_CFG   0xc0011022
#define MSR_AMD64_BU_CFG2  0xc001102a

I just noticed that we have a fourth one in
arch/x86/kernel/cpu/amd.c:

#define MSR_AMD64_DE_CFG   0xC0011029


That's wrong. I think we should call those something else but not
"AMD64".


Okay. Propose a naming scheme for these which looks god to you.



Perhaps the families for which the workaround is being applied.
In the last case, MSR_F12H_DE_CFG, for example. And yes, I should've
paid attention to that but ...


A bit problematic:

MSR C001_1020 is used (mentioned in Rev Guides as a possible way
to work around an errata) by all Fams starting from K8, except Fam15h.

MSR C001_1022 is used by K8, 10h, 15h.

Etc...


Re: [PATCH] x86, msr: Better document AMD "tweak MSRs", rename MSR_F15H_IC_CFG

2017-04-25 Thread Denys Vlasenko

On 04/25/2017 01:59 PM, Borislav Petkov wrote:

On Tue, Apr 25, 2017 at 01:45:41PM +0200, Denys Vlasenko wrote:

Before this patch, we have a define for MSR 0xc0011021: MSR_F15H_IC_CFG.
But it exists starting from K8, so it's not really a Fam15h MSR only.

Lat's call it MSR_AMD64_IC_CFG.


Except that we name only those MSRs with "AMD64" which are
architectural. See "Appendix A MSR Cross-Reference" in APM vol 2.


Yes, APM vol 2 has none of c001_1xxx MSRs.

However, all IBS registers are in this range. DRi_ADDR_MASK
are in this range - and these are very useful, likely to stay.

In the arch/x86/include/asm/msr-index.h file we already have
three "tweak" MSRs defined with "AMD64":

#define MSR_AMD64_LS_CFG   0xc0011020
#define MSR_AMD64_DC_CFG   0xc0011022
#define MSR_AMD64_BU_CFG2  0xc001102a

I just noticed that we have a fourth one in
arch/x86/kernel/cpu/amd.c:

#define MSR_AMD64_DE_CFG   0xC0011029


[PATCH] x86, msr: Better document AMD "tweak MSRs", rename MSR_F15H_IC_CFG

2017-04-25 Thread Denys Vlasenko
G2)

11h_BKDG documents:
MSRC001_1022 Data Cache Configuration (DC_CFG)
MSRC001_1023 Bus Unit Configuration Register (BU_CFG)

12h_BKDG documents:
MSRC001_1020 Load-Store Configuration (LS_CFG)
MSRC001_1021 Instruction Cache Configuration (IC_CFG)
MSRC001_1022 Data Cache Configuration (DC_CFG)
MSRC001_1029 Decode Configuration (DE_CFG)
MSRC001_102A Combined Unit Configuration 2 (CU_CFG2) - name change since 10h

14h_Mod_00h-0Fh_BKDG documents only:
MSRC001_1020 Load-Store Configuration (LS_CFG)
MSRC001_1021 Instruction Cache Configuration (IC_CFG)
MSRC001_1022 Data Cache Configuration (DC_CFG)

15h_Mod_00h-0Fh_BKDG documents more:
MSRC001_1020 Load-Store Configuration (LS_CFG)
MSRC001_1021 Instruction Cache Configuration (IC_CFG)
MSRC001_1022 Data Cache Configuration (DC_CFG)
MSRC001_1023 Combined Unit Configuration (CU_CFG) - name change since 11h
MSRC001_1028 Floating Point Configuration (FP_CFG)
MSRC001_1029 Decode Configuration (DE_CFG)
MSRC001_102A Combined Unit Configuration 2 (CU_CFG2)
MSRC001_102B Combined Unit Configuration 3 (CU_CFG3)
MSRC001_102C Execution Unit Configuration (EX_CFG)
MSRC001_102D Load-Store Configuration 2 (LS_CFG2)

15h_Mod_10h-1Fh_BKDG: does not mention MSRC001_1029 and MSRC001_102C.

15h_Mod_30h-3Fh_BKDG: does not mention MSRC001_1029, MSRC001_102C
and MSRC001_102D, adds new one:
MSRC001_102F Prefetch Throttling Configuration (CU_PFTCFG)

15h_Mod_60h-6Fh_BKDG: also fails to mention MSRC001_1029, MSRC001_102C
and MSRC001_102D, but has new ones:
MSRC001_101C Load-Store Configuration 3 (LS_CFG3)
MSRC001_1090 Processor Feedback Constants 0
MSRC001_10A1 Contention Blocking Buffer Control (CU_CBBCFG)

16h_Mod_00h-0Fh_BKDG: stuff disappeared or got renamed
(1023 and 102A are "Bus Unit Configuration" again):
MSRC001_1020 Load-Store Configuration (LS_CFG)
MSRC001_1021 Instruction Cache Configuration (IC_CFG)
MSRC001_1022 Data Cache Configuration (DC_CFG)
MSRC001_1023 Bus Unit Configuration (BU_CFG)
MSRC001_1028 Floating Point Configuration (FP_CFG) - all bits are "reserved"
MSRC001_102A Bus Unit Configuration 2 (BU_CFG2)

16h_Mod_30h-3Fh_BKDG: FP_CFG now has one documented field

CC: Ingo Molnar 
CC: Andy Lutomirski 
CC: Borislav Petkov 
CC: Brian Gerst 
CC: Peter Zijlstra 
CC: "H. Peter Anvin" 
CC: x...@kernel.org
CC: linux-kernel@vger.kernel.org
Signed-off-by: Denys Vlasenko 
---
 arch/x86/include/asm/msr-index.h | 20 +++-
 arch/x86/kernel/cpu/amd.c|  4 ++--
 arch/x86/kvm/svm.c   |  2 +-
 3 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index d8b5f8a..4195681 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -293,9 +293,28 @@
 #define MSR_AMD64_PATCH_LOADER 0xc0010020
 #define MSR_AMD64_OSVW_ID_LENGTH   0xc0010140
 #define MSR_AMD64_OSVW_STATUS  0xc0010141
+/*
+ * MSRs in 0xc001101c-0xc001102f range are sparsely documented in BKDGs,
+ * but sometimes they can be found in errata documents.
+ * Registers 1020..1023 exist since K8. Fam10h and later have regs 1028..102d.
+ * Fam15h BKDGs document registers 102f, 101c, 1090, 10a1.
+ * Registers 1023 and 102a are called "Combined Unit Cfg" in Fam12h-15h,
+ * but "Bus Unit Cfg" in K8, Fam10h, 11h and 16h. Other "CU_" registers
+ * might also be "BU_" (did not find them in docs for these families).
+ */
+#define MSR_AMD64_LS_CFG3  0xc001101c
 #define MSR_AMD64_LS_CFG   0xc0011020
+#define MSR_AMD64_IC_CFG   0xc0011021
 #define MSR_AMD64_DC_CFG   0xc0011022
+#define MSR_AMD64_BU_CFG   0xc0011023
+#define MSR_AMD64_FP_CFG   0xc0011028
+#define MSR_AMD64_DE_CFG   0xc0011029
 #define MSR_AMD64_BU_CFG2  0xc001102a
+#define MSR_AMD64_CU_CFG3  0xc001102b
+#define MSR_AMD64_EX_CFG   0xc001102c
+#define MSR_AMD64_LS_CFG2  0xc001102d
+#define MSR_AMD64_CU_PFTCFG0xc001102f
+
 #define MSR_AMD64_IBSFETCHCTL  0xc0011030
 #define MSR_AMD64_IBSFETCHLINAD0xc0011031
 #define MSR_AMD64_IBSFETCHPHYSAD   0xc0011032
@@ -332,7 +351,6 @@
 #define MSR_F15H_NB_PERF_CTL   0xc0010240
 #define MSR_F15H_NB_PERF_CTR   0xc0010241
 #define MSR_F15H_PTSC  0xc0010280
-#define MSR_F15H_IC_CFG0xc0011021
 
 /* Fam 10h MSRs */
 #define MSR_FAM10H_MMIO_CONF_BASE  0xc0010058
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index c36140d..7d4a5bf 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -720,9 +720,9 @@ static void init_amd_bd(struct cpuinfo_x86 *c)
 * Disable it on the affected CPUs.
 */
if ((c->x86_model >= 0x02) && (c->x86_model < 0x20)) {
-   if (!rdmsrl_safe(MSR_F15H_IC_CFG, &value) && !(value

Re: [tip:x86/mm] x86/asm: Remove __VIRTUAL_MASK_SHIFT==47 assert

2017-04-05 Thread Denys Vlasenko



On 04/05/2017 01:12 PM, Kirill A. Shutemov wrote:

On Tue, Apr 04, 2017 at 05:36:33PM +0200, Denys Vlasenko wrote:

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 044d18e..f07b4ef 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -265,12 +265,9 @@ return_from_SYSCALL_64:
 *
 * If width of "canonical tail" ever becomes variable, this will need
 * to be updated to remain correct on both old and new CPUs.
+*
+* Change top 16 bits to be the sign-extension of 47th bit


The comment above stops being correct: it's not necessary 16 top bits
we sign-extend now. With larger __VIRTUAL_MASK_SHIFT for 5-level translation,
it will become 7 bits (if I do the math right).


Does the patch below look okay to you?


 */
-   .ifne __VIRTUAL_MASK_SHIFT - 47
-   .error "virtual address width changed -- SYSRET checks need update"
-   .endif
-
-   /* Change top 16 bits to be the sign-extension of 47th bit */
shl $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
sar $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx


The bigger problem here would be the future boot-time choice of 4/5-level
page tables: __VIRTUAL_MASK_SHIFT will need to depend on that choice,
but in this location it is preferable to not use any variables
(memory references).


Yeah. Will see what I will be able to come up with. Not sure yet.

---8<--

From 2433cf4f8847bbc41cc2b02d6af4f191b3b5a0c5 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" 
Date: Wed, 5 Apr 2017 14:06:15 +0300
Subject: [PATCH] x86/asm: Fix comment in return_from_SYSCALL_64

On x86-64 __VIRTUAL_MASK_SHIFT depends on paging mode now.

Signed-off-by: Kirill A. Shutemov 
---
 arch/x86/entry/entry_64.S | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 607d72c4a485..c70e064d9592 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -266,7 +266,8 @@ return_from_SYSCALL_64:
 * If width of "canonical tail" ever becomes variable, this will need
 * to be updated to remain correct on both old and new CPUs.
 *
-* Change top 16 bits to be the sign-extension of 47th bit
+* Change top bits to match most significant valuable bit (47 or 56
+* depending on paging mode) in the address.


Er "Change top bits ... ((47 or 56 [bits] depending on paging mode)"?
I know that's wrong and that's not what you meant to say,
but it can be read this way too. "47th" instead of "47"
would eliminate this reading, but you removed "th".

Spell it out to eliminate any chance of confusion:

Change top bits to match most significant bit (47th or 56th bit
depending on paging mode) in the address.



 */
shl $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
sar $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx



Re: [tip:x86/mm] x86/asm: Remove __VIRTUAL_MASK_SHIFT==47 assert

2017-04-04 Thread Denys Vlasenko

On 04/04/2017 10:29 AM, tip-bot for Kirill A. Shutemov wrote:

Commit-ID:  361b4b58ec4cf123e12a773909c6454dbd5e6dbc
Gitweb: http://git.kernel.org/tip/361b4b58ec4cf123e12a773909c6454dbd5e6dbc
Author: Kirill A. Shutemov 
AuthorDate: Thu, 30 Mar 2017 11:07:26 +0300
Committer:  Ingo Molnar 
CommitDate: Tue, 4 Apr 2017 08:22:33 +0200

x86/asm: Remove __VIRTUAL_MASK_SHIFT==47 assert

We don't need the assert anymore, as:

  17be0aec74fb ("x86/asm/entry/64: Implement better check for canonical 
addresses")

made canonical address checks generic wrt. address width.

Signed-off-by: Kirill A. Shutemov 
Cc: Andrew Morton 
Cc: Andy Lutomirski 
Cc: Andy Lutomirski 
Cc: Borislav Petkov 
Cc: Brian Gerst 
Cc: Dave Hansen 
Cc: Denys Vlasenko 
Cc: H. Peter Anvin 
Cc: Josh Poimboeuf 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Thomas Gleixner 
Cc: linux-a...@vger.kernel.org
Cc: linux...@kvack.org
Link: 
http://lkml.kernel.org/r/20170330080731.65421-3-kirill.shute...@linux.intel.com
Signed-off-by: Ingo Molnar 
---
 arch/x86/entry/entry_64.S | 7 ++-
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 044d18e..f07b4ef 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -265,12 +265,9 @@ return_from_SYSCALL_64:
 *
 * If width of "canonical tail" ever becomes variable, this will need
 * to be updated to remain correct on both old and new CPUs.
+*
+* Change top 16 bits to be the sign-extension of 47th bit


The comment above stops being correct: it's not necessary 16 top bits
we sign-extend now. With larger __VIRTUAL_MASK_SHIFT for 5-level translation,
it will become 7 bits (if I do the math right).


 */
-   .ifne __VIRTUAL_MASK_SHIFT - 47
-   .error "virtual address width changed -- SYSRET checks need update"
-   .endif
-
-   /* Change top 16 bits to be the sign-extension of 47th bit */
shl $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
sar $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx


The bigger problem here would be the future boot-time choice of 4/5-level
page tables: __VIRTUAL_MASK_SHIFT will need to depend on that choice,
but in this location it is preferable to not use any variables
(memory references).

--
vda


Re: random: /dev/random often returns short reads

2017-02-15 Thread Denys Vlasenko
On Wed, Jan 18, 2017 at 7:07 PM, Theodore Ts'o  wrote:
> On Wed, Jan 18, 2017 at 04:44:54PM +0100, Denys Vlasenko wrote:
>> In this case, what code does is it returns fewer bytes,
>> even though *it has enough random bytes to return the full request*.
>>
>> This happens because the patch which added more conservative
>> accounting, while containing technically correct accounting per se,
>> it  forgot to take in the account another part of the code
>> which was relying on the previous, simplistic logic
>> "if we add N random bytes to a pool, now it has +N random bytes".
>>
>> In my opinion, it should have changed that part of code simultaneously
>> with introducing more conservative accounting.
>
> In the ideal world, yes.  I've acknowledged this is a bug, in the "be
> conservative in what you send, liberal in what you receive" sense..
> But no one complained for three year, and userspace needs to be able
> to retry short reads instead of immediately erroring out.
>
> The problem is changing that code to figure out exactly how many bytes
> you need to get in order to have N random bytes is non-trivial.  So
> our choices are:
>
> 1) Transfer more bytes than might be needed to the secondary pool
...
> 2) Transfer bytes without using the conservative entropy "overwrite"
> calculation if the blocking pool is mostly empty.  This means we will
> be over-estimating the entropy in that pool, which is unfortunate.
> One could argue that all of the cases where people have been whining
> about this, they are using HAVEGED which is providing pretend entropy
> based on the presumed unpredictability of Intel cahce timing, so
> careful entropy calculations is kind of silly anyway.  However, there
> might be some people who really are trying to do carefule entropy
> measurement, so compromising this isn't really a great idea.>
> I'm leaning a bit towards 1 if we have to do something (which is my
> proposed, untested patch).

I spend quite some time looking at both your patch which implements #1
and at the commit 30e37ec516ae5a6957596de7661673c615c82ea4 which introduced
"conservative accounting" code, and the same thought returns to me:
this complexity and problems are self-inflicted by
commit 30e37ec516ae5a6957596de7661673c615c82ea4.

The code is already conservative elsewhere, adding more conservative code -
and more complex code, especially considering that it should be even more
complex than it is, since it should be further fixed up in
"xfer_secondary_pool(r, nbytes)" location -
it does not look like worthy improvement to me.

I propose to simply revert 30e37ec516ae5a6957596de7661673c615c82ea4.

If you worry about this accounting more than I do,
how about dealing with it in a simpler way, a-la

-   xfer_secondary_pool(r, nbytes);
+   xfer_secondary_pool(r, nbytes * 5/4); /* be a bit paranoid */


Re: [PATCH] x86/crypto: make constants readonly, allow linker to merge them

2017-01-23 Thread Denys Vlasenko

On 01/20/2017 12:09 AM, Thomas Gleixner wrote:

On Thu, 19 Jan 2017, Denys Vlasenko wrote:


A lot of asm-optimized routines in arch/x86/crypto/ keep its
constants in .data. This is wrong, they should be on .rodata.

Mnay of these constants are the same in different modules.
For example, 128-bit shuffle mask 0x000102030405060708090A0B0C0D0E0F
exists in at least half a dozen places.

There is a way to let linker merge them and use just one copy.
The rules are as follows: mergeable objects of different sizes
should not share sections. You can't put them all in one .rodata
section, they will lose "mergeability".

GCC puts its mergeable constants in ".rodata.cstSIZE" sections,
or ".rodata.cstSIZE." if -fdata-sections is used.
This patch does the same:

.section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16

It is important that all data in such section consists of
16-byte elements, not larger ones, and there are no implicit
use of one element from another.

When this is not the case, use non-mergeable section:

.section .rodata[.VAR_NAME], "a", @progbits

This reduces .data by ~15 kbytes:

textdata bss dec  hex filename
11097415 2705840 2630712 16433967  fac32f vmlinux-prev.o
2095 2690672 2630712 16433479  fac147 vmlinux.o


And at the same time it increases text size by ~15k. The overall change in
total size is 488 byte reduction. Weird.


Of course: the constants do need to go somewhere. They migrate to
.rodata.blabla sections, which eventually got appended to read-only .text

Without merging, if I would just move constants to .rodata,
there would be no net size win at all.

I did not look deepper why the overall change is smaller than expected.
It may be affected by changed padding. The linker is not too clever
about it, and also IIRC we don't actually giving it the best options
to sort sections during link time to pack them better.


Re: random: /dev/random often returns short reads

2017-01-19 Thread Denys Vlasenko
On Wed, Jan 18, 2017 at 7:07 PM, Theodore Ts'o  wrote:
> In the ideal world, yes.  I've acknowledged this is a bug, in the "be
> conservative in what you send, liberal in what you receive" sense..
> But no one complained for three year, and userspace needs to be able
> to retry short reads instead of immediately erroring out.
>
> The problem is changing that code to figure out exactly how many bytes
> you need to get in order to have N random bytes is non-trivial.  So
> our choices are:
>
> 1) Transfer more bytes than might be needed to the secondary pool,
> which results in resource stranding --- since entropy in the secondary
> pool isn't available for reseeding the CRNG.  OTOH, given that we're
> now using the CRNG solution, and we're only reseeding every five
> minutes, I'm not actually all that worried about stranding some extra
> entropy bits in the blocking pool, since that's only going to happen
> if we have people *using* the /dev/random pool, and so that entropy
> will likely be used eventually anyway
...
...
> I'm leaning a bit towards 1 if we have to do something (which is my
> proposed, untested patch).

Thanks, this solution is okay for me.


[PATCH] x86/crypto: make constants readonly, allow linker to merge them

2017-01-19 Thread Denys Vlasenko
A lot of asm-optimized routines in arch/x86/crypto/ keep its
constants in .data. This is wrong, they should be on .rodata.

Mnay of these constants are the same in different modules.
For example, 128-bit shuffle mask 0x000102030405060708090A0B0C0D0E0F
exists in at least half a dozen places.

There is a way to let linker merge them and use just one copy.
The rules are as follows: mergeable objects of different sizes
should not share sections. You can't put them all in one .rodata
section, they will lose "mergeability".

GCC puts its mergeable constants in ".rodata.cstSIZE" sections,
or ".rodata.cstSIZE." if -fdata-sections is used.
This patch does the same:

.section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16

It is important that all data in such section consists of
16-byte elements, not larger ones, and there are no implicit
use of one element from another.

When this is not the case, use non-mergeable section:

.section .rodata[.VAR_NAME], "a", @progbits

This reduces .data by ~15 kbytes:

textdata bss dec  hex filename
11097415 2705840 2630712 16433967  fac32f vmlinux-prev.o
2095 2690672 2630712 16433479  fac147 vmlinux.o

Merged objects are visible in System.map:

81a28810 r POLY
81a28810 r POLY
81a28820 r TWOONE
81a28820 r TWOONE
81a28830 r PSHUFFLE_BYTE_FLIP_MASK <- merged regardless of
81a28830 r SHUF_MASK   <- the name difference
81a28830 r SHUF_MASK
81a28830 r SHUF_MASK
..
81a28d00 r K512 <- merged three identical 640-byte tables
81a28d00 r K512
81a28d00 r K512

Use of object names in section name suffixes is not strictly necessary,
but might help if someday link stage will use garbage collection
to eliminate unused sections (ld --gc-sections).

Signed-off-by: Denys Vlasenko 
CC: Herbert Xu 
CC: Josh Poimboeuf 
CC: Xiaodong Liu 
CC: Megha Dey 
CC: linux-cry...@vger.kernel.org
CC: x...@kernel.org
CC: linux-kernel@vger.kernel.org
---
 arch/x86/crypto/aesni-intel_asm.S  | 37 +-
 arch/x86/crypto/aesni-intel_avx-x86_64.S   | 32 ++-
 arch/x86/crypto/camellia-aesni-avx-asm_64.S|  5 ++-
 arch/x86/crypto/camellia-aesni-avx2-asm_64.S   | 12 +--
 arch/x86/crypto/cast5-avx-x86_64-asm_64.S  | 14 ++--
 arch/x86/crypto/cast6-avx-x86_64-asm_64.S  | 12 +--
 arch/x86/crypto/chacha20-avx2-x86_64.S |  9 --
 arch/x86/crypto/chacha20-ssse3-x86_64.S|  7 ++--
 arch/x86/crypto/crct10dif-pcl-asm_64.S | 14 ++--
 arch/x86/crypto/des3_ede-asm_64.S  |  2 +-
 arch/x86/crypto/ghash-clmulni-intel_asm.S  |  3 +-
 arch/x86/crypto/poly1305-avx2-x86_64.S |  6 ++--
 arch/x86/crypto/poly1305-sse2-x86_64.S |  6 ++--
 arch/x86/crypto/serpent-avx-x86_64-asm_64.S|  5 +--
 arch/x86/crypto/serpent-avx2-asm_64.S  |  9 --
 arch/x86/crypto/sha1-mb/sha1_mb_mgr_flush_avx2.S   |  6 ++--
 arch/x86/crypto/sha1-mb/sha1_mb_mgr_submit_avx2.S  |  3 +-
 arch/x86/crypto/sha1-mb/sha1_x8_avx2.S | 15 +++--
 arch/x86/crypto/sha1_ni_asm.S  |  8 +++--
 arch/x86/crypto/sha256-avx-asm.S   |  9 +-
 arch/x86/crypto/sha256-avx2-asm.S  |  9 +-
 .../crypto/sha256-mb/sha256_mb_mgr_flush_avx2.S|  6 ++--
 .../crypto/sha256-mb/sha256_mb_mgr_submit_avx2.S   |  3 +-
 arch/x86/crypto/sha256-mb/sha256_x8_avx2.S |  7 +++-
 arch/x86/crypto/sha256-ssse3-asm.S |  8 -
 arch/x86/crypto/sha256_ni_asm.S|  4 ++-
 arch/x86/crypto/sha512-avx-asm.S   |  9 --
 arch/x86/crypto/sha512-avx2-asm.S  | 10 --
 .../crypto/sha512-mb/sha512_mb_mgr_flush_avx2.S| 10 --
 .../crypto/sha512-mb/sha512_mb_mgr_submit_avx2.S   |  4 ++-
 arch/x86/crypto/sha512-mb/sha512_x4_avx2.S |  4 ++-
 arch/x86/crypto/sha512-ssse3-asm.S |  9 --
 arch/x86/crypto/twofish-avx-x86_64-asm_64.S|  6 ++--
 33 files changed, 229 insertions(+), 74 deletions(-)

diff --git a/arch/x86/crypto/aesni-intel_asm.S 
b/arch/x86/crypto/aesni-intel_asm.S
index 383a6f8..3c46518 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -46,28 +46,49 @@
 
 #ifdef __x86_64__
 
-.data
+# constants in mergeable sections, linker can reorder and merge
+.section   .rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16
 .align 16
 .Lgf128mul_x_ble_mask:
.octa 0x00010087
+.section   .rodata.cst16.POLY, "aM", @progbits, 16
+.align 16
 POLY:   .octa 0xC201
+.section   .rodata.cst16.TWOONE, "aM", @progbits, 16
+.align 16
 TWOONE: .octa 0x00010001
 
-# order of these constan

[PATCH] x86/crypto: fix %progbits -> @progbits

2017-01-19 Thread Denys Vlasenko
%progbits form is used on ARM (where @ is a comment char).

x86 consistently uses @progbits everywhere else.

Signed-off-by: Denys Vlasenko 
CC: Herbert Xu 
CC: Josh Poimboeuf 
CC: Xiaodong Liu 
CC: Megha Dey 
CC: George Spelvin 
CC: linux-cry...@vger.kernel.org
CC: x...@kernel.org
CC: linux-kernel@vger.kernel.org
---
 arch/x86/crypto/crc32c-pcl-intel-asm_64.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S 
b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
index dc05f01..7a7de27 100644
--- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
+++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
@@ -312,7 +312,7 @@ do_return:
 ret
 ENDPROC(crc_pcl)
 
-.section   .rodata, "a", %progbits
+.section   .rodata, "a", @progbits
 
 ## jump tableTable is 129 entries x 2 bytes each
 
-- 
2.9.2



Re: random: /dev/random often returns short reads

2017-01-18 Thread Denys Vlasenko

On 01/17/2017 11:29 PM, H. Peter Anvin wrote:

On 01/17/17 09:34, Denys Vlasenko wrote:



On 01/17/2017 06:15 PM, Theodore Ts'o wrote:

On Tue, Jan 17, 2017 at 09:21:31AM +0100, Denys Vlasenko wrote:

If someone wants to send me a patch, I'll happily take a look at it,


Will something along these lines be accepted?


The problem is that this won't work.  In the cases that we're talking
about, the entropy counter in the secondary pool is not zero, but
close to zero, we'll still have short reads.  And that's going to
happen a fair amount of the time.

Perhaps the best *hacky* solution would be to say, ok if the entropy
count is less than some threshold, don't use the correct entropy
calculation, but rather assume that all of the new bits won't land on
top of existing entropy bits.


IOW, something like this:

--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -653,6 +653,9 @@ static void credit_entropy_bits(struct
entropy_store *r, int nbits)
if (nfrac < 0) {
/* Debit */
entropy_count += nfrac;
+   } else if (entropy_count < ((8 * 8) << ENTROPY_SHIFT)) {
+   /* Credit, and the pool is almost empty */
+   entropy_count += nfrac;
} else {
/*
 * Credit: we have to account for the possibility of
 * overwriting already present entropy.  Even in the

Want the patch? If yes, what name of the constant you prefer? How about



This seems very wrong.  The whole point is that we keep it conservative
-- always less than or equal to the correct number.


In this case, what code does is it returns fewer bytes,
even though *it has enough random bytes to return the full request*.

This happens because the patch which added more conservative
accounting, while containing technically correct accounting per se,
it  forgot to take in the account another part of the code
which was relying on the previous, simplistic logic
"if we add N random bytes to a pool, now it has +N random bytes".

In my opinion, it should have changed that part of code simultaneously
with introducing more conservative accounting.


Re: random: /dev/random often returns short reads

2017-01-17 Thread Denys Vlasenko



On 01/17/2017 06:15 PM, Theodore Ts'o wrote:

On Tue, Jan 17, 2017 at 09:21:31AM +0100, Denys Vlasenko wrote:

If someone wants to send me a patch, I'll happily take a look at it,


Will something along these lines be accepted?


The problem is that this won't work.  In the cases that we're talking
about, the entropy counter in the secondary pool is not zero, but
close to zero, we'll still have short reads.  And that's going to
happen a fair amount of the time.

Perhaps the best *hacky* solution would be to say, ok if the entropy
count is less than some threshold, don't use the correct entropy
calculation, but rather assume that all of the new bits won't land on
top of existing entropy bits.


IOW, something like this:

--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -653,6 +653,9 @@ static void credit_entropy_bits(struct
entropy_store *r, int nbits)
if (nfrac < 0) {
/* Debit */
entropy_count += nfrac;
+   } else if (entropy_count < ((8 * 8) << ENTROPY_SHIFT)) {
+   /* Credit, and the pool is almost empty */
+   entropy_count += nfrac;
} else {
/*
 * Credit: we have to account for the possibility of
 * overwriting already present entropy.  Even in the

Want the patch? If yes, what name of the constant you prefer? How about

/* Has less than 8 bytes */
#define ALMOST_EMPTY_POOL_frac   ((8 * 8) << ENTROPY_SHIFT)


Re: random: /dev/random often returns short reads

2017-01-17 Thread Denys Vlasenko
On Tue, Jan 17, 2017 at 5:36 AM, Theodore Ts'o  wrote:
> On Mon, Jan 16, 2017 at 07:50:55PM +0100, Denys Vlasenko wrote:
>>
>> /dev/random can legitimately returns short reads
>> when there is not enough entropy for the full request.
>
> Yes, but callers of /dev/random should be able to handle short reads.
> So it's a bug in the application as well.

I absolutely agree, whoever stumbled over it has a bug
in their app.

>> The code looks like it effectively credits the pool only for ~3/4
>> of the amount, i.e. 24 bytes, not 32.
>
> How much it credits the pools varies depending on how many bits of
> entropy are being transferred and how full the pool happens to be
> beforehand.

I think the problem is that even if the target pool has no entropy
at all, current algorithm thinks that transferring N random bytes
to it gives it N*3/4 bytes of randomness.

> Reversing the calculation so that we transfer exactly the
> right number of bits is tricky, and if we transfer too many bits, we
> risk "wasting" entropy bits.  Of course, it doesn't matter if we're
> transfering pretend entropy only for the purposes of getting FIPS
> certification, but getting it Right(tm) is non-trivial.
>
> If someone wants to send me a patch, I'll happily take a look at it,

Will something along these lines be accepted?

--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -653,6 +653,9 @@ static void credit_entropy_bits(struct
entropy_store *r, int nbits)
if (nfrac < 0) {
/* Debit */
entropy_count += nfrac;
+   } else if (entropy_count == 0) {
+   /* Credit, and the pool is empty */
+   entropy_count += nfrac;
} else {
/*
 * Credit: we have to account for the possibility of
 * overwriting already present entropy.  Even in the


> but given that fixing userspace is something you really should do
> anyway

I agree. It's just not in my (or my company's, IIUC) userspace code.
I wouldn't even know about this thing since *my* programs do handle
short reads correctly.


Re: random: /dev/random often returns short reads

2017-01-16 Thread Denys Vlasenko
> # while ./eat_dev_random; do ./eat_dev_random; done; ./eat_dev_random
> read of 32 returns 32
> read of 32 returns 32
> read of 32 returns 28
> read of 32 returns 24
>
> Just two few first requests worked, and then ouch...
>
> I think this is what happens here:
> we transfer 4 bytes of entrophy to /dev/random pool:


erm... 32 bytes, of course.


random: /dev/random often returns short reads

2017-01-16 Thread Denys Vlasenko
Hi,

/dev/random can legitimately returns short reads
when there is not enough entropy for the full request.
However, now it does so far too often,
and it appears to be a bug:

#include 
#include 
#include 
#include 
#include 
#include 
int main(int argc, char **argv)
{
int fd, ret, len;
char buf[16 * 1024];

len = argv[1] ? atoi(argv[1]) : 32;
fd = open("/dev/random", O_RDONLY);
ret = read(fd, buf, len);
printf("read of %d returns %d\n", len, ret);
if (ret != len)
return 1;
return 0;
}

# gcc -Os -Wall eat_dev_random.c -o eat_dev_random

# while ./eat_dev_random; do ./eat_dev_random; done; ./eat_dev_random
read of 32 returns 32
read of 32 returns 32
read of 32 returns 28
read of 32 returns 24

Just two few first requests worked, and then ouch...

I think this is what happens here:
we transfer 4 bytes of entrophy to /dev/random pool:

_xfer_secondary_pool(struct entropy_store *r, size_t nbytes)
int bytes = nbytes;
/* pull at least as much as a wakeup */
bytes = max_t(int, bytes, random_read_wakeup_bits / 8);
/* but never more than the buffer size */
bytes = min_t(int, bytes, sizeof(tmp));
bytes = extract_entropy(r->pull, tmp, bytes,
random_read_wakeup_bits / 8, rsvd_bytes);
mix_pool_bytes(r, tmp, bytes);
credit_entropy_bits(r, bytes*8);


but when we enter credit_entropy_bits(), there is a defensive code
which slightly underestimates the amount of entropy!
It was added by this commit:

commit 30e37ec516ae5a6957596de7661673c615c82ea4
Author: H. Peter Anvin 
Date:   Tue Sep 10 23:16:17 2013 -0400

random: account for entropy loss due to overwrites

When we write entropy into a non-empty pool, we currently don't
account at all for the fact that we will probabilistically overwrite
some of the entropy in that pool.  This means that unless the pool is
fully empty, we are currently *guaranteed* to overestimate the amount
of entropy in the pool!


The code looks like it effectively credits the pool only for ~3/4
of the amount, i.e. 24 bytes, not 32.

If /dev/random pool was empty or nearly so, further it results
in a short read.

This is wrong because _xfer_secondary_pool() could well had
lots and lots of entropy to supply, it just did not give enough.


[PATCH v8] powerpc: Do not make the entire heap executable

2016-12-15 Thread Denys Vlasenko
On 32-bit powerpc the ELF PLT sections of binaries (built with --bss-plt,
or with a toolchain which defaults to it) look like this:

  [17] .sbss NOBITS  0002aff8 01aff8 14 00  WA  0   0  4
  [18] .plt  NOBITS  0002b00c 01aff8 84 00 WAX  0   0  4
  [19] .bss  NOBITS  0002b090 01aff8 a4 00  WA  0   0  4

Which results in an ELF load header:

  Type   Offset   VirtAddr   PhysAddr   FileSiz MemSiz  Flg Align
  LOAD   0x019c70 0x00029c70 0x00029c70 0x01388 0x014c4 RWE 0x1

This is all correct, the load region containing the PLT is marked as
executable. Note that the PLT starts at 0002b00c but the file mapping ends at
0002aff8, so the PLT falls in the 0 fill section described by the load header,
and after a page boundary.

Unfortunately the generic ELF loader ignores the X bit in the load headers
when it creates the 0 filled non-file backed mappings. It assumes all of these
mappings are RW BSS sections, which is not the case for PPC.

gcc/ld has an option (--secure-plt) to not do this, this is said to incur
a small performance penalty.

Currently, to support 32-bit binaries with PLT in BSS kernel maps *entire
brk area* with executable rights for all binaries, even --secure-plt ones.

Stop doing that.

Teach the ELF loader to check the X bit in the relevant load header
and create 0 filled anonymous mappings that are executable
if the load header requests that.

Test program showing the difference in /proc/$PID/maps:

int main() {
char buf[16*1024];
char *p = malloc(123); /* make "[heap]" mapping appear */
int fd = open("/proc/self/maps", O_RDONLY);
int len = read(fd, buf, sizeof(buf));
write(1, buf, len);
printf("%p\n", p);
return 0;
}

Compiled using: gcc -mbss-plt -m32 -Os test.c -otest

Unpatched ppc64 kernel:
0010-0012 r-xp  00:00 0  [vdso]
0fe1-0ffd r-xp  fd:00 67898094   
/usr/lib/libc-2.17.so
0ffd-0ffe r--p 001b fd:00 67898094   
/usr/lib/libc-2.17.so
0ffe-0fff rw-p 001c fd:00 67898094   
/usr/lib/libc-2.17.so
1000-1001 r-xp  fd:00 100674505  
/home/user/test
1001-1002 r--p  fd:00 100674505  
/home/user/test
1002-1003 rw-p 0001 fd:00 100674505  
/home/user/test
1069-106c rwxp  00:00 0  [heap]
f7f7-f7fa r-xp  fd:00 67898089   
/usr/lib/ld-2.17.so
f7fa-f7fb r--p 0002 fd:00 67898089   
/usr/lib/ld-2.17.so
f7fb-f7fc rw-p 0003 fd:00 67898089   
/usr/lib/ld-2.17.so
ffa9-ffac rw-p  00:00 0  [stack]
0x10690008

Patched ppc64 kernel:
0010-0012 r-xp  00:00 0  [vdso]
0fe1-0ffd r-xp  fd:00 67898094   
/usr/lib/libc-2.17.so
0ffd-0ffe r--p 001b fd:00 67898094   
/usr/lib/libc-2.17.so
0ffe-0fff rw-p 001c fd:00 67898094   
/usr/lib/libc-2.17.so
1000-1001 r-xp  fd:00 100674505  
/home/user/test
1001-1002 r--p  fd:00 100674505  
/home/user/test
1002-1003 rw-p 0001 fd:00 100674505  
/home/user/test
1018-101b rw-p  00:00 0  [heap]
   this has changed
f7c6-f7c9 r-xp  fd:00 67898089   
/usr/lib/ld-2.17.so
f7c9-f7ca r--p 0002 fd:00 67898089   
/usr/lib/ld-2.17.so
f7ca-f7cb rw-p 0003 fd:00 67898089   
/usr/lib/ld-2.17.so
ff86-ff89 rw-p  00:00 0  [stack]
0x10180008

The patch was originally posted in 2012 by Jason Gunthorpe
and apparently ignored:

https://lkml.org/lkml/2012/9/30/138

Lightly run-tested.

Signed-off-by: Jason Gunthorpe 
Signed-off-by: Denys Vlasenko 
Acked-by: Kees Cook 
Acked-by: Michael Ellerman 
Tested-by: Jason Gunthorpe 
CC: Andrew Morton 
CC: Benjamin Herrenschmidt 
CC: Paul Mackerras 
CC: "Aneesh Kumar K.V" 
CC: Kees Cook 
CC: Oleg Nesterov 
CC: Michael Ellerman 
CC: Florian Weimer 
CC: linux...@kvack.org
CC: linuxppc-...@lists.ozlabs.org
CC: linux-kernel@vger.kernel.org
---
Changes since v7:
* added /proc/$PID/maps example in the commit message

Changes since v6:
* rebased to current Linus tree
* sending to akpm

Changes since v5:
* made do_brk_flags() error out if any bits other than VM_EXEC are set.
  (Kees Cook: "With this, I'd be happy 

[PATCH v7] powerpc: Do not make the entire heap executable

2016-11-09 Thread Denys Vlasenko
On 32-bit powerpc the ELF PLT sections of binaries (built with --bss-plt,
or with a toolchain which defaults to it) look like this:

  [17] .sbss NOBITS  0002aff8 01aff8 14 00  WA  0   0  4
  [18] .plt  NOBITS  0002b00c 01aff8 84 00 WAX  0   0  4
  [19] .bss  NOBITS  0002b090 01aff8 a4 00  WA  0   0  4

Which results in an ELF load header:

  Type   Offset   VirtAddr   PhysAddr   FileSiz MemSiz  Flg Align
  LOAD   0x019c70 0x00029c70 0x00029c70 0x01388 0x014c4 RWE 0x1

This is all correct, the load region containing the PLT is marked as
executable. Note that the PLT starts at 0002b00c but the file mapping ends at
0002aff8, so the PLT falls in the 0 fill section described by the load header,
and after a page boundary.

Unfortunately the generic ELF loader ignores the X bit in the load headers
when it creates the 0 filled non-file backed mappings. It assumes all of these
mappings are RW BSS sections, which is not the case for PPC.

gcc/ld has an option (--secure-plt) to not do this, this is said to incur
a small performance penalty.

Currently, to support 32-bit binaries with PLT in BSS kernel maps *entire
brk area* with executable rights for all binaries, even --secure-plt ones.

Stop doing that.

Teach the ELF loader to check the X bit in the relevant load header
and create 0 filled anonymous mappings that are executable
if the load header requests that.

The patch was originally posted in 2012 by Jason Gunthorpe
and apparently ignored:

https://lkml.org/lkml/2012/9/30/138

Lightly run-tested.

Signed-off-by: Jason Gunthorpe 
Signed-off-by: Denys Vlasenko 
Acked-by: Kees Cook 
Acked-by: Michael Ellerman 
Tested-by: Jason Gunthorpe 
CC: Andrew Morton 
CC: Benjamin Herrenschmidt 
CC: Paul Mackerras 
CC: "Aneesh Kumar K.V" 
CC: Kees Cook 
CC: Oleg Nesterov 
CC: Michael Ellerman 
CC: Florian Weimer 
CC: linux...@kvack.org
CC: linuxppc-...@lists.ozlabs.org
CC: linux-kernel@vger.kernel.org
---
Changes since v6:
* rebased to current Linus tree
* sending to akpm

Changes since v5:
* made do_brk_flags() error out if any bits other than VM_EXEC are set.
  (Kees Cook: "With this, I'd be happy to Ack.")
  See https://patchwork.ozlabs.org/patch/661595/

Changes since v4:
* if (current->personality & READ_IMPLIES_EXEC), still use VM_EXEC
  for 32-bit executables.

Changes since v3:
* typo fix in commit message
* rebased to current Linus tree

Changes since v2:
* moved capability to map with VM_EXEC into vm_brk_flags()

Changes since v1:
* wrapped lines to not exceed 79 chars
* improved comment
* expanded CC list

 arch/powerpc/include/asm/page.h |  4 +++-
 fs/binfmt_elf.c | 30 ++
 include/linux/mm.h  |  1 +
 mm/mmap.c   | 24 +++-
 4 files changed, 45 insertions(+), 14 deletions(-)

diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h
index 56398e7..17d3d2c 100644
--- a/arch/powerpc/include/asm/page.h
+++ b/arch/powerpc/include/asm/page.h
@@ -230,7 +230,9 @@ extern long long virt_phys_offset;
  * and needs to be executable.  This means the whole heap ends
  * up being executable.
  */
-#define VM_DATA_DEFAULT_FLAGS32(VM_READ | VM_WRITE | VM_EXEC | \
+#define VM_DATA_DEFAULT_FLAGS32 \
+   (((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0) | \
+VM_READ | VM_WRITE | \
 VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
 
 #define VM_DATA_DEFAULT_FLAGS64(VM_READ | VM_WRITE | \
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 2472af2..065134b 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -91,12 +91,18 @@ static struct linux_binfmt elf_format = {
 
 #define BAD_ADDR(x) ((unsigned long)(x) >= TASK_SIZE)
 
-static int set_brk(unsigned long start, unsigned long end)
+static int set_brk(unsigned long start, unsigned long end, int prot)
 {
start = ELF_PAGEALIGN(start);
end = ELF_PAGEALIGN(end);
if (end > start) {
-   int error = vm_brk(start, end - start);
+   /*
+* Map the last of the bss segment.
+* If the header is requesting these pages to be
+* executable, honour that (ppc32 needs this).
+*/
+   int error = vm_brk_flags(start, end - start,
+   prot & PROT_EXEC ? VM_EXEC : 0);
if (error)
return error;
}
@@ -524,6 +530,7 @@ static unsigned long load_elf_interp(struct elfhdr 
*interp_elf_ex,
unsigned long load_addr = 0;
int load_addr_set = 0;
unsigned long last_bss = 0, elf_bss = 0;
+   int bss_prot = 0;
unsigned long error = ~0UL;
unsigned long total_size;
int i;
@@ -606,8 +613,10 @@ static unsigned l

[PATCH] scsi: aic7xxx: fix ahc_delay and ahd_delay

2016-10-14 Thread Denys Vlasenko
They are buggy:

while (usec > 0)
   udelay(usec % 1024);
   usec -= 1024;

For example, for usec = 100*1024 + 1, old code will udelay(1) 101 times,
i.e. it will be approximately equivalent to udelay(101),
not the expected udelay(102400).

This did not bite because all callers use values far from "pathological" ones,
such as 500 and 1000 - these work fine with buggy code.

This was reported in 2006 but was missed.

Signed-off-by: Denys Vlasenko 
CC: James Bottomley 
CC: Hannes Reinicke 
CC: linux-s...@vger.kernel.org
CC: linux-kernel@vger.kernel.org
---
 drivers/scsi/aic7xxx/aic79xx_osm.c | 7 ---
 drivers/scsi/aic7xxx/aic7xxx_osm.c | 7 ---
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/drivers/scsi/aic7xxx/aic79xx_osm.c 
b/drivers/scsi/aic7xxx/aic79xx_osm.c
index 2588b8f..e7a7838 100644
--- a/drivers/scsi/aic7xxx/aic79xx_osm.c
+++ b/drivers/scsi/aic7xxx/aic79xx_osm.c
@@ -380,9 +380,10 @@ ahd_delay(long usec)
 * multi-millisecond waits.  Wait at most
 * 1024us per call.
 */
-   while (usec > 0) {
-   udelay(usec % 1024);
-   usec -= 1024;
+   udelay(usec & 1023);
+   usec >>= 10;
+   while (--usec >= 0) {
+   udelay(1024);
}
 }
 
diff --git a/drivers/scsi/aic7xxx/aic7xxx_osm.c 
b/drivers/scsi/aic7xxx/aic7xxx_osm.c
index fc6a831..c81798e 100644
--- a/drivers/scsi/aic7xxx/aic7xxx_osm.c
+++ b/drivers/scsi/aic7xxx/aic7xxx_osm.c
@@ -388,9 +388,10 @@ ahc_delay(long usec)
 * multi-millisecond waits.  Wait at most
 * 1024us per call.
 */
-   while (usec > 0) {
-   udelay(usec % 1024);
-   usec -= 1024;
+   udelay(usec & 1023);
+   usec >>= 10;
+   while (--usec >= 0) {
+   udelay(1024);
}
 }
 
-- 
2.9.2



Re: Another gcc corruption bug (was Re: [PATCH] [RFC] x86: avoid -mtune=atom for objtool warnings)

2016-10-13 Thread Denys Vlasenko

On 10/13/2016 02:46 PM, Josh Poimboeuf wrote:

On Tue, Oct 11, 2016 at 10:38:42PM +0200, Arnd Bergmann wrote:

On Tuesday, October 11, 2016 10:51:46 AM CEST Josh Poimboeuf wrote:

Notice how it just falls off the end of the function.  We had a similar
bug before:

  https://lkml.kernel.org/r/20160413033649.7r3msnmo3trtq47z@treble


I remember that nightmare :(


  https://gcc.gnu.org/bugzilla/show_bug.cgi?id=70646

I'm not sure yet if this is the same gcc bug or a different one.  Maybe
it's related to the new GCC_PLUGIN_SANCOV?


I've reduced one of the test cases to this now:

/* gcc-6  -O2 -fno-strict-aliasing -fno-reorder-blocks -fno-omit-frame-pointer  
-Wno-pointer-sign -fsanitize-coverage=trace-pc -Wall -Werror -c snic_res.c -o 
snic_res.o */
typedef int spinlock_t;
extern unsigned int ioread32(void *);
struct vnic_wq_ctrl {
unsigned int error_status;
};
struct vnic_wq {
struct vnic_wq_ctrl *ctrl;
} mempool_t;
struct snic {
unsigned int wq_count;
__attribute__ ((__aligned__)) struct vnic_wq wq[1];
spinlock_t wq_lock[1];
};
unsigned int snic_log_q_error_err_status;
void snic_log_q_error(struct snic *snic)
{
unsigned int i;
for (i = 0; i < snic->wq_count; i++)
snic_log_q_error_err_status =
ioread32(&snic->wq[i].ctrl->error_status);
}

which gets compiled into

 :
   0:   55  push   %rbp
   1:   48 89 e5mov%rsp,%rbp
   4:   53  push   %rbx
   5:   48 89 fbmov%rdi,%rbx
   8:   48 83 ec 08 sub$0x8,%rsp
   c:   e8 00 00 00 00  callq  11 
d: R_X86_64_PC32__sanitizer_cov_trace_pc-0x4
  11:   8b 03   mov(%rbx),%eax
  13:   85 c0   test   %eax,%eax
  15:   75 11   jne28 
  17:   48 83 c4 08 add$0x8,%rsp
  1b:   5b  pop%rbx
  1c:   5d  pop%rbp
  1d:   e9 00 00 00 00  jmpq   22 
1e: R_X86_64_PC32   __sanitizer_cov_trace_pc-0x4
  22:   66 0f 1f 44 00 00   nopw   0x0(%rax,%rax,1)
  28:   e8 00 00 00 00  callq  2d 
29: R_X86_64_PC32   __sanitizer_cov_trace_pc-0x4
  2d:   48 8b 7b 10 mov0x10(%rbx),%rdi
  31:   e8 00 00 00 00  callq  36 
32: R_X86_64_PC32   ioread32-0x4
  36:   89 05 00 00 00 00   mov%eax,0x0(%rip)# 3c 

38: R_X86_64_PC32   snic_log_q_error_err_status-0x4
  3c:   83 3b 01cmpl   $0x1,(%rbx)
  3f:   76 d6   jbe17 
  41:   e8 00 00 00 00  callq  46 
42: R_X86_64_PC32   __sanitizer_cov_trace_pc-0x4


I opened a bug:

  https://gcc.gnu.org/bugzilla/show_bug.cgi?id=77966



Surprisingly, it's really "not a bug". The only way you can end up in this 
branch
is if you have a bug and run off the end of wq[1] array member: i.e.
if snic->wq_count >= 2. (See gcc BZ for smaller example)

It's debatable whether it's okay for gcc to just let buggy code to run off
and execute something random. It is surely surprising, and not debug-friendly.

An option to emit a crashing instruction (HLT, INT3, that sort of thing)
instead of just stopping code generation might be useful.


Re: [PATCH][v10] PM / hibernate: Verify the consistent of e820 memory map by md5 digest

2016-10-13 Thread Denys Vlasenko
On Fri, Sep 9, 2016 at 2:21 PM, Chen Yu  wrote:
> On some platforms, there is occasional panic triggered when trying to
> resume from hibernation, a typical panic looks like:
>
> "BUG: unable to handle kernel paging request at 880085894000
> IP: [] load_image_lzo+0x8c2/0xe70"
>
> Investigation carried out by Lee Chun-Yi show that this is because
> e820 map has been changed by BIOS across hibernation, and one
> of the page frames from suspend kernel is right located in restore
> kernel's unmapped region, so panic comes out when accessing unmapped
> kernel address.
>
> In order to expose this issue earlier, the md5 hash of e820 map
> is passed from suspend kernel to restore kernel, and the restore
> kernel will terminate the resume process once it finds the md5
> hash are not the same.
>
> As the format of image header has been modified, the magic number
> should also be adjusted as kernels with the same RESTORE_MAGIC have
> to use the same header format and interpret all of the fields in
> it in the same way.
>
> If the suspend kernel is built without md5 support, and the restore
> kernel has md5 support, then the latter will bypass the check process.
> Vice versa the restore kernel will bypass the check if it does not
> support md5 operation.
>
> Note:
> 1. Without this patch applied, it is possible that BIOS has
>provided an inconsistent memory map, but the resume kernel is still
>able to restore the image anyway(e.g, E820_RAM region is the superset
>of the previous one), although the system might be unstable. So this
>patch tries to treat any inconsistent e820 as illegal.
>
> 2. Another case is, this patch replies on comparing the e820_saved, but
>currently the e820_save might not be strictly the same across
>hibernation, even if BIOS has provided consistent e820 map - In
>theory mptable might modify the BIOS-provided e820_saved dynamically
>in early_reserve_e820_mpc_new, which would allocate a buffer from
>E820_RAM, and marks it from E820_RAM to E820_RESERVED).
>This is a potential and rare case we need to deal with in OS in
>the future.
>
> Suggested-by: Pavel Machek 
> Suggested-by: Rafael J. Wysocki 
> Cc: Rafael J. Wysocki 
> Cc: Pavel Machek 
> Cc: Lee Chun-Yi 
> Cc: Borislav Petkov 
> Acked-by: Pavel Machek 
> Signed-off-by: Chen Yu 
> ---

> +static int get_e820_md5(struct e820map *map, void *buf)
> +{
> +   struct scatterlist sg;
> +   struct crypto_ahash *tfm;
> +   struct ahash_request *req;
> +   int ret = 0;
> +
> +   tfm = crypto_alloc_ahash("md5", 0, CRYPTO_ALG_ASYNC);
> +   if (IS_ERR(tfm))
> +   return -ENOMEM;
> +
> +   req = ahash_request_alloc(tfm, GFP_KERNEL);
> +   if (!req) {
> +   ret = -ENOMEM;
> +   goto free_ahash;
> +   }

I looked elsewhere in kernel, and there is this idiom for placing
struct ahash_request on stack. Instead of the ahash_request_alloc()
and never-actually-tirggering-error handling, you can do:

 {
  AHASH_REQUEST_ON_STACK(req, tfm);

> +
> +   sg_init_one(&sg, (u8 *)map, sizeof(struct e820map));
> +   ahash_request_set_callback(req, 0, NULL, NULL);
> +   ahash_request_set_crypt(req, &sg, buf, sizeof(struct e820map));
> +
> +   if (crypto_ahash_digest(req))
> +   ret = -EINVAL;
> +
> +   ahash_request_free(req);
> + free_ahash:

and, naturally, the free() and the label would not be needed too,
just close the one extra brace:

> +   crypto_free_ahash(tfm);
> +
> +   return ret;

  }

> +}


[PATCH v6] powerpc: Do not make the entire heap executable

2016-10-03 Thread Denys Vlasenko
On 32-bit powerpc the ELF PLT sections of binaries (built with --bss-plt,
or with a toolchain which defaults to it) look like this:

  [17] .sbss NOBITS  0002aff8 01aff8 14 00  WA  0   0  4
  [18] .plt  NOBITS  0002b00c 01aff8 84 00 WAX  0   0  4
  [19] .bss  NOBITS  0002b090 01aff8 a4 00  WA  0   0  4

Which results in an ELF load header:

  Type   Offset   VirtAddr   PhysAddr   FileSiz MemSiz  Flg Align
  LOAD   0x019c70 0x00029c70 0x00029c70 0x01388 0x014c4 RWE 0x1

This is all correct, the load region containing the PLT is marked as
executable. Note that the PLT starts at 0002b00c but the file mapping ends at
0002aff8, so the PLT falls in the 0 fill section described by the load header,
and after a page boundary.

Unfortunately the generic ELF loader ignores the X bit in the load headers
when it creates the 0 filled non-file backed mappings. It assumes all of these
mappings are RW BSS sections, which is not the case for PPC.

gcc/ld has an option (--secure-plt) to not do this, this is said to incur
a small performance penalty.

Currently, to support 32-bit binaries with PLT in BSS kernel maps *entire
brk area* with executable rights for all binaries, even --secure-plt ones.

Stop doing that.

Teach the ELF loader to check the X bit in the relevant load header
and create 0 filled anonymous mappings that are executable
if the load header requests that.

The patch was originally posted in 2012 by Jason Gunthorpe
and apparently ignored:

https://lkml.org/lkml/2012/9/30/138

Lightly run-tested.

Signed-off-by: Jason Gunthorpe 
Signed-off-by: Denys Vlasenko 
Acked-by: Kees Cook 
Acked-by: Michael Ellerman 
CC: Benjamin Herrenschmidt 
CC: Paul Mackerras 
CC: "Aneesh Kumar K.V" 
CC: Kees Cook 
CC: Oleg Nesterov 
CC: Michael Ellerman 
CC: Florian Weimer 
CC: linux...@kvack.org
CC: linuxppc-...@lists.ozlabs.org
CC: linux-kernel@vger.kernel.org
---
Changes since v5:
* made do_brk_flags() error out if any bits other than VM_EXEC are set.
  (Kees Cook: "With this, I'd be happy to Ack.")
  See https://patchwork.ozlabs.org/patch/661595/

Changes since v4:
* if (current->personality & READ_IMPLIES_EXEC), still use VM_EXEC
  for 32-bit executables.

Changes since v3:
* typo fix in commit message
* rebased to current Linus tree

Changes since v2:
* moved capability to map with VM_EXEC into vm_brk_flags()

Changes since v1:
* wrapped lines to not exceed 79 chars
* improved comment
* expanded CC list

 arch/powerpc/include/asm/page.h |  4 +++-
 fs/binfmt_elf.c | 30 ++
 include/linux/mm.h  |  1 +
 mm/mmap.c   | 24 +++-
 4 files changed, 45 insertions(+), 14 deletions(-)

diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h
index 56398e7..17d3d2c 100644
--- a/arch/powerpc/include/asm/page.h
+++ b/arch/powerpc/include/asm/page.h
@@ -230,7 +230,9 @@ extern long long virt_phys_offset;
  * and needs to be executable.  This means the whole heap ends
  * up being executable.
  */
-#define VM_DATA_DEFAULT_FLAGS32(VM_READ | VM_WRITE | VM_EXEC | \
+#define VM_DATA_DEFAULT_FLAGS32 \
+   (((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0) | \
+VM_READ | VM_WRITE | \
 VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
 
 #define VM_DATA_DEFAULT_FLAGS64(VM_READ | VM_WRITE | \
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index e5495f3..12b0d19 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -91,12 +91,18 @@ static struct linux_binfmt elf_format = {
 
 #define BAD_ADDR(x) ((unsigned long)(x) >= TASK_SIZE)
 
-static int set_brk(unsigned long start, unsigned long end)
+static int set_brk(unsigned long start, unsigned long end, int prot)
 {
start = ELF_PAGEALIGN(start);
end = ELF_PAGEALIGN(end);
if (end > start) {
-   int error = vm_brk(start, end - start);
+   /*
+* Map the last of the bss segment.
+* If the header is requesting these pages to be
+* executable, honour that (ppc32 needs this).
+*/
+   int error = vm_brk_flags(start, end - start,
+   prot & PROT_EXEC ? VM_EXEC : 0);
if (error)
return error;
}
@@ -524,6 +530,7 @@ static unsigned long load_elf_interp(struct elfhdr 
*interp_elf_ex,
unsigned long load_addr = 0;
int load_addr_set = 0;
unsigned long last_bss = 0, elf_bss = 0;
+   int bss_prot = 0;
unsigned long error = ~0UL;
unsigned long total_size;
int i;
@@ -606,8 +613,10 @@ static unsigned long load_elf_interp(struct elfhdr 
*interp_elf_ex,
 * elf_bss and last_bss is the bss section.

Huge static buffer in liquidio

2016-09-24 Thread Denys Vlasenko
Hello,

I would like to discuss this commit:

commit d3d7e6c65f75de18ced10a98595a847f9f95f0ce
Author: Raghu Vatsavayi 
Date:   Tue Jun 21 22:53:07 2016 -0700

liquidio: Firmware image download

This patch has firmware image related changes for: firmware
release upon failure, support latest firmware version and
firmware download in 4MB chunks.


Here is a part of it:


+u8 fbuf[4 * 1024 * 1024];
 ^^^ what?? [also, why it is not static?]
+
 int octeon_download_firmware(struct octeon_device *oct, const u8 *data,
 size_t size)
 {
int ret = 0;
-   u8 *p;
-   u8 *buffer;
+   u8 *p = fbuf;


Don't you think that using 4 megabytes of static buffer
*just for the firmware upload* is not a good practice?

Further down the patch it's obvious that the buffer is not even
needed, because the firmware is already in memory, the "data"
and "size" parameters of this function point to it.

The meat of the change of the FW download is this (deleted
some debug messages code):

-   buffer = kmemdup(data, size, GFP_KERNEL);
-   if (!buffer)
-   return -ENOMEM;
-
-   p = buffer + sizeof(struct octeon_firmware_file_header);
+   data += sizeof(struct octeon_firmware_file_header);

/* load all images */
for (i = 0; i < be32_to_cpu(h->num_images); i++) {
load_addr = be64_to_cpu(h->desc[i].addr);
image_len = be32_to_cpu(h->desc[i].len);

-   /* download the image */
-   octeon_pci_write_core_mem(oct, load_addr, p, image_len);
+   /* Write in 4MB chunks*/
+   rem = image_len;

-   p += image_len;
+   while (rem) {
+   if (rem < (4 * 1024 * 1024))
+   size = rem;
+   else
+   size = 4 * 1024 * 1024;
+
+   memcpy(p, data, size);
+
+   /* download the image */
+   octeon_pci_write_core_mem(oct, load_addr, p, (u32)size);
+
+   data += size;
+   rem -= (u32)size;
+   load_addr += size;
+   }
}
+   dev_info(&oct->pci_dev->dev, "Writing boot command: %s\n",
+h->bootcmd);

/* Invoke the bootcmd */
ret = octeon_console_send_cmd(oct, h->bootcmd, 50);

-done_downloading:
-   kfree(buffer);


octeon_pci_write_core_mem() takes spinlock around copy op,
I take this was the reason for this change: reduce
IRQ-disabled time.

Do you actually need an intermediate fbuf[] buffer here?
(IOW: can't you just write data to PCI from memory area pointed
by "data" ptr?)

If there is indeed a reason for intermediate buffer,
why did you drop the approach of having a temporary kmalloc'ed
buffer and switches to a static (and *huge*) buffer?

In my opinion, 4 Mbyte buffer is an overkill in any case.
A buffer of ~4..16 Kbyte one would work just fine - it's not like
you need to squeeze last 0.5% of speed when you upload firmware.


[tip:x86/boot] x86/e820: Use much less memory for e820/e820_saved, save up to 120k

2016-09-21 Thread tip-bot for Denys Vlasenko
Commit-ID:  1827822902cf659d60d3413fd42c7e6cbd18df4d
Gitweb: http://git.kernel.org/tip/1827822902cf659d60d3413fd42c7e6cbd18df4d
Author: Denys Vlasenko 
AuthorDate: Sun, 18 Sep 2016 20:21:25 +0200
Committer:  Ingo Molnar 
CommitDate: Wed, 21 Sep 2016 15:02:12 +0200

x86/e820: Use much less memory for e820/e820_saved, save up to 120k

The maximum size of e820 map array for EFI systems is defined as
E820_X_MAX (E820MAX + 3 * MAX_NUMNODES).

In x86_64 defconfig, this ends up with E820_X_MAX = 320, e820 and e820_saved
are 6404 bytes each.

With larger configs, for example Fedora kernels, E820_X_MAX = 3200, e820
and e820_saved are 64004 bytes each. Most of this space is wasted.
Typical machines have some 20-30 e820 areas at most.

After previous patch, e820 and e820_saved are pointers to e280 maps.

Change them to initially point to maps which are __initdata.

At the very end of kernel init, just before __init[data] sections are freed
in free_initmem(), allocate smaller blocks, copy maps there,
and change pointers.

The late switch makes sure that all functions which can be used to change
e820 maps are no longer accessible (they are all __init functions).

Run-tested.

Signed-off-by: Denys Vlasenko 
Acked-by: Thomas Gleixner 
Cc: Andy Lutomirski 
Cc: Andy Lutomirski 
Cc: Borislav Petkov 
Cc: Brian Gerst 
Cc: H. Peter Anvin 
Cc: Josh Poimboeuf 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Yinghai Lu 
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/20160918182125.21000-1-dvlas...@redhat.com
Signed-off-by: Ingo Molnar 
---
 arch/x86/kernel/e820.c | 8 
 arch/x86/mm/init.c | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 585000c9..bb8c690 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -40,10 +40,10 @@
  * user can e.g. boot the original kernel with mem=1G while still booting the
  * next kernel with full memory.
  */
-static struct e820map initial_e820;
-static struct e820map initial_e820_saved;
-struct e820map *e820 = &initial_e820;
-struct e820map *e820_saved = &initial_e820_saved;
+static struct e820map initial_e820  __initdata;
+static struct e820map initial_e820_saved  __initdata;
+struct e820map *e820 __refdata = &initial_e820;
+struct e820map *e820_saved __refdata = &initial_e820_saved;
 
 /* For PCI or other memory-mapped resources */
 unsigned long pci_mem_start = 0xaeedbabe;
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 167deae..22af912 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -699,9 +699,9 @@ void free_init_pages(char *what, unsigned long begin, 
unsigned long end)
}
 }
 
-void free_initmem(void)
+void __ref free_initmem(void)
 {
-   /* e820_reallocate_tables(); - disabled for now */
+   e820_reallocate_tables();
 
free_init_pages("unused kernel",
(unsigned long)(&__init_begin),


[tip:x86/boot] x86/e820: Prepare e280 code for switch to dynamic storage

2016-09-21 Thread tip-bot for Denys Vlasenko
Commit-ID:  475339684ef19e46f4702e2d185a869a5c454688
Gitweb: http://git.kernel.org/tip/475339684ef19e46f4702e2d185a869a5c454688
Author: Denys Vlasenko 
AuthorDate: Sat, 17 Sep 2016 23:39:26 +0200
Committer:  Ingo Molnar 
CommitDate: Wed, 21 Sep 2016 15:02:12 +0200

x86/e820: Prepare e280 code for switch to dynamic storage

This patch turns e820 and e820_saved into pointers to e820 tables,
of the same size as before.

Signed-off-by: Denys Vlasenko 
Acked-by: Thomas Gleixner 
Cc: Andy Lutomirski 
Cc: Andy Lutomirski 
Cc: Borislav Petkov 
Cc: Brian Gerst 
Cc: H. Peter Anvin 
Cc: Josh Poimboeuf 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Yinghai Lu 
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/20160917213927.1787-2-dvlas...@redhat.com
Signed-off-by: Ingo Molnar 
---
 arch/x86/include/asm/e820.h   |   6 +-
 arch/x86/kernel/e820.c| 125 +++---
 arch/x86/kernel/early-quirks.c|   2 +-
 arch/x86/kernel/kexec-bzimage64.c |   4 +-
 arch/x86/kernel/resource.c|   4 +-
 arch/x86/kernel/setup.c   |   8 +--
 arch/x86/kernel/tboot.c   |   8 +--
 arch/x86/mm/init.c|   2 +
 arch/x86/platform/efi/efi.c   |   2 +-
 arch/x86/xen/setup.c  |   2 +-
 10 files changed, 98 insertions(+), 65 deletions(-)

diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h
index 3ab0537..476b574 100644
--- a/arch/x86/include/asm/e820.h
+++ b/arch/x86/include/asm/e820.h
@@ -10,8 +10,8 @@
 #include 
 #ifndef __ASSEMBLY__
 /* see comment in arch/x86/kernel/e820.c */
-extern struct e820map e820;
-extern struct e820map e820_saved;
+extern struct e820map *e820;
+extern struct e820map *e820_saved;
 
 extern unsigned long pci_mem_start;
 extern int e820_any_mapped(u64 start, u64 end, unsigned type);
@@ -53,6 +53,8 @@ extern void e820_reserve_resources_late(void);
 extern void setup_memory_map(void);
 extern char *default_machine_specific_memory_setup(void);
 
+extern void e820_reallocate_tables(void);
+
 /*
  * Returns true iff the specified range [s,e) is completely contained inside
  * the ISA region.
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 4d3dd9a..585000c9 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -40,8 +40,10 @@
  * user can e.g. boot the original kernel with mem=1G while still booting the
  * next kernel with full memory.
  */
-struct e820map e820;
-struct e820map e820_saved;
+static struct e820map initial_e820;
+static struct e820map initial_e820_saved;
+struct e820map *e820 = &initial_e820;
+struct e820map *e820_saved = &initial_e820_saved;
 
 /* For PCI or other memory-mapped resources */
 unsigned long pci_mem_start = 0xaeedbabe;
@@ -58,8 +60,8 @@ e820_any_mapped(u64 start, u64 end, unsigned type)
 {
int i;
 
-   for (i = 0; i < e820.nr_map; i++) {
-   struct e820entry *ei = &e820.map[i];
+   for (i = 0; i < e820->nr_map; i++) {
+   struct e820entry *ei = &e820->map[i];
 
if (type && ei->type != type)
continue;
@@ -81,8 +83,8 @@ int __init e820_all_mapped(u64 start, u64 end, unsigned type)
 {
int i;
 
-   for (i = 0; i < e820.nr_map; i++) {
-   struct e820entry *ei = &e820.map[i];
+   for (i = 0; i < e820->nr_map; i++) {
+   struct e820entry *ei = &e820->map[i];
 
if (type && ei->type != type)
continue;
@@ -128,7 +130,7 @@ static void __init __e820_add_region(struct e820map *e820x, 
u64 start, u64 size,
 
 void __init e820_add_region(u64 start, u64 size, int type)
 {
-   __e820_add_region(&e820, start, size, type);
+   __e820_add_region(e820, start, size, type);
 }
 
 static void __init e820_print_type(u32 type)
@@ -164,12 +166,12 @@ void __init e820_print_map(char *who)
 {
int i;
 
-   for (i = 0; i < e820.nr_map; i++) {
+   for (i = 0; i < e820->nr_map; i++) {
printk(KERN_INFO "%s: [mem %#018Lx-%#018Lx] ", who,
-  (unsigned long long) e820.map[i].addr,
+  (unsigned long long) e820->map[i].addr,
   (unsigned long long)
-  (e820.map[i].addr + e820.map[i].size - 1));
-   e820_print_type(e820.map[i].type);
+  (e820->map[i].addr + e820->map[i].size - 1));
+   e820_print_type(e820->map[i].type);
printk(KERN_CONT "\n");
}
 }
@@ -493,13 +495,13 @@ static u64 __init __e820_update_range(struct e820map 
*e820x, u64 start,
 u64 __init e820_update_range(u64 start, u64 size, unsigned old_type,
 unsigned new_type)
 {
-   return __e820_update_range(&e820, start, size, old_type, new_type);
+   return __e820_update_range(e820, start, size, old_type, new_type);

[tip:x86/boot] x86/e820: Mark some static functions __init

2016-09-21 Thread tip-bot for Denys Vlasenko
Commit-ID:  8c2103f224216a45c1a4d7aebbc13f3e007cde34
Gitweb: http://git.kernel.org/tip/8c2103f224216a45c1a4d7aebbc13f3e007cde34
Author: Denys Vlasenko 
AuthorDate: Sat, 17 Sep 2016 23:39:25 +0200
Committer:  Ingo Molnar 
CommitDate: Wed, 21 Sep 2016 15:02:11 +0200

x86/e820: Mark some static functions __init

They are all called only from other __init functions in e820.c

Signed-off-by: Denys Vlasenko 
Acked-by: Thomas Gleixner 
Cc: Andy Lutomirski 
Cc: Andy Lutomirski 
Cc: Borislav Petkov 
Cc: Brian Gerst 
Cc: H. Peter Anvin 
Cc: Josh Poimboeuf 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Yinghai Lu 
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/20160917213927.1787-1-dvlas...@redhat.com
Signed-off-by: Ingo Molnar 
---
 arch/x86/kernel/e820.c | 12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 871f186..4d3dd9a 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -802,7 +802,7 @@ unsigned long __init e820_end_of_low_ram_pfn(void)
return e820_end_pfn(1UL << (32-PAGE_SHIFT));
 }
 
-static void early_panic(char *msg)
+static void __init early_panic(char *msg)
 {
early_printk(msg);
panic(msg);
@@ -912,7 +912,7 @@ void __init finish_e820_parsing(void)
}
 }
 
-static const char *e820_type_to_string(int e820_type)
+static const char *__init e820_type_to_string(int e820_type)
 {
switch (e820_type) {
case E820_RESERVED_KERN:
@@ -926,7 +926,7 @@ static const char *e820_type_to_string(int e820_type)
}
 }
 
-static unsigned long e820_type_to_iomem_type(int e820_type)
+static unsigned long __init e820_type_to_iomem_type(int e820_type)
 {
switch (e820_type) {
case E820_RESERVED_KERN:
@@ -942,7 +942,7 @@ static unsigned long e820_type_to_iomem_type(int e820_type)
}
 }
 
-static unsigned long e820_type_to_iores_desc(int e820_type)
+static unsigned long __init e820_type_to_iores_desc(int e820_type)
 {
switch (e820_type) {
case E820_ACPI:
@@ -961,7 +961,7 @@ static unsigned long e820_type_to_iores_desc(int e820_type)
}
 }
 
-static bool do_mark_busy(u32 type, struct resource *res)
+static bool __init do_mark_busy(u32 type, struct resource *res)
 {
/* this is the legacy bios/dos rom-shadow + mmio region */
if (res->start < (1ULL<<20))
@@ -1027,7 +1027,7 @@ void __init e820_reserve_resources(void)
 }
 
 /* How much should we pad RAM ending depending on where it is? */
-static unsigned long ram_alignment(resource_size_t pos)
+static unsigned long __init ram_alignment(resource_size_t pos)
 {
unsigned long mb = pos >> 20;
 


[tip:x86/apic] x86/apic: Get rid of apic_version[] array

2016-09-19 Thread tip-bot for Denys Vlasenko
Commit-ID:  cff9ab2b291e64259d97add48fe073c081afe4e2
Gitweb: http://git.kernel.org/tip/cff9ab2b291e64259d97add48fe073c081afe4e2
Author: Denys Vlasenko 
AuthorDate: Tue, 13 Sep 2016 20:12:32 +0200
Committer:  Thomas Gleixner 
CommitDate: Tue, 20 Sep 2016 00:31:19 +0200

x86/apic: Get rid of apic_version[] array

The array has a size of MAX_LOCAL_APIC, which can be as large as 32k, so it
can consume up to 128k.

The array has been there forever and was never used for anything useful
other than a version mismatch check which was introduced in 2009.

There is no reason to store the version in an array. The kernel is not
prepared to handle different APIC versions anyway, so the real important
part is to detect a version mismatch and warn about it, which can be done
with a single variable as well.

[ tglx: Massaged changelog ]

Signed-off-by: Denys Vlasenko 
CC: Andy Lutomirski 
CC: Borislav Petkov 
CC: Brian Gerst 
CC: Mike Travis 
Link: http://lkml.kernel.org/r/20160913181232.30815-1-dvlas...@redhat.com
Signed-off-by: Thomas Gleixner 

---
 arch/x86/include/asm/mpspec.h   |  2 +-
 arch/x86/kernel/acpi/boot.c |  2 +-
 arch/x86/kernel/apic/apic.c | 17 +++--
 arch/x86/kernel/apic/io_apic.c  |  4 ++--
 arch/x86/kernel/apic/probe_32.c |  2 +-
 arch/x86/kernel/smpboot.c   | 10 +-
 6 files changed, 17 insertions(+), 20 deletions(-)

diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h
index b07233b..c2f94dc 100644
--- a/arch/x86/include/asm/mpspec.h
+++ b/arch/x86/include/asm/mpspec.h
@@ -6,7 +6,6 @@
 #include 
 #include 
 
-extern int apic_version[];
 extern int pic_mode;
 
 #ifdef CONFIG_X86_32
@@ -40,6 +39,7 @@ extern int mp_bus_id_to_type[MAX_MP_BUSSES];
 extern DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
 
 extern unsigned int boot_cpu_physical_apicid;
+extern u8 boot_cpu_apic_version;
 extern unsigned long mp_lapic_addr;
 
 #ifdef CONFIG_X86_LOCAL_APIC
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 1ad5fe2..0447e31 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -182,7 +182,7 @@ static int acpi_register_lapic(int id, u32 acpiid, u8 
enabled)
}
 
if (boot_cpu_physical_apicid != -1U)
-   ver = apic_version[boot_cpu_physical_apicid];
+   ver = boot_cpu_apic_version;
 
cpu = generic_processor_info(id, ver);
if (cpu >= 0)
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 1cbae30..779dae5 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -64,6 +64,8 @@ unsigned disabled_cpus;
 unsigned int boot_cpu_physical_apicid = -1U;
 EXPORT_SYMBOL_GPL(boot_cpu_physical_apicid);
 
+u8 boot_cpu_apic_version;
+
 /*
  * The highest APIC ID seen during enumeration.
  */
@@ -1812,8 +1814,7 @@ void __init init_apic_mappings(void)
 * since smp_sanity_check is prepared for such a case
 * and disable smp mode
 */
-   apic_version[new_apicid] =
-GET_APIC_VERSION(apic_read(APIC_LVR));
+   boot_cpu_apic_version = GET_APIC_VERSION(apic_read(APIC_LVR));
}
 }
 
@@ -1828,13 +1829,10 @@ void __init register_lapic_address(unsigned long 
address)
}
if (boot_cpu_physical_apicid == -1U) {
boot_cpu_physical_apicid  = read_apic_id();
-   apic_version[boot_cpu_physical_apicid] =
-GET_APIC_VERSION(apic_read(APIC_LVR));
+   boot_cpu_apic_version = GET_APIC_VERSION(apic_read(APIC_LVR));
}
 }
 
-int apic_version[MAX_LOCAL_APIC];
-
 /*
  * Local APIC interrupts
  */
@@ -2124,11 +2122,10 @@ int generic_processor_info(int apicid, int version)
   cpu, apicid);
version = 0x10;
}
-   apic_version[apicid] = version;
 
-   if (version != apic_version[boot_cpu_physical_apicid]) {
+   if (version != boot_cpu_apic_version) {
pr_warning("BIOS bug: APIC version mismatch, boot CPU: %x, CPU 
%d: version %x\n",
-   apic_version[boot_cpu_physical_apicid], cpu, version);
+   boot_cpu_apic_version, cpu, version);
}
 
physid_set(apicid, phys_cpu_present_map);
@@ -2271,7 +2268,7 @@ int __init APIC_init_uniprocessor(void)
 * Complain if the BIOS pretends there is one.
 */
if (!boot_cpu_has(X86_FEATURE_APIC) &&
-   APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) {
+   APIC_INTEGRATED(boot_cpu_apic_version)) {
pr_err("BIOS bug, local APIC 0x%x not detected!...\n",
boot_cpu_physical_apicid);
return -1;
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 7491f41..48e6d84 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@

[PATCH 3/3 v2] x86/e820: Use much less memory for e820/e820_saved, save up to 120k

2016-09-18 Thread Denys Vlasenko
The maximum size of e820 map array for EFI systems is defined as
E820_X_MAX (E820MAX + 3 * MAX_NUMNODES).

In x86_64 defconfig, this ends up with E820_X_MAX = 320, e820 and e820_saved
are 6404 bytes each.

With larger configs, for example Fedora kernels, E820_X_MAX = 3200, e820
and e820_saved are 64004 bytes each. Most of this space is wasted.
Typical machines have some 20-30 e820 areas at most.

After previous patch, e820 and e820_saved are pointers to e280 maps.

Change them to initially point to maps which are __initdata.

At the very end of kernel init, just before __init[data] sections are freed
in free_initmem(), allocate smaller blocks, copy maps there,
and change pointers.

The late switch makes sure that all functions which can be used to change
e820 maps are no longer accessible (they are all __init functions).

Run-tested.

Signed-off-by: Denys Vlasenko 
CC: Ingo Molnar 
CC: Andy Lutomirski 
CC: "H. Peter Anvin" 
CC: Borislav Petkov 
CC: Brian Gerst 
CC: x...@kernel.org
CC: linux-kernel@vger.kernel.org
---
Changes since v1: free_initmem() needs __ref annotation.

 arch/x86/kernel/e820.c | 8 
 arch/x86/mm/init.c | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index cf7c9ad..7e9249c 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -40,10 +40,10 @@
  * user can e.g. boot the original kernel with mem=1G while still booting the
  * next kernel with full memory.
  */
-static struct e820map initial_e820;
-static struct e820map initial_e820_saved;
-struct e820map *e820 = &initial_e820;
-struct e820map *e820_saved = &initial_e820_saved;
+static struct e820map initial_e820  __initdata;
+static struct e820map initial_e820_saved  __initdata;
+struct e820map *e820 __refdata = &initial_e820;
+struct e820map *e820_saved __refdata = &initial_e820_saved;
 
 /* For PCI or other memory-mapped resources */
 unsigned long pci_mem_start = 0xaeedbabe;
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 167deae..22af912 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -699,9 +699,9 @@ void free_init_pages(char *what, unsigned long begin, 
unsigned long end)
}
 }
 
-void free_initmem(void)
+void __ref free_initmem(void)
 {
-   /* e820_reallocate_tables(); - disabled for now */
+   e820_reallocate_tables();
 
free_init_pages("unused kernel",
(unsigned long)(&__init_begin),
-- 
2.9.2



Re: [PATCH 2/2 v2] x86/e820: Use much less memory for e820/e820_saved, save up to 120k

2016-09-18 Thread Denys Vlasenko

On 09/18/2016 10:31 AM, Ingo Molnar wrote:

* Denys Vlasenko  wrote:

On 09/15/2016 09:04 AM, Ingo Molnar wrote:

* Denys Vlasenko  wrote:


The maximum size of e820 map array for EFI systems is defined as
E820_X_MAX (E820MAX + 3 * MAX_NUMNODES).

In x86_64 defconfig, this ends up with E820_X_MAX = 320, e820 and e820_saved
are 6404 bytes each.

With larger configs, for example Fedora kernels, E820_X_MAX = 3200, e820 and 
e820_saved
are 64004 bytes each. Most of this space is wasted. Typical machines have some 
20-30
e820 areas at most.

This patch turns e820 and e820_saved to pointers which initially point to 
__initdata
tables, of the same size as before.

At the very end of setup_arch(), when we are done fiddling with these maps,
allocate smaller alloc_bootmem blocks, copy maps there, and change pointers.

Run-tested.



+/*
+ * Initial e820 and e820_saved are largish __initdata arrays.
+ * Copy them to (usually much smaller) dynamically allocated area.
+ * This is done after all tweaks we ever do to them.
+ */
+__init void e820_reallocate_tables(void)
+{
+   struct e820map *n;
+   int size;
+
+   size = offsetof(struct e820map, map) + sizeof(struct e820entry) * 
e820->nr_map;
+   n = alloc_bootmem(size);
+   memcpy(n, e820, size);
+   e820 = n;
+
+   size = offsetof(struct e820map, map) + sizeof(struct e820entry) * 
e820_saved->nr_map;
+   n = alloc_bootmem(size);
+   memcpy(n, e820_saved, size);
+   e820_saved = n;
+}


Ok, this makes me quite nervous, could you please split this into two patches so
that any fails can be nicely bisected to?


No problem.


First patch only does the pointerization changes with a trivial placeholder
structure (full size, static allocated), second patch does all the dangerous 
bits
such as changing it to __initdata, allocating and copying over bits.

Also, could we please also add some minimal debugging facility to make sure the
memory table does not get extended after it's been reallocated?


I have another idea: run e820_reallocate_tables() later, just before
we free __init and __initdata. Then e820 tables _can't_ be_ changed -
all functions which do that are __init functions.

Will test this now, and send a patchset.


Could we also mark it __ro_after_init?


__ro_after_init makes sense only for statically allocated objects.
e820_reallocate_tables() copies e280 maps to kmalloced memory.


[PATCH 3/3] x86/e820: Use much less memory for e820/e820_saved, save up to 120k

2016-09-17 Thread Denys Vlasenko
The maximum size of e820 map array for EFI systems is defined as
E820_X_MAX (E820MAX + 3 * MAX_NUMNODES).

In x86_64 defconfig, this ends up with E820_X_MAX = 320, e820 and e820_saved
are 6404 bytes each.

With larger configs, for example Fedora kernels, E820_X_MAX = 3200, e820
and e820_saved are 64004 bytes each. Most of this space is wasted.
Typical machines have some 20-30 e820 areas at most.

After previous patch, e820 and e820_saved are pointers to e280 maps.

Change them to initially point to maps which are __initdata.

At the very end of kernel init, just before __init[data] sections are freed
in free_initmem(), allocate smaller blocks, copy maps there,
and change pointers.

The late switch makes sure that all functions which can be used to change
e820 maps are no longer accessible (they are all __init functions).

Run-tested.

Signed-off-by: Denys Vlasenko 
CC: Ingo Molnar 
CC: Andy Lutomirski 
CC: "H. Peter Anvin" 
CC: Borislav Petkov 
CC: Brian Gerst 
CC: x...@kernel.org
CC: linux-kernel@vger.kernel.org
---
 arch/x86/kernel/e820.c | 8 
 arch/x86/mm/init.c | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index cf7c9ad..7e9249c 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -40,10 +40,10 @@
  * user can e.g. boot the original kernel with mem=1G while still booting the
  * next kernel with full memory.
  */
-static struct e820map initial_e820;
-static struct e820map initial_e820_saved;
-struct e820map *e820 = &initial_e820;
-struct e820map *e820_saved = &initial_e820_saved;
+static struct e820map initial_e820  __initdata;
+static struct e820map initial_e820_saved  __initdata;
+struct e820map *e820 __refdata = &initial_e820;
+struct e820map *e820_saved __refdata = &initial_e820_saved;
 
 /* For PCI or other memory-mapped resources */
 unsigned long pci_mem_start = 0xaeedbabe;
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 167deae..7be8dbf 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -701,7 +701,7 @@ void free_init_pages(char *what, unsigned long begin, 
unsigned long end)
 
 void free_initmem(void)
 {
-   /* e820_reallocate_tables(); - disabled for now */
+   e820_reallocate_tables();
 
free_init_pages("unused kernel",
(unsigned long)(&__init_begin),
-- 
2.9.2



[PATCH 1/3] x86/e820: Mark some static functions __init

2016-09-17 Thread Denys Vlasenko
They are all called only from other __init functions in e820.c

Signed-off-by: Denys Vlasenko 
CC: Ingo Molnar 
CC: Andy Lutomirski 
CC: "H. Peter Anvin" 
CC: Borislav Petkov 
CC: Brian Gerst 
CC: x...@kernel.org
CC: linux-kernel@vger.kernel.org
---
 arch/x86/kernel/e820.c | 12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 621b501..a6d4f10 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -802,7 +802,7 @@ unsigned long __init e820_end_of_low_ram_pfn(void)
return e820_end_pfn(1UL << (32-PAGE_SHIFT));
 }
 
-static void early_panic(char *msg)
+static void __init early_panic(char *msg)
 {
early_printk(msg);
panic(msg);
@@ -912,7 +912,7 @@ void __init finish_e820_parsing(void)
}
 }
 
-static const char *e820_type_to_string(int e820_type)
+static const char *__init e820_type_to_string(int e820_type)
 {
switch (e820_type) {
case E820_RESERVED_KERN:
@@ -926,7 +926,7 @@ static const char *e820_type_to_string(int e820_type)
}
 }
 
-static unsigned long e820_type_to_iomem_type(int e820_type)
+static unsigned long __init e820_type_to_iomem_type(int e820_type)
 {
switch (e820_type) {
case E820_RESERVED_KERN:
@@ -942,7 +942,7 @@ static unsigned long e820_type_to_iomem_type(int e820_type)
}
 }
 
-static unsigned long e820_type_to_iores_desc(int e820_type)
+static unsigned long __init e820_type_to_iores_desc(int e820_type)
 {
switch (e820_type) {
case E820_ACPI:
@@ -961,7 +961,7 @@ static unsigned long e820_type_to_iores_desc(int e820_type)
}
 }
 
-static bool do_mark_busy(u32 type, struct resource *res)
+static bool __init do_mark_busy(u32 type, struct resource *res)
 {
/* this is the legacy bios/dos rom-shadow + mmio region */
if (res->start < (1ULL<<20))
@@ -1027,7 +1027,7 @@ void __init e820_reserve_resources(void)
 }
 
 /* How much should we pad RAM ending depending on where it is? */
-static unsigned long ram_alignment(resource_size_t pos)
+static unsigned long __init ram_alignment(resource_size_t pos)
 {
unsigned long mb = pos >> 20;
 
-- 
2.9.2



[PATCH 2/3] x86/e820: Prepare e280 code for switch to dynamic storage

2016-09-17 Thread Denys Vlasenko
This patch turns e820 and e820_saved into pointers to e820 tables,
of the same size as before.

Signed-off-by: Denys Vlasenko 
CC: Ingo Molnar 
CC: Andy Lutomirski 
CC: "H. Peter Anvin" 
CC: Borislav Petkov 
CC: Brian Gerst 
CC: x...@kernel.org
CC: linux-kernel@vger.kernel.org
Signed-off-by: Denys Vlasenko 
---
 arch/x86/include/asm/e820.h   |   6 +-
 arch/x86/kernel/e820.c| 125 +++---
 arch/x86/kernel/early-quirks.c|   2 +-
 arch/x86/kernel/kexec-bzimage64.c |   4 +-
 arch/x86/kernel/resource.c|   4 +-
 arch/x86/kernel/setup.c   |   8 +--
 arch/x86/kernel/tboot.c   |   8 +--
 arch/x86/mm/init.c|   2 +
 arch/x86/platform/efi/efi.c   |   2 +-
 arch/x86/xen/setup.c  |   2 +-
 10 files changed, 98 insertions(+), 65 deletions(-)

diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h
index 3ab0537..476b574 100644
--- a/arch/x86/include/asm/e820.h
+++ b/arch/x86/include/asm/e820.h
@@ -10,8 +10,8 @@
 #include 
 #ifndef __ASSEMBLY__
 /* see comment in arch/x86/kernel/e820.c */
-extern struct e820map e820;
-extern struct e820map e820_saved;
+extern struct e820map *e820;
+extern struct e820map *e820_saved;
 
 extern unsigned long pci_mem_start;
 extern int e820_any_mapped(u64 start, u64 end, unsigned type);
@@ -53,6 +53,8 @@ extern void e820_reserve_resources_late(void);
 extern void setup_memory_map(void);
 extern char *default_machine_specific_memory_setup(void);
 
+extern void e820_reallocate_tables(void);
+
 /*
  * Returns true iff the specified range [s,e) is completely contained inside
  * the ISA region.
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index a6d4f10..cf7c9ad 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -40,8 +40,10 @@
  * user can e.g. boot the original kernel with mem=1G while still booting the
  * next kernel with full memory.
  */
-struct e820map e820;
-struct e820map e820_saved;
+static struct e820map initial_e820;
+static struct e820map initial_e820_saved;
+struct e820map *e820 = &initial_e820;
+struct e820map *e820_saved = &initial_e820_saved;
 
 /* For PCI or other memory-mapped resources */
 unsigned long pci_mem_start = 0xaeedbabe;
@@ -58,8 +60,8 @@ e820_any_mapped(u64 start, u64 end, unsigned type)
 {
int i;
 
-   for (i = 0; i < e820.nr_map; i++) {
-   struct e820entry *ei = &e820.map[i];
+   for (i = 0; i < e820->nr_map; i++) {
+   struct e820entry *ei = &e820->map[i];
 
if (type && ei->type != type)
continue;
@@ -81,8 +83,8 @@ int __init e820_all_mapped(u64 start, u64 end, unsigned type)
 {
int i;
 
-   for (i = 0; i < e820.nr_map; i++) {
-   struct e820entry *ei = &e820.map[i];
+   for (i = 0; i < e820->nr_map; i++) {
+   struct e820entry *ei = &e820->map[i];
 
if (type && ei->type != type)
continue;
@@ -128,7 +130,7 @@ static void __init __e820_add_region(struct e820map *e820x, 
u64 start, u64 size,
 
 void __init e820_add_region(u64 start, u64 size, int type)
 {
-   __e820_add_region(&e820, start, size, type);
+   __e820_add_region(e820, start, size, type);
 }
 
 static void __init e820_print_type(u32 type)
@@ -164,12 +166,12 @@ void __init e820_print_map(char *who)
 {
int i;
 
-   for (i = 0; i < e820.nr_map; i++) {
+   for (i = 0; i < e820->nr_map; i++) {
printk(KERN_INFO "%s: [mem %#018Lx-%#018Lx] ", who,
-  (unsigned long long) e820.map[i].addr,
+  (unsigned long long) e820->map[i].addr,
   (unsigned long long)
-  (e820.map[i].addr + e820.map[i].size - 1));
-   e820_print_type(e820.map[i].type);
+  (e820->map[i].addr + e820->map[i].size - 1));
+   e820_print_type(e820->map[i].type);
printk(KERN_CONT "\n");
}
 }
@@ -493,13 +495,13 @@ static u64 __init __e820_update_range(struct e820map 
*e820x, u64 start,
 u64 __init e820_update_range(u64 start, u64 size, unsigned old_type,
 unsigned new_type)
 {
-   return __e820_update_range(&e820, start, size, old_type, new_type);
+   return __e820_update_range(e820, start, size, old_type, new_type);
 }
 
 static u64 __init e820_update_range_saved(u64 start, u64 size,
  unsigned old_type, unsigned new_type)
 {
-   return __e820_update_range(&e820_saved, start, size, old_type,
+   return __e820_update_range(e820_saved, start, size, old_type,
 new_type);
 }
 
@@ -521,8 +523,8 @@ u64 __init e820_remove_range(u64 start, u64 size, unsigned 
old_type,
e820_print_type(old_ty

Re: [PATCH 2/2 v2] x86/e820: Use much less memory for e820/e820_saved, save up to 120k

2016-09-17 Thread Denys Vlasenko

On 09/15/2016 09:04 AM, Ingo Molnar wrote:


* Denys Vlasenko  wrote:


The maximum size of e820 map array for EFI systems is defined as
E820_X_MAX (E820MAX + 3 * MAX_NUMNODES).

In x86_64 defconfig, this ends up with E820_X_MAX = 320, e820 and e820_saved
are 6404 bytes each.

With larger configs, for example Fedora kernels, E820_X_MAX = 3200, e820 and 
e820_saved
are 64004 bytes each. Most of this space is wasted. Typical machines have some 
20-30
e820 areas at most.

This patch turns e820 and e820_saved to pointers which initially point to 
__initdata
tables, of the same size as before.

At the very end of setup_arch(), when we are done fiddling with these maps,
allocate smaller alloc_bootmem blocks, copy maps there, and change pointers.

Run-tested.



+/*
+ * Initial e820 and e820_saved are largish __initdata arrays.
+ * Copy them to (usually much smaller) dynamically allocated area.
+ * This is done after all tweaks we ever do to them.
+ */
+__init void e820_reallocate_tables(void)
+{
+   struct e820map *n;
+   int size;
+
+   size = offsetof(struct e820map, map) + sizeof(struct e820entry) * 
e820->nr_map;
+   n = alloc_bootmem(size);
+   memcpy(n, e820, size);
+   e820 = n;
+
+   size = offsetof(struct e820map, map) + sizeof(struct e820entry) * 
e820_saved->nr_map;
+   n = alloc_bootmem(size);
+   memcpy(n, e820_saved, size);
+   e820_saved = n;
+}


Ok, this makes me quite nervous, could you please split this into two patches so
that any fails can be nicely bisected to?


No problem.


First patch only does the pointerization changes with a trivial placeholder
structure (full size, static allocated), second patch does all the dangerous 
bits
such as changing it to __initdata, allocating and copying over bits.

Also, could we please also add some minimal debugging facility to make sure the
memory table does not get extended after it's been reallocated?


I have another idea: run e820_reallocate_tables() later, just before
we free __init and __initdata. Then e820 tables _can't_ be_ changed -
all functions which do that are __init functions.

Will test this now, and send a patchset.


[PATCH] x86/apic: Get rid of apic_version[] array. Save up to 128k

2016-09-13 Thread Denys Vlasenko
This array is [MAX_LOCAL_APIC], and MAX_LOCAL_APIC can easily be up to 32k.

The "APIC version mismatch" message, which should trigger when two different
APIC versions in different CPUs are seen, is in kernel since 2009 and no one
is hitting it.

Replace the array with a single variable, u8 boot_cpu_apic_version.

(Why u8?
APIC version values as of year 2016 are no larger than 0x1f on all known CPUs.
Version field in the APIC register is 8 bit wide - not likely
to overflow byte range in foreseeable future.)

"APIC version mismatch" check is not removed.

Signed-off-by: Denys Vlasenko 
CC: Ingo Molnar 
CC: Andy Lutomirski 
CC: "H. Peter Anvin" 
CC: Borislav Petkov 
CC: Brian Gerst 
CC: Mike Travis 
CC: x...@kernel.org
CC: linux-kernel@vger.kernel.org
---
This is the alternative patch for
"x86/apic: Use byte array apic_version[], not int array" one.

 arch/x86/include/asm/mpspec.h   |  2 +-
 arch/x86/kernel/acpi/boot.c |  2 +-
 arch/x86/kernel/apic/apic.c | 19 ---
 arch/x86/kernel/apic/io_apic.c  |  4 ++--
 arch/x86/kernel/apic/probe_32.c |  2 +-
 arch/x86/kernel/smpboot.c   | 10 +-
 6 files changed, 18 insertions(+), 21 deletions(-)

diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h
index b07233b..c2f94dc 100644
--- a/arch/x86/include/asm/mpspec.h
+++ b/arch/x86/include/asm/mpspec.h
@@ -6,7 +6,6 @@
 #include 
 #include 
 
-extern int apic_version[];
 extern int pic_mode;
 
 #ifdef CONFIG_X86_32
@@ -40,6 +39,7 @@ extern int mp_bus_id_to_type[MAX_MP_BUSSES];
 extern DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
 
 extern unsigned int boot_cpu_physical_apicid;
+extern u8 boot_cpu_apic_version;
 extern unsigned long mp_lapic_addr;
 
 #ifdef CONFIG_X86_LOCAL_APIC
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 90d84c3..fbd1944 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -182,7 +182,7 @@ static int acpi_register_lapic(int id, u32 acpiid, u8 
enabled)
}
 
if (boot_cpu_physical_apicid != -1U)
-   ver = apic_version[boot_cpu_physical_apicid];
+   ver = boot_cpu_apic_version;
 
cpu = generic_processor_info(id, ver);
if (cpu >= 0)
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 50c95af..8f9be74 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -63,6 +63,8 @@ unsigned disabled_cpus;
 /* Processor that is doing the boot up */
 unsigned int boot_cpu_physical_apicid = -1U;
 EXPORT_SYMBOL_GPL(boot_cpu_physical_apicid);
+
+u8 boot_cpu_apic_version;
 
 /*
  * The highest APIC ID seen during enumeration.
@@ -1816,8 +1818,7 @@ void __init init_apic_mappings(void)
 * since smp_sanity_check is prepared for such a case
 * and disable smp mode
 */
-   apic_version[new_apicid] =
-GET_APIC_VERSION(apic_read(APIC_LVR));
+   boot_cpu_apic_version = GET_APIC_VERSION(apic_read(APIC_LVR));
}
 }
 
@@ -1831,14 +1832,11 @@ void __init register_lapic_address(unsigned long 
address)
APIC_BASE, mp_lapic_addr);
}
if (boot_cpu_physical_apicid == -1U) {
-   boot_cpu_physical_apicid  = read_apic_id();
-   apic_version[boot_cpu_physical_apicid] =
-GET_APIC_VERSION(apic_read(APIC_LVR));
+   boot_cpu_physical_apicid = read_apic_id();
+   boot_cpu_apic_version = GET_APIC_VERSION(apic_read(APIC_LVR));
}
 }
 
-int apic_version[MAX_LOCAL_APIC];
-
 /*
  * Local APIC interrupts
  */
@@ -2128,11 +2126,10 @@ int generic_processor_info(int apicid, int version)
   cpu, apicid);
version = 0x10;
}
-   apic_version[apicid] = version;
 
-   if (version != apic_version[boot_cpu_physical_apicid]) {
+   if (version != boot_cpu_apic_version) {
pr_warning("BIOS bug: APIC version mismatch, boot CPU: %x, CPU 
%d: version %x\n",
-   apic_version[boot_cpu_physical_apicid], cpu, version);
+   boot_cpu_apic_version, cpu, version);
}
 
physid_set(apicid, phys_cpu_present_map);
@@ -2275,7 +2272,7 @@ int __init APIC_init_uniprocessor(void)
 * Complain if the BIOS pretends there is one.
 */
if (!boot_cpu_has(X86_FEATURE_APIC) &&
-   APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) {
+   APIC_INTEGRATED(boot_cpu_apic_version)) {
pr_err("BIOS bug, local APIC 0x%x not detected!...\n",
boot_cpu_physical_apicid);
return -1;
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 7491f41..48e6d84 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1593,7 +1593

Re: [PATCH] x86/apic: Use byte array apic_version[], not int array. Saves up to 96k

2016-09-13 Thread Denys Vlasenko



On 09/13/2016 05:33 PM, Thomas Gleixner wrote:

On Sun, 11 Sep 2016, Borislav Petkov wrote:

On Fri, Sep 09, 2016 at 10:32:04AM +0200, Denys Vlasenko wrote:

This array is [MAX_LOCAL_APIC], and MAX_LOCAL_APIC can easily be up to 32k.

This patch changes apic_version[] array elements from int to u8 -
APIC version values as of year 2016 are no larger than 0x1f on all known CPUs.
Version field in the APIC register is 8 bit wide - not likely
to overflow byte range in foreseeable future.

The "ver" argument of generic_processor_info(id,ver), which goes into 
apic_version[id],
is also changed from int to u8: make it obvious that assignment can't overflow.

generic_processor_info() has four callsites, none of them can put an 
out-of-range value
into this argument.


Right, so I dug a bit into this and found:

http://marc.info/?l=linux-kernel&m=123230551709711

and

b2b815d80a5c ("x86: put trigger in to detect mismatched apic versions")

It is from 2009 and I don't know how relevant 16-bit APIC IDs are
anymore... I guess you probably want to run this by SGI folk first.

Otherwise I was going to propose to kill that apic_version array
altogether and cache only the version of the previous CPU and compare it
to the current one to catch mismatches...


Yeah, the idea was back then to eliminate the array, but we wanted to make
sure that we don't have systems out in the wild which have different apic
versions. I really doubt that we can deal with that proper, so having a
single version entry and yelling loudly when we detect a mismatch is good
enough.


Makes sense. I'll send a patch


[PATCH] x86/apic: Use byte array apic_version[], not int array. Saves up to 96k

2016-09-09 Thread Denys Vlasenko
This array is [MAX_LOCAL_APIC], and MAX_LOCAL_APIC can easily be up to 32k.

This patch changes apic_version[] array elements from int to u8 -
APIC version values as of year 2016 are no larger than 0x1f on all known CPUs.
Version field in the APIC register is 8 bit wide - not likely
to overflow byte range in foreseeable future.

The "ver" argument of generic_processor_info(id,ver), which goes into 
apic_version[id],
is also changed from int to u8: make it obvious that assignment can't overflow.

generic_processor_info() has four callsites, none of them can put an 
out-of-range value
into this argument.

Signed-off-by: Denys Vlasenko 
CC: Ingo Molnar 
CC: Andy Lutomirski 
CC: "H. Peter Anvin" 
CC: Borislav Petkov 
CC: Brian Gerst 
CC: x...@kernel.org
CC: linux-kernel@vger.kernel.org
---
 arch/x86/include/asm/mpspec.h | 4 ++--
 arch/x86/kernel/acpi/boot.c   | 2 +-
 arch/x86/kernel/apic/apic.c   | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h
index b07233b..a0bc349 100644
--- a/arch/x86/include/asm/mpspec.h
+++ b/arch/x86/include/asm/mpspec.h
@@ -6,7 +6,7 @@
 #include 
 #include 
 
-extern int apic_version[];
+extern u8 apic_version[];
 extern int pic_mode;
 
 #ifdef CONFIG_X86_32
@@ -85,7 +85,7 @@ static inline void early_reserve_e820_mpc_new(void) { }
 #define default_get_smp_config x86_init_uint_noop
 #endif
 
-int generic_processor_info(int apicid, int version);
+int generic_processor_info(int apicid, u8 version);
 
 #define PHYSID_ARRAY_SIZE  BITS_TO_LONGS(MAX_LOCAL_APIC)
 
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 90d84c3..fde236f 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -168,7 +168,7 @@ static int __init acpi_parse_madt(struct acpi_table_header 
*table)
  */
 static int acpi_register_lapic(int id, u32 acpiid, u8 enabled)
 {
-   unsigned int ver = 0;
+   u8 ver = 0;
int cpu;
 
if (id >= MAX_LOCAL_APIC) {
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 50c95af..d5cc7c6 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -1837,7 +1837,7 @@ void __init register_lapic_address(unsigned long address)
}
 }
 
-int apic_version[MAX_LOCAL_APIC];
+u8 apic_version[MAX_LOCAL_APIC];
 
 /*
  * Local APIC interrupts
@@ -2027,7 +2027,7 @@ void disconnect_bsp_APIC(int virt_wire_setup)
apic_write(APIC_LVT1, value);
 }
 
-int generic_processor_info(int apicid, int version)
+int generic_processor_info(int apicid, u8 version)
 {
int cpu, max = nr_cpu_ids;
bool boot_cpu_detected = physid_isset(boot_cpu_physical_apicid,
-- 
2.9.2



[PATCH 2/2 v2] x86/e820: Use much less memory for e820/e820_saved, save up to 120k

2016-09-09 Thread Denys Vlasenko
The maximum size of e820 map array for EFI systems is defined as
E820_X_MAX (E820MAX + 3 * MAX_NUMNODES).

In x86_64 defconfig, this ends up with E820_X_MAX = 320, e820 and e820_saved
are 6404 bytes each.

With larger configs, for example Fedora kernels, E820_X_MAX = 3200, e820 and 
e820_saved
are 64004 bytes each. Most of this space is wasted. Typical machines have some 
20-30
e820 areas at most.

This patch turns e820 and e820_saved to pointers which initially point to 
__initdata
tables, of the same size as before.

At the very end of setup_arch(), when we are done fiddling with these maps,
allocate smaller alloc_bootmem blocks, copy maps there, and change pointers.

Run-tested.

Signed-off-by: Denys Vlasenko 
CC: Ingo Molnar 
CC: Andy Lutomirski 
CC: "H. Peter Anvin" 
CC: Borislav Petkov 
CC: Brian Gerst 
CC: x...@kernel.org
CC: linux-kernel@vger.kernel.org
---
Changes since v1: added __refdata annotations.

 arch/x86/include/asm/e820.h   |   6 +-
 arch/x86/kernel/e820.c| 123 +++---
 arch/x86/kernel/early-quirks.c|   2 +-
 arch/x86/kernel/kexec-bzimage64.c |   4 +-
 arch/x86/kernel/resource.c|   4 +-
 arch/x86/kernel/setup.c   |  10 ++--
 arch/x86/kernel/tboot.c   |   8 +--
 arch/x86/platform/efi/efi.c   |   2 +-
 arch/x86/xen/setup.c  |   2 +-
 9 files changed, 96 insertions(+), 65 deletions(-)

diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h
index 3ab0537..476b574 100644
--- a/arch/x86/include/asm/e820.h
+++ b/arch/x86/include/asm/e820.h
@@ -10,8 +10,8 @@
 #include 
 #ifndef __ASSEMBLY__
 /* see comment in arch/x86/kernel/e820.c */
-extern struct e820map e820;
-extern struct e820map e820_saved;
+extern struct e820map *e820;
+extern struct e820map *e820_saved;
 
 extern unsigned long pci_mem_start;
 extern int e820_any_mapped(u64 start, u64 end, unsigned type);
@@ -53,6 +53,8 @@ extern void e820_reserve_resources_late(void);
 extern void setup_memory_map(void);
 extern char *default_machine_specific_memory_setup(void);
 
+extern void e820_reallocate_tables(void);
+
 /*
  * Returns true iff the specified range [s,e) is completely contained inside
  * the ISA region.
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index a6d4f10..4e52859 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -40,8 +40,14 @@
  * user can e.g. boot the original kernel with mem=1G while still booting the
  * next kernel with full memory.
  */
-struct e820map e820;
-struct e820map e820_saved;
+static struct e820map initial_e820 __initdata;
+static struct e820map initial_e820_saved __initdata;
+/*
+ * After we are done fiddling with maps at boot, we reallocate (shrink) them.
+ * Otherwise each map uses up to ~64k, but typically only ~1k is needed.
+ */
+struct e820map *e820 __refdata = &initial_e820;
+struct e820map *e820_saved __refdata = &initial_e820_saved;
 
 /* For PCI or other memory-mapped resources */
 unsigned long pci_mem_start = 0xaeedbabe;
@@ -58,8 +64,8 @@ e820_any_mapped(u64 start, u64 end, unsigned type)
 {
int i;
 
-   for (i = 0; i < e820.nr_map; i++) {
-   struct e820entry *ei = &e820.map[i];
+   for (i = 0; i < e820->nr_map; i++) {
+   struct e820entry *ei = &e820->map[i];
 
if (type && ei->type != type)
continue;
@@ -81,8 +87,8 @@ int __init e820_all_mapped(u64 start, u64 end, unsigned type)
 {
int i;
 
-   for (i = 0; i < e820.nr_map; i++) {
-   struct e820entry *ei = &e820.map[i];
+   for (i = 0; i < e820->nr_map; i++) {
+   struct e820entry *ei = &e820->map[i];
 
if (type && ei->type != type)
continue;
@@ -128,7 +134,7 @@ static void __init __e820_add_region(struct e820map *e820x, 
u64 start, u64 size,
 
 void __init e820_add_region(u64 start, u64 size, int type)
 {
-   __e820_add_region(&e820, start, size, type);
+   __e820_add_region(e820, start, size, type);
 }
 
 static void __init e820_print_type(u32 type)
@@ -164,12 +170,12 @@ void __init e820_print_map(char *who)
 {
int i;
 
-   for (i = 0; i < e820.nr_map; i++) {
+   for (i = 0; i < e820->nr_map; i++) {
printk(KERN_INFO "%s: [mem %#018Lx-%#018Lx] ", who,
-  (unsigned long long) e820.map[i].addr,
+  (unsigned long long) e820->map[i].addr,
   (unsigned long long)
-  (e820.map[i].addr + e820.map[i].size - 1));
-   e820_print_type(e820.map[i].type);
+  (e820->map[i].addr + e820->map[i].size - 1));
+   e820_print_type(e820->map[i].type);
printk(KERN_CONT "\n");
}
 }
@@ -493,13 +499,13 @@ static u64 __init __e820_update_range(struct e820map 
*

[PATCH 2/2] x86/e820: Use much less memory for e820/e820_saved, save up to 120k

2016-09-08 Thread Denys Vlasenko
The maximum size of e820 map array for EFI systems is defined as
E820_X_MAX (E820MAX + 3 * MAX_NUMNODES).

In x86_64 defconfig, this ends up with E820_X_MAX = 320, e820 and e820_saved
are 6404 bytes each.

With larger configs, for example Fedora kernels, E820_X_MAX = 3200, e820 and 
e820_saved
are 64004 bytes each. Most of this space is wasted. Typical machines have some 
20-30
e820 areas at most.

This patch turns e820 and e820_saved to pointers which initially point to 
__initdata
tables, of the same size as before.

At the very end of setup_arch(), when we are done fiddling with these maps,
allocate smaller alloc_bootmem blocks, copy maps there, and change pointers.

Run-tested.

Signed-off-by: Denys Vlasenko 
CC: Ingo Molnar 
CC: Andy Lutomirski 
CC: "H. Peter Anvin" 
CC: Borislav Petkov 
CC: Brian Gerst 
CC: x...@kernel.org
CC: linux-kernel@vger.kernel.org
---
 arch/x86/include/asm/e820.h   |   6 +-
 arch/x86/kernel/e820.c| 123 +++---
 arch/x86/kernel/early-quirks.c|   2 +-
 arch/x86/kernel/kexec-bzimage64.c |   4 +-
 arch/x86/kernel/resource.c|   4 +-
 arch/x86/kernel/setup.c   |  10 ++--
 arch/x86/kernel/tboot.c   |   8 +--
 arch/x86/platform/efi/efi.c   |   2 +-
 arch/x86/xen/setup.c  |   2 +-
 9 files changed, 96 insertions(+), 65 deletions(-)

diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h
index 3ab0537..476b574 100644
--- a/arch/x86/include/asm/e820.h
+++ b/arch/x86/include/asm/e820.h
@@ -10,8 +10,8 @@
 #include 
 #ifndef __ASSEMBLY__
 /* see comment in arch/x86/kernel/e820.c */
-extern struct e820map e820;
-extern struct e820map e820_saved;
+extern struct e820map *e820;
+extern struct e820map *e820_saved;
 
 extern unsigned long pci_mem_start;
 extern int e820_any_mapped(u64 start, u64 end, unsigned type);
@@ -53,6 +53,8 @@ extern void e820_reserve_resources_late(void);
 extern void setup_memory_map(void);
 extern char *default_machine_specific_memory_setup(void);
 
+extern void e820_reallocate_tables(void);
+
 /*
  * Returns true iff the specified range [s,e) is completely contained inside
  * the ISA region.
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index a6d4f10..95fd1ec 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -40,8 +40,14 @@
  * user can e.g. boot the original kernel with mem=1G while still booting the
  * next kernel with full memory.
  */
-struct e820map e820;
-struct e820map e820_saved;
+static struct e820map initial_e820 __initdata;
+static struct e820map initial_e820_saved __initdata;
+/*
+ * After we are done fiddling with maps at boot, we reallocate (shrink) them.
+ * Otherwise each map uses up to ~64k, but typically only ~1k is needed.
+ */
+struct e820map *e820 = &initial_e820;
+struct e820map *e820_saved = &initial_e820_saved;
 
 /* For PCI or other memory-mapped resources */
 unsigned long pci_mem_start = 0xaeedbabe;
@@ -58,8 +64,8 @@ e820_any_mapped(u64 start, u64 end, unsigned type)
 {
int i;
 
-   for (i = 0; i < e820.nr_map; i++) {
-   struct e820entry *ei = &e820.map[i];
+   for (i = 0; i < e820->nr_map; i++) {
+   struct e820entry *ei = &e820->map[i];
 
if (type && ei->type != type)
continue;
@@ -81,8 +87,8 @@ int __init e820_all_mapped(u64 start, u64 end, unsigned type)
 {
int i;
 
-   for (i = 0; i < e820.nr_map; i++) {
-   struct e820entry *ei = &e820.map[i];
+   for (i = 0; i < e820->nr_map; i++) {
+   struct e820entry *ei = &e820->map[i];
 
if (type && ei->type != type)
continue;
@@ -128,7 +134,7 @@ static void __init __e820_add_region(struct e820map *e820x, 
u64 start, u64 size,
 
 void __init e820_add_region(u64 start, u64 size, int type)
 {
-   __e820_add_region(&e820, start, size, type);
+   __e820_add_region(e820, start, size, type);
 }
 
 static void __init e820_print_type(u32 type)
@@ -164,12 +170,12 @@ void __init e820_print_map(char *who)
 {
int i;
 
-   for (i = 0; i < e820.nr_map; i++) {
+   for (i = 0; i < e820->nr_map; i++) {
printk(KERN_INFO "%s: [mem %#018Lx-%#018Lx] ", who,
-  (unsigned long long) e820.map[i].addr,
+  (unsigned long long) e820->map[i].addr,
   (unsigned long long)
-  (e820.map[i].addr + e820.map[i].size - 1));
-   e820_print_type(e820.map[i].type);
+  (e820->map[i].addr + e820->map[i].size - 1));
+   e820_print_type(e820->map[i].type);
printk(KERN_CONT "\n");
}
 }
@@ -493,13 +499,13 @@ static u64 __init __e820_update_range(struct e820map 
*e820x, u64 start,
 u64 __init e820_update_range(u64 start, u64 size

[PATCH 1/2] x86/e820: mark some static functions __init

2016-09-08 Thread Denys Vlasenko
They are all called only from other __init functions.

Signed-off-by: Denys Vlasenko 
CC: Ingo Molnar 
CC: Andy Lutomirski 
CC: "H. Peter Anvin" 
CC: Borislav Petkov 
CC: Brian Gerst 
CC: x...@kernel.org
CC: linux-kernel@vger.kernel.org
---
 arch/x86/kernel/e820.c | 12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 621b501..a6d4f10 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -802,7 +802,7 @@ unsigned long __init e820_end_of_low_ram_pfn(void)
return e820_end_pfn(1UL << (32-PAGE_SHIFT));
 }
 
-static void early_panic(char *msg)
+static void __init early_panic(char *msg)
 {
early_printk(msg);
panic(msg);
@@ -912,7 +912,7 @@ void __init finish_e820_parsing(void)
}
 }
 
-static const char *e820_type_to_string(int e820_type)
+static const char *__init e820_type_to_string(int e820_type)
 {
switch (e820_type) {
case E820_RESERVED_KERN:
@@ -926,7 +926,7 @@ static const char *e820_type_to_string(int e820_type)
}
 }
 
-static unsigned long e820_type_to_iomem_type(int e820_type)
+static unsigned long __init e820_type_to_iomem_type(int e820_type)
 {
switch (e820_type) {
case E820_RESERVED_KERN:
@@ -942,7 +942,7 @@ static unsigned long e820_type_to_iomem_type(int e820_type)
}
 }
 
-static unsigned long e820_type_to_iores_desc(int e820_type)
+static unsigned long __init e820_type_to_iores_desc(int e820_type)
 {
switch (e820_type) {
case E820_ACPI:
@@ -961,7 +961,7 @@ static unsigned long e820_type_to_iores_desc(int e820_type)
}
 }
 
-static bool do_mark_busy(u32 type, struct resource *res)
+static bool __init do_mark_busy(u32 type, struct resource *res)
 {
/* this is the legacy bios/dos rom-shadow + mmio region */
if (res->start < (1ULL<<20))
@@ -1027,7 +1027,7 @@ void __init e820_reserve_resources(void)
 }
 
 /* How much should we pad RAM ending depending on where it is? */
-static unsigned long ram_alignment(resource_size_t pos)
+static unsigned long __init ram_alignment(resource_size_t pos)
 {
unsigned long mb = pos >> 20;
 
-- 
2.9.2



Re: RFC: Petition Intel/AMD to add POPF_IF insn

2016-08-31 Thread Denys Vlasenko

On 08/19/2016 12:54 PM, Paolo Bonzini wrote:

On 18/08/2016 19:24, Linus Torvalds wrote:

I didn't do CPL0 tests yet. Realized that cli/sti can be tested in userspace
if we set iopl(3) first.

Yes, but it might not be the same. So the timings could be very
different from a cpl0 case.


FWIW I recently measured around 20 cycles for a popf as well on
Haswell-EP and CPL=0 (that was for commit f2485b3e0c6c, "KVM: x86: use
guest_exit_irqoff", 2016-07-01).


Thanks for confirmation.

I revisited benchmarking of the

if (flags & X86_EFLAGS_IF)
native_irq_enable();

patch. In "make -j20" kernel compiles on a 8-way (HT) CPU, it shows some ~5 
second
improvement during ~16 minute compile. That's 0.5% speedup. It's ok, but not
something to bee too excited.

80 e6 02and$0x2,%dh
74 01   je 810101ae 
fb  sti

41 f6 86 91 00 00 00 02 testb  $0x2,0x91(%r14)
74 01   je 81013ce7 
fb  sti

f6 83 91 00 00 00 02testb  $0x2,0x91(%rbx)
74 01   je 81013efa 
fb  sti

41 f7 c4 00 02 00 00test   $0x200,%r12d
74 01   je 8101615d 
fb  sti

Here we trade 20-cycle POPF for either 4-cycle STI, or a branch (which is either
~1 cycle if predicted, or ~20 cycles if mispredicted). The disassembly of
vmlinux shows that gcc generates these asm patterns:

I still think a dedicated instruction for a conditional STI is worth asking for.

Along the lines of "If bit 9 in the r/m argument is set, then STI, else 
nothing".

What do people from CPU companies say?


[PATCH v5] powerpc: Do not make the entire heap executable

2016-08-22 Thread Denys Vlasenko
On 32-bit powerpc the ELF PLT sections of binaries (built with --bss-plt,
or with a toolchain which defaults to it) look like this:

  [17] .sbss NOBITS  0002aff8 01aff8 14 00  WA  0   0  4
  [18] .plt  NOBITS  0002b00c 01aff8 84 00 WAX  0   0  4
  [19] .bss  NOBITS  0002b090 01aff8 a4 00  WA  0   0  4

Which results in an ELF load header:

  Type   Offset   VirtAddr   PhysAddr   FileSiz MemSiz  Flg Align
  LOAD   0x019c70 0x00029c70 0x00029c70 0x01388 0x014c4 RWE 0x1

This is all correct, the load region containing the PLT is marked as
executable. Note that the PLT starts at 0002b00c but the file mapping ends at
0002aff8, so the PLT falls in the 0 fill section described by the load header,
and after a page boundary.

Unfortunately the generic ELF loader ignores the X bit in the load headers
when it creates the 0 filled non-file backed mappings. It assumes all of these
mappings are RW BSS sections, which is not the case for PPC.

gcc/ld has an option (--secure-plt) to not do this, this is said to incur
a small performance penalty.

Currently, to support 32-bit binaries with PLT in BSS kernel maps *entire
brk area* with executable rights for all binaries, even --secure-plt ones.

Stop doing that.

Teach the ELF loader to check the X bit in the relevant load header
and create 0 filled anonymous mappings that are executable
if the load header requests that.

The patch was originally posted in 2012 by Jason Gunthorpe
and apparently ignored:

https://lkml.org/lkml/2012/9/30/138

Lightly run-tested.

Signed-off-by: Jason Gunthorpe 
Signed-off-by: Denys Vlasenko 
Reviewed-by: Kees Cook 
CC: Benjamin Herrenschmidt 
CC: Paul Mackerras 
CC: "Aneesh Kumar K.V" 
CC: Kees Cook 
CC: Oleg Nesterov 
CC: Michael Ellerman 
CC: Florian Weimer 
CC: linux...@kvack.org
CC: linuxppc-...@lists.ozlabs.org
CC: linux-kernel@vger.kernel.org
---
Changes since v4:
* if (current->personality & READ_IMPLIES_EXEC), still use VM_EXEC
  for 32-bit executables.

Changes since v3:
* typo fix in commit message
* rebased to current Linus tree

Changes since v2:
* moved capability to map with VM_EXEC into vm_brk_flags()

Changes since v1:
* wrapped lines to not exceed 79 chars
* improved comment
* expanded CC list

 arch/powerpc/include/asm/page.h |  3 ++-
 fs/binfmt_elf.c | 30 ++
 include/linux/mm.h  |  1 +
 mm/mmap.c   | 21 -
 4 files changed, 41 insertions(+), 14 deletions(-)

diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h
index 56398e7..d188f51 100644
--- a/arch/powerpc/include/asm/page.h
+++ b/arch/powerpc/include/asm/page.h
@@ -230,7 +230,9 @@ extern long long virt_phys_offset;
  * and needs to be executable.  This means the whole heap ends
  * up being executable.
  */
-#define VM_DATA_DEFAULT_FLAGS32(VM_READ | VM_WRITE | VM_EXEC | \
+#define VM_DATA_DEFAULT_FLAGS32 \
+   (((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0) | \
+VM_READ | VM_WRITE | \
 VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
 
 #define VM_DATA_DEFAULT_FLAGS64(VM_READ | VM_WRITE | \
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 7f6aff3f..2b57b5a 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -91,12 +91,18 @@ static struct linux_binfmt elf_format = {
 
 #define BAD_ADDR(x) ((unsigned long)(x) >= TASK_SIZE)
 
-static int set_brk(unsigned long start, unsigned long end)
+static int set_brk(unsigned long start, unsigned long end, int prot)
 {
start = ELF_PAGEALIGN(start);
end = ELF_PAGEALIGN(end);
if (end > start) {
-   int error = vm_brk(start, end - start);
+   /*
+* Map the last of the bss segment.
+* If the header is requesting these pages to be
+* executable, honour that (ppc32 needs this).
+*/
+   int error = vm_brk_flags(start, end - start,
+   prot & PROT_EXEC ? VM_EXEC : 0);
if (error)
return error;
}
@@ -524,6 +530,7 @@ static unsigned long load_elf_interp(struct elfhdr 
*interp_elf_ex,
unsigned long load_addr = 0;
int load_addr_set = 0;
unsigned long last_bss = 0, elf_bss = 0;
+   int bss_prot = 0;
unsigned long error = ~0UL;
unsigned long total_size;
int i;
@@ -606,8 +613,10 @@ static unsigned long load_elf_interp(struct elfhdr 
*interp_elf_ex,
 * elf_bss and last_bss is the bss section.
 */
k = load_addr + eppnt->p_vaddr + eppnt->p_memsz;
-   if (k > last_bss)
+   if (k > last_bss) {
last_bss = k;
+

Re: [PATCH v4] powerpc: Do not make the entire heap executable

2016-08-22 Thread Denys Vlasenko

On 08/21/2016 05:47 PM, Aneesh Kumar K.V wrote:

Denys Vlasenko  writes:


On 32-bit powerpc the ELF PLT sections of binaries (built with --bss-plt,
or with a toolchain which defaults to it) look like this:

  [17] .sbss NOBITS  0002aff8 01aff8 14 00  WA  0   0  4
  [18] .plt  NOBITS  0002b00c 01aff8 84 00 WAX  0   0  4
  [19] .bss  NOBITS  0002b090 01aff8 a4 00  WA  0   0  4

Which results in an ELF load header:

  Type   Offset   VirtAddr   PhysAddr   FileSiz MemSiz  Flg Align
  LOAD   0x019c70 0x00029c70 0x00029c70 0x01388 0x014c4 RWE 0x1

This is all correct, the load region containing the PLT is marked as
executable. Note that the PLT starts at 0002b00c but the file mapping ends at
0002aff8, so the PLT falls in the 0 fill section described by the load header,
and after a page boundary.

Unfortunately the generic ELF loader ignores the X bit in the load headers
when it creates the 0 filled non-file backed mappings. It assumes all of these
mappings are RW BSS sections, which is not the case for PPC.

gcc/ld has an option (--secure-plt) to not do this, this is said to incur
a small performance penalty.

Currently, to support 32-bit binaries with PLT in BSS kernel maps *entire
brk area* with executable rights for all binaries, even --secure-plt ones.



Is this going to break any application ? I am asking because you
mentioned the patch is lightly tested.


I booted powerpc64 machine with RHEL7 installation,
it did not catch fire.


x86 do have a

#define VM_DATA_DEFAULT_FLAGS \
(((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0 ) | \
 VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)

ie, it can force a read implies exec mode. Do we need that ?


powerpc64 never had that. 32-bit mode may need it, since before
this patch all 32-bit tasks were unconditionally getting
VM_DATA_DEFAULT_FLAGS with VM_EXEC bit.

I'll send an updated patch.



Re: [PATCH v4] powerpc: Do not make the entire heap executable

2016-08-19 Thread Denys Vlasenko

On 08/10/2016 03:00 PM, Denys Vlasenko wrote:

On 32-bit powerpc the ELF PLT sections of binaries (built with --bss-plt,
or with a toolchain which defaults to it) look like this:

  [17] .sbss NOBITS  0002aff8 01aff8 14 00  WA  0   0  4
  [18] .plt  NOBITS  0002b00c 01aff8 84 00 WAX  0   0  4
  [19] .bss  NOBITS  0002b090 01aff8 a4 00  WA  0   0  4

Which results in an ELF load header:

  Type   Offset   VirtAddr   PhysAddr   FileSiz MemSiz  Flg Align
  LOAD   0x019c70 0x00029c70 0x00029c70 0x01388 0x014c4 RWE 0x1

This is all correct, the load region containing the PLT is marked as
executable. Note that the PLT starts at 0002b00c but the file mapping ends at
0002aff8, so the PLT falls in the 0 fill section described by the load header,
and after a page boundary.

Unfortunately the generic ELF loader ignores the X bit in the load headers
when it creates the 0 filled non-file backed mappings. It assumes all of these
mappings are RW BSS sections, which is not the case for PPC.

gcc/ld has an option (--secure-plt) to not do this, this is said to incur
a small performance penalty.

Currently, to support 32-bit binaries with PLT in BSS kernel maps *entire
brk area* with executable rights for all binaries, even --secure-plt ones.

Stop doing that.

Teach the ELF loader to check the X bit in the relevant load header
and create 0 filled anonymous mappings that are executable
if the load header requests that.

The patch was originally posted in 2012 by Jason Gunthorpe
and apparently ignored:

https://lkml.org/lkml/2012/9/30/138

Lightly run-tested.



Ping powerpc/mm people.
How does this patch look? Are you taking it?


-static int do_brk(unsigned long addr, unsigned long request)
+static int do_brk_flags(unsigned long addr, unsigned long request, unsigned 
long flags)
 {
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma, *prev;
-   unsigned long flags, len;
+   unsigned long len;
struct rb_node **rb_link, *rb_parent;
pgoff_t pgoff = addr >> PAGE_SHIFT;
int error;
@@ -2668,7 +2668,7 @@ static int do_brk(unsigned long addr, unsigned long 
request)
if (!len)
return 0;

-   flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
+   flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;

error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED);


Regarding "maybe VM_LOCKED needs to be masked out of flags?"
in the fragment above.

I agree. In a sense that "Yes, maybe. I don't really know
whether mm people feel it is worth the cost."
I'd be happy to send a new version if someone will express
a definite request to add that.


Re: RFC: Petition Intel/AMD to add POPF_IF insn

2016-08-18 Thread Denys Vlasenko

On 08/18/2016 07:24 PM, Linus Torvalds wrote:

That said, your numbers really aren't very convincing. If popf really
is just 10 cycles on modern Intel hardware, it's already fast enough
that I really don't think it matters.


It's 20 cycles. I was wrong in my email, I forgot that the insn count
also counts "push %ebx" insns.

Since I already made a mistake, let me double-check.

200 million iterations of this loop execute under 17 seconds:

  400100:   b8 00 c2 eb 0b  mov$0xbebc200,%eax # 1000*1000*1000
  400105:   9c  pushfq
  400106:   5b  pop%rbx
  400107:   90  nop

00400140 :
  400140:   53  push   %rbx
  400141:   9d  popfq
  400142:   53  push   %rbx
  400143:   9d  popfq
  400144:   53  push   %rbx
  400145:   9d  popfq
  400146:   53  push   %rbx
  400147:   9d  popfq
  400148:   53  push   %rbx
  400149:   9d  popfq
  40014a:   53  push   %rbx
  40014b:   9d  popfq
  40014c:   53  push   %rbx
  40014d:   9d  popfq
  40014e:   53  push   %rbx
  40014f:   9d  popfq
  400150:   53  push   %rbx
  400151:   9d  popfq
  400152:   53  push   %rbx
  400153:   9d  popfq
  400154:   53  push   %rbx
  400155:   9d  popfq
  400156:   53  push   %rbx
  400157:   9d  popfq
  400158:   53  push   %rbx
  400159:   9d  popfq
  40015a:   53  push   %rbx
  40015b:   9d  popfq
  40015c:   ff c8   dec%eax
  40015e:   75 e0   jne400140 

The loop is exactly 32 bytes, aligned.
There are 14 POPFs. Other insns are very fast.

No perf, just "time taskset 1 ./test".
My CPU frequency hovers around 3500 MHz when loaded.

17 seconds is 17*3500 million cycles.
17*3500 million cycles / 200*14 million cycles = 21.25

Thus, one POPF in CPL3 is ~20 cycles on Skylake.


Re: RFC: Petition Intel/AMD to add POPF_IF insn

2016-08-18 Thread Denys Vlasenko



On 08/18/2016 07:47 PM, Denys Vlasenko wrote:

On 08/18/2016 07:24 PM, Linus Torvalds wrote:

That said, your numbers really aren't very convincing. If popf really
is just 10 cycles on modern Intel hardware, it's already fast enough
that I really don't think it matters.


It's 20 cycles. I was wrong in my email, I forgot that the insn count
also counts "push %ebx" insns.

Since I already made a mistake, let me double-check.

200 million iterations of this loop execute under 17 seconds:

  400100:b8 00 c2 eb 0b   mov$0xbebc200,%eax # 1000*1000*1000


Grr. It's 200*1000*1000, not one billion. Sorry.


Re: RFC: Petition Intel/AMD to add POPF_IF insn

2016-08-18 Thread Denys Vlasenko

Of course, somebody really should do timings on modern CPU's (in cpl0,
comparing native_fl() that enables interrupts with a popf)


I didn't do CPL0 tests yet. Realized that cli/sti can be tested in userspace
if we set iopl(3) first.

Surprisingly, STI is slower than CLI. A loop with 27 CLI's and one STI
converges to about ~0.5 insn/cycle:

# compile with: gcc -nostartfiles -nostdlib
_start: .globl  _start
mov $172, %eax #iopl
mov $3, %edi
syscall
mov $200*1000*1000, %eax
.balign 64
loop:
cli;cli;cli;cli
cli;cli;cli;cli
cli;cli;cli;cli
cli;cli;cli;cli

cli;cli;cli;cli
cli;cli;cli;cli
cli;cli;cli;sti
dec %eax
jnz loop

mov $231, %eax #exit_group
syscall

perf stat:
 6,015,787,968  instructions  #0.52  insn per cycle
   3.355474199 seconds time elapsed

With all CLIs replaced by STIs, it's ~0.25 insn/cycle:

 6,030,530,328  instructions  #0.27  insn per cycle
   6.547200322 seconds time elapsed


POPF which needs to enable interrupts is not measurably faster than
one which does not change .IF:

Loop with:
  400158:   fa  cli
  400159:   53  push   %rbx  #saved eflags with if=1
  40015a:   9d  popfq
shows:
 8,908,857,324  instructions  #0.11  insn per cycle 
  ( +-  0.00% )

Loop with:
  400140:   fb  sti
  400141:   53  push   %rbx
  400142:   9d  popfq
shows:
 8,920,243,701  instructions  #0.10  insn per cycle 
  ( +-  0.01% )

Even loop with neither CLI nor STI, only with POPF:
  400140:   53  push   %rbx
  400141:   9d  popfq
shows:
 6,079,936,714  instructions  #0.10  insn per cycle 
  ( +-  0.00% )

This is on a Skylake CPU.


The gist of it:
CLI is 2 cycles,
STI is 4 cycles,
POPF is 10 cycles
seemingly regardless of prior value of EFLAGS.IF.


Re: RFC: Petition Intel/AMD to add POPF_IF insn

2016-08-18 Thread Denys Vlasenko

On 08/18/2016 11:21 AM, Denys Vlasenko wrote:

Of course, if somebody uses native_restore_fl() to actually *disable*
interrupts (when they weren't already disabled), then this untested
patch will just not work. But why would you do somethign so stupid?
Famous last words...


Looking around, the vsmp code actually uses "native_restore_fl()" to
not just set the interrupt flag, but AC as well.

And the PV spinlock case has that "push;popf" sequence encoded in an alternate.

So that trivial patch may (or may not - still not tested) work for
some quick testing, but needs more effort for any *real* use.


I'm going to test the attached patch.

...


+# CONFIG_UPROBES is not set
+# CONFIG_SCHED_OMIT_FRAME_POINTER is not set
+# CONFIG_HYPERVISOR_GUEST is not set
+# CONFIG_SYS_HYPERVISOR is not set
+# CONFIG_FRAME_POINTER is not set
+# CONFIG_KMEMCHECK is not set
+# CONFIG_DEBUG_LOCK_ALLOC is not set
+# CONFIG_PROVE_LOCKING is not set
+# CONFIG_LOCK_STAT is not set
+# CONFIG_PROVE_RCU is not set
+# CONFIG_LATENCYTOP is not set
+# CONFIG_FTRACE is not set
+# CONFIG_BINARY_PRINTF is not set


Need also !CONFIG_DEBUG_SPINLOCK, then unpatched irqrestore is:

817115a0 <_raw_spin_unlock_irqrestore>:
817115a0:   c6 07 00movb   $0x0,(%rdi)
817115a3:   56  push   %rsi
817115a4:   9d  popfq
817115a5:   65 ff 0d e4 ad 8f 7edecl   %gs:__preempt_count
817115ac:   c3  retq
817115ad:   0f 1f 00nopl   (%rax)

patched one is

81711660 <_raw_spin_unlock_irqrestore>:
81711660:   f7 c6 00 02 00 00   test   $0x200,%esi
81711666:   c6 07 00movb   $0x0,(%rdi)
81711669:   74 01   je 8171166c 
<_raw_spin_unlock_irqrestore+0xc>
8171166b:   fb  sti
8171166c:   65 ff 0d 1d ad 8f 7edecl   %gs:__preempt_count
81711673:   c3  retq
81711674:   66 90   xchg   %ax,%ax
81711676:   66 2e 0f 1f 84 00 00 00 00 00   nopw   
%cs:0x0(%rax,%rax,1)


Ran the following twice on a quiesced machine:

taskset 1 perf stat -r60 perf bench sched messaging
taskset 1 perf stat -r60 perf bench sched pipe

Out of these four runs, both "perf bench sched pipe" runs show improvements:

-   2648.279829  task-clock (msec) #1.000 CPUs utilized 
   ( +-  0.24% )
+   2483.143469  task-clock (msec) #0.998 CPUs utilized 
   ( +-  0.31% )
- 2,000,002  context-switches  #0.755 M/sec 
   ( +-  0.00% )
+ 2,000,013  context-switches  #0.805 M/sec 
   ( +-  0.00% )
-   547  page-faults   #0.206 K/sec 
   ( +-  0.04% )
+   546  page-faults   #0.220 K/sec 
   ( +-  0.04% )
- 8,723,284,926  cycles#3.294 GHz   
   ( +-  0.06% )
+ 8,157,949,449  cycles#3.285 GHz   
   ( +-  0.07% )
-12,286,937,344  instructions  #1.41  insn per cycle
   ( +-  0.03% )
+12,255,616,405  instructions  #1.50  insn per cycle
   ( +-  0.05% )
- 2,588,839,023  branches  #  977.555 M/sec 
   ( +-  0.02% )
+ 2,599,319,615  branches  # 1046.786 M/sec 
   ( +-  0.04% )
- 3,620,273  branch-misses #0.14% of all branches   
   ( +-  0.67% )
+ 3,577,771  branch-misses #0.14% of all branches   
   ( +-  0.69% )
-   2.648799072 seconds time elapsed
  ( +-  0.24% )
+   2.487452268 seconds time elapsed
  ( +-  0.31% )

Good, we run more insns/cycle, as expected. However, a bit more branches.

But of two "perf bench sched messaging" run, one was slower on a patched kernel,
and perf shows why: more branches, and also branch miss percentage is larger:

-690.008697  task-clock (msec) #0.996 CPUs utilized 
   ( +-  0.45% )
+699.526509  task-clock (msec) #0.994 CPUs utilized 
   ( +-  0.28% )
-26,796  context-switches  #0.039 M/sec 
   ( +-  8.31% )
+29,088  context-switches  #0.042 M/sec 
   ( +-  6.62% )
-35,477  page-faults   #0.051 M/sec 
   ( +-  0.11% )
+35,504  page-faults   #0.051 M/sec 
   ( +-  0.14%

Re: RFC: Petition Intel/AMD to add POPF_IF insn

2016-08-18 Thread Denys Vlasenko

On 08/17/2016 11:35 PM, Linus Torvalds wrote:

On Wed, Aug 17, 2016 at 2:26 PM, Linus Torvalds
 wrote:


Of course, if somebody uses native_restore_fl() to actually *disable*
interrupts (when they weren't already disabled), then this untested
patch will just not work. But why would you do somethign so stupid?
Famous last words...


Looking around, the vsmp code actually uses "native_restore_fl()" to
not just set the interrupt flag, but AC as well.

And the PV spinlock case has that "push;popf" sequence encoded in an alternate.

So that trivial patch may (or may not - still not tested) work for
some quick testing, but needs more effort for any *real* use.


I'm going to test the attached patch.

PV and debug maze is daunting... I discovered that Fedora kernels
compile with the set of .config options which result in
spin_unlock_irqrestore which looks like this:

818d0f40 <_raw_spin_unlock_irqrestore>:
818d0f40:   e8 1b 31 00 00  callq  818d4060 
<__fentry__>818d0f41: R_X86_64_PC32 __fentry__-0x4
818d0f45:   55  push   %rbp
818d0f46:   48 89 e5mov%rsp,%rbp
818d0f49:   41 54   push   %r12
818d0f4b:   53  push   %rbx
818d0f4c:   48 8b 55 08 mov0x8(%rbp),%rdx
818d0f50:   49 89 fcmov%rdi,%r12
818d0f53:   48 89 f3mov%rsi,%rbx
818d0f56:   48 83 c7 18 add$0x18,%rdi
818d0f5a:   be 01 00 00 00  mov$0x1,%esi
818d0f5f:   e8 8c b8 83 ff  callq  8110c7f0 
  818d0f60: R_X86_64_PC32 lock_release-0x4
818d0f64:   4c 89 e7mov%r12,%rdi
818d0f67:   e8 d4 fb 83 ff  callq  81110b40 
818d0f68: R_X86_64_PC32 do_raw_spin_unlock-0x4
818d0f6c:   f6 c7 02test   $0x2,%bh
818d0f6f:   74 1b   je 818d0f8c 
<_raw_spin_unlock_irqrestore+0x4c>
818d0f71:   e8 aa 9d 83 ff  callq  8110ad20 
 818d0f72: R_X86_64_PC32 trace_hardirqs_on-0x4
818d0f76:   48 89 dfmov%rbx,%rdi
818d0f79:   ff 14 25 48 32 e2 81callq  *0x81e23248  
818d0f7c: R_X86_64_32S  pv_irq_ops+0x8
818d0f80:   65 ff 0d c9 c4 73 7edecl   %gs:0x7e73c4c9(%rip)# 
d450 <__preempt_count> 818d0f83: R_X86_64_PC32 __preempt_count-0x4
818d0f87:   5b  pop%rbx
818d0f88:   41 5c   pop%r12
818d0f8a:   5d  pop%rbp
818d0f8b:   c3  retq
818d0f8c:   48 89 dfmov%rbx,%rdi
818d0f8f:   ff 14 25 48 32 e2 81callq  *0x81e23248  
818d0f92: R_X86_64_32S  pv_irq_ops+0x8
818d0f96:   e8 35 5b 83 ff  callq  81106ad0 
818d0f97: R_X86_64_PC32 trace_hardirqs_off-0x4
818d0f9b:   eb e3   jmp818d0f80 
<_raw_spin_unlock_irqrestore+0x40>
818d0f9d:   0f 1f 00nopl   (%rax)

Gawd... really Fedora? All this is needed?

Testing with _this_ is not going to show any differences, I need to weed
all the cruft out and test a fast, non-debug .config.

Changed it like this:

+# CONFIG_UPROBES is not set
+# CONFIG_SCHED_OMIT_FRAME_POINTER is not set
+# CONFIG_HYPERVISOR_GUEST is not set
+# CONFIG_SYS_HYPERVISOR is not set
+# CONFIG_FRAME_POINTER is not set
+# CONFIG_KMEMCHECK is not set
+# CONFIG_DEBUG_LOCK_ALLOC is not set
+# CONFIG_PROVE_LOCKING is not set
+# CONFIG_LOCK_STAT is not set
+# CONFIG_PROVE_RCU is not set
+# CONFIG_LATENCYTOP is not set
+# CONFIG_FTRACE is not set
+# CONFIG_BINARY_PRINTF is not set

Looks better (it's with the patch already):

00f0 <_raw_spin_unlock_irqrestore>:
  f0:   53  push   %rbx
  f1:   48 89 f3mov%rsi,%rbx
  f4:   e8 00 00 00 00  callq  f9 <_raw_spin_unlock_irqrestore+0x9> 
f5: R_X86_64_PC32   do_raw_spin_unlock-0x4
  f9:   80 e7 02and$0x2,%bh
  fc:   74 01   je ff <_raw_spin_unlock_irqrestore+0xf>
  fe:   fb  sti
  ff:   65 ff 0d 00 00 00 00decl   %gs:0x0(%rip)# 106 
<_raw_spin_unlock_irqrestore+0x16>102: R_X86_64_PC32  
__preempt_count-0x4
 106:   5b  pop%rbx
 107:   c3  retq

This also raises questions. Such as "why do_raw_spin_unlock() is not inlined 
here?"

Anyway... on to more kernel builds and testing.
Please take a look at the below patch. If it's obviously buggy, let me know.


diff -urp linux-4.7.1.org/arch/x86/include/asm/irqflags.h 
linux-4.7.1.obj/arch/x86/include/a

Re: RFC: Petition Intel/AMD to add POPF_IF insn

2016-08-17 Thread Denys Vlasenko



On 08/17/2016 09:32 PM, Linus Torvalds wrote:

On Wed, Aug 17, 2016 at 12:26 PM, Denys Vlasenko  wrote:


Exactly. And more:


All of which is ENTIRELY IRRELEVANT.

The 2-page pseudo-code is about behavior. It's trivial to
short-circuit. There are obvious optimizations, which involve just
making the rare cases go slow and have a trivial "if those bits didn't
change, don't go through the expense of testing them".

The problem is that IF is almost certainly right now in that rare case
mask, and so popf is stupidly slow for IF.


I ran the test, see the first email in the thread.

Experimentally, POPF is stupidly slow _always_. 6 cycles
even if none of the "scary" flags are changed.
Either:
*  its microcode has no rare case mask
or
* its microcode is so slow that even fast path is slow,
  and slow path is worse


Re: RFC: Petition Intel/AMD to add POPF_IF insn

2016-08-17 Thread Denys Vlasenko

On 08/17/2016 09:13 PM, Andy Lutomirski wrote:

On Wed, Aug 17, 2016 at 12:01 PM, Linus Torvalds
 wrote:

On Aug 17, 2016 11:41 AM, "Denys Vlasenko"  wrote:


OTOH 5 years will inevitably pass.


Yes. But in five years, maybe we'll have a popf that is faster anyway.

I'd actually prefer that in the end. The problem with popf right now seems
to be mainly that it's effectively serializing and does stupid things in
microcode. It doesn't have to be that way. It could actually do much better,
but it hasn't been a high enough priority for Intel.



It wouldn't surprise me if that were easier said than done.  popf
potentially changes AC, and AC affects address translation.  popf also
potentially changes IOPL, and I don't know whether Intel chips track
IOPL in a way that lets them find all the dependent instructions
without serializing.  But maybe their pipeline is fancy enough.


Exactly. And more:

POPF potentially changes TF (and it works even in CPL3).
POPD changes DF - must serialize versus string insns.
POPF changes NT - must serialize versus IRET insns.
POPF changes VIF, from a different bit in a popped value,
and under a rather complex conditions.

Intel's documentation has a pseudo-code for the instructions.
For POPF, that pseudo-code is two pages long...


Re: RFC: Petition Intel/AMD to add POPF_IF insn

2016-08-17 Thread Denys Vlasenko



On 08/17/2016 07:32 PM, Linus Torvalds wrote:

On Wed, Aug 17, 2016 at 10:20 AM, Denys Vlasenko  wrote:

Last year, a proposal was floated to avoid costly POPF.
In particular, each spin_unlock_irqrestore() does it,
a rather common operation.


Quiet frankly, it takes so long for hw features that I don't think
it's worth it for something like this.


True, it will take some ~5 years for new hardware to become
widely used.

OTOH 5 years will inevitably pass. Just like now 32-bit Linux userspace
is commonly compiled to "i686" instruction set, because
inevitably enough time has passed that you can safely assume
most people run at least Pentium Pro-level CPU, with CMOV, CMPXCHG,
etc.

If the new instruction will be implemented with "REPZ POPF" encoding,
no recompilation or alternatives will be needed for the new kernel
to run on an old CPU. On an old CPU, entire EFLAGS will be restored.
On a new one, only EFLAGS.IF.


Re: RFC: Petition Intel/AMD to add POPF_IF insn

2016-08-17 Thread Denys Vlasenko

On 08/17/2016 07:30 PM, Christian König wrote:

But in my measurements POPF is not fast even in the case where restored
flags are not changed at all:

mov $200*1000*1000, %eax
pushf
pop %rbx
.balign 64
loop:   push%rbx
popf
dec %eax
jnz loop

# perf stat -r20 ./popf_1g
 4,929,012,093  cycles#3.412 GHz
  ( +-  0.02% )
   835,721,371  instructions  #0.17  insn per cycle 
  ( +-  0.02% )
   1.446185359 seconds time elapsed 
 ( +-  0.46% )

If I replace POPF with a pop into an unused register, I get this:


You are comparing apples and bananas here.


Yes, I know. Pop into a register here is basically free.
I'd also add a STI and measure how much it takes to
enable interrupts, but unfortunately STI throws a #GP
in CPL 3.



RFC: Petition Intel/AMD to add POPF_IF insn

2016-08-17 Thread Denys Vlasenko

Last year, a proposal was floated to avoid costly POPF.
In particular, each spin_unlock_irqrestore() does it,
a rather common operation.

https://lkml.org/lkml/2015/4/21/290
[RFC PATCH] x86/asm/irq: Don't use POPF but STI

* Andy Lutomirski  wrote:

> Another different approach would be to formally state that
> pv_irq_ops.save_fl() needs to return all the flags, which would
> make local_irq_save() safe to use in this circumstance, but that
> makes a hotpath longer for the sake of a single boot time check.

...which reminds me:

Why does native_restore_fl restore anything other than IF?  A branch
and sti should be considerably faster than popf.



Ingo agreed:

Yes, this has come up in the past, something like the patch below?

Totally untested and not signed off yet: because we'd first have to
make sure (via irq flags debugging) that it's not used in reverse, to
re-disable interrupts:
local_irq_save(flags);
local_irq_enable();
...
local_irq_restore(flags); /* effective local_irq_disable() */
I don't think we have many (any?) such patterns left, but it has to be
checked first. If we have such cases then we'll have to use different
primitives there.


Linus replied:
=
"popf" is fast for the "no changes to IF" case, and is a smaller
instruction anyway.



This basically shot down the proposal.

But in my measurements POPF is not fast even in the case where restored
flags are not changes at all:

mov $200*1000*1000, %eax
pushf
pop %rbx
.balign 64
loop:   push%rbx
popf
dec %eax
jnz loop

# perf stat -r20 ./popf_1g
 4,929,012,093  cycles#3.412 GHz
  ( +-  0.02% )
   835,721,371  instructions  #0.17  insn per cycle 
  ( +-  0.02% )
   1.446185359 seconds time elapsed 
 ( +-  0.46% )

If I replace POPF with a pop into an unused register, I get this:

loop:   push%rbx
pop %rcx
dec %eax
jnz loop

   209,247,645  cycles#3.209 GHz
  ( +-  0.11% )
   801,898,016  instructions  #3.83  insn per cycle 
  ( +-  0.00% )
   0.066210725 seconds time elapsed 
 ( +-  0.59% )


IOW, POPF takes at least 6 cycles.


Linus does have a point that a "test+branch+STI" may end up not a clear win 
because of the branch.

But the need to restore IF flag exists, it is necessary not only for Linux, but 
for any OS
running on x86: they all have some sort of spinlock.

The addition of a POPF instruction variant which looks only at IF bit and 
changes
only that bit in EFLAGS may be a good idea, for all OSes.

I propose that we ask Intel / AMD to do that.

Maybe by the "ignored prefix" trick which was used when LZCNT insn was 
introduced
as REPZ-prefixed BSR?
Currently, REPZ POPF (f3 9d) insn does execute. Redefine this opcode as
POPF_IF. Then the same kernel will work on old and new CPUs.

CC'ing some @intel and @amd emails...


Re: [PATCH] uprobes/x86: fix rip-relative handling of EVEX-encoded insns.

2016-08-12 Thread Denys Vlasenko

On 08/12/2016 07:39 AM, Srikar Dronamraju wrote:

* Denys Vlasenko  [2016-08-11 17:45:21]:


Since instruction decoder now supports EVEX-encoded insns, two fixes are needed
to correctly handle them in uprobes.


Could you please add the commit that helps support the insns in the
Changelog.


Sorry, I don't understand the above. Can you rephrase it?


[tip:perf/urgent] uprobes/x86: Fix RIP-relative handling of EVEX-encoded instructions

2016-08-12 Thread tip-bot for Denys Vlasenko
Commit-ID:  68187872c76a96ed4db7bfb064272591f02e208b
Gitweb: http://git.kernel.org/tip/68187872c76a96ed4db7bfb064272591f02e208b
Author: Denys Vlasenko 
AuthorDate: Thu, 11 Aug 2016 17:45:21 +0200
Committer:  Ingo Molnar 
CommitDate: Fri, 12 Aug 2016 08:29:24 +0200

uprobes/x86: Fix RIP-relative handling of EVEX-encoded instructions

Since instruction decoder now supports EVEX-encoded instructions, two fixes
are needed to correctly handle them in uprobes.

Extended bits for MODRM.rm field need to be sanitized just like we do it
for VEX3, to avoid encoding wrong register for register-relative access.

EVEX has _two_ extended bits: b and x. Theoretically, EVEX.x should be
ignored by the CPU (since GPRs go only up to 15, not 31), but let's be
paranoid here: proper encoding for register-relative access
should have EVEX.x = 1.

Secondly, we should fetch vex. for EVEX too.
This is now super easy because instruction decoder populates
vex_prefix.bytes[2] for all flavors of (e)vex encodings, even for VEX2.

Signed-off-by: Denys Vlasenko 
Acked-by: Masami Hiramatsu 
Acked-by: Srikar Dronamraju 
Cc: Alexander Shishkin 
Cc: Andy Lutomirski 
Cc: Arnaldo Carvalho de Melo 
Cc: Borislav Petkov 
Cc: Brian Gerst 
Cc: H. Peter Anvin 
Cc: Jim Keniston 
Cc: Jiri Olsa 
Cc: Josh Poimboeuf 
Cc: Linus Torvalds 
Cc: Masami Hiramatsu 
Cc: Oleg Nesterov 
Cc: Peter Zijlstra 
Cc: Stephane Eranian 
Cc: Thomas Gleixner 
Cc: Vince Weaver 
Cc: linux-kernel@vger.kernel.org
Cc:  # v4.1+
Fixes: 8a764a875fe3 ("x86/asm/decoder: Create artificial 3rd byte for 2-byte 
VEX")
Link: http://lkml.kernel.org/r/20160811154521.20469-1-dvlas...@redhat.com
Signed-off-by: Ingo Molnar 
---
 arch/x86/kernel/uprobes.c | 22 +++---
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index 6c1ff31..495c776 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -357,20 +357,22 @@ static void riprel_analyze(struct arch_uprobe *auprobe, 
struct insn *insn)
*cursor &= 0xfe;
}
/*
-* Similar treatment for VEX3 prefix.
-* TODO: add XOP/EVEX treatment when insn decoder supports them
+* Similar treatment for VEX3/EVEX prefix.
+* TODO: add XOP treatment when insn decoder supports them
 */
-   if (insn->vex_prefix.nbytes == 3) {
+   if (insn->vex_prefix.nbytes >= 3) {
/*
 * vex2: c5rLpp   (has no b bit)
 * vex3/xop: c4/8f rxbm wLpp
 * evex: 62rxbR00mm w1pp zllBVaaa
-*   (evex will need setting of both b and x since
-*   in non-sib encoding evex.x is 4th bit of MODRM.rm)
-* Setting VEX3.b (setting because it has inverted meaning):
+* Setting VEX3.b (setting because it has inverted meaning).
+* Setting EVEX.x since (in non-SIB encoding) EVEX.x
+* is the 4th bit of MODRM.rm, and needs the same treatment.
+* For VEX3-encoded insns, VEX3.x value has no effect in
+* non-SIB encoding, the change is superfluous but harmless.
 */
cursor = auprobe->insn + insn_offset_vex_prefix(insn) + 1;
-   *cursor |= 0x20;
+   *cursor |= 0x60;
}
 
/*
@@ -415,12 +417,10 @@ static void riprel_analyze(struct arch_uprobe *auprobe, 
struct insn *insn)
 
reg = MODRM_REG(insn);  /* Fetch modrm.reg */
reg2 = 0xff;/* Fetch vex. */
-   if (insn->vex_prefix.nbytes == 2)
-   reg2 = insn->vex_prefix.bytes[1];
-   else if (insn->vex_prefix.nbytes == 3)
+   if (insn->vex_prefix.nbytes)
reg2 = insn->vex_prefix.bytes[2];
/*
-* TODO: add XOP, EXEV  reading.
+* TODO: add XOP  reading.
 *
 * vex. field is in bits 6-3, bits are inverted.
 * But in 32-bit mode, high-order bit may be ignored.


[PATCH] uprobes/x86: fix rip-relative handling of EVEX-encoded insns.

2016-08-11 Thread Denys Vlasenko
Since instruction decoder now supports EVEX-encoded insns, two fixes are needed
to correctly handle them in uprobes.

Extended bits for MODRM.rm field need to be sanitized just like we do it
for VEX3, to avoid encoding wrong register for register-relative access.
EVEX has _two_ extended bits: b and x. Theoretically, EVEX.x should be
ignored by CPU (since GPRs go only up to 15, not 31), but let's be paranoid
here: proper encoding for register-relative access should have EVEX.x = 1.

Secondly, we should fetch vex. for EVEX too.
This is now super easy because insn decoder populates vex_prefix.bytes[2]
for all flavors of (e)vex encodings, even for vex2.

Signed-off-by: Denys Vlasenko 
CC: Jim Keniston 
CC: Masami Hiramatsu 
CC: Srikar Dronamraju 
CC: Ingo Molnar 
CC: Oleg Nesterov 
CC: x...@kernel.org
CC: linux-kernel@vger.kernel.org
---
 arch/x86/kernel/uprobes.c | 22 +++---
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index 6c1ff31..a6e6096 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -357,20 +357,22 @@ static void riprel_analyze(struct arch_uprobe *auprobe, 
struct insn *insn)
*cursor &= 0xfe;
}
/*
-* Similar treatment for VEX3 prefix.
-* TODO: add XOP/EVEX treatment when insn decoder supports them
+* Similar treatment for VEX3/EVEX prefix.
+* TODO: add XOP treatment when insn decoder supports them
 */
-   if (insn->vex_prefix.nbytes == 3) {
+   if (insn->vex_prefix.nbytes >= 3) {
/*
 * vex2: c5rLpp   (has no b bit)
 * vex3/xop: c4/8f rxbm wLpp
 * evex: 62rxbR00mm w1pp zllBVaaa
-*   (evex will need setting of both b and x since
-*   in non-sib encoding evex.x is 4th bit of MODRM.rm)
-* Setting VEX3.b (setting because it has inverted meaning):
+* Setting VEX3.b (setting because it has inverted meaning).
+* Setting EVEX.x since (in non-SIB encoding) EVEX.x
+* is the 4th bit of MODRM.rm, and needs the same treatment.
+* For VEX3-encoded insns, VEX3.x value has no effect in
+* non-SIB encoding, the change is superfluous but harmless.
 */
cursor = auprobe->insn + insn_offset_vex_prefix(insn) + 1;
-   *cursor |= 0x20;
+   *cursor |= 0x60;
}
 
/*
@@ -415,12 +417,10 @@ static void riprel_analyze(struct arch_uprobe *auprobe, 
struct insn *insn)
 
reg = MODRM_REG(insn);  /* Fetch modrm.reg */
reg2 = 0xff;/* Fetch vex. */
-   if (insn->vex_prefix.nbytes == 2)
-   reg2 = insn->vex_prefix.bytes[1];
-   else if (insn->vex_prefix.nbytes == 3)
+   if (insn->vex_prefix.nbytes)
reg2 = insn->vex_prefix.bytes[2];
/*
-* TODO: add XOP, EXEV  reading.
+* TODO: add XOP  reading.
 *
 * vex. field is in bits 6-3, bits are inverted.
 * But in 32-bit mode, high-order bit may be ignored.
-- 
2.9.2



Re: [PATCH v3] powerpc: Do not make the entire heap executable

2016-08-10 Thread Denys Vlasenko



On 08/10/2016 06:36 AM, Michael Ellerman wrote:

Denys Vlasenko  writes:


On 32-bit powerps the ELF PLT sections of binaries (built with --bss-plt,
or with a toolchain which defaults to it) look like this:

...


 arch/powerpc/include/asm/page.h| 10 +-
 arch/powerpc/include/asm/page_32.h |  2 --
 arch/powerpc/include/asm/page_64.h |  4 
 fs/binfmt_elf.c| 34 ++
 include/linux/mm.h |  1 +
 mm/mmap.c  | 20 +++-
 6 files changed, 43 insertions(+), 28 deletions(-)


What tree is this against?


Linus' tree from before August 2.
The "mm: refuse wrapped vm_brk requests" commit collided with my changes
I'll send patch v4 rebased to today's linus tree as soon as I finish testing it.


Re: [PATCH v3] powerpc: Do not make the entire heap executable

2016-08-10 Thread Denys Vlasenko

On 08/10/2016 12:43 AM, Kees Cook wrote:

-static int do_brk(unsigned long addr, unsigned long len)
+static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long 
flags)
 {
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma, *prev;
-   unsigned long flags;
struct rb_node **rb_link, *rb_parent;
pgoff_t pgoff = addr >> PAGE_SHIFT;
int error;
@@ -2666,7 +2665,7 @@ static int do_brk(unsigned long addr, unsigned long len)
if (!len)
return 0;

-   flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
+   flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;


For sanity's sake, should a mask be applied here? i.e. to be extra
careful about what flags can get passed in?


Maybe... I am leaving it to mm experts.


Otherwise, this looks okay to me:

Reviewed-by: Kees Cook 

-Kees


[PATCH v4] powerpc: Do not make the entire heap executable

2016-08-10 Thread Denys Vlasenko
On 32-bit powerpc the ELF PLT sections of binaries (built with --bss-plt,
or with a toolchain which defaults to it) look like this:

  [17] .sbss NOBITS  0002aff8 01aff8 14 00  WA  0   0  4
  [18] .plt  NOBITS  0002b00c 01aff8 84 00 WAX  0   0  4
  [19] .bss  NOBITS  0002b090 01aff8 a4 00  WA  0   0  4

Which results in an ELF load header:

  Type   Offset   VirtAddr   PhysAddr   FileSiz MemSiz  Flg Align
  LOAD   0x019c70 0x00029c70 0x00029c70 0x01388 0x014c4 RWE 0x1

This is all correct, the load region containing the PLT is marked as
executable. Note that the PLT starts at 0002b00c but the file mapping ends at
0002aff8, so the PLT falls in the 0 fill section described by the load header,
and after a page boundary.

Unfortunately the generic ELF loader ignores the X bit in the load headers
when it creates the 0 filled non-file backed mappings. It assumes all of these
mappings are RW BSS sections, which is not the case for PPC.

gcc/ld has an option (--secure-plt) to not do this, this is said to incur
a small performance penalty.

Currently, to support 32-bit binaries with PLT in BSS kernel maps *entire
brk area* with executable rights for all binaries, even --secure-plt ones.

Stop doing that.

Teach the ELF loader to check the X bit in the relevant load header
and create 0 filled anonymous mappings that are executable
if the load header requests that.

The patch was originally posted in 2012 by Jason Gunthorpe
and apparently ignored:

https://lkml.org/lkml/2012/9/30/138

Lightly run-tested.

Signed-off-by: Jason Gunthorpe 
Signed-off-by: Denys Vlasenko 
Reviewed-by: Kees Cook 
CC: Benjamin Herrenschmidt 
CC: Paul Mackerras 
CC: Kees Cook 
CC: Oleg Nesterov 
CC: Michael Ellerman 
CC: Florian Weimer 
CC: linux...@kvack.org
CC: linuxppc-...@lists.ozlabs.org
CC: linux-kernel@vger.kernel.org
---
Changes since v3:
* typo fix in commit message
* rebased to current Linus tree

Changes since v2:
* moved capability to map with VM_EXEC into vm_brk_flags()

Changes since v1:
* wrapped lines to not exceed 79 chars
* improved comment
* expanded CC list

 arch/powerpc/include/asm/page.h| 10 +-
 arch/powerpc/include/asm/page_32.h |  2 --
 arch/powerpc/include/asm/page_64.h |  4 
 fs/binfmt_elf.c| 30 ++
 include/linux/mm.h |  1 +
 mm/mmap.c  | 21 -
 6 files changed, 40 insertions(+), 28 deletions(-)

diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h
index 56398e7..42d7ea1 100644
--- a/arch/powerpc/include/asm/page.h
+++ b/arch/powerpc/include/asm/page.h
@@ -225,15 +225,7 @@ extern long long virt_phys_offset;
 #endif
 #endif
 
-/*
- * Unfortunately the PLT is in the BSS in the PPC32 ELF ABI,
- * and needs to be executable.  This means the whole heap ends
- * up being executable.
- */
-#define VM_DATA_DEFAULT_FLAGS32(VM_READ | VM_WRITE | VM_EXEC | \
-VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
-
-#define VM_DATA_DEFAULT_FLAGS64(VM_READ | VM_WRITE | \
+#define VM_DATA_DEFAULT_FLAGS  (VM_READ | VM_WRITE | \
 VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
 
 #ifdef __powerpc64__
diff --git a/arch/powerpc/include/asm/page_32.h 
b/arch/powerpc/include/asm/page_32.h
index 6a8e179..6113fa8 100644
--- a/arch/powerpc/include/asm/page_32.h
+++ b/arch/powerpc/include/asm/page_32.h
@@ -9,8 +9,6 @@
 #endif
 #endif
 
-#define VM_DATA_DEFAULT_FLAGS  VM_DATA_DEFAULT_FLAGS32
-
 #ifdef CONFIG_NOT_COHERENT_CACHE
 #define ARCH_DMA_MINALIGN  L1_CACHE_BYTES
 #endif
diff --git a/arch/powerpc/include/asm/page_64.h 
b/arch/powerpc/include/asm/page_64.h
index dd5f071..52d8e9c 100644
--- a/arch/powerpc/include/asm/page_64.h
+++ b/arch/powerpc/include/asm/page_64.h
@@ -159,10 +159,6 @@ do {   \
 
 #endif /* !CONFIG_HUGETLB_PAGE */
 
-#define VM_DATA_DEFAULT_FLAGS \
-   (is_32bit_task() ? \
-VM_DATA_DEFAULT_FLAGS32 : VM_DATA_DEFAULT_FLAGS64)
-
 /*
  * This is the default if a program doesn't have a PT_GNU_STACK
  * program header entry. The PPC64 ELF ABI has a non executable stack
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 7f6aff3f..2b57b5a 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -91,12 +91,18 @@ static struct linux_binfmt elf_format = {
 
 #define BAD_ADDR(x) ((unsigned long)(x) >= TASK_SIZE)
 
-static int set_brk(unsigned long start, unsigned long end)
+static int set_brk(unsigned long start, unsigned long end, int prot)
 {
start = ELF_PAGEALIGN(start);
end = ELF_PAGEALIGN(end);
if (end > start) {
-   int error = vm_brk(start, end - start);
+   /*
+* Map the last of the bss segment.
+* If the header is requesting these pages to be
+* executable, honour 

  1   2   3   4   5   6   7   8   9   10   >