[PATCH v15 07/23] selftests: vm: pkeys: Add helpers for pkey bits

2019-12-17 Thread Sandipan Das
This introduces some functions that help with setting
or fetching bits of a particular pkey. This also adds
an abstraction for getting a pkey's bit position in
the pkey register as this may vary across architectures.

cc: Dave Hansen 
cc: Florian Weimer 
cc: Ram Pai 
Signed-off-by: Sandipan Das 
---
 tools/testing/selftests/vm/pkey-helpers.h| 23 ++
 tools/testing/selftests/vm/pkey-x86.h|  5 +++
 tools/testing/selftests/vm/protection_keys.c | 32 ++--
 3 files changed, 37 insertions(+), 23 deletions(-)

diff --git a/tools/testing/selftests/vm/pkey-helpers.h 
b/tools/testing/selftests/vm/pkey-helpers.h
index 2a1a0240f684..bd90a49a3229 100644
--- a/tools/testing/selftests/vm/pkey-helpers.h
+++ b/tools/testing/selftests/vm/pkey-helpers.h
@@ -80,6 +80,29 @@ extern void abort_hooks(void);
 #error Architecture not supported
 #endif /* arch */
 
+#define PKEY_MASK  (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE)
+
+static inline pkey_reg_t set_pkey_bits(pkey_reg_t reg, int pkey,
+   pkey_reg_t flags)
+{
+   u32 shift = pkey_bit_position(pkey);
+   /* mask out bits from pkey in old value */
+   reg &= ~((pkey_reg_t)PKEY_MASK << shift);
+   /* OR in new bits for pkey */
+   reg |= (flags & PKEY_MASK) << shift;
+   return reg;
+}
+
+static inline pkey_reg_t get_pkey_bits(pkey_reg_t reg, int pkey)
+{
+   u32 shift = pkey_bit_position(pkey);
+   /*
+* shift down the relevant bits to the lowest two, then
+* mask off all the other higher bits
+*/
+   return ((reg >> shift) & PKEY_MASK);
+}
+
 extern pkey_reg_t shadow_pkey_reg;
 
 static inline pkey_reg_t _read_pkey_reg(int line)
diff --git a/tools/testing/selftests/vm/pkey-x86.h 
b/tools/testing/selftests/vm/pkey-x86.h
index 5f40901219d3..4937f48f77cc 100644
--- a/tools/testing/selftests/vm/pkey-x86.h
+++ b/tools/testing/selftests/vm/pkey-x86.h
@@ -120,6 +120,11 @@ static inline int cpu_has_pku(void)
return 1;
 }
 
+static inline u32 pkey_bit_position(int pkey)
+{
+   return pkey * PKEY_BITS_PER_PKEY;
+}
+
 #define XSTATE_PKEY_BIT(9)
 #define XSTATE_PKEY0x200
 
diff --git a/tools/testing/selftests/vm/protection_keys.c 
b/tools/testing/selftests/vm/protection_keys.c
index 7e2148662fa4..b474d4fbe92b 100644
--- a/tools/testing/selftests/vm/protection_keys.c
+++ b/tools/testing/selftests/vm/protection_keys.c
@@ -333,25 +333,13 @@ pid_t fork_lazy_child(void)
 
 static u32 hw_pkey_get(int pkey, unsigned long flags)
 {
-   u32 mask = (PKEY_DISABLE_ACCESS|PKEY_DISABLE_WRITE);
pkey_reg_t pkey_reg = __read_pkey_reg();
-   pkey_reg_t shifted_pkey_reg;
-   u32 masked_pkey_reg;
 
dprintf1("%s(pkey=%d, flags=%lx) = %x / %d\n",
__func__, pkey, flags, 0, 0);
dprintf2("%s() raw pkey_reg: "PKEY_REG_FMT"\n", __func__, pkey_reg);
 
-   shifted_pkey_reg = (pkey_reg >> (pkey * PKEY_BITS_PER_PKEY));
-   dprintf2("%s() shifted_pkey_reg: "PKEY_REG_FMT"\n", __func__,
-   shifted_pkey_reg);
-   masked_pkey_reg = shifted_pkey_reg & mask;
-   dprintf2("%s() masked  pkey_reg: %x\n", __func__, masked_pkey_reg);
-   /*
-* shift down the relevant bits to the lowest two, then
-* mask off all the other high bits.
-*/
-   return masked_pkey_reg;
+   return (u32) get_pkey_bits(pkey_reg, pkey);
 }
 
 static int hw_pkey_set(int pkey, unsigned long rights, unsigned long flags)
@@ -363,12 +351,8 @@ static int hw_pkey_set(int pkey, unsigned long rights, 
unsigned long flags)
/* make sure that 'rights' only contains the bits we expect: */
assert(!(rights & ~mask));
 
-   /* copy old pkey_reg */
-   new_pkey_reg = old_pkey_reg;
-   /* mask out bits from pkey in old value: */
-   new_pkey_reg &= ~(mask << (pkey * PKEY_BITS_PER_PKEY));
-   /* OR in new bits for pkey: */
-   new_pkey_reg |= (rights << (pkey * PKEY_BITS_PER_PKEY));
+   /* modify bits accordingly in old pkey_reg and assign it */
+   new_pkey_reg = set_pkey_bits(old_pkey_reg, pkey, rights);
 
__write_pkey_reg(new_pkey_reg);
 
@@ -402,7 +386,7 @@ void pkey_disable_set(int pkey, int flags)
ret = hw_pkey_set(pkey, pkey_rights, syscall_flags);
assert(!ret);
/* pkey_reg and flags have the same format */
-   shadow_pkey_reg |= flags << (pkey * 2);
+   shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, pkey, pkey_rights);
dprintf1("%s(%d) shadow: 0x"PKEY_REG_FMT"\n",
__func__, pkey, shadow_pkey_reg);
 
@@ -436,7 +420,7 @@ void pkey_disable_clear(int pkey, int flags)
pkey_rights |= flags;
 
ret = hw_pkey_set(pkey, pkey_rights, 0);
-   shadow_pkey_reg &= ~(flags << (pkey * 2));
+   shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, pkey, pkey_rights);
pkey_assert(ret >= 0);
 
pkey_rights = hw_pkey_get(pkey, syscall_flags);
@@ -512,7

[PATCH v15 06/23] selftests/vm/pkeys: Typecast the pkey register

2019-12-17 Thread Sandipan Das
From: Ram Pai 

The size of the pkey register can vary across architectures.
Hence, the pkey_reg_t type is introduced for abstraction in
preparation for multi-arch support.

cc: Dave Hansen 
cc: Florian Weimer 
Signed-off-by: Ram Pai 
Signed-off-by: Thiago Jung Bauermann 
Acked-by: Dave Hansen 
Signed-off-by: Sandipan Das 
---
 tools/testing/selftests/vm/pkey-helpers.h| 23 +++---
 tools/testing/selftests/vm/pkey-x86.h| 16 ++--
 tools/testing/selftests/vm/protection_keys.c | 87 
 3 files changed, 73 insertions(+), 53 deletions(-)

diff --git a/tools/testing/selftests/vm/pkey-helpers.h 
b/tools/testing/selftests/vm/pkey-helpers.h
index 7f18a82e54fc..2a1a0240f684 100644
--- a/tools/testing/selftests/vm/pkey-helpers.h
+++ b/tools/testing/selftests/vm/pkey-helpers.h
@@ -80,13 +80,14 @@ extern void abort_hooks(void);
 #error Architecture not supported
 #endif /* arch */
 
-extern unsigned int shadow_pkey_reg;
+extern pkey_reg_t shadow_pkey_reg;
 
-static inline unsigned int _read_pkey_reg(int line)
+static inline pkey_reg_t _read_pkey_reg(int line)
 {
-   unsigned int pkey_reg = __read_pkey_reg();
+   pkey_reg_t pkey_reg = __read_pkey_reg();
 
-   dprintf4("read_pkey_reg(line=%d) pkey_reg: %x shadow: %x\n",
+   dprintf4("read_pkey_reg(line=%d) pkey_reg: "PKEY_REG_FMT
+   " shadow: "PKEY_REG_FMT"\n",
line, pkey_reg, shadow_pkey_reg);
assert(pkey_reg == shadow_pkey_reg);
 
@@ -95,15 +96,15 @@ static inline unsigned int _read_pkey_reg(int line)
 
 #define read_pkey_reg() _read_pkey_reg(__LINE__)
 
-static inline void write_pkey_reg(unsigned int pkey_reg)
+static inline void write_pkey_reg(pkey_reg_t pkey_reg)
 {
-   dprintf4("%s() changing %08x to %08x\n", __func__,
+   dprintf4("%s() changing "PKEY_REG_FMT" to "PKEY_REG_FMT"\n", __func__,
__read_pkey_reg(), pkey_reg);
/* will do the shadow check for us: */
read_pkey_reg();
__write_pkey_reg(pkey_reg);
shadow_pkey_reg = pkey_reg;
-   dprintf4("%s(%08x) pkey_reg: %08x\n", __func__,
+   dprintf4("%s("PKEY_REG_FMT") pkey_reg: "PKEY_REG_FMT"\n", __func__,
pkey_reg, __read_pkey_reg());
 }
 
@@ -113,7 +114,7 @@ static inline void write_pkey_reg(unsigned int pkey_reg)
  */
 static inline void __pkey_access_allow(int pkey, int do_allow)
 {
-   unsigned int pkey_reg = read_pkey_reg();
+   pkey_reg_t pkey_reg = read_pkey_reg();
int bit = pkey * 2;
 
if (do_allow)
@@ -121,13 +122,13 @@ static inline void __pkey_access_allow(int pkey, int 
do_allow)
else
pkey_reg |= (1<>>>===SIGSEGV\n");
-   dprintf1("%s()::%d, pkey_reg: 0x%x shadow: %x\n", __func__, __LINE__,
+   dprintf1("%s()::%d, pkey_reg: "PKEY_REG_FMT" shadow: "PKEY_REG_FMT"\n",
+   __func__, __LINE__,
__read_pkey_reg(), shadow_pkey_reg);
 
trapno = uctxt->uc_mcontext.gregs[REG_TRAPNO];
@@ -213,8 +214,9 @@ void signal_handler(int signum, siginfo_t *si, void 
*vucontext)
fpregset = uctxt->uc_mcontext.fpregs;
fpregs = (void *)fpregset;
 
-   dprintf2("%s() trapno: %d ip: 0x%lx info->si_code: %s/%d\n", __func__,
-   trapno, ip, si_code_str(si->si_code), si->si_code);
+   dprintf2("%s() trapno: %d ip: 0x%016lx info->si_code: %s/%d\n",
+   __func__, trapno, ip, si_code_str(si->si_code),
+   si->si_code);
 #ifdef __i386__
/*
 * 32-bit has some extra padding so that userspace can tell whether
@@ -251,12 +253,13 @@ void signal_handler(int signum, siginfo_t *si, void 
*vucontext)
pkey_assert(siginfo_pkey < NR_PKEYS);
last_si_pkey = siginfo_pkey;
 
-   dprintf1("signal pkey_reg from xsave: %08x\n", *pkey_reg_ptr);
+   dprintf1("signal pkey_reg from xsave: "PKEY_REG_FMT"\n", *pkey_reg_ptr);
/*
 * need __read_pkey_reg() version so we do not do shadow_pkey_reg
 * checking
 */
-   dprintf1("signal pkey_reg from  pkey_reg: %08x\n", __read_pkey_reg());
+   dprintf1("signal pkey_reg from  pkey_reg: "PKEY_REG_FMT"\n",
+   __read_pkey_reg());
dprintf1("pkey from siginfo: %jx\n", siginfo_pkey);
*(u64 *)pkey_reg_ptr = 0x;
dprintf1("WARNING: set PKEY_REG=0 to allow faulting instruction to 
continue\n");
@@ -331,16 +334,17 @@ pid_t fork_lazy_child(void)
 static u32 hw_pkey_get(int pkey, unsigned long flags)
 {
u32 mask = (PKEY_DISABLE_ACCESS|PKEY_DISABLE_WRITE);
-   u32 pkey_reg = __read_pkey_reg();
-   u32 shifted_pkey_reg;
+   pkey_reg_t pkey_reg = __read_pkey_reg();
+   pkey_reg_t shifted_pkey_reg;
u32 masked_pkey_reg;
 
dprintf1("%s(pkey=%d, flags=%lx) = %x / %d\n",
__func__, pkey, flags, 0, 0);
-   dprintf2("%s() raw pkey_r

[PATCH v15 03/23] selftests/vm/pkeys: Move generic definitions to header file

2019-12-17 Thread Sandipan Das
From: Ram Pai 

This moves all the generic definitions and helper functions
to a header file.

cc: Dave Hansen 
cc: Florian Weimer 
Signed-off-by: Ram Pai 
Signed-off-by: Thiago Jung Bauermann 
Acked-by: Dave Hansen 
Signed-off-by: Sandipan Das 
---
 tools/testing/selftests/vm/pkey-helpers.h| 35 +---
 tools/testing/selftests/vm/protection_keys.c | 27 ---
 2 files changed, 30 insertions(+), 32 deletions(-)

diff --git a/tools/testing/selftests/vm/pkey-helpers.h 
b/tools/testing/selftests/vm/pkey-helpers.h
index d5779be4793f..6ad1bd54ef94 100644
--- a/tools/testing/selftests/vm/pkey-helpers.h
+++ b/tools/testing/selftests/vm/pkey-helpers.h
@@ -13,6 +13,14 @@
 #include 
 #include 
 
+/* Define some kernel-like types */
+#define  u8 uint8_t
+#define u16 uint16_t
+#define u32 uint32_t
+#define u64 uint64_t
+
+#define PTR_ERR_ENOTSUP ((void *)-ENOTSUP)
+
 #define NR_PKEYS 16
 #define PKEY_BITS_PER_PKEY 2
 
@@ -53,6 +61,18 @@ static inline void sigsafe_printf(const char *format, ...)
 #define dprintf3(args...) dprintf_level(3, args)
 #define dprintf4(args...) dprintf_level(4, args)
 
+extern void abort_hooks(void);
+#define pkey_assert(condition) do {\
+   if (!(condition)) { \
+   dprintf0("assert() at %s::%d test_nr: %d iteration: %d\n", \
+   __FILE__, __LINE__, \
+   test_nr, iteration_nr); \
+   dprintf0("errno at assert: %d", errno); \
+   abort_hooks();  \
+   exit(__LINE__); \
+   }   \
+} while (0)
+
 extern unsigned int shadow_pkey_reg;
 static inline unsigned int __read_pkey_reg(void)
 {
@@ -137,11 +157,6 @@ static inline void __pkey_write_allow(int pkey, int 
do_allow_write)
dprintf4("pkey_reg now: %08x\n", read_pkey_reg());
 }
 
-#define PROT_PKEY0 0x10/* protection key value (bit 0) */
-#define PROT_PKEY1 0x20/* protection key value (bit 1) */
-#define PROT_PKEY2 0x40/* protection key value (bit 2) */
-#define PROT_PKEY3 0x80/* protection key value (bit 3) */
-
 #define PAGE_SIZE 4096
 #define MB (1<<20)
 
@@ -219,4 +234,14 @@ int pkey_reg_xstate_offset(void)
return xstate_offset;
 }
 
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x)))
+#define ALIGN_UP(x, align_to)  (((x) + ((align_to)-1)) & ~((align_to)-1))
+#define ALIGN_DOWN(x, align_to) ((x) & ~((align_to)-1))
+#define ALIGN_PTR_UP(p, ptr_align_to)  \
+   ((typeof(p))ALIGN_UP((unsigned long)(p), ptr_align_to))
+#define ALIGN_PTR_DOWN(p, ptr_align_to)\
+   ((typeof(p))ALIGN_DOWN((unsigned long)(p), ptr_align_to))
+#define __stringify_1(x...) #x
+#define __stringify(x...)   __stringify_1(x)
+
 #endif /* _PKEYS_HELPER_H */
diff --git a/tools/testing/selftests/vm/protection_keys.c 
b/tools/testing/selftests/vm/protection_keys.c
index 2f4ab81c570d..42ffb58810f2 100644
--- a/tools/testing/selftests/vm/protection_keys.c
+++ b/tools/testing/selftests/vm/protection_keys.c
@@ -51,31 +51,10 @@ int test_nr;
 unsigned int shadow_pkey_reg;
 
 #define HPAGE_SIZE (1UL<<21)
-#define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x)))
-#define ALIGN_UP(x, align_to)  (((x) + ((align_to)-1)) & ~((align_to)-1))
-#define ALIGN_DOWN(x, align_to) ((x) & ~((align_to)-1))
-#define ALIGN_PTR_UP(p, ptr_align_to)  ((typeof(p))ALIGN_UP((unsigned 
long)(p),ptr_align_to))
-#define ALIGN_PTR_DOWN(p, ptr_align_to)
((typeof(p))ALIGN_DOWN((unsigned long)(p),  ptr_align_to))
-#define __stringify_1(x...) #x
-#define __stringify(x...)   __stringify_1(x)
-
-#define PTR_ERR_ENOTSUP ((void *)-ENOTSUP)
 
 int dprint_in_signal;
 char dprint_in_signal_buffer[DPRINT_IN_SIGNAL_BUF_SIZE];
 
-extern void abort_hooks(void);
-#define pkey_assert(condition) do {\
-   if (!(condition)) { \
-   dprintf0("assert() at %s::%d test_nr: %d iteration: %d\n", \
-   __FILE__, __LINE__, \
-   test_nr, iteration_nr); \
-   dprintf0("errno at assert: %d", errno); \
-   abort_hooks();  \
-   exit(__LINE__); \
-   }   \
-} while (0)
-
 void cat_into_file(char *str, char *file)
 {
int fd = open(file, O_RDWR);
@@ -186,12 +165,6 @@ void lots_o_noops_around_write(int *write_to_me)
dprintf3("%s() done\n", __func__);
 }
 
-/* Define some kernel-like types */
-#define  u8 uint8_t
-#define u16 uint16_t
-#define u32 uint32_t
-#define u64 uint64_t
-
 #ifdef __i386__
 
 #ifndef SYS_mprotect_key
-- 
2.17.1



Re: [PATCH 04/14] powerpc/vas: Setup IRQ mapping and register port for each window

2019-12-17 Thread Oliver O'Halloran
On Wed, Nov 27, 2019 at 12:07 PM Haren Myneni  wrote:
>
> *snip*
>
> @@ -36,7 +62,18 @@ static int init_vas_instance(struct platform_device *pdev)
> return -ENODEV;
> }
>
> -   if (pdev->num_resources != 4) {
> +   rc = of_property_read_u64(dn, "ibm,vas-port", &port);
> +   if (rc) {
> +   pr_err("No ibm,vas-port property for %s?\n", pdev->name);
> +   /* No interrupts property */
> +   nresources = 4;
> +   }
> +
> +   /*
> +* interrupts property is available with 'ibm,vas-port' property.
> +* 4 Resources and 1 IRQ if interrupts property is available.
> +*/
> +   if (pdev->num_resources != nresources) {
> pr_err("Unexpected DT configuration for [%s, %d]\n",
> pdev->name, vasid);
> return -ENODEV;

Right, so adding the IRQ in firmware will break the VAS driver in
existing kernels since it changes the resource count. This is IMO a
bug in the VAS driver that you should fix, but it does mean we need to
think twice about having firmware assign an interrupt at boot.

I had a closer look at this series and I'm not convinced that any
firmware changes are actually required either. We already have OPAL
calls for allocating an hwirq for the kernel to use and for getting
the IRQ's XIVE trigger port (see pnv_ocxl_alloc_xive_irq() for an
example). Why not use those here too? Doing so would allow us to
assign interrupts to individual windows too which might be useful for
the windows used by the kernel.


Re: [PATCH v3 3/3] powerpc: Book3S 64-bit "heavyweight" KASAN support

2019-12-17 Thread Daniel Axtens


>>[For those not immersed in ppc64, in real mode, the top nibble or 2 bits
>>(depending on radix/hash mmu) of the address is ignored. The linear
>>mapping is placed at 0xc000. This means that a pointer to
>>part of the linear mapping will work both in real mode, where it will be
>>interpreted as a physical address of the form 0x000..., and out of real
>>mode, where it will go via the linear mapping.]
>>
>
> How does hash or radix mmu mode effect how many bits are ignored in real mode?

Bah, you're picking on details that I picked up from random
conversations in the office rather than from reading the spec! :P

The ISA suggests that real addresses space is limited to at most 64
bits. ISAv3, Book III s5.7:

| * Host real address space size is 2^m bytes, m <= 60;
|   see Note 1.
| * Guest real address space size is 2 m bytes, m <= 60;
|   see Notes 1 and 2.
...
| Notes:
| 1. The value of m is implementation-dependent (sub-
|ject to the maximum given above). When used to
|address storage or to represent a guest real
|address, the high-order 60-m bits of the “60-bit”
|real address must be zeros.
| 2. The hypervisor may assign a guest real address
|space size for each partition that uses Radix Tree
|translation. Accesses to guest real storage out-
|side this range but still mappable by the second
|level Radix Tree will cause an HISI or HDSI.
|Accesses to storage outside the mappable range
|will have boundedly undefined results.

However, it doesn't follow from that passage that the top 4 bits are
always ignored when translations are off ('real mode'): see for example
the discussion of the HRMOR in s 5.7.3 and s 5.7.3.1. 

I think I got the 'top 2 bits on radix' thing from the discussion of
'quadrants' in arch/powerpc/include/asm/book3s/64/radix.h, which in turn
is discussed in s 5.7.5.1. Table 20 in particular is really helpful for
understanding it. But it's not especially relevant to what I'm actually
doing here.

I think to fully understand all of what's going on I would need to spend
some serious time with the entirety of s5.7, because there a lot of
quirks about how storage works! But I think for our purposes it suffices
to say:

  The kernel installs a linear mapping at effective address
  c000... onward. This is a one-to-one mapping with physical memory from
  ... onward. Because of how memory accesses work on powerpc 64-bit
  Book3S, a kernel pointer in the linear map accesses the same memory
  both with translations on (accessing as an 'effective address'), and
  with translations off (accessing as a 'real address'). This works in
  both guests and the hypervisor. For more details, see s5.7 of Book III
  of version 3 of the ISA, in particular the Storage Control Overview,
  s5.7.3, and s5.7.5 - noting that this KASAN implementation currently
  only supports Radix.

Thanks for your attention to detail!

Regards,
Daniel





Re: [PATCH-tty-testing] tty/serial/8250: Add has_sysrq to plat_serial8250_port

2019-12-17 Thread Greg Kroah-Hartman
On Wed, Dec 18, 2019 at 04:01:11AM +, Dmitry Safonov wrote:
> In contrast to 8250/8250_of, legacy_serial on powerpc does fill
> (struct plat_serial8250_port). The reason is likely that it's done on
> device_initcall(), not on probe. So, 8250_core is not yet probed.
> 
> Propagate value from platform_device on 8250 probe - in case powepc
> legacy driver it's initialized on initcall, in case 8250_of it will be
> initialized later on of_platform_serial_setup().
> 
> Fixes: ea2683bf546c ("tty/serial: Migrate 8250_fsl to use has_sysrq").
> Cc: Benjamin Herrenschmidt 
> Cc: Paul Mackerras 
> Cc: Michael Ellerman 
> Cc: linuxppc-dev@lists.ozlabs.org
> Reported-by: kbuild test robot 
> Signed-off-by: Dmitry Safonov 
> ---
>  It's probably better to squash this into the 8250_fsl patch.
>  I've added Fixes tag in case the branch won't be rebased.
>  Tested powerpc build manually with ppc64 cross-compiler.

I have squashed this into that original 8250_fsl patch now, and rebased
the series.  Let's see what kbuild does...

thanks,

greg k-h


Re: [PATCH V3 2/2] KVM: PPC: Implement H_SVM_INIT_ABORT hcall

2019-12-17 Thread Paul Mackerras
On Sat, Dec 14, 2019 at 06:12:08PM -0800, Sukadev Bhattiprolu wrote:
> 
> Implement the H_SVM_INIT_ABORT hcall which the Ultravisor can use to
> abort an SVM after it has issued the H_SVM_INIT_START and before the
> H_SVM_INIT_DONE hcalls. This hcall could be used when Ultravisor
> encounters security violations or other errors when starting an SVM.
> 
> Note that this hcall is different from UV_SVM_TERMINATE ucall which
> is used by HV to terminate/cleanup an VM that has becore secure.
> 
> The H_SVM_INIT_ABORT should basically undo operations that were done
> since the H_SVM_INIT_START hcall - i.e page-out all the VM pages back
> to normal memory, and terminate the SVM.
> 
> (If we do not bring the pages back to normal memory, the text/data
> of the VM would be stuck in secure memory and since the SVM did not
> go secure, its MSR_S bit will be clear and the VM wont be able to
> access its pages even to do a clean exit).
> 
> Based on patches and discussion with Paul Mackerras, Ram Pai and
> Bharata Rao.
> 
> Signed-off-by: Ram Pai 
> Signed-off-by: Sukadev Bhattiprolu 
> Signed-off-by: Bharata B Rao 

Minor comment below, but not a showstopper.  Also, as Bharata noted
you need to hold the srcu lock for reading.

> + for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
> + struct kvm_memory_slot *memslot;
> + struct kvm_memslots *slots = __kvm_memslots(kvm, i);
> +
> + if (!slots)
> + continue;
> +
> + kvm_for_each_memslot(memslot, slots)
> + kvmppc_uvmem_drop_pages(memslot, kvm, false);
> + }

Since we use the default KVM_ADDRESS_SPACE_NUM, which is 1, this code
isn't wrong but it is more verbose than it needs to be.  It could be

kvm_for_each_memslot(kvm_memslots(kvm), slots)
kvmppc_uvmem_drop_pages(memslot, kvm, false);

Paul.


Re: [PATCH V3 1/2] KVM: PPC: Add skip_page_out parameter

2019-12-17 Thread Paul Mackerras
On Sat, Dec 14, 2019 at 06:11:04PM -0800, Sukadev Bhattiprolu wrote:
> 
> This patch is based on Bharata's v11 KVM patches for secure guests:
> https://lists.ozlabs.org/pipermail/linuxppc-dev/2019-November/200918.html
> ---
> 
> From: Sukadev Bhattiprolu 
> Date: Fri, 13 Dec 2019 15:06:16 -0600
> Subject: [PATCH V3 1/2] KVM: PPC: Add skip_page_out parameter
> 
> Add 'skip_page_out' parameter to kvmppc_uvmem_drop_pages() which will
> be needed in a follow-on patch that implements H_SVM_INIT_ABORT hcall.
> 
> Signed-off-by: Sukadev Bhattiprolu 

Reviewed-by: Paul Mackerras 


[PATCH v2 3/3] asm-generic/tlb: Avoid potential double flush

2019-12-17 Thread Aneesh Kumar K.V
From: Peter Zijlstra 

Aneesh reported that:

tlb_flush_mmu()
  tlb_flush_mmu_tlbonly()
tlb_flush() <-- #1
  tlb_flush_mmu_free()
tlb_table_flush()
  tlb_table_invalidate()
tlb_flush_mmu_tlbonly()
  tlb_flush()   <-- #2

does two TLBIs when tlb->fullmm, because __tlb_reset_range() will not
clear tlb->end in that case.

Observe that any caller to __tlb_adjust_range() also sets at least one
of the tlb->freed_tables || tlb->cleared_p* bits, and those are
unconditionally cleared by __tlb_reset_range().

Change the condition for actually issuing TLBI to having one of those
bits set, as opposed to having tlb->end != 0.

Reported-by: "Aneesh Kumar K.V" 
Signed-off-by: Peter Zijlstra (Intel) 
Signed-off-by: Aneesh Kumar K.V 
---
 include/asm-generic/tlb.h | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index 9e22ac369d1d..b36b3bef5661 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -402,7 +402,12 @@ tlb_update_vma_flags(struct mmu_gather *tlb, struct 
vm_area_struct *vma) { }
 
 static inline void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb)
 {
-   if (!tlb->end)
+   /*
+* Anything calling __tlb_adjust_range() also sets at least one of
+* these bits.
+*/
+   if (!(tlb->freed_tables || tlb->cleared_ptes || tlb->cleared_pmds ||
+ tlb->cleared_puds || tlb->cleared_p4ds))
return;
 
tlb_flush(tlb);
-- 
2.23.0



[PATCH v2 2/3] mm/mmu_gather: Invalidate TLB correctly on batch allocation failure and flush

2019-12-17 Thread Aneesh Kumar K.V
From: Peter Zijlstra 

Architectures for which we have hardware walkers of Linux page table should
flush TLB on mmu gather batch allocation failures and batch flush. Some
architectures like POWER supports multiple translation modes (hash and radix)
and in the case of POWER only radix translation mode needs the above TLBI.
This is because for hash translation mode kernel wants to avoid this extra
flush since there are no hardware walkers of linux page table. With radix
translation, the hardware also walks linux page table and with that, kernel
needs to make sure to TLB invalidate page walk cache before page table pages are
freed.

More details in
commit: d86564a2f085 ("mm/tlb, x86/mm: Support invalidating TLB caches for 
RCU_TABLE_FREE")

Fixes: a46cc7a90fd8 ("powerpc/mm/radix: Improve TLB/PWC flushes")
Signed-off-by: Peter Zijlstra (Intel) 
---
 arch/Kconfig|  3 ---
 arch/powerpc/Kconfig|  1 -
 arch/powerpc/include/asm/tlb.h  | 11 +++
 arch/sparc/Kconfig  |  1 -
 arch/sparc/include/asm/tlb_64.h |  9 +
 include/asm-generic/tlb.h   | 22 +++---
 mm/mmu_gather.c | 16 
 7 files changed, 43 insertions(+), 20 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index 48b5e103bdb0..208aad121630 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -396,9 +396,6 @@ config HAVE_ARCH_JUMP_LABEL_RELATIVE
 config HAVE_RCU_TABLE_FREE
bool
 
-config HAVE_RCU_TABLE_NO_INVALIDATE
-   bool
-
 config HAVE_MMU_GATHER_PAGE_SIZE
bool
 
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 04240205f38c..f9970f87612e 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -223,7 +223,6 @@ config PPC
select HAVE_PERF_REGS
select HAVE_PERF_USER_STACK_DUMP
select HAVE_RCU_TABLE_FREE
-   select HAVE_RCU_TABLE_NO_INVALIDATE if HAVE_RCU_TABLE_FREE
select HAVE_MMU_GATHER_PAGE_SIZE
select HAVE_REGS_AND_STACK_ACCESS_API
select HAVE_RELIABLE_STACKTRACE if PPC_BOOK3S_64 && 
CPU_LITTLE_ENDIAN
diff --git a/arch/powerpc/include/asm/tlb.h b/arch/powerpc/include/asm/tlb.h
index b2c0be93929d..7f3a8b902325 100644
--- a/arch/powerpc/include/asm/tlb.h
+++ b/arch/powerpc/include/asm/tlb.h
@@ -26,6 +26,17 @@
 
 #define tlb_flush tlb_flush
 extern void tlb_flush(struct mmu_gather *tlb);
+/*
+ * book3s:
+ * Hash does not use the linux page-tables, so we can avoid
+ * the TLB invalidate for page-table freeing, Radix otoh does use the
+ * page-tables and needs the TLBI.
+ *
+ * nohash:
+ * We still do TLB invalidate in the __pte_free_tlb routine before we
+ * add the page table pages to mmu gather table batch.
+ */
+#define tlb_needs_table_invalidate()   radix_enabled()
 
 /* Get the generic bits... */
 #include 
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index eb24cb1afc11..18e9fb6fcf1b 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -65,7 +65,6 @@ config SPARC64
select HAVE_KRETPROBES
select HAVE_KPROBES
select HAVE_RCU_TABLE_FREE if SMP
-   select HAVE_RCU_TABLE_NO_INVALIDATE if HAVE_RCU_TABLE_FREE
select HAVE_MEMBLOCK_NODE_MAP
select HAVE_ARCH_TRANSPARENT_HUGEPAGE
select HAVE_DYNAMIC_FTRACE
diff --git a/arch/sparc/include/asm/tlb_64.h b/arch/sparc/include/asm/tlb_64.h
index a2f3fa61ee36..8cb8f3833239 100644
--- a/arch/sparc/include/asm/tlb_64.h
+++ b/arch/sparc/include/asm/tlb_64.h
@@ -28,6 +28,15 @@ void flush_tlb_pending(void);
 #define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0)
 #define tlb_flush(tlb) flush_tlb_pending()
 
+/*
+ * SPARC64's hardware TLB fill does not use the Linux page-tables
+ * and therefore we don't need a TLBI when freeing page-table pages.
+ */
+
+#ifdef CONFIG_HAVE_RCU_TABLE_FREE
+#define tlb_needs_table_invalidate()   (false)
+#endif
+
 #include 
 
 #endif /* _SPARC64_TLB_H */
diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index 2b10036fefd0..9e22ac369d1d 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -137,13 +137,6 @@
  *  When used, an architecture is expected to provide __tlb_remove_table()
  *  which does the actual freeing of these pages.
  *
- *  HAVE_RCU_TABLE_NO_INVALIDATE
- *
- *  This makes HAVE_RCU_TABLE_FREE avoid calling tlb_flush_mmu_tlbonly() before
- *  freeing the page-table pages. This can be avoided if you use
- *  HAVE_RCU_TABLE_FREE and your architecture does _NOT_ use the Linux
- *  page-tables natively.
- *
  *  MMU_GATHER_NO_RANGE
  *
  *  Use this if your architecture lacks an efficient flush_tlb_range().
@@ -189,8 +182,23 @@ struct mmu_table_batch {
 
 extern void tlb_remove_table(struct mmu_gather *tlb, void *table);
 
+/*
+ * This allows an architecture that does not use the linux page-tables for
+ * hardware to skip the TLBI when freeing page tables.
+ */
+#ifndef tlb_needs_table_invalidate
+#define tlb_needs_table_invalidate() (true)
+#endif
+
+#else
+
+#

[PATCH v2 1/3] powerpc/mmu_gather: Enable RCU_TABLE_FREE even for !SMP case

2019-12-17 Thread Aneesh Kumar K.V
A follow up patch is going to make sure we correctly invalidate page walk cache
before we free page table pages. In order to keep things simple enable
RCU_TABLE_FREE even for !SMP so that we don't have to fixup the !SMP case
differently in the followup patch

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/Kconfig | 2 +-
 arch/powerpc/include/asm/book3s/32/pgalloc.h | 8 
 arch/powerpc/include/asm/book3s/64/pgalloc.h | 2 --
 arch/powerpc/include/asm/nohash/pgalloc.h| 8 
 arch/powerpc/mm/book3s64/pgtable.c   | 7 ---
 5 files changed, 1 insertion(+), 26 deletions(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 1ec34e16ed65..04240205f38c 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -222,7 +222,7 @@ config PPC
select HAVE_HARDLOCKUP_DETECTOR_PERFif PERF_EVENTS && 
HAVE_PERF_EVENTS_NMI && !HAVE_HARDLOCKUP_DETECTOR_ARCH
select HAVE_PERF_REGS
select HAVE_PERF_USER_STACK_DUMP
-   select HAVE_RCU_TABLE_FREE  if SMP
+   select HAVE_RCU_TABLE_FREE
select HAVE_RCU_TABLE_NO_INVALIDATE if HAVE_RCU_TABLE_FREE
select HAVE_MMU_GATHER_PAGE_SIZE
select HAVE_REGS_AND_STACK_ACCESS_API
diff --git a/arch/powerpc/include/asm/book3s/32/pgalloc.h 
b/arch/powerpc/include/asm/book3s/32/pgalloc.h
index 998317702630..dc5c039eb28e 100644
--- a/arch/powerpc/include/asm/book3s/32/pgalloc.h
+++ b/arch/powerpc/include/asm/book3s/32/pgalloc.h
@@ -49,7 +49,6 @@ static inline void pgtable_free(void *table, unsigned 
index_size)
 
 #define get_hugepd_cache_index(x)  (x)
 
-#ifdef CONFIG_SMP
 static inline void pgtable_free_tlb(struct mmu_gather *tlb,
void *table, int shift)
 {
@@ -66,13 +65,6 @@ static inline void __tlb_remove_table(void *_table)
 
pgtable_free(table, shift);
 }
-#else
-static inline void pgtable_free_tlb(struct mmu_gather *tlb,
-   void *table, int shift)
-{
-   pgtable_free(table, shift);
-}
-#endif
 
 static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t table,
  unsigned long address)
diff --git a/arch/powerpc/include/asm/book3s/64/pgalloc.h 
b/arch/powerpc/include/asm/book3s/64/pgalloc.h
index f6968c811026..a41e91bd0580 100644
--- a/arch/powerpc/include/asm/book3s/64/pgalloc.h
+++ b/arch/powerpc/include/asm/book3s/64/pgalloc.h
@@ -19,9 +19,7 @@ extern struct vmemmap_backing *vmemmap_list;
 extern pmd_t *pmd_fragment_alloc(struct mm_struct *, unsigned long);
 extern void pmd_fragment_free(unsigned long *);
 extern void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift);
-#ifdef CONFIG_SMP
 extern void __tlb_remove_table(void *_table);
-#endif
 void pte_frag_destroy(void *pte_frag);
 
 static inline pgd_t *radix__pgd_alloc(struct mm_struct *mm)
diff --git a/arch/powerpc/include/asm/nohash/pgalloc.h 
b/arch/powerpc/include/asm/nohash/pgalloc.h
index 332b13b4ecdb..29c43665a753 100644
--- a/arch/powerpc/include/asm/nohash/pgalloc.h
+++ b/arch/powerpc/include/asm/nohash/pgalloc.h
@@ -46,7 +46,6 @@ static inline void pgtable_free(void *table, int shift)
 
 #define get_hugepd_cache_index(x)  (x)
 
-#ifdef CONFIG_SMP
 static inline void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int 
shift)
 {
unsigned long pgf = (unsigned long)table;
@@ -64,13 +63,6 @@ static inline void __tlb_remove_table(void *_table)
pgtable_free(table, shift);
 }
 
-#else
-static inline void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int 
shift)
-{
-   pgtable_free(table, shift);
-}
-#endif
-
 static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t table,
  unsigned long address)
 {
diff --git a/arch/powerpc/mm/book3s64/pgtable.c 
b/arch/powerpc/mm/book3s64/pgtable.c
index 75483b40fcb1..2bf7e1b4fd82 100644
--- a/arch/powerpc/mm/book3s64/pgtable.c
+++ b/arch/powerpc/mm/book3s64/pgtable.c
@@ -378,7 +378,6 @@ static inline void pgtable_free(void *table, int index)
}
 }
 
-#ifdef CONFIG_SMP
 void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int index)
 {
unsigned long pgf = (unsigned long)table;
@@ -395,12 +394,6 @@ void __tlb_remove_table(void *_table)
 
return pgtable_free(table, index);
 }
-#else
-void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int index)
-{
-   return pgtable_free(table, index);
-}
-#endif
 
 #ifdef CONFIG_PROC_FS
 atomic_long_t direct_pages_count[MMU_PAGE_COUNT];
-- 
2.23.0



Re: [RFC PATCH 1/2] mm/mmu_gather: Invalidate TLB correctly on batch allocation failure and flush

2019-12-17 Thread Aneesh Kumar K.V
Peter Zijlstra  writes:

> On Tue, Dec 17, 2019 at 04:18:40PM +0530, Aneesh Kumar K.V wrote:
>> On 12/17/19 2:39 PM, Peter Zijlstra wrote:
>> > On Tue, Dec 17, 2019 at 12:47:12PM +0530, Aneesh Kumar K.V wrote:
>> > > Architectures for which we have hardware walkers of Linux page table 
>> > > should
>> > > flush TLB on mmu gather batch allocation failures and batch flush. Some
>> > > architectures like POWER supports multiple translation modes (hash and 
>> > > radix)
>> > > and in the case of POWER only radix translation mode needs the above 
>> > > TLBI.
>> > > This is because for hash translation mode kernel wants to avoid this 
>> > > extra
>> > > flush since there are no hardware walkers of linux page table. With radix
>> > > translation, the hardware also walks linux page table and with that, 
>> > > kernel
>> > > needs to make sure to TLB invalidate page walk cache before page table 
>> > > pages are
>> > > freed.
>> > 
>> > > Based on changes from Peter Zijlstra 
>> > 
>> > AFAICT it is all my patch ;-)
>> 
>> Yes. I moved the changes you had to upstream. I can update the From: in the
>> next version if you are ok with that?
>
> Well, since PPC isn't broken per finding the invalidate in
> __p*_free_tlb(), lets do these things on top of the patches I proposed
> here. Also, you mnight want to run benchmarks to see if the movement of
> that TLBI actually helps (I'm thinking the cost of the PTESYNC might add
> up).

Upstream ppc64 is broken after the commit: a46cc7a90fd8
("powerpc/mm/radix: Improve TLB/PWC flushes").

Also the patches are not adding any extra TLBI on either radix or hash.

Considering we need to backport this to stable and other distributions,
how about we do this early patches in your series before the Kconfig rename?
This should enable stable to pick them up with less dependencies. 

-aneesh


Re: [PATCH 1/1] kvm/book3s_64: Fixes crash caused by not cleaning vhost IOTLB

2019-12-17 Thread Alexey Kardashevskiy



On 18/12/2019 08:06, Leonardo Bras wrote:
> Fixes a bug that happens when a virtual machine is created without DDW,
> with vhost supporting a virtio-net device.
> 
> In this scenario, an IOMMU with 32-bit DMA window will possibly map
> IOVA's to different memory addresses.
> 
> As the code works today, H_STUFF_TCE hypercall will be dealt only with
> kvm code, which does not invalidate the IOTLB entry in vhost, meaning
> that at some point, and old entry can cause an access to a previous
> memory address that IOVA pointed.
> 
> Example:
> - virtio-net passes IOVA N to vhost, which point to M1
> - vhost tries IOTLB, but miss
> - vhost translates IOVA N and stores result to IOTLB
> - vhost writes to M1
> - (some IOMMU usage)
> - virtio-net passes IOVA N to vhost, which now points to M2
> - vhost tries IOTLB, and translates IOVA N to M1
> - vhost writes to M1 
> 
> The reason why this error was not so evident, is probably because the
> IOTLB was small enough to almost always miss at the point an IOVA was
> reused. Raising the IOTLB size to 32k (which is a module parameter that
> defaults to 2k) is enough to reproduce the bug in +90% of the runs.
> It usually takes less than 10 seconds of netperf to cause this bug
> to happen.
> 
> A few minutes after reproducing this bug, the guest usually crash.
> 
> Fixing this bug involves cleaning a IOVA entry from IOTLB.
> The guest kernel trigger this by doing a H_STUFF_TCE hypercall with
> tce_value == 0.
> 
> This change fixes this bug by returning H_TOO_HARD on kvmppc_h_stuff_tce
> when tce_value == 0, which causes kvm to let qemu deal with this.
> In this case, qemu does free the vhost IOTLB entry, which fixes the bug.
> 
> Signed-off-by: Leonardo Bras 
> ---
>  arch/powerpc/kvm/book3s_64_vio.c | 3 +++
>  1 file changed, 3 insertions(+)
> 
> diff --git a/arch/powerpc/kvm/book3s_64_vio.c 
> b/arch/powerpc/kvm/book3s_64_vio.c
> index 883a66e76638..841eff3f6392 100644
> --- a/arch/powerpc/kvm/book3s_64_vio.c
> +++ b/arch/powerpc/kvm/book3s_64_vio.c
> @@ -710,6 +710,9 @@ long kvmppc_h_stuff_tce(struct kvm_vcpu *vcpu,
>   if (ret != H_SUCCESS)
>   return ret;
>  
> + if (tce_value == 0)


H_STUFF_TCE is always called with 0. Well, may be some AIX somewhere
calls it with a value other than zero, and I probably saw some other
value somewhere but in QEMU/KVM case it is 0 so you effectively disable
in-kernel acceleration of H_STUFF_TCE which is undesirable.

For now we should disable in-kernel H_STUFF_TCE/... handlers in QEMU
just like we do for VFIO for older host kernels:

https://git.qemu.org/?p=qemu.git;a=blob;f=hw/ppc/spapr_iommu.c;h=3d3bcc86496a5277d62f7855fbb09c013c015f27;hb=HEAD#l208

I am not sure what a proper solution would be, something like an eventfd
and KVM's kvmppc_h_stuff_tce() signalling vhost that the latter needs to
invalidate iotlbs. Or we can just say that we do not allow KVM
acceleration if there is vhost+iommu on the same liobn (== vPHB, pretty
much). Thanks,



> + return H_TOO_HARD;
> +
>   /* Check permission bits only to allow userspace poison TCE for debug */
>   if (tce_value & (TCE_PCI_WRITE | TCE_PCI_READ))
>   return H_PARAMETER;
> 

-- 
Alexey


Re: [PATCH v3 3/3] powerpc: Book3S 64-bit "heavyweight" KASAN support

2019-12-17 Thread Daniel Axtens
Daniel Axtens  writes:

> Hi Christophe,
>
> I'm working through your feedback, thank you. Regarding this one:
>
>>> --- a/arch/powerpc/kernel/process.c
>>> +++ b/arch/powerpc/kernel/process.c
>>> @@ -2081,7 +2081,14 @@ void show_stack(struct task_struct *tsk, unsigned 
>>> long *stack)
>>> /*
>>>  * See if this is an exception frame.
>>>  * We look for the "regshere" marker in the current frame.
>>> +*
>>> +* KASAN may complain about this. If it is an exception frame,
>>> +* we won't have unpoisoned the stack in asm when we set the
>>> +* exception marker. If it's not an exception frame, who knows
>>> +* how things are laid out - the shadow could be in any state
>>> +* at all. Just disable KASAN reporting for now.
>>>  */
>>> +   kasan_disable_current();
>>> if (validate_sp(sp, tsk, STACK_INT_FRAME_SIZE)
>>> && stack[STACK_FRAME_MARKER] == STACK_FRAME_REGS_MARKER) {
>>> struct pt_regs *regs = (struct pt_regs *)
>>> @@ -2091,6 +2098,7 @@ void show_stack(struct task_struct *tsk, unsigned 
>>> long *stack)
>>>regs->trap, (void *)regs->nip, (void *)lr);
>>> firstframe = 1;
>>> }
>>> +   kasan_enable_current();
>>
>> If this is really a concern for all targets including PPC32, should it 
>> be a separate patch with a Fixes: tag to be applied back in stable as well ?
>
> I've managed to repro this by commening out the kasan_disable/enable
> lines, and just booting in qemu without a disk attached:
>
> sudo qemu-system-ppc64 -accel kvm -m 2G -M pseries -cpu power9  -kernel 
> ./vmlinux  -nographic -chardev stdio,id=charserial0,mux=on -device 
> spapr-vty,chardev=charserial0,reg=0x3000  -mon 
> chardev=charserial0,mode=readline -nodefaults -smp 2 
>
> ...
>
> [0.210740] Kernel panic - not syncing: VFS: Unable to mount root fs on 
> unknown-block(0,0)
> [0.210789] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 
> 5.5.0-rc1-next-20191213-16824-g469a24fbdb34 #12
> [0.210844] Call Trace:
> [0.210866] [c0006a4839b0] [c1f74f48] dump_stack+0xfc/0x154 
> (unreliable)
> [0.210915] [c0006a483a00] [c025411c] panic+0x258/0x59c
> [0.210958] [c0006a483aa0] [c24870b0] 
> mount_block_root+0x648/0x7ac
> [0.211005] 
> ==
> [0.211054] BUG: KASAN: stack-out-of-bounds in show_stack+0x438/0x580
> [0.211095] Read of size 8 at addr c0006a483b00 by task swapper/0/1
> [0.211134] 
> [0.211152] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 
> 5.5.0-rc1-next-20191213-16824-g469a24fbdb34 #12
> [0.211207] Call Trace:
> [0.211225] [c0006a483680] [c1f74f48] dump_stack+0xfc/0x154 
> (unreliable)
> [0.211274] [c0006a4836d0] [c08f877c] 
> print_address_description.isra.10+0x7c/0x470
> [0.211330] [c0006a483760] [c08f8e7c] 
> __kasan_report+0x1bc/0x244
> [0.211380] [c0006a483830] [c08f6eb8] kasan_report+0x18/0x30
> [0.211422] [c0006a483850] [c08fa5d4] 
> __asan_report_load8_noabort+0x24/0x40
> [0.211471] [c0006a483870] [c003d448] show_stack+0x438/0x580
> [0.211512] [c0006a4839b0] [c1f74f48] dump_stack+0xfc/0x154
> [0.211553] [c0006a483a00] [c025411c] panic+0x258/0x59c
> [0.211595] [c0006a483aa0] [c24870b0] 
> mount_block_root+0x648/0x7ac
> [0.211644] [c0006a483be0] [c2487784] 
> prepare_namespace+0x1ec/0x240
> [0.211694] [c0006a483c60] [c248669c] 
> kernel_init_freeable+0x7f4/0x870
> [0.211745] [c0006a483da0] [c0011f30] kernel_init+0x3c/0x15c
> [0.211787] [c0006a483e20] [c000bebc] 
> ret_from_kernel_thread+0x5c/0x80
> [0.211834] 
> [0.211851] Allocated by task 0:
> [0.211878]  save_stack+0x2c/0xe0
> [0.211904]  __kasan_kmalloc.isra.16+0x11c/0x150
> [0.211937]  kmem_cache_alloc_node+0x114/0x3b0
> [0.211971]  copy_process+0x5b8/0x6410
> [0.211996]  _do_fork+0x130/0xbf0
> [0.212022]  kernel_thread+0xdc/0x130
> [0.212047]  rest_init+0x44/0x184
> [0.212072]  start_kernel+0x77c/0x7dc
> [0.212098]  start_here_common+0x1c/0x20
> [0.212122] 
> [0.212139] Freed by task 0:
> [0.212163] (stack is not available)
> [0.212187] 
> [0.212205] The buggy address belongs to the object at c0006a48
> [0.212205]  which belongs to the cache thread_stack of size 16384
> [0.212285] The buggy address is located 15104 bytes inside of
> [0.212285]  16384-byte region [c0006a48, c0006a484000)
> [0.212356] The buggy address belongs to the page:
> [0.212391] page:c00c001a9200 refcount:1 mapcount:0 
> mapping:c0006a019e00 index:0x0 compound_mapcount: 0
> [0.212455] raw: 00710

[PATCH 2/2] powerpc/pseries/svm: Disable PMUs in SVMs

2019-12-17 Thread Sukadev Bhattiprolu
For now, disable hardware PMU facilities in secure virtual
machines (SVMs) to prevent any information leak between SVMs
and the (untrusted) HV.

With this, a simple 'myperf' program that uses the perf_event_open()
fails for SVMs (with the corresponding fix to UV). In normal VMs and
on the bare-metal HV the syscall and performance counters work

Signed-off-by: Sukadev Bhattiprolu 
---
 arch/powerpc/kernel/cpu_setup_power.S | 22 ++
 arch/powerpc/perf/core-book3s.c   |  6 ++
 2 files changed, 28 insertions(+)

diff --git a/arch/powerpc/kernel/cpu_setup_power.S 
b/arch/powerpc/kernel/cpu_setup_power.S
index a460298c7ddb..d5eb06e20b5a 100644
--- a/arch/powerpc/kernel/cpu_setup_power.S
+++ b/arch/powerpc/kernel/cpu_setup_power.S
@@ -206,14 +206,36 @@ __init_PMU_HV_ISA207:
blr
 
 __init_PMU:
+#ifdef CONFIG_PPC_SVM
+   /*
+* For now, SVM's are restricted from accessing PMU
+* features, so skip accordingly.
+*/
+   mfmsr   r5
+   rldicl  r5, r5, 64-MSR_S_LG, 62
+   cmpwi   r5,1
+   beq skip1
+#endif
li  r5,0
mtspr   SPRN_MMCRA,r5
mtspr   SPRN_MMCR0,r5
mtspr   SPRN_MMCR1,r5
mtspr   SPRN_MMCR2,r5
+skip1:
blr
 
 __init_PMU_ISA207:
+#ifdef CONFIG_PPC_SVM
+   /*
+* For now, SVM's are restricted from accessing PMU
+* features, so skip accordingly.
+*/
+   mfmsr   r5
+   rldicl  r5, r5, 64-MSR_S_LG, 62
+   cmpwi   r5,1
+   beq skip2
+#endif
li  r5,0
mtspr   SPRN_MMCRS,r5
+skip2:
blr
diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
index 4e76b2251801..9e6a9f1803f6 100644
--- a/arch/powerpc/perf/core-book3s.c
+++ b/arch/powerpc/perf/core-book3s.c
@@ -2275,6 +2275,12 @@ static int power_pmu_prepare_cpu(unsigned int cpu)
 
 int register_power_pmu(struct power_pmu *pmu)
 {
+   /*
+* PMU events are not currently supported in SVMs
+*/
+   if (is_secure_guest())
+   return -ENOSYS;
+
if (ppmu)
return -EBUSY;  /* something's already registered */
 
-- 
2.17.2



[PATCH 1/2] powerpc/pseries/svm: Don't access some SPRs

2019-12-17 Thread Sukadev Bhattiprolu
Ultravisor disables some CPU features like EBB and BHRB in the HFSCR
for secure virtual machines (SVMs). If the SVMs attempt to access
related registers, they will get a Program Interrupt.

Use macros/wrappers to skip accessing EBB and BHRB registers in secure
VMs.

Signed-off-by: Sukadev Bhattiprolu 
---
---
 arch/powerpc/include/asm/reg.h  | 35 ++
 arch/powerpc/kernel/process.c   | 12 +++
 arch/powerpc/kvm/book3s_hv.c| 24 ++---
 arch/powerpc/kvm/book3s_hv_rmhandlers.S | 48 ++---
 arch/powerpc/kvm/book3s_hv_tm_builtin.c |  6 ++--
 arch/powerpc/perf/core-book3s.c |  5 +--
 arch/powerpc/xmon/xmon.c|  2 +-
 7 files changed, 96 insertions(+), 36 deletions(-)

diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index b3cbb1136bce..026eb20f6d13 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -1379,6 +1379,41 @@ static inline void msr_check_and_clear(unsigned long 
bits)
__msr_check_and_clear(bits);
 }
 
+#ifdef CONFIG_PPC_SVM
+/*
+ * Move from some "restricted" sprs.
+ * Secure VMs should not access some registers as the related features
+ * are disabled in the CPU. If an SVM is attempting read from the given
+ * SPR, return 0. Otherwise behave like a normal mfspr.
+ */
+#define mfspr_r(rn)\
+({ \
+   unsigned long rval = 0ULL;  \
+   \
+   if (!(mfmsr() & MSR_S)) \
+   asm volatile("mfspr %0," __stringify(rn)\
+   : "=r" (rval)); \
+   rval;   \
+})
+
+/*
+ * Move to some "restricted" sprs.
+ * Secure VMs should not access some registers as the related features
+ * are disabled in the CPU. If an SVM is attempting write to the given
+ * SPR, ignore the write. Otherwise behave like a normal mtspr.
+ */
+#define mtspr_r(rn, v) \
+({ \
+   if (!(mfmsr() & MSR_S)) \
+   asm volatile("mtspr " __stringify(rn) ",%0" :   \
+: "r" ((unsigned long)(v)) \
+: "memory");   \
+})
+#else
+#define mfspr_rmfspr
+#define mtspr_rmtspr
+#endif
+
 #ifdef __powerpc64__
 #if defined(CONFIG_PPC_CELL) || defined(CONFIG_PPC_FSL_BOOK3E)
 #define mftb() ({unsigned long rval;   \
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 639ceae7da9d..9a691452ea3b 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -1059,9 +1059,9 @@ static inline void save_sprs(struct thread_struct *t)
t->dscr = mfspr(SPRN_DSCR);
 
if (cpu_has_feature(CPU_FTR_ARCH_207S)) {
-   t->bescr = mfspr(SPRN_BESCR);
-   t->ebbhr = mfspr(SPRN_EBBHR);
-   t->ebbrr = mfspr(SPRN_EBBRR);
+   t->bescr = mfspr_r(SPRN_BESCR);
+   t->ebbhr = mfspr_r(SPRN_EBBHR);
+   t->ebbrr = mfspr_r(SPRN_EBBRR);
 
t->fscr = mfspr(SPRN_FSCR);
 
@@ -1098,11 +1098,11 @@ static inline void restore_sprs(struct thread_struct 
*old_thread,
 
if (cpu_has_feature(CPU_FTR_ARCH_207S)) {
if (old_thread->bescr != new_thread->bescr)
-   mtspr(SPRN_BESCR, new_thread->bescr);
+   mtspr_r(SPRN_BESCR, new_thread->bescr);
if (old_thread->ebbhr != new_thread->ebbhr)
-   mtspr(SPRN_EBBHR, new_thread->ebbhr);
+   mtspr_r(SPRN_EBBHR, new_thread->ebbhr);
if (old_thread->ebbrr != new_thread->ebbrr)
-   mtspr(SPRN_EBBRR, new_thread->ebbrr);
+   mtspr_r(SPRN_EBBRR, new_thread->ebbrr);
 
if (old_thread->fscr != new_thread->fscr)
mtspr(SPRN_FSCR, new_thread->fscr);
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 709cf1fd4cf4..dba21b0e1d22 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -3568,9 +3568,9 @@ int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 
time_limit,
mtspr(SPRN_PSPB, vcpu->arch.pspb);
mtspr(SPRN_FSCR, vcpu->arch.fscr);
mtspr(SPRN_TAR, vcpu->arch.tar);
-   mtspr(SPRN_EBBHR, vcpu->arch.ebbhr);
-   mtspr(SPRN_EBBRR, vcpu->arch.ebbrr);
-   mtspr(SPRN_BESCR, vcpu->arch.bescr);
+   mtspr_r(SPRN_EBBHR, vcpu->arch.ebbhr);
+   mtspr_r(SPRN_EBBRR, vcpu->arch.ebbrr);
+   mtspr_r(SPRN_BESCR, vcpu->arch.bescr);
mtspr(SPRN_WOR

Re: [PATCH v2] powerpc/pseries/cmm: fix managed page counts when migrating pages between zones

2019-12-17 Thread Michael Ellerman
On Mon, 2019-12-16 at 10:30:58 UTC, David Hildenbrand wrote:
> Commit 63341ab03706 (virtio-balloon: fix managed page counts when migrati=
> ng
> pages between zones) fixed a long existing BUG in the virtio-balloon
> driver when pages would get migrated between zones.  I did not try to
> reproduce on powerpc, but looking at the code, the same should apply to
> powerpc/cmm ever since it started using the balloon compaction
> infrastructure (luckily just recently).
> 
> In case we have to migrate a ballon page to a newpage of another zone, th=
> e
> managed page count of both zones is wrong. Paired with memory offlining
> (which will adjust the managed page count), we can trigger kernel crashes
> and all kinds of different symptoms.
> 
> Fix it by properly adjusting the managed page count when migrating if
> the zone changed.
> 
> We'll temporarily modify the totalram page count. If this ever becomes a
> problem, we can fine tune by providing helpers that don't touch
> the totalram pages (e.g., adjust_zone_managed_page_count()).
> 
> Fixes: fe030c9b85e6 ("powerpc/pseries/cmm: Implement balloon compaction")
> Cc: Benjamin Herrenschmidt 
> Cc: Paul Mackerras 
> Cc: Michael Ellerman 
> Cc: Andrew Morton 
> Cc: Richard Fontana 
> Cc: Greg Kroah-Hartman 
> Cc: Arun KS 
> Cc: Thomas Gleixner 
> Cc: linuxppc-dev@lists.ozlabs.org
> Signed-off-by: David Hildenbrand 

Applied to powerpc fixes, thanks.

https://git.kernel.org/powerpc/c/e352f576d345e5bf1fb62c8559851448a6c1d9cd

cheers


Re: [PATCH v2] powerpc: Fix __clear_user() with KUAP enabled

2019-12-17 Thread Michael Ellerman
On Mon, 2019-12-09 at 13:22:21 UTC, Andrew Donnellan wrote:
> The KUAP implementation adds calls in clear_user() to enable and disable
> access to userspace memory. However, it doesn't add these to
> __clear_user(), which is used in the ptrace regset code.
> 
> As there's only one direct user of __clear_user(), and the time taken to
> set the AMR for KUAP purposes is going to dominate the cost of a quick
> access_ok(), there's not much point having a separate path.
> 
> Rename __clear_user() to clear_user_asm(), and make __clear_user() just
> call clear_user().
> 
> Reported-by: syzbot+f25ecf4b2982d8c7a...@syzkaller-ppc64.appspotmail.com
> Reported-by: Daniel Axtens 
> Suggested-by: Michael Ellerman 
> Cc: Christophe Leroy 
> Cc: Russell Currey 
> Fixes: de78a9c42a79 ("powerpc: Add a framework for Kernel Userspace Access 
> Protection")
> Signed-off-by: Andrew Donnellan 

Applied to powerpc fixes, thanks.

https://git.kernel.org/powerpc/c/61e3acd8c693a14fc69b824cb5b08d02cb90a6e7

cheers


Re: [PATCH] powerpc/8xx: fix bogus __init on mmu_mapin_ram_chunk()

2019-12-17 Thread Michael Ellerman
On Sat, 2019-12-14 at 08:10:29 UTC, Christophe Leroy wrote:
> Remove __init qualifier for mmu_mapin_ram_chunk() as it is called by
> mmu_mark_initmem_nx() and mmu_mark_rodata_ro() which are not __init
> functions.
> 
> At the same time, mark it static as it is only used in this file.
> 
> Reported-by: kbuild test robot 
> Fixes: a2227a277743 ("powerpc/32: Don't populate page tables for block mapped 
> pages except on the 8xx")
> Signed-off-by: Christophe Leroy 

Applied to powerpc fixes, thanks.

https://git.kernel.org/powerpc/c/0601546f23fb70d84b807e73cfe8e789d054c98d

cheers


Re: [PATCH] powerpc/irq: fix stack overflow verification

2019-12-17 Thread Michael Ellerman
On Mon, 2019-12-09 at 06:19:08 UTC, Christophe Leroy wrote:
> Before commit 0366a1c70b89 ("powerpc/irq: Run softirqs off the top of
> the irq stack"), check_stack_overflow() was called by do_IRQ(), before
> switching to the irq stack.
> In that commit, do_IRQ() was renamed __do_irq(), and is now executing
> on the irq stack, so check_stack_overflow() has just become almost
> useless.
> 
> Move check_stack_overflow() call in do_IRQ() to do the check while
> still on the current stack.
> 
> Fixes: 0366a1c70b89 ("powerpc/irq: Run softirqs off the top of the irq stack")
> Cc: sta...@vger.kernel.org
> Signed-off-by: Christophe Leroy 

Applied to powerpc fixes, thanks.

https://git.kernel.org/powerpc/c/099bc4812f09155da77eeb960a983470249c9ce1

cheers


Re: [PATCH v3] ocxl: Fix potential memory leak on context creation

2019-12-17 Thread Michael Ellerman
On Mon, 2019-12-09 at 10:55:13 UTC, Frederic Barrat wrote:
> If we couldn't fully init a context, we were leaking memory.
> 
> Fixes: b9721d275cc2 ("ocxl: Allow external drivers to use OpenCAPI contexts")
> Signed-off-by: Frederic Barrat 

Applied to powerpc fixes, thanks.

https://git.kernel.org/powerpc/c/913e73c77d48aeeb50c16450a653dca9c71ae2e2

cheers


Re: [PATCH] powerpc: ensure that swiotlb buffer is allocated from low memory

2019-12-17 Thread Michael Ellerman
On Wed, 2019-12-04 at 12:35:24 UTC, Mike Rapoport wrote:
> From: Mike Rapoport 
> 
> Some powerpc platforms (e.g. 85xx) limit DMA-able memory way below 4G. If a
> system has more physical memory than this limit, the swiotlb buffer is not
> addressable because it is allocated from memblock using top-down mode.
> 
> Force memblock to bottom-up mode before calling swiotlb_init() to ensure
> that the swiotlb buffer is DMA-able.
> 
> Link: 
> https://lkml.kernel.org/r/f1ebb706-73df-430e-9020-c214ec8ed...@xenosoft.de
> Reported-by: Christian Zigotzky 
> Signed-off-by: Mike Rapoport 

Applied to powerpc fixes, thanks.

https://git.kernel.org/powerpc/c/8fabc623238e68b3ac63c0dd1657bf86c1fa33af

cheers


[PATCH-tty-testing] tty/serial/8250: Add has_sysrq to plat_serial8250_port

2019-12-17 Thread Dmitry Safonov
In contrast to 8250/8250_of, legacy_serial on powerpc does fill
(struct plat_serial8250_port). The reason is likely that it's done on
device_initcall(), not on probe. So, 8250_core is not yet probed.

Propagate value from platform_device on 8250 probe - in case powepc
legacy driver it's initialized on initcall, in case 8250_of it will be
initialized later on of_platform_serial_setup().

Fixes: ea2683bf546c ("tty/serial: Migrate 8250_fsl to use has_sysrq").
Cc: Benjamin Herrenschmidt 
Cc: Paul Mackerras 
Cc: Michael Ellerman 
Cc: linuxppc-dev@lists.ozlabs.org
Reported-by: kbuild test robot 
Signed-off-by: Dmitry Safonov 
---
 It's probably better to squash this into the 8250_fsl patch.
 I've added Fixes tag in case the branch won't be rebased.
 Tested powerpc build manually with ppc64 cross-compiler.

 drivers/tty/serial/8250/8250_core.c | 1 +
 include/linux/serial_8250.h | 1 +
 2 files changed, 2 insertions(+)

diff --git a/drivers/tty/serial/8250/8250_core.c 
b/drivers/tty/serial/8250/8250_core.c
index e682390ce0de..0894a22fd702 100644
--- a/drivers/tty/serial/8250/8250_core.c
+++ b/drivers/tty/serial/8250/8250_core.c
@@ -816,6 +816,7 @@ static int serial8250_probe(struct platform_device *dev)
uart.port.flags = p->flags;
uart.port.mapbase   = p->mapbase;
uart.port.hub6  = p->hub6;
+   uart.port.has_sysrq = p->has_sysrq;
uart.port.private_data  = p->private_data;
uart.port.type  = p->type;
uart.port.serial_in = p->serial_in;
diff --git a/include/linux/serial_8250.h b/include/linux/serial_8250.h
index bb2bc99388ca..6a8e8c48c882 100644
--- a/include/linux/serial_8250.h
+++ b/include/linux/serial_8250.h
@@ -25,6 +25,7 @@ struct plat_serial8250_port {
unsigned char   regshift;   /* register shift */
unsigned char   iotype; /* UPIO_* */
unsigned char   hub6;
+   unsigned char   has_sysrq;  /* supports magic SysRq */
upf_t   flags;  /* UPF_* flags */
unsigned inttype;   /* If UPF_FIXED_TYPE */
unsigned int(*serial_in)(struct uart_port *, int);
-- 
2.24.1



Re: [PATCH 01/10] soc: sunxi: convert to devm_platform_ioremap_resource

2019-12-17 Thread Chen-Yu Tsai
On Sun, Dec 15, 2019 at 1:54 AM Yangtao Li  wrote:
>
> Use devm_platform_ioremap_resource() to simplify code.
>
> Signed-off-by: Yangtao Li 

Acked-by: Chen-Yu Tsai 

> ---
>  drivers/soc/sunxi/sunxi_sram.c | 4 +---
>  1 file changed, 1 insertion(+), 3 deletions(-)
>
> diff --git a/drivers/soc/sunxi/sunxi_sram.c b/drivers/soc/sunxi/sunxi_sram.c
> index 1b0d50f36349..f73fbcc73f51 100644
> --- a/drivers/soc/sunxi/sunxi_sram.c
> +++ b/drivers/soc/sunxi/sunxi_sram.c
> @@ -320,7 +320,6 @@ static struct regmap_config sunxi_sram_emac_clock_regmap 
> = {
>
>  static int sunxi_sram_probe(struct platform_device *pdev)
>  {
> -   struct resource *res;
> struct dentry *d;
> struct regmap *emac_clock;
> const struct sunxi_sramc_variant *variant;
> @@ -331,8 +330,7 @@ static int sunxi_sram_probe(struct platform_device *pdev)
> if (!variant)
> return -EINVAL;
>
> -   res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
> -   base = devm_ioremap_resource(&pdev->dev, res);
> +   base = devm_platform_ioremap_resource(pdev, 0);
> if (IS_ERR(base))
> return PTR_ERR(base);
>
> --
> 2.17.1
>


[PATCH] powerpc/setup_64: use DEFINE_DEBUGFS_ATTRIBUTE to define fops_rfi_flush

2019-12-17 Thread Chen Zhou
Use DEFINE_DEBUGFS_ATTRIBUTE rather than DEFINE_SIMPLE_ATTRIBUTE for
debugfs files.

Semantic patch information:
Rationale: DEFINE_SIMPLE_ATTRIBUTE + debugfs_create_file()
imposes some significant overhead as compared to
DEFINE_DEBUGFS_ATTRIBUTE + debugfs_create_file_unsafe().

Signed-off-by: Chen Zhou 
---
 arch/powerpc/kernel/setup_64.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index 6104917..4b9fbb2 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -956,11 +956,11 @@ static int rfi_flush_get(void *data, u64 *val)
return 0;
 }
 
-DEFINE_SIMPLE_ATTRIBUTE(fops_rfi_flush, rfi_flush_get, rfi_flush_set, 
"%llu\n");
+DEFINE_DEBUGFS_ATTRIBUTE(fops_rfi_flush, rfi_flush_get, rfi_flush_set, 
"%llu\n");
 
 static __init int rfi_flush_debugfs_init(void)
 {
-   debugfs_create_file("rfi_flush", 0600, powerpc_debugfs_root, NULL, 
&fops_rfi_flush);
+   debugfs_create_file_unsafe("rfi_flush", 0600, powerpc_debugfs_root, 
NULL, &fops_rfi_flush);
return 0;
 }
 device_initcall(rfi_flush_debugfs_init);
-- 
2.7.4



[PATCH 1/1] kvm/book3s_64: Fixes crash caused by not cleaning vhost IOTLB

2019-12-17 Thread Leonardo Bras
Fixes a bug that happens when a virtual machine is created without DDW,
with vhost supporting a virtio-net device.

In this scenario, an IOMMU with 32-bit DMA window will possibly map
IOVA's to different memory addresses.

As the code works today, H_STUFF_TCE hypercall will be dealt only with
kvm code, which does not invalidate the IOTLB entry in vhost, meaning
that at some point, and old entry can cause an access to a previous
memory address that IOVA pointed.

Example:
- virtio-net passes IOVA N to vhost, which point to M1
- vhost tries IOTLB, but miss
- vhost translates IOVA N and stores result to IOTLB
- vhost writes to M1
- (some IOMMU usage)
- virtio-net passes IOVA N to vhost, which now points to M2
- vhost tries IOTLB, and translates IOVA N to M1
- vhost writes to M1 

The reason why this error was not so evident, is probably because the
IOTLB was small enough to almost always miss at the point an IOVA was
reused. Raising the IOTLB size to 32k (which is a module parameter that
defaults to 2k) is enough to reproduce the bug in +90% of the runs.
It usually takes less than 10 seconds of netperf to cause this bug
to happen.

A few minutes after reproducing this bug, the guest usually crash.

Fixing this bug involves cleaning a IOVA entry from IOTLB.
The guest kernel trigger this by doing a H_STUFF_TCE hypercall with
tce_value == 0.

This change fixes this bug by returning H_TOO_HARD on kvmppc_h_stuff_tce
when tce_value == 0, which causes kvm to let qemu deal with this.
In this case, qemu does free the vhost IOTLB entry, which fixes the bug.

Signed-off-by: Leonardo Bras 
---
 arch/powerpc/kvm/book3s_64_vio.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
index 883a66e76638..841eff3f6392 100644
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -710,6 +710,9 @@ long kvmppc_h_stuff_tce(struct kvm_vcpu *vcpu,
if (ret != H_SUCCESS)
return ret;
 
+   if (tce_value == 0)
+   return H_TOO_HARD;
+
/* Check permission bits only to allow userspace poison TCE for debug */
if (tce_value & (TCE_PCI_WRITE | TCE_PCI_READ))
return H_PARAMETER;
-- 
2.23.0



[PATCH] asm-generic/tlb: Avoid potential double flush

2019-12-17 Thread Peter Zijlstra
On Tue, Dec 17, 2019 at 01:34:16PM +0100, Peter Zijlstra wrote:
> Perhaps if we replace !tlb->end with something like:
> 
>   !tlb->freed_tables && !tlb->cleared_p*
> 
> (which GCC should be able to do with a single load and mask)
> 
> I've not really thought too hard about it yet, I need to run some
> errands, but I'll look at it more closely when I get back.

AFAICT this should work.

---
Subject: asm-generic/tlb: Avoid potential double flush

Aneesh reported that:

tlb_flush_mmu()
  tlb_flush_mmu_tlbonly()
tlb_flush() <-- #1
  tlb_flush_mmu_free()
tlb_table_flush()
  tlb_table_invalidate()
tlb_flush_mmu_tlbonly()
  tlb_flush()   <-- #2

does two TLBIs when tlb->fullmm, because __tlb_reset_range() will not
clear tlb->end in that case.

Observe that any caller to __tlb_adjust_range() also sets at least one
of the tlb->freed_tables || tlb->cleared_p* bits, and those are
unconditionally cleared by __tlb_reset_range().

Change the condition for actually issuing TLBI to having one of those
bits set, as opposed to having tlb->end != 0.

Reported-by: "Aneesh Kumar K.V" 
Signed-off-by: Peter Zijlstra (Intel) 
---
 include/asm-generic/tlb.h | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index fe0ea6ff3636..c9a25c5a83e8 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -402,7 +402,12 @@ tlb_update_vma_flags(struct mmu_gather *tlb, struct 
vm_area_struct *vma) { }
 
 static inline void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb)
 {
-   if (!tlb->end)
+   /*
+* Anything calling __tlb_adjust_range() also sets at least one of
+* these bits.
+*/
+   if (!(tlb->freed_tables || tlb->cleared_ptes || tlb->cleared_pmds ||
+ tlb->cleared_puds || tlb->cleared_p4ds))
return;
 
tlb_flush(tlb);


Re: READ_ONCE() + STACKPROTECTOR_STRONG == :/ (was Re: [GIT PULL] Please pull powerpc/linux.git powerpc-5.5-2 tag (topic/kasan-bitops))

2019-12-17 Thread Linus Torvalds
On Tue, Dec 17, 2019 at 10:04 AM Linus Torvalds
 wrote:
>
> Let me think about it.

How about we just get rid of the union entirely, and just use
'unsigned long' or 'unsigned long long' depending on the size.

Something like the attached patch - it still requires that it be an
arithmetic type, but now because of the final cast.

But it might still be a cast to a volatile type, of course. Then the
result will be volatile, but at least now READ_ONCE() won't be taking
the address of a volatile variable on the stack - does that at least
fix some of the horrible code generation. Hmm?

This is untested, because I obviously still have the cases of
structures (page table entries) being accessed once..

  Linus
 include/linux/compiler.h | 33 +
 1 file changed, 17 insertions(+), 16 deletions(-)

diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 5e88e7e33abe..8b4282194f16 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -179,18 +179,18 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val,
 
 #include 
 
-#define __READ_ONCE_SIZE		\
-({	\
-	switch (size) {			\
-	case 1: *(__u8 *)res = *(volatile __u8 *)p; break;		\
-	case 2: *(__u16 *)res = *(volatile __u16 *)p; break;		\
-	case 4: *(__u32 *)res = *(volatile __u32 *)p; break;		\
-	case 8: *(__u64 *)res = *(volatile __u64 *)p; break;		\
-	default:			\
-		barrier();		\
-		__builtin_memcpy((void *)res, (const void *)p, size);	\
-		barrier();		\
-	}\
+/* "unsigned long" or "unsigned long long" - make it fit in a register if possible */
+#define __READ_ONCE_TYPE(size) \
+	__typeof__(__builtin_choose_expr(size > sizeof(0UL), 0ULL, 0UL))
+
+#define __READ_ONCE_SIZE			\
+({		\
+	switch (size) {\
+	case 1: *(unsigned long *)res = *(volatile __u8 *)p; break;		\
+	case 2: *(unsigned long *)res = *(volatile __u16 *)p; break;		\
+	case 4: *(unsigned long *)res = *(volatile __u32 *)p; break;		\
+	case 8: *(unsigned long long *)res = *(volatile __u64 *)p; break;	\
+	}	\
 })
 
 static __always_inline
@@ -258,13 +258,14 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s
 
 #define __READ_ONCE(x, check)		\
 ({	\
-	union { typeof(x) __val; char __c[1]; } __u;			\
+	__READ_ONCE_TYPE(sizeof(x)) __u;\
+	compiletime_assert(sizeof(x) <= sizeof(__u), "READ_ONCE type");	\
 	if (check)			\
-		__read_once_size(&(x), __u.__c, sizeof(x));		\
+		__read_once_size(&(x), &__u, sizeof(x));		\
 	else\
-		__read_once_size_nocheck(&(x), __u.__c, sizeof(x));	\
+		__read_once_size_nocheck(&(x), &__u, sizeof(x));	\
 	smp_read_barrier_depends(); /* Enforce dependency ordering from x */ \
-	__u.__val;			\
+	(__typeof__(x))__u;		\
 })
 #define READ_ONCE(x) __READ_ONCE(x, 1)
 


Re: READ_ONCE() + STACKPROTECTOR_STRONG == :/ (was Re: [GIT PULL] Please pull powerpc/linux.git powerpc-5.5-2 tag (topic/kasan-bitops))

2019-12-17 Thread Will Deacon
On Tue, Dec 17, 2019 at 10:05:53AM -0800, Linus Torvalds wrote:
> On Tue, Dec 17, 2019 at 10:04 AM Linus Torvalds
>  wrote:
> >
> > Let me think about it.
> 
> .. and in the short term, maybe for code generation, the right thing
> is to just do the cast in the bitops, where we can just cast to
> "unsigned long *" and remove the volatile that way.

Yeah, I think I'll spin that patch series tomorrow anyway, since I don't
think we need to hold it up.

> I'm still hoping there's a trick, but..

Well, there's always Peter's awful hack [1] but it's really gross. FWIW,
I've pushed the handful of patches I have to [2], which drop the GCC 4.8
workaround and introduce a non-atomic version instead of the
'__builtin_memcpy()'.

Will

[1] 
https://lore.kernel.org/lkml/20191213125618.gd2...@hirez.programming.kicks-ass.net
[2] 
https://git.kernel.org/pub/scm/linux/kernel/git/will/linux.git/log/?h=rwonce/cleanup


Re: READ_ONCE() + STACKPROTECTOR_STRONG == :/ (was Re: [GIT PULL] Please pull powerpc/linux.git powerpc-5.5-2 tag (topic/kasan-bitops))

2019-12-17 Thread Linus Torvalds
On Tue, Dec 17, 2019 at 10:04 AM Linus Torvalds
 wrote:
>
> Let me think about it.

.. and in the short term, maybe for code generation, the right thing
is to just do the cast in the bitops, where we can just cast to
"unsigned long *" and remove the volatile that way.

I'm still hoping there's a trick, but..

   Linus


Re: READ_ONCE() + STACKPROTECTOR_STRONG == :/ (was Re: [GIT PULL] Please pull powerpc/linux.git powerpc-5.5-2 tag (topic/kasan-bitops))

2019-12-17 Thread Linus Torvalds
On Tue, Dec 17, 2019 at 9:07 AM Will Deacon  wrote:
>
> However, I'm really banging my head against the compiler trying to get
> your trick above to work for pointer types when the pointed-to-type is
> not defined.

You are right, of course. The trick works fine with arithmetic types,
but since it does use arithmetic, it requires that pointer types be
not only declared, but defined. The addition wants the size of the
underlying type (even though with an addition of zero it wouldn't be
required - but that's not how C works).

Let me think about it.

 Linus


[Bug 205885] [Bisected] BUG: KASAN: null-ptr-deref in strncpy+0x3c/0x60

2019-12-17 Thread bugzilla-daemon
https://bugzilla.kernel.org/show_bug.cgi?id=205885

Erhard F. (erhar...@mailbox.org) changed:

   What|Removed |Added

 Attachment #286345|0   |1
is obsolete||

--- Comment #6 from Erhard F. (erhar...@mailbox.org) ---
Created attachment 286347
  --> https://bugzilla.kernel.org/attachment.cgi?id=286347&action=edit
bisect.log

# git bisect bad | tee -a ~/bisect01.log 
cccaa5e33525fc07f4a2ce0518e50b9ddf435e47 is the first bad commit
commit cccaa5e33525fc07f4a2ce0518e50b9ddf435e47
Author: Dominik Brodowski 
Date:   Tue Oct 23 22:41:09 2018 +0200

init: use do_mount() instead of ksys_mount()

In prepare_namespace(), do_mount() can be used instead of ksys_mount()
as the first and third argument are const strings in the kernel, the
second and fourth argument are passed through anyway, and the fifth
argument is NULL.

In do_mount_root(), ksys_mount() is called with the first and third
argument being already kernelspace strings, which do not need to be
copied over from userspace to kernelspace (again). The second and
fourth arguments are passed through to do_mount() anyway. The fifth
argument, while already residing in kernelspace, needs to be put into
a page of its own. Then, do_mount() can be used instead of
ksys_mount().

Once this is done, there are no in-kernel users to ksys_mount() left,
which can therefore be removed.

Signed-off-by: Dominik Brodowski 

 fs/namespace.c   | 10 ++
 include/linux/syscalls.h |  2 --
 init/do_mounts.c | 28 ++--
 3 files changed, 24 insertions(+), 16 deletions(-)

-- 
You are receiving this mail because:
You are watching the assignee of the bug.

[Bug 205885] [Bisected] BUG: KASAN: null-ptr-deref in strncpy+0x3c/0x60

2019-12-17 Thread bugzilla-daemon
https://bugzilla.kernel.org/show_bug.cgi?id=205885

--- Comment #5 from Erhard F. (erhar...@mailbox.org) ---
Created attachment 286345
  --> https://bugzilla.kernel.org/attachment.cgi?id=286345&action=edit
bisect.log

-- 
You are receiving this mail because:
You are watching the assignee of the bug.

[Bug 205885] BUG: KASAN: null-ptr-deref in strncpy+0x3c/0x60

2019-12-17 Thread bugzilla-daemon
https://bugzilla.kernel.org/show_bug.cgi?id=205885

Erhard F. (erhar...@mailbox.org) changed:

   What|Removed |Added

 Attachment #286333|0   |1
is obsolete||

--- Comment #4 from Erhard F. (erhar...@mailbox.org) ---
Created attachment 286343
  --> https://bugzilla.kernel.org/attachment.cgi?id=286343&action=edit
kernel .config (5.5-rc2, PowerMac G4 DP)

-- 
You are receiving this mail because:
You are watching the assignee of the bug.

Re: READ_ONCE() + STACKPROTECTOR_STRONG == :/ (was Re: [GIT PULL] Please pull powerpc/linux.git powerpc-5.5-2 tag (topic/kasan-bitops))

2019-12-17 Thread Will Deacon
On Thu, Dec 12, 2019 at 12:49:52PM -0800, Linus Torvalds wrote:
> On Thu, Dec 12, 2019 at 11:34 AM Will Deacon  wrote:
> >
> > The root of my concern in all of this, and what started me looking at it in
> > the first place, is the interaction with 'typeof()'. Inheriting 'volatile'
> > for a pointer means that local variables in macros declared using typeof()
> > suddenly start generating *hideous* code, particularly when pointless stack
> > spills get stackprotector all excited.
> 
> Yeah, removing volatile can be a bit annoying.
> 
> For the particular case of the bitops, though, it's not an issue.
> Since you know the type there, you can just cast it.
> 
> And if we had the rule that READ_ONCE() was an arithmetic type, you could do
> 
> typeof(0+(*p)) __var;
> 
> since you might as well get the integer promotion anyway (on the
> non-volatile result).
> 
> But that doesn't work with structures or unions, of course.
> 
> I'm not entirely sure we have READ_ONCE() with a struct. I do know we
> have it with 64-bit entities on 32-bit machines, but that's ok with
> the "0+" trick.

Other than the two trivial examples Arnd and I spotted, it looks like
we're in for some fun with the page-table types such as pud_t but that
/should/ be fixable with enough effort.

However, I'm really banging my head against the compiler trying to get
your trick above to work for pointer types when the pointed-to-type is
not defined. As a very cut down example (I pulled this back out of the
preprocessor and cleaned it up a bit):


struct dentry {
struct inode *d_inode;
};

static inline struct inode *d_inode_rcu(struct dentry *dentry)
{
return ({
typeof(0 + dentry->d_inode) __x = (*(volatile 
typeof(dentry->d_inode) *)&(dentry->d_inode));
(typeof(dentry->d_inode))__x;
});
}


Trying to compile this results in:

  | In function 'd_inode_rcu':
  | error: invalid use of undefined type 'struct inode'

whereas it compiles fine if you remove the '0 +' from the first typeof.

What am I missing? Perhaps the compiler wants the size information of
'struct inode' before it will contemplate the arithmetic, but if so then
I don't think we can use this trick after all. Hmm.

Will


[Bug 205889] New: CONFIG_PPC_85xx with CONFIG_CORENET_GENERIC outputs uImage instead of zImage

2019-12-17 Thread bugzilla-daemon
https://bugzilla.kernel.org/show_bug.cgi?id=205889

Bug ID: 205889
   Summary: CONFIG_PPC_85xx with CONFIG_CORENET_GENERIC outputs
uImage instead of zImage
   Product: Platform Specific/Hardware
   Version: 2.5
Kernel Version: 5.5
  Hardware: All
OS: Linux
  Tree: Mainline
Status: NEW
  Severity: normal
  Priority: P1
 Component: PPC-32
  Assignee: platform_ppc...@kernel-bugs.osdl.org
  Reporter: bradley.gam...@ncipher.com
Regression: No

Attempting a PowerPC Linux kernel build with the config options
"CONFIG_PPC_85xx=y" and "CONFIG_CORENET_GENERIC=y" will output a file named
"zImage", however this file is actually a "uImage" formatted file.

This can be replicated with this minimal defconfig:
CONFIG_PPC_85xx=y
CONFIG_CORENET_GENERIC=y

If I perform a build with one of these options I am given a valid zImage file:
$ file arch/powerpc/boot/zImage
arch/powerpc/boot/zImage: ELF 32-bit MSB executable, PowerPC or cisco 4500,
version 1 (SYSV), statically linked, not stripped

However performing the same build with both config options enabled gives an
incorrectly formatted image:
$ file arch/powerpc/boot/zImage
arch/powerpc/boot/zImage: u-boot legacy uImage, Linux-5.5.0-rc2-gea200dec5,
Linux/PowerPC, OS Kernel Image (gzip), 1366142 bytes, Tue Dec 17 15:30:22 2019,
Load Address: 0x, Entry Point: 0x, Header CRC: 0x99D350A0, Data
CRC: 0xC9090D33

-- 
You are receiving this mail because:
You are watching the assignee of the bug.

[Bug 205885] BUG: KASAN: null-ptr-deref in strncpy+0x3c/0x60

2019-12-17 Thread bugzilla-daemon
https://bugzilla.kernel.org/show_bug.cgi?id=205885

--- Comment #3 from Erhard F. (erhar...@mailbox.org) ---
5.5-rc1 works with identical kernel .config.
And on -rc2 I get that without KASAN as well.

I'll do a bisect and report back.

-- 
You are receiving this mail because:
You are watching the assignee of the bug.

Re: [PATCH v3 1/7] capabilities: introduce CAP_SYS_PERFMON to kernel and user space

2019-12-17 Thread Stephen Smalley

On 12/16/19 2:58 PM, Alexey Budankov wrote:


Introduce CAP_SYS_PERFMON capability devoted to secure system performance
monitoring and observability so that CAP_SYS_PERFMON would assist
CAP_SYS_ADMIN capability in its governing role for perf_events, i915_perf
and other subsystems of the kernel.

CAP_SYS_PERFMON intends to harden system security and integrity during
system performance monitoring and observability by decreasing attack surface
that is available to CAP_SYS_ADMIN privileged processes.

CAP_SYS_PERFMON intends to take over CAP_SYS_ADMIN credentials related to
system performance monitoring and observability and balance amount of
CAP_SYS_ADMIN credentials in accordance with the recommendations provided
in the man page for CAP_SYS_ADMIN [1]: "Note: this capability is overloaded;
see Notes to kernel developers, below."

[1] http://man7.org/linux/man-pages/man7/capabilities.7.html

Signed-off-by: Alexey Budankov 
---
  include/linux/capability.h  | 1 +
  include/uapi/linux/capability.h | 8 +++-
  security/selinux/include/classmap.h | 4 ++--
  3 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/include/linux/capability.h b/include/linux/capability.h
index ecce0f43c73a..6342502c4c2a 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -251,6 +251,7 @@ extern bool privileged_wrt_inode_uidgid(struct 
user_namespace *ns, const struct
  extern bool capable_wrt_inode_uidgid(const struct inode *inode, int cap);
  extern bool file_ns_capable(const struct file *file, struct user_namespace 
*ns, int cap);
  extern bool ptracer_capable(struct task_struct *tsk, struct user_namespace 
*ns);
+#define perfmon_capable() (capable(CAP_SYS_PERFMON) || capable(CAP_SYS_ADMIN))


I think making it a static inline bool function instead of a macro would 
be preferred?


Otherwise,
Acked-by: Stephen Smalley 

  
  /* audit system wants to get cap info from files as well */

  extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct 
cpu_vfs_cap_data *cpu_caps);
diff --git a/include/uapi/linux/capability.h b/include/uapi/linux/capability.h
index 240fdb9a60f6..98e03cc76c7c 100644
--- a/include/uapi/linux/capability.h
+++ b/include/uapi/linux/capability.h
@@ -366,8 +366,14 @@ struct vfs_ns_cap_data {
  
  #define CAP_AUDIT_READ		37
  
+/*

+ * Allow system performance and observability privileged operations
+ * using perf_events, i915_perf and other kernel subsystems
+ */
+
+#define CAP_SYS_PERFMON38
  
-#define CAP_LAST_CAP CAP_AUDIT_READ

+#define CAP_LAST_CAP CAP_SYS_PERFMON
  
  #define cap_valid(x) ((x) >= 0 && (x) <= CAP_LAST_CAP)
  
diff --git a/security/selinux/include/classmap.h b/security/selinux/include/classmap.h

index 7db24855e12d..bae602c623b0 100644
--- a/security/selinux/include/classmap.h
+++ b/security/selinux/include/classmap.h
@@ -27,9 +27,9 @@
"audit_control", "setfcap"
  
  #define COMMON_CAP2_PERMS  "mac_override", "mac_admin", "syslog", \

-   "wake_alarm", "block_suspend", "audit_read"
+   "wake_alarm", "block_suspend", "audit_read", "sys_perfmon"
  
-#if CAP_LAST_CAP > CAP_AUDIT_READ

+#if CAP_LAST_CAP > CAP_SYS_PERFMON
  #error New capability defined, please update COMMON_CAP2_PERMS.
  #endif
  





[PATCH v12 23/25] mm/gup: track FOLL_PIN pages

2019-12-17 Thread John Hubbard
Add tracking of pages that were pinned via FOLL_PIN.

As mentioned in the FOLL_PIN documentation, callers who effectively set
FOLL_PIN are required to ultimately free such pages via unpin_user_page().
The effect is similar to FOLL_GET, and may be thought of as "FOLL_GET
for DIO and/or RDMA use".

Pages that have been pinned via FOLL_PIN are identifiable via a
new function call:

   bool page_dma_pinned(struct page *page);

What to do in response to encountering such a page, is left to later
patchsets. There is discussion about this in [1], [2], and [3].

This also changes a BUG_ON(), to a WARN_ON(), in follow_page_mask().

[1] Some slow progress on get_user_pages() (Apr 2, 2019):
https://lwn.net/Articles/784574/
[2] DMA and get_user_pages() (LPC: Dec 12, 2018):
https://lwn.net/Articles/774411/
[3] The trouble with get_user_pages() (Apr 30, 2018):
https://lwn.net/Articles/753027/

Reviewed-by: Jan Kara 
Suggested-by: Jan Kara 
Suggested-by: Jérôme Glisse 
Cc: Kirill A. Shutemov 
Signed-off-by: John Hubbard 
---

Hi,

The kbuild test robot noticed that try_pin_compound_head() can be
declared static, in mm/gup.c. This updated patch does that.

thanks,
John Hubbard
NVIDIA

 Documentation/core-api/pin_user_pages.rst |   2 +-
 include/linux/mm.h|  83 -
 include/linux/mmzone.h|   2 +
 include/linux/page_ref.h  |  10 +
 mm/gup.c  | 410 +-
 mm/huge_memory.c  |  29 +-
 mm/hugetlb.c  |  38 +-
 mm/vmstat.c   |   2 +
 8 files changed, 440 insertions(+), 136 deletions(-)

diff --git a/Documentation/core-api/pin_user_pages.rst 
b/Documentation/core-api/pin_user_pages.rst
index 1d490155ecd7..2db14df1f2d7 100644
--- a/Documentation/core-api/pin_user_pages.rst
+++ b/Documentation/core-api/pin_user_pages.rst
@@ -53,7 +53,7 @@ Which flags are set by each wrapper
 For these pin_user_pages*() functions, FOLL_PIN is OR'd in with whatever gup
 flags the caller provides. The caller is required to pass in a non-null struct
 pages* array, and the function then pin pages by incrementing each by a special
-value. For now, that value is +1, just like get_user_pages*().::
+value: GUP_PIN_COUNTING_BIAS.::
 
  Function
  
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 6a1a357e7d86..bb44c4d2ada7 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1016,6 +1016,8 @@ static inline void get_page(struct page *page)
page_ref_inc(page);
 }
 
+bool __must_check try_grab_page(struct page *page, unsigned int flags);
+
 static inline __must_check bool try_get_page(struct page *page)
 {
page = compound_head(page);
@@ -1044,29 +1046,80 @@ static inline void put_page(struct page *page)
__put_page(page);
 }
 
-/**
- * unpin_user_page() - release a gup-pinned page
- * @page:pointer to page to be released
+/*
+ * GUP_PIN_COUNTING_BIAS, and the associated functions that use it, overload
+ * the page's refcount so that two separate items are tracked: the original 
page
+ * reference count, and also a new count of how many pin_user_pages() calls 
were
+ * made against the page. ("gup-pinned" is another term for the latter).
+ *
+ * With this scheme, pin_user_pages() becomes special: such pages are marked as
+ * distinct from normal pages. As such, the unpin_user_page() call (and its
+ * variants) must be used in order to release gup-pinned pages.
+ *
+ * Choice of value:
+ *
+ * By making GUP_PIN_COUNTING_BIAS a power of two, debugging of page reference
+ * counts with respect to pin_user_pages() and unpin_user_page() becomes
+ * simpler, due to the fact that adding an even power of two to the page
+ * refcount has the effect of using only the upper N bits, for the code that
+ * counts up using the bias value. This means that the lower bits are left for
+ * the exclusive use of the original code that increments and decrements by one
+ * (or at least, by much smaller values than the bias value).
  *
- * Pages that were pinned via pin_user_pages*() must be released via either
- * unpin_user_page(), or one of the unpin_user_pages*() routines. This is so
- * that eventually such pages can be separately tracked and uniquely handled. 
In
- * particular, interactions with RDMA and filesystems need special handling.
+ * Of course, once the lower bits overflow into the upper bits (and this is
+ * OK, because subtraction recovers the original values), then visual 
inspection
+ * no longer suffices to directly view the separate counts. However, for normal
+ * applications that don't have huge page reference counts, this won't be an
+ * issue.
  *
- * unpin_user_page() and put_page() are not interchangeable, despite this early
- * implementation that makes them look the same. unpin_user_page() calls must
- * be perfectly matched up with pin*() calls.
+ * Locking: the lockless algorithm described

Re: [RFC PATCH] mm/gup: try_pin_compound_head() can be static

2019-12-17 Thread John Hubbard

On 12/17/19 12:03 AM, kbuild test robot wrote:


Fixes: 8086d1c61970 ("mm/gup: track FOLL_PIN pages")
Signed-off-by: kbuild test robot 
---
  gup.c |2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/gup.c b/mm/gup.c
index 038b71165a761..849a6f55938e6 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -75,7 +75,7 @@ static inline struct page *try_get_compound_head(struct page 
*page, int refs)
   * @Return:   the compound head page, with ref appropriately incremented,
   * or NULL upon failure.
   */
-__must_check struct page *try_pin_compound_head(struct page *page, int refs)
+static __must_check struct page *try_pin_compound_head(struct page *page, int 
refs)
  {
struct page *head = try_get_compound_head(page,
  GUP_PIN_COUNTING_BIAS * refs);



Yes, it should have been declared static. And this also applies to the latest 
version
(v11). The preferred fix would stay within 80 columns, like this:

diff --git a/mm/gup.c b/mm/gup.c
index c2793a86450e..39b2f683bd2e 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -75,7 +75,8 @@ static inline struct page *try_get_compound_head(struct page 
*page, int refs)
  * @Return:the compound head page, with ref appropriately incremented,
  * or NULL upon failure.
  */
-__must_check struct page *try_pin_compound_head(struct page *page, int refs)
+static __must_check struct page *try_pin_compound_head(struct page *page,
+  int refs)
 {
struct page *head = try_get_compound_head(page,
  GUP_PIN_COUNTING_BIAS * refs);


thanks,
--
John Hubbard
NVIDIA


Re: [PATCH v5 0/5] Append new variables to vmcoreinfo (TCR_EL1.T1SZ for arm64 and MAX_PHYSMEM_BITS for all archs)

2019-12-17 Thread Borislav Petkov
On Mon, Dec 16, 2019 at 12:16:12PM +0530, Bhupesh Sharma wrote:
> I remember there was a suggestion during the review of an earlier
> version to keep them as a separate patch(es) so that the documentation
> text is easier to review,

Documentation text is one sentence, respectively. Not really worth a
separate patch.

> I can merge the documentation patches with the respective patches
> (which export the variables/defines to vmcoreinfo) in v6,

Please do.

Thx.

-- 
Regards/Gruss,
Boris.

https://people.kernel.org/tglx/notes-about-netiquette


Re: [PATCH v3 3/3] powerpc: Book3S 64-bit "heavyweight" KASAN support

2019-12-17 Thread Daniel Axtens
Hi Christophe,

I'm working through your feedback, thank you. Regarding this one:

>> --- a/arch/powerpc/kernel/process.c
>> +++ b/arch/powerpc/kernel/process.c
>> @@ -2081,7 +2081,14 @@ void show_stack(struct task_struct *tsk, unsigned 
>> long *stack)
>>  /*
>>   * See if this is an exception frame.
>>   * We look for the "regshere" marker in the current frame.
>> + *
>> + * KASAN may complain about this. If it is an exception frame,
>> + * we won't have unpoisoned the stack in asm when we set the
>> + * exception marker. If it's not an exception frame, who knows
>> + * how things are laid out - the shadow could be in any state
>> + * at all. Just disable KASAN reporting for now.
>>   */
>> +kasan_disable_current();
>>  if (validate_sp(sp, tsk, STACK_INT_FRAME_SIZE)
>>  && stack[STACK_FRAME_MARKER] == STACK_FRAME_REGS_MARKER) {
>>  struct pt_regs *regs = (struct pt_regs *)
>> @@ -2091,6 +2098,7 @@ void show_stack(struct task_struct *tsk, unsigned long 
>> *stack)
>> regs->trap, (void *)regs->nip, (void *)lr);
>>  firstframe = 1;
>>  }
>> +kasan_enable_current();
>
> If this is really a concern for all targets including PPC32, should it 
> be a separate patch with a Fixes: tag to be applied back in stable as well ?

I've managed to repro this by commening out the kasan_disable/enable
lines, and just booting in qemu without a disk attached:

sudo qemu-system-ppc64 -accel kvm -m 2G -M pseries -cpu power9  -kernel 
./vmlinux  -nographic -chardev stdio,id=charserial0,mux=on -device 
spapr-vty,chardev=charserial0,reg=0x3000  -mon 
chardev=charserial0,mode=readline -nodefaults -smp 2 

...

[0.210740] Kernel panic - not syncing: VFS: Unable to mount root fs on 
unknown-block(0,0)
[0.210789] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 
5.5.0-rc1-next-20191213-16824-g469a24fbdb34 #12
[0.210844] Call Trace:
[0.210866] [c0006a4839b0] [c1f74f48] dump_stack+0xfc/0x154 
(unreliable)
[0.210915] [c0006a483a00] [c025411c] panic+0x258/0x59c
[0.210958] [c0006a483aa0] [c24870b0] 
mount_block_root+0x648/0x7ac
[0.211005] 
==
[0.211054] BUG: KASAN: stack-out-of-bounds in show_stack+0x438/0x580
[0.211095] Read of size 8 at addr c0006a483b00 by task swapper/0/1
[0.211134] 
[0.211152] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 
5.5.0-rc1-next-20191213-16824-g469a24fbdb34 #12
[0.211207] Call Trace:
[0.211225] [c0006a483680] [c1f74f48] dump_stack+0xfc/0x154 
(unreliable)
[0.211274] [c0006a4836d0] [c08f877c] 
print_address_description.isra.10+0x7c/0x470
[0.211330] [c0006a483760] [c08f8e7c] __kasan_report+0x1bc/0x244
[0.211380] [c0006a483830] [c08f6eb8] kasan_report+0x18/0x30
[0.211422] [c0006a483850] [c08fa5d4] 
__asan_report_load8_noabort+0x24/0x40
[0.211471] [c0006a483870] [c003d448] show_stack+0x438/0x580
[0.211512] [c0006a4839b0] [c1f74f48] dump_stack+0xfc/0x154
[0.211553] [c0006a483a00] [c025411c] panic+0x258/0x59c
[0.211595] [c0006a483aa0] [c24870b0] 
mount_block_root+0x648/0x7ac
[0.211644] [c0006a483be0] [c2487784] 
prepare_namespace+0x1ec/0x240
[0.211694] [c0006a483c60] [c248669c] 
kernel_init_freeable+0x7f4/0x870
[0.211745] [c0006a483da0] [c0011f30] kernel_init+0x3c/0x15c
[0.211787] [c0006a483e20] [c000bebc] 
ret_from_kernel_thread+0x5c/0x80
[0.211834] 
[0.211851] Allocated by task 0:
[0.211878]  save_stack+0x2c/0xe0
[0.211904]  __kasan_kmalloc.isra.16+0x11c/0x150
[0.211937]  kmem_cache_alloc_node+0x114/0x3b0
[0.211971]  copy_process+0x5b8/0x6410
[0.211996]  _do_fork+0x130/0xbf0
[0.212022]  kernel_thread+0xdc/0x130
[0.212047]  rest_init+0x44/0x184
[0.212072]  start_kernel+0x77c/0x7dc
[0.212098]  start_here_common+0x1c/0x20
[0.212122] 
[0.212139] Freed by task 0:
[0.212163] (stack is not available)
[0.212187] 
[0.212205] The buggy address belongs to the object at c0006a48
[0.212205]  which belongs to the cache thread_stack of size 16384
[0.212285] The buggy address is located 15104 bytes inside of
[0.212285]  16384-byte region [c0006a48, c0006a484000)
[0.212356] The buggy address belongs to the page:
[0.212391] page:c00c001a9200 refcount:1 mapcount:0 
mapping:c0006a019e00 index:0x0 compound_mapcount: 0
[0.212455] raw: 00710200 5deadbeef100 5deadbeef122 
c0006a019e00
[0.212504] raw:  00100010 0001 

[0.212551] page dumped because: k

Re: [Intel-gfx] [PATCH v3 4/7] drm/i915/perf: open access for CAP_SYS_PERFMON privileged process

2019-12-17 Thread Lionel Landwerlin

On 16/12/2019 22:03, Alexey Budankov wrote:

Open access to i915_perf monitoring for CAP_SYS_PERFMON privileged processes.
For backward compatibility reasons access to i915_perf subsystem remains open
for CAP_SYS_ADMIN privileged processes but CAP_SYS_ADMIN usage for secure
i915_perf monitoring is discouraged with respect to CAP_SYS_PERFMON capability.

Signed-off-by: Alexey Budankov 



Assuming people are fine with this new cap, I like this idea of a 
lighter privilege for i915-perf.



-Lionel




Re: [PATCH RFC v1 0/3] powerpc/memtrace: Don't offline memory blocks via offline_pages()

2019-12-17 Thread David Hildenbrand
On 17.12.19 13:38, David Hildenbrand wrote:
> This RFC is based on linux-next and
> - 2 patches from "PATCH RFC v4 00/13] virtio-mem: paravirtualized memory"
>  -> "mm: Allow to offline unmovable PageOffline() pages via
>  MEM_GOING_OFFLINE" [1]
>  -> "mm/memory_hotplug: Introduce offline_and_remove_memory()" [2]
> - "mm/memory_hotplug: Don't free usage map when removing a re-added early
>section" [3]
> 
> A branch with all patches (kept updated) is available at:
>   https://github.com/davidhildenbrand/linux.git memtrace
> 
> Stop using offline_pages() to offline memory blocks. Allocate the memory
> blocks using alloc_contig_pages() first and offline+remove the allcoated
> memory blocks using a clean MM interface. Offlining of allocated memory is
> made possible by using PageOffline() in combination with a memory notifier
> (similar to virto-mem).
> 
> Note: In the future, we might want to switch to only removing/readding the
> page tables of the allocated memory (while still marking it PageOffline()).
> However, that might have other implications, and requires work from PPC
> people (IOW, I won't fiddle with that :) ).
> 
> [1] https://lkml.kernel.org/r/20191212171137.13872-8-da...@redhat.com
> [2] https://lkml.kernel.org/r/20191212171137.13872-10-da...@redhat.com
> [3] https://lkml.kernel.org/r/20191217104637.5509-1-da...@redhat.com
> 
> 
> David Hildenbrand (3):
>   powerpc/memtrace: Enforce power of 2 for memory buffer size
>   powerpc/memtrace: Factor out readding memory into memtrace_free_node()
>   powerpc/memtrace: Don't offline memory blocks via offline_pages()
> 
>  arch/powerpc/platforms/powernv/Kconfig|   1 +
>  arch/powerpc/platforms/powernv/memtrace.c | 217 ++
>  2 files changed, 136 insertions(+), 82 deletions(-)
> 

(CC linuxppc-dev on the cover letter, my fancy sendmail cc-cmd.sh script
missed it, sorry)

-- 
Thanks,

David / dhildenb



[PATCH RFC v1 3/3] powerpc/memtrace: Don't offline memory blocks via offline_pages()

2019-12-17 Thread David Hildenbrand
offline_pages() should not be called outside of the MM core. Especially,
having to manually fiddle with the memory block states is a sign that
this is not a good idea. To offline memory block devices cleanly,
device_offline() should be used. This is the only remaining caller of
offline_pages(), except the official device_offline() way.

E.g., when trying to allocate right now we trigger messages like
[   11.227817] page:c00c00182000 refcount:1 mapcount:0 
mapping: index:0x0
[   11.228056] raw: 0070 c1538860 c1538860 

[   11.228070] raw:  0001 0001 

[   11.228097] page dumped because: unmovable page

and theoretically we might end up looping quite a long time trying to
offline memory, which would have to be canceled by the user manually
(CTRL-C).

Memtrace needs to identify+allocate multiple consecutive memory blocks.
It also has to remove the memory blocks to remove all page tables
(HW requirement).

Let's use alloc_contig_pages() to allocate memory that spans multiple
memory block devices. We can then set all pages PageOffline() to allow
these pages to get isolated. A temporary memory notifier can then make
offlining of these pages succeed by dropping its reference to the pages
on MEM_GOING_OFFLINE events(as documented in include/linux/page-flags.h
for PageOffline() pages). Error handling is a bit tricky.

Note1: ZONE_MOVABLE memory blocks won't be considered. Not sure if that
was ever really relevant. (unmovable data would end up on these memory
blocks for a tiny little time frame)

Note2: We don't have to care about online_page_callback_t, as we forbid
re-onlining from our memory notifier.

Note3: I was told this feature is never used along with DIMM-based memory
hotunplug - otherwise bad things will happen when a DIMM would try to
remove "alread-removed" memory (that is still in use).

Tested under QEMU with powernv emulation (kernel + initramfs).

$ mount -t debugfs none /sys/kernel/debug/
$ cat /sys/devices/system/memory/block_size_bytes
1000
$ echo 0x2000 > /sys/kernel/debug/powerpc/memtrace/enable
[   19.809790] Offlined Pages 4096
[   19.835842] Offlined Pages 4096
[   19.853136] memtrace: Allocated trace memory on node 0 at 0x4000

Unfortunately, QEMU does not support NUMA for powernv yet, so I cannot
test that.

Cc: Benjamin Herrenschmidt 
Cc: Paul Mackerras 
Cc: Michael Ellerman 
Cc: Andrew Morton 
Cc: David Hildenbrand 
Cc: Allison Randal 
Cc: Jens Axboe 
Cc: Anshuman Khandual 
Cc: Thomas Gleixner 
Cc: Michal Hocko 
Cc: Oscar Salvador 
Cc: Balbir Singh 
Cc: Rashmica Gupta 
Cc: linuxppc-dev@lists.ozlabs.org
Signed-off-by: David Hildenbrand 
---
 arch/powerpc/platforms/powernv/Kconfig|   1 +
 arch/powerpc/platforms/powernv/memtrace.c | 175 ++
 2 files changed, 112 insertions(+), 64 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/Kconfig 
b/arch/powerpc/platforms/powernv/Kconfig
index 938803eab0ad..571a0fa9f055 100644
--- a/arch/powerpc/platforms/powernv/Kconfig
+++ b/arch/powerpc/platforms/powernv/Kconfig
@@ -29,6 +29,7 @@ config OPAL_PRD
 config PPC_MEMTRACE
bool "Enable removal of RAM from kernel mappings for tracing"
depends on PPC_POWERNV && MEMORY_HOTREMOVE
+   select CONTIG_ALLOC
help
  Enabling this option allows for the removal of memory (RAM)
  from the kernel mappings to be used for hardware tracing.
diff --git a/arch/powerpc/platforms/powernv/memtrace.c 
b/arch/powerpc/platforms/powernv/memtrace.c
index 2d2a0a2acd60..fe1e8f3926a1 100644
--- a/arch/powerpc/platforms/powernv/memtrace.c
+++ b/arch/powerpc/platforms/powernv/memtrace.c
@@ -76,83 +76,130 @@ static int memtrace_free_node(int nid, unsigned long 
start, unsigned long size)
return ret;
 }
 
-static int check_memblock_online(struct memory_block *mem, void *arg)
-{
-   if (mem->state != MEM_ONLINE)
-   return -1;
-
-   return 0;
-}
-
-static int change_memblock_state(struct memory_block *mem, void *arg)
-{
-   unsigned long state = (unsigned long)arg;
-
-   mem->state = state;
-
-   return 0;
-}
+struct memtrace_alloc_info {
+   struct notifier_block memory_notifier;
+   unsigned long base_pfn;
+   unsigned long nr_pages;
+};
 
-/* called with device_hotplug_lock held */
-static bool memtrace_offline_pages(u32 nid, u64 start_pfn, u64 nr_pages)
+static int memtrace_memory_notifier_cb(struct notifier_block *nb,
+  unsigned long action, void *arg)
 {
-   const unsigned long start = PFN_PHYS(start_pfn);
-   const unsigned long size = PFN_PHYS(nr_pages);
-
-   if (walk_memory_blocks(start, size, NULL, check_memblock_online))
-   return false;
-
-   walk_memory_blocks(start, size, (void *)MEM_GOING_OFFLINE,
-  change_memblock_state);
-
-   if (offline_pages(start_pfn, nr_pages))

[PATCH RFC v1 2/3] powerpc/memtrace: Factor out readding memory into memtrace_free_node()

2019-12-17 Thread David Hildenbrand
While at it, move it, we want to reuse it soon.

Cc: Benjamin Herrenschmidt 
Cc: Paul Mackerras 
Cc: Michael Ellerman 
Cc: Andrew Morton 
Cc: David Hildenbrand 
Cc: Allison Randal 
Cc: Jens Axboe 
Cc: Anshuman Khandual 
Cc: Thomas Gleixner 
Cc: Balbir Singh 
Cc: Rashmica Gupta 
Cc: linuxppc-dev@lists.ozlabs.org
Signed-off-by: David Hildenbrand 
---
 arch/powerpc/platforms/powernv/memtrace.c | 44 ++-
 1 file changed, 27 insertions(+), 17 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/memtrace.c 
b/arch/powerpc/platforms/powernv/memtrace.c
index 0c4c54d2e3c4..2d2a0a2acd60 100644
--- a/arch/powerpc/platforms/powernv/memtrace.c
+++ b/arch/powerpc/platforms/powernv/memtrace.c
@@ -50,6 +50,32 @@ static const struct file_operations memtrace_fops = {
.open   = simple_open,
 };
 
+static int online_mem_block(struct memory_block *mem, void *arg)
+{
+   return device_online(&mem->dev);
+}
+
+static int memtrace_free_node(int nid, unsigned long start, unsigned long size)
+{
+   int ret;
+
+   ret = add_memory(nid, start, size);
+   if (!ret) {
+   /*
+* If the kernel isn't compiled with the auto online option, we
+* will try to online ourselves. We'll ignore any errors here -
+* user space can try to online itself later (after all, the
+* memory was added successfully).
+*/
+   if (!memhp_auto_online) {
+   lock_device_hotplug();
+   walk_memory_blocks(start, size, NULL, online_mem_block);
+   unlock_device_hotplug();
+   }
+   }
+   return ret;
+}
+
 static int check_memblock_online(struct memory_block *mem, void *arg)
 {
if (mem->state != MEM_ONLINE)
@@ -202,11 +228,6 @@ static int memtrace_init_debugfs(void)
return ret;
 }
 
-static int online_mem_block(struct memory_block *mem, void *arg)
-{
-   return device_online(&mem->dev);
-}
-
 /*
  * Iterate through the chunks of memory we have removed from the kernel
  * and attempt to add them back to the kernel.
@@ -229,24 +250,13 @@ static int memtrace_online(void)
ent->mem = 0;
}
 
-   if (add_memory(ent->nid, ent->start, ent->size)) {
+   if (memtrace_free_node(ent->nid, ent->start, ent->size)) {
pr_err("Failed to add trace memory to node %d\n",
ent->nid);
ret += 1;
continue;
}
 
-   /*
-* If kernel isn't compiled with the auto online option
-* we need to online the memory ourselves.
-*/
-   if (!memhp_auto_online) {
-   lock_device_hotplug();
-   walk_memory_blocks(ent->start, ent->size, NULL,
-  online_mem_block);
-   unlock_device_hotplug();
-   }
-
/*
 * Memory was added successfully so clean up references to it
 * so on reentry we can tell that this chunk was added.
-- 
2.23.0



[PATCH RFC v1 1/3] powerpc/memtrace: Enforce power of 2 for memory buffer size

2019-12-17 Thread David Hildenbrand
The code mentions "Trace memory needs to be aligned to the size", and
e.g., round_up() is documented to work on power of 2 only. Also, the
whole search is not optimized e.g., for being aligned to memory block
size only while allocating multiple memory blocks.

Let's just limit to powers of 2 that are at least the size of memory
blocks - the granularity we are using for alloc/offline/unplug.

Cc: Benjamin Herrenschmidt 
Cc: Paul Mackerras 
Cc: Michael Ellerman 
Cc: Andrew Morton 
Cc: David Hildenbrand 
Cc: Allison Randal 
Cc: Anshuman Khandual 
Cc: Balbir Singh 
Cc: Rashmica Gupta 
Cc: linuxppc-dev@lists.ozlabs.org
Signed-off-by: David Hildenbrand 
---
 arch/powerpc/platforms/powernv/memtrace.c | 14 +-
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/memtrace.c 
b/arch/powerpc/platforms/powernv/memtrace.c
index eb2e75dac369..0c4c54d2e3c4 100644
--- a/arch/powerpc/platforms/powernv/memtrace.c
+++ b/arch/powerpc/platforms/powernv/memtrace.c
@@ -268,15 +268,11 @@ static int memtrace_online(void)
 
 static int memtrace_enable_set(void *data, u64 val)
 {
-   u64 bytes;
-
-   /*
-* Don't attempt to do anything if size isn't aligned to a memory
-* block or equal to zero.
-*/
-   bytes = memory_block_size_bytes();
-   if (val & (bytes - 1)) {
-   pr_err("Value must be aligned with 0x%llx\n", bytes);
+   const unsigned long bytes = memory_block_size_bytes();
+
+   if (val && (!is_power_of_2(val) || val < bytes)) {
+   pr_err("Value must be 0 or a power of 2 (at least 0x%lx)\n",
+  bytes);
return -EINVAL;
}
 
-- 
2.23.0



Re: [RFC PATCH 1/2] mm/mmu_gather: Invalidate TLB correctly on batch allocation failure and flush

2019-12-17 Thread Peter Zijlstra
On Tue, Dec 17, 2019 at 04:18:40PM +0530, Aneesh Kumar K.V wrote:
> On 12/17/19 2:39 PM, Peter Zijlstra wrote:
> > On Tue, Dec 17, 2019 at 12:47:12PM +0530, Aneesh Kumar K.V wrote:
> > > Architectures for which we have hardware walkers of Linux page table 
> > > should
> > > flush TLB on mmu gather batch allocation failures and batch flush. Some
> > > architectures like POWER supports multiple translation modes (hash and 
> > > radix)
> > > and in the case of POWER only radix translation mode needs the above TLBI.
> > > This is because for hash translation mode kernel wants to avoid this extra
> > > flush since there are no hardware walkers of linux page table. With radix
> > > translation, the hardware also walks linux page table and with that, 
> > > kernel
> > > needs to make sure to TLB invalidate page walk cache before page table 
> > > pages are
> > > freed.
> > 
> > > Based on changes from Peter Zijlstra 
> > 
> > AFAICT it is all my patch ;-)
> 
> Yes. I moved the changes you had to upstream. I can update the From: in the
> next version if you are ok with that?

Well, since PPC isn't broken per finding the invalidate in
__p*_free_tlb(), lets do these things on top of the patches I proposed
here. Also, you mnight want to run benchmarks to see if the movement of
that TLBI actually helps (I'm thinking the cost of the PTESYNC might add
up).


Re: [RFC PATCH 2/2] mm/mmu_gather: Avoid multiple page walk cache flush

2019-12-17 Thread Peter Zijlstra
On Tue, Dec 17, 2019 at 03:45:36PM +0530, Aneesh Kumar K.V wrote:
> On 12/17/19 2:28 PM, Peter Zijlstra wrote:
> > On Tue, Dec 17, 2019 at 12:47:13PM +0530, Aneesh Kumar K.V wrote:
> > > On tlb_finish_mmu() kernel does a tlb flush before  mmu gather table 
> > > invalidate.
> > > The mmu gather table invalidate depending on kernel config also does 
> > > another
> > > TLBI. Avoid the later on tlb_finish_mmu().
> > 
> > That is already avoided, if you look at tlb_flush_mmu_tlbonly() it does
> > __tlb_range_reset(), which results in ->end = 0, which then triggers the
> > early exit on the next invocation:
> > 
> > if (!tlb->end)
> > return;
> > 
> 
> Is that true for tlb->fulmm flush?

Hmm, no, but I'm thinking you patch is broken, even for that case. We
must issue the TLBI before call_rcu().

Perhaps if we replace !tlb->end with something like:

  !tlb->freed_tables && !tlb->cleared_p*

(which GCC should be able to do with a single load and mask)

I've not really thought too hard about it yet, I need to run some
errands, but I'll look at it more closely when I get back.


Re: [Intel-gfx] [PATCH v3 4/7] drm/i915/perf: open access for CAP_SYS_PERFMON privileged process

2019-12-17 Thread Alexey Budankov


On 17.12.2019 12:45, Lionel Landwerlin wrote:
> On 16/12/2019 22:03, Alexey Budankov wrote:
>> Open access to i915_perf monitoring for CAP_SYS_PERFMON privileged processes.
>> For backward compatibility reasons access to i915_perf subsystem remains open
>> for CAP_SYS_ADMIN privileged processes but CAP_SYS_ADMIN usage for secure
>> i915_perf monitoring is discouraged with respect to CAP_SYS_PERFMON 
>> capability.
>>
>> Signed-off-by: Alexey Budankov 
> 
> 
> Assuming people are fine with this new cap, I like this idea of a lighter 
> privilege for i915-perf.

Lionel, thanks for your meaningful input!
Appreciate your collaboration.

Regards,
Alexey

> 
> 
> -Lionel
> 
> 
> 


Re: [PATCH] drivers: char: tpm: remove unneeded MODULE_VERSION() usage

2019-12-17 Thread Jarkko Sakkinen
On Tue, 2019-12-17 at 13:16 +0200, Jarkko Sakkinen wrote:
> On Mon, 2019-12-16 at 09:42 +0100, Enrico Weigelt, metux IT consult wrote:
> > Remove MODULE_VERSION(), as it isn't needed at all: the only version
> > making sense is the kernel version.
> 
> Take the following line away:
> 
> > See also: https://lkml.org/lkml/2017/11/22/480
> 
> And just before SOB:
> 
> Link: https://lkml.org/lkml/2017/11/22/480
> > Signed-off-by: Enrico Weigelt, metux IT consult 
> 
> You have some extra cruft there. It should be:
> 
> Signed-off-by: Enrico Weigelt 

Also, the email that you are sending this patch from incorrectly
formatted email address. Please configure your email client to
have just Firstname Lastname as the email.

/Jarkko



Re: [PATCH] drivers: char: tpm: remove unneeded MODULE_VERSION() usage

2019-12-17 Thread Jarkko Sakkinen
On Mon, 2019-12-16 at 09:42 +0100, Enrico Weigelt, metux IT consult wrote:
> Remove MODULE_VERSION(), as it isn't needed at all: the only version
> making sense is the kernel version.

Take the following line away:

> See also: https://lkml.org/lkml/2017/11/22/480

And just before SOB:

Link: https://lkml.org/lkml/2017/11/22/480
> Signed-off-by: Enrico Weigelt, metux IT consult 

You have some extra cruft there. It should be:

Signed-off-by: Enrico Weigelt 

/Jarkko



Re: [RFC PATCH 1/2] mm/mmu_gather: Invalidate TLB correctly on batch allocation failure and flush

2019-12-17 Thread Aneesh Kumar K.V

On 12/17/19 2:39 PM, Peter Zijlstra wrote:

On Tue, Dec 17, 2019 at 12:47:12PM +0530, Aneesh Kumar K.V wrote:

Architectures for which we have hardware walkers of Linux page table should
flush TLB on mmu gather batch allocation failures and batch flush. Some
architectures like POWER supports multiple translation modes (hash and radix)
and in the case of POWER only radix translation mode needs the above TLBI.
This is because for hash translation mode kernel wants to avoid this extra
flush since there are no hardware walkers of linux page table. With radix
translation, the hardware also walks linux page table and with that, kernel
needs to make sure to TLB invalidate page walk cache before page table pages are
freed.



Based on changes from Peter Zijlstra 


AFAICT it is all my patch ;-)


Yes. I moved the changes you had to upstream. I can update the From: in 
the next version if you are ok with that?




Anyway, this commit:


More details in
commit: d86564a2f085 ("mm/tlb, x86/mm: Support invalidating TLB caches for 
RCU_TABLE_FREE")


states that you do an explicit invalidate in __p*_free_tlb(), which, if
I'm not mistaken is still there:

   arch/powerpc/include/asm/nohash/pgalloc.h:  tlb_flush_pgtable(tlb, 
address);



nohash is not really radix. So we still do the tlb flush from the 
pte_free_tlb for nohash and for PPC-radix, we let tlb_table_invalidate 
to flush that.




Or am I reading this wrong? I'm thinking you can remove that now.


diff --git a/arch/powerpc/include/asm/tlb.h b/arch/powerpc/include/asm/tlb.h
index b2c0be93929d..feea1a09bbce 100644
--- a/arch/powerpc/include/asm/tlb.h
+++ b/arch/powerpc/include/asm/tlb.h
@@ -27,6 +27,10 @@
  #define tlb_flush tlb_flush
  extern void tlb_flush(struct mmu_gather *tlb);
  
+#ifdef CONFIG_HAVE_RCU_TABLE_FREE

/*
  * PPC-Hash does not use the linux page-tables, so we can avoid
  * the TLBI for page-table freeing, PPC-Radix otoh does use the
  * page-tables and needs the TLBI.
  */

+#define tlb_needs_table_invalidate()   radix_enabled()
+#endif


Also, are you really sure about the !SMP case? Esp. on Radix I'm
thinking that the PWC (page-walk-cache) can give trouble even on UP,
when we get preempted in the middle of mmu_gather. Hmm?



Yes, looking at !SMP I guess we do have issue there. we do free the 
pagetable pages directly in __p*_free_tlb() with the current code. That 
will definitely not work. Are you suggesting we enable 
HAVE_RCU_TABLE_FREE even for !SMP?



  /* Get the generic bits... */
  #include 





-aneesh


Re: [RFC PATCH 2/2] mm/mmu_gather: Avoid multiple page walk cache flush

2019-12-17 Thread Aneesh Kumar K.V

On 12/17/19 2:28 PM, Peter Zijlstra wrote:

On Tue, Dec 17, 2019 at 12:47:13PM +0530, Aneesh Kumar K.V wrote:

On tlb_finish_mmu() kernel does a tlb flush before  mmu gather table invalidate.
The mmu gather table invalidate depending on kernel config also does another
TLBI. Avoid the later on tlb_finish_mmu().


That is already avoided, if you look at tlb_flush_mmu_tlbonly() it does
__tlb_range_reset(), which results in ->end = 0, which then triggers the
early exit on the next invocation:

if (!tlb->end)
return;



Is that true for tlb->fulmm flush?

-aneesh


Re: [PATCH 08/10] crypto/NX: Add NX GZIP user space API

2019-12-17 Thread Herbert Xu
On Sun, Dec 15, 2019 at 05:05:19AM -0800, Haren Myneni wrote:
> 
> On power9, userspace can send GZIP compression requests directly to NX
> once kernel establishes NX channel / window. This patch provides GZIP
> engine access to user space via /dev/crypto/nx-gzip device node with
> open, VAS_TX_WIN_OPEN ioctl, mmap and close operations.
> 
> Each window corresponds to file descriptor and application can open
> multiple windows. After the window is opened, mmap() system call to map
> the hardware address of engine's request queue into the application's
> virtual address space.
> 
> Then the application can then submit one or more requests to the the
> engine by using the copy/paste instructions and pasting the CRBs to
> the virtual address (aka paste_address) returned by mmap().
> 
> Signed-off-by: Sukadev Bhattiprolu 
> Signed-off-by: Haren Myneni 
> ---
>  drivers/crypto/nx/Makefile|   2 +-
>  drivers/crypto/nx/nx-842-powernv.h|   2 +
>  drivers/crypto/nx/nx-commom-powernv.c |  21 ++-
>  drivers/crypto/nx/nx-gzip-powernv.c   | 282 
> ++
>  4 files changed, 304 insertions(+), 3 deletions(-)
>  create mode 100644 drivers/crypto/nx/nx-gzip-powernv.c

We already have a kernel compress API which could be exposed
to user-space through af_alg.  If every driver created their
own user-space API it would be unmanageable.

Cheers,
-- 
Email: Herbert Xu 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


Re: [RFC PATCH 1/2] mm/mmu_gather: Invalidate TLB correctly on batch allocation failure and flush

2019-12-17 Thread Peter Zijlstra
On Tue, Dec 17, 2019 at 12:47:12PM +0530, Aneesh Kumar K.V wrote:
> Architectures for which we have hardware walkers of Linux page table should
> flush TLB on mmu gather batch allocation failures and batch flush. Some
> architectures like POWER supports multiple translation modes (hash and radix)
> and in the case of POWER only radix translation mode needs the above TLBI.
> This is because for hash translation mode kernel wants to avoid this extra
> flush since there are no hardware walkers of linux page table. With radix
> translation, the hardware also walks linux page table and with that, kernel
> needs to make sure to TLB invalidate page walk cache before page table pages 
> are
> freed.

> Based on changes from Peter Zijlstra 

AFAICT it is all my patch ;-)

Anyway, this commit:

> More details in
> commit: d86564a2f085 ("mm/tlb, x86/mm: Support invalidating TLB caches for 
> RCU_TABLE_FREE")

states that you do an explicit invalidate in __p*_free_tlb(), which, if
I'm not mistaken is still there:

  arch/powerpc/include/asm/nohash/pgalloc.h:  tlb_flush_pgtable(tlb, 
address);

Or am I reading this wrong? I'm thinking you can remove that now.

> diff --git a/arch/powerpc/include/asm/tlb.h b/arch/powerpc/include/asm/tlb.h
> index b2c0be93929d..feea1a09bbce 100644
> --- a/arch/powerpc/include/asm/tlb.h
> +++ b/arch/powerpc/include/asm/tlb.h
> @@ -27,6 +27,10 @@
>  #define tlb_flush tlb_flush
>  extern void tlb_flush(struct mmu_gather *tlb);
>  
> +#ifdef CONFIG_HAVE_RCU_TABLE_FREE
/*
 * PPC-Hash does not use the linux page-tables, so we can avoid
 * the TLBI for page-table freeing, PPC-Radix otoh does use the
 * page-tables and needs the TLBI.
 */
> +#define tlb_needs_table_invalidate() radix_enabled()
> +#endif

Also, are you really sure about the !SMP case? Esp. on Radix I'm
thinking that the PWC (page-walk-cache) can give trouble even on UP,
when we get preempted in the middle of mmu_gather. Hmm?

>  /* Get the generic bits... */
>  #include 




Re: [RFC PATCH 2/2] mm/mmu_gather: Avoid multiple page walk cache flush

2019-12-17 Thread Peter Zijlstra
On Tue, Dec 17, 2019 at 12:47:13PM +0530, Aneesh Kumar K.V wrote:
> On tlb_finish_mmu() kernel does a tlb flush before  mmu gather table 
> invalidate.
> The mmu gather table invalidate depending on kernel config also does another
> TLBI. Avoid the later on tlb_finish_mmu().

That is already avoided, if you look at tlb_flush_mmu_tlbonly() it does
__tlb_range_reset(), which results in ->end = 0, which then triggers the
early exit on the next invocation:

if (!tlb->end)
return;


Re: [PATCH v9 23/25] mm/gup: track FOLL_PIN pages

2019-12-17 Thread kbuild test robot
Hi John,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on rdma/for-next]
[also build test WARNING on linus/master v5.5-rc2 next-20191216]
[cannot apply to mmotm/master vfio/next]
[if your patch is applied to the wrong git tree, please drop us a note to help
improve the system. BTW, we also suggest to use '--base' option to specify the
base tree in git format-patch, please see https://stackoverflow.com/a/37406982]

url:
https://github.com/0day-ci/linux/commits/John-Hubbard/mm-gup-track-dma-pinned-pages-FOLL_PIN/20191212-013238
base:   https://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git for-next
reproduce:
# apt-get install sparse
# sparse version: v0.6.1-104-gf934193-dirty
make ARCH=x86_64 allmodconfig
make C=1 CF='-fdiagnostic-prefix -D__CHECK_ENDIAN__'

If you fix the issue, kindly add following tag
Reported-by: kbuild test robot 


sparse warnings: (new ones prefixed by >>)

>> mm/gup.c:78:26: sparse: sparse: symbol 'try_pin_compound_head' was not 
>> declared. Should it be static?

Please review and possibly fold the followup patch.

---
0-DAY kernel test infrastructure Open Source Technology Center
https://lists.01.org/hyperkitty/list/kbuild-...@lists.01.org Intel Corporation


[RFC PATCH] mm/gup: try_pin_compound_head() can be static

2019-12-17 Thread kbuild test robot


Fixes: 8086d1c61970 ("mm/gup: track FOLL_PIN pages")
Signed-off-by: kbuild test robot 
---
 gup.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/gup.c b/mm/gup.c
index 038b71165a761..849a6f55938e6 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -75,7 +75,7 @@ static inline struct page *try_get_compound_head(struct page 
*page, int refs)
  * @Return:the compound head page, with ref appropriately incremented,
  * or NULL upon failure.
  */
-__must_check struct page *try_pin_compound_head(struct page *page, int refs)
+static __must_check struct page *try_pin_compound_head(struct page *page, int 
refs)
 {
struct page *head = try_get_compound_head(page,
  GUP_PIN_COUNTING_BIAS * refs);