[PATCH 08/11] perf intel-pt: Allow for a guest kernel address filter

2021-02-18 Thread Adrian Hunter
Handling TIP.PGD for an address filter for a guest kernel is the same as a
host kernel, but user space decoding, and hence address filters, are not
supported.

Signed-off-by: Adrian Hunter 
---
 tools/perf/util/intel-pt.c | 8 +++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/tools/perf/util/intel-pt.c b/tools/perf/util/intel-pt.c
index 29d871718995..546d512b300a 100644
--- a/tools/perf/util/intel-pt.c
+++ b/tools/perf/util/intel-pt.c
@@ -792,8 +792,14 @@ static int __intel_pt_pgd_ip(uint64_t ip, void *data)
u8 cpumode;
u64 offset;
 
-   if (ip >= ptq->pt->kernel_start)
+   if (ptq->state->to_nr) {
+   if (intel_pt_guest_kernel_ip(ip))
+   return intel_pt_match_pgd_ip(ptq->pt, ip, ip, NULL);
+   /* No support for decoding guest user space */
+   return -EINVAL;
+   } else if (ip >= ptq->pt->kernel_start) {
return intel_pt_match_pgd_ip(ptq->pt, ip, ip, NULL);
+   }
 
cpumode = PERF_RECORD_MISC_USER;
 
-- 
2.17.1



[PATCH 5.4 06/32] arm64: Fix kernel address detection of __is_lm_address()

2021-02-05 Thread Greg Kroah-Hartman
From: Vincenzo Frascino 

commit 519ea6f1c82fcdc9842908155ae379de47818778 upstream.

Currently, the __is_lm_address() check just masks out the top 12 bits
of the address, but if they are 0, it still yields a true result.
This has as a side effect that virt_addr_valid() returns true even for
invalid virtual addresses (e.g. 0x0).

Fix the detection checking that it's actually a kernel address starting
at PAGE_OFFSET.

Fixes: 68dd8ef32162 ("arm64: memory: Fix virt_addr_valid() using 
__is_lm_address()")
Cc:  # 5.4.x
Cc: Will Deacon 
Suggested-by: Catalin Marinas 
Reviewed-by: Catalin Marinas 
Acked-by: Mark Rutland 
Signed-off-by: Vincenzo Frascino 
Link: https://lore.kernel.org/r/20210126134056.45747-1-vincenzo.frasc...@arm.com
Signed-off-by: Catalin Marinas 
Signed-off-by: Greg Kroah-Hartman 
---
 arch/arm64/include/asm/memory.h |8 
 1 file changed, 4 insertions(+), 4 deletions(-)

--- a/arch/arm64/include/asm/memory.h
+++ b/arch/arm64/include/asm/memory.h
@@ -247,11 +247,11 @@ static inline const void *__tag_set(cons
 
 
 /*
- * The linear kernel range starts at the bottom of the virtual address
- * space. Testing the top bit for the start of the region is a
- * sufficient check and avoids having to worry about the tag.
+ * Check whether an arbitrary address is within the linear map, which
+ * lives in the [PAGE_OFFSET, PAGE_END) interval at the bottom of the
+ * kernel's TTBR1 address range.
  */
-#define __is_lm_address(addr)  (!(((u64)addr) & BIT(vabits_actual - 1)))
+#define __is_lm_address(addr)  (((u64)(addr) ^ PAGE_OFFSET) < (PAGE_END - 
PAGE_OFFSET))
 
 #define __lm_to_phys(addr) (((addr) & ~PAGE_OFFSET) + PHYS_OFFSET)
 #define __kimg_to_phys(addr)   ((addr) - kimage_voffset)




[PATCH 5.10 13/57] arm64: Fix kernel address detection of __is_lm_address()

2021-02-05 Thread Greg Kroah-Hartman
From: Vincenzo Frascino 

commit 519ea6f1c82fcdc9842908155ae379de47818778 upstream.

Currently, the __is_lm_address() check just masks out the top 12 bits
of the address, but if they are 0, it still yields a true result.
This has as a side effect that virt_addr_valid() returns true even for
invalid virtual addresses (e.g. 0x0).

Fix the detection checking that it's actually a kernel address starting
at PAGE_OFFSET.

Fixes: 68dd8ef32162 ("arm64: memory: Fix virt_addr_valid() using 
__is_lm_address()")
Cc:  # 5.4.x
Cc: Will Deacon 
Suggested-by: Catalin Marinas 
Reviewed-by: Catalin Marinas 
Acked-by: Mark Rutland 
Signed-off-by: Vincenzo Frascino 
Link: https://lore.kernel.org/r/20210126134056.45747-1-vincenzo.frasc...@arm.com
Signed-off-by: Catalin Marinas 
Signed-off-by: Greg Kroah-Hartman 
---
 arch/arm64/include/asm/memory.h |8 
 1 file changed, 4 insertions(+), 4 deletions(-)

--- a/arch/arm64/include/asm/memory.h
+++ b/arch/arm64/include/asm/memory.h
@@ -238,11 +238,11 @@ static inline const void *__tag_set(cons
 
 
 /*
- * The linear kernel range starts at the bottom of the virtual address
- * space. Testing the top bit for the start of the region is a
- * sufficient check and avoids having to worry about the tag.
+ * Check whether an arbitrary address is within the linear map, which
+ * lives in the [PAGE_OFFSET, PAGE_END) interval at the bottom of the
+ * kernel's TTBR1 address range.
  */
-#define __is_lm_address(addr)  (!(((u64)addr) & BIT(vabits_actual - 1)))
+#define __is_lm_address(addr)  (((u64)(addr) ^ PAGE_OFFSET) < (PAGE_END - 
PAGE_OFFSET))
 
 #define __lm_to_phys(addr) (((addr) & ~PAGE_OFFSET) + PHYS_OFFSET)
 #define __kimg_to_phys(addr)   ((addr) - kimage_voffset)




Re: [PATCH] arm64: Fix kernel address detection of __is_lm_address()

2021-01-27 Thread Catalin Marinas
On Tue, 26 Jan 2021 13:40:56 +, Vincenzo Frascino wrote:
> Currently, the __is_lm_address() check just masks out the top 12 bits
> of the address, but if they are 0, it still yields a true result.
> This has as a side effect that virt_addr_valid() returns true even for
> invalid virtual addresses (e.g. 0x0).
> 
> Fix the detection checking that it's actually a kernel address starting
> at PAGE_OFFSET.

Applied to arm64 (for-next/fixes), thanks!

[1/1] arm64: Fix kernel address detection of __is_lm_address()
  https://git.kernel.org/arm64/c/519ea6f1c82f

-- 
Catalin



Re: [PATCH] arm64: Fix kernel address detection of __is_lm_address()

2021-01-26 Thread Vincenzo Frascino



On 1/26/21 4:36 PM, Catalin Marinas wrote:
> On Tue, Jan 26, 2021 at 01:40:56PM +, Vincenzo Frascino wrote:
>> Currently, the __is_lm_address() check just masks out the top 12 bits
>> of the address, but if they are 0, it still yields a true result.
>> This has as a side effect that virt_addr_valid() returns true even for
>> invalid virtual addresses (e.g. 0x0).
>>
>> Fix the detection checking that it's actually a kernel address starting
>> at PAGE_OFFSET.
>>
>> Fixes: f4693c2716b35 ("arm64: mm: extend linear region for 52-bit VA 
>> configurations")
>> Cc:  # 5.4.x
> 
> Not sure what happened with the Fixes tag but that's definitely not what
> it fixes. The above is a 5.11 commit that preserves the semantics of an
> older commit. So it should be:
> 
> Fixes: 68dd8ef32162 ("arm64: memory: Fix virt_addr_valid() using 
> __is_lm_address()")
> 

Yes that is correct. I moved the release to which applies backword but I forgot
to update the fixes tag I suppose.

...

> 
> Anyway, no need to repost, I can update the fixes tag myself.
>

Thank you for this.

> In terms of stable backports, it may be cleaner to backport 7bc1a0f9e176
> ("arm64: mm: use single quantity to represent the PA to VA translation")
> which has a Fixes tag already but never made it to -stable. On top of
> this, we can backport Ard's latest f4693c2716b35 ("arm64: mm: extend
> linear region for 52-bit VA configurations"). I just tried these locally
> and the conflicts were fairly trivial.
> 

Ok, thank you for digging it. I will give it a try tomorrow.

-- 
Regards,
Vincenzo


Re: [PATCH] arm64: Fix kernel address detection of __is_lm_address()

2021-01-26 Thread Catalin Marinas
On Tue, Jan 26, 2021 at 01:40:56PM +, Vincenzo Frascino wrote:
> Currently, the __is_lm_address() check just masks out the top 12 bits
> of the address, but if they are 0, it still yields a true result.
> This has as a side effect that virt_addr_valid() returns true even for
> invalid virtual addresses (e.g. 0x0).
> 
> Fix the detection checking that it's actually a kernel address starting
> at PAGE_OFFSET.
> 
> Fixes: f4693c2716b35 ("arm64: mm: extend linear region for 52-bit VA 
> configurations")
> Cc:  # 5.4.x

Not sure what happened with the Fixes tag but that's definitely not what
it fixes. The above is a 5.11 commit that preserves the semantics of an
older commit. So it should be:

Fixes: 68dd8ef32162 ("arm64: memory: Fix virt_addr_valid() using 
__is_lm_address()")

The above also had a fix for another commit but no need to add two
entries, we just fix the original fix: 14c127c957c1 ("arm64: mm: Flip
kernel VA space").

Anyway, no need to repost, I can update the fixes tag myself.

In terms of stable backports, it may be cleaner to backport 7bc1a0f9e176
("arm64: mm: use single quantity to represent the PA to VA translation")
which has a Fixes tag already but never made it to -stable. On top of
this, we can backport Ard's latest f4693c2716b35 ("arm64: mm: extend
linear region for 52-bit VA configurations"). I just tried these locally
and the conflicts were fairly trivial.

-- 
Catalin


[PATCH] arm64: Fix kernel address detection of __is_lm_address()

2021-01-26 Thread Vincenzo Frascino
Currently, the __is_lm_address() check just masks out the top 12 bits
of the address, but if they are 0, it still yields a true result.
This has as a side effect that virt_addr_valid() returns true even for
invalid virtual addresses (e.g. 0x0).

Fix the detection checking that it's actually a kernel address starting
at PAGE_OFFSET.

Fixes: f4693c2716b35 ("arm64: mm: extend linear region for 52-bit VA 
configurations")
Cc:  # 5.4.x
Cc: Catalin Marinas 
Cc: Will Deacon 
Suggested-by: Catalin Marinas 
Reviewed-by: Catalin Marinas 
Acked-by: Mark Rutland 
Signed-off-by: Vincenzo Frascino 
---
 arch/arm64/include/asm/memory.h | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h
index 18fce223b67b..99d7e1494aaa 100644
--- a/arch/arm64/include/asm/memory.h
+++ b/arch/arm64/include/asm/memory.h
@@ -247,9 +247,11 @@ static inline const void *__tag_set(const void *addr, u8 
tag)
 
 
 /*
- * The linear kernel range starts at the bottom of the virtual address space.
+ * Check whether an arbitrary address is within the linear map, which
+ * lives in the [PAGE_OFFSET, PAGE_END) interval at the bottom of the
+ * kernel's TTBR1 address range.
  */
-#define __is_lm_address(addr)  (((u64)(addr) & ~PAGE_OFFSET) < (PAGE_END - 
PAGE_OFFSET))
+#define __is_lm_address(addr)  (((u64)(addr) ^ PAGE_OFFSET) < (PAGE_END - 
PAGE_OFFSET))
 
 #define __lm_to_phys(addr) (((addr) & ~PAGE_OFFSET) + PHYS_OFFSET)
 #define __kimg_to_phys(addr)   ((addr) - kimage_voffset)
-- 
2.30.0



Re: [PATCH v4 1/3] arm64: Improve kernel address detection of __is_lm_address()

2021-01-26 Thread Vincenzo Frascino



On 1/26/21 12:07 PM, Catalin Marinas wrote:
> On Tue, Jan 26, 2021 at 11:58:13AM +, Vincenzo Frascino wrote:
>> On 1/25/21 5:56 PM, Catalin Marinas wrote:
>>> On Mon, Jan 25, 2021 at 04:09:57PM +, Vincenzo Frascino wrote:
>>>> On 1/25/21 2:59 PM, Catalin Marinas wrote:
>>>>> On Mon, Jan 25, 2021 at 02:36:34PM +, Vincenzo Frascino wrote:
>>>>>> On 1/25/21 1:02 PM, Mark Rutland wrote:
>>>>>>> On Fri, Jan 22, 2021 at 03:56:40PM +, Vincenzo Frascino wrote:
>>>>>>>> Currently, the __is_lm_address() check just masks out the top 12 bits
>>>>>>>> of the address, but if they are 0, it still yields a true result.
>>>>>>>> This has as a side effect that virt_addr_valid() returns true even for
>>>>>>>> invalid virtual addresses (e.g. 0x0).
>>>>>>>>
>>>>>>>> Improve the detection checking that it's actually a kernel address
>>>>>>>> starting at PAGE_OFFSET.
>>>>>>>>
>>>>>>>> Cc: Catalin Marinas 
>>>>>>>> Cc: Will Deacon 
>>>>>>>> Suggested-by: Catalin Marinas 
>>>>>>>> Reviewed-by: Catalin Marinas 
>>>>>>>> Signed-off-by: Vincenzo Frascino 
>>>>>>>
>>>>>>> Looking around, it seems that there are some existing uses of
>>>>>>> virt_addr_valid() that expect it to reject addresses outside of the
>>>>>>> TTBR1 range. For example, check_mem_type() in drivers/tee/optee/call.c.
>>>>>>>
>>>>>>> Given that, I think we need something that's easy to backport to stable.
>>>>>>>
>>>>>>
>>>>>> I agree, I started looking at it this morning and I found cases even in 
>>>>>> the main
>>>>>> allocators (slub and page_alloc) either then the one you mentioned.
>>>>>>
>>>>>>> This patch itself looks fine, but it's not going to backport very far,
>>>>>>> so I suspect we might need to write a preparatory patch that adds an
>>>>>>> explicit range check to virt_addr_valid() which can be trivially
>>>>>>> backported.
>>>>>>>
>>>>>>
>>>>>> I checked the old releases and I agree this is not back-portable as it 
>>>>>> stands.
>>>>>> I propose therefore to add a preparatory patch with the check below:
>>>>>>
>>>>>> #define __is_ttrb1_address(addr) ((u64)(addr) >= PAGE_OFFSET && \
>>>>>>  (u64)(addr) < PAGE_END)
>>>>>>
>>>>>> If it works for you I am happy to take care of it and post a new version 
>>>>>> of my
>>>>>> patches.
>>>>>
>>>>> I'm not entirely sure we need a preparatory patch. IIUC (it needs
>>>>> checking), virt_addr_valid() was fine until 5.4, broken by commit
>>>>> 14c127c957c1 ("arm64: mm: Flip kernel VA space"). Will addressed the
>>>>> flip case in 68dd8ef32162 ("arm64: memory: Fix virt_addr_valid() using
>>>>> __is_lm_address()") but this broke the >>>> NULL address is considered valid.
>>>>>
>>>>> Ard's commit f4693c2716b3 ("arm64: mm: extend linear region for 52-bit
>>>>> VA configurations") changed the test to no longer rely on va_bits but
>>>>> did not change the broken semantics.
>>>>>
>>>>> If Ard's change plus the fix proposed in this test works on 5.4, I'd say
>>>>> we just merge this patch with the corresponding Cc stable and Fixes tags
>>>>> and tweak it slightly when doing the backports as it wouldn't apply
>>>>> cleanly. IOW, I wouldn't add another check to virt_addr_valid() as we
>>>>> did not need one prior to 5.4.
>>>>
>>>> Thank you for the detailed analysis. I checked on 5.4 and it seems that Ard
>>>> patch (not a clean backport) plus my proposed fix works correctly and 
>>>> solves the
>>>> issue.
>>>
>>> I didn't mean the backport of the whole commit f4693c2716b3 as it
>>> probably has other dependencies, just the __is_lm_address() change in
>>> that patch.
>>
>> Then call it preparatory patch ;)
> 
> It's preparatory only for the stable backports, not for current
> mainline. But I'd rather change the upstream patch when backporting to
> apply cleanly, no need for a preparatory stable patch.
> 

Thanks for the clarification.

-- 
Regards,
Vincenzo


Re: [PATCH v4 1/3] arm64: Improve kernel address detection of __is_lm_address()

2021-01-26 Thread Catalin Marinas
On Tue, Jan 26, 2021 at 11:58:13AM +, Vincenzo Frascino wrote:
> On 1/25/21 5:56 PM, Catalin Marinas wrote:
> > On Mon, Jan 25, 2021 at 04:09:57PM +, Vincenzo Frascino wrote:
> >> On 1/25/21 2:59 PM, Catalin Marinas wrote:
> >>> On Mon, Jan 25, 2021 at 02:36:34PM +, Vincenzo Frascino wrote:
> >>>> On 1/25/21 1:02 PM, Mark Rutland wrote:
> >>>>> On Fri, Jan 22, 2021 at 03:56:40PM +, Vincenzo Frascino wrote:
> >>>>>> Currently, the __is_lm_address() check just masks out the top 12 bits
> >>>>>> of the address, but if they are 0, it still yields a true result.
> >>>>>> This has as a side effect that virt_addr_valid() returns true even for
> >>>>>> invalid virtual addresses (e.g. 0x0).
> >>>>>>
> >>>>>> Improve the detection checking that it's actually a kernel address
> >>>>>> starting at PAGE_OFFSET.
> >>>>>>
> >>>>>> Cc: Catalin Marinas 
> >>>>>> Cc: Will Deacon 
> >>>>>> Suggested-by: Catalin Marinas 
> >>>>>> Reviewed-by: Catalin Marinas 
> >>>>>> Signed-off-by: Vincenzo Frascino 
> >>>>>
> >>>>> Looking around, it seems that there are some existing uses of
> >>>>> virt_addr_valid() that expect it to reject addresses outside of the
> >>>>> TTBR1 range. For example, check_mem_type() in drivers/tee/optee/call.c.
> >>>>>
> >>>>> Given that, I think we need something that's easy to backport to stable.
> >>>>>
> >>>>
> >>>> I agree, I started looking at it this morning and I found cases even in 
> >>>> the main
> >>>> allocators (slub and page_alloc) either then the one you mentioned.
> >>>>
> >>>>> This patch itself looks fine, but it's not going to backport very far,
> >>>>> so I suspect we might need to write a preparatory patch that adds an
> >>>>> explicit range check to virt_addr_valid() which can be trivially
> >>>>> backported.
> >>>>>
> >>>>
> >>>> I checked the old releases and I agree this is not back-portable as it 
> >>>> stands.
> >>>> I propose therefore to add a preparatory patch with the check below:
> >>>>
> >>>> #define __is_ttrb1_address(addr) ((u64)(addr) >= PAGE_OFFSET && \
> >>>>  (u64)(addr) < PAGE_END)
> >>>>
> >>>> If it works for you I am happy to take care of it and post a new version 
> >>>> of my
> >>>> patches.
> >>>
> >>> I'm not entirely sure we need a preparatory patch. IIUC (it needs
> >>> checking), virt_addr_valid() was fine until 5.4, broken by commit
> >>> 14c127c957c1 ("arm64: mm: Flip kernel VA space"). Will addressed the
> >>> flip case in 68dd8ef32162 ("arm64: memory: Fix virt_addr_valid() using
> >>> __is_lm_address()") but this broke the  >>> NULL address is considered valid.
> >>>
> >>> Ard's commit f4693c2716b3 ("arm64: mm: extend linear region for 52-bit
> >>> VA configurations") changed the test to no longer rely on va_bits but
> >>> did not change the broken semantics.
> >>>
> >>> If Ard's change plus the fix proposed in this test works on 5.4, I'd say
> >>> we just merge this patch with the corresponding Cc stable and Fixes tags
> >>> and tweak it slightly when doing the backports as it wouldn't apply
> >>> cleanly. IOW, I wouldn't add another check to virt_addr_valid() as we
> >>> did not need one prior to 5.4.
> >>
> >> Thank you for the detailed analysis. I checked on 5.4 and it seems that Ard
> >> patch (not a clean backport) plus my proposed fix works correctly and 
> >> solves the
> >> issue.
> > 
> > I didn't mean the backport of the whole commit f4693c2716b3 as it
> > probably has other dependencies, just the __is_lm_address() change in
> > that patch.
> 
> Then call it preparatory patch ;)

It's preparatory only for the stable backports, not for current
mainline. But I'd rather change the upstream patch when backporting to
apply cleanly, no need for a preparatory stable patch.

-- 
Catalin


Re: [PATCH v4 1/3] arm64: Improve kernel address detection of __is_lm_address()

2021-01-26 Thread Vincenzo Frascino



On 1/25/21 5:56 PM, Catalin Marinas wrote:
> On Mon, Jan 25, 2021 at 04:09:57PM +, Vincenzo Frascino wrote:
>> On 1/25/21 2:59 PM, Catalin Marinas wrote:
>>> On Mon, Jan 25, 2021 at 02:36:34PM +, Vincenzo Frascino wrote:
>>>> On 1/25/21 1:02 PM, Mark Rutland wrote:
>>>>> On Fri, Jan 22, 2021 at 03:56:40PM +, Vincenzo Frascino wrote:
>>>>>> Currently, the __is_lm_address() check just masks out the top 12 bits
>>>>>> of the address, but if they are 0, it still yields a true result.
>>>>>> This has as a side effect that virt_addr_valid() returns true even for
>>>>>> invalid virtual addresses (e.g. 0x0).
>>>>>>
>>>>>> Improve the detection checking that it's actually a kernel address
>>>>>> starting at PAGE_OFFSET.
>>>>>>
>>>>>> Cc: Catalin Marinas 
>>>>>> Cc: Will Deacon 
>>>>>> Suggested-by: Catalin Marinas 
>>>>>> Reviewed-by: Catalin Marinas 
>>>>>> Signed-off-by: Vincenzo Frascino 
>>>>>
>>>>> Looking around, it seems that there are some existing uses of
>>>>> virt_addr_valid() that expect it to reject addresses outside of the
>>>>> TTBR1 range. For example, check_mem_type() in drivers/tee/optee/call.c.
>>>>>
>>>>> Given that, I think we need something that's easy to backport to stable.
>>>>>
>>>>
>>>> I agree, I started looking at it this morning and I found cases even in 
>>>> the main
>>>> allocators (slub and page_alloc) either then the one you mentioned.
>>>>
>>>>> This patch itself looks fine, but it's not going to backport very far,
>>>>> so I suspect we might need to write a preparatory patch that adds an
>>>>> explicit range check to virt_addr_valid() which can be trivially
>>>>> backported.
>>>>>
>>>>
>>>> I checked the old releases and I agree this is not back-portable as it 
>>>> stands.
>>>> I propose therefore to add a preparatory patch with the check below:
>>>>
>>>> #define __is_ttrb1_address(addr)   ((u64)(addr) >= PAGE_OFFSET && \
>>>>(u64)(addr) < PAGE_END)
>>>>
>>>> If it works for you I am happy to take care of it and post a new version 
>>>> of my
>>>> patches.
>>>
>>> I'm not entirely sure we need a preparatory patch. IIUC (it needs
>>> checking), virt_addr_valid() was fine until 5.4, broken by commit
>>> 14c127c957c1 ("arm64: mm: Flip kernel VA space"). Will addressed the
>>> flip case in 68dd8ef32162 ("arm64: memory: Fix virt_addr_valid() using
>>> __is_lm_address()") but this broke the >> NULL address is considered valid.
>>>
>>> Ard's commit f4693c2716b3 ("arm64: mm: extend linear region for 52-bit
>>> VA configurations") changed the test to no longer rely on va_bits but
>>> did not change the broken semantics.
>>>
>>> If Ard's change plus the fix proposed in this test works on 5.4, I'd say
>>> we just merge this patch with the corresponding Cc stable and Fixes tags
>>> and tweak it slightly when doing the backports as it wouldn't apply
>>> cleanly. IOW, I wouldn't add another check to virt_addr_valid() as we
>>> did not need one prior to 5.4.
>>
>> Thank you for the detailed analysis. I checked on 5.4 and it seems that Ard
>> patch (not a clean backport) plus my proposed fix works correctly and solves 
>> the
>> issue.
> 
> I didn't mean the backport of the whole commit f4693c2716b3 as it
> probably has other dependencies, just the __is_lm_address() change in
> that patch.
> 

Then call it preparatory patch ;)

>> Tomorrow I will post a new version of the series that includes what you are
>> suggesting.
> 
> Please post the __is_lm_address() fix separately from the kasan patches.
> I'll pick it up as a fix via the arm64 tree. The kasan change can go in
> 5.12 since it's not currently broken but I'll leave the decision with
> Andrey.
> 

Ok, will do.

-- 
Regards,
Vincenzo


Re: [PATCH v4 1/3] arm64: Improve kernel address detection of __is_lm_address()

2021-01-25 Thread Mark Rutland
Hi Vincenzo,

On Fri, Jan 22, 2021 at 03:56:40PM +, Vincenzo Frascino wrote:
> Currently, the __is_lm_address() check just masks out the top 12 bits
> of the address, but if they are 0, it still yields a true result.
> This has as a side effect that virt_addr_valid() returns true even for
> invalid virtual addresses (e.g. 0x0).
> 
> Improve the detection checking that it's actually a kernel address
> starting at PAGE_OFFSET.
> 
> Cc: Catalin Marinas 
> Cc: Will Deacon 
> Suggested-by: Catalin Marinas 
> Reviewed-by: Catalin Marinas 
> Signed-off-by: Vincenzo Frascino 

Looking around, it seems that there are some existing uses of
virt_addr_valid() that expect it to reject addresses outside of the
TTBR1 range. For example, check_mem_type() in drivers/tee/optee/call.c.

Given that, I think we need something that's easy to backport to stable.

This patch itself looks fine, but it's not going to backport very far,
so I suspect we might need to write a preparatory patch that adds an
explicit range check to virt_addr_valid() which can be trivially
backported.

For this patch:

Acked-by: Mark Rutland 

Thanks,
Mark.

> ---
>  arch/arm64/include/asm/memory.h | 6 --
>  1 file changed, 4 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h
> index 18fce223b67b..99d7e1494aaa 100644
> --- a/arch/arm64/include/asm/memory.h
> +++ b/arch/arm64/include/asm/memory.h
> @@ -247,9 +247,11 @@ static inline const void *__tag_set(const void *addr, u8 
> tag)
>  
>  
>  /*
> - * The linear kernel range starts at the bottom of the virtual address space.
> + * Check whether an arbitrary address is within the linear map, which
> + * lives in the [PAGE_OFFSET, PAGE_END) interval at the bottom of the
> + * kernel's TTBR1 address range.
>   */
> -#define __is_lm_address(addr)(((u64)(addr) & ~PAGE_OFFSET) < 
> (PAGE_END - PAGE_OFFSET))
> +#define __is_lm_address(addr)(((u64)(addr) ^ PAGE_OFFSET) < 
> (PAGE_END - PAGE_OFFSET))
>  
>  #define __lm_to_phys(addr)   (((addr) & ~PAGE_OFFSET) + PHYS_OFFSET)
>  #define __kimg_to_phys(addr) ((addr) - kimage_voffset)
> -- 
> 2.30.0
> 


Re: [PATCH v4 1/3] arm64: Improve kernel address detection of __is_lm_address()

2021-01-25 Thread Catalin Marinas
On Mon, Jan 25, 2021 at 04:09:57PM +, Vincenzo Frascino wrote:
> On 1/25/21 2:59 PM, Catalin Marinas wrote:
> > On Mon, Jan 25, 2021 at 02:36:34PM +, Vincenzo Frascino wrote:
> >> On 1/25/21 1:02 PM, Mark Rutland wrote:
> >>> On Fri, Jan 22, 2021 at 03:56:40PM +, Vincenzo Frascino wrote:
> >>>> Currently, the __is_lm_address() check just masks out the top 12 bits
> >>>> of the address, but if they are 0, it still yields a true result.
> >>>> This has as a side effect that virt_addr_valid() returns true even for
> >>>> invalid virtual addresses (e.g. 0x0).
> >>>>
> >>>> Improve the detection checking that it's actually a kernel address
> >>>> starting at PAGE_OFFSET.
> >>>>
> >>>> Cc: Catalin Marinas 
> >>>> Cc: Will Deacon 
> >>>> Suggested-by: Catalin Marinas 
> >>>> Reviewed-by: Catalin Marinas 
> >>>> Signed-off-by: Vincenzo Frascino 
> >>>
> >>> Looking around, it seems that there are some existing uses of
> >>> virt_addr_valid() that expect it to reject addresses outside of the
> >>> TTBR1 range. For example, check_mem_type() in drivers/tee/optee/call.c.
> >>>
> >>> Given that, I think we need something that's easy to backport to stable.
> >>>
> >>
> >> I agree, I started looking at it this morning and I found cases even in 
> >> the main
> >> allocators (slub and page_alloc) either then the one you mentioned.
> >>
> >>> This patch itself looks fine, but it's not going to backport very far,
> >>> so I suspect we might need to write a preparatory patch that adds an
> >>> explicit range check to virt_addr_valid() which can be trivially
> >>> backported.
> >>>
> >>
> >> I checked the old releases and I agree this is not back-portable as it 
> >> stands.
> >> I propose therefore to add a preparatory patch with the check below:
> >>
> >> #define __is_ttrb1_address(addr)   ((u64)(addr) >= PAGE_OFFSET && \
> >>(u64)(addr) < PAGE_END)
> >>
> >> If it works for you I am happy to take care of it and post a new version 
> >> of my
> >> patches.
> > 
> > I'm not entirely sure we need a preparatory patch. IIUC (it needs
> > checking), virt_addr_valid() was fine until 5.4, broken by commit
> > 14c127c957c1 ("arm64: mm: Flip kernel VA space"). Will addressed the
> > flip case in 68dd8ef32162 ("arm64: memory: Fix virt_addr_valid() using
> > __is_lm_address()") but this broke the  > NULL address is considered valid.
> > 
> > Ard's commit f4693c2716b3 ("arm64: mm: extend linear region for 52-bit
> > VA configurations") changed the test to no longer rely on va_bits but
> > did not change the broken semantics.
> > 
> > If Ard's change plus the fix proposed in this test works on 5.4, I'd say
> > we just merge this patch with the corresponding Cc stable and Fixes tags
> > and tweak it slightly when doing the backports as it wouldn't apply
> > cleanly. IOW, I wouldn't add another check to virt_addr_valid() as we
> > did not need one prior to 5.4.
> 
> Thank you for the detailed analysis. I checked on 5.4 and it seems that Ard
> patch (not a clean backport) plus my proposed fix works correctly and solves 
> the
> issue.

I didn't mean the backport of the whole commit f4693c2716b3 as it
probably has other dependencies, just the __is_lm_address() change in
that patch.

> Tomorrow I will post a new version of the series that includes what you are
> suggesting.

Please post the __is_lm_address() fix separately from the kasan patches.
I'll pick it up as a fix via the arm64 tree. The kasan change can go in
5.12 since it's not currently broken but I'll leave the decision with
Andrey.

-- 
Catalin


Re: [PATCH v4 1/3] arm64: Improve kernel address detection of __is_lm_address()

2021-01-25 Thread Mark Rutland
On Mon, Jan 25, 2021 at 02:59:12PM +, Catalin Marinas wrote:
> On Mon, Jan 25, 2021 at 02:36:34PM +, Vincenzo Frascino wrote:
> > On 1/25/21 1:02 PM, Mark Rutland wrote:
> > > On Fri, Jan 22, 2021 at 03:56:40PM +, Vincenzo Frascino wrote:
> > > This patch itself looks fine, but it's not going to backport very far,
> > > so I suspect we might need to write a preparatory patch that adds an
> > > explicit range check to virt_addr_valid() which can be trivially
> > > backported.
> > 
> > I checked the old releases and I agree this is not back-portable as it 
> > stands.
> > I propose therefore to add a preparatory patch with the check below:
> > 
> > #define __is_ttrb1_address(addr)((u64)(addr) >= PAGE_OFFSET && \
> > 
> > If it works for you I am happy to take care of it and post a new version of 
> > my
> > patches.
> 
> I'm not entirely sure we need a preparatory patch. IIUC (it needs
> checking), virt_addr_valid() was fine until 5.4, broken by commit
> 14c127c957c1 ("arm64: mm: Flip kernel VA space").

Ah, so it was; thanks for digging into the history!

> Will addressed the
> flip case in 68dd8ef32162 ("arm64: memory: Fix virt_addr_valid() using
> __is_lm_address()") but this broke the  NULL address is considered valid.
> 
> Ard's commit f4693c2716b3 ("arm64: mm: extend linear region for 52-bit
> VA configurations") changed the test to no longer rely on va_bits but
> did not change the broken semantics.
> 
> If Ard's change plus the fix proposed in this test works on 5.4, I'd say
> we just merge this patch with the corresponding Cc stable and Fixes tags
> and tweak it slightly when doing the backports as it wouldn't apply
> cleanly. IOW, I wouldn't add another check to virt_addr_valid() as we
> did not need one prior to 5.4.

That makes sense to me; sorry for the noise!

Thanks,
Mark.


Re: [PATCH v4 1/3] arm64: Improve kernel address detection of __is_lm_address()

2021-01-25 Thread Vincenzo Frascino



On 1/25/21 2:59 PM, Catalin Marinas wrote:
> On Mon, Jan 25, 2021 at 02:36:34PM +, Vincenzo Frascino wrote:
>> On 1/25/21 1:02 PM, Mark Rutland wrote:
>>> On Fri, Jan 22, 2021 at 03:56:40PM +, Vincenzo Frascino wrote:
>>>> Currently, the __is_lm_address() check just masks out the top 12 bits
>>>> of the address, but if they are 0, it still yields a true result.
>>>> This has as a side effect that virt_addr_valid() returns true even for
>>>> invalid virtual addresses (e.g. 0x0).
>>>>
>>>> Improve the detection checking that it's actually a kernel address
>>>> starting at PAGE_OFFSET.
>>>>
>>>> Cc: Catalin Marinas 
>>>> Cc: Will Deacon 
>>>> Suggested-by: Catalin Marinas 
>>>> Reviewed-by: Catalin Marinas 
>>>> Signed-off-by: Vincenzo Frascino 
>>>
>>> Looking around, it seems that there are some existing uses of
>>> virt_addr_valid() that expect it to reject addresses outside of the
>>> TTBR1 range. For example, check_mem_type() in drivers/tee/optee/call.c.
>>>
>>> Given that, I think we need something that's easy to backport to stable.
>>>
>>
>> I agree, I started looking at it this morning and I found cases even in the 
>> main
>> allocators (slub and page_alloc) either then the one you mentioned.
>>
>>> This patch itself looks fine, but it's not going to backport very far,
>>> so I suspect we might need to write a preparatory patch that adds an
>>> explicit range check to virt_addr_valid() which can be trivially
>>> backported.
>>>
>>
>> I checked the old releases and I agree this is not back-portable as it 
>> stands.
>> I propose therefore to add a preparatory patch with the check below:
>>
>> #define __is_ttrb1_address(addr) ((u64)(addr) >= PAGE_OFFSET && \
>>  (u64)(addr) < PAGE_END)
>>
>> If it works for you I am happy to take care of it and post a new version of 
>> my
>> patches.
> 
> I'm not entirely sure we need a preparatory patch. IIUC (it needs
> checking), virt_addr_valid() was fine until 5.4, broken by commit
> 14c127c957c1 ("arm64: mm: Flip kernel VA space"). Will addressed the
> flip case in 68dd8ef32162 ("arm64: memory: Fix virt_addr_valid() using
> __is_lm_address()") but this broke the  NULL address is considered valid.
> 
> Ard's commit f4693c2716b3 ("arm64: mm: extend linear region for 52-bit
> VA configurations") changed the test to no longer rely on va_bits but
> did not change the broken semantics.
> 
> If Ard's change plus the fix proposed in this test works on 5.4, I'd say
> we just merge this patch with the corresponding Cc stable and Fixes tags
> and tweak it slightly when doing the backports as it wouldn't apply
> cleanly. IOW, I wouldn't add another check to virt_addr_valid() as we
> did not need one prior to 5.4.
> 

Thank you for the detailed analysis. I checked on 5.4 and it seems that Ard
patch (not a clean backport) plus my proposed fix works correctly and solves the
issue.

Tomorrow I will post a new version of the series that includes what you are
suggesting.

-- 
Regards,
Vincenzo


Re: [PATCH v4 1/3] arm64: Improve kernel address detection of __is_lm_address()

2021-01-25 Thread Catalin Marinas
On Mon, Jan 25, 2021 at 02:36:34PM +, Vincenzo Frascino wrote:
> On 1/25/21 1:02 PM, Mark Rutland wrote:
> > On Fri, Jan 22, 2021 at 03:56:40PM +, Vincenzo Frascino wrote:
> >> Currently, the __is_lm_address() check just masks out the top 12 bits
> >> of the address, but if they are 0, it still yields a true result.
> >> This has as a side effect that virt_addr_valid() returns true even for
> >> invalid virtual addresses (e.g. 0x0).
> >>
> >> Improve the detection checking that it's actually a kernel address
> >> starting at PAGE_OFFSET.
> >>
> >> Cc: Catalin Marinas 
> >> Cc: Will Deacon 
> >> Suggested-by: Catalin Marinas 
> >> Reviewed-by: Catalin Marinas 
> >> Signed-off-by: Vincenzo Frascino 
> > 
> > Looking around, it seems that there are some existing uses of
> > virt_addr_valid() that expect it to reject addresses outside of the
> > TTBR1 range. For example, check_mem_type() in drivers/tee/optee/call.c.
> > 
> > Given that, I think we need something that's easy to backport to stable.
> > 
> 
> I agree, I started looking at it this morning and I found cases even in the 
> main
> allocators (slub and page_alloc) either then the one you mentioned.
> 
> > This patch itself looks fine, but it's not going to backport very far,
> > so I suspect we might need to write a preparatory patch that adds an
> > explicit range check to virt_addr_valid() which can be trivially
> > backported.
> > 
> 
> I checked the old releases and I agree this is not back-portable as it stands.
> I propose therefore to add a preparatory patch with the check below:
> 
> #define __is_ttrb1_address(addr)  ((u64)(addr) >= PAGE_OFFSET && \
>   (u64)(addr) < PAGE_END)
> 
> If it works for you I am happy to take care of it and post a new version of my
> patches.

I'm not entirely sure we need a preparatory patch. IIUC (it needs
checking), virt_addr_valid() was fine until 5.4, broken by commit
14c127c957c1 ("arm64: mm: Flip kernel VA space"). Will addressed the
flip case in 68dd8ef32162 ("arm64: memory: Fix virt_addr_valid() using
__is_lm_address()") but this broke the 

Re: [PATCH v4 1/3] arm64: Improve kernel address detection of __is_lm_address()

2021-01-25 Thread Vincenzo Frascino
Hi Mark,

On 1/25/21 1:02 PM, Mark Rutland wrote:
> Hi Vincenzo,
> 
> On Fri, Jan 22, 2021 at 03:56:40PM +, Vincenzo Frascino wrote:
>> Currently, the __is_lm_address() check just masks out the top 12 bits
>> of the address, but if they are 0, it still yields a true result.
>> This has as a side effect that virt_addr_valid() returns true even for
>> invalid virtual addresses (e.g. 0x0).
>>
>> Improve the detection checking that it's actually a kernel address
>> starting at PAGE_OFFSET.
>>
>> Cc: Catalin Marinas 
>> Cc: Will Deacon 
>> Suggested-by: Catalin Marinas 
>> Reviewed-by: Catalin Marinas 
>> Signed-off-by: Vincenzo Frascino 
> 
> Looking around, it seems that there are some existing uses of
> virt_addr_valid() that expect it to reject addresses outside of the
> TTBR1 range. For example, check_mem_type() in drivers/tee/optee/call.c.
> 
> Given that, I think we need something that's easy to backport to stable.
> 

I agree, I started looking at it this morning and I found cases even in the main
allocators (slub and page_alloc) either then the one you mentioned.

> This patch itself looks fine, but it's not going to backport very far,
> so I suspect we might need to write a preparatory patch that adds an
> explicit range check to virt_addr_valid() which can be trivially
> backported.
> 

I checked the old releases and I agree this is not back-portable as it stands.
I propose therefore to add a preparatory patch with the check below:

#define __is_ttrb1_address(addr)((u64)(addr) >= PAGE_OFFSET && \
(u64)(addr) < PAGE_END)

If it works for you I am happy to take care of it and post a new version of my
patches.

Thanks!

> For this patch:
> 
> Acked-by: Mark Rutland 
> 
> Thanks,
> Mark.
> 
>> ---
>>  arch/arm64/include/asm/memory.h | 6 --
>>  1 file changed, 4 insertions(+), 2 deletions(-)
>>
>> diff --git a/arch/arm64/include/asm/memory.h 
>> b/arch/arm64/include/asm/memory.h
>> index 18fce223b67b..99d7e1494aaa 100644
>> --- a/arch/arm64/include/asm/memory.h
>> +++ b/arch/arm64/include/asm/memory.h
>> @@ -247,9 +247,11 @@ static inline const void *__tag_set(const void *addr, 
>> u8 tag)
>>  
>>  
>>  /*
>> - * The linear kernel range starts at the bottom of the virtual address 
>> space.
>> + * Check whether an arbitrary address is within the linear map, which
>> + * lives in the [PAGE_OFFSET, PAGE_END) interval at the bottom of the
>> + * kernel's TTBR1 address range.
>>   */
>> -#define __is_lm_address(addr)   (((u64)(addr) & ~PAGE_OFFSET) < 
>> (PAGE_END - PAGE_OFFSET))
>> +#define __is_lm_address(addr)   (((u64)(addr) ^ PAGE_OFFSET) < 
>> (PAGE_END - PAGE_OFFSET))
>>  
>>  #define __lm_to_phys(addr)  (((addr) & ~PAGE_OFFSET) + PHYS_OFFSET)
>>  #define __kimg_to_phys(addr)((addr) - kimage_voffset)
>> -- 
>> 2.30.0
>>

-- 
Regards,
Vincenzo


[PATCH v4 1/3] arm64: Improve kernel address detection of __is_lm_address()

2021-01-22 Thread Vincenzo Frascino
Currently, the __is_lm_address() check just masks out the top 12 bits
of the address, but if they are 0, it still yields a true result.
This has as a side effect that virt_addr_valid() returns true even for
invalid virtual addresses (e.g. 0x0).

Improve the detection checking that it's actually a kernel address
starting at PAGE_OFFSET.

Cc: Catalin Marinas 
Cc: Will Deacon 
Suggested-by: Catalin Marinas 
Reviewed-by: Catalin Marinas 
Signed-off-by: Vincenzo Frascino 
---
 arch/arm64/include/asm/memory.h | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h
index 18fce223b67b..99d7e1494aaa 100644
--- a/arch/arm64/include/asm/memory.h
+++ b/arch/arm64/include/asm/memory.h
@@ -247,9 +247,11 @@ static inline const void *__tag_set(const void *addr, u8 
tag)
 
 
 /*
- * The linear kernel range starts at the bottom of the virtual address space.
+ * Check whether an arbitrary address is within the linear map, which
+ * lives in the [PAGE_OFFSET, PAGE_END) interval at the bottom of the
+ * kernel's TTBR1 address range.
  */
-#define __is_lm_address(addr)  (((u64)(addr) & ~PAGE_OFFSET) < (PAGE_END - 
PAGE_OFFSET))
+#define __is_lm_address(addr)  (((u64)(addr) ^ PAGE_OFFSET) < (PAGE_END - 
PAGE_OFFSET))
 
 #define __lm_to_phys(addr) (((addr) & ~PAGE_OFFSET) + PHYS_OFFSET)
 #define __kimg_to_phys(addr)   ((addr) - kimage_voffset)
-- 
2.30.0



Re: [PATCH v3 1/2] arm64: Improve kernel address detection of __is_lm_address()

2021-01-22 Thread Catalin Marinas
On Fri, Jan 22, 2021 at 02:37:47PM +, Vincenzo Frascino wrote:
> Currently, the __is_lm_address() check just masks out the top 12 bits
> of the address, but if they are 0, it still yields a true result.
> This has as a side effect that virt_addr_valid() returns true even for
> invalid virtual addresses (e.g. 0x0).
> 
> Improve the detection checking that it's actually a kernel address
> starting at PAGE_OFFSET.
> 
> Cc: Catalin Marinas 
> Cc: Will Deacon 
> Suggested-by: Catalin Marinas 
> Signed-off-by: Vincenzo Frascino 

Reviewed-by: Catalin Marinas 

The question is whether we leave this for 5.12 or we merge it earlier.

-- 
Catalin


[PATCH v3 1/2] arm64: Improve kernel address detection of __is_lm_address()

2021-01-22 Thread Vincenzo Frascino
Currently, the __is_lm_address() check just masks out the top 12 bits
of the address, but if they are 0, it still yields a true result.
This has as a side effect that virt_addr_valid() returns true even for
invalid virtual addresses (e.g. 0x0).

Improve the detection checking that it's actually a kernel address
starting at PAGE_OFFSET.

Cc: Catalin Marinas 
Cc: Will Deacon 
Suggested-by: Catalin Marinas 
Signed-off-by: Vincenzo Frascino 
---
 arch/arm64/include/asm/memory.h | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h
index 18fce223b67b..99d7e1494aaa 100644
--- a/arch/arm64/include/asm/memory.h
+++ b/arch/arm64/include/asm/memory.h
@@ -247,9 +247,11 @@ static inline const void *__tag_set(const void *addr, u8 
tag)
 
 
 /*
- * The linear kernel range starts at the bottom of the virtual address space.
+ * Check whether an arbitrary address is within the linear map, which
+ * lives in the [PAGE_OFFSET, PAGE_END) interval at the bottom of the
+ * kernel's TTBR1 address range.
  */
-#define __is_lm_address(addr)  (((u64)(addr) & ~PAGE_OFFSET) < (PAGE_END - 
PAGE_OFFSET))
+#define __is_lm_address(addr)  (((u64)(addr) ^ PAGE_OFFSET) < (PAGE_END - 
PAGE_OFFSET))
 
 #define __lm_to_phys(addr) (((addr) & ~PAGE_OFFSET) + PHYS_OFFSET)
 #define __kimg_to_phys(addr)   ((addr) - kimage_voffset)
-- 
2.30.0



Re: [PATCH v2 1/2] arm64: Fix kernel address detection of __is_lm_address()

2021-01-21 Thread Vincenzo Frascino


On 1/21/21 4:02 PM, Vincenzo Frascino wrote:
>> I think it'd be worth checking, if we're going to use this in common
>> code.
>>
> Ok, I will run some tests and let you know.
> 

I checked on x86_64 and ppc64 (they both have KASAN implementation):

I added the following:

printk("%s: %d\n", __func__, virt_addr_valid(0));

in x86_64: sounds/last.c
in pp64: arch/powerpc/kernel/setup-common.c

and in both the cases the output is 0 (false) when the same in arm64 is 1
(true). Therefore I think we should proceed with the change.

-- 
Regards,
Vincenzo


Re: [PATCH v2 1/2] arm64: Fix kernel address detection of __is_lm_address()

2021-01-21 Thread Mark Rutland
On Thu, Jan 21, 2021 at 03:30:51PM +, Vincenzo Frascino wrote:
> On 1/21/21 3:12 PM, Mark Rutland wrote:
> > On Thu, Jan 21, 2021 at 01:19:55PM +, Vincenzo Frascino wrote:
> >> Currently, the __is_lm_address() check just masks out the top 12 bits
> >> of the address, but if they are 0, it still yields a true result.
> >> This has as a side effect that virt_addr_valid() returns true even for
> >> invalid virtual addresses (e.g. 0x0).
> > 
> > When it was added, __is_lm_address() was intended to distinguish valid
> > kernel virtual addresses (i.e. those in the TTBR1 address range), and
> > wasn't intended to do anything for addresses outside of this range. See
> > commit:
> > 
> >   ec6d06efb0bac6cd ("arm64: Add support for CONFIG_DEBUG_VIRTUAL")
> > 
> > ... where it simply tests a bit.
> > 
> > So I believe that it's working as intended (though this is poorly
> > documented), but I think you're saying that usage isn't aligned with
> > that intent. Given that, I'm not sure the fixes tag is right; I think it
> > has never had the semantic you're after.
> >
> I did not do much thinking on the intended semantics. I based my 
> interpretation
> on what you are saying (the usage is not aligned with the intent). Based on 
> what
> you are are saying, I will change the patch description removing the "Fix" 
> term.

Thanks! I assume that also means removing the fixes tag.

> > I had thought the same was true for virt_addr_valid(), and that wasn't
> > expected to be called for VAs outside of the kernel VA range. Is it
> > actually safe to call that with NULL on other architectures?
> 
> I am not sure on this, did not do any testing outside of arm64.

I think it'd be worth checking, if we're going to use this in common
code.

> > I wonder if it's worth virt_addr_valid() having an explicit check for
> > the kernel VA range, instead.
> 
> I have no strong opinion either way even if personally I feel that modifying
> __is_lm_address() is more clear. Feel free to propose something.

Sure; I'm happy for it to live within __is_lm_address() if that's
simpler overall, given it doesn't look like it's making that more
complex or expensive.

> >> Fix the detection checking that it's actually a kernel address starting
> >> at PAGE_OFFSET.
> >>
> >> Fixes: f4693c2716b35 ("arm64: mm: extend linear region for 52-bit VA 
> >> configurations")
> >> Cc: Catalin Marinas 
> >> Cc: Will Deacon 
> >> Suggested-by: Catalin Marinas 
> >> Signed-off-by: Vincenzo Frascino 
> >> ---
> >>  arch/arm64/include/asm/memory.h | 2 +-
> >>  1 file changed, 1 insertion(+), 1 deletion(-)
> >>
> >> diff --git a/arch/arm64/include/asm/memory.h 
> >> b/arch/arm64/include/asm/memory.h
> >> index 18fce223b67b..e04ac898ffe4 100644
> >> --- a/arch/arm64/include/asm/memory.h
> >> +++ b/arch/arm64/include/asm/memory.h
> >> @@ -249,7 +249,7 @@ static inline const void *__tag_set(const void *addr, 
> >> u8 tag)
> >>  /*
> >>   * The linear kernel range starts at the bottom of the virtual address 
> >> space.
> >>   */
> >> -#define __is_lm_address(addr) (((u64)(addr) & ~PAGE_OFFSET) < 
> >> (PAGE_END - PAGE_OFFSET))
> >> +#define __is_lm_address(addr) (((u64)(addr) ^ PAGE_OFFSET) < 
> >> (PAGE_END - PAGE_OFFSET))
> > 
> > If we're going to make this stronger, can we please expand the comment
> > with the intended semantic? Otherwise we're liable to break this in
> > future.
> 
> Based on your reply on the above matter, if you agree, I am happy to extend 
> the
> comment.

Works for me; how about:

/*
 * Check whether an arbitrary address is within the linear map, which
 * lives in the [PAGE_OFFSET, PAGE_END) interval at the bottom of the
 * kernel's TTBR1 address range.
 */

... with "arbitrary" being the key word.

Thanks,
Mark.


Re: [PATCH v2 1/2] arm64: Fix kernel address detection of __is_lm_address()

2021-01-21 Thread Vincenzo Frascino



On 1/21/21 3:49 PM, Mark Rutland wrote:
> On Thu, Jan 21, 2021 at 03:30:51PM +, Vincenzo Frascino wrote:
>> On 1/21/21 3:12 PM, Mark Rutland wrote:
>>> On Thu, Jan 21, 2021 at 01:19:55PM +, Vincenzo Frascino wrote:
>>>> Currently, the __is_lm_address() check just masks out the top 12 bits
>>>> of the address, but if they are 0, it still yields a true result.
>>>> This has as a side effect that virt_addr_valid() returns true even for
>>>> invalid virtual addresses (e.g. 0x0).
>>>
>>> When it was added, __is_lm_address() was intended to distinguish valid
>>> kernel virtual addresses (i.e. those in the TTBR1 address range), and
>>> wasn't intended to do anything for addresses outside of this range. See
>>> commit:
>>>
>>>   ec6d06efb0bac6cd ("arm64: Add support for CONFIG_DEBUG_VIRTUAL")
>>>
>>> ... where it simply tests a bit.
>>>
>>> So I believe that it's working as intended (though this is poorly
>>> documented), but I think you're saying that usage isn't aligned with
>>> that intent. Given that, I'm not sure the fixes tag is right; I think it
>>> has never had the semantic you're after.
>>>
>> I did not do much thinking on the intended semantics. I based my 
>> interpretation
>> on what you are saying (the usage is not aligned with the intent). Based on 
>> what
>> you are are saying, I will change the patch description removing the "Fix" 
>> term.
> 
> Thanks! I assume that also means removing the fixes tag.
>

Obviously ;)

>>> I had thought the same was true for virt_addr_valid(), and that wasn't
>>> expected to be called for VAs outside of the kernel VA range. Is it
>>> actually safe to call that with NULL on other architectures?
>>
>> I am not sure on this, did not do any testing outside of arm64.
> 
> I think it'd be worth checking, if we're going to use this in common
> code.
> 

Ok, I will run some tests and let you know.

>>> I wonder if it's worth virt_addr_valid() having an explicit check for
>>> the kernel VA range, instead.
>>
>> I have no strong opinion either way even if personally I feel that modifying
>> __is_lm_address() is more clear. Feel free to propose something.
> 
> Sure; I'm happy for it to live within __is_lm_address() if that's
> simpler overall, given it doesn't look like it's making that more
> complex or expensive.
> 
>>>> Fix the detection checking that it's actually a kernel address starting
>>>> at PAGE_OFFSET.
>>>>
>>>> Fixes: f4693c2716b35 ("arm64: mm: extend linear region for 52-bit VA 
>>>> configurations")
>>>> Cc: Catalin Marinas 
>>>> Cc: Will Deacon 
>>>> Suggested-by: Catalin Marinas 
>>>> Signed-off-by: Vincenzo Frascino 
>>>> ---
>>>>  arch/arm64/include/asm/memory.h | 2 +-
>>>>  1 file changed, 1 insertion(+), 1 deletion(-)
>>>>
>>>> diff --git a/arch/arm64/include/asm/memory.h 
>>>> b/arch/arm64/include/asm/memory.h
>>>> index 18fce223b67b..e04ac898ffe4 100644
>>>> --- a/arch/arm64/include/asm/memory.h
>>>> +++ b/arch/arm64/include/asm/memory.h
>>>> @@ -249,7 +249,7 @@ static inline const void *__tag_set(const void *addr, 
>>>> u8 tag)
>>>>  /*
>>>>   * The linear kernel range starts at the bottom of the virtual address 
>>>> space.
>>>>   */
>>>> -#define __is_lm_address(addr) (((u64)(addr) & ~PAGE_OFFSET) < 
>>>> (PAGE_END - PAGE_OFFSET))
>>>> +#define __is_lm_address(addr) (((u64)(addr) ^ PAGE_OFFSET) < 
>>>> (PAGE_END - PAGE_OFFSET))
>>>
>>> If we're going to make this stronger, can we please expand the comment
>>> with the intended semantic? Otherwise we're liable to break this in
>>> future.
>>
>> Based on your reply on the above matter, if you agree, I am happy to extend 
>> the
>> comment.
> 
> Works for me; how about:
> 
> /*
>  * Check whether an arbitrary address is within the linear map, which
>  * lives in the [PAGE_OFFSET, PAGE_END) interval at the bottom of the
>  * kernel's TTBR1 address range.
>  */
> 
> ... with "arbitrary" being the key word.
> 

Sounds good to me! I will post the new version after confirming the behavior of
virt_addr_valid() on the other architectures.

> Thanks,
> Mark.
> 

-- 
Regards,
Vincenzo


Re: [PATCH v2 1/2] arm64: Fix kernel address detection of __is_lm_address()

2021-01-21 Thread Vincenzo Frascino



On 1/21/21 3:12 PM, Mark Rutland wrote:
> [adding Ard]
>

Thanks for this, it is related to his patch and I forgot to Cc: him directly.

> On Thu, Jan 21, 2021 at 01:19:55PM +, Vincenzo Frascino wrote:
>> Currently, the __is_lm_address() check just masks out the top 12 bits
>> of the address, but if they are 0, it still yields a true result.
>> This has as a side effect that virt_addr_valid() returns true even for
>> invalid virtual addresses (e.g. 0x0).
> 
> When it was added, __is_lm_address() was intended to distinguish valid
> kernel virtual addresses (i.e. those in the TTBR1 address range), and
> wasn't intended to do anything for addresses outside of this range. See
> commit:
> 
>   ec6d06efb0bac6cd ("arm64: Add support for CONFIG_DEBUG_VIRTUAL")
> 
> ... where it simply tests a bit.
> 
> So I believe that it's working as intended (though this is poorly
> documented), but I think you're saying that usage isn't aligned with
> that intent. Given that, I'm not sure the fixes tag is right; I think it
> has never had the semantic you're after.
>

I did not do much thinking on the intended semantics. I based my interpretation
on what you are saying (the usage is not aligned with the intent). Based on what
you are are saying, I will change the patch description removing the "Fix" term.

> I had thought the same was true for virt_addr_valid(), and that wasn't
> expected to be called for VAs outside of the kernel VA range. Is it
> actually safe to call that with NULL on other architectures?
> 

I am not sure on this, did not do any testing outside of arm64.

> I wonder if it's worth virt_addr_valid() having an explicit check for
> the kernel VA range, instead.
> 

I have no strong opinion either way even if personally I feel that modifying
__is_lm_address() is more clear. Feel free to propose something.

>> Fix the detection checking that it's actually a kernel address starting
>> at PAGE_OFFSET.
>>
>> Fixes: f4693c2716b35 ("arm64: mm: extend linear region for 52-bit VA 
>> configurations")
>> Cc: Catalin Marinas 
>> Cc: Will Deacon 
>> Suggested-by: Catalin Marinas 
>> Signed-off-by: Vincenzo Frascino 
>> ---
>>  arch/arm64/include/asm/memory.h | 2 +-
>>  1 file changed, 1 insertion(+), 1 deletion(-)
>>
>> diff --git a/arch/arm64/include/asm/memory.h 
>> b/arch/arm64/include/asm/memory.h
>> index 18fce223b67b..e04ac898ffe4 100644
>> --- a/arch/arm64/include/asm/memory.h
>> +++ b/arch/arm64/include/asm/memory.h
>> @@ -249,7 +249,7 @@ static inline const void *__tag_set(const void *addr, u8 
>> tag)
>>  /*
>>   * The linear kernel range starts at the bottom of the virtual address 
>> space.
>>   */
>> -#define __is_lm_address(addr)   (((u64)(addr) & ~PAGE_OFFSET) < 
>> (PAGE_END - PAGE_OFFSET))
>> +#define __is_lm_address(addr)   (((u64)(addr) ^ PAGE_OFFSET) < 
>> (PAGE_END - PAGE_OFFSET))
> 
> If we're going to make this stronger, can we please expand the comment
> with the intended semantic? Otherwise we're liable to break this in
> future.
> 

Based on your reply on the above matter, if you agree, I am happy to extend the
comment.

> Thanks,
> Mark.
> 

-- 
Regards,
Vincenzo


Re: [PATCH v2 1/2] arm64: Fix kernel address detection of __is_lm_address()

2021-01-21 Thread Mark Rutland
[adding Ard]

On Thu, Jan 21, 2021 at 01:19:55PM +, Vincenzo Frascino wrote:
> Currently, the __is_lm_address() check just masks out the top 12 bits
> of the address, but if they are 0, it still yields a true result.
> This has as a side effect that virt_addr_valid() returns true even for
> invalid virtual addresses (e.g. 0x0).

When it was added, __is_lm_address() was intended to distinguish valid
kernel virtual addresses (i.e. those in the TTBR1 address range), and
wasn't intended to do anything for addresses outside of this range. See
commit:

  ec6d06efb0bac6cd ("arm64: Add support for CONFIG_DEBUG_VIRTUAL")

... where it simply tests a bit.

So I believe that it's working as intended (though this is poorly
documented), but I think you're saying that usage isn't aligned with
that intent. Given that, I'm not sure the fixes tag is right; I think it
has never had the semantic you're after.

I had thought the same was true for virt_addr_valid(), and that wasn't
expected to be called for VAs outside of the kernel VA range. Is it
actually safe to call that with NULL on other architectures?

I wonder if it's worth virt_addr_valid() having an explicit check for
the kernel VA range, instead.

> Fix the detection checking that it's actually a kernel address starting
> at PAGE_OFFSET.
> 
> Fixes: f4693c2716b35 ("arm64: mm: extend linear region for 52-bit VA 
> configurations")
> Cc: Catalin Marinas 
> Cc: Will Deacon 
> Suggested-by: Catalin Marinas 
> Signed-off-by: Vincenzo Frascino 
> ---
>  arch/arm64/include/asm/memory.h | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h
> index 18fce223b67b..e04ac898ffe4 100644
> --- a/arch/arm64/include/asm/memory.h
> +++ b/arch/arm64/include/asm/memory.h
> @@ -249,7 +249,7 @@ static inline const void *__tag_set(const void *addr, u8 
> tag)
>  /*
>   * The linear kernel range starts at the bottom of the virtual address space.
>   */
> -#define __is_lm_address(addr)(((u64)(addr) & ~PAGE_OFFSET) < 
> (PAGE_END - PAGE_OFFSET))
> +#define __is_lm_address(addr)(((u64)(addr) ^ PAGE_OFFSET) < 
> (PAGE_END - PAGE_OFFSET))

If we're going to make this stronger, can we please expand the comment
with the intended semantic? Otherwise we're liable to break this in
future.

Thanks,
Mark.


[PATCH v2 1/2] arm64: Fix kernel address detection of __is_lm_address()

2021-01-21 Thread Vincenzo Frascino
Currently, the __is_lm_address() check just masks out the top 12 bits
of the address, but if they are 0, it still yields a true result.
This has as a side effect that virt_addr_valid() returns true even for
invalid virtual addresses (e.g. 0x0).

Fix the detection checking that it's actually a kernel address starting
at PAGE_OFFSET.

Fixes: f4693c2716b35 ("arm64: mm: extend linear region for 52-bit VA 
configurations")
Cc: Catalin Marinas 
Cc: Will Deacon 
Suggested-by: Catalin Marinas 
Signed-off-by: Vincenzo Frascino 
---
 arch/arm64/include/asm/memory.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h
index 18fce223b67b..e04ac898ffe4 100644
--- a/arch/arm64/include/asm/memory.h
+++ b/arch/arm64/include/asm/memory.h
@@ -249,7 +249,7 @@ static inline const void *__tag_set(const void *addr, u8 
tag)
 /*
  * The linear kernel range starts at the bottom of the virtual address space.
  */
-#define __is_lm_address(addr)  (((u64)(addr) & ~PAGE_OFFSET) < (PAGE_END - 
PAGE_OFFSET))
+#define __is_lm_address(addr)  (((u64)(addr) ^ PAGE_OFFSET) < (PAGE_END - 
PAGE_OFFSET))
 
 #define __lm_to_phys(addr) (((addr) & ~PAGE_OFFSET) + PHYS_OFFSET)
 #define __kimg_to_phys(addr)   ((addr) - kimage_voffset)
-- 
2.30.0



[PATCH] perf: Turn kernel address filters into linear address filters

2020-09-09 Thread Alexander Shishkin
One thing that the address filters can't do at the moment is cover
anonymous mappings. Apparently, there is code out there that generates
executable code in anonymous mappings and executes it in place. And at
the moment we only allow file-based address filters for userspace.

The easiest way to do this is to repurpose the kernel filters and allow
those to be used on userspace addresses. The Intel PT driver at the moment
does enforce that non-file-based filters are only used with kernel
addresses, so that needs to be corrected. The Coresight ETM driver doesn't
enforce that, so no changes needed there.

No tooling changes required, either.

Signed-off-by: Alexander Shishkin 
---
 arch/x86/events/intel/pt.c | 13 -
 kernel/events/core.c   | 22 ++
 2 files changed, 10 insertions(+), 25 deletions(-)

diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c
index e94af4a54d0d..2d981dbaac1b 100644
--- a/arch/x86/events/intel/pt.c
+++ b/arch/x86/events/intel/pt.c
@@ -1347,11 +1347,6 @@ static void pt_addr_filters_fini(struct perf_event 
*event)
event->hw.addr_filters = NULL;
 }
 
-static inline bool valid_kernel_ip(unsigned long ip)
-{
-   return virt_addr_valid(ip) && kernel_ip(ip);
-}
-
 static int pt_event_addr_filters_validate(struct list_head *filters)
 {
struct perf_addr_filter *filter;
@@ -1366,14 +1361,6 @@ static int pt_event_addr_filters_validate(struct 
list_head *filters)
filter->action == PERF_ADDR_FILTER_ACTION_START)
return -EOPNOTSUPP;
 
-   if (!filter->path.dentry) {
-   if (!valid_kernel_ip(filter->offset))
-   return -EINVAL;
-
-   if (!valid_kernel_ip(filter->offset + filter->size))
-   return -EINVAL;
-   }
-
if (++range > 
intel_pt_validate_hw_cap(PT_CAP_num_address_ranges))
return -EOPNOTSUPP;
}
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 57efe3b21e29..8579654d3fc7 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -9989,9 +9989,9 @@ enum {
IF_ACT_START,
IF_ACT_STOP,
IF_SRC_FILE,
-   IF_SRC_KERNEL,
+   IF_SRC_LINEAR,
IF_SRC_FILEADDR,
-   IF_SRC_KERNELADDR,
+   IF_SRC_LINEARADDR,
 };
 
 enum {
@@ -10005,9 +10005,9 @@ static const match_table_t if_tokens = {
{ IF_ACT_START, "start" },
{ IF_ACT_STOP,  "stop" },
{ IF_SRC_FILE,  "%u/%u@%s" },
-   { IF_SRC_KERNEL,"%u/%u" },
+   { IF_SRC_LINEAR,"%u/%u" },
{ IF_SRC_FILEADDR,  "%u@%s" },
-   { IF_SRC_KERNELADDR,"%u" },
+   { IF_SRC_LINEARADDR,"%u" },
{ IF_ACT_NONE,  NULL },
 };
 
@@ -10022,7 +10022,7 @@ perf_event_parse_addr_filter(struct perf_event *event, 
char *fstr,
char *start, *orig, *filename = NULL;
substring_t args[MAX_OPT_ARGS];
int state = IF_STATE_ACTION, token;
-   unsigned int kernel = 0;
+   unsigned int linear = 0;
int ret = -EINVAL;
 
orig = fstr = kstrdup(fstr, GFP_KERNEL);
@@ -10059,9 +10059,9 @@ perf_event_parse_addr_filter(struct perf_event *event, 
char *fstr,
state = IF_STATE_SOURCE;
break;
 
-   case IF_SRC_KERNELADDR:
-   case IF_SRC_KERNEL:
-   kernel = 1;
+   case IF_SRC_LINEARADDR:
+   case IF_SRC_LINEAR:
+   linear = 1;
/* fall through */
 
case IF_SRC_FILEADDR:
@@ -10074,7 +10074,7 @@ perf_event_parse_addr_filter(struct perf_event *event, 
char *fstr,
if (ret)
goto fail;
 
-   if (token == IF_SRC_KERNEL || token == IF_SRC_FILE) {
+   if (token == IF_SRC_LINEAR || token == IF_SRC_FILE) {
*args[1].to = 0;
ret = kstrtoul(args[1].from, 0, &filter->size);
if (ret)
@@ -10105,8 +10105,6 @@ perf_event_parse_addr_filter(struct perf_event *event, 
char *fstr,
 */
if (state == IF_STATE_END) {
ret = -EINVAL;
-   if (kernel && event->attr.exclude_kernel)
-   goto fail;
 
/*
 * ACTION "filter" must have a non-zero length region
@@ -10116,7 +10114,7 @@ perf_event_parse_addr_filter(struct perf_event *event, 
char *fstr,
!filter->size)
goto fail;
 
-   if (!kernel) {
+   if (!linear) {
if (!filename)
goto fail;
 
-- 
2.28.0



Re: Kernel panic : Unable to handle kernel paging request at virtual address - dead address between user and kernel address ranges

2020-08-30 Thread Viresh Kumar
On 28-08-20, 14:23, Ulf Hansson wrote:
> Anders, Naresh - thanks for testing and reporting. I am dropping the
> patch from my tree.
> 
> Viresh, I suggest to keep Anders/Naresh in the cc, for the next
> version. Then I can wait for their tested-by tag before I apply again.

Sorry for the trouble, I thought you will wait for a bit before
applying the patch to see test results from Naresh, but you were fast
enough as well :)

-- 
viresh


Re: Kernel panic : Unable to handle kernel paging request at virtual address - dead address between user and kernel address ranges

2020-08-28 Thread Ulf Hansson
On Fri, 28 Aug 2020 at 12:29, Anders Roxell  wrote:
>
> On Fri, 28 Aug 2020 at 11:35, Ulf Hansson  wrote:
> >
> > On Fri, 28 Aug 2020 at 11:22, Naresh Kamboju  
> > wrote:
> > >
> > > On Thu, 27 Aug 2020 at 17:06, Naresh Kamboju  
> > > wrote:
> > > >
> > > > On Thu, 27 Aug 2020 at 15:42, Viresh Kumar  
> > > > wrote:
> > > > >
> > > > > On 27-08-20, 11:48, Arnd Bergmann wrote:
> > > > > > > > [3.680477]  dev_pm_opp_put_clkname+0x30/0x58
> > > > > > > > [3.683431]  sdhci_msm_probe+0x284/0x9a0
> > > > > >
> > > > > > dev_pm_opp_put_clkname() is part of the error handling in the
> > > > > > probe function, so I would deduct there are two problems:
> > > > > >
> > > > > > - something failed during the probe and the driver is trying
> > > > > >   to unwind
> > > > > > - the error handling it self is buggy and tries to undo something
> > > > > >   again that has already been undone.
> > > > >
> > > > > Right.
> > > > >
> > > > > > This points to Viresh's
> > > > > > d05a7238fe1c mmc: sdhci-msm: Unconditionally call 
> > > > > > dev_pm_opp_of_remove_table()
> > > > >
> > > > > I completely forgot that Ulf already pushed this patch and I was
> > > > > wondering on which of the OPP core changes I wrote have done this :(
> > > > >
> > > > > > Most likely this is not the entire problem but it uncovered a 
> > > > > > preexisting
> > > > > > bug.
> > > > >
> > > > > I think this is.
> > > > >
> > > > > Naresh: Can you please test with this diff ?
> > > >
> > > > I have applied your patch and tested but still see the reported problem.
> > >
> > > The git bisect shows that the first bad commit is,
> > > d05a7238fe1c mmc: sdhci-msm: Unconditionally call 
> > > dev_pm_opp_of_remove_table()
> > >
> > > Reported-by: Naresh Kamboju 
> > > Reported-by: Anders Roxell 
> >
> > I am not sure what version of the patch you tested. However, I have
> > dropped Viresh's v1 and replaced it with v2 [1]. It's available for
> > testing at:
> >
> > https://git.kernel.org/pub/scm/linux/kernel/git/ulfh/mmc.git next
> >
> > Can you please check if it still causes problems, then I will drop it, 
> > again.
>
> I tried to run with a kernel from your tree and I could see the same
> kernel panic on db410c [1].

Anders, Naresh - thanks for testing and reporting. I am dropping the
patch from my tree.

Viresh, I suggest to keep Anders/Naresh in the cc, for the next
version. Then I can wait for their tested-by tag before I apply again.

Kind regards
Uffe


Re: Kernel panic : Unable to handle kernel paging request at virtual address - dead address between user and kernel address ranges

2020-08-28 Thread Anders Roxell
On Fri, 28 Aug 2020 at 11:35, Ulf Hansson  wrote:
>
> On Fri, 28 Aug 2020 at 11:22, Naresh Kamboju  
> wrote:
> >
> > On Thu, 27 Aug 2020 at 17:06, Naresh Kamboju  
> > wrote:
> > >
> > > On Thu, 27 Aug 2020 at 15:42, Viresh Kumar  
> > > wrote:
> > > >
> > > > On 27-08-20, 11:48, Arnd Bergmann wrote:
> > > > > > > [3.680477]  dev_pm_opp_put_clkname+0x30/0x58
> > > > > > > [3.683431]  sdhci_msm_probe+0x284/0x9a0
> > > > >
> > > > > dev_pm_opp_put_clkname() is part of the error handling in the
> > > > > probe function, so I would deduct there are two problems:
> > > > >
> > > > > - something failed during the probe and the driver is trying
> > > > >   to unwind
> > > > > - the error handling it self is buggy and tries to undo something
> > > > >   again that has already been undone.
> > > >
> > > > Right.
> > > >
> > > > > This points to Viresh's
> > > > > d05a7238fe1c mmc: sdhci-msm: Unconditionally call 
> > > > > dev_pm_opp_of_remove_table()
> > > >
> > > > I completely forgot that Ulf already pushed this patch and I was
> > > > wondering on which of the OPP core changes I wrote have done this :(
> > > >
> > > > > Most likely this is not the entire problem but it uncovered a 
> > > > > preexisting
> > > > > bug.
> > > >
> > > > I think this is.
> > > >
> > > > Naresh: Can you please test with this diff ?
> > >
> > > I have applied your patch and tested but still see the reported problem.
> >
> > The git bisect shows that the first bad commit is,
> > d05a7238fe1c mmc: sdhci-msm: Unconditionally call 
> > dev_pm_opp_of_remove_table()
> >
> > Reported-by: Naresh Kamboju 
> > Reported-by: Anders Roxell 
>
> I am not sure what version of the patch you tested. However, I have
> dropped Viresh's v1 and replaced it with v2 [1]. It's available for
> testing at:
>
> https://git.kernel.org/pub/scm/linux/kernel/git/ulfh/mmc.git next
>
> Can you please check if it still causes problems, then I will drop it, again.

I tried to run with a kernel from your tree and I could see the same
kernel panic on db410c [1].

Cheers,
Anders
[1] https://lkft.validation.linaro.org/scheduler/job/1717770#L1912


Re: Kernel panic : Unable to handle kernel paging request at virtual address - dead address between user and kernel address ranges

2020-08-28 Thread Naresh Kamboju
On Fri, 28 Aug 2020 at 15:05, Ulf Hansson  wrote:
>
> On Fri, 28 Aug 2020 at 11:22, Naresh Kamboju  
> wrote:
> >
> > On Thu, 27 Aug 2020 at 17:06, Naresh Kamboju  
> > wrote:
> > >
> > > On Thu, 27 Aug 2020 at 15:42, Viresh Kumar  
> > > wrote:
> > > >
> > > > On 27-08-20, 11:48, Arnd Bergmann wrote:
> > > > > > > [3.680477]  dev_pm_opp_put_clkname+0x30/0x58
> > > > > > > [3.683431]  sdhci_msm_probe+0x284/0x9a0
> > > > >
> > > > > dev_pm_opp_put_clkname() is part of the error handling in the
> > > > > probe function, so I would deduct there are two problems:
> > > > >
> > > > > - something failed during the probe and the driver is trying
> > > > >   to unwind
> > > > > - the error handling it self is buggy and tries to undo something
> > > > >   again that has already been undone.
> > > >
> > > > Right.
> > > >
> > > > > This points to Viresh's
> > > > > d05a7238fe1c mmc: sdhci-msm: Unconditionally call 
> > > > > dev_pm_opp_of_remove_table()
> > > >
> > > > I completely forgot that Ulf already pushed this patch and I was
> > > > wondering on which of the OPP core changes I wrote have done this :(
> > > >
> > > > > Most likely this is not the entire problem but it uncovered a 
> > > > > preexisting
> > > > > bug.
> > > >
> > > > I think this is.
> > > >
> > > > Naresh: Can you please test with this diff ?
> > >
> > > I have applied your patch and tested but still see the reported problem.
> >
> > The git bisect shows that the first bad commit is,
> > d05a7238fe1c mmc: sdhci-msm: Unconditionally call 
> > dev_pm_opp_of_remove_table()
> >
> > Reported-by: Naresh Kamboju 
> > Reported-by: Anders Roxell 
>
> I am not sure what version of the patch you tested.

I have applied The v2 patch series on top of linux next-20200824.
and tested again the reported kernel panic still there on db410c [1]

https://lkft.validation.linaro.org/scheduler/job/1717611#L1874

- Naresh


Re: Kernel panic : Unable to handle kernel paging request at virtual address - dead address between user and kernel address ranges

2020-08-28 Thread Ulf Hansson
On Fri, 28 Aug 2020 at 11:22, Naresh Kamboju  wrote:
>
> On Thu, 27 Aug 2020 at 17:06, Naresh Kamboju  
> wrote:
> >
> > On Thu, 27 Aug 2020 at 15:42, Viresh Kumar  wrote:
> > >
> > > On 27-08-20, 11:48, Arnd Bergmann wrote:
> > > > > > [3.680477]  dev_pm_opp_put_clkname+0x30/0x58
> > > > > > [3.683431]  sdhci_msm_probe+0x284/0x9a0
> > > >
> > > > dev_pm_opp_put_clkname() is part of the error handling in the
> > > > probe function, so I would deduct there are two problems:
> > > >
> > > > - something failed during the probe and the driver is trying
> > > >   to unwind
> > > > - the error handling it self is buggy and tries to undo something
> > > >   again that has already been undone.
> > >
> > > Right.
> > >
> > > > This points to Viresh's
> > > > d05a7238fe1c mmc: sdhci-msm: Unconditionally call 
> > > > dev_pm_opp_of_remove_table()
> > >
> > > I completely forgot that Ulf already pushed this patch and I was
> > > wondering on which of the OPP core changes I wrote have done this :(
> > >
> > > > Most likely this is not the entire problem but it uncovered a 
> > > > preexisting
> > > > bug.
> > >
> > > I think this is.
> > >
> > > Naresh: Can you please test with this diff ?
> >
> > I have applied your patch and tested but still see the reported problem.
>
> The git bisect shows that the first bad commit is,
> d05a7238fe1c mmc: sdhci-msm: Unconditionally call dev_pm_opp_of_remove_table()
>
> Reported-by: Naresh Kamboju 
> Reported-by: Anders Roxell 

I am not sure what version of the patch you tested. However, I have
dropped Viresh's v1 and replaced it with v2 [1]. It's available for
testing at:

https://git.kernel.org/pub/scm/linux/kernel/git/ulfh/mmc.git next

Can you please check if it still causes problems, then I will drop it, again.

Kind regards
Uffe

[1] https://lkml.org/lkml/2020/8/28/43


Re: Kernel panic : Unable to handle kernel paging request at virtual address - dead address between user and kernel address ranges

2020-08-28 Thread Naresh Kamboju
On Thu, 27 Aug 2020 at 17:06, Naresh Kamboju  wrote:
>
> On Thu, 27 Aug 2020 at 15:42, Viresh Kumar  wrote:
> >
> > On 27-08-20, 11:48, Arnd Bergmann wrote:
> > > > > [3.680477]  dev_pm_opp_put_clkname+0x30/0x58
> > > > > [3.683431]  sdhci_msm_probe+0x284/0x9a0
> > >
> > > dev_pm_opp_put_clkname() is part of the error handling in the
> > > probe function, so I would deduct there are two problems:
> > >
> > > - something failed during the probe and the driver is trying
> > >   to unwind
> > > - the error handling it self is buggy and tries to undo something
> > >   again that has already been undone.
> >
> > Right.
> >
> > > This points to Viresh's
> > > d05a7238fe1c mmc: sdhci-msm: Unconditionally call 
> > > dev_pm_opp_of_remove_table()
> >
> > I completely forgot that Ulf already pushed this patch and I was
> > wondering on which of the OPP core changes I wrote have done this :(
> >
> > > Most likely this is not the entire problem but it uncovered a preexisting
> > > bug.
> >
> > I think this is.
> >
> > Naresh: Can you please test with this diff ?
>
> I have applied your patch and tested but still see the reported problem.

The git bisect shows that the first bad commit is,
d05a7238fe1c mmc: sdhci-msm: Unconditionally call dev_pm_opp_of_remove_table()

Reported-by: Naresh Kamboju 
Reported-by: Anders Roxell 

>
> - Naresh


Re: Kernel panic : Unable to handle kernel paging request at virtual address - dead address between user and kernel address ranges

2020-08-27 Thread Naresh Kamboju
On Thu, 27 Aug 2020 at 15:42, Viresh Kumar  wrote:
>
> On 27-08-20, 11:48, Arnd Bergmann wrote:
> > > > [3.680477]  dev_pm_opp_put_clkname+0x30/0x58
> > > > [3.683431]  sdhci_msm_probe+0x284/0x9a0
> >
> > dev_pm_opp_put_clkname() is part of the error handling in the
> > probe function, so I would deduct there are two problems:
> >
> > - something failed during the probe and the driver is trying
> >   to unwind
> > - the error handling it self is buggy and tries to undo something
> >   again that has already been undone.
>
> Right.
>
> > This points to Viresh's
> > d05a7238fe1c mmc: sdhci-msm: Unconditionally call 
> > dev_pm_opp_of_remove_table()
>
> I completely forgot that Ulf already pushed this patch and I was
> wondering on which of the OPP core changes I wrote have done this :(
>
> > Most likely this is not the entire problem but it uncovered a preexisting
> > bug.
>
> I think this is.
>
> Naresh: Can you please test with this diff ?

I have applied your patch and tested but still see the reported problem.
Link to test job,
https://lkft.validation.linaro.org/scheduler/job/1715677#L1886

- Naresh


Re: Kernel panic : Unable to handle kernel paging request at virtual address - dead address between user and kernel address ranges

2020-08-27 Thread Viresh Kumar
On 27-08-20, 11:48, Arnd Bergmann wrote:
> > > [3.680477]  dev_pm_opp_put_clkname+0x30/0x58
> > > [3.683431]  sdhci_msm_probe+0x284/0x9a0
> 
> dev_pm_opp_put_clkname() is part of the error handling in the
> probe function, so I would deduct there are two problems:
> 
> - something failed during the probe and the driver is trying
>   to unwind
> - the error handling it self is buggy and tries to undo something
>   again that has already been undone.

Right.

> This points to Viresh's
> d05a7238fe1c mmc: sdhci-msm: Unconditionally call dev_pm_opp_of_remove_table()

I completely forgot that Ulf already pushed this patch and I was
wondering on which of the OPP core changes I wrote have done this :(

> Most likely this is not the entire problem but it uncovered a preexisting
> bug.

I think this is.

Naresh: Can you please test with this diff ?

diff --git a/drivers/mmc/host/sdhci-msm.c b/drivers/mmc/host/sdhci-msm.c
index b7e47107a31a..401839a97b57 100644
--- a/drivers/mmc/host/sdhci-msm.c
+++ b/drivers/mmc/host/sdhci-msm.c
@@ -2286,7 +2286,7 @@ static int sdhci_msm_probe(struct platform_device *pdev)
ret = dev_pm_opp_of_add_table(&pdev->dev);
if (ret != -ENODEV) {
dev_err(&pdev->dev, "Invalid OPP table in Device tree\n");
-   goto opp_cleanup;
+   goto opp_put_clkname;
}
 
/* Vote for maximum clock rate for maximum performance */
@@ -2451,6 +2451,7 @@ static int sdhci_msm_probe(struct platform_device *pdev)
   msm_host->bulk_clks);
 opp_cleanup:
dev_pm_opp_of_remove_table(&pdev->dev);
+opp_put_clkname:
dev_pm_opp_put_clkname(msm_host->opp_table);
 bus_clk_disable:
if (!IS_ERR(msm_host->bus_clk))

-- 
viresh


Re: Kernel panic : Unable to handle kernel paging request at virtual address - dead address between user and kernel address ranges

2020-08-27 Thread Arnd Bergmann
On Thu, Aug 27, 2020 at 11:08 AM Viresh Kumar  wrote:
>
> +Rajendra
>
> On 27-08-20, 14:02, Naresh Kamboju wrote:
> > arm64 dragonboard db410c boot failed while running linux next 20200827 
> > kernel.
> >
> > metadata:
> >   git branch: master
> >   git repo: 
> > https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git
> >   git commit: 88abac0b753dfdd85362a26d2da8277cb1e0842b
> >   git describe: next-20200827
> >   make_kernelversion: 5.9.0-rc2
> >   kernel-config:
> > https://builds.tuxbuild.com/vThV35pOF_GMlWdiTs3Bdw/kernel.config
> >
> > Boot log,
> >
> > [0.00] Booting Linux on physical CPU 0x00 [0x410fd030]
> > [0.00] Linux version 5.9.0-rc2-next-20200827
> > (TuxBuild@12963d21faa5) (aarch64-linux-gnu-gcc (Debian 9.3.0-8) 9.3.0,
> > GNU ld (GNU Binutils for Debian) 2.34) #1 SMP PREEMPT Thu Aug 27
> > 05:19:00 UTC 2020
> > [0.00] Machine model: Qualcomm Technologies, Inc. APQ 8016 SBC
> > [0.00] efi: UEFI not found.
> > [0.00] [Firmware Bug]: Kernel image misaligned at boot, please
> > fix your bootloader!
> > 
> > [3.451425] i2c_qup 78ba000.i2c: using default clock-frequency 10
> > [3.451491] i2c_qup 78ba000.i2c:
> > [3.451491]  tx channel not available
> > [3.493455] sdhci: Secure Digital Host Controller Interface driver
> > [3.493508] sdhci: Copyright(c) Pierre Ossman
> > [3.500902] Synopsys Designware Multimedia Card Interface Driver
> > [3.507441] sdhci-pltfm: SDHCI platform and OF driver helper
> > [3.514308] Unable to handle kernel paging request at virtual
> > address dead0108

This is where the address comes from:

#define POISON_POINTER_DELTA _AC(CONFIG_ILLEGAL_POINTER_VALUE, UL)
#define LIST_POISON1  ((void *) 0x100 + POISON_POINTER_DELTA)

static inline void hlist_del(struct hlist_node *n)
{
__hlist_del(n);
n->next = LIST_POISON1;
n->pprev = LIST_POISON2;
}

> > [3.514695] Mem abort info:
> > [3.522421]   ESR = 0x9644
> > [3.525096]   EC = 0x25: DABT (current EL), IL = 32 bits
> > [3.528236]   SET = 0, FnV = 0
> > [3.533703]   EA = 0, S1PTW = 0
> > [3.536561] Data abort info:
> > [3.539601]   ISV = 0, ISS = 0x0044
> > [3.542727]   CM = 0, WnR = 1
> > [3.546287] [dead0108] address between user and kernel address 
> > ranges
> > [3.549414] Internal error: Oops: 9644 [#1] PREEMPT SMP
> > [3.556520] Modules linked in:
> > [3.561901] CPU: 0 PID: 1 Comm: swapper/0 Not tainted
> > 5.9.0-rc2-next-20200827 #1
> > [3.565034] Hardware name: Qualcomm Technologies, Inc. APQ 8016 SBC (DT)
> > [3.572584] pstate: 6005 (nZCv daif -PAN -UAO BTYPE=--)
> > [3.579271] pc : __clk_put+0x40/0x140
> > [3.584556] lr : __clk_put+0x2c/0x140

Fairly sure this is from the hlist_del(), meaning we try to remove the
same list object a second time, after it was already removed.

> > [3.588373] sp : 80001002bb00
> > [3.592016] x29: 80001002bb00 x28: 002e
> > [3.595320] x27: 09f7ba68 x26: 80001146d878
> > [3.600703] x25: 3fcfd8f8 x24: 3d0bc410
> > [3.605999] x23: 80001146d0e0 x22: 09f7ba40
> > [3.611293] x21: 3d0bc400 x20: 09f7b580
> > [3.616588] x19: 3bccc780 x18: 07824000
> > [3.621883] x17: 09f7ba00 x16: 09f7b5d0
> > [3.627177] x15: 800011966cf8 x14: 
> > [3.632472] x13: 800012917000 x12: 800012917000
> > [3.637769] x11: 0020 x10: 0101010101010101
> > [3.643063] x9 : 8000107a984c x8 : 7f7f7f7f7f7f7f7f
> > [3.648358] x7 : 09fd8000 x6 : 80001237a000
> > [3.653653] x5 :  x4 : 09fd8000
> > [3.658949] x3 : 8000124e6768 x2 : 09fd8000
> > [3.664243] x1 : 3bccca80 x0 : dead0100
> > [3.669539] Call trace:
> > [3.674830]  __clk_put+0x40/0x140
> > [3.677003]  clk_put+0x18/0x28
> > [3.680477]  dev_pm_opp_put_clkname+0x30/0x58
> > [3.683431]  sdhci_msm_probe+0x284/0x9a0

dev_pm_opp_put_clkname() is part of the error handling in the
probe function, so I would deduct there are two problems:

- something failed during the probe and the driver is trying
  to unwind
- the error handling it self is buggy and tries to undo something
  again that has already been undone.

> > [3.687857]  platform_drv_probe+0x5c/0xb0
> > [3.691847]  really_probe+0xf0/0x4d8
> > 

Re: Kernel panic : Unable to handle kernel paging request at virtual address - dead address between user and kernel address ranges

2020-08-27 Thread Naresh Kamboju
On Thu, 27 Aug 2020 at 14:02, Naresh Kamboju  wrote:
>
> arm64 dragonboard db410c boot failed while running linux next 20200827 kernel.
>
> metadata:
>   git branch: master
>   git repo: 
> https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git
>   git commit: 88abac0b753dfdd85362a26d2da8277cb1e0842b
>   git describe: next-20200827
>   make_kernelversion: 5.9.0-rc2
>   kernel-config:
> https://builds.tuxbuild.com/vThV35pOF_GMlWdiTs3Bdw/kernel.config

The reported issue is started from linux next tag next-20200825.

BAD:  next-20200825
GOOD:  next-20200824

We are working on git bisect and boot testing on db410c and get back to you.

>
> Boot log,
>
> [0.00] Booting Linux on physical CPU 0x00 [0x410fd030]
> [0.00] Linux version 5.9.0-rc2-next-20200827
> (TuxBuild@12963d21faa5) (aarch64-linux-gnu-gcc (Debian 9.3.0-8) 9.3.0,
> GNU ld (GNU Binutils for Debian) 2.34) #1 SMP PREEMPT Thu Aug 27
> 05:19:00 UTC 2020
> [0.00] Machine model: Qualcomm Technologies, Inc. APQ 8016 SBC
> [0.00] efi: UEFI not found.
> [0.00] [Firmware Bug]: Kernel image misaligned at boot, please
> fix your bootloader!
> 
> [3.451425] i2c_qup 78ba000.i2c: using default clock-frequency 10
> [3.451491] i2c_qup 78ba000.i2c:
> [3.451491]  tx channel not available
> [3.493455] sdhci: Secure Digital Host Controller Interface driver
> [3.493508] sdhci: Copyright(c) Pierre Ossman
> [3.500902] Synopsys Designware Multimedia Card Interface Driver
> [3.507441] sdhci-pltfm: SDHCI platform and OF driver helper
> [3.514308] Unable to handle kernel paging request at virtual
> address dead0108
> [3.514695] Mem abort info:
> [3.522421]   ESR = 0x9644
> [3.525096]   EC = 0x25: DABT (current EL), IL = 32 bits
> [3.528236]   SET = 0, FnV = 0
> [3.533703]   EA = 0, S1PTW = 0
> [3.536561] Data abort info:
> [3.539601]   ISV = 0, ISS = 0x0044
> [3.542727]   CM = 0, WnR = 1
> [3.546287] [dead0108] address between user and kernel address 
> ranges
> [3.549414] Internal error: Oops: 9644 [#1] PREEMPT SMP
> [3.556520] Modules linked in:
> [3.561901] CPU: 0 PID: 1 Comm: swapper/0 Not tainted
> 5.9.0-rc2-next-20200827 #1
> [3.565034] Hardware name: Qualcomm Technologies, Inc. APQ 8016 SBC (DT)
> [3.572584] pstate: 6005 (nZCv daif -PAN -UAO BTYPE=--)
> [3.579271] pc : __clk_put+0x40/0x140
> [3.584556] lr : __clk_put+0x2c/0x140
> [3.588373] sp : 80001002bb00
> [3.592016] x29: 80001002bb00 x28: 002e
> [3.595320] x27: 09f7ba68 x26: 80001146d878
> [3.600703] x25: 3fcfd8f8 x24: 3d0bc410
> [3.605999] x23: 80001146d0e0 x22: 09f7ba40
> [3.611293] x21: 3d0bc400 x20: 09f7b580
> [3.616588] x19: 3bccc780 x18: 07824000
> [3.621883] x17: 09f7ba00 x16: 09f7b5d0
> [3.627177] x15: 800011966cf8 x14: 
> [3.632472] x13: 800012917000 x12: 800012917000
> [3.637769] x11: 0020 x10: 0101010101010101
> [3.643063] x9 : 8000107a984c x8 : 7f7f7f7f7f7f7f7f
> [3.648358] x7 : 09fd8000 x6 : 80001237a000
> [3.653653] x5 :  x4 : 09fd8000
> [3.658949] x3 : 8000124e6768 x2 : 09fd8000
> [3.664243] x1 : 3bccca80 x0 : dead0100
> [3.669539] Call trace:
> [3.674830]  __clk_put+0x40/0x140
> [3.677003]  clk_put+0x18/0x28
> [3.680477]  dev_pm_opp_put_clkname+0x30/0x58
> [3.683431]  sdhci_msm_probe+0x284/0x9a0
> [3.687857]  platform_drv_probe+0x5c/0xb0
> [3.691847]  really_probe+0xf0/0x4d8
> [3.695753]  driver_probe_device+0xfc/0x168
> [3.699399]  device_driver_attach+0x7c/0x88
> [3.703306]  __driver_attach+0xac/0x178
> [3.707472]  bus_for_each_dev+0x78/0xc8
> [3.711291]  driver_attach+0x2c/0x38
> [3.715110]  bus_add_driver+0x14c/0x230
> [3.718929]  driver_register+0x6c/0x128
> [3.722489]  __platform_driver_register+0x50/0x60
> [3.726312]  sdhci_msm_driver_init+0x24/0x30
> [3.731173]  do_one_initcall+0x4c/0x2c0
> [3.735511]  kernel_init_freeable+0x21c/0x284
> [3.739072]  kernel_init+0x1c/0x120
> [3.743582]  ret_from_fork+0x10/0x30
> [3.746885] Code: 35000720 a9438660 f920 b440 (f9000401)
> [3.750720] ---[ end trace a8d4100497387a2e ]---
> [3.756736] Kernel panic - not syncing: Attempted to kill init!
> exitcode=0x000b
> [3.761392] SMP: stopping secondary CPUs
> [3.768877] Kernel Offset: 0x8 from 0x80001000
> [3.772924] PHYS_OFFSET: 0x80

Re: Kernel panic : Unable to handle kernel paging request at virtual address - dead address between user and kernel address ranges

2020-08-27 Thread Viresh Kumar
+Rajendra

On 27-08-20, 14:02, Naresh Kamboju wrote:
> arm64 dragonboard db410c boot failed while running linux next 20200827 kernel.
> 
> metadata:
>   git branch: master
>   git repo: 
> https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git
>   git commit: 88abac0b753dfdd85362a26d2da8277cb1e0842b
>   git describe: next-20200827
>   make_kernelversion: 5.9.0-rc2
>   kernel-config:
> https://builds.tuxbuild.com/vThV35pOF_GMlWdiTs3Bdw/kernel.config
> 
> Boot log,
> 
> [0.00] Booting Linux on physical CPU 0x00 [0x410fd030]
> [0.00] Linux version 5.9.0-rc2-next-20200827
> (TuxBuild@12963d21faa5) (aarch64-linux-gnu-gcc (Debian 9.3.0-8) 9.3.0,
> GNU ld (GNU Binutils for Debian) 2.34) #1 SMP PREEMPT Thu Aug 27
> 05:19:00 UTC 2020
> [0.00] Machine model: Qualcomm Technologies, Inc. APQ 8016 SBC
> [0.00] efi: UEFI not found.
> [0.00] [Firmware Bug]: Kernel image misaligned at boot, please
> fix your bootloader!
> 
> [3.451425] i2c_qup 78ba000.i2c: using default clock-frequency 10
> [3.451491] i2c_qup 78ba000.i2c:
> [3.451491]  tx channel not available
> [3.493455] sdhci: Secure Digital Host Controller Interface driver
> [3.493508] sdhci: Copyright(c) Pierre Ossman
> [3.500902] Synopsys Designware Multimedia Card Interface Driver
> [3.507441] sdhci-pltfm: SDHCI platform and OF driver helper
> [3.514308] Unable to handle kernel paging request at virtual
> address dead0108
> [3.514695] Mem abort info:
> [3.522421]   ESR = 0x9644
> [3.525096]   EC = 0x25: DABT (current EL), IL = 32 bits
> [3.528236]   SET = 0, FnV = 0
> [3.533703]   EA = 0, S1PTW = 0
> [3.536561] Data abort info:
> [3.539601]   ISV = 0, ISS = 0x0044
> [3.542727]   CM = 0, WnR = 1
> [3.546287] [dead0108] address between user and kernel address 
> ranges
> [3.549414] Internal error: Oops: 9644 [#1] PREEMPT SMP
> [3.556520] Modules linked in:
> [3.561901] CPU: 0 PID: 1 Comm: swapper/0 Not tainted
> 5.9.0-rc2-next-20200827 #1
> [3.565034] Hardware name: Qualcomm Technologies, Inc. APQ 8016 SBC (DT)
> [3.572584] pstate: 6005 (nZCv daif -PAN -UAO BTYPE=--)
> [3.579271] pc : __clk_put+0x40/0x140
> [3.584556] lr : __clk_put+0x2c/0x140
> [3.588373] sp : 80001002bb00
> [3.592016] x29: 80001002bb00 x28: 002e
> [3.595320] x27: 09f7ba68 x26: 80001146d878
> [3.600703] x25: 3fcfd8f8 x24: 3d0bc410
> [3.605999] x23: 80001146d0e0 x22: 09f7ba40
> [3.611293] x21: 3d0bc400 x20: 09f7b580
> [3.616588] x19: 3bccc780 x18: 07824000
> [3.621883] x17: 09f7ba00 x16: 09f7b5d0
> [3.627177] x15: 800011966cf8 x14: 
> [3.632472] x13: 800012917000 x12: 800012917000
> [3.637769] x11: 0020 x10: 0101010101010101
> [3.643063] x9 : 8000107a984c x8 : 7f7f7f7f7f7f7f7f
> [3.648358] x7 : 09fd8000 x6 : 80001237a000
> [3.653653] x5 :  x4 : 09fd8000
> [3.658949] x3 : 8000124e6768 x2 : 09fd8000
> [3.664243] x1 : 3bccca80 x0 : dead0100
> [3.669539] Call trace:
> [3.674830]  __clk_put+0x40/0x140
> [3.677003]  clk_put+0x18/0x28
> [3.680477]  dev_pm_opp_put_clkname+0x30/0x58
> [3.683431]  sdhci_msm_probe+0x284/0x9a0
> [3.687857]  platform_drv_probe+0x5c/0xb0
> [3.691847]  really_probe+0xf0/0x4d8
> [3.695753]  driver_probe_device+0xfc/0x168
> [3.699399]  device_driver_attach+0x7c/0x88
> [3.703306]  __driver_attach+0xac/0x178
> [3.707472]  bus_for_each_dev+0x78/0xc8
> [3.711291]  driver_attach+0x2c/0x38
> [3.715110]  bus_add_driver+0x14c/0x230
> [3.718929]  driver_register+0x6c/0x128
> [3.722489]  __platform_driver_register+0x50/0x60
> [3.726312]  sdhci_msm_driver_init+0x24/0x30
> [3.731173]  do_one_initcall+0x4c/0x2c0
> [3.735511]  kernel_init_freeable+0x21c/0x284
> [3.739072]  kernel_init+0x1c/0x120
> [3.743582]  ret_from_fork+0x10/0x30
> [3.746885] Code: 35000720 a9438660 f920 b440 (f9000401)
> [3.750720] ---[ end trace a8d4100497387a2e ]---
> [3.756736] Kernel panic - not syncing: Attempted to kill init!
> exitcode=0x000b
> [3.761392] SMP: stopping secondary CPUs
> [3.768877] Kernel Offset: 0x8 from 0x80001000
> [3.772924] PHYS_OFFSET: 0x8000
> [3.778216] CPU features: 0x0240002,24802005
> [3.781602] Memory Limit: none
> 
> full test log,
> https://qa-reports.linaro.org/lkft/linux-next-oe/build/next-20200827/testrun/3123101/suite/linux-log-parser/test/check-kernel-oops-1714695/log
> 
> -- 
> Linaro LKFT
> https://lkft.linaro.org

-- 
viresh


Kernel panic : Unable to handle kernel paging request at virtual address - dead address between user and kernel address ranges

2020-08-27 Thread Naresh Kamboju
arm64 dragonboard db410c boot failed while running linux next 20200827 kernel.

metadata:
  git branch: master
  git repo: https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git
  git commit: 88abac0b753dfdd85362a26d2da8277cb1e0842b
  git describe: next-20200827
  make_kernelversion: 5.9.0-rc2
  kernel-config:
https://builds.tuxbuild.com/vThV35pOF_GMlWdiTs3Bdw/kernel.config

Boot log,

[0.00] Booting Linux on physical CPU 0x00 [0x410fd030]
[0.00] Linux version 5.9.0-rc2-next-20200827
(TuxBuild@12963d21faa5) (aarch64-linux-gnu-gcc (Debian 9.3.0-8) 9.3.0,
GNU ld (GNU Binutils for Debian) 2.34) #1 SMP PREEMPT Thu Aug 27
05:19:00 UTC 2020
[0.00] Machine model: Qualcomm Technologies, Inc. APQ 8016 SBC
[0.00] efi: UEFI not found.
[0.00] [Firmware Bug]: Kernel image misaligned at boot, please
fix your bootloader!

[3.451425] i2c_qup 78ba000.i2c: using default clock-frequency 10
[3.451491] i2c_qup 78ba000.i2c:
[3.451491]  tx channel not available
[3.493455] sdhci: Secure Digital Host Controller Interface driver
[3.493508] sdhci: Copyright(c) Pierre Ossman
[3.500902] Synopsys Designware Multimedia Card Interface Driver
[3.507441] sdhci-pltfm: SDHCI platform and OF driver helper
[3.514308] Unable to handle kernel paging request at virtual
address dead0108
[3.514695] Mem abort info:
[3.522421]   ESR = 0x9644
[3.525096]   EC = 0x25: DABT (current EL), IL = 32 bits
[3.528236]   SET = 0, FnV = 0
[3.533703]   EA = 0, S1PTW = 0
[3.536561] Data abort info:
[3.539601]   ISV = 0, ISS = 0x0044
[3.542727]   CM = 0, WnR = 1
[3.546287] [dead0108] address between user and kernel address ranges
[3.549414] Internal error: Oops: 9644 [#1] PREEMPT SMP
[3.556520] Modules linked in:
[3.561901] CPU: 0 PID: 1 Comm: swapper/0 Not tainted
5.9.0-rc2-next-20200827 #1
[3.565034] Hardware name: Qualcomm Technologies, Inc. APQ 8016 SBC (DT)
[3.572584] pstate: 6005 (nZCv daif -PAN -UAO BTYPE=--)
[3.579271] pc : __clk_put+0x40/0x140
[3.584556] lr : __clk_put+0x2c/0x140
[3.588373] sp : 80001002bb00
[3.592016] x29: 80001002bb00 x28: 002e
[3.595320] x27: 09f7ba68 x26: 80001146d878
[3.600703] x25: 3fcfd8f8 x24: 3d0bc410
[3.605999] x23: 80001146d0e0 x22: 09f7ba40
[3.611293] x21: 3d0bc400 x20: 09f7b580
[3.616588] x19: 3bccc780 x18: 07824000
[3.621883] x17: 09f7ba00 x16: 09f7b5d0
[3.627177] x15: 800011966cf8 x14: 
[3.632472] x13: 800012917000 x12: 800012917000
[3.637769] x11: 0020 x10: 0101010101010101
[3.643063] x9 : 8000107a984c x8 : 7f7f7f7f7f7f7f7f
[3.648358] x7 : 09fd8000 x6 : 80001237a000
[3.653653] x5 :  x4 : 09fd8000
[3.658949] x3 : 8000124e6768 x2 : 09fd8000
[3.664243] x1 : 3bccca80 x0 : dead0100
[3.669539] Call trace:
[3.674830]  __clk_put+0x40/0x140
[3.677003]  clk_put+0x18/0x28
[3.680477]  dev_pm_opp_put_clkname+0x30/0x58
[3.683431]  sdhci_msm_probe+0x284/0x9a0
[3.687857]  platform_drv_probe+0x5c/0xb0
[3.691847]  really_probe+0xf0/0x4d8
[3.695753]  driver_probe_device+0xfc/0x168
[3.699399]  device_driver_attach+0x7c/0x88
[3.703306]  __driver_attach+0xac/0x178
[3.707472]  bus_for_each_dev+0x78/0xc8
[3.711291]  driver_attach+0x2c/0x38
[3.715110]  bus_add_driver+0x14c/0x230
[3.718929]  driver_register+0x6c/0x128
[3.722489]  __platform_driver_register+0x50/0x60
[3.726312]  sdhci_msm_driver_init+0x24/0x30
[3.731173]  do_one_initcall+0x4c/0x2c0
[3.735511]  kernel_init_freeable+0x21c/0x284
[3.739072]  kernel_init+0x1c/0x120
[3.743582]  ret_from_fork+0x10/0x30
[3.746885] Code: 35000720 a9438660 f920 b440 (f9000401)
[3.750720] ---[ end trace a8d4100497387a2e ]---
[3.756736] Kernel panic - not syncing: Attempted to kill init!
exitcode=0x000b
[3.761392] SMP: stopping secondary CPUs
[3.768877] Kernel Offset: 0x8 from 0x80001000
[3.772924] PHYS_OFFSET: 0x8000
[3.778216] CPU features: 0x0240002,24802005
[3.781602] Memory Limit: none

full test log,
https://qa-reports.linaro.org/lkft/linux-next-oe/build/next-20200827/testrun/3123101/suite/linux-log-parser/test/check-kernel-oops-1714695/log

-- 
Linaro LKFT
https://lkft.linaro.org


Re: [RFC v2 00/27] Kernel Address Space Isolation

2020-07-01 Thread 黄金海
How about performance when running with ASI?


Re: [RFC v2 00/27] Kernel Address Space Isolation

2020-07-01 Thread 黄金海
How about performance when running with ASI?


Re: [RFC v2 00/27] Kernel Address Space Isolation

2020-07-01 Thread hackapple
How about performance when running kvm example or isolate other kernel data?



[PATCH v4 37/45] powerpc/8xx: Refactor kernel address boundary comparison

2020-05-18 Thread Christophe Leroy
Now that linear and IMMR dedicated TLB handling is gone, kernel
boundary address comparison is similar in ITLB miss handler and
in DTLB miss handler.

Create a macro named compare_to_kernel_boundary.

When TASK_SIZE is strictly below 0x8000 and PAGE_OFFSET is
above 0x8000, it is enough to compare to 0x800, and this
can be done with a single instruction.

Using not. instruction, we get to use 'blt' conditional branch as
when doing a regular comparison:

0x <= addr <= 0x7fff ==>
0x >= NOT(addr) >= 0x8000
The above test corresponds to a 'blt'

Otherwise, do a regular comparison using two instructions.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/kernel/head_8xx.S | 22 --
 1 file changed, 8 insertions(+), 14 deletions(-)

diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S
index 9f3f7f3d03a7..9a117b9f0998 100644
--- a/arch/powerpc/kernel/head_8xx.S
+++ b/arch/powerpc/kernel/head_8xx.S
@@ -32,10 +32,15 @@
 
 #include "head_32.h"
 
+.macro compare_to_kernel_boundary scratch, addr
 #if CONFIG_TASK_SIZE <= 0x8000 && CONFIG_PAGE_OFFSET >= 0x8000
 /* By simply checking Address >= 0x8000, we know if its a kernel address */
-#define SIMPLE_KERNEL_ADDRESS  1
+   not.\scratch, \addr
+#else
+   rlwinm  \scratch, \addr, 16, 0xfff8
+   cmpli   cr0, \scratch, PAGE_OFFSET@h
 #endif
+.endm
 
 /*
  * We need an ITLB miss handler for kernel addresses if:
@@ -209,20 +214,11 @@ InstructionTLBMiss:
mtspr   SPRN_MD_EPN, r10
 #ifdef ITLB_MISS_KERNEL
mfcrr11
-#if defined(SIMPLE_KERNEL_ADDRESS)
-   cmpicr0, r10, 0 /* Address >= 0x8000 */
-#else
-   rlwinm  r10, r10, 16, 0xfff8
-   cmpli   cr0, r10, PAGE_OFFSET@h
-#endif
+   compare_to_kernel_boundary r10, r10
 #endif
mfspr   r10, SPRN_M_TWB /* Get level 1 table */
 #ifdef ITLB_MISS_KERNEL
-#if defined(SIMPLE_KERNEL_ADDRESS)
-   bge+3f
-#else
blt+3f
-#endif
rlwinm  r10, r10, 0, 20, 31
orisr10, r10, (swapper_pg_dir - PAGE_OFFSET)@ha
 3:
@@ -288,9 +284,7 @@ DataStoreTLBMiss:
 * kernel page tables.
 */
mfspr   r10, SPRN_MD_EPN
-   rlwinm  r10, r10, 16, 0xfff8
-   cmpli   cr0, r10, PAGE_OFFSET@h
-
+   compare_to_kernel_boundary r10, r10
mfspr   r10, SPRN_M_TWB /* Get level 1 table */
blt+3f
rlwinm  r10, r10, 0, 20, 31
-- 
2.25.0



[PATCH v3 37/45] powerpc/8xx: Refactor kernel address boundary comparison

2020-05-11 Thread Christophe Leroy
Now that linear and IMMR dedicated TLB handling is gone, kernel
boundary address comparison is similar in ITLB miss handler and
in DTLB miss handler.

Create a macro named compare_to_kernel_boundary.

When TASK_SIZE is strictly below 0x8000 and PAGE_OFFSET is
above 0x8000, it is enough to compare to 0x800, and this
can be done with a single instruction.

Using not. instruction, we get to use 'blt' conditional branch as
when doing a regular comparison:

0x <= addr <= 0x7fff ==>
0x >= NOT(addr) >= 0x8000
The above test corresponds to a 'blt'

Otherwise, do a regular comparison using two instructions.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/kernel/head_8xx.S | 22 --
 1 file changed, 8 insertions(+), 14 deletions(-)

diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S
index 9f3f7f3d03a7..9a117b9f0998 100644
--- a/arch/powerpc/kernel/head_8xx.S
+++ b/arch/powerpc/kernel/head_8xx.S
@@ -32,10 +32,15 @@
 
 #include "head_32.h"
 
+.macro compare_to_kernel_boundary scratch, addr
 #if CONFIG_TASK_SIZE <= 0x8000 && CONFIG_PAGE_OFFSET >= 0x8000
 /* By simply checking Address >= 0x8000, we know if its a kernel address */
-#define SIMPLE_KERNEL_ADDRESS  1
+   not.\scratch, \addr
+#else
+   rlwinm  \scratch, \addr, 16, 0xfff8
+   cmpli   cr0, \scratch, PAGE_OFFSET@h
 #endif
+.endm
 
 /*
  * We need an ITLB miss handler for kernel addresses if:
@@ -209,20 +214,11 @@ InstructionTLBMiss:
mtspr   SPRN_MD_EPN, r10
 #ifdef ITLB_MISS_KERNEL
mfcrr11
-#if defined(SIMPLE_KERNEL_ADDRESS)
-   cmpicr0, r10, 0 /* Address >= 0x8000 */
-#else
-   rlwinm  r10, r10, 16, 0xfff8
-   cmpli   cr0, r10, PAGE_OFFSET@h
-#endif
+   compare_to_kernel_boundary r10, r10
 #endif
mfspr   r10, SPRN_M_TWB /* Get level 1 table */
 #ifdef ITLB_MISS_KERNEL
-#if defined(SIMPLE_KERNEL_ADDRESS)
-   bge+3f
-#else
blt+3f
-#endif
rlwinm  r10, r10, 0, 20, 31
orisr10, r10, (swapper_pg_dir - PAGE_OFFSET)@ha
 3:
@@ -288,9 +284,7 @@ DataStoreTLBMiss:
 * kernel page tables.
 */
mfspr   r10, SPRN_MD_EPN
-   rlwinm  r10, r10, 16, 0xfff8
-   cmpli   cr0, r10, PAGE_OFFSET@h
-
+   compare_to_kernel_boundary r10, r10
mfspr   r10, SPRN_M_TWB /* Get level 1 table */
blt+3f
rlwinm  r10, r10, 0, 20, 31
-- 
2.25.0



[PATCH v2 37/45] powerpc/8xx: Refactor kernel address boundary comparison

2020-05-06 Thread Christophe Leroy
Now that linear and IMMR dedicated TLB handling is gone, kernel
boundary address comparison is similar in ITLB miss handler and
in DTLB miss handler.

Create a macro named compare_to_kernel_boundary.

When TASK_SIZE is strictly below 0x8000 and PAGE_OFFSET is
above 0x8000, it is enough to compare to 0x800, and this
can be done with a single instruction.

Using not. instruction, we get to use 'blt' conditional branch as
when doing a regular comparison:

0x <= addr <= 0x7fff ==>
0x >= NOT(addr) >= 0x8000
The above test corresponds to a 'blt'

Otherwise, do a regular comparison using two instructions.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/kernel/head_8xx.S | 22 --
 1 file changed, 8 insertions(+), 14 deletions(-)

diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S
index 9f3f7f3d03a7..9a117b9f0998 100644
--- a/arch/powerpc/kernel/head_8xx.S
+++ b/arch/powerpc/kernel/head_8xx.S
@@ -32,10 +32,15 @@
 
 #include "head_32.h"
 
+.macro compare_to_kernel_boundary scratch, addr
 #if CONFIG_TASK_SIZE <= 0x8000 && CONFIG_PAGE_OFFSET >= 0x8000
 /* By simply checking Address >= 0x8000, we know if its a kernel address */
-#define SIMPLE_KERNEL_ADDRESS  1
+   not.\scratch, \addr
+#else
+   rlwinm  \scratch, \addr, 16, 0xfff8
+   cmpli   cr0, \scratch, PAGE_OFFSET@h
 #endif
+.endm
 
 /*
  * We need an ITLB miss handler for kernel addresses if:
@@ -209,20 +214,11 @@ InstructionTLBMiss:
mtspr   SPRN_MD_EPN, r10
 #ifdef ITLB_MISS_KERNEL
mfcrr11
-#if defined(SIMPLE_KERNEL_ADDRESS)
-   cmpicr0, r10, 0 /* Address >= 0x8000 */
-#else
-   rlwinm  r10, r10, 16, 0xfff8
-   cmpli   cr0, r10, PAGE_OFFSET@h
-#endif
+   compare_to_kernel_boundary r10, r10
 #endif
mfspr   r10, SPRN_M_TWB /* Get level 1 table */
 #ifdef ITLB_MISS_KERNEL
-#if defined(SIMPLE_KERNEL_ADDRESS)
-   bge+3f
-#else
blt+3f
-#endif
rlwinm  r10, r10, 0, 20, 31
orisr10, r10, (swapper_pg_dir - PAGE_OFFSET)@ha
 3:
@@ -288,9 +284,7 @@ DataStoreTLBMiss:
 * kernel page tables.
 */
mfspr   r10, SPRN_MD_EPN
-   rlwinm  r10, r10, 16, 0xfff8
-   cmpli   cr0, r10, PAGE_OFFSET@h
-
+   compare_to_kernel_boundary r10, r10
mfspr   r10, SPRN_M_TWB /* Get level 1 table */
blt+3f
rlwinm  r10, r10, 0, 20, 31
-- 
2.25.0



[RFC v4][PATCH part-1 1/7] mm/x86: Introduce kernel Address Space Isolation (ASI)

2020-05-04 Thread Alexandre Chartre
Introduce core functions and structures for implementing Address Space
Isolation (ASI). Kernel address space isolation provides the ability to
run some kernel code with a reduced kernel address space.

An address space isolation is defined with a struct asi structure and
associated with an ASI type and a pagetable.

Signed-off-by: Alexandre Chartre 
---
 arch/x86/include/asm/asi.h | 88 ++
 arch/x86/mm/Makefile   |  1 +
 arch/x86/mm/asi.c  | 60 ++
 security/Kconfig   | 10 +
 4 files changed, 159 insertions(+)
 create mode 100644 arch/x86/include/asm/asi.h
 create mode 100644 arch/x86/mm/asi.c

diff --git a/arch/x86/include/asm/asi.h b/arch/x86/include/asm/asi.h
new file mode 100644
index ..844a81fb84d2
--- /dev/null
+++ b/arch/x86/include/asm/asi.h
@@ -0,0 +1,88 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef ARCH_X86_MM_ASI_H
+#define ARCH_X86_MM_ASI_H
+
+#ifdef CONFIG_ADDRESS_SPACE_ISOLATION
+
+/*
+ * An Address Space Isolation (ASI) is defined with a struct asi and
+ * associated with an ASI type (struct asi_type). All ASIs of the same
+ * type reference the same ASI type.
+ *
+ * An ASI type has a unique PCID prefix (a value in the range [1, 255])
+ * which is used to define the PCID used for the ASI CR3 value. The
+ * first four bits of the ASI PCID come from the kernel PCID (a value
+ * between 1 and 6, see TLB_NR_DYN_ASIDS). The remaining 8 bits are
+ * filled with the ASI PCID prefix.
+ *
+ *   ASI PCID = (ASI Type PCID Prefix << 4) | Kernel PCID
+ *
+ * The ASI PCID is used to optimize TLB flushing when switching between
+ * the kernel and ASI pagetables. The optimization is valid only when
+ * a task switches between ASI of different types. If a task switches
+ * between different ASIs with the same type then the ASI TLB the task
+ * is switching to will always be flushed.
+ */
+
+#define ASI_PCID_PREFIX_SHIFT  4
+#define ASI_PCID_PREFIX_MASK   0xff0
+#define ASI_KERNEL_PCID_MASK   0x00f
+
+/*
+ * We use bit 12 of a pagetable pointer (and so of the CR3 value) as
+ * a way to know if a pointer/CR3 is referencing a full kernel page
+ * table or an ASI page table.
+ *
+ * A full kernel pagetable is always located on the first half of an
+ * 8K buffer, while an ASI pagetable is always located on the second
+ * half of an 8K buffer.
+ */
+#define ASI_PGTABLE_BITPAGE_SHIFT
+#define ASI_PGTABLE_MASK   (1 << ASI_PGTABLE_BIT)
+
+#ifndef __ASSEMBLY__
+
+#include 
+
+struct asi_type {
+   int pcid_prefix;/* PCID prefix */
+};
+
+/*
+ * Macro to define and declare an ASI type.
+ *
+ * Declaring an ASI type will also define an inline function
+ * (asi_create_()) to easily create an ASI of the
+ * specified type.
+ */
+#define DEFINE_ASI_TYPE(name, pcid_prefix) \
+   struct asi_type asi_type_ ## name = {   \
+   pcid_prefix,\
+   };  \
+   EXPORT_SYMBOL(asi_type_ ## name)
+
+#define DECLARE_ASI_TYPE(name) \
+   extern struct asi_type asi_type_ ## name;   \
+   DECLARE_ASI_CREATE(name)
+
+#define DECLARE_ASI_CREATE(name)   \
+static inline struct asi *asi_create_ ## name(void)\
+{  \
+   return asi_create(&asi_type_ ## name);  \
+}
+
+struct asi {
+   struct asi_type *type;  /* ASI type */
+   pgd_t   *pagetable; /* ASI pagetable */
+   unsigned long   base_cr3;   /* base ASI CR3 */
+};
+
+extern struct asi *asi_create(struct asi_type *type);
+extern void asi_destroy(struct asi *asi);
+extern void asi_set_pagetable(struct asi *asi, pgd_t *pagetable);
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* CONFIG_ADDRESS_SPACE_ISOLATION */
+
+#endif
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 98f7c6fa2eaa..e57af263e870 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -48,6 +48,7 @@ obj-$(CONFIG_NUMA_EMU)+= numa_emulation.o
 obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o
 obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o
 obj-$(CONFIG_PAGE_TABLE_ISOLATION) += pti.o
+obj-$(CONFIG_ADDRESS_SPACE_ISOLATION)  += asi.o
 
 obj-$(CONFIG_AMD_MEM_ENCRYPT)  += mem_encrypt.o
 obj-$(CONFIG_AMD_MEM_ENCRYPT)  += mem_encrypt_identity.o
diff --git a/arch/x86/mm/asi.c b/arch/x86/mm/asi.c
new file mode 100644
index ..0a0ac9d6d078
--- /dev/null
+++ b/arch/x86/mm/asi.c
@@ -0,0 +1,60 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2019, 2020, Oracle and/or its affiliates.
+ *
+ * Kernel Address Space Isolation (ASI)
+ */
+
+#include 
+#include 
+
+#include 
+#include 
+
+struct asi *asi_create(struct asi_type *type)
+{
+   struct asi *asi;
+
+

Re: [PATCH] staging: vchiq: don't leak kernel address

2019-10-08 Thread Greg Kroah-Hartman
On Tue, Oct 08, 2019 at 05:25:17PM +0300, Dan Carpenter wrote:
> On Tue, Oct 08, 2019 at 04:21:54PM +0200, Matteo Croce wrote:
> > On Tue, Oct 8, 2019 at 3:16 PM Dan Carpenter  
> > wrote:
> > >
> > > The subject doesn't match the patch.  It should just be "remove useless
> > > printk".
> > >
> > > regards,
> > > dan carpenter
> > >
> > 
> > Well, it avoids leaking an address by removing an useless printk.
> > It seems that GKH already picked the patch in his staging tree, but
> > I'm fine with both subjects, really,
> 
> The address wasn't leaked because it was already %pK.  The subject
> says there is an info leak security problem, when the opposite is true.

I've edited the subject line now.

thanks,

greg k-h


Re: [PATCH] staging: vchiq: don't leak kernel address

2019-10-08 Thread Dan Carpenter
On Tue, Oct 08, 2019 at 04:21:54PM +0200, Matteo Croce wrote:
> On Tue, Oct 8, 2019 at 3:16 PM Dan Carpenter  wrote:
> >
> > The subject doesn't match the patch.  It should just be "remove useless
> > printk".
> >
> > regards,
> > dan carpenter
> >
> 
> Well, it avoids leaking an address by removing an useless printk.
> It seems that GKH already picked the patch in his staging tree, but
> I'm fine with both subjects, really,

The address wasn't leaked because it was already %pK.  The subject
says there is an info leak security problem, when the opposite is true.

regards,
dan carpenter



Re: [PATCH] staging: vchiq: don't leak kernel address

2019-10-08 Thread Matteo Croce
On Tue, Oct 8, 2019 at 3:16 PM Dan Carpenter  wrote:
>
> The subject doesn't match the patch.  It should just be "remove useless
> printk".
>
> regards,
> dan carpenter
>

Well, it avoids leaking an address by removing an useless printk.
It seems that GKH already picked the patch in his staging tree, but
I'm fine with both subjects, really,

Greg?

-- 
Matteo Croce
per aspera ad upstream



Re: [PATCH] staging: vchiq: don't leak kernel address

2019-10-08 Thread Dan Carpenter
The subject doesn't match the patch.  It should just be "remove useless
printk".

regards,
dan carpenter



[PATCH] staging: vchiq: don't leak kernel address

2019-10-08 Thread Matteo Croce
Since commit ad67b74d2469d9b8 ("printk: hash addresses printed with %p"),
an obfuscated kernel pointer is printed at boot:

vchiq: vchiq_init_state: slot_zero = (ptrval)

Remove the the print completely, as it's useless without the address.

Signed-off-by: Matteo Croce 
---
 drivers/staging/vc04_services/interface/vchiq_arm/vchiq_core.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_core.c 
b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_core.c
index 56a23a297fa4..084cd4b0f07c 100644
--- a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_core.c
+++ b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_core.c
@@ -2132,9 +2132,6 @@ vchiq_init_state(struct vchiq_state *state, struct 
vchiq_slot_zero *slot_zero)
char threadname[16];
int i;
 
-   vchiq_log_warning(vchiq_core_log_level,
-   "%s: slot_zero = %pK", __func__, slot_zero);
-
if (vchiq_states[0]) {
pr_err("%s: VCHIQ state already initialized\n", __func__);
return VCHIQ_ERROR;
-- 
2.21.0



Re: [PATCH v5] perf machine: arm/arm64: Improve completeness for kernel address space

2019-09-04 Thread Leo Yan
Hi Adrian,

On Wed, Sep 04, 2019 at 10:26:13AM +0300, Adrian Hunter wrote:

[...]

> > Could you take chance to review my below replying?  I'd like to get
> > your confirmation before I send out offical patch.
> 
> It is not necessary to do kallsyms__parse for x86_64, so it would be better
> to check the arch before calling that.

Thanks for suggestion, will do it in the formal patch.

> However in general, having to copy and use kallsyms with perf.data if on a
> different arch does not seem very user friendly.

Agree.  Seems it's more reasonable to save related info in
perf.data; TBH, I have no idea for how to do that.

> But really that is up to Arnaldo.

@Arnaldo, if possible could you take a look for below change?

If you don't think below code is the right thing and it's not a common
issue, then maybe it's more feasible to resolve this issue only for
Arm CoreSight specific.

Please let me know how about you think for this?

Thanks,
Leo Yan

> >> For your question for taking a perf.data file to a machine with a
> >> different architecture, we can firstly use command 'perf buildid-list'
> >> to print out the buildid for kallsyms, based on the dumped buildid we
> >> can find out the location for the saved kallsyms file; then we can use
> >> option '--kallsyms' to specify the offline kallsyms file and use the
> >> offline kallsyms to fixup kernel start address.  The detailed commands
> >> are listed as below:
> >>
> >> root@debian:~# perf buildid-list
> >> 7b36dfca8317ef74974ebd7ee5ec0a8b35c97640 [kernel.kallsyms]
> >> 56b84aa88a1bcfe222a97a53698b92723a3977ca /usr/lib/systemd/systemd
> >> 0956b952e9cd673d48ff2cfeb1a9dbd0c853e686 
> >> /usr/lib/aarch64-linux-gnu/libm-2.28.so
> >> [...]
> >>
> >> root@debian:~# perf script --kallsyms 
> >> ~/.debug/\[kernel.kallsyms\]/7b36dfca8317ef74974ebd7ee5ec0a8b35c97640/kallsyms
> >>
> >> The amended patch is as below, please review and always welcome
> >> any suggestions or comments!
> >>
> >> diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c
> >> index 5734460fc89e..593f05cc453f 100644
> >> --- a/tools/perf/util/machine.c
> >> +++ b/tools/perf/util/machine.c
> >> @@ -2672,9 +2672,26 @@ int machine__nr_cpus_avail(struct machine *machine)
> >>return machine ? perf_env__nr_cpus_avail(machine->env) : 0;
> >>  }
> >>  
> >> +static int machine__fixup_kernel_start(void *arg,
> >> + const char *name __maybe_unused,
> >> + char type,
> >> + u64 start)
> >> +{
> >> +  struct machine *machine = arg;
> >> +
> >> +  type = toupper(type);
> >> +
> >> +  /* Fixup for text, weak, data and bss sections. */
> >> +  if (type == 'T' || type == 'W' || type == 'D' || type == 'B')
> >> +  machine->kernel_start = min(machine->kernel_start, start);
> >> +
> >> +  return 0;
> >> +}
> >> +
> >>  int machine__get_kernel_start(struct machine *machine)
> >>  {
> >>struct map *map = machine__kernel_map(machine);
> >> +  char filename[PATH_MAX];
> >>int err = 0;
> >>  
> >>/*
> >> @@ -2696,6 +2713,22 @@ int machine__get_kernel_start(struct machine 
> >> *machine)
> >>if (!err && !machine__is(machine, "x86_64"))
> >>machine->kernel_start = map->start;
> >>}
> >> +
> >> +  if (symbol_conf.kallsyms_name != NULL) {
> >> +  strncpy(filename, symbol_conf.kallsyms_name, PATH_MAX);
> >> +  } else {
> >> +  machine__get_kallsyms_filename(machine, filename, PATH_MAX);
> >> +
> >> +  if (symbol__restricted_filename(filename, "/proc/kallsyms"))
> >> +  goto out;
> >> +  }
> >> +
> >> +  if (kallsyms__parse(filename, machine, machine__fixup_kernel_start))
> >> +  pr_warning("Fail to fixup kernel start address. skipping...\n");
> >> +
> >> +out:
> >>return err;
> >>  }
> >>  
> >>
> >> Thanks,
> >> Leo Yan
> > 
> 


Re: [PATCH v5] perf machine: arm/arm64: Improve completeness for kernel address space

2019-09-04 Thread Adrian Hunter
On 2/09/19 5:15 PM, Leo Yan wrote:
> Hi Adrian,
> 
> On Mon, Aug 26, 2019 at 08:51:05PM +0800, Leo Yan wrote:
>> Hi Adrian,
>>
>> On Fri, Aug 16, 2019 at 04:00:02PM +0300, Adrian Hunter wrote:
>>> On 16/08/19 4:45 AM, Leo Yan wrote:
 Hi Adrian,

 On Thu, Aug 15, 2019 at 02:45:57PM +0300, Adrian Hunter wrote:

 [...]

>>> How come you cannot use kallsyms to get the information?
>>
>> Thanks for pointing out this.  Sorry I skipped your comment "I don't
>> know how you intend to calculate ARM_PRE_START_SIZE" when you reviewed
>> the patch v3, I should use that chance to elaborate the detailed idea
>> and so can get more feedback/guidance before procceed.
>>
>> Actually, I have considered to use kallsyms when worked on the previous
>> patch set.
>>
>> As mentioned in patch set v4's cover letter, I tried to implement
>> machine__create_extra_kernel_maps() for arm/arm64, the purpose is to
>> parse kallsyms so can find more kernel maps and thus also can fixup
>> the kernel start address.  But I found the 'perf script' tool directly
>> calls machine__get_kernel_start() instead of running into the flow for
>> machine__create_extra_kernel_maps();
>
> Doesn't it just need to loop through each kernel map to find the lowest
> start address?

 Based on your suggestion, I worked out below change and verified it
 can work well on arm64 for fixing up start address; please let me know
 if the change works for you?
>>>
>>> How does that work if take a perf.data file to a machine with a different
>>> architecture?
>>
>> Sorry I delayed so long to respond to your question; I didn't have
>> confidence to give out very reasonale answer and this is the main reason
>> for delaying.
> 
> Could you take chance to review my below replying?  I'd like to get
> your confirmation before I send out offical patch.

It is not necessary to do kallsyms__parse for x86_64, so it would be better
to check the arch before calling that.

However in general, having to copy and use kallsyms with perf.data if on a
different arch does not seem very user friendly.  But really that is up to
Arnaldo.

> 
> Thanks,
> Leo Yan
> 
>>
>> For your question for taking a perf.data file to a machine with a
>> different architecture, we can firstly use command 'perf buildid-list'
>> to print out the buildid for kallsyms, based on the dumped buildid we
>> can find out the location for the saved kallsyms file; then we can use
>> option '--kallsyms' to specify the offline kallsyms file and use the
>> offline kallsyms to fixup kernel start address.  The detailed commands
>> are listed as below:
>>
>> root@debian:~# perf buildid-list
>> 7b36dfca8317ef74974ebd7ee5ec0a8b35c97640 [kernel.kallsyms]
>> 56b84aa88a1bcfe222a97a53698b92723a3977ca /usr/lib/systemd/systemd
>> 0956b952e9cd673d48ff2cfeb1a9dbd0c853e686 
>> /usr/lib/aarch64-linux-gnu/libm-2.28.so
>> [...]
>>
>> root@debian:~# perf script --kallsyms 
>> ~/.debug/\[kernel.kallsyms\]/7b36dfca8317ef74974ebd7ee5ec0a8b35c97640/kallsyms
>>
>> The amended patch is as below, please review and always welcome
>> any suggestions or comments!
>>
>> diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c
>> index 5734460fc89e..593f05cc453f 100644
>> --- a/tools/perf/util/machine.c
>> +++ b/tools/perf/util/machine.c
>> @@ -2672,9 +2672,26 @@ int machine__nr_cpus_avail(struct machine *machine)
>>  return machine ? perf_env__nr_cpus_avail(machine->env) : 0;
>>  }
>>  
>> +static int machine__fixup_kernel_start(void *arg,
>> +   const char *name __maybe_unused,
>> +   char type,
>> +   u64 start)
>> +{
>> +struct machine *machine = arg;
>> +
>> +type = toupper(type);
>> +
>> +/* Fixup for text, weak, data and bss sections. */
>> +if (type == 'T' || type == 'W' || type == 'D' || type == 'B')
>> +machine->kernel_start = min(machine->kernel_start, start);
>> +
>> +return 0;
>> +}
>> +
>>  int machine__get_kernel_start(struct machine *machine)
>>  {
>>  struct map *map = machine__kernel_map(machine);
>> +char filename[PATH_MAX];
>>  int err = 0;
>>  
>>  /*
>> @@ -2696,6 +2713,22 @@ int machine__get_kernel_start(struct machine *machine)
>>  if (!err && !machine__is(machine, "x86_64"))
>>  machine->kernel_start = map->start;
>>  }
>> +
>> +if (symbol_conf.kallsyms_name != NULL) {
>> +strncpy(filename, symbol_conf.kallsyms_name, PATH_MAX);
>> +} else {
>> +machine__get_kallsyms_filename(machine, filename, PATH_MAX);
>> +
>> +if (symbol__restricted_filename(filename, "/proc/kallsyms"))
>> +goto out;
>> +}
>> +
>> +if (kallsyms__parse(filename, machine, machine__fixup_kernel_start))
>> +pr_warning("Fail to fixup kernel start address. skipping...\n");
>> +

Re: [PATCH v5] perf machine: arm/arm64: Improve completeness for kernel address space

2019-09-02 Thread Leo Yan
Hi Adrian,

On Mon, Aug 26, 2019 at 08:51:05PM +0800, Leo Yan wrote:
> Hi Adrian,
> 
> On Fri, Aug 16, 2019 at 04:00:02PM +0300, Adrian Hunter wrote:
> > On 16/08/19 4:45 AM, Leo Yan wrote:
> > > Hi Adrian,
> > > 
> > > On Thu, Aug 15, 2019 at 02:45:57PM +0300, Adrian Hunter wrote:
> > > 
> > > [...]
> > > 
> >  How come you cannot use kallsyms to get the information?
> > >>>
> > >>> Thanks for pointing out this.  Sorry I skipped your comment "I don't
> > >>> know how you intend to calculate ARM_PRE_START_SIZE" when you reviewed
> > >>> the patch v3, I should use that chance to elaborate the detailed idea
> > >>> and so can get more feedback/guidance before procceed.
> > >>>
> > >>> Actually, I have considered to use kallsyms when worked on the previous
> > >>> patch set.
> > >>>
> > >>> As mentioned in patch set v4's cover letter, I tried to implement
> > >>> machine__create_extra_kernel_maps() for arm/arm64, the purpose is to
> > >>> parse kallsyms so can find more kernel maps and thus also can fixup
> > >>> the kernel start address.  But I found the 'perf script' tool directly
> > >>> calls machine__get_kernel_start() instead of running into the flow for
> > >>> machine__create_extra_kernel_maps();
> > >>
> > >> Doesn't it just need to loop through each kernel map to find the lowest
> > >> start address?
> > > 
> > > Based on your suggestion, I worked out below change and verified it
> > > can work well on arm64 for fixing up start address; please let me know
> > > if the change works for you?
> > 
> > How does that work if take a perf.data file to a machine with a different
> > architecture?
> 
> Sorry I delayed so long to respond to your question; I didn't have
> confidence to give out very reasonale answer and this is the main reason
> for delaying.

Could you take chance to review my below replying?  I'd like to get
your confirmation before I send out offical patch.

Thanks,
Leo Yan

> 
> For your question for taking a perf.data file to a machine with a
> different architecture, we can firstly use command 'perf buildid-list'
> to print out the buildid for kallsyms, based on the dumped buildid we
> can find out the location for the saved kallsyms file; then we can use
> option '--kallsyms' to specify the offline kallsyms file and use the
> offline kallsyms to fixup kernel start address.  The detailed commands
> are listed as below:
> 
> root@debian:~# perf buildid-list
> 7b36dfca8317ef74974ebd7ee5ec0a8b35c97640 [kernel.kallsyms]
> 56b84aa88a1bcfe222a97a53698b92723a3977ca /usr/lib/systemd/systemd
> 0956b952e9cd673d48ff2cfeb1a9dbd0c853e686 
> /usr/lib/aarch64-linux-gnu/libm-2.28.so
> [...]
> 
> root@debian:~# perf script --kallsyms 
> ~/.debug/\[kernel.kallsyms\]/7b36dfca8317ef74974ebd7ee5ec0a8b35c97640/kallsyms
> 
> The amended patch is as below, please review and always welcome
> any suggestions or comments!
> 
> diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c
> index 5734460fc89e..593f05cc453f 100644
> --- a/tools/perf/util/machine.c
> +++ b/tools/perf/util/machine.c
> @@ -2672,9 +2672,26 @@ int machine__nr_cpus_avail(struct machine *machine)
>   return machine ? perf_env__nr_cpus_avail(machine->env) : 0;
>  }
>  
> +static int machine__fixup_kernel_start(void *arg,
> +const char *name __maybe_unused,
> +char type,
> +u64 start)
> +{
> + struct machine *machine = arg;
> +
> + type = toupper(type);
> +
> + /* Fixup for text, weak, data and bss sections. */
> + if (type == 'T' || type == 'W' || type == 'D' || type == 'B')
> + machine->kernel_start = min(machine->kernel_start, start);
> +
> + return 0;
> +}
> +
>  int machine__get_kernel_start(struct machine *machine)
>  {
>   struct map *map = machine__kernel_map(machine);
> + char filename[PATH_MAX];
>   int err = 0;
>  
>   /*
> @@ -2696,6 +2713,22 @@ int machine__get_kernel_start(struct machine *machine)
>   if (!err && !machine__is(machine, "x86_64"))
>   machine->kernel_start = map->start;
>   }
> +
> + if (symbol_conf.kallsyms_name != NULL) {
> + strncpy(filename, symbol_conf.kallsyms_name, PATH_MAX);
> + } else {
> + machine__get_kallsyms_filename(machine, filename, PATH_MAX);
> +
> + if (symbol__restricted_filename(filename, "/proc/kallsyms"))
> + goto out;
> + }
> +
> + if (kallsyms__parse(filename, machine, machine__fixup_kernel_start))
> + pr_warning("Fail to fixup kernel start address. skipping...\n");
> +
> +out:
>   return err;
>  }
>  
> 
> Thanks,
> Leo Yan


Re: [PATCH v5] perf machine: arm/arm64: Improve completeness for kernel address space

2019-08-26 Thread Leo Yan
Hi Adrian,

On Fri, Aug 16, 2019 at 04:00:02PM +0300, Adrian Hunter wrote:
> On 16/08/19 4:45 AM, Leo Yan wrote:
> > Hi Adrian,
> > 
> > On Thu, Aug 15, 2019 at 02:45:57PM +0300, Adrian Hunter wrote:
> > 
> > [...]
> > 
>  How come you cannot use kallsyms to get the information?
> >>>
> >>> Thanks for pointing out this.  Sorry I skipped your comment "I don't
> >>> know how you intend to calculate ARM_PRE_START_SIZE" when you reviewed
> >>> the patch v3, I should use that chance to elaborate the detailed idea
> >>> and so can get more feedback/guidance before procceed.
> >>>
> >>> Actually, I have considered to use kallsyms when worked on the previous
> >>> patch set.
> >>>
> >>> As mentioned in patch set v4's cover letter, I tried to implement
> >>> machine__create_extra_kernel_maps() for arm/arm64, the purpose is to
> >>> parse kallsyms so can find more kernel maps and thus also can fixup
> >>> the kernel start address.  But I found the 'perf script' tool directly
> >>> calls machine__get_kernel_start() instead of running into the flow for
> >>> machine__create_extra_kernel_maps();
> >>
> >> Doesn't it just need to loop through each kernel map to find the lowest
> >> start address?
> > 
> > Based on your suggestion, I worked out below change and verified it
> > can work well on arm64 for fixing up start address; please let me know
> > if the change works for you?
> 
> How does that work if take a perf.data file to a machine with a different
> architecture?

Sorry I delayed so long to respond to your question; I didn't have
confidence to give out very reasonale answer and this is the main reason
for delaying.

For your question for taking a perf.data file to a machine with a
different architecture, we can firstly use command 'perf buildid-list'
to print out the buildid for kallsyms, based on the dumped buildid we
can find out the location for the saved kallsyms file; then we can use
option '--kallsyms' to specify the offline kallsyms file and use the
offline kallsyms to fixup kernel start address.  The detailed commands
are listed as below:

root@debian:~# perf buildid-list
7b36dfca8317ef74974ebd7ee5ec0a8b35c97640 [kernel.kallsyms]
56b84aa88a1bcfe222a97a53698b92723a3977ca /usr/lib/systemd/systemd
0956b952e9cd673d48ff2cfeb1a9dbd0c853e686 /usr/lib/aarch64-linux-gnu/libm-2.28.so
[...]

root@debian:~# perf script --kallsyms 
~/.debug/\[kernel.kallsyms\]/7b36dfca8317ef74974ebd7ee5ec0a8b35c97640/kallsyms

The amended patch is as below, please review and always welcome
any suggestions or comments!

diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c
index 5734460fc89e..593f05cc453f 100644
--- a/tools/perf/util/machine.c
+++ b/tools/perf/util/machine.c
@@ -2672,9 +2672,26 @@ int machine__nr_cpus_avail(struct machine *machine)
return machine ? perf_env__nr_cpus_avail(machine->env) : 0;
 }
 
+static int machine__fixup_kernel_start(void *arg,
+  const char *name __maybe_unused,
+  char type,
+  u64 start)
+{
+   struct machine *machine = arg;
+
+   type = toupper(type);
+
+   /* Fixup for text, weak, data and bss sections. */
+   if (type == 'T' || type == 'W' || type == 'D' || type == 'B')
+   machine->kernel_start = min(machine->kernel_start, start);
+
+   return 0;
+}
+
 int machine__get_kernel_start(struct machine *machine)
 {
struct map *map = machine__kernel_map(machine);
+   char filename[PATH_MAX];
int err = 0;
 
/*
@@ -2696,6 +2713,22 @@ int machine__get_kernel_start(struct machine *machine)
if (!err && !machine__is(machine, "x86_64"))
machine->kernel_start = map->start;
}
+
+   if (symbol_conf.kallsyms_name != NULL) {
+   strncpy(filename, symbol_conf.kallsyms_name, PATH_MAX);
+   } else {
+   machine__get_kallsyms_filename(machine, filename, PATH_MAX);
+
+   if (symbol__restricted_filename(filename, "/proc/kallsyms"))
+   goto out;
+   }
+
+   if (kallsyms__parse(filename, machine, machine__fixup_kernel_start))
+   pr_warning("Fail to fixup kernel start address. skipping...\n");
+
+out:
return err;
 }
 

Thanks,
Leo Yan


Re: [RFC v2 00/27] Kernel Address Space Isolation

2019-08-22 Thread Alexandre Chartre



On 7/31/19 6:31 PM, Dario Faggioli wrote:

Hello all,

I know this is a bit of an old thread, so apologies for being late to
the party. :-)


And sorry for the late reply, I was away for a while.


I would have a question about this:


On 7/12/19 2:36 PM, Peter Zijlstra wrote:

On Fri, Jul 12, 2019 at 02:17:20PM +0200, Alexandre Chartre
wrote:

On 7/12/19 1:44 PM, Peter Zijlstra wrote:

AFAIK3 this wants/needs to be combined with core-scheduling
to be
useful, but not a single mention of that is anywhere.


No. This is actually an alternative to core-scheduling.
Eventually, ASI
will kick all sibling hyperthreads when exiting isolation and
it needs to
run with the full kernel page-table (note that's currently
not in these
patches).



I.e., about the fact that ASI is presented as an alternative to
core-scheduling or, at least, as it will only need integrate a small
subset of the logic (and of the code) from core-scheduling, as said
here:


I haven't looked at details about what has been done so far.
Hopefully, we
can do something not too complex, or reuse a (small) part of co-
scheduling.


Now, sticking to virtualization examples, if you don't have core-
scheduling, it means that you can have two vcpus, one from VM A and the
other from VM B, running on the same core, one on thread 0 and the
other one on thread 1, at the same time.

And if VM A's vcpu, running on thread 0, exits, then VM B's vcpu
running in guest more on thread 1 can read host memory, as it is
speculatively accessed (either "normally" or because of cache load
gadgets) and brought in L1D cache by thread 0. And Indeed I do see how
ASI protects us from this attack scenario.


However, when the two VMs' vcpus are both running in guest mode, each
one on a thread of the same core, VM B's vcpu running on thread 1 can
exploit L1TF to peek at and steal secrets that VM A's vcpu, running on
thread 0, is accessing, as they're brought into L1D cache... can't it?

How can, ASI *without* core-scheduling, prevent this other attack
scenario?

Because I may very well be missing something, but it looks to me that
it can't. In which case, I'm not sure we can call it "alternative" to
core-scheduling Or is the second attack scenario that I tried to
describe above, not considered interesting?



Correct, ASI doesn't prevent this attack scenario. However, this case can
be prevented by pinning each VM to different CPU cores (for example, using
cgroups) so that you never have two different VMs running with CPU threads
from the same CPU core. Of course, this limits the number of VMs you can
run to the number of CPU cores on the system but we assume this is a
reasonable configuration when you want to have high performing VM.

Rgds,

alex.


Re: [PATCH v5] perf machine: arm/arm64: Improve completeness for kernel address space

2019-08-16 Thread Adrian Hunter
On 16/08/19 4:45 AM, Leo Yan wrote:
> Hi Adrian,
> 
> On Thu, Aug 15, 2019 at 02:45:57PM +0300, Adrian Hunter wrote:
> 
> [...]
> 
 How come you cannot use kallsyms to get the information?
>>>
>>> Thanks for pointing out this.  Sorry I skipped your comment "I don't
>>> know how you intend to calculate ARM_PRE_START_SIZE" when you reviewed
>>> the patch v3, I should use that chance to elaborate the detailed idea
>>> and so can get more feedback/guidance before procceed.
>>>
>>> Actually, I have considered to use kallsyms when worked on the previous
>>> patch set.
>>>
>>> As mentioned in patch set v4's cover letter, I tried to implement
>>> machine__create_extra_kernel_maps() for arm/arm64, the purpose is to
>>> parse kallsyms so can find more kernel maps and thus also can fixup
>>> the kernel start address.  But I found the 'perf script' tool directly
>>> calls machine__get_kernel_start() instead of running into the flow for
>>> machine__create_extra_kernel_maps();
>>
>> Doesn't it just need to loop through each kernel map to find the lowest
>> start address?
> 
> Based on your suggestion, I worked out below change and verified it
> can work well on arm64 for fixing up start address; please let me know
> if the change works for you?

How does that work if take a perf.data file to a machine with a different
architecture?

> 
> diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c
> index f6ee7fbad3e4..51d78313dca1 100644
> --- a/tools/perf/util/machine.c
> +++ b/tools/perf/util/machine.c
> @@ -2671,9 +2671,26 @@ int machine__nr_cpus_avail(struct machine *machine)
>   return machine ? perf_env__nr_cpus_avail(machine->env) : 0;
>  }
>  
> +static int machine__fixup_kernel_start(void *arg,
> +const char *name __maybe_unused,
> +char type,
> +u64 start)
> +{
> + struct machine *machine = arg;
> +
> + type = toupper(type);
> +
> + /* Fixup for text, weak, data and bss sections. */
> + if (type == 'T' || type == 'W' || type == 'D' || type == 'B')
> + machine->kernel_start = min(machine->kernel_start, start);
> +
> + return 0;
> +}
> +
>  int machine__get_kernel_start(struct machine *machine)
>  {
>   struct map *map = machine__kernel_map(machine);
> + char filename[PATH_MAX];
>   int err = 0;
>  
>   /*
> @@ -2687,6 +2704,7 @@ int machine__get_kernel_start(struct machine *machine)
>   machine->kernel_start = 1ULL << 63;
>   if (map) {
>   err = map__load(map);
>   /*
>* On x86_64, PTI entry trampolines are less than the
>* start of kernel text, but still above 2^63. So leave
> @@ -2695,6 +2713,16 @@ int machine__get_kernel_start(struct machine *machine)
>   if (!err && !machine__is(machine, "x86_64"))
>   machine->kernel_start = map->start;
>   }
> +
> + machine__get_kallsyms_filename(machine, filename, PATH_MAX);
> +
> + if (symbol__restricted_filename(filename, "/proc/kallsyms"))
> + goto out;
> +
> + if (kallsyms__parse(filename, machine, machine__fixup_kernel_start))
> + pr_warning("Fail to fixup kernel start address. skipping...\n");
> +
> +out:
>   return err;
>  }
> 
> Thanks,
> Leo Yan
> 



Re: [PATCH v5] perf machine: arm/arm64: Improve completeness for kernel address space

2019-08-15 Thread Leo Yan
Hi Adrian,

On Thu, Aug 15, 2019 at 02:45:57PM +0300, Adrian Hunter wrote:

[...]

> >> How come you cannot use kallsyms to get the information?
> > 
> > Thanks for pointing out this.  Sorry I skipped your comment "I don't
> > know how you intend to calculate ARM_PRE_START_SIZE" when you reviewed
> > the patch v3, I should use that chance to elaborate the detailed idea
> > and so can get more feedback/guidance before procceed.
> > 
> > Actually, I have considered to use kallsyms when worked on the previous
> > patch set.
> > 
> > As mentioned in patch set v4's cover letter, I tried to implement
> > machine__create_extra_kernel_maps() for arm/arm64, the purpose is to
> > parse kallsyms so can find more kernel maps and thus also can fixup
> > the kernel start address.  But I found the 'perf script' tool directly
> > calls machine__get_kernel_start() instead of running into the flow for
> > machine__create_extra_kernel_maps();
> 
> Doesn't it just need to loop through each kernel map to find the lowest
> start address?

Based on your suggestion, I worked out below change and verified it
can work well on arm64 for fixing up start address; please let me know
if the change works for you?

diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c
index f6ee7fbad3e4..51d78313dca1 100644
--- a/tools/perf/util/machine.c
+++ b/tools/perf/util/machine.c
@@ -2671,9 +2671,26 @@ int machine__nr_cpus_avail(struct machine *machine)
return machine ? perf_env__nr_cpus_avail(machine->env) : 0;
 }
 
+static int machine__fixup_kernel_start(void *arg,
+  const char *name __maybe_unused,
+  char type,
+  u64 start)
+{
+   struct machine *machine = arg;
+
+   type = toupper(type);
+
+   /* Fixup for text, weak, data and bss sections. */
+   if (type == 'T' || type == 'W' || type == 'D' || type == 'B')
+   machine->kernel_start = min(machine->kernel_start, start);
+
+   return 0;
+}
+
 int machine__get_kernel_start(struct machine *machine)
 {
struct map *map = machine__kernel_map(machine);
+   char filename[PATH_MAX];
int err = 0;
 
/*
@@ -2687,6 +2704,7 @@ int machine__get_kernel_start(struct machine *machine)
machine->kernel_start = 1ULL << 63;
if (map) {
err = map__load(map);
/*
 * On x86_64, PTI entry trampolines are less than the
 * start of kernel text, but still above 2^63. So leave
@@ -2695,6 +2713,16 @@ int machine__get_kernel_start(struct machine *machine)
if (!err && !machine__is(machine, "x86_64"))
machine->kernel_start = map->start;
}
+
+   machine__get_kallsyms_filename(machine, filename, PATH_MAX);
+
+   if (symbol__restricted_filename(filename, "/proc/kallsyms"))
+   goto out;
+
+   if (kallsyms__parse(filename, machine, machine__fixup_kernel_start))
+   pr_warning("Fail to fixup kernel start address. skipping...\n");
+
+out:
return err;
 }

Thanks,
Leo Yan


Re: [PATCH v5] perf machine: arm/arm64: Improve completeness for kernel address space

2019-08-15 Thread Adrian Hunter
ls/perf/util/machine.c b/tools/perf/util/machine.c
>>> index f6ee7fbad3e4..e993f891bb82 100644
>>> --- a/tools/perf/util/machine.c
>>> +++ b/tools/perf/util/machine.c
>>> @@ -2687,13 +2687,26 @@ int machine__get_kernel_start(struct machine 
>>> *machine)
>>> machine->kernel_start = 1ULL << 63;
>>> if (map) {
>>> err = map__load(map);
>>> +   if (err)
>>> +   return err;
>>> +
>>> /*
>>>  * On x86_64, PTI entry trampolines are less than the
>>>  * start of kernel text, but still above 2^63. So leave
>>>  * kernel_start = 1ULL << 63 for x86_64.
>>>  */
>>> -   if (!err && !machine__is(machine, "x86_64"))
>>> +   if (!machine__is(machine, "x86_64"))
>>> machine->kernel_start = map->start;
>>> +
>>> +   /*
>>> +* On arm/arm64, the kernel uses some memory regions which are
>>> +* prior to '_stext' symbol; to reflect the complete kernel
>>> +* address space, compensate these pre-defined regions for
>>> +* kernel start address.
>>> +*/
>>> +   if (!strcmp(perf_env__arch(machine->env), "arm") ||
>>> +   !strcmp(perf_env__arch(machine->env), "arm64"))
>>> +   machine->kernel_start -= ARM_PRE_START_SIZE;
>>> }
>>> return err;
>>>  }
>>>
>>
> 



Re: [PATCH v5] perf machine: arm/arm64: Improve completeness for kernel address space

2019-08-15 Thread Leo Yan
Hi Adrian,

On Thu, Aug 15, 2019 at 11:54:54AM +0300, Adrian Hunter wrote:

[...]

> > diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config
> > index e4988f49ea79..d7ff839d8b20 100644
> > --- a/tools/perf/Makefile.config
> > +++ b/tools/perf/Makefile.config
> > @@ -48,9 +48,20 @@ ifeq ($(SRCARCH),x86)
> >NO_PERF_REGS := 0
> >  endif
> >  
> > +ARM_PRE_START_SIZE := 0
> > +
> >  ifeq ($(SRCARCH),arm)
> >NO_PERF_REGS := 0
> >LIBUNWIND_LIBS = -lunwind -lunwind-arm
> > +  ifneq ($(wildcard $(srctree)/arch/$(SRCARCH)/kernel/vmlinux.lds),)
> > +# Extract info from lds:
> > +#   . = ((0xC000)) + 0x00208000;
> > +# ARM_PRE_START_SIZE := 0x00208000
> > +ARM_PRE_START_SIZE := $(shell egrep ' \. \= \({2}0x[0-9a-fA-F]+\){2}' \
> > +  $(srctree)/arch/$(SRCARCH)/kernel/vmlinux.lds | \
> > +  sed -e 's/[(|)|.|=|+|<|;|-]//g' -e 's/ \+/ /g' -e 's/^[ \t]*//' | \
> > +  awk -F' ' '{printf "0x%x", $$2}' 2>/dev/null)
> > +  endif
> >  endif
> >  
> >  ifeq ($(SRCARCH),arm64)
> > @@ -58,8 +69,19 @@ ifeq ($(SRCARCH),arm64)
> >NO_SYSCALL_TABLE := 0
> >CFLAGS += -I$(OUTPUT)arch/arm64/include/generated
> >LIBUNWIND_LIBS = -lunwind -lunwind-aarch64
> > +  ifneq ($(wildcard $(srctree)/arch/$(SRCARCH)/kernel/vmlinux.lds),)
> > +# Extract info from lds:
> > +#  . = 0x)) - (((1)) << (48)) + 1) + (0)) + 
> > (0x0800))) + (0x0800))) + 0x0008;
> > +# ARM_PRE_START_SIZE := (0x0800 + 0x0800 + 0x0008) = 
> > 0x1008
> > +ARM_PRE_START_SIZE := $(shell egrep ' \. \= \({8}0x[0-9a-fA-F]+\){2}' \
> > +  $(srctree)/arch/$(SRCARCH)/kernel/vmlinux.lds | \
> > +  sed -e 's/[(|)|.|=|+|<|;|-]//g' -e 's/ \+/ /g' -e 's/^[ \t]*//' | \
> > +  awk -F' ' '{printf "0x%x", $$6+$$7+$$8}' 2>/dev/null)
> > +  endif
> 
> So, that is not going to work if you take a perf.data file to a non-arm 
> machine?

Yeah, this patch will only allow perf to work correctly when perf
run natively on arm/arm64, so it can resolve partial of the issue.

> How come you cannot use kallsyms to get the information?

Thanks for pointing out this.  Sorry I skipped your comment "I don't
know how you intend to calculate ARM_PRE_START_SIZE" when you reviewed
the patch v3, I should use that chance to elaborate the detailed idea
and so can get more feedback/guidance before procceed.

Actually, I have considered to use kallsyms when worked on the previous
patch set.

As mentioned in patch set v4's cover letter, I tried to implement
machine__create_extra_kernel_maps() for arm/arm64, the purpose is to
parse kallsyms so can find more kernel maps and thus also can fixup
the kernel start address.  But I found the 'perf script' tool directly
calls machine__get_kernel_start() instead of running into the flow for
machine__create_extra_kernel_maps(); so I finally gave up to use
machine__create_extra_kernel_maps() for tweaking kernel start address
and went back to use this patch's approach by parsing lds files.

So for next step, I want to get some guidances:

- One method is to add a new weak function, e.g.
  arch__fix_kernel_text_start(), then every arch can implement its own
  function to fixup the kernel start address;

  For arm/arm64, can use kallsyms to find the symbols with least
  address and fixup for kernel start address.

- Another method is to directly parse kallsyms in the function
  machine__get_kernel_start(), thus the change can be used for all
  archs;

Seems to me the second method is to address this issue as a common
issue crossing all archs.  But not sure if this is the requirement for
all archs or just this is only required for arm/arm64.  Please let me
know what's your preference or other thoughts.  Thanks a lot!

Leo.

> >  endif
> >  
> > +CFLAGS += -DARM_PRE_START_SIZE=$(ARM_PRE_START_SIZE)
> > +
> >  ifeq ($(SRCARCH),csky)
> >NO_PERF_REGS := 0
> >  endif
> > diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c
> > index f6ee7fbad3e4..e993f891bb82 100644
> > --- a/tools/perf/util/machine.c
> > +++ b/tools/perf/util/machine.c
> > @@ -2687,13 +2687,26 @@ int machine__get_kernel_start(struct machine 
> > *machine)
> > machine->kernel_start = 1ULL << 63;
> > if (map) {
> > err = map__load(map);
> > +   if (err)
> > +   return err;
> > +
> >     /*
> >      * On x86_64, PTI entry tr

Re: [PATCH v5] perf machine: arm/arm64: Improve completeness for kernel address space

2019-08-15 Thread Adrian Hunter
 index e4988f49ea79..d7ff839d8b20 100644
> --- a/tools/perf/Makefile.config
> +++ b/tools/perf/Makefile.config
> @@ -48,9 +48,20 @@ ifeq ($(SRCARCH),x86)
>NO_PERF_REGS := 0
>  endif
>  
> +ARM_PRE_START_SIZE := 0
> +
>  ifeq ($(SRCARCH),arm)
>NO_PERF_REGS := 0
>LIBUNWIND_LIBS = -lunwind -lunwind-arm
> +  ifneq ($(wildcard $(srctree)/arch/$(SRCARCH)/kernel/vmlinux.lds),)
> +# Extract info from lds:
> +#   . = ((0xC000)) + 0x00208000;
> +# ARM_PRE_START_SIZE := 0x00208000
> +ARM_PRE_START_SIZE := $(shell egrep ' \. \= \({2}0x[0-9a-fA-F]+\){2}' \
> +  $(srctree)/arch/$(SRCARCH)/kernel/vmlinux.lds | \
> +  sed -e 's/[(|)|.|=|+|<|;|-]//g' -e 's/ \+/ /g' -e 's/^[ \t]*//' | \
> +  awk -F' ' '{printf "0x%x", $$2}' 2>/dev/null)
> +  endif
>  endif
>  
>  ifeq ($(SRCARCH),arm64)
> @@ -58,8 +69,19 @@ ifeq ($(SRCARCH),arm64)
>NO_SYSCALL_TABLE := 0
>CFLAGS += -I$(OUTPUT)arch/arm64/include/generated
>LIBUNWIND_LIBS = -lunwind -lunwind-aarch64
> +  ifneq ($(wildcard $(srctree)/arch/$(SRCARCH)/kernel/vmlinux.lds),)
> +# Extract info from lds:
> +#  . = 0x)) - (((1)) << (48)) + 1) + (0)) + 
> (0x0800))) + (0x0800))) + 0x0008;
> +# ARM_PRE_START_SIZE := (0x0800 + 0x0800 + 0x0008) = 
> 0x1008
> +ARM_PRE_START_SIZE := $(shell egrep ' \. \= \({8}0x[0-9a-fA-F]+\){2}' \
> +  $(srctree)/arch/$(SRCARCH)/kernel/vmlinux.lds | \
> +  sed -e 's/[(|)|.|=|+|<|;|-]//g' -e 's/ \+/ /g' -e 's/^[ \t]*//' | \
> +  awk -F' ' '{printf "0x%x", $$6+$$7+$$8}' 2>/dev/null)
> +  endif

So, that is not going to work if you take a perf.data file to a non-arm machine?

How come you cannot use kallsyms to get the information?

>  endif
>  
> +CFLAGS += -DARM_PRE_START_SIZE=$(ARM_PRE_START_SIZE)
> +
>  ifeq ($(SRCARCH),csky)
>NO_PERF_REGS := 0
>  endif
> diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c
> index f6ee7fbad3e4..e993f891bb82 100644
> --- a/tools/perf/util/machine.c
> +++ b/tools/perf/util/machine.c
> @@ -2687,13 +2687,26 @@ int machine__get_kernel_start(struct machine *machine)
>   machine->kernel_start = 1ULL << 63;
>   if (map) {
>   err = map__load(map);
> + if (err)
> + return err;
> +
>   /*
>* On x86_64, PTI entry trampolines are less than the
>* start of kernel text, but still above 2^63. So leave
>* kernel_start = 1ULL << 63 for x86_64.
>*/
> - if (!err && !machine__is(machine, "x86_64"))
> + if (!machine__is(machine, "x86_64"))
>   machine->kernel_start = map->start;
> +
> + /*
> +  * On arm/arm64, the kernel uses some memory regions which are
> +  * prior to '_stext' symbol; to reflect the complete kernel
> +  * address space, compensate these pre-defined regions for
> +  * kernel start address.
> +  */
> + if (!strcmp(perf_env__arch(machine->env), "arm") ||
> + !strcmp(perf_env__arch(machine->env), "arm64"))
> + machine->kernel_start -= ARM_PRE_START_SIZE;
>   }
>   return err;
>  }
> 



[PATCH v5] perf machine: arm/arm64: Improve completeness for kernel address space

2019-08-15 Thread Leo Yan
/kernel/vmlinux.lds | \
+  sed -e 's/[(|)|.|=|+|<|;|-]//g' -e 's/ \+/ /g' -e 's/^[ \t]*//' | \
+  awk -F' ' '{printf "0x%x", $$2}' 2>/dev/null)
+  endif
 endif
 
 ifeq ($(SRCARCH),arm64)
@@ -58,8 +69,19 @@ ifeq ($(SRCARCH),arm64)
   NO_SYSCALL_TABLE := 0
   CFLAGS += -I$(OUTPUT)arch/arm64/include/generated
   LIBUNWIND_LIBS = -lunwind -lunwind-aarch64
+  ifneq ($(wildcard $(srctree)/arch/$(SRCARCH)/kernel/vmlinux.lds),)
+# Extract info from lds:
+#  . = 0x)) - (((1)) << (48)) + 1) + (0)) + 
(0x0800))) + (0x0800))) + 0x0008;
+# ARM_PRE_START_SIZE := (0x0800 + 0x0800 + 0x0008) = 0x1008
+ARM_PRE_START_SIZE := $(shell egrep ' \. \= \({8}0x[0-9a-fA-F]+\){2}' \
+  $(srctree)/arch/$(SRCARCH)/kernel/vmlinux.lds | \
+  sed -e 's/[(|)|.|=|+|<|;|-]//g' -e 's/ \+/ /g' -e 's/^[ \t]*//' | \
+  awk -F' ' '{printf "0x%x", $$6+$$7+$$8}' 2>/dev/null)
+  endif
 endif
 
+CFLAGS += -DARM_PRE_START_SIZE=$(ARM_PRE_START_SIZE)
+
 ifeq ($(SRCARCH),csky)
   NO_PERF_REGS := 0
 endif
diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c
index f6ee7fbad3e4..e993f891bb82 100644
--- a/tools/perf/util/machine.c
+++ b/tools/perf/util/machine.c
@@ -2687,13 +2687,26 @@ int machine__get_kernel_start(struct machine *machine)
machine->kernel_start = 1ULL << 63;
if (map) {
err = map__load(map);
+   if (err)
+   return err;
+
/*
 * On x86_64, PTI entry trampolines are less than the
 * start of kernel text, but still above 2^63. So leave
 * kernel_start = 1ULL << 63 for x86_64.
 */
-   if (!err && !machine__is(machine, "x86_64"))
+   if (!machine__is(machine, "x86_64"))
machine->kernel_start = map->start;
+
+   /*
+* On arm/arm64, the kernel uses some memory regions which are
+* prior to '_stext' symbol; to reflect the complete kernel
+* address space, compensate these pre-defined regions for
+* kernel start address.
+*/
+   if (!strcmp(perf_env__arch(machine->env), "arm") ||
+   !strcmp(perf_env__arch(machine->env), "arm64"))
+   machine->kernel_start -= ARM_PRE_START_SIZE;
}
return err;
 }
-- 
2.17.1



[PATCH v4 2/2] perf machine: arm/arm64: Improve completeness for kernel address space

2019-08-10 Thread Leo Yan
000)) + 0x00208000;
+# PRE_START_SIZE := 0x00208000
+PRE_START_SIZE := $(shell egrep ' \. \= \({2}0x[0-9a-fA-F]+\){2}' \
+  $(srctree)/arch/$(SRCARCH)/kernel/vmlinux.lds | \
+  sed -e 's/[(|)|.|=|+|<|;|-]//g' -e 's/ \+/ /g' -e 's/^[ \t]*//' | \
+  awk -F' ' '{printf "0x%x", $$2}' 2>/dev/null)
+  endif
+  CFLAGS += -DARM_PRE_START_SIZE=$(PRE_START_SIZE)
 endif
 
 ifeq ($(SRCARCH),arm64)
@@ -58,6 +69,17 @@ ifeq ($(SRCARCH),arm64)
   NO_SYSCALL_TABLE := 0
   CFLAGS += -I$(OUTPUT)arch/arm64/include/generated
   LIBUNWIND_LIBS = -lunwind -lunwind-aarch64
+  PRE_START_SIZE := 0
+  ifneq ($(wildcard $(srctree)/arch/$(SRCARCH)/kernel/vmlinux.lds),)
+# Extract info from lds:
+#  . = 0x)) - (((1)) << (48)) + 1) + (0)) + 
(0x0800))) + (0x0800))) + 0x0008;
+# PRE_START_SIZE := (0x0800 + 0x0800 + 0x0008) = 0x1008
+PRE_START_SIZE := $(shell egrep ' \. \= \({8}0x[0-9a-fA-F]+\){2}' \
+  $(srctree)/arch/$(SRCARCH)/kernel/vmlinux.lds | \
+  sed -e 's/[(|)|.|=|+|<|;|-]//g' -e 's/ \+/ /g' -e 's/^[ \t]*//' | \
+  awk -F' ' '{printf "0x%x", $$6+$$7+$$8}' 2>/dev/null)
+  endif
+  CFLAGS += -DARM_PRE_START_SIZE=$(PRE_START_SIZE)
 endif
 
 ifeq ($(SRCARCH),csky)
diff --git a/tools/perf/arch/arm/util/Build b/tools/perf/arch/arm/util/Build
index 296f0eac5e18..efa6b768218a 100644
--- a/tools/perf/arch/arm/util/Build
+++ b/tools/perf/arch/arm/util/Build
@@ -1,3 +1,5 @@
+perf-y += machine.o
+
 perf-$(CONFIG_DWARF) += dwarf-regs.o
 
 perf-$(CONFIG_LOCAL_LIBUNWIND)+= unwind-libunwind.o
diff --git a/tools/perf/arch/arm/util/machine.c 
b/tools/perf/arch/arm/util/machine.c
new file mode 100644
index ..db172894e4ea
--- /dev/null
+++ b/tools/perf/arch/arm/util/machine.c
@@ -0,0 +1,17 @@
+// SPDX-License-Identifier: GPL-2.0
+#include 
+#include 
+#include 
+
+#include "../../util/machine.h"
+
+void arch__fix_kernel_text_start(u64 *start)
+{
+   /*
+* On arm, the 16MB virtual memory space prior to 'kernel_start' is
+* allocated to device modules, a PMD table if CONFIG_HIGHMEM is
+* enabled and a PGD table.  To reflect the complete kernel address
+* space, compensate the pre-defined regions for kernel start address.
+*/
+   *start = *start - ARM_PRE_START_SIZE;
+}
diff --git a/tools/perf/arch/arm64/util/Build b/tools/perf/arch/arm64/util/Build
index 3cde540d2fcf..8081fb8a7b3d 100644
--- a/tools/perf/arch/arm64/util/Build
+++ b/tools/perf/arch/arm64/util/Build
@@ -1,4 +1,5 @@
 perf-y += header.o
+perf-y += machine.o
 perf-y += sym-handling.o
 perf-$(CONFIG_DWARF) += dwarf-regs.o
 perf-$(CONFIG_LOCAL_LIBUNWIND) += unwind-libunwind.o
diff --git a/tools/perf/arch/arm64/util/machine.c 
b/tools/perf/arch/arm64/util/machine.c
new file mode 100644
index ..61058dca8c5a
--- /dev/null
+++ b/tools/perf/arch/arm64/util/machine.c
@@ -0,0 +1,17 @@
+// SPDX-License-Identifier: GPL-2.0
+#include 
+#include 
+#include 
+
+#include "../../util/machine.h"
+
+void arch__fix_kernel_text_start(u64 *start)
+{
+   /*
+* On arm64, the root PGD table, device module memory region and
+* BPF jit region are prior to 'kernel_start'.  To reflect the
+* complete kernel address space, compensate these pre-defined
+* regions for kernel start address.
+*/
+   *start = *start - ARM_PRE_START_SIZE;
+}
-- 
2.17.1



[PATCH v4 0/2] perf: arm/arm64: Improve completeness for kernel address space

2019-08-10 Thread Leo Yan
This patch set is to improve completeness for kernel address space for
arm/arm64; it adds architecture specific tweaking for the kernel start
address, thus can include the memory regions which are prior to '_stext'
symbol.  With this change, we can see the eBPF program can be parsed
properly on arm64.

This patch set is a following up version for the old patch "perf cs-etm:
Improve completeness for kernel address space" [1]; the old patch was
only to fix the issue for CoreSight ETM event; but the kernel address space
issue is not only limited to CoreSight event, it should be a common issue
for other events (e.g. PMU events), clock events for profiling eBPF
program.  So this patch set tries to resolve it as a common issue for
arm/arm64 archs.

When implemented related code, I tried to use the API
machine__create_extra_kernel_maps(); but I found the 'perf script' tool
directly calls machine__get_kernel_start() instead of running into
the flow for machine__create_extra_kernel_maps(); this is the reason I
don't use machine__create_extra_kernel_maps() for tweaking kernel start
address and refactor machine__get_kernel_start() alternatively.

If there have better method to resolve this issue, any suggestions and
comments are very welcome!

[1] https://lkml.org/lkml/2019/6/19/1057


Leo Yan (2):
  perf machine: Support arch's specific kernel start address
  perf machine: arm/arm64: Improve completeness for kernel address space

 tools/perf/Makefile.config   | 22 ++
 tools/perf/arch/arm/util/Build   |  2 ++
 tools/perf/arch/arm/util/machine.c   | 17 +
 tools/perf/arch/arm64/util/Build |  1 +
 tools/perf/arch/arm64/util/machine.c | 17 +
 tools/perf/arch/x86/util/machine.c   | 10 ++
 tools/perf/util/machine.c| 13 +++--
 tools/perf/util/machine.h|  2 ++
 8 files changed, 78 insertions(+), 6 deletions(-)
 create mode 100644 tools/perf/arch/arm/util/machine.c
 create mode 100644 tools/perf/arch/arm64/util/machine.c

-- 
2.17.1



Re: [RFC v2 00/27] Kernel Address Space Isolation

2019-07-31 Thread Dario Faggioli
Hello all,

I know this is a bit of an old thread, so apologies for being late to
the party. :-)

I would have a question about this:

> > > On 7/12/19 2:36 PM, Peter Zijlstra wrote:
> > > > On Fri, Jul 12, 2019 at 02:17:20PM +0200, Alexandre Chartre
> > > > wrote:
> > > > > On 7/12/19 1:44 PM, Peter Zijlstra wrote:
> > > > > > AFAIK3 this wants/needs to be combined with core-scheduling 
> > > > > > to be
> > > > > > useful, but not a single mention of that is anywhere.
> > > > > 
> > > > > No. This is actually an alternative to core-scheduling.
> > > > > Eventually, ASI
> > > > > will kick all sibling hyperthreads when exiting isolation and
> > > > > it needs to
> > > > > run with the full kernel page-table (note that's currently
> > > > > not in these
> > > > > patches).
> 
I.e., about the fact that ASI is presented as an alternative to
core-scheduling or, at least, as it will only need integrate a small
subset of the logic (and of the code) from core-scheduling, as said
here:

> I haven't looked at details about what has been done so far.
> Hopefully, we
> can do something not too complex, or reuse a (small) part of co-
> scheduling.
> 
Now, sticking to virtualization examples, if you don't have core-
scheduling, it means that you can have two vcpus, one from VM A and the
other from VM B, running on the same core, one on thread 0 and the
other one on thread 1, at the same time.

And if VM A's vcpu, running on thread 0, exits, then VM B's vcpu
running in guest more on thread 1 can read host memory, as it is
speculatively accessed (either "normally" or because of cache load
gadgets) and brought in L1D cache by thread 0. And Indeed I do see how
ASI protects us from this attack scenario.

However, when the two VMs' vcpus are both running in guest mode, each
one on a thread of the same core, VM B's vcpu running on thread 1 can
exploit L1TF to peek at and steal secrets that VM A's vcpu, running on
thread 0, is accessing, as they're brought into L1D cache... can't it? 

How can, ASI *without* core-scheduling, prevent this other attack
scenario?

Because I may very well be missing something, but it looks to me that
it can't. In which case, I'm not sure we can call it "alternative" to
core-scheduling Or is the second attack scenario that I tried to
describe above, not considered interesting?

Thanks and Regards
-- 
Dario Faggioli, Ph.D
http://about.me/dario.faggioli
Virtualization Software Engineer
SUSE Labs, SUSE https://www.suse.com/
---
<> (Raistlin Majere)



signature.asc
Description: This is a digitally signed message part


Re: [RFC v2 00/27] Kernel Address Space Isolation

2019-07-15 Thread Peter Zijlstra
On Sun, Jul 14, 2019 at 08:06:12AM -0700, Andy Lutomirski wrote:
> On Fri, Jul 12, 2019 at 12:06 PM Peter Zijlstra  wrote:
> >
> > On Fri, Jul 12, 2019 at 06:37:47PM +0200, Alexandre Chartre wrote:
> > > On 7/12/19 5:16 PM, Thomas Gleixner wrote:
> >
> > > > Right. If we decide to expose more parts of the kernel mappings then 
> > > > that's
> > > > just adding more stuff to the existing user (PTI) map mechanics.
> > >
> > > If we expose more parts of the kernel mapping by adding them to the 
> > > existing
> > > user (PTI) map, then we only control the mapping of kernel sensitive data 
> > > but
> > > we don't control user mapping (with ASI, we exclude all user mappings).
> > >
> > > How would you control the mapping of userland sensitive data and exclude 
> > > them
> > > from the user map? Would you have the application explicitly identify 
> > > sensitive
> > > data (like Andy suggested with a /dev/xpfo device)?
> >
> > To what purpose do you want to exclude userspace from the kernel
> > mapping; that is, what are you mitigating against with that?
> 
> Mutually distrusting user/guest tenants.  Imagine an attack against a
> VM hosting provider (GCE, for example).  If the overall system is
> well-designed, the host kernel won't possess secrets that are
> important to the overall hosting network.  The interesting secrets are
> in the memory of other tenants running under the same host.  So, if we
> can mostly or completely avoid mapping one tenant's memory in the
> host, we reduce the amount of valuable information that could leak via
> a speculation (or wild read) attack to another tenant.
> 
> The practicality of such a scheme is obviously an open question.

Ah, ok. So it's some virt specific nonsense. I'll go on ignoring it then
;-)


Re: [RFC v2 00/27] Kernel Address Space Isolation

2019-07-15 Thread Thomas Gleixner
Alexandre,

On Mon, 15 Jul 2019, Alexandre Chartre wrote:
> On 7/12/19 9:48 PM, Thomas Gleixner wrote:
> > As I said before, come up with a list of possible usage scenarios and
> > protection scopes first and please take all the ideas other people have
> > with this into account. This includes PTI of course.
> > 
> > Once we have that we need to figure out whether these things can actually
> > coexist and do not contradict each other at the semantical level and
> > whether the outcome justifies the resulting complexity.
> > 
> > After that we can talk about implementation details.
> 
> Right, that makes perfect sense. I think so far we have the following
> scenarios:
> 
>  - PTI
>  - KVM (i.e. VMExit handler isolation)
>  - maybe some syscall isolation?

Vs. the latter you want to talk to Paul Turner. He had some ideas there.

> I will look at them in more details, in particular what particular
> mappings they need and when they need to switch mappings.
> 
> And thanks for putting me back on the right track.

That's what maintainers are for :)

Thanks,

tglx


Re: [RFC v2 00/27] Kernel Address Space Isolation

2019-07-15 Thread Alexandre Chartre




On 7/12/19 9:48 PM, Thomas Gleixner wrote:

On Fri, 12 Jul 2019, Alexandre Chartre wrote:

On 7/12/19 5:16 PM, Thomas Gleixner wrote:

On Fri, 12 Jul 2019, Peter Zijlstra wrote:

On Fri, Jul 12, 2019 at 01:56:44PM +0200, Alexandre Chartre wrote:
And then we've fully replaced PTI.

So no, they're not orthogonal.


Right. If we decide to expose more parts of the kernel mappings then that's
just adding more stuff to the existing user (PTI) map mechanics.
  
If we expose more parts of the kernel mapping by adding them to the existing

user (PTI) map, then we only control the mapping of kernel sensitive data but
we don't control user mapping (with ASI, we exclude all user mappings).


What prevents you from adding functionality to do so to the PTI
implementation? Nothing.

Again, the underlying concept is exactly the same:

   1) Create a restricted mapping from an existing mapping

   2) Switch to the restricted mapping when entering a particular execution
  context

   3) Switch to the unrestricted mapping when leaving that execution context

   4) Keep track of the state

The restriction scope is different, but that's conceptually completely
irrelevant. It's a detail which needs to be handled at the implementation
level.

What matters here is the concept and because the concept is the same, this
needs to share the infrastructure for #1 - #4.



You are totally right, that's the same concept (page-table creation and 
switching),
it is just used in different contexts. Sorry it took me that long to realize it,
I was too focus on the use case.



It's obvious that this requires changes to the way PTI works today, but
anything which creates a parallel implementation of any part of the above
#1 - #4 is not going anywhere.

This stuff is way too sensitive and has pretty well understood limitations
and corner cases. So it needs to be designed from ground up to handle these
proper. Which also means, that the possible use cases are going to be
limited.

As I said before, come up with a list of possible usage scenarios and
protection scopes first and please take all the ideas other people have
with this into account. This includes PTI of course.

Once we have that we need to figure out whether these things can actually
coexist and do not contradict each other at the semantical level and
whether the outcome justifies the resulting complexity.

After that we can talk about implementation details.


Right, that makes perfect sense. I think so far we have the following scenarios:

 - PTI
 - KVM (i.e. VMExit handler isolation)
 - maybe some syscall isolation?

I will look at them in more details, in particular what particular mappings they
need and when they need to switch mappings.


And thanks for putting me back on the right track.


alex.


This problem is not going to be solved with handwaving and an ad hoc
implementation which creates more problems than it solves.

Thanks,

tglx



Re: [RFC v2 00/27] Kernel Address Space Isolation

2019-07-14 Thread Alexander Graf




On 12.07.19 16:36, Andy Lutomirski wrote:

On Fri, Jul 12, 2019 at 6:45 AM Alexandre Chartre
 wrote:



On 7/12/19 2:50 PM, Peter Zijlstra wrote:

On Fri, Jul 12, 2019 at 01:56:44PM +0200, Alexandre Chartre wrote:


I think that's precisely what makes ASI and PTI different and independent.
PTI is just about switching between userland and kernel page-tables, while
ASI is about switching page-table inside the kernel. You can have ASI without
having PTI. You can also use ASI for kernel threads so for code that won't
be triggered from userland and so which won't involve PTI.


PTI is not mapping kernel space to avoid speculation crap 
(meltdown).
ASI is not mapping part of kernel space to avoid (different) speculation crap 
(MDS).

See how very similar they are?


Furthermore, to recover SMT for userspace (under MDS) we not only need
core-scheduling but core-scheduling per address space. And ASI was
specifically designed to help mitigate the trainwreck just described.

By explicitly exposing (hopefully harmless) part of the kernel to MDS,
we reduce the part that needs core-scheduling and thus reduce the rate
the SMT siblngs need to sync up/schedule.

But looking at it that way, it makes no sense to retain 3 address
spaces, namely:

user / kernel exposed / kernel private.

Specifically, it makes no sense to expose part of the kernel through MDS
but not through Meltdow. Therefore we can merge the user and kernel
exposed address spaces.


The goal of ASI is to provide a reduced address space which exclude sensitive
data. A user process (for example a database daemon, a web server, or a vmm
like qemu) will likely have sensitive data mapped in its user address space.
Such data shouldn't be mapped with ASI because it can potentially leak to the
sibling hyperthread. For example, if an hyperthread is running a VM then the
VM could potentially access user sensitive data if they are mapped on the
sibling hyperthread with ASI.


So I've proposed the following slightly hackish thing:

Add a mechanism (call it /dev/xpfo).  When you open /dev/xpfo and
fallocate it to some size, you allocate that amount of memory and kick
it out of the kernel direct map.  (And pay the IPI cost unless there
were already cached non-direct-mapped pages ready.)  Then you map
*that* into your VMs.  Now, for a dedicated VM host, you map *all* the
VM private memory from /dev/xpfo.  Pretend it's SEV if you want to
determine which pages can be set up like this.

Does this get enough of the benefit at a negligible fraction of the
code complexity cost?  (This plus core scheduling, anyway.)


The problem with that approach is that you lose the ability to run 
legacy workloads that do not support an SEV like model of "guest owned" 
and "host visible" pages, but instead assume you can DMA anywhere.


Without that, your host will have visibility into guest pages via user 
space (QEMU) pages which again are mapped in the kernel direct map, so 
can be exposed via a spectre gadget into a malicious guest.


Also, please keep in mind that even register state of other VMs may be a 
secret that we do not want to leak into other guests.



Alex


Re: [RFC v2 00/27] Kernel Address Space Isolation

2019-07-14 Thread Mike Rapoport
On Fri, Jul 12, 2019 at 10:45:06AM -0600, Andy Lutomirski wrote:
> 
> 
> > On Jul 12, 2019, at 10:37 AM, Alexandre Chartre 
> >  wrote:
> > 
> > 
> > 
> >> On 7/12/19 5:16 PM, Thomas Gleixner wrote:
> >>> On Fri, 12 Jul 2019, Peter Zijlstra wrote:
>  On Fri, Jul 12, 2019 at 01:56:44PM +0200, Alexandre Chartre wrote:
>  
>  I think that's precisely what makes ASI and PTI different and 
>  independent.
>  PTI is just about switching between userland and kernel page-tables, 
>  while
>  ASI is about switching page-table inside the kernel. You can have ASI 
>  without
>  having PTI. You can also use ASI for kernel threads so for code that 
>  won't
>  be triggered from userland and so which won't involve PTI.
> >>> 
> >>> PTI is not mapping kernel space to avoid speculation 
> >>> crap (meltdown).
> >>> ASI is not mapping part of kernel space to avoid (different) speculation 
> >>> crap (MDS).
> >>> 
> >>> See how very similar they are?
> >>> 
> >>> Furthermore, to recover SMT for userspace (under MDS) we not only need
> >>> core-scheduling but core-scheduling per address space. And ASI was
> >>> specifically designed to help mitigate the trainwreck just described.
> >>> 
> >>> By explicitly exposing (hopefully harmless) part of the kernel to MDS,
> >>> we reduce the part that needs core-scheduling and thus reduce the rate
> >>> the SMT siblngs need to sync up/schedule.
> >>> 
> >>> But looking at it that way, it makes no sense to retain 3 address
> >>> spaces, namely:
> >>> 
> >>>   user / kernel exposed / kernel private.
> >>> 
> >>> Specifically, it makes no sense to expose part of the kernel through MDS
> >>> but not through Meltdow. Therefore we can merge the user and kernel
> >>> exposed address spaces.
> >>> 
> >>> And then we've fully replaced PTI.
> >>> 
> >>> So no, they're not orthogonal.
> >> Right. If we decide to expose more parts of the kernel mappings then that's
> >> just adding more stuff to the existing user (PTI) map mechanics.
> > 
> > If we expose more parts of the kernel mapping by adding them to the existing
> > user (PTI) map, then we only control the mapping of kernel sensitive data 
> > but
> > we don't control user mapping (with ASI, we exclude all user mappings).
> > 
> > How would you control the mapping of userland sensitive data and exclude 
> > them
> > from the user map?
> 
> As I see it, if we think part of the kernel is okay to leak to VM guests,
> then it should think it’s okay to leak to userspace and versa. At the end
> of the day, this may just have to come down to an administrator’s choice
> of how careful the mitigations need to be.
> 
> > Would you have the application explicitly identify sensitive
> > data (like Andy suggested with a /dev/xpfo device)?
> 
> That’s not really the intent of my suggestion. I was suggesting that
> maybe we don’t need ASI at all if we allow VMs to exclude their memory
> from the kernel mapping entirely.  Heck, in a setup like this, we can
> maybe even get away with turning PTI off under very, very controlled
> circumstances.  I’m not quite sure what to do about the kernel random
> pools, though.

I think KVM already allows excluding VMs memory from the kernel mapping
with the "new guest mapping interface" [1]. The memory managed by the host
can be restricted with "mem=" and KVM maps/unmaps the guest memory pages
only when needed.

It would be interesting to see if /dev/xpfo or even
madvise(MAKE_MY_MEMORY_PRIVATE) can be made useful for multi-tenant
container hosts.

[1] 
https://lore.kernel.org/lkml/1548966284-28642-1-git-send-email-karah...@amazon.de/

-- 
Sincerely yours,
Mike.



Re: [RFC v2 00/27] Kernel Address Space Isolation

2019-07-14 Thread Andy Lutomirski
On Fri, Jul 12, 2019 at 12:06 PM Peter Zijlstra  wrote:
>
> On Fri, Jul 12, 2019 at 06:37:47PM +0200, Alexandre Chartre wrote:
> > On 7/12/19 5:16 PM, Thomas Gleixner wrote:
>
> > > Right. If we decide to expose more parts of the kernel mappings then 
> > > that's
> > > just adding more stuff to the existing user (PTI) map mechanics.
> >
> > If we expose more parts of the kernel mapping by adding them to the existing
> > user (PTI) map, then we only control the mapping of kernel sensitive data 
> > but
> > we don't control user mapping (with ASI, we exclude all user mappings).
> >
> > How would you control the mapping of userland sensitive data and exclude 
> > them
> > from the user map? Would you have the application explicitly identify 
> > sensitive
> > data (like Andy suggested with a /dev/xpfo device)?
>
> To what purpose do you want to exclude userspace from the kernel
> mapping; that is, what are you mitigating against with that?

Mutually distrusting user/guest tenants.  Imagine an attack against a
VM hosting provider (GCE, for example).  If the overall system is
well-designed, the host kernel won't possess secrets that are
important to the overall hosting network.  The interesting secrets are
in the memory of other tenants running under the same host.  So, if we
can mostly or completely avoid mapping one tenant's memory in the
host, we reduce the amount of valuable information that could leak via
a speculation (or wild read) attack to another tenant.

The practicality of such a scheme is obviously an open question.


Re: [RFC v2 00/27] Kernel Address Space Isolation

2019-07-12 Thread Thomas Gleixner
On Fri, 12 Jul 2019, Alexandre Chartre wrote:
> On 7/12/19 5:16 PM, Thomas Gleixner wrote:
> > On Fri, 12 Jul 2019, Peter Zijlstra wrote:
> > > On Fri, Jul 12, 2019 at 01:56:44PM +0200, Alexandre Chartre wrote:
> > > And then we've fully replaced PTI.
> > > 
> > > So no, they're not orthogonal.
> > 
> > Right. If we decide to expose more parts of the kernel mappings then that's
> > just adding more stuff to the existing user (PTI) map mechanics.
>  
> If we expose more parts of the kernel mapping by adding them to the existing
> user (PTI) map, then we only control the mapping of kernel sensitive data but
> we don't control user mapping (with ASI, we exclude all user mappings).

What prevents you from adding functionality to do so to the PTI
implementation? Nothing.

Again, the underlying concept is exactly the same:

  1) Create a restricted mapping from an existing mapping

  2) Switch to the restricted mapping when entering a particular execution
 context

  3) Switch to the unrestricted mapping when leaving that execution context

  4) Keep track of the state

The restriction scope is different, but that's conceptually completely
irrelevant. It's a detail which needs to be handled at the implementation
level.

What matters here is the concept and because the concept is the same, this
needs to share the infrastructure for #1 - #4.

It's obvious that this requires changes to the way PTI works today, but
anything which creates a parallel implementation of any part of the above
#1 - #4 is not going anywhere.

This stuff is way too sensitive and has pretty well understood limitations
and corner cases. So it needs to be designed from ground up to handle these
proper. Which also means, that the possible use cases are going to be
limited.

As I said before, come up with a list of possible usage scenarios and
protection scopes first and please take all the ideas other people have
with this into account. This includes PTI of course.

Once we have that we need to figure out whether these things can actually
coexist and do not contradict each other at the semantical level and
whether the outcome justifies the resulting complexity.

After that we can talk about implementation details.

This problem is not going to be solved with handwaving and an ad hoc
implementation which creates more problems than it solves.

Thanks,

tglx


Re: [RFC v2 00/27] Kernel Address Space Isolation

2019-07-12 Thread Peter Zijlstra
On Fri, Jul 12, 2019 at 06:37:47PM +0200, Alexandre Chartre wrote:
> On 7/12/19 5:16 PM, Thomas Gleixner wrote:

> > Right. If we decide to expose more parts of the kernel mappings then that's
> > just adding more stuff to the existing user (PTI) map mechanics.
> 
> If we expose more parts of the kernel mapping by adding them to the existing
> user (PTI) map, then we only control the mapping of kernel sensitive data but
> we don't control user mapping (with ASI, we exclude all user mappings).
> 
> How would you control the mapping of userland sensitive data and exclude them
> from the user map? Would you have the application explicitly identify 
> sensitive
> data (like Andy suggested with a /dev/xpfo device)?

To what purpose do you want to exclude userspace from the kernel
mapping; that is, what are you mitigating against with that?


Re: [RFC v2 00/27] Kernel Address Space Isolation

2019-07-12 Thread Andy Lutomirski



> On Jul 12, 2019, at 10:37 AM, Alexandre Chartre 
>  wrote:
> 
> 
> 
>> On 7/12/19 5:16 PM, Thomas Gleixner wrote:
>>> On Fri, 12 Jul 2019, Peter Zijlstra wrote:
 On Fri, Jul 12, 2019 at 01:56:44PM +0200, Alexandre Chartre wrote:
 
 I think that's precisely what makes ASI and PTI different and independent.
 PTI is just about switching between userland and kernel page-tables, while
 ASI is about switching page-table inside the kernel. You can have ASI 
 without
 having PTI. You can also use ASI for kernel threads so for code that won't
 be triggered from userland and so which won't involve PTI.
>>> 
>>> PTI is not mapping kernel space to avoid speculation 
>>> crap (meltdown).
>>> ASI is not mapping part of kernel space to avoid (different) speculation 
>>> crap (MDS).
>>> 
>>> See how very similar they are?
>>> 
>>> Furthermore, to recover SMT for userspace (under MDS) we not only need
>>> core-scheduling but core-scheduling per address space. And ASI was
>>> specifically designed to help mitigate the trainwreck just described.
>>> 
>>> By explicitly exposing (hopefully harmless) part of the kernel to MDS,
>>> we reduce the part that needs core-scheduling and thus reduce the rate
>>> the SMT siblngs need to sync up/schedule.
>>> 
>>> But looking at it that way, it makes no sense to retain 3 address
>>> spaces, namely:
>>> 
>>>   user / kernel exposed / kernel private.
>>> 
>>> Specifically, it makes no sense to expose part of the kernel through MDS
>>> but not through Meltdow. Therefore we can merge the user and kernel
>>> exposed address spaces.
>>> 
>>> And then we've fully replaced PTI.
>>> 
>>> So no, they're not orthogonal.
>> Right. If we decide to expose more parts of the kernel mappings then that's
>> just adding more stuff to the existing user (PTI) map mechanics.
> 
> If we expose more parts of the kernel mapping by adding them to the existing
> user (PTI) map, then we only control the mapping of kernel sensitive data but
> we don't control user mapping (with ASI, we exclude all user mappings).
> 
> How would you control the mapping of userland sensitive data and exclude them
> from the user map?

As I see it, if we think part of the kernel is okay to leak to VM guests, then 
it should think it’s okay to leak to userspace and versa. At the end of the 
day, this may just have to come down to an administrator’s choice of how 
careful the mitigations need to be.

> Would you have the application explicitly identify sensitive
> data (like Andy suggested with a /dev/xpfo device)?

That’s not really the intent of my suggestion. I was suggesting that maybe we 
don’t need ASI at all if we allow VMs to exclude their memory from the kernel 
mapping entirely.  Heck, in a setup like this, we can maybe even get away with 
turning PTI off under very, very controlled circumstances.  I’m not quite sure 
what to do about the kernel random pools, though.

Re: [RFC v2 00/27] Kernel Address Space Isolation

2019-07-12 Thread Alexandre Chartre




On 7/12/19 5:16 PM, Thomas Gleixner wrote:

On Fri, 12 Jul 2019, Peter Zijlstra wrote:

On Fri, Jul 12, 2019 at 01:56:44PM +0200, Alexandre Chartre wrote:


I think that's precisely what makes ASI and PTI different and independent.
PTI is just about switching between userland and kernel page-tables, while
ASI is about switching page-table inside the kernel. You can have ASI without
having PTI. You can also use ASI for kernel threads so for code that won't
be triggered from userland and so which won't involve PTI.


PTI is not mapping kernel space to avoid speculation crap 
(meltdown).
ASI is not mapping part of kernel space to avoid (different) speculation crap 
(MDS).

See how very similar they are?

Furthermore, to recover SMT for userspace (under MDS) we not only need
core-scheduling but core-scheduling per address space. And ASI was
specifically designed to help mitigate the trainwreck just described.

By explicitly exposing (hopefully harmless) part of the kernel to MDS,
we reduce the part that needs core-scheduling and thus reduce the rate
the SMT siblngs need to sync up/schedule.

But looking at it that way, it makes no sense to retain 3 address
spaces, namely:

   user / kernel exposed / kernel private.

Specifically, it makes no sense to expose part of the kernel through MDS
but not through Meltdow. Therefore we can merge the user and kernel
exposed address spaces.

And then we've fully replaced PTI.

So no, they're not orthogonal.


Right. If we decide to expose more parts of the kernel mappings then that's
just adding more stuff to the existing user (PTI) map mechanics.
 


If we expose more parts of the kernel mapping by adding them to the existing
user (PTI) map, then we only control the mapping of kernel sensitive data but
we don't control user mapping (with ASI, we exclude all user mappings).

How would you control the mapping of userland sensitive data and exclude them
from the user map? Would you have the application explicitly identify sensitive
data (like Andy suggested with a /dev/xpfo device)?

Thanks,

alex.



As a consequence the CR3 switching points become different or can be
consolidated and that can be handled right at those switching points
depending on static keys or alternatives as we do today with PTI and other
mitigations.

All of that can do without that obscure "state machine" which is solely
there to duct-tape the complete lack of design. The same applies to that
mapping thing. Just mapping randomly selected parts by sticking them into
an array is a non-maintainable approach. This needs proper separation of
text and data sections, so violations of the mapping constraints can be
statically analyzed. Depending solely on the page fault at run time for
analysis is just bound to lead to hard to diagnose failures in the field.

TBH we all know already that this can be done and that this will solve some
of the issues caused by the speculation mess, so just writing some hastily
cobbled together POC code which explodes just by looking at it, does not
lead to anything else than time waste on all ends.

This first needs a clear definition of protection scope. That scope clearly
defines the required mappings and consequently the transition requirements
which provide the necessary transition points for flipping CR3.

If we have agreed on that, then we can think about the implementation
details.

Thanks,

tglx



Re: [RFC v2 00/27] Kernel Address Space Isolation

2019-07-12 Thread Thomas Gleixner
On Fri, 12 Jul 2019, Alexandre Chartre wrote:
> On 7/12/19 12:44 PM, Thomas Gleixner wrote:
> > That ASI thing is just PTI on steroids.
> > 
> > So why do we need two versions of the same thing? That's absolutely bonkers
> > and will just introduce subtle bugs and conflicting decisions all over the
> > place.
> > 
> > The need for ASI is very tightly coupled to the need for PTI and there is
> > absolutely no point in keeping them separate.
> > 
> > The only difference vs. interrupts and exceptions is that the PTI logic
> > cares whether they enter from user or from kernel space while ASI only
> > cares about the kernel entry.
> 
> I think that's precisely what makes ASI and PTI different and independent.
> PTI is just about switching between userland and kernel page-tables, while
> ASI is about switching page-table inside the kernel. You can have ASI without
> having PTI. You can also use ASI for kernel threads so for code that won't
> be triggered from userland and so which won't involve PTI.

It's still the same concept. And you can argue in circles it does not
justify yet another mapping setup with is a different copy of some other
mapping setup. Whether PTI is replaced by ASI or PTI is extended to handle
ASI does not matter at all. Having two similar concepts side by side is a
guarantee for disaster.

> > So why do you want ot treat that differently? There is absolutely zero
> > reason to do so. And there is no reason to create a pointlessly different
> > version of PTI which introduces yet another variant of a restricted page
> > table instead of just reusing and extending what's there already.
> > 
> 
> As I've tried to explain, to me PTI and ASI are different and independent.
> PTI manages switching between userland and kernel page-table, and ASI manages
> switching between kernel and a reduced-kernel page-table.

Again. It's the same concept and it does not matter what form of reduced
page tables you use. You always need transition points and in order to make
the transition points work you need reliably mapped bits and pieces.

Also Paul wants to use the same concept for user space so trivial system
calls can do w/o PTI. In some other thread you said yourself that this
could be extended to cover the kvm ioctl, which is clearly a return to user
space.

Are we then going to add another set of randomly sprinkled transition
points and yet another 'state machine' to duct-tape the fallout?

Definitely not going to happen.

Thanks,

tglx



Re: [RFC v2 00/27] Kernel Address Space Isolation

2019-07-12 Thread Thomas Gleixner
On Fri, 12 Jul 2019, Alexandre Chartre wrote:
> On 7/12/19 3:51 PM, Dave Hansen wrote:
> > BTW, the PTI CR3 writes are not *strictly* about the interrupt coming
> > from user vs. kernel.  It's tricky because there's a window both in the
> > entry and exit code where you are in the kernel but have a userspace CR3
> > value.  You end up needing a CR3 write when you have a userspace CR3
> > value when the interrupt occurred, not only when you interrupt userspace
> > itself.
> > 
> 
> Right. ASI is simpler because it comes from the kernel and return to the
> kernel. There's just a small window (on entry) where we have the ASI CR3
> but we quickly switch to the full kernel CR3.

That's wrong in several aspects.

   1) You are looking at it purely from the VMM perspective, which is bogus
  as you already said, that this can/should be used to be extended to
  other scenarios (including kvm ioctl or such).

  So no, it's not just coming from kernel space and returning to it.

  If that'd be true then the entry code could just stay as is because
  you can handle _ALL_ of that very trivial in the atomic VMM
  enter/exit code.

   2) It does not matter how small that window is. If there is a window
  then this needs to be covered, no matter what.

Thanks,

tglx


Re: [RFC v2 00/27] Kernel Address Space Isolation

2019-07-12 Thread Peter Zijlstra
On Fri, Jul 12, 2019 at 06:54:22AM -0700, Dave Hansen wrote:
> On 7/12/19 5:50 AM, Peter Zijlstra wrote:
> > PTI is not mapping kernel space to avoid speculation 
> > crap (meltdown).
> > ASI is not mapping part of kernel space to avoid (different) speculation 
> > crap (MDS).
> > 
> > See how very similar they are?
> 
> That's an interesting point.
> 
> I'd add that PTI maps a part of kernel space that partially overlaps
> with what ASI wants.

Right, wherever we put the boundary, we need whatever is required to
cross it.

> > But looking at it that way, it makes no sense to retain 3 address
> > spaces, namely:
> > 
> >   user / kernel exposed / kernel private.
> > 
> > Specifically, it makes no sense to expose part of the kernel through MDS
> > but not through Meltdown. Therefore we can merge the user and kernel
> > exposed address spaces.
> > 
> > And then we've fully replaced PTI.
> 
> So, in one address space (PTI/user or ASI), we say, "screw it" and all
> the data mapped is exposed to speculation attacks.  We have to be very
> careful about what we map and expose here.

Yes, which is why, in an earlier email, I've asked for a clear
definition of 'sensitive" :-)

> So, maybe we're not replacing PTI as much as we're growing PTI so that
> we can run more kernel code with the (now inappropriately named) user
> page tables.

Right.


Re: [RFC v2 00/27] Kernel Address Space Isolation

2019-07-12 Thread Thomas Gleixner
On Fri, 12 Jul 2019, Peter Zijlstra wrote:
> On Fri, Jul 12, 2019 at 01:56:44PM +0200, Alexandre Chartre wrote:
> 
> > I think that's precisely what makes ASI and PTI different and independent.
> > PTI is just about switching between userland and kernel page-tables, while
> > ASI is about switching page-table inside the kernel. You can have ASI 
> > without
> > having PTI. You can also use ASI for kernel threads so for code that won't
> > be triggered from userland and so which won't involve PTI.
> 
> PTI is not mapping kernel space to avoid speculation crap 
> (meltdown).
> ASI is not mapping part of kernel space to avoid (different) speculation crap 
> (MDS).
> 
> See how very similar they are?
> 
> Furthermore, to recover SMT for userspace (under MDS) we not only need
> core-scheduling but core-scheduling per address space. And ASI was
> specifically designed to help mitigate the trainwreck just described.
> 
> By explicitly exposing (hopefully harmless) part of the kernel to MDS,
> we reduce the part that needs core-scheduling and thus reduce the rate
> the SMT siblngs need to sync up/schedule.
> 
> But looking at it that way, it makes no sense to retain 3 address
> spaces, namely:
> 
>   user / kernel exposed / kernel private.
> 
> Specifically, it makes no sense to expose part of the kernel through MDS
> but not through Meltdow. Therefore we can merge the user and kernel
> exposed address spaces.
> 
> And then we've fully replaced PTI.
> 
> So no, they're not orthogonal.

Right. If we decide to expose more parts of the kernel mappings then that's
just adding more stuff to the existing user (PTI) map mechanics.

As a consequence the CR3 switching points become different or can be
consolidated and that can be handled right at those switching points
depending on static keys or alternatives as we do today with PTI and other
mitigations.

All of that can do without that obscure "state machine" which is solely
there to duct-tape the complete lack of design. The same applies to that
mapping thing. Just mapping randomly selected parts by sticking them into
an array is a non-maintainable approach. This needs proper separation of
text and data sections, so violations of the mapping constraints can be
statically analyzed. Depending solely on the page fault at run time for
analysis is just bound to lead to hard to diagnose failures in the field.

TBH we all know already that this can be done and that this will solve some
of the issues caused by the speculation mess, so just writing some hastily
cobbled together POC code which explodes just by looking at it, does not
lead to anything else than time waste on all ends.

This first needs a clear definition of protection scope. That scope clearly
defines the required mappings and consequently the transition requirements
which provide the necessary transition points for flipping CR3.

If we have agreed on that, then we can think about the implementation
details.

Thanks,

tglx


Re: [RFC v2 00/27] Kernel Address Space Isolation

2019-07-12 Thread Andy Lutomirski
On Fri, Jul 12, 2019 at 6:45 AM Alexandre Chartre
 wrote:
>
>
> On 7/12/19 2:50 PM, Peter Zijlstra wrote:
> > On Fri, Jul 12, 2019 at 01:56:44PM +0200, Alexandre Chartre wrote:
> >
> >> I think that's precisely what makes ASI and PTI different and independent.
> >> PTI is just about switching between userland and kernel page-tables, while
> >> ASI is about switching page-table inside the kernel. You can have ASI 
> >> without
> >> having PTI. You can also use ASI for kernel threads so for code that won't
> >> be triggered from userland and so which won't involve PTI.
> >
> > PTI is not mapping kernel space to avoid speculation 
> > crap (meltdown).
> > ASI is not mapping part of kernel space to avoid (different) speculation 
> > crap (MDS).
> >
> > See how very similar they are?
> >
> >
> > Furthermore, to recover SMT for userspace (under MDS) we not only need
> > core-scheduling but core-scheduling per address space. And ASI was
> > specifically designed to help mitigate the trainwreck just described.
> >
> > By explicitly exposing (hopefully harmless) part of the kernel to MDS,
> > we reduce the part that needs core-scheduling and thus reduce the rate
> > the SMT siblngs need to sync up/schedule.
> >
> > But looking at it that way, it makes no sense to retain 3 address
> > spaces, namely:
> >
> >user / kernel exposed / kernel private.
> >
> > Specifically, it makes no sense to expose part of the kernel through MDS
> > but not through Meltdow. Therefore we can merge the user and kernel
> > exposed address spaces.
>
> The goal of ASI is to provide a reduced address space which exclude sensitive
> data. A user process (for example a database daemon, a web server, or a vmm
> like qemu) will likely have sensitive data mapped in its user address space.
> Such data shouldn't be mapped with ASI because it can potentially leak to the
> sibling hyperthread. For example, if an hyperthread is running a VM then the
> VM could potentially access user sensitive data if they are mapped on the
> sibling hyperthread with ASI.

So I've proposed the following slightly hackish thing:

Add a mechanism (call it /dev/xpfo).  When you open /dev/xpfo and
fallocate it to some size, you allocate that amount of memory and kick
it out of the kernel direct map.  (And pay the IPI cost unless there
were already cached non-direct-mapped pages ready.)  Then you map
*that* into your VMs.  Now, for a dedicated VM host, you map *all* the
VM private memory from /dev/xpfo.  Pretend it's SEV if you want to
determine which pages can be set up like this.

Does this get enough of the benefit at a negligible fraction of the
code complexity cost?  (This plus core scheduling, anyway.)

--Andy


Re: [RFC v2 00/27] Kernel Address Space Isolation

2019-07-12 Thread Alexandre Chartre



On 7/12/19 3:51 PM, Dave Hansen wrote:

On 7/12/19 1:09 AM, Alexandre Chartre wrote:

On 7/12/19 12:38 AM, Dave Hansen wrote:

I don't see the per-cpu areas in here.  But, the ASI macros in
entry_64.S (and asi_start_abort()) use per-cpu data.


We don't map all per-cpu areas, but only the per-cpu variables we need. ASI
code uses the per-cpu cpu_asi_session variable which is mapped when an ASI
is created (see patch 15/26):


No fair!  I had per-cpu variables just for PTI at some point and had to
give them up! ;)


+    /*
+ * Map the percpu ASI sessions. This is used by interrupt handlers
+ * to figure out if we have entered isolation and switch back to
+     * the kernel address space.
+ */
+    err = ASI_MAP_CPUVAR(asi, cpu_asi_session);
+    if (err)
+    return err;



Also, this stuff seems to do naughty stuff (calling C code, touching
per-cpu data) before the PTI CR3 writes have been done.  But, I don't
see anything excluding PTI and this code from coexisting.


My understanding is that PTI CR3 writes only happens when switching to/from
userland. While ASI enter/exit/abort happens while we are already in the
kernel,
so asi_start_abort() is not called when coming from userland and so not
interacting with PTI.


OK, that makes sense.  You only need to call C code when interrupted
from something in the kernel (deeper than the entry code), and those
were already running kernel C code anyway.



Exactly.


If this continues to live in the entry code, I think you have a good
clue where to start commenting.


Yeah, lot of writing to do... :-)
 

BTW, the PTI CR3 writes are not *strictly* about the interrupt coming
from user vs. kernel.  It's tricky because there's a window both in the
entry and exit code where you are in the kernel but have a userspace CR3
value.  You end up needing a CR3 write when you have a userspace CR3
value when the interrupt occurred, not only when you interrupt userspace
itself.



Right. ASI is simpler because it comes from the kernel and return to the
kernel. There's just a small window (on entry) where we have the ASI CR3
but we quickly switch to the full kernel CR3.

alex.


Re: [RFC v2 00/27] Kernel Address Space Isolation

2019-07-12 Thread Dave Hansen
On 7/12/19 6:43 AM, Alexandre Chartre wrote:
> The current approach is assuming that anything in the user address space
> can be sensitive, and so the user address space shouldn't be mapped in ASI.

Is this universally true?

There's certainly *some* mitigation provided by SMAP that would allow
userspace to remain mapped and still protected.


Re: [RFC v2 00/27] Kernel Address Space Isolation

2019-07-12 Thread Dave Hansen
On 7/12/19 5:50 AM, Peter Zijlstra wrote:
> PTI is not mapping kernel space to avoid speculation crap 
> (meltdown).
> ASI is not mapping part of kernel space to avoid (different) speculation crap 
> (MDS).
> 
> See how very similar they are?

That's an interesting point.

I'd add that PTI maps a part of kernel space that partially overlaps
with what ASI wants.

> But looking at it that way, it makes no sense to retain 3 address
> spaces, namely:
> 
>   user / kernel exposed / kernel private.
> 
> Specifically, it makes no sense to expose part of the kernel through MDS
> but not through Meltdown. Therefore we can merge the user and kernel
> exposed address spaces.
> 
> And then we've fully replaced PTI.

So, in one address space (PTI/user or ASI), we say, "screw it" and all
the data mapped is exposed to speculation attacks.  We have to be very
careful about what we map and expose here.

The other (full kernel) address space we are more careful about what we
*do* instead of what we map.  We map everything but have to add
mitigations to ensure that we don't leak anything back to the exposed
address space.

So, maybe we're not replacing PTI as much as we're growing PTI so that
we can run more kernel code with the (now inappropriately named) user
page tables.


Re: [RFC v2 00/27] Kernel Address Space Isolation

2019-07-12 Thread Alexandre Chartre



On 7/12/19 3:07 PM, Peter Zijlstra wrote:

On Fri, Jul 12, 2019 at 02:47:23PM +0200, Alexandre Chartre wrote:

On 7/12/19 2:36 PM, Peter Zijlstra wrote:

On Fri, Jul 12, 2019 at 02:17:20PM +0200, Alexandre Chartre wrote:

On 7/12/19 1:44 PM, Peter Zijlstra wrote:



AFAIK3 this wants/needs to be combined with core-scheduling to be
useful, but not a single mention of that is anywhere.


No. This is actually an alternative to core-scheduling. Eventually, ASI
will kick all sibling hyperthreads when exiting isolation and it needs to
run with the full kernel page-table (note that's currently not in these
patches).

So ASI can be seen as an optimization to disabling hyperthreading: instead
of just disabling hyperthreading you run with ASI, and when ASI can't preserve
isolation you will basically run with a single thread.


You can't do that without much of the scheduler changes present in the
core-scheduling patches.



We hope we can do that without the whole core-scheduling mechanism. The idea
is to send an IPI to all sibling hyperthreads. This IPI will interrupt these
sibling hyperthreads and have them wait for a condition that will allow them
to resume execution (for example when re-entering isolation). We are
investigating this in parallel to ASI.


You cannot wait from IPI context, so you have to go somewhere else to
wait.

Also, consider what happens when the task that entered isolation decides
to schedule out / gets migrated.

I think you'll quickly find yourself back at core-scheduling.



I haven't looked at details about what has been done so far. Hopefully, we
can do something not too complex, or reuse a (small) part of co-scheduling.

Thanks for pointing this out.

alex.


Re: [RFC v2 00/27] Kernel Address Space Isolation

2019-07-12 Thread Dave Hansen
On 7/12/19 1:09 AM, Alexandre Chartre wrote:
> On 7/12/19 12:38 AM, Dave Hansen wrote:
>> I don't see the per-cpu areas in here.  But, the ASI macros in
>> entry_64.S (and asi_start_abort()) use per-cpu data.
> 
> We don't map all per-cpu areas, but only the per-cpu variables we need. ASI
> code uses the per-cpu cpu_asi_session variable which is mapped when an ASI
> is created (see patch 15/26):

No fair!  I had per-cpu variables just for PTI at some point and had to
give them up! ;)

> +    /*
> + * Map the percpu ASI sessions. This is used by interrupt handlers
> + * to figure out if we have entered isolation and switch back to
> + * the kernel address space.
> + */
> +    err = ASI_MAP_CPUVAR(asi, cpu_asi_session);
> +    if (err)
> +    return err;
> 
> 
>> Also, this stuff seems to do naughty stuff (calling C code, touching
>> per-cpu data) before the PTI CR3 writes have been done.  But, I don't
>> see anything excluding PTI and this code from coexisting.
> 
> My understanding is that PTI CR3 writes only happens when switching to/from
> userland. While ASI enter/exit/abort happens while we are already in the
> kernel,
> so asi_start_abort() is not called when coming from userland and so not
> interacting with PTI.

OK, that makes sense.  You only need to call C code when interrupted
from something in the kernel (deeper than the entry code), and those
were already running kernel C code anyway.

If this continues to live in the entry code, I think you have a good
clue where to start commenting.

BTW, the PTI CR3 writes are not *strictly* about the interrupt coming
from user vs. kernel.  It's tricky because there's a window both in the
entry and exit code where you are in the kernel but have a userspace CR3
value.  You end up needing a CR3 write when you have a userspace CR3
value when the interrupt occurred, not only when you interrupt userspace
itself.


Re: [RFC v2 00/27] Kernel Address Space Isolation

2019-07-12 Thread Alexandre Chartre



On 7/12/19 2:50 PM, Peter Zijlstra wrote:

On Fri, Jul 12, 2019 at 01:56:44PM +0200, Alexandre Chartre wrote:


I think that's precisely what makes ASI and PTI different and independent.
PTI is just about switching between userland and kernel page-tables, while
ASI is about switching page-table inside the kernel. You can have ASI without
having PTI. You can also use ASI for kernel threads so for code that won't
be triggered from userland and so which won't involve PTI.


PTI is not mapping kernel space to avoid speculation crap 
(meltdown).
ASI is not mapping part of kernel space to avoid (different) speculation crap 
(MDS).

See how very similar they are?


Furthermore, to recover SMT for userspace (under MDS) we not only need
core-scheduling but core-scheduling per address space. And ASI was
specifically designed to help mitigate the trainwreck just described.

By explicitly exposing (hopefully harmless) part of the kernel to MDS,
we reduce the part that needs core-scheduling and thus reduce the rate
the SMT siblngs need to sync up/schedule.

But looking at it that way, it makes no sense to retain 3 address
spaces, namely:

   user / kernel exposed / kernel private.

Specifically, it makes no sense to expose part of the kernel through MDS
but not through Meltdow. Therefore we can merge the user and kernel
exposed address spaces.


The goal of ASI is to provide a reduced address space which exclude sensitive
data. A user process (for example a database daemon, a web server, or a vmm
like qemu) will likely have sensitive data mapped in its user address space.
Such data shouldn't be mapped with ASI because it can potentially leak to the
sibling hyperthread. For example, if an hyperthread is running a VM then the
VM could potentially access user sensitive data if they are mapped on the
sibling hyperthread with ASI.

The current approach is assuming that anything in the user address space
can be sensitive, and so the user address space shouldn't be mapped in ASI.

It looks like what you are suggesting could be an optimization when creating
an ASI for a process which has no sensitive data (this could be an option to
specify when creating an ASI, for example).

alex.



And then we've fully replaced PTI.

So no, they're not orthogonal.



Re: [RFC v2 00/27] Kernel Address Space Isolation

2019-07-12 Thread Peter Zijlstra
On Fri, Jul 12, 2019 at 02:47:23PM +0200, Alexandre Chartre wrote:
> On 7/12/19 2:36 PM, Peter Zijlstra wrote:
> > On Fri, Jul 12, 2019 at 02:17:20PM +0200, Alexandre Chartre wrote:
> > > On 7/12/19 1:44 PM, Peter Zijlstra wrote:
> > 
> > > > AFAIK3 this wants/needs to be combined with core-scheduling to be
> > > > useful, but not a single mention of that is anywhere.
> > > 
> > > No. This is actually an alternative to core-scheduling. Eventually, ASI
> > > will kick all sibling hyperthreads when exiting isolation and it needs to
> > > run with the full kernel page-table (note that's currently not in these
> > > patches).
> > > 
> > > So ASI can be seen as an optimization to disabling hyperthreading: instead
> > > of just disabling hyperthreading you run with ASI, and when ASI can't 
> > > preserve
> > > isolation you will basically run with a single thread.
> > 
> > You can't do that without much of the scheduler changes present in the
> > core-scheduling patches.
> > 
> 
> We hope we can do that without the whole core-scheduling mechanism. The idea
> is to send an IPI to all sibling hyperthreads. This IPI will interrupt these
> sibling hyperthreads and have them wait for a condition that will allow them
> to resume execution (for example when re-entering isolation). We are
> investigating this in parallel to ASI.

You cannot wait from IPI context, so you have to go somewhere else to
wait.

Also, consider what happens when the task that entered isolation decides
to schedule out / gets migrated.

I think you'll quickly find yourself back at core-scheduling.


Re: [RFC v2 00/27] Kernel Address Space Isolation

2019-07-12 Thread Alexandre Chartre



On 7/12/19 2:36 PM, Peter Zijlstra wrote:

On Fri, Jul 12, 2019 at 02:17:20PM +0200, Alexandre Chartre wrote:

On 7/12/19 1:44 PM, Peter Zijlstra wrote:



AFAIK3 this wants/needs to be combined with core-scheduling to be
useful, but not a single mention of that is anywhere.


No. This is actually an alternative to core-scheduling. Eventually, ASI
will kick all sibling hyperthreads when exiting isolation and it needs to
run with the full kernel page-table (note that's currently not in these
patches).

So ASI can be seen as an optimization to disabling hyperthreading: instead
of just disabling hyperthreading you run with ASI, and when ASI can't preserve
isolation you will basically run with a single thread.


You can't do that without much of the scheduler changes present in the
core-scheduling patches.



We hope we can do that without the whole core-scheduling mechanism. The idea
is to send an IPI to all sibling hyperthreads. This IPI will interrupt these
sibling hyperthreads and have them wait for a condition that will allow them
to resume execution (for example when re-entering isolation). We are
investigating this in parallel to ASI.

alex.


Re: [RFC v2 00/27] Kernel Address Space Isolation

2019-07-12 Thread Peter Zijlstra
On Fri, Jul 12, 2019 at 01:56:44PM +0200, Alexandre Chartre wrote:

> I think that's precisely what makes ASI and PTI different and independent.
> PTI is just about switching between userland and kernel page-tables, while
> ASI is about switching page-table inside the kernel. You can have ASI without
> having PTI. You can also use ASI for kernel threads so for code that won't
> be triggered from userland and so which won't involve PTI.

PTI is not mapping kernel space to avoid speculation crap 
(meltdown).
ASI is not mapping part of kernel space to avoid (different) speculation crap 
(MDS).

See how very similar they are?

Furthermore, to recover SMT for userspace (under MDS) we not only need
core-scheduling but core-scheduling per address space. And ASI was
specifically designed to help mitigate the trainwreck just described.

By explicitly exposing (hopefully harmless) part of the kernel to MDS,
we reduce the part that needs core-scheduling and thus reduce the rate
the SMT siblngs need to sync up/schedule.

But looking at it that way, it makes no sense to retain 3 address
spaces, namely:

  user / kernel exposed / kernel private.

Specifically, it makes no sense to expose part of the kernel through MDS
but not through Meltdow. Therefore we can merge the user and kernel
exposed address spaces.

And then we've fully replaced PTI.

So no, they're not orthogonal.


Re: [RFC v2 00/27] Kernel Address Space Isolation

2019-07-12 Thread Peter Zijlstra
On Fri, Jul 12, 2019 at 02:17:20PM +0200, Alexandre Chartre wrote:
> On 7/12/19 1:44 PM, Peter Zijlstra wrote:

> > AFAIK3 this wants/needs to be combined with core-scheduling to be
> > useful, but not a single mention of that is anywhere.
> 
> No. This is actually an alternative to core-scheduling. Eventually, ASI
> will kick all sibling hyperthreads when exiting isolation and it needs to
> run with the full kernel page-table (note that's currently not in these
> patches).
> 
> So ASI can be seen as an optimization to disabling hyperthreading: instead
> of just disabling hyperthreading you run with ASI, and when ASI can't preserve
> isolation you will basically run with a single thread.

You can't do that without much of the scheduler changes present in the
core-scheduling patches.


Re: [RFC v2 00/27] Kernel Address Space Isolation

2019-07-12 Thread Alexandre Chartre



On 7/12/19 1:44 PM, Peter Zijlstra wrote:

On Thu, Jul 11, 2019 at 04:25:12PM +0200, Alexandre Chartre wrote:

Kernel Address Space Isolation aims to use address spaces to isolate some
parts of the kernel (for example KVM) to prevent leaking sensitive data
between hyper-threads under speculative execution attacks. You can refer
to the first version of this RFC for more context:

https://lkml.org/lkml/2019/5/13/515


No, no, no!

That is the crux of this entire series; you're not punting on explaining
exactly why we want to go dig through 26 patches of gunk.

You get to exactly explain what (your definition of) sensitive data is,
and which speculative scenarios and how this approach mitigates them.

And included in that is a high level overview of the whole thing.



Ok, I will rework the explanation. Sorry about that.


On the one hand you've made this implementation for KVM, while on the
other hand you're saying it is generic but then fail to describe any
!KVM user.

AFAIK all speculative fails this is relevant to are now public, so
excruciating horrible details are fine and required.


Ok.


AFAIK2 this is all because of MDS but it also helps with v1.


Yes, mostly MDS and also L1TF.


AFAIK3 this wants/needs to be combined with core-scheduling to be
useful, but not a single mention of that is anywhere.


No. This is actually an alternative to core-scheduling. Eventually, ASI
will kick all sibling hyperthreads when exiting isolation and it needs to
run with the full kernel page-table (note that's currently not in these
patches).

So ASI can be seen as an optimization to disabling hyperthreading: instead
of just disabling hyperthreading you run with ASI, and when ASI can't preserve
isolation you will basically run with a single thread.

I will add all that to the explanation.

Thanks,

alex.


Re: [RFC v2 00/27] Kernel Address Space Isolation

2019-07-12 Thread Alexandre Chartre



On 7/12/19 12:44 PM, Thomas Gleixner wrote:

On Thu, 11 Jul 2019, Dave Hansen wrote:


On 7/11/19 7:25 AM, Alexandre Chartre wrote:

- Kernel code mapped to the ASI page-table has been reduced to:
   . the entire kernel (I still need to test with only the kernel text)
   . the cpu entry area (because we need the GDT to be mapped)
   . the cpu ASI session (for managing ASI)
   . the current stack

- Optionally, an ASI can request the following kernel mapping to be added:
   . the stack canary
   . the cpu offsets (this_cpu_off)
   . the current task
   . RCU data (rcu_data)
   . CPU HW events (cpu_hw_events).


I don't see the per-cpu areas in here.  But, the ASI macros in
entry_64.S (and asi_start_abort()) use per-cpu data.

Also, this stuff seems to do naughty stuff (calling C code, touching
per-cpu data) before the PTI CR3 writes have been done.  But, I don't
see anything excluding PTI and this code from coexisting.


That ASI thing is just PTI on steroids.

So why do we need two versions of the same thing? That's absolutely bonkers
and will just introduce subtle bugs and conflicting decisions all over the
place.

The need for ASI is very tightly coupled to the need for PTI and there is
absolutely no point in keeping them separate.

The only difference vs. interrupts and exceptions is that the PTI logic
cares whether they enter from user or from kernel space while ASI only
cares about the kernel entry.


I think that's precisely what makes ASI and PTI different and independent.
PTI is just about switching between userland and kernel page-tables, while
ASI is about switching page-table inside the kernel. You can have ASI without
having PTI. You can also use ASI for kernel threads so for code that won't
be triggered from userland and so which won't involve PTI.


But most exceptions/interrupts transitions do not require to be handled at
the entry code level because on VMEXIT the exit reason clearly tells
whether a switch to the kernel CR3 is necessary or not. So this has to be
handled at the VMM level already in a very clean and simple way.

I'm not a virt wizard, but according to code inspection and instrumentation
even the NMI on the host is actually reinjected manually into the host via
'int $2' after the VMEXIT and for MCE it looks like manual handling as
well. So why do we need to sprinkle that muck all over the entry code?

 From a semantical perspective VMENTER/VMEXIT are very similar to the return
to user / enter to user mechanics. Just that the transition happens in the
VMM code and not at the regular user/kernel transition points.


VMExit returns to the kernel, and ASI is used to run the VMExit handler with
a limited kernel address space instead of using the full kernel address space.
Change in entry code is required to handle any interrupt/exception which
can happen while running code with ASI (like KVM VMExit handler).

Note that KVM is an example of an ASI consumer, but ASI is generic and can be
used to run (mostly) any kernel code if you want to run code with a reduced
kernel address space.


So why do you want ot treat that differently? There is absolutely zero
reason to do so. And there is no reason to create a pointlessly different
version of PTI which introduces yet another variant of a restricted page
table instead of just reusing and extending what's there already.



As I've tried to explain, to me PTI and ASI are different and independent.
PTI manages switching between userland and kernel page-table, and ASI manages
switching between kernel and a reduced-kernel page-table.


Thanks,

alex.


Re: [RFC v2 00/27] Kernel Address Space Isolation

2019-07-12 Thread Peter Zijlstra
On Thu, Jul 11, 2019 at 04:25:12PM +0200, Alexandre Chartre wrote:
> Kernel Address Space Isolation aims to use address spaces to isolate some
> parts of the kernel (for example KVM) to prevent leaking sensitive data
> between hyper-threads under speculative execution attacks. You can refer
> to the first version of this RFC for more context:
> 
>https://lkml.org/lkml/2019/5/13/515

No, no, no!

That is the crux of this entire series; you're not punting on explaining
exactly why we want to go dig through 26 patches of gunk.

You get to exactly explain what (your definition of) sensitive data is,
and which speculative scenarios and how this approach mitigates them.

And included in that is a high level overview of the whole thing.

On the one hand you've made this implementation for KVM, while on the
other hand you're saying it is generic but then fail to describe any
!KVM user.

AFAIK all speculative fails this is relevant to are now public, so
excruciating horrible details are fine and required.

AFAIK2 this is all because of MDS but it also helps with v1.

AFAIK3 this wants/needs to be combined with core-scheduling to be
useful, but not a single mention of that is anywhere.


Re: [RFC v2 00/27] Kernel Address Space Isolation

2019-07-12 Thread Thomas Gleixner
On Thu, 11 Jul 2019, Dave Hansen wrote:

> On 7/11/19 7:25 AM, Alexandre Chartre wrote:
> > - Kernel code mapped to the ASI page-table has been reduced to:
> >   . the entire kernel (I still need to test with only the kernel text)
> >   . the cpu entry area (because we need the GDT to be mapped)
> >   . the cpu ASI session (for managing ASI)
> >   . the current stack
> > 
> > - Optionally, an ASI can request the following kernel mapping to be added:
> >   . the stack canary
> >   . the cpu offsets (this_cpu_off)
> >   . the current task
> >   . RCU data (rcu_data)
> >   . CPU HW events (cpu_hw_events).
> 
> I don't see the per-cpu areas in here.  But, the ASI macros in
> entry_64.S (and asi_start_abort()) use per-cpu data.
> 
> Also, this stuff seems to do naughty stuff (calling C code, touching
> per-cpu data) before the PTI CR3 writes have been done.  But, I don't
> see anything excluding PTI and this code from coexisting.

That ASI thing is just PTI on steroids.

So why do we need two versions of the same thing? That's absolutely bonkers
and will just introduce subtle bugs and conflicting decisions all over the
place.

The need for ASI is very tightly coupled to the need for PTI and there is
absolutely no point in keeping them separate.

The only difference vs. interrupts and exceptions is that the PTI logic
cares whether they enter from user or from kernel space while ASI only
cares about the kernel entry.

But most exceptions/interrupts transitions do not require to be handled at
the entry code level because on VMEXIT the exit reason clearly tells
whether a switch to the kernel CR3 is necessary or not. So this has to be
handled at the VMM level already in a very clean and simple way.

I'm not a virt wizard, but according to code inspection and instrumentation
even the NMI on the host is actually reinjected manually into the host via
'int $2' after the VMEXIT and for MCE it looks like manual handling as
well. So why do we need to sprinkle that muck all over the entry code?

>From a semantical perspective VMENTER/VMEXIT are very similar to the return
to user / enter to user mechanics. Just that the transition happens in the
VMM code and not at the regular user/kernel transition points.

So why do you want ot treat that differently? There is absolutely zero
reason to do so. And there is no reason to create a pointlessly different
version of PTI which introduces yet another variant of a restricted page
table instead of just reusing and extending what's there already.

Thanks,

tglx


Re: [RFC v2 00/27] Kernel Address Space Isolation

2019-07-12 Thread Alexandre Chartre



On 7/12/19 12:38 AM, Dave Hansen wrote:

On 7/11/19 7:25 AM, Alexandre Chartre wrote:

- Kernel code mapped to the ASI page-table has been reduced to:
   . the entire kernel (I still need to test with only the kernel text)
   . the cpu entry area (because we need the GDT to be mapped)
   . the cpu ASI session (for managing ASI)
   . the current stack

- Optionally, an ASI can request the following kernel mapping to be added:
   . the stack canary
   . the cpu offsets (this_cpu_off)
   . the current task
   . RCU data (rcu_data)
   . CPU HW events (cpu_hw_events).


I don't see the per-cpu areas in here.  But, the ASI macros in
entry_64.S (and asi_start_abort()) use per-cpu data.


We don't map all per-cpu areas, but only the per-cpu variables we need. ASI
code uses the per-cpu cpu_asi_session variable which is mapped when an ASI
is created (see patch 15/26):

+   /*
+* Map the percpu ASI sessions. This is used by interrupt handlers
+* to figure out if we have entered isolation and switch back to
+* the kernel address space.
+*/
+   err = ASI_MAP_CPUVAR(asi, cpu_asi_session);
+   if (err)
+   return err;



Also, this stuff seems to do naughty stuff (calling C code, touching
per-cpu data) before the PTI CR3 writes have been done.  But, I don't
see anything excluding PTI and this code from coexisting.


My understanding is that PTI CR3 writes only happens when switching to/from
userland. While ASI enter/exit/abort happens while we are already in the kernel,
so asi_start_abort() is not called when coming from userland and so not
interacting with PTI.

For example, if ASI in used during a syscall (e.g. with KVM), we have:

 -> syscall
- PTI CR3 write (kernel CR3)
- syscall handler:
  ...
  asi_enter()-> write ASI CR3
  .. code run with ASI ..
  asi_exit() or asi abort -> restore original CR3
  ...
- PTI CR3 write (userland CR3)
 <- syscall


Thanks,

alex.


Re: [RFC v2 01/26] mm/x86: Introduce kernel address space isolation

2019-07-12 Thread Alexandre Chartre



On 7/11/19 11:33 PM, Thomas Gleixner wrote:

On Thu, 11 Jul 2019, Alexandre Chartre wrote:

+/*
+ * When isolation is active, the address space doesn't necessarily map
+ * the percpu offset value (this_cpu_off) which is used to get pointers
+ * to percpu variables. So functions which can be invoked while isolation
+ * is active shouldn't be getting pointers to percpu variables (i.e. with
+ * get_cpu_var() or this_cpu_ptr()). Instead percpu variable should be
+ * directly read or written to (i.e. with this_cpu_read() or
+ * this_cpu_write()).
+ */
+
+int asi_enter(struct asi *asi)
+{
+   enum asi_session_state state;
+   struct asi *current_asi;
+   struct asi_session *asi_session;
+
+   state = this_cpu_read(cpu_asi_session.state);
+   /*
+* We can re-enter isolation, but only with the same ASI (we don't
+* support nesting isolation). Also, if isolation is still active,
+* then we should be re-entering with the same task.
+*/
+   if (state == ASI_SESSION_STATE_ACTIVE) {
+   current_asi = this_cpu_read(cpu_asi_session.asi);
+   if (current_asi != asi) {
+   WARN_ON(1);
+   return -EBUSY;
+   }
+   WARN_ON(this_cpu_read(cpu_asi_session.task) != current);
+   return 0;
+   }
+
+   /* isolation is not active so we can safely access the percpu pointer */
+   asi_session = &get_cpu_var(cpu_asi_session);


get_cpu_var()?? Where is the matching put_cpu_var() ? get_cpu_var()
contains a preempt_disable ...

What's wrong with a simple this_cpu_ptr() here?



Oups, my mistake, I should be using this_cpu_ptr(). I will replace all 
get_cpu_var()
with this_cpu_ptr().



+void asi_exit(struct asi *asi)
+{
+   struct asi_session *asi_session;
+   enum asi_session_state asi_state;
+   unsigned long original_cr3;
+
+   asi_state = this_cpu_read(cpu_asi_session.state);
+   if (asi_state == ASI_SESSION_STATE_INACTIVE)
+   return;
+
+   /* TODO: Kick sibling hyperthread before switching to kernel cr3 */
+   original_cr3 = this_cpu_read(cpu_asi_session.original_cr3);
+   if (original_cr3)


Why would this be 0 if the session is active?



Correct, original_cr3 won't be 0. I think this is a remain from a previous 
version
where original_cr3 was handled differently.



+   write_cr3(original_cr3);
+
+   /* page-table was switched, we can now access the percpu pointer */
+   asi_session = &get_cpu_var(cpu_asi_session);


See above.



Will fix that.


Thanks,

alex.


+   WARN_ON(asi_session->task != current);
+   asi_session->state = ASI_SESSION_STATE_INACTIVE;
+   asi_session->asi = NULL;
+   asi_session->task = NULL;
+   asi_session->original_cr3 = 0;
+}


Thanks,

tglx



Re: [RFC v2 00/27] Kernel Address Space Isolation

2019-07-11 Thread Dave Hansen
On 7/11/19 7:25 AM, Alexandre Chartre wrote:
> - Kernel code mapped to the ASI page-table has been reduced to:
>   . the entire kernel (I still need to test with only the kernel text)
>   . the cpu entry area (because we need the GDT to be mapped)
>   . the cpu ASI session (for managing ASI)
>   . the current stack
> 
> - Optionally, an ASI can request the following kernel mapping to be added:
>   . the stack canary
>   . the cpu offsets (this_cpu_off)
>   . the current task
>   . RCU data (rcu_data)
>   . CPU HW events (cpu_hw_events).

I don't see the per-cpu areas in here.  But, the ASI macros in
entry_64.S (and asi_start_abort()) use per-cpu data.

Also, this stuff seems to do naughty stuff (calling C code, touching
per-cpu data) before the PTI CR3 writes have been done.  But, I don't
see anything excluding PTI and this code from coexisting.


Re: [RFC v2 01/26] mm/x86: Introduce kernel address space isolation

2019-07-11 Thread Thomas Gleixner
On Thu, 11 Jul 2019, Alexandre Chartre wrote:
> +/*
> + * When isolation is active, the address space doesn't necessarily map
> + * the percpu offset value (this_cpu_off) which is used to get pointers
> + * to percpu variables. So functions which can be invoked while isolation
> + * is active shouldn't be getting pointers to percpu variables (i.e. with
> + * get_cpu_var() or this_cpu_ptr()). Instead percpu variable should be
> + * directly read or written to (i.e. with this_cpu_read() or
> + * this_cpu_write()).
> + */
> +
> +int asi_enter(struct asi *asi)
> +{
> + enum asi_session_state state;
> + struct asi *current_asi;
> + struct asi_session *asi_session;
> +
> + state = this_cpu_read(cpu_asi_session.state);
> + /*
> +  * We can re-enter isolation, but only with the same ASI (we don't
> +  * support nesting isolation). Also, if isolation is still active,
> +  * then we should be re-entering with the same task.
> +  */
> + if (state == ASI_SESSION_STATE_ACTIVE) {
> + current_asi = this_cpu_read(cpu_asi_session.asi);
> + if (current_asi != asi) {
> + WARN_ON(1);
> + return -EBUSY;
> + }
> + WARN_ON(this_cpu_read(cpu_asi_session.task) != current);
> + return 0;
> + }
> +
> + /* isolation is not active so we can safely access the percpu pointer */
> + asi_session = &get_cpu_var(cpu_asi_session);

get_cpu_var()?? Where is the matching put_cpu_var() ? get_cpu_var()
contains a preempt_disable ...

What's wrong with a simple this_cpu_ptr() here?

> +void asi_exit(struct asi *asi)
> +{
> + struct asi_session *asi_session;
> + enum asi_session_state asi_state;
> + unsigned long original_cr3;
> +
> + asi_state = this_cpu_read(cpu_asi_session.state);
> + if (asi_state == ASI_SESSION_STATE_INACTIVE)
> + return;
> +
> + /* TODO: Kick sibling hyperthread before switching to kernel cr3 */
> + original_cr3 = this_cpu_read(cpu_asi_session.original_cr3);
> + if (original_cr3)

Why would this be 0 if the session is active?

> + write_cr3(original_cr3);
> +
> + /* page-table was switched, we can now access the percpu pointer */
> + asi_session = &get_cpu_var(cpu_asi_session);

See above.

> + WARN_ON(asi_session->task != current);
> + asi_session->state = ASI_SESSION_STATE_INACTIVE;
> + asi_session->asi = NULL;
> + asi_session->task = NULL;
> + asi_session->original_cr3 = 0;
> +}

Thanks,

tglx


Re: [RFC v2 00/27] Kernel Address Space Isolation

2019-07-11 Thread Alexandre Chartre



And I've just noticed that I've messed up the subject of the cover letter.
There are 26 patches, not 27. So it should have been 00/26 not 00/27.

Sorry about that.

alex.

On 7/11/19 4:25 PM, Alexandre Chartre wrote:

Hi,

This is version 2 of the "KVM Address Space Isolation" RFC. The code
has been completely changed compared to v1 and it now provides a generic
kernel framework which provides Address Space Isolation; and KVM is now
a simple consumer of that framework. That's why the RFC title has been
changed from "KVM Address Space Isolation" to "Kernel Address Space
Isolation".

Kernel Address Space Isolation aims to use address spaces to isolate some
parts of the kernel (for example KVM) to prevent leaking sensitive data
between hyper-threads under speculative execution attacks. You can refer
to the first version of this RFC for more context:

https://lkml.org/lkml/2019/5/13/515

The new code is still a proof of concept. It is much more stable than v1:
I am able to run a VM with a full OS (and also a nested VM) with multiple
vcpus. But it looks like there are still some corner cases which cause the
system to crash/hang.

I am looking for feedback about this new approach where address space
isolation is provided by the kernel, and KVM is a just a consumer of this
new framework.


Changes
===

- Address Space Isolation (ASI) is now provided as a kernel framework:
   interfaces for creating and managing an ASI are provided by the kernel,
   there are not implemented in KVM.

- An ASI is associated with a page-table, we don't use mm anymore. Entering
   isolation is done by just updating CR3 to use the ASI page-table. Exiting
   isolation restores CR3 with the CR3 value present before entering isolation.

- Isolation is exited at the beginning of any interrupt/exception handler,
   and on context switch.

- Isolation doesn't disable interrupt, but if an interrupt occurs the
   interrupt handler will exit isolation.

- The current stack is mapped when entering isolation and unmapped when
   exiting isolation.

- The current task is not mapped by default, but there's an option to map it.
   In such a case, the current task is mapped when entering isolation and
   unmap when exiting isolation.

- Kernel code mapped to the ASI page-table has been reduced to:
   . the entire kernel (I still need to test with only the kernel text)
   . the cpu entry area (because we need the GDT to be mapped)
   . the cpu ASI session (for managing ASI)
   . the current stack

- Optionally, an ASI can request the following kernel mapping to be added:
   . the stack canary
   . the cpu offsets (this_cpu_off)
   . the current task
   . RCU data (rcu_data)
   . CPU HW events (cpu_hw_events).

   All these optional mappings are used for KVM isolation.
   


Patches:


The proposed patches provides a framework for creating an Address Space
Isolation (ASI) (represented by a struct asi). The ASI has a page-table which
can be populated by copying mappings from the kernel page-table. The ASI can
then be entered/exited by switching between the kernel page-table and the
ASI page-table. In addition, any interrupt, exception or context switch
will automatically abort and exit the isolation. Finally patches use the
ASI framework to implement KVM isolation.

- 01-03: Core of the ASI framework: create/destroy ASI, enter/exit/abort
   isolation, ASI page-fault handler.

- 04-14: Functions to manage, populate and clear an ASI page-table.

- 15-20: ASI core mappings and optional mappings.

- 21: Make functions to read cr3/cr4 ASI aware

- 22-26: Use ASI in KVM to provide isolation for VMExit handlers.


API Overview:
=
Here is a short description of the main ASI functions provided by the framwork.

struct asi *asi_create(int map_flags)

   Create an Address Space Isolation (ASI). map_flags can be used to specify
   optional kernel mapping to be added to the ASI page-table (for example,
   ASI_MAP_STACK_CANARY to map the stack canary).


void asi_destroy(struct asi *asi)

   Destroy an ASI.


int asi_enter(struct asi *asi)

   Enter isolation for the specified ASI. This switches from the kernel 
page-table
   to the page-table associated with the ASI.


void asi_exit(struct asi *asi)

   Exit isolation for the specified ASI. This switches back to the kernel
   page-table


int asi_map(struct asi *asi, void *ptr, unsigned long size);

   Copy kernel mapping to the specified ASI page-table.


void asi_unmap(struct asi *asi, void *ptr);

   Clear kernel mapping from the specified ASI page-table.



Alexandre Chartre (23):
   mm/x86: Introduce kernel address space isolation
   mm/asi: Abort isolation on interrupt, exception and context switch
   mm/asi: Handle page fault due to address space isolation
   mm/asi: Functions to track buffers allocated for an ASI page-table
   mm/asi: Add ASI page-table entry offset functions
   mm/asi: Add ASI page-table entry 

[RFC v2 00/27] Kernel Address Space Isolation

2019-07-11 Thread Alexandre Chartre
Hi,

This is version 2 of the "KVM Address Space Isolation" RFC. The code
has been completely changed compared to v1 and it now provides a generic
kernel framework which provides Address Space Isolation; and KVM is now
a simple consumer of that framework. That's why the RFC title has been
changed from "KVM Address Space Isolation" to "Kernel Address Space
Isolation".

Kernel Address Space Isolation aims to use address spaces to isolate some
parts of the kernel (for example KVM) to prevent leaking sensitive data
between hyper-threads under speculative execution attacks. You can refer
to the first version of this RFC for more context:

   https://lkml.org/lkml/2019/5/13/515

The new code is still a proof of concept. It is much more stable than v1:
I am able to run a VM with a full OS (and also a nested VM) with multiple
vcpus. But it looks like there are still some corner cases which cause the
system to crash/hang.

I am looking for feedback about this new approach where address space
isolation is provided by the kernel, and KVM is a just a consumer of this
new framework.


Changes
===

- Address Space Isolation (ASI) is now provided as a kernel framework:
  interfaces for creating and managing an ASI are provided by the kernel,
  there are not implemented in KVM.

- An ASI is associated with a page-table, we don't use mm anymore. Entering
  isolation is done by just updating CR3 to use the ASI page-table. Exiting
  isolation restores CR3 with the CR3 value present before entering isolation.

- Isolation is exited at the beginning of any interrupt/exception handler,
  and on context switch.

- Isolation doesn't disable interrupt, but if an interrupt occurs the
  interrupt handler will exit isolation.

- The current stack is mapped when entering isolation and unmapped when
  exiting isolation.

- The current task is not mapped by default, but there's an option to map it.
  In such a case, the current task is mapped when entering isolation and
  unmap when exiting isolation.

- Kernel code mapped to the ASI page-table has been reduced to:
  . the entire kernel (I still need to test with only the kernel text)
  . the cpu entry area (because we need the GDT to be mapped)
  . the cpu ASI session (for managing ASI)
  . the current stack

- Optionally, an ASI can request the following kernel mapping to be added:
  . the stack canary
  . the cpu offsets (this_cpu_off)
  . the current task
  . RCU data (rcu_data)
  . CPU HW events (cpu_hw_events).

  All these optional mappings are used for KVM isolation.
  

Patches:


The proposed patches provides a framework for creating an Address Space
Isolation (ASI) (represented by a struct asi). The ASI has a page-table which
can be populated by copying mappings from the kernel page-table. The ASI can
then be entered/exited by switching between the kernel page-table and the
ASI page-table. In addition, any interrupt, exception or context switch
will automatically abort and exit the isolation. Finally patches use the
ASI framework to implement KVM isolation.

- 01-03: Core of the ASI framework: create/destroy ASI, enter/exit/abort
  isolation, ASI page-fault handler.

- 04-14: Functions to manage, populate and clear an ASI page-table.

- 15-20: ASI core mappings and optional mappings.

- 21: Make functions to read cr3/cr4 ASI aware

- 22-26: Use ASI in KVM to provide isolation for VMExit handlers.


API Overview:
=
Here is a short description of the main ASI functions provided by the framwork.

struct asi *asi_create(int map_flags)

  Create an Address Space Isolation (ASI). map_flags can be used to specify
  optional kernel mapping to be added to the ASI page-table (for example,
  ASI_MAP_STACK_CANARY to map the stack canary).


void asi_destroy(struct asi *asi)

  Destroy an ASI.


int asi_enter(struct asi *asi)

  Enter isolation for the specified ASI. This switches from the kernel 
page-table
  to the page-table associated with the ASI.


void asi_exit(struct asi *asi)

  Exit isolation for the specified ASI. This switches back to the kernel
  page-table


int asi_map(struct asi *asi, void *ptr, unsigned long size);

  Copy kernel mapping to the specified ASI page-table.


void asi_unmap(struct asi *asi, void *ptr);

  Clear kernel mapping from the specified ASI page-table.



Alexandre Chartre (23):
  mm/x86: Introduce kernel address space isolation
  mm/asi: Abort isolation on interrupt, exception and context switch
  mm/asi: Handle page fault due to address space isolation
  mm/asi: Functions to track buffers allocated for an ASI page-table
  mm/asi: Add ASI page-table entry offset functions
  mm/asi: Add ASI page-table entry allocation functions
  mm/asi: Add ASI page-table entry set functions
  mm/asi: Functions to populate an ASI page-table from a VA range
  mm/asi: Helper functions to map module into ASI
  mm/asi: Keep track of VA ranges mapped in ASI page-table
  mm/asi: Functions to cl

[RFC v2 01/26] mm/x86: Introduce kernel address space isolation

2019-07-11 Thread Alexandre Chartre
Introduce core functions and structures for implementing Address Space
Isolation (ASI). Kernel address space isolation provides the ability to
run some kernel code with a reduced kernel address space.

An address space isolation is defined with a struct asi structure which
has its own page-table. While, for now, this page-table is empty, it
will eventually be possible to populate it so that it is much smaller
than the full kernel page-table.

Isolation is entered by calling asi_enter() which switches the kernel
page-table to the address space isolation page-table. Isolation is then
exited by calling asi_exit() which switches the page-table back to the
kernel page-table.

Signed-off-by: Alexandre Chartre 
---
 arch/x86/include/asm/asi.h |   41 
 arch/x86/mm/Makefile   |2 +
 arch/x86/mm/asi.c  |  152 
 security/Kconfig   |   10 +++
 4 files changed, 205 insertions(+), 0 deletions(-)
 create mode 100644 arch/x86/include/asm/asi.h
 create mode 100644 arch/x86/mm/asi.c

diff --git a/arch/x86/include/asm/asi.h b/arch/x86/include/asm/asi.h
new file mode 100644
index 000..8a13f73
--- /dev/null
+++ b/arch/x86/include/asm/asi.h
@@ -0,0 +1,41 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef ARCH_X86_MM_ASI_H
+#define ARCH_X86_MM_ASI_H
+
+#ifdef CONFIG_ADDRESS_SPACE_ISOLATION
+
+#include 
+#include 
+
+struct asi {
+   spinlock_t  lock;   /* protect all attributes */
+   pgd_t   *pgd;   /* ASI page-table */
+};
+
+/*
+ * An ASI session maintains the state of address state isolation on a
+ * cpu. There is one ASI session per cpu. There is no lock to protect
+ * members of the asi_session structure as each cpu is managing its
+ * own ASI session.
+ */
+
+enum asi_session_state {
+   ASI_SESSION_STATE_INACTIVE, /* no address space isolation */
+   ASI_SESSION_STATE_ACTIVE,   /* address space isolation is active */
+};
+
+struct asi_session {
+   struct asi  *asi;   /* ASI for this session */
+   enum asi_session_state  state;  /* state of ASI session */
+   unsigned long   original_cr3;   /* cr3 before entering ASI */
+   struct task_struct  *task;  /* task during isolation */
+} __aligned(PAGE_SIZE);
+
+extern struct asi *asi_create(void);
+extern void asi_destroy(struct asi *asi);
+extern int asi_enter(struct asi *asi);
+extern void asi_exit(struct asi *asi);
+
+#endif /* CONFIG_ADDRESS_SPACE_ISOLATION */
+
+#endif
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 84373dc..dae5c8a 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -49,7 +49,9 @@ obj-$(CONFIG_X86_INTEL_MPX)   += mpx.o
 obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o
 obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o
 obj-$(CONFIG_PAGE_TABLE_ISOLATION) += pti.o
+obj-$(CONFIG_ADDRESS_SPACE_ISOLATION)  += asi.o
 
 obj-$(CONFIG_AMD_MEM_ENCRYPT)  += mem_encrypt.o
 obj-$(CONFIG_AMD_MEM_ENCRYPT)  += mem_encrypt_identity.o
 obj-$(CONFIG_AMD_MEM_ENCRYPT)  += mem_encrypt_boot.o
+
diff --git a/arch/x86/mm/asi.c b/arch/x86/mm/asi.c
new file mode 100644
index 000..c3993b7
--- /dev/null
+++ b/arch/x86/mm/asi.c
@@ -0,0 +1,152 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Kernel Address Space Isolation (ASI)
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include 
+#include 
+#include 
+
+/* ASI sessions, one per cpu */
+DEFINE_PER_CPU_PAGE_ALIGNED(struct asi_session, cpu_asi_session);
+
+static int asi_init_mapping(struct asi *asi)
+{
+   /*
+* TODO: Populate the ASI page-table with minimal mappings so
+* that we can at least enter isolation and abort.
+*/
+   return 0;
+}
+
+struct asi *asi_create(void)
+{
+   struct page *page;
+   struct asi *asi;
+   int err;
+
+   asi = kzalloc(sizeof(*asi), GFP_KERNEL);
+   if (!asi)
+   return NULL;
+
+   page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+   if (!page)
+   goto error;
+
+   asi->pgd = page_address(page);
+   spin_lock_init(&asi->lock);
+
+   err = asi_init_mapping(asi);
+   if (err)
+   goto error;
+
+   return asi;
+
+error:
+   asi_destroy(asi);
+   return NULL;
+}
+EXPORT_SYMBOL(asi_create);
+
+void asi_destroy(struct asi *asi)
+{
+   if (!asi)
+   return;
+
+   if (asi->pgd)
+   free_page((unsigned long)asi->pgd);
+
+   kfree(asi);
+}
+EXPORT_SYMBOL(asi_destroy);
+
+
+/*
+ * When isolation is active, the address space doesn't necessarily map
+ * the percpu offset value (this_cpu_off) which is used to get pointers
+ * to percpu variables. So functions which can be invoked while isolation
+ * is active shouldn't be getting po

  1   2   3   4   5   >