[PATCH 07/26] x86/kexec: support p4d_t

2017-03-12 Thread Kirill A. Shutemov
Handle additional page table level in kexec code.

Signed-off-by: Kirill A. Shutemov 
---
 arch/x86/include/asm/kexec.h   |  1 +
 arch/x86/kernel/machine_kexec_32.c |  4 +++-
 arch/x86/kernel/machine_kexec_64.c | 14 --
 3 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h
index 282630e4c6ea..70ef205489f0 100644
--- a/arch/x86/include/asm/kexec.h
+++ b/arch/x86/include/asm/kexec.h
@@ -164,6 +164,7 @@ struct kimage_arch {
 };
 #else
 struct kimage_arch {
+   p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
diff --git a/arch/x86/kernel/machine_kexec_32.c 
b/arch/x86/kernel/machine_kexec_32.c
index 469b23d6acc2..5f43cec296c5 100644
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -103,6 +103,7 @@ static void machine_kexec_page_table_set_one(
pgd_t *pgd, pmd_t *pmd, pte_t *pte,
unsigned long vaddr, unsigned long paddr)
 {
+   p4d_t *p4d;
pud_t *pud;
 
pgd += pgd_index(vaddr);
@@ -110,7 +111,8 @@ static void machine_kexec_page_table_set_one(
if (!(pgd_val(*pgd) & _PAGE_PRESENT))
set_pgd(pgd, __pgd(__pa(pmd) | _PAGE_PRESENT));
 #endif
-   pud = pud_offset(pgd, vaddr);
+   p4d = p4d_offset(pgd, vaddr);
+   pud = pud_offset(p4d, vaddr);
pmd = pmd_offset(pud, vaddr);
if (!(pmd_val(*pmd) & _PAGE_PRESENT))
set_pmd(pmd, __pmd(__pa(pte) | _PAGE_TABLE));
diff --git a/arch/x86/kernel/machine_kexec_64.c 
b/arch/x86/kernel/machine_kexec_64.c
index 857cdbd02867..085c3b300d32 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -36,6 +36,7 @@ static struct kexec_file_ops *kexec_file_loaders[] = {
 
 static void free_transition_pgtable(struct kimage *image)
 {
+   free_page((unsigned long)image->arch.p4d);
free_page((unsigned long)image->arch.pud);
free_page((unsigned long)image->arch.pmd);
free_page((unsigned long)image->arch.pte);
@@ -43,6 +44,7 @@ static void free_transition_pgtable(struct kimage *image)
 
 static int init_transition_pgtable(struct kimage *image, pgd_t *pgd)
 {
+   p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
@@ -53,13 +55,21 @@ static int init_transition_pgtable(struct kimage *image, 
pgd_t *pgd)
paddr = __pa(page_address(image->control_code_page)+PAGE_SIZE);
pgd += pgd_index(vaddr);
if (!pgd_present(*pgd)) {
+   p4d = (p4d_t *)get_zeroed_page(GFP_KERNEL);
+   if (!p4d)
+   goto err;
+   image->arch.p4d = p4d;
+   set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE));
+   }
+   p4d = p4d_offset(pgd, vaddr);
+   if (!p4d_present(*p4d)) {
pud = (pud_t *)get_zeroed_page(GFP_KERNEL);
if (!pud)
goto err;
image->arch.pud = pud;
-   set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE));
+   set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE));
}
-   pud = pud_offset(pgd, vaddr);
+   pud = pud_offset(p4d, vaddr);
if (!pud_present(*pud)) {
pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL);
if (!pmd)
-- 
2.11.0



[PATCH 21/26] x86/mm: add support of additional page table level during early boot

2017-03-12 Thread Kirill A. Shutemov
This patch adds support for 5-level paging during early boot.
It generalizes boot for 4- and 5-level paging on 64-bit systems with
compile-time switch between them.

Signed-off-by: Kirill A. Shutemov 
---
 arch/x86/boot/compressed/head_64.S  | 23 +--
 arch/x86/include/asm/pgtable.h  |  2 +-
 arch/x86/include/asm/pgtable_64.h   |  6 ++-
 arch/x86/include/uapi/asm/processor-flags.h |  2 +
 arch/x86/kernel/espfix_64.c |  2 +-
 arch/x86/kernel/head64.c| 40 +-
 arch/x86/kernel/head_64.S   | 63 +
 arch/x86/kernel/machine_kexec_64.c  |  2 +-
 arch/x86/mm/dump_pagetables.c   |  2 +-
 arch/x86/mm/kasan_init_64.c | 12 +++---
 arch/x86/realmode/init.c|  2 +-
 arch/x86/xen/mmu.c  | 38 ++---
 arch/x86/xen/xen-pvh.S  |  2 +-
 13 files changed, 136 insertions(+), 60 deletions(-)

diff --git a/arch/x86/boot/compressed/head_64.S 
b/arch/x86/boot/compressed/head_64.S
index d2ae1f821e0c..3ed26769810b 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -122,9 +122,12 @@ ENTRY(startup_32)
addl%ebp, gdt+2(%ebp)
lgdtgdt(%ebp)
 
-   /* Enable PAE mode */
+   /* Enable PAE and LA57 mode */
movl%cr4, %eax
orl $X86_CR4_PAE, %eax
+#ifdef CONFIG_X86_5LEVEL
+   orl $X86_CR4_LA57, %eax
+#endif
movl%eax, %cr4
 
  /*
@@ -136,13 +139,24 @@ ENTRY(startup_32)
movl$(BOOT_INIT_PGT_SIZE/4), %ecx
rep stosl
 
+   xorl%edx, %edx
+
+   /* Build Top Level */
+   lealpgtable(%ebx,%edx,1), %edi
+   leal0x1007 (%edi), %eax
+   movl%eax, 0(%edi)
+
+#ifdef CONFIG_X86_5LEVEL
/* Build Level 4 */
-   lealpgtable + 0(%ebx), %edi
+   addl$0x1000, %edx
+   lealpgtable(%ebx,%edx), %edi
leal0x1007 (%edi), %eax
movl%eax, 0(%edi)
+#endif
 
/* Build Level 3 */
-   lealpgtable + 0x1000(%ebx), %edi
+   addl$0x1000, %edx
+   lealpgtable(%ebx,%edx), %edi
leal0x1007(%edi), %eax
movl$4, %ecx
 1: movl%eax, 0x00(%edi)
@@ -152,7 +166,8 @@ ENTRY(startup_32)
jnz 1b
 
/* Build Level 2 */
-   lealpgtable + 0x2000(%ebx), %edi
+   addl$0x1000, %edx
+   lealpgtable(%ebx,%edx), %edi
movl$0x0183, %eax
movl$2048, %ecx
 1: movl%eax, 0(%edi)
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 90f32116acd8..6cefd861ac65 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -917,7 +917,7 @@ extern pgd_t trampoline_pgd_entry;
 static inline void __meminit init_trampoline_default(void)
 {
/* Default trampoline pgd value */
-   trampoline_pgd_entry = init_level4_pgt[pgd_index(__PAGE_OFFSET)];
+   trampoline_pgd_entry = init_top_pgt[pgd_index(__PAGE_OFFSET)];
 }
 # ifdef CONFIG_RANDOMIZE_MEMORY
 void __meminit init_trampoline(void);
diff --git a/arch/x86/include/asm/pgtable_64.h 
b/arch/x86/include/asm/pgtable_64.h
index 9991224f6238..c9e41f1599dd 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -14,15 +14,17 @@
 #include 
 #include 
 
+extern p4d_t level4_kernel_pgt[512];
+extern p4d_t level4_ident_pgt[512];
 extern pud_t level3_kernel_pgt[512];
 extern pud_t level3_ident_pgt[512];
 extern pmd_t level2_kernel_pgt[512];
 extern pmd_t level2_fixmap_pgt[512];
 extern pmd_t level2_ident_pgt[512];
 extern pte_t level1_fixmap_pgt[512];
-extern pgd_t init_level4_pgt[];
+extern pgd_t init_top_pgt[];
 
-#define swapper_pg_dir init_level4_pgt
+#define swapper_pg_dir init_top_pgt
 
 extern void paging_init(void);
 
diff --git a/arch/x86/include/uapi/asm/processor-flags.h 
b/arch/x86/include/uapi/asm/processor-flags.h
index 567de50a4c2a..185f3d10c194 100644
--- a/arch/x86/include/uapi/asm/processor-flags.h
+++ b/arch/x86/include/uapi/asm/processor-flags.h
@@ -104,6 +104,8 @@
 #define X86_CR4_OSFXSR _BITUL(X86_CR4_OSFXSR_BIT)
 #define X86_CR4_OSXMMEXCPT_BIT 10 /* enable unmasked SSE exceptions */
 #define X86_CR4_OSXMMEXCPT _BITUL(X86_CR4_OSXMMEXCPT_BIT)
+#define X86_CR4_LA57_BIT   12 /* enable 5-level page tables */
+#define X86_CR4_LA57   _BITUL(X86_CR4_LA57_BIT)
 #define X86_CR4_VMXE_BIT   13 /* enable VMX virtualization */
 #define X86_CR4_VMXE   _BITUL(X86_CR4_VMXE_BIT)
 #define X86_CR4_SMXE_BIT   14 /* enable safer mode (TXT) */
diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c
index 8e598a1ad986..6b91e2eb8d3f 100644
--- a/arch/x86/kernel/espfix_64.c
+++ b/arch/x86/kernel/espfix_64.c
@@ -125,7 +125,7 @@ void __init init_espfix_bsp(void)
p4d_t *p4d;
 
/* Install the 

[PATCH 07/26] x86/kexec: support p4d_t

2017-03-12 Thread Kirill A. Shutemov
Handle additional page table level in kexec code.

Signed-off-by: Kirill A. Shutemov 
---
 arch/x86/include/asm/kexec.h   |  1 +
 arch/x86/kernel/machine_kexec_32.c |  4 +++-
 arch/x86/kernel/machine_kexec_64.c | 14 --
 3 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h
index 282630e4c6ea..70ef205489f0 100644
--- a/arch/x86/include/asm/kexec.h
+++ b/arch/x86/include/asm/kexec.h
@@ -164,6 +164,7 @@ struct kimage_arch {
 };
 #else
 struct kimage_arch {
+   p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
diff --git a/arch/x86/kernel/machine_kexec_32.c 
b/arch/x86/kernel/machine_kexec_32.c
index 469b23d6acc2..5f43cec296c5 100644
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -103,6 +103,7 @@ static void machine_kexec_page_table_set_one(
pgd_t *pgd, pmd_t *pmd, pte_t *pte,
unsigned long vaddr, unsigned long paddr)
 {
+   p4d_t *p4d;
pud_t *pud;
 
pgd += pgd_index(vaddr);
@@ -110,7 +111,8 @@ static void machine_kexec_page_table_set_one(
if (!(pgd_val(*pgd) & _PAGE_PRESENT))
set_pgd(pgd, __pgd(__pa(pmd) | _PAGE_PRESENT));
 #endif
-   pud = pud_offset(pgd, vaddr);
+   p4d = p4d_offset(pgd, vaddr);
+   pud = pud_offset(p4d, vaddr);
pmd = pmd_offset(pud, vaddr);
if (!(pmd_val(*pmd) & _PAGE_PRESENT))
set_pmd(pmd, __pmd(__pa(pte) | _PAGE_TABLE));
diff --git a/arch/x86/kernel/machine_kexec_64.c 
b/arch/x86/kernel/machine_kexec_64.c
index 857cdbd02867..085c3b300d32 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -36,6 +36,7 @@ static struct kexec_file_ops *kexec_file_loaders[] = {
 
 static void free_transition_pgtable(struct kimage *image)
 {
+   free_page((unsigned long)image->arch.p4d);
free_page((unsigned long)image->arch.pud);
free_page((unsigned long)image->arch.pmd);
free_page((unsigned long)image->arch.pte);
@@ -43,6 +44,7 @@ static void free_transition_pgtable(struct kimage *image)
 
 static int init_transition_pgtable(struct kimage *image, pgd_t *pgd)
 {
+   p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
@@ -53,13 +55,21 @@ static int init_transition_pgtable(struct kimage *image, 
pgd_t *pgd)
paddr = __pa(page_address(image->control_code_page)+PAGE_SIZE);
pgd += pgd_index(vaddr);
if (!pgd_present(*pgd)) {
+   p4d = (p4d_t *)get_zeroed_page(GFP_KERNEL);
+   if (!p4d)
+   goto err;
+   image->arch.p4d = p4d;
+   set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE));
+   }
+   p4d = p4d_offset(pgd, vaddr);
+   if (!p4d_present(*p4d)) {
pud = (pud_t *)get_zeroed_page(GFP_KERNEL);
if (!pud)
goto err;
image->arch.pud = pud;
-   set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE));
+   set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE));
}
-   pud = pud_offset(pgd, vaddr);
+   pud = pud_offset(p4d, vaddr);
if (!pud_present(*pud)) {
pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL);
if (!pmd)
-- 
2.11.0



[PATCH 21/26] x86/mm: add support of additional page table level during early boot

2017-03-12 Thread Kirill A. Shutemov
This patch adds support for 5-level paging during early boot.
It generalizes boot for 4- and 5-level paging on 64-bit systems with
compile-time switch between them.

Signed-off-by: Kirill A. Shutemov 
---
 arch/x86/boot/compressed/head_64.S  | 23 +--
 arch/x86/include/asm/pgtable.h  |  2 +-
 arch/x86/include/asm/pgtable_64.h   |  6 ++-
 arch/x86/include/uapi/asm/processor-flags.h |  2 +
 arch/x86/kernel/espfix_64.c |  2 +-
 arch/x86/kernel/head64.c| 40 +-
 arch/x86/kernel/head_64.S   | 63 +
 arch/x86/kernel/machine_kexec_64.c  |  2 +-
 arch/x86/mm/dump_pagetables.c   |  2 +-
 arch/x86/mm/kasan_init_64.c | 12 +++---
 arch/x86/realmode/init.c|  2 +-
 arch/x86/xen/mmu.c  | 38 ++---
 arch/x86/xen/xen-pvh.S  |  2 +-
 13 files changed, 136 insertions(+), 60 deletions(-)

diff --git a/arch/x86/boot/compressed/head_64.S 
b/arch/x86/boot/compressed/head_64.S
index d2ae1f821e0c..3ed26769810b 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -122,9 +122,12 @@ ENTRY(startup_32)
addl%ebp, gdt+2(%ebp)
lgdtgdt(%ebp)
 
-   /* Enable PAE mode */
+   /* Enable PAE and LA57 mode */
movl%cr4, %eax
orl $X86_CR4_PAE, %eax
+#ifdef CONFIG_X86_5LEVEL
+   orl $X86_CR4_LA57, %eax
+#endif
movl%eax, %cr4
 
  /*
@@ -136,13 +139,24 @@ ENTRY(startup_32)
movl$(BOOT_INIT_PGT_SIZE/4), %ecx
rep stosl
 
+   xorl%edx, %edx
+
+   /* Build Top Level */
+   lealpgtable(%ebx,%edx,1), %edi
+   leal0x1007 (%edi), %eax
+   movl%eax, 0(%edi)
+
+#ifdef CONFIG_X86_5LEVEL
/* Build Level 4 */
-   lealpgtable + 0(%ebx), %edi
+   addl$0x1000, %edx
+   lealpgtable(%ebx,%edx), %edi
leal0x1007 (%edi), %eax
movl%eax, 0(%edi)
+#endif
 
/* Build Level 3 */
-   lealpgtable + 0x1000(%ebx), %edi
+   addl$0x1000, %edx
+   lealpgtable(%ebx,%edx), %edi
leal0x1007(%edi), %eax
movl$4, %ecx
 1: movl%eax, 0x00(%edi)
@@ -152,7 +166,8 @@ ENTRY(startup_32)
jnz 1b
 
/* Build Level 2 */
-   lealpgtable + 0x2000(%ebx), %edi
+   addl$0x1000, %edx
+   lealpgtable(%ebx,%edx), %edi
movl$0x0183, %eax
movl$2048, %ecx
 1: movl%eax, 0(%edi)
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 90f32116acd8..6cefd861ac65 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -917,7 +917,7 @@ extern pgd_t trampoline_pgd_entry;
 static inline void __meminit init_trampoline_default(void)
 {
/* Default trampoline pgd value */
-   trampoline_pgd_entry = init_level4_pgt[pgd_index(__PAGE_OFFSET)];
+   trampoline_pgd_entry = init_top_pgt[pgd_index(__PAGE_OFFSET)];
 }
 # ifdef CONFIG_RANDOMIZE_MEMORY
 void __meminit init_trampoline(void);
diff --git a/arch/x86/include/asm/pgtable_64.h 
b/arch/x86/include/asm/pgtable_64.h
index 9991224f6238..c9e41f1599dd 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -14,15 +14,17 @@
 #include 
 #include 
 
+extern p4d_t level4_kernel_pgt[512];
+extern p4d_t level4_ident_pgt[512];
 extern pud_t level3_kernel_pgt[512];
 extern pud_t level3_ident_pgt[512];
 extern pmd_t level2_kernel_pgt[512];
 extern pmd_t level2_fixmap_pgt[512];
 extern pmd_t level2_ident_pgt[512];
 extern pte_t level1_fixmap_pgt[512];
-extern pgd_t init_level4_pgt[];
+extern pgd_t init_top_pgt[];
 
-#define swapper_pg_dir init_level4_pgt
+#define swapper_pg_dir init_top_pgt
 
 extern void paging_init(void);
 
diff --git a/arch/x86/include/uapi/asm/processor-flags.h 
b/arch/x86/include/uapi/asm/processor-flags.h
index 567de50a4c2a..185f3d10c194 100644
--- a/arch/x86/include/uapi/asm/processor-flags.h
+++ b/arch/x86/include/uapi/asm/processor-flags.h
@@ -104,6 +104,8 @@
 #define X86_CR4_OSFXSR _BITUL(X86_CR4_OSFXSR_BIT)
 #define X86_CR4_OSXMMEXCPT_BIT 10 /* enable unmasked SSE exceptions */
 #define X86_CR4_OSXMMEXCPT _BITUL(X86_CR4_OSXMMEXCPT_BIT)
+#define X86_CR4_LA57_BIT   12 /* enable 5-level page tables */
+#define X86_CR4_LA57   _BITUL(X86_CR4_LA57_BIT)
 #define X86_CR4_VMXE_BIT   13 /* enable VMX virtualization */
 #define X86_CR4_VMXE   _BITUL(X86_CR4_VMXE_BIT)
 #define X86_CR4_SMXE_BIT   14 /* enable safer mode (TXT) */
diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c
index 8e598a1ad986..6b91e2eb8d3f 100644
--- a/arch/x86/kernel/espfix_64.c
+++ b/arch/x86/kernel/espfix_64.c
@@ -125,7 +125,7 @@ void __init init_espfix_bsp(void)
p4d_t *p4d;
 
/* Install the espfix pud into the kernel page 

[PATCH 20/26] x86/espfix: support 5-level paging

2017-03-12 Thread Kirill A. Shutemov
We don't need extra virtual address space for ESPFIX, so it stays within
one PUD page table for both 4- and 5-level paging.

Signed-off-by: Kirill A. Shutemov 
---
 arch/x86/kernel/espfix_64.c | 12 +++-
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c
index 04f89caef9c4..8e598a1ad986 100644
--- a/arch/x86/kernel/espfix_64.c
+++ b/arch/x86/kernel/espfix_64.c
@@ -50,11 +50,11 @@
 #define ESPFIX_STACKS_PER_PAGE (PAGE_SIZE/ESPFIX_STACK_SIZE)
 
 /* There is address space for how many espfix pages? */
-#define ESPFIX_PAGE_SPACE  (1UL << (PGDIR_SHIFT-PAGE_SHIFT-16))
+#define ESPFIX_PAGE_SPACE  (1UL << (P4D_SHIFT-PAGE_SHIFT-16))
 
 #define ESPFIX_MAX_CPUS(ESPFIX_STACKS_PER_PAGE * 
ESPFIX_PAGE_SPACE)
 #if CONFIG_NR_CPUS > ESPFIX_MAX_CPUS
-# error "Need more than one PGD for the ESPFIX hack"
+# error "Need more virtual address space for the ESPFIX hack"
 #endif
 
 #define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO)
@@ -121,11 +121,13 @@ static void init_espfix_random(void)
 
 void __init init_espfix_bsp(void)
 {
-   pgd_t *pgd_p;
+   pgd_t *pgd;
+   p4d_t *p4d;
 
/* Install the espfix pud into the kernel page directory */
-   pgd_p = _level4_pgt[pgd_index(ESPFIX_BASE_ADDR)];
-   pgd_populate(_mm, pgd_p, (pud_t *)espfix_pud_page);
+   pgd = _level4_pgt[pgd_index(ESPFIX_BASE_ADDR)];
+   p4d = p4d_alloc(_mm, pgd, ESPFIX_BASE_ADDR);
+   p4d_populate(_mm, p4d, espfix_pud_page);
 
/* Randomize the locations */
init_espfix_random();
-- 
2.11.0



[PATCH 20/26] x86/espfix: support 5-level paging

2017-03-12 Thread Kirill A. Shutemov
We don't need extra virtual address space for ESPFIX, so it stays within
one PUD page table for both 4- and 5-level paging.

Signed-off-by: Kirill A. Shutemov 
---
 arch/x86/kernel/espfix_64.c | 12 +++-
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c
index 04f89caef9c4..8e598a1ad986 100644
--- a/arch/x86/kernel/espfix_64.c
+++ b/arch/x86/kernel/espfix_64.c
@@ -50,11 +50,11 @@
 #define ESPFIX_STACKS_PER_PAGE (PAGE_SIZE/ESPFIX_STACK_SIZE)
 
 /* There is address space for how many espfix pages? */
-#define ESPFIX_PAGE_SPACE  (1UL << (PGDIR_SHIFT-PAGE_SHIFT-16))
+#define ESPFIX_PAGE_SPACE  (1UL << (P4D_SHIFT-PAGE_SHIFT-16))
 
 #define ESPFIX_MAX_CPUS(ESPFIX_STACKS_PER_PAGE * 
ESPFIX_PAGE_SPACE)
 #if CONFIG_NR_CPUS > ESPFIX_MAX_CPUS
-# error "Need more than one PGD for the ESPFIX hack"
+# error "Need more virtual address space for the ESPFIX hack"
 #endif
 
 #define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO)
@@ -121,11 +121,13 @@ static void init_espfix_random(void)
 
 void __init init_espfix_bsp(void)
 {
-   pgd_t *pgd_p;
+   pgd_t *pgd;
+   p4d_t *p4d;
 
/* Install the espfix pud into the kernel page directory */
-   pgd_p = _level4_pgt[pgd_index(ESPFIX_BASE_ADDR)];
-   pgd_populate(_mm, pgd_p, (pud_t *)espfix_pud_page);
+   pgd = _level4_pgt[pgd_index(ESPFIX_BASE_ADDR)];
+   p4d = p4d_alloc(_mm, pgd, ESPFIX_BASE_ADDR);
+   p4d_populate(_mm, p4d, espfix_pud_page);
 
/* Randomize the locations */
init_espfix_random();
-- 
2.11.0



Re: [PATCH] [media] atmel-isc: fix off-by-one comparison and out of bounds read issue

2017-03-12 Thread Wu, Songjun



On 3/9/2017 18:57, Hans Verkuil wrote:

Hi Songjun,

On 08/03/17 03:25, Wu, Songjun wrote:

Hi Colin,

Thank you for your comment.
It is a bug, will be fixed in the next patch.


Do you mean that you will provide a new patch for this? Is there anything
wrong with this patch? It seems reasonable to me.


Hi Hans,

I see this patch is merged in git://linuxtv.org/media_tree.git.
So I do not need submit isc-pipeline-v3 patch, just submit the patches, 
based on the current master branch?



Regards,

Hans



On 3/7/2017 22:30, Colin King wrote:

From: Colin Ian King 

The are only HIST_ENTRIES worth of entries in  hist_entry however the
for-loop is iterating one too many times leasing to a read access off
the end off the array ctrls->hist_entry.  Fix this by iterating by
the correct number of times.

Detected by CoverityScan, CID#1415279 ("Out-of-bounds read")

Signed-off-by: Colin Ian King 
---
 drivers/media/platform/atmel/atmel-isc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/media/platform/atmel/atmel-isc.c 
b/drivers/media/platform/atmel/atmel-isc.c
index b380a7d..7dacf8c 100644
--- a/drivers/media/platform/atmel/atmel-isc.c
+++ b/drivers/media/platform/atmel/atmel-isc.c
@@ -1298,7 +1298,7 @@ static void isc_hist_count(struct isc_device *isc)
 regmap_bulk_read(regmap, ISC_HIS_ENTRY, hist_entry, HIST_ENTRIES);

 *hist_count = 0;
-for (i = 0; i <= HIST_ENTRIES; i++)
+for (i = 0; i < HIST_ENTRIES; i++)
 *hist_count += i * (*hist_entry++);
 }






Re: [PATCH] [media] atmel-isc: fix off-by-one comparison and out of bounds read issue

2017-03-12 Thread Wu, Songjun



On 3/9/2017 18:57, Hans Verkuil wrote:

Hi Songjun,

On 08/03/17 03:25, Wu, Songjun wrote:

Hi Colin,

Thank you for your comment.
It is a bug, will be fixed in the next patch.


Do you mean that you will provide a new patch for this? Is there anything
wrong with this patch? It seems reasonable to me.


Hi Hans,

I see this patch is merged in git://linuxtv.org/media_tree.git.
So I do not need submit isc-pipeline-v3 patch, just submit the patches, 
based on the current master branch?



Regards,

Hans



On 3/7/2017 22:30, Colin King wrote:

From: Colin Ian King 

The are only HIST_ENTRIES worth of entries in  hist_entry however the
for-loop is iterating one too many times leasing to a read access off
the end off the array ctrls->hist_entry.  Fix this by iterating by
the correct number of times.

Detected by CoverityScan, CID#1415279 ("Out-of-bounds read")

Signed-off-by: Colin Ian King 
---
 drivers/media/platform/atmel/atmel-isc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/media/platform/atmel/atmel-isc.c 
b/drivers/media/platform/atmel/atmel-isc.c
index b380a7d..7dacf8c 100644
--- a/drivers/media/platform/atmel/atmel-isc.c
+++ b/drivers/media/platform/atmel/atmel-isc.c
@@ -1298,7 +1298,7 @@ static void isc_hist_count(struct isc_device *isc)
 regmap_bulk_read(regmap, ISC_HIS_ENTRY, hist_entry, HIST_ENTRIES);

 *hist_count = 0;
-for (i = 0; i <= HIST_ENTRIES; i++)
+for (i = 0; i < HIST_ENTRIES; i++)
 *hist_count += i * (*hist_entry++);
 }






Re: [PATCH v3 1/2] net: sched: make default fifo qdiscs appear in the dump

2017-03-12 Thread David Miller
From: Jiri Kosina 
Date: Wed, 8 Mar 2017 16:03:32 +0100 (CET)

> From: Jiri Kosina 
> 
> The original reason [1] for having hidden qdiscs (potential scalability
> issues in qdisc_match_from_root() with single linked list in case of large
> amount of qdiscs) has been invalidated by 59cc1f61f0 ("net: sched: convert
> qdisc linked list to hashtable").
> 
> This allows us for bringing more clarity and determinism into the dump by
> making default pfifo qdiscs visible.
> 
> We're not turning this on by default though, at it was deemed [2] too
> intrusive / unnecessary change of default behavior towards userspace.
> Instead, TCA_DUMP_INVISIBLE netlink attribute is introduced, which allows
> applications to request complete qdisc hierarchy dump, including the
> ones that have always been implicit/invisible.
> 
> Singleton noop_qdisc stays invisible, as teaching the whole infrastructure
> about singletons would require quite some surgery with very little gain
> (seeing no qdisc or seeing noop qdisc in the dump is probably setting
> the same user expectation).
> 
> [1] 
> http://lkml.kernel.org/r/1460732328.10638.74.ca...@edumazet-glaptop3.roam.corp.google.com
> [2] 
> http://lkml.kernel.org/r/20161021.105935.1907696543877061916.da...@davemloft.net
> 
> Signed-off-by: Jiri Kosina 

Applied, thanks Jiri.


[PATCH 06/26] x86/power: support p4d_t in hibernate code

2017-03-12 Thread Kirill A. Shutemov
set_up_temporary_text_mapping() and relocate_restore_code() require
trivial adjustments to handle additional page table level.

Signed-off-by: Kirill A. Shutemov 
---
 arch/x86/power/hibernate_64.c | 49 ++-
 1 file changed, 35 insertions(+), 14 deletions(-)

diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c
index ded2e8272382..9ec941638932 100644
--- a/arch/x86/power/hibernate_64.c
+++ b/arch/x86/power/hibernate_64.c
@@ -49,6 +49,7 @@ static int set_up_temporary_text_mapping(pgd_t *pgd)
 {
pmd_t *pmd;
pud_t *pud;
+   p4d_t *p4d;
 
/*
 * The new mapping only has to cover the page containing the image
@@ -63,6 +64,13 @@ static int set_up_temporary_text_mapping(pgd_t *pgd)
 * the virtual address space after switching over to the original page
 * tables used by the image kernel.
 */
+
+   if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
+   p4d = (p4d_t *)get_safe_page(GFP_ATOMIC);
+   if (!p4d)
+   return -ENOMEM;
+   }
+
pud = (pud_t *)get_safe_page(GFP_ATOMIC);
if (!pud)
return -ENOMEM;
@@ -75,8 +83,15 @@ static int set_up_temporary_text_mapping(pgd_t *pgd)
__pmd((jump_address_phys & PMD_MASK) | 
__PAGE_KERNEL_LARGE_EXEC));
set_pud(pud + pud_index(restore_jump_address),
__pud(__pa(pmd) | _KERNPG_TABLE));
-   set_pgd(pgd + pgd_index(restore_jump_address),
-   __pgd(__pa(pud) | _KERNPG_TABLE));
+   if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
+   set_p4d(p4d + p4d_index(restore_jump_address),
+   __p4d(__pa(pud) | _KERNPG_TABLE));
+   set_pgd(pgd + pgd_index(restore_jump_address),
+   __pgd(__pa(p4d) | _KERNPG_TABLE));
+   } else {
+   set_pgd(pgd + pgd_index(restore_jump_address),
+   __pgd(__pa(pud) | _KERNPG_TABLE));
+   }
 
return 0;
 }
@@ -124,7 +139,10 @@ static int set_up_temporary_mappings(void)
 static int relocate_restore_code(void)
 {
pgd_t *pgd;
+   p4d_t *p4d;
pud_t *pud;
+   pmd_t *pmd;
+   pte_t *pte;
 
relocated_restore_code = get_safe_page(GFP_ATOMIC);
if (!relocated_restore_code)
@@ -134,22 +152,25 @@ static int relocate_restore_code(void)
 
/* Make the page containing the relocated code executable */
pgd = (pgd_t *)__va(read_cr3()) + pgd_index(relocated_restore_code);
-   pud = pud_offset(pgd, relocated_restore_code);
+   p4d = p4d_offset(pgd, relocated_restore_code);
+   if (p4d_large(*p4d)) {
+   set_p4d(p4d, __p4d(p4d_val(*p4d) & ~_PAGE_NX));
+   goto out;
+   }
+   pud = pud_offset(p4d, relocated_restore_code);
if (pud_large(*pud)) {
set_pud(pud, __pud(pud_val(*pud) & ~_PAGE_NX));
-   } else {
-   pmd_t *pmd = pmd_offset(pud, relocated_restore_code);
-
-   if (pmd_large(*pmd)) {
-   set_pmd(pmd, __pmd(pmd_val(*pmd) & ~_PAGE_NX));
-   } else {
-   pte_t *pte = pte_offset_kernel(pmd, 
relocated_restore_code);
-
-   set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_NX));
-   }
+   goto out;
+   }
+   pmd = pmd_offset(pud, relocated_restore_code);
+   if (pmd_large(*pmd)) {
+   set_pmd(pmd, __pmd(pmd_val(*pmd) & ~_PAGE_NX));
+   goto out;
}
+   pte = pte_offset_kernel(pmd, relocated_restore_code);
+   set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_NX));
+out:
__flush_tlb_all();
-
return 0;
 }
 
-- 
2.11.0



[PATCH 24/26] x86/mm: add support for 5-level paging for KASLR

2017-03-12 Thread Kirill A. Shutemov
With 5-level paging randomization happens on P4D level instead of PUD.

Maximum amount of physical memory also bumped to 52-bits for 5-level
paging.

Signed-off-by: Kirill A. Shutemov 
---
 arch/x86/mm/kaslr.c | 82 -
 1 file changed, 63 insertions(+), 19 deletions(-)

diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c
index 887e57182716..662e5c4b21c8 100644
--- a/arch/x86/mm/kaslr.c
+++ b/arch/x86/mm/kaslr.c
@@ -6,12 +6,12 @@
  *
  * Entropy is generated using the KASLR early boot functions now shared in
  * the lib directory (originally written by Kees Cook). Randomization is
- * done on PGD & PUD page table levels to increase possible addresses. The
- * physical memory mapping code was adapted to support PUD level virtual
- * addresses. This implementation on the best configuration provides 30,000
- * possible virtual addresses in average for each memory region. An additional
- * low memory page is used to ensure each CPU can start with a PGD aligned
- * virtual address (for realmode).
+ * done on PGD & P4D/PUD page table levels to increase possible addresses.
+ * The physical memory mapping code was adapted to support P4D/PUD level
+ * virtual addresses. This implementation on the best configuration provides
+ * 30,000 possible virtual addresses in average for each memory region.
+ * An additional low memory page is used to ensure each CPU can start with
+ * a PGD aligned virtual address (for realmode).
  *
  * The order of each memory region is not changed. The feature looks at
  * the available space for the regions based on different configuration
@@ -70,7 +70,8 @@ static __initdata struct kaslr_memory_region {
unsigned long *base;
unsigned long size_tb;
 } kaslr_regions[] = {
-   { _offset_base, 64/* Maximum */ },
+   { _offset_base,
+   1 << (__PHYSICAL_MASK_SHIFT - TB_SHIFT) /* Maximum */ },
{ _base, VMALLOC_SIZE_TB },
{ _base, 1 },
 };
@@ -142,7 +143,10 @@ void __init kernel_randomize_memory(void)
 */
entropy = remain_entropy / (ARRAY_SIZE(kaslr_regions) - i);
prandom_bytes_state(_state, , sizeof(rand));
-   entropy = (rand % (entropy + 1)) & PUD_MASK;
+   if (IS_ENABLED(CONFIG_X86_5LEVEL))
+   entropy = (rand % (entropy + 1)) & P4D_MASK;
+   else
+   entropy = (rand % (entropy + 1)) & PUD_MASK;
vaddr += entropy;
*kaslr_regions[i].base = vaddr;
 
@@ -151,27 +155,21 @@ void __init kernel_randomize_memory(void)
 * randomization alignment.
 */
vaddr += get_padding(_regions[i]);
-   vaddr = round_up(vaddr + 1, PUD_SIZE);
+   if (IS_ENABLED(CONFIG_X86_5LEVEL))
+   vaddr = round_up(vaddr + 1, P4D_SIZE);
+   else
+   vaddr = round_up(vaddr + 1, PUD_SIZE);
remain_entropy -= entropy;
}
 }
 
-/*
- * Create PGD aligned trampoline table to allow real mode initialization
- * of additional CPUs. Consume only 1 low memory page.
- */
-void __meminit init_trampoline(void)
+static void __meminit init_trampoline_pud(void)
 {
unsigned long paddr, paddr_next;
pgd_t *pgd;
pud_t *pud_page, *pud_page_tramp;
int i;
 
-   if (!kaslr_memory_enabled()) {
-   init_trampoline_default();
-   return;
-   }
-
pud_page_tramp = alloc_low_page();
 
paddr = 0;
@@ -192,3 +190,49 @@ void __meminit init_trampoline(void)
set_pgd(_pgd_entry,
__pgd(_KERNPG_TABLE | __pa(pud_page_tramp)));
 }
+
+static void __meminit init_trampoline_p4d(void)
+{
+   unsigned long paddr, paddr_next;
+   pgd_t *pgd;
+   p4d_t *p4d_page, *p4d_page_tramp;
+   int i;
+
+   p4d_page_tramp = alloc_low_page();
+
+   paddr = 0;
+   pgd = pgd_offset_k((unsigned long)__va(paddr));
+   p4d_page = (p4d_t *) pgd_page_vaddr(*pgd);
+
+   for (i = p4d_index(paddr); i < PTRS_PER_P4D; i++, paddr = paddr_next) {
+   p4d_t *p4d, *p4d_tramp;
+   unsigned long vaddr = (unsigned long)__va(paddr);
+
+   p4d_tramp = p4d_page_tramp + p4d_index(paddr);
+   p4d = p4d_page + p4d_index(vaddr);
+   paddr_next = (paddr & P4D_MASK) + P4D_SIZE;
+
+   *p4d_tramp = *p4d;
+   }
+
+   set_pgd(_pgd_entry,
+   __pgd(_KERNPG_TABLE | __pa(p4d_page_tramp)));
+}
+
+/*
+ * Create PGD aligned trampoline table to allow real mode initialization
+ * of additional CPUs. Consume only 1 low memory page.
+ */
+void __meminit init_trampoline(void)
+{
+
+   if (!kaslr_memory_enabled()) {
+   init_trampoline_default();
+   return;
+   }
+
+   if (IS_ENABLED(CONFIG_X86_5LEVEL))
+   

Re: [PATCH v3 1/2] net: sched: make default fifo qdiscs appear in the dump

2017-03-12 Thread David Miller
From: Jiri Kosina 
Date: Wed, 8 Mar 2017 16:03:32 +0100 (CET)

> From: Jiri Kosina 
> 
> The original reason [1] for having hidden qdiscs (potential scalability
> issues in qdisc_match_from_root() with single linked list in case of large
> amount of qdiscs) has been invalidated by 59cc1f61f0 ("net: sched: convert
> qdisc linked list to hashtable").
> 
> This allows us for bringing more clarity and determinism into the dump by
> making default pfifo qdiscs visible.
> 
> We're not turning this on by default though, at it was deemed [2] too
> intrusive / unnecessary change of default behavior towards userspace.
> Instead, TCA_DUMP_INVISIBLE netlink attribute is introduced, which allows
> applications to request complete qdisc hierarchy dump, including the
> ones that have always been implicit/invisible.
> 
> Singleton noop_qdisc stays invisible, as teaching the whole infrastructure
> about singletons would require quite some surgery with very little gain
> (seeing no qdisc or seeing noop qdisc in the dump is probably setting
> the same user expectation).
> 
> [1] 
> http://lkml.kernel.org/r/1460732328.10638.74.ca...@edumazet-glaptop3.roam.corp.google.com
> [2] 
> http://lkml.kernel.org/r/20161021.105935.1907696543877061916.da...@davemloft.net
> 
> Signed-off-by: Jiri Kosina 

Applied, thanks Jiri.


[PATCH 06/26] x86/power: support p4d_t in hibernate code

2017-03-12 Thread Kirill A. Shutemov
set_up_temporary_text_mapping() and relocate_restore_code() require
trivial adjustments to handle additional page table level.

Signed-off-by: Kirill A. Shutemov 
---
 arch/x86/power/hibernate_64.c | 49 ++-
 1 file changed, 35 insertions(+), 14 deletions(-)

diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c
index ded2e8272382..9ec941638932 100644
--- a/arch/x86/power/hibernate_64.c
+++ b/arch/x86/power/hibernate_64.c
@@ -49,6 +49,7 @@ static int set_up_temporary_text_mapping(pgd_t *pgd)
 {
pmd_t *pmd;
pud_t *pud;
+   p4d_t *p4d;
 
/*
 * The new mapping only has to cover the page containing the image
@@ -63,6 +64,13 @@ static int set_up_temporary_text_mapping(pgd_t *pgd)
 * the virtual address space after switching over to the original page
 * tables used by the image kernel.
 */
+
+   if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
+   p4d = (p4d_t *)get_safe_page(GFP_ATOMIC);
+   if (!p4d)
+   return -ENOMEM;
+   }
+
pud = (pud_t *)get_safe_page(GFP_ATOMIC);
if (!pud)
return -ENOMEM;
@@ -75,8 +83,15 @@ static int set_up_temporary_text_mapping(pgd_t *pgd)
__pmd((jump_address_phys & PMD_MASK) | 
__PAGE_KERNEL_LARGE_EXEC));
set_pud(pud + pud_index(restore_jump_address),
__pud(__pa(pmd) | _KERNPG_TABLE));
-   set_pgd(pgd + pgd_index(restore_jump_address),
-   __pgd(__pa(pud) | _KERNPG_TABLE));
+   if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
+   set_p4d(p4d + p4d_index(restore_jump_address),
+   __p4d(__pa(pud) | _KERNPG_TABLE));
+   set_pgd(pgd + pgd_index(restore_jump_address),
+   __pgd(__pa(p4d) | _KERNPG_TABLE));
+   } else {
+   set_pgd(pgd + pgd_index(restore_jump_address),
+   __pgd(__pa(pud) | _KERNPG_TABLE));
+   }
 
return 0;
 }
@@ -124,7 +139,10 @@ static int set_up_temporary_mappings(void)
 static int relocate_restore_code(void)
 {
pgd_t *pgd;
+   p4d_t *p4d;
pud_t *pud;
+   pmd_t *pmd;
+   pte_t *pte;
 
relocated_restore_code = get_safe_page(GFP_ATOMIC);
if (!relocated_restore_code)
@@ -134,22 +152,25 @@ static int relocate_restore_code(void)
 
/* Make the page containing the relocated code executable */
pgd = (pgd_t *)__va(read_cr3()) + pgd_index(relocated_restore_code);
-   pud = pud_offset(pgd, relocated_restore_code);
+   p4d = p4d_offset(pgd, relocated_restore_code);
+   if (p4d_large(*p4d)) {
+   set_p4d(p4d, __p4d(p4d_val(*p4d) & ~_PAGE_NX));
+   goto out;
+   }
+   pud = pud_offset(p4d, relocated_restore_code);
if (pud_large(*pud)) {
set_pud(pud, __pud(pud_val(*pud) & ~_PAGE_NX));
-   } else {
-   pmd_t *pmd = pmd_offset(pud, relocated_restore_code);
-
-   if (pmd_large(*pmd)) {
-   set_pmd(pmd, __pmd(pmd_val(*pmd) & ~_PAGE_NX));
-   } else {
-   pte_t *pte = pte_offset_kernel(pmd, 
relocated_restore_code);
-
-   set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_NX));
-   }
+   goto out;
+   }
+   pmd = pmd_offset(pud, relocated_restore_code);
+   if (pmd_large(*pmd)) {
+   set_pmd(pmd, __pmd(pmd_val(*pmd) & ~_PAGE_NX));
+   goto out;
}
+   pte = pte_offset_kernel(pmd, relocated_restore_code);
+   set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_NX));
+out:
__flush_tlb_all();
-
return 0;
 }
 
-- 
2.11.0



[PATCH 24/26] x86/mm: add support for 5-level paging for KASLR

2017-03-12 Thread Kirill A. Shutemov
With 5-level paging randomization happens on P4D level instead of PUD.

Maximum amount of physical memory also bumped to 52-bits for 5-level
paging.

Signed-off-by: Kirill A. Shutemov 
---
 arch/x86/mm/kaslr.c | 82 -
 1 file changed, 63 insertions(+), 19 deletions(-)

diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c
index 887e57182716..662e5c4b21c8 100644
--- a/arch/x86/mm/kaslr.c
+++ b/arch/x86/mm/kaslr.c
@@ -6,12 +6,12 @@
  *
  * Entropy is generated using the KASLR early boot functions now shared in
  * the lib directory (originally written by Kees Cook). Randomization is
- * done on PGD & PUD page table levels to increase possible addresses. The
- * physical memory mapping code was adapted to support PUD level virtual
- * addresses. This implementation on the best configuration provides 30,000
- * possible virtual addresses in average for each memory region. An additional
- * low memory page is used to ensure each CPU can start with a PGD aligned
- * virtual address (for realmode).
+ * done on PGD & P4D/PUD page table levels to increase possible addresses.
+ * The physical memory mapping code was adapted to support P4D/PUD level
+ * virtual addresses. This implementation on the best configuration provides
+ * 30,000 possible virtual addresses in average for each memory region.
+ * An additional low memory page is used to ensure each CPU can start with
+ * a PGD aligned virtual address (for realmode).
  *
  * The order of each memory region is not changed. The feature looks at
  * the available space for the regions based on different configuration
@@ -70,7 +70,8 @@ static __initdata struct kaslr_memory_region {
unsigned long *base;
unsigned long size_tb;
 } kaslr_regions[] = {
-   { _offset_base, 64/* Maximum */ },
+   { _offset_base,
+   1 << (__PHYSICAL_MASK_SHIFT - TB_SHIFT) /* Maximum */ },
{ _base, VMALLOC_SIZE_TB },
{ _base, 1 },
 };
@@ -142,7 +143,10 @@ void __init kernel_randomize_memory(void)
 */
entropy = remain_entropy / (ARRAY_SIZE(kaslr_regions) - i);
prandom_bytes_state(_state, , sizeof(rand));
-   entropy = (rand % (entropy + 1)) & PUD_MASK;
+   if (IS_ENABLED(CONFIG_X86_5LEVEL))
+   entropy = (rand % (entropy + 1)) & P4D_MASK;
+   else
+   entropy = (rand % (entropy + 1)) & PUD_MASK;
vaddr += entropy;
*kaslr_regions[i].base = vaddr;
 
@@ -151,27 +155,21 @@ void __init kernel_randomize_memory(void)
 * randomization alignment.
 */
vaddr += get_padding(_regions[i]);
-   vaddr = round_up(vaddr + 1, PUD_SIZE);
+   if (IS_ENABLED(CONFIG_X86_5LEVEL))
+   vaddr = round_up(vaddr + 1, P4D_SIZE);
+   else
+   vaddr = round_up(vaddr + 1, PUD_SIZE);
remain_entropy -= entropy;
}
 }
 
-/*
- * Create PGD aligned trampoline table to allow real mode initialization
- * of additional CPUs. Consume only 1 low memory page.
- */
-void __meminit init_trampoline(void)
+static void __meminit init_trampoline_pud(void)
 {
unsigned long paddr, paddr_next;
pgd_t *pgd;
pud_t *pud_page, *pud_page_tramp;
int i;
 
-   if (!kaslr_memory_enabled()) {
-   init_trampoline_default();
-   return;
-   }
-
pud_page_tramp = alloc_low_page();
 
paddr = 0;
@@ -192,3 +190,49 @@ void __meminit init_trampoline(void)
set_pgd(_pgd_entry,
__pgd(_KERNPG_TABLE | __pa(pud_page_tramp)));
 }
+
+static void __meminit init_trampoline_p4d(void)
+{
+   unsigned long paddr, paddr_next;
+   pgd_t *pgd;
+   p4d_t *p4d_page, *p4d_page_tramp;
+   int i;
+
+   p4d_page_tramp = alloc_low_page();
+
+   paddr = 0;
+   pgd = pgd_offset_k((unsigned long)__va(paddr));
+   p4d_page = (p4d_t *) pgd_page_vaddr(*pgd);
+
+   for (i = p4d_index(paddr); i < PTRS_PER_P4D; i++, paddr = paddr_next) {
+   p4d_t *p4d, *p4d_tramp;
+   unsigned long vaddr = (unsigned long)__va(paddr);
+
+   p4d_tramp = p4d_page_tramp + p4d_index(paddr);
+   p4d = p4d_page + p4d_index(vaddr);
+   paddr_next = (paddr & P4D_MASK) + P4D_SIZE;
+
+   *p4d_tramp = *p4d;
+   }
+
+   set_pgd(_pgd_entry,
+   __pgd(_KERNPG_TABLE | __pa(p4d_page_tramp)));
+}
+
+/*
+ * Create PGD aligned trampoline table to allow real mode initialization
+ * of additional CPUs. Consume only 1 low memory page.
+ */
+void __meminit init_trampoline(void)
+{
+
+   if (!kaslr_memory_enabled()) {
+   init_trampoline_default();
+   return;
+   }
+
+   if (IS_ENABLED(CONFIG_X86_5LEVEL))
+   init_trampoline_p4d();
+   else
+   

[PATCH 19/26] x86/kasan: extend to support 5-level paging

2017-03-12 Thread Kirill A. Shutemov
This patch bring support for non-folded additional page table level.

Signed-off-by: Kirill A. Shutemov 
Cc: Dmitry Vyukov = 5 && i < PTRS_PER_P4D; i++)
+   kasan_zero_p4d[i] = __p4d(p4d_val);
+
kasan_map_early_shadow(early_level4_pgt);
kasan_map_early_shadow(init_level4_pgt);
 }
-- 
2.11.0



[PATCH 12/26] x86: convert the rest of the code to support p4d_t

2017-03-12 Thread Kirill A. Shutemov
This patch converts x86 to use proper folding of new page table level
with .

That's a bit of kitchen sink, but I don't see how to split it further.

Signed-off-by: Kirill A. Shutemov 
---
 arch/x86/include/asm/paravirt.h   |  33 +-
 arch/x86/include/asm/paravirt_types.h |  12 ++-
 arch/x86/include/asm/pgalloc.h|  35 ++-
 arch/x86/include/asm/pgtable.h|  59 ++-
 arch/x86/include/asm/pgtable_64.h |  12 ++-
 arch/x86/include/asm/pgtable_types.h  |  10 +-
 arch/x86/include/asm/xen/page.h   |   8 +-
 arch/x86/kernel/paravirt.c|  10 +-
 arch/x86/mm/init_64.c | 183 +++---
 arch/x86/xen/mmu.c| 152 
 include/trace/events/xen.h|  28 +++---
 11 files changed, 401 insertions(+), 141 deletions(-)

diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 0489884fdc44..158d877ce9e9 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -536,7 +536,7 @@ static inline void set_pud(pud_t *pudp, pud_t pud)
PVOP_VCALL2(pv_mmu_ops.set_pud, pudp,
val);
 }
-#if CONFIG_PGTABLE_LEVELS == 4
+#if CONFIG_PGTABLE_LEVELS >= 4
 static inline pud_t __pud(pudval_t val)
 {
pudval_t ret;
@@ -565,6 +565,32 @@ static inline pudval_t pud_val(pud_t pud)
return ret;
 }
 
+static inline void pud_clear(pud_t *pudp)
+{
+   set_pud(pudp, __pud(0));
+}
+
+static inline void set_p4d(p4d_t *p4dp, p4d_t p4d)
+{
+   p4dval_t val = native_p4d_val(p4d);
+
+   if (sizeof(p4dval_t) > sizeof(long))
+   PVOP_VCALL3(pv_mmu_ops.set_p4d, p4dp,
+   val, (u64)val >> 32);
+   else
+   PVOP_VCALL2(pv_mmu_ops.set_p4d, p4dp,
+   val);
+}
+
+static inline void p4d_clear(p4d_t *p4dp)
+{
+   set_p4d(p4dp, __p4d(0));
+}
+
+#if CONFIG_PGTABLE_LEVELS >= 5
+
+#error FIXME
+
 static inline void set_pgd(pgd_t *pgdp, pgd_t pgd)
 {
pgdval_t val = native_pgd_val(pgd);
@@ -582,10 +608,7 @@ static inline void pgd_clear(pgd_t *pgdp)
set_pgd(pgdp, __pgd(0));
 }
 
-static inline void pud_clear(pud_t *pudp)
-{
-   set_pud(pudp, __pud(0));
-}
+#endif  /* CONFIG_PGTABLE_LEVELS == 5 */
 
 #endif /* CONFIG_PGTABLE_LEVELS == 4 */
 
diff --git a/arch/x86/include/asm/paravirt_types.h 
b/arch/x86/include/asm/paravirt_types.h
index b060f962d581..93c49cf09b63 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -279,12 +279,18 @@ struct pv_mmu_ops {
struct paravirt_callee_save pmd_val;
struct paravirt_callee_save make_pmd;
 
-#if CONFIG_PGTABLE_LEVELS == 4
+#if CONFIG_PGTABLE_LEVELS >= 4
struct paravirt_callee_save pud_val;
struct paravirt_callee_save make_pud;
 
-   void (*set_pgd)(pgd_t *pudp, pgd_t pgdval);
-#endif /* CONFIG_PGTABLE_LEVELS == 4 */
+   void (*set_p4d)(p4d_t *p4dp, p4d_t p4dval);
+
+#if CONFIG_PGTABLE_LEVELS >= 5
+#error FIXME
+#endif /* CONFIG_PGTABLE_LEVELS >= 5 */
+
+#endif /* CONFIG_PGTABLE_LEVELS >= 4 */
+
 #endif /* CONFIG_PGTABLE_LEVELS >= 3 */
 
struct pv_lazy_ops lazy_mode;
diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h
index b6d425999f99..2f585054c63c 100644
--- a/arch/x86/include/asm/pgalloc.h
+++ b/arch/x86/include/asm/pgalloc.h
@@ -121,10 +121,10 @@ static inline void pud_populate(struct mm_struct *mm, 
pud_t *pud, pmd_t *pmd)
 #endif /* CONFIG_X86_PAE */
 
 #if CONFIG_PGTABLE_LEVELS > 3
-static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud)
+static inline void p4d_populate(struct mm_struct *mm, p4d_t *p4d, pud_t *pud)
 {
paravirt_alloc_pud(mm, __pa(pud) >> PAGE_SHIFT);
-   set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(pud)));
+   set_p4d(p4d, __p4d(_PAGE_TABLE | __pa(pud)));
 }
 
 static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
@@ -150,6 +150,37 @@ static inline void __pud_free_tlb(struct mmu_gather *tlb, 
pud_t *pud,
___pud_free_tlb(tlb, pud);
 }
 
+#if CONFIG_PGTABLE_LEVELS > 4
+static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, p4d_t *p4d)
+{
+   paravirt_alloc_p4d(mm, __pa(p4d) >> PAGE_SHIFT);
+   set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(p4d)));
+}
+
+static inline p4d_t *p4d_alloc_one(struct mm_struct *mm, unsigned long addr)
+{
+   gfp_t gfp = GFP_KERNEL_ACCOUNT;
+
+   if (mm == _mm)
+   gfp &= ~__GFP_ACCOUNT;
+   return (p4d_t *)get_zeroed_page(gfp);
+}
+
+static inline void p4d_free(struct mm_struct *mm, p4d_t *p4d)
+{
+   BUG_ON((unsigned long)p4d & (PAGE_SIZE-1));
+   free_page((unsigned long)p4d);
+}
+
+extern void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d);
+
+static inline void __p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d,
+ unsigned long address)
+{

[PATCH 19/26] x86/kasan: extend to support 5-level paging

2017-03-12 Thread Kirill A. Shutemov
This patch bring support for non-folded additional page table level.

Signed-off-by: Kirill A. Shutemov 
Cc: Dmitry Vyukov = 5 && i < PTRS_PER_P4D; i++)
+   kasan_zero_p4d[i] = __p4d(p4d_val);
+
kasan_map_early_shadow(early_level4_pgt);
kasan_map_early_shadow(init_level4_pgt);
 }
-- 
2.11.0



[PATCH 12/26] x86: convert the rest of the code to support p4d_t

2017-03-12 Thread Kirill A. Shutemov
This patch converts x86 to use proper folding of new page table level
with .

That's a bit of kitchen sink, but I don't see how to split it further.

Signed-off-by: Kirill A. Shutemov 
---
 arch/x86/include/asm/paravirt.h   |  33 +-
 arch/x86/include/asm/paravirt_types.h |  12 ++-
 arch/x86/include/asm/pgalloc.h|  35 ++-
 arch/x86/include/asm/pgtable.h|  59 ++-
 arch/x86/include/asm/pgtable_64.h |  12 ++-
 arch/x86/include/asm/pgtable_types.h  |  10 +-
 arch/x86/include/asm/xen/page.h   |   8 +-
 arch/x86/kernel/paravirt.c|  10 +-
 arch/x86/mm/init_64.c | 183 +++---
 arch/x86/xen/mmu.c| 152 
 include/trace/events/xen.h|  28 +++---
 11 files changed, 401 insertions(+), 141 deletions(-)

diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 0489884fdc44..158d877ce9e9 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -536,7 +536,7 @@ static inline void set_pud(pud_t *pudp, pud_t pud)
PVOP_VCALL2(pv_mmu_ops.set_pud, pudp,
val);
 }
-#if CONFIG_PGTABLE_LEVELS == 4
+#if CONFIG_PGTABLE_LEVELS >= 4
 static inline pud_t __pud(pudval_t val)
 {
pudval_t ret;
@@ -565,6 +565,32 @@ static inline pudval_t pud_val(pud_t pud)
return ret;
 }
 
+static inline void pud_clear(pud_t *pudp)
+{
+   set_pud(pudp, __pud(0));
+}
+
+static inline void set_p4d(p4d_t *p4dp, p4d_t p4d)
+{
+   p4dval_t val = native_p4d_val(p4d);
+
+   if (sizeof(p4dval_t) > sizeof(long))
+   PVOP_VCALL3(pv_mmu_ops.set_p4d, p4dp,
+   val, (u64)val >> 32);
+   else
+   PVOP_VCALL2(pv_mmu_ops.set_p4d, p4dp,
+   val);
+}
+
+static inline void p4d_clear(p4d_t *p4dp)
+{
+   set_p4d(p4dp, __p4d(0));
+}
+
+#if CONFIG_PGTABLE_LEVELS >= 5
+
+#error FIXME
+
 static inline void set_pgd(pgd_t *pgdp, pgd_t pgd)
 {
pgdval_t val = native_pgd_val(pgd);
@@ -582,10 +608,7 @@ static inline void pgd_clear(pgd_t *pgdp)
set_pgd(pgdp, __pgd(0));
 }
 
-static inline void pud_clear(pud_t *pudp)
-{
-   set_pud(pudp, __pud(0));
-}
+#endif  /* CONFIG_PGTABLE_LEVELS == 5 */
 
 #endif /* CONFIG_PGTABLE_LEVELS == 4 */
 
diff --git a/arch/x86/include/asm/paravirt_types.h 
b/arch/x86/include/asm/paravirt_types.h
index b060f962d581..93c49cf09b63 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -279,12 +279,18 @@ struct pv_mmu_ops {
struct paravirt_callee_save pmd_val;
struct paravirt_callee_save make_pmd;
 
-#if CONFIG_PGTABLE_LEVELS == 4
+#if CONFIG_PGTABLE_LEVELS >= 4
struct paravirt_callee_save pud_val;
struct paravirt_callee_save make_pud;
 
-   void (*set_pgd)(pgd_t *pudp, pgd_t pgdval);
-#endif /* CONFIG_PGTABLE_LEVELS == 4 */
+   void (*set_p4d)(p4d_t *p4dp, p4d_t p4dval);
+
+#if CONFIG_PGTABLE_LEVELS >= 5
+#error FIXME
+#endif /* CONFIG_PGTABLE_LEVELS >= 5 */
+
+#endif /* CONFIG_PGTABLE_LEVELS >= 4 */
+
 #endif /* CONFIG_PGTABLE_LEVELS >= 3 */
 
struct pv_lazy_ops lazy_mode;
diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h
index b6d425999f99..2f585054c63c 100644
--- a/arch/x86/include/asm/pgalloc.h
+++ b/arch/x86/include/asm/pgalloc.h
@@ -121,10 +121,10 @@ static inline void pud_populate(struct mm_struct *mm, 
pud_t *pud, pmd_t *pmd)
 #endif /* CONFIG_X86_PAE */
 
 #if CONFIG_PGTABLE_LEVELS > 3
-static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud)
+static inline void p4d_populate(struct mm_struct *mm, p4d_t *p4d, pud_t *pud)
 {
paravirt_alloc_pud(mm, __pa(pud) >> PAGE_SHIFT);
-   set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(pud)));
+   set_p4d(p4d, __p4d(_PAGE_TABLE | __pa(pud)));
 }
 
 static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
@@ -150,6 +150,37 @@ static inline void __pud_free_tlb(struct mmu_gather *tlb, 
pud_t *pud,
___pud_free_tlb(tlb, pud);
 }
 
+#if CONFIG_PGTABLE_LEVELS > 4
+static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, p4d_t *p4d)
+{
+   paravirt_alloc_p4d(mm, __pa(p4d) >> PAGE_SHIFT);
+   set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(p4d)));
+}
+
+static inline p4d_t *p4d_alloc_one(struct mm_struct *mm, unsigned long addr)
+{
+   gfp_t gfp = GFP_KERNEL_ACCOUNT;
+
+   if (mm == _mm)
+   gfp &= ~__GFP_ACCOUNT;
+   return (p4d_t *)get_zeroed_page(gfp);
+}
+
+static inline void p4d_free(struct mm_struct *mm, p4d_t *p4d)
+{
+   BUG_ON((unsigned long)p4d & (PAGE_SIZE-1));
+   free_page((unsigned long)p4d);
+}
+
+extern void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d);
+
+static inline void __p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d,
+ unsigned long address)
+{
+   ___p4d_free_tlb(tlb, 

[PATCH 17/26] x86/mm: basic defines/helpers for CONFIG_X86_5LEVEL

2017-03-12 Thread Kirill A. Shutemov
Extends pagetable headers to support new paging mode.

Signed-off-by: Kirill A. Shutemov 
---
 arch/x86/include/asm/pgtable_64.h   | 11 +++
 arch/x86/include/asm/pgtable_64_types.h | 20 +++
 arch/x86/include/asm/pgtable_types.h| 10 +-
 arch/x86/mm/pgtable.c   | 34 -
 4 files changed, 73 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/pgtable_64.h 
b/arch/x86/include/asm/pgtable_64.h
index 79396bfdc791..9991224f6238 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -35,6 +35,13 @@ extern void paging_init(void);
 #define pud_ERROR(e)   \
pr_err("%s:%d: bad pud %p(%016lx)\n",   \
   __FILE__, __LINE__, &(e), pud_val(e))
+
+#if CONFIG_PGTABLE_LEVELS >= 5
+#define p4d_ERROR(e)   \
+   pr_err("%s:%d: bad p4d %p(%016lx)\n",   \
+  __FILE__, __LINE__, &(e), p4d_val(e))
+#endif
+
 #define pgd_ERROR(e)   \
pr_err("%s:%d: bad pgd %p(%016lx)\n",   \
   __FILE__, __LINE__, &(e), pgd_val(e))
@@ -128,7 +135,11 @@ static inline void native_set_p4d(p4d_t *p4dp, p4d_t p4d)
 
 static inline void native_p4d_clear(p4d_t *p4d)
 {
+#ifdef CONFIG_X86_5LEVEL
+   native_set_p4d(p4d, native_make_p4d(0));
+#else
native_set_p4d(p4d, (p4d_t) { .pgd = native_make_pgd(0)});
+#endif
 }
 
 static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
diff --git a/arch/x86/include/asm/pgtable_64_types.h 
b/arch/x86/include/asm/pgtable_64_types.h
index 00dc0c2b456e..7ae641fdbd07 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -23,12 +23,32 @@ typedef struct { pteval_t pte; } pte_t;
 
 #define SHARED_KERNEL_PMD  0
 
+#ifdef CONFIG_X86_5LEVEL
+
+/*
+ * PGDIR_SHIFT determines what a top-level page table entry can map
+ */
+#define PGDIR_SHIFT48
+#define PTRS_PER_PGD   512
+
+/*
+ * 4rd level page in 5-level paging case
+ */
+#define P4D_SHIFT  39
+#define PTRS_PER_P4D   512
+#define P4D_SIZE   (_AC(1, UL) << P4D_SHIFT)
+#define P4D_MASK   (~(P4D_SIZE - 1))
+
+#else  /* CONFIG_X86_5LEVEL */
+
 /*
  * PGDIR_SHIFT determines what a top-level page table entry can map
  */
 #define PGDIR_SHIFT39
 #define PTRS_PER_PGD   512
 
+#endif  /* CONFIG_X86_5LEVEL */
+
 /*
  * 3rd level page
  */
diff --git a/arch/x86/include/asm/pgtable_types.h 
b/arch/x86/include/asm/pgtable_types.h
index 4930afe9df0a..bf9638e1ee42 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -273,9 +273,17 @@ static inline pgdval_t pgd_flags(pgd_t pgd)
 }
 
 #if CONFIG_PGTABLE_LEVELS > 4
+typedef struct { p4dval_t p4d; } p4d_t;
 
-#error FIXME
+static inline p4d_t native_make_p4d(pudval_t val)
+{
+   return (p4d_t) { val };
+}
 
+static inline p4dval_t native_p4d_val(p4d_t p4d)
+{
+   return p4d.p4d;
+}
 #else
 #include 
 
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 38b6daf72deb..d26b066944a5 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -81,6 +81,14 @@ void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
paravirt_release_pud(__pa(pud) >> PAGE_SHIFT);
tlb_remove_page(tlb, virt_to_page(pud));
 }
+
+#if CONFIG_PGTABLE_LEVELS > 4
+void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d)
+{
+   paravirt_release_p4d(__pa(p4d) >> PAGE_SHIFT);
+   tlb_remove_page(tlb, virt_to_page(p4d));
+}
+#endif /* CONFIG_PGTABLE_LEVELS > 4 */
 #endif /* CONFIG_PGTABLE_LEVELS > 3 */
 #endif /* CONFIG_PGTABLE_LEVELS > 2 */
 
@@ -120,7 +128,7 @@ static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd)
   references from swapper_pg_dir. */
if (CONFIG_PGTABLE_LEVELS == 2 ||
(CONFIG_PGTABLE_LEVELS == 3 && SHARED_KERNEL_PMD) ||
-   CONFIG_PGTABLE_LEVELS == 4) {
+   CONFIG_PGTABLE_LEVELS >= 4) {
clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY,
swapper_pg_dir + KERNEL_PGD_BOUNDARY,
KERNEL_PGD_PTRS);
@@ -582,6 +590,30 @@ void native_set_fixmap(enum fixed_addresses idx, 
phys_addr_t phys,
 }
 
 #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
+#ifdef CONFIG_X86_5LEVEL
+/**
+ * p4d_set_huge - setup kernel P4D mapping
+ *
+ * No 512GB pages yet -- always return 0
+ *
+ * Returns 1 on success and 0 on failure.
+ */
+int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot)
+{
+   return 0;
+}
+
+/**
+ * p4d_clear_huge - clear kernel P4D mapping when it is set
+ *
+ * No 512GB pages yet -- always return 0
+ */
+int p4d_clear_huge(p4d_t *p4d)
+{
+   return 0;
+}
+#endif
+
 /**
  * pud_set_huge - setup kernel PUD mapping
  *
-- 
2.11.0



[PATCH 26/26] x86/mm: allow to have userspace mappings above 47-bits

2017-03-12 Thread Kirill A. Shutemov
On x86, 5-level paging enables 56-bit userspace virtual address space.
Not all user space is ready to handle wide addresses. It's known that
at least some JIT compilers use higher bits in pointers to encode their
information. It collides with valid pointers with 5-level paging and
leads to crashes.

To mitigate this, we are not going to allocate virtual address space
above 47-bit by default.

But userspace can ask for allocation from full address space by
specifying hint address (with or without MAP_FIXED) above 47-bits.

If hint address set above 47-bit, but MAP_FIXED is not specified, we try
to look for unmapped area by specified address. If it's already
occupied, we look for unmapped area in *full* address space, rather than
from 47-bit window.

This approach helps to easily make application's memory allocator aware
about large address space without manually tracking allocated virtual
address space.

One important case we need to handle here is interaction with MPX.
MPX (without MAWA( extension cannot handle addresses above 47-bit, so we
need to make sure that MPX cannot be enabled we already have VMA above
the boundary and forbid creating such VMAs once MPX is enabled.

Signed-off-by: Kirill A. Shutemov 
---
 arch/x86/include/asm/elf.h   |  2 +-
 arch/x86/include/asm/mpx.h   |  9 +
 arch/x86/include/asm/processor.h |  9 ++---
 arch/x86/kernel/sys_x86_64.c | 28 +++-
 arch/x86/mm/hugetlbpage.c| 31 +++
 arch/x86/mm/mmap.c   |  4 ++--
 arch/x86/mm/mpx.c| 33 -
 7 files changed, 104 insertions(+), 12 deletions(-)

diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index 9d49c18b5ea9..265625b0d6cb 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -250,7 +250,7 @@ extern int force_personality32;
the loader.  We need to make sure that it is out of the way of the program
that it will "exec", and that there is sufficient room for the brk.  */
 
-#define ELF_ET_DYN_BASE(TASK_SIZE / 3 * 2)
+#define ELF_ET_DYN_BASE(DEFAULT_MAP_WINDOW / 3 * 2)
 
 /* This yields a mask that user programs can use to figure out what
instruction set this CPU supports.  This could be done in user space,
diff --git a/arch/x86/include/asm/mpx.h b/arch/x86/include/asm/mpx.h
index a0d662be4c5b..7d7404756bb4 100644
--- a/arch/x86/include/asm/mpx.h
+++ b/arch/x86/include/asm/mpx.h
@@ -73,6 +73,9 @@ static inline void mpx_mm_init(struct mm_struct *mm)
 }
 void mpx_notify_unmap(struct mm_struct *mm, struct vm_area_struct *vma,
  unsigned long start, unsigned long end);
+
+unsigned long mpx_unmapped_area_check(unsigned long addr, unsigned long len,
+   unsigned long flags);
 #else
 static inline siginfo_t *mpx_generate_siginfo(struct pt_regs *regs)
 {
@@ -94,6 +97,12 @@ static inline void mpx_notify_unmap(struct mm_struct *mm,
unsigned long start, unsigned long end)
 {
 }
+
+static inline unsigned long mpx_unmapped_area_check(unsigned long addr,
+   unsigned long len, unsigned long flags)
+{
+   return addr;
+}
 #endif /* CONFIG_X86_INTEL_MPX */
 
 #endif /* _ASM_X86_MPX_H */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index f385eca5407a..da8ab4f2d0c7 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -799,6 +799,7 @@ static inline void spin_lock_prefetch(const void *x)
  */
 #define TASK_SIZE  PAGE_OFFSET
 #define TASK_SIZE_MAX  TASK_SIZE
+#define DEFAULT_MAP_WINDOW TASK_SIZE
 #define STACK_TOP  TASK_SIZE
 #define STACK_TOP_MAX  STACK_TOP
 
@@ -838,7 +839,9 @@ static inline void spin_lock_prefetch(const void *x)
  * particular problem by preventing anything from being mapped
  * at the maximum canonical address.
  */
-#define TASK_SIZE_MAX  ((1UL << 47) - PAGE_SIZE)
+#define TASK_SIZE_MAX  ((1UL << __VIRTUAL_MASK_SHIFT) - PAGE_SIZE)
+
+#define DEFAULT_MAP_WINDOW ((1UL << 47) - PAGE_SIZE)
 
 /* This decides where the kernel will search for a free chunk of vm
  * space during mmap's.
@@ -851,7 +854,7 @@ static inline void spin_lock_prefetch(const void *x)
 #define TASK_SIZE_OF(child)((test_tsk_thread_flag(child, TIF_ADDR32)) ? \
IA32_PAGE_OFFSET : TASK_SIZE_MAX)
 
-#define STACK_TOP  TASK_SIZE
+#define STACK_TOP  DEFAULT_MAP_WINDOW
 #define STACK_TOP_MAX  TASK_SIZE_MAX
 
 #define INIT_THREAD  { \
@@ -873,7 +876,7 @@ extern void start_thread(struct pt_regs *regs, unsigned 
long new_ip,
  * This decides where the kernel will search for a free chunk of vm
  * space during mmap's.
  */
-#define TASK_UNMAPPED_BASE (PAGE_ALIGN(TASK_SIZE / 3))
+#define TASK_UNMAPPED_BASE 

[PATCH 17/26] x86/mm: basic defines/helpers for CONFIG_X86_5LEVEL

2017-03-12 Thread Kirill A. Shutemov
Extends pagetable headers to support new paging mode.

Signed-off-by: Kirill A. Shutemov 
---
 arch/x86/include/asm/pgtable_64.h   | 11 +++
 arch/x86/include/asm/pgtable_64_types.h | 20 +++
 arch/x86/include/asm/pgtable_types.h| 10 +-
 arch/x86/mm/pgtable.c   | 34 -
 4 files changed, 73 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/pgtable_64.h 
b/arch/x86/include/asm/pgtable_64.h
index 79396bfdc791..9991224f6238 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -35,6 +35,13 @@ extern void paging_init(void);
 #define pud_ERROR(e)   \
pr_err("%s:%d: bad pud %p(%016lx)\n",   \
   __FILE__, __LINE__, &(e), pud_val(e))
+
+#if CONFIG_PGTABLE_LEVELS >= 5
+#define p4d_ERROR(e)   \
+   pr_err("%s:%d: bad p4d %p(%016lx)\n",   \
+  __FILE__, __LINE__, &(e), p4d_val(e))
+#endif
+
 #define pgd_ERROR(e)   \
pr_err("%s:%d: bad pgd %p(%016lx)\n",   \
   __FILE__, __LINE__, &(e), pgd_val(e))
@@ -128,7 +135,11 @@ static inline void native_set_p4d(p4d_t *p4dp, p4d_t p4d)
 
 static inline void native_p4d_clear(p4d_t *p4d)
 {
+#ifdef CONFIG_X86_5LEVEL
+   native_set_p4d(p4d, native_make_p4d(0));
+#else
native_set_p4d(p4d, (p4d_t) { .pgd = native_make_pgd(0)});
+#endif
 }
 
 static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
diff --git a/arch/x86/include/asm/pgtable_64_types.h 
b/arch/x86/include/asm/pgtable_64_types.h
index 00dc0c2b456e..7ae641fdbd07 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -23,12 +23,32 @@ typedef struct { pteval_t pte; } pte_t;
 
 #define SHARED_KERNEL_PMD  0
 
+#ifdef CONFIG_X86_5LEVEL
+
+/*
+ * PGDIR_SHIFT determines what a top-level page table entry can map
+ */
+#define PGDIR_SHIFT48
+#define PTRS_PER_PGD   512
+
+/*
+ * 4rd level page in 5-level paging case
+ */
+#define P4D_SHIFT  39
+#define PTRS_PER_P4D   512
+#define P4D_SIZE   (_AC(1, UL) << P4D_SHIFT)
+#define P4D_MASK   (~(P4D_SIZE - 1))
+
+#else  /* CONFIG_X86_5LEVEL */
+
 /*
  * PGDIR_SHIFT determines what a top-level page table entry can map
  */
 #define PGDIR_SHIFT39
 #define PTRS_PER_PGD   512
 
+#endif  /* CONFIG_X86_5LEVEL */
+
 /*
  * 3rd level page
  */
diff --git a/arch/x86/include/asm/pgtable_types.h 
b/arch/x86/include/asm/pgtable_types.h
index 4930afe9df0a..bf9638e1ee42 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -273,9 +273,17 @@ static inline pgdval_t pgd_flags(pgd_t pgd)
 }
 
 #if CONFIG_PGTABLE_LEVELS > 4
+typedef struct { p4dval_t p4d; } p4d_t;
 
-#error FIXME
+static inline p4d_t native_make_p4d(pudval_t val)
+{
+   return (p4d_t) { val };
+}
 
+static inline p4dval_t native_p4d_val(p4d_t p4d)
+{
+   return p4d.p4d;
+}
 #else
 #include 
 
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 38b6daf72deb..d26b066944a5 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -81,6 +81,14 @@ void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
paravirt_release_pud(__pa(pud) >> PAGE_SHIFT);
tlb_remove_page(tlb, virt_to_page(pud));
 }
+
+#if CONFIG_PGTABLE_LEVELS > 4
+void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d)
+{
+   paravirt_release_p4d(__pa(p4d) >> PAGE_SHIFT);
+   tlb_remove_page(tlb, virt_to_page(p4d));
+}
+#endif /* CONFIG_PGTABLE_LEVELS > 4 */
 #endif /* CONFIG_PGTABLE_LEVELS > 3 */
 #endif /* CONFIG_PGTABLE_LEVELS > 2 */
 
@@ -120,7 +128,7 @@ static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd)
   references from swapper_pg_dir. */
if (CONFIG_PGTABLE_LEVELS == 2 ||
(CONFIG_PGTABLE_LEVELS == 3 && SHARED_KERNEL_PMD) ||
-   CONFIG_PGTABLE_LEVELS == 4) {
+   CONFIG_PGTABLE_LEVELS >= 4) {
clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY,
swapper_pg_dir + KERNEL_PGD_BOUNDARY,
KERNEL_PGD_PTRS);
@@ -582,6 +590,30 @@ void native_set_fixmap(enum fixed_addresses idx, 
phys_addr_t phys,
 }
 
 #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
+#ifdef CONFIG_X86_5LEVEL
+/**
+ * p4d_set_huge - setup kernel P4D mapping
+ *
+ * No 512GB pages yet -- always return 0
+ *
+ * Returns 1 on success and 0 on failure.
+ */
+int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot)
+{
+   return 0;
+}
+
+/**
+ * p4d_clear_huge - clear kernel P4D mapping when it is set
+ *
+ * No 512GB pages yet -- always return 0
+ */
+int p4d_clear_huge(p4d_t *p4d)
+{
+   return 0;
+}
+#endif
+
 /**
  * pud_set_huge - setup kernel PUD mapping
  *
-- 
2.11.0



[PATCH 26/26] x86/mm: allow to have userspace mappings above 47-bits

2017-03-12 Thread Kirill A. Shutemov
On x86, 5-level paging enables 56-bit userspace virtual address space.
Not all user space is ready to handle wide addresses. It's known that
at least some JIT compilers use higher bits in pointers to encode their
information. It collides with valid pointers with 5-level paging and
leads to crashes.

To mitigate this, we are not going to allocate virtual address space
above 47-bit by default.

But userspace can ask for allocation from full address space by
specifying hint address (with or without MAP_FIXED) above 47-bits.

If hint address set above 47-bit, but MAP_FIXED is not specified, we try
to look for unmapped area by specified address. If it's already
occupied, we look for unmapped area in *full* address space, rather than
from 47-bit window.

This approach helps to easily make application's memory allocator aware
about large address space without manually tracking allocated virtual
address space.

One important case we need to handle here is interaction with MPX.
MPX (without MAWA( extension cannot handle addresses above 47-bit, so we
need to make sure that MPX cannot be enabled we already have VMA above
the boundary and forbid creating such VMAs once MPX is enabled.

Signed-off-by: Kirill A. Shutemov 
---
 arch/x86/include/asm/elf.h   |  2 +-
 arch/x86/include/asm/mpx.h   |  9 +
 arch/x86/include/asm/processor.h |  9 ++---
 arch/x86/kernel/sys_x86_64.c | 28 +++-
 arch/x86/mm/hugetlbpage.c| 31 +++
 arch/x86/mm/mmap.c   |  4 ++--
 arch/x86/mm/mpx.c| 33 -
 7 files changed, 104 insertions(+), 12 deletions(-)

diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index 9d49c18b5ea9..265625b0d6cb 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -250,7 +250,7 @@ extern int force_personality32;
the loader.  We need to make sure that it is out of the way of the program
that it will "exec", and that there is sufficient room for the brk.  */
 
-#define ELF_ET_DYN_BASE(TASK_SIZE / 3 * 2)
+#define ELF_ET_DYN_BASE(DEFAULT_MAP_WINDOW / 3 * 2)
 
 /* This yields a mask that user programs can use to figure out what
instruction set this CPU supports.  This could be done in user space,
diff --git a/arch/x86/include/asm/mpx.h b/arch/x86/include/asm/mpx.h
index a0d662be4c5b..7d7404756bb4 100644
--- a/arch/x86/include/asm/mpx.h
+++ b/arch/x86/include/asm/mpx.h
@@ -73,6 +73,9 @@ static inline void mpx_mm_init(struct mm_struct *mm)
 }
 void mpx_notify_unmap(struct mm_struct *mm, struct vm_area_struct *vma,
  unsigned long start, unsigned long end);
+
+unsigned long mpx_unmapped_area_check(unsigned long addr, unsigned long len,
+   unsigned long flags);
 #else
 static inline siginfo_t *mpx_generate_siginfo(struct pt_regs *regs)
 {
@@ -94,6 +97,12 @@ static inline void mpx_notify_unmap(struct mm_struct *mm,
unsigned long start, unsigned long end)
 {
 }
+
+static inline unsigned long mpx_unmapped_area_check(unsigned long addr,
+   unsigned long len, unsigned long flags)
+{
+   return addr;
+}
 #endif /* CONFIG_X86_INTEL_MPX */
 
 #endif /* _ASM_X86_MPX_H */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index f385eca5407a..da8ab4f2d0c7 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -799,6 +799,7 @@ static inline void spin_lock_prefetch(const void *x)
  */
 #define TASK_SIZE  PAGE_OFFSET
 #define TASK_SIZE_MAX  TASK_SIZE
+#define DEFAULT_MAP_WINDOW TASK_SIZE
 #define STACK_TOP  TASK_SIZE
 #define STACK_TOP_MAX  STACK_TOP
 
@@ -838,7 +839,9 @@ static inline void spin_lock_prefetch(const void *x)
  * particular problem by preventing anything from being mapped
  * at the maximum canonical address.
  */
-#define TASK_SIZE_MAX  ((1UL << 47) - PAGE_SIZE)
+#define TASK_SIZE_MAX  ((1UL << __VIRTUAL_MASK_SHIFT) - PAGE_SIZE)
+
+#define DEFAULT_MAP_WINDOW ((1UL << 47) - PAGE_SIZE)
 
 /* This decides where the kernel will search for a free chunk of vm
  * space during mmap's.
@@ -851,7 +854,7 @@ static inline void spin_lock_prefetch(const void *x)
 #define TASK_SIZE_OF(child)((test_tsk_thread_flag(child, TIF_ADDR32)) ? \
IA32_PAGE_OFFSET : TASK_SIZE_MAX)
 
-#define STACK_TOP  TASK_SIZE
+#define STACK_TOP  DEFAULT_MAP_WINDOW
 #define STACK_TOP_MAX  TASK_SIZE_MAX
 
 #define INIT_THREAD  { \
@@ -873,7 +876,7 @@ extern void start_thread(struct pt_regs *regs, unsigned 
long new_ip,
  * This decides where the kernel will search for a free chunk of vm
  * space during mmap's.
  */
-#define TASK_UNMAPPED_BASE (PAGE_ALIGN(TASK_SIZE / 3))
+#define TASK_UNMAPPED_BASE (PAGE_ALIGN(DEFAULT_MAP_WINDOW / 

[PATCH 23/26] x86/mm: make kernel_physical_mapping_init() support 5-level paging

2017-03-12 Thread Kirill A. Shutemov
Properly populate addition pagetable level if CONFIG_X86_5LEVEL is
enabled.

Signed-off-by: Kirill A. Shutemov 
---
 arch/x86/mm/init_64.c | 71 ---
 1 file changed, 62 insertions(+), 9 deletions(-)

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 5ba99090dc3c..ef117a69f74e 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -622,6 +622,58 @@ phys_pud_init(pud_t *pud_page, unsigned long paddr, 
unsigned long paddr_end,
return paddr_last;
 }
 
+static unsigned long __meminit
+phys_p4d_init(p4d_t *p4d_page, unsigned long paddr, unsigned long paddr_end,
+ unsigned long page_size_mask)
+{
+   unsigned long paddr_next, paddr_last = paddr_end;
+   unsigned long vaddr = (unsigned long)__va(paddr);
+   int i = p4d_index(vaddr);
+
+   if (!IS_ENABLED(CONFIG_X86_5LEVEL))
+   return phys_pud_init((pud_t *) p4d_page, paddr, paddr_end, 
page_size_mask);
+
+   for (; i < PTRS_PER_P4D; i++, paddr = paddr_next) {
+   p4d_t *p4d;
+   pud_t *pud;
+
+   vaddr = (unsigned long)__va(paddr);
+   p4d = p4d_page + p4d_index(vaddr);
+   paddr_next = (paddr & P4D_MASK) + P4D_SIZE;
+
+   if (paddr >= paddr_end) {
+   if (!after_bootmem &&
+   !e820_any_mapped(paddr & P4D_MASK, paddr_next,
+E820_RAM) &&
+   !e820_any_mapped(paddr & P4D_MASK, paddr_next,
+E820_RESERVED_KERN)) {
+   set_p4d(p4d, __p4d(0));
+   }
+   continue;
+   }
+
+   if (!p4d_none(*p4d)) {
+   pud = pud_offset(p4d, 0);
+   paddr_last = phys_pud_init(pud, paddr,
+   paddr_end,
+   page_size_mask);
+   __flush_tlb_all();
+   continue;
+   }
+
+   pud = alloc_low_page();
+   paddr_last = phys_pud_init(pud, paddr, paddr_end,
+  page_size_mask);
+
+   spin_lock(_mm.page_table_lock);
+   p4d_populate(_mm, p4d, pud);
+   spin_unlock(_mm.page_table_lock);
+   }
+   __flush_tlb_all();
+
+   return paddr_last;
+}
+
 /*
  * Create page table mapping for the physical memory for specific physical
  * addresses. The virtual and physical addresses have to be aligned on PMD 
level
@@ -643,26 +695,27 @@ kernel_physical_mapping_init(unsigned long paddr_start,
for (; vaddr < vaddr_end; vaddr = vaddr_next) {
pgd_t *pgd = pgd_offset_k(vaddr);
p4d_t *p4d;
-   pud_t *pud;
 
vaddr_next = (vaddr & PGDIR_MASK) + PGDIR_SIZE;
 
-   BUILD_BUG_ON(pgd_none(*pgd));
-   p4d = p4d_offset(pgd, vaddr);
-   if (p4d_val(*p4d)) {
-   pud = (pud_t *)p4d_page_vaddr(*p4d);
-   paddr_last = phys_pud_init(pud, __pa(vaddr),
+   if (pgd_val(*pgd)) {
+   p4d = (p4d_t *)pgd_page_vaddr(*pgd);
+   paddr_last = phys_p4d_init(p4d, __pa(vaddr),
   __pa(vaddr_end),
   page_size_mask);
continue;
}
 
-   pud = alloc_low_page();
-   paddr_last = phys_pud_init(pud, __pa(vaddr), __pa(vaddr_end),
+   p4d = alloc_low_page();
+   paddr_last = phys_p4d_init(p4d, __pa(vaddr), __pa(vaddr_end),
   page_size_mask);
 
spin_lock(_mm.page_table_lock);
-   p4d_populate(_mm, p4d, pud);
+   if (IS_ENABLED(CONFIG_X86_5LEVEL))
+   pgd_populate(_mm, pgd, p4d);
+   else
+   p4d_populate(_mm, p4d_offset(pgd, vaddr),
+   (pud_t *) p4d);
spin_unlock(_mm.page_table_lock);
pgd_changed = true;
}
-- 
2.11.0



[PATCH 23/26] x86/mm: make kernel_physical_mapping_init() support 5-level paging

2017-03-12 Thread Kirill A. Shutemov
Properly populate addition pagetable level if CONFIG_X86_5LEVEL is
enabled.

Signed-off-by: Kirill A. Shutemov 
---
 arch/x86/mm/init_64.c | 71 ---
 1 file changed, 62 insertions(+), 9 deletions(-)

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 5ba99090dc3c..ef117a69f74e 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -622,6 +622,58 @@ phys_pud_init(pud_t *pud_page, unsigned long paddr, 
unsigned long paddr_end,
return paddr_last;
 }
 
+static unsigned long __meminit
+phys_p4d_init(p4d_t *p4d_page, unsigned long paddr, unsigned long paddr_end,
+ unsigned long page_size_mask)
+{
+   unsigned long paddr_next, paddr_last = paddr_end;
+   unsigned long vaddr = (unsigned long)__va(paddr);
+   int i = p4d_index(vaddr);
+
+   if (!IS_ENABLED(CONFIG_X86_5LEVEL))
+   return phys_pud_init((pud_t *) p4d_page, paddr, paddr_end, 
page_size_mask);
+
+   for (; i < PTRS_PER_P4D; i++, paddr = paddr_next) {
+   p4d_t *p4d;
+   pud_t *pud;
+
+   vaddr = (unsigned long)__va(paddr);
+   p4d = p4d_page + p4d_index(vaddr);
+   paddr_next = (paddr & P4D_MASK) + P4D_SIZE;
+
+   if (paddr >= paddr_end) {
+   if (!after_bootmem &&
+   !e820_any_mapped(paddr & P4D_MASK, paddr_next,
+E820_RAM) &&
+   !e820_any_mapped(paddr & P4D_MASK, paddr_next,
+E820_RESERVED_KERN)) {
+   set_p4d(p4d, __p4d(0));
+   }
+   continue;
+   }
+
+   if (!p4d_none(*p4d)) {
+   pud = pud_offset(p4d, 0);
+   paddr_last = phys_pud_init(pud, paddr,
+   paddr_end,
+   page_size_mask);
+   __flush_tlb_all();
+   continue;
+   }
+
+   pud = alloc_low_page();
+   paddr_last = phys_pud_init(pud, paddr, paddr_end,
+  page_size_mask);
+
+   spin_lock(_mm.page_table_lock);
+   p4d_populate(_mm, p4d, pud);
+   spin_unlock(_mm.page_table_lock);
+   }
+   __flush_tlb_all();
+
+   return paddr_last;
+}
+
 /*
  * Create page table mapping for the physical memory for specific physical
  * addresses. The virtual and physical addresses have to be aligned on PMD 
level
@@ -643,26 +695,27 @@ kernel_physical_mapping_init(unsigned long paddr_start,
for (; vaddr < vaddr_end; vaddr = vaddr_next) {
pgd_t *pgd = pgd_offset_k(vaddr);
p4d_t *p4d;
-   pud_t *pud;
 
vaddr_next = (vaddr & PGDIR_MASK) + PGDIR_SIZE;
 
-   BUILD_BUG_ON(pgd_none(*pgd));
-   p4d = p4d_offset(pgd, vaddr);
-   if (p4d_val(*p4d)) {
-   pud = (pud_t *)p4d_page_vaddr(*p4d);
-   paddr_last = phys_pud_init(pud, __pa(vaddr),
+   if (pgd_val(*pgd)) {
+   p4d = (p4d_t *)pgd_page_vaddr(*pgd);
+   paddr_last = phys_p4d_init(p4d, __pa(vaddr),
   __pa(vaddr_end),
   page_size_mask);
continue;
}
 
-   pud = alloc_low_page();
-   paddr_last = phys_pud_init(pud, __pa(vaddr), __pa(vaddr_end),
+   p4d = alloc_low_page();
+   paddr_last = phys_p4d_init(p4d, __pa(vaddr), __pa(vaddr_end),
   page_size_mask);
 
spin_lock(_mm.page_table_lock);
-   p4d_populate(_mm, p4d, pud);
+   if (IS_ENABLED(CONFIG_X86_5LEVEL))
+   pgd_populate(_mm, pgd, p4d);
+   else
+   p4d_populate(_mm, p4d_offset(pgd, vaddr),
+   (pud_t *) p4d);
spin_unlock(_mm.page_table_lock);
pgd_changed = true;
}
-- 
2.11.0



[PATCH 05/26] x86/mm: add support of p4d_t in vmalloc_fault()

2017-03-12 Thread Kirill A. Shutemov
With 4-level paging copying happens on p4d level, as we have pgd_none()
always false when p4d_t folded.

Signed-off-by: Kirill A. Shutemov 
---
 arch/x86/mm/fault.c | 27 ---
 1 file changed, 24 insertions(+), 3 deletions(-)

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 605fd5e8e048..88040bb2b78a 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -435,6 +435,7 @@ void vmalloc_sync_all(void)
 static noinline int vmalloc_fault(unsigned long address)
 {
pgd_t *pgd, *pgd_ref;
+   p4d_t *p4d, *p4d_ref;
pud_t *pud, *pud_ref;
pmd_t *pmd, *pmd_ref;
pte_t *pte, *pte_ref;
@@ -458,17 +459,37 @@ static noinline int vmalloc_fault(unsigned long address)
if (pgd_none(*pgd)) {
set_pgd(pgd, *pgd_ref);
arch_flush_lazy_mmu_mode();
-   } else {
+   } else if (CONFIG_PGTABLE_LEVELS > 4) {
+   /*
+* With folded p4d, pgd_none() is always false. So pgd may
+* point to empty page table entry and pgd_page_vaddr()
+* will return garbage.
+*
+* We will do the correct sanity check on p4d level.
+*/
BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
}
 
+   /* With 4-level paging copying happens on p4d level. */
+   p4d = p4d_offset(pgd, address);
+   p4d_ref = p4d_offset(pgd_ref, address);
+   if (p4d_none(*p4d_ref))
+   return -1;
+
+   if (p4d_none(*p4d)) {
+   set_p4d(p4d, *p4d_ref);
+   arch_flush_lazy_mmu_mode();
+   } else {
+   BUG_ON(p4d_pfn(*p4d) != p4d_pfn(*p4d_ref));
+   }
+
/*
 * Below here mismatches are bugs because these lower tables
 * are shared:
 */
 
-   pud = pud_offset(pgd, address);
-   pud_ref = pud_offset(pgd_ref, address);
+   pud = pud_offset(p4d, address);
+   pud_ref = pud_offset(p4d_ref, address);
if (pud_none(*pud_ref))
return -1;
 
-- 
2.11.0



[PATCH 15/26] x86/mm: define virtual memory map for 5-level paging

2017-03-12 Thread Kirill A. Shutemov
The first part of memory map (up to %esp fixup) simply scales existing
map for 4-level paging by factor of 9 -- number of bits addressed by
additional page table level.

The rest of the map is uncahnged.

Signed-off-by: Kirill A. Shutemov 
---
 Documentation/x86/x86_64/mm.txt | 33 ++---
 arch/x86/Kconfig|  1 +
 arch/x86/include/asm/kasan.h|  9 ++---
 arch/x86/include/asm/page_64_types.h| 10 ++
 arch/x86/include/asm/pgtable_64_types.h |  6 ++
 arch/x86/include/asm/sparsemem.h|  9 +++--
 6 files changed, 60 insertions(+), 8 deletions(-)

diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt
index 5724092db811..0303a47b82f8 100644
--- a/Documentation/x86/x86_64/mm.txt
+++ b/Documentation/x86/x86_64/mm.txt
@@ -4,7 +4,7 @@
 Virtual memory map with 4 level page tables:
 
  - 7fff (=47 bits) user space, different per mm
-hole caused by [48:63] sign extension
+hole caused by [47:63] sign extension
 8000 - 87ff (=43 bits) guard hole, reserved for 
hypervisor
 8800 - c7ff (=64 TB) direct mapping of all phys. memory
 c800 - c8ff (=40 bits) hole
@@ -23,12 +23,39 @@ a000 - ff5f (=1526 MB) module 
mapping space
 ff60 - ffdf (=8 MB) vsyscalls
 ffe0 -  (=2 MB) unused hole
 
+Virtual memory map with 5 level page tables:
+
+ - 00ff (=56 bits) user space, different per mm
+hole caused by [56:63] sign extension
+ff00 - ff0f (=52 bits) guard hole, reserved for 
hypervisor
+ff10 - ff8f (=55 bits) direct mapping of all phys. 
memory
+ff90 - ff91 (=49 bits) hole
+ff92 - ffd1 (=54 bits) vmalloc/ioremap space
+ffd2 - ffd3 (=49 bits) hole
+ffd4 - ffd5 (=49 bits) virtual memory map (512TB)
+... unused hole ...
+ffd8 - fff7 (=53 bits) kasan shadow memory (8PB)
+... unused hole ...
+fffe - fffe007f (=39 bits) %esp fixup stacks
+... unused hole ...
+ffef - fffe (=64 GB) EFI region mapping space
+... unused hole ...
+8000 - 9fff (=512 MB)  kernel text mapping, from phys 0
+a000 - ff5f (=1526 MB) module mapping space
+ff60 - ffdf (=8 MB) vsyscalls
+ffe0 -  (=2 MB) unused hole
+
+Architecture defines a 64-bit virtual address. Implementations can support
+less. Currently supported are 48- and 57-bit virtual addresses. Bits 63
+through to the most-significant implemented bit are set to either all ones
+or all zero. This causes hole between user space and kernel addresses.
+
 The direct mapping covers all memory in the system up to the highest
 memory address (this means in some cases it can also include PCI memory
 holes).
 
-vmalloc space is lazily synchronized into the different PML4 pages of
-the processes using the page fault handler, with init_level4_pgt as
+vmalloc space is lazily synchronized into the different PML4/PML5 pages of
+the processes using the page fault handler, with init_top_pgt as
 reference.
 
 Current X86-64 implementations support up to 46 bits of address space (64 TB),
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index cc98d5a294ee..747f06f00a22 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -290,6 +290,7 @@ config ARCH_SUPPORTS_DEBUG_PAGEALLOC
 config KASAN_SHADOW_OFFSET
hex
depends on KASAN
+   default 0xdff8 if X86_5LEVEL
default 0xdc00
 
 config HAVE_INTEL_TXT
diff --git a/arch/x86/include/asm/kasan.h b/arch/x86/include/asm/kasan.h
index 1410b567ecde..f527b02a0ee3 100644
--- a/arch/x86/include/asm/kasan.h
+++ b/arch/x86/include/asm/kasan.h
@@ -11,9 +11,12 @@
  * 'kernel address space start' >> KASAN_SHADOW_SCALE_SHIFT
  */
 #define KASAN_SHADOW_START  (KASAN_SHADOW_OFFSET + \
-   (0x8000ULL >> 3))
-/* 47 bits for kernel address -> (47 - 3) bits for shadow */
-#define KASAN_SHADOW_END(KASAN_SHADOW_START + (1ULL << (47 - 3)))
+   ((-1UL << __VIRTUAL_MASK_SHIFT) >> 3))
+/*
+ * 47 bits for kernel address -> (47 - 3) bits for shadow
+ * 56 bits for kernel address -> (56 - 3) bits for shadow
+ */
+#define KASAN_SHADOW_END(KASAN_SHADOW_START + (1ULL << 
(__VIRTUAL_MASK_SHIFT - 3)))
 
 #ifndef __ASSEMBLY__
 
diff --git a/arch/x86/include/asm/page_64_types.h 
b/arch/x86/include/asm/page_64_types.h
index 9215e0527647..3f5f08b010d0 100644
--- a/arch/x86/include/asm/page_64_types.h
+++ b/arch/x86/include/asm/page_64_types.h
@@ -36,7 +36,12 @@
  * hypervisor to fit.  Choosing 16 slots here is 

[PATCH 11/26] x86/xen: convert __xen_pgd_walk() and xen_cleanmfnmap() to support p4d

2017-03-12 Thread Kirill A. Shutemov
Split these helpers few per-level functions and add p4d support.

Signed-off-by: Xiong Zhang 
[kirill.shute...@linux.intel.com: split off into separate patch]
Signed-off-by: Kirill A. Shutemov 
---
 arch/x86/xen/mmu.c | 245 -
 arch/x86/xen/mmu.h |   1 +
 2 files changed, 150 insertions(+), 96 deletions(-)

diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 37cb5aad71de..c49e165fde60 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -593,6 +593,64 @@ static void xen_set_pgd(pgd_t *ptr, pgd_t val)
 }
 #endif /* CONFIG_PGTABLE_LEVELS == 4 */
 
+static int xen_pmd_walk(struct mm_struct *mm, pmd_t *pmd,
+   int (*func)(struct mm_struct *mm, struct page *, enum pt_level),
+   bool last, unsigned long limit)
+{
+   int i, nr, flush = 0;
+
+   nr = last ? pmd_index(limit) + 1 : PTRS_PER_PMD;
+   for (i = 0; i < nr; i++) {
+   if (!pmd_none(pmd[i]))
+   flush |= (*func)(mm, pmd_page(pmd[i]), PT_PTE);
+   }
+   return flush;
+}
+
+static int xen_pud_walk(struct mm_struct *mm, pud_t *pud,
+   int (*func)(struct mm_struct *mm, struct page *, enum pt_level),
+   bool last, unsigned long limit)
+{
+   int i, nr, flush = 0;
+
+   nr = last ? pud_index(limit) + 1 : PTRS_PER_PUD;
+   for (i = 0; i < nr; i++) {
+   pmd_t *pmd;
+
+   if (pud_none(pud[i]))
+   continue;
+
+   pmd = pmd_offset([i], 0);
+   if (PTRS_PER_PMD > 1)
+   flush |= (*func)(mm, virt_to_page(pmd), PT_PMD);
+   flush |= xen_pmd_walk(mm, pmd, func,
+   last && i == nr - 1, limit);
+   }
+   return flush;
+}
+
+static int xen_p4d_walk(struct mm_struct *mm, p4d_t *p4d,
+   int (*func)(struct mm_struct *mm, struct page *, enum pt_level),
+   bool last, unsigned long limit)
+{
+   int i, nr, flush = 0;
+
+   nr = last ? p4d_index(limit) + 1 : PTRS_PER_P4D;
+   for (i = 0; i < nr; i++) {
+   pud_t *pud;
+
+   if (p4d_none(p4d[i]))
+   continue;
+
+   pud = pud_offset([i], 0);
+   if (PTRS_PER_PUD > 1)
+   flush |= (*func)(mm, virt_to_page(pud), PT_PUD);
+   flush |= xen_pud_walk(mm, pud, func,
+   last && i == nr - 1, limit);
+   }
+   return flush;
+}
+
 /*
  * (Yet another) pagetable walker.  This one is intended for pinning a
  * pagetable.  This means that it walks a pagetable and calls the
@@ -613,10 +671,8 @@ static int __xen_pgd_walk(struct mm_struct *mm, pgd_t *pgd,
  enum pt_level),
  unsigned long limit)
 {
-   int flush = 0;
+   int i, nr, flush = 0;
unsigned hole_low, hole_high;
-   unsigned pgdidx_limit, pudidx_limit, pmdidx_limit;
-   unsigned pgdidx, pudidx, pmdidx;
 
/* The limit is the last byte to be touched */
limit--;
@@ -633,65 +689,22 @@ static int __xen_pgd_walk(struct mm_struct *mm, pgd_t 
*pgd,
hole_low = pgd_index(USER_LIMIT);
hole_high = pgd_index(PAGE_OFFSET);
 
-   pgdidx_limit = pgd_index(limit);
-#if PTRS_PER_PUD > 1
-   pudidx_limit = pud_index(limit);
-#else
-   pudidx_limit = 0;
-#endif
-#if PTRS_PER_PMD > 1
-   pmdidx_limit = pmd_index(limit);
-#else
-   pmdidx_limit = 0;
-#endif
-
-   for (pgdidx = 0; pgdidx <= pgdidx_limit; pgdidx++) {
-   pud_t *pud;
+   nr = pgd_index(limit) + 1;
+   for (i = 0; i < nr; i++) {
+   p4d_t *p4d;
 
-   if (pgdidx >= hole_low && pgdidx < hole_high)
+   if (i >= hole_low && i < hole_high)
continue;
 
-   if (!pgd_val(pgd[pgdidx]))
+   if (pgd_none(pgd[i]))
continue;
 
-   pud = pud_offset([pgdidx], 0);
-
-   if (PTRS_PER_PUD > 1) /* not folded */
-   flush |= (*func)(mm, virt_to_page(pud), PT_PUD);
-
-   for (pudidx = 0; pudidx < PTRS_PER_PUD; pudidx++) {
-   pmd_t *pmd;
-
-   if (pgdidx == pgdidx_limit &&
-   pudidx > pudidx_limit)
-   goto out;
-
-   if (pud_none(pud[pudidx]))
-   continue;
-
-   pmd = pmd_offset([pudidx], 0);
-
-   if (PTRS_PER_PMD > 1) /* not folded */
-   flush |= (*func)(mm, virt_to_page(pmd), PT_PMD);
-
-   for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) {
-   struct page *pte;
-
-   if (pgdidx == pgdidx_limit &&
- 

[PATCH 05/26] x86/mm: add support of p4d_t in vmalloc_fault()

2017-03-12 Thread Kirill A. Shutemov
With 4-level paging copying happens on p4d level, as we have pgd_none()
always false when p4d_t folded.

Signed-off-by: Kirill A. Shutemov 
---
 arch/x86/mm/fault.c | 27 ---
 1 file changed, 24 insertions(+), 3 deletions(-)

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 605fd5e8e048..88040bb2b78a 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -435,6 +435,7 @@ void vmalloc_sync_all(void)
 static noinline int vmalloc_fault(unsigned long address)
 {
pgd_t *pgd, *pgd_ref;
+   p4d_t *p4d, *p4d_ref;
pud_t *pud, *pud_ref;
pmd_t *pmd, *pmd_ref;
pte_t *pte, *pte_ref;
@@ -458,17 +459,37 @@ static noinline int vmalloc_fault(unsigned long address)
if (pgd_none(*pgd)) {
set_pgd(pgd, *pgd_ref);
arch_flush_lazy_mmu_mode();
-   } else {
+   } else if (CONFIG_PGTABLE_LEVELS > 4) {
+   /*
+* With folded p4d, pgd_none() is always false. So pgd may
+* point to empty page table entry and pgd_page_vaddr()
+* will return garbage.
+*
+* We will do the correct sanity check on p4d level.
+*/
BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
}
 
+   /* With 4-level paging copying happens on p4d level. */
+   p4d = p4d_offset(pgd, address);
+   p4d_ref = p4d_offset(pgd_ref, address);
+   if (p4d_none(*p4d_ref))
+   return -1;
+
+   if (p4d_none(*p4d)) {
+   set_p4d(p4d, *p4d_ref);
+   arch_flush_lazy_mmu_mode();
+   } else {
+   BUG_ON(p4d_pfn(*p4d) != p4d_pfn(*p4d_ref));
+   }
+
/*
 * Below here mismatches are bugs because these lower tables
 * are shared:
 */
 
-   pud = pud_offset(pgd, address);
-   pud_ref = pud_offset(pgd_ref, address);
+   pud = pud_offset(p4d, address);
+   pud_ref = pud_offset(p4d_ref, address);
if (pud_none(*pud_ref))
return -1;
 
-- 
2.11.0



[PATCH 15/26] x86/mm: define virtual memory map for 5-level paging

2017-03-12 Thread Kirill A. Shutemov
The first part of memory map (up to %esp fixup) simply scales existing
map for 4-level paging by factor of 9 -- number of bits addressed by
additional page table level.

The rest of the map is uncahnged.

Signed-off-by: Kirill A. Shutemov 
---
 Documentation/x86/x86_64/mm.txt | 33 ++---
 arch/x86/Kconfig|  1 +
 arch/x86/include/asm/kasan.h|  9 ++---
 arch/x86/include/asm/page_64_types.h| 10 ++
 arch/x86/include/asm/pgtable_64_types.h |  6 ++
 arch/x86/include/asm/sparsemem.h|  9 +++--
 6 files changed, 60 insertions(+), 8 deletions(-)

diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt
index 5724092db811..0303a47b82f8 100644
--- a/Documentation/x86/x86_64/mm.txt
+++ b/Documentation/x86/x86_64/mm.txt
@@ -4,7 +4,7 @@
 Virtual memory map with 4 level page tables:
 
  - 7fff (=47 bits) user space, different per mm
-hole caused by [48:63] sign extension
+hole caused by [47:63] sign extension
 8000 - 87ff (=43 bits) guard hole, reserved for 
hypervisor
 8800 - c7ff (=64 TB) direct mapping of all phys. memory
 c800 - c8ff (=40 bits) hole
@@ -23,12 +23,39 @@ a000 - ff5f (=1526 MB) module 
mapping space
 ff60 - ffdf (=8 MB) vsyscalls
 ffe0 -  (=2 MB) unused hole
 
+Virtual memory map with 5 level page tables:
+
+ - 00ff (=56 bits) user space, different per mm
+hole caused by [56:63] sign extension
+ff00 - ff0f (=52 bits) guard hole, reserved for 
hypervisor
+ff10 - ff8f (=55 bits) direct mapping of all phys. 
memory
+ff90 - ff91 (=49 bits) hole
+ff92 - ffd1 (=54 bits) vmalloc/ioremap space
+ffd2 - ffd3 (=49 bits) hole
+ffd4 - ffd5 (=49 bits) virtual memory map (512TB)
+... unused hole ...
+ffd8 - fff7 (=53 bits) kasan shadow memory (8PB)
+... unused hole ...
+fffe - fffe007f (=39 bits) %esp fixup stacks
+... unused hole ...
+ffef - fffe (=64 GB) EFI region mapping space
+... unused hole ...
+8000 - 9fff (=512 MB)  kernel text mapping, from phys 0
+a000 - ff5f (=1526 MB) module mapping space
+ff60 - ffdf (=8 MB) vsyscalls
+ffe0 -  (=2 MB) unused hole
+
+Architecture defines a 64-bit virtual address. Implementations can support
+less. Currently supported are 48- and 57-bit virtual addresses. Bits 63
+through to the most-significant implemented bit are set to either all ones
+or all zero. This causes hole between user space and kernel addresses.
+
 The direct mapping covers all memory in the system up to the highest
 memory address (this means in some cases it can also include PCI memory
 holes).
 
-vmalloc space is lazily synchronized into the different PML4 pages of
-the processes using the page fault handler, with init_level4_pgt as
+vmalloc space is lazily synchronized into the different PML4/PML5 pages of
+the processes using the page fault handler, with init_top_pgt as
 reference.
 
 Current X86-64 implementations support up to 46 bits of address space (64 TB),
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index cc98d5a294ee..747f06f00a22 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -290,6 +290,7 @@ config ARCH_SUPPORTS_DEBUG_PAGEALLOC
 config KASAN_SHADOW_OFFSET
hex
depends on KASAN
+   default 0xdff8 if X86_5LEVEL
default 0xdc00
 
 config HAVE_INTEL_TXT
diff --git a/arch/x86/include/asm/kasan.h b/arch/x86/include/asm/kasan.h
index 1410b567ecde..f527b02a0ee3 100644
--- a/arch/x86/include/asm/kasan.h
+++ b/arch/x86/include/asm/kasan.h
@@ -11,9 +11,12 @@
  * 'kernel address space start' >> KASAN_SHADOW_SCALE_SHIFT
  */
 #define KASAN_SHADOW_START  (KASAN_SHADOW_OFFSET + \
-   (0x8000ULL >> 3))
-/* 47 bits for kernel address -> (47 - 3) bits for shadow */
-#define KASAN_SHADOW_END(KASAN_SHADOW_START + (1ULL << (47 - 3)))
+   ((-1UL << __VIRTUAL_MASK_SHIFT) >> 3))
+/*
+ * 47 bits for kernel address -> (47 - 3) bits for shadow
+ * 56 bits for kernel address -> (56 - 3) bits for shadow
+ */
+#define KASAN_SHADOW_END(KASAN_SHADOW_START + (1ULL << 
(__VIRTUAL_MASK_SHIFT - 3)))
 
 #ifndef __ASSEMBLY__
 
diff --git a/arch/x86/include/asm/page_64_types.h 
b/arch/x86/include/asm/page_64_types.h
index 9215e0527647..3f5f08b010d0 100644
--- a/arch/x86/include/asm/page_64_types.h
+++ b/arch/x86/include/asm/page_64_types.h
@@ -36,7 +36,12 @@
  * hypervisor to fit.  Choosing 16 slots here is arbitrary, but it's
  * what Xen 

[PATCH 11/26] x86/xen: convert __xen_pgd_walk() and xen_cleanmfnmap() to support p4d

2017-03-12 Thread Kirill A. Shutemov
Split these helpers few per-level functions and add p4d support.

Signed-off-by: Xiong Zhang 
[kirill.shute...@linux.intel.com: split off into separate patch]
Signed-off-by: Kirill A. Shutemov 
---
 arch/x86/xen/mmu.c | 245 -
 arch/x86/xen/mmu.h |   1 +
 2 files changed, 150 insertions(+), 96 deletions(-)

diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 37cb5aad71de..c49e165fde60 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -593,6 +593,64 @@ static void xen_set_pgd(pgd_t *ptr, pgd_t val)
 }
 #endif /* CONFIG_PGTABLE_LEVELS == 4 */
 
+static int xen_pmd_walk(struct mm_struct *mm, pmd_t *pmd,
+   int (*func)(struct mm_struct *mm, struct page *, enum pt_level),
+   bool last, unsigned long limit)
+{
+   int i, nr, flush = 0;
+
+   nr = last ? pmd_index(limit) + 1 : PTRS_PER_PMD;
+   for (i = 0; i < nr; i++) {
+   if (!pmd_none(pmd[i]))
+   flush |= (*func)(mm, pmd_page(pmd[i]), PT_PTE);
+   }
+   return flush;
+}
+
+static int xen_pud_walk(struct mm_struct *mm, pud_t *pud,
+   int (*func)(struct mm_struct *mm, struct page *, enum pt_level),
+   bool last, unsigned long limit)
+{
+   int i, nr, flush = 0;
+
+   nr = last ? pud_index(limit) + 1 : PTRS_PER_PUD;
+   for (i = 0; i < nr; i++) {
+   pmd_t *pmd;
+
+   if (pud_none(pud[i]))
+   continue;
+
+   pmd = pmd_offset([i], 0);
+   if (PTRS_PER_PMD > 1)
+   flush |= (*func)(mm, virt_to_page(pmd), PT_PMD);
+   flush |= xen_pmd_walk(mm, pmd, func,
+   last && i == nr - 1, limit);
+   }
+   return flush;
+}
+
+static int xen_p4d_walk(struct mm_struct *mm, p4d_t *p4d,
+   int (*func)(struct mm_struct *mm, struct page *, enum pt_level),
+   bool last, unsigned long limit)
+{
+   int i, nr, flush = 0;
+
+   nr = last ? p4d_index(limit) + 1 : PTRS_PER_P4D;
+   for (i = 0; i < nr; i++) {
+   pud_t *pud;
+
+   if (p4d_none(p4d[i]))
+   continue;
+
+   pud = pud_offset([i], 0);
+   if (PTRS_PER_PUD > 1)
+   flush |= (*func)(mm, virt_to_page(pud), PT_PUD);
+   flush |= xen_pud_walk(mm, pud, func,
+   last && i == nr - 1, limit);
+   }
+   return flush;
+}
+
 /*
  * (Yet another) pagetable walker.  This one is intended for pinning a
  * pagetable.  This means that it walks a pagetable and calls the
@@ -613,10 +671,8 @@ static int __xen_pgd_walk(struct mm_struct *mm, pgd_t *pgd,
  enum pt_level),
  unsigned long limit)
 {
-   int flush = 0;
+   int i, nr, flush = 0;
unsigned hole_low, hole_high;
-   unsigned pgdidx_limit, pudidx_limit, pmdidx_limit;
-   unsigned pgdidx, pudidx, pmdidx;
 
/* The limit is the last byte to be touched */
limit--;
@@ -633,65 +689,22 @@ static int __xen_pgd_walk(struct mm_struct *mm, pgd_t 
*pgd,
hole_low = pgd_index(USER_LIMIT);
hole_high = pgd_index(PAGE_OFFSET);
 
-   pgdidx_limit = pgd_index(limit);
-#if PTRS_PER_PUD > 1
-   pudidx_limit = pud_index(limit);
-#else
-   pudidx_limit = 0;
-#endif
-#if PTRS_PER_PMD > 1
-   pmdidx_limit = pmd_index(limit);
-#else
-   pmdidx_limit = 0;
-#endif
-
-   for (pgdidx = 0; pgdidx <= pgdidx_limit; pgdidx++) {
-   pud_t *pud;
+   nr = pgd_index(limit) + 1;
+   for (i = 0; i < nr; i++) {
+   p4d_t *p4d;
 
-   if (pgdidx >= hole_low && pgdidx < hole_high)
+   if (i >= hole_low && i < hole_high)
continue;
 
-   if (!pgd_val(pgd[pgdidx]))
+   if (pgd_none(pgd[i]))
continue;
 
-   pud = pud_offset([pgdidx], 0);
-
-   if (PTRS_PER_PUD > 1) /* not folded */
-   flush |= (*func)(mm, virt_to_page(pud), PT_PUD);
-
-   for (pudidx = 0; pudidx < PTRS_PER_PUD; pudidx++) {
-   pmd_t *pmd;
-
-   if (pgdidx == pgdidx_limit &&
-   pudidx > pudidx_limit)
-   goto out;
-
-   if (pud_none(pud[pudidx]))
-   continue;
-
-   pmd = pmd_offset([pudidx], 0);
-
-   if (PTRS_PER_PMD > 1) /* not folded */
-   flush |= (*func)(mm, virt_to_page(pmd), PT_PMD);
-
-   for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) {
-   struct page *pte;
-
-   if (pgdidx == pgdidx_limit &&
-   pudidx == pudidx_limit &&
- 

[PATCH 18/26] x86/dump_pagetables: support 5-level paging

2017-03-12 Thread Kirill A. Shutemov
Simple extension to support one more page table level.

Signed-off-by: Kirill A. Shutemov 
---
 arch/x86/mm/dump_pagetables.c | 49 ---
 1 file changed, 42 insertions(+), 7 deletions(-)

diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index 58b5bee7ea27..0effac6989cd 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -110,7 +110,8 @@ static struct addr_marker address_markers[] = {
 #define PTE_LEVEL_MULT (PAGE_SIZE)
 #define PMD_LEVEL_MULT (PTRS_PER_PTE * PTE_LEVEL_MULT)
 #define PUD_LEVEL_MULT (PTRS_PER_PMD * PMD_LEVEL_MULT)
-#define PGD_LEVEL_MULT (PTRS_PER_PUD * PUD_LEVEL_MULT)
+#define P4D_LEVEL_MULT (PTRS_PER_PUD * PUD_LEVEL_MULT)
+#define PGD_LEVEL_MULT (PTRS_PER_PUD * P4D_LEVEL_MULT)
 
 #define pt_dump_seq_printf(m, to_dmesg, fmt, args...)  \
 ({ \
@@ -347,7 +348,7 @@ static bool pud_already_checked(pud_t *prev_pud, pud_t 
*pud, bool checkwx)
return checkwx && prev_pud && (pud_val(*prev_pud) == pud_val(*pud));
 }
 
-static void walk_pud_level(struct seq_file *m, struct pg_state *st, pgd_t addr,
+static void walk_pud_level(struct seq_file *m, struct pg_state *st, p4d_t addr,
unsigned long P)
 {
int i;
@@ -355,7 +356,7 @@ static void walk_pud_level(struct seq_file *m, struct 
pg_state *st, pgd_t addr,
pgprotval_t prot;
pud_t *prev_pud = NULL;
 
-   start = (pud_t *) pgd_page_vaddr(addr);
+   start = (pud_t *) p4d_page_vaddr(addr);
 
for (i = 0; i < PTRS_PER_PUD; i++) {
st->current_address = normalize_addr(P + i * PUD_LEVEL_MULT);
@@ -377,9 +378,43 @@ static void walk_pud_level(struct seq_file *m, struct 
pg_state *st, pgd_t addr,
 }
 
 #else
-#define walk_pud_level(m,s,a,p) walk_pmd_level(m,s,__pud(pgd_val(a)),p)
-#define pgd_large(a) pud_large(__pud(pgd_val(a)))
-#define pgd_none(a)  pud_none(__pud(pgd_val(a)))
+#define walk_pud_level(m,s,a,p) walk_pmd_level(m,s,__pud(p4d_val(a)),p)
+#define p4d_large(a) pud_large(__pud(p4d_val(a)))
+#define p4d_none(a)  pud_none(__pud(p4d_val(a)))
+#endif
+
+#if PTRS_PER_P4D > 1
+
+static void walk_p4d_level(struct seq_file *m, struct pg_state *st, pgd_t addr,
+   unsigned long P)
+{
+   int i;
+   p4d_t *start;
+   pgprotval_t prot;
+
+   start = (p4d_t *) pgd_page_vaddr(addr);
+
+   for (i = 0; i < PTRS_PER_P4D; i++) {
+   st->current_address = normalize_addr(P + i * P4D_LEVEL_MULT);
+   if (!p4d_none(*start)) {
+   if (p4d_large(*start) || !p4d_present(*start)) {
+   prot = p4d_flags(*start);
+   note_page(m, st, __pgprot(prot), 2);
+   } else {
+   walk_pud_level(m, st, *start,
+  P + i * P4D_LEVEL_MULT);
+   }
+   } else
+   note_page(m, st, __pgprot(0), 2);
+
+   start++;
+   }
+}
+
+#else
+#define walk_p4d_level(m,s,a,p) walk_pud_level(m,s,__p4d(pgd_val(a)),p)
+#define pgd_large(a) p4d_large(__p4d(pgd_val(a)))
+#define pgd_none(a)  p4d_none(__p4d(pgd_val(a)))
 #endif
 
 static inline bool is_hypervisor_range(int idx)
@@ -424,7 +459,7 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, 
pgd_t *pgd,
prot = pgd_flags(*start);
note_page(m, , __pgprot(prot), 1);
} else {
-   walk_pud_level(m, , *start,
+   walk_p4d_level(m, , *start,
   i * PGD_LEVEL_MULT);
}
} else
-- 
2.11.0



[PATCH 18/26] x86/dump_pagetables: support 5-level paging

2017-03-12 Thread Kirill A. Shutemov
Simple extension to support one more page table level.

Signed-off-by: Kirill A. Shutemov 
---
 arch/x86/mm/dump_pagetables.c | 49 ---
 1 file changed, 42 insertions(+), 7 deletions(-)

diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index 58b5bee7ea27..0effac6989cd 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -110,7 +110,8 @@ static struct addr_marker address_markers[] = {
 #define PTE_LEVEL_MULT (PAGE_SIZE)
 #define PMD_LEVEL_MULT (PTRS_PER_PTE * PTE_LEVEL_MULT)
 #define PUD_LEVEL_MULT (PTRS_PER_PMD * PMD_LEVEL_MULT)
-#define PGD_LEVEL_MULT (PTRS_PER_PUD * PUD_LEVEL_MULT)
+#define P4D_LEVEL_MULT (PTRS_PER_PUD * PUD_LEVEL_MULT)
+#define PGD_LEVEL_MULT (PTRS_PER_PUD * P4D_LEVEL_MULT)
 
 #define pt_dump_seq_printf(m, to_dmesg, fmt, args...)  \
 ({ \
@@ -347,7 +348,7 @@ static bool pud_already_checked(pud_t *prev_pud, pud_t 
*pud, bool checkwx)
return checkwx && prev_pud && (pud_val(*prev_pud) == pud_val(*pud));
 }
 
-static void walk_pud_level(struct seq_file *m, struct pg_state *st, pgd_t addr,
+static void walk_pud_level(struct seq_file *m, struct pg_state *st, p4d_t addr,
unsigned long P)
 {
int i;
@@ -355,7 +356,7 @@ static void walk_pud_level(struct seq_file *m, struct 
pg_state *st, pgd_t addr,
pgprotval_t prot;
pud_t *prev_pud = NULL;
 
-   start = (pud_t *) pgd_page_vaddr(addr);
+   start = (pud_t *) p4d_page_vaddr(addr);
 
for (i = 0; i < PTRS_PER_PUD; i++) {
st->current_address = normalize_addr(P + i * PUD_LEVEL_MULT);
@@ -377,9 +378,43 @@ static void walk_pud_level(struct seq_file *m, struct 
pg_state *st, pgd_t addr,
 }
 
 #else
-#define walk_pud_level(m,s,a,p) walk_pmd_level(m,s,__pud(pgd_val(a)),p)
-#define pgd_large(a) pud_large(__pud(pgd_val(a)))
-#define pgd_none(a)  pud_none(__pud(pgd_val(a)))
+#define walk_pud_level(m,s,a,p) walk_pmd_level(m,s,__pud(p4d_val(a)),p)
+#define p4d_large(a) pud_large(__pud(p4d_val(a)))
+#define p4d_none(a)  pud_none(__pud(p4d_val(a)))
+#endif
+
+#if PTRS_PER_P4D > 1
+
+static void walk_p4d_level(struct seq_file *m, struct pg_state *st, pgd_t addr,
+   unsigned long P)
+{
+   int i;
+   p4d_t *start;
+   pgprotval_t prot;
+
+   start = (p4d_t *) pgd_page_vaddr(addr);
+
+   for (i = 0; i < PTRS_PER_P4D; i++) {
+   st->current_address = normalize_addr(P + i * P4D_LEVEL_MULT);
+   if (!p4d_none(*start)) {
+   if (p4d_large(*start) || !p4d_present(*start)) {
+   prot = p4d_flags(*start);
+   note_page(m, st, __pgprot(prot), 2);
+   } else {
+   walk_pud_level(m, st, *start,
+  P + i * P4D_LEVEL_MULT);
+   }
+   } else
+   note_page(m, st, __pgprot(0), 2);
+
+   start++;
+   }
+}
+
+#else
+#define walk_p4d_level(m,s,a,p) walk_pud_level(m,s,__p4d(pgd_val(a)),p)
+#define pgd_large(a) p4d_large(__p4d(pgd_val(a)))
+#define pgd_none(a)  p4d_none(__p4d(pgd_val(a)))
 #endif
 
 static inline bool is_hypervisor_range(int idx)
@@ -424,7 +459,7 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, 
pgd_t *pgd,
prot = pgd_flags(*start);
note_page(m, , __pgprot(prot), 1);
} else {
-   walk_pud_level(m, , *start,
+   walk_p4d_level(m, , *start,
   i * PGD_LEVEL_MULT);
}
} else
-- 
2.11.0



[PATCH 00/26] x86: 5-level paging enabling for v4.12

2017-03-12 Thread Kirill A. Shutemov
Here is v5 of 5-level paging patchset. Please review and consider applying.

== Overview ==

x86-64 is currently limited to 256 TiB of virtual address space and 64 TiB
of physical address space. We are already bumping into this limit: some
vendors offers servers with 64 TiB of memory today.

To overcome the limitation upcoming hardware will introduce support for
5-level paging[1]. It is a straight-forward extension of the current page
table structure adding one more layer of translation.

It bumps the limits to 128 PiB of virtual address space and 4 PiB of
physical address space. This "ought to be enough for anybody" ©.

==  Patches ==

The patchset is build on top of v4.11-rc2.

Current QEMU upstream git supports 5-level paging. Use "-cpu qemu64,+la57"
to enable it.

Patches 1-12:
Convert x86 to properly folded p4d layer using
.

Patches 13-25:
Enabling of real 5-level paging.

CONFIG_X86_5LEVEL=y will enable new paging mode.

Patch 26:
Introduce new prctl(2) handles -- PR_SET_MAX_VADDR and PR_GET_MAX_VADDR.

This aims to address compatibility issue. Only supports x86 for
now, but should be useful for other archtectures.

Git:
git://git.kernel.org/pub/scm/linux/kernel/git/kas/linux.git la57/v5

== TODO ==

There is still work to do:

  - CONFIG_XEN is broken for 5-level paging.

Xen for 5-level paging requires more work to get functional.

Xen on 4-level paging works, so it's not a regression.

  - Boot-time switch between 4- and 5-level paging.

We assume that distributions will be keen to avoid returning to the
i386 days where we shipped one kernel binary for each page table
layout.

As page table format is the same for 4- and 5-level paging it should
be possible to have single kernel binary and switch between them at
boot-time without too much hassle.

For now I only implemented compile-time switch.

This will implemented with separate patchset.

== Changelong ==

  v5:
- Rebased to v4.11-rc2;
- Fix false-positive BUG_ON() in vmalloc_fault() with 4-level paging
  enabled;
- __xen_pgd_walk(): do not not miss required flushes;
- Fix build with CONFIG_XEN_PVH=y;
  v4:
- Rebased to v4.11-rc1;
- Use mmap() hint address to allocate virtual addresss space above
  47-bits insteads of prctl() handles.
  v3:
- Rebased to v4.10-rc5;
- prctl() handles for large address space opt-in;
- Xen works for 4-level paging;
- EFI boot fixed for both 4- and 5-level paging;
- Hibernation fixed for 4-level paging;
- kexec() fixed;
- Couple of build fixes;
  v2:
- Rebased to v4.10-rc1;
- RLIMIT_VADDR proposal;
- Fix virtual map and update documentation;
- Fix few build errors;
- Rework cpuid helpers in boot code;
- Fix espfix code to work with 5-level pages;

[1] 
https://software.intel.com/sites/default/files/managed/2b/80/5-level_paging_white_paper.pdf

Kirill A. Shutemov (26):
  x86: basic changes into headers for 5-level paging
  x86: trivial portion of 5-level paging conversion
  x86/gup: add 5-level paging support
  x86/ident_map: add 5-level paging support
  x86/mm: add support of p4d_t in vmalloc_fault()
  x86/power: support p4d_t in hibernate code
  x86/kexec: support p4d_t
  x86/efi: handle p4d in EFI pagetables
  x86/mm/pat: handle additional page table
  x86/kasan: prepare clear_pgds() to switch to

  x86/xen: convert __xen_pgd_walk() and xen_cleanmfnmap() to support p4d
  x86: convert the rest of the code to support p4d_t
  x86: detect 5-level paging support
  x86/asm: remove __VIRTUAL_MASK_SHIFT==47 assert
  x86/mm: define virtual memory map for 5-level paging
  x86/paravirt: make paravirt code support 5-level paging
  x86/mm: basic defines/helpers for CONFIG_X86_5LEVEL
  x86/dump_pagetables: support 5-level paging
  x86/kasan: extend to support 5-level paging
  x86/espfix: support 5-level paging
  x86/mm: add support of additional page table level during early boot
  x86/mm: add sync_global_pgds() for configuration with 5-level paging
  x86/mm: make kernel_physical_mapping_init() support 5-level paging
  x86/mm: add support for 5-level paging for KASLR
  x86: enable 5-level paging support
  x86/mm: allow to have userspace mappings above 47-bits

 Documentation/x86/x86_64/mm.txt |  33 ++-
 arch/x86/Kconfig|   6 +
 arch/x86/boot/compressed/head_64.S  |  23 +-
 arch/x86/boot/cpucheck.c|   9 +
 arch/x86/boot/cpuflags.c|  12 +-
 arch/x86/entry/entry_64.S   |   7 +-
 arch/x86/include/asm/disabled-features.h|   8 +-
 arch/x86/include/asm/elf.h  |   2 +-
 arch/x86/include/asm/kasan.h|   9 +-
 arch/x86/include/asm/kexec.h|   1 +
 arch/x86/include/asm/mpx.h  |   9 +
 arch/x86/include/asm/page_64_types.h|  10 +
 arch/x86/include/asm/paravirt.h

[PATCH 16/26] x86/paravirt: make paravirt code support 5-level paging

2017-03-12 Thread Kirill A. Shutemov
Add operations to allocate/release p4ds. Xen requires more work.

Signed-off-by: Kirill A. Shutemov 
---
 arch/x86/include/asm/paravirt.h   | 44 +++
 arch/x86/include/asm/paravirt_types.h |  7 +-
 arch/x86/include/asm/pgalloc.h|  2 ++
 arch/x86/kernel/paravirt.c|  9 +--
 4 files changed, 55 insertions(+), 7 deletions(-)

diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 158d877ce9e9..677edf3b6421 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -357,6 +357,16 @@ static inline void paravirt_release_pud(unsigned long pfn)
PVOP_VCALL1(pv_mmu_ops.release_pud, pfn);
 }
 
+static inline void paravirt_alloc_p4d(struct mm_struct *mm, unsigned long pfn)
+{
+   PVOP_VCALL2(pv_mmu_ops.alloc_p4d, mm, pfn);
+}
+
+static inline void paravirt_release_p4d(unsigned long pfn)
+{
+   PVOP_VCALL1(pv_mmu_ops.release_p4d, pfn);
+}
+
 static inline void pte_update(struct mm_struct *mm, unsigned long addr,
  pte_t *ptep)
 {
@@ -582,14 +592,35 @@ static inline void set_p4d(p4d_t *p4dp, p4d_t p4d)
val);
 }
 
-static inline void p4d_clear(p4d_t *p4dp)
+#if CONFIG_PGTABLE_LEVELS >= 5
+
+static inline p4d_t __p4d(p4dval_t val)
 {
-   set_p4d(p4dp, __p4d(0));
+   p4dval_t ret;
+
+   if (sizeof(p4dval_t) > sizeof(long))
+   ret = PVOP_CALLEE2(p4dval_t, pv_mmu_ops.make_p4d,
+  val, (u64)val >> 32);
+   else
+   ret = PVOP_CALLEE1(p4dval_t, pv_mmu_ops.make_p4d,
+  val);
+
+   return (p4d_t) { ret };
 }
 
-#if CONFIG_PGTABLE_LEVELS >= 5
+static inline p4dval_t p4d_val(p4d_t p4d)
+{
+   p4dval_t ret;
+
+   if (sizeof(p4dval_t) > sizeof(long))
+   ret =  PVOP_CALLEE2(p4dval_t, pv_mmu_ops.p4d_val,
+   p4d.p4d, (u64)p4d.p4d >> 32);
+   else
+   ret =  PVOP_CALLEE1(p4dval_t, pv_mmu_ops.p4d_val,
+   p4d.p4d);
 
-#error FIXME
+   return ret;
+}
 
 static inline void set_pgd(pgd_t *pgdp, pgd_t pgd)
 {
@@ -610,6 +641,11 @@ static inline void pgd_clear(pgd_t *pgdp)
 
 #endif  /* CONFIG_PGTABLE_LEVELS == 5 */
 
+static inline void p4d_clear(p4d_t *p4dp)
+{
+   set_p4d(p4dp, __p4d(0));
+}
+
 #endif /* CONFIG_PGTABLE_LEVELS == 4 */
 
 #endif /* CONFIG_PGTABLE_LEVELS >= 3 */
diff --git a/arch/x86/include/asm/paravirt_types.h 
b/arch/x86/include/asm/paravirt_types.h
index 93c49cf09b63..7465d6fe336f 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -238,9 +238,11 @@ struct pv_mmu_ops {
void (*alloc_pte)(struct mm_struct *mm, unsigned long pfn);
void (*alloc_pmd)(struct mm_struct *mm, unsigned long pfn);
void (*alloc_pud)(struct mm_struct *mm, unsigned long pfn);
+   void (*alloc_p4d)(struct mm_struct *mm, unsigned long pfn);
void (*release_pte)(unsigned long pfn);
void (*release_pmd)(unsigned long pfn);
void (*release_pud)(unsigned long pfn);
+   void (*release_p4d)(unsigned long pfn);
 
/* Pagetable manipulation functions */
void (*set_pte)(pte_t *ptep, pte_t pteval);
@@ -286,7 +288,10 @@ struct pv_mmu_ops {
void (*set_p4d)(p4d_t *p4dp, p4d_t p4dval);
 
 #if CONFIG_PGTABLE_LEVELS >= 5
-#error FIXME
+   struct paravirt_callee_save p4d_val;
+   struct paravirt_callee_save make_p4d;
+
+   void (*set_pgd)(pgd_t *pgdp, pgd_t pgdval);
 #endif /* CONFIG_PGTABLE_LEVELS >= 5 */
 
 #endif /* CONFIG_PGTABLE_LEVELS >= 4 */
diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h
index 2f585054c63c..b2d0cd8288aa 100644
--- a/arch/x86/include/asm/pgalloc.h
+++ b/arch/x86/include/asm/pgalloc.h
@@ -17,9 +17,11 @@ static inline void paravirt_alloc_pmd(struct mm_struct *mm, 
unsigned long pfn)   {
 static inline void paravirt_alloc_pmd_clone(unsigned long pfn, unsigned long 
clonepfn,
unsigned long start, unsigned long 
count) {}
 static inline void paravirt_alloc_pud(struct mm_struct *mm, unsigned long pfn) 
{}
+static inline void paravirt_alloc_p4d(struct mm_struct *mm, unsigned long pfn) 
{}
 static inline void paravirt_release_pte(unsigned long pfn) {}
 static inline void paravirt_release_pmd(unsigned long pfn) {}
 static inline void paravirt_release_pud(unsigned long pfn) {}
+static inline void paravirt_release_p4d(unsigned long pfn) {}
 #endif
 
 /*
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 110daf22f5c7..3586996fc50d 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -405,9 +405,11 @@ struct pv_mmu_ops pv_mmu_ops __ro_after_init = {
.alloc_pte = paravirt_nop,
.alloc_pmd = paravirt_nop,
.alloc_pud = paravirt_nop,
+   .alloc_p4d = 

[PATCH 25/26] x86: enable 5-level paging support

2017-03-12 Thread Kirill A. Shutemov
Most of things are in place and we can enable support of 5-level paging.

Enabling XEN with 5-level paging requires more work. The patch makes XEN
dependent on !X86_5LEVEL.

Signed-off-by: Kirill A. Shutemov 
---
 arch/x86/Kconfig | 5 +
 arch/x86/xen/Kconfig | 1 +
 2 files changed, 6 insertions(+)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 747f06f00a22..43b3343402f5 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -317,6 +317,7 @@ config FIX_EARLYCON_MEM
 
 config PGTABLE_LEVELS
int
+   default 5 if X86_5LEVEL
default 4 if X86_64
default 3 if X86_PAE
default 2
@@ -1381,6 +1382,10 @@ config X86_PAE
  has the cost of more pagetable lookup overhead, and also
  consumes more pagetable space per process.
 
+config X86_5LEVEL
+   bool "Enable 5-level page tables support"
+   depends on X86_64
+
 config ARCH_PHYS_ADDR_T_64BIT
def_bool y
depends on X86_64 || X86_PAE
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index 76b6dbd627df..b90d481ce5a1 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -5,6 +5,7 @@
 config XEN
bool "Xen guest support"
depends on PARAVIRT
+   depends on !X86_5LEVEL
select PARAVIRT_CLOCK
select XEN_HAVE_PVMMU
select XEN_HAVE_VPMU
-- 
2.11.0



[PATCH 16/26] x86/paravirt: make paravirt code support 5-level paging

2017-03-12 Thread Kirill A. Shutemov
Add operations to allocate/release p4ds. Xen requires more work.

Signed-off-by: Kirill A. Shutemov 
---
 arch/x86/include/asm/paravirt.h   | 44 +++
 arch/x86/include/asm/paravirt_types.h |  7 +-
 arch/x86/include/asm/pgalloc.h|  2 ++
 arch/x86/kernel/paravirt.c|  9 +--
 4 files changed, 55 insertions(+), 7 deletions(-)

diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 158d877ce9e9..677edf3b6421 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -357,6 +357,16 @@ static inline void paravirt_release_pud(unsigned long pfn)
PVOP_VCALL1(pv_mmu_ops.release_pud, pfn);
 }
 
+static inline void paravirt_alloc_p4d(struct mm_struct *mm, unsigned long pfn)
+{
+   PVOP_VCALL2(pv_mmu_ops.alloc_p4d, mm, pfn);
+}
+
+static inline void paravirt_release_p4d(unsigned long pfn)
+{
+   PVOP_VCALL1(pv_mmu_ops.release_p4d, pfn);
+}
+
 static inline void pte_update(struct mm_struct *mm, unsigned long addr,
  pte_t *ptep)
 {
@@ -582,14 +592,35 @@ static inline void set_p4d(p4d_t *p4dp, p4d_t p4d)
val);
 }
 
-static inline void p4d_clear(p4d_t *p4dp)
+#if CONFIG_PGTABLE_LEVELS >= 5
+
+static inline p4d_t __p4d(p4dval_t val)
 {
-   set_p4d(p4dp, __p4d(0));
+   p4dval_t ret;
+
+   if (sizeof(p4dval_t) > sizeof(long))
+   ret = PVOP_CALLEE2(p4dval_t, pv_mmu_ops.make_p4d,
+  val, (u64)val >> 32);
+   else
+   ret = PVOP_CALLEE1(p4dval_t, pv_mmu_ops.make_p4d,
+  val);
+
+   return (p4d_t) { ret };
 }
 
-#if CONFIG_PGTABLE_LEVELS >= 5
+static inline p4dval_t p4d_val(p4d_t p4d)
+{
+   p4dval_t ret;
+
+   if (sizeof(p4dval_t) > sizeof(long))
+   ret =  PVOP_CALLEE2(p4dval_t, pv_mmu_ops.p4d_val,
+   p4d.p4d, (u64)p4d.p4d >> 32);
+   else
+   ret =  PVOP_CALLEE1(p4dval_t, pv_mmu_ops.p4d_val,
+   p4d.p4d);
 
-#error FIXME
+   return ret;
+}
 
 static inline void set_pgd(pgd_t *pgdp, pgd_t pgd)
 {
@@ -610,6 +641,11 @@ static inline void pgd_clear(pgd_t *pgdp)
 
 #endif  /* CONFIG_PGTABLE_LEVELS == 5 */
 
+static inline void p4d_clear(p4d_t *p4dp)
+{
+   set_p4d(p4dp, __p4d(0));
+}
+
 #endif /* CONFIG_PGTABLE_LEVELS == 4 */
 
 #endif /* CONFIG_PGTABLE_LEVELS >= 3 */
diff --git a/arch/x86/include/asm/paravirt_types.h 
b/arch/x86/include/asm/paravirt_types.h
index 93c49cf09b63..7465d6fe336f 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -238,9 +238,11 @@ struct pv_mmu_ops {
void (*alloc_pte)(struct mm_struct *mm, unsigned long pfn);
void (*alloc_pmd)(struct mm_struct *mm, unsigned long pfn);
void (*alloc_pud)(struct mm_struct *mm, unsigned long pfn);
+   void (*alloc_p4d)(struct mm_struct *mm, unsigned long pfn);
void (*release_pte)(unsigned long pfn);
void (*release_pmd)(unsigned long pfn);
void (*release_pud)(unsigned long pfn);
+   void (*release_p4d)(unsigned long pfn);
 
/* Pagetable manipulation functions */
void (*set_pte)(pte_t *ptep, pte_t pteval);
@@ -286,7 +288,10 @@ struct pv_mmu_ops {
void (*set_p4d)(p4d_t *p4dp, p4d_t p4dval);
 
 #if CONFIG_PGTABLE_LEVELS >= 5
-#error FIXME
+   struct paravirt_callee_save p4d_val;
+   struct paravirt_callee_save make_p4d;
+
+   void (*set_pgd)(pgd_t *pgdp, pgd_t pgdval);
 #endif /* CONFIG_PGTABLE_LEVELS >= 5 */
 
 #endif /* CONFIG_PGTABLE_LEVELS >= 4 */
diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h
index 2f585054c63c..b2d0cd8288aa 100644
--- a/arch/x86/include/asm/pgalloc.h
+++ b/arch/x86/include/asm/pgalloc.h
@@ -17,9 +17,11 @@ static inline void paravirt_alloc_pmd(struct mm_struct *mm, 
unsigned long pfn)   {
 static inline void paravirt_alloc_pmd_clone(unsigned long pfn, unsigned long 
clonepfn,
unsigned long start, unsigned long 
count) {}
 static inline void paravirt_alloc_pud(struct mm_struct *mm, unsigned long pfn) 
{}
+static inline void paravirt_alloc_p4d(struct mm_struct *mm, unsigned long pfn) 
{}
 static inline void paravirt_release_pte(unsigned long pfn) {}
 static inline void paravirt_release_pmd(unsigned long pfn) {}
 static inline void paravirt_release_pud(unsigned long pfn) {}
+static inline void paravirt_release_p4d(unsigned long pfn) {}
 #endif
 
 /*
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 110daf22f5c7..3586996fc50d 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -405,9 +405,11 @@ struct pv_mmu_ops pv_mmu_ops __ro_after_init = {
.alloc_pte = paravirt_nop,
.alloc_pmd = paravirt_nop,
.alloc_pud = paravirt_nop,
+   .alloc_p4d = paravirt_nop,

[PATCH 25/26] x86: enable 5-level paging support

2017-03-12 Thread Kirill A. Shutemov
Most of things are in place and we can enable support of 5-level paging.

Enabling XEN with 5-level paging requires more work. The patch makes XEN
dependent on !X86_5LEVEL.

Signed-off-by: Kirill A. Shutemov 
---
 arch/x86/Kconfig | 5 +
 arch/x86/xen/Kconfig | 1 +
 2 files changed, 6 insertions(+)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 747f06f00a22..43b3343402f5 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -317,6 +317,7 @@ config FIX_EARLYCON_MEM
 
 config PGTABLE_LEVELS
int
+   default 5 if X86_5LEVEL
default 4 if X86_64
default 3 if X86_PAE
default 2
@@ -1381,6 +1382,10 @@ config X86_PAE
  has the cost of more pagetable lookup overhead, and also
  consumes more pagetable space per process.
 
+config X86_5LEVEL
+   bool "Enable 5-level page tables support"
+   depends on X86_64
+
 config ARCH_PHYS_ADDR_T_64BIT
def_bool y
depends on X86_64 || X86_PAE
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index 76b6dbd627df..b90d481ce5a1 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -5,6 +5,7 @@
 config XEN
bool "Xen guest support"
depends on PARAVIRT
+   depends on !X86_5LEVEL
select PARAVIRT_CLOCK
select XEN_HAVE_PVMMU
select XEN_HAVE_VPMU
-- 
2.11.0



[PATCH 00/26] x86: 5-level paging enabling for v4.12

2017-03-12 Thread Kirill A. Shutemov
Here is v5 of 5-level paging patchset. Please review and consider applying.

== Overview ==

x86-64 is currently limited to 256 TiB of virtual address space and 64 TiB
of physical address space. We are already bumping into this limit: some
vendors offers servers with 64 TiB of memory today.

To overcome the limitation upcoming hardware will introduce support for
5-level paging[1]. It is a straight-forward extension of the current page
table structure adding one more layer of translation.

It bumps the limits to 128 PiB of virtual address space and 4 PiB of
physical address space. This "ought to be enough for anybody" ©.

==  Patches ==

The patchset is build on top of v4.11-rc2.

Current QEMU upstream git supports 5-level paging. Use "-cpu qemu64,+la57"
to enable it.

Patches 1-12:
Convert x86 to properly folded p4d layer using
.

Patches 13-25:
Enabling of real 5-level paging.

CONFIG_X86_5LEVEL=y will enable new paging mode.

Patch 26:
Introduce new prctl(2) handles -- PR_SET_MAX_VADDR and PR_GET_MAX_VADDR.

This aims to address compatibility issue. Only supports x86 for
now, but should be useful for other archtectures.

Git:
git://git.kernel.org/pub/scm/linux/kernel/git/kas/linux.git la57/v5

== TODO ==

There is still work to do:

  - CONFIG_XEN is broken for 5-level paging.

Xen for 5-level paging requires more work to get functional.

Xen on 4-level paging works, so it's not a regression.

  - Boot-time switch between 4- and 5-level paging.

We assume that distributions will be keen to avoid returning to the
i386 days where we shipped one kernel binary for each page table
layout.

As page table format is the same for 4- and 5-level paging it should
be possible to have single kernel binary and switch between them at
boot-time without too much hassle.

For now I only implemented compile-time switch.

This will implemented with separate patchset.

== Changelong ==

  v5:
- Rebased to v4.11-rc2;
- Fix false-positive BUG_ON() in vmalloc_fault() with 4-level paging
  enabled;
- __xen_pgd_walk(): do not not miss required flushes;
- Fix build with CONFIG_XEN_PVH=y;
  v4:
- Rebased to v4.11-rc1;
- Use mmap() hint address to allocate virtual addresss space above
  47-bits insteads of prctl() handles.
  v3:
- Rebased to v4.10-rc5;
- prctl() handles for large address space opt-in;
- Xen works for 4-level paging;
- EFI boot fixed for both 4- and 5-level paging;
- Hibernation fixed for 4-level paging;
- kexec() fixed;
- Couple of build fixes;
  v2:
- Rebased to v4.10-rc1;
- RLIMIT_VADDR proposal;
- Fix virtual map and update documentation;
- Fix few build errors;
- Rework cpuid helpers in boot code;
- Fix espfix code to work with 5-level pages;

[1] 
https://software.intel.com/sites/default/files/managed/2b/80/5-level_paging_white_paper.pdf

Kirill A. Shutemov (26):
  x86: basic changes into headers for 5-level paging
  x86: trivial portion of 5-level paging conversion
  x86/gup: add 5-level paging support
  x86/ident_map: add 5-level paging support
  x86/mm: add support of p4d_t in vmalloc_fault()
  x86/power: support p4d_t in hibernate code
  x86/kexec: support p4d_t
  x86/efi: handle p4d in EFI pagetables
  x86/mm/pat: handle additional page table
  x86/kasan: prepare clear_pgds() to switch to

  x86/xen: convert __xen_pgd_walk() and xen_cleanmfnmap() to support p4d
  x86: convert the rest of the code to support p4d_t
  x86: detect 5-level paging support
  x86/asm: remove __VIRTUAL_MASK_SHIFT==47 assert
  x86/mm: define virtual memory map for 5-level paging
  x86/paravirt: make paravirt code support 5-level paging
  x86/mm: basic defines/helpers for CONFIG_X86_5LEVEL
  x86/dump_pagetables: support 5-level paging
  x86/kasan: extend to support 5-level paging
  x86/espfix: support 5-level paging
  x86/mm: add support of additional page table level during early boot
  x86/mm: add sync_global_pgds() for configuration with 5-level paging
  x86/mm: make kernel_physical_mapping_init() support 5-level paging
  x86/mm: add support for 5-level paging for KASLR
  x86: enable 5-level paging support
  x86/mm: allow to have userspace mappings above 47-bits

 Documentation/x86/x86_64/mm.txt |  33 ++-
 arch/x86/Kconfig|   6 +
 arch/x86/boot/compressed/head_64.S  |  23 +-
 arch/x86/boot/cpucheck.c|   9 +
 arch/x86/boot/cpuflags.c|  12 +-
 arch/x86/entry/entry_64.S   |   7 +-
 arch/x86/include/asm/disabled-features.h|   8 +-
 arch/x86/include/asm/elf.h  |   2 +-
 arch/x86/include/asm/kasan.h|   9 +-
 arch/x86/include/asm/kexec.h|   1 +
 arch/x86/include/asm/mpx.h  |   9 +
 arch/x86/include/asm/page_64_types.h|  10 +
 arch/x86/include/asm/paravirt.h

[PATCH 10/26] x86/kasan: prepare clear_pgds() to switch to

2017-03-12 Thread Kirill A. Shutemov
With folded p4d, pgd_clear() is nop. Change clear_pgds() to use
p4d_clear() instead.

Signed-off-by: Kirill A. Shutemov 
Cc: Dmitry Vyukov 
---
 arch/x86/mm/kasan_init_64.c | 11 +--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
index 8d63d7a104c3..733f8ba6a01f 100644
--- a/arch/x86/mm/kasan_init_64.c
+++ b/arch/x86/mm/kasan_init_64.c
@@ -32,8 +32,15 @@ static int __init map_range(struct range *range)
 static void __init clear_pgds(unsigned long start,
unsigned long end)
 {
-   for (; start < end; start += PGDIR_SIZE)
-   pgd_clear(pgd_offset_k(start));
+   pgd_t *pgd;
+
+   for (; start < end; start += PGDIR_SIZE) {
+   pgd = pgd_offset_k(start);
+   if (CONFIG_PGTABLE_LEVELS < 5)
+   p4d_clear(p4d_offset(pgd, start));
+   else
+   pgd_clear(pgd);
+   }
 }
 
 static void __init kasan_map_early_shadow(pgd_t *pgd)
-- 
2.11.0



[PATCH 10/26] x86/kasan: prepare clear_pgds() to switch to

2017-03-12 Thread Kirill A. Shutemov
With folded p4d, pgd_clear() is nop. Change clear_pgds() to use
p4d_clear() instead.

Signed-off-by: Kirill A. Shutemov 
Cc: Dmitry Vyukov 
---
 arch/x86/mm/kasan_init_64.c | 11 +--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
index 8d63d7a104c3..733f8ba6a01f 100644
--- a/arch/x86/mm/kasan_init_64.c
+++ b/arch/x86/mm/kasan_init_64.c
@@ -32,8 +32,15 @@ static int __init map_range(struct range *range)
 static void __init clear_pgds(unsigned long start,
unsigned long end)
 {
-   for (; start < end; start += PGDIR_SIZE)
-   pgd_clear(pgd_offset_k(start));
+   pgd_t *pgd;
+
+   for (; start < end; start += PGDIR_SIZE) {
+   pgd = pgd_offset_k(start);
+   if (CONFIG_PGTABLE_LEVELS < 5)
+   p4d_clear(p4d_offset(pgd, start));
+   else
+   pgd_clear(pgd);
+   }
 }
 
 static void __init kasan_map_early_shadow(pgd_t *pgd)
-- 
2.11.0



[PATCH 22/26] x86/mm: add sync_global_pgds() for configuration with 5-level paging

2017-03-12 Thread Kirill A. Shutemov
This basically restores slightly modified version of original
sync_global_pgds() which we had before foldedl p4d was introduced.

The only modification is protection against 'address' overflow.

Signed-off-by: Kirill A. Shutemov 
---
 arch/x86/mm/init_64.c | 37 +
 1 file changed, 37 insertions(+)

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 7bdda6f1d135..5ba99090dc3c 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -92,6 +92,42 @@ __setup("noexec32=", nonx32_setup);
  * When memory was added make sure all the processes MM have
  * suitable PGD entries in the local PGD level page.
  */
+#ifdef CONFIG_X86_5LEVEL
+void sync_global_pgds(unsigned long start, unsigned long end)
+{
+   unsigned long address;
+
+   for (address = start; address <= end && address >= start;
+   address += PGDIR_SIZE) {
+   const pgd_t *pgd_ref = pgd_offset_k(address);
+   struct page *page;
+
+   if (pgd_none(*pgd_ref))
+   continue;
+
+   spin_lock(_lock);
+   list_for_each_entry(page, _list, lru) {
+   pgd_t *pgd;
+   spinlock_t *pgt_lock;
+
+   pgd = (pgd_t *)page_address(page) + pgd_index(address);
+   /* the pgt_lock only for Xen */
+   pgt_lock = _page_get_mm(page)->page_table_lock;
+   spin_lock(pgt_lock);
+
+   if (!pgd_none(*pgd_ref) && !pgd_none(*pgd))
+   BUG_ON(pgd_page_vaddr(*pgd)
+   != pgd_page_vaddr(*pgd_ref));
+
+   if (pgd_none(*pgd))
+   set_pgd(pgd, *pgd_ref);
+
+   spin_unlock(pgt_lock);
+   }
+   spin_unlock(_lock);
+   }
+}
+#else
 void sync_global_pgds(unsigned long start, unsigned long end)
 {
unsigned long address;
@@ -135,6 +171,7 @@ void sync_global_pgds(unsigned long start, unsigned long 
end)
spin_unlock(_lock);
}
 }
+#endif
 
 /*
  * NOTE: This function is marked __ref because it calls __init function
-- 
2.11.0



[PATCH 22/26] x86/mm: add sync_global_pgds() for configuration with 5-level paging

2017-03-12 Thread Kirill A. Shutemov
This basically restores slightly modified version of original
sync_global_pgds() which we had before foldedl p4d was introduced.

The only modification is protection against 'address' overflow.

Signed-off-by: Kirill A. Shutemov 
---
 arch/x86/mm/init_64.c | 37 +
 1 file changed, 37 insertions(+)

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 7bdda6f1d135..5ba99090dc3c 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -92,6 +92,42 @@ __setup("noexec32=", nonx32_setup);
  * When memory was added make sure all the processes MM have
  * suitable PGD entries in the local PGD level page.
  */
+#ifdef CONFIG_X86_5LEVEL
+void sync_global_pgds(unsigned long start, unsigned long end)
+{
+   unsigned long address;
+
+   for (address = start; address <= end && address >= start;
+   address += PGDIR_SIZE) {
+   const pgd_t *pgd_ref = pgd_offset_k(address);
+   struct page *page;
+
+   if (pgd_none(*pgd_ref))
+   continue;
+
+   spin_lock(_lock);
+   list_for_each_entry(page, _list, lru) {
+   pgd_t *pgd;
+   spinlock_t *pgt_lock;
+
+   pgd = (pgd_t *)page_address(page) + pgd_index(address);
+   /* the pgt_lock only for Xen */
+   pgt_lock = _page_get_mm(page)->page_table_lock;
+   spin_lock(pgt_lock);
+
+   if (!pgd_none(*pgd_ref) && !pgd_none(*pgd))
+   BUG_ON(pgd_page_vaddr(*pgd)
+   != pgd_page_vaddr(*pgd_ref));
+
+   if (pgd_none(*pgd))
+   set_pgd(pgd, *pgd_ref);
+
+   spin_unlock(pgt_lock);
+   }
+   spin_unlock(_lock);
+   }
+}
+#else
 void sync_global_pgds(unsigned long start, unsigned long end)
 {
unsigned long address;
@@ -135,6 +171,7 @@ void sync_global_pgds(unsigned long start, unsigned long 
end)
spin_unlock(_lock);
}
 }
+#endif
 
 /*
  * NOTE: This function is marked __ref because it calls __init function
-- 
2.11.0



[PATCH 14/26] x86/asm: remove __VIRTUAL_MASK_SHIFT==47 assert

2017-03-12 Thread Kirill A. Shutemov
We don't need it anymore. 17be0aec74fb ("x86/asm/entry/64: Implement
better check for canonical addresses") made canonical address check
generic wrt. address width.

Signed-off-by: Kirill A. Shutemov 
---
 arch/x86/entry/entry_64.S | 7 ++-
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 044d18ebc43c..f07b4efb34d5 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -265,12 +265,9 @@ return_from_SYSCALL_64:
 *
 * If width of "canonical tail" ever becomes variable, this will need
 * to be updated to remain correct on both old and new CPUs.
+*
+* Change top 16 bits to be the sign-extension of 47th bit
 */
-   .ifne __VIRTUAL_MASK_SHIFT - 47
-   .error "virtual address width changed -- SYSRET checks need update"
-   .endif
-
-   /* Change top 16 bits to be the sign-extension of 47th bit */
shl $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
sar $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
 
-- 
2.11.0



[PATCH 14/26] x86/asm: remove __VIRTUAL_MASK_SHIFT==47 assert

2017-03-12 Thread Kirill A. Shutemov
We don't need it anymore. 17be0aec74fb ("x86/asm/entry/64: Implement
better check for canonical addresses") made canonical address check
generic wrt. address width.

Signed-off-by: Kirill A. Shutemov 
---
 arch/x86/entry/entry_64.S | 7 ++-
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 044d18ebc43c..f07b4efb34d5 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -265,12 +265,9 @@ return_from_SYSCALL_64:
 *
 * If width of "canonical tail" ever becomes variable, this will need
 * to be updated to remain correct on both old and new CPUs.
+*
+* Change top 16 bits to be the sign-extension of 47th bit
 */
-   .ifne __VIRTUAL_MASK_SHIFT - 47
-   .error "virtual address width changed -- SYSRET checks need update"
-   .endif
-
-   /* Change top 16 bits to be the sign-extension of 47th bit */
shl $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
sar $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
 
-- 
2.11.0



[PATCH 09/26] x86/mm/pat: handle additional page table

2017-03-12 Thread Kirill A. Shutemov
Straight-forward extension of existing code to support additional page
table level.

Signed-off-by: Kirill A. Shutemov 
---
 arch/x86/mm/pageattr.c | 56 --
 1 file changed, 41 insertions(+), 15 deletions(-)

diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 28d42130243c..eb0ad12cdfde 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -346,6 +346,7 @@ static inline pgprot_t static_protections(pgprot_t prot, 
unsigned long address,
 pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address,
 unsigned int *level)
 {
+   p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
 
@@ -354,7 +355,15 @@ pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long 
address,
if (pgd_none(*pgd))
return NULL;
 
-   pud = pud_offset(pgd, address);
+   p4d = p4d_offset(pgd, address);
+   if (p4d_none(*p4d))
+   return NULL;
+
+   *level = PG_LEVEL_512G;
+   if (p4d_large(*p4d) || !p4d_present(*p4d))
+   return (pte_t *)p4d;
+
+   pud = pud_offset(p4d, address);
if (pud_none(*pud))
return NULL;
 
@@ -406,13 +415,18 @@ static pte_t *_lookup_address_cpa(struct cpa_data *cpa, 
unsigned long address,
 pmd_t *lookup_pmd_address(unsigned long address)
 {
pgd_t *pgd;
+   p4d_t *p4d;
pud_t *pud;
 
pgd = pgd_offset_k(address);
if (pgd_none(*pgd))
return NULL;
 
-   pud = pud_offset(pgd, address);
+   p4d = p4d_offset(pgd, address);
+   if (p4d_none(*p4d) || p4d_large(*p4d) || !p4d_present(*p4d))
+   return NULL;
+
+   pud = pud_offset(p4d, address);
if (pud_none(*pud) || pud_large(*pud) || !pud_present(*pud))
return NULL;
 
@@ -477,11 +491,13 @@ static void __set_pmd_pte(pte_t *kpte, unsigned long 
address, pte_t pte)
 
list_for_each_entry(page, _list, lru) {
pgd_t *pgd;
+   p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
 
pgd = (pgd_t *)page_address(page) + pgd_index(address);
-   pud = pud_offset(pgd, address);
+   p4d = p4d_offset(pgd, address);
+   pud = pud_offset(p4d, address);
pmd = pmd_offset(pud, address);
set_pte_atomic((pte_t *)pmd, pte);
}
@@ -836,9 +852,9 @@ static void unmap_pmd_range(pud_t *pud, unsigned long 
start, unsigned long end)
pud_clear(pud);
 }
 
-static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
+static void unmap_pud_range(p4d_t *p4d, unsigned long start, unsigned long end)
 {
-   pud_t *pud = pud_offset(pgd, start);
+   pud_t *pud = pud_offset(p4d, start);
 
/*
 * Not on a GB page boundary?
@@ -1004,8 +1020,8 @@ static long populate_pmd(struct cpa_data *cpa,
return num_pages;
 }
 
-static long populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd,
-pgprot_t pgprot)
+static int populate_pud(struct cpa_data *cpa, unsigned long start, p4d_t *p4d,
+   pgprot_t pgprot)
 {
pud_t *pud;
unsigned long end;
@@ -1026,7 +1042,7 @@ static long populate_pud(struct cpa_data *cpa, unsigned 
long start, pgd_t *pgd,
cur_pages = (pre_end - start) >> PAGE_SHIFT;
cur_pages = min_t(int, (int)cpa->numpages, cur_pages);
 
-   pud = pud_offset(pgd, start);
+   pud = pud_offset(p4d, start);
 
/*
 * Need a PMD page?
@@ -1047,7 +1063,7 @@ static long populate_pud(struct cpa_data *cpa, unsigned 
long start, pgd_t *pgd,
if (cpa->numpages == cur_pages)
return cur_pages;
 
-   pud = pud_offset(pgd, start);
+   pud = pud_offset(p4d, start);
pud_pgprot = pgprot_4k_2_large(pgprot);
 
/*
@@ -1067,7 +1083,7 @@ static long populate_pud(struct cpa_data *cpa, unsigned 
long start, pgd_t *pgd,
if (start < end) {
long tmp;
 
-   pud = pud_offset(pgd, start);
+   pud = pud_offset(p4d, start);
if (pud_none(*pud))
if (alloc_pmd_page(pud))
return -1;
@@ -1090,33 +1106,43 @@ static int populate_pgd(struct cpa_data *cpa, unsigned 
long addr)
 {
pgprot_t pgprot = __pgprot(_KERNPG_TABLE);
pud_t *pud = NULL;  /* shut up gcc */
+   p4d_t *p4d;
pgd_t *pgd_entry;
long ret;
 
pgd_entry = cpa->pgd + pgd_index(addr);
 
+   if (pgd_none(*pgd_entry)) {
+   p4d = (p4d_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
+   if (!p4d)
+   return -1;
+
+   set_pgd(pgd_entry, 

[PATCH 09/26] x86/mm/pat: handle additional page table

2017-03-12 Thread Kirill A. Shutemov
Straight-forward extension of existing code to support additional page
table level.

Signed-off-by: Kirill A. Shutemov 
---
 arch/x86/mm/pageattr.c | 56 --
 1 file changed, 41 insertions(+), 15 deletions(-)

diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 28d42130243c..eb0ad12cdfde 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -346,6 +346,7 @@ static inline pgprot_t static_protections(pgprot_t prot, 
unsigned long address,
 pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address,
 unsigned int *level)
 {
+   p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
 
@@ -354,7 +355,15 @@ pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long 
address,
if (pgd_none(*pgd))
return NULL;
 
-   pud = pud_offset(pgd, address);
+   p4d = p4d_offset(pgd, address);
+   if (p4d_none(*p4d))
+   return NULL;
+
+   *level = PG_LEVEL_512G;
+   if (p4d_large(*p4d) || !p4d_present(*p4d))
+   return (pte_t *)p4d;
+
+   pud = pud_offset(p4d, address);
if (pud_none(*pud))
return NULL;
 
@@ -406,13 +415,18 @@ static pte_t *_lookup_address_cpa(struct cpa_data *cpa, 
unsigned long address,
 pmd_t *lookup_pmd_address(unsigned long address)
 {
pgd_t *pgd;
+   p4d_t *p4d;
pud_t *pud;
 
pgd = pgd_offset_k(address);
if (pgd_none(*pgd))
return NULL;
 
-   pud = pud_offset(pgd, address);
+   p4d = p4d_offset(pgd, address);
+   if (p4d_none(*p4d) || p4d_large(*p4d) || !p4d_present(*p4d))
+   return NULL;
+
+   pud = pud_offset(p4d, address);
if (pud_none(*pud) || pud_large(*pud) || !pud_present(*pud))
return NULL;
 
@@ -477,11 +491,13 @@ static void __set_pmd_pte(pte_t *kpte, unsigned long 
address, pte_t pte)
 
list_for_each_entry(page, _list, lru) {
pgd_t *pgd;
+   p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
 
pgd = (pgd_t *)page_address(page) + pgd_index(address);
-   pud = pud_offset(pgd, address);
+   p4d = p4d_offset(pgd, address);
+   pud = pud_offset(p4d, address);
pmd = pmd_offset(pud, address);
set_pte_atomic((pte_t *)pmd, pte);
}
@@ -836,9 +852,9 @@ static void unmap_pmd_range(pud_t *pud, unsigned long 
start, unsigned long end)
pud_clear(pud);
 }
 
-static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
+static void unmap_pud_range(p4d_t *p4d, unsigned long start, unsigned long end)
 {
-   pud_t *pud = pud_offset(pgd, start);
+   pud_t *pud = pud_offset(p4d, start);
 
/*
 * Not on a GB page boundary?
@@ -1004,8 +1020,8 @@ static long populate_pmd(struct cpa_data *cpa,
return num_pages;
 }
 
-static long populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd,
-pgprot_t pgprot)
+static int populate_pud(struct cpa_data *cpa, unsigned long start, p4d_t *p4d,
+   pgprot_t pgprot)
 {
pud_t *pud;
unsigned long end;
@@ -1026,7 +1042,7 @@ static long populate_pud(struct cpa_data *cpa, unsigned 
long start, pgd_t *pgd,
cur_pages = (pre_end - start) >> PAGE_SHIFT;
cur_pages = min_t(int, (int)cpa->numpages, cur_pages);
 
-   pud = pud_offset(pgd, start);
+   pud = pud_offset(p4d, start);
 
/*
 * Need a PMD page?
@@ -1047,7 +1063,7 @@ static long populate_pud(struct cpa_data *cpa, unsigned 
long start, pgd_t *pgd,
if (cpa->numpages == cur_pages)
return cur_pages;
 
-   pud = pud_offset(pgd, start);
+   pud = pud_offset(p4d, start);
pud_pgprot = pgprot_4k_2_large(pgprot);
 
/*
@@ -1067,7 +1083,7 @@ static long populate_pud(struct cpa_data *cpa, unsigned 
long start, pgd_t *pgd,
if (start < end) {
long tmp;
 
-   pud = pud_offset(pgd, start);
+   pud = pud_offset(p4d, start);
if (pud_none(*pud))
if (alloc_pmd_page(pud))
return -1;
@@ -1090,33 +1106,43 @@ static int populate_pgd(struct cpa_data *cpa, unsigned 
long addr)
 {
pgprot_t pgprot = __pgprot(_KERNPG_TABLE);
pud_t *pud = NULL;  /* shut up gcc */
+   p4d_t *p4d;
pgd_t *pgd_entry;
long ret;
 
pgd_entry = cpa->pgd + pgd_index(addr);
 
+   if (pgd_none(*pgd_entry)) {
+   p4d = (p4d_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
+   if (!p4d)
+   return -1;
+
+   set_pgd(pgd_entry, __pgd(__pa(p4d) | _KERNPG_TABLE));
+ 

[PATCH 04/26] x86/ident_map: add 5-level paging support

2017-03-12 Thread Kirill A. Shutemov
Nothing special: just handle one more level.

Signed-off-by: Kirill A. Shutemov 
---
 arch/x86/mm/ident_map.c | 47 ---
 1 file changed, 40 insertions(+), 7 deletions(-)

diff --git a/arch/x86/mm/ident_map.c b/arch/x86/mm/ident_map.c
index 4473cb4f8b90..2c9a62282fb1 100644
--- a/arch/x86/mm/ident_map.c
+++ b/arch/x86/mm/ident_map.c
@@ -45,6 +45,34 @@ static int ident_pud_init(struct x86_mapping_info *info, 
pud_t *pud_page,
return 0;
 }
 
+static int ident_p4d_init(struct x86_mapping_info *info, p4d_t *p4d_page,
+ unsigned long addr, unsigned long end)
+{
+   unsigned long next;
+
+   for (; addr < end; addr = next) {
+   p4d_t *p4d = p4d_page + p4d_index(addr);
+   pud_t *pud;
+
+   next = (addr & P4D_MASK) + P4D_SIZE;
+   if (next > end)
+   next = end;
+
+   if (p4d_present(*p4d)) {
+   pud = pud_offset(p4d, 0);
+   ident_pud_init(info, pud, addr, next);
+   continue;
+   }
+   pud = (pud_t *)info->alloc_pgt_page(info->context);
+   if (!pud)
+   return -ENOMEM;
+   ident_pud_init(info, pud, addr, next);
+   set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE));
+   }
+
+   return 0;
+}
+
 int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page,
  unsigned long pstart, unsigned long pend)
 {
@@ -55,27 +83,32 @@ int kernel_ident_mapping_init(struct x86_mapping_info 
*info, pgd_t *pgd_page,
 
for (; addr < end; addr = next) {
pgd_t *pgd = pgd_page + pgd_index(addr);
-   pud_t *pud;
+   p4d_t *p4d;
 
next = (addr & PGDIR_MASK) + PGDIR_SIZE;
if (next > end)
next = end;
 
if (pgd_present(*pgd)) {
-   pud = pud_offset(pgd, 0);
-   result = ident_pud_init(info, pud, addr, next);
+   p4d = p4d_offset(pgd, 0);
+   result = ident_p4d_init(info, p4d, addr, next);
if (result)
return result;
continue;
}
 
-   pud = (pud_t *)info->alloc_pgt_page(info->context);
-   if (!pud)
+   p4d = (p4d_t *)info->alloc_pgt_page(info->context);
+   if (!p4d)
return -ENOMEM;
-   result = ident_pud_init(info, pud, addr, next);
+   result = ident_p4d_init(info, p4d, addr, next);
if (result)
return result;
-   set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE));
+   if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
+   set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE));
+   } else {
+   pud_t *pud = pud_offset(p4d, 0);
+   set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE));
+   }
}
 
return 0;
-- 
2.11.0



[PATCH 08/26] x86/efi: handle p4d in EFI pagetables

2017-03-12 Thread Kirill A. Shutemov
Allocate additional page table level and change efi_sync_low_kernel_mappings()
to make syncing logic work with additional page table level.

Signed-off-by: Kirill A. Shutemov 
Reviewed-by: Matt Fleming 
---
 arch/x86/platform/efi/efi_64.c | 33 +++--
 1 file changed, 23 insertions(+), 10 deletions(-)

diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index 8544dae3d1b4..34d019f75239 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -135,6 +135,7 @@ static pgd_t *efi_pgd;
 int __init efi_alloc_page_tables(void)
 {
pgd_t *pgd;
+   p4d_t *p4d;
pud_t *pud;
gfp_t gfp_mask;
 
@@ -147,15 +148,20 @@ int __init efi_alloc_page_tables(void)
return -ENOMEM;
 
pgd = efi_pgd + pgd_index(EFI_VA_END);
+   p4d = p4d_alloc(_mm, pgd, EFI_VA_END);
+   if (!p4d) {
+   free_page((unsigned long)efi_pgd);
+   return -ENOMEM;
+   }
 
-   pud = pud_alloc_one(NULL, 0);
+   pud = pud_alloc(_mm, p4d, EFI_VA_END);
if (!pud) {
+   if (CONFIG_PGTABLE_LEVELS > 4)
+   free_page((unsigned long) pgd_page_vaddr(*pgd));
free_page((unsigned long)efi_pgd);
return -ENOMEM;
}
 
-   pgd_populate(NULL, pgd, pud);
-
return 0;
 }
 
@@ -190,6 +196,18 @@ void efi_sync_low_kernel_mappings(void)
num_entries = pgd_index(EFI_VA_END) - pgd_index(PAGE_OFFSET);
memcpy(pgd_efi, pgd_k, sizeof(pgd_t) * num_entries);
 
+   /* The same story as with PGD entries */
+   BUILD_BUG_ON(p4d_index(EFI_VA_END) != p4d_index(MODULES_END));
+   BUILD_BUG_ON((EFI_VA_START & P4D_MASK) != (EFI_VA_END & P4D_MASK));
+
+   pgd_efi = efi_pgd + pgd_index(EFI_VA_END);
+   pgd_k = pgd_offset_k(EFI_VA_END);
+   p4d_efi = p4d_offset(pgd_efi, 0);
+   p4d_k = p4d_offset(pgd_k, 0);
+
+   num_entries = p4d_index(EFI_VA_END);
+   memcpy(p4d_efi, p4d_k, sizeof(p4d_t) * num_entries);
+
/*
 * We share all the PUD entries apart from those that map the
 * EFI regions. Copy around them.
@@ -197,20 +215,15 @@ void efi_sync_low_kernel_mappings(void)
BUILD_BUG_ON((EFI_VA_START & ~PUD_MASK) != 0);
BUILD_BUG_ON((EFI_VA_END & ~PUD_MASK) != 0);
 
-   pgd_efi = efi_pgd + pgd_index(EFI_VA_END);
-   p4d_efi = p4d_offset(pgd_efi, 0);
+   p4d_efi = p4d_offset(pgd_efi, EFI_VA_END);
+   p4d_k = p4d_offset(pgd_k, EFI_VA_END);
pud_efi = pud_offset(p4d_efi, 0);
-
-   pgd_k = pgd_offset_k(EFI_VA_END);
-   p4d_k = p4d_offset(pgd_k, 0);
pud_k = pud_offset(p4d_k, 0);
 
num_entries = pud_index(EFI_VA_END);
memcpy(pud_efi, pud_k, sizeof(pud_t) * num_entries);
 
-   p4d_efi = p4d_offset(pgd_efi, EFI_VA_START);
pud_efi = pud_offset(p4d_efi, EFI_VA_START);
-   p4d_k = p4d_offset(pgd_k, EFI_VA_START);
pud_k = pud_offset(p4d_k, EFI_VA_START);
 
num_entries = PTRS_PER_PUD - pud_index(EFI_VA_START);
-- 
2.11.0



[PATCH 02/26] x86: trivial portion of 5-level paging conversion

2017-03-12 Thread Kirill A. Shutemov
This patch covers simple cases only.

Signed-off-by: Kirill A. Shutemov 
---
 arch/x86/kernel/tboot.c|  6 +-
 arch/x86/kernel/vm86_32.c  |  6 +-
 arch/x86/mm/fault.c| 39 +--
 arch/x86/mm/init_32.c  | 22 --
 arch/x86/mm/ioremap.c  |  3 ++-
 arch/x86/mm/pgtable.c  |  4 +++-
 arch/x86/mm/pgtable_32.c   |  8 +++-
 arch/x86/platform/efi/efi_64.c | 13 +
 arch/x86/power/hibernate_32.c  |  7 +--
 9 files changed, 85 insertions(+), 23 deletions(-)

diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index b868fa1b812b..5db0f33cbf2c 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -118,12 +118,16 @@ static int map_tboot_page(unsigned long vaddr, unsigned 
long pfn,
  pgprot_t prot)
 {
pgd_t *pgd;
+   p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
 
pgd = pgd_offset(_mm, vaddr);
-   pud = pud_alloc(_mm, pgd, vaddr);
+   p4d = p4d_alloc(_mm, pgd, vaddr);
+   if (!p4d)
+   return -1;
+   pud = pud_alloc(_mm, p4d, vaddr);
if (!pud)
return -1;
pmd = pmd_alloc(_mm, pud, vaddr);
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 23ee89ce59a9..62597c300d94 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -164,6 +164,7 @@ static void mark_screen_rdonly(struct mm_struct *mm)
struct vm_area_struct *vma;
spinlock_t *ptl;
pgd_t *pgd;
+   p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
@@ -173,7 +174,10 @@ static void mark_screen_rdonly(struct mm_struct *mm)
pgd = pgd_offset(mm, 0xA);
if (pgd_none_or_clear_bad(pgd))
goto out;
-   pud = pud_offset(pgd, 0xA);
+   p4d = p4d_offset(pgd, 0xA);
+   if (p4d_none_or_clear_bad(p4d))
+   goto out;
+   pud = pud_offset(p4d, 0xA);
if (pud_none_or_clear_bad(pud))
goto out;
pmd = pmd_offset(pud, 0xA);
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 428e31763cb9..605fd5e8e048 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -253,6 +253,7 @@ static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned 
long address)
 {
unsigned index = pgd_index(address);
pgd_t *pgd_k;
+   p4d_t *p4d, *p4d_k;
pud_t *pud, *pud_k;
pmd_t *pmd, *pmd_k;
 
@@ -265,10 +266,15 @@ static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, 
unsigned long address)
/*
 * set_pgd(pgd, *pgd_k); here would be useless on PAE
 * and redundant with the set_pmd() on non-PAE. As would
-* set_pud.
+* set_p4d/set_pud.
 */
-   pud = pud_offset(pgd, address);
-   pud_k = pud_offset(pgd_k, address);
+   p4d = p4d_offset(pgd, address);
+   p4d_k = p4d_offset(pgd_k, address);
+   if (!p4d_present(*p4d_k))
+   return NULL;
+
+   pud = pud_offset(p4d, address);
+   pud_k = pud_offset(p4d_k, address);
if (!pud_present(*pud_k))
return NULL;
 
@@ -384,6 +390,8 @@ static void dump_pagetable(unsigned long address)
 {
pgd_t *base = __va(read_cr3());
pgd_t *pgd = [pgd_index(address)];
+   p4d_t *p4d;
+   pud_t *pud;
pmd_t *pmd;
pte_t *pte;
 
@@ -392,7 +400,9 @@ static void dump_pagetable(unsigned long address)
if (!low_pfn(pgd_val(*pgd) >> PAGE_SHIFT) || !pgd_present(*pgd))
goto out;
 #endif
-   pmd = pmd_offset(pud_offset(pgd, address), address);
+   p4d = p4d_offset(pgd, address);
+   pud = pud_offset(p4d, address);
+   pmd = pmd_offset(pud, address);
printk(KERN_CONT "*pde = %0*Lx ", sizeof(*pmd) * 2, (u64)pmd_val(*pmd));
 
/*
@@ -526,6 +536,7 @@ static void dump_pagetable(unsigned long address)
 {
pgd_t *base = __va(read_cr3() & PHYSICAL_PAGE_MASK);
pgd_t *pgd = base + pgd_index(address);
+   p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
@@ -538,7 +549,15 @@ static void dump_pagetable(unsigned long address)
if (!pgd_present(*pgd))
goto out;
 
-   pud = pud_offset(pgd, address);
+   p4d = p4d_offset(pgd, address);
+   if (bad_address(p4d))
+   goto bad;
+
+   printk("P4D %lx ", p4d_val(*p4d));
+   if (!p4d_present(*p4d) || p4d_large(*p4d))
+   goto out;
+
+   pud = pud_offset(p4d, address);
if (bad_address(pud))
goto bad;
 
@@ -1082,6 +1101,7 @@ static noinline int
 spurious_fault(unsigned long error_code, unsigned long address)
 {
pgd_t *pgd;
+   p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
@@ -1104,7 +1124,14 @@ spurious_fault(unsigned long error_code, unsigned long 
address)

[PATCH 08/26] x86/efi: handle p4d in EFI pagetables

2017-03-12 Thread Kirill A. Shutemov
Allocate additional page table level and change efi_sync_low_kernel_mappings()
to make syncing logic work with additional page table level.

Signed-off-by: Kirill A. Shutemov 
Reviewed-by: Matt Fleming 
---
 arch/x86/platform/efi/efi_64.c | 33 +++--
 1 file changed, 23 insertions(+), 10 deletions(-)

diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index 8544dae3d1b4..34d019f75239 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -135,6 +135,7 @@ static pgd_t *efi_pgd;
 int __init efi_alloc_page_tables(void)
 {
pgd_t *pgd;
+   p4d_t *p4d;
pud_t *pud;
gfp_t gfp_mask;
 
@@ -147,15 +148,20 @@ int __init efi_alloc_page_tables(void)
return -ENOMEM;
 
pgd = efi_pgd + pgd_index(EFI_VA_END);
+   p4d = p4d_alloc(_mm, pgd, EFI_VA_END);
+   if (!p4d) {
+   free_page((unsigned long)efi_pgd);
+   return -ENOMEM;
+   }
 
-   pud = pud_alloc_one(NULL, 0);
+   pud = pud_alloc(_mm, p4d, EFI_VA_END);
if (!pud) {
+   if (CONFIG_PGTABLE_LEVELS > 4)
+   free_page((unsigned long) pgd_page_vaddr(*pgd));
free_page((unsigned long)efi_pgd);
return -ENOMEM;
}
 
-   pgd_populate(NULL, pgd, pud);
-
return 0;
 }
 
@@ -190,6 +196,18 @@ void efi_sync_low_kernel_mappings(void)
num_entries = pgd_index(EFI_VA_END) - pgd_index(PAGE_OFFSET);
memcpy(pgd_efi, pgd_k, sizeof(pgd_t) * num_entries);
 
+   /* The same story as with PGD entries */
+   BUILD_BUG_ON(p4d_index(EFI_VA_END) != p4d_index(MODULES_END));
+   BUILD_BUG_ON((EFI_VA_START & P4D_MASK) != (EFI_VA_END & P4D_MASK));
+
+   pgd_efi = efi_pgd + pgd_index(EFI_VA_END);
+   pgd_k = pgd_offset_k(EFI_VA_END);
+   p4d_efi = p4d_offset(pgd_efi, 0);
+   p4d_k = p4d_offset(pgd_k, 0);
+
+   num_entries = p4d_index(EFI_VA_END);
+   memcpy(p4d_efi, p4d_k, sizeof(p4d_t) * num_entries);
+
/*
 * We share all the PUD entries apart from those that map the
 * EFI regions. Copy around them.
@@ -197,20 +215,15 @@ void efi_sync_low_kernel_mappings(void)
BUILD_BUG_ON((EFI_VA_START & ~PUD_MASK) != 0);
BUILD_BUG_ON((EFI_VA_END & ~PUD_MASK) != 0);
 
-   pgd_efi = efi_pgd + pgd_index(EFI_VA_END);
-   p4d_efi = p4d_offset(pgd_efi, 0);
+   p4d_efi = p4d_offset(pgd_efi, EFI_VA_END);
+   p4d_k = p4d_offset(pgd_k, EFI_VA_END);
pud_efi = pud_offset(p4d_efi, 0);
-
-   pgd_k = pgd_offset_k(EFI_VA_END);
-   p4d_k = p4d_offset(pgd_k, 0);
pud_k = pud_offset(p4d_k, 0);
 
num_entries = pud_index(EFI_VA_END);
memcpy(pud_efi, pud_k, sizeof(pud_t) * num_entries);
 
-   p4d_efi = p4d_offset(pgd_efi, EFI_VA_START);
pud_efi = pud_offset(p4d_efi, EFI_VA_START);
-   p4d_k = p4d_offset(pgd_k, EFI_VA_START);
pud_k = pud_offset(p4d_k, EFI_VA_START);
 
num_entries = PTRS_PER_PUD - pud_index(EFI_VA_START);
-- 
2.11.0



[PATCH 02/26] x86: trivial portion of 5-level paging conversion

2017-03-12 Thread Kirill A. Shutemov
This patch covers simple cases only.

Signed-off-by: Kirill A. Shutemov 
---
 arch/x86/kernel/tboot.c|  6 +-
 arch/x86/kernel/vm86_32.c  |  6 +-
 arch/x86/mm/fault.c| 39 +--
 arch/x86/mm/init_32.c  | 22 --
 arch/x86/mm/ioremap.c  |  3 ++-
 arch/x86/mm/pgtable.c  |  4 +++-
 arch/x86/mm/pgtable_32.c   |  8 +++-
 arch/x86/platform/efi/efi_64.c | 13 +
 arch/x86/power/hibernate_32.c  |  7 +--
 9 files changed, 85 insertions(+), 23 deletions(-)

diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index b868fa1b812b..5db0f33cbf2c 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -118,12 +118,16 @@ static int map_tboot_page(unsigned long vaddr, unsigned 
long pfn,
  pgprot_t prot)
 {
pgd_t *pgd;
+   p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
 
pgd = pgd_offset(_mm, vaddr);
-   pud = pud_alloc(_mm, pgd, vaddr);
+   p4d = p4d_alloc(_mm, pgd, vaddr);
+   if (!p4d)
+   return -1;
+   pud = pud_alloc(_mm, p4d, vaddr);
if (!pud)
return -1;
pmd = pmd_alloc(_mm, pud, vaddr);
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 23ee89ce59a9..62597c300d94 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -164,6 +164,7 @@ static void mark_screen_rdonly(struct mm_struct *mm)
struct vm_area_struct *vma;
spinlock_t *ptl;
pgd_t *pgd;
+   p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
@@ -173,7 +174,10 @@ static void mark_screen_rdonly(struct mm_struct *mm)
pgd = pgd_offset(mm, 0xA);
if (pgd_none_or_clear_bad(pgd))
goto out;
-   pud = pud_offset(pgd, 0xA);
+   p4d = p4d_offset(pgd, 0xA);
+   if (p4d_none_or_clear_bad(p4d))
+   goto out;
+   pud = pud_offset(p4d, 0xA);
if (pud_none_or_clear_bad(pud))
goto out;
pmd = pmd_offset(pud, 0xA);
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 428e31763cb9..605fd5e8e048 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -253,6 +253,7 @@ static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned 
long address)
 {
unsigned index = pgd_index(address);
pgd_t *pgd_k;
+   p4d_t *p4d, *p4d_k;
pud_t *pud, *pud_k;
pmd_t *pmd, *pmd_k;
 
@@ -265,10 +266,15 @@ static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, 
unsigned long address)
/*
 * set_pgd(pgd, *pgd_k); here would be useless on PAE
 * and redundant with the set_pmd() on non-PAE. As would
-* set_pud.
+* set_p4d/set_pud.
 */
-   pud = pud_offset(pgd, address);
-   pud_k = pud_offset(pgd_k, address);
+   p4d = p4d_offset(pgd, address);
+   p4d_k = p4d_offset(pgd_k, address);
+   if (!p4d_present(*p4d_k))
+   return NULL;
+
+   pud = pud_offset(p4d, address);
+   pud_k = pud_offset(p4d_k, address);
if (!pud_present(*pud_k))
return NULL;
 
@@ -384,6 +390,8 @@ static void dump_pagetable(unsigned long address)
 {
pgd_t *base = __va(read_cr3());
pgd_t *pgd = [pgd_index(address)];
+   p4d_t *p4d;
+   pud_t *pud;
pmd_t *pmd;
pte_t *pte;
 
@@ -392,7 +400,9 @@ static void dump_pagetable(unsigned long address)
if (!low_pfn(pgd_val(*pgd) >> PAGE_SHIFT) || !pgd_present(*pgd))
goto out;
 #endif
-   pmd = pmd_offset(pud_offset(pgd, address), address);
+   p4d = p4d_offset(pgd, address);
+   pud = pud_offset(p4d, address);
+   pmd = pmd_offset(pud, address);
printk(KERN_CONT "*pde = %0*Lx ", sizeof(*pmd) * 2, (u64)pmd_val(*pmd));
 
/*
@@ -526,6 +536,7 @@ static void dump_pagetable(unsigned long address)
 {
pgd_t *base = __va(read_cr3() & PHYSICAL_PAGE_MASK);
pgd_t *pgd = base + pgd_index(address);
+   p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
@@ -538,7 +549,15 @@ static void dump_pagetable(unsigned long address)
if (!pgd_present(*pgd))
goto out;
 
-   pud = pud_offset(pgd, address);
+   p4d = p4d_offset(pgd, address);
+   if (bad_address(p4d))
+   goto bad;
+
+   printk("P4D %lx ", p4d_val(*p4d));
+   if (!p4d_present(*p4d) || p4d_large(*p4d))
+   goto out;
+
+   pud = pud_offset(p4d, address);
if (bad_address(pud))
goto bad;
 
@@ -1082,6 +1101,7 @@ static noinline int
 spurious_fault(unsigned long error_code, unsigned long address)
 {
pgd_t *pgd;
+   p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
@@ -1104,7 +1124,14 @@ spurious_fault(unsigned long error_code, unsigned long 
address)
if (!pgd_present(*pgd))
 

[PATCH 04/26] x86/ident_map: add 5-level paging support

2017-03-12 Thread Kirill A. Shutemov
Nothing special: just handle one more level.

Signed-off-by: Kirill A. Shutemov 
---
 arch/x86/mm/ident_map.c | 47 ---
 1 file changed, 40 insertions(+), 7 deletions(-)

diff --git a/arch/x86/mm/ident_map.c b/arch/x86/mm/ident_map.c
index 4473cb4f8b90..2c9a62282fb1 100644
--- a/arch/x86/mm/ident_map.c
+++ b/arch/x86/mm/ident_map.c
@@ -45,6 +45,34 @@ static int ident_pud_init(struct x86_mapping_info *info, 
pud_t *pud_page,
return 0;
 }
 
+static int ident_p4d_init(struct x86_mapping_info *info, p4d_t *p4d_page,
+ unsigned long addr, unsigned long end)
+{
+   unsigned long next;
+
+   for (; addr < end; addr = next) {
+   p4d_t *p4d = p4d_page + p4d_index(addr);
+   pud_t *pud;
+
+   next = (addr & P4D_MASK) + P4D_SIZE;
+   if (next > end)
+   next = end;
+
+   if (p4d_present(*p4d)) {
+   pud = pud_offset(p4d, 0);
+   ident_pud_init(info, pud, addr, next);
+   continue;
+   }
+   pud = (pud_t *)info->alloc_pgt_page(info->context);
+   if (!pud)
+   return -ENOMEM;
+   ident_pud_init(info, pud, addr, next);
+   set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE));
+   }
+
+   return 0;
+}
+
 int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page,
  unsigned long pstart, unsigned long pend)
 {
@@ -55,27 +83,32 @@ int kernel_ident_mapping_init(struct x86_mapping_info 
*info, pgd_t *pgd_page,
 
for (; addr < end; addr = next) {
pgd_t *pgd = pgd_page + pgd_index(addr);
-   pud_t *pud;
+   p4d_t *p4d;
 
next = (addr & PGDIR_MASK) + PGDIR_SIZE;
if (next > end)
next = end;
 
if (pgd_present(*pgd)) {
-   pud = pud_offset(pgd, 0);
-   result = ident_pud_init(info, pud, addr, next);
+   p4d = p4d_offset(pgd, 0);
+   result = ident_p4d_init(info, p4d, addr, next);
if (result)
return result;
continue;
}
 
-   pud = (pud_t *)info->alloc_pgt_page(info->context);
-   if (!pud)
+   p4d = (p4d_t *)info->alloc_pgt_page(info->context);
+   if (!p4d)
return -ENOMEM;
-   result = ident_pud_init(info, pud, addr, next);
+   result = ident_p4d_init(info, p4d, addr, next);
if (result)
return result;
-   set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE));
+   if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
+   set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE));
+   } else {
+   pud_t *pud = pud_offset(p4d, 0);
+   set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE));
+   }
}
 
return 0;
-- 
2.11.0



[PATCH 01/26] x86: basic changes into headers for 5-level paging

2017-03-12 Thread Kirill A. Shutemov
This patch extends x86 headers to enable 5-level paging support.

It's still based on . We will get to the
point where we can have  later.

Signed-off-by: Kirill A. Shutemov 
---
 arch/x86/include/asm/pgtable-2level_types.h |  1 +
 arch/x86/include/asm/pgtable-3level_types.h |  1 +
 arch/x86/include/asm/pgtable.h  | 26 -
 arch/x86/include/asm/pgtable_64_types.h |  1 +
 arch/x86/include/asm/pgtable_types.h| 30 -
 5 files changed, 53 insertions(+), 6 deletions(-)

diff --git a/arch/x86/include/asm/pgtable-2level_types.h 
b/arch/x86/include/asm/pgtable-2level_types.h
index 392576433e77..373ab1de909f 100644
--- a/arch/x86/include/asm/pgtable-2level_types.h
+++ b/arch/x86/include/asm/pgtable-2level_types.h
@@ -7,6 +7,7 @@
 typedef unsigned long  pteval_t;
 typedef unsigned long  pmdval_t;
 typedef unsigned long  pudval_t;
+typedef unsigned long  p4dval_t;
 typedef unsigned long  pgdval_t;
 typedef unsigned long  pgprotval_t;
 
diff --git a/arch/x86/include/asm/pgtable-3level_types.h 
b/arch/x86/include/asm/pgtable-3level_types.h
index bcc89625ebe5..b8a4341faafa 100644
--- a/arch/x86/include/asm/pgtable-3level_types.h
+++ b/arch/x86/include/asm/pgtable-3level_types.h
@@ -7,6 +7,7 @@
 typedef u64pteval_t;
 typedef u64pmdval_t;
 typedef u64pudval_t;
+typedef u64p4dval_t;
 typedef u64pgdval_t;
 typedef u64pgprotval_t;
 
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 1cfb36b8c024..6f6f351e0a81 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -179,6 +179,17 @@ static inline unsigned long pud_pfn(pud_t pud)
return (pud_val(pud) & pud_pfn_mask(pud)) >> PAGE_SHIFT;
 }
 
+static inline unsigned long p4d_pfn(p4d_t p4d)
+{
+   return (p4d_val(p4d) & p4d_pfn_mask(p4d)) >> PAGE_SHIFT;
+}
+
+static inline int p4d_large(p4d_t p4d)
+{
+   /* No 512 GiB pages yet */
+   return 0;
+}
+
 #define pte_page(pte)  pfn_to_page(pte_pfn(pte))
 
 static inline int pmd_large(pmd_t pte)
@@ -770,6 +781,16 @@ static inline int pud_large(pud_t pud)
 }
 #endif /* CONFIG_PGTABLE_LEVELS > 2 */
 
+static inline unsigned long pud_index(unsigned long address)
+{
+   return (address >> PUD_SHIFT) & (PTRS_PER_PUD - 1);
+}
+
+static inline unsigned long p4d_index(unsigned long address)
+{
+   return (address >> P4D_SHIFT) & (PTRS_PER_P4D - 1);
+}
+
 #if CONFIG_PGTABLE_LEVELS > 3
 static inline int pgd_present(pgd_t pgd)
 {
@@ -788,11 +809,6 @@ static inline unsigned long pgd_page_vaddr(pgd_t pgd)
 #define pgd_page(pgd)  pfn_to_page(pgd_val(pgd) >> PAGE_SHIFT)
 
 /* to find an entry in a page-table-directory. */
-static inline unsigned long pud_index(unsigned long address)
-{
-   return (address >> PUD_SHIFT) & (PTRS_PER_PUD - 1);
-}
-
 static inline pud_t *pud_offset(pgd_t *pgd, unsigned long address)
 {
return (pud_t *)pgd_page_vaddr(*pgd) + pud_index(address);
diff --git a/arch/x86/include/asm/pgtable_64_types.h 
b/arch/x86/include/asm/pgtable_64_types.h
index 3a264200c62f..0b2797e5083c 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -13,6 +13,7 @@
 typedef unsigned long  pteval_t;
 typedef unsigned long  pmdval_t;
 typedef unsigned long  pudval_t;
+typedef unsigned long  p4dval_t;
 typedef unsigned long  pgdval_t;
 typedef unsigned long  pgprotval_t;
 
diff --git a/arch/x86/include/asm/pgtable_types.h 
b/arch/x86/include/asm/pgtable_types.h
index 62484333673d..df08535f774a 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -272,9 +272,20 @@ static inline pgdval_t pgd_flags(pgd_t pgd)
return native_pgd_val(pgd) & PTE_FLAGS_MASK;
 }
 
-#if CONFIG_PGTABLE_LEVELS > 3
+#if CONFIG_PGTABLE_LEVELS > 4
+
+#error FIXME
+
+#else
 #include 
 
+static inline p4dval_t native_p4d_val(p4d_t p4d)
+{
+   return native_pgd_val(p4d);
+}
+#endif
+
+#if CONFIG_PGTABLE_LEVELS > 3
 typedef struct { pudval_t pud; } pud_t;
 
 static inline pud_t native_make_pud(pmdval_t val)
@@ -318,6 +329,22 @@ static inline pmdval_t native_pmd_val(pmd_t pmd)
 }
 #endif
 
+static inline p4dval_t p4d_pfn_mask(p4d_t p4d)
+{
+   /* No 512 GiB huge pages yet */
+   return PTE_PFN_MASK;
+}
+
+static inline p4dval_t p4d_flags_mask(p4d_t p4d)
+{
+   return ~p4d_pfn_mask(p4d);
+}
+
+static inline p4dval_t p4d_flags(p4d_t p4d)
+{
+   return native_p4d_val(p4d) & p4d_flags_mask(p4d);
+}
+
 static inline pudval_t pud_pfn_mask(pud_t pud)
 {
if (native_pud_val(pud) & _PAGE_PSE)
@@ -461,6 +488,7 @@ enum pg_level {
PG_LEVEL_4K,
PG_LEVEL_2M,
PG_LEVEL_1G,
+   PG_LEVEL_512G,
PG_LEVEL_NUM
 };
 
-- 
2.11.0



[PATCH 01/26] x86: basic changes into headers for 5-level paging

2017-03-12 Thread Kirill A. Shutemov
This patch extends x86 headers to enable 5-level paging support.

It's still based on . We will get to the
point where we can have  later.

Signed-off-by: Kirill A. Shutemov 
---
 arch/x86/include/asm/pgtable-2level_types.h |  1 +
 arch/x86/include/asm/pgtable-3level_types.h |  1 +
 arch/x86/include/asm/pgtable.h  | 26 -
 arch/x86/include/asm/pgtable_64_types.h |  1 +
 arch/x86/include/asm/pgtable_types.h| 30 -
 5 files changed, 53 insertions(+), 6 deletions(-)

diff --git a/arch/x86/include/asm/pgtable-2level_types.h 
b/arch/x86/include/asm/pgtable-2level_types.h
index 392576433e77..373ab1de909f 100644
--- a/arch/x86/include/asm/pgtable-2level_types.h
+++ b/arch/x86/include/asm/pgtable-2level_types.h
@@ -7,6 +7,7 @@
 typedef unsigned long  pteval_t;
 typedef unsigned long  pmdval_t;
 typedef unsigned long  pudval_t;
+typedef unsigned long  p4dval_t;
 typedef unsigned long  pgdval_t;
 typedef unsigned long  pgprotval_t;
 
diff --git a/arch/x86/include/asm/pgtable-3level_types.h 
b/arch/x86/include/asm/pgtable-3level_types.h
index bcc89625ebe5..b8a4341faafa 100644
--- a/arch/x86/include/asm/pgtable-3level_types.h
+++ b/arch/x86/include/asm/pgtable-3level_types.h
@@ -7,6 +7,7 @@
 typedef u64pteval_t;
 typedef u64pmdval_t;
 typedef u64pudval_t;
+typedef u64p4dval_t;
 typedef u64pgdval_t;
 typedef u64pgprotval_t;
 
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 1cfb36b8c024..6f6f351e0a81 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -179,6 +179,17 @@ static inline unsigned long pud_pfn(pud_t pud)
return (pud_val(pud) & pud_pfn_mask(pud)) >> PAGE_SHIFT;
 }
 
+static inline unsigned long p4d_pfn(p4d_t p4d)
+{
+   return (p4d_val(p4d) & p4d_pfn_mask(p4d)) >> PAGE_SHIFT;
+}
+
+static inline int p4d_large(p4d_t p4d)
+{
+   /* No 512 GiB pages yet */
+   return 0;
+}
+
 #define pte_page(pte)  pfn_to_page(pte_pfn(pte))
 
 static inline int pmd_large(pmd_t pte)
@@ -770,6 +781,16 @@ static inline int pud_large(pud_t pud)
 }
 #endif /* CONFIG_PGTABLE_LEVELS > 2 */
 
+static inline unsigned long pud_index(unsigned long address)
+{
+   return (address >> PUD_SHIFT) & (PTRS_PER_PUD - 1);
+}
+
+static inline unsigned long p4d_index(unsigned long address)
+{
+   return (address >> P4D_SHIFT) & (PTRS_PER_P4D - 1);
+}
+
 #if CONFIG_PGTABLE_LEVELS > 3
 static inline int pgd_present(pgd_t pgd)
 {
@@ -788,11 +809,6 @@ static inline unsigned long pgd_page_vaddr(pgd_t pgd)
 #define pgd_page(pgd)  pfn_to_page(pgd_val(pgd) >> PAGE_SHIFT)
 
 /* to find an entry in a page-table-directory. */
-static inline unsigned long pud_index(unsigned long address)
-{
-   return (address >> PUD_SHIFT) & (PTRS_PER_PUD - 1);
-}
-
 static inline pud_t *pud_offset(pgd_t *pgd, unsigned long address)
 {
return (pud_t *)pgd_page_vaddr(*pgd) + pud_index(address);
diff --git a/arch/x86/include/asm/pgtable_64_types.h 
b/arch/x86/include/asm/pgtable_64_types.h
index 3a264200c62f..0b2797e5083c 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -13,6 +13,7 @@
 typedef unsigned long  pteval_t;
 typedef unsigned long  pmdval_t;
 typedef unsigned long  pudval_t;
+typedef unsigned long  p4dval_t;
 typedef unsigned long  pgdval_t;
 typedef unsigned long  pgprotval_t;
 
diff --git a/arch/x86/include/asm/pgtable_types.h 
b/arch/x86/include/asm/pgtable_types.h
index 62484333673d..df08535f774a 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -272,9 +272,20 @@ static inline pgdval_t pgd_flags(pgd_t pgd)
return native_pgd_val(pgd) & PTE_FLAGS_MASK;
 }
 
-#if CONFIG_PGTABLE_LEVELS > 3
+#if CONFIG_PGTABLE_LEVELS > 4
+
+#error FIXME
+
+#else
 #include 
 
+static inline p4dval_t native_p4d_val(p4d_t p4d)
+{
+   return native_pgd_val(p4d);
+}
+#endif
+
+#if CONFIG_PGTABLE_LEVELS > 3
 typedef struct { pudval_t pud; } pud_t;
 
 static inline pud_t native_make_pud(pmdval_t val)
@@ -318,6 +329,22 @@ static inline pmdval_t native_pmd_val(pmd_t pmd)
 }
 #endif
 
+static inline p4dval_t p4d_pfn_mask(p4d_t p4d)
+{
+   /* No 512 GiB huge pages yet */
+   return PTE_PFN_MASK;
+}
+
+static inline p4dval_t p4d_flags_mask(p4d_t p4d)
+{
+   return ~p4d_pfn_mask(p4d);
+}
+
+static inline p4dval_t p4d_flags(p4d_t p4d)
+{
+   return native_p4d_val(p4d) & p4d_flags_mask(p4d);
+}
+
 static inline pudval_t pud_pfn_mask(pud_t pud)
 {
if (native_pud_val(pud) & _PAGE_PSE)
@@ -461,6 +488,7 @@ enum pg_level {
PG_LEVEL_4K,
PG_LEVEL_2M,
PG_LEVEL_1G,
+   PG_LEVEL_512G,
PG_LEVEL_NUM
 };
 
-- 
2.11.0



[PATCH 03/26] x86/gup: add 5-level paging support

2017-03-12 Thread Kirill A. Shutemov
It's simply extension for one more page table level.

Signed-off-by: Kirill A. Shutemov 
---
 arch/x86/mm/gup.c | 33 +++--
 1 file changed, 27 insertions(+), 6 deletions(-)

diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
index 1f3b6ef105cd..456dfdfd2249 100644
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -76,9 +76,9 @@ static void undo_dev_pagemap(int *nr, int nr_start, struct 
page **pages)
 }
 
 /*
- * 'pteval' can come from a pte, pmd or pud.  We only check
+ * 'pteval' can come from a pte, pmd, pud or p4d.  We only check
  * _PAGE_PRESENT, _PAGE_USER, and _PAGE_RW in here which are the
- * same value on all 3 types.
+ * same value on all 4 types.
  */
 static inline int pte_allows_gup(unsigned long pteval, int write)
 {
@@ -295,13 +295,13 @@ static noinline int gup_huge_pud(pud_t pud, unsigned long 
addr,
return 1;
 }
 
-static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
+static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end,
int write, struct page **pages, int *nr)
 {
unsigned long next;
pud_t *pudp;
 
-   pudp = pud_offset(, addr);
+   pudp = pud_offset(, addr);
do {
pud_t pud = *pudp;
 
@@ -320,6 +320,27 @@ static int gup_pud_range(pgd_t pgd, unsigned long addr, 
unsigned long end,
return 1;
 }
 
+static int gup_p4d_range(pgd_t pgd, unsigned long addr, unsigned long end,
+   int write, struct page **pages, int *nr)
+{
+   unsigned long next;
+   p4d_t *p4dp;
+
+   p4dp = p4d_offset(, addr);
+   do {
+   p4d_t p4d = *p4dp;
+
+   next = p4d_addr_end(addr, end);
+   if (p4d_none(p4d))
+   return 0;
+   BUILD_BUG_ON(p4d_large(p4d));
+   if (!gup_pud_range(p4d, addr, next, write, pages, nr))
+   return 0;
+   } while (p4dp++, addr = next, addr != end);
+
+   return 1;
+}
+
 /*
  * Like get_user_pages_fast() except its IRQ-safe in that it won't fall
  * back to the regular GUP.
@@ -368,7 +389,7 @@ int __get_user_pages_fast(unsigned long start, int 
nr_pages, int write,
next = pgd_addr_end(addr, end);
if (pgd_none(pgd))
break;
-   if (!gup_pud_range(pgd, addr, next, write, pages, ))
+   if (!gup_p4d_range(pgd, addr, next, write, pages, ))
break;
} while (pgdp++, addr = next, addr != end);
local_irq_restore(flags);
@@ -440,7 +461,7 @@ int get_user_pages_fast(unsigned long start, int nr_pages, 
int write,
next = pgd_addr_end(addr, end);
if (pgd_none(pgd))
goto slow;
-   if (!gup_pud_range(pgd, addr, next, write, pages, ))
+   if (!gup_p4d_range(pgd, addr, next, write, pages, ))
goto slow;
} while (pgdp++, addr = next, addr != end);
local_irq_enable();
-- 
2.11.0



[PATCH 13/26] x86: detect 5-level paging support

2017-03-12 Thread Kirill A. Shutemov
5-level paging support is required from hardware when compiled with
CONFIG_X86_5LEVEL=y. We may implement runtime switch support later.

Signed-off-by: Kirill A. Shutemov 
---
 arch/x86/boot/cpucheck.c |  9 +
 arch/x86/boot/cpuflags.c | 12 ++--
 arch/x86/include/asm/disabled-features.h |  8 +++-
 arch/x86/include/asm/required-features.h |  8 +++-
 4 files changed, 33 insertions(+), 4 deletions(-)

diff --git a/arch/x86/boot/cpucheck.c b/arch/x86/boot/cpucheck.c
index 4ad7d70e8739..8f0c4c9fc904 100644
--- a/arch/x86/boot/cpucheck.c
+++ b/arch/x86/boot/cpucheck.c
@@ -44,6 +44,15 @@ static const u32 req_flags[NCAPINTS] =
0, /* REQUIRED_MASK5 not implemented in this file */
REQUIRED_MASK6,
0, /* REQUIRED_MASK7 not implemented in this file */
+   0, /* REQUIRED_MASK8 not implemented in this file */
+   0, /* REQUIRED_MASK9 not implemented in this file */
+   0, /* REQUIRED_MASK10 not implemented in this file */
+   0, /* REQUIRED_MASK11 not implemented in this file */
+   0, /* REQUIRED_MASK12 not implemented in this file */
+   0, /* REQUIRED_MASK13 not implemented in this file */
+   0, /* REQUIRED_MASK14 not implemented in this file */
+   0, /* REQUIRED_MASK15 not implemented in this file */
+   REQUIRED_MASK16,
 };
 
 #define A32(a, b, c, d) (((d) << 24)+((c) << 16)+((b) << 8)+(a))
diff --git a/arch/x86/boot/cpuflags.c b/arch/x86/boot/cpuflags.c
index 6687ab953257..9e77c23c2422 100644
--- a/arch/x86/boot/cpuflags.c
+++ b/arch/x86/boot/cpuflags.c
@@ -70,16 +70,19 @@ int has_eflag(unsigned long mask)
 # define EBX_REG "=b"
 #endif
 
-static inline void cpuid(u32 id, u32 *a, u32 *b, u32 *c, u32 *d)
+static inline void cpuid_count(u32 id, u32 count,
+   u32 *a, u32 *b, u32 *c, u32 *d)
 {
asm volatile(".ifnc %%ebx,%3 ; movl  %%ebx,%3 ; .endif  \n\t"
 "cpuid \n\t"
 ".ifnc %%ebx,%3 ; xchgl %%ebx,%3 ; .endif  \n\t"
: "=a" (*a), "=c" (*c), "=d" (*d), EBX_REG (*b)
-   : "a" (id)
+   : "a" (id), "c" (count)
);
 }
 
+#define cpuid(id, a, b, c, d) cpuid_count(id, 0, a, b, c, d)
+
 void get_cpuflags(void)
 {
u32 max_intel_level, max_amd_level;
@@ -108,6 +111,11 @@ void get_cpuflags(void)
cpu.model += ((tfms >> 16) & 0xf) << 4;
}
 
+   if (max_intel_level >= 0x0007) {
+   cpuid_count(0x0007, 0, , ,
+   [16], );
+   }
+
cpuid(0x8000, _amd_level, , ,
  );
 
diff --git a/arch/x86/include/asm/disabled-features.h 
b/arch/x86/include/asm/disabled-features.h
index 85599ad4d024..fc0960236fc3 100644
--- a/arch/x86/include/asm/disabled-features.h
+++ b/arch/x86/include/asm/disabled-features.h
@@ -36,6 +36,12 @@
 # define DISABLE_OSPKE (1<<(X86_FEATURE_OSPKE & 31))
 #endif /* CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS */
 
+#ifdef CONFIG_X86_5LEVEL
+#define DISABLE_LA57   0
+#else
+#define DISABLE_LA57   (1<<(X86_FEATURE_LA57 & 31))
+#endif
+
 /*
  * Make sure to add features to the correct mask
  */
@@ -55,7 +61,7 @@
 #define DISABLED_MASK130
 #define DISABLED_MASK140
 #define DISABLED_MASK150
-#define DISABLED_MASK16(DISABLE_PKU|DISABLE_OSPKE)
+#define DISABLED_MASK16(DISABLE_PKU|DISABLE_OSPKE|DISABLE_LA57)
 #define DISABLED_MASK170
 #define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 18)
 
diff --git a/arch/x86/include/asm/required-features.h 
b/arch/x86/include/asm/required-features.h
index fac9a5c0abe9..d91ba04dd007 100644
--- a/arch/x86/include/asm/required-features.h
+++ b/arch/x86/include/asm/required-features.h
@@ -53,6 +53,12 @@
 # define NEED_MOVBE0
 #endif
 
+#ifdef CONFIG_X86_5LEVEL
+# define NEED_LA57 (1<<(X86_FEATURE_LA57 & 31))
+#else
+# define NEED_LA57 0
+#endif
+
 #ifdef CONFIG_X86_64
 #ifdef CONFIG_PARAVIRT
 /* Paravirtualized systems may not have PSE or PGE available */
@@ -98,7 +104,7 @@
 #define REQUIRED_MASK130
 #define REQUIRED_MASK140
 #define REQUIRED_MASK150
-#define REQUIRED_MASK160
+#define REQUIRED_MASK16(NEED_LA57)
 #define REQUIRED_MASK170
 #define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 18)
 
-- 
2.11.0



[PATCH 03/26] x86/gup: add 5-level paging support

2017-03-12 Thread Kirill A. Shutemov
It's simply extension for one more page table level.

Signed-off-by: Kirill A. Shutemov 
---
 arch/x86/mm/gup.c | 33 +++--
 1 file changed, 27 insertions(+), 6 deletions(-)

diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
index 1f3b6ef105cd..456dfdfd2249 100644
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -76,9 +76,9 @@ static void undo_dev_pagemap(int *nr, int nr_start, struct 
page **pages)
 }
 
 /*
- * 'pteval' can come from a pte, pmd or pud.  We only check
+ * 'pteval' can come from a pte, pmd, pud or p4d.  We only check
  * _PAGE_PRESENT, _PAGE_USER, and _PAGE_RW in here which are the
- * same value on all 3 types.
+ * same value on all 4 types.
  */
 static inline int pte_allows_gup(unsigned long pteval, int write)
 {
@@ -295,13 +295,13 @@ static noinline int gup_huge_pud(pud_t pud, unsigned long 
addr,
return 1;
 }
 
-static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
+static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end,
int write, struct page **pages, int *nr)
 {
unsigned long next;
pud_t *pudp;
 
-   pudp = pud_offset(, addr);
+   pudp = pud_offset(, addr);
do {
pud_t pud = *pudp;
 
@@ -320,6 +320,27 @@ static int gup_pud_range(pgd_t pgd, unsigned long addr, 
unsigned long end,
return 1;
 }
 
+static int gup_p4d_range(pgd_t pgd, unsigned long addr, unsigned long end,
+   int write, struct page **pages, int *nr)
+{
+   unsigned long next;
+   p4d_t *p4dp;
+
+   p4dp = p4d_offset(, addr);
+   do {
+   p4d_t p4d = *p4dp;
+
+   next = p4d_addr_end(addr, end);
+   if (p4d_none(p4d))
+   return 0;
+   BUILD_BUG_ON(p4d_large(p4d));
+   if (!gup_pud_range(p4d, addr, next, write, pages, nr))
+   return 0;
+   } while (p4dp++, addr = next, addr != end);
+
+   return 1;
+}
+
 /*
  * Like get_user_pages_fast() except its IRQ-safe in that it won't fall
  * back to the regular GUP.
@@ -368,7 +389,7 @@ int __get_user_pages_fast(unsigned long start, int 
nr_pages, int write,
next = pgd_addr_end(addr, end);
if (pgd_none(pgd))
break;
-   if (!gup_pud_range(pgd, addr, next, write, pages, ))
+   if (!gup_p4d_range(pgd, addr, next, write, pages, ))
break;
} while (pgdp++, addr = next, addr != end);
local_irq_restore(flags);
@@ -440,7 +461,7 @@ int get_user_pages_fast(unsigned long start, int nr_pages, 
int write,
next = pgd_addr_end(addr, end);
if (pgd_none(pgd))
goto slow;
-   if (!gup_pud_range(pgd, addr, next, write, pages, ))
+   if (!gup_p4d_range(pgd, addr, next, write, pages, ))
goto slow;
} while (pgdp++, addr = next, addr != end);
local_irq_enable();
-- 
2.11.0



[PATCH 13/26] x86: detect 5-level paging support

2017-03-12 Thread Kirill A. Shutemov
5-level paging support is required from hardware when compiled with
CONFIG_X86_5LEVEL=y. We may implement runtime switch support later.

Signed-off-by: Kirill A. Shutemov 
---
 arch/x86/boot/cpucheck.c |  9 +
 arch/x86/boot/cpuflags.c | 12 ++--
 arch/x86/include/asm/disabled-features.h |  8 +++-
 arch/x86/include/asm/required-features.h |  8 +++-
 4 files changed, 33 insertions(+), 4 deletions(-)

diff --git a/arch/x86/boot/cpucheck.c b/arch/x86/boot/cpucheck.c
index 4ad7d70e8739..8f0c4c9fc904 100644
--- a/arch/x86/boot/cpucheck.c
+++ b/arch/x86/boot/cpucheck.c
@@ -44,6 +44,15 @@ static const u32 req_flags[NCAPINTS] =
0, /* REQUIRED_MASK5 not implemented in this file */
REQUIRED_MASK6,
0, /* REQUIRED_MASK7 not implemented in this file */
+   0, /* REQUIRED_MASK8 not implemented in this file */
+   0, /* REQUIRED_MASK9 not implemented in this file */
+   0, /* REQUIRED_MASK10 not implemented in this file */
+   0, /* REQUIRED_MASK11 not implemented in this file */
+   0, /* REQUIRED_MASK12 not implemented in this file */
+   0, /* REQUIRED_MASK13 not implemented in this file */
+   0, /* REQUIRED_MASK14 not implemented in this file */
+   0, /* REQUIRED_MASK15 not implemented in this file */
+   REQUIRED_MASK16,
 };
 
 #define A32(a, b, c, d) (((d) << 24)+((c) << 16)+((b) << 8)+(a))
diff --git a/arch/x86/boot/cpuflags.c b/arch/x86/boot/cpuflags.c
index 6687ab953257..9e77c23c2422 100644
--- a/arch/x86/boot/cpuflags.c
+++ b/arch/x86/boot/cpuflags.c
@@ -70,16 +70,19 @@ int has_eflag(unsigned long mask)
 # define EBX_REG "=b"
 #endif
 
-static inline void cpuid(u32 id, u32 *a, u32 *b, u32 *c, u32 *d)
+static inline void cpuid_count(u32 id, u32 count,
+   u32 *a, u32 *b, u32 *c, u32 *d)
 {
asm volatile(".ifnc %%ebx,%3 ; movl  %%ebx,%3 ; .endif  \n\t"
 "cpuid \n\t"
 ".ifnc %%ebx,%3 ; xchgl %%ebx,%3 ; .endif  \n\t"
: "=a" (*a), "=c" (*c), "=d" (*d), EBX_REG (*b)
-   : "a" (id)
+   : "a" (id), "c" (count)
);
 }
 
+#define cpuid(id, a, b, c, d) cpuid_count(id, 0, a, b, c, d)
+
 void get_cpuflags(void)
 {
u32 max_intel_level, max_amd_level;
@@ -108,6 +111,11 @@ void get_cpuflags(void)
cpu.model += ((tfms >> 16) & 0xf) << 4;
}
 
+   if (max_intel_level >= 0x0007) {
+   cpuid_count(0x0007, 0, , ,
+   [16], );
+   }
+
cpuid(0x8000, _amd_level, , ,
  );
 
diff --git a/arch/x86/include/asm/disabled-features.h 
b/arch/x86/include/asm/disabled-features.h
index 85599ad4d024..fc0960236fc3 100644
--- a/arch/x86/include/asm/disabled-features.h
+++ b/arch/x86/include/asm/disabled-features.h
@@ -36,6 +36,12 @@
 # define DISABLE_OSPKE (1<<(X86_FEATURE_OSPKE & 31))
 #endif /* CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS */
 
+#ifdef CONFIG_X86_5LEVEL
+#define DISABLE_LA57   0
+#else
+#define DISABLE_LA57   (1<<(X86_FEATURE_LA57 & 31))
+#endif
+
 /*
  * Make sure to add features to the correct mask
  */
@@ -55,7 +61,7 @@
 #define DISABLED_MASK130
 #define DISABLED_MASK140
 #define DISABLED_MASK150
-#define DISABLED_MASK16(DISABLE_PKU|DISABLE_OSPKE)
+#define DISABLED_MASK16(DISABLE_PKU|DISABLE_OSPKE|DISABLE_LA57)
 #define DISABLED_MASK170
 #define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 18)
 
diff --git a/arch/x86/include/asm/required-features.h 
b/arch/x86/include/asm/required-features.h
index fac9a5c0abe9..d91ba04dd007 100644
--- a/arch/x86/include/asm/required-features.h
+++ b/arch/x86/include/asm/required-features.h
@@ -53,6 +53,12 @@
 # define NEED_MOVBE0
 #endif
 
+#ifdef CONFIG_X86_5LEVEL
+# define NEED_LA57 (1<<(X86_FEATURE_LA57 & 31))
+#else
+# define NEED_LA57 0
+#endif
+
 #ifdef CONFIG_X86_64
 #ifdef CONFIG_PARAVIRT
 /* Paravirtualized systems may not have PSE or PGE available */
@@ -98,7 +104,7 @@
 #define REQUIRED_MASK130
 #define REQUIRED_MASK140
 #define REQUIRED_MASK150
-#define REQUIRED_MASK160
+#define REQUIRED_MASK16(NEED_LA57)
 #define REQUIRED_MASK170
 #define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 18)
 
-- 
2.11.0



Re: [RESEND PATCH v3 4/7] PCI: dwc: all: Modify dbi accessors to take dbi_base as argument

2017-03-12 Thread Kishon Vijay Abraham I
Hi Niklas,

On Friday 10 March 2017 09:17 PM, Niklas Cassel wrote:
> 
> 
> On 03/10/2017 01:56 PM, Kishon Vijay Abraham I wrote:
>> Hi Niklas,
>>
>> On Friday 10 March 2017 06:01 PM, Niklas Cassel wrote:
>>> On 03/10/2017 12:36 PM, Kishon Vijay Abraham I wrote:
 Hi,

 On Thursday 09 March 2017 08:35 PM, Niklas Cassel wrote:
> On 03/09/2017 03:48 PM, Niklas Cassel wrote:
>> On 03/09/2017 07:39 AM, Kishon Vijay Abraham I wrote:
>>> dwc has 2 dbi address space labeled dbics and dbics2. The existing
>>> helper to access dbi address space can access only dbics. However
>>> dbics2 has to be accessed for programming the BAR registers in the
>>> case of EP mode. This is in preparation for adding EP mode support
>>> to dwc driver.
>> Hello Kishon
>>
>> I don't really like the idea of adding an extra argument to every 
>> existing read/write.
>> Will not a read/write using dbi2 be quite uncommon compared to a 
>> read/write
>> using dbi?
>>
>> How about something like this:
>>
>> void __dw_pcie_writel(struct dw_pcie *pci, void __iomem *base, u32 reg, 
>> u32 val)
>> {
>> if (pci->ops->writel_dbi)
>> pci->ops->writel_dbi(pci, base, reg, val);
>> else
>> writel(val, base + reg);
>> }
>>
>> #define dw_pcie_writel_dbi(pci, reg, val) __dw_pcie_writel(pci, 
>> pci->dbi_base, reg, val)
>> #define dw_pcie_writel_dbi2(pci, reg, val) __dw_pcie_writel(pci, 
>> pci->dbi_base2, reg, val)
> Perhaps make dw_pcie_writel_dbi2 a function rather than a define,
> so we can return an error if pci->dbi_base2 == NULL.
 Should we return an error? We don't return error for dbi_base either. I 
 think
 it should be sufficient to return errors while populating dbi_base or
 dbi_base2. Otherwise it's a bug and should result in abort. Joao?
>>> Sorry for previous empty email.
>>>
>>>
>>> What I meant to write:
>>>
>>> Right now we do error checking for dbi_base in platform specific code
>>> and in pcie-designware-host.c:dw_pcie_host_init.
>> it's been done in dw_pcie_host_init not as an error checking but since it's
>> *optional* for certain platforms to populate dbi_base (i.e where dbi_base is
>> mapped to configuration space), host_init takes care of assigning dbi_base to
>> configuration space address.
> 
> What I'm afraid of is that we might get a NULL ptr dereference
> when using dw_pcie_writel_dbi2, if platform specific code has
> not populated dbi_base2.
> 
> Having a NULL check in generic code is just a fail safe if some
> platform specific code failed to NULL check.
> 
> The code in dw_pcie_host_init might have been written just
> to populate dbi_base when dbi is mapped to config space,
> but the end result is that if platform specific code did not
> populate dbi_base (and did not populate pp->cfg),
> we will return -ENOMEM.
> Which means that we can never get a NULL ptr dereference
> when using dw_pcie_writel_dbi.
> 
> It might be a good idea to have a NULL check in generic code,
> just as a fail safe, also for dw_pcie_ep_init.
> That way we know that we will not get a NULL ptr dereference
> when using dw_pcie_writel_dbi2.

All right, will add it then.

Thanks
Kishon


Re: [RESEND PATCH v3 4/7] PCI: dwc: all: Modify dbi accessors to take dbi_base as argument

2017-03-12 Thread Kishon Vijay Abraham I
Hi Niklas,

On Friday 10 March 2017 09:17 PM, Niklas Cassel wrote:
> 
> 
> On 03/10/2017 01:56 PM, Kishon Vijay Abraham I wrote:
>> Hi Niklas,
>>
>> On Friday 10 March 2017 06:01 PM, Niklas Cassel wrote:
>>> On 03/10/2017 12:36 PM, Kishon Vijay Abraham I wrote:
 Hi,

 On Thursday 09 March 2017 08:35 PM, Niklas Cassel wrote:
> On 03/09/2017 03:48 PM, Niklas Cassel wrote:
>> On 03/09/2017 07:39 AM, Kishon Vijay Abraham I wrote:
>>> dwc has 2 dbi address space labeled dbics and dbics2. The existing
>>> helper to access dbi address space can access only dbics. However
>>> dbics2 has to be accessed for programming the BAR registers in the
>>> case of EP mode. This is in preparation for adding EP mode support
>>> to dwc driver.
>> Hello Kishon
>>
>> I don't really like the idea of adding an extra argument to every 
>> existing read/write.
>> Will not a read/write using dbi2 be quite uncommon compared to a 
>> read/write
>> using dbi?
>>
>> How about something like this:
>>
>> void __dw_pcie_writel(struct dw_pcie *pci, void __iomem *base, u32 reg, 
>> u32 val)
>> {
>> if (pci->ops->writel_dbi)
>> pci->ops->writel_dbi(pci, base, reg, val);
>> else
>> writel(val, base + reg);
>> }
>>
>> #define dw_pcie_writel_dbi(pci, reg, val) __dw_pcie_writel(pci, 
>> pci->dbi_base, reg, val)
>> #define dw_pcie_writel_dbi2(pci, reg, val) __dw_pcie_writel(pci, 
>> pci->dbi_base2, reg, val)
> Perhaps make dw_pcie_writel_dbi2 a function rather than a define,
> so we can return an error if pci->dbi_base2 == NULL.
 Should we return an error? We don't return error for dbi_base either. I 
 think
 it should be sufficient to return errors while populating dbi_base or
 dbi_base2. Otherwise it's a bug and should result in abort. Joao?
>>> Sorry for previous empty email.
>>>
>>>
>>> What I meant to write:
>>>
>>> Right now we do error checking for dbi_base in platform specific code
>>> and in pcie-designware-host.c:dw_pcie_host_init.
>> it's been done in dw_pcie_host_init not as an error checking but since it's
>> *optional* for certain platforms to populate dbi_base (i.e where dbi_base is
>> mapped to configuration space), host_init takes care of assigning dbi_base to
>> configuration space address.
> 
> What I'm afraid of is that we might get a NULL ptr dereference
> when using dw_pcie_writel_dbi2, if platform specific code has
> not populated dbi_base2.
> 
> Having a NULL check in generic code is just a fail safe if some
> platform specific code failed to NULL check.
> 
> The code in dw_pcie_host_init might have been written just
> to populate dbi_base when dbi is mapped to config space,
> but the end result is that if platform specific code did not
> populate dbi_base (and did not populate pp->cfg),
> we will return -ENOMEM.
> Which means that we can never get a NULL ptr dereference
> when using dw_pcie_writel_dbi.
> 
> It might be a good idea to have a NULL check in generic code,
> just as a fail safe, also for dw_pcie_ep_init.
> That way we know that we will not get a NULL ptr dereference
> when using dw_pcie_writel_dbi2.

All right, will add it then.

Thanks
Kishon


Re: [Outreachy kernel] [PATCH] staging: iio: ade7753: replace mlock with driver private lock

2017-03-12 Thread Alison Schofield
On Mon, Mar 13, 2017 at 09:28:34AM +0530, SIMRAN SINGHAL wrote:
> On Mon, Mar 13, 2017 at 12:03 AM, Alison Schofield  
> wrote:
> > On Sun, Mar 12, 2017 at 07:02:50PM +0530, simran singhal wrote:
> >> The IIO subsystem is redefining iio_dev->mlock to be used by
> >> the IIO core only for protecting device operating mode changes.
> >> ie. Changes between INDIO_DIRECT_MODE, INDIO_BUFFER_* modes.
> >>
> >> In this driver, mlock was being used to protect hardware state
> >> changes.  Replace it with a lock in the devices global data.
> >>
> >> Fix some coding style issues related to white space also.
> >>
> >> Signed-off-by: simran singhal 
> >
> > Hi Simran, This looks good to me.  Let's see what the
> > reviewers say.  I think the white space stuff is ok,
> > since it was right where you were editing.
> > alisons
> >
> Alison, so sending this patch here on outreachy mailing list is fine.
> Still confuse :P

You are OK.  You sent it to everyone suggested in the Task Description.

This patch was sent before I posted the Task Description.  I'm assuming
that since then you've found the posted Task:
https://kernelnewbies.org/IIO_tasks 
Find Coding Task 2 --> "PATCHES need to be sent to all of:"

> 
> >> ---
> >>  drivers/staging/iio/meter/ade7753.c | 14 --
> >>  1 file changed, 8 insertions(+), 6 deletions(-)
> >>
> >> diff --git a/drivers/staging/iio/meter/ade7753.c 
> >> b/drivers/staging/iio/meter/ade7753.c
> >> index dfd8b71..ca99d82 100644
> >> --- a/drivers/staging/iio/meter/ade7753.c
> >> +++ b/drivers/staging/iio/meter/ade7753.c
> >> @@ -81,12 +81,14 @@
> >>   * @tx: transmit buffer
> >>   * @rx: receive buffer
> >>   * @buf_lock:   mutex to protect tx and rx
> >> + * @lock:protect sensor state
> >>   **/
> >>  struct ade7753_state {
> >> - struct spi_device   *us;
> >> - struct mutexbuf_lock;
> >> - u8  tx[ADE7753_MAX_TX] 
> >> cacheline_aligned;
> >> - u8  rx[ADE7753_MAX_RX];
> >> + struct spi_device   *us;
> >> + struct mutexbuf_lock;
> >> + struct mutexlock;   /* protect sensor state */
> >> + u8  tx[ADE7753_MAX_TX] cacheline_aligned;
> >> + u8  rx[ADE7753_MAX_RX];
> >>  };
> >>
> >>  static int ade7753_spi_write_reg_8(struct device *dev,
> >> @@ -484,7 +486,7 @@ static ssize_t ade7753_write_frequency(struct device 
> >> *dev,
> >>   if (!val)
> >>   return -EINVAL;
> >>
> >> - mutex_lock(_dev->mlock);
> >> + mutex_lock(>lock);
> >>
> >>   t = 27900 / val;
> >>   if (t > 0)
> >> @@ -505,7 +507,7 @@ static ssize_t ade7753_write_frequency(struct device 
> >> *dev,
> >>   ret = ade7753_spi_write_reg_16(dev, ADE7753_MODE, reg);
> >>
> >>  out:
> >> - mutex_unlock(_dev->mlock);
> >> + mutex_unlock(>lock);
> >>
> >>   return ret ? ret : len;
> >>  }
> >> --
> >> 2.7.4
> >>
> >> --
> >> You received this message because you are subscribed to the Google Groups 
> >> "outreachy-kernel" group.
> >> To unsubscribe from this group and stop receiving emails from it, send an 
> >> email to outreachy-kernel+unsubscr...@googlegroups.com.
> >> To post to this group, send email to outreachy-ker...@googlegroups.com.
> >> To view this discussion on the web visit 
> >> https://groups.google.com/d/msgid/outreachy-kernel/20170312133250.GA7772%40singhal-Inspiron-5558.
> >> For more options, visit https://groups.google.com/d/optout.


Re: [Outreachy kernel] [PATCH] staging: iio: ade7753: replace mlock with driver private lock

2017-03-12 Thread Alison Schofield
On Mon, Mar 13, 2017 at 09:28:34AM +0530, SIMRAN SINGHAL wrote:
> On Mon, Mar 13, 2017 at 12:03 AM, Alison Schofield  
> wrote:
> > On Sun, Mar 12, 2017 at 07:02:50PM +0530, simran singhal wrote:
> >> The IIO subsystem is redefining iio_dev->mlock to be used by
> >> the IIO core only for protecting device operating mode changes.
> >> ie. Changes between INDIO_DIRECT_MODE, INDIO_BUFFER_* modes.
> >>
> >> In this driver, mlock was being used to protect hardware state
> >> changes.  Replace it with a lock in the devices global data.
> >>
> >> Fix some coding style issues related to white space also.
> >>
> >> Signed-off-by: simran singhal 
> >
> > Hi Simran, This looks good to me.  Let's see what the
> > reviewers say.  I think the white space stuff is ok,
> > since it was right where you were editing.
> > alisons
> >
> Alison, so sending this patch here on outreachy mailing list is fine.
> Still confuse :P

You are OK.  You sent it to everyone suggested in the Task Description.

This patch was sent before I posted the Task Description.  I'm assuming
that since then you've found the posted Task:
https://kernelnewbies.org/IIO_tasks 
Find Coding Task 2 --> "PATCHES need to be sent to all of:"

> 
> >> ---
> >>  drivers/staging/iio/meter/ade7753.c | 14 --
> >>  1 file changed, 8 insertions(+), 6 deletions(-)
> >>
> >> diff --git a/drivers/staging/iio/meter/ade7753.c 
> >> b/drivers/staging/iio/meter/ade7753.c
> >> index dfd8b71..ca99d82 100644
> >> --- a/drivers/staging/iio/meter/ade7753.c
> >> +++ b/drivers/staging/iio/meter/ade7753.c
> >> @@ -81,12 +81,14 @@
> >>   * @tx: transmit buffer
> >>   * @rx: receive buffer
> >>   * @buf_lock:   mutex to protect tx and rx
> >> + * @lock:protect sensor state
> >>   **/
> >>  struct ade7753_state {
> >> - struct spi_device   *us;
> >> - struct mutexbuf_lock;
> >> - u8  tx[ADE7753_MAX_TX] 
> >> cacheline_aligned;
> >> - u8  rx[ADE7753_MAX_RX];
> >> + struct spi_device   *us;
> >> + struct mutexbuf_lock;
> >> + struct mutexlock;   /* protect sensor state */
> >> + u8  tx[ADE7753_MAX_TX] cacheline_aligned;
> >> + u8  rx[ADE7753_MAX_RX];
> >>  };
> >>
> >>  static int ade7753_spi_write_reg_8(struct device *dev,
> >> @@ -484,7 +486,7 @@ static ssize_t ade7753_write_frequency(struct device 
> >> *dev,
> >>   if (!val)
> >>   return -EINVAL;
> >>
> >> - mutex_lock(_dev->mlock);
> >> + mutex_lock(>lock);
> >>
> >>   t = 27900 / val;
> >>   if (t > 0)
> >> @@ -505,7 +507,7 @@ static ssize_t ade7753_write_frequency(struct device 
> >> *dev,
> >>   ret = ade7753_spi_write_reg_16(dev, ADE7753_MODE, reg);
> >>
> >>  out:
> >> - mutex_unlock(_dev->mlock);
> >> + mutex_unlock(>lock);
> >>
> >>   return ret ? ret : len;
> >>  }
> >> --
> >> 2.7.4
> >>
> >> --
> >> You received this message because you are subscribed to the Google Groups 
> >> "outreachy-kernel" group.
> >> To unsubscribe from this group and stop receiving emails from it, send an 
> >> email to outreachy-kernel+unsubscr...@googlegroups.com.
> >> To post to this group, send email to outreachy-ker...@googlegroups.com.
> >> To view this discussion on the web visit 
> >> https://groups.google.com/d/msgid/outreachy-kernel/20170312133250.GA7772%40singhal-Inspiron-5558.
> >> For more options, visit https://groups.google.com/d/optout.


[PATCH] tpm: Add sysfs interface to show TPM hardware version

2017-03-12 Thread Meng.Li
From: Limeng 

So far, there is not a sysfs interface for user space code to
check the TPM hardware version(TPM1.x or TPM2). So, add a
file named description in /sys/class/tpm/tpmX/ to show it.

Signed-off-by: Meng Li 
---
 drivers/char/tpm/tpm-chip.c |   85 +++
 1 file changed, 85 insertions(+)

diff --git a/drivers/char/tpm/tpm-chip.c b/drivers/char/tpm/tpm-chip.c
index c406343..da2cd69 100644
--- a/drivers/char/tpm/tpm-chip.c
+++ b/drivers/char/tpm/tpm-chip.c
@@ -36,6 +36,83 @@
 dev_t tpm_devt;
 
 /**
+ * show_description - sysfs interface for checking current TPM hardware 
version.
+ * @dev:   pointer to tpm chip device
+ * @attr:  unused
+ * @buf:   char buffer to be filled with TPM hardware version info
+ *
+ * Provides sysfs interface for showing current TPM hardware version.
+ */
+static ssize_t show_description(struct device *dev,
+   struct device_attribute *attr, char *buf)
+{
+   struct tpm_chip *chip = (struct tpm_chip *)container_of(dev,struct 
tpm_chip,dev);
+   int ret;
+
+   if (chip->flags & TPM_CHIP_FLAG_TPM2)
+   ret = sprintf(buf, "TPM 2.0");
+   else
+   ret = sprintf(buf, "TPM 1.x");
+
+   return ret;
+}
+
+/**
+ * store_description - interface for manually setting data.
+ * @dev:   unused
+ * @attr:  unused
+ * @buf:   unused
+ * @count: unused
+ *
+ * There is not any process in this function, reserve for feature.
+ */
+static ssize_t store_description(struct device *dev, struct device_attribute 
*attr,
+   const char *buf, size_t count)
+{
+   return count;
+}
+
+static struct device_attribute tpm_attrs[] = {
+   __ATTR(description, S_IRUGO | S_IWUSR, show_description, 
store_description),
+};
+
+/**
+ * tpm_create_sysfs - Create tpm sysfs interface.
+ * @dev:   pointer to tpm chip device
+ *
+ * Create sysfs interface for checking current TPM hardware version.
+ */
+static int tpm_create_sysfs(struct device *dev)
+{
+   int r, t;
+
+   for (t = 0; t < ARRAY_SIZE(tpm_attrs); t++) {
+   r = device_create_file(dev, _attrs[t]);
+   if (r) {
+   dev_err(dev, "failed to create sysfs file\n");
+   return r;
+   }
+   }
+
+   return 0;
+}
+
+/**
+ * tpm_remove_sysfs - Remove tpm sysfs interface.
+ * @dev:   pointer to tpm chip device
+ *
+ * Remove sysfs interface for checking current TPM hardware version.
+ */
+static void tpm_remove_sysfs(struct device *dev)
+{
+   int  t;
+
+   for (t = 0; t < ARRAY_SIZE(tpm_attrs); t++) {
+   device_remove_file(dev, _attrs[t]);
+   }
+}
+
+/**
  * tpm_try_get_ops() - Get a ref to the tpm_chip
  * @chip: Chip to ref
  *
@@ -363,6 +440,13 @@ int tpm_chip_register(struct tpm_chip *chip)
return rc;
}
 
+   rc = tpm_create_sysfs(>dev);
+   if (rc) {
+   tpm_del_legacy_sysfs(chip);
+   tpm_chip_unregister(chip);
+   return rc;
+   }
+
return 0;
 }
 EXPORT_SYMBOL_GPL(tpm_chip_register);
@@ -382,6 +466,7 @@ int tpm_chip_register(struct tpm_chip *chip)
  */
 void tpm_chip_unregister(struct tpm_chip *chip)
 {
+   tpm_remove_sysfs(>dev);
tpm_del_legacy_sysfs(chip);
tpm_bios_log_teardown(chip);
tpm_del_char_device(chip);
-- 
1.7.9.5



[PATCH] mm, gup: fix typo in gup_p4d_range()

2017-03-12 Thread Kirill A. Shutemov
gup_p4d_range() should call gup_pud_range(), not itself.

Signed-off-by: Kirill A. Shutemov 
Reported-by: Chris Packham 
Fixes: c2febafc6773 ("mm: convert generic code to 5-level paging")
---
 mm/gup.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/gup.c b/mm/gup.c
index c74bad1bf6e8..04aa405350dc 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1455,7 +1455,7 @@ static int gup_p4d_range(pgd_t pgd, unsigned long addr, 
unsigned long end,
if (!gup_huge_pd(__hugepd(p4d_val(p4d)), addr,
 P4D_SHIFT, next, write, pages, nr))
return 0;
-   } else if (!gup_p4d_range(p4d, addr, next, write, pages, nr))
+   } else if (!gup_pud_range(p4d, addr, next, write, pages, nr))
return 0;
} while (p4dp++, addr = next, addr != end);
 
-- 
2.11.0



Re: [RFC PATCH] phy: samsung: move the Samsung specific phy files to "samsung" directory

2017-03-12 Thread Kishon Vijay Abraham I
Hi,

On Sunday 12 March 2017 02:48 PM, Vivek Gautam wrote:
> Hi Kishon,
> 
> 
> On Thu, Mar 9, 2017 at 5:26 PM, Kishon Vijay Abraham I  wrote:
>> Hi,
>>
>> On Thursday 09 March 2017 05:03 PM, Jaehoon Chung wrote:
>>> Make the "samsung" directory and move the Samsung specific files to
>>> there for maintaining the files relevant to Samsung.
>>
>> The number of phy drivers in drivers/phy is getting unmanageable. I think 
>> this
>> is a good step to make it a little better. Can you also add a MAINTAINER for
>> drivers/phy/samsung?
> 
> I remember making a similar attempt in past [1], but that time we couldn't
> reach an agreement as to whether group the phy drivers based on
> vendors or based on the type of phy.
> 
> If you are fine with grouping the drivers for each vendor, I hope you can
> consider picking that patch (I can respin the patch based on linux-phy/next).
> Other driver maintainers were also cool with that older patch.

Sure, you can re-spin the patch.

At that point of time I didn't think grouping phy drivers for each vendor is
required. But especially after [1] where I failed to notice an existing phy
driver can be reused and later has to be reverted. This could have been easily
identified by MAINTAINERS of that particular platform. That's why now I feel
grouping phy drivers and having a MAINTAINER for every vendor directory will
help to identify such issues.

Thanks
Kishon

[1]
https://git.kernel.org/pub/scm/linux/kernel/git/kishon/linux-phy.git/commit/?h=fixes=9200c6f177638909dbbaded8aeeeccbd48744400
> 
> Let me know your comments.
> 
> [1] https://patchwork.kernel.org/patch/8762561/
> 
> Regards
> Vivek
> 
>>
>> Thanks
>> Kishon
>>>
>>> Signed-off-by: Jaehoon Chung 
>>> ---
>>>  drivers/phy/Kconfig   | 96 
>>> +--
>>>  drivers/phy/Makefile  | 14 +---
>>>  drivers/phy/samsung/Kconfig   | 92 
>>> ++
>>>  drivers/phy/samsung/Makefile  | 11 +++
>>>  drivers/phy/{ => samsung}/phy-exynos-dp-video.c   |  0
>>>  drivers/phy/{ => samsung}/phy-exynos-mipi-video.c |  0
>>>  drivers/phy/{ => samsung}/phy-exynos-pcie.c   |  0
>>>  drivers/phy/{ => samsung}/phy-exynos4210-usb2.c   |  0
>>>  drivers/phy/{ => samsung}/phy-exynos4x12-usb2.c   |  0
>>>  drivers/phy/{ => samsung}/phy-exynos5-usbdrd.c|  0
>>>  drivers/phy/{ => samsung}/phy-exynos5250-sata.c   |  0
>>>  drivers/phy/{ => samsung}/phy-exynos5250-usb2.c   |  0
>>>  drivers/phy/{ => samsung}/phy-s5pv210-usb2.c  |  0
>>>  drivers/phy/{ => samsung}/phy-samsung-usb2.c  |  0
>>>  drivers/phy/{ => samsung}/phy-samsung-usb2.h  |  0
>>>  15 files changed, 108 insertions(+), 105 deletions(-)
>>>  create mode 100644 drivers/phy/samsung/Kconfig
>>>  create mode 100644 drivers/phy/samsung/Makefile
>>>  rename drivers/phy/{ => samsung}/phy-exynos-dp-video.c (100%)
>>>  rename drivers/phy/{ => samsung}/phy-exynos-mipi-video.c (100%)
>>>  rename drivers/phy/{ => samsung}/phy-exynos-pcie.c (100%)
>>>  rename drivers/phy/{ => samsung}/phy-exynos4210-usb2.c (100%)
>>>  rename drivers/phy/{ => samsung}/phy-exynos4x12-usb2.c (100%)
>>>  rename drivers/phy/{ => samsung}/phy-exynos5-usbdrd.c (100%)
>>>  rename drivers/phy/{ => samsung}/phy-exynos5250-sata.c (100%)
>>>  rename drivers/phy/{ => samsung}/phy-exynos5250-usb2.c (100%)
>>>  rename drivers/phy/{ => samsung}/phy-s5pv210-usb2.c (100%)
>>>  rename drivers/phy/{ => samsung}/phy-samsung-usb2.c (100%)
>>>  rename drivers/phy/{ => samsung}/phy-samsung-usb2.h (100%)
> 
> [snip]
> 
> 
> 


[PATCH] mm, gup: fix typo in gup_p4d_range()

2017-03-12 Thread Kirill A. Shutemov
gup_p4d_range() should call gup_pud_range(), not itself.

Signed-off-by: Kirill A. Shutemov 
Reported-by: Chris Packham 
Fixes: c2febafc6773 ("mm: convert generic code to 5-level paging")
---
 mm/gup.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/gup.c b/mm/gup.c
index c74bad1bf6e8..04aa405350dc 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1455,7 +1455,7 @@ static int gup_p4d_range(pgd_t pgd, unsigned long addr, 
unsigned long end,
if (!gup_huge_pd(__hugepd(p4d_val(p4d)), addr,
 P4D_SHIFT, next, write, pages, nr))
return 0;
-   } else if (!gup_p4d_range(p4d, addr, next, write, pages, nr))
+   } else if (!gup_pud_range(p4d, addr, next, write, pages, nr))
return 0;
} while (p4dp++, addr = next, addr != end);
 
-- 
2.11.0



Re: [RFC PATCH] phy: samsung: move the Samsung specific phy files to "samsung" directory

2017-03-12 Thread Kishon Vijay Abraham I
Hi,

On Sunday 12 March 2017 02:48 PM, Vivek Gautam wrote:
> Hi Kishon,
> 
> 
> On Thu, Mar 9, 2017 at 5:26 PM, Kishon Vijay Abraham I  wrote:
>> Hi,
>>
>> On Thursday 09 March 2017 05:03 PM, Jaehoon Chung wrote:
>>> Make the "samsung" directory and move the Samsung specific files to
>>> there for maintaining the files relevant to Samsung.
>>
>> The number of phy drivers in drivers/phy is getting unmanageable. I think 
>> this
>> is a good step to make it a little better. Can you also add a MAINTAINER for
>> drivers/phy/samsung?
> 
> I remember making a similar attempt in past [1], but that time we couldn't
> reach an agreement as to whether group the phy drivers based on
> vendors or based on the type of phy.
> 
> If you are fine with grouping the drivers for each vendor, I hope you can
> consider picking that patch (I can respin the patch based on linux-phy/next).
> Other driver maintainers were also cool with that older patch.

Sure, you can re-spin the patch.

At that point of time I didn't think grouping phy drivers for each vendor is
required. But especially after [1] where I failed to notice an existing phy
driver can be reused and later has to be reverted. This could have been easily
identified by MAINTAINERS of that particular platform. That's why now I feel
grouping phy drivers and having a MAINTAINER for every vendor directory will
help to identify such issues.

Thanks
Kishon

[1]
https://git.kernel.org/pub/scm/linux/kernel/git/kishon/linux-phy.git/commit/?h=fixes=9200c6f177638909dbbaded8aeeeccbd48744400
> 
> Let me know your comments.
> 
> [1] https://patchwork.kernel.org/patch/8762561/
> 
> Regards
> Vivek
> 
>>
>> Thanks
>> Kishon
>>>
>>> Signed-off-by: Jaehoon Chung 
>>> ---
>>>  drivers/phy/Kconfig   | 96 
>>> +--
>>>  drivers/phy/Makefile  | 14 +---
>>>  drivers/phy/samsung/Kconfig   | 92 
>>> ++
>>>  drivers/phy/samsung/Makefile  | 11 +++
>>>  drivers/phy/{ => samsung}/phy-exynos-dp-video.c   |  0
>>>  drivers/phy/{ => samsung}/phy-exynos-mipi-video.c |  0
>>>  drivers/phy/{ => samsung}/phy-exynos-pcie.c   |  0
>>>  drivers/phy/{ => samsung}/phy-exynos4210-usb2.c   |  0
>>>  drivers/phy/{ => samsung}/phy-exynos4x12-usb2.c   |  0
>>>  drivers/phy/{ => samsung}/phy-exynos5-usbdrd.c|  0
>>>  drivers/phy/{ => samsung}/phy-exynos5250-sata.c   |  0
>>>  drivers/phy/{ => samsung}/phy-exynos5250-usb2.c   |  0
>>>  drivers/phy/{ => samsung}/phy-s5pv210-usb2.c  |  0
>>>  drivers/phy/{ => samsung}/phy-samsung-usb2.c  |  0
>>>  drivers/phy/{ => samsung}/phy-samsung-usb2.h  |  0
>>>  15 files changed, 108 insertions(+), 105 deletions(-)
>>>  create mode 100644 drivers/phy/samsung/Kconfig
>>>  create mode 100644 drivers/phy/samsung/Makefile
>>>  rename drivers/phy/{ => samsung}/phy-exynos-dp-video.c (100%)
>>>  rename drivers/phy/{ => samsung}/phy-exynos-mipi-video.c (100%)
>>>  rename drivers/phy/{ => samsung}/phy-exynos-pcie.c (100%)
>>>  rename drivers/phy/{ => samsung}/phy-exynos4210-usb2.c (100%)
>>>  rename drivers/phy/{ => samsung}/phy-exynos4x12-usb2.c (100%)
>>>  rename drivers/phy/{ => samsung}/phy-exynos5-usbdrd.c (100%)
>>>  rename drivers/phy/{ => samsung}/phy-exynos5250-sata.c (100%)
>>>  rename drivers/phy/{ => samsung}/phy-exynos5250-usb2.c (100%)
>>>  rename drivers/phy/{ => samsung}/phy-s5pv210-usb2.c (100%)
>>>  rename drivers/phy/{ => samsung}/phy-samsung-usb2.c (100%)
>>>  rename drivers/phy/{ => samsung}/phy-samsung-usb2.h (100%)
> 
> [snip]
> 
> 
> 


[PATCH] tpm: Add sysfs interface to show TPM hardware version

2017-03-12 Thread Meng.Li
From: Limeng 

So far, there is not a sysfs interface for user space code to
check the TPM hardware version(TPM1.x or TPM2). So, add a
file named description in /sys/class/tpm/tpmX/ to show it.

Signed-off-by: Meng Li 
---
 drivers/char/tpm/tpm-chip.c |   85 +++
 1 file changed, 85 insertions(+)

diff --git a/drivers/char/tpm/tpm-chip.c b/drivers/char/tpm/tpm-chip.c
index c406343..da2cd69 100644
--- a/drivers/char/tpm/tpm-chip.c
+++ b/drivers/char/tpm/tpm-chip.c
@@ -36,6 +36,83 @@
 dev_t tpm_devt;
 
 /**
+ * show_description - sysfs interface for checking current TPM hardware 
version.
+ * @dev:   pointer to tpm chip device
+ * @attr:  unused
+ * @buf:   char buffer to be filled with TPM hardware version info
+ *
+ * Provides sysfs interface for showing current TPM hardware version.
+ */
+static ssize_t show_description(struct device *dev,
+   struct device_attribute *attr, char *buf)
+{
+   struct tpm_chip *chip = (struct tpm_chip *)container_of(dev,struct 
tpm_chip,dev);
+   int ret;
+
+   if (chip->flags & TPM_CHIP_FLAG_TPM2)
+   ret = sprintf(buf, "TPM 2.0");
+   else
+   ret = sprintf(buf, "TPM 1.x");
+
+   return ret;
+}
+
+/**
+ * store_description - interface for manually setting data.
+ * @dev:   unused
+ * @attr:  unused
+ * @buf:   unused
+ * @count: unused
+ *
+ * There is not any process in this function, reserve for feature.
+ */
+static ssize_t store_description(struct device *dev, struct device_attribute 
*attr,
+   const char *buf, size_t count)
+{
+   return count;
+}
+
+static struct device_attribute tpm_attrs[] = {
+   __ATTR(description, S_IRUGO | S_IWUSR, show_description, 
store_description),
+};
+
+/**
+ * tpm_create_sysfs - Create tpm sysfs interface.
+ * @dev:   pointer to tpm chip device
+ *
+ * Create sysfs interface for checking current TPM hardware version.
+ */
+static int tpm_create_sysfs(struct device *dev)
+{
+   int r, t;
+
+   for (t = 0; t < ARRAY_SIZE(tpm_attrs); t++) {
+   r = device_create_file(dev, _attrs[t]);
+   if (r) {
+   dev_err(dev, "failed to create sysfs file\n");
+   return r;
+   }
+   }
+
+   return 0;
+}
+
+/**
+ * tpm_remove_sysfs - Remove tpm sysfs interface.
+ * @dev:   pointer to tpm chip device
+ *
+ * Remove sysfs interface for checking current TPM hardware version.
+ */
+static void tpm_remove_sysfs(struct device *dev)
+{
+   int  t;
+
+   for (t = 0; t < ARRAY_SIZE(tpm_attrs); t++) {
+   device_remove_file(dev, _attrs[t]);
+   }
+}
+
+/**
  * tpm_try_get_ops() - Get a ref to the tpm_chip
  * @chip: Chip to ref
  *
@@ -363,6 +440,13 @@ int tpm_chip_register(struct tpm_chip *chip)
return rc;
}
 
+   rc = tpm_create_sysfs(>dev);
+   if (rc) {
+   tpm_del_legacy_sysfs(chip);
+   tpm_chip_unregister(chip);
+   return rc;
+   }
+
return 0;
 }
 EXPORT_SYMBOL_GPL(tpm_chip_register);
@@ -382,6 +466,7 @@ int tpm_chip_register(struct tpm_chip *chip)
  */
 void tpm_chip_unregister(struct tpm_chip *chip)
 {
+   tpm_remove_sysfs(>dev);
tpm_del_legacy_sysfs(chip);
tpm_bios_log_teardown(chip);
tpm_del_char_device(chip);
-- 
1.7.9.5



Re: [PATCH] mm: mark gup_pud_range as unused

2017-03-12 Thread Kirill A. Shutemov
On Mon, Mar 13, 2017 at 04:58:37PM +1300, Chris Packham wrote:
> The last caller to gup_pud_range was removed in commit c2febafc6773
> ("mm: convert generic code to 5-level paging"). Mark it as unused to
> silence a warning from gcc.
> 
> Signed-off-by: Chris Packham 
> ---
> I saw this warning when compiling 4.11-rc2 with -Werror. An equally valid fix
> would be to remove the function entirely but I went for the less invasive
> approach.

Thanks for report. But real fix is to call gup_pud_range() from
gup_p4d_range(), not itself.

I'll post a fix.
Reported-by: Chris Packham 
Reported-by: Chris Packham 

-- 
 Kirill A. Shutemov


Re: [PATCH] mm: mark gup_pud_range as unused

2017-03-12 Thread Kirill A. Shutemov
On Mon, Mar 13, 2017 at 04:58:37PM +1300, Chris Packham wrote:
> The last caller to gup_pud_range was removed in commit c2febafc6773
> ("mm: convert generic code to 5-level paging"). Mark it as unused to
> silence a warning from gcc.
> 
> Signed-off-by: Chris Packham 
> ---
> I saw this warning when compiling 4.11-rc2 with -Werror. An equally valid fix
> would be to remove the function entirely but I went for the less invasive
> approach.

Thanks for report. But real fix is to call gup_pud_range() from
gup_p4d_range(), not itself.

I'll post a fix.
Reported-by: Chris Packham 
Reported-by: Chris Packham 

-- 
 Kirill A. Shutemov


Re: memfill v2 now with ARM and x86 implementations

2017-03-12 Thread Minchan Kim
Hi Matthew,

On Sat, Mar 11, 2017 at 06:56:40AM -0800, Matthew Wilcox wrote:
> On Mon, Feb 06, 2017 at 12:16:44AM +0900, Minchan Kim wrote:
> > +static inline void zram_fill_page(char *ptr, unsigned long len,
> > +   unsigned long value)
> > +{
> > +   int i;
> > +   unsigned long *page = (unsigned long *)ptr;
> > +
> > +   WARN_ON_ONCE(!IS_ALIGNED(len, sizeof(unsigned long)));
> > +
> > +   if (likely(value == 0)) {
> > +   memset(ptr, 0, len);
> > +   } else {
> > +   for (i = 0; i < len / sizeof(*page); i++)
> > +   page[i] = value;
> > +   }
> > +}
> 
> I've hacked up memset32/memset64 for both ARM and x86 here:
> 
> http://git.infradead.org/users/willy/linux-dax.git/shortlog/refs/heads/memfill

Thanks for the patch.

> 
> Can you do some performance testing and see if it makes a difference?

I tested that zram is *full* with non-zero 100M dedupable data(i.e.,
it's a ideal case) on x86. With this, I see 7% enhancement.

perf stat -r 10 dd if=/dev/zram0 of=/dev/null

vanilla:0.232050465 seconds time elapsed ( +-  0.51% )
memset_l:   0.217219387 seconds time elapsed ( +-  0.07% )

I doubt it makes such benefit in read workload which a small percent
non-zero dedup data(e.g., under 3%) but it makes code simple/perform
win.

Thanks.

> 
> At this point, I'd probably ask for the first 5 patches in that git
> branch to be included, and leave out memfill and the shoddy testsuite.
> 
> I haven't actually tested either asm implementation ... only the
> C fallback.


Re: memfill v2 now with ARM and x86 implementations

2017-03-12 Thread Minchan Kim
Hi Matthew,

On Sat, Mar 11, 2017 at 06:56:40AM -0800, Matthew Wilcox wrote:
> On Mon, Feb 06, 2017 at 12:16:44AM +0900, Minchan Kim wrote:
> > +static inline void zram_fill_page(char *ptr, unsigned long len,
> > +   unsigned long value)
> > +{
> > +   int i;
> > +   unsigned long *page = (unsigned long *)ptr;
> > +
> > +   WARN_ON_ONCE(!IS_ALIGNED(len, sizeof(unsigned long)));
> > +
> > +   if (likely(value == 0)) {
> > +   memset(ptr, 0, len);
> > +   } else {
> > +   for (i = 0; i < len / sizeof(*page); i++)
> > +   page[i] = value;
> > +   }
> > +}
> 
> I've hacked up memset32/memset64 for both ARM and x86 here:
> 
> http://git.infradead.org/users/willy/linux-dax.git/shortlog/refs/heads/memfill

Thanks for the patch.

> 
> Can you do some performance testing and see if it makes a difference?

I tested that zram is *full* with non-zero 100M dedupable data(i.e.,
it's a ideal case) on x86. With this, I see 7% enhancement.

perf stat -r 10 dd if=/dev/zram0 of=/dev/null

vanilla:0.232050465 seconds time elapsed ( +-  0.51% )
memset_l:   0.217219387 seconds time elapsed ( +-  0.07% )

I doubt it makes such benefit in read workload which a small percent
non-zero dedup data(e.g., under 3%) but it makes code simple/perform
win.

Thanks.

> 
> At this point, I'd probably ask for the first 5 patches in that git
> branch to be included, and leave out memfill and the shoddy testsuite.
> 
> I haven't actually tested either asm implementation ... only the
> C fallback.


Re: [PATCH 1/7] regulator: max1586: Constify regulator_ops

2017-03-12 Thread Chanwoo Choi
Dear all,

All patches in this series looks good to me.
Reviewed-by: Chanwoo Choi 

Best Regards,
Chanwoo Choi

On 2017년 03월 12일 04:01, Krzysztof Kozlowski wrote:
> Static struct regulator_ops is not modified so can be made const for
> code safeness.
> 
> Signed-off-by: Krzysztof Kozlowski 
> ---
>  drivers/regulator/max1586.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/regulator/max1586.c b/drivers/regulator/max1586.c
> index 2c1228d5796a..6779c2b53674 100644
> --- a/drivers/regulator/max1586.c
> +++ b/drivers/regulator/max1586.c
> @@ -126,14 +126,14 @@ static int max1586_v6_set_voltage_sel(struct 
> regulator_dev *rdev,
>   * The Maxim 1586 controls V3 and V6 voltages, but offers no way of reading 
> back
>   * the set up value.
>   */
> -static struct regulator_ops max1586_v3_ops = {
> +static const struct regulator_ops max1586_v3_ops = {
>   .get_voltage_sel = max1586_v3_get_voltage_sel,
>   .set_voltage_sel = max1586_v3_set_voltage_sel,
>   .list_voltage = regulator_list_voltage_linear,
>   .map_voltage = regulator_map_voltage_linear,
>  };
>  
> -static struct regulator_ops max1586_v6_ops = {
> +static const struct regulator_ops max1586_v6_ops = {
>   .get_voltage_sel = max1586_v6_get_voltage_sel,
>   .set_voltage_sel = max1586_v6_set_voltage_sel,
>   .list_voltage = regulator_list_voltage_table,
> 



Re: [PATCH 1/7] regulator: max1586: Constify regulator_ops

2017-03-12 Thread Chanwoo Choi
Dear all,

All patches in this series looks good to me.
Reviewed-by: Chanwoo Choi 

Best Regards,
Chanwoo Choi

On 2017년 03월 12일 04:01, Krzysztof Kozlowski wrote:
> Static struct regulator_ops is not modified so can be made const for
> code safeness.
> 
> Signed-off-by: Krzysztof Kozlowski 
> ---
>  drivers/regulator/max1586.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/regulator/max1586.c b/drivers/regulator/max1586.c
> index 2c1228d5796a..6779c2b53674 100644
> --- a/drivers/regulator/max1586.c
> +++ b/drivers/regulator/max1586.c
> @@ -126,14 +126,14 @@ static int max1586_v6_set_voltage_sel(struct 
> regulator_dev *rdev,
>   * The Maxim 1586 controls V3 and V6 voltages, but offers no way of reading 
> back
>   * the set up value.
>   */
> -static struct regulator_ops max1586_v3_ops = {
> +static const struct regulator_ops max1586_v3_ops = {
>   .get_voltage_sel = max1586_v3_get_voltage_sel,
>   .set_voltage_sel = max1586_v3_set_voltage_sel,
>   .list_voltage = regulator_list_voltage_linear,
>   .map_voltage = regulator_map_voltage_linear,
>  };
>  
> -static struct regulator_ops max1586_v6_ops = {
> +static const struct regulator_ops max1586_v6_ops = {
>   .get_voltage_sel = max1586_v6_get_voltage_sel,
>   .set_voltage_sel = max1586_v6_set_voltage_sel,
>   .list_voltage = regulator_list_voltage_table,
> 



Re: [PATCH v2] mtd: Fix mtdblock for >4GB MTD devices

2017-03-12 Thread Marek Vasut
On 03/01/2017 05:14 AM, lepton wrote:
> If checking some calling side,  the len is from cache_size of struct
> mtdblk_dev, it's defined as unsigned int now. So it's not 64bit yet.

Ummm ... since you're top-posting, I have no clue which part do you
refer to , sorry.

> BTW, seems it's just block size (512) at some other calling side.
> 
> (Sorry for previous same content email, just found out it's html
> format and rejected by mail list)
> 
> On Mon, Feb 27, 2017 at 1:31 AM, Marek Vasut  wrote:
>> On 02/22/2017 03:15 AM, Lepton Wu wrote:
>>> Change to use loff_t instead of unsigned long in some functions
>>> to make sure mtdblock can handle offset bigger than 4G in 32 bits mode.
>>>
>>> Signed-off-by: Lepton Wu 
>>> ---
>>>  Changes in v2:
>>>   - Make the commit message more clearer and fix some format issues.
>>>
>>>  drivers/mtd/mtdblock.c| 35 ++-
>>>  drivers/mtd/mtdblock_ro.c |  4 ++--
>>>  2 files changed, 20 insertions(+), 19 deletions(-)
>>>
>>> diff --git a/drivers/mtd/mtdblock.c b/drivers/mtd/mtdblock.c
>>> index bb4c14f83c75..373c0edca803 100644
>>> --- a/drivers/mtd/mtdblock.c
>>> +++ b/drivers/mtd/mtdblock.c
>>> @@ -61,8 +61,8 @@ static void erase_callback(struct erase_info *done)
>>>   wake_up(wait_q);
>>>  }
>>>
>>> -static int erase_write (struct mtd_info *mtd, unsigned long pos,
>>> - int len, const char *buf)
>>> +static int erase_write(struct mtd_info *mtd, loff_t pos, int len,
>>> +const char *buf)
>>
>> Can the length be 64bit too now ?
>>
>> [...]
>>
>> --
>> Best regards,
>> Marek Vasut


-- 
Best regards,
Marek Vasut


Re: [PATCH v2] mtd: Fix mtdblock for >4GB MTD devices

2017-03-12 Thread Marek Vasut
On 03/01/2017 05:14 AM, lepton wrote:
> If checking some calling side,  the len is from cache_size of struct
> mtdblk_dev, it's defined as unsigned int now. So it's not 64bit yet.

Ummm ... since you're top-posting, I have no clue which part do you
refer to , sorry.

> BTW, seems it's just block size (512) at some other calling side.
> 
> (Sorry for previous same content email, just found out it's html
> format and rejected by mail list)
> 
> On Mon, Feb 27, 2017 at 1:31 AM, Marek Vasut  wrote:
>> On 02/22/2017 03:15 AM, Lepton Wu wrote:
>>> Change to use loff_t instead of unsigned long in some functions
>>> to make sure mtdblock can handle offset bigger than 4G in 32 bits mode.
>>>
>>> Signed-off-by: Lepton Wu 
>>> ---
>>>  Changes in v2:
>>>   - Make the commit message more clearer and fix some format issues.
>>>
>>>  drivers/mtd/mtdblock.c| 35 ++-
>>>  drivers/mtd/mtdblock_ro.c |  4 ++--
>>>  2 files changed, 20 insertions(+), 19 deletions(-)
>>>
>>> diff --git a/drivers/mtd/mtdblock.c b/drivers/mtd/mtdblock.c
>>> index bb4c14f83c75..373c0edca803 100644
>>> --- a/drivers/mtd/mtdblock.c
>>> +++ b/drivers/mtd/mtdblock.c
>>> @@ -61,8 +61,8 @@ static void erase_callback(struct erase_info *done)
>>>   wake_up(wait_q);
>>>  }
>>>
>>> -static int erase_write (struct mtd_info *mtd, unsigned long pos,
>>> - int len, const char *buf)
>>> +static int erase_write(struct mtd_info *mtd, loff_t pos, int len,
>>> +const char *buf)
>>
>> Can the length be 64bit too now ?
>>
>> [...]
>>
>> --
>> Best regards,
>> Marek Vasut


-- 
Best regards,
Marek Vasut


Re: [PATCH 1/2] extcon: int3496: Use gpiod_get instead of gpiod_get_index

2017-03-12 Thread Chanwoo Choi
Hi,

On 2017년 03월 11일 05:52, Hans de Goede wrote:
> Now that we've an acpi mapping table we should be using gpiod_get
> instead of gpiod_get_index.
> 
> Cc: Andy Shevchenko 
> Signed-off-by: Hans de Goede 
> ---
>  drivers/extcon/extcon-intel-int3496.c | 12 +++-
>  1 file changed, 3 insertions(+), 9 deletions(-)

Applied it.

-- 
Best Regards,
Chanwoo Choi
Samsung Electronics


Re: [PATCH 1/2] extcon: int3496: Use gpiod_get instead of gpiod_get_index

2017-03-12 Thread Chanwoo Choi
Hi,

On 2017년 03월 11일 05:52, Hans de Goede wrote:
> Now that we've an acpi mapping table we should be using gpiod_get
> instead of gpiod_get_index.
> 
> Cc: Andy Shevchenko 
> Signed-off-by: Hans de Goede 
> ---
>  drivers/extcon/extcon-intel-int3496.c | 12 +++-
>  1 file changed, 3 insertions(+), 9 deletions(-)

Applied it.

-- 
Best Regards,
Chanwoo Choi
Samsung Electronics


Re: [PATCH 2/7] regulator: max77693: Constify regulator_ops

2017-03-12 Thread Chanwoo Choi
Hi,

On 2017년 03월 12일 04:01, Krzysztof Kozlowski wrote:
> Static struct regulator_ops is not modified so can be made const for
> code safeness.
> 
> Signed-off-by: Krzysztof Kozlowski 
> ---
>  drivers/regulator/max77693-regulator.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/drivers/regulator/max77693-regulator.c 
> b/drivers/regulator/max77693-regulator.c
> index 3fce67982682..e7000e777292 100644
> --- a/drivers/regulator/max77693-regulator.c
> +++ b/drivers/regulator/max77693-regulator.c
> @@ -150,7 +150,7 @@ static const struct regulator_ops max77693_safeout_ops = {
>   .set_voltage_sel= regulator_set_voltage_sel_regmap,
>  };
>  
> -static struct regulator_ops max77693_charger_ops = {
> +static const struct regulator_ops max77693_charger_ops = {
>   .is_enabled = regulator_is_enabled_regmap,
>   .enable = regulator_enable_regmap,
>   .disable= regulator_disable_regmap,
> 

Reviewed-by: Chanwoo Choi 

-- 
Best Regards,
Chanwoo Choi
Samsung Electronics


Re: [PATCH 2/7] regulator: max77693: Constify regulator_ops

2017-03-12 Thread Chanwoo Choi
Hi,

On 2017년 03월 12일 04:01, Krzysztof Kozlowski wrote:
> Static struct regulator_ops is not modified so can be made const for
> code safeness.
> 
> Signed-off-by: Krzysztof Kozlowski 
> ---
>  drivers/regulator/max77693-regulator.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/drivers/regulator/max77693-regulator.c 
> b/drivers/regulator/max77693-regulator.c
> index 3fce67982682..e7000e777292 100644
> --- a/drivers/regulator/max77693-regulator.c
> +++ b/drivers/regulator/max77693-regulator.c
> @@ -150,7 +150,7 @@ static const struct regulator_ops max77693_safeout_ops = {
>   .set_voltage_sel= regulator_set_voltage_sel_regmap,
>  };
>  
> -static struct regulator_ops max77693_charger_ops = {
> +static const struct regulator_ops max77693_charger_ops = {
>   .is_enabled = regulator_is_enabled_regmap,
>   .enable = regulator_enable_regmap,
>   .disable= regulator_disable_regmap,
> 

Reviewed-by: Chanwoo Choi 

-- 
Best Regards,
Chanwoo Choi
Samsung Electronics


[PATCH v2-kernel 4.1] irqdomain: handle the per-CPU irq trigger type settings

2017-03-12 Thread Dongjiu Geng
when percpu devices set its IRQ trigger type using irq_of_parse
and_map API,it will be failed because irq_set_irq_type is only
for 1-N mode interrupt source,not for per-cpu interrupt source.
so handle per-cpu IRQs for this failure.

problem: per cpu device call irq_of_parse_and_map to set its
timer trigger type IRQ_TYPE_EDGE_RISING, irq_of_parse_and_map
will call irq_create_of_mapping to set trigger_type through
irq_set_irq_type. But irq_set_irq_type uses
IRQ_GET_DESC_CHECK_GLOBAL and not IRQ_GET_DESC_CHECK_PERCPU.
per-cpu IRQ is per-cpu and not global.

solution: storing the type into irqdata, then get this IRQ
type fromirq_get_trigger_type, and enable_percpu_irq sets the
type

Signed-off-by: Dongjiu Geng 
Signed-off-by: Haibin Zhang 
---
 kernel/irq/irqdomain.c |   12 ++--
 1 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 9fd618d..8116cf2 100755
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -542,8 +542,16 @@ unsigned int irq_create_of_mapping(struct of_phandle_args 
*irq_data)
 
/* Set type if specified and different than the current one */
if (type != IRQ_TYPE_NONE &&
-   type != irq_get_trigger_type(virq))
-   irq_set_irq_type(virq, type);
+   type != irq_get_trigger_type(virq)) {
+   int ret = 0;
+   struct irq_data *irq_data = irq_get_irq_data(virq);
+
+   ret = irq_set_irq_type(virq, type);
+
+/* Handle per-cpu IRQ: just save type in irq_data */
+   if (-EINVAL == ret && irq_data)
+   irqd_set_trigger_type(irq_data, type);
+   }
return virq;
 }
 EXPORT_SYMBOL_GPL(irq_create_of_mapping);
-- 
1.7.7



[PATCH v2-kernel 4.1] irqdomain: handle the per-CPU irq trigger type settings

2017-03-12 Thread Dongjiu Geng
when percpu devices set its IRQ trigger type using irq_of_parse
and_map API,it will be failed because irq_set_irq_type is only
for 1-N mode interrupt source,not for per-cpu interrupt source.
so handle per-cpu IRQs for this failure.

problem: per cpu device call irq_of_parse_and_map to set its
timer trigger type IRQ_TYPE_EDGE_RISING, irq_of_parse_and_map
will call irq_create_of_mapping to set trigger_type through
irq_set_irq_type. But irq_set_irq_type uses
IRQ_GET_DESC_CHECK_GLOBAL and not IRQ_GET_DESC_CHECK_PERCPU.
per-cpu IRQ is per-cpu and not global.

solution: storing the type into irqdata, then get this IRQ
type fromirq_get_trigger_type, and enable_percpu_irq sets the
type

Signed-off-by: Dongjiu Geng 
Signed-off-by: Haibin Zhang 
---
 kernel/irq/irqdomain.c |   12 ++--
 1 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 9fd618d..8116cf2 100755
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -542,8 +542,16 @@ unsigned int irq_create_of_mapping(struct of_phandle_args 
*irq_data)
 
/* Set type if specified and different than the current one */
if (type != IRQ_TYPE_NONE &&
-   type != irq_get_trigger_type(virq))
-   irq_set_irq_type(virq, type);
+   type != irq_get_trigger_type(virq)) {
+   int ret = 0;
+   struct irq_data *irq_data = irq_get_irq_data(virq);
+
+   ret = irq_set_irq_type(virq, type);
+
+/* Handle per-cpu IRQ: just save type in irq_data */
+   if (-EINVAL == ret && irq_data)
+   irqd_set_trigger_type(irq_data, type);
+   }
return virq;
 }
 EXPORT_SYMBOL_GPL(irq_create_of_mapping);
-- 
1.7.7



Re: [PATCH v2] statx: optimize copy of struct statx to userspace

2017-03-12 Thread Andreas Dilger
On Mar 11, 2017, at 11:01 PM, Eric Biggers  wrote:
> 
> On Sat, Mar 11, 2017 at 08:02:06PM -0800, Eric Biggers wrote:
>> On Sun, Mar 12, 2017 at 02:29:27AM +, Al Viro wrote:
>>> 
>>> Oh, I agree that multiple __put_user() are wrong; I also agree that bulk
>>> copy is the right approach (when we get the unsafe stuff right, we can
>>> revisit that, but I suspect that on quite a few architectures a bulk copy
>>> will still give better time, no matter what).
>>> 
 If padding is a concern at all (AFAICS it's not actually an issue now
 with struct statx, but people tend to have different opinions on how
 careful they want to be with padding), then I think we'll just have to
 start by memsetting the whole struct to 0.
>>> 
>>> My point is simply that it's worth a comment in that code.
>> 
>> Okay, thanks.  I'll add a comment about the padding assumption, and I think
>> I'll take the suggestion to use a designated initializer.  Then at least
>> all *fields* get initialized by default.  And if in the future someone
>> wants to conditionally initialize fields, then they can use ?: or they can
>> do it after the initializer.  Either way, at least they won't be able to
>> forget to zero some field.
> 
> Okay, well, I may have changed my mind again...  I tried the designated
> initializer on x86_64 with gcc 4.8 and 6.3, and also on arm64 with gcc 4.8.
> In each case, it was compiled into first zeroing all 256 bytes of the struct,
> just like memset(, 0, sizeof(tmp)).  Yes, this was with
> CC_OPTIMIZE_FOR_PERFORMANCE=y.  So I think we might as well just write the
> full memset(), making it completely clear that everything is initialized.
> (This is especially useful for people who are auditing code paths like this
> for information leaks.)  Also, a smart compiler could potentially optimize
> away parts of the memset() anyway...

Not that it is a huge deal either way, but I'd think it is harder for the
compiler to optimize across a function call boundary like memset() vs. a
struct initialization in the same function where it can see that all but
a few of the fields are being overwritten immediately before they are used.

I don't think the designated initializer is any less clear to the reader
that the struct is zeroed out compared to using memset().  Possibly the
best compromise is to use a designated initializer that specifies all of
the known fields, and leaves it to the compiler to initialize unset fields
or padding.  That avoids double zeroing without any risk of exposing unset
fields to userspace:

static int cp_statx(const struct kstat *stat, struct statx __user *buffer)
{
struct statx tmp = {
.stx_mask = stat->result_mask;
.stx_blksize = stat->blksize;
.stx_attributes = stat->attributes;
.stx_nlink = stat->nlink;
.stx_uid = from_kuid_munged(current_user_ns(), stat->uid);
.stx_gid = from_kgid_munged(current_user_ns(), stat->gid);
.stx_mode = stat->mode;
.stx_ino = stat->ino;
.stx_size = stat->size;
.stx_blocks = stat->blocks;
.stx_atime.tv_sec = stat->atime.tv_sec;
.stx_atime.tv_nsec = stat->atime.tv_nsec;
.stx_btime.tv_sec = stat->btime.tv_sec;
.stx_btime.tv_nsec = stat->btime.tv_nsec;
.stx_ctime.tv_sec = stat->ctime.tv_sec;
.stx_ctime.tv_nsec = stat->ctime.tv_nsec;
.stx_mtime.tv_sec = stat->mtime.tv_sec;
.stx_mtime.tv_nsec = stat->mtime.tv_nsec;
.stx_rdev_major = MAJOR(stat->rdev);
.stx_rdev_minor = MINOR(stat->rdev);
.stx_dev_major = MAJOR(stat->dev);
.stx_dev_minor = MINOR(stat->dev);
};

return copy_to_user(buffer, , sizeof(tmp)) ? -EFAULT : 0;
}

Cheers, Andreas







signature.asc
Description: Message signed with OpenPGP


Re: [PATCH v2] statx: optimize copy of struct statx to userspace

2017-03-12 Thread Andreas Dilger
On Mar 11, 2017, at 11:01 PM, Eric Biggers  wrote:
> 
> On Sat, Mar 11, 2017 at 08:02:06PM -0800, Eric Biggers wrote:
>> On Sun, Mar 12, 2017 at 02:29:27AM +, Al Viro wrote:
>>> 
>>> Oh, I agree that multiple __put_user() are wrong; I also agree that bulk
>>> copy is the right approach (when we get the unsafe stuff right, we can
>>> revisit that, but I suspect that on quite a few architectures a bulk copy
>>> will still give better time, no matter what).
>>> 
 If padding is a concern at all (AFAICS it's not actually an issue now
 with struct statx, but people tend to have different opinions on how
 careful they want to be with padding), then I think we'll just have to
 start by memsetting the whole struct to 0.
>>> 
>>> My point is simply that it's worth a comment in that code.
>> 
>> Okay, thanks.  I'll add a comment about the padding assumption, and I think
>> I'll take the suggestion to use a designated initializer.  Then at least
>> all *fields* get initialized by default.  And if in the future someone
>> wants to conditionally initialize fields, then they can use ?: or they can
>> do it after the initializer.  Either way, at least they won't be able to
>> forget to zero some field.
> 
> Okay, well, I may have changed my mind again...  I tried the designated
> initializer on x86_64 with gcc 4.8 and 6.3, and also on arm64 with gcc 4.8.
> In each case, it was compiled into first zeroing all 256 bytes of the struct,
> just like memset(, 0, sizeof(tmp)).  Yes, this was with
> CC_OPTIMIZE_FOR_PERFORMANCE=y.  So I think we might as well just write the
> full memset(), making it completely clear that everything is initialized.
> (This is especially useful for people who are auditing code paths like this
> for information leaks.)  Also, a smart compiler could potentially optimize
> away parts of the memset() anyway...

Not that it is a huge deal either way, but I'd think it is harder for the
compiler to optimize across a function call boundary like memset() vs. a
struct initialization in the same function where it can see that all but
a few of the fields are being overwritten immediately before they are used.

I don't think the designated initializer is any less clear to the reader
that the struct is zeroed out compared to using memset().  Possibly the
best compromise is to use a designated initializer that specifies all of
the known fields, and leaves it to the compiler to initialize unset fields
or padding.  That avoids double zeroing without any risk of exposing unset
fields to userspace:

static int cp_statx(const struct kstat *stat, struct statx __user *buffer)
{
struct statx tmp = {
.stx_mask = stat->result_mask;
.stx_blksize = stat->blksize;
.stx_attributes = stat->attributes;
.stx_nlink = stat->nlink;
.stx_uid = from_kuid_munged(current_user_ns(), stat->uid);
.stx_gid = from_kgid_munged(current_user_ns(), stat->gid);
.stx_mode = stat->mode;
.stx_ino = stat->ino;
.stx_size = stat->size;
.stx_blocks = stat->blocks;
.stx_atime.tv_sec = stat->atime.tv_sec;
.stx_atime.tv_nsec = stat->atime.tv_nsec;
.stx_btime.tv_sec = stat->btime.tv_sec;
.stx_btime.tv_nsec = stat->btime.tv_nsec;
.stx_ctime.tv_sec = stat->ctime.tv_sec;
.stx_ctime.tv_nsec = stat->ctime.tv_nsec;
.stx_mtime.tv_sec = stat->mtime.tv_sec;
.stx_mtime.tv_nsec = stat->mtime.tv_nsec;
.stx_rdev_major = MAJOR(stat->rdev);
.stx_rdev_minor = MINOR(stat->rdev);
.stx_dev_major = MAJOR(stat->dev);
.stx_dev_minor = MINOR(stat->dev);
};

return copy_to_user(buffer, , sizeof(tmp)) ? -EFAULT : 0;
}

Cheers, Andreas







signature.asc
Description: Message signed with OpenPGP


Re: [PATCH v5 00/39] i.MX Media Driver

2017-03-12 Thread Steve Longerbeam



On 03/12/2017 01:22 PM, Russell King - ARM Linux wrote:

On Sun, Mar 12, 2017 at 01:05:06PM -0700, Steve Longerbeam wrote:



On 03/12/2017 12:57 PM, Russell King - ARM Linux wrote:

On Sat, Mar 11, 2017 at 04:30:53PM -0800, Steve Longerbeam wrote:

If it's too difficult to get the imx219 csi-2 transmitter into the
LP-11 state on power on, perhaps the csi-2 receiver can be a little
more lenient on the transmitter and make the LP-11 timeout a warning
instead of error-out.

Can you try the attached change on top of the version 5 patchset?

If that doesn't work then you're just going to have to fix the bug
in imx219.


That patch gets me past that hurdle, only to reveal that there's another
issue:


Yeah, ipu_cpmem_set_image() failed because it doesn't recognize the
bayer formats. Wait, didn't we fix this already? I've lost track.
Ah, right, we were going to move this support into the IPUv3 driver,
but in the meantime I think you had some patches to get around this.


What I had was this patch for your v3.  I never got to testing your
v4 because of the LP-11 problem.

In v5, you've changed to propagate the ipu_cpmem_set_image() error
code to avoid the resulting corruption, but that leaves the other bits
of this patch unaddressed, along my "media: imx: smfc: add support
for bayer formats" patch.

Your driver basically has no support for bayer formats.


You added the patches to this driver that adds the bayer support,
I don't think there is anything more required of the driver at this
point to support bayer, the remaining work needs to happen in the IPUv3
driver.

I'll see if I have time to write that patch to IPUv3, but it's simple,
in fact what you wrote below can be translate directly into
ipu_cpmem_set_image(). There's a few other places bayer needs to be
treated in IPUv3, but it should be obvious by grepping for the
reference to pixel formats.

Steve




diff --git a/drivers/staging/media/imx/imx-smfc.c 
b/drivers/staging/media/imx/imx-smfc.c
index 313732201a52..4351c0365cf4 100644
--- a/drivers/staging/media/imx/imx-smfc.c
+++ b/drivers/staging/media/imx/imx-smfc.c
@@ -234,11 +234,6 @@ static void imx_smfc_setup_channel(struct imx_smfc_priv 
*priv)
buf1 = imx_media_dma_buf_get_next_queued(priv->out_ring);
priv->next = buf1;

-   image.phys0 = buf0->phys;
-   image.phys1 = buf1->phys;
-   ipu_cpmem_set_image(priv->smfc_ch, );
-
-
switch (image.pix.pixelformat) {
case V4L2_PIX_FMT_SBGGR8:
case V4L2_PIX_FMT_SGBRG8:
@@ -247,6 +242,10 @@ static void imx_smfc_setup_channel(struct imx_smfc_priv 
*priv)
burst_size = 8;
passthrough = true;
passthrough_bits = 8;
+   ipu_cpmem_set_resolution(priv->smfc_ch, image.rect.width, 
image.rect.height);
+   ipu_cpmem_set_stride(priv->smfc_ch, image.pix.bytesperline);
+   ipu_cpmem_set_buffer(priv->smfc_ch, 0, buf0->phys);
+   ipu_cpmem_set_buffer(priv->smfc_ch, 1, buf1->phys);
break;

case V4L2_PIX_FMT_SBGGR16:
@@ -256,9 +255,17 @@ static void imx_smfc_setup_channel(struct imx_smfc_priv 
*priv)
burst_size = 4;
passthrough = true;
passthrough_bits = 16;
+   ipu_cpmem_set_resolution(priv->smfc_ch, image.rect.width, 
image.rect.height);
+   ipu_cpmem_set_stride(priv->smfc_ch, image.pix.bytesperline);
+   ipu_cpmem_set_buffer(priv->smfc_ch, 0, buf0->phys);
+   ipu_cpmem_set_buffer(priv->smfc_ch, 1, buf1->phys);
break;

default:
+   image.phys0 = buf0->phys;
+   image.phys1 = buf1->phys;
+   ipu_cpmem_set_image(priv->smfc_ch, );
+
burst_size = (outfmt->width & 0xf) ? 8 : 16;

/*



Re: [PATCH v5 00/39] i.MX Media Driver

2017-03-12 Thread Steve Longerbeam



On 03/12/2017 01:22 PM, Russell King - ARM Linux wrote:

On Sun, Mar 12, 2017 at 01:05:06PM -0700, Steve Longerbeam wrote:



On 03/12/2017 12:57 PM, Russell King - ARM Linux wrote:

On Sat, Mar 11, 2017 at 04:30:53PM -0800, Steve Longerbeam wrote:

If it's too difficult to get the imx219 csi-2 transmitter into the
LP-11 state on power on, perhaps the csi-2 receiver can be a little
more lenient on the transmitter and make the LP-11 timeout a warning
instead of error-out.

Can you try the attached change on top of the version 5 patchset?

If that doesn't work then you're just going to have to fix the bug
in imx219.


That patch gets me past that hurdle, only to reveal that there's another
issue:


Yeah, ipu_cpmem_set_image() failed because it doesn't recognize the
bayer formats. Wait, didn't we fix this already? I've lost track.
Ah, right, we were going to move this support into the IPUv3 driver,
but in the meantime I think you had some patches to get around this.


What I had was this patch for your v3.  I never got to testing your
v4 because of the LP-11 problem.

In v5, you've changed to propagate the ipu_cpmem_set_image() error
code to avoid the resulting corruption, but that leaves the other bits
of this patch unaddressed, along my "media: imx: smfc: add support
for bayer formats" patch.

Your driver basically has no support for bayer formats.


You added the patches to this driver that adds the bayer support,
I don't think there is anything more required of the driver at this
point to support bayer, the remaining work needs to happen in the IPUv3
driver.

I'll see if I have time to write that patch to IPUv3, but it's simple,
in fact what you wrote below can be translate directly into
ipu_cpmem_set_image(). There's a few other places bayer needs to be
treated in IPUv3, but it should be obvious by grepping for the
reference to pixel formats.

Steve




diff --git a/drivers/staging/media/imx/imx-smfc.c 
b/drivers/staging/media/imx/imx-smfc.c
index 313732201a52..4351c0365cf4 100644
--- a/drivers/staging/media/imx/imx-smfc.c
+++ b/drivers/staging/media/imx/imx-smfc.c
@@ -234,11 +234,6 @@ static void imx_smfc_setup_channel(struct imx_smfc_priv 
*priv)
buf1 = imx_media_dma_buf_get_next_queued(priv->out_ring);
priv->next = buf1;

-   image.phys0 = buf0->phys;
-   image.phys1 = buf1->phys;
-   ipu_cpmem_set_image(priv->smfc_ch, );
-
-
switch (image.pix.pixelformat) {
case V4L2_PIX_FMT_SBGGR8:
case V4L2_PIX_FMT_SGBRG8:
@@ -247,6 +242,10 @@ static void imx_smfc_setup_channel(struct imx_smfc_priv 
*priv)
burst_size = 8;
passthrough = true;
passthrough_bits = 8;
+   ipu_cpmem_set_resolution(priv->smfc_ch, image.rect.width, 
image.rect.height);
+   ipu_cpmem_set_stride(priv->smfc_ch, image.pix.bytesperline);
+   ipu_cpmem_set_buffer(priv->smfc_ch, 0, buf0->phys);
+   ipu_cpmem_set_buffer(priv->smfc_ch, 1, buf1->phys);
break;

case V4L2_PIX_FMT_SBGGR16:
@@ -256,9 +255,17 @@ static void imx_smfc_setup_channel(struct imx_smfc_priv 
*priv)
burst_size = 4;
passthrough = true;
passthrough_bits = 16;
+   ipu_cpmem_set_resolution(priv->smfc_ch, image.rect.width, 
image.rect.height);
+   ipu_cpmem_set_stride(priv->smfc_ch, image.pix.bytesperline);
+   ipu_cpmem_set_buffer(priv->smfc_ch, 0, buf0->phys);
+   ipu_cpmem_set_buffer(priv->smfc_ch, 1, buf1->phys);
break;

default:
+   image.phys0 = buf0->phys;
+   image.phys1 = buf1->phys;
+   ipu_cpmem_set_image(priv->smfc_ch, );
+
burst_size = (outfmt->width & 0xf) ? 8 : 16;

/*



Re: [PATCH 01/14] cpufreq: intel_pstate: Update pid_params.sample_rate_ns in pid_param_set()

2017-03-12 Thread Viresh Kumar
On Sun, Mar 12, 2017 at 10:42 PM, Rafael J. Wysocki  wrote:
> From: Rafael J. Wysocki 
>
> Fix the debugfs interface for PID tuning to actually update
> pid_params.sample_rate_ns on PID parameters updates, as changing
> pid_params.sample_rate_ms via debugfs has no effect now.
>
> Fixes: a4675fbc4a7a (cpufreq: intel_pstate: Replace timers with utilization 
> update callbacks)
> Signed-off-by: Rafael J. Wysocki 
> ---
>  drivers/cpufreq/intel_pstate.c |1 +
>  1 file changed, 1 insertion(+)
>
> Index: linux-pm/drivers/cpufreq/intel_pstate.c
> ===
> --- linux-pm.orig/drivers/cpufreq/intel_pstate.c
> +++ linux-pm/drivers/cpufreq/intel_pstate.c
> @@ -983,6 +983,7 @@ static void intel_pstate_update_policies
>  static int pid_param_set(void *data, u64 val)
>  {
> *(u32 *)data = val;
> +   pid_params.sample_rate_ns = pid_params.sample_rate_ms * NSEC_PER_MSEC;
> intel_pstate_reset_all_pid();
> return 0;
>  }
>

Acked-by: Viresh Kumar 


Re: [PATCH 01/14] cpufreq: intel_pstate: Update pid_params.sample_rate_ns in pid_param_set()

2017-03-12 Thread Viresh Kumar
On Sun, Mar 12, 2017 at 10:42 PM, Rafael J. Wysocki  wrote:
> From: Rafael J. Wysocki 
>
> Fix the debugfs interface for PID tuning to actually update
> pid_params.sample_rate_ns on PID parameters updates, as changing
> pid_params.sample_rate_ms via debugfs has no effect now.
>
> Fixes: a4675fbc4a7a (cpufreq: intel_pstate: Replace timers with utilization 
> update callbacks)
> Signed-off-by: Rafael J. Wysocki 
> ---
>  drivers/cpufreq/intel_pstate.c |1 +
>  1 file changed, 1 insertion(+)
>
> Index: linux-pm/drivers/cpufreq/intel_pstate.c
> ===
> --- linux-pm.orig/drivers/cpufreq/intel_pstate.c
> +++ linux-pm/drivers/cpufreq/intel_pstate.c
> @@ -983,6 +983,7 @@ static void intel_pstate_update_policies
>  static int pid_param_set(void *data, u64 val)
>  {
> *(u32 *)data = val;
> +   pid_params.sample_rate_ns = pid_params.sample_rate_ms * NSEC_PER_MSEC;
> intel_pstate_reset_all_pid();
> return 0;
>  }
>

Acked-by: Viresh Kumar 


Re: [Outreachy kernel] [PATCH] staging: iio: ade7753: replace mlock with driver private lock

2017-03-12 Thread SIMRAN SINGHAL
On Mon, Mar 13, 2017 at 12:03 AM, Alison Schofield  wrote:
> On Sun, Mar 12, 2017 at 07:02:50PM +0530, simran singhal wrote:
>> The IIO subsystem is redefining iio_dev->mlock to be used by
>> the IIO core only for protecting device operating mode changes.
>> ie. Changes between INDIO_DIRECT_MODE, INDIO_BUFFER_* modes.
>>
>> In this driver, mlock was being used to protect hardware state
>> changes.  Replace it with a lock in the devices global data.
>>
>> Fix some coding style issues related to white space also.
>>
>> Signed-off-by: simran singhal 
>
> Hi Simran, This looks good to me.  Let's see what the
> reviewers say.  I think the white space stuff is ok,
> since it was right where you were editing.
> alisons
>
Alison, so sending this patch here on outreachy mailing list is fine.
Still confuse :P

>> ---
>>  drivers/staging/iio/meter/ade7753.c | 14 --
>>  1 file changed, 8 insertions(+), 6 deletions(-)
>>
>> diff --git a/drivers/staging/iio/meter/ade7753.c 
>> b/drivers/staging/iio/meter/ade7753.c
>> index dfd8b71..ca99d82 100644
>> --- a/drivers/staging/iio/meter/ade7753.c
>> +++ b/drivers/staging/iio/meter/ade7753.c
>> @@ -81,12 +81,14 @@
>>   * @tx: transmit buffer
>>   * @rx: receive buffer
>>   * @buf_lock:   mutex to protect tx and rx
>> + * @lock:protect sensor state
>>   **/
>>  struct ade7753_state {
>> - struct spi_device   *us;
>> - struct mutexbuf_lock;
>> - u8  tx[ADE7753_MAX_TX] 
>> cacheline_aligned;
>> - u8  rx[ADE7753_MAX_RX];
>> + struct spi_device   *us;
>> + struct mutexbuf_lock;
>> + struct mutexlock;   /* protect sensor state */
>> + u8  tx[ADE7753_MAX_TX] cacheline_aligned;
>> + u8  rx[ADE7753_MAX_RX];
>>  };
>>
>>  static int ade7753_spi_write_reg_8(struct device *dev,
>> @@ -484,7 +486,7 @@ static ssize_t ade7753_write_frequency(struct device 
>> *dev,
>>   if (!val)
>>   return -EINVAL;
>>
>> - mutex_lock(_dev->mlock);
>> + mutex_lock(>lock);
>>
>>   t = 27900 / val;
>>   if (t > 0)
>> @@ -505,7 +507,7 @@ static ssize_t ade7753_write_frequency(struct device 
>> *dev,
>>   ret = ade7753_spi_write_reg_16(dev, ADE7753_MODE, reg);
>>
>>  out:
>> - mutex_unlock(_dev->mlock);
>> + mutex_unlock(>lock);
>>
>>   return ret ? ret : len;
>>  }
>> --
>> 2.7.4
>>
>> --
>> You received this message because you are subscribed to the Google Groups 
>> "outreachy-kernel" group.
>> To unsubscribe from this group and stop receiving emails from it, send an 
>> email to outreachy-kernel+unsubscr...@googlegroups.com.
>> To post to this group, send email to outreachy-ker...@googlegroups.com.
>> To view this discussion on the web visit 
>> https://groups.google.com/d/msgid/outreachy-kernel/20170312133250.GA7772%40singhal-Inspiron-5558.
>> For more options, visit https://groups.google.com/d/optout.


[PATCH] mm: mark gup_pud_range as unused

2017-03-12 Thread Chris Packham
The last caller to gup_pud_range was removed in commit c2febafc6773
("mm: convert generic code to 5-level paging"). Mark it as unused to
silence a warning from gcc.

Signed-off-by: Chris Packham 
---
I saw this warning when compiling 4.11-rc2 with -Werror. An equally valid fix
would be to remove the function entirely but I went for the less invasive
approach.

 mm/gup.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/mm/gup.c b/mm/gup.c
index c74bad1bf6e8..10f5c582273c 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1409,8 +1409,9 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, 
unsigned long end,
return 1;
 }
 
-static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end,
-int write, struct page **pages, int *nr)
+static int __maybe_unused gup_pud_range(p4d_t p4d, unsigned long addr,
+   unsigned long end, int write,
+   struct page **pages, int *nr)
 {
unsigned long next;
pud_t *pudp;
-- 
2.11.0.24.ge6920cf



Re: [Outreachy kernel] [PATCH] staging: iio: ade7753: replace mlock with driver private lock

2017-03-12 Thread SIMRAN SINGHAL
On Mon, Mar 13, 2017 at 12:03 AM, Alison Schofield  wrote:
> On Sun, Mar 12, 2017 at 07:02:50PM +0530, simran singhal wrote:
>> The IIO subsystem is redefining iio_dev->mlock to be used by
>> the IIO core only for protecting device operating mode changes.
>> ie. Changes between INDIO_DIRECT_MODE, INDIO_BUFFER_* modes.
>>
>> In this driver, mlock was being used to protect hardware state
>> changes.  Replace it with a lock in the devices global data.
>>
>> Fix some coding style issues related to white space also.
>>
>> Signed-off-by: simran singhal 
>
> Hi Simran, This looks good to me.  Let's see what the
> reviewers say.  I think the white space stuff is ok,
> since it was right where you were editing.
> alisons
>
Alison, so sending this patch here on outreachy mailing list is fine.
Still confuse :P

>> ---
>>  drivers/staging/iio/meter/ade7753.c | 14 --
>>  1 file changed, 8 insertions(+), 6 deletions(-)
>>
>> diff --git a/drivers/staging/iio/meter/ade7753.c 
>> b/drivers/staging/iio/meter/ade7753.c
>> index dfd8b71..ca99d82 100644
>> --- a/drivers/staging/iio/meter/ade7753.c
>> +++ b/drivers/staging/iio/meter/ade7753.c
>> @@ -81,12 +81,14 @@
>>   * @tx: transmit buffer
>>   * @rx: receive buffer
>>   * @buf_lock:   mutex to protect tx and rx
>> + * @lock:protect sensor state
>>   **/
>>  struct ade7753_state {
>> - struct spi_device   *us;
>> - struct mutexbuf_lock;
>> - u8  tx[ADE7753_MAX_TX] 
>> cacheline_aligned;
>> - u8  rx[ADE7753_MAX_RX];
>> + struct spi_device   *us;
>> + struct mutexbuf_lock;
>> + struct mutexlock;   /* protect sensor state */
>> + u8  tx[ADE7753_MAX_TX] cacheline_aligned;
>> + u8  rx[ADE7753_MAX_RX];
>>  };
>>
>>  static int ade7753_spi_write_reg_8(struct device *dev,
>> @@ -484,7 +486,7 @@ static ssize_t ade7753_write_frequency(struct device 
>> *dev,
>>   if (!val)
>>   return -EINVAL;
>>
>> - mutex_lock(_dev->mlock);
>> + mutex_lock(>lock);
>>
>>   t = 27900 / val;
>>   if (t > 0)
>> @@ -505,7 +507,7 @@ static ssize_t ade7753_write_frequency(struct device 
>> *dev,
>>   ret = ade7753_spi_write_reg_16(dev, ADE7753_MODE, reg);
>>
>>  out:
>> - mutex_unlock(_dev->mlock);
>> + mutex_unlock(>lock);
>>
>>   return ret ? ret : len;
>>  }
>> --
>> 2.7.4
>>
>> --
>> You received this message because you are subscribed to the Google Groups 
>> "outreachy-kernel" group.
>> To unsubscribe from this group and stop receiving emails from it, send an 
>> email to outreachy-kernel+unsubscr...@googlegroups.com.
>> To post to this group, send email to outreachy-ker...@googlegroups.com.
>> To view this discussion on the web visit 
>> https://groups.google.com/d/msgid/outreachy-kernel/20170312133250.GA7772%40singhal-Inspiron-5558.
>> For more options, visit https://groups.google.com/d/optout.


[PATCH] mm: mark gup_pud_range as unused

2017-03-12 Thread Chris Packham
The last caller to gup_pud_range was removed in commit c2febafc6773
("mm: convert generic code to 5-level paging"). Mark it as unused to
silence a warning from gcc.

Signed-off-by: Chris Packham 
---
I saw this warning when compiling 4.11-rc2 with -Werror. An equally valid fix
would be to remove the function entirely but I went for the less invasive
approach.

 mm/gup.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/mm/gup.c b/mm/gup.c
index c74bad1bf6e8..10f5c582273c 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1409,8 +1409,9 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, 
unsigned long end,
return 1;
 }
 
-static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end,
-int write, struct page **pages, int *nr)
+static int __maybe_unused gup_pud_range(p4d_t p4d, unsigned long addr,
+   unsigned long end, int write,
+   struct page **pages, int *nr)
 {
unsigned long next;
pud_t *pudp;
-- 
2.11.0.24.ge6920cf



Re: [PATCH for-4.11] ASoC: don't dereference NULL pcm_{new,free}

2017-03-12 Thread Kuninori Morimoto

Hi Brian

Thank you for your feedback

> There are 4 drivers calling that:
> 
>   snd_soc_dummy_probe
>   rt5514_spi_probe
>   2 instances of snd_dmaengine_pcm_register, via rockchip_i2s_probe
> 
> Only the latter two seem to run the assignment here:
> 
>   if (platform_drv->pcm_new)
>   platform->component.pcm_new = snd_soc_platform_drv_pcm_new;
> 
> Both snd_soc_dummy_probe and rt5514_spi_probe find ->pcm_new NULL here.

Hmm...

The crasher was snd_dmaengine_pcm_register's platform ?
This means, in your current kernel, dmaengine platform dosn't call
its .pcm_new (= dmaengine_pcm_new) somehow ?

I'm wondering why ->pcm_new became NULL which exists on probe timing ?
Can you check component and driver by this patch ?
This is very rough but enough for debug

-
diff --git a/sound/soc/soc-core.c b/sound/soc/soc-core.c
index 5933851..43da1ec 100644
--- a/sound/soc/soc-core.c
+++ b/sound/soc/soc-core.c
@@ -3322,6 +3322,10 @@ static int snd_soc_platform_drv_pcm_new(struct 
snd_soc_pcm_runtime *rtd)
 {
struct snd_soc_platform *platform = rtd->platform;
 
+   printk("---use name: %s, %p\n",
+  platform->component.name,
+  platform->driver);
+
return platform->driver->pcm_new(rtd);
 }
 
@@ -3356,8 +3360,12 @@ int snd_soc_add_platform(struct device *dev, struct 
snd_soc_platform *platform,
platform->component.probe = snd_soc_platform_drv_probe;
if (platform_drv->remove)
platform->component.remove = snd_soc_platform_drv_remove;
-   if (platform_drv->pcm_new)
+   if (platform_drv->pcm_new) {
+   printk("---add name: %s, %p\n",
+  platform->component.name,
+  platform->driver);
platform->component.pcm_new = snd_soc_platform_drv_pcm_new;
+   }
if (platform_drv->pcm_free)
platform->component.pcm_free = snd_soc_platform_drv_pcm_free;
 
-


Re: [PATCH for-4.11] ASoC: don't dereference NULL pcm_{new,free}

2017-03-12 Thread Kuninori Morimoto

Hi Brian

Thank you for your feedback

> There are 4 drivers calling that:
> 
>   snd_soc_dummy_probe
>   rt5514_spi_probe
>   2 instances of snd_dmaengine_pcm_register, via rockchip_i2s_probe
> 
> Only the latter two seem to run the assignment here:
> 
>   if (platform_drv->pcm_new)
>   platform->component.pcm_new = snd_soc_platform_drv_pcm_new;
> 
> Both snd_soc_dummy_probe and rt5514_spi_probe find ->pcm_new NULL here.

Hmm...

The crasher was snd_dmaengine_pcm_register's platform ?
This means, in your current kernel, dmaengine platform dosn't call
its .pcm_new (= dmaengine_pcm_new) somehow ?

I'm wondering why ->pcm_new became NULL which exists on probe timing ?
Can you check component and driver by this patch ?
This is very rough but enough for debug

-
diff --git a/sound/soc/soc-core.c b/sound/soc/soc-core.c
index 5933851..43da1ec 100644
--- a/sound/soc/soc-core.c
+++ b/sound/soc/soc-core.c
@@ -3322,6 +3322,10 @@ static int snd_soc_platform_drv_pcm_new(struct 
snd_soc_pcm_runtime *rtd)
 {
struct snd_soc_platform *platform = rtd->platform;
 
+   printk("---use name: %s, %p\n",
+  platform->component.name,
+  platform->driver);
+
return platform->driver->pcm_new(rtd);
 }
 
@@ -3356,8 +3360,12 @@ int snd_soc_add_platform(struct device *dev, struct 
snd_soc_platform *platform,
platform->component.probe = snd_soc_platform_drv_probe;
if (platform_drv->remove)
platform->component.remove = snd_soc_platform_drv_remove;
-   if (platform_drv->pcm_new)
+   if (platform_drv->pcm_new) {
+   printk("---add name: %s, %p\n",
+  platform->component.name,
+  platform->driver);
platform->component.pcm_new = snd_soc_platform_drv_pcm_new;
+   }
if (platform_drv->pcm_free)
platform->component.pcm_free = snd_soc_platform_drv_pcm_free;
 
-


Re: [PATCH 3.16 302/370] drm/radeon: Use mode h/vdisplay fields to hide out of bounds HW cursor

2017-03-12 Thread Michel Dänzer
On 10/03/17 08:46 PM, Ben Hutchings wrote:
> 3.16.42-rc1 review patch.  If anyone has any objections, please let me know.
> 
> --
> 
> From: Michel Dänzer 
> 
> commit d74c67dd7800fc7aae381f272875c337f268806c upstream.
> 
> The crtc_h/vdisplay fields may not match the CRTC viewport dimensions
> with special modes such as interlaced ones.
> 
> Fixes the HW cursor disappearing in the bottom half of the screen with
> interlaced modes.
> 
> Fixes: 6b16cf7785a4 ("drm/radeon: Hide the HW cursor while it's out of 
> bounds")

It might make sense to squash together the backports of this commit and
6b16cf7785a4, or at least move them closer together in the series, to
prevent people from hitting the regressed state.


-- 
Earthling Michel Dänzer   |   http://www.amd.com
Libre software enthusiast | Mesa and X developer


Re: [PATCH 3.16 302/370] drm/radeon: Use mode h/vdisplay fields to hide out of bounds HW cursor

2017-03-12 Thread Michel Dänzer
On 10/03/17 08:46 PM, Ben Hutchings wrote:
> 3.16.42-rc1 review patch.  If anyone has any objections, please let me know.
> 
> --
> 
> From: Michel Dänzer 
> 
> commit d74c67dd7800fc7aae381f272875c337f268806c upstream.
> 
> The crtc_h/vdisplay fields may not match the CRTC viewport dimensions
> with special modes such as interlaced ones.
> 
> Fixes the HW cursor disappearing in the bottom half of the screen with
> interlaced modes.
> 
> Fixes: 6b16cf7785a4 ("drm/radeon: Hide the HW cursor while it's out of 
> bounds")

It might make sense to squash together the backports of this commit and
6b16cf7785a4, or at least move them closer together in the series, to
prevent people from hitting the regressed state.


-- 
Earthling Michel Dänzer   |   http://www.amd.com
Libre software enthusiast | Mesa and X developer


Re: [RFC] Add option to mount only a pids subset

2017-03-12 Thread Andy Lutomirski
On Sat, Mar 11, 2017 at 6:13 PM, Al Viro  wrote:
> PS: AFAICS, simple mount --bind of your pid-only mount will suddenly
> expose the full thing.  And as for the lifetimes making no sense...
> note that you are simply not freeing these structures of yours.
> Try to handle that and you'll get a serious PITA all over the
> place.
>
> What are you trying to achieve, anyway?  Why not add a second vfsmount
> pointer per pid_namespace and make it initialized on demand, at the
> first attempt of no-pid mount?  Just have a separate no-pid instance
> created for those namespaces where it had been asked for, with
> separate superblock and dentry tree not containing anything other
> that pid-only parts + self + thread-self...

Can't we just make procfs work like most other filesystems and have
each mount have its own superblock?  If we need to do something funky
to stat() output to keep existing userspace working, I think that's
okay.

As far as I can tell, proc_mnt is very nearly useless -- it seems to
be used for proc_flush_task (which claims to be purely an optimization
and could be preserved in the common case where there's only one
relevant mount) and for sysctl_binary.  For the latter, we could
create proc_mnt but make actual user-initiated mounts be new
superblocks anyway.


Re: [RFC] Add option to mount only a pids subset

2017-03-12 Thread Andy Lutomirski
On Sat, Mar 11, 2017 at 6:13 PM, Al Viro  wrote:
> PS: AFAICS, simple mount --bind of your pid-only mount will suddenly
> expose the full thing.  And as for the lifetimes making no sense...
> note that you are simply not freeing these structures of yours.
> Try to handle that and you'll get a serious PITA all over the
> place.
>
> What are you trying to achieve, anyway?  Why not add a second vfsmount
> pointer per pid_namespace and make it initialized on demand, at the
> first attempt of no-pid mount?  Just have a separate no-pid instance
> created for those namespaces where it had been asked for, with
> separate superblock and dentry tree not containing anything other
> that pid-only parts + self + thread-self...

Can't we just make procfs work like most other filesystems and have
each mount have its own superblock?  If we need to do something funky
to stat() output to keep existing userspace working, I think that's
okay.

As far as I can tell, proc_mnt is very nearly useless -- it seems to
be used for proc_flush_task (which claims to be purely an optimization
and could be preserved in the common case where there's only one
relevant mount) and for sysctl_binary.  For the latter, we could
create proc_mnt but make actual user-initiated mounts be new
superblocks anyway.


[PATCH 1/1] Drivers: hv: vmbus: Don't leak memory when a channel is rescinded

2017-03-12 Thread kys
From: K. Y. Srinivasan 

When we close a channel that has been rescinded, we will leak memory since
vmbus_teardown_gpadl() returns an error. Fix this so that we can properly
cleanup the memory allocated to the ring buffers.

Fixes: ccb61f8a99e6 ("Drivers: hv: vmbus: Fix a rescind handling bug")

Signed-off-by: K. Y. Srinivasan 
---
 drivers/hv/channel.c |   13 -
 1 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c
index 54075ac..e9b0fab 100644
--- a/drivers/hv/channel.c
+++ b/drivers/hv/channel.c
@@ -502,12 +502,15 @@ int vmbus_teardown_gpadl(struct vmbus_channel *channel, 
u32 gpadl_handle)
 
wait_for_completion(>waitevent);
 
-   if (channel->rescind) {
-   ret = -ENODEV;
-   goto post_msg_err;
-   }
-
 post_msg_err:
+   /*
+* If the channel has been rescinded;
+* we will be awakened by the rescind
+* handler; set the error code to zero so we don't leak memory.
+*/
+   if (channel->rescind)
+   ret = 0;
+
spin_lock_irqsave(_connection.channelmsg_lock, flags);
list_del(>msglistentry);
spin_unlock_irqrestore(_connection.channelmsg_lock, flags);
-- 
1.7.1



[PATCH 1/1] Drivers: hv: vmbus: Don't leak memory when a channel is rescinded

2017-03-12 Thread kys
From: K. Y. Srinivasan 

When we close a channel that has been rescinded, we will leak memory since
vmbus_teardown_gpadl() returns an error. Fix this so that we can properly
cleanup the memory allocated to the ring buffers.

Fixes: ccb61f8a99e6 ("Drivers: hv: vmbus: Fix a rescind handling bug")

Signed-off-by: K. Y. Srinivasan 
---
 drivers/hv/channel.c |   13 -
 1 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c
index 54075ac..e9b0fab 100644
--- a/drivers/hv/channel.c
+++ b/drivers/hv/channel.c
@@ -502,12 +502,15 @@ int vmbus_teardown_gpadl(struct vmbus_channel *channel, 
u32 gpadl_handle)
 
wait_for_completion(>waitevent);
 
-   if (channel->rescind) {
-   ret = -ENODEV;
-   goto post_msg_err;
-   }
-
 post_msg_err:
+   /*
+* If the channel has been rescinded;
+* we will be awakened by the rescind
+* handler; set the error code to zero so we don't leak memory.
+*/
+   if (channel->rescind)
+   ret = 0;
+
spin_lock_irqsave(_connection.channelmsg_lock, flags);
list_del(>msglistentry);
spin_unlock_irqrestore(_connection.channelmsg_lock, flags);
-- 
1.7.1



  1   2   3   4   5   6   7   >