[PATCH v6 08/38] arc: Implement the new page table range API

2023-08-02 Thread Matthew Wilcox (Oracle)
Add PFN_PTE_SHIFT, update_mmu_cache_range(), flush_dcache_folio()
and flush_icache_pages().

Change the PG_dc_clean flag from being per-page to per-folio (which
means it cannot always be set as we don't know that all pages in this
folio were cleaned).  Enhance the internal flush routines to take the
number of pages to flush.

Signed-off-by: Matthew Wilcox (Oracle) 
Acked-by: Mike Rapoport (IBM) 
Cc: Vineet Gupta 
Cc: linux-snps-arc@lists.infradead.org
---
 arch/arc/include/asm/cacheflush.h |  7 ++-
 arch/arc/include/asm/pgtable-bits-arcv2.h | 12 ++---
 arch/arc/include/asm/pgtable-levels.h |  1 +
 arch/arc/mm/cache.c   | 61 ++-
 arch/arc/mm/tlb.c | 18 ---
 5 files changed, 59 insertions(+), 40 deletions(-)

diff --git a/arch/arc/include/asm/cacheflush.h 
b/arch/arc/include/asm/cacheflush.h
index e201b4b1655a..04f65f588510 100644
--- a/arch/arc/include/asm/cacheflush.h
+++ b/arch/arc/include/asm/cacheflush.h
@@ -25,17 +25,20 @@
  * in update_mmu_cache()
  */
 #define flush_icache_page(vma, page)
+#define flush_icache_pages(vma, page, nr)
 
 void flush_cache_all(void);
 
 void flush_icache_range(unsigned long kstart, unsigned long kend);
 void __sync_icache_dcache(phys_addr_t paddr, unsigned long vaddr, int len);
-void __inv_icache_page(phys_addr_t paddr, unsigned long vaddr);
-void __flush_dcache_page(phys_addr_t paddr, unsigned long vaddr);
+void __inv_icache_pages(phys_addr_t paddr, unsigned long vaddr, unsigned nr);
+void __flush_dcache_pages(phys_addr_t paddr, unsigned long vaddr, unsigned nr);
 
 #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1
 
 void flush_dcache_page(struct page *page);
+void flush_dcache_folio(struct folio *folio);
+#define flush_dcache_folio flush_dcache_folio
 
 void dma_cache_wback_inv(phys_addr_t start, unsigned long sz);
 void dma_cache_inv(phys_addr_t start, unsigned long sz);
diff --git a/arch/arc/include/asm/pgtable-bits-arcv2.h 
b/arch/arc/include/asm/pgtable-bits-arcv2.h
index 6e9f8ca6d6a1..ee78ab30958d 100644
--- a/arch/arc/include/asm/pgtable-bits-arcv2.h
+++ b/arch/arc/include/asm/pgtable-bits-arcv2.h
@@ -100,14 +100,12 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t 
newprot)
return __pte((pte_val(pte) & _PAGE_CHG_MASK) | pgprot_val(newprot));
 }
 
-static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep, pte_t pteval)
-{
-   set_pte(ptep, pteval);
-}
+struct vm_fault;
+void update_mmu_cache_range(struct vm_fault *vmf, struct vm_area_struct *vma,
+   unsigned long address, pte_t *ptep, unsigned int nr);
 
-void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
- pte_t *ptep);
+#define update_mmu_cache(vma, addr, ptep) \
+   update_mmu_cache_range(NULL, vma, addr, ptep, 1)
 
 /*
  * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that
diff --git a/arch/arc/include/asm/pgtable-levels.h 
b/arch/arc/include/asm/pgtable-levels.h
index ef68758b69f7..fc417c75c24d 100644
--- a/arch/arc/include/asm/pgtable-levels.h
+++ b/arch/arc/include/asm/pgtable-levels.h
@@ -169,6 +169,7 @@
 #define pte_ERROR(e) \
pr_crit("%s:%d: bad pte %08lx.\n", __FILE__, __LINE__, pte_val(e))
 
+#define PFN_PTE_SHIFT  PAGE_SHIFT
 #define pte_none(x)(!pte_val(x))
 #define pte_present(x) (pte_val(x) & _PAGE_PRESENT)
 #define pte_clear(mm,addr,ptep)set_pte_at(mm, addr, ptep, __pte(0))
diff --git a/arch/arc/mm/cache.c b/arch/arc/mm/cache.c
index 55c6de138eae..3c16ee942a5c 100644
--- a/arch/arc/mm/cache.c
+++ b/arch/arc/mm/cache.c
@@ -752,17 +752,17 @@ static inline void arc_slc_enable(void)
  * There's a corollary case, where kernel READs from a userspace mapped page.
  * If the U-mapping is not congruent to K-mapping, former needs flushing.
  */
-void flush_dcache_page(struct page *page)
+void flush_dcache_folio(struct folio *folio)
 {
struct address_space *mapping;
 
if (!cache_is_vipt_aliasing()) {
-   clear_bit(PG_dc_clean, >flags);
+   clear_bit(PG_dc_clean, >flags);
return;
}
 
/* don't handle anon pages here */
-   mapping = page_mapping_file(page);
+   mapping = folio_flush_mapping(folio);
if (!mapping)
return;
 
@@ -771,17 +771,27 @@ void flush_dcache_page(struct page *page)
 * Make a note that K-mapping is dirty
 */
if (!mapping_mapped(mapping)) {
-   clear_bit(PG_dc_clean, >flags);
-   } else if (page_mapcount(page)) {
-
+   clear_bit(PG_dc_clean, >flags);
+   } else if (folio_mapped(folio)) {
/* kernel reading from page with U-mapping */
-   phys_addr_t paddr = (unsigned long)page_address(page);
-   unsigned long vaddr = page->index << PAGE_SHIFT;
+   phys_addr_t pad

[PATCH v5 08/38] arc: Implement the new page table range API

2023-07-10 Thread Matthew Wilcox (Oracle)
Add PFN_PTE_SHIFT, update_mmu_cache_range(), flush_dcache_folio()
and flush_icache_pages().

Change the PG_dc_clean flag from being per-page to per-folio (which
means it cannot always be set as we don't know that all pages in this
folio were cleaned).  Enhance the internal flush routines to take the
number of pages to flush.

Signed-off-by: Matthew Wilcox (Oracle) 
Acked-by: Mike Rapoport (IBM) 
Cc: Vineet Gupta 
Cc: linux-snps-arc@lists.infradead.org
---
 arch/arc/include/asm/cacheflush.h |  7 ++-
 arch/arc/include/asm/pgtable-bits-arcv2.h | 12 ++---
 arch/arc/include/asm/pgtable-levels.h |  1 +
 arch/arc/mm/cache.c   | 61 ++-
 arch/arc/mm/tlb.c | 18 ---
 5 files changed, 59 insertions(+), 40 deletions(-)

diff --git a/arch/arc/include/asm/cacheflush.h 
b/arch/arc/include/asm/cacheflush.h
index e201b4b1655a..04f65f588510 100644
--- a/arch/arc/include/asm/cacheflush.h
+++ b/arch/arc/include/asm/cacheflush.h
@@ -25,17 +25,20 @@
  * in update_mmu_cache()
  */
 #define flush_icache_page(vma, page)
+#define flush_icache_pages(vma, page, nr)
 
 void flush_cache_all(void);
 
 void flush_icache_range(unsigned long kstart, unsigned long kend);
 void __sync_icache_dcache(phys_addr_t paddr, unsigned long vaddr, int len);
-void __inv_icache_page(phys_addr_t paddr, unsigned long vaddr);
-void __flush_dcache_page(phys_addr_t paddr, unsigned long vaddr);
+void __inv_icache_pages(phys_addr_t paddr, unsigned long vaddr, unsigned nr);
+void __flush_dcache_pages(phys_addr_t paddr, unsigned long vaddr, unsigned nr);
 
 #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1
 
 void flush_dcache_page(struct page *page);
+void flush_dcache_folio(struct folio *folio);
+#define flush_dcache_folio flush_dcache_folio
 
 void dma_cache_wback_inv(phys_addr_t start, unsigned long sz);
 void dma_cache_inv(phys_addr_t start, unsigned long sz);
diff --git a/arch/arc/include/asm/pgtable-bits-arcv2.h 
b/arch/arc/include/asm/pgtable-bits-arcv2.h
index 6e9f8ca6d6a1..ee78ab30958d 100644
--- a/arch/arc/include/asm/pgtable-bits-arcv2.h
+++ b/arch/arc/include/asm/pgtable-bits-arcv2.h
@@ -100,14 +100,12 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t 
newprot)
return __pte((pte_val(pte) & _PAGE_CHG_MASK) | pgprot_val(newprot));
 }
 
-static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep, pte_t pteval)
-{
-   set_pte(ptep, pteval);
-}
+struct vm_fault;
+void update_mmu_cache_range(struct vm_fault *vmf, struct vm_area_struct *vma,
+   unsigned long address, pte_t *ptep, unsigned int nr);
 
-void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
- pte_t *ptep);
+#define update_mmu_cache(vma, addr, ptep) \
+   update_mmu_cache_range(NULL, vma, addr, ptep, 1)
 
 /*
  * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that
diff --git a/arch/arc/include/asm/pgtable-levels.h 
b/arch/arc/include/asm/pgtable-levels.h
index ef68758b69f7..fc417c75c24d 100644
--- a/arch/arc/include/asm/pgtable-levels.h
+++ b/arch/arc/include/asm/pgtable-levels.h
@@ -169,6 +169,7 @@
 #define pte_ERROR(e) \
pr_crit("%s:%d: bad pte %08lx.\n", __FILE__, __LINE__, pte_val(e))
 
+#define PFN_PTE_SHIFT  PAGE_SHIFT
 #define pte_none(x)(!pte_val(x))
 #define pte_present(x) (pte_val(x) & _PAGE_PRESENT)
 #define pte_clear(mm,addr,ptep)set_pte_at(mm, addr, ptep, __pte(0))
diff --git a/arch/arc/mm/cache.c b/arch/arc/mm/cache.c
index 55c6de138eae..3c16ee942a5c 100644
--- a/arch/arc/mm/cache.c
+++ b/arch/arc/mm/cache.c
@@ -752,17 +752,17 @@ static inline void arc_slc_enable(void)
  * There's a corollary case, where kernel READs from a userspace mapped page.
  * If the U-mapping is not congruent to K-mapping, former needs flushing.
  */
-void flush_dcache_page(struct page *page)
+void flush_dcache_folio(struct folio *folio)
 {
struct address_space *mapping;
 
if (!cache_is_vipt_aliasing()) {
-   clear_bit(PG_dc_clean, >flags);
+   clear_bit(PG_dc_clean, >flags);
return;
}
 
/* don't handle anon pages here */
-   mapping = page_mapping_file(page);
+   mapping = folio_flush_mapping(folio);
if (!mapping)
return;
 
@@ -771,17 +771,27 @@ void flush_dcache_page(struct page *page)
 * Make a note that K-mapping is dirty
 */
if (!mapping_mapped(mapping)) {
-   clear_bit(PG_dc_clean, >flags);
-   } else if (page_mapcount(page)) {
-
+   clear_bit(PG_dc_clean, >flags);
+   } else if (folio_mapped(folio)) {
/* kernel reading from page with U-mapping */
-   phys_addr_t paddr = (unsigned long)page_address(page);
-   unsigned long vaddr = page->index << PAGE_SHIFT;
+   phys_addr_t pad

[PATCH v4 07/36] arc: Implement the new page table range API

2023-03-14 Thread Matthew Wilcox (Oracle)
Add PFN_PTE_SHIFT, update_mmu_cache_range(), flush_dcache_folio()
and flush_icache_pages().

Change the PG_dc_clean flag from being per-page to per-folio (which
means it cannot always be set as we don't know that all pages in this
folio were cleaned).  Enhance the internal flush routines to take the
number of pages to flush.

Signed-off-by: Matthew Wilcox (Oracle) 
Cc: Vineet Gupta 
Cc: linux-snps-arc@lists.infradead.org
---
 arch/arc/include/asm/cacheflush.h |  7 ++-
 arch/arc/include/asm/pgtable-bits-arcv2.h | 11 ++--
 arch/arc/include/asm/pgtable-levels.h |  1 +
 arch/arc/mm/cache.c   | 61 ++-
 arch/arc/mm/tlb.c | 18 ---
 5 files changed, 58 insertions(+), 40 deletions(-)

diff --git a/arch/arc/include/asm/cacheflush.h 
b/arch/arc/include/asm/cacheflush.h
index e201b4b1655a..04f65f588510 100644
--- a/arch/arc/include/asm/cacheflush.h
+++ b/arch/arc/include/asm/cacheflush.h
@@ -25,17 +25,20 @@
  * in update_mmu_cache()
  */
 #define flush_icache_page(vma, page)
+#define flush_icache_pages(vma, page, nr)
 
 void flush_cache_all(void);
 
 void flush_icache_range(unsigned long kstart, unsigned long kend);
 void __sync_icache_dcache(phys_addr_t paddr, unsigned long vaddr, int len);
-void __inv_icache_page(phys_addr_t paddr, unsigned long vaddr);
-void __flush_dcache_page(phys_addr_t paddr, unsigned long vaddr);
+void __inv_icache_pages(phys_addr_t paddr, unsigned long vaddr, unsigned nr);
+void __flush_dcache_pages(phys_addr_t paddr, unsigned long vaddr, unsigned nr);
 
 #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1
 
 void flush_dcache_page(struct page *page);
+void flush_dcache_folio(struct folio *folio);
+#define flush_dcache_folio flush_dcache_folio
 
 void dma_cache_wback_inv(phys_addr_t start, unsigned long sz);
 void dma_cache_inv(phys_addr_t start, unsigned long sz);
diff --git a/arch/arc/include/asm/pgtable-bits-arcv2.h 
b/arch/arc/include/asm/pgtable-bits-arcv2.h
index 6e9f8ca6d6a1..06d8039180c0 100644
--- a/arch/arc/include/asm/pgtable-bits-arcv2.h
+++ b/arch/arc/include/asm/pgtable-bits-arcv2.h
@@ -100,14 +100,11 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t 
newprot)
return __pte((pte_val(pte) & _PAGE_CHG_MASK) | pgprot_val(newprot));
 }
 
-static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep, pte_t pteval)
-{
-   set_pte(ptep, pteval);
-}
+void update_mmu_cache_range(struct vm_area_struct *vma, unsigned long address,
+ pte_t *ptep, unsigned int nr);
 
-void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
- pte_t *ptep);
+#define update_mmu_cache(vma, addr, ptep) \
+   update_mmu_cache_range(vma, addr, ptep, 1)
 
 /*
  * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that
diff --git a/arch/arc/include/asm/pgtable-levels.h 
b/arch/arc/include/asm/pgtable-levels.h
index ef68758b69f7..fc417c75c24d 100644
--- a/arch/arc/include/asm/pgtable-levels.h
+++ b/arch/arc/include/asm/pgtable-levels.h
@@ -169,6 +169,7 @@
 #define pte_ERROR(e) \
pr_crit("%s:%d: bad pte %08lx.\n", __FILE__, __LINE__, pte_val(e))
 
+#define PFN_PTE_SHIFT  PAGE_SHIFT
 #define pte_none(x)(!pte_val(x))
 #define pte_present(x) (pte_val(x) & _PAGE_PRESENT)
 #define pte_clear(mm,addr,ptep)set_pte_at(mm, addr, ptep, __pte(0))
diff --git a/arch/arc/mm/cache.c b/arch/arc/mm/cache.c
index 55c6de138eae..3c16ee942a5c 100644
--- a/arch/arc/mm/cache.c
+++ b/arch/arc/mm/cache.c
@@ -752,17 +752,17 @@ static inline void arc_slc_enable(void)
  * There's a corollary case, where kernel READs from a userspace mapped page.
  * If the U-mapping is not congruent to K-mapping, former needs flushing.
  */
-void flush_dcache_page(struct page *page)
+void flush_dcache_folio(struct folio *folio)
 {
struct address_space *mapping;
 
if (!cache_is_vipt_aliasing()) {
-   clear_bit(PG_dc_clean, >flags);
+   clear_bit(PG_dc_clean, >flags);
return;
}
 
/* don't handle anon pages here */
-   mapping = page_mapping_file(page);
+   mapping = folio_flush_mapping(folio);
if (!mapping)
return;
 
@@ -771,17 +771,27 @@ void flush_dcache_page(struct page *page)
 * Make a note that K-mapping is dirty
 */
if (!mapping_mapped(mapping)) {
-   clear_bit(PG_dc_clean, >flags);
-   } else if (page_mapcount(page)) {
-
+   clear_bit(PG_dc_clean, >flags);
+   } else if (folio_mapped(folio)) {
/* kernel reading from page with U-mapping */
-   phys_addr_t paddr = (unsigned long)page_address(page);
-   unsigned long vaddr = page->index << PAGE_SHIFT;
+   phys_addr_t paddr = (unsigned long)folio_address(folio);
+ 

[PATCH v3 06/34] arc: Implement the new page table range API

2023-02-28 Thread Matthew Wilcox (Oracle)
Add set_ptes(), update_mmu_cache_range(), flush_dcache_folio()
and flush_icache_pages().

Change the PG_dc_clean flag from being per-page to per-folio (which
means it cannot always be set as we don't know that all pages in this
folio were cleaned).  Enhance the internal flush routines to take the
number of pages to flush.

Signed-off-by: Matthew Wilcox (Oracle) 
Cc: Vineet Gupta 
Cc: linux-snps-arc@lists.infradead.org
---
 arch/arc/include/asm/cacheflush.h |  7 ++-
 arch/arc/include/asm/pgtable-bits-arcv2.h | 20 ++--
 arch/arc/mm/cache.c   | 61 ++-
 arch/arc/mm/tlb.c | 18 ---
 4 files changed, 68 insertions(+), 38 deletions(-)

diff --git a/arch/arc/include/asm/cacheflush.h 
b/arch/arc/include/asm/cacheflush.h
index e201b4b1655a..04f65f588510 100644
--- a/arch/arc/include/asm/cacheflush.h
+++ b/arch/arc/include/asm/cacheflush.h
@@ -25,17 +25,20 @@
  * in update_mmu_cache()
  */
 #define flush_icache_page(vma, page)
+#define flush_icache_pages(vma, page, nr)
 
 void flush_cache_all(void);
 
 void flush_icache_range(unsigned long kstart, unsigned long kend);
 void __sync_icache_dcache(phys_addr_t paddr, unsigned long vaddr, int len);
-void __inv_icache_page(phys_addr_t paddr, unsigned long vaddr);
-void __flush_dcache_page(phys_addr_t paddr, unsigned long vaddr);
+void __inv_icache_pages(phys_addr_t paddr, unsigned long vaddr, unsigned nr);
+void __flush_dcache_pages(phys_addr_t paddr, unsigned long vaddr, unsigned nr);
 
 #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1
 
 void flush_dcache_page(struct page *page);
+void flush_dcache_folio(struct folio *folio);
+#define flush_dcache_folio flush_dcache_folio
 
 void dma_cache_wback_inv(phys_addr_t start, unsigned long sz);
 void dma_cache_inv(phys_addr_t start, unsigned long sz);
diff --git a/arch/arc/include/asm/pgtable-bits-arcv2.h 
b/arch/arc/include/asm/pgtable-bits-arcv2.h
index 6e9f8ca6d6a1..4a1b2ce204c6 100644
--- a/arch/arc/include/asm/pgtable-bits-arcv2.h
+++ b/arch/arc/include/asm/pgtable-bits-arcv2.h
@@ -100,14 +100,24 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t 
newprot)
return __pte((pte_val(pte) & _PAGE_CHG_MASK) | pgprot_val(newprot));
 }
 
-static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep, pte_t pteval)
+static inline void set_ptes(struct mm_struct *mm, unsigned long addr,
+   pte_t *ptep, pte_t pte, unsigned int nr)
 {
-   set_pte(ptep, pteval);
+   for (;;) {
+   set_pte(ptep, pte);
+   if (--nr == 0)
+   break;
+   ptep++;
+   pte_val(pte) += PAGE_SIZE;
+   }
 }
+#define set_pte_at(mm, addr, ptep, pte) set_ptes(mm, addr, ptep, pte, 1)
 
-void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
- pte_t *ptep);
+void update_mmu_cache_range(struct vm_area_struct *vma, unsigned long address,
+ pte_t *ptep, unsigned int nr);
+
+#define update_mmu_cache(vma, addr, ptep) \
+   update_mmu_cache_range(vma, addr, ptep, 1)
 
 /*
  * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that
diff --git a/arch/arc/mm/cache.c b/arch/arc/mm/cache.c
index 55c6de138eae..3c16ee942a5c 100644
--- a/arch/arc/mm/cache.c
+++ b/arch/arc/mm/cache.c
@@ -752,17 +752,17 @@ static inline void arc_slc_enable(void)
  * There's a corollary case, where kernel READs from a userspace mapped page.
  * If the U-mapping is not congruent to K-mapping, former needs flushing.
  */
-void flush_dcache_page(struct page *page)
+void flush_dcache_folio(struct folio *folio)
 {
struct address_space *mapping;
 
if (!cache_is_vipt_aliasing()) {
-   clear_bit(PG_dc_clean, >flags);
+   clear_bit(PG_dc_clean, >flags);
return;
}
 
/* don't handle anon pages here */
-   mapping = page_mapping_file(page);
+   mapping = folio_flush_mapping(folio);
if (!mapping)
return;
 
@@ -771,17 +771,27 @@ void flush_dcache_page(struct page *page)
 * Make a note that K-mapping is dirty
 */
if (!mapping_mapped(mapping)) {
-   clear_bit(PG_dc_clean, >flags);
-   } else if (page_mapcount(page)) {
-
+   clear_bit(PG_dc_clean, >flags);
+   } else if (folio_mapped(folio)) {
/* kernel reading from page with U-mapping */
-   phys_addr_t paddr = (unsigned long)page_address(page);
-   unsigned long vaddr = page->index << PAGE_SHIFT;
+   phys_addr_t paddr = (unsigned long)folio_address(folio);
+   unsigned long vaddr = folio_pos(folio);
 
+   /*
+* vaddr is not actually the virtual address, but is
+* congruent to every user mapping.
+*/
if (addr_not_cac

Re: [PATCH v2 06/30] arc: Implement the new page table range API

2023-02-28 Thread Matthew Wilcox
On Mon, Feb 27, 2023 at 10:34:05PM -0800, Vineet Gupta wrote:
> You need to split ARC and ARM into separate patches.

Ugh.  Looks like I inadvertently squashed them together during a rebase.

c228f5b4e007 HEAD@{121}: rebase (reword): arm64: Implement the new page table ra
nge API
22744c8ae873 HEAD@{122}: rebase (fixup): arc: Implement the new page table range
 API
11da1e48 HEAD@{123}: rebase (fixup): # This is a combination of 2 commits.
d68d7ab9b184 HEAD@{124}: rebase (start): checkout next-20230225

I was trying to fixup an arm commit and looks like i squashed the arm
commit with the arc commit instead.  Will fix and resend.

> Also it'd be best to drop all the VIPT aliasing bits for ARC, they are a
> needless maintenance burden.
> I can send a patch which you could carry in your tree for easier logistics.

Works for me!  I don't mind if you want to drop the VIPT bits before
or after my changes; I can adapt to either.  Thanks

___
linux-snps-arc mailing list
linux-snps-arc@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-snps-arc


[PATCH v2 06/30] arc: Implement the new page table range API

2023-02-27 Thread Matthew Wilcox (Oracle)
Add set_ptes(), update_mmu_cache_range(), flush_dcache_folio()
and flush_icache_pages().

Change the PG_dc_clean flag from being per-page to per-folio (which
means it cannot always be set as we don't know that all pages in this
folio were cleaned).  Enhance the internal flush routines to take the
number of pages to flush.

Signed-off-by: Matthew Wilcox (Oracle) 
Cc: Vineet Gupta 
Cc: linux-snps-arc@lists.infradead.org
---
 arch/arc/include/asm/cacheflush.h |  7 +-
 arch/arc/include/asm/pgtable-bits-arcv2.h | 20 +++--
 arch/arc/mm/cache.c   | 61 --
 arch/arc/mm/tlb.c | 18 +++--
 arch/arm/include/asm/cacheflush.h | 24 +++---
 arch/arm/include/asm/pgtable.h|  5 +-
 arch/arm/include/asm/tlbflush.h   | 13 +--
 arch/arm/mm/copypage-v4mc.c   |  5 +-
 arch/arm/mm/copypage-v6.c |  5 +-
 arch/arm/mm/copypage-xscale.c |  5 +-
 arch/arm/mm/dma-mapping.c | 24 +++---
 arch/arm/mm/fault-armv.c  | 14 ++--
 arch/arm/mm/flush.c   | 99 ++-
 arch/arm/mm/mm.h  |  2 +-
 arch/arm/mm/mmu.c | 14 +++-
 15 files changed, 193 insertions(+), 123 deletions(-)

diff --git a/arch/arc/include/asm/cacheflush.h 
b/arch/arc/include/asm/cacheflush.h
index e201b4b1655a..04f65f588510 100644
--- a/arch/arc/include/asm/cacheflush.h
+++ b/arch/arc/include/asm/cacheflush.h
@@ -25,17 +25,20 @@
  * in update_mmu_cache()
  */
 #define flush_icache_page(vma, page)
+#define flush_icache_pages(vma, page, nr)
 
 void flush_cache_all(void);
 
 void flush_icache_range(unsigned long kstart, unsigned long kend);
 void __sync_icache_dcache(phys_addr_t paddr, unsigned long vaddr, int len);
-void __inv_icache_page(phys_addr_t paddr, unsigned long vaddr);
-void __flush_dcache_page(phys_addr_t paddr, unsigned long vaddr);
+void __inv_icache_pages(phys_addr_t paddr, unsigned long vaddr, unsigned nr);
+void __flush_dcache_pages(phys_addr_t paddr, unsigned long vaddr, unsigned nr);
 
 #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1
 
 void flush_dcache_page(struct page *page);
+void flush_dcache_folio(struct folio *folio);
+#define flush_dcache_folio flush_dcache_folio
 
 void dma_cache_wback_inv(phys_addr_t start, unsigned long sz);
 void dma_cache_inv(phys_addr_t start, unsigned long sz);
diff --git a/arch/arc/include/asm/pgtable-bits-arcv2.h 
b/arch/arc/include/asm/pgtable-bits-arcv2.h
index 6e9f8ca6d6a1..4a1b2ce204c6 100644
--- a/arch/arc/include/asm/pgtable-bits-arcv2.h
+++ b/arch/arc/include/asm/pgtable-bits-arcv2.h
@@ -100,14 +100,24 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t 
newprot)
return __pte((pte_val(pte) & _PAGE_CHG_MASK) | pgprot_val(newprot));
 }
 
-static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep, pte_t pteval)
+static inline void set_ptes(struct mm_struct *mm, unsigned long addr,
+   pte_t *ptep, pte_t pte, unsigned int nr)
 {
-   set_pte(ptep, pteval);
+   for (;;) {
+   set_pte(ptep, pte);
+   if (--nr == 0)
+   break;
+   ptep++;
+   pte_val(pte) += PAGE_SIZE;
+   }
 }
+#define set_pte_at(mm, addr, ptep, pte) set_ptes(mm, addr, ptep, pte, 1)
 
-void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
- pte_t *ptep);
+void update_mmu_cache_range(struct vm_area_struct *vma, unsigned long address,
+ pte_t *ptep, unsigned int nr);
+
+#define update_mmu_cache(vma, addr, ptep) \
+   update_mmu_cache_range(vma, addr, ptep, 1)
 
 /*
  * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that
diff --git a/arch/arc/mm/cache.c b/arch/arc/mm/cache.c
index 55c6de138eae..3c16ee942a5c 100644
--- a/arch/arc/mm/cache.c
+++ b/arch/arc/mm/cache.c
@@ -752,17 +752,17 @@ static inline void arc_slc_enable(void)
  * There's a corollary case, where kernel READs from a userspace mapped page.
  * If the U-mapping is not congruent to K-mapping, former needs flushing.
  */
-void flush_dcache_page(struct page *page)
+void flush_dcache_folio(struct folio *folio)
 {
struct address_space *mapping;
 
if (!cache_is_vipt_aliasing()) {
-   clear_bit(PG_dc_clean, >flags);
+   clear_bit(PG_dc_clean, >flags);
return;
}
 
/* don't handle anon pages here */
-   mapping = page_mapping_file(page);
+   mapping = folio_flush_mapping(folio);
if (!mapping)
return;
 
@@ -771,17 +771,27 @@ void flush_dcache_page(struct page *page)
 * Make a note that K-mapping is dirty
 */
if (!mapping_mapped(mapping)) {
-   clear_bit(PG_dc_clean, >flags);
-   } else if (page_mapcount(page)) {
-
+   clear_bit(PG_dc_clean, >flags);
+   } else if (fo

Re: How many colours does the ARC cache have?

2023-02-18 Thread Matthew Wilcox
On Wed, Feb 15, 2023 at 08:59:18PM -0800, Vineet Gupta wrote:
> On 2/10/23 09:06, Matthew Wilcox wrote:
> > I see a discrepancy here ...
> > 
> > arch/arc/include/asm/shmparam.h:
> > /* Handle upto 2 cache bins */
> > #define SHMLBA  (2 * PAGE_SIZE)
> > 
> > arch/arc/include/asm/cacheflush.h:
> > #define CACHE_COLORS_NUM4
> 
> The initial aliasing dcache support assumed 2 colors but was later bumped to
> 4, w/o making the adjustment in shmparam.h

OK.  Will you fix it yourself, or do you want me to send a patch?

> > (there are some other problems with the arc cache flushing code;
> 
> The VIPT aliasing config (which is pretty much dead and unused) or regular
> parts ?

I'm not sure.  This is incorrect in flush_dcache_page():

} else if (page_mapcount(page)) {
[...]
unsigned long vaddr = page->index << PAGE_SHIFT;

If the page being flushed is a tail page, then page->index is not
valid, so you're essentially flushing a random address.  I have a
fix for it that I sent out earlier this week.

> > I'm working on patches to address them, but those are things I understand a
> > little better.  I know nothing about the ARC architecture itself)
> 
> Legacy ARC700 cpus had VIPT D$. The cache size was configurable by Soc
> builder and the specific geometry could yield an aliasing configuration
> (e.g. standard page size 8K, 4 way set associative D$: so D$ > 32K were
> aliasing and needed CONFIG_ARC_CACHE_VIPT_ALIASING). Although there was ever
> only 1 customer who taped out an aliasing cache config.
> 
> The newer ARC HS cores have PIPT D$ and thus don't need the aliasing
> support.
> 
> FWIW we could rip out all the VIPT aliasing code as I don't think it is
> needed anymore. @Alexey can you confirm ?
> 
> -Vineet

___
linux-snps-arc mailing list
linux-snps-arc@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-snps-arc


How many colours does the ARC cache have?

2023-02-10 Thread Matthew Wilcox
I see a discrepancy here ...

arch/arc/include/asm/shmparam.h:
/* Handle upto 2 cache bins */
#define SHMLBA  (2 * PAGE_SIZE)

arch/arc/include/asm/cacheflush.h:
#define CACHE_COLORS_NUM4


(there are some other problems with the arc cache flushing code; I'm
working on patches to address them, but those are things I understand a
little better.  I know nothing about the ARC architecture itself)

___
linux-snps-arc mailing list
linux-snps-arc@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-snps-arc


Re: [PATCH 1/2] mm: Fix struct page layout on 32-bit systems

2021-04-20 Thread Matthew Wilcox
On Tue, Apr 20, 2021 at 09:39:54AM +0200, Geert Uytterhoeven wrote:
> > +++ b/include/linux/mm_types.h
> > @@ -97,10 +97,10 @@ struct page {
> > };
> > struct {/* page_pool used by netstack */
> > /**
> > -* @dma_addr: might require a 64-bit value even on
> > +* @dma_addr: might require a 64-bit value on
> >  * 32-bit architectures.
> >  */
> > -   dma_addr_t dma_addr;
> > +   unsigned long dma_addr[2];
> 
> So we get two 64-bit words on 64-bit platforms, while only one is
> needed?

Not really.  This is part of the 5-word union in struct page, so the space
ends up being reserved anyway, event if it's not "assigned" to dma_addr.

> > +   dma_addr_t ret = page->dma_addr[0];
> > +   if (sizeof(dma_addr_t) > sizeof(unsigned long))
> > +   ret |= (dma_addr_t)page->dma_addr[1] << 16 << 16;
> 
> We don't seem to have a handy macro for a 32-bit left shift yet...
> 
> But you can also avoid the warning using
> 
> ret |= (u64)page->dma_addr[1] << 32;

Sure.  It doesn't really matter which way we eliminate the warning;
the code is unreachable.

> > +{
> > +   page->dma_addr[0] = addr;
> > +   if (sizeof(dma_addr_t) > sizeof(unsigned long))
> > +   page->dma_addr[1] = addr >> 16 >> 16;
> 
> ... but we do have upper_32_bits() for a 32-bit right shift.

Yep, that's what my current tree looks like.

___
linux-snps-arc mailing list
linux-snps-arc@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-snps-arc


Re: [PATCH 1/2] mm: Fix struct page layout on 32-bit systems

2021-04-19 Thread Matthew Wilcox
On Tue, Apr 20, 2021 at 02:48:17AM +, Vineet Gupta wrote:
> > 32-bit architectures which expect 8-byte alignment for 8-byte integers
> > and need 64-bit DMA addresses (arc, arm, mips, ppc) had their struct
> > page inadvertently expanded in 2019.
> 
> FWIW, ARC doesn't require 8 byte alignment for 8 byte integers. This is 
> only needed for 8-byte atomics due to the requirements of LLOCKD/SCOND 
> instructions.

Ah, like x86?  OK, great, I'll drop your arch from the list of
affected.  Thanks!

___
linux-snps-arc mailing list
linux-snps-arc@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-snps-arc


Re: [PATCH 1/2] mm: Fix struct page layout on 32-bit systems

2021-04-17 Thread Matthew Wilcox
On Sat, Apr 17, 2021 at 09:18:57PM +, David Laight wrote:
> Ugly as well.

Thank you for expressing your opinion.  Again.

___
linux-snps-arc mailing list
linux-snps-arc@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-snps-arc


Re: [PATCH 2/2] mm: Indicate pfmemalloc pages in compound_head

2021-04-17 Thread Matthew Wilcox
On Sat, Apr 17, 2021 at 09:13:45PM +, David Laight wrote:
> > struct {/* page_pool used by netstack */
> > -   /**
> > -* @dma_addr: might require a 64-bit value on
> > -* 32-bit architectures.
> > -*/
> > +   unsigned long pp_magic;
> > +   unsigned long xmi;
> > +   unsigned long _pp_mapping_pad;
> > unsigned long dma_addr[2];
> > };
> 
> You've deleted the comment.

Yes.  It no longer added any value.  You can see dma_addr now occupies
two words.

> I also think there should be a comment that dma_addr[0]
> must be aliased to ->index.

That's not a requirement.  Moving the pfmemalloc indicator is a
requirement so that we _can_ use index, but there's no requirement about
how index is used.

___
linux-snps-arc mailing list
linux-snps-arc@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-snps-arc


Re: [PATCH 1/2] mm: Fix struct page layout on 32-bit systems

2021-04-16 Thread Matthew Wilcox


Replacement patch to fix compiler warning.

From: "Matthew Wilcox (Oracle)" 
Date: Fri, 16 Apr 2021 16:34:55 -0400
Subject: [PATCH 1/2] mm: Fix struct page layout on 32-bit systems
To: bro...@redhat.com
Cc: linux-ker...@vger.kernel.org,
linux...@kvack.org,
net...@vger.kernel.org,
linuxppc-...@lists.ozlabs.org,
linux-arm-ker...@lists.infradead.org,
linux-m...@vger.kernel.org,
ilias.apalodi...@linaro.org,
mcr...@linux.microsoft.com,
grygorii.stras...@ti.com,
a...@kernel.org,
h...@lst.de,
linux-snps-arc@lists.infradead.org,
mho...@kernel.org,
mgor...@suse.de

32-bit architectures which expect 8-byte alignment for 8-byte integers
and need 64-bit DMA addresses (arc, arm, mips, ppc) had their struct
page inadvertently expanded in 2019.  When the dma_addr_t was added,
it forced the alignment of the union to 8 bytes, which inserted a 4 byte
gap between 'flags' and the union.

Fix this by storing the dma_addr_t in one or two adjacent unsigned longs.
This restores the alignment to that of an unsigned long, and also fixes a
potential problem where (on a big endian platform), the bit used to denote
PageTail could inadvertently get set, and a racing get_user_pages_fast()
could dereference a bogus compound_head().

Fixes: c25fff7171be ("mm: add dma_addr_t to struct page")
Signed-off-by: Matthew Wilcox (Oracle) 
---
 include/linux/mm_types.h |  4 ++--
 include/net/page_pool.h  | 12 +++-
 net/core/page_pool.c | 12 +++-
 3 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 6613b26a8894..5aacc1c10a45 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -97,10 +97,10 @@ struct page {
};
struct {/* page_pool used by netstack */
/**
-* @dma_addr: might require a 64-bit value even on
+* @dma_addr: might require a 64-bit value on
 * 32-bit architectures.
 */
-   dma_addr_t dma_addr;
+   unsigned long dma_addr[2];
};
struct {/* slab, slob and slub */
union {
diff --git a/include/net/page_pool.h b/include/net/page_pool.h
index b5b195305346..ad6154dc206c 100644
--- a/include/net/page_pool.h
+++ b/include/net/page_pool.h
@@ -198,7 +198,17 @@ static inline void page_pool_recycle_direct(struct 
page_pool *pool,
 
 static inline dma_addr_t page_pool_get_dma_addr(struct page *page)
 {
-   return page->dma_addr;
+   dma_addr_t ret = page->dma_addr[0];
+   if (sizeof(dma_addr_t) > sizeof(unsigned long))
+   ret |= (dma_addr_t)page->dma_addr[1] << 16 << 16;
+   return ret;
+}
+
+static inline void page_pool_set_dma_addr(struct page *page, dma_addr_t addr)
+{
+   page->dma_addr[0] = addr;
+   if (sizeof(dma_addr_t) > sizeof(unsigned long))
+   page->dma_addr[1] = addr >> 16 >> 16;
 }
 
 static inline bool is_page_pool_compiled_in(void)
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index ad8b0707af04..f014fd8c19a6 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -174,8 +174,10 @@ static void page_pool_dma_sync_for_device(struct page_pool 
*pool,
  struct page *page,
  unsigned int dma_sync_size)
 {
+   dma_addr_t dma_addr = page_pool_get_dma_addr(page);
+
dma_sync_size = min(dma_sync_size, pool->p.max_len);
-   dma_sync_single_range_for_device(pool->p.dev, page->dma_addr,
+   dma_sync_single_range_for_device(pool->p.dev, dma_addr,
 pool->p.offset, dma_sync_size,
 pool->p.dma_dir);
 }
@@ -226,7 +228,7 @@ static struct page *__page_pool_alloc_pages_slow(struct 
page_pool *pool,
put_page(page);
return NULL;
}
-   page->dma_addr = dma;
+   page_pool_set_dma_addr(page, dma);
 
if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV)
page_pool_dma_sync_for_device(pool, page, pool->p.max_len);
@@ -294,13 +296,13 @@ void page_pool_release_page(struct page_pool *pool, 
struct page *page)
 */
goto skip_dma_unmap;
 
-   dma = page->dma_addr;
+   dma = page_pool_get_dma_addr(page);
 
-   /* When page is unmapped, it cannot be returned our pool */
+   /* When page is unmapped, it cannot be returned to our pool */
dma_unmap_page_attrs(pool->p.dev, dma,
 PAGE_SIZE << pool->p.order, pool->p.dma_dir,
 DMA_ATTR_SKIP_CPU_SYNC);
-   page->dma_addr = 0;
+   page_pool_set_dma_addr(page, 0);
 skip_dma_unm

[PATCH 2/2] mm: Indicate pfmemalloc pages in compound_head

2021-04-16 Thread Matthew Wilcox (Oracle)
The net page_pool wants to use a magic value to identify page pool pages.
The best place to put it is in the first word where it can be clearly a
non-pointer value.  That means shifting dma_addr up to alias with ->index,
which means we need to find another way to indicate page_is_pfmemalloc().
Since page_pool doesn't want to set its magic value on pages which are
pfmemalloc, we can use bit 1 of compound_head to indicate that the page
came from the memory reserves.

Signed-off-by: Matthew Wilcox (Oracle) 
---
 include/linux/mm.h   | 12 +++-
 include/linux/mm_types.h |  7 +++
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 8ba434287387..44eab3f6d5ae 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1629,10 +1629,12 @@ struct address_space *page_mapping_file(struct page 
*page);
 static inline bool page_is_pfmemalloc(const struct page *page)
 {
/*
-* Page index cannot be this large so this must be
-* a pfmemalloc page.
+* This is not a tail page; compound_head of a head page is unused
+* at return from the page allocator, and will be overwritten
+* by callers who do not care whether the page came from the
+* reserves.
 */
-   return page->index == -1UL;
+   return page->compound_head & 2;
 }
 
 /*
@@ -1641,12 +1643,12 @@ static inline bool page_is_pfmemalloc(const struct page 
*page)
  */
 static inline void set_page_pfmemalloc(struct page *page)
 {
-   page->index = -1UL;
+   page->compound_head = 2;
 }
 
 static inline void clear_page_pfmemalloc(struct page *page)
 {
-   page->index = 0;
+   page->compound_head = 0;
 }
 
 /*
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 5aacc1c10a45..39f7163dcace 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -96,10 +96,9 @@ struct page {
unsigned long private;
};
struct {/* page_pool used by netstack */
-   /**
-* @dma_addr: might require a 64-bit value on
-* 32-bit architectures.
-*/
+   unsigned long pp_magic;
+   unsigned long xmi;
+   unsigned long _pp_mapping_pad;
unsigned long dma_addr[2];
};
struct {/* slab, slob and slub */
-- 
2.30.2


___
linux-snps-arc mailing list
linux-snps-arc@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-snps-arc


[PATCH 1/2] mm: Fix struct page layout on 32-bit systems

2021-04-16 Thread Matthew Wilcox (Oracle)
32-bit architectures which expect 8-byte alignment for 8-byte integers
and need 64-bit DMA addresses (arc, arm, mips, ppc) had their struct
page inadvertently expanded in 2019.  When the dma_addr_t was added,
it forced the alignment of the union to 8 bytes, which inserted a 4 byte
gap between 'flags' and the union.

Fix this by storing the dma_addr_t in one or two adjacent unsigned longs.
This restores the alignment to that of an unsigned long, and also fixes a
potential problem where (on a big endian platform), the bit used to denote
PageTail could inadvertently get set, and a racing get_user_pages_fast()
could dereference a bogus compound_head().

Fixes: c25fff7171be ("mm: add dma_addr_t to struct page")
Signed-off-by: Matthew Wilcox (Oracle) 
---
 include/linux/mm_types.h |  4 ++--
 include/net/page_pool.h  | 12 +++-
 net/core/page_pool.c | 12 +++-
 3 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 6613b26a8894..5aacc1c10a45 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -97,10 +97,10 @@ struct page {
};
struct {/* page_pool used by netstack */
/**
-* @dma_addr: might require a 64-bit value even on
+* @dma_addr: might require a 64-bit value on
 * 32-bit architectures.
 */
-   dma_addr_t dma_addr;
+   unsigned long dma_addr[2];
};
struct {/* slab, slob and slub */
union {
diff --git a/include/net/page_pool.h b/include/net/page_pool.h
index b5b195305346..db7c7020746a 100644
--- a/include/net/page_pool.h
+++ b/include/net/page_pool.h
@@ -198,7 +198,17 @@ static inline void page_pool_recycle_direct(struct 
page_pool *pool,
 
 static inline dma_addr_t page_pool_get_dma_addr(struct page *page)
 {
-   return page->dma_addr;
+   dma_addr_t ret = page->dma_addr[0];
+   if (sizeof(dma_addr_t) > sizeof(unsigned long))
+   ret |= (dma_addr_t)page->dma_addr[1] << 32;
+   return ret;
+}
+
+static inline void page_pool_set_dma_addr(struct page *page, dma_addr_t addr)
+{
+   page->dma_addr[0] = addr;
+   if (sizeof(dma_addr_t) > sizeof(unsigned long))
+   page->dma_addr[1] = addr >> 32;
 }
 
 static inline bool is_page_pool_compiled_in(void)
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index ad8b0707af04..f014fd8c19a6 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -174,8 +174,10 @@ static void page_pool_dma_sync_for_device(struct page_pool 
*pool,
  struct page *page,
  unsigned int dma_sync_size)
 {
+   dma_addr_t dma_addr = page_pool_get_dma_addr(page);
+
dma_sync_size = min(dma_sync_size, pool->p.max_len);
-   dma_sync_single_range_for_device(pool->p.dev, page->dma_addr,
+   dma_sync_single_range_for_device(pool->p.dev, dma_addr,
 pool->p.offset, dma_sync_size,
 pool->p.dma_dir);
 }
@@ -226,7 +228,7 @@ static struct page *__page_pool_alloc_pages_slow(struct 
page_pool *pool,
put_page(page);
return NULL;
}
-   page->dma_addr = dma;
+   page_pool_set_dma_addr(page, dma);
 
if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV)
page_pool_dma_sync_for_device(pool, page, pool->p.max_len);
@@ -294,13 +296,13 @@ void page_pool_release_page(struct page_pool *pool, 
struct page *page)
 */
goto skip_dma_unmap;
 
-   dma = page->dma_addr;
+   dma = page_pool_get_dma_addr(page);
 
-   /* When page is unmapped, it cannot be returned our pool */
+   /* When page is unmapped, it cannot be returned to our pool */
dma_unmap_page_attrs(pool->p.dev, dma,
 PAGE_SIZE << pool->p.order, pool->p.dma_dir,
 DMA_ATTR_SKIP_CPU_SYNC);
-   page->dma_addr = 0;
+   page_pool_set_dma_addr(page, 0);
 skip_dma_unmap:
/* This may be the last page returned, releasing the pool, so
 * it is not safe to reference pool afterwards.
-- 
2.30.2


___
linux-snps-arc mailing list
linux-snps-arc@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-snps-arc


[PATCH 0/2] Change struct page layout for page_pool

2021-04-16 Thread Matthew Wilcox (Oracle)
The first patch here fixes two bugs on ppc32, and mips32.  It fixes one
bug on arc and arm32 (in certain configurations).  It probably makes
sense to get it in ASAP through the networking tree.  I'd like to see
testing on those four architectures if possible?

The second patch enables new functionality.  It is much less urgent.
I'd really like to see Mel & Michal's thoughts on it.

I have only compile-tested these patches.

Matthew Wilcox (Oracle) (2):
  mm: Fix struct page layout on 32-bit systems
  mm: Indicate pfmemalloc pages in compound_head

 include/linux/mm.h   | 12 +++-
 include/linux/mm_types.h |  9 -
 include/net/page_pool.h  | 12 +++-
 net/core/page_pool.c | 12 +++-
 4 files changed, 29 insertions(+), 16 deletions(-)

-- 
2.30.2


___
linux-snps-arc mailing list
linux-snps-arc@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-snps-arc


Re: [PATCH v2] fs/dax: include to fix build error on ARC

2021-01-04 Thread Matthew Wilcox
On Mon, Jan 04, 2021 at 12:13:02PM -0800, Dan Williams wrote:
> On Thu, Dec 31, 2020 at 8:29 PM Randy Dunlap  wrote:
> > +++ lnx-511-rc1/fs/dax.c
> > @@ -25,6 +25,7 @@
> >  #include 
> >  #include 
> >  #include 
> > +#include 
> 
> I would expect this to come from one of the linux/ includes like
> linux/mm.h. asm/ headers are implementation linux/ headers are api.

It does indeed come from linux/mm.h already.  And a number of
other random places, including linux/serial.h.  Our headers are a mess,
but they shouldn't be made worse by adding _this_ include.  So I
second Dan's objection here.

> Once you drop that then the subject of this patch can just be "arc:
> add a copy_user_page() implementation", and handled by the arc
> maintainer (or I can take it with Vineet's ack).
> 
> >  #include 
> 
> Yes, this one should have a linux/ api header to front it, but that's
> a cleanup for another day.

Definitely more involved.


___
linux-snps-arc mailing list
linux-snps-arc@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-snps-arc


Re: [patch V2 00/18] mm/highmem: Preemptible variant of kmap_atomic & friends

2020-10-30 Thread Matthew Wilcox
On Thu, Oct 29, 2020 at 11:18:06PM +0100, Thomas Gleixner wrote:
> This series provides kmap_local.* iomap_local variants which only disable
> migration to keep the virtual mapping address stable accross preemption,
> but do neither disable pagefaults nor preemption. The new functions can be
> used in any context, but if used in atomic context the caller has to take
> care of eventually disabling pagefaults.

Could I ask for a CONFIG_KMAP_DEBUG which aliases all the kmap variants
to vmap()?  I think we currently have a problem in iov_iter on HIGHMEM
configs:

copy_page_to_iter() calls page_copy_sane() which checks:

head = compound_head(page);
if (likely(n <= v && v <= page_size(head)))
return true;

but then:

void *kaddr = kmap_atomic(page);
size_t wanted = copy_to_iter(kaddr + offset, bytes, i);
kunmap_atomic(kaddr);

so if offset to offset+bytes is larger than PAGE_SIZE, this is going to
work for lowmem pages and fail miserably for highmem pages.  I suggest
vmap() because vmap has a PAGE_SIZE gap between each allocation.

Alternatively if we could have a kmap_atomic_compound(), that would
be awesome, but probably not realistic to implement.  I've more
or less resigned myself to having to map things one page at a time.

___
linux-snps-arc mailing list
linux-snps-arc@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-snps-arc


Re: [patch RFC 00/15] mm/highmem: Provide a preemptible variant of kmap_atomic & friends

2020-09-19 Thread Matthew Wilcox
On Sat, Sep 19, 2020 at 10:18:54AM -0700, Linus Torvalds wrote:
> On Sat, Sep 19, 2020 at 2:50 AM Thomas Gleixner  wrote:
> >
> > this provides a preemptible variant of kmap_atomic & related
> > interfaces. This is achieved by:
> 
> Ack. This looks really nice, even apart from the new capability.
> 
> The only thing I really reacted to is that the name doesn't make sense
> to me: "kmap_temporary()" seems a bit odd.
> 
> Particularly for an interface that really is basically meant as a
> better replacement of "kmap_atomic()" (but is perhaps also a better
> replacement for "kmap()").
> 
> I think I understand how the name came about: I think the "temporary"
> is there as a distinction from the "longterm" regular kmap(). So I
> think it makes some sense from an internal implementation angle, but I
> don't think it makes a lot of sense from an interface name.
> 
> I don't know what might be a better name, but if we want to emphasize
> that it's thread-private and a one-off, maybe "local" would be a
> better naming, and make it distinct from the "global" nature of the
> old kmap() interface?
> 
> However, another solution might be to just use this new preemptible
> "local" kmap(), and remove the old global one entirely. Yes, the old
> global one caches the page table mapping and that sounds really
> efficient and nice. But it's actually horribly horribly bad, because
> it means that we need to use locking for them. Your new "temporary"
> implementation seems to be fundamentally better locking-wise, and only
> need preemption disabling as locking (and is equally fast for the
> non-highmem case).
> 
> So I wonder if the single-page TLB flush isn't a better model, and
> whether it wouldn't be a lot simpler to just get rid of the old
> complex kmap() entirely, and replace it with this?
> 
> I agree we can't replace the kmap_atomic() version, because maybe
> people depend on the preemption disabling it also implied. But what
> about replacing the non-atomic kmap()?

My concern with that is people might use kmap() and then pass the address
to a different task.  So we need to audit the current users of kmap()
and convert any that do that into using vmap() instead.

I like kmap_local().  Or kmap_thread().

___
linux-snps-arc mailing list
linux-snps-arc@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-snps-arc


Flushing transparent hugepages

2020-08-18 Thread Matthew Wilcox
If your arch does not support HAVE_ARCH_TRANSPARENT_HUGEPAGE, you can
stop reading now.  Although maybe you're curious about adding support.

$ git grep -w HAVE_ARCH_TRANSPARENT_HUGEPAGE arch
arch/Kconfig:config HAVE_ARCH_TRANSPARENT_HUGEPAGE
arch/arc/Kconfig:config HAVE_ARCH_TRANSPARENT_HUGEPAGE
arch/arm/Kconfig:config HAVE_ARCH_TRANSPARENT_HUGEPAGE
arch/arm64/Kconfig: select HAVE_ARCH_TRANSPARENT_HUGEPAGE
arch/mips/Kconfig:  select HAVE_ARCH_TRANSPARENT_HUGEPAGE if 
CPU_SUPPORTS_HUGEPAGES
arch/powerpc/platforms/Kconfig.cputype: select HAVE_ARCH_TRANSPARENT_HUGEPAGE
arch/s390/Kconfig:  select HAVE_ARCH_TRANSPARENT_HUGEPAGE
arch/sparc/Kconfig: select HAVE_ARCH_TRANSPARENT_HUGEPAGE
arch/x86/Kconfig:   select HAVE_ARCH_TRANSPARENT_HUGEPAGE

If your arch does not implement flush_dcache_page(), you can also
stop reading.

$ for i in arc arm arm64 mips powerpc s390 sparc x86; do git grep -l 
flush_dcache_page arch/$i/include; done
arch/arc/include/asm/cacheflush.h
arch/arm/include/asm/cacheflush.h
arch/arm64/include/asm/cacheflush.h
arch/mips/include/asm/cacheflush.h
arch/powerpc/include/asm/cacheflush.h
arch/sparc/include/asm/cacheflush_32.h
arch/sparc/include/asm/cacheflush_64.h
arch/sparc/include/asm/pgtable_64.h

OK, so we're down to arc, arm, arm64, mips, powerpc & sparc.  Hi!  ;-)

I'm working on adding THP support for filesystems with storage backing
and part of that is expanding the definition of THP to be any order
(ie any power of two of PAGE_SIZE).  Now, shmem already has some calls
to flush_dcache_page() for THPs, for example:

if (sgp != SGP_WRITE && !PageUptodate(page)) {
struct page *head = compound_head(page);
int i;

for (i = 0; i < compound_nr(head); i++) {
clear_highpage(head + i);
flush_dcache_page(head + i);
}
SetPageUptodate(head);
}

where you'll be called once for each subpage.  But ... these are error
paths, and I'm sure you all diligently test cache coherency scenarios
of error paths in shmem ... right?

For example, arm64 seems confused in this scenario:

void flush_dcache_page(struct page *page)
{
if (test_bit(PG_dcache_clean, >flags))
clear_bit(PG_dcache_clean, >flags);
}

...

void __sync_icache_dcache(pte_t pte)
{
struct page *page = pte_page(pte);

if (!test_and_set_bit(PG_dcache_clean, >flags))
sync_icache_aliases(page_address(page), page_size(page));
}

So arm64 keeps track on a per-page basis which ones have been flushed.
page_size() will return PAGE_SIZE if called on a tail page or regular
page, but will return PAGE_SIZE << compound_order if called on a head
page.  So this will either over-flush, or it's missing the opportunity
to clear the bits on all the subpages which have now been flushed.

PowerPC has special handling of hugetlbfs pages.  Well, that's what
the config option says, but actually it handles THP as well.  If
the config option is enabled.

#ifdef CONFIG_HUGETLB_PAGE
if (PageCompound(page)) {
flush_dcache_icache_hugepage(page);
return;
}
#endif

By the way, THPs can be mapped askew -- that is, at an offset which
means you can't use a PMD to map a PMD sized page.

Anyway, we don't really have consensus between the various architectures
on how to handle either THPs or hugetlb pages.  It's not contemplated
in Documentation/core-api/cachetlb.rst so there's no real surprise
we've diverged.

What would you _like_ to see?  Would you rather flush_dcache_page()
were called once for each subpage, or would you rather maintain
the page-needs-flushing state once per compound page?  We could also
introduce flush_dcache_thp() if some architectures would prefer it one
way and one the other, although that brings into question what to do
for hugetlbfs pages.

It might not be a bad idea to centralise the handling of all this stuff
somewhere.  Sounds like the kind of thing Arnd would like to do ;-) I'll
settle for getting enough clear feedback about what the various arch
maintainers want that I can write a documentation update for cachetlb.rst.

___
linux-snps-arc mailing list
linux-snps-arc@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-snps-arc


Re: [PATCH 4/8] asm-generic: pgalloc: provide generic pmd_alloc_one() and pmd_free_one()

2020-06-27 Thread Matthew Wilcox
On Sat, Jun 27, 2020 at 05:34:49PM +0300, Mike Rapoport wrote:
> More elaborate versions on arm64 and x86 account memory for the user page
> tables and call to pgtable_pmd_page_ctor() as the part of PMD page
> initialization.
> 
> Move the arm64 version to include/asm-generic/pgalloc.h and use the generic
> version on several architectures.
> 
> The pgtable_pmd_page_ctor() is a NOP when ARCH_ENABLE_SPLIT_PMD_PTLOCK is
> not enabled, so there is no functional change for most architectures except
> of the addition of __GFP_ACCOUNT for allocation of user page tables.

Thanks for including this line; it reminded me that we're not setting
the PageTable flag on the page, nor accounting it to the zone page stats.
Hope you don't mind me tagging a patch to do that on as 9/8.

We could also do with a pud_page_[cd]tor and maybe even p4d/pgd versions.
But that brings me to the next question -- could/should some of this
be moved over to asm-generic/pgalloc.h?  The ctor/dtor aren't called
from anywhere else, and there's value to reducing the total amount of
code in mm.h, but then there's also value to keeping all the ifdef
ARCH_ENABLE_SPLIT_PMD_PTLOCK code together too.  So I'm a bit torn.
What do you think?

___
linux-snps-arc mailing list
linux-snps-arc@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-snps-arc


[PATCH 9/8] mm: Account PMD tables like PTE tables

2020-06-27 Thread Matthew Wilcox
We account the PTE level of the page tables to the process in order to
make smarter OOM decisions and help diagnose why memory is fragmented.
For these same reasons, we should account pages allocated for PMDs.
With larger process address spaces and ASLR, the number of PMDs in use
is higher than it used to be so the inaccuracy is starting to matter.

Signed-off-by: Matthew Wilcox (Oracle) 
---
 include/linux/mm.h | 24 
 1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index dc7b87310c10..b283e25fcffa 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2271,7 +2271,7 @@ static inline spinlock_t *pmd_lockptr(struct mm_struct 
*mm, pmd_t *pmd)
return ptlock_ptr(pmd_to_page(pmd));
 }
 
-static inline bool pgtable_pmd_page_ctor(struct page *page)
+static inline bool pmd_ptlock_init(struct page *page)
 {
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
page->pmd_huge_pte = NULL;
@@ -2279,7 +2279,7 @@ static inline bool pgtable_pmd_page_ctor(struct page 
*page)
return ptlock_init(page);
 }
 
-static inline void pgtable_pmd_page_dtor(struct page *page)
+static inline void pmd_ptlock_free(struct page *page)
 {
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
VM_BUG_ON_PAGE(page->pmd_huge_pte, page);
@@ -2296,8 +2296,8 @@ static inline spinlock_t *pmd_lockptr(struct mm_struct 
*mm, pmd_t *pmd)
return >page_table_lock;
 }
 
-static inline bool pgtable_pmd_page_ctor(struct page *page) { return true; }
-static inline void pgtable_pmd_page_dtor(struct page *page) {}
+static inline bool pmd_ptlock_init(struct page *page) { return true; }
+static inline void pmd_ptlock_free(struct page *page) {}
 
 #define pmd_huge_pte(mm, pmd) ((mm)->pmd_huge_pte)
 
@@ -2310,6 +2310,22 @@ static inline spinlock_t *pmd_lock(struct mm_struct *mm, 
pmd_t *pmd)
return ptl;
 }
 
+static inline bool pgtable_pmd_page_ctor(struct page *page)
+{
+   if (!pmd_ptlock_init(page))
+   return false;
+   __SetPageTable(page);
+   inc_zone_page_state(page, NR_PAGETABLE);
+   return true;
+}
+
+static inline void pgtable_pmd_page_dtor(struct page *page)
+{
+   pmd_ptlock_free(page);
+   __ClearPageTable(page);
+   dec_zone_page_state(page, NR_PAGETABLE);
+}
+
 /*
  * No scalability reason to split PUD locks yet, but follow the same pattern
  * as the PMD locks to make it easier if we decide to.  The VM should not be
-- 
2.27.0


___
linux-snps-arc mailing list
linux-snps-arc@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-snps-arc


Re: [PATCH 0/8] mm: cleanup usage of

2020-06-27 Thread Matthew Wilcox
On Sat, Jun 27, 2020 at 05:34:45PM +0300, Mike Rapoport wrote:
> Most architectures have very similar versions of pXd_alloc_one() and
> pXd_free_one() for intermediate levels of page table. 
> These patches add generic versions of these functions in
>  and enable use of the generic functions where
> appropriate.

For the series:

Reviewed-by: Matthew Wilcox (Oracle) 

___
linux-snps-arc mailing list
linux-snps-arc@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-snps-arc


Re: [PATCH 08/12] mm: pgtable: add shortcuts for accessing kernel PMD and PTE

2020-05-12 Thread Matthew Wilcox
On Tue, May 12, 2020 at 09:44:18PM +0300, Mike Rapoport wrote:
> +++ b/include/linux/pgtable.h
> @@ -28,6 +28,24 @@
>  #define USER_PGTABLES_CEILING0UL
>  #endif
>  
> +/* FIXME: */

Fix you what?  Add documentation?

> +static inline pmd_t *pmd_off(struct mm_struct *mm, unsigned long va)
> +{
> + return pmd_offset(pud_offset(p4d_offset(pgd_offset(mm, va), va), va), 
> va);
> +}

___
linux-snps-arc mailing list
linux-snps-arc@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-snps-arc


Re: [PATCH 03/12] mm: reorder includes after introduction of linux/pgtable.h

2020-05-12 Thread Matthew Wilcox
On Tue, May 12, 2020 at 09:44:13PM +0300, Mike Rapoport wrote:
> diff --git a/arch/alpha/kernel/proto.h b/arch/alpha/kernel/proto.h
> index a093cd45ec79..701a05090141 100644
> --- a/arch/alpha/kernel/proto.h
> +++ b/arch/alpha/kernel/proto.h
> @@ -2,8 +2,6 @@
>  #include 
>  #include 
>  
> -#include 
> -
>  /* Prototypes of functions used across modules here in this directory.  */
>  
>  #define vucp volatile unsigned char  *

Looks like your script has a bug if linux/pgtable.h is the last include
in the file?


___
linux-snps-arc mailing list
linux-snps-arc@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-snps-arc


Re: [RFC] asm-generic/tlb: stub out pmd_free_tlb() if __PAGETABLE_PMD_FOLDED

2019-10-14 Thread Matthew Wilcox
On Mon, Oct 14, 2019 at 01:38:34PM -0700, Linus Torvalds wrote:
> And now I've said pgd/pud/p4d/pmd so many times that I've confused
> myself and think I'm wrong again, and I think that historically -
> originally - we always had a pgd, and then the pmd didn't exist
> because it was folded into it. That makes sense from a x86 naming
> standpoint. Then x86 _did_ get a pmd, and then we added more levels in
> between, and other architectures did things differently.

Oh my goodness.  Thank you for writing all this out and finally getting
to this point.  I was reading the whole thing thinking "This is different
from what I remember" and then you got here.  This explains so much about
how our MM does/doesn't work, and it's not just me that's confused ;-)

___
linux-snps-arc mailing list
linux-snps-arc@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-snps-arc


Re: [RFC V2 0/1] mm/debug: Add tests for architecture exported page table helpers

2019-08-26 Thread Matthew Wilcox
On Mon, Aug 26, 2019 at 08:07:13AM +0530, Anshuman Khandual wrote:
> On 08/09/2019 07:22 PM, Matthew Wilcox wrote:
> > On Fri, Aug 09, 2019 at 04:05:07PM +0530, Anshuman Khandual wrote:
> >> On 08/09/2019 03:46 PM, Matthew Wilcox wrote:
> >>> On Fri, Aug 09, 2019 at 01:03:17PM +0530, Anshuman Khandual wrote:
> >>>> Should alloc_gigantic_page() be made available as an interface for 
> >>>> general
> >>>> use in the kernel. The test module here uses very similar implementation 
> >>>> from
> >>>> HugeTLB to allocate a PUD aligned memory block. Similar for mm_alloc() 
> >>>> which
> >>>> needs to be exported through a header.
> >>>
> >>> Why are you allocating memory at all instead of just using some
> >>> known-to-exist PFNs like I suggested?
> >>
> >> We needed PFN to be PUD aligned for pfn_pud() and PMD aligned for mk_pmd().
> >> Now walking the kernel page table for a known symbol like kernel_init()
> > 
> > I didn't say to walk the kernel page table.  I said to call virt_to_pfn()
> > for a known symbol like kernel_init().
> > 
> >> as you had suggested earlier we might encounter page table page entries at 
> >> PMD
> >> and PUD which might not be PMD or PUD aligned respectively. It seemed to me
> >> that alignment requirement is applicable only for mk_pmd() and pfn_pud()
> >> which create large mappings at those levels but that requirement does not
> >> exist for page table pages pointing to next level. Is not that correct ? Or
> >> I am missing something here ?
> > 
> > Just clear the bottom bits off the PFN until you get a PMD or PUD aligned
> > PFN.  It's really not hard.
> 
> As Mark pointed out earlier that might end up being just a synthetic PFN
> which might not even exist on a given system.

And why would that matter?


Re: [RFC V2 0/1] mm/debug: Add tests for architecture exported page table helpers

2019-08-09 Thread Matthew Wilcox
On Fri, Aug 09, 2019 at 04:05:07PM +0530, Anshuman Khandual wrote:
> On 08/09/2019 03:46 PM, Matthew Wilcox wrote:
> > On Fri, Aug 09, 2019 at 01:03:17PM +0530, Anshuman Khandual wrote:
> >> Should alloc_gigantic_page() be made available as an interface for general
> >> use in the kernel. The test module here uses very similar implementation 
> >> from
> >> HugeTLB to allocate a PUD aligned memory block. Similar for mm_alloc() 
> >> which
> >> needs to be exported through a header.
> > 
> > Why are you allocating memory at all instead of just using some
> > known-to-exist PFNs like I suggested?
> 
> We needed PFN to be PUD aligned for pfn_pud() and PMD aligned for mk_pmd().
> Now walking the kernel page table for a known symbol like kernel_init()

I didn't say to walk the kernel page table.  I said to call virt_to_pfn()
for a known symbol like kernel_init().

> as you had suggested earlier we might encounter page table page entries at PMD
> and PUD which might not be PMD or PUD aligned respectively. It seemed to me
> that alignment requirement is applicable only for mk_pmd() and pfn_pud()
> which create large mappings at those levels but that requirement does not
> exist for page table pages pointing to next level. Is not that correct ? Or
> I am missing something here ?

Just clear the bottom bits off the PFN until you get a PMD or PUD aligned
PFN.  It's really not hard.


___
linux-snps-arc mailing list
linux-snps-arc@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-snps-arc


Re: [RFC V2 0/1] mm/debug: Add tests for architecture exported page table helpers

2019-08-09 Thread Matthew Wilcox
On Fri, Aug 09, 2019 at 01:03:17PM +0530, Anshuman Khandual wrote:
> Should alloc_gigantic_page() be made available as an interface for general
> use in the kernel. The test module here uses very similar implementation from
> HugeTLB to allocate a PUD aligned memory block. Similar for mm_alloc() which
> needs to be exported through a header.

Why are you allocating memory at all instead of just using some
known-to-exist PFNs like I suggested?

___
linux-snps-arc mailing list
linux-snps-arc@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-snps-arc


Re: [PATCH v2 1/2] mm: introduce ARCH_HAS_PTE_SPECIAL

2018-04-10 Thread Matthew Wilcox
On Tue, Apr 10, 2018 at 05:25:50PM +0200, Laurent Dufour wrote:
>  arch/powerpc/include/asm/pte-common.h  | 3 ---
>  arch/riscv/Kconfig | 1 +
>  arch/s390/Kconfig  | 1 +

You forgot to delete __HAVE_ARCH_PTE_SPECIAL from
arch/riscv/include/asm/pgtable-bits.h

___
linux-snps-arc mailing list
linux-snps-arc@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-snps-arc


Re: [RFC PATCH v2 0/2] Randomization of address chosen by mmap.

2018-03-23 Thread Matthew Wilcox
On Fri, Mar 23, 2018 at 02:00:24PM -0400, Rich Felker wrote:
> On Fri, Mar 23, 2018 at 05:48:06AM -0700, Matthew Wilcox wrote:
> > On Thu, Mar 22, 2018 at 07:36:36PM +0300, Ilya Smith wrote:
> > > Current implementation doesn't randomize address returned by mmap.
> > > All the entropy ends with choosing mmap_base_addr at the process
> > > creation. After that mmap build very predictable layout of address
> > > space. It allows to bypass ASLR in many cases. This patch make
> > > randomization of address on any mmap call.
> > 
> > Why should this be done in the kernel rather than libc?  libc is perfectly
> > capable of specifying random numbers in the first argument of mmap.
> 
> Generally libc does not have a view of the current vm maps, and thus
> in passing "random numbers", they would have to be uniform across the
> whole vm space and thus non-uniform once the kernel rounds up to avoid
> existing mappings.

I'm aware that you're the musl author, but glibc somehow manages to
provide etext, edata and end, demonstrating that it does know where at
least some of the memory map lies.  Virtually everything after that is
brought into the address space via mmap, which at least glibc intercepts,
so it's entirely possible for a security-conscious libc to know where
other things are in the memory map.  Not to mention that what we're
primarily talking about here are libraries which are dynamically linked
and are loaded by ld.so before calling main(); not dlopen() or even
regular user mmaps.

> Also this would impose requirements that libc be
> aware of the kernel's use of the virtual address space and what's
> available to userspace -- for example, on 32-bit archs whether 2GB,
> 3GB, or full 4GB (for 32-bit-user-on-64-bit-kernel) is available, and
> on 64-bit archs where fewer than the full 64 bits are actually valid
> in addresses, what the actual usable pointer size is. There is
> currently no clean way of conveying this information to userspace.

Huh, I thought libc was aware of this.  Also, I'd expect a libc-based
implementation to restrict itself to, eg, only loading libraries in
the bottom 1GB to avoid applications who want to map huge things from
running out of unfragmented address space.

___
linux-snps-arc mailing list
linux-snps-arc@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-snps-arc


Re: [RFC PATCH v2 0/2] Randomization of address chosen by mmap.

2018-03-23 Thread Matthew Wilcox
On Thu, Mar 22, 2018 at 07:36:36PM +0300, Ilya Smith wrote:
> Current implementation doesn't randomize address returned by mmap.
> All the entropy ends with choosing mmap_base_addr at the process
> creation. After that mmap build very predictable layout of address
> space. It allows to bypass ASLR in many cases. This patch make
> randomization of address on any mmap call.

Why should this be done in the kernel rather than libc?  libc is perfectly
capable of specifying random numbers in the first argument of mmap.

___
linux-snps-arc mailing list
linux-snps-arc@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-snps-arc


Re: [RFC PATCH 11/13] mm/vas: Introduce VAS segments - shareable address space regions

2017-03-13 Thread Matthew Wilcox
On Mon, Mar 13, 2017 at 03:14:13PM -0700, Till Smejkal wrote:
> +/**
> + * Create a new VAS segment.
> + *
> + * @param[in] name:  The name of the new VAS segment.
> + * @param[in] start: The address where the VAS segment begins.
> + * @param[in] end:   The address where the VAS segment ends.
> + * @param[in] mode:  The access rights for the VAS segment.
> + *
> + * @returns: The VAS segment ID on success, -ERRNO otherwise.
> + **/

Please follow the kernel-doc conventions, as described in
Documentation/doc-guide/kernel-doc.rst.  Also, function documentation
goes with the implementation, not the declaration.

> +/**
> + * Get ID of the VAS segment belonging to a given name.
> + *
> + * @param[in] name:  The name of the VAS segment for which the ID
> + *   should be returned.
> + *
> + * @returns: The VAS segment ID on success, -ERRNO
> + *   otherwise.
> + **/
> +extern int vas_seg_find(const char *name);

So ... segments have names, and IDs ... and access permissions ...
Why isn't this a special purpose filesystem?


___
linux-snps-arc mailing list
linux-snps-arc@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-snps-arc