date:20210319

[PATCH v5 27/27] mm/doc: Build kerneldoc for various mm files

2021-03-19 Thread Matthew Wilcox (Oracle)

These files weren't included in the html docs.

Signed-off-by: Matthew Wilcox (Oracle) 
---
 Documentation/core-api/mm-api.rst | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/Documentation/core-api/mm-api.rst 
b/Documentation/core-api/mm-api.rst
index 874ae1250258..3af5875a1d9e 100644
--- a/Documentation/core-api/mm-api.rst
+++ b/Documentation/core-api/mm-api.rst
@@ -93,3 +93,10 @@ More Memory Management Functions
 
 .. kernel-doc:: mm/page_alloc.c
 .. kernel-doc:: mm/mempolicy.c
+
+.. kernel-doc:: include/linux/mm_types.h
+   :internal:
+.. kernel-doc:: include/linux/mm.h
+   :internal:
+.. kernel-doc:: mm/util.c
+   :functions: folio_mapping
-- 
2.30.2

[PATCH v5 23/27] mm/writeback: Add wait_for_stable_folio

2021-03-19 Thread Matthew Wilcox (Oracle)

Move wait_for_stable_page() into the folio compatibility file.
wait_for_stable_folio() avoids a call to compound_head() and is 14 bytes
smaller than wait_for_stable_page() was.  The net text size grows by 24
bytes as a result of this patch.

Signed-off-by: Matthew Wilcox (Oracle) 
---
 include/linux/pagemap.h |  1 +
 mm/folio-compat.c   |  6 ++
 mm/page-writeback.c | 17 -
 3 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index a6adf69ea5c5..c92782b77d98 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -813,6 +813,7 @@ int wait_on_folio_writeback_killable(struct folio *folio);
 void end_page_writeback(struct page *page);
 void end_folio_writeback(struct folio *folio);
 void wait_for_stable_page(struct page *page);
+void wait_for_stable_folio(struct folio *folio);
 
 void page_endio(struct page *page, bool is_write, int err);
 
diff --git a/mm/folio-compat.c b/mm/folio-compat.c
index 6aadecc39fba..335594fe414e 100644
--- a/mm/folio-compat.c
+++ b/mm/folio-compat.c
@@ -29,3 +29,9 @@ void wait_on_page_writeback(struct page *page)
return wait_on_folio_writeback(page_folio(page));
 }
 EXPORT_SYMBOL_GPL(wait_on_page_writeback);
+
+void wait_for_stable_page(struct page *page)
+{
+   return wait_for_stable_folio(page_folio(page));
+}
+EXPORT_SYMBOL_GPL(wait_for_stable_page);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index a08e77abcf12..c222f88cf06b 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2862,17 +2862,16 @@ int wait_on_folio_writeback_killable(struct folio 
*folio)
 EXPORT_SYMBOL_GPL(wait_on_folio_writeback_killable);
 
 /**
- * wait_for_stable_page() - wait for writeback to finish, if necessary.
- * @page:  The page to wait on.
+ * wait_for_stable_folio() - wait for writeback to finish, if necessary.
+ * @folio: The folio to wait on.
  *
- * This function determines if the given page is related to a backing device
- * that requires page contents to be held stable during writeback.  If so, then
+ * This function determines if the given folio is related to a backing device
+ * that requires folio contents to be held stable during writeback.  If so, 
then
  * it will wait for any pending writeback to complete.
  */
-void wait_for_stable_page(struct page *page)
+void wait_for_stable_folio(struct folio *folio)
 {
-   page = thp_head(page);
-   if (page->mapping->host->i_sb->s_iflags & SB_I_STABLE_WRITES)
-   wait_on_page_writeback(page);
+   if (folio->page.mapping->host->i_sb->s_iflags & SB_I_STABLE_WRITES)
+   wait_on_folio_writeback(folio);
 }
-EXPORT_SYMBOL_GPL(wait_for_stable_page);
+EXPORT_SYMBOL_GPL(wait_for_stable_folio);
-- 
2.30.2

[PATCH v5 25/27] mm/filemap: Convert wake_up_page_bit to wake_up_folio_bit

2021-03-19 Thread Matthew Wilcox (Oracle)

All callers have a folio, so use it directly.

Signed-off-by: Matthew Wilcox (Oracle) 
---
 mm/filemap.c | 20 ++--
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/mm/filemap.c b/mm/filemap.c
index f8746c149562..f5bacbe702ff 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1121,14 +1121,14 @@ static int wake_page_function(wait_queue_entry_t *wait, 
unsigned mode, int sync,
return (flags & WQ_FLAG_EXCLUSIVE) != 0;
 }
 
-static void wake_up_page_bit(struct page *page, int bit_nr)
+static void wake_up_folio_bit(struct folio *folio, int bit_nr)
 {
-   wait_queue_head_t *q = page_waitqueue(page);
+   wait_queue_head_t *q = page_waitqueue(&folio->page);
struct wait_page_key key;
unsigned long flags;
wait_queue_entry_t bookmark;
 
-   key.page = page;
+   key.page = &folio->page;
key.bit_nr = bit_nr;
key.page_match = 0;
 
@@ -1163,7 +1163,7 @@ static void wake_up_page_bit(struct page *page, int 
bit_nr)
 * page waiters.
 */
if (!waitqueue_active(q) || !key.page_match) {
-   ClearPageWaiters(page);
+   ClearFolioWaiters(folio);
/*
 * It's possible to miss clearing Waiters here, when we woke
 * our page waiters, but the hashed waitqueue has waiters for
@@ -1179,7 +1179,7 @@ static void wake_up_folio(struct folio *folio, int bit)
 {
if (!FolioWaiters(folio))
return;
-   wake_up_page_bit(&folio->page, bit);
+   wake_up_folio_bit(folio, bit);
 }
 
 /*
@@ -1444,7 +1444,7 @@ void unlock_folio(struct folio *folio)
BUILD_BUG_ON(PG_waiters != 7);
VM_BUG_ON_FOLIO(!FolioLocked(folio), folio);
if (clear_bit_unlock_is_negative_byte(PG_locked, folio_flags(folio, 0)))
-   wake_up_page_bit(&folio->page, PG_locked);
+   wake_up_folio_bit(folio, PG_locked);
 }
 EXPORT_SYMBOL(unlock_folio);
 
@@ -1461,10 +1461,10 @@ EXPORT_SYMBOL(unlock_folio);
  */
 void unlock_page_private_2(struct page *page)
 {
-   page = compound_head(page);
-   VM_BUG_ON_PAGE(!PagePrivate2(page), page);
-   clear_bit_unlock(PG_private_2, &page->flags);
-   wake_up_page_bit(page, PG_private_2);
+   struct folio *folio = page_folio(page);
+   VM_BUG_ON_FOLIO(!FolioPrivate2(folio), folio);
+   clear_bit_unlock(PG_private_2, folio_flags(folio, 0));
+   wake_up_folio_bit(folio, PG_private_2);
 }
 EXPORT_SYMBOL(unlock_page_private_2);
 
-- 
2.30.2

[PATCH v5 26/27] mm/filemap: Convert page wait queues to be folios

2021-03-19 Thread Matthew Wilcox (Oracle)

Reinforce that if we're waiting for a bit in a struct page, that's
actually in the head page by changing the type from page to folio.
Increases the size of cachefiles by two bytes, but the kernel core
is unchanged in size.

Signed-off-by: Matthew Wilcox (Oracle) 
---
 fs/cachefiles/rdwr.c| 16 
 include/linux/pagemap.h |  8 
 mm/filemap.c| 33 +
 3 files changed, 29 insertions(+), 28 deletions(-)

diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
index 8ffc40e84a59..ef50bd80ae74 100644
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c
@@ -25,20 +25,20 @@ static int cachefiles_read_waiter(wait_queue_entry_t *wait, 
unsigned mode,
struct cachefiles_object *object;
struct fscache_retrieval *op = monitor->op;
struct wait_page_key *key = _key;
-   struct page *page = wait->private;
+   struct folio *folio = wait->private;
 
ASSERT(key);
 
_enter("{%lu},%u,%d,{%p,%u}",
   monitor->netfs_page->index, mode, sync,
-  key->page, key->bit_nr);
+  key->folio, key->bit_nr);
 
-   if (key->page != page || key->bit_nr != PG_locked)
+   if (key->folio != folio || key->bit_nr != PG_locked)
return 0;
 
-   _debug("--- monitor %p %lx ---", page, page->flags);
+   _debug("--- monitor %p %lx ---", folio, folio->page.flags);
 
-   if (!PageUptodate(page) && !PageError(page)) {
+   if (!FolioUptodate(folio) && !FolioError(folio)) {
/* unlocked, not uptodate and not erronous? */
_debug("page probably truncated");
}
@@ -107,7 +107,7 @@ static int cachefiles_read_reissue(struct cachefiles_object 
*object,
put_page(backpage2);
 
INIT_LIST_HEAD(&monitor->op_link);
-   add_page_wait_queue(backpage, &monitor->monitor);
+   add_folio_wait_queue(page_folio(backpage), &monitor->monitor);
 
if (trylock_page(backpage)) {
ret = -EIO;
@@ -294,7 +294,7 @@ static int cachefiles_read_backing_file_one(struct 
cachefiles_object *object,
get_page(backpage);
monitor->back_page = backpage;
monitor->monitor.private = backpage;
-   add_page_wait_queue(backpage, &monitor->monitor);
+   add_folio_wait_queue(page_folio(backpage), &monitor->monitor);
monitor = NULL;
 
/* but the page may have been read before the monitor was installed, so
@@ -548,7 +548,7 @@ static int cachefiles_read_backing_file(struct 
cachefiles_object *object,
get_page(backpage);
monitor->back_page = backpage;
monitor->monitor.private = backpage;
-   add_page_wait_queue(backpage, &monitor->monitor);
+   add_folio_wait_queue(page_folio(backpage), &monitor->monitor);
monitor = NULL;
 
/* but the page may have been read before the monitor was
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 7ddaabbd1ddb..78d865c2f2da 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -669,13 +669,13 @@ static inline pgoff_t linear_page_index(struct 
vm_area_struct *vma,
 }
 
 struct wait_page_key {
-   struct page *page;
+   struct folio *folio;
int bit_nr;
int page_match;
 };
 
 struct wait_page_queue {
-   struct page *page;
+   struct folio *folio;
int bit_nr;
wait_queue_entry_t wait;
 };
@@ -683,7 +683,7 @@ struct wait_page_queue {
 static inline bool wake_page_match(struct wait_page_queue *wait_page,
  struct wait_page_key *key)
 {
-   if (wait_page->page != key->page)
+   if (wait_page->folio != key->folio)
   return false;
key->page_match = 1;
 
@@ -820,7 +820,7 @@ void page_endio(struct page *page, bool is_write, int err);
 /*
  * Add an arbitrary waiter to a page's wait queue
  */
-extern void add_page_wait_queue(struct page *page, wait_queue_entry_t *waiter);
+void add_folio_wait_queue(struct folio *folio, wait_queue_entry_t *waiter);
 
 /*
  * Fault everything in given userspace address range in.
diff --git a/mm/filemap.c b/mm/filemap.c
index f5bacbe702ff..d9238d921009 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1019,11 +1019,11 @@ EXPORT_SYMBOL(__page_cache_alloc);
  */
 #define PAGE_WAIT_TABLE_BITS 8
 #define PAGE_WAIT_TABLE_SIZE (1 << PAGE_WAIT_TABLE_BITS)
-static wait_queue_head_t page_wait_table[PAGE_WAIT_TABLE_SIZE] 
__cacheline_aligned;
+static wait_queue_head_t folio_wait_table[PAGE_WAIT_TABLE_SIZE] 
__cacheline_aligned;
 
-static wait_queue_head_t *page_waitqueue(struct page *page)
+static wait_queue_head_t *folio_waitqueue(struct folio *folio)
 {
-   return &page_wait_table[hash_ptr(page, PAGE_WAIT_TABLE_BITS)];
+   return &folio_wait_table[hash_ptr(folio, PAGE_WAIT_TABLE_BITS)];
 }
 
 void __init pagecache_init(void)
@@ -1031,7 +1031,7 @@ void __init pagecache_init(void)
int i;

[PATCH v5 24/27] mm/filemap: Convert wait_on_page_bit to wait_on_folio_bit

2021-03-19 Thread Matthew Wilcox (Oracle)

We must always wait on the folio, otherwise we won't be woken up.

This commit shrinks the kernel by 691 bytes, mostly due to moving
the page waitqueue lookup into wait_on_folio_bit_common().

Signed-off-by: Matthew Wilcox (Oracle) 
---
 include/linux/netfs.h   |  2 +-
 include/linux/pagemap.h | 10 
 mm/filemap.c| 56 ++---
 mm/page-writeback.c |  4 +--
 4 files changed, 33 insertions(+), 39 deletions(-)

diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index 9d3fbed4e30a..f44142dca767 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -54,7 +54,7 @@ static inline void unlock_page_fscache(struct page *page)
 static inline void wait_on_page_fscache(struct page *page)
 {
if (PageFsCache(page))
-   wait_on_page_bit(compound_head(page), PG_fscache);
+   wait_on_folio_bit(page_folio(page), PG_fscache);
 }
 
 enum netfs_read_source {
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index c92782b77d98..7ddaabbd1ddb 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -770,11 +770,11 @@ static inline int lock_page_or_retry(struct page *page, 
struct mm_struct *mm,
 }
 
 /*
- * This is exported only for wait_on_page_locked/wait_on_page_writeback, etc.,
+ * This is exported only for wait_on_folio_locked/wait_on_folio_writeback, 
etc.,
  * and should not be used directly.
  */
-extern void wait_on_page_bit(struct page *page, int bit_nr);
-extern int wait_on_page_bit_killable(struct page *page, int bit_nr);
+extern void wait_on_folio_bit(struct folio *folio, int bit_nr);
+extern int wait_on_folio_bit_killable(struct folio *folio, int bit_nr);
 
 /* 
  * Wait for a folio to be unlocked.
@@ -786,14 +786,14 @@ extern int wait_on_page_bit_killable(struct page *page, 
int bit_nr);
 static inline void wait_on_folio_locked(struct folio *folio)
 {
if (FolioLocked(folio))
-   wait_on_page_bit(&folio->page, PG_locked);
+   wait_on_folio_bit(folio, PG_locked);
 }
 
 static inline int wait_on_folio_locked_killable(struct folio *folio)
 {
if (!FolioLocked(folio))
return 0;
-   return wait_on_page_bit_killable(&folio->page, PG_locked);
+   return wait_on_folio_bit_killable(folio, PG_locked);
 }
 
 static inline void wait_on_page_locked(struct page *page)
diff --git a/mm/filemap.c b/mm/filemap.c
index dc7deb8c36ee..f8746c149562 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1102,7 +1102,7 @@ static int wake_page_function(wait_queue_entry_t *wait, 
unsigned mode, int sync,
 *
 * So update the flags atomically, and wake up the waiter
 * afterwards to avoid any races. This store-release pairs
-* with the load-acquire in wait_on_page_bit_common().
+* with the load-acquire in wait_on_folio_bit_common().
 */
smp_store_release(&wait->flags, flags | WQ_FLAG_WOKEN);
wake_up_state(wait->private, mode);
@@ -1183,7 +1183,7 @@ static void wake_up_folio(struct folio *folio, int bit)
 }
 
 /*
- * A choice of three behaviors for wait_on_page_bit_common():
+ * A choice of three behaviors for wait_on_folio_bit_common():
  */
 enum behavior {
EXCLUSIVE,  /* Hold ref to page and take the bit when woken, like
@@ -1217,9 +1217,10 @@ static inline bool trylock_page_bit_common(struct page 
*page, int bit_nr,
 /* How many times do we accept lock stealing from under a waiter? */
 int sysctl_page_lock_unfairness = 5;
 
-static inline int wait_on_page_bit_common(wait_queue_head_t *q,
-   struct page *page, int bit_nr, int state, enum behavior behavior)
+static inline int wait_on_folio_bit_common(struct folio *folio, int bit_nr,
+   int state, enum behavior behavior)
 {
+   wait_queue_head_t *q = page_waitqueue(&folio->page);
int unfairness = sysctl_page_lock_unfairness;
struct wait_page_queue wait_page;
wait_queue_entry_t *wait = &wait_page.wait;
@@ -1228,8 +1229,8 @@ static inline int 
wait_on_page_bit_common(wait_queue_head_t *q,
unsigned long pflags;
 
if (bit_nr == PG_locked &&
-   !PageUptodate(page) && PageWorkingset(page)) {
-   if (!PageSwapBacked(page)) {
+   !FolioUptodate(folio) && FolioWorkingset(folio)) {
+   if (!FolioSwapBacked(folio)) {
delayacct_thrashing_start();
delayacct = true;
}
@@ -1239,7 +1240,7 @@ static inline int 
wait_on_page_bit_common(wait_queue_head_t *q,
 
init_wait(wait);
wait->func = wake_page_function;
-   wait_page.page = page;
+   wait_page.page = &folio->page;
wait_page.bit_nr = bit_nr;
 
 repeat:
@@ -1254,7 +1255,7 @@ static inline int 
wait_on_page_bit_common(wait_queue_head_t *q,
 * Do one last check whether we can get the
 * page bit synchronously.
 *
-* Do the SetPageWaiters() marking before that
+* Do

[PATCH v5 22/27] mm/writeback: Add wait_on_folio_writeback

2021-03-19 Thread Matthew Wilcox (Oracle)

wait_on_page_writeback_killable() only has one caller, so convert it to
call wait_on_folio_writeback_killable().  For the wait_on_page_writeback()
callers, add a compatibility wrapper around wait_on_folio_writeback().

Turning PageWriteback() into FolioWriteback() eliminates a call to
compound_head() which saves 8 bytes and 15 bytes in the two functions.
That is more than offset by adding the wait_on_page_writeback
compatibility wrapper for a net increase in text of 15 bytes.

Signed-off-by: Matthew Wilcox (Oracle) 
---
 fs/afs/write.c  |  2 +-
 include/linux/pagemap.h |  3 ++-
 mm/folio-compat.c   |  6 ++
 mm/page-writeback.c | 43 +++--
 4 files changed, 38 insertions(+), 16 deletions(-)

diff --git a/fs/afs/write.c b/fs/afs/write.c
index 106a864b6a93..4b70b0e7fcfa 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -850,7 +850,7 @@ vm_fault_t afs_page_mkwrite(struct vm_fault *vmf)
return VM_FAULT_RETRY;
 #endif
 
-   if (wait_on_page_writeback_killable(page))
+   if (wait_on_folio_writeback_killable(page_folio(page)))
return VM_FAULT_RETRY;
 
if (lock_page_killable(page) < 0)
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 2ee6b1b9561c..a6adf69ea5c5 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -808,7 +808,8 @@ static inline int wait_on_page_locked_killable(struct page 
*page)
 
 int put_and_wait_on_page_locked(struct page *page, int state);
 void wait_on_page_writeback(struct page *page);
-int wait_on_page_writeback_killable(struct page *page);
+void wait_on_folio_writeback(struct folio *folio);
+int wait_on_folio_writeback_killable(struct folio *folio);
 void end_page_writeback(struct page *page);
 void end_folio_writeback(struct folio *folio);
 void wait_for_stable_page(struct page *page);
diff --git a/mm/folio-compat.c b/mm/folio-compat.c
index d1a1dfe52589..6aadecc39fba 100644
--- a/mm/folio-compat.c
+++ b/mm/folio-compat.c
@@ -23,3 +23,9 @@ void end_page_writeback(struct page *page)
return end_folio_writeback(page_folio(page));
 }
 EXPORT_SYMBOL(end_page_writeback);
+
+void wait_on_page_writeback(struct page *page)
+{
+   return wait_on_folio_writeback(page_folio(page));
+}
+EXPORT_SYMBOL_GPL(wait_on_page_writeback);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 5e761fb62800..a08e77abcf12 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2818,33 +2818,48 @@ int __test_set_page_writeback(struct page *page, bool 
keep_write)
 }
 EXPORT_SYMBOL(__test_set_page_writeback);
 
-/*
- * Wait for a page to complete writeback
+/**
+ * wait_on_folio_writeback - Wait for a folio to complete writeback.
+ * @folio: The folio to wait for.
+ *
+ * If the folio is currently being written back to storage, wait for the
+ * I/O to complete.
+ *
+ * Context: Sleeps; must be called in process context and with no spinlocks
+ * held.
  */
-void wait_on_page_writeback(struct page *page)
+void wait_on_folio_writeback(struct folio *folio)
 {
-   while (PageWriteback(page)) {
-   trace_wait_on_page_writeback(page, page_mapping(page));
-   wait_on_page_bit(page, PG_writeback);
+   while (FolioWriteback(folio)) {
+   trace_wait_on_page_writeback(&folio->page, 
folio_mapping(folio));
+   wait_on_page_bit(&folio->page, PG_writeback);
}
 }
-EXPORT_SYMBOL_GPL(wait_on_page_writeback);
+EXPORT_SYMBOL_GPL(wait_on_folio_writeback);
 
-/*
- * Wait for a page to complete writeback.  Returns -EINTR if we get a
+/**
+ * wait_on_folio_writeback_killable - Wait for a folio to complete writeback.
+ * @folio: The folio to wait for.
+ *
+ * If the folio is currently being written back to storage, wait for the
+ * I/O to complete or a fatal signal to arrive.
+ *
+ * Context: Sleeps; must be called in process context and with no spinlocks
+ * held.
+ * Return: 0 if the folio has completed writeback.  -EINTR if we get a
  * fatal signal while waiting.
  */
-int wait_on_page_writeback_killable(struct page *page)
+int wait_on_folio_writeback_killable(struct folio *folio)
 {
-   while (PageWriteback(page)) {
-   trace_wait_on_page_writeback(page, page_mapping(page));
-   if (wait_on_page_bit_killable(page, PG_writeback))
+   while (FolioWriteback(folio)) {
+   trace_wait_on_page_writeback(&folio->page, 
folio_mapping(folio));
+   if (wait_on_page_bit_killable(&folio->page, PG_writeback))
return -EINTR;
}
 
return 0;
 }
-EXPORT_SYMBOL_GPL(wait_on_page_writeback_killable);
+EXPORT_SYMBOL_GPL(wait_on_folio_writeback_killable);
 
 /**
  * wait_for_stable_page() - wait for writeback to finish, if necessary.
-- 
2.30.2

[PATCH v5 20/27] mm/filemap: Add wait_on_folio_locked

2021-03-19 Thread Matthew Wilcox (Oracle)

Also add wait_on_folio_locked_killable().  Turn wait_on_page_locked()
and wait_on_page_locked_killable() into wrappers.  This eliminates a
call to compound_head() from each call-site, reducing text size by 200
bytes for me.

Signed-off-by: Matthew Wilcox (Oracle) 
---
 include/linux/pagemap.h | 26 ++
 mm/filemap.c|  4 ++--
 2 files changed, 20 insertions(+), 10 deletions(-)

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 38f4ee28a3a5..a8e19e4e0b09 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -777,23 +777,33 @@ extern void wait_on_page_bit(struct page *page, int 
bit_nr);
 extern int wait_on_page_bit_killable(struct page *page, int bit_nr);
 
 /* 
- * Wait for a page to be unlocked.
+ * Wait for a folio to be unlocked.
  *
- * This must be called with the caller "holding" the page,
- * ie with increased "page->count" so that the page won't
+ * This must be called with the caller "holding" the folio,
+ * ie with increased "page->count" so that the folio won't
  * go away during the wait..
  */
+static inline void wait_on_folio_locked(struct folio *folio)
+{
+   if (FolioLocked(folio))
+   wait_on_page_bit(&folio->page, PG_locked);
+}
+
+static inline int wait_on_folio_locked_killable(struct folio *folio)
+{
+   if (!FolioLocked(folio))
+   return 0;
+   return wait_on_page_bit_killable(&folio->page, PG_locked);
+}
+
 static inline void wait_on_page_locked(struct page *page)
 {
-   if (PageLocked(page))
-   wait_on_page_bit(compound_head(page), PG_locked);
+   wait_on_folio_locked(page_folio(page));
 }
 
 static inline int wait_on_page_locked_killable(struct page *page)
 {
-   if (!PageLocked(page))
-   return 0;
-   return wait_on_page_bit_killable(compound_head(page), PG_locked);
+   return wait_on_folio_locked_killable(page_folio(page));
 }
 
 int put_and_wait_on_page_locked(struct page *page, int state);
diff --git a/mm/filemap.c b/mm/filemap.c
index 35e16db2e2be..99758045ec2d 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1604,9 +1604,9 @@ int __lock_folio_or_retry(struct folio *folio, struct 
mm_struct *mm,
 
mmap_read_unlock(mm);
if (flags & FAULT_FLAG_KILLABLE)
-   wait_on_page_locked_killable(page);
+   wait_on_folio_locked_killable(folio);
else
-   wait_on_page_locked(page);
+   wait_on_folio_locked(folio);
return 0;
}
if (flags & FAULT_FLAG_KILLABLE) {
-- 
2.30.2

[PATCH v5 21/27] mm/filemap: Add end_folio_writeback

2021-03-19 Thread Matthew Wilcox (Oracle)

Add an end_page_writeback() wrapper function for users that are not yet
converted to folios.

end_folio_writeback() is less than half the size of end_page_writeback()
at just 105 bytes compared to 213 bytes, due to removing all the
compound_head() calls.  The 30 byte wrapper function makes this a net
saving of 70 bytes.

Signed-off-by: Matthew Wilcox (Oracle) 
---
 include/linux/pagemap.h |  3 ++-
 mm/filemap.c| 38 +++---
 mm/folio-compat.c   |  6 ++
 3 files changed, 27 insertions(+), 20 deletions(-)

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index a8e19e4e0b09..2ee6b1b9561c 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -809,7 +809,8 @@ static inline int wait_on_page_locked_killable(struct page 
*page)
 int put_and_wait_on_page_locked(struct page *page, int state);
 void wait_on_page_writeback(struct page *page);
 int wait_on_page_writeback_killable(struct page *page);
-extern void end_page_writeback(struct page *page);
+void end_page_writeback(struct page *page);
+void end_folio_writeback(struct folio *folio);
 void wait_for_stable_page(struct page *page);
 
 void page_endio(struct page *page, bool is_write, int err);
diff --git a/mm/filemap.c b/mm/filemap.c
index 99758045ec2d..dc7deb8c36ee 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1175,11 +1175,11 @@ static void wake_up_page_bit(struct page *page, int 
bit_nr)
spin_unlock_irqrestore(&q->lock, flags);
 }
 
-static void wake_up_page(struct page *page, int bit)
+static void wake_up_folio(struct folio *folio, int bit)
 {
-   if (!PageWaiters(page))
+   if (!FolioWaiters(folio))
return;
-   wake_up_page_bit(page, bit);
+   wake_up_page_bit(&folio->page, bit);
 }
 
 /*
@@ -1473,38 +1473,38 @@ void unlock_page_private_2(struct page *page)
 EXPORT_SYMBOL(unlock_page_private_2);
 
 /**
- * end_page_writeback - end writeback against a page
- * @page: the page
+ * end_folio_writeback - End writeback against a folio.
+ * @folio: The folio.
  */
-void end_page_writeback(struct page *page)
+void end_folio_writeback(struct folio *folio)
 {
/*
 * TestClearPageReclaim could be used here but it is an atomic
 * operation and overkill in this particular case. Failing to
-* shuffle a page marked for immediate reclaim is too mild to
+* shuffle a folio marked for immediate reclaim is too mild to
 * justify taking an atomic operation penalty at the end of
-* ever page writeback.
+* every folio writeback.
 */
-   if (PageReclaim(page)) {
-   ClearPageReclaim(page);
-   rotate_reclaimable_page(page);
+   if (FolioReclaim(folio)) {
+   ClearFolioReclaim(folio);
+   rotate_reclaimable_page(&folio->page);
}
 
/*
-* Writeback does not hold a page reference of its own, relying
+* Writeback does not hold a folio reference of its own, relying
 * on truncation to wait for the clearing of PG_writeback.
-* But here we must make sure that the page is not freed and
-* reused before the wake_up_page().
+* But here we must make sure that the folio is not freed and
+* reused before the wake_up_folio().
 */
-   get_page(page);
-   if (!test_clear_page_writeback(page))
+   get_folio(folio);
+   if (!test_clear_page_writeback(&folio->page))
BUG();
 
smp_mb__after_atomic();
-   wake_up_page(page, PG_writeback);
-   put_page(page);
+   wake_up_folio(folio, PG_writeback);
+   put_folio(folio);
 }
-EXPORT_SYMBOL(end_page_writeback);
+EXPORT_SYMBOL(end_folio_writeback);
 
 /*
  * After completing I/O on a page, call this routine to update the page
diff --git a/mm/folio-compat.c b/mm/folio-compat.c
index 02798abf19a1..d1a1dfe52589 100644
--- a/mm/folio-compat.c
+++ b/mm/folio-compat.c
@@ -17,3 +17,9 @@ void unlock_page(struct page *page)
return unlock_folio(page_folio(page));
 }
 EXPORT_SYMBOL(unlock_page);
+
+void end_page_writeback(struct page *page)
+{
+   return end_folio_writeback(page_folio(page));
+}
+EXPORT_SYMBOL(end_page_writeback);
-- 
2.30.2

[PATCH v5 18/27] mm/filemap: Add __lock_folio_async

2021-03-19 Thread Matthew Wilcox (Oracle)

There aren't any actual callers of lock_page_async(), so remove it.
Convert filemap_update_page() to call __lock_folio_async().

__lock_folio_async() is 21 bytes smaller than __lock_page_async(),
but the real savings come from using a folio in filemap_update_page(),
shrinking it from 514 bytes to 403 bytes, saving 111 bytes.  The text
shrinks by 132 bytes in total.

Signed-off-by: Matthew Wilcox (Oracle) 
---
 fs/io_uring.c   |  2 +-
 include/linux/pagemap.h | 17 -
 mm/filemap.c| 31 ---
 3 files changed, 17 insertions(+), 33 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index b882bc4c5af7..ad0dc9afd194 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -3201,7 +3201,7 @@ static int io_read_prep(struct io_kiocb *req, const 
struct io_uring_sqe *sqe)
 }
 
 /*
- * This is our waitqueue callback handler, registered through lock_page_async()
+ * This is our waitqueue callback handler, registered through 
lock_folio_async()
  * when we initially tried to do the IO with the iocb armed our waitqueue.
  * This gets called when the page is unlocked, and we generally expect that to
  * happen when the page IO is completed and the page is now uptodate. This will
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index aa7f564e5ecf..3cd1b5e28593 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -695,7 +695,6 @@ static inline bool wake_page_match(struct wait_page_queue 
*wait_page,
 
 void __lock_folio(struct folio *folio);
 int __lock_folio_killable(struct folio *folio);
-extern int __lock_page_async(struct page *page, struct wait_page_queue *wait);
 extern int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
unsigned int flags);
 void unlock_page(struct page *page);
@@ -753,22 +752,6 @@ static inline int lock_page_killable(struct page *page)
return lock_folio_killable(page_folio(page));
 }
 
-/*
- * lock_page_async - Lock the page, unless this would block. If the page
- * is already locked, then queue a callback when the page becomes unlocked.
- * This callback can then retry the operation.
- *
- * Returns 0 if the page is locked successfully, or -EIOCBQUEUED if the page
- * was already locked and the callback defined in 'wait' was queued.
- */
-static inline int lock_page_async(struct page *page,
- struct wait_page_queue *wait)
-{
-   if (!trylock_page(page))
-   return __lock_page_async(page, wait);
-   return 0;
-}
-
 /*
  * lock_page_or_retry - Lock the page, unless this would block and the
  * caller indicated that it can handle a retry.
diff --git a/mm/filemap.c b/mm/filemap.c
index 7cac47db78a5..12dc672adc2e 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1554,18 +1554,18 @@ int __lock_folio_killable(struct folio *folio)
 }
 EXPORT_SYMBOL_GPL(__lock_folio_killable);
 
-int __lock_page_async(struct page *page, struct wait_page_queue *wait)
+static int __lock_folio_async(struct folio *folio, struct wait_page_queue 
*wait)
 {
-   struct wait_queue_head *q = page_waitqueue(page);
+   struct wait_queue_head *q = page_waitqueue(&folio->page);
int ret = 0;
 
-   wait->page = page;
+   wait->page = &folio->page;
wait->bit_nr = PG_locked;
 
spin_lock_irq(&q->lock);
__add_wait_queue_entry_tail(q, &wait->wait);
-   SetPageWaiters(page);
-   ret = !trylock_page(page);
+   SetFolioWaiters(folio);
+   ret = !trylock_folio(folio);
/*
 * If we were successful now, we know we're still on the
 * waitqueue as we're still under the lock. This means it's
@@ -2312,41 +2312,42 @@ static int filemap_update_page(struct kiocb *iocb,
struct address_space *mapping, struct iov_iter *iter,
struct page *page)
 {
+   struct folio *folio = page_folio(page);
int error;
 
-   if (!trylock_page(page)) {
+   if (!trylock_folio(folio)) {
if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_NOIO))
return -EAGAIN;
if (!(iocb->ki_flags & IOCB_WAITQ)) {
-   put_and_wait_on_page_locked(page, TASK_KILLABLE);
+   put_and_wait_on_page_locked(&folio->page, 
TASK_KILLABLE);
return AOP_TRUNCATED_PAGE;
}
-   error = __lock_page_async(page, iocb->ki_waitq);
+   error = __lock_folio_async(folio, iocb->ki_waitq);
if (error)
return error;
}
 
-   if (!page->mapping)
+   if (!folio->page.mapping)
goto truncated;
 
error = 0;
-   if (filemap_range_uptodate(mapping, iocb->ki_pos, iter, page))
+   if (filemap_range_uptodate(mapping, iocb->ki_pos, iter, &folio->page))
goto unlock;
 
error = -EAGAIN;
if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT | IOCB_WAITQ))

[PATCH v5 19/27] mm/filemap: Add __lock_folio_or_retry

2021-03-19 Thread Matthew Wilcox (Oracle)

Convert __lock_page_or_retry() to __lock_folio_or_retry().  This actually
saves 4 bytes in the only caller of lock_page_or_retry() (due to better
register allocation) and saves the 20 byte cost of calling page_folio()
in __lock_folio_or_retry() for a total saving of 24 bytes.

Signed-off-by: Matthew Wilcox (Oracle) 
---
 include/linux/pagemap.h |  9 ++---
 mm/filemap.c| 10 --
 mm/memory.c |  8 
 3 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 3cd1b5e28593..38f4ee28a3a5 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -695,7 +695,7 @@ static inline bool wake_page_match(struct wait_page_queue 
*wait_page,
 
 void __lock_folio(struct folio *folio);
 int __lock_folio_killable(struct folio *folio);
-extern int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
+int __lock_folio_or_retry(struct folio *folio, struct mm_struct *mm,
unsigned int flags);
 void unlock_page(struct page *page);
 void unlock_folio(struct folio *folio);
@@ -757,13 +757,16 @@ static inline int lock_page_killable(struct page *page)
  * caller indicated that it can handle a retry.
  *
  * Return value and mmap_lock implications depend on flags; see
- * __lock_page_or_retry().
+ * __lock_folio_or_retry().
  */
 static inline int lock_page_or_retry(struct page *page, struct mm_struct *mm,
 unsigned int flags)
 {
+   struct folio *folio;
might_sleep();
-   return trylock_page(page) || __lock_page_or_retry(page, mm, flags);
+
+   folio = page_folio(page);
+   return trylock_folio(folio) || __lock_folio_or_retry(folio, mm, flags);
 }
 
 /*
diff --git a/mm/filemap.c b/mm/filemap.c
index 12dc672adc2e..35e16db2e2be 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1582,20 +1582,18 @@ static int __lock_folio_async(struct folio *folio, 
struct wait_page_queue *wait)
 
 /*
  * Return values:
- * 1 - page is locked; mmap_lock is still held.
- * 0 - page is not locked.
+ * 1 - folio is locked; mmap_lock is still held.
+ * 0 - folio is not locked.
  * mmap_lock has been released (mmap_read_unlock(), unless flags had both
  * FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_RETRY_NOWAIT set, in
  * which case mmap_lock is still held.
  *
  * If neither ALLOW_RETRY nor KILLABLE are set, will always return 1
- * with the page locked and the mmap_lock unperturbed.
+ * with the folio locked and the mmap_lock unperturbed.
  */
-int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
+int __lock_folio_or_retry(struct folio *folio, struct mm_struct *mm,
 unsigned int flags)
 {
-   struct folio *folio = page_folio(page);
-
if (fault_flag_allow_retry_first(flags)) {
/*
 * CAUTION! In this case, mmap_lock is not released
diff --git a/mm/memory.c b/mm/memory.c
index d3273bd69dbb..9c3554972e2d 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4056,7 +4056,7 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf)
  * We enter with non-exclusive mmap_lock (to exclude vma changes,
  * but allow concurrent faults).
  * The mmap_lock may have been released depending on flags and our
- * return value.  See filemap_fault() and __lock_page_or_retry().
+ * return value.  See filemap_fault() and __lock_folio_or_retry().
  * If mmap_lock is released, vma may become invalid (for example
  * by other thread calling munmap()).
  */
@@ -4288,7 +4288,7 @@ static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t 
orig_pud)
  * concurrent faults).
  *
  * The mmap_lock may have been released depending on flags and our return 
value.
- * See filemap_fault() and __lock_page_or_retry().
+ * See filemap_fault() and __lock_folio_or_retry().
  */
 static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
 {
@@ -4392,7 +4392,7 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
  * By the time we get here, we already hold the mm semaphore
  *
  * The mmap_lock may have been released depending on flags and our
- * return value.  See filemap_fault() and __lock_page_or_retry().
+ * return value.  See filemap_fault() and __lock_folio_or_retry().
  */
 static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
unsigned long address, unsigned int flags)
@@ -4548,7 +4548,7 @@ static inline void mm_account_fault(struct pt_regs *regs,
  * By the time we get here, we already hold the mm semaphore
  *
  * The mmap_lock may have been released depending on flags and our
- * return value.  See filemap_fault() and __lock_page_or_retry().
+ * return value.  See filemap_fault() and __lock_folio_or_retry().
  */
 vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
   unsigned int flags, struct pt_regs *regs)
-- 
2.30.2

[PATCH v5 17/27] mm/filemap: Add lock_folio_killable

2021-03-19 Thread Matthew Wilcox (Oracle)

This is like lock_page_killable() but for use by callers who
know they have a folio.  Convert __lock_page_killable() to be
__lock_folio_killable().  This saves one call to compound_head() per
contended call to lock_page_killable().

__lock_folio_killable() is 20 bytes smaller than __lock_page_killable()
was.  lock_page_maybe_drop_mmap() shrinks by 68 bytes and
__lock_page_or_retry() shrinks by 66 bytes.  That's a total of 154 bytes
of text saved.

Signed-off-by: Matthew Wilcox (Oracle) 
---
 include/linux/pagemap.h | 15 ++-
 mm/filemap.c| 17 +
 2 files changed, 19 insertions(+), 13 deletions(-)

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index c96ba0dfe111..aa7f564e5ecf 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -694,7 +694,7 @@ static inline bool wake_page_match(struct wait_page_queue 
*wait_page,
 }
 
 void __lock_folio(struct folio *folio);
-extern int __lock_page_killable(struct page *page);
+int __lock_folio_killable(struct folio *folio);
 extern int __lock_page_async(struct page *page, struct wait_page_queue *wait);
 extern int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
unsigned int flags);
@@ -735,6 +735,14 @@ static inline void lock_page(struct page *page)
__lock_folio(folio);
 }
 
+static inline int lock_folio_killable(struct folio *folio)
+{
+   might_sleep();
+   if (!trylock_folio(folio))
+   return __lock_folio_killable(folio);
+   return 0;
+}
+
 /*
  * lock_page_killable is like lock_page but can be interrupted by fatal
  * signals.  It returns 0 if it locked the page and -EINTR if it was
@@ -742,10 +750,7 @@ static inline void lock_page(struct page *page)
  */
 static inline int lock_page_killable(struct page *page)
 {
-   might_sleep();
-   if (!trylock_page(page))
-   return __lock_page_killable(page);
-   return 0;
+   return lock_folio_killable(page_folio(page));
 }
 
 /*
diff --git a/mm/filemap.c b/mm/filemap.c
index 99c05e2c0eea..7cac47db78a5 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1546,14 +1546,13 @@ void __lock_folio(struct folio *folio)
 }
 EXPORT_SYMBOL(__lock_folio);
 
-int __lock_page_killable(struct page *__page)
+int __lock_folio_killable(struct folio *folio)
 {
-   struct page *page = compound_head(__page);
-   wait_queue_head_t *q = page_waitqueue(page);
-   return wait_on_page_bit_common(q, page, PG_locked, TASK_KILLABLE,
+   wait_queue_head_t *q = page_waitqueue(&folio->page);
+   return wait_on_page_bit_common(q, &folio->page, PG_locked, 
TASK_KILLABLE,
EXCLUSIVE);
 }
-EXPORT_SYMBOL_GPL(__lock_page_killable);
+EXPORT_SYMBOL_GPL(__lock_folio_killable);
 
 int __lock_page_async(struct page *page, struct wait_page_queue *wait)
 {
@@ -1595,6 +1594,8 @@ int __lock_page_async(struct page *page, struct 
wait_page_queue *wait)
 int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
 unsigned int flags)
 {
+   struct folio *folio = page_folio(page);
+
if (fault_flag_allow_retry_first(flags)) {
/*
 * CAUTION! In this case, mmap_lock is not released
@@ -1613,13 +1614,13 @@ int __lock_page_or_retry(struct page *page, struct 
mm_struct *mm,
if (flags & FAULT_FLAG_KILLABLE) {
int ret;
 
-   ret = __lock_page_killable(page);
+   ret = __lock_folio_killable(folio);
if (ret) {
mmap_read_unlock(mm);
return 0;
}
} else {
-   __lock_folio(page_folio(page));
+   __lock_folio(folio);
}
 
return 1;
@@ -2781,7 +2782,7 @@ static int lock_page_maybe_drop_mmap(struct vm_fault 
*vmf, struct page *page,
 
*fpin = maybe_unlock_mmap_for_io(vmf, *fpin);
if (vmf->flags & FAULT_FLAG_KILLABLE) {
-   if (__lock_page_killable(&folio->page)) {
+   if (__lock_folio_killable(folio)) {
/*
 * We didn't have the right flags to drop the mmap_lock,
 * but all fault_handlers only check for fatal signals
-- 
2.30.2

[PATCH v5 15/27] mm/filemap: Add unlock_folio

2021-03-19 Thread Matthew Wilcox (Oracle)

Convert unlock_page() to call unlock_folio().  By using a folio we
avoid a call to compound_head().  This shortens the function from 39
bytes to 25 and removes 4 instructions on x86-64.  Because we still
have unlock_page(), it's a net increase of 24 bytes of text for the
kernel as a whole, but any path that uses unlock_folio() will execute
4 fewer instructions.

Signed-off-by: Matthew Wilcox (Oracle) 
---
 include/linux/pagemap.h |  3 ++-
 mm/filemap.c| 27 ++-
 mm/folio-compat.c   |  6 ++
 3 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 90e970f48039..c211868086e0 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -698,7 +698,8 @@ extern int __lock_page_killable(struct page *page);
 extern int __lock_page_async(struct page *page, struct wait_page_queue *wait);
 extern int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
unsigned int flags);
-extern void unlock_page(struct page *page);
+void unlock_page(struct page *page);
+void unlock_folio(struct folio *folio);
 void unlock_page_private_2(struct page *page);
 
 /*
diff --git a/mm/filemap.c b/mm/filemap.c
index eeeb8e2cc36a..47ac8126a12e 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1435,29 +1435,22 @@ static inline bool 
clear_bit_unlock_is_negative_byte(long nr, volatile void *mem
 #endif
 
 /**
- * unlock_page - unlock a locked page
- * @page: the page
+ * unlock_folio - Unlock a locked folio.
+ * @folio: The folio.
  *
- * Unlocks the page and wakes up sleepers in wait_on_page_locked().
- * Also wakes sleepers in wait_on_page_writeback() because the wakeup
- * mechanism between PageLocked pages and PageWriteback pages is shared.
- * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep.
+ * Unlocks the folio and wakes up any thread sleeping on the page lock.
  *
- * Note that this depends on PG_waiters being the sign bit in the byte
- * that contains PG_locked - thus the BUILD_BUG_ON(). That allows us to
- * clear the PG_locked bit and test PG_waiters at the same time fairly
- * portably (architectures that do LL/SC can test any bit, while x86 can
- * test the sign bit).
+ * Context: May be called from interrupt or process context.  May not be
+ * called from NMI context.
  */
-void unlock_page(struct page *page)
+void unlock_folio(struct folio *folio)
 {
BUILD_BUG_ON(PG_waiters != 7);
-   page = compound_head(page);
-   VM_BUG_ON_PAGE(!PageLocked(page), page);
-   if (clear_bit_unlock_is_negative_byte(PG_locked, &page->flags))
-   wake_up_page_bit(page, PG_locked);
+   VM_BUG_ON_FOLIO(!FolioLocked(folio), folio);
+   if (clear_bit_unlock_is_negative_byte(PG_locked, folio_flags(folio, 0)))
+   wake_up_page_bit(&folio->page, PG_locked);
 }
-EXPORT_SYMBOL(unlock_page);
+EXPORT_SYMBOL(unlock_folio);
 
 /**
  * unlock_page_private_2 - Unlock a page that's locked with PG_private_2
diff --git a/mm/folio-compat.c b/mm/folio-compat.c
index 5e107aa30a62..02798abf19a1 100644
--- a/mm/folio-compat.c
+++ b/mm/folio-compat.c
@@ -11,3 +11,9 @@ struct address_space *page_mapping(struct page *page)
return folio_mapping(page_folio(page));
 }
 EXPORT_SYMBOL(page_mapping);
+
+void unlock_page(struct page *page)
+{
+   return unlock_folio(page_folio(page));
+}
+EXPORT_SYMBOL(unlock_page);
-- 
2.30.2

[PATCH v5 13/27] mm/util: Add folio_mapping and folio_file_mapping

2021-03-19 Thread Matthew Wilcox (Oracle)

These are the folio equivalent of page_mapping() and page_file_mapping().
Add an out-of-line page_mapping() wrapper around folio_mapping()
in order to prevent the page_folio() call from bloating every caller
of page_mapping().  Adjust page_file_mapping() and page_mapping_file()
to use folios internally.  Rename __page_file_mapping() to
swapcache_mapping() and change it to take a folio.

This ends up saving 186 bytes of text overall.  folio_mapping() is
45 bytes shorter than page_mapping() was, but the new page_mapping()
wrapper is 30 bytes.  The major reduction is a few bytes less in dozens
of nfs functions (which call page_file_mapping()).  Most of these appear
to be a slight change in gcc's register allocation decisions, which allow:

   48 8b 56 08 mov0x8(%rsi),%rdx
   48 8d 42 ff lea-0x1(%rdx),%rax
   83 e2 01and$0x1,%edx
   48 0f 44 c6 cmove  %rsi,%rax

to become:

   48 8b 46 08 mov0x8(%rsi),%rax
   48 8d 78 ff lea-0x1(%rax),%rdi
   a8 01   test   $0x1,%al
   48 0f 44 fe cmove  %rsi,%rdi

for a reduction of a single byte.  Once the NFS client is converted to
use folios, this entire sequence will disappear.

Signed-off-by: Matthew Wilcox (Oracle) 
---
 include/linux/mm.h  | 14 --
 include/linux/pagemap.h | 35 +--
 include/linux/swap.h|  6 ++
 mm/Makefile |  2 +-
 mm/folio-compat.c   | 13 +
 mm/swapfile.c   |  8 
 mm/util.c   | 30 ++
 7 files changed, 75 insertions(+), 33 deletions(-)
 create mode 100644 mm/folio-compat.c

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 8fc7b04a1438..bc626c19f9f8 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1732,19 +1732,6 @@ void page_address_init(void);
 
 extern void *page_rmapping(struct page *page);
 extern struct anon_vma *page_anon_vma(struct page *page);
-extern struct address_space *page_mapping(struct page *page);
-
-extern struct address_space *__page_file_mapping(struct page *);
-
-static inline
-struct address_space *page_file_mapping(struct page *page)
-{
-   if (unlikely(PageSwapCache(page)))
-   return __page_file_mapping(page);
-
-   return page->mapping;
-}
-
 extern pgoff_t __page_file_index(struct page *page);
 
 /*
@@ -1759,7 +1746,6 @@ static inline pgoff_t page_index(struct page *page)
 }
 
 bool page_mapped(struct page *page);
-struct address_space *page_mapping(struct page *page);
 
 /*
  * Return true only if the page has been allocated with
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index f29c96ed3721..90e970f48039 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -162,14 +162,45 @@ static inline void filemap_nr_thps_dec(struct 
address_space *mapping)
 
 void release_pages(struct page **pages, int nr);
 
+struct address_space *page_mapping(struct page *);
+struct address_space *folio_mapping(struct folio *);
+struct address_space *swapcache_mapping(struct folio *);
+
+/**
+ * folio_file_mapping - Find the mapping this folio belongs to.
+ * @folio: The folio.
+ *
+ * For folios which are in the page cache, return the mapping that this
+ * page belongs to.  Folios in the swap cache return the mapping of the
+ * swap file or swap device where the data is stored.  This is different
+ * from the mapping returned by folio_mapping().  The only reason to
+ * use it is if, like NFS, you return 0 from ->activate_swapfile.
+ *
+ * Do not call this for folios which aren't in the page cache or swap cache.
+ */
+static inline struct address_space *folio_file_mapping(struct folio *folio)
+{
+   if (unlikely(FolioSwapCache(folio)))
+   return swapcache_mapping(folio);
+
+   return folio->page.mapping;
+}
+
+static inline struct address_space *page_file_mapping(struct page *page)
+{
+   return folio_file_mapping(page_folio(page));
+}
+
 /*
  * For file cache pages, return the address_space, otherwise return NULL
  */
 static inline struct address_space *page_mapping_file(struct page *page)
 {
-   if (unlikely(PageSwapCache(page)))
+   struct folio *folio = page_folio(page);
+
+   if (unlikely(FolioSwapCache(folio)))
return NULL;
-   return page_mapping(page);
+   return folio_mapping(folio);
 }
 
 /*
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 4c3a844ac9b4..09316a5c33e9 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -314,6 +314,12 @@ struct vma_swap_readahead {
 #endif
 };
 
+static inline swp_entry_t folio_swap_entry(struct folio *folio)
+{
+   swp_entry_t entry = { .val = page_private(&folio->page) };
+   return entry;
+}
+
 /* linux/mm/workingset.c */
 void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages);
 void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg);
diff --git a/mm/Makefile b/mm/M

[PATCH v5 14/27] mm/memcg: Add folio wrappers for various functions

2021-03-19 Thread Matthew Wilcox (Oracle)

Add new wrapper functions folio_memcg(), lock_folio_memcg(),
unlock_folio_memcg() and mem_cgroup_folio_lruvec().

Signed-off-by: Matthew Wilcox (Oracle) 
---
 include/linux/memcontrol.h | 21 +
 1 file changed, 21 insertions(+)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 4064c9dda534..493136f495b6 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -397,6 +397,11 @@ static inline struct mem_cgroup *page_memcg(struct page 
*page)
return (struct mem_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
 }
 
+static inline struct mem_cgroup *folio_memcg(struct folio *folio)
+{
+   return page_memcg(&folio->page);
+}
+
 /*
  * page_memcg_rcu - locklessly get the memory cgroup associated with a page
  * @page: a pointer to the page struct
@@ -1400,6 +1405,22 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t 
*pgdat, int order,
 }
 #endif /* CONFIG_MEMCG */
 
+static inline void lock_folio_memcg(struct folio *folio)
+{
+   lock_page_memcg(&folio->page);
+}
+
+static inline void unlock_folio_memcg(struct folio *folio)
+{
+   unlock_page_memcg(&folio->page);
+}
+
+static inline struct lruvec *mem_cgroup_folio_lruvec(struct folio *folio,
+   struct pglist_data *pgdat)
+{
+   return mem_cgroup_page_lruvec(&folio->page, pgdat);
+}
+
 static inline void __inc_lruvec_kmem_state(void *p, enum node_stat_item idx)
 {
__mod_lruvec_kmem_state(p, idx, 1);
-- 
2.30.2

[PATCH v5 12/27] mm: Add folio_index, folio_file_page and folio_contains

2021-03-19 Thread Matthew Wilcox (Oracle)

folio_index() is the equivalent of page_index() for folios.
folio_file_page() is the equivalent of find_subpage().
folio_contains() is the equivalent of thp_contains().

Signed-off-by: Matthew Wilcox (Oracle) 
---
 include/linux/pagemap.h | 53 +
 1 file changed, 53 insertions(+)

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 6676210addf6..f29c96ed3721 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -462,6 +462,59 @@ static inline bool thp_contains(struct page *head, pgoff_t 
index)
return page_index(head) == (index & ~(thp_nr_pages(head) - 1UL));
 }
 
+#define swapcache_index(folio) __page_file_index(&(folio)->page)
+
+/**
+ * folio_index - File index of a folio.
+ * @folio: The folio.
+ *
+ * For a folio which is either in the page cache or the swap cache,
+ * return its index within the address_space it belongs to.  If you know
+ * the page is definitely in the page cache, you can look at the folio's
+ * index directly.
+ *
+ * Return: The index (offset in units of pages) of a folio in its file.
+ */
+static inline pgoff_t folio_index(struct folio *folio)
+{
+if (unlikely(FolioSwapCache(folio)))
+return swapcache_index(folio);
+return folio->page.index;
+}
+
+/**
+ * folio_file_page - The page for a particular index.
+ * @folio: The folio which contains this index.
+ * @index: The index we want to look up.
+ *
+ * Sometimes after looking up a folio in the page cache, we need to
+ * obtain the specific page for an index (eg a page fault).
+ *
+ * Return: The page containing the file data for this index.
+ */
+static inline struct page *folio_file_page(struct folio *folio, pgoff_t index)
+{
+   return &folio->page + (index & (folio_nr_pages(folio) - 1));
+}
+
+/**
+ * folio_contains - Does this folio contain this index?
+ * @folio: The folio.
+ * @index: The page index within the file.
+ *
+ * Context: The caller should have the page locked in order to prevent
+ * (eg) shmem from moving the page between the page cache and swap cache
+ * and changing its index in the middle of the operation.
+ * Return: true or false.
+ */
+static inline bool folio_contains(struct folio *folio, pgoff_t index)
+{
+   /* HugeTLBfs indexes the page cache in units of hpage_size */
+   if (PageHuge(&folio->page))
+   return folio->page.index == index;
+   return index - folio_index(folio) < folio_nr_pages(folio);
+}
+
 /*
  * Given the page we found in the page cache, return the page corresponding
  * to this index in the file
-- 
2.30.2

[PATCH v5 16/27] mm/filemap: Add lock_folio

2021-03-19 Thread Matthew Wilcox (Oracle)

This is like lock_page() but for use by callers who know they have a folio.
Convert __lock_page() to be __lock_folio().  This saves one call to
compound_head() per contended call to lock_page().

Saves 362 bytes of text; mostly from improved register allocation and
inlining decisions.  __lock_folio is 59 bytes while __lock_page was 79.

Signed-off-by: Matthew Wilcox (Oracle) 
---
 include/linux/pagemap.h | 24 +++-
 mm/filemap.c| 29 +++--
 2 files changed, 34 insertions(+), 19 deletions(-)

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index c211868086e0..c96ba0dfe111 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -693,7 +693,7 @@ static inline bool wake_page_match(struct wait_page_queue 
*wait_page,
return true;
 }
 
-extern void __lock_page(struct page *page);
+void __lock_folio(struct folio *folio);
 extern int __lock_page_killable(struct page *page);
 extern int __lock_page_async(struct page *page, struct wait_page_queue *wait);
 extern int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
@@ -702,13 +702,24 @@ void unlock_page(struct page *page);
 void unlock_folio(struct folio *folio);
 void unlock_page_private_2(struct page *page);
 
+static inline bool trylock_folio(struct folio *folio)
+{
+   return likely(!test_and_set_bit_lock(PG_locked, folio_flags(folio, 0)));
+}
+
 /*
  * Return true if the page was successfully locked
  */
 static inline int trylock_page(struct page *page)
 {
-   page = compound_head(page);
-   return (likely(!test_and_set_bit_lock(PG_locked, &page->flags)));
+   return trylock_folio(page_folio(page));
+}
+
+static inline void lock_folio(struct folio *folio)
+{
+   might_sleep();
+   if (!trylock_folio(folio))
+   __lock_folio(folio);
 }
 
 /*
@@ -716,9 +727,12 @@ static inline int trylock_page(struct page *page)
  */
 static inline void lock_page(struct page *page)
 {
+   struct folio *folio;
might_sleep();
-   if (!trylock_page(page))
-   __lock_page(page);
+
+   folio = page_folio(page);
+   if (!trylock_folio(folio))
+   __lock_folio(folio);
 }
 
 /*
diff --git a/mm/filemap.c b/mm/filemap.c
index 47ac8126a12e..99c05e2c0eea 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1187,7 +1187,7 @@ static void wake_up_page(struct page *page, int bit)
  */
 enum behavior {
EXCLUSIVE,  /* Hold ref to page and take the bit when woken, like
-* __lock_page() waiting on then setting PG_locked.
+* __lock_folio() waiting on then setting PG_locked.
 */
SHARED, /* Hold ref to page and check the bit when woken, like
 * wait_on_page_writeback() waiting on PG_writeback.
@@ -1535,17 +1535,16 @@ void page_endio(struct page *page, bool is_write, int 
err)
 EXPORT_SYMBOL_GPL(page_endio);
 
 /**
- * __lock_page - get a lock on the page, assuming we need to sleep to get it
- * @__page: the page to lock
+ * __lock_folio - Get a lock on the folio, assuming we need to sleep to get it.
+ * @folio: The folio to lock
  */
-void __lock_page(struct page *__page)
+void __lock_folio(struct folio *folio)
 {
-   struct page *page = compound_head(__page);
-   wait_queue_head_t *q = page_waitqueue(page);
-   wait_on_page_bit_common(q, page, PG_locked, TASK_UNINTERRUPTIBLE,
+   wait_queue_head_t *q = page_waitqueue(&folio->page);
+   wait_on_page_bit_common(q, &folio->page, PG_locked, 
TASK_UNINTERRUPTIBLE,
EXCLUSIVE);
 }
-EXPORT_SYMBOL(__lock_page);
+EXPORT_SYMBOL(__lock_folio);
 
 int __lock_page_killable(struct page *__page)
 {
@@ -1620,10 +1619,10 @@ int __lock_page_or_retry(struct page *page, struct 
mm_struct *mm,
return 0;
}
} else {
-   __lock_page(page);
+   __lock_folio(page_folio(page));
}
-   return 1;
 
+   return 1;
 }
 
 /**
@@ -2767,7 +2766,9 @@ loff_t mapping_seek_hole_data(struct address_space 
*mapping, loff_t start,
 static int lock_page_maybe_drop_mmap(struct vm_fault *vmf, struct page *page,
 struct file **fpin)
 {
-   if (trylock_page(page))
+   struct folio *folio = page_folio(page);
+
+   if (trylock_folio(folio))
return 1;
 
/*
@@ -2780,7 +2781,7 @@ static int lock_page_maybe_drop_mmap(struct vm_fault 
*vmf, struct page *page,
 
*fpin = maybe_unlock_mmap_for_io(vmf, *fpin);
if (vmf->flags & FAULT_FLAG_KILLABLE) {
-   if (__lock_page_killable(page)) {
+   if (__lock_page_killable(&folio->page)) {
/*
 * We didn't have the right flags to drop the mmap_lock,
 * but all fault_handlers only check for fatal signals
@@ -2792,11 +2793,11 @@ static int lock_page_m

[PATCH v5 11/27] mm: Handle per-folio private data

2021-03-19 Thread Matthew Wilcox (Oracle)

Add folio_private() and set_folio_private() which mirror page_private()
and set_page_private() -- ie folio private data is the same as page
private data.  The only difference is that these return a void *
instead of an unsigned long, which matches the majority of users.

Turn attach_page_private() into attach_folio_private() and reimplement
attach_page_private() as a wrapper.  No filesystem which uses page private
data currently supports compound pages, so we're free to define the rules.
attach_page_private() may only be called on a head page; if you want
to add private data to a tail page, you can call set_page_private()
directly (and shouldn't increment the page refcount!  That should be
done when adding private data to the head page / folio).

This saves 597 bytes of text with the distro-derived config that I'm
testing due to removing the calls to compound_head() in get_page()
& put_page().

Signed-off-by: Matthew Wilcox (Oracle) 
---
 include/linux/mm_types.h | 16 ++
 include/linux/pagemap.h  | 48 
 2 files changed, 45 insertions(+), 19 deletions(-)

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 4fc0b230d3ea..90086f93e9de 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -278,6 +278,12 @@ static inline atomic_t *compound_pincount_ptr(struct page 
*page)
 #define PAGE_FRAG_CACHE_MAX_SIZE   __ALIGN_MASK(32768, ~PAGE_MASK)
 #define PAGE_FRAG_CACHE_MAX_ORDER  get_order(PAGE_FRAG_CACHE_MAX_SIZE)
 
+/*
+ * page_private can be used on tail pages.  However, PagePrivate is only
+ * checked by the VM on the head page.  So page_private on the tail pages
+ * should be used for data that's ancillary to the head page (eg attaching
+ * buffer heads to tail pages after attaching buffer heads to the head page)
+ */
 #define page_private(page) ((page)->private)
 
 static inline void set_page_private(struct page *page, unsigned long private)
@@ -285,6 +291,16 @@ static inline void set_page_private(struct page *page, 
unsigned long private)
page->private = private;
 }
 
+static inline void *folio_private(struct folio *folio)
+{
+   return (void *)folio->page.private;
+}
+
+static inline void set_folio_private(struct folio *folio, void *v)
+{
+   folio->page.private = (unsigned long)v;
+}
+
 struct page_frag_cache {
void * va;
 #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 8c844ba67785..6676210addf6 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -260,42 +260,52 @@ static inline int page_cache_add_speculative(struct page 
*page, int count)
 }
 
 /**
- * attach_page_private - Attach private data to a page.
- * @page: Page to attach data to.
- * @data: Data to attach to page.
+ * attach_folio_private - Attach private data to a folio.
+ * @folio: Folio to attach data to.
+ * @data: Data to attach to folio.
  *
- * Attaching private data to a page increments the page's reference count.
- * The data must be detached before the page will be freed.
+ * Attaching private data to a folio increments the page's reference count.
+ * The data must be detached before the folio will be freed.
  */
-static inline void attach_page_private(struct page *page, void *data)
+static inline void attach_folio_private(struct folio *folio, void *data)
 {
-   get_page(page);
-   set_page_private(page, (unsigned long)data);
-   SetPagePrivate(page);
+   get_folio(folio);
+   set_folio_private(folio, data);
+   SetFolioPrivate(folio);
 }
 
 /**
- * detach_page_private - Detach private data from a page.
- * @page: Page to detach data from.
+ * detach_folio_private - Detach private data from a folio.
+ * @folio: Folio to detach data from.
  *
- * Removes the data that was previously attached to the page and decrements
+ * Removes the data that was previously attached to the folio and decrements
  * the refcount on the page.
  *
- * Return: Data that was attached to the page.
+ * Return: Data that was attached to the folio.
  */
-static inline void *detach_page_private(struct page *page)
+static inline void *detach_folio_private(struct folio *folio)
 {
-   void *data = (void *)page_private(page);
+   void *data = folio_private(folio);
 
-   if (!PagePrivate(page))
+   if (!FolioPrivate(folio))
return NULL;
-   ClearPagePrivate(page);
-   set_page_private(page, 0);
-   put_page(page);
+   ClearFolioPrivate(folio);
+   set_folio_private(folio, NULL);
+   put_folio(folio);
 
return data;
 }
 
+static inline void attach_page_private(struct page *page, void *data)
+{
+   attach_folio_private(page_folio(page), data);
+}
+
+static inline void *detach_page_private(struct page *page)
+{
+   return detach_folio_private(page_folio(page));
+}
+
 #ifdef CONFIG_NUMA
 extern struct page *__page_cache_alloc(gfp_t gfp);
 #else
-- 
2.30.2

[PATCH v5 10/27] mm: Create FolioFlags

2021-03-19 Thread Matthew Wilcox (Oracle)

These new functions are the folio analogues of the PageFlags functions.
If CONFIG_DEBUG_VM_PGFLAGS is enabled, we check the folio is not a tail
page at every invocation.  Note that this will also catch the PagePoisoned
case as a poisoned page has every bit set, which would include PageTail.

This saves 1727 bytes of text with the distro-derived config that
I'm testing due to removing a double call to compound_head() in
PageSwapCache().

Signed-off-by: Matthew Wilcox (Oracle) 
---
 include/linux/page-flags.h | 120 ++---
 1 file changed, 100 insertions(+), 20 deletions(-)

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 04a34c08e0a6..ec0e3eb6b85a 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -212,6 +212,15 @@ static inline void page_init_poison(struct page *page, 
size_t size)
 }
 #endif
 
+static unsigned long *folio_flags(struct folio *folio, unsigned n)
+{
+   struct page *page = &folio->page;
+
+   VM_BUG_ON_PGFLAGS(PageTail(page), page);
+   VM_BUG_ON_PGFLAGS(n > 0 && !test_bit(PG_head, &page->flags), page);
+   return &page[n].flags;
+}
+
 /*
  * Page flags policies wrt compound pages
  *
@@ -256,34 +265,56 @@ static inline void page_init_poison(struct page *page, 
size_t size)
VM_BUG_ON_PGFLAGS(!PageHead(page), page);   \
PF_POISONED_CHECK(&page[1]); })
 
+/* Which page is the flag stored in */
+#define FOLIO_PF_ANY   0
+#define FOLIO_PF_HEAD  0
+#define FOLIO_PF_ONLY_HEAD 0
+#define FOLIO_PF_NO_TAIL   0
+#define FOLIO_PF_NO_COMPOUND   0
+#define FOLIO_PF_SECOND1
+
 /*
  * Macros to create function definitions for page flags
  */
 #define TESTPAGEFLAG(uname, lname, policy) \
+static __always_inline int Folio##uname(struct folio *folio)   \
+   { return test_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); } \
 static __always_inline int Page##uname(struct page *page)  \
{ return test_bit(PG_##lname, &policy(page, 0)->flags); }
 
 #define SETPAGEFLAG(uname, lname, policy)  \
+static __always_inline void SetFolio##uname(struct folio *folio)   \
+   { set_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); }\
 static __always_inline void SetPage##uname(struct page *page)  \
{ set_bit(PG_##lname, &policy(page, 1)->flags); }
 
 #define CLEARPAGEFLAG(uname, lname, policy)\
+static __always_inline void ClearFolio##uname(struct folio *folio) \
+   { clear_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); }  \
 static __always_inline void ClearPage##uname(struct page *page)
\
{ clear_bit(PG_##lname, &policy(page, 1)->flags); }
 
 #define __SETPAGEFLAG(uname, lname, policy)\
+static __always_inline void __SetFolio##uname(struct folio *folio) \
+   { __set_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); }  \
 static __always_inline void __SetPage##uname(struct page *page)
\
{ __set_bit(PG_##lname, &policy(page, 1)->flags); }
 
 #define __CLEARPAGEFLAG(uname, lname, policy)  \
+static __always_inline void __ClearFolio##uname(struct folio *folio)   \
+   { __clear_bit(PG_##lname, folio_flags(folio, FOLIO_##policy)); } \
 static __always_inline void __ClearPage##uname(struct page *page)  \
{ __clear_bit(PG_##lname, &policy(page, 1)->flags); }
 
 #define TESTSETFLAG(uname, lname, policy)  \
+static __always_inline int TestSetFolio##uname(struct folio *folio)\
+   { return test_and_set_bit(PG_##lname, folio_flags(folio, 
FOLIO_##policy)); } \
 static __always_inline int TestSetPage##uname(struct page *page)   \
{ return test_and_set_bit(PG_##lname, &policy(page, 1)->flags); }
 
 #define TESTCLEARFLAG(uname, lname, policy)\
+static __always_inline int TestClearFolio##uname(struct folio *folio)  \
+   { return test_and_clear_bit(PG_##lname, folio_flags(folio, 
FOLIO_##policy)); } \
 static __always_inline int TestClearPage##uname(struct page *page) \
{ return test_and_clear_bit(PG_##lname, &policy(page, 1)->flags); }
 
@@ -302,21 +333,27 @@ static __always_inline int TestClearPage##uname(struct 
page *page)\
TESTCLEARFLAG(uname, lname, policy)
 
 #define TESTPAGEFLAG_FALSE(uname)  \
+static inline int Folio##uname(const struct folio *folio) { return 0; }
\
 static inline int Page##uname(const struct page *page) { return 0; }
 
 #define SETPAGEFLAG_NOOP(uname)
\
+static inline void SetFolio##uname(struct folio *folio) { }\
 static inline void SetPage##uname(struct page *page) {  }
 
 #define CLEARPAGEFLAG_NOOP(uname)

[PATCH v5 08/27] mm: Add put_folio

2021-03-19 Thread Matthew Wilcox (Oracle)

If we know we have a folio, we can call put_folio() instead of put_page()
and save the overhead of calling compound_head().  Also skips the
devmap checks.

This commit looks like it should be a no-op, but actually saves 1714 bytes
of text with the distro-derived config that I'm testing.  Some functions
grow a little while others shrink.  I presume the compiler is making
different inlining decisions.

Signed-off-by: Matthew Wilcox (Oracle) 
Reviewed-by: Zi Yan 
---
 include/linux/mm.h | 28 +++-
 1 file changed, 23 insertions(+), 5 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index e176e9c9990f..5052479febc7 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1226,9 +1226,28 @@ static inline __must_check bool try_get_page(struct page 
*page)
return true;
 }
 
+/**
+ * put_folio - Decrement the reference count on a folio.
+ * @folio: The folio.
+ *
+ * If the folio's reference count reaches zero, the memory will be
+ * released back to the page allocator and may be used by another
+ * allocation immediately.  Do not access the memory or the struct folio
+ * after calling put_folio() unless you can be sure that it wasn't the
+ * last reference.
+ *
+ * Context: May be called in process or interrupt context, but not in NMI
+ * context.  May be called while holding a spinlock.
+ */
+static inline void put_folio(struct folio *folio)
+{
+   if (put_page_testzero(&folio->page))
+   __put_page(&folio->page);
+}
+
 static inline void put_page(struct page *page)
 {
-   page = compound_head(page);
+   struct folio *folio = page_folio(page);
 
/*
 * For devmap managed pages we need to catch refcount transition from
@@ -1236,13 +1255,12 @@ static inline void put_page(struct page *page)
 * need to inform the device driver through callback. See
 * include/linux/memremap.h and HMM for details.
 */
-   if (page_is_devmap_managed(page)) {
-   put_devmap_managed_page(page);
+   if (page_is_devmap_managed(&folio->page)) {
+   put_devmap_managed_page(&folio->page);
return;
}
 
-   if (put_page_testzero(page))
-   __put_page(page);
+   put_folio(folio);
 }
 
 /*
-- 
2.30.2

[PATCH v5 09/27] mm: Add get_folio

2021-03-19 Thread Matthew Wilcox (Oracle)

If we know we have a folio, we can call get_folio() instead of get_page()
and save the overhead of calling compound_head().

Signed-off-by: Matthew Wilcox (Oracle) 
Reviewed-by: Zi Yan 
---
 include/linux/mm.h | 26 +-
 1 file changed, 17 insertions(+), 9 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 5052479febc7..8fc7b04a1438 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1198,18 +1198,26 @@ static inline bool is_pci_p2pdma_page(const struct page 
*page)
 }
 
 /* 127: arbitrary random number, small enough to assemble well */
-#define page_ref_zero_or_close_to_overflow(page) \
-   ((unsigned int) page_ref_count(page) + 127u <= 127u)
+#define folio_ref_zero_or_close_to_overflow(folio) \
+   ((unsigned int) page_ref_count(&folio->page) + 127u <= 127u)
+
+/**
+ * get_folio - Increment the reference count on a folio.
+ * @folio: The folio.
+ *
+ * Context: May be called in any context, as long as you know that
+ * you have a refcount on the folio.  If you do not already have one,
+ * try_grab_page() may be the right interface for you to use.
+ */
+static inline void get_folio(struct folio *folio)
+{
+   VM_BUG_ON_FOLIO(folio_ref_zero_or_close_to_overflow(folio), folio);
+   page_ref_inc(&folio->page);
+}
 
 static inline void get_page(struct page *page)
 {
-   page = compound_head(page);
-   /*
-* Getting a normal page or the head of a compound page
-* requires to already have an elevated page->_refcount.
-*/
-   VM_BUG_ON_PAGE(page_ref_zero_or_close_to_overflow(page), page);
-   page_ref_inc(page);
+   get_folio(page_folio(page));
 }
 
 bool __must_check try_grab_page(struct page *page, unsigned int flags);
-- 
2.30.2

[PATCH v5 07/27] mm/debug: Add VM_BUG_ON_FOLIO and VM_WARN_ON_ONCE_FOLIO

2021-03-19 Thread Matthew Wilcox (Oracle)

These are the folio equivalents of VM_BUG_ON_PAGE and VM_WARN_ON_ONCE_PAGE.

Signed-off-by: Matthew Wilcox (Oracle) 
Reviewed-by: Zi Yan 
---
 include/linux/mmdebug.h | 20 
 1 file changed, 20 insertions(+)

diff --git a/include/linux/mmdebug.h b/include/linux/mmdebug.h
index 5d0767cb424a..77d24e1dcaec 100644
--- a/include/linux/mmdebug.h
+++ b/include/linux/mmdebug.h
@@ -23,6 +23,13 @@ void dump_mm(const struct mm_struct *mm);
BUG();  \
}   \
} while (0)
+#define VM_BUG_ON_FOLIO(cond, folio)   \
+   do {\
+   if (unlikely(cond)) {   \
+   dump_page(&folio->page, "VM_BUG_ON_FOLIO(" 
__stringify(cond)")");\
+   BUG();  \
+   }   \
+   } while (0)
 #define VM_BUG_ON_VMA(cond, vma)   \
do {\
if (unlikely(cond)) {   \
@@ -48,6 +55,17 @@ void dump_mm(const struct mm_struct *mm);
}   \
unlikely(__ret_warn_once);  \
 })
+#define VM_WARN_ON_ONCE_FOLIO(cond, folio) ({  \
+   static bool __section(".data.once") __warned;   \
+   int __ret_warn_once = !!(cond); \
+   \
+   if (unlikely(__ret_warn_once && !__warned)) {   \
+   dump_page(&folio->page, "VM_WARN_ON_ONCE_FOLIO(" 
__stringify(cond)")");\
+   __warned = true;\
+   WARN_ON(1); \
+   }   \
+   unlikely(__ret_warn_once);  \
+})
 
 #define VM_WARN_ON(cond) (void)WARN_ON(cond)
 #define VM_WARN_ON_ONCE(cond) (void)WARN_ON_ONCE(cond)
@@ -56,11 +74,13 @@ void dump_mm(const struct mm_struct *mm);
 #else
 #define VM_BUG_ON(cond) BUILD_BUG_ON_INVALID(cond)
 #define VM_BUG_ON_PAGE(cond, page) VM_BUG_ON(cond)
+#define VM_BUG_ON_FOLIO(cond, folio) VM_BUG_ON(cond)
 #define VM_BUG_ON_VMA(cond, vma) VM_BUG_ON(cond)
 #define VM_BUG_ON_MM(cond, mm) VM_BUG_ON(cond)
 #define VM_WARN_ON(cond) BUILD_BUG_ON_INVALID(cond)
 #define VM_WARN_ON_ONCE(cond) BUILD_BUG_ON_INVALID(cond)
 #define VM_WARN_ON_ONCE_PAGE(cond, page)  BUILD_BUG_ON_INVALID(cond)
+#define VM_WARN_ON_ONCE_FOLIO(cond, folio)  BUILD_BUG_ON_INVALID(cond)
 #define VM_WARN_ONCE(cond, format...) BUILD_BUG_ON_INVALID(cond)
 #define VM_WARN(cond, format...) BUILD_BUG_ON_INVALID(cond)
 #endif
-- 
2.30.2

[PATCH v5 04/27] mm: Introduce struct folio

2021-03-19 Thread Matthew Wilcox (Oracle)

A struct folio is a new abstraction for a head-or-single page.  A function
which takes a struct folio argument declares that it will operate on the
entire (possibly compound) page, not just PAGE_SIZE bytes.  In return,
the caller guarantees that the pointer it is passing does not point to
a tail page.

Signed-off-by: Matthew Wilcox (Oracle) 
---
 include/linux/mm.h   | 78 
 include/linux/mm_types.h | 36 +++
 2 files changed, 114 insertions(+)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index cb1e191da319..9b7e3fa12fd3 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -934,6 +934,20 @@ static inline unsigned int compound_order(struct page 
*page)
return page[1].compound_order;
 }
 
+/**
+ * folio_order - The allocation order of a folio.
+ * @folio: The folio.
+ *
+ * A folio is composed of 2^order pages.  See get_order() for the definition
+ * of order.
+ *
+ * Return: The order of the folio.
+ */
+static inline unsigned int folio_order(struct folio *folio)
+{
+   return compound_order(&folio->page);
+}
+
 static inline bool hpage_pincount_available(struct page *page)
 {
/*
@@ -1579,6 +1593,69 @@ static inline void set_page_links(struct page *page, 
enum zone_type zone,
 #endif
 }
 
+/**
+ * folio_nr_pages - The number of pages in the folio.
+ * @folio: The folio.
+ *
+ * Return: A number which is a power of two.
+ */
+static inline unsigned long folio_nr_pages(struct folio *folio)
+{
+   return compound_nr(&folio->page);
+}
+
+/**
+ * folio_next - Move to the next physical folio.
+ * @folio: The folio we're currently operating on.
+ *
+ * If you have physically contiguous memory which may span more than
+ * one folio (eg a &struct bio_vec), use this function to move from one
+ * folio to the next.  Do not use it if the memory is only virtually
+ * contiguous as the folios are almost certainly not adjacent to each
+ * other.  This is the folio equivalent to writing ``page++``.
+ *
+ * Context: We assume that the folios are refcounted and/or locked at a
+ * higher level and do not adjust the reference counts.
+ * Return: The next struct folio.
+ */
+static inline struct folio *folio_next(struct folio *folio)
+{
+#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
+   return (struct folio *)nth_page(&folio->page, folio_nr_pages(folio));
+#else
+   return folio + folio_nr_pages(folio);
+#endif
+}
+
+/**
+ * folio_shift - The number of bits covered by this folio.
+ * @folio: The folio.
+ *
+ * A folio contains a number of bytes which is a power-of-two in size.
+ * This function tells you which power-of-two the folio is.
+ *
+ * Context: The caller should have a reference on the folio to prevent
+ * it from being split.  It is not necessary for the folio to be locked.
+ * Return: The base-2 logarithm of the size of this folio.
+ */
+static inline unsigned int folio_shift(struct folio *folio)
+{
+   return PAGE_SHIFT + folio_order(folio);
+}
+
+/**
+ * folio_size - The number of bytes in a folio.
+ * @folio: The folio.
+ *
+ * Context: The caller should have a reference on the folio to prevent
+ * it from being split.  It is not necessary for the folio to be locked.
+ * Return: The number of bytes in this folio.
+ */
+static inline size_t folio_size(struct folio *folio)
+{
+   return PAGE_SIZE << folio_order(folio);
+}
+
 /*
  * Some inline functions in vmstat.h depend on page_zone()
  */
@@ -1683,6 +1760,7 @@ extern void pagefault_out_of_memory(void);
 
 #define offset_in_page(p)  ((unsigned long)(p) & ~PAGE_MASK)
 #define offset_in_thp(page, p) ((unsigned long)(p) & (thp_size(page) - 1))
+#define offset_in_folio(folio, p) ((unsigned long)(p) & (folio_size(folio) - 
1))
 
 /*
  * Flags passed to show_mem() and show_free_areas() to suppress output in
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 6613b26a8894..4fc0b230d3ea 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -224,6 +224,42 @@ struct page {
 #endif
 } _struct_page_alignment;
 
+/**
+ * struct folio - Represents a contiguous set of bytes.
+ * @page: Either a base (order-0) page or the head page of a compound page.
+ *
+ * A folio is a physically, virtually and logically contiguous set
+ * of bytes.  It is a power-of-two in size, and it is aligned to that
+ * same power-of-two.  If it is found in the page cache, it is at a file
+ * offset which is a multiple of that power-of-two.  It is at least as
+ * large as PAGE_SIZE.
+ */
+struct folio {
+   struct page page;
+};
+
+/**
+ * page_folio - Converts from page to folio.
+ * @page: The page.
+ *
+ * Every page is part of a folio.  This function cannot be called on a
+ * NULL pointer.
+ *
+ * Context: No reference, nor lock is required on @page.  If the caller
+ * does not hold a reference, this call may race with a folio split, so
+ * it should re-check the folio still contains this page after gaining
+ * a reference

[PATCH v5 05/27] mm: Add folio_pgdat and folio_zone

2021-03-19 Thread Matthew Wilcox (Oracle)

These are just convenience wrappers for callers with folios; pgdat and
zone can be reached from tail pages as well as head pages.

Signed-off-by: Matthew Wilcox (Oracle) 
Reviewed-by: Zi Yan 
---
 include/linux/mm.h | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 9b7e3fa12fd3..e176e9c9990f 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1544,6 +1544,16 @@ static inline pg_data_t *page_pgdat(const struct page 
*page)
return NODE_DATA(page_to_nid(page));
 }
 
+static inline struct zone *folio_zone(const struct folio *folio)
+{
+   return page_zone(&folio->page);
+}
+
+static inline pg_data_t *folio_pgdat(const struct folio *folio)
+{
+   return page_pgdat(&folio->page);
+}
+
 #ifdef SECTION_IN_PAGE_FLAGS
 static inline void set_page_section(struct page *page, unsigned long section)
 {
-- 
2.30.2

[PATCH v5 02/27] mm/writeback: Add wait_on_page_writeback_killable

2021-03-19 Thread Matthew Wilcox (Oracle)

This is the killable version of wait_on_page_writeback.

Signed-off-by: Matthew Wilcox (Oracle) 
---
 include/linux/pagemap.h |  1 +
 mm/page-writeback.c | 16 
 2 files changed, 17 insertions(+)

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 139678f382ff..8c844ba67785 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -698,6 +698,7 @@ static inline int wait_on_page_locked_killable(struct page 
*page)
 
 int put_and_wait_on_page_locked(struct page *page, int state);
 void wait_on_page_writeback(struct page *page);
+int wait_on_page_writeback_killable(struct page *page);
 extern void end_page_writeback(struct page *page);
 void wait_for_stable_page(struct page *page);
 
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index f6c2c3165d4d..5e761fb62800 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2830,6 +2830,22 @@ void wait_on_page_writeback(struct page *page)
 }
 EXPORT_SYMBOL_GPL(wait_on_page_writeback);
 
+/*
+ * Wait for a page to complete writeback.  Returns -EINTR if we get a
+ * fatal signal while waiting.
+ */
+int wait_on_page_writeback_killable(struct page *page)
+{
+   while (PageWriteback(page)) {
+   trace_wait_on_page_writeback(page, page_mapping(page));
+   if (wait_on_page_bit_killable(page, PG_writeback))
+   return -EINTR;
+   }
+
+   return 0;
+}
+EXPORT_SYMBOL_GPL(wait_on_page_writeback_killable);
+
 /**
  * wait_for_stable_page() - wait for writeback to finish, if necessary.
  * @page:  The page to wait on.
-- 
2.30.2

[PATCH v5 00/27] Memory Folios

2021-03-19 Thread Matthew Wilcox (Oracle)

Managing memory in 4KiB pages is a serious overhead.  Many benchmarks
exist which show the benefits of a larger "page size".  As an example,
an earlier iteration of this idea which used compound pages got a 7%
performance boost when compiling the kernel using kernbench without any
particular tuning.

Using compound pages or THPs exposes a serious weakness in our type
system.  Functions are often unprepared for compound pages to be passed
to them, and may only act on PAGE_SIZE chunks.  Even functions which are
aware of compound pages may expect a head page, and do the wrong thing
if passed a tail page.

There have been efforts to label function parameters as 'head' instead
of 'page' to indicate that the function expects a head page, but this
leaves us with runtime assertions instead of using the compiler to prove
that nobody has mistakenly passed a tail page.  Calling a struct page
'head' is also inaccurate as they will work perfectly well on base pages.
The term 'nottail' has not proven popular.

We also waste a lot of instructions ensuring that we're not looking at
a tail page.  Almost every call to PageFoo() contains one or more hidden
calls to compound_head().  This also happens for get_page(), put_page()
and many more functions.  There does not appear to be a way to tell gcc
that it can cache the result of compound_head(), nor is there a way to
tell it that compound_head() is idempotent.

This series introduces the 'struct folio' as a replacement for
head-or-base pages.  This initial set reduces the kernel size by
approximately 6kB, although its real purpose is adding infrastructure
to enable further use of the folio.

The intent is to convert all filesystems and some device drivers to work
in terms of folios.  This series contains a lot of explicit conversions,
but it's important to realise it's removing a lot of implicit conversions
in some relatively hot paths.  There will be very few conversions from
folios when this work is completed; filesystems, the page cache, the
LRU and so on will generally only deal with folios.

I analysed the text size reduction using a config based on Oracle UEK
with all modules changed to built-in.  That's obviously not a kernel
which makes sense to run, but it serves to compare the effects on (many
common) filesystems & drivers, not just the core.

add/remove: 33645/33632 grow/shrink: 1850/1924 up/down: 894474/-899674 (-5200)

Current tree at:
https://git.infradead.org/users/willy/pagecache.git/shortlog/refs/heads/folio

(contains another ~100 patches on top of this batch, not all of which are
in good shape for submission)

v5:
 - Rebase on next-20210319
 - Pull out three bug-fix patches to the front of the series, allowing
   them to be applied earlier.
 - Fix folio_page() against pages being moved between swap & page cache
 - Fix FolioDoubleMap to use the right page flags
 - Rename next_folio() to folio_next() (akpm)
 - Renamed folio stat functions (akpm)
 - Add 'mod' versions of the folio stats for users that already have 'nr'
 - Renamed folio_page to folio_file_page() (akpm)
 - Added kernel-doc for struct folio, folio_next(), folio_index(),
   folio_file_page(), folio_contains(), folio_order(), folio_nr_pages(),
   folio_shift(), folio_size(), page_folio(), get_folio(), put_folio()
 - Make folio_private() work in terms of void * instead of unsigned long
 - Used page_folio() in attach/detach page_private() (hch)
 - Drop afs_page_mkwrite folio conversion from this series
 - Add wait_on_folio_writeback_killable()
 - Convert add_page_wait_queue() to add_folio_wait_queue()
 - Add folio_swap_entry() helper
 - Drop the additions of *FolioFsCache
 - Simplify the addition of lock_folio_memcg() et al
 - Drop test_clear_page_writeback() conversion from this series
 - Add FolioTransHuge() definition
 - Rename __folio_file_mapping() to swapcache_mapping()
 - Added swapcache_index() helper
 - Removed lock_folio_async()
 - Made __lock_folio_async() static to filemap.c
 - Converted unlock_page_private_2() to use a folio internally
v4:
 - Rebase on current Linus tree (including swap fix)
 - Analyse each patch in terms of its effects on kernel text size.
   A few were modified to improve their effect.  In particular, where
   pushing calls to page_folio() into the callers resulted in unacceptable
   size increases, the wrapper was placed in mm/folio-compat.c.  This lets
   us see all the places which are good targets for conversion to folios.
 - Some of the patches were reordered, split or merged in order to make
   more logical sense.
 - Use nth_page() for folio_next() if we're using SPARSEMEM and not
   VMEMMAP (Zi Yan)
 - Increment and decrement page stats in units of pages instead of units
   of folios (Zi Yan)
v3:
 - Rebase on next-20210127.  Two major sources of conflict, the
   generic_file_buffered_read refactoring (in akpm tree) and the
   fscache work (in dh

[PATCH v5 06/27] mm/vmstat: Add functions to account folio statistics

2021-03-19 Thread Matthew Wilcox (Oracle)

Allow page counters to be more readily modified by callers which have
a folio.  Name these wrappers with 'stat' instead of 'state' as requested
by Linus here:
https://lore.kernel.org/linux-mm/CAHk-=wj847sudr-kt+46ft3+xffgiwpgthvm7djwgdi4cvr...@mail.gmail.com/

Signed-off-by: Matthew Wilcox (Oracle) 
---
 include/linux/vmstat.h | 107 +
 1 file changed, 107 insertions(+)

diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 3299cd69e4ca..d287d7c31b8f 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -402,6 +402,78 @@ static inline void drain_zonestat(struct zone *zone,
struct per_cpu_pageset *pset) { }
 #endif /* CONFIG_SMP */
 
+static inline void __zone_stat_mod_folio(struct folio *folio,
+   enum zone_stat_item item, long nr)
+{
+   __mod_zone_page_state(folio_zone(folio), item, nr);
+}
+
+static inline void __zone_stat_add_folio(struct folio *folio,
+   enum zone_stat_item item)
+{
+   __mod_zone_page_state(folio_zone(folio), item, folio_nr_pages(folio));
+}
+
+static inline void __zone_stat_sub_folio(struct folio *folio,
+   enum zone_stat_item item)
+{
+   __mod_zone_page_state(folio_zone(folio), item, -folio_nr_pages(folio));
+}
+
+static inline void zone_stat_mod_folio(struct folio *folio,
+   enum zone_stat_item item, long nr)
+{
+   mod_zone_page_state(folio_zone(folio), item, nr);
+}
+
+static inline void zone_stat_add_folio(struct folio *folio,
+   enum zone_stat_item item)
+{
+   mod_zone_page_state(folio_zone(folio), item, folio_nr_pages(folio));
+}
+
+static inline void zone_stat_sub_folio(struct folio *folio,
+   enum zone_stat_item item)
+{
+   mod_zone_page_state(folio_zone(folio), item, -folio_nr_pages(folio));
+}
+
+static inline void __node_stat_mod_folio(struct folio *folio,
+   enum node_stat_item item, long nr)
+{
+   __mod_node_page_state(folio_pgdat(folio), item, nr);
+}
+
+static inline void __node_stat_add_folio(struct folio *folio,
+   enum node_stat_item item)
+{
+   __mod_node_page_state(folio_pgdat(folio), item, folio_nr_pages(folio));
+}
+
+static inline void __node_stat_sub_folio(struct folio *folio,
+   enum node_stat_item item)
+{
+   __mod_node_page_state(folio_pgdat(folio), item, -folio_nr_pages(folio));
+}
+
+static inline void node_stat_mod_folio(struct folio *folio,
+   enum node_stat_item item, long nr)
+{
+   mod_node_page_state(folio_pgdat(folio), item, nr);
+}
+
+static inline void node_stat_add_folio(struct folio *folio,
+   enum node_stat_item item)
+{
+   mod_node_page_state(folio_pgdat(folio), item, folio_nr_pages(folio));
+}
+
+static inline void node_stat_sub_folio(struct folio *folio,
+   enum node_stat_item item)
+{
+   mod_node_page_state(folio_pgdat(folio), item, -folio_nr_pages(folio));
+}
+
 static inline void __mod_zone_freepage_state(struct zone *zone, int nr_pages,
 int migratetype)
 {
@@ -530,6 +602,24 @@ static inline void __dec_lruvec_page_state(struct page 
*page,
__mod_lruvec_page_state(page, idx, -1);
 }
 
+static inline void __lruvec_stat_mod_folio(struct folio *folio,
+  enum node_stat_item idx, int val)
+{
+   __mod_lruvec_page_state(&folio->page, idx, val);
+}
+
+static inline void __lruvec_stat_add_folio(struct folio *folio,
+  enum node_stat_item idx)
+{
+   __lruvec_stat_mod_folio(folio, idx, folio_nr_pages(folio));
+}
+
+static inline void __lruvec_stat_sub_folio(struct folio *folio,
+  enum node_stat_item idx)
+{
+   __lruvec_stat_mod_folio(folio, idx, -folio_nr_pages(folio));
+}
+
 static inline void inc_lruvec_page_state(struct page *page,
 enum node_stat_item idx)
 {
@@ -542,4 +632,21 @@ static inline void dec_lruvec_page_state(struct page *page,
mod_lruvec_page_state(page, idx, -1);
 }
 
+static inline void lruvec_stat_mod_folio(struct folio *folio,
+enum node_stat_item idx, int val)
+{
+   mod_lruvec_page_state(&folio->page, idx, val);
+}
+
+static inline void lruvec_stat_add_folio(struct folio *folio,
+enum node_stat_item idx)
+{
+   lruvec_stat_mod_folio(folio, idx, folio_nr_pages(folio));
+}
+
+static inline void lruvec_stat_sub_folio(struct folio *folio,
+enum node_stat_item idx)
+{
+   lruvec_stat_mod_folio(folio, idx, -folio_nr_pages(folio));
+}
 #endif /* _LINUX_VMSTAT_H */
-- 
2.30.2

[PATCH v5 01/27] fs/cachefiles: Remove wait_bit_key layout dependency

2021-03-19 Thread Matthew Wilcox (Oracle)

Cachefiles was relying on wait_page_key and wait_bit_key being the
same layout, which is fragile.  Now that wait_page_key is exposed in
the pagemap.h header, we can remove that fragility

Signed-off-by: Matthew Wilcox (Oracle) 
---
 fs/cachefiles/rdwr.c| 7 +++
 include/linux/pagemap.h | 1 -
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
index e027c718ca01..8ffc40e84a59 100644
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c
@@ -24,17 +24,16 @@ static int cachefiles_read_waiter(wait_queue_entry_t *wait, 
unsigned mode,
container_of(wait, struct cachefiles_one_read, monitor);
struct cachefiles_object *object;
struct fscache_retrieval *op = monitor->op;
-   struct wait_bit_key *key = _key;
+   struct wait_page_key *key = _key;
struct page *page = wait->private;
 
ASSERT(key);
 
_enter("{%lu},%u,%d,{%p,%u}",
   monitor->netfs_page->index, mode, sync,
-  key->flags, key->bit_nr);
+  key->page, key->bit_nr);
 
-   if (key->flags != &page->flags ||
-   key->bit_nr != PG_locked)
+   if (key->page != page || key->bit_nr != PG_locked)
return 0;
 
_debug("--- monitor %p %lx ---", page, page->flags);
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index f68fe61c1dec..139678f382ff 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -574,7 +574,6 @@ static inline pgoff_t linear_page_index(struct 
vm_area_struct *vma,
return pgoff;
 }
 
-/* This has the same layout as wait_bit_key - see fs/cachefiles/rdwr.c */
 struct wait_page_key {
struct page *page;
int bit_nr;
-- 
2.30.2

[PATCH v5 03/27] afs: Use wait_on_page_writeback_killable

2021-03-19 Thread Matthew Wilcox (Oracle)

Open-coding this function meant it missed out on the recent bugfix
for waiters being woken by a delayed wake event from a previous
instantiation of the page.

Signed-off-by: Matthew Wilcox (Oracle) 
---
 fs/afs/write.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/fs/afs/write.c b/fs/afs/write.c
index b2e03de09c24..106a864b6a93 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -850,8 +850,7 @@ vm_fault_t afs_page_mkwrite(struct vm_fault *vmf)
return VM_FAULT_RETRY;
 #endif
 
-   if (PageWriteback(page) &&
-   wait_on_page_bit_killable(page, PG_writeback) < 0)
+   if (wait_on_page_writeback_killable(page))
return VM_FAULT_RETRY;
 
if (lock_page_killable(page) < 0)
-- 
2.30.2

[GIT PULL] RISC-V Fixes for 5.12-rc4

2021-03-19 Thread Palmer Dabbelt

The following changes since commit a38fd8748464831584a19438cbb3082b5a2dab15:

  Linux 5.12-rc2 (2021-03-05 17:33:41 -0800)

are available in the Git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/riscv/linux.git 
tags/riscv-for-linus-5.12-rc4

for you to fetch changes up to a5406a7ff56e63376c210b06072aa0ef23473366:

  riscv: Correct SPARSEMEM configuration (2021-03-16 22:15:21 -0700)


RISC-V Fixes for 5.12-rc4

I have handful of fixes for 5.12:

* A fix to the SBI remote fence numbers for hypervisor fences, which had
  been transcribed in the wrong order in Linux.  These fences are only
  used with the KVM patches applied.
* A whole host of build warnings have been fixed, these should have no
  functional change.
* A fix to init_resources() that prevents an off-by-one error from
  causing an out-of-bounds array reference.  This is manifesting during
  boot on vexriscv.
* A fix to ensure the KASAN mappings are visible before proceeding to
  use them.


Alexandre Ghiti (1):
  riscv: Ensure page table writes are flushed when initializing KASAN 
vmalloc

Colin Ian King (1):
  ftrace: Fix spelling mistake "disabed" -> "disabled"

Damien Le Moal (1):
  riscv: Fix compilation error with Canaan SoC

Geert Uytterhoeven (1):
  RISC-V: Fix out-of-bounds accesses in init_resources()

Heinrich Schuchardt (1):
  RISC-V: correct enum sbi_ext_rfence_fid

Kefeng Wang (1):
  riscv: Correct SPARSEMEM configuration

Nanyong Sun (9):
  riscv: traps: Fix no prototype warnings
  riscv: irq: Fix no prototype warning
  riscv: sbi: Fix comment of __sbi_set_timer_v01
  riscv: ptrace: Fix no prototype warnings
  riscv: time: Fix no prototype for time_init
  riscv: syscall_table: Reduce W=1 compilation warnings noise
  riscv: process: Fix no prototype for show_regs
  riscv: ftrace: Use ftrace_get_regs helper
  riscv: process: Fix no prototype for arch_dup_task_struct

Palmer Dabbelt (1):
  RISC-V: kasan: Declare kasan_shallow_populate() static

kernel test robot (1):
  riscv: fix bugon.cocci warnings

 arch/csky/kernel/probes/ftrace.c|  2 +-
 arch/riscv/Kconfig  |  4 ++--
 arch/riscv/Kconfig.socs |  2 ++
 arch/riscv/include/asm/asm-prototypes.h | 16 
 arch/riscv/include/asm/irq.h|  2 ++
 arch/riscv/include/asm/processor.h  |  1 +
 arch/riscv/include/asm/ptrace.h |  5 +
 arch/riscv/include/asm/sbi.h|  4 ++--
 arch/riscv/include/asm/timex.h  |  2 ++
 arch/riscv/kernel/Makefile  |  1 +
 arch/riscv/kernel/probes/ftrace.c   | 18 ++
 arch/riscv/kernel/probes/kprobes.c  |  3 +--
 arch/riscv/kernel/process.c |  1 +
 arch/riscv/kernel/sbi.c |  2 +-
 arch/riscv/kernel/setup.c   |  3 ++-
 arch/riscv/kernel/time.c|  1 +
 arch/riscv/kernel/traps.c   |  1 +
 arch/riscv/mm/kasan_init.c  |  4 +++-
 arch/x86/kernel/kprobes/ftrace.c|  2 +-
 19 files changed, 55 insertions(+), 19 deletions(-)

Re: [PATCH 2/2] usb: dwc3: gadget: Ignore EP queue requests during bus reset

2021-03-19 Thread Wesley Cheng

Hi Thinh,


On 3/19/2021 7:01 PM, Thinh Nguyen wrote:
> Wesley Cheng wrote:
>>
>>
>> On 3/19/2021 5:40 PM, Thinh Nguyen wrote:
>>> Hi,
>>>
>>> Wesley Cheng wrote:
 The current dwc3_gadget_reset_interrupt() will stop any active
 transfers, but only addresses blocking of EP queuing for while we are
 coming from a disconnected scenario, i.e. after receiving the disconnect
 event.  If the host decides to issue a bus reset on the device, the
 connected parameter will still be set to true, allowing for EP queuing
 to continue while we are disabling the functions.  To avoid this, set the
 connected flag to false until the stop active transfers is complete.

 Signed-off-by: Wesley Cheng 
 ---
  drivers/usb/dwc3/gadget.c | 9 +
  1 file changed, 9 insertions(+)

 diff --git a/drivers/usb/dwc3/gadget.c b/drivers/usb/dwc3/gadget.c
 index 6e14fdc..d5ed0f69 100644
 --- a/drivers/usb/dwc3/gadget.c
 +++ b/drivers/usb/dwc3/gadget.c
 @@ -3327,6 +3327,15 @@ static void dwc3_gadget_reset_interrupt(struct dwc3 
 *dwc)
u32 reg;
  
/*
 +   * Ideally, dwc3_reset_gadget() would trigger the function
 +   * drivers to stop any active transfers through ep disable.
 +   * However, for functions which defer ep disable, such as mass
 +   * storage, we will need to rely on the call to stop active
 +   * transfers here, and avoid allowing of request queuing.
 +   */
 +  dwc->connected = false;
 +
 +  /*
 * WORKAROUND: DWC3 revisions <1.88a have an issue which
 * would cause a missing Disconnect Event if there's a
 * pending Setup Packet in the FIFO.

>>>
>>> This doesn't look right. Did you have rebase issue with your local
>>> change again?
>>>
>>> BR,
>>> Thinh
>>>
>> Hi Thinh,
>>
>> This was rebased on Greg's usb-linus branch, which has commit
>> f09ddcfcb8c5 ("usb: dwc3: gadget: Prevent EP queuing while stopping
>> transfers") merged.
> 
> Ah I see.
> 
>>
>> commit f09ddcfcb8c5  moved the dwc->connected = true to after we have
>> finished stop active transfers.  However, this change will also ensure
>> that the connected flag is set to false to ensure that when we call stop
>> active transfers, nothing can prepare TRBs.  (previous commit only
>> addresses the case where we get the reset interrupt when coming from a
>> disconnected state)
>>
> 
> That still doesn't address this issue.
> 
> Because:
> 1) We're still protected by the spin_lock_irq*(), so this change doesn't
> make any difference while handling an event.

Thank you for the feedback.  So it is true that we lock dwc->lock while
handling EP/device events, but what these changes are trying to address
is that during dwc3_stop_active_transfers() we will eventually call
dwc3_gadget_giveback() to call the complete() functions registered by
the function driver.  Before we call the complete() callbacks, we unlock
dwc->lock, so we are no longer protected, and if there was a pending ep
queue from a function driver, that would allow it to acquire the lock
and continue preparing the TRBs.

> 2) We don't enable the interrupt for END_TRANSFER command completion
> when doing dwc3_stop_active_transfers(), the
> DWC3_EP_END_TRANSFER_PENDING flag will not be set to prevent preparing
> new requests.
> 
Agreed.  That is the reason for adding the check to dwc->connected in
__dwc3_gadget_ep_queue()

if (!dep->endpoint.desc || !dwc->pullups_connected || !dwc->connected) {
dev_err(dwc->dev, "%s: can't queue to disabled endpoint\n",
dep->name);
return -ESHUTDOWN;

> We should do dwc->connected = true when we handle connection_done
> interrupt instead. The END_TRANSFER command should complete before this.
> 
So how this change will address the issue is:

1.  IRQ handler will acquire dwc->lock
2.  dwc3_gadget_reset_handler() sets dwc->connected = false
3.  Call to dwc3_stop_active_transfers()
---> dwc3_gadget_giveback() releases dwc->lock
4.  If there was a pending ep queue (waiting for dwc->lock) it can
continue here
5.  __dwc3_gadget_ep_queue() exits early due to dwc->connected = false
6.  dwc3_gadget_giveback() re-acquires dwc->lock and continues

Thanks
Wesley Cheng

-- 
The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
a Linux Foundation Collaborative Project

Re: linux-next: Tree for Mar 19

2021-03-19 Thread Stephen Rothwell

Hi all,

On Fri, 19 Mar 2021 15:30:31 +0100 Heiko Carstens  wrote:
>
> This breaks now on s390 with commit 8ef6f74a3571 ("Rust support").
> make modules_install / depmod now fails with:
> 
> depmod: WARNING: 
> /.../lib/modules/5.12.0-rc3-1-g8ef6f74a3571/kernel/drivers/s390/scsi/zfcp.ko
>  needs unknown symbol
> 
> for every module (yes, the line is complete).

Daniel Axtens reported the same breakage on powerpc.  I bisected it to
the same commit.  More experimentation shows that if you reverse just
the change to include/linux/moduleparam.h the above warnings go away.
So

-#define MAX_PARAM_PREFIX_LEN (256 - sizeof(unsigned long))
+#define MAX_PARAM_PREFIX_LEN (64 - sizeof(unsigned long))

fixes it up.  Not sure what that does to the rust support ...
-- 
Cheers,
Stephen Rothwell

[tip:auto-latest] BUILD SUCCESS 68644c505bc74972676d4557b060546d4c6e9326

2021-03-19 Thread kernel test robot

tree/branch: https://git.kernel.org/pub/scm/linux/kernel/git/peterz/queue.git 
auto-latest
branch HEAD: 68644c505bc74972676d4557b060546d4c6e9326  Merge branch 
'locking/urgent'

elapsed time: 970m

configs tested: 128
configs skipped: 2

The following configs have been built successfully.
More configs may be tested in the coming days.

gcc tested configs:
arm defconfig
arm64allyesconfig
arm64   defconfig
arm  allyesconfig
arm  allmodconfig
x86_64   allyesconfig
riscvallmodconfig
i386 allyesconfig
riscvallyesconfig
armqcom_defconfig
m68k   m5249evb_defconfig
powerpc pseries_defconfig
powerpc  ppc64e_defconfig
arm  badge4_defconfig
arm   sunxi_defconfig
xtensageneric_kc705_defconfig
arm  exynos_defconfig
arm  lpd270_defconfig
m68k allyesconfig
m68kq40_defconfig
arm   milbeaut_m10v_defconfig
armmulti_v7_defconfig
shedosk7705_defconfig
powerpcge_imp3a_defconfig
powerpc tqm8548_defconfig
arm at91_dt_defconfig
sh  r7780mp_defconfig
powerpc  ppc6xx_defconfig
mips  rb532_defconfig
openrisc simple_smp_defconfig
riscv  rv32_defconfig
s390   zfcpdump_defconfig
powerpc   eiger_defconfig
sh   se7750_defconfig
mips decstation_defconfig
mipsqi_lb60_defconfig
mips  pistachio_defconfig
powerpc mpc837x_rdb_defconfig
sh  landisk_defconfig
parisc   alldefconfig
arc  axs103_smp_defconfig
powerpc  pmac32_defconfig
sh   rts7751r2dplus_defconfig
armvt8500_v6_v7_defconfig
m68k   sun3_defconfig
ia64zx1_defconfig
powerpc skiroot_defconfig
powerpc ksi8560_defconfig
sh magicpanelr2_defconfig
mipsworkpad_defconfig
arm axm55xx_defconfig
arm64alldefconfig
armneponset_defconfig
mips  maltaaprp_defconfig
armclps711x_defconfig
powerpc   mpc834x_itxgp_defconfig
sh   allmodconfig
pariscgeneric-64bit_defconfig
mipsmaltaup_defconfig
ia64 allmodconfig
ia64defconfig
ia64 allyesconfig
m68k allmodconfig
m68kdefconfig
nios2   defconfig
arc  allyesconfig
nds32 allnoconfig
nds32   defconfig
nios2allyesconfig
cskydefconfig
alpha   defconfig
alphaallyesconfig
xtensa   allyesconfig
h8300allyesconfig
arc defconfig
parisc  defconfig
s390 allyesconfig
s390 allmodconfig
parisc   allyesconfig
s390defconfig
sparcallyesconfig
sparc   defconfig
i386   tinyconfig
i386defconfig
mips allyesconfig
mips allmodconfig
powerpc  allyesconfig
powerpc  allmodconfig
powerpc   allnoconfig
i386 randconfig-a001-20210318
i386 randconfig-a005-20210318
i386 randconfig-a003-20210318
i386 randconfig-a002-20210318
i386 randconfig-a006-20210318
i386 randconfig-a004-20210318
i386 randconfig-a001-20210319
i386 randconfig-a005-20210319
i386 randconfig-a003-20210319
i386 randconfig-a002-20210319
i386 randconfig-a006-20210319
i386 randc

Re: [PATCH v1 1/2] s390/kvm: split kvm_s390_real_to_abs

2021-03-19 Thread Thomas Huth


On 19/03/2021 20.33, Claudio Imbrenda wrote:

A new function _kvm_s390_real_to_abs will apply prefixing to a real address
with a given prefix value.

The old kvm_s390_real_to_abs becomes now a wrapper around the new function.

This is needed to avoid code duplication in vSIE.

Cc: sta...@vger.kernel.org
Signed-off-by: Claudio Imbrenda 
---
  arch/s390/kvm/gaccess.h | 23 +--
  1 file changed, 17 insertions(+), 6 deletions(-)

diff --git a/arch/s390/kvm/gaccess.h b/arch/s390/kvm/gaccess.h
index daba10f76936..7c72a5e3449f 100644
--- a/arch/s390/kvm/gaccess.h
+++ b/arch/s390/kvm/gaccess.h
@@ -18,17 +18,14 @@
  
  /**

   * kvm_s390_real_to_abs - convert guest real address to guest absolute address
- * @vcpu - guest virtual cpu
+ * @prefix - guest prefix
   * @gra - guest real address
   *
   * Returns the guest absolute address that corresponds to the passed guest 
real
- * address @gra of a virtual guest cpu by applying its prefix.
+ * address @gra of by applying the given prefix.
   */
-static inline unsigned long kvm_s390_real_to_abs(struct kvm_vcpu *vcpu,
-unsigned long gra)
+static inline unsigned long _kvm_s390_real_to_abs(u32 prefix, unsigned long 
gra)



Just a matter of taste, but maybe this could be named differently? 
kvm_s390_real2abs_prefix() ? kvm_s390_prefix_real_to_abs()?



Anyway:
Reviewed-by: Thomas Huth

[PATCH] mmc: core: Mark mmc_host device with pm_runtime_no_callbacks

2021-03-19 Thread kehuanlin

The rpm_resume() will call parent's resume callback recursively.
Since mmc_host has no its own pm_runtime callbacks, the mmc devices
may fail to resume (-ENOSYS in rpm_callback) sometimes. Mark mmc_host
device with pm_runtime_no_callbacks can fix the issue.

Signed-off-by: kehuanlin 
---
 drivers/mmc/core/host.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/mmc/core/host.c b/drivers/mmc/core/host.c
index 9b89a91b6b47..177bebd9a6c4 100644
--- a/drivers/mmc/core/host.c
+++ b/drivers/mmc/core/host.c
@@ -15,6 +15,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -480,6 +481,7 @@ struct mmc_host *mmc_alloc_host(int extra, struct device 
*dev)
host->class_dev.class = &mmc_host_class;
device_initialize(&host->class_dev);
device_enable_async_suspend(&host->class_dev);
+   pm_runtime_no_callbacks(&host->class_dev);
 
if (mmc_gpio_alloc(host)) {
put_device(&host->class_dev);
-- 
2.30.0

Re: [PATCH] clang-format: Update ColumnLimit

2021-03-19 Thread Joe Perches

On Fri, 2021-03-19 at 19:48 +0100, Miguel Ojeda wrote:
> On Fri, Mar 19, 2021 at 7:45 PM Ansuel Smith  wrote:
> > 
> > Sorry, didn't notice that. Considering that checkpatch complains and
> > some reviewers actually state that 100 is the new limit, I think it's
> > time to update the file.
> 
> IIUC, 80 is still the soft limit, but 100 is now the hard limit.

80 columns is still the strongly preferred limit.

>From coding-style.rst:
---
The preferred limit on the length of a single line is 80 columns.

Statements longer than 80 columns should be broken into sensible chunks,
unless exceeding 80 columns significantly increases readability and does
not hide information.
---

IMO: clang-format is mechanical and, like checkpatch, doesn't have much
'taste'.

Ideally, 100 columns would only be used when long length identifiers
exist with some mechanism that determines statement complexity.

Today it's fairly easy to go beyond 80 columns even if a statement
is similar to
a = b + c;
when identifier lengths are relatively long.

There are many existing 25+ character length identifiers, so the trivial
statement above if used with all identifiers of 25 characters or more
exceeds 80 columns.

So for some things, clang-format (and checkpatch) should allow > 80 column
lines for trivial statements like the above.

It's not a trivial implementation problem though.

Re: [PATCH 8/9] vfio/pci: export nvlink2 support into vendor vfio_pci drivers

2021-03-19 Thread Alex Williamson

On Fri, 19 Mar 2021 19:59:43 -0300
Jason Gunthorpe  wrote:

> On Fri, Mar 19, 2021 at 03:08:09PM -0600, Alex Williamson wrote:
> > On Fri, 19 Mar 2021 17:07:49 -0300
> > Jason Gunthorpe  wrote:
> >   
> > > On Fri, Mar 19, 2021 at 11:36:42AM -0600, Alex Williamson wrote:  
> > > > On Fri, 19 Mar 2021 17:34:49 +0100
> > > > Christoph Hellwig  wrote:
> > > > 
> > > > > On Fri, Mar 19, 2021 at 01:28:48PM -0300, Jason Gunthorpe wrote:
> > > > > > The wrinkle I don't yet have an easy answer to is how to load 
> > > > > > vfio_pci
> > > > > > as a universal "default" within the driver core lazy bind scheme and
> > > > > > still have working module autoloading... I'm hoping to get some
> > > > > > research into this..  
> > > > 
> > > > What about using MODULE_SOFTDEP("pre: ...") in the vfio-pci base
> > > > driver, which would load all the known variants in order to influence
> > > > the match, and therefore probe ordering?
> > > 
> > > The way the driver core works is to first match against the already
> > > loaded driver list, then trigger an event for module loading and when
> > > new drivers are registered they bind to unbound devices.  
> > 
> > The former is based on id_tables, the latter on MODULE_DEVICE_TABLE, we
> > don't have either of those.  
> 
> Well, today we don't, but Max here adds id_table's to the special
> devices and a MODULE_DEVICE_TABLE would come too if we do the flavours
> thing below.

I think the id_tables are the wrong approach for IGD and NVLink
variants.
 
> My starting thinking is that everything should have these tables and
> they should work properly..

id_tables require ongoing maintenance whereas the existing variants
require only vendor + device class and some platform feature, like a
firmware or fdt table.  They're meant to only add extra regions to
vfio-pci base support, not extensively modify the device interface.
 
> > As noted to Christoph, the cases where we want a vfio driver to
> > bind to anything automatically is the exception.  
> 
> I agree vfio should not automatically claim devices, but once vfio is
> told to claim a device everything from there after should be
> automatic.
> 
> > > One answer is to have userspace udev have the "hook" here and when a
> > > vfio flavour mod alias is requested on a PCI device it swaps in
> > > vfio_pci if it can't find an alternative.
> > > 
> > > The dream would be a system with no vfio modules loaded could do some
> > > 
> > >  echo "vfio" > /sys/bus/pci/xxx/driver_flavour
> > > 
> > > And a module would be loaded and a struct vfio_device is created for
> > > that device. Very easy for the user.  
> > 
> > This is like switching a device to a parallel universe where we do
> > want vfio drivers to bind automatically to devices.  
> 
> Yes.
> 
> If we do this I'd probably suggest that driver_override be bumped down
> to some user compat and 'vfio > driver_override' would just set the
> flavour.
> 
> As-is driver_override seems dangerous as overriding the matching table
> could surely allow root userspace to crash the machine. In situations
> with trusted boot/signed modules this shouldn't be.

When we're dealing with meta-drivers that can bind to anything, we
shouldn't rely on the match, but should instead verify the driver is
appropriate in the probe callback.  Even without driver_override,
there's the new_id mechanism.  Either method allows the root user to
break driver binding.  Greg has previously stated something to the
effect that users get to keep all the pieces when they break something
by manipulating driver binding.

> > > > If we coupled that with wildcard support in driver_override, ex.
> > > > "vfio_pci*", and used consistent module naming, I think we'd only need
> > > > to teach userspace about this wildcard and binding to a specific module
> > > > would come for free.
> > > 
> > > What would the wildcard do?  
> > 
> > It allows a driver_override to match more than one driver, not too
> > dissimilar to your driver_flavor above.  In this case it would match
> > all driver names starting with "vfio_pci".  For example if we had:
> > 
> > softdep vfio-pci pre: vfio-pci-foo vfio-pci-bar
> >
> > Then we'd pre-seed the condition that drivers foo and bar precede the
> > base vfio-pci driver, each will match the device to the driver and have
> > an opportunity in their probe function to either claim or skip the
> > device.  Userspace could also set and exact driver_override, for
> > example if they want to force using the base vfio-pci driver or go
> > directly to a specific variant.  
> 
> Okay, I see. The problem is that this makes 'vfio-pci' monolithic, in
> normal situations it will load *everything*.
> 
> While that might not seem too bad with these simple drivers, at least
> the mlx5 migration driver will have a large dependency tree and pull
> in lots of other modules. Even Max's sample from v1 pulls in mlx5_core.ko
> and a bunch of other stuff in its orbit.

Luckily the mlx5 driver doesn't need to

Re: [PATCH 2/2] mtd: spi-nor: add initial sysfs support

2021-03-19 Thread Yicong Yang

On 2021/3/18 17:24, Michael Walle wrote:
> Add support to show the name and JEDEC identifier as well as to dump the
> SFDP table. Not all flashes list their SFDP table contents in their
> datasheet. So having that is useful. It might also be helpful in bug
> reports from users.
> 
> The idea behind the sysfs module is also to have raw access to the SPI
> NOR flash device registers, which can also be useful for debugging.

Hi Michael,

I like the idea to dump the sfdp data,it will make debug easier. should it go 
in debugfs?
we already have debugfs files for partname and partid of the flash.

> 
> Signed-off-by: Michael Walle 
> ---
>  drivers/mtd/spi-nor/Makefile |  2 +-
>  drivers/mtd/spi-nor/core.c   |  5 +++
>  drivers/mtd/spi-nor/core.h   |  3 ++
>  drivers/mtd/spi-nor/sysfs.c  | 86 
>  4 files changed, 95 insertions(+), 1 deletion(-)
>  create mode 100644 drivers/mtd/spi-nor/sysfs.c
> 
> diff --git a/drivers/mtd/spi-nor/Makefile b/drivers/mtd/spi-nor/Makefile
> index 653923896205..aff308f75987 100644
> --- a/drivers/mtd/spi-nor/Makefile
> +++ b/drivers/mtd/spi-nor/Makefile
> @@ -1,6 +1,6 @@
>  # SPDX-License-Identifier: GPL-2.0
>  
> -spi-nor-objs := core.o sfdp.o
> +spi-nor-objs := core.o sfdp.o sysfs.o
>  spi-nor-objs += atmel.o
>  spi-nor-objs += catalyst.o
>  spi-nor-objs += eon.o
> diff --git a/drivers/mtd/spi-nor/core.c b/drivers/mtd/spi-nor/core.c
> index 4a315cb1c4db..2eaf4ba8c0f3 100644
> --- a/drivers/mtd/spi-nor/core.c
> +++ b/drivers/mtd/spi-nor/core.c
> @@ -3707,6 +3707,10 @@ static int spi_nor_probe(struct spi_mem *spimem)
>   if (ret)
>   return ret;
>  
> + ret = spi_nor_sysfs_create(nor);
> + if (ret)
> + return ret;
> +
>   return mtd_device_register(&nor->mtd, data ? data->parts : NULL,
>  data ? data->nr_parts : 0);
>  }
> @@ -3716,6 +3720,7 @@ static int spi_nor_remove(struct spi_mem *spimem)
>   struct spi_nor *nor = spi_mem_get_drvdata(spimem);
>  
>   spi_nor_restore(nor);
> + spi_nor_sysfs_remove(nor);
>  
>   /* Clean up MTD stuff. */
>   return mtd_device_unregister(&nor->mtd);
> diff --git a/drivers/mtd/spi-nor/core.h b/drivers/mtd/spi-nor/core.h
> index 668f22011b1d..dd592f7b62d1 100644
> --- a/drivers/mtd/spi-nor/core.h
> +++ b/drivers/mtd/spi-nor/core.h
> @@ -488,4 +488,7 @@ static struct spi_nor __maybe_unused 
> *mtd_to_spi_nor(struct mtd_info *mtd)
>   return mtd->priv;
>  }
>  
> +int spi_nor_sysfs_create(struct spi_nor *nor);
> +void spi_nor_sysfs_remove(struct spi_nor *nor);
> +
>  #endif /* __LINUX_MTD_SPI_NOR_INTERNAL_H */
> diff --git a/drivers/mtd/spi-nor/sysfs.c b/drivers/mtd/spi-nor/sysfs.c
> new file mode 100644
> index ..0de031e246c5
> --- /dev/null
> +++ b/drivers/mtd/spi-nor/sysfs.c
> @@ -0,0 +1,86 @@
> +// SPDX-License-Identifier: GPL-2.0
> +
> +#include 
> +#include 
> +#include 
> +#include 
> +
> +#include "core.h"
> +
> +static ssize_t name_show(struct device *dev,
> +  struct device_attribute *attr, char *buf)
> +{
> + struct spi_device *spi = to_spi_device(dev);
> + struct spi_mem *spimem = spi_get_drvdata(spi);
> + struct spi_nor *nor = spi_mem_get_drvdata(spimem);
> +
> + return sprintf(buf, "%s\n", nor->info->name);

perhaps sysfs_emit() instead if we go sysfs? as suggested by [1].

[1] Documentation/filesystems/sysfs.rst:line 246

Thanks,
Yicong

> +}
> +static DEVICE_ATTR_RO(name);
> +
> +static ssize_t jedec_id_show(struct device *dev,
> +  struct device_attribute *attr, char *buf)
> +{
> + struct spi_device *spi = to_spi_device(dev);
> + struct spi_mem *spimem = spi_get_drvdata(spi);
> + struct spi_nor *nor = spi_mem_get_drvdata(spimem);
> +
> + return sprintf(buf, "%*phN\n", nor->info->id_len, nor->info->id);
> +}
> +static DEVICE_ATTR_RO(jedec_id);
> +
> +static struct attribute *spi_nor_sysfs_entries[] = {
> + &dev_attr_name.attr,
> + &dev_attr_jedec_id.attr,
> + NULL
> +};
> +
> +static ssize_t sfdp_read(struct file *filp, struct kobject *kobj,
> +  struct bin_attribute *bin_attr, char *buf,
> +  loff_t off, size_t count)
> +{
> + struct spi_device *spi = to_spi_device(kobj_to_dev(kobj));
> + struct spi_mem *spimem = spi_get_drvdata(spi);
> + struct spi_nor *nor = spi_mem_get_drvdata(spimem);
> + struct sfdp *sfdp = nor->sfdp;
> + size_t sfdp_size = sfdp->num_dwords * sizeof(*sfdp->dwords);
> +
> + return memory_read_from_buffer(buf, count, &off, nor->sfdp->dwords,
> +sfdp_size);
> +}
> +static BIN_ATTR_RO(sfdp, PAGE_SIZE);
> +
> +static struct bin_attribute *spi_nor_sysfs_bin_entries[] = {
> + &bin_attr_sfdp,
> + NULL
> +};
> +
> +static umode_t spi_nor_sysfs_is_bin_visible(struct kobject *kobj,
> +

[for-stable-4.19 PATCH v2 2/2] lkdtm: don't move ctors to .rodata

2021-03-19 Thread Nicolas Boichat

From: Mark Rutland 

commit 3f618ab3323407ee4c6a6734a37eb6e9663ebfb9 upstream.

When building with KASAN and LKDTM, clang may implictly generate an
asan.module_ctor function in the LKDTM rodata object. The Makefile moves
the lkdtm_rodata_do_nothing() function into .rodata by renaming the
file's .text section to .rodata, and consequently also moves the ctor
function into .rodata, leading to a boot time crash (splat below) when
the ctor is invoked by do_ctors().

Let's prevent this by marking the function as noinstr rather than
notrace, and renaming the file's .noinstr.text to .rodata. Marking the
function as noinstr will prevent tracing and kprobes, and will inhibit
any undesireable compiler instrumentation.

The ctor function (if any) will be placed in .text and will work
correctly.

Example splat before this patch is applied:

[0.916359] Unable to handle kernel execute from non-executable memory at 
virtual address a0006b60f5ac
[0.922088] Mem abort info:
[0.922828]   ESR = 0x860e
[0.923635]   EC = 0x21: IABT (current EL), IL = 32 bits
[0.925036]   SET = 0, FnV = 0
[0.925838]   EA = 0, S1PTW = 0
[0.926714] swapper pgtable: 4k pages, 48-bit VAs, pgdp=427b3000
[0.928489] [a0006b60f5ac] pgd=00023003, p4d=00023003, 
pud=00023fffe003, pmd=006842000f01
[0.931330] Internal error: Oops: 860e [#1] PREEMPT SMP
[0.932806] Modules linked in:
[0.933617] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.10.0-rc7 #2
[0.935620] Hardware name: linux,dummy-virt (DT)
[0.936924] pstate: 4045 (nZcv daif +PAN -UAO -TCO BTYPE=--)
[0.938609] pc : asan.module_ctor+0x0/0x14
[0.939759] lr : do_basic_setup+0x4c/0x70
[0.940889] sp : 27b600177e30
[0.941815] x29: 27b600177e30 x28: 
[0.943306] x27:  x26: 
[0.944803] x25:  x24: 
[0.946289] x23: 0001 x22: 
[0.94] x21: a0006bf4a890 x20: a0006befb6c0
[0.949271] x19: a0006bef9358 x18: 0068
[0.950756] x17: fff8 x16: 
[0.952246] x15:  x14: 
[0.953734] x13: 838a16d5 x12: 0001
[0.955223] x11: 94000da74041 x10: dfffa000
[0.956715] x9 :  x8 : a0006b60f5ac
[0.958199] x7 : f9f9f9f9f9f9f9f9 x6 : 003f
[0.959683] x5 : 0040 x4 : 
[0.961178] x3 : a0006bdc15a0 x2 : 0005
[0.962662] x1 : 00f9 x0 : a0006bef9350
[0.964155] Call trace:
[0.964844]  asan.module_ctor+0x0/0x14
[0.965895]  kernel_init_freeable+0x158/0x198
[0.967115]  kernel_init+0x14/0x19c
[0.968104]  ret_from_fork+0x10/0x30
[0.969110] Code: 0003    ()
[0.970815] ---[ end trace b5339784e20d015c ]---

Cc: Arnd Bergmann 
Cc: Greg Kroah-Hartman 
Cc: Kees Cook 
Acked-by: Kees Cook 
Signed-off-by: Mark Rutland 
Link: https://lore.kernel.org/r/20201207170533.10738-1-mark.rutl...@arm.com
Signed-off-by: Greg Kroah-Hartman 

Signed-off-by: Nicolas Boichat 
---

(no changes since v1)

 drivers/misc/lkdtm/Makefile | 2 +-
 drivers/misc/lkdtm/rodata.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/misc/lkdtm/Makefile b/drivers/misc/lkdtm/Makefile
index cce47a15a79f..aeb960cb096d 100644
--- a/drivers/misc/lkdtm/Makefile
+++ b/drivers/misc/lkdtm/Makefile
@@ -13,7 +13,7 @@ KCOV_INSTRUMENT_rodata.o  := n
 
 OBJCOPYFLAGS :=
 OBJCOPYFLAGS_rodata_objcopy.o  := \
-   --rename-section .text=.rodata,alloc,readonly,load
+   --rename-section 
.noinstr.text=.rodata,alloc,readonly,load
 targets += rodata.o rodata_objcopy.o
 $(obj)/rodata_objcopy.o: $(obj)/rodata.o FORCE
$(call if_changed,objcopy)
diff --git a/drivers/misc/lkdtm/rodata.c b/drivers/misc/lkdtm/rodata.c
index 58d180af72cf..baacb876d1d9 100644
--- a/drivers/misc/lkdtm/rodata.c
+++ b/drivers/misc/lkdtm/rodata.c
@@ -5,7 +5,7 @@
  */
 #include "lkdtm.h"
 
-void notrace lkdtm_rodata_do_nothing(void)
+void noinstr lkdtm_rodata_do_nothing(void)
 {
/* Does nothing. We just want an architecture agnostic "return". */
 }
-- 
2.31.0.rc2.261.g7f71774620-goog

[for-stable-4.19 PATCH v2 1/2] vmlinux.lds.h: Create section for protection against instrumentation

2021-03-19 Thread Nicolas Boichat

From: Thomas Gleixner 

commit 655389433e7efec589838b400a2a652b3ffa upstream.

Some code pathes, especially the low level entry code, must be protected
against instrumentation for various reasons:

 - Low level entry code can be a fragile beast, especially on x86.

 - With NO_HZ_FULL RCU state needs to be established before using it.

Having a dedicated section for such code allows to validate with tooling
that no unsafe functions are invoked.

Add the .noinstr.text section and the noinstr attribute to mark
functions. noinstr implies notrace. Kprobes will gain a section check
later.

Provide also a set of markers: instrumentation_begin()/end()

These are used to mark code inside a noinstr function which calls
into regular instrumentable text section as safe.

The instrumentation markers are only active when CONFIG_DEBUG_ENTRY is
enabled as the end marker emits a NOP to prevent the compiler from merging
the annotation points. This means the objtool verification requires a
kernel compiled with this option.

Signed-off-by: Thomas Gleixner 
Reviewed-by: Alexandre Chartre 
Acked-by: Peter Zijlstra 
Link: https://lkml.kernel.org/r/20200505134100.075416...@linutronix.de

[Nicolas:
Guard noinstr macro in include/linux/compiler_types.h in __KERNEL__
&& !__ASSEMBLY__, otherwise noinstr is expanded in the linker
script construct.

Upstream does not have this problem as many macros were moved by
commit 71391bdd2e9a ("include/linux/compiler_types.h: don't pollute
userspace with macro definitions"). We take the minimal approach here
and just guard the new macro.

Minor context conflicts in:
arch/powerpc/kernel/vmlinux.lds.S
include/asm-generic/vmlinux.lds.h
include/linux/compiler.h]
Signed-off-by: Nicolas Boichat 

---
Technically guarding with !__ASSEMBLY__ should be enough, but
there seems to be no reason to expose this new macro when
!__KERNEL__, so let's just match what upstream does.

Changes in v2:
 - Guard noinstr macro by __KERNEL__ && !__ASSEMBLY__ to prevent
   expansion in linker script and match upstream.

 arch/powerpc/kernel/vmlinux.lds.S |  1 +
 include/asm-generic/sections.h|  3 ++
 include/asm-generic/vmlinux.lds.h | 10 ++
 include/linux/compiler.h  | 54 +++
 include/linux/compiler_types.h|  6 
 scripts/mod/modpost.c |  2 +-
 6 files changed, 75 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/vmlinux.lds.S 
b/arch/powerpc/kernel/vmlinux.lds.S
index 695432965f20..9b346f3d2814 100644
--- a/arch/powerpc/kernel/vmlinux.lds.S
+++ b/arch/powerpc/kernel/vmlinux.lds.S
@@ -99,6 +99,7 @@ SECTIONS
 #endif
/* careful! __ftr_alt_* sections need to be close to .text */
*(.text.hot TEXT_MAIN .text.fixup .text.unlikely .fixup 
__ftr_alt_* .ref.text);
+   NOINSTR_TEXT
SCHED_TEXT
CPUIDLE_TEXT
LOCK_TEXT
diff --git a/include/asm-generic/sections.h b/include/asm-generic/sections.h
index 849cd8eb5ca0..ea5987bb0b84 100644
--- a/include/asm-generic/sections.h
+++ b/include/asm-generic/sections.h
@@ -53,6 +53,9 @@ extern char __ctors_start[], __ctors_end[];
 /* Start and end of .opd section - used for function descriptors. */
 extern char __start_opd[], __end_opd[];
 
+/* Start and end of instrumentation protected text section */
+extern char __noinstr_text_start[], __noinstr_text_end[];
+
 extern __visible const void __nosave_begin, __nosave_end;
 
 /* Function descriptor handling (if any).  Override in asm/sections.h */
diff --git a/include/asm-generic/vmlinux.lds.h 
b/include/asm-generic/vmlinux.lds.h
index 2d632a74cc5e..88484ee023ca 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -482,6 +482,15 @@
__security_initcall_end = .;\
}
 
+/*
+ * Non-instrumentable text section
+ */
+#define NOINSTR_TEXT   \
+   ALIGN_FUNCTION();   \
+   __noinstr_text_start = .;   \
+   *(.noinstr.text)\
+   __noinstr_text_end = .;
+
 /*
  * .text section. Map to function alignment to avoid address changes
  * during second ld run in second ld pass when generating System.map
@@ -496,6 +505,7 @@
*(TEXT_MAIN .text.fixup)\
*(.text.unlikely .text.unlikely.*)  \
*(.text.unknown .text.unknown.*)\
+   NOINSTR_TEXT\
*(.text..refcount)  \
*(.ref.text)\
MEM_KEEP(init.text*)\
diff --git a/include/linux/compiler.h b/include/linux/compiler

[for-stable-4.19 PATCH v2 0/2] Backport patches to fix KASAN+LKDTM with recent clang on ARM64

2021-03-19 Thread Nicolas Boichat

Backport 2 patches that are required to make KASAN+LKDTM work
with recent clang (patch 2/2 has a complete description).
Tested on our chromeos-4.19 branch.
Also compile tested on x86-64 and arm64 with gcc this time
around.

Patch 1/2 adds a guard around noinstr that matches upstream,
to prevent a build issue, and has some minor context conflicts.
Patch 2/2 is a clean backport.

These patches have been merged to 5.4 stable already. We might
need to backport to older stable branches, but this is what I
could test for now.

Changes in v2:
 - Guard noinstr macro by __KERNEL__ && !__ASSEMBLY__ to prevent
   expansion in linker script and match upstream.

Mark Rutland (1):
  lkdtm: don't move ctors to .rodata

Thomas Gleixner (1):
  vmlinux.lds.h: Create section for protection against instrumentation

 arch/powerpc/kernel/vmlinux.lds.S |  1 +
 drivers/misc/lkdtm/Makefile   |  2 +-
 drivers/misc/lkdtm/rodata.c   |  2 +-
 include/asm-generic/sections.h|  3 ++
 include/asm-generic/vmlinux.lds.h | 10 ++
 include/linux/compiler.h  | 54 +++
 include/linux/compiler_types.h|  6 
 scripts/mod/modpost.c |  2 +-
 8 files changed, 77 insertions(+), 3 deletions(-)

-- 
2.31.0.rc2.261.g7f71774620-goog

Re: [PATCH 07/10] mm/vmscan: add helper for querying ability to age anonymous pages

2021-03-19 Thread Greg Thelen

Dave Hansen  wrote:

> From: Dave Hansen 
>
> Anonymous pages are kept on their own LRU(s).  These lists could
> theoretically always be scanned and maintained.  But, without swap,
> there is currently nothing the kernel can *do* with the results of a
> scanned, sorted LRU for anonymous pages.
>
> A check for '!total_swap_pages' currently serves as a valid check as
> to whether anonymous LRUs should be maintained.  However, another
> method will be added shortly: page demotion.
>
> Abstract out the 'total_swap_pages' checks into a helper, give it a
> logically significant name, and check for the possibility of page
> demotion.

Reviewed-by: Greg Thelen 

> Signed-off-by: Dave Hansen 
> Cc: David Rientjes 
> Cc: Huang Ying 
> Cc: Dan Williams 
> Cc: David Hildenbrand 
> Cc: osalvador 
> ---
>
>  b/mm/vmscan.c |   28 +---
>  1 file changed, 25 insertions(+), 3 deletions(-)
>
> diff -puN mm/vmscan.c~mm-vmscan-anon-can-be-aged mm/vmscan.c
> --- a/mm/vmscan.c~mm-vmscan-anon-can-be-aged  2021-03-04 15:35:58.935806422 
> -0800
> +++ b/mm/vmscan.c 2021-03-04 15:35:58.942806422 -0800
> @@ -2517,6 +2517,26 @@ out:
>   }
>  }
>  
> +/*
> + * Anonymous LRU management is a waste if there is
> + * ultimately no way to reclaim the memory.
> + */
> +bool anon_should_be_aged(struct lruvec *lruvec)

Should this be static?

> +{
> + struct pglist_data *pgdat = lruvec_pgdat(lruvec);
> +
> + /* Aging the anon LRU is valuable if swap is present: */
> + if (total_swap_pages > 0)
> + return true;
> +
> + /* Also valuable if anon pages can be demoted: */
> + if (next_demotion_node(pgdat->node_id) >= 0)
> + return true;
> +
> + /* No way to reclaim anon pages.  Should not age anon LRUs: */
> + return false;
> +}
> +
>  static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
>  {
>   unsigned long nr[NR_LRU_LISTS];
> @@ -2626,7 +2646,8 @@ static void shrink_lruvec(struct lruvec
>* Even if we did not try to evict anon pages at all, we want to
>* rebalance the anon lru active/inactive ratio.
>*/
> - if (total_swap_pages && inactive_is_low(lruvec, LRU_INACTIVE_ANON))
> + if (anon_should_be_aged(lruvec) &&
> + inactive_is_low(lruvec, LRU_INACTIVE_ANON))
>   shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
>  sc, LRU_ACTIVE_ANON);
>  }
> @@ -3455,10 +3476,11 @@ static void age_active_anon(struct pglis
>   struct mem_cgroup *memcg;
>   struct lruvec *lruvec;
>  
> - if (!total_swap_pages)
> + lruvec = mem_cgroup_lruvec(NULL, pgdat);
> +
> + if (!anon_should_be_aged(lruvec))
>   return;
>  
> - lruvec = mem_cgroup_lruvec(NULL, pgdat);
>   if (!inactive_is_low(lruvec, LRU_INACTIVE_ANON))
>   return;
>  
> _

Re: [PATCH 10/10] ide: remove the legacy ide driver

2021-03-19 Thread Maciej W. Rozycki

On Sat, 20 Mar 2021, Maciej W. Rozycki wrote:

> > been scheduled for removal for a while.  Finally kill it off so that we
> > can start cleaning up various bits of cruft it forced on the block layer.
> 
>  You need to adjust Documentation/admin-guide/kernel-parameters.txt too, 
> i.e. remove all the `ide*' options, possibly more (I haven't checked in 
> detail).

 And also Documentation/ide/ide.rst.  Also do we have all the necessary 
`hdparm' features supported in libata nowadays for PATA devices?

  Maciej

Re: [PATCH 10/10] ide: remove the legacy ide driver

2021-03-19 Thread Maciej W. Rozycki

On Thu, 18 Mar 2021, Christoph Hellwig wrote:

> The legay ide driver has been replace with libata startin in 2003 and has

 s/legay/legacy/;s/replace/replaced/;s/startin/startin/ (though I'd say 
"back in" instead in the latter case).

> been scheduled for removal for a while.  Finally kill it off so that we
> can start cleaning up various bits of cruft it forced on the block layer.

 You need to adjust Documentation/admin-guide/kernel-parameters.txt too, 
i.e. remove all the `ide*' options, possibly more (I haven't checked in 
detail).

  Maciej

Re: [PATCH] Input: ims-pcu - drop redundant driver-data assignment

2021-03-19 Thread Dmitry Torokhov

On Thu, Mar 18, 2021 at 04:55:25PM +0100, Johan Hovold wrote:
> The driver data for the data interface has already been set by
> usb_driver_claim_interface() so drop the subsequent redundant
> assignment.
> 
> Signed-off-by: Johan Hovold 

Applied, thank you.

-- 
Dmitry

Re: [PATCH] mm: page_alloc: fix memcg accounting leak in speculative cache lookup

2021-03-19 Thread Matthew Wilcox

On Fri, Mar 19, 2021 at 06:52:58PM -0700, Hugh Dickins wrote:
> > +   /*
> > +* Drop the base reference from __alloc_pages and free. In
> > +* case there is an outstanding speculative reference, from
> > +* e.g. the page cache, it will put and free the page later.
> > +*/
> > +   if (likely(put_page_testzero(page))) {
> > free_the_page(page, order);
> > -   else if (!PageHead(page))
> > +   return;
> > +   }
> > +
> > +   /*
> > +* The speculative reference will put and free the page.
> > +*
> > +* However, if the speculation was into a higher-order page
> > +* chunk that isn't marked compound, the other side will know
> > +* nothing about our buddy pages and only free the order-0
> > +* page at the start of our chunk! We must split off and free
> > +* the buddy pages here.
> > +*
> > +* The buddy pages aren't individually refcounted, so they
> > +* can't have any pending speculative references themselves.
> > +*/
> > +   if (!PageHead(page) && order > 0) {
> 
> The put_page_testzero() has released our reference to the first
> subpage of page: it's now under the control of the racing speculative
> lookup.  So it seems to me unsafe to be checking PageHead(page) here:
> if it was actually a compound page, PageHead might already be cleared
> by now, and we doubly free its tail pages below?  I think we need to
> use a "bool compound = PageHead(page)" on entry to __free_pages().
> 
> Or alternatively, it's wrong to call __free_pages() on a compound
> page anyway, so we should not check PageHead at all, except in a
> WARN_ON_ONCE(PageCompound(page)) at the start?

Alas ...

$ git grep '__free_pages\>.*compound'
drivers/dma-buf/heaps/system_heap.c:__free_pages(page, 
compound_order(page));
drivers/dma-buf/heaps/system_heap.c:__free_pages(p, 
compound_order(p));
drivers/dma-buf/heaps/system_heap.c:__free_pages(page, 
compound_order(page));
mm/huge_memory.c:   __free_pages(zero_page, 
compound_order(zero_page));
mm/huge_memory.c:   __free_pages(zero_page, 
compound_order(zero_page));
mm/slub.c:  __free_pages(page, compound_order(page));

Maybe we should disallow it!

There are a few other places to check:

$ grep -l __GFP_COMP $(git grep -lw __free_pages) | wc -l
24

(assuming the pages are allocated and freed in the same file, which is a
reasonable approximation, but not guaranteed to catch everything.  Many
of these 24 will be false positives, of course.)

[PATCH v2 5/5] iommu/vt-d: Avoid unnecessary cache flush in pasid entry teardown

2021-03-19 Thread Lu Baolu

When a present pasid entry is disassembled, all kinds of pasid related
caches need to be flushed. But when a pasid entry is not being used
(PRESENT bit not set), we don't need to do this. Check the PRESENT bit
in intel_pasid_tear_down_entry() and avoid flushing caches if it's not
set.

Signed-off-by: Lu Baolu 
---
 drivers/iommu/intel/pasid.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c
index dd69df5a188a..7a73385edcc0 100644
--- a/drivers/iommu/intel/pasid.c
+++ b/drivers/iommu/intel/pasid.c
@@ -502,6 +502,9 @@ void intel_pasid_tear_down_entry(struct intel_iommu *iommu, 
struct device *dev,
if (WARN_ON(!pte))
return;
 
+   if (!(pte->val[0] & PASID_PTE_PRESENT))
+   return;
+
did = pasid_get_domain_id(pte);
intel_pasid_clear_entry(dev, pasid, fault_ignore);
 
-- 
2.25.1

[PATCH v2 3/5] iommu/vt-d: Invalidate PASID cache when root/context entry changed

2021-03-19 Thread Lu Baolu

When the Intel IOMMU is operating in the scalable mode, some information
from the root and context table may be used to tag entries in the PASID
cache. Software should invalidate the PASID-cache when changing root or
context table entries.

Suggested-by: Ashok Raj 
Fixes: 7373a8cc38197 ("iommu/vt-d: Setup context and enable RID2PASID support")
Signed-off-by: Lu Baolu 
---
 drivers/iommu/intel/iommu.c | 18 +-
 include/linux/intel-iommu.h |  1 +
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 132cbf9f214f..868f195f55ff 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -1339,6 +1339,11 @@ static void iommu_set_root_entry(struct intel_iommu 
*iommu)
  readl, (sts & DMA_GSTS_RTPS), sts);
 
raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
+
+   iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
+   if (sm_supported(iommu))
+   qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
+   iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
 }
 
 void iommu_flush_write_buffer(struct intel_iommu *iommu)
@@ -2422,6 +2427,10 @@ static void domain_context_clear_one(struct intel_iommu 
*iommu, u8 bus, u8 devfn
   (((u16)bus) << 8) | devfn,
   DMA_CCMD_MASK_NOBIT,
   DMA_CCMD_DEVICE_INVL);
+
+   if (sm_supported(iommu))
+   qi_flush_pasid_cache(iommu, did_old, QI_PC_ALL_PASIDS, 0);
+
iommu->flush.flush_iotlb(iommu,
 did_old,
 0,
@@ -3267,8 +3276,6 @@ static int __init init_dmars(void)
register_pasid_allocator(iommu);
 #endif
iommu_set_root_entry(iommu);
-   iommu->flush.flush_context(iommu, 0, 0, 0, 
DMA_CCMD_GLOBAL_INVL);
-   iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
}
 
 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
@@ -3458,12 +3465,7 @@ static int init_iommu_hw(void)
}
 
iommu_flush_write_buffer(iommu);
-
iommu_set_root_entry(iommu);
-
-   iommu->flush.flush_context(iommu, 0, 0, 0,
-  DMA_CCMD_GLOBAL_INVL);
-   iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
iommu_enable_translation(iommu);
iommu_disable_protect_mem_regions(iommu);
}
@@ -3846,8 +3848,6 @@ static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
goto disable_iommu;
 
iommu_set_root_entry(iommu);
-   iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
-   iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
iommu_enable_translation(iommu);
 
iommu_disable_protect_mem_regions(iommu);
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 1732298ce888..76f974da8ca4 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -378,6 +378,7 @@ enum {
 /* PASID cache invalidation granu */
 #define QI_PC_ALL_PASIDS   0
 #define QI_PC_PASID_SEL1
+#define QI_PC_GLOBAL   3
 
 #define QI_EIOTLB_ADDR(addr)   ((u64)(addr) & VTD_PAGE_MASK)
 #define QI_EIOTLB_IH(ih)   (((u64)ih) << 6)
-- 
2.25.1

[PATCH v2 4/5] iommu/vt-d: Use user privilege for RID2PASID translation

2021-03-19 Thread Lu Baolu

When first-level page tables are used for IOVA translation, we use user
privilege by setting U/S bit in the page table entry. This is to make it
consistent with the second level translation, where the U/S enforcement
is not available. Clear the SRE (Supervisor Request Enable) field in the
pasid table entry of RID2PASID so that requests requesting the supervisor
privilege are blocked and treated as DMA remapping faults.

Suggested-by: Jacob Pan 
Fixes: b802d070a52a1 ("iommu/vt-d: Use iova over first level")
Signed-off-by: Lu Baolu 
---
 drivers/iommu/intel/iommu.c | 7 +--
 drivers/iommu/intel/pasid.c | 3 ++-
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 868f195f55ff..7354f9ce47d8 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -2494,9 +2494,9 @@ static int domain_setup_first_level(struct intel_iommu 
*iommu,
struct device *dev,
u32 pasid)
 {
-   int flags = PASID_FLAG_SUPERVISOR_MODE;
struct dma_pte *pgd = domain->pgd;
int agaw, level;
+   int flags = 0;
 
/*
 * Skip top levels of page tables for iommu which has
@@ -2512,7 +2512,10 @@ static int domain_setup_first_level(struct intel_iommu 
*iommu,
if (level != 4 && level != 5)
return -EINVAL;
 
-   flags |= (level == 5) ? PASID_FLAG_FL5LP : 0;
+   if (pasid != PASID_RID2PASID)
+   flags |= PASID_FLAG_SUPERVISOR_MODE;
+   if (level == 5)
+   flags |= PASID_FLAG_FL5LP;
 
return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
 domain->iommu_did[iommu->seq_id],
diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c
index 0bf7e0a76890..dd69df5a188a 100644
--- a/drivers/iommu/intel/pasid.c
+++ b/drivers/iommu/intel/pasid.c
@@ -673,7 +673,8 @@ int intel_pasid_setup_second_level(struct intel_iommu 
*iommu,
 * Since it is a second level only translation setup, we should
 * set SRE bit as well (addresses are expected to be GPAs).
 */
-   pasid_set_sre(pte);
+   if (pasid != PASID_RID2PASID)
+   pasid_set_sre(pte);
pasid_set_present(pte);
pasid_flush_caches(iommu, pte, pasid, did);
 
-- 
2.25.1

[PATCH v2 0/5] iommu/vt-d: Several misc fixes

2021-03-19 Thread Lu Baolu

Hi Joerg,

This series includes some misc fixes for the VT-d iommu driver. Please
help to review and merge.

Best regards,
baolu

Change log:
 v1->v2:
  - v1: 
https://lore.kernel.org/linux-iommu/20210225062654.2864322-1-baolu...@linux.intel.com/
  - [PATCH 2/5] iommu/vt-d: Remove WO permissions on second-level paging entries
 - Refine the commit message to make the intention clear.

Lu Baolu (5):
  iommu/vt-d: Report the right page fault address
  iommu/vt-d: Remove WO permissions on second-level paging entries
  iommu/vt-d: Invalidate PASID cache when root/context entry changed
  iommu/vt-d: Use user privilege for RID2PASID translation
  iommu/vt-d: Avoid unnecessary cache flush in pasid entry teardown

 drivers/iommu/intel/iommu.c | 28 
 drivers/iommu/intel/pasid.c |  6 +-
 drivers/iommu/intel/svm.c   |  2 +-
 include/linux/intel-iommu.h |  1 +
 4 files changed, 23 insertions(+), 14 deletions(-)

-- 
2.25.1

[PATCH v2 1/5] iommu/vt-d: Report the right page fault address

2021-03-19 Thread Lu Baolu

The Address field of the Page Request Descriptor only keeps bit [63:12]
of the offending address. Convert it to a full address before reporting
it to device drivers.

Fixes: eb8d93ea3c1d3 ("iommu/vt-d: Report page request faults for guest SVA")
Signed-off-by: Lu Baolu 
---
 drivers/iommu/intel/svm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index 54db58945c2d..677d7f6b43bb 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -862,7 +862,7 @@ intel_svm_prq_report(struct device *dev, struct 
page_req_dsc *desc)
/* Fill in event data for device specific processing */
memset(&event, 0, sizeof(struct iommu_fault_event));
event.fault.type = IOMMU_FAULT_PAGE_REQ;
-   event.fault.prm.addr = desc->addr;
+   event.fault.prm.addr = (u64)desc->addr << VTD_PAGE_SHIFT;
event.fault.prm.pasid = desc->pasid;
event.fault.prm.grpid = desc->prg_index;
event.fault.prm.perm = prq_to_iommu_prot(desc);
-- 
2.25.1

[PATCH v2 2/5] iommu/vt-d: Remove WO permissions on second-level paging entries

2021-03-19 Thread Lu Baolu

When the first level page table is used for IOVA translation, it only
supports Read-Only and Read-Write permissions. The Write-Only permission
is not supported as the PRESENT bit (implying Read permission) should
always set. When using second level, we still give separate permissions
that allows WriteOnly which seems inconsistent and awkward. We want to
have consistent behavior. After moving to 1st level, we don't want things
to work sometimes, and break if we use 2nd level for the same mappings.
Hence remove this configuration.

Suggested-by: Ashok Raj 
Fixes: b802d070a52a1 ("iommu/vt-d: Use iova over first level")
Signed-off-by: Lu Baolu 
---
 drivers/iommu/intel/iommu.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 167219ea8d70..132cbf9f214f 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -2304,8 +2304,9 @@ __domain_mapping(struct dmar_domain *domain, unsigned 
long iov_pfn,
return -EINVAL;
 
attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
+   attr |= DMA_FL_PTE_PRESENT;
if (domain_use_first_level(domain)) {
-   attr |= DMA_FL_PTE_PRESENT | DMA_FL_PTE_XD | DMA_FL_PTE_US;
+   attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US;
 
if (domain->domain.type == IOMMU_DOMAIN_DMA) {
attr |= DMA_FL_PTE_ACCESS;
-- 
2.25.1

[PATCH v2 1/1] iommu/vt-d: Don't set then clear private data in prq_event_thread()

2021-03-19 Thread Lu Baolu

The VT-d specification (section 7.6) requires that the value in the
Private Data field of a Page Group Response Descriptor must match
the value in the Private Data field of the respective Page Request
Descriptor.

The private data field of a page group response descriptor is set then
immediately cleared in prq_event_thread(). This breaks the rule defined
by the VT-d specification. Fix it by moving clearing code up.

Fixes: 5b438f4ba315d ("iommu/vt-d: Support page request in scalable mode")
Cc: Jacob Pan 
Reviewed-by: Liu Yi L 
Signed-off-by: Lu Baolu 
---
 drivers/iommu/intel/svm.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

Log:
v1->v2:
  - v1: 
https://lore.kernel.org/linux-iommu/20210309004641.3809653-1-baolu...@linux.intel.com/
  - Refine the commit title to make the affected field clear.
  - Refine the commit message to declare why the change matters.

diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index 677d7f6b43bb..5d590d63ab52 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -1034,12 +1034,12 @@ static irqreturn_t prq_event_thread(int irq, void *d)
QI_PGRP_RESP_TYPE;
resp.qw1 = QI_PGRP_IDX(req->prg_index) |
QI_PGRP_LPIG(req->lpig);
+   resp.qw2 = 0;
+   resp.qw3 = 0;
 
if (req->priv_data_present)
memcpy(&resp.qw2, req->priv_data,
   sizeof(req->priv_data));
-   resp.qw2 = 0;
-   resp.qw3 = 0;
qi_submit_sync(iommu, &resp, 1, 0);
}
 prq_advance:
-- 
2.25.1

Re: [PATCH 2/3] mm, dax, pmem: Introduce dev_pagemap_failure()

2021-03-19 Thread Dan Williams

On Fri, Mar 19, 2021 at 6:47 PM Dave Chinner  wrote:
[..]
> > Now I'm trying to reconcile the fact that platform
> > poison handling will hit memory_failure() first and may not
> > immediately reach the driver, if ever (see the perennially awkward
> > firmware-first-mode error handling: ghes_handle_memory_failure()) . So
> > even if the ->memory_failure(dev...) up call exists there is no
> > guarantee it can get called for all poison before the memory_failure()
> > down call happens. Which means regardless of whether
> > ->memory_failure(dev...) exists memory_failure() needs to be able to
> > do the right thing.
>
> I don't see how a poor implementation of memory_failure in a driver
> or hardware is even remotely relevant to the interface used to
> notify the filesystem of a media or device failure. It sounds like
> you are trying to use memory_failure() for something it was never
> intended to support and that there's a bunch of driver and
> infrastructure work needed to make it work how you want it to work.
> And even then it may not work the way we want it to work
>
> > Combine that with the fact that new buses like CXL might be configured
> > in "poison on decode error" mode which means that a memory_failure()
> > storm can happen regardless of whether the driver initiates it
> > programatically.
>
> Straw man argument.
>
> "We can't make this interface a ranged notification because the
> hardware might only be able to do page-by-page notification."

No, it's "we can't make this interface notify the filesytem that
sectors have failed before the memory_failure() (ranged or not) has
communicated that pfns have failed."

memory_failure() today is the first and sometimes only interface that
learns of pfn failures.

>
> You can do page-by-page notification with a range based interface.
> We are talking about how to efficiently and reliably inform the
> filesystem that a range of a device is no longer accessible and so
> it needs to revoke all mappings over that range of it's address
> space. That does not need to be a single page at a time interface.
>
> If your hardware is configured to do stupid things, then that is not
> the fault of the software interface used to communicate faults up
> the stack, nor is it something that the notfication interface should
> try to fix or mitigate.
>
> > How about a mechanism to optionally let a filesystem take over memory
> > failure handling for a range of pfns that the memory_failure() can
> > consult to fail ranges at a time rather than one by one? So a new
> > 'struct dax_operations' op (void) (*memory_failure_register(struct
> > dax_device *, void *data). Where any agent that claims a dax_dev can
> > register to take over memory_failure() handling for any event that
> > happens in that range. This would be routed through device-mapper like
> > any other 'struct dax_operations' op. I think that meets your
> > requirement to get notifications of all the events you want to handle,
> > but still allows memory_failure() to be the last resort for everything
> > that has not opted into this error handling.
>
> Which is basically the same as the proposed ->corrupted_range stack,
> except it doesn't map the pfns back to LBA addresses the filesystem
> needs to make sense of the failure.
>
> fs-dax filesystems have no clue what pfns are, or how to translate
> them to LBAs in their block device address space that the map
> everything to. The fs-dax infrastructure asks the filesystem for
> bdev/sector based mappings, and internally converts them to pfns by
> a combination of bdev and daxdev callouts. Hence fs-dax filesystems
> never see nor interpret pfns at all.  Nor do they have the
> capability to convert a PFN to a LBA address. And neither the
> underlying block device nor the associated DAX device provide a
> method for doing this reverse mapping translation.

True.

>
> So if we have an interface that hands a {daxdev,PFN,len} tuple to
> the filesystem, exactly what is the filesystem supposed to do with
> it? How do we turn that back into a {bdev,sector,len} tuple so we
> can do reverse mapping lookups to find the filesystem objects that
> allocated within the notified range?
>
> I'll point out again that these address space translations were
> something that the ->corrupted_range callbacks handled directly - no
> layer in the stack was handed a range that it didn't know how to map
> to it's own internal structures. By the time it got to the
> filesystem, it was a {bdev,sector,len} tuple, and the filesystem
> could feed that directly to it's reverse mapping lookups
>
> Maybe I'm missing something magical about ->memory_failure that does
> all this translation for us, but I don't see it in this patchset. I
> just don't see how this proposed interface is a usable at the
> filesystem level as it stands.

So then it's not the filesystem that needs to register for
memory_failure() it's the driver in order to translate the failed LBAs
up the stack. However, memory_fai

Re: [RFC PATCH v3 2/3] regmap-irq: Add support for POLARITY_HI and POLARITY_LO config regs

2021-03-19 Thread Guru Das Srinagesh

On Wed, Mar 17, 2021 at 08:42:12PM +, Mark Brown wrote:
> On Mon, Mar 15, 2021 at 01:33:37PM -0700, Guru Das Srinagesh wrote:
> 
> > Since I do need to write to two extra registers, I'll need two
> > register_base's and two buffers to hold their data. This can be
> > generalized to "extra config registers" in the framework as follows:
> > 
> > - Add these two fields to `struct regmap_irq_chip`:
> > 
> > unsigned int *extra_config_base; /* Points to array of extra regs */
> > int num_extra_config_regs;   /* = ARRAY_SIZE(array above) */
> 
> I'm having a hard time loving this but I'm also not able to think of any
> better ideas so sure.  I'd change the name to virtual (or virt) rather
> than extra since that's what they are so it makes it a bit omre clear.

Thanks for accepting the first patch in this series. I will test out my
proposed changes and then send a new patchset sometime next week.

Thank you.

Guru Das.

Re: [PATCH v6 12/14] drm/bridge: imx: Add LDB support for i.MX8qxp

2021-03-19 Thread kernel test robot

Hi Liu,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on shawnguo/for-next]
[also build test ERROR on robh/for-next drm-intel/for-linux-next 
drm-tip/drm-tip drm-exynos/exynos-drm-next tegra-drm/drm/tegra/for-next 
linus/master v5.12-rc3 next-20210319]
[cannot apply to drm/drm-next]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:
https://github.com/0day-ci/linux/commits/Liu-Ying/Add-some-DRM-bridge-drivers-support-for-i-MX8qm-qxp-SoCs/20210317-115847
base:   https://git.kernel.org/pub/scm/linux/kernel/git/shawnguo/linux.git 
for-next
config: x86_64-allyesconfig (attached as .config)
compiler: gcc-9 (Debian 9.3.0-22) 9.3.0
reproduce (this is a W=1 build):
# 
https://github.com/0day-ci/linux/commit/482264f815494bc2e90bde5f7b47a60331b81817
git remote add linux-review https://github.com/0day-ci/linux
git fetch --no-tags linux-review 
Liu-Ying/Add-some-DRM-bridge-drivers-support-for-i-MX8qm-qxp-SoCs/20210317-115847
git checkout 482264f815494bc2e90bde5f7b47a60331b81817
# save the attached .config to linux build tree
make W=1 ARCH=x86_64 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot 

All errors (new ones prefixed by >>):

   drivers/gpu/drm/bridge/imx/imx8qxp-ldb-drv.c:65:16: warning: 'struct 
phy_configure_opts_lvds' declared inside parameter list will not be visible 
outside of this definition or declaration
  65 | struct phy_configure_opts_lvds *phy_cfg)
 |^~~
   drivers/gpu/drm/bridge/imx/imx8qxp-ldb-drv.c: In function 
'imx8qxp_ldb_set_phy_cfg':
>> drivers/gpu/drm/bridge/imx/imx8qxp-ldb-drv.c:67:9: error: dereferencing 
>> pointer to incomplete type 'struct phy_configure_opts_lvds'
  67 |  phy_cfg->bits_per_lane_and_dclk_cycle = 7;
 | ^~
   drivers/gpu/drm/bridge/imx/imx8qxp-ldb-drv.c: In function 
'imx8qxp_ldb_bridge_atomic_check':
>> drivers/gpu/drm/bridge/imx/imx8qxp-ldb-drv.c:94:49: error: 'union 
>> phy_configure_opts' has no member named 'lvds'
  94 |  struct phy_configure_opts_lvds *phy_cfg = &opts.lvds;
 | ^
>> drivers/gpu/drm/bridge/imx/imx8qxp-ldb-drv.c:102:57: error: passing argument 
>> 4 of 'imx8qxp_ldb_set_phy_cfg' from incompatible pointer type 
>> [-Werror=incompatible-pointer-types]
 102 |  imx8qxp_ldb_set_phy_cfg(imx8qxp_ldb, di_clk, is_split, phy_cfg);
 | ^~~
 | |
 | struct 
phy_configure_opts_lvds *
   drivers/gpu/drm/bridge/imx/imx8qxp-ldb-drv.c:65:41: note: expected 'struct 
phy_configure_opts_lvds *' but argument is of type 'struct 
phy_configure_opts_lvds *'
  65 | struct phy_configure_opts_lvds *phy_cfg)
 | ^~~
   drivers/gpu/drm/bridge/imx/imx8qxp-ldb-drv.c: In function 
'imx8qxp_ldb_bridge_mode_set':
   drivers/gpu/drm/bridge/imx/imx8qxp-ldb-drv.c:136:49: error: 'union 
phy_configure_opts' has no member named 'lvds'
 136 |  struct phy_configure_opts_lvds *phy_cfg = &opts.lvds;
 | ^
   drivers/gpu/drm/bridge/imx/imx8qxp-ldb-drv.c:162:57: error: passing argument 
4 of 'imx8qxp_ldb_set_phy_cfg' from incompatible pointer type 
[-Werror=incompatible-pointer-types]
 162 |  imx8qxp_ldb_set_phy_cfg(imx8qxp_ldb, di_clk, is_split, phy_cfg);
 | ^~~
 | |
 | struct 
phy_configure_opts_lvds *
   drivers/gpu/drm/bridge/imx/imx8qxp-ldb-drv.c:65:41: note: expected 'struct 
phy_configure_opts_lvds *' but argument is of type 'struct 
phy_configure_opts_lvds *'
  65 | struct phy_configure_opts_lvds *phy_cfg)
 | ^~~
   cc1: some warnings being treated as errors


vim +67 drivers/gpu/drm/bridge/imx/imx8qxp-ldb-drv.c

62  
63  static void imx8qxp_ldb_set_phy_cfg(struct imx8qxp_ldb *imx8qxp_ldb,
64  unsigned long di_clk, bool is_split,
65  struct phy_configure_opts_lvds 
*phy_cfg)
66  {
  > 67  phy_cfg->bits_per_lane_and_dclk_cycle = 7;
68  phy_cfg->lanes = 4;
69  
70  if (i

Re: [PATCH v3 2/2] Adding i2c-cp2615: i2c support for Silicon Labs' CP2615 Digital Audio Bridge

2021-03-19 Thread kernel test robot

Hi "Bence,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on wsa/i2c/for-next]
[also build test ERROR on v5.12-rc3 next-20210319]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:
https://github.com/0day-ci/linux/commits/Bence-Cs-k-s/Add-i2c-cp2615/20210318-193822
base:   https://git.kernel.org/pub/scm/linux/kernel/git/wsa/linux.git 
i2c/for-next
config: arc-allyesconfig (attached as .config)
compiler: arceb-elf-gcc (GCC) 9.3.0
reproduce (this is a W=1 build):
wget 
https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O 
~/bin/make.cross
chmod +x ~/bin/make.cross
# 
https://github.com/0day-ci/linux/commit/7aa4ceb301ef5116752aef6e09f6ff845dedc106
git remote add linux-review https://github.com/0day-ci/linux
git fetch --no-tags linux-review 
Bence-Cs-k-s/Add-i2c-cp2615/20210318-193822
git checkout 7aa4ceb301ef5116752aef6e09f6ff845dedc106
# save the attached .config to linux build tree
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-9.3.0 make.cross ARCH=arc 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot 

All errors (new ones prefixed by >>):

   drivers/i2c/busses/i2c-cp2615.c:78:5: warning: no previous prototype for 
'cp2615_init_iop_msg' [-Wmissing-prototypes]
  78 | int cp2615_init_iop_msg(struct cp2615_iop_msg *ret, enum 
cp2615_iop_msg_type msg,
 | ^~~
   drivers/i2c/busses/i2c-cp2615.c:96:5: warning: no previous prototype for 
'cp2615_init_i2c_msg' [-Wmissing-prototypes]
  96 | int cp2615_init_i2c_msg(struct cp2615_iop_msg *ret, const struct 
cp2615_i2c_transfer *data)
 | ^~~
   drivers/i2c/busses/i2c-cp2615.c:102:5: warning: no previous prototype for 
'cp2615_check_status' [-Wmissing-prototypes]
 102 | int cp2615_check_status(enum cp2615_i2c_status status)
 | ^~~
   drivers/i2c/busses/i2c-cp2615.c:266:1: warning: data definition has no type 
or storage class
 266 | MODULE_DEVICE_TABLE(usb, id_table);
 | ^~~
>> drivers/i2c/busses/i2c-cp2615.c:266:1: error: type defaults to 'int' in 
>> declaration of 'MODULE_DEVICE_TABLE' [-Werror=implicit-int]
   drivers/i2c/busses/i2c-cp2615.c:266:1: warning: parameter names (without 
types) in function declaration
   In file included from include/linux/device.h:32,
from include/linux/acpi.h:15,
from include/linux/i2c.h:13,
from drivers/i2c/busses/i2c-cp2615.c:9:
   include/linux/device/driver.h:263:1: warning: data definition has no type or 
storage class
 263 | module_init(__driver##_init); \
 | ^~~
   include/linux/usb.h:1303:2: note: in expansion of macro 'module_driver'
1303 |  module_driver(__usb_driver, usb_register, \
 |  ^
   drivers/i2c/busses/i2c-cp2615.c:275:1: note: in expansion of macro 
'module_usb_driver'
 275 | module_usb_driver(cp2615_i2c_driver);
 | ^
>> include/linux/device/driver.h:263:1: error: type defaults to 'int' in 
>> declaration of 'module_init' [-Werror=implicit-int]
 263 | module_init(__driver##_init); \
 | ^~~
   include/linux/usb.h:1303:2: note: in expansion of macro 'module_driver'
1303 |  module_driver(__usb_driver, usb_register, \
 |  ^
   drivers/i2c/busses/i2c-cp2615.c:275:1: note: in expansion of macro 
'module_usb_driver'
 275 | module_usb_driver(cp2615_i2c_driver);
 | ^
   In file included from include/linux/linkage.h:7,
from include/linux/kernel.h:7,
from include/linux/list.h:9,
from include/linux/kobject.h:19,
from include/linux/of.h:17,
from include/linux/irqdomain.h:35,
from include/linux/acpi.h:13,
from include/linux/i2c.h:13,
from drivers/i2c/busses/i2c-cp2615.c:9:
   include/linux/export.h:19:30: warning: parameter names (without types) in 
function declaration
  19 | #define THIS_MODULE ((struct module *)0)
 |  ^~
   include/linux/usb.h:1290:30: note: in expansion of macro 'THIS_MODULE'
1290 |  usb_register_driver(driver, THIS_MODULE, KBUILD_MODNAME)
 |  ^~~
   include/linux/device/driver.h:261:9: note: in expansion of macro 
'usb_register'
 261 |  return __register(&(__driver) , ##__VA_ARGS__); \
 | ^~
   include/linux/usb.h:1303:2: note: in e

Re: [PATCH V2] drm/amdgpu: Fix a typo

2021-03-19 Thread Randy Dunlap





On Fri, 19 Mar 2021, Bhaskar Chowdhury wrote:


s/traing/training/

...Plus the entire sentence construction for better readability.

Signed-off-by: Bhaskar Chowdhury 
---
Changes from V1:
 Alex and Randy's suggestions incorporated.

drivers/gpu/drm/amd/amdgpu/psp_v11_0.c | 8 
1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v11_0.c 
b/drivers/gpu/drm/amd/amdgpu/psp_v11_0.c
index c325d6f53a71..bf3857867f51 100644
--- a/drivers/gpu/drm/amd/amdgpu/psp_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/psp_v11_0.c
@@ -661,10 +661,10 @@ static int psp_v11_0_memory_training(struct psp_context 
*psp, uint32_t ops)

if (ops & PSP_MEM_TRAIN_SEND_LONG_MSG) {
/*
-* Long traing will encroach certain mount of bottom VRAM,
-* saving the content of this bottom VRAM to system memory
-* before training, and restoring it after training to avoid
-* VRAM corruption.
+* Long training will encroach a certain amount on the bottom 
of VRAM;
+ * save the content from the bottom VRAM to system memory
+ * before training, and restore it after training to avoid
+ * VRAM corruption.


These 3 new lines are indented with spaces instead of tabs. Oops.  :(

(I may be too late with this comment -- sorry about that.)


 */
sz = GDDR6_MEM_TRAINING_ENCROACHED_SIZE;

--
2.26.2

Re: [PATCH V2] mm: Few spelling fixes

2021-03-19 Thread Randy Dunlap





On Fri, 19 Mar 2021, Bhaskar Chowdhury wrote:


Few spelling fixes throughout the file.

Signed-off-by: Bhaskar Chowdhury 


Acked-by: Randy Dunlap 



---
Changes from V1:
 Mentioned suggestion incorporated.

include/linux/pgtable.h | 10 +-
1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 5e772392a379..b58f20226bb9 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -426,7 +426,7 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, 
unsigned long addres

/*
 * On some architectures hardware does not set page access bit when accessing
- * memory page, it is responsibilty of software setting this bit. It brings
+ * memory page, it is responsibility of software setting this bit. It brings
 * out extra page fault penalty to track page access bit. For optimization page
 * access bit can be set during all page fault flow on these arches.
 * To be differentiate with macro pte_mkyoung, this macro is used on platforms
@@ -519,7 +519,7 @@ extern pgtable_t pgtable_trans_huge_withdraw(struct 
mm_struct *mm, pmd_t *pmdp);
/*
 * This is an implementation of pmdp_establish() that is only suitable for an
 * architecture that doesn't have hardware dirty/accessed bits. In this case we
- * can't race with CPU which sets these bits and non-atomic aproach is fine.
+ * can't race with CPU which sets these bits and non-atomic approach is fine.
 */
static inline pmd_t generic_pmdp_establish(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmdp, pmd_t pmd)
@@ -852,7 +852,7 @@ static inline void __ptep_modify_prot_commit(struct 
vm_area_struct *vma,
 * updates, but to prevent any updates it may make from being lost.
 *
 * This does not protect against other software modifications of the
- * pte; the appropriate pte lock must be held over the transation.
+ * pte; the appropriate pte lock must be held over the transaction.
 *
 * Note that this interface is intended to be batchable, meaning that
 * ptep_modify_prot_commit may not actually update the pte, but merely
@@ -1269,13 +1269,13 @@ static inline int 
pmd_none_or_trans_huge_or_clear_bad(pmd_t *pmd)
 *
 * The complete check uses is_pmd_migration_entry() in linux/swapops.h
 * But using that requires moving current function and 
pmd_trans_unstable()
-* to linux/swapops.h to resovle dependency, which is too much code 
move.
+* to linux/swapops.h to resolve dependency, which is too much code 
move.
 *
 * !pmd_present() is equivalent to is_pmd_migration_entry() currently,
 * because !pmd_present() pages can only be under migration not swapped
 * out.
 *
-* pmd_none() is preseved for future condition checks on pmd migration
+* pmd_none() is preserved for future condition checks on pmd migration
 * entries and not confusing with this function name, although it is
 * redundant with !pmd_present().
 */
--
2.26.2

Re: [PATCH V3] ethernet: sun: Fix a typo

2021-03-19 Thread Randy Dunlap





On Fri, 19 Mar 2021, Bhaskar Chowdhury wrote:


s/serisouly/seriously/

...plus the sentence construction for better readability.

Signed-off-by: Bhaskar Chowdhury 
---
 Changes from V2:
 Missed the subject line labeling ..so added

drivers/net/ethernet/sun/sungem.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/sun/sungem.c 
b/drivers/net/ethernet/sun/sungem.c
index 58f142ee78a3..9790656cf970 100644
--- a/drivers/net/ethernet/sun/sungem.c
+++ b/drivers/net/ethernet/sun/sungem.c
@@ -1674,8 +1674,8 @@ static void gem_init_phy(struct gem *gp)
if (gp->pdev->vendor == PCI_VENDOR_ID_APPLE) {
int i;

-   /* Those delay sucks, the HW seem to love them though, I'll
-* serisouly consider breaking some locks here to be able
+   /* Those delays sucks, the HW seems to love them though, I'll


Nope: needs subject/verb agreement, e.g. "delays suck".


+* seriously consider breaking some locks here to be able
 * to schedule instead
 */
for (i = 0; i < 3; i++) {
--
2.26.2

[PATCH v2 1/1] iommu/vt-d: Fix lockdep splat in intel_pasid_get_entry()

2021-03-19 Thread Lu Baolu

The pasid_lock is used to synchronize different threads from modifying a
same pasid directory entry at the same time. It causes below lockdep splat.

[   83.296538] 
[   83.296538] WARNING: possible irq lock inversion dependency detected
[   83.296539] 5.12.0-rc3+ #25 Tainted: GW
[   83.296539] 
[   83.296540] bash/780 just changed the state of lock:
[   83.296540] 82b29c98 (device_domain_lock){..-.}-{2:2}, at:
   iommu_flush_dev_iotlb.part.0+0x32/0x110
[   83.296547] but this lock took another, SOFTIRQ-unsafe lock in the past:
[   83.296547]  (pasid_lock){+.+.}-{2:2}
[   83.296548]

   and interrupts could create inverse lock ordering between them.

[   83.296549] other info that might help us debug this:
[   83.296549] Chain exists of:
 device_domain_lock --> &iommu->lock --> pasid_lock
[   83.296551]  Possible interrupt unsafe locking scenario:

[   83.296551]CPU0CPU1
[   83.296552]
[   83.296552]   lock(pasid_lock);
[   83.296553]local_irq_disable();
[   83.296553]lock(device_domain_lock);
[   83.296554]lock(&iommu->lock);
[   83.296554]   
[   83.296554] lock(device_domain_lock);
[   83.296555]
*** DEADLOCK ***

Fix it by replacing the pasid_lock with an atomic exchange operation.

Reported-and-tested-by: Dave Jiang 
Signed-off-by: Lu Baolu 
---
 drivers/iommu/intel/pasid.c | 21 +
 1 file changed, 13 insertions(+), 8 deletions(-)

Log:
v1->v2:
  - v1: 
https://lore.kernel.org/linux-iommu/20210317005834.173503-1-baolu...@linux.intel.com/
  - Use retry to make code neat;
  - Add a comment about no clear case, hence no race.

diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c
index 7a73385edcc0..f2c747e62c6a 100644
--- a/drivers/iommu/intel/pasid.c
+++ b/drivers/iommu/intel/pasid.c
@@ -24,7 +24,6 @@
 /*
  * Intel IOMMU system wide PASID name space:
  */
-static DEFINE_SPINLOCK(pasid_lock);
 u32 intel_pasid_max_id = PASID_MAX;
 
 int vcmd_alloc_pasid(struct intel_iommu *iommu, u32 *pasid)
@@ -259,19 +258,25 @@ struct pasid_entry *intel_pasid_get_entry(struct device 
*dev, u32 pasid)
dir_index = pasid >> PASID_PDE_SHIFT;
index = pasid & PASID_PTE_MASK;
 
-   spin_lock(&pasid_lock);
+retry:
entries = get_pasid_table_from_pde(&dir[dir_index]);
if (!entries) {
entries = alloc_pgtable_page(info->iommu->node);
-   if (!entries) {
-   spin_unlock(&pasid_lock);
+   if (!entries)
return NULL;
-   }
 
-   WRITE_ONCE(dir[dir_index].val,
-  (u64)virt_to_phys(entries) | PASID_PTE_PRESENT);
+   /*
+* The pasid directory table entry won't be freed after
+* allocation. No worry about the race with free and
+* clear. However, this entry might be populated by others
+* while we are preparing it. Use theirs with a retry.
+*/
+   if (cmpxchg64(&dir[dir_index].val, 0ULL,
+ (u64)virt_to_phys(entries) | PASID_PTE_PRESENT)) {
+   free_pgtable_page(entries);
+   goto retry;
+   }
}
-   spin_unlock(&pasid_lock);
 
return &entries[index];
 }
-- 
2.25.1

Re: [PATCH] selftests: net: forwarding: Fix a typo

2021-03-19 Thread Randy Dunlap





On Fri, 19 Mar 2021, Bhaskar Chowdhury wrote:


s/verfied/verified/

Signed-off-by: Bhaskar Chowdhury 


Acked-by: Randy Dunlap 



---
tools/testing/selftests/net/forwarding/fib_offload_lib.sh | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/testing/selftests/net/forwarding/fib_offload_lib.sh 
b/tools/testing/selftests/net/forwarding/fib_offload_lib.sh
index 66496659bea7..e134a5f529c9 100644
--- a/tools/testing/selftests/net/forwarding/fib_offload_lib.sh
+++ b/tools/testing/selftests/net/forwarding/fib_offload_lib.sh
@@ -224,7 +224,7 @@ fib_ipv4_plen_test()
ip -n $ns link set dev dummy1 up

# Add two routes with the same key and different prefix length and
-   # make sure both are in hardware. It can be verfied that both are
+   # make sure both are in hardware. It can be verified that both are
# sharing the same leaf by checking the /proc/net/fib_trie
ip -n $ns route add 192.0.2.0/24 dev dummy1
ip -n $ns route add 192.0.2.0/25 dev dummy1
--
2.26.2

[no subject]

2021-03-19 Thread Kayla Manthey

Ik wil alsjeblieft weten of je mijn vorige berichten hebt ontvangen.

Re: [PATCH 2/2] usb: dwc3: gadget: Ignore EP queue requests during bus reset

2021-03-19 Thread Thinh Nguyen

Thinh Nguyen wrote:
> Wesley Cheng wrote:
>>
>>
>> On 3/19/2021 5:40 PM, Thinh Nguyen wrote:
>>> Hi,
>>>
>>> Wesley Cheng wrote:
 The current dwc3_gadget_reset_interrupt() will stop any active
 transfers, but only addresses blocking of EP queuing for while we are
 coming from a disconnected scenario, i.e. after receiving the disconnect
 event.  If the host decides to issue a bus reset on the device, the
 connected parameter will still be set to true, allowing for EP queuing
 to continue while we are disabling the functions.  To avoid this, set the
 connected flag to false until the stop active transfers is complete.

 Signed-off-by: Wesley Cheng 
 ---
  drivers/usb/dwc3/gadget.c | 9 +
  1 file changed, 9 insertions(+)

 diff --git a/drivers/usb/dwc3/gadget.c b/drivers/usb/dwc3/gadget.c
 index 6e14fdc..d5ed0f69 100644
 --- a/drivers/usb/dwc3/gadget.c
 +++ b/drivers/usb/dwc3/gadget.c
 @@ -3327,6 +3327,15 @@ static void dwc3_gadget_reset_interrupt(struct dwc3 
 *dwc)
u32 reg;
  
/*
 +   * Ideally, dwc3_reset_gadget() would trigger the function
 +   * drivers to stop any active transfers through ep disable.
 +   * However, for functions which defer ep disable, such as mass
 +   * storage, we will need to rely on the call to stop active
 +   * transfers here, and avoid allowing of request queuing.
 +   */
 +  dwc->connected = false;
 +
 +  /*
 * WORKAROUND: DWC3 revisions <1.88a have an issue which
 * would cause a missing Disconnect Event if there's a
 * pending Setup Packet in the FIFO.

>>>
>>> This doesn't look right. Did you have rebase issue with your local
>>> change again?
>>>
>>> BR,
>>> Thinh
>>>
>> Hi Thinh,
>>
>> This was rebased on Greg's usb-linus branch, which has commit
>> f09ddcfcb8c5 ("usb: dwc3: gadget: Prevent EP queuing while stopping
>> transfers") merged.
> 
> Ah I see.
> 
>>
>> commit f09ddcfcb8c5  moved the dwc->connected = true to after we have
>> finished stop active transfers.  However, this change will also ensure
>> that the connected flag is set to false to ensure that when we call stop
>> active transfers, nothing can prepare TRBs.  (previous commit only
>> addresses the case where we get the reset interrupt when coming from a
>> disconnected state)
>>
> 
> That still doesn't address this issue.
> 
> Because:
> 1) We're still protected by the spin_lock_irq*(), so this change doesn't
> make any difference while handling an event.
> 2) We don't enable the interrupt for END_TRANSFER command completion
> when doing dwc3_stop_active_transfers(), the
> DWC3_EP_END_TRANSFER_PENDING flag will not be set to prevent preparing
> new requests.
> 
> We should do dwc->connected = true when we handle connection_done
> interrupt instead. The END_TRANSFER command should complete before this.
> 
> Thanks,
> Thinh
> 

Just want to clarify, I was referring to your previous commit
f09ddcfcb8c5, we'd still need dwc->connected = false when handling reset
interrupt as you've done here.

BR,
Thinh

[tip:x86/seves] BUILD SUCCESS 799de1baaf3509a54ff713efb768020f8defd709

2021-03-19 Thread kernel test robot

tree/branch: https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git 
x86/seves
branch HEAD: 799de1baaf3509a54ff713efb768020f8defd709  x86/sev-es: Optimize 
__sev_es_ist_enter() for better readability

elapsed time: 720m

configs tested: 152
configs skipped: 2

The following configs have been built successfully.
More configs may be tested in the coming days.

gcc tested configs:
arm defconfig
arm64allyesconfig
arm64   defconfig
arm  allyesconfig
arm  allmodconfig
x86_64   allyesconfig
riscvallmodconfig
i386 allyesconfig
riscvallyesconfig
mips tb0287_defconfig
powerpc tqm8541_defconfig
arm socfpga_defconfig
arc  alldefconfig
powerpc  iss476-smp_defconfig
sh  lboxre2_defconfig
sh  rsk7203_defconfig
sh  kfr2r09_defconfig
archsdk_defconfig
mips   rbtx49xx_defconfig
powerpc stx_gp3_defconfig
sh   rts7751r2dplus_defconfig
powerpc   bluestone_defconfig
arm at91_dt_defconfig
arm  colibri_pxa270_defconfig
powerpc  pmac32_defconfig
powerpc skiroot_defconfig
arm   sunxi_defconfig
arm  colibri_pxa300_defconfig
armqcom_defconfig
powerpc redwood_defconfig
nds32 allnoconfig
powerpc   lite5200b_defconfig
powerpc canyonlands_defconfig
m68kq40_defconfig
arm   milbeaut_m10v_defconfig
armneponset_defconfig
armmulti_v7_defconfig
shedosk7705_defconfig
powerpc  mpc885_ads_defconfig
arm cm_x300_defconfig
powerpc pq2fads_defconfig
mips loongson1c_defconfig
m68k   sun3_defconfig
mipsbcm63xx_defconfig
armvt8500_v6_v7_defconfig
arm   u8500_defconfig
arm  imote2_defconfig
s390   zfcpdump_defconfig
powerpc   eiger_defconfig
powerpc tqm8548_defconfig
mips  pistachio_defconfig
sh  rts7751r2d1_defconfig
nios2 3c120_defconfig
sh  polaris_defconfig
powerpc ksi8560_defconfig
powerpc taishan_defconfig
powerpcwarp_defconfig
armmvebu_v7_defconfig
xtensa  audio_kc705_defconfig
m68kstmark2_defconfig
arm   aspeed_g4_defconfig
powerpc   maple_defconfig
arm   mainstone_defconfig
sparc64 defconfig
powerpc  acadia_defconfig
sh   se7750_defconfig
mips decstation_defconfig
mipsqi_lb60_defconfig
powerpc mpc837x_rdb_defconfig
sh  landisk_defconfig
parisc   alldefconfig
arc  axs103_smp_defconfig
mips  ath25_defconfig
powerpccell_defconfig
powerpc tqm8560_defconfig
ia64zx1_defconfig
sh magicpanelr2_defconfig
powerpc ps3_defconfig
powerpcsam440ep_defconfig
m68k  atari_defconfig
mips   xway_defconfig
powerpc  ppc64e_defconfig
powerpc mpc512x_defconfig
armclps711x_defconfig
powerpc   mpc834x_itxgp_defconfig
pariscgeneric-64bit_defconfig
mipsmaltaup_defconfig
ia64 allmodconfig
ia64defconfig
ia64 allyesconfig
m68k allmodconfig
m68kdefconfig
m68k allyesconfig
nios2   defconfig
arc  allyesconfig
nds32   defconfig
nios2allyesconfig
cskydefconfig
alpha   defconfig
alphaallyesconfig
xtensa

[tip:efi/urgent] BUILD SUCCESS 429257a430a0e81e9979256e0db718e35e7d9cee

2021-03-19 Thread kernel test robot

nfig
sparcallyesconfig
sparc   defconfig
i386   tinyconfig
i386defconfig
mips allyesconfig
mips allmodconfig
powerpc  allyesconfig
powerpc  allmodconfig
powerpc   allnoconfig
i386 randconfig-a001-20210318
i386 randconfig-a005-20210318
i386 randconfig-a003-20210318
i386 randconfig-a002-20210318
i386 randconfig-a006-20210318
i386 randconfig-a004-20210318
i386 randconfig-a001-20210319
i386 randconfig-a005-20210319
i386 randconfig-a003-20210319
i386 randconfig-a002-20210319
i386 randconfig-a006-20210319
i386 randconfig-a004-20210319
x86_64   randconfig-a011-20210318
x86_64   randconfig-a016-20210318
x86_64   randconfig-a013-20210318
x86_64   randconfig-a015-20210318
x86_64   randconfig-a014-20210318
x86_64   randconfig-a012-20210318
i386 randconfig-a013-20210318
i386 randconfig-a016-20210318
i386 randconfig-a011-20210318
i386 randconfig-a014-20210318
i386 randconfig-a015-20210318
i386 randconfig-a012-20210318
riscvnommu_k210_defconfig
riscvnommu_virt_defconfig
riscv allnoconfig
riscv   defconfig
riscv  rv32_defconfig
x86_64rhel-7.6-kselftests
x86_64  defconfig
x86_64   rhel-8.3
x86_64  rhel-8.3-kbuiltin
x86_64  kexec

clang tested configs:
x86_64   randconfig-a006-20210318
x86_64   randconfig-a001-20210318
x86_64   randconfig-a005-20210318
x86_64   randconfig-a002-20210318
x86_64   randconfig-a003-20210318
x86_64   randconfig-a004-20210318

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-...@lists.01.org

[tip:master] BUILD SUCCESS 75e5f8b5058d581688be8098ca0c335780a6d8df

2021-03-19 Thread kernel test robot

tree/branch: https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git master
branch HEAD: 75e5f8b5058d581688be8098ca0c335780a6d8df  Merge branch 'x86/seves'

elapsed time: 721m

configs tested: 136
configs skipped: 2

The following configs have been built successfully.
More configs may be tested in the coming days.

gcc tested configs:
arm defconfig
arm64allyesconfig
arm64   defconfig
arm  allyesconfig
arm  allmodconfig
x86_64   allyesconfig
riscvallmodconfig
i386 allyesconfig
riscvallyesconfig
powerpc  walnut_defconfig
sh sh03_defconfig
sh   se7343_defconfig
mips  pistachio_defconfig
powerpc mpc837x_rdb_defconfig
sh  rsk7203_defconfig
sh  kfr2r09_defconfig
archsdk_defconfig
mips   rbtx49xx_defconfig
powerpc stx_gp3_defconfig
sh   rts7751r2dplus_defconfig
powerpc   bluestone_defconfig
arm at91_dt_defconfig
arm  colibri_pxa270_defconfig
powerpc  pmac32_defconfig
powerpc skiroot_defconfig
arm   sunxi_defconfig
arm  colibri_pxa300_defconfig
armqcom_defconfig
powerpc redwood_defconfig
nds32 allnoconfig
powerpc   lite5200b_defconfig
powerpc canyonlands_defconfig
ia64 allmodconfig
m68kq40_defconfig
arm   milbeaut_m10v_defconfig
armneponset_defconfig
armmulti_v7_defconfig
shedosk7705_defconfig
armvt8500_v6_v7_defconfig
mips tb0287_defconfig
arm   u8500_defconfig
arm  imote2_defconfig
sh  rts7751r2d1_defconfig
nios2 3c120_defconfig
sh  polaris_defconfig
powerpc ksi8560_defconfig
powerpc taishan_defconfig
powerpcwarp_defconfig
armmvebu_v7_defconfig
xtensa  audio_kc705_defconfig
m68kstmark2_defconfig
arm   aspeed_g4_defconfig
powerpc   maple_defconfig
arm   mainstone_defconfig
sparc64 defconfig
powerpc  acadia_defconfig
sh   se7750_defconfig
mips decstation_defconfig
mipsqi_lb60_defconfig
sh  landisk_defconfig
parisc   alldefconfig
arc  axs103_smp_defconfig
mips  ath25_defconfig
powerpccell_defconfig
powerpc tqm8560_defconfig
powerpc tqm5200_defconfig
parisc  defconfig
powerpc linkstation_defconfig
s390 allmodconfig
arm   versatile_defconfig
armclps711x_defconfig
powerpc   mpc834x_itxgp_defconfig
pariscgeneric-64bit_defconfig
mipsmaltaup_defconfig
ia64defconfig
ia64 allyesconfig
m68k allmodconfig
m68kdefconfig
m68k allyesconfig
nds32   defconfig
nios2allyesconfig
cskydefconfig
alpha   defconfig
alphaallyesconfig
xtensa   allyesconfig
h8300allyesconfig
arc defconfig
sh   allmodconfig
s390 allyesconfig
parisc   allyesconfig
s390defconfig
nios2   defconfig
arc  allyesconfig
sparcallyesconfig
sparc   defconfig
i386   tinyconfig
i386defconfig
mips allyesconfig
mips allmodconfig
powerpc  allyesconfig
powerpc  allmodconfig
powerpc

Re: [PATCH v4 01/25] mm: Introduce struct folio

2021-03-19 Thread Balbir Singh

On Fri, Mar 19, 2021 at 01:25:27AM +, Matthew Wilcox wrote:
> On Fri, Mar 19, 2021 at 10:56:45AM +1100, Balbir Singh wrote:
> > On Fri, Mar 05, 2021 at 04:18:37AM +, Matthew Wilcox (Oracle) wrote:
> > > A struct folio refers to an entire (possibly compound) page.  A function
> > > which takes a struct folio argument declares that it will operate on the
> > > entire compound page, not just PAGE_SIZE bytes.  In return, the caller
> > > guarantees that the pointer it is passing does not point to a tail page.
> > >
> > 
> > Is this a part of a larger use case or general cleanup/refactor where
> > the split between page and folio simplify programming?
> 
> The goal here is to manage memory in larger chunks.  Pages are now too
> small for just about every workload.  Even compiling the kernel sees a 7%
> performance improvement just by doing readahead using relatively small
> THPs (16k-256k).  You can see that work here:
> https://git.infradead.org/users/willy/pagecache.git/shortlog/refs/heads/master
> 
> I think Kirill, Hugh and others have done a fantastic job stretching
> the page struct to work in shmem, but we really need a different type
> to avoid people writing code that _looks_ right but is actually buggy.
> So I'm starting again, this time with the folio metaphor.

Thanks, makes sense, I'll take a look.

Balbir Singh.

Re: [PATCH v4 1/4] sched/fair: Introduce primitives for CFS bandwidth burst

2021-03-19 Thread changhuaixin




> On Mar 19, 2021, at 8:39 PM, changhuaixin  
> wrote:
> 
> 
> 
>> On Mar 18, 2021, at 11:05 PM, Peter Zijlstra  wrote:
>> 
>> On Thu, Mar 18, 2021 at 09:26:58AM +0800, changhuaixin wrote:
 On Mar 17, 2021, at 4:06 PM, Peter Zijlstra  wrote:
>> 
 So what is the typical avg,stdev,max and mode for the workloads where you 
 find
 you need this?
 
 I would really like to put a limit on the burst. IMO a workload that has
 a burst many times longer than the quota is plain broken.
>>> 
>>> I see. Then the problem comes down to how large the limit on burst shall be.
>>> 
>>> I have sampled the CPU usage of a bursty container in 100ms periods. The 
>>> statistics are:
>> 
>> So CPU usage isn't exactly what is required, job execution time is what
>> you're after. Assuming there is a relation...
>> 
> 
> Yes, job execution time is important. To be specific, it is to improve the 
> CPU usage of the whole
> system to reduce the total cost of ownership, while not damaging job 
> execution time. This
> requires lower the average CPU resource of underutilized cgroups, and 
> allowing their bursts
> at the same time.
> 
>>> average : 42.2%
>>> stddev  : 81.5%
>>> max : 844.5%
>>> P95 : 183.3%
>>> P99 : 437.0%
>> 
>> Then your WCET is 844% of 100ms ? , which is .84s.
>> 
>> But you forgot your mode; what is the most common duration, given P95 is
>> so high, I doubt that avg is representative of the most common duration.
>> 
> 
> It is true.
> 
>>> If quota is 10ms, burst buffer needs to be 8 times more in order
>>> for this workload not to be throttled.
>> 
>> Where does that 100s come from? And an 800s burst is bizarre.
>> 
>> Did you typo [us] as [ms] ?
>> 
> 
> Sorry, it should be 10us.
> 
>>> I can't say this is typical, but these workloads exist. On a machine
>>> running Kubernetes containers, where there is often room for such
>>> burst and the interference is hard to notice, users would prefer
>>> allowing such burst to being throttled occasionally.
>> 
>> Users also want ponies. I've no idea what kubernetes actually is or what
>> it has to do with containers. That's all just word salad.
>> 
>>> In this sense, I suggest limit burst buffer to 16 times of quota or
>>> around. That should be enough for users to improve tail latency caused
>>> by throttling. And users might choose a smaller one or even none, if
>>> the interference is unacceptable. What do you think?
>> 
>> Well, normal RT theory would suggest you pick your runtime around 200%
>> to get that P95 and then allow a full period burst to get your P99, but
>> that same RT theory would also have you calculate the resulting
>> interference and see if that works with the rest of the system...
>> 
> 
> I am sorry that I don't know much about the RT theory you mentioned, and 
> can't provide
> the desired calculation now. But I'd like to try and do some reading if that 
> is needed.
> 
>> 16 times is horrific.
> 
> So can we decide on a more relative value now? Or is the interference 
> probabilities still the
> missing piece?

A more [realistic] value, I mean.

> 
> Is the paper you mentioned about called "Insensitivity results in statistical 
> bandwidth sharing",
> or some related ones on statistical bandwidth results under some kind of 
> fairness?

Re: [PATCH 2/2] usb: dwc3: gadget: Ignore EP queue requests during bus reset

2021-03-19 Thread Thinh Nguyen

Wesley Cheng wrote:
> 
> 
> On 3/19/2021 5:40 PM, Thinh Nguyen wrote:
>> Hi,
>>
>> Wesley Cheng wrote:
>>> The current dwc3_gadget_reset_interrupt() will stop any active
>>> transfers, but only addresses blocking of EP queuing for while we are
>>> coming from a disconnected scenario, i.e. after receiving the disconnect
>>> event.  If the host decides to issue a bus reset on the device, the
>>> connected parameter will still be set to true, allowing for EP queuing
>>> to continue while we are disabling the functions.  To avoid this, set the
>>> connected flag to false until the stop active transfers is complete.
>>>
>>> Signed-off-by: Wesley Cheng 
>>> ---
>>>  drivers/usb/dwc3/gadget.c | 9 +
>>>  1 file changed, 9 insertions(+)
>>>
>>> diff --git a/drivers/usb/dwc3/gadget.c b/drivers/usb/dwc3/gadget.c
>>> index 6e14fdc..d5ed0f69 100644
>>> --- a/drivers/usb/dwc3/gadget.c
>>> +++ b/drivers/usb/dwc3/gadget.c
>>> @@ -3327,6 +3327,15 @@ static void dwc3_gadget_reset_interrupt(struct dwc3 
>>> *dwc)
>>> u32 reg;
>>>  
>>> /*
>>> +* Ideally, dwc3_reset_gadget() would trigger the function
>>> +* drivers to stop any active transfers through ep disable.
>>> +* However, for functions which defer ep disable, such as mass
>>> +* storage, we will need to rely on the call to stop active
>>> +* transfers here, and avoid allowing of request queuing.
>>> +*/
>>> +   dwc->connected = false;
>>> +
>>> +   /*
>>>  * WORKAROUND: DWC3 revisions <1.88a have an issue which
>>>  * would cause a missing Disconnect Event if there's a
>>>  * pending Setup Packet in the FIFO.
>>>
>>
>> This doesn't look right. Did you have rebase issue with your local
>> change again?
>>
>> BR,
>> Thinh
>>
> Hi Thinh,
> 
> This was rebased on Greg's usb-linus branch, which has commit
> f09ddcfcb8c5 ("usb: dwc3: gadget: Prevent EP queuing while stopping
> transfers") merged.

Ah I see.

> 
> commit f09ddcfcb8c5  moved the dwc->connected = true to after we have
> finished stop active transfers.  However, this change will also ensure
> that the connected flag is set to false to ensure that when we call stop
> active transfers, nothing can prepare TRBs.  (previous commit only
> addresses the case where we get the reset interrupt when coming from a
> disconnected state)
> 

That still doesn't address this issue.

Because:
1) We're still protected by the spin_lock_irq*(), so this change doesn't
make any difference while handling an event.
2) We don't enable the interrupt for END_TRANSFER command completion
when doing dwc3_stop_active_transfers(), the
DWC3_EP_END_TRANSFER_PENDING flag will not be set to prevent preparing
new requests.

We should do dwc->connected = true when we handle connection_done
interrupt instead. The END_TRANSFER command should complete before this.

Thanks,
Thinh

[PATCH v4 9/9] dt-bindings: serial: stm32: Use 'unevaluatedProperties' instead of 'additionalProperties'

2021-03-19 Thread dillon . minfei

From: dillon min 

To use additional properties 'bluetooth', need use unevaluatedProperties
to fix dtbs_check warnings.

'arch/arm/boot/dts/stm32h750i-art-pi.dt.yaml: serial@40004800: 'bluetooth'
does not match any of the regexes: 'pinctrl-[0-9]+'

Reported-by: kernel test robot 
Fixes: af1c2d81695b ("dt-bindings: serial: Convert STM32 UART to json-schema")
Signed-off-by: dillon min 
---

v4:
- add Reported-by and Fixes tag
- use unevaluatedProperties: false to fix dtbs_check warrnings instead of
  add 'bluetooth' in st,stm32-uart.yaml

 Documentation/devicetree/bindings/serial/st,stm32-uart.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/devicetree/bindings/serial/st,stm32-uart.yaml 
b/Documentation/devicetree/bindings/serial/st,stm32-uart.yaml
index 8631678283f9..305941b1d5a0 100644
--- a/Documentation/devicetree/bindings/serial/st,stm32-uart.yaml
+++ b/Documentation/devicetree/bindings/serial/st,stm32-uart.yaml
@@ -80,7 +80,7 @@ required:
   - interrupts
   - clocks
 
-additionalProperties: false
+unevaluatedProperties: false
 
 examples:
   - |
-- 
1.9.1

[PATCH v4 6/9] ARM: dts: stm32: add support for art-pi board based on stm32h750xbh6

2021-03-19 Thread dillon . minfei

From: dillon min 

This patchset has following changes:

- introduce stm32h750.dtsi to support stm32h750 value line
- add stm32h750i-art-pi.dtb (arch/arm/boot/dts/Makefile)
- add dts binding usart3 for bt, uart4 for console
  usart3/uart4 pinctrl in stm32h7-pinctrl.dtsi
  usart3/uart4 register in stm32h743.dtsi
- add dts binding sdmmc2 for wifi
  sdmmc2 pinctrl in stm32h7-pinctrl.dtsi
  sdmmc2 register in stm32h743.dtsi
- add spi1 pinctrl in stm32h7-pinctrl.dtsi for spi flash
- add stm32h750-art-pi.dts to support art-pi board
- add pinctrl: pin-controller@5802 {} to fix dtbs_check warrnings

art-pi board component:
- 8MiB qspi flash
- 16MiB spi flash
- 32MiB sdram
- ap6212 wifi&bt&fm

the detail board information can be found at:
https://art-pi.gitee.io/website/

Reported-by: kernel test robot 
Fixes: 500cdb23d608 ("ARM: dts: stm32: Add STM32H743 MCU and STM32H743i-EVAL 
board")
Signed-off-by: dillon min 
---

v4:
- add Reported-by and Fixes tag
- replace 'C' to 'c' in hex to fix dtbs_check warrnings

 arch/arm/boot/dts/Makefile  |   1 +
 arch/arm/boot/dts/stm32h743.dtsi| 153 -
 arch/arm/boot/dts/stm32h750.dtsi|   6 +
 arch/arm/boot/dts/stm32h750i-art-pi.dts | 229 
 4 files changed, 387 insertions(+), 2 deletions(-)
 create mode 100644 arch/arm/boot/dts/stm32h750.dtsi
 create mode 100644 arch/arm/boot/dts/stm32h750i-art-pi.dts

diff --git a/arch/arm/boot/dts/Makefile b/arch/arm/boot/dts/Makefile
index 8e5d4ab4e75e..a19c5ab9df84 100644
--- a/arch/arm/boot/dts/Makefile
+++ b/arch/arm/boot/dts/Makefile
@@ -1071,6 +1071,7 @@ dtb-$(CONFIG_ARCH_STM32) += \
stm32746g-eval.dtb \
stm32h743i-eval.dtb \
stm32h743i-disco.dtb \
+   stm32h750i-art-pi.dtb \
stm32mp153c-dhcom-drc02.dtb \
stm32mp157a-avenger96.dtb \
stm32mp157a-dhcor-avenger96.dtb \
diff --git a/arch/arm/boot/dts/stm32h743.dtsi b/arch/arm/boot/dts/stm32h743.dtsi
index 4ebffb0a45a3..4379063d36a2 100644
--- a/arch/arm/boot/dts/stm32h743.dtsi
+++ b/arch/arm/boot/dts/stm32h743.dtsi
@@ -135,6 +135,22 @@
clocks = <&rcc USART2_CK>;
};
 
+   usart3: serial@40004800 {
+   compatible = "st,stm32h7-uart";
+   reg = <0x40004800 0x400>;
+   interrupts = <39>;
+   status = "disabled";
+   clocks = <&rcc USART3_CK>;
+   };
+
+   uart4: serial@40004c00 {
+   compatible = "st,stm32h7-uart";
+   reg = <0x40004c00 0x400>;
+   interrupts = <52>;
+   status = "disabled";
+   clocks = <&rcc UART4_CK>;
+   };
+
i2c1: i2c@40005400 {
compatible = "st,stm32f7-i2c";
#address-cells = <1>;
@@ -159,7 +175,7 @@
status = "disabled";
};
 
-   i2c3: i2c@40005C00 {
+   i2c3: i2c@40005c00 {
compatible = "st,stm32f7-i2c";
#address-cells = <1>;
#size-cells = <0>;
@@ -368,6 +384,20 @@
max-frequency = <12000>;
};
 
+   sdmmc2: mmc@48022400 {
+   compatible = "arm,pl18x", "arm,primecell";
+   arm,primecell-periphid = <0x10153180>;
+   reg = <0x48022400 0x400>;
+   interrupts = <124>;
+   interrupt-names = "cmd_irq";
+   clocks = <&rcc SDMMC2_CK>;
+   clock-names = "apb_pclk";
+   resets = <&rcc STM32H7_AHB2_RESET(SDMMC2)>;
+   cap-sd-highspeed;
+   cap-mmc-highspeed;
+   max-frequency = <12000>;
+   };
+
exti: interrupt-controller@5800 {
compatible = "st,stm32h7-exti";
interrupt-controller;
@@ -392,7 +422,7 @@
status = "disabled";
};
 
-   i2c4: i2c@58001C00 {
+   i2c4: i2c@58001c00 {
compatible = "st,stm32f7-i2c";
#address-cells = <1>;
#size-cells = <0>;
@@ -555,6 +585,125 @@
snps,pbl = <8>;
status = "disabled";
};
+
+   pinctrl: pin-controller@5802 {
+   #address-cells = <1>;
+   #size-cells = <1>;
+   ranges = <0 0x5802 0x3000>;
+   interrupt-parent = <&exti>;
+   st,syscfg = <&syscfg 0x8>;
+   pins-are-numbered;
+
+   gpioa: gpio@5802 {
+   gp

[PATCH v4 5/9] ARM: dts: stm32: add stm32h750-pinctrl.dtsi

2021-03-19 Thread dillon . minfei

From: dillon min 

This patch add stm32h750-pinctrl.dtsi which just
reference stm32h7-pinctrl.dtsi

Signed-off-by: dillon min 
---

v4: no changes

 arch/arm/boot/dts/stm32h750-pinctrl.dtsi | 12 
 1 file changed, 12 insertions(+)
 create mode 100644 arch/arm/boot/dts/stm32h750-pinctrl.dtsi

diff --git a/arch/arm/boot/dts/stm32h750-pinctrl.dtsi 
b/arch/arm/boot/dts/stm32h750-pinctrl.dtsi
new file mode 100644
index ..ef8c4d881dba
--- /dev/null
+++ b/arch/arm/boot/dts/stm32h750-pinctrl.dtsi
@@ -0,0 +1,12 @@
+// SPDX-License-Identifier: (GPL-2.0+ OR BSD-3-Clause)
+/*
+ * Copyright (C) STMicroelectronics 2021 - All Rights Reserved
+ * Author: Dillon Min   for STMicroelectronics.
+ */
+
+#include "stm32h7-pinctrl.dtsi"
+
+&pinctrl{
+   compatible = "st,stm32h750-pinctrl";
+};
+
-- 
1.9.1

[PATCH v4 4/9] ARM: dts: stm32: introduce stm32h7-pinctrl.dtsi to support stm32h750

2021-03-19 Thread dillon . minfei

From: dillon min 

This patch is intend to add support stm32h750 value line,
just add stm32h7-pinctrl.dtsi for extending, with following changes:

- rename stm32h743-pinctrl.dtsi to stm32h7-pinctrl.dtsi
- move compatible string "st,stm32h743-pinctrl" from stm32h7-pinctrl.dtsi
  to stm32h743-pinctrl.dtsi
- move 'pin-controller' from stm32h7-pinctrl.dtsi to stm32h743.dtsi, to
  fix make dtbs_check warrnings
  arch/arm/boot/dts/stm32h750i-art-pi.dt.yaml: soc: 'i2c@40005C00',
  'i2c@58001C00' do not match any of the regexes:
  '@(0|[1-9a-f][0-9a-f]*)$', '^[^@]+$', 'pinctrl-[0-9]+'

Reported-by: kernel test robot 
Signed-off-by: dillon min 
---

v4: add Reported-by tag

 arch/arm/boot/dts/stm32h7-pinctrl.dtsi   | 274 +++
 arch/arm/boot/dts/stm32h743-pinctrl.dtsi | 307 +--
 2 files changed, 280 insertions(+), 301 deletions(-)
 create mode 100644 arch/arm/boot/dts/stm32h7-pinctrl.dtsi

diff --git a/arch/arm/boot/dts/stm32h7-pinctrl.dtsi 
b/arch/arm/boot/dts/stm32h7-pinctrl.dtsi
new file mode 100644
index ..fbab41a01af5
--- /dev/null
+++ b/arch/arm/boot/dts/stm32h7-pinctrl.dtsi
@@ -0,0 +1,274 @@
+/*
+ * Copyright 2017 - Alexandre Torgue 
+ *
+ * This file is dual-licensed: you can use it either under the terms
+ * of the GPL or the X11 license, at your option. Note that this dual
+ * licensing only applies to this file, and not this project as a
+ * whole.
+ *
+ *  a) This file is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This file is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * Or, alternatively,
+ *
+ *  b) Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include 
+
+&pinctrl {
+   i2c1_pins_a: i2c1-0 {
+   pins {
+   pinmux = , /* I2C1_SCL */
+; /* I2C1_SDA */
+   bias-disable;
+   drive-open-drain;
+   slew-rate = <0>;
+   };
+   };
+
+   ethernet_rmii: rmii-0 {
+   pins {
+   pinmux = ,
+,
+,
+,
+,
+,
+,
+,
+;
+   slew-rate = <2>;
+   };
+   };
+
+   sdmmc1_b4_pins_a: sdmmc1-b4-0 {
+   pins {
+   pinmux = , /* SDMMC1_D0 */
+, /* SDMMC1_D1 */
+, /* SDMMC1_D2 */
+, /* SDMMC1_D3 */
+, /* SDMMC1_CK */
+; /* SDMMC1_CMD */
+   slew-rate = <3>;
+   drive-push-pull;
+   bias-disable;
+   };
+   };
+
+   sdmmc1_b4_od_pins_a: sdmmc1-b4-od-0 {
+   pins1 {
+   pinmux = , /* SDMMC1_D0 */
+, /* SDMMC1_D1 */
+, /* SDMMC1_D2 */
+, /* SDMMC1_D3 */
+; /* SDMMC1_CK */
+   slew-rate = <3>;
+   drive-push-pull;
+   bias-disable;
+   };
+   pins2{
+

[PATCH v4 8/9] pinctrl: stm32: Add STM32H750 MCU pinctrl support

2021-03-19 Thread dillon . minfei

From: dillon min 

This patch adds STM32H750 pinctrl and GPIO support
since stm32h750 has the same pin alternate functions
with stm32h743, so just reuse the stm32h743's pinctrl
driver

Signed-off-by: dillon min 
---

v4: no changes

 drivers/pinctrl/stm32/Kconfig | 2 +-
 drivers/pinctrl/stm32/pinctrl-stm32h743.c | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/pinctrl/stm32/Kconfig b/drivers/pinctrl/stm32/Kconfig
index f36f29113370..fb1ffc94c57f 100644
--- a/drivers/pinctrl/stm32/Kconfig
+++ b/drivers/pinctrl/stm32/Kconfig
@@ -35,7 +35,7 @@ config PINCTRL_STM32F769
select PINCTRL_STM32
 
 config PINCTRL_STM32H743
-   bool "STMicroelectronics STM32H743 pin control" if COMPILE_TEST && 
!MACH_STM32H743
+   bool "STMicroelectronics STM32H743/STM32H750 pin control" if 
COMPILE_TEST && !MACH_STM32H743
depends on OF && HAS_IOMEM
default MACH_STM32H743
select PINCTRL_STM32
diff --git a/drivers/pinctrl/stm32/pinctrl-stm32h743.c 
b/drivers/pinctrl/stm32/pinctrl-stm32h743.c
index ffe7b5271506..700206c7bc11 100644
--- a/drivers/pinctrl/stm32/pinctrl-stm32h743.c
+++ b/drivers/pinctrl/stm32/pinctrl-stm32h743.c
@@ -1966,6 +1966,9 @@
.compatible = "st,stm32h743-pinctrl",
.data = &stm32h743_match_data,
},
+   {   .compatible = "st,stm32h750-pinctrl",
+   .data = &stm32h743_match_data,
+   },
{ }
 };
 
-- 
1.9.1

[PATCH v4 7/9] ARM: stm32: Add a new SOC - STM32H750

2021-03-19 Thread dillon . minfei

From: dillon min 

The STM32H750 is a Cortex-M7 MCU running at 480MHz
and containing 128KBytes internal flash, 1MiB SRAM.

Signed-off-by: dillon min 
---

v4: no changes

 arch/arm/mach-stm32/board-dt.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/arm/mach-stm32/board-dt.c b/arch/arm/mach-stm32/board-dt.c
index 011d57b488c2..a766310d8dca 100644
--- a/arch/arm/mach-stm32/board-dt.c
+++ b/arch/arm/mach-stm32/board-dt.c
@@ -17,6 +17,7 @@
"st,stm32f746",
"st,stm32f769",
"st,stm32h743",
+   "st,stm32h750",
"st,stm32mp157",
NULL
 };
-- 
1.9.1

[PATCH v4 1/9] Documentation: arm: stm32: Add stm32h750 value line doc

2021-03-19 Thread dillon . minfei

From: dillon min 

This patchset add support for soc stm32h750, stm32h750 has mirror
different from stm32h743

itemstm32h743 stm32h750
flash size: 2MiB  128KiB
adc:none  3
crypto-hash:none  aes/hamc/des/tdes/md5/sha

detail information can be found at:
https://www.st.com/en/microcontrollers-microprocessors/stm32h750-value-line.html

Signed-off-by: dillon min 
---

v4: no changes

 Documentation/arm/index.rst|  1 +
 Documentation/arm/stm32/stm32h750-overview.rst | 34 ++
 2 files changed, 35 insertions(+)
 create mode 100644 Documentation/arm/stm32/stm32h750-overview.rst

diff --git a/Documentation/arm/index.rst b/Documentation/arm/index.rst
index b4bea32472b6..d4f34ae9e6f4 100644
--- a/Documentation/arm/index.rst
+++ b/Documentation/arm/index.rst
@@ -52,6 +52,7 @@ SoC-specific documents
stm32/stm32f746-overview
stm32/overview
stm32/stm32h743-overview
+   stm32/stm32h750-overview
stm32/stm32f769-overview
stm32/stm32f429-overview
stm32/stm32mp157-overview
diff --git a/Documentation/arm/stm32/stm32h750-overview.rst 
b/Documentation/arm/stm32/stm32h750-overview.rst
new file mode 100644
index ..0e51235c9547
--- /dev/null
+++ b/Documentation/arm/stm32/stm32h750-overview.rst
@@ -0,0 +1,34 @@
+==
+STM32H750 Overview
+==
+
+Introduction
+
+
+The STM32H750 is a Cortex-M7 MCU aimed at various applications.
+It features:
+
+- Cortex-M7 core running up to @480MHz
+- 128K internal flash, 1MBytes internal RAM
+- FMC controller to connect SDRAM, NOR and NAND memories
+- Dual mode QSPI
+- SD/MMC/SDIO support
+- Ethernet controller
+- USB OTFG FS & HS controllers
+- I2C, SPI, CAN busses support
+- Several 16 & 32 bits general purpose timers
+- Serial Audio interface
+- LCD controller
+- HDMI-CEC
+- SPDIFRX
+- DFSDM
+
+Resources
+-
+
+Datasheet and reference manual are publicly available on ST website 
(STM32H750_).
+
+.. _STM32H750: 
https://www.st.com/en/microcontrollers-microprocessors/stm32h750-value-line.html
+
+:Authors: Dillon Min 
+
-- 
1.9.1

[PATCH v4 0/9] ARM: STM32: add art-pi(stm32h750xbh6) board support

2021-03-19 Thread dillon . minfei

From: dillon min 

This patchset intend to add art-pi board support, this board developed
by rt-thread(https://www.rt-thread.org/).

Board resources:
8MiB QSPI flash
16MiB SPI flash
32MiB SDRAM
AP6212 wifi,bt,fm comb

sw context:
- as stm32h750 just has 128k bytes internal flash, so running a fw on
  internal flash to download u-boot/kernel to qspi flash, boot
  u-boot/kernel from qspi flash. this fw is based on rt-thread.
- kernel can be xip on qspi flash or load to sdram
- root filesystem is jffs2(created by buildroot), stored on spi flash

to support the boad, add following changes.
- fix r0-r3, r12 register restore failed after svc call,
- add dts binding
- update yaml doc

changes in v4:
- use unevaluatedProperties: false to fix dtbs_check warrnings instead of
  add 'bluetooth' in st,stm32-uart.yaml

changes in v3:
- fix dtbs_check warrning: (8002cbd78fd5 and 4bc21d3dd678)
  >> arch/arm/boot/dts/stm32h743i-eval.dt.yaml: soc: pin-controller:
 {'type': 'object'} is not allowed for {'#address-cells': [[1]], 
'#size-cells':
 [[1]], 'ranges': [[0,
  
  >> arch/arm/boot/dts/stm32h743i-eval.dt.yaml: soc: 'i2c@40005C00',
 'i2c@58001C00' do not match any of the regexes: '@(0|[1-9a-f][0-9a-f]*)$',
 '^[^@]+$', 'pinctrl-[0-9]+'
  >> arch/arm/boot/dts/stm32h750i-art-pi.dt.yaml: serial@40004800:
 'bluetooth' does not match any of the regexes: 'pinctrl-[0-9]+'

changes in v2:
- reorganize the pinctrl device tree about
  stm32h7-pinctrl/stm32h743/750-pinctrl
  stm32h7-pinctrl.dtsi --> stm32h743-pinctrl.dtsi --> stm32h743i-disco.dts
 |  |-> stm32h743i-eval.dts
 |-> stm32h750-pinctrl.dtsi --> stm32h750i-art-pi.dts
  same to the stm32f7/f4's pinctrl style
- fix author name/copyright mistake
- add compatible string st,stm32h750-pinctrl to pinctl-stm32h743.c as they
  have same pin alternate functions, update Kconfig description
- make item in stm32h750i-art-pi.dts sort by letter

dillon min (9):
  Documentation: arm: stm32: Add stm32h750 value line doc
  dt-bindings: arm: stm32: Add compatible strings for ART-PI board
  dt-bindings: pinctrl: stm32: Add stm32h750 pinctrl
  ARM: dts: stm32: introduce stm32h7-pinctrl.dtsi to support stm32h750
  ARM: dts: stm32: add stm32h750-pinctrl.dtsi
  ARM: dts: stm32: add support for art-pi board based on stm32h750xbh6
  ARM: stm32: Add a new SOC - STM32H750
  pinctrl: stm32: Add STM32H750 MCU pinctrl support
  dt-bindings: serial: stm32: Use 'unevaluatedProperties' instead of
'additionalProperties'

 Documentation/arm/index.rst|   1 +
 Documentation/arm/stm32/stm32h750-overview.rst |  34 +++
 .../devicetree/bindings/arm/stm32/stm32.yaml   |   4 +
 .../bindings/pinctrl/st,stm32-pinctrl.yaml |   1 +
 .../devicetree/bindings/serial/st,stm32-uart.yaml  |   2 +-
 arch/arm/boot/dts/Makefile |   1 +
 arch/arm/boot/dts/stm32h7-pinctrl.dtsi | 274 ++
 arch/arm/boot/dts/stm32h743-pinctrl.dtsi   | 307 +
 arch/arm/boot/dts/stm32h743.dtsi   | 153 +-
 arch/arm/boot/dts/stm32h750-pinctrl.dtsi   |  12 +
 arch/arm/boot/dts/stm32h750.dtsi   |   6 +
 arch/arm/boot/dts/stm32h750i-art-pi.dts| 229 +++
 arch/arm/mach-stm32/board-dt.c |   1 +
 drivers/pinctrl/stm32/Kconfig  |   2 +-
 drivers/pinctrl/stm32/pinctrl-stm32h743.c  |   3 +
 15 files changed, 725 insertions(+), 305 deletions(-)
 create mode 100644 Documentation/arm/stm32/stm32h750-overview.rst
 create mode 100644 arch/arm/boot/dts/stm32h7-pinctrl.dtsi
 create mode 100644 arch/arm/boot/dts/stm32h750-pinctrl.dtsi
 create mode 100644 arch/arm/boot/dts/stm32h750.dtsi
 create mode 100644 arch/arm/boot/dts/stm32h750i-art-pi.dts

-- 
1.9.1

Re: [PATCH] mm: page_alloc: fix memcg accounting leak in speculative cache lookup

2021-03-19 Thread Hugh Dickins

On Fri, 19 Mar 2021, Johannes Weiner wrote:

> When the freeing of a higher-order page block (non-compound) races
> with a speculative page cache lookup, __free_pages() needs to leave
> the first order-0 page in the chunk to the lookup but free the buddy
> pages that the lookup doesn't know about separately.
> 
> However, if such a higher-order page is charged to a memcg (e.g. !vmap
> kernel stack)), only the first page of the block has page->memcg
> set. That means we'll uncharge only one order-0 page from the entire
> block, and leak the remainder.
> 
> Add a split_page_memcg() to __free_pages() right before it starts
> taking the higher-order page apart and freeing its individual
> constituent pages. This ensures all of them will have the memcg
> linkage set up for correct uncharging. Also update the comments a bit
> to clarify what exactly is happening to the page during that race.
> 
> This bug is old and has its roots in the speculative page cache patch
> and adding cgroup accounting of kernel pages. There are no known user
> reports. A backport to stable is therefor not warranted.
> 
> Reported-by: Matthew Wilcox 
> Signed-off-by: Johannes Weiner 

Acked-by: Hugh Dickins 

to the split_page_memcg() addition etc, but a doubt just hit me on the
original e320d3012d25 ("mm/page_alloc.c: fix freeing non-compound pages"):
see comment below.

> ---
>  mm/page_alloc.c | 33 +++--
>  1 file changed, 27 insertions(+), 6 deletions(-)
> 
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index c53fe4fa10bf..f4bd56656402 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -5112,10 +5112,9 @@ static inline void free_the_page(struct page *page, 
> unsigned int order)
>   * the allocation, so it is easy to leak memory.  Freeing more memory
>   * than was allocated will probably emit a warning.
>   *
> - * If the last reference to this page is speculative, it will be released
> - * by put_page() which only frees the first page of a non-compound
> - * allocation.  To prevent the remaining pages from being leaked, we free
> - * the subsequent pages here.  If you want to use the page's reference
> + * This function isn't a put_page(). Don't let the put_page_testzero()
> + * fool you, it's only to deal with speculative cache references. It
> + * WILL free pages directly. If you want to use the page's reference
>   * count to decide when to free the allocation, you should allocate a
>   * compound page, and use put_page() instead of __free_pages().
>   *
> @@ -5124,11 +5123,33 @@ static inline void free_the_page(struct page *page, 
> unsigned int order)
>   */
>  void __free_pages(struct page *page, unsigned int order)
>  {
> - if (put_page_testzero(page))
> + /*
> +  * Drop the base reference from __alloc_pages and free. In
> +  * case there is an outstanding speculative reference, from
> +  * e.g. the page cache, it will put and free the page later.
> +  */
> + if (likely(put_page_testzero(page))) {
>   free_the_page(page, order);
> - else if (!PageHead(page))
> + return;
> + }
> +
> + /*
> +  * The speculative reference will put and free the page.
> +  *
> +  * However, if the speculation was into a higher-order page
> +  * chunk that isn't marked compound, the other side will know
> +  * nothing about our buddy pages and only free the order-0
> +  * page at the start of our chunk! We must split off and free
> +  * the buddy pages here.
> +  *
> +  * The buddy pages aren't individually refcounted, so they
> +  * can't have any pending speculative references themselves.
> +  */
> + if (!PageHead(page) && order > 0) {

The put_page_testzero() has released our reference to the first
subpage of page: it's now under the control of the racing speculative
lookup.  So it seems to me unsafe to be checking PageHead(page) here:
if it was actually a compound page, PageHead might already be cleared
by now, and we doubly free its tail pages below?  I think we need to
use a "bool compound = PageHead(page)" on entry to __free_pages().

Or alternatively, it's wrong to call __free_pages() on a compound
page anyway, so we should not check PageHead at all, except in a
WARN_ON_ONCE(PageCompound(page)) at the start?

And would it be wrong to fix that too in this patch?
Though it ought then to be backported to 5.10 stable.

> + split_page_memcg(page, 1 << order);
>   while (order-- > 0)
>   free_the_page(page + (1 << order), order);
> + }
>  }
>  EXPORT_SYMBOL(__free_pages);
>  
> -- 
> 2.30.1

Re: [PATCH 2/3] mm, dax, pmem: Introduce dev_pagemap_failure()

2021-03-19 Thread Dave Chinner

On Thu, Mar 18, 2021 at 12:20:35PM -0700, Dan Williams wrote:
> On Wed, Mar 17, 2021 at 9:58 PM Dave Chinner  wrote:
> >
> > On Wed, Mar 17, 2021 at 09:08:23PM -0700, Dan Williams wrote:
> > > Jason wondered why the get_user_pages_fast() path takes references on a
> > > @pgmap object. The rationale was to protect against accessing a 'struct
> > > page' that might be in the process of being removed by the driver, but
> > > he rightly points out that should be solved the same way all gup-fast
> > > synchronization is solved which is invalidate the mapping and let the
> > > gup slow path do @pgmap synchronization [1].
> > >
> > > To achieve that it means that new user mappings need to stop being
> > > created and all existing user mappings need to be invalidated.
> > >
> > > For device-dax this is already the case as kill_dax() prevents future
> > > faults from installing a pte, and the single device-dax inode
> > > address_space can be trivially unmapped.
> > >
> > > The situation is different for filesystem-dax where device pages could
> > > be mapped by any number of inode address_space instances. An initial
> > > thought was to treat the device removal event like a drop_pagecache_sb()
> > > event that walks superblocks and unmaps all inodes. However, Dave points
> > > out that it is not just the filesystem user-mappings that need to react
> > > to global DAX page-unmap events, it is also filesystem metadata
> > > (proposed DAX metadata access), and other drivers (upstream
> > > DM-writecache) that need to react to this event [2].
> > >
> > > The only kernel facility that is meant to globally broadcast the loss of
> > > a page (via corruption or surprise remove) is memory_failure(). The
> > > downside of memory_failure() is that it is a pfn-at-a-time interface.
> > > However, the events that would trigger the need to call memory_failure()
> > > over a full PMEM device should be rare.
> >
> > This is a highly suboptimal design. Filesystems only need a single
> > callout to trigger a shutdown that unmaps every active mapping in
> > the filesystem - we do not need a page-by-page error notification
> > which results in 250 million hwposion callouts per TB of pmem to do
> > this.
> >
> > Indeed, the moment we get the first hwpoison from this patch, we'll
> > map it to the primary XFS superblock and we'd almost certainly
> > consider losing the storage behind that block to be a shut down
> > trigger. During the shutdown, the filesystem should unmap all the
> > active mappings (we already need to add this to shutdown on DAX
> > regardless of this device remove issue) and so we really don't need
> > a page-by-page notification of badness.
> 
> XFS doesn't, but what about device-mapper and other agents? Even if
> the driver had a callback up the stack memory_failure() still needs to
> be able to trigger failures down the stack for CPU consumed poison.

If the device is gone, then they don't need page by page
notifucation, either. Tell them the entire device is gone so they
can do what they need (like pass it up to the filesystem as ranges
of badness!).

> > AFAICT, it's going to take minutes, maybe hours for do the page-by-page
> > iteration to hwposion every page. It's going to take a few seconds
> > for the filesystem shutdown to run a device wide invalidation.
> >
> > SO, yeah, I think this should simply be a single ranged call to the
> > filesystem like:
> >
> > ->memory_failure(dev, 0, -1ULL)
> >
> > to tell the filesystem that the entire backing device has gone away,
> > and leave the filesystem to handle failure entirely at the
> > filesystem level.
> 
> So I went with memory_failure() after our discussion of all the other
> agents in the system that might care about these pfns going offline
> and relying on memory_failure() to route down to each of those. I.e.
> the "reuse the drop_pagecache_sb() model" idea was indeed
> insufficient.

Using drop_pagecache_sb is insufficient because filesystems have
more than just inode indexed caches that pmem failure may affect.
This is not an argument against a "knock everything down at once"
notification model, just that drop_pagecache_sb() is ...
insufficient to do what we need...

> Now I'm trying to reconcile the fact that platform
> poison handling will hit memory_failure() first and may not
> immediately reach the driver, if ever (see the perennially awkward
> firmware-first-mode error handling: ghes_handle_memory_failure()) . So
> even if the ->memory_failure(dev...) up call exists there is no
> guarantee it can get called for all poison before the memory_failure()
> down call happens. Which means regardless of whether
> ->memory_failure(dev...) exists memory_failure() needs to be able to
> do the right thing.

I don't see how a poor implementation of memory_failure in a driver
or hardware is even remotely relevant to the interface used to
notify the filesystem of a media or device failure. It sounds like
you are trying to use memory_failure() for somethin

Re: [PATCH v11 1/6] kasan: allow an architecture to disable inline instrumentation

2021-03-19 Thread Balbir Singh

On Sat, Mar 20, 2021 at 01:40:53AM +1100, Daniel Axtens wrote:
> For annoying architectural reasons, it's very difficult to support inline
> instrumentation on powerpc64.

I think we can expand here and talk about how in hash mode, the vmalloc
address space is in a region of memory different than where kernel virtual
addresses are mapped. Did I recollect the reason correctly?

> 
> Add a Kconfig flag to allow an arch to disable inline. (It's a bit
> annoying to be 'backwards', but I'm not aware of any way to have
> an arch force a symbol to be 'n', rather than 'y'.)
> 
> We also disable stack instrumentation in this case as it does things that
> are functionally equivalent to inline instrumentation, namely adding
> code that touches the shadow directly without going through a C helper.
> 
> Signed-off-by: Daniel Axtens 
> ---
>  lib/Kconfig.kasan | 8 
>  1 file changed, 8 insertions(+)
> 
> diff --git a/lib/Kconfig.kasan b/lib/Kconfig.kasan
> index cffc2ebbf185..7e237dbb6df3 100644
> --- a/lib/Kconfig.kasan
> +++ b/lib/Kconfig.kasan
> @@ -12,6 +12,9 @@ config HAVE_ARCH_KASAN_HW_TAGS
>  config HAVE_ARCH_KASAN_VMALLOC
>   bool
>  
> +config ARCH_DISABLE_KASAN_INLINE
> + def_bool n
> +

Some comments on what arch's want to disable kasan inline would
be helpful and why.

Balbir Singh.

Re: [RFC PATCH v1 0/4] vfio: Add IOPF support for VFIO passthrough

2021-03-19 Thread Lu Baolu


On 3/19/21 9:30 AM, Keqian Zhu wrote:

Hi Baolu,

On 2021/3/19 8:33, Lu Baolu wrote:

On 3/18/21 7:53 PM, Shenming Lu wrote:

On 2021/3/18 17:07, Tian, Kevin wrote:

From: Shenming Lu
Sent: Thursday, March 18, 2021 3:53 PM

On 2021/2/4 14:52, Tian, Kevin wrote:>>> In reality, many

devices allow I/O faulting only in selective contexts. However, there
is no standard way (e.g. PCISIG) for the device to report whether
arbitrary I/O fault is allowed. Then we may have to maintain device
specific knowledge in software, e.g. in an opt-in table to list devices
which allows arbitrary faults. For devices which only support selective
faulting, a mediator (either through vendor extensions on vfio-pci-core
or a mdev wrapper) might be necessary to help lock down non-faultable
mappings and then enable faulting on the rest mappings.

For devices which only support selective faulting, they could tell it to the
IOMMU driver and let it filter out non-faultable faults? Do I get it wrong?

Not exactly to IOMMU driver. There is already a vfio_pin_pages() for
selectively page-pinning. The matter is that 'they' imply some device
specific logic to decide which pages must be pinned and such knowledge
is outside of VFIO.

  From enabling p.o.v we could possibly do it in phased approach. First
handles devices which tolerate arbitrary DMA faults, and then extends
to devices with selective-faulting. The former is simpler, but with one
main open whether we want to maintain such device IDs in a static
table in VFIO or rely on some hints from other components (e.g. PF
driver in VF assignment case). Let's see how Alex thinks about it.

Hi Kevin,

You mentioned selective-faulting some time ago. I still have some doubt
about it:
There is already a vfio_pin_pages() which is used for limiting the IOMMU
group dirty scope to pinned pages, could it also be used for indicating
the faultable scope is limited to the pinned pages and the rest mappings
is non-faultable that should be pinned and mapped immediately? But it
seems to be a little weird and not exactly to what you meant... I will
be grateful if you can help to explain further.:-)


The opposite, i.e. the vendor driver uses vfio_pin_pages to lock down
pages that are not faultable (based on its specific knowledge) and then
the rest memory becomes faultable.

Ahh...
Thus, from the perspective of VFIO IOMMU, if IOPF enabled for such device,
only the page faults within the pinned range are valid in the registered
iommu fault handler...

Isn't it opposite? The pinned pages will never generate any page faults.
I might miss some contexts here.

It seems that vfio_pin_pages() just pin some pages and record the pinned scope 
to pfn_list of vfio_dma.
No mapping is established, so we still has page faults.


Make sense. Thanks a lot for the explanation.



IIUC, vfio_pin_pages() is used to
1. pin pages for non-iommu backed devices.
2. mark dirty scope for non-iommu backed devices and iommu backed devices.


Best regards,
baolu

Re: [unixbhas...@gmail.com: [PATCH] lib: Fix a typo]

2021-03-19 Thread Randy Dunlap





On Sat, 20 Mar 2021, Bhaskar Chowdhury wrote:


- Forwarded message from Bhaskar Chowdhury  -

Hi Dave,

Can you please take it, as I haven't found anybody attached with this 
file,but

found you made some entry(I am trying to take advantage of you!! :)  ) ..

Thanks,
Bhaskar
Date: Sat, 20 Mar 2021 02:31:55 +0530
From: Bhaskar Chowdhury 
To: unixbhas...@gmail.com, linux-kernel@vger.kernel.org
Subject: [PATCH] lib: Fix a typo
X-Mailer: git-send-email 2.26.2


s/funtion/function/

Signed-off-by: Bhaskar Chowdhury 


Acked-by: Randy Dunlap 



---
 lib/list_sort.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/list_sort.c b/lib/list_sort.c
index 52f0c258c895..282fe269f16a 100644
--- a/lib/list_sort.c
+++ b/lib/list_sort.c
@@ -107,7 +107,7 @@ static void merge_final(void *priv, cmp_func cmp, struct 
list_head *head,

  * @head: the list to sort
  * @cmp: the elements comparison function
  *
- * The comparison funtion @cmp must return > 0 if @a should sort after
+ * The comparison function @cmp must return > 0 if @a should sort after
  * @b ("@a > @b" if you want an ascending sort), and <= 0 if @a should
  * sort before @b *or* their original order should be preserved.  It is
  * always called with the element that came first in the input in @a,
--
2.26.2


- End forwarded message -

Re: [PATCH v11 0/6] KASAN for powerpc64 radix

2021-03-19 Thread Balbir Singh

On Sat, Mar 20, 2021 at 01:40:52AM +1100, Daniel Axtens wrote:
> Building on the work of Christophe, Aneesh and Balbir, I've ported
> KASAN to 64-bit Book3S kernels running on the Radix MMU.
> 
> v11 applies to next-20210317. I had hoped to have it apply to
> powerpc/next but once again there are changes in the kasan core that
> clash. Also, thanks to mpe for fixing a build break with KASAN off.
> 
> I'm not sure how best to progress this towards actually being merged
> when it has impacts across subsystems. I'd appreciate any input. Maybe
> the first four patches could go in via the kasan tree, that should
> make things easier for powerpc in a future cycle?
> 
> v10 rebases on top of next-20210125, fixing things up to work on top
> of the latest changes, and fixing some review comments from
> Christophe. I have tested host and guest with 64k pages for this spin.
> 
> There is now only 1 failing KUnit test: kasan_global_oob - gcc puts
> the ASAN init code in a section called '.init_array'. Powerpc64 module
> loading code goes through and _renames_ any section beginning with
> '.init' to begin with '_init' in order to avoid some complexities
> around our 24-bit indirect jumps. This means it renames '.init_array'
> to '_init_array', and the generic module loading code then fails to
> recognise the section as a constructor and thus doesn't run it. This
> hack dates back to 2003 and so I'm not going to try to unpick it in
> this series. (I suspect this may have previously worked if the code
> ended up in .ctors rather than .init_array but I don't keep my old
> binaries around so I have no real way of checking.)
> 
> (The previously failing stack tests are now skipped due to more
> accurate configuration settings.)
> 
> Details from v9: This is a significant reworking of the previous
> versions. Instead of the previous approach which supported inline
> instrumentation, this series provides only outline instrumentation.
> 
> To get around the problem of accessing the shadow region inside code we run
> with translations off (in 'real mode'), we we restrict checking to when
> translations are enabled. This is done via a new hook in the kasan core and
> by excluding larger quantites of arch code from instrumentation. The upside
> is that we no longer require that you be able to specify the amount of
> physically contiguous memory on the system at compile time. Hopefully this
> is a better trade-off. More details in patch 6.
> 
> kexec works. Both 64k and 4k pages work. Running as a KVM host works, but
> nothing in arch/powerpc/kvm is instrumented. It's also potentially a bit
> fragile - if any real mode code paths call out to instrumented code, things
> will go boom.
>

The last time I checked, the changes for real mode, made the code hard to
review/maintain. I am happy to see that we've decided to leave that off
the table for now, reviewing the series

Balbir Singh.

Re: [PATCH] mm: kmemleak: Fix a typo

2021-03-19 Thread Randy Dunlap





On Sat, 20 Mar 2021, Bhaskar Chowdhury wrote:



s/interruptable/interruptible/

Signed-off-by: Bhaskar Chowdhury 


Acked-by: Randy Dunlap 



---
mm/kmemleak.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index c0014d3b91c1..f09f1a9402a9 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -1202,7 +1202,7 @@ static void update_refs(struct kmemleak_object *object)
}

/*
- * Memory scanning is a long process and it needs to be interruptable. This
+ * Memory scanning is a long process and it needs to be interruptible. This
 * function checks whether such interrupt condition occurred.
 */
static int scan_should_stop(void)
--
2.26.2

Re: [PATCH] arm64: cpuinfo: Fix a typo

2021-03-19 Thread Randy Dunlap





On Sat, 20 Mar 2021, Bhaskar Chowdhury wrote:



s/acurate/accurate/

Signed-off-by: Bhaskar Chowdhury 


Acked-by: Randy Dunlap 



---
arch/arm64/kernel/cpuinfo.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c
index 77605aec25fe..51fcf99d5351 100644
--- a/arch/arm64/kernel/cpuinfo.c
+++ b/arch/arm64/kernel/cpuinfo.c
@@ -353,7 +353,7 @@ static void __cpuinfo_store_cpu(struct cpuinfo_arm64 *info)
 * with the CLIDR_EL1 fields to avoid triggering false warnings
 * when there is a mismatch across the CPUs. Keep track of the
 * effective value of the CTR_EL0 in our internal records for
-* acurate sanity check and feature enablement.
+* accurate sanity check and feature enablement.
 */
info->reg_ctr = read_cpuid_effective_cachetype();
info->reg_dczid = read_cpuid(DCZID_EL0);
--
2.26.2

drivers/media/platform/allegro-dvt/allegro-core.c:3206:34: warning: unused variable 'allegro_dt_ids'

2021-03-19 Thread kernel test robot

tree:   https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git 
master
head:   8b12a62a4e3ed4ae99c715034f557eb391d6b196
commit: d74d4e2359ec7985831192f9b5ee22ed5e55b81c media: allegro: move driver 
out of staging
date:   3 months ago
config: x86_64-randconfig-a002-20210320 (attached as .config)
compiler: clang version 13.0.0 (https://github.com/llvm/llvm-project 
fcc1ce00931751ac02498986feb37744e9ace8de)
reproduce (this is a W=1 build):
wget 
https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O 
~/bin/make.cross
chmod +x ~/bin/make.cross
# install x86_64 cross compiling tool for clang build
# apt-get install binutils-x86-64-linux-gnu
# 
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=d74d4e2359ec7985831192f9b5ee22ed5e55b81c
git remote add linus 
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
git fetch --no-tags linus master
git checkout d74d4e2359ec7985831192f9b5ee22ed5e55b81c
# save the attached .config to linux build tree
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross ARCH=x86_64 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot 

All warnings (new ones prefixed by >>):

>> drivers/media/platform/allegro-dvt/allegro-core.c:3206:34: warning: unused 
>> variable 'allegro_dt_ids' [-Wunused-const-variable]
   static const struct of_device_id allegro_dt_ids[] = {
^
   1 warning generated.


vim +/allegro_dt_ids +3206 drivers/media/platform/allegro-dvt/allegro-core.c

f20387dfd06569 drivers/staging/media/allegro-dvt/allegro-core.c Michael Tretter 
2019-05-28  3205  
f20387dfd06569 drivers/staging/media/allegro-dvt/allegro-core.c Michael Tretter 
2019-05-28 @3206  static const struct of_device_id allegro_dt_ids[] = {
f20387dfd06569 drivers/staging/media/allegro-dvt/allegro-core.c Michael Tretter 
2019-05-28  3207{ .compatible = "allegro,al5e-1.1" },
f20387dfd06569 drivers/staging/media/allegro-dvt/allegro-core.c Michael Tretter 
2019-05-28  3208{ /* sentinel */ }
f20387dfd06569 drivers/staging/media/allegro-dvt/allegro-core.c Michael Tretter 
2019-05-28  3209  };
f20387dfd06569 drivers/staging/media/allegro-dvt/allegro-core.c Michael Tretter 
2019-05-28  3210  

:: The code at line 3206 was first introduced by commit
:: f20387dfd065693ba7ea2788a2f893bf653c9cb8 media: allegro: add Allegro DVT 
video IP core driver

:: TO: Michael Tretter 
:: CC: Mauro Carvalho Chehab 

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-...@lists.01.org


.config.gz
Description: application/gzip

Re: [PATCH] ipc/sem.c: Couple of spelling fixes

2021-03-19 Thread Randy Dunlap





On Sat, 20 Mar 2021, Bhaskar Chowdhury wrote:



s/purpuse/purpose/
s/seperately/separately/

Signed-off-by: Bhaskar Chowdhury 


Acked-by: Randy Dunlap 



---
ipc/sem.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ipc/sem.c b/ipc/sem.c
index f6c30a85dadf..780d75eff8c6 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -786,7 +786,7 @@ static inline void wake_up_sem_queue_prepare(struct 
sem_queue *q, int error,
{
get_task_struct(q->sleeper);

-   /* see SEM_BARRIER_2 for purpuse/pairing */
+   /* see SEM_BARRIER_2 for purpose/pairing */
smp_store_release(&q->status, error);

wake_q_add_safe(wake_q, q->sleeper);
@@ -821,7 +821,7 @@ static inline int check_restart(struct sem_array *sma, 
struct sem_queue *q)

/* It is impossible that someone waits for the new value:
 * - complex operations always restart.
-* - wait-for-zero are handled seperately.
+* - wait-for-zero are handled separately.
 * - q is a previously sleeping simple operation that
 *   altered the array. It must be a decrement, because
 *   simple increments never sleep.
--
2.26.2

Re: [PATCH] binfmt_misc: Trivial spello fix

2021-03-19 Thread Randy Dunlap





On Sat, 20 Mar 2021, Bhaskar Chowdhury wrote:



s/delimeter/delimiter/

Signed-off-by: Bhaskar Chowdhury 


Acked-by: Randy Dunlap 



---
Al, please don't fret over this trivialities. I am trying to make sense the
change I am making.

fs/binfmt_misc.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index e1eae7ea823a..1e4a59af41eb 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -297,7 +297,7 @@ static Node *create_entry(const char __user *buffer, size_t 
count)
if (copy_from_user(buf, buffer, count))
goto efault;

-   del = *p++; /* delimeter */
+   del = *p++; /* delimiter */

pr_debug("register: delim: %#x {%c}\n", del, del);

--
2.26.2

Re: [PATCH] perf/builtin-c2c.c: Fix a punctuation

2021-03-19 Thread Randy Dunlap





On Sat, 20 Mar 2021, Bhaskar Chowdhury wrote:



s/dont/don\'t/

Signed-off-by: Bhaskar Chowdhury 


Acked-by: Randy Dunlap 



---
tools/perf/builtin-c2c.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/perf/builtin-c2c.c b/tools/perf/builtin-c2c.c
index e3b9d63077ef..af1c1b89e769 100644
--- a/tools/perf/builtin-c2c.c
+++ b/tools/perf/builtin-c2c.c
@@ -1813,7 +1813,7 @@ static int hpp_list__parse(struct perf_hpp_list *hpp_list,
perf_hpp__setup_output_field(hpp_list);

/*
-* We dont need other sorting keys other than those
+* We don't need other sorting keys other than those
 * we already specified. It also really slows down
 * the processing a lot with big number of output
 * fields, so switching this off for c2c.
--
2.26.2

Re: [PATCH 2/2] usb: dwc3: gadget: Ignore EP queue requests during bus reset

2021-03-19 Thread Wesley Cheng




On 3/19/2021 5:40 PM, Thinh Nguyen wrote:
> Hi,
> 
> Wesley Cheng wrote:
>> The current dwc3_gadget_reset_interrupt() will stop any active
>> transfers, but only addresses blocking of EP queuing for while we are
>> coming from a disconnected scenario, i.e. after receiving the disconnect
>> event.  If the host decides to issue a bus reset on the device, the
>> connected parameter will still be set to true, allowing for EP queuing
>> to continue while we are disabling the functions.  To avoid this, set the
>> connected flag to false until the stop active transfers is complete.
>>
>> Signed-off-by: Wesley Cheng 
>> ---
>>  drivers/usb/dwc3/gadget.c | 9 +
>>  1 file changed, 9 insertions(+)
>>
>> diff --git a/drivers/usb/dwc3/gadget.c b/drivers/usb/dwc3/gadget.c
>> index 6e14fdc..d5ed0f69 100644
>> --- a/drivers/usb/dwc3/gadget.c
>> +++ b/drivers/usb/dwc3/gadget.c
>> @@ -3327,6 +3327,15 @@ static void dwc3_gadget_reset_interrupt(struct dwc3 
>> *dwc)
>>  u32 reg;
>>  
>>  /*
>> + * Ideally, dwc3_reset_gadget() would trigger the function
>> + * drivers to stop any active transfers through ep disable.
>> + * However, for functions which defer ep disable, such as mass
>> + * storage, we will need to rely on the call to stop active
>> + * transfers here, and avoid allowing of request queuing.
>> + */
>> +dwc->connected = false;
>> +
>> +/*
>>   * WORKAROUND: DWC3 revisions <1.88a have an issue which
>>   * would cause a missing Disconnect Event if there's a
>>   * pending Setup Packet in the FIFO.
>>
> 
> This doesn't look right. Did you have rebase issue with your local
> change again?
> 
> BR,
> Thinh
> 
Hi Thinh,

This was rebased on Greg's usb-linus branch, which has commit
f09ddcfcb8c5 ("usb: dwc3: gadget: Prevent EP queuing while stopping
transfers") merged.

commit f09ddcfcb8c5  moved the dwc->connected = true to after we have
finished stop active transfers.  However, this change will also ensure
that the connected flag is set to false to ensure that when we call stop
active transfers, nothing can prepare TRBs.  (previous commit only
addresses the case where we get the reset interrupt when coming from a
disconnected state)

Thanks
Wesley Cheng

-- 
The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
a Linux Foundation Collaborative Project

Re: [PATCH -tip v3 07/11] ia64: Add instruction_pointer_set() API

2021-03-19 Thread Masami Hiramatsu

On Fri, 19 Mar 2021 21:23:01 +0900
Masami Hiramatsu  wrote:

> Add instruction_pointer_set() API for ia64.
> 
> Signed-off-by: Masami Hiramatsu 
> ---
>  arch/ia64/include/asm/ptrace.h |8 +++-
>  1 file changed, 7 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/ia64/include/asm/ptrace.h b/arch/ia64/include/asm/ptrace.h
> index b3aa46090101..e382f1a6bff3 100644
> --- a/arch/ia64/include/asm/ptrace.h
> +++ b/arch/ia64/include/asm/ptrace.h
> @@ -45,6 +45,7 @@
>  #include 
>  #include 
>  
> +# define ia64_psr(regs)  ((struct ia64_psr *) 
> &(regs)->cr_ipsr)
>  /*
>   * We use the ia64_psr(regs)->ri to determine which of the three
>   * instructions in bundle (16 bytes) took the sample. Generate
> @@ -71,6 +72,12 @@ static inline long regs_return_value(struct pt_regs *regs)
>   return -regs->r8;
>  }
>  
> +static inline void instruction_pointer_set(struct pt_regs *regs, unsigned 
> long val)
> +{
> + ia64_psr(regs)->ri = (val & 0xf);
> + regs->cr_iip = (val & ~0xfULL);
> +}

Oops, this caused a build error. Thanks for the kernel test bot.

It seems that all code which accessing to the "struct ia64_psr" in asm/ptrace.h
has to be a macro, because "struct ia64_psr" is defined in the asm/processor.h
which includes asm/ptrace.h (for pt_regs?).
If the code is defined as an inline function, the "struct ia64_psr" is evaluated
at that point, and caused build error.

arch/ia64/include/asm/ptrace.h:77:16: error: dereferencing pointer to 
incomplete type 'struct ia64_psr'

But macro code evaluation is postponed until it is used...

Let me update it.

Thank you,

-- 
Masami Hiramatsu

RE: [PATCH v5 2/3] x86/bus_lock: Handle #DB for bus lock

2021-03-19 Thread Thomas Gleixner

On Fri, Mar 19 2021 at 21:50, Tony Luck wrote:
>>  What is the justifucation for making this rate limit per UID and not
>>  per task, per process or systemwide?
>
> The concern is that a malicious user is running a workload that loops
> obtaining the buslock. This brings the whole system to its knees.
>
> Limiting per task doesn't help. The user can just fork(2) a whole bunch
> of tasks for a distributed buslock attack..

Fair enough.

> Systemwide might be an interesting alternative. Downside would be accidental
> rate limit of non-malicious tasks that happen to grab a bus lock periodically
> but in the same window with other buslocks from other users.
>
> Do you think that a risk worth taking to make the code simpler?

I'd consider it low risk, but I just looked for the usage of the
existing ratelimit in struct user and the related commit. Nw it's dawns
on me where you are coming from.

So that seems to become a pattern ... so the uncompiled thing below
might solve that.

Yes, it makes the efivars thingy slower, but do we care?

We neither care about efivars performance nor about the buslock
performance.

But I pretty much care about no having to stare at code which gets the
fundamental refcounting wrong.

Thanks,

tglx
---
 fs/efivarfs/Kconfig   |1 +
 fs/efivarfs/file.c|   11 +--
 include/linux/ratelimit.h |1 +
 include/linux/ratelimit_uid.h |   26 ++
 include/linux/sched/user.h|4 ++--
 kernel/user.c |7 ---
 lib/Kconfig   |3 +++
 lib/ratelimit.c   |   41 +
 8 files changed, 87 insertions(+), 7 deletions(-)

--- a/fs/efivarfs/Kconfig
+++ b/fs/efivarfs/Kconfig
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0-only
 config EFIVAR_FS
tristate "EFI Variable filesystem"
+   select UID_RATELIMIT
depends on EFI
default m
help
--- a/fs/efivarfs/file.c
+++ b/fs/efivarfs/file.c
@@ -63,6 +63,14 @@ static ssize_t efivarfs_file_write(struc
return bytes;
 }
 
+static const struct uid_ratelimit_cfg efivars_rl = {
+   .which  = UID_RATELIMIT_EFIVARS,
+   .interval   = HZ,
+   .burst  = 100,
+   .flags  = RATELIMIT_MSG_ON_RELEASE,
+   .delay  = 50,
+};
+
 static ssize_t efivarfs_file_read(struct file *file, char __user *userbuf,
size_t count, loff_t *ppos)
 {
@@ -73,8 +81,7 @@ static ssize_t efivarfs_file_read(struct
ssize_t size = 0;
int err;
 
-   while (!__ratelimit(&file->f_cred->user->ratelimit))
-   msleep(50);
+   uid_ratelimit(&efivars_rl);
 
err = efivar_entry_size(var, &datasize);
 
--- a/include/linux/ratelimit.h
+++ b/include/linux/ratelimit.h
@@ -3,6 +3,7 @@
 #define _LINUX_RATELIMIT_H
 
 #include 
+#include 
 #include 
 #include 
 
--- /dev/null
+++ b/include/linux/ratelimit_uid.h
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_RATELIMIT_UID_H
+#define _LINUX_RATELIMIT_UID_H
+
+/* Per UID ratelimits */
+enum uid_ratelimits {
+#ifdef CONFIG_EFIVAR_FS
+   UID_RATELIMIT_EFIVARS,
+#endif
+   UID_RATELIMIT_MAX,
+};
+
+#define UID_RATELIMIT_NODELAY  ULONG_MAX
+
+struct uid_ratelimit_cfg {
+   enum uid_ratelimits which;
+   int interval;
+   int burst;
+   unsigned long   flags;
+   unsigned long   delay;
+};
+
+extern int __uid_ratelimit(const struct uid_ratelimit_cfg *cfg, const void 
*func);
+#define uid_ratelimit(cfg) __uid_ratelimit(cfg, __func__)
+
+#endif
--- a/include/linux/sched/user.h
+++ b/include/linux/sched/user.h
@@ -40,8 +40,8 @@ struct user_struct {
atomic_t nr_watches;/* The number of watches this user currently 
has */
 #endif
 
-   /* Miscellaneous per-user rate limit */
-   struct ratelimit_state ratelimit;
+   /* Miscellaneous per-user rate limits storage */
+   struct ratelimit_state  *ratelimits[UID_RATELIMIT_MAX];
 };
 
 extern int uids_sysfs_init(void);
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -102,7 +102,6 @@ struct user_struct root_user = {
.sigpending = ATOMIC_INIT(0),
.locked_shm = 0,
.uid= GLOBAL_ROOT_UID,
-   .ratelimit  = RATELIMIT_STATE_INIT(root_user.ratelimit, 0, 0),
 };
 
 /*
@@ -139,8 +138,12 @@ static struct user_struct *uid_hash_find
 static void free_user(struct user_struct *up, unsigned long flags)
__releases(&uidhash_lock)
 {
+   unsigned int i;
+
uid_hash_remove(up);
spin_unlock_irqrestore(&uidhash_lock, flags);
+   for (i = 0; i < UID_RATELIMIT_MAX; i++)
+   kfree(up->ratelimits[i]);
kmem_cache_free(uid_cachep, up);
 }
 
@@ -188,8 +191,6 @@ struct user_struct *alloc_uid(kuid_t uid
 
new->uid = uid;
refcount_set(&new->__count, 1);
-   ratelimit_state_init

[tip:x86/cleanups] BUILD SUCCESS 21d6a7dcbfba5e7b31f4e9d555a9be362578bfc3

2021-03-19 Thread kernel test robot

 allyesconfig
mips allmodconfig
powerpc  allyesconfig
powerpc  allmodconfig
powerpc   allnoconfig
i386 randconfig-a001-20210318
i386 randconfig-a005-20210318
i386 randconfig-a003-20210318
i386 randconfig-a002-20210318
i386 randconfig-a006-20210318
i386 randconfig-a004-20210318
i386 randconfig-a001-20210319
i386 randconfig-a005-20210319
i386 randconfig-a003-20210319
i386 randconfig-a002-20210319
i386 randconfig-a006-20210319
i386 randconfig-a004-20210319
x86_64   randconfig-a011-20210318
x86_64   randconfig-a016-20210318
x86_64   randconfig-a013-20210318
x86_64   randconfig-a015-20210318
x86_64   randconfig-a014-20210318
x86_64   randconfig-a012-20210318
i386 randconfig-a013-20210318
i386 randconfig-a016-20210318
i386 randconfig-a011-20210318
i386 randconfig-a014-20210318
i386 randconfig-a015-20210318
i386 randconfig-a012-20210318
riscvnommu_k210_defconfig
riscvnommu_virt_defconfig
riscv allnoconfig
riscv   defconfig
x86_64rhel-7.6-kselftests
x86_64  defconfig
x86_64   rhel-8.3
x86_64  rhel-8.3-kbuiltin
x86_64  kexec

clang tested configs:
x86_64   randconfig-a006-20210318
x86_64   randconfig-a001-20210318
x86_64   randconfig-a005-20210318
x86_64   randconfig-a002-20210318
x86_64   randconfig-a003-20210318
x86_64   randconfig-a004-20210318

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-...@lists.01.org

[tip:locking/core] BUILD SUCCESS 8c52cca04f97a4c09ec2f0bd8fe6d0cdf49834e4

2021-03-19 Thread kernel test robot

m68kdefconfig
nds32   defconfig
nios2allyesconfig
cskydefconfig
alpha   defconfig
alphaallyesconfig
xtensa   allyesconfig
h8300allyesconfig
arc defconfig
nios2   defconfig
arc  allyesconfig
parisc  defconfig
s390 allyesconfig
s390 allmodconfig
parisc   allyesconfig
s390defconfig
sparcallyesconfig
sparc   defconfig
i386   tinyconfig
i386defconfig
mips allyesconfig
mips allmodconfig
powerpc  allyesconfig
powerpc  allmodconfig
powerpc   allnoconfig
i386 randconfig-a001-20210318
i386 randconfig-a005-20210318
i386 randconfig-a003-20210318
i386 randconfig-a002-20210318
i386 randconfig-a006-20210318
i386 randconfig-a004-20210318
i386 randconfig-a001-20210319
i386 randconfig-a005-20210319
i386 randconfig-a003-20210319
i386 randconfig-a002-20210319
i386 randconfig-a006-20210319
i386 randconfig-a004-20210319
x86_64   randconfig-a011-20210318
x86_64   randconfig-a016-20210318
x86_64   randconfig-a013-20210318
x86_64   randconfig-a015-20210318
x86_64   randconfig-a014-20210318
x86_64   randconfig-a012-20210318
i386 randconfig-a013-20210318
i386 randconfig-a016-20210318
i386 randconfig-a011-20210318
i386 randconfig-a014-20210318
i386 randconfig-a015-20210318
i386 randconfig-a012-20210318
riscvnommu_k210_defconfig
riscvnommu_virt_defconfig
riscv allnoconfig
riscv   defconfig
x86_64rhel-7.6-kselftests
x86_64  defconfig
x86_64   rhel-8.3
x86_64  rhel-8.3-kbuiltin
x86_64  kexec

clang tested configs:
x86_64   randconfig-a006-20210318
x86_64   randconfig-a001-20210318
x86_64   randconfig-a005-20210318
x86_64   randconfig-a002-20210318
x86_64   randconfig-a003-20210318
x86_64   randconfig-a004-20210318

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-...@lists.01.org

Re: [PATCH 2/2] usb: dwc3: gadget: Ignore EP queue requests during bus reset

2021-03-19 Thread Thinh Nguyen

Hi,

Wesley Cheng wrote:
> The current dwc3_gadget_reset_interrupt() will stop any active
> transfers, but only addresses blocking of EP queuing for while we are
> coming from a disconnected scenario, i.e. after receiving the disconnect
> event.  If the host decides to issue a bus reset on the device, the
> connected parameter will still be set to true, allowing for EP queuing
> to continue while we are disabling the functions.  To avoid this, set the
> connected flag to false until the stop active transfers is complete.
> 
> Signed-off-by: Wesley Cheng 
> ---
>  drivers/usb/dwc3/gadget.c | 9 +
>  1 file changed, 9 insertions(+)
> 
> diff --git a/drivers/usb/dwc3/gadget.c b/drivers/usb/dwc3/gadget.c
> index 6e14fdc..d5ed0f69 100644
> --- a/drivers/usb/dwc3/gadget.c
> +++ b/drivers/usb/dwc3/gadget.c
> @@ -3327,6 +3327,15 @@ static void dwc3_gadget_reset_interrupt(struct dwc3 
> *dwc)
>   u32 reg;
>  
>   /*
> +  * Ideally, dwc3_reset_gadget() would trigger the function
> +  * drivers to stop any active transfers through ep disable.
> +  * However, for functions which defer ep disable, such as mass
> +  * storage, we will need to rely on the call to stop active
> +  * transfers here, and avoid allowing of request queuing.
> +  */
> + dwc->connected = false;
> +
> + /*
>* WORKAROUND: DWC3 revisions <1.88a have an issue which
>* would cause a missing Disconnect Event if there's a
>* pending Setup Packet in the FIFO.
> 

This doesn't look right. Did you have rebase issue with your local
change again?

BR,
Thinh

Re: [PATCH 2/2] mm: memcontrol: deprecate swapaccounting=0 mode

2021-03-19 Thread Hugh Dickins

On Fri, 19 Mar 2021, Johannes Weiner wrote:

> The swapaccounting= commandline option already does very little
> today. To close a trivial containment failure case, the swap ownership
> tracking part of the swap controller has recently become mandatory
> (see commit 2d1c498072de ("mm: memcontrol: make swap tracking an
> integral part of memory control") for details), which makes up the
> majority of the work during swapout, swapin, and the swap slot map.
> 
> The only thing left under this flag is the page_counter operations and
> the visibility of the swap control files in the first place, which are
> rather meager savings. There also aren't many scenarios, if any, where
> controlling the memory of a cgroup while allowing it unlimited access
> to a global swap space is a workable resource isolation stragegy.
> 
> On the other hand, there have been several bugs and confusion around
> the many possible swap controller states (cgroup1 vs cgroup2 behavior,
> memory accounting without swap accounting, memcg runtime disabled).
> 
> This puts the maintenance overhead of retaining the toggle above its
> practical benefits. Deprecate it.
> 
> Suggested-by: Shakeel Butt 
> Signed-off-by: Johannes Weiner 

This crashes, and needs a fix: see below (plus some nits).

But it's a very welcome cleanup: just getting rid of all those
!cgroup_memory_noswap double negatives is a relief in itself.

It does suggest eliminating CONFIG_MEMCG_SWAP altogether (just
using #ifdef CONFIG_SWAP instead, in those parts of CONFIG_MEMCG code);
but you're right that's a separate cleanup, and not nearly so worthwhile
as this one (I notice CONFIG_MEMCG_SWAP in some of the arch defconfigs,
and don't know whether whoever removes CONFIG_MEMCG_SWAP would be
obligated to remove those too).

> ---
>  .../admin-guide/kernel-parameters.txt |  5 --
>  include/linux/memcontrol.h|  4 --
>  mm/memcontrol.c   | 48 ++-
>  3 files changed, 15 insertions(+), 42 deletions(-)
> 
> diff --git a/Documentation/admin-guide/kernel-parameters.txt 
> b/Documentation/admin-guide/kernel-parameters.txt
> index 942bbef8f128..986d45dd8c37 100644
> --- a/Documentation/admin-guide/kernel-parameters.txt
> +++ b/Documentation/admin-guide/kernel-parameters.txt
> @@ -5322,11 +5322,6 @@
>   This parameter controls use of the Protected
>   Execution Facility on pSeries.
>  
> - swapaccount=[0|1]
> - [KNL] Enable accounting of swap in memory resource
> - controller if no parameter or 1 is given or disable
> - it if 0 is given (See 
> Documentation/admin-guide/cgroup-v1/memory.rst)
> -
>   swiotlb=[ARM,IA-64,PPC,MIPS,X86]
>   Format: {  | force | noforce }
>-- Number of I/O TLB slabs
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index 4064c9dda534..ef9613538d36 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -874,10 +874,6 @@ struct mem_cgroup *mem_cgroup_get_oom_group(struct 
> task_struct *victim,
>   struct mem_cgroup *oom_domain);
>  void mem_cgroup_print_oom_group(struct mem_cgroup *memcg);
>  
> -#ifdef CONFIG_MEMCG_SWAP
> -extern bool cgroup_memory_noswap;
> -#endif
> -
>  void lock_page_memcg(struct page *page);
>  void unlock_page_memcg(struct page *page);
>  
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 49bdcf603af1..b036c4fb0fa7 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -85,13 +85,6 @@ static bool cgroup_memory_nosocket;
>  /* Kernel memory accounting disabled? */
>  static bool cgroup_memory_nokmem;
>  
> -/* Whether the swap controller is active */
> -#ifdef CONFIG_MEMCG_SWAP
> -bool cgroup_memory_noswap __read_mostly;
> -#else
> -#define cgroup_memory_noswap 1
> -#endif
> -
>  #ifdef CONFIG_CGROUP_WRITEBACK
>  static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq);
>  #endif
> @@ -99,7 +92,11 @@ static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq);
>  /* Whether legacy memory+swap accounting is active */
>  static bool do_memsw_account(void)
>  {
> - return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && 
> !cgroup_memory_noswap;
> + /* cgroup2 doesn't do mem+swap accounting */
> + if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
> + return false;
> +
> + return true;

Nit: I'm not fond of the "if (boolean()) return true; else return false;"
codestyle, and would prefer the straightforward

return !cgroup_subsys_on_dfl(memory_cgrp_subsys);

but you've chosen otherwise, so, okay.

>  }
>  
>  #define THRESHOLDS_EVENTS_TARGET 128
> @@ -7019,7 +7016,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t 
> entry)
>   if (!mem_cgroup_is_root(memcg))
>   page_counter_uncharge(&memcg->memory, nr_entries);
>  
> - if (!cgroup_memory_noswap && memcg != swap_me

Re: [PATCH v3] mm/gup: check page posion status for coredump.

2021-03-19 Thread Matthew Wilcox

On Fri, Mar 19, 2021 at 10:44:37AM +0800, Aili Yao wrote:
> +++ b/mm/gup.c
> @@ -1536,6 +1536,10 @@ struct page *get_dump_page(unsigned long addr)
> FOLL_FORCE | FOLL_DUMP | FOLL_GET);
>   if (locked)
>   mmap_read_unlock(mm);
> +
> + if (ret == 1 && is_page_poisoned(page))
> + return NULL;
> +
>   return (ret == 1) ? page : NULL;
>  }
>  #endif /* CONFIG_ELF_CORE */
> diff --git a/mm/internal.h b/mm/internal.h
> index 25d2b2439..902d993 100644
> --- a/mm/internal.h
> +++ b/mm/internal.h
> @@ -97,6 +97,27 @@ static inline void set_page_refcounted(struct page *page)
>   set_page_count(page, 1);
>  }
>  
> +/*
> + * When kernel touch the user page, the user page may be have been marked
> + * poison but still mapped in user space, if without this page, the kernel
> + * can guarantee the data integrity and operation success, the kernel is
> + * better to check the posion status and avoid touching it, be good not to
> + * panic, coredump for process fatal signal is a sample case matching this
> + * scenario. Or if kernel can't guarantee the data integrity, it's better
> + * not to call this function, let kernel touch the poison page and get to
> + * panic.
> + */
> +static inline bool is_page_poisoned(struct page *page)
> +{
> + if (page != NULL) {

Why are you checking page for NULL here?  How can it possibly be NULL?

> + if (PageHWPoison(page))
> + return true;
> + else if (PageHuge(page) && PageHWPoison(compound_head(page)))
> + return true;
> + }
> + return 0;
> +}
> +
>  extern unsigned long highest_memmap_pfn;
>  
>  /*
> -- 
> 1.8.3.1
> 
>

Re: [RFC] btrfs: Allow read-only mount with corrupted extent tree

2021-03-19 Thread Qu Wenruo





On 2021/3/19 下午11:34, Dāvis Mosāns wrote:

ceturtd., 2021. g. 18. marts, plkst. 01:49 — lietotājs Qu Wenruo
() rakstīja:




On 2021/3/18 上午5:03, Dāvis Mosāns wrote:

trešd., 2021. g. 17. marts, plkst. 12:28 — lietotājs Qu Wenruo
() rakstīja:




On 2021/3/17 上午9:29, Dāvis Mosāns wrote:

trešd., 2021. g. 17. marts, plkst. 03:18 — lietotājs Dāvis Mosāns
() rakstīja:


Currently if there's any corruption at all in extent tree
(eg. even single bit) then mounting will fail with:
"failed to read block groups: -5" (-EIO)
It happens because we immediately abort on first error when
searching in extent tree for block groups.

Now with this patch if `ignorebadroots` option is specified
then we handle such case and continue by removing already
created block groups and creating dummy block groups.

Signed-off-by: Dāvis Mosāns 
---
fs/btrfs/block-group.c | 14 ++
fs/btrfs/disk-io.c |  4 ++--
fs/btrfs/disk-io.h |  2 ++
3 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 48ebc106a606..827a977614b3 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -2048,6 +2048,20 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
   ret = check_chunk_block_group_mappings(info);
error:
   btrfs_free_path(path);
+
+   if (ret == -EIO && btrfs_test_opt(info, IGNOREBADROOTS)) {
+   btrfs_put_block_group_cache(info);
+   btrfs_stop_all_workers(info);
+   btrfs_free_block_groups(info);
+   ret = btrfs_init_workqueues(info, NULL);
+   if (ret)
+   return ret;
+   ret = btrfs_init_space_info(info);
+   if (ret)
+   return ret;
+   return fill_dummy_bgs(info);


When we hit bad things in extent tree, we should ensure we're mounting
the fs RO, or we can't continue.

And we should also refuse to mount back to RW if we hit such case, so
that we don't need anything complex, just ignore the whole extent tree
and create the dummy block groups.



That's what we're doing here, `ignorebadroots` implies RO mount and
without specifying it doesn't mount at all.



This isn't that nice, but I don't really know how to properly clean up
everything related to already created block groups so this was easiest
way. It seems to work fine.
But looks like need to do something about replay log aswell because if
it's not disabled then it fails with:

[ 1397.246869] BTRFS info (device sde): start tree-log replay
[ 1398.218685] BTRFS warning (device sde): sde checksum verify failed
on 21057127661568 wanted 0xd1506ed9 found 0x22ab750a level 0
[ 1398.218803] BTRFS warning (device sde): sde checksum verify failed
on 21057127661568 wanted 0xd1506ed9 found 0x7dd54bb9 level 0
[ 1398.218813] BTRFS: error (device sde) in __btrfs_free_extent:3054:
errno=-5 IO failure
[ 1398.218828] BTRFS: error (device sde) in
btrfs_run_delayed_refs:2124: errno=-5 IO failure
[ 1398.219002] BTRFS: error (device sde) in btrfs_replay_log:2254:
errno=-5 IO failure (Failed to recover log tree)
[ 1398.229048] BTRFS error (device sde): open_ctree failed


This is because we shouldn't allow to do anything write to the fs if we
have anything wrong in extent tree.



This is happening when mounting read-only. My assumption is that it
only tries to replay in memory without writing anything to disk.



We lacks the check on log tree.

Normally for such forced RO mount, log replay is not allowed.

We should output a warning to prompt user to use nologreplay, and reject
the mount.



I'm not familiar with log replay but couldn't there be something
useful (ignoring ref counts) that would still be worth replaying in
memory?


Log replay means metadata write.

Any write needs a valid extent tree to find out free space for new
metadata/data.

So no, we can't do anything but completely ignoring the log.

Thanks,
Qu

[tip:locking/urgent] BUILD SUCCESS 38c93587375053c5b9ef093f4a5ea754538cba32

2021-03-19 Thread kernel test robot

 allmodconfig
m68kdefconfig
nios2   defconfig
arc  allyesconfig
nds32   defconfig
nios2allyesconfig
cskydefconfig
alpha   defconfig
alphaallyesconfig
xtensa   allyesconfig
arc defconfig
h8300allyesconfig
s390 allyesconfig
parisc   allyesconfig
s390defconfig
sparcallyesconfig
sparc   defconfig
i386   tinyconfig
i386defconfig
mips allyesconfig
mips allmodconfig
powerpc  allyesconfig
powerpc  allmodconfig
powerpc   allnoconfig
i386 randconfig-a001-20210318
i386 randconfig-a005-20210318
i386 randconfig-a003-20210318
i386 randconfig-a002-20210318
i386 randconfig-a006-20210318
i386 randconfig-a004-20210318
i386 randconfig-a001-20210319
i386 randconfig-a005-20210319
i386 randconfig-a003-20210319
i386 randconfig-a002-20210319
i386 randconfig-a006-20210319
i386 randconfig-a004-20210319
x86_64   randconfig-a011-20210318
x86_64   randconfig-a016-20210318
x86_64   randconfig-a013-20210318
x86_64   randconfig-a015-20210318
x86_64   randconfig-a014-20210318
x86_64   randconfig-a012-20210318
i386 randconfig-a013-20210318
i386 randconfig-a016-20210318
i386 randconfig-a011-20210318
i386 randconfig-a014-20210318
i386 randconfig-a015-20210318
i386 randconfig-a012-20210318
riscvnommu_k210_defconfig
riscvnommu_virt_defconfig
riscv allnoconfig
riscv   defconfig
x86_64rhel-7.6-kselftests
x86_64  defconfig
x86_64   rhel-8.3
x86_64  rhel-8.3-kbuiltin
x86_64  kexec

clang tested configs:
x86_64   randconfig-a006-20210318
x86_64   randconfig-a001-20210318
x86_64   randconfig-a005-20210318
x86_64   randconfig-a002-20210318
x86_64   randconfig-a003-20210318
x86_64   randconfig-a004-20210318

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-...@lists.01.org

[PATCH] HID: do not use down_interruptible() when unbinding devices

2021-03-19 Thread Dmitry Torokhov

Action of unbinding driver from a device is not cancellable and should not
fail, and driver core does not pay attention to the result of "remove"
method, therefore using down_interruptible() in hid_device_remove() does
not make sense.

Signed-off-by: Dmitry Torokhov 
---
 drivers/hid/hid-core.c | 10 +++---
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/drivers/hid/hid-core.c b/drivers/hid/hid-core.c
index 56172fe6995c..ec63a9ff40dc 100644
--- a/drivers/hid/hid-core.c
+++ b/drivers/hid/hid-core.c
@@ -2300,12 +2300,8 @@ static int hid_device_remove(struct device *dev)
 {
struct hid_device *hdev = to_hid_device(dev);
struct hid_driver *hdrv;
-   int ret = 0;
 
-   if (down_interruptible(&hdev->driver_input_lock)) {
-   ret = -EINTR;
-   goto end;
-   }
+   down(&hdev->driver_input_lock);
hdev->io_started = false;
 
hdrv = hdev->driver;
@@ -2320,8 +2316,8 @@ static int hid_device_remove(struct device *dev)
 
if (!hdev->io_started)
up(&hdev->driver_input_lock);
-end:
-   return ret;
+
+   return 0;
 }
 
 static ssize_t modalias_show(struct device *dev, struct device_attribute *a,
-- 
2.31.0.rc2.261.g7f71774620-goog


-- 
Dmitry

1 2 3 4 5 6 7 8 9 10 >

1 - 100 of 1480 matches

Mail list logo