[PATCH v2 12/16] mm: multigenerational lru: eviction

2021-04-12 Thread Yu Zhao
The eviction consumes old generations. Given an lruvec, the eviction
scans the pages on the per-zone lists indexed by either of min_seq[2].
It first tries to select a type based on the values of min_seq[2].
When anon and file types are both available from the same generation,
it selects the one that has a lower refault rate.

During a scan, the eviction sorts pages according to their generation
numbers, if the aging has found them referenced. It also moves pages
from the tiers that have higher refault rates than tier 0 to the next
generation. When it finds all the per-zone lists of a selected type
are empty, the eviction increments min_seq[2] indexed by this selected
type.

Signed-off-by: Yu Zhao 
---
 mm/vmscan.c | 341 
 1 file changed, 341 insertions(+)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 31e1b4155677..6239b1acd84f 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -5468,6 +5468,347 @@ static bool walk_mm_list(struct lruvec *lruvec, 
unsigned long max_seq,
return true;
 }
 
+/**
+ *  the eviction
+ 
**/
+
+static bool sort_page(struct page *page, struct lruvec *lruvec, int 
tier_to_isolate)
+{
+   bool success;
+   int gen = page_lru_gen(page);
+   int file = page_is_file_lru(page);
+   int zone = page_zonenum(page);
+   int tier = lru_tier_from_usage(page_tier_usage(page));
+   struct lrugen *lrugen = &lruvec->evictable;
+
+   VM_BUG_ON_PAGE(gen == -1, page);
+   VM_BUG_ON_PAGE(tier_to_isolate < 0, page);
+
+   /* a lazy-free page that has been written into? */
+   if (file && PageDirty(page) && PageAnon(page)) {
+   success = lru_gen_deletion(page, lruvec);
+   VM_BUG_ON_PAGE(!success, page);
+   SetPageSwapBacked(page);
+   add_page_to_lru_list_tail(page, lruvec);
+   return true;
+   }
+
+   /* page_update_gen() has updated the page? */
+   if (gen != lru_gen_from_seq(lrugen->min_seq[file])) {
+   list_move(&page->lru, &lrugen->lists[gen][file][zone]);
+   return true;
+   }
+
+   /* activate the page if its tier has a higher refault rate */
+   if (tier_to_isolate < tier) {
+   int sid = sid_from_seq_or_gen(gen);
+
+   page_inc_gen(page, lruvec, false);
+   WRITE_ONCE(lrugen->activated[sid][file][tier - 1],
+  lrugen->activated[sid][file][tier - 1] + 
thp_nr_pages(page));
+   inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + file);
+   return true;
+   }
+
+   /*
+* A page can't be immediately evicted, and page_inc_gen() will mark it
+* for reclaim and hopefully writeback will write it soon if it's dirty.
+*/
+   if (PageLocked(page) || PageWriteback(page) || (file && 
PageDirty(page))) {
+   page_inc_gen(page, lruvec, true);
+   return true;
+   }
+
+   return false;
+}
+
+static bool should_skip_page(struct page *page, struct scan_control *sc)
+{
+   if (!sc->may_unmap && page_mapped(page))
+   return true;
+
+   if (!(sc->may_writepage && (sc->gfp_mask & __GFP_IO)) &&
+   (PageDirty(page) || (PageAnon(page) && !PageSwapCache(page
+   return true;
+
+   if (!get_page_unless_zero(page))
+   return true;
+
+   if (!TestClearPageLRU(page)) {
+   put_page(page);
+   return true;
+   }
+
+   return false;
+}
+
+static void isolate_page(struct page *page, struct lruvec *lruvec)
+{
+   bool success;
+
+   success = lru_gen_deletion(page, lruvec);
+   VM_BUG_ON_PAGE(!success, page);
+
+   if (PageActive(page)) {
+   ClearPageActive(page);
+   /* make sure shrink_page_list() rejects this page */
+   SetPageReferenced(page);
+   return;
+   }
+
+   /* make sure shrink_page_list() doesn't try to write this page */
+   ClearPageReclaim(page);
+   /* make sure shrink_page_list() doesn't reject this page */
+   ClearPageReferenced(page);
+}
+
+static int scan_lru_gen_pages(struct lruvec *lruvec, struct scan_control *sc,
+ long *nr_to_scan, int file, int tier,
+ struct list_head *list)
+{
+   bool success;
+   int gen, zone;
+   enum vm_event_item item;
+   int sorted = 0;
+   int scanned = 0;
+   int isolated = 0;
+   int batch_size = 0;
+   struct lrugen *lrugen = &lruvec->evictable;
+
+   VM_BUG_ON(!list_empty(list));
+
+   if (get_nr_gens(lruvec, file) == MIN_NR_GENS)
+   return -ENOENT;
+
+   gen = lru_gen_from_seq(lrugen->min_seq[file]);
+
+   for (zone = sc->reclaim_idx; zone >= 0; 

[PATCH v2 10/16] mm: multigenerational lru: mm_struct list

2021-04-12 Thread Yu Zhao
In order to scan page tables, we add an infrastructure to maintain
either a system-wide mm_struct list or per-memcg mm_struct lists.
Multiple threads can concurrently work on the same mm_struct list, and
each of them will be given a different mm_struct.

This infrastructure also tracks whether an mm_struct is being used on
any CPUs or has been used since the last time a worker looked at it.
In other words, workers will not be given an mm_struct that belongs to
a process that has been sleeping.

Signed-off-by: Yu Zhao 
---
 fs/exec.c  |   2 +
 include/linux/memcontrol.h |   6 +
 include/linux/mm_types.h   | 117 ++
 include/linux/mmzone.h |   2 -
 kernel/exit.c  |   1 +
 kernel/fork.c  |  10 ++
 kernel/kthread.c   |   1 +
 kernel/sched/core.c|   2 +
 mm/memcontrol.c|  28 
 mm/vmscan.c| 316 +
 10 files changed, 483 insertions(+), 2 deletions(-)

diff --git a/fs/exec.c b/fs/exec.c
index 18594f11c31f..c691d4d7720c 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1008,6 +1008,7 @@ static int exec_mmap(struct mm_struct *mm)
active_mm = tsk->active_mm;
tsk->active_mm = mm;
tsk->mm = mm;
+   lru_gen_add_mm(mm);
/*
 * This prevents preemption while active_mm is being loaded and
 * it and mm are being updated, which could cause problems for
@@ -1018,6 +1019,7 @@ static int exec_mmap(struct mm_struct *mm)
if (!IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM))
local_irq_enable();
activate_mm(active_mm, mm);
+   lru_gen_switch_mm(active_mm, mm);
if (IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM))
local_irq_enable();
tsk->mm->vmacache_seqnum = 0;
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index f13dc02cf277..cff95ed1ee2b 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -212,6 +212,8 @@ struct obj_cgroup {
};
 };
 
+struct lru_gen_mm_list;
+
 /*
  * The memory controller data structure. The memory controller controls both
  * page cache and RSS per cgroup. We would eventually like to provide
@@ -335,6 +337,10 @@ struct mem_cgroup {
struct deferred_split deferred_split_queue;
 #endif
 
+#ifdef CONFIG_LRU_GEN
+   struct lru_gen_mm_list *mm_list;
+#endif
+
struct mem_cgroup_per_node *nodeinfo[0];
/* WARNING: nodeinfo must be the last member here */
 };
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 6613b26a8894..f8a239fbb958 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -15,6 +15,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 
 #include 
 
@@ -383,6 +385,8 @@ struct core_state {
struct completion startup;
 };
 
+#define ANON_AND_FILE 2
+
 struct kioctx_table;
 struct mm_struct {
struct {
@@ -561,6 +565,22 @@ struct mm_struct {
 
 #ifdef CONFIG_IOMMU_SUPPORT
u32 pasid;
+#endif
+#ifdef CONFIG_LRU_GEN
+   struct {
+   /* the node of a global or per-memcg mm_struct list */
+   struct list_head list;
+#ifdef CONFIG_MEMCG
+   /* points to memcg of the owner task above */
+   struct mem_cgroup *memcg;
+#endif
+   /* whether this mm_struct has been used since the last 
walk */
+   nodemask_t nodes[ANON_AND_FILE];
+#ifndef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
+   /* the number of CPUs using this mm_struct */
+   atomic_t nr_cpus;
+#endif
+   } lrugen;
 #endif
} __randomize_layout;
 
@@ -588,6 +608,103 @@ static inline cpumask_t *mm_cpumask(struct mm_struct *mm)
return (struct cpumask *)&mm->cpu_bitmap;
 }
 
+#ifdef CONFIG_LRU_GEN
+
+void lru_gen_init_mm(struct mm_struct *mm);
+void lru_gen_add_mm(struct mm_struct *mm);
+void lru_gen_del_mm(struct mm_struct *mm);
+#ifdef CONFIG_MEMCG
+int lru_gen_alloc_mm_list(struct mem_cgroup *memcg);
+void lru_gen_free_mm_list(struct mem_cgroup *memcg);
+void lru_gen_migrate_mm(struct mm_struct *mm);
+#endif
+
+/*
+ * Track the usage so mm_struct's that haven't been used since the last walk 
can
+ * be skipped. This function adds a theoretical overhead to each context 
switch,
+ * which hasn't been measurable.
+ */
+static inline void lru_gen_switch_mm(struct mm_struct *old, struct mm_struct 
*new)
+{
+   int file;
+
+   /* exclude init_mm, efi_mm, etc. */
+   if (!core_kernel_data((unsigned long)old)) {
+   VM_BUG_ON(old == &init_mm);
+
+   for (file = 0; file < ANON_AND_FILE; file++)
+   nodes_setall(old->lrugen.nodes[file]);
+
+#ifndef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
+   atomic_dec(&old->lrugen.nr_cpus);
+   VM_BUG_ON_MM(atomic_read(&old->lrugen.nr_cpus) < 0, old);
+#endif

[PATCH v2 11/16] mm: multigenerational lru: aging

2021-04-12 Thread Yu Zhao
The aging produces young generations. Given an lruvec, the aging walks
the mm_struct list associated with this lruvec to scan page tables for
referenced pages. Upon finding one, the aging updates the generation
number of this page to max_seq. After each round of scan, the aging
increments max_seq. The aging is due when both of min_seq[2] reaches
max_seq-1, assuming both anon and file types are reclaimable.

The aging uses the following optimizations when scanning page tables:
  1) It will not scan page tables from processes that have been
  sleeping since the last scan.
  2) It will not scan PTE tables under non-leaf PMD entries that do
  not have the accessed bit set, when
  CONFIG_HAVE_ARCH_PARENT_PMD_YOUNG=y.
  3) It will not zigzag between the PGD table and the same PMD or PTE
  table spanning multiple VMAs. In other words, it finishes all the
  VMAs with the range of the same PMD or PTE table before it returns
  to the PGD table. This optimizes workloads that have large numbers
  of tiny VMAs, especially when CONFIG_PGTABLE_LEVELS=5.

Signed-off-by: Yu Zhao 
---
 mm/vmscan.c | 700 
 1 file changed, 700 insertions(+)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index d67dfd1e3930..31e1b4155677 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -50,6 +50,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -4771,6 +4772,702 @@ static bool get_next_mm(struct mm_walk_args *args, int 
swappiness, struct mm_str
return last;
 }
 
+/**
+ *  the aging
+ 
**/
+
+static void update_batch_size(struct page *page, int old_gen, int new_gen,
+ struct mm_walk_args *args)
+{
+   int file = page_is_file_lru(page);
+   int zone = page_zonenum(page);
+   int delta = thp_nr_pages(page);
+
+   VM_BUG_ON(old_gen >= MAX_NR_GENS);
+   VM_BUG_ON(new_gen >= MAX_NR_GENS);
+
+   args->batch_size++;
+
+   args->nr_pages[old_gen][file][zone] -= delta;
+   args->nr_pages[new_gen][file][zone] += delta;
+}
+
+static void reset_batch_size(struct lruvec *lruvec, struct mm_walk_args *args)
+{
+   int gen, file, zone;
+   struct lrugen *lrugen = &lruvec->evictable;
+
+   args->batch_size = 0;
+
+   spin_lock_irq(&lruvec->lru_lock);
+
+   for_each_gen_type_zone(gen, file, zone) {
+   enum lru_list lru = LRU_FILE * file;
+   int total = args->nr_pages[gen][file][zone];
+
+   if (!total)
+   continue;
+
+   args->nr_pages[gen][file][zone] = 0;
+   WRITE_ONCE(lrugen->sizes[gen][file][zone],
+  lrugen->sizes[gen][file][zone] + total);
+
+   if (lru_gen_is_active(lruvec, gen))
+   lru += LRU_ACTIVE;
+   update_lru_size(lruvec, lru, zone, total);
+   }
+
+   spin_unlock_irq(&lruvec->lru_lock);
+}
+
+static int page_update_gen(struct page *page, int new_gen)
+{
+   int old_gen;
+   unsigned long old_flags, new_flags;
+
+   VM_BUG_ON(new_gen >= MAX_NR_GENS);
+
+   do {
+   old_flags = READ_ONCE(page->flags);
+
+   old_gen = ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
+   if (old_gen < 0)
+   new_flags = old_flags | BIT(PG_referenced);
+   else
+   new_flags = (old_flags & ~(LRU_GEN_MASK | 
LRU_USAGE_MASK |
+LRU_TIER_FLAGS)) | ((new_gen + 1UL) << 
LRU_GEN_PGOFF);
+
+   if (old_flags == new_flags)
+   break;
+   } while (cmpxchg(&page->flags, old_flags, new_flags) != old_flags);
+
+   return old_gen;
+}
+
+static int should_skip_vma(unsigned long start, unsigned long end, struct 
mm_walk *walk)
+{
+   struct vm_area_struct *vma = walk->vma;
+   struct mm_walk_args *args = walk->private;
+
+   if (!vma_is_accessible(vma) || is_vm_hugetlb_page(vma) ||
+   (vma->vm_flags & (VM_LOCKED | VM_SPECIAL)))
+   return true;
+
+   if (vma_is_anonymous(vma))
+   return !args->should_walk[0];
+
+   if (vma_is_shmem(vma))
+   return !args->should_walk[0] ||
+  mapping_unevictable(vma->vm_file->f_mapping);
+
+   return !args->should_walk[1] || vma_is_dax(vma) ||
+  vma == get_gate_vma(vma->vm_mm) ||
+  mapping_unevictable(vma->vm_file->f_mapping);
+}
+
+/*
+ * Some userspace memory allocators create many single-page VMAs. So instead of
+ * returning back to the PGD table for each of such VMAs, we finish at least an
+ * entire PMD table and therefore avoid many zigzags. This optimizes page table
+ * walks for workloads that have large numbers of tiny VMAs.
+ *
+ * We scan PMD tables in two pass. Th

[PATCH v2 08/16] mm: multigenerational lru: groundwork

2021-04-12 Thread Yu Zhao
For each lruvec, evictable pages are divided into multiple
generations. The youngest generation number is stored in max_seq for
both anon and file types as they are aged on an equal footing. The
oldest generation numbers are stored in min_seq[2] separately for anon
and file types as clean file pages can be evicted regardless of
may_swap or may_writepage. Generation numbers are truncated into
order_base_2(MAX_NR_GENS+1) bits in order to fit into page->flags. The
sliding window technique is used to prevent truncated generation
numbers from overlapping. Each truncated generation number is an index
to lruvec->evictable.lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES].
Evictable pages are added to the per-zone lists indexed by max_seq or
min_seq[2] (modulo MAX_NR_GENS), depending on whether they are being
faulted in.

The workflow comprises two conceptually independent functions: the
aging and the eviction. The aging produces young generations. Given an
lruvec, the aging scans page tables for referenced pages of this
lruvec. Upon finding one, the aging updates its generation number to
max_seq. After each round of scan, the aging increments max_seq. The
aging is due when both of min_seq[2] reaches max_seq-1, assuming both
anon and file types are reclaimable.

The eviction consumes old generations. Given an lruvec, the eviction
scans the pages on the per-zone lists indexed by either of min_seq[2].
It tries to select a type based on the values of min_seq[2] and
swappiness. During a scan, the eviction sorts pages according to their
generation numbers, if the aging has found them referenced. When it
finds all the per-zone lists of a selected type are empty, the
eviction increments min_seq[2] indexed by this selected type.

Signed-off-by: Yu Zhao 
---
 fs/fuse/dev.c |   3 +-
 include/linux/mm.h|   2 +
 include/linux/mm_inline.h | 193 +++
 include/linux/mmzone.h| 110 +++
 include/linux/page-flags-layout.h |  20 +-
 include/linux/page-flags.h|   4 +-
 kernel/bounds.c   |   6 +
 mm/huge_memory.c  |   3 +-
 mm/mm_init.c  |  16 +-
 mm/mmzone.c   |   2 +
 mm/swapfile.c |   4 +
 mm/vmscan.c   | 305 ++
 12 files changed, 656 insertions(+), 12 deletions(-)

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index c0fee830a34e..27c83f557794 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -784,7 +784,8 @@ static int fuse_check_page(struct page *page)
   1 << PG_lru |
   1 << PG_active |
   1 << PG_reclaim |
-  1 << PG_waiters))) {
+  1 << PG_waiters |
+  LRU_GEN_MASK | LRU_USAGE_MASK))) {
dump_page(page, "fuse: trying to steal weird page");
return 1;
}
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 8ba434287387..2c8a2db78ce9 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1070,6 +1070,8 @@ vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf);
 #define ZONES_PGOFF(NODES_PGOFF - ZONES_WIDTH)
 #define LAST_CPUPID_PGOFF  (ZONES_PGOFF - LAST_CPUPID_WIDTH)
 #define KASAN_TAG_PGOFF(LAST_CPUPID_PGOFF - KASAN_TAG_WIDTH)
+#define LRU_GEN_PGOFF  (KASAN_TAG_PGOFF - LRU_GEN_WIDTH)
+#define LRU_USAGE_PGOFF(LRU_GEN_PGOFF - LRU_USAGE_WIDTH)
 
 /*
  * Define the bit shifts to access each section.  For non-existent
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index 355ea1ee32bd..2bf910eb3dd7 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -79,11 +79,198 @@ static __always_inline enum lru_list page_lru(struct page 
*page)
return lru;
 }
 
+#ifdef CONFIG_LRU_GEN
+
+#ifdef CONFIG_LRU_GEN_ENABLED
+DECLARE_STATIC_KEY_TRUE(lru_gen_static_key);
+#define lru_gen_enabled() static_branch_likely(&lru_gen_static_key)
+#else
+DECLARE_STATIC_KEY_FALSE(lru_gen_static_key);
+#define lru_gen_enabled() static_branch_unlikely(&lru_gen_static_key)
+#endif
+
+/* We track at most MAX_NR_GENS generations using the sliding window 
technique. */
+static inline int lru_gen_from_seq(unsigned long seq)
+{
+   return seq % MAX_NR_GENS;
+}
+
+/* Return a proper index regardless whether we keep a full history of stats. */
+static inline int sid_from_seq_or_gen(int seq_or_gen)
+{
+   return seq_or_gen % NR_STAT_GENS;
+}
+
+/* The youngest and the second youngest generations are considered active. */
+static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen)
+{
+   unsigned long max_seq = READ_ONCE(lruvec->evictable.max_seq);
+
+   VM_BUG_ON(!max_seq);
+   VM_BUG_ON(gen >= MAX_NR_GENS);
+
+   return gen == lru_gen_from_seq(max_seq) || gen == 
lru_gen_from_seq(max_seq - 1);
+}
+
+/* Update the sizes of the multigenerational lru. */
+static inline void lru_gen_update_size(struct page

[PATCH v2 09/16] mm: multigenerational lru: activation

2021-04-12 Thread Yu Zhao
For pages accessed multiple times via file descriptors, instead of
activating them upon the second accesses, we activate them based on
the refault rates of their tiers. Pages accessed N times via file
descriptors belong to tier order_base_2(N). Pages from tier 0, i.e.,
those read ahead, accessed once via file descriptors and accessed only
via page tables, are evicted regardless of the refault rate. Pages
from other tiers will be moved to the next generation, i.e.,
activated, if the refault rates of their tiers are higher than that of
tier 0. Each generation contains at most MAX_NR_TIERS tiers, and they
require additional MAX_NR_TIERS-2 bits in page->flags. This feedback
model has a few advantages over the current feedforward model:
  1) It has a negligible overhead in the access path because
  activations are done in the reclaim path.
  2) It takes mapped pages into account and avoids overprotecting
  pages accessed multiple times via file descriptors.
  3) More tiers offer better protection to pages accessed more than
  twice when buffered-I/O-intensive workloads are under memory
  pressure.

For pages mapped upon page faults, the accessed bit is set and they
must be properly aged. We add them to the per-zone lists index by
max_seq, i.e., the youngest generation. For pages not in page cache
or swap cache, this can be done easily in the page fault path: we
rename lru_cache_add_inactive_or_unevictable() to
lru_cache_add_page_vma() and add a new parameter, which is set to true
for pages mapped upon page faults. For pages in page cache or swap
cache, we cannot differentiate the page fault path from the read ahead
path at the time we call lru_cache_add() in add_to_page_cache_lru()
and __read_swap_cache_async(). So we add a new function
lru_gen_activation(), which is essentially activate_page(), to move
pages to the per-zone lists indexed by max_seq at a later time.
Hopefully we would find those pages in lru_pvecs.lru_add and simply
set PageActive() on them without having to actually move them.

Finally, we need to be compatible with the existing notion of active
and inactive. We cannot use PageActive() because it is not set on
active pages unless they are isolated, in order to spare the aging the
trouble of clearing it when an active generation becomes inactive. A
new function page_is_active() compares the generation number of a page
with max_seq and max_seq-1 (modulo MAX_NR_GENS), which are considered
active and protected from the eviction. Other generations, which may
or may not exist, are considered inactive.

Signed-off-by: Yu Zhao 
---
 fs/proc/task_mmu.c|   3 +-
 include/linux/mm_inline.h | 101 +
 include/linux/swap.h  |   4 +-
 kernel/events/uprobes.c   |   2 +-
 mm/huge_memory.c  |   2 +-
 mm/khugepaged.c   |   2 +-
 mm/memory.c   |  14 +--
 mm/migrate.c  |   2 +-
 mm/swap.c |  26 +++---
 mm/swapfile.c |   2 +-
 mm/userfaultfd.c  |   2 +-
 mm/vmscan.c   |  91 ++-
 mm/workingset.c   | 179 +++---
 13 files changed, 371 insertions(+), 59 deletions(-)

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index e862cab69583..d292f20c4e3d 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -19,6 +19,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -1718,7 +1719,7 @@ static void gather_stats(struct page *page, struct 
numa_maps *md, int pte_dirty,
if (PageSwapCache(page))
md->swapcache += nr_pages;
 
-   if (PageActive(page) || PageUnevictable(page))
+   if (PageUnevictable(page) || page_is_active(compound_head(page), NULL))
md->active += nr_pages;
 
if (PageWriteback(page))
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index 2bf910eb3dd7..5eb4b12972ec 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -95,6 +95,12 @@ static inline int lru_gen_from_seq(unsigned long seq)
return seq % MAX_NR_GENS;
 }
 
+/* Convert the level of usage to a tier. See the comment on MAX_NR_TIERS. */
+static inline int lru_tier_from_usage(int usage)
+{
+   return order_base_2(usage + 1);
+}
+
 /* Return a proper index regardless whether we keep a full history of stats. */
 static inline int sid_from_seq_or_gen(int seq_or_gen)
 {
@@ -238,12 +244,93 @@ static inline bool lru_gen_deletion(struct page *page, 
struct lruvec *lruvec)
return true;
 }
 
+/* Activate a page from page cache or swap cache after it's mapped. */
+static inline void lru_gen_activation(struct page *page, struct vm_area_struct 
*vma)
+{
+   if (!lru_gen_enabled())
+   return;
+
+   if (PageActive(page) || PageUnevictable(page) || vma_is_dax(vma) ||
+   (vma->vm_flags & (VM_LOCKED | VM_SPECIAL)))
+   return;
+   /*
+* TODO: pass vm_fault to add_to_page_cache_lru() and
+* __read_swa

[PATCH v2 01/16] include/linux/memcontrol.h: do not warn in page_memcg_rcu() if !CONFIG_MEMCG

2021-04-12 Thread Yu Zhao
page_memcg_rcu() warns on !rcu_read_lock_held() regardless of
CONFIG_MEMCG. The following code is legit, but it triggers the warning
when !CONFIG_MEMCG, since lock_page_memcg() and unlock_page_memcg()
are empty for this config.

  memcg = lock_page_memcg(page1)
(rcu_read_lock() if CONFIG_MEMCG=y)

  do something to page1

  if (page_memcg_rcu(page2) == memcg)
do something to page2 too as it cannot be migrated away from the
memcg either.

  unlock_page_memcg(page1)
(rcu_read_unlock() if CONFIG_MEMCG=y)

Locking/unlocking rcu consistently for both configs is rigorous but it
also forces unnecessary locking upon users who have no interest in
CONFIG_MEMCG.

This patch removes the assertion for !CONFIG_MEMCG, because
page_memcg_rcu() has a few callers and there are no concerns regarding
their correctness at the moment.

Signed-off-by: Yu Zhao 
---
 include/linux/memcontrol.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 0c04d39a7967..f13dc02cf277 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -1077,7 +1077,6 @@ static inline struct mem_cgroup *page_memcg(struct page 
*page)
 
 static inline struct mem_cgroup *page_memcg_rcu(struct page *page)
 {
-   WARN_ON_ONCE(!rcu_read_lock_held());
return NULL;
 }
 
-- 
2.31.1.295.g9ea45b61b8-goog



[PATCH v2 04/16] include/linux/cgroup.h: export cgroup_mutex

2021-04-12 Thread Yu Zhao
cgroup_mutex is needed to synchronize with memcg creations.

Signed-off-by: Yu Zhao 
---
 include/linux/cgroup.h | 15 ++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 4f2f79de083e..bd5744360cfa 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -432,6 +432,18 @@ static inline void cgroup_put(struct cgroup *cgrp)
css_put(&cgrp->self);
 }
 
+extern struct mutex cgroup_mutex;
+
+static inline void cgroup_lock(void)
+{
+   mutex_lock(&cgroup_mutex);
+}
+
+static inline void cgroup_unlock(void)
+{
+   mutex_unlock(&cgroup_mutex);
+}
+
 /**
  * task_css_set_check - obtain a task's css_set with extra access conditions
  * @task: the task to obtain css_set for
@@ -446,7 +458,6 @@ static inline void cgroup_put(struct cgroup *cgrp)
  * as locks used during the cgroup_subsys::attach() methods.
  */
 #ifdef CONFIG_PROVE_RCU
-extern struct mutex cgroup_mutex;
 extern spinlock_t css_set_lock;
 #define task_css_set_check(task, __c)  \
rcu_dereference_check((task)->cgroups,  \
@@ -704,6 +715,8 @@ struct cgroup;
 static inline u64 cgroup_id(const struct cgroup *cgrp) { return 1; }
 static inline void css_get(struct cgroup_subsys_state *css) {}
 static inline void css_put(struct cgroup_subsys_state *css) {}
+static inline void cgroup_lock(void) {}
+static inline void cgroup_unlock(void) {}
 static inline int cgroup_attach_task_all(struct task_struct *from,
 struct task_struct *t) { return 0; }
 static inline int cgroupstats_build(struct cgroupstats *stats,
-- 
2.31.1.295.g9ea45b61b8-goog



[PATCH v2 06/16] mm, x86: support the access bit on non-leaf PMD entries

2021-04-12 Thread Yu Zhao
Some architectures support the accessed bit on non-leaf PMD entries
(parents) in addition to leaf PTE entries (children) where pages are
mapped, e.g., x86_64 sets the accessed bit on a parent when using it
as part of linear-address translation [1]. Page table walkers who are
interested in the accessed bit on children can take advantage of this:
they do not need to search the children when the accessed bit is not
set on a parent, given that they have previously cleared the accessed
bit on this parent.

[1]: Intel 64 and IA-32 Architectures Software Developer's Manual
 Volume 3 (October 2019), section 4.8

Signed-off-by: Yu Zhao 
---
 arch/Kconfig   | 9 +
 arch/x86/Kconfig   | 1 +
 arch/x86/include/asm/pgtable.h | 2 +-
 arch/x86/mm/pgtable.c  | 5 -
 include/linux/pgtable.h| 4 ++--
 5 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index ecfd3520b676..cbd7f66734ee 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -782,6 +782,15 @@ config HAVE_ARCH_TRANSPARENT_HUGEPAGE
 config HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
bool
 
+config HAVE_ARCH_PARENT_PMD_YOUNG
+   bool
+   depends on PGTABLE_LEVELS > 2
+   help
+ Architectures that select this are able to set the accessed bit on
+ non-leaf PMD entries in addition to leaf PTE entries where pages are
+ mapped. For them, page table walkers that clear the accessed bit may
+ stop at non-leaf PMD entries when they do not see the accessed bit.
+
 config HAVE_ARCH_HUGE_VMAP
bool
 
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 2792879d398e..b5972eb82337 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -163,6 +163,7 @@ config X86
select HAVE_ARCH_TRACEHOOK
select HAVE_ARCH_TRANSPARENT_HUGEPAGE
select HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD if X86_64
+   select HAVE_ARCH_PARENT_PMD_YOUNG   if X86_64
select HAVE_ARCH_USERFAULTFD_WP if X86_64 && USERFAULTFD
select HAVE_ARCH_VMAP_STACK if X86_64
select HAVE_ARCH_WITHIN_STACK_FRAMES
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index a02c67291cfc..a6b5cfe1fc5a 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -846,7 +846,7 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd)
 
 static inline int pmd_bad(pmd_t pmd)
 {
-   return (pmd_flags(pmd) & ~_PAGE_USER) != _KERNPG_TABLE;
+   return ((pmd_flags(pmd) | _PAGE_ACCESSED) & ~_PAGE_USER) != 
_KERNPG_TABLE;
 }
 
 static inline unsigned long pages_to_mb(unsigned long npg)
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index f6a9e2e36642..1c27e6f43f80 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -550,7 +550,7 @@ int ptep_test_and_clear_young(struct vm_area_struct *vma,
return ret;
 }
 
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || 
defined(CONFIG_HAVE_ARCH_PARENT_PMD_YOUNG)
 int pmdp_test_and_clear_young(struct vm_area_struct *vma,
  unsigned long addr, pmd_t *pmdp)
 {
@@ -562,6 +562,9 @@ int pmdp_test_and_clear_young(struct vm_area_struct *vma,
 
return ret;
 }
+#endif
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
 int pudp_test_and_clear_young(struct vm_area_struct *vma,
  unsigned long addr, pud_t *pudp)
 {
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 5e772392a379..08dd9b8c055a 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -193,7 +193,7 @@ static inline int ptep_test_and_clear_young(struct 
vm_area_struct *vma,
 #endif
 
 #ifndef __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || 
defined(CONFIG_HAVE_ARCH_PARENT_PMD_YOUNG)
 static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
unsigned long address,
pmd_t *pmdp)
@@ -214,7 +214,7 @@ static inline int pmdp_test_and_clear_young(struct 
vm_area_struct *vma,
BUILD_BUG();
return 0;
 }
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HAVE_ARCH_PARENT_PMD_YOUNG */
 #endif
 
 #ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
-- 
2.31.1.295.g9ea45b61b8-goog



[PATCH v2 05/16] mm/swap.c: export activate_page()

2021-04-12 Thread Yu Zhao
activate_page() is needed to activate pages that are already on lru or
queued in lru_pvecs.lru_add. The exported function is a merger between
the existing activate_page() and __lru_cache_activate_page().

Signed-off-by: Yu Zhao 
---
 include/linux/swap.h |  1 +
 mm/swap.c| 28 +++-
 2 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 4cc6ec3bf0ab..de2bbbf181ba 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -344,6 +344,7 @@ extern void lru_add_drain_cpu(int cpu);
 extern void lru_add_drain_cpu_zone(struct zone *zone);
 extern void lru_add_drain_all(void);
 extern void rotate_reclaimable_page(struct page *page);
+extern void activate_page(struct page *page);
 extern void deactivate_file_page(struct page *page);
 extern void deactivate_page(struct page *page);
 extern void mark_page_lazyfree(struct page *page);
diff --git a/mm/swap.c b/mm/swap.c
index 31b844d4ed94..f20ed56ebbbf 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -334,7 +334,7 @@ static bool need_activate_page_drain(int cpu)
return pagevec_count(&per_cpu(lru_pvecs.activate_page, cpu)) != 0;
 }
 
-static void activate_page(struct page *page)
+static void activate_page_on_lru(struct page *page)
 {
page = compound_head(page);
if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
@@ -354,7 +354,7 @@ static inline void activate_page_drain(int cpu)
 {
 }
 
-static void activate_page(struct page *page)
+static void activate_page_on_lru(struct page *page)
 {
struct lruvec *lruvec;
 
@@ -368,11 +368,22 @@ static void activate_page(struct page *page)
 }
 #endif
 
-static void __lru_cache_activate_page(struct page *page)
+/*
+ * If the page is on the LRU, queue it for activation via
+ * lru_pvecs.activate_page. Otherwise, assume the page is on a
+ * pagevec, mark it active and it'll be moved to the active
+ * LRU on the next drain.
+ */
+void activate_page(struct page *page)
 {
struct pagevec *pvec;
int i;
 
+   if (PageLRU(page)) {
+   activate_page_on_lru(page);
+   return;
+   }
+
local_lock(&lru_pvecs.lock);
pvec = this_cpu_ptr(&lru_pvecs.lru_add);
 
@@ -421,16 +432,7 @@ void mark_page_accessed(struct page *page)
 * evictable page accessed has no effect.
 */
} else if (!PageActive(page)) {
-   /*
-* If the page is on the LRU, queue it for activation via
-* lru_pvecs.activate_page. Otherwise, assume the page is on a
-* pagevec, mark it active and it'll be moved to the active
-* LRU on the next drain.
-*/
-   if (PageLRU(page))
-   activate_page(page);
-   else
-   __lru_cache_activate_page(page);
+   activate_page(page);
ClearPageReferenced(page);
workingset_activation(page);
}
-- 
2.31.1.295.g9ea45b61b8-goog



[PATCH v2 03/16] include/linux/huge_mm.h: define is_huge_zero_pmd() if !CONFIG_TRANSPARENT_HUGEPAGE

2021-04-12 Thread Yu Zhao
Currently is_huge_zero_pmd() only exists when
CONFIG_TRANSPARENT_HUGEPAGE=y. This patch adds the function for
!CONFIG_TRANSPARENT_HUGEPAGE.

Signed-off-by: Yu Zhao 
---
 include/linux/huge_mm.h | 5 +
 1 file changed, 5 insertions(+)

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index ba973efcd369..0ba7b3f9029c 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -443,6 +443,11 @@ static inline bool is_huge_zero_page(struct page *page)
return false;
 }
 
+static inline bool is_huge_zero_pmd(pmd_t pmd)
+{
+   return false;
+}
+
 static inline bool is_huge_zero_pud(pud_t pud)
 {
return false;
-- 
2.31.1.295.g9ea45b61b8-goog



[PATCH v2 02/16] include/linux/nodemask.h: define next_memory_node() if !CONFIG_NUMA

2021-04-12 Thread Yu Zhao
Currently next_memory_node only exists when CONFIG_NUMA=y. This patch
adds the macro for !CONFIG_NUMA.

Signed-off-by: Yu Zhao 
---
 include/linux/nodemask.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h
index ac398e143c9a..89fe4e3592f9 100644
--- a/include/linux/nodemask.h
+++ b/include/linux/nodemask.h
@@ -486,6 +486,7 @@ static inline int num_node_state(enum node_states state)
 #define first_online_node  0
 #define first_memory_node  0
 #define next_online_node(nid)  (MAX_NUMNODES)
+#define next_memory_node(nid)  (MAX_NUMNODES)
 #define nr_node_ids1U
 #define nr_online_nodes1U
 
-- 
2.31.1.295.g9ea45b61b8-goog



[PATCH v2 07/16] mm/vmscan.c: refactor shrink_node()

2021-04-12 Thread Yu Zhao
Heuristics that determine scan balance between anon and file LRUs are
rather independent. Move them into a separate function to improve
readability.

Signed-off-by: Yu Zhao 
---
 mm/vmscan.c | 186 +++-
 1 file changed, 98 insertions(+), 88 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 562e87cbd7a1..1a24d2e0a4cb 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2224,6 +2224,103 @@ enum scan_balance {
SCAN_FILE,
 };
 
+static void prepare_scan_count(pg_data_t *pgdat, struct scan_control *sc)
+{
+   unsigned long file;
+   struct lruvec *target_lruvec;
+
+   target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
+
+   /*
+* Determine the scan balance between anon and file LRUs.
+*/
+   spin_lock_irq(&target_lruvec->lru_lock);
+   sc->anon_cost = target_lruvec->anon_cost;
+   sc->file_cost = target_lruvec->file_cost;
+   spin_unlock_irq(&target_lruvec->lru_lock);
+
+   /*
+* Target desirable inactive:active list ratios for the anon
+* and file LRU lists.
+*/
+   if (!sc->force_deactivate) {
+   unsigned long refaults;
+
+   refaults = lruvec_page_state(target_lruvec,
+   WORKINGSET_ACTIVATE_ANON);
+   if (refaults != target_lruvec->refaults[0] ||
+   inactive_is_low(target_lruvec, LRU_INACTIVE_ANON))
+   sc->may_deactivate |= DEACTIVATE_ANON;
+   else
+   sc->may_deactivate &= ~DEACTIVATE_ANON;
+
+   /*
+* When refaults are being observed, it means a new
+* workingset is being established. Deactivate to get
+* rid of any stale active pages quickly.
+*/
+   refaults = lruvec_page_state(target_lruvec,
+   WORKINGSET_ACTIVATE_FILE);
+   if (refaults != target_lruvec->refaults[1] ||
+   inactive_is_low(target_lruvec, LRU_INACTIVE_FILE))
+   sc->may_deactivate |= DEACTIVATE_FILE;
+   else
+   sc->may_deactivate &= ~DEACTIVATE_FILE;
+   } else
+   sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE;
+
+   /*
+* If we have plenty of inactive file pages that aren't
+* thrashing, try to reclaim those first before touching
+* anonymous pages.
+*/
+   file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE);
+   if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE))
+   sc->cache_trim_mode = 1;
+   else
+   sc->cache_trim_mode = 0;
+
+   /*
+* Prevent the reclaimer from falling into the cache trap: as
+* cache pages start out inactive, every cache fault will tip
+* the scan balance towards the file LRU.  And as the file LRU
+* shrinks, so does the window for rotation from references.
+* This means we have a runaway feedback loop where a tiny
+* thrashing file LRU becomes infinitely more attractive than
+* anon pages.  Try to detect this based on file LRU size.
+*/
+   if (!cgroup_reclaim(sc)) {
+   unsigned long total_high_wmark = 0;
+   unsigned long free, anon;
+   int z;
+
+   free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES);
+   file = node_page_state(pgdat, NR_ACTIVE_FILE) +
+  node_page_state(pgdat, NR_INACTIVE_FILE);
+
+   for (z = 0; z < MAX_NR_ZONES; z++) {
+   struct zone *zone = &pgdat->node_zones[z];
+
+   if (!managed_zone(zone))
+   continue;
+
+   total_high_wmark += high_wmark_pages(zone);
+   }
+
+   /*
+* Consider anon: if that's low too, this isn't a
+* runaway file reclaim problem, but rather just
+* extreme pressure. Reclaim as per usual then.
+*/
+   anon = node_page_state(pgdat, NR_INACTIVE_ANON);
+
+   sc->file_is_tiny =
+   file + free <= total_high_wmark &&
+   !(sc->may_deactivate & DEACTIVATE_ANON) &&
+   anon >> sc->priority;
+   }
+}
+
 /*
  * Determine how aggressively the anon and file LRU lists should be
  * scanned.  The relative value of each set of LRU lists is determined
@@ -2669,7 +2766,6 @@ static void shrink_node(pg_data_t *pgdat, struct 
scan_control *sc)
unsigned long nr_reclaimed, nr_scanned;
struct lruvec *target_lruvec;
bool reclaimable = false;
-   unsigned long file;
 
target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
 
@@ -2679,93 +2775,7 @@ static void shrink_node(pg_data_t *pgdat, struct 
scan_control 

[PATCH v2 00/16] Multigenerational LRU Framework

2021-04-12 Thread Yu Zhao
What's new in v2

Special thanks to Jens Axboe for reporting a regression in buffered
I/O and helping test the fix.

This version includes the support of tiers, which represent levels of
usage from file descriptors only. Pages accessed N times via file
descriptors belong to tier order_base_2(N). Each generation contains
at most MAX_NR_TIERS tiers, and they require additional MAX_NR_TIERS-2
bits in page->flags. In contrast to moving across generations which
requires the lru lock, moving across tiers only involves an atomic
operation on page->flags and therefore has a negligible cost. A
feedback loop modeled after the well-known PID controller monitors the
refault rates across all tiers and decides when to activate pages from
which tiers, on the reclaim path.

This feedback model has a few advantages over the current feedforward
model:
1) It has a negligible overhead in the buffered I/O access path
   because activations are done in the reclaim path.
2) It takes mapped pages into account and avoids overprotecting pages
   accessed multiple times via file descriptors.
3) More tiers offer better protection to pages accessed more than
   twice when buffered-I/O-intensive workloads are under memory
   pressure.

The fio/io_uring benchmark shows 14% improvement in IOPS when randomly
accessing Samsung PM981a in the buffered I/O mode.

Highlights from the discussions on v1
=
Thanks to Ying Huang and Dave Hansen for the comments and suggestions
on page table scanning.

A simple worst-case scenario test did not find page table scanning
underperforms the rmap because of the following optimizations:
1) It will not scan page tables from processes that have been sleeping
   since the last scan.
2) It will not scan PTE tables under non-leaf PMD entries that do not
   have the accessed bit set, when
   CONFIG_HAVE_ARCH_PARENT_PMD_YOUNG=y.
3) It will not zigzag between the PGD table and the same PMD or PTE
   table spanning multiple VMAs. In other words, it finishes all the
   VMAs with the range of the same PMD or PTE table before it returns
   to the PGD table. This optimizes workloads that have large numbers
   of tiny VMAs, especially when CONFIG_PGTABLE_LEVELS=5.

TLDR

The current page reclaim is too expensive in terms of CPU usage and
often making poor choices about what to evict. We would like to offer
an alternative framework that is performant, versatile and
straightforward.

Repo

git fetch https://linux-mm.googlesource.com/page-reclaim refs/changes/73/1173/1

Gerrit https://linux-mm-review.googlesource.com/c/page-reclaim/+/1173

Background
==
DRAM is a major factor in total cost of ownership, and improving
memory overcommit brings a high return on investment. Over the past
decade of research and experimentation in memory overcommit, we
observed a distinct trend across millions of servers and clients: the
size of page cache has been decreasing because of the growing
popularity of cloud storage. Nowadays anon pages account for more than
90% of our memory consumption and page cache contains mostly
executable pages.

Problems

Notion of active/inactive
-
For servers equipped with hundreds of gigabytes of memory, the
granularity of the active/inactive is too coarse to be useful for job
scheduling. False active/inactive rates are relatively high, and thus
the assumed savings may not materialize.

For phones and laptops, executable pages are frequently evicted
despite the fact that there are many less recently used anon pages.
Major faults on executable pages cause "janks" (slow UI renderings)
and negatively impact user experience.

For lruvecs from different memcgs or nodes, comparisons are impossible
due to the lack of a common frame of reference.

Incremental scans via rmap
--
Each incremental scan picks up at where the last scan left off and
stops after it has found a handful of unreferenced pages. For
workloads using a large amount of anon memory, incremental scans lose
the advantage under sustained memory pressure due to high ratios of
the number of scanned pages to the number of reclaimed pages. In our
case, the average ratio of pgscan to pgsteal is above 7.

On top of that, the rmap has poor memory locality due to its complex
data structures. The combined effects typically result in a high
amount of CPU usage in the reclaim path. For example, with zram, a
typical kswapd profile on v5.11 looks like:
  31.03%  page_vma_mapped_walk
  25.59%  lzo1x_1_do_compress
   4.63%  do_raw_spin_lock
   3.89%  vma_interval_tree_iter_next
   3.33%  vma_interval_tree_subtree_search

And with real swap, it looks like:
  45.16%  page_vma_mapped_walk
   7.61%  do_raw_spin_lock
   5.69%  vma_interval_tree_iter_next
   4.91%  vma_interval_tree_subtree_search
   3.71%  page_referenced_one

Solutions
=
Notion of generation numbers

The notion of generation numbers introduces a q

Re: [PATCH 4/7] mm: Introduce verify_page_range()

2021-04-12 Thread Peter Zijlstra
On Mon, Apr 12, 2021 at 01:05:09PM -0700, Kees Cook wrote:
> On Mon, Apr 12, 2021 at 10:00:16AM +0200, Peter Zijlstra wrote:
> > +struct vpr_data {
> > +   int (*fn)(pte_t pte, unsigned long addr, void *data);
> > +   void *data;
> > +};
> 
> Eeerg. This is likely to become an attack target itself. Stored function
> pointer with stored (3rd) argument.

You got some further reading on that? How exactly are those exploited?


Re: Re: [PATCH] phy: nxp-c45: add driver for tja1103

2021-04-12 Thread Christian Herber

Hi Andrew,

On 4/12/2021 6:52 PM, Andrew Lunn wrote:


So what you are say is, you don't care if the IP is completely
different, it all goes in one driver. So lets put this driver into
nxp-tja11xx.c. And then we avoid all the naming issues.

  Andrew



As this seems to be a key question, let me try and shed some more light 
on this.
The original series of BASE-T1 PHYs includes TJA110, TJA1101, and 
TJA1102. They are covered by the existing driver, which has the 
unfortunate naming TJA11xx. Unfortunate, because the use of wildcards is 
a bit to generous. E.g. the naming would also include a TJA1145, which 
is a high-speed CAN transceiver. The truth is, extrapolating wildcards 
in product names doesn't work as there is not guarantee of future 
product names.
The mentioned TJA1100/1/2 are *fairly* software-compatible, which is why 
it makes sense to have a shared driver. When it gets to TJA1103, there 
is no SW compatibility, which is why we decided to create a new driver.
We want to support all future Ethernet PHY devices with this codebase, 
and that is why the naming is that generic. The common denominator of 
the devices is that they are NXP products and use clause 45 addressing. 
When you say we don't care that the IP is different, that doesn't quite 
fit. Just because the MDI is different, the register map does not need 
to change much, so it will be easy to support future PHYs also when 
using different PHY technology.
Moving the code into TJA11xx is creating more issues, as it assumes that 
the devices which are managed by the driver are always TJA... devices 
which may not be true.


Christian


Re: [PATCH] mm: optimize memory allocation

2021-04-12 Thread Michal Hocko
On Mon 12-04-21 15:49:53, ultrac...@163.com wrote:
> From: Chen Xiaoguang 
> 
> Check memory cgroup limit before allocating real memory. This may
> improve performance especially in slow path when memory allocation
> exceeds cgroup limitation.

I would be really curious about any actual numbers because I have really
hard times to see scenarios when this would lead to an improvement.
Effectitelly only non-oom allocations would benefit theoretically (e.g.
atomic or GFP_NORETRY etc). All others will trigger the memcg oom killer
to help forward progress.

Besides that I really dislike kmem and LRU pages to be handled
differently so for that reason
Nacked-by: Michal Hocko 

If the optimization really can be provent then the patch would require
to be much more invasive.

> Signed-off-by: Chen Xiaoguang 
> Signed-off-by: Chen He 
> ---
>  include/linux/memcontrol.h | 30 ++
>  mm/memcontrol.c| 34 --
>  mm/page_alloc.c| 24 +---
>  3 files changed, 55 insertions(+), 33 deletions(-)
> 
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index 0c04d39..59bb3ba 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -1583,8 +1583,9 @@ static inline void memcg_set_shrinker_bit(struct 
> mem_cgroup *memcg,
>  #endif
>  
>  #ifdef CONFIG_MEMCG_KMEM
> -int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order);
> -void __memcg_kmem_uncharge_page(struct page *page, int order);
> +int __memcg_kmem_charge_page(struct mem_cgroup *memcg, gfp_t gfp, int order);
> +void __memcg_kmem_uncharge_page(struct page *page, int order,
> + struct mem_cgroup *memcg);
>  
>  struct obj_cgroup *get_obj_cgroup_from_current(void);
>  
> @@ -1610,18 +1611,30 @@ static inline bool memcg_kmem_enabled(void)
>   return static_branch_likely(&memcg_kmem_enabled_key);
>  }
>  
> +extern struct mem_cgroup *get_mem_cgroup_from_current(void);
> +
>  static inline int memcg_kmem_charge_page(struct page *page, gfp_t gfp,
>int order)
>  {
> - if (memcg_kmem_enabled())
> - return __memcg_kmem_charge_page(page, gfp, order);
> - return 0;
> + struct mem_cgroup *memcg;
> + int ret = 0;
> +
> + memcg = get_mem_cgroup_from_current();
> + if (memcg && memcg_kmem_enabled() && !mem_cgroup_is_root(memcg)) {
> + ret = __memcg_kmem_charge_page(memcg, gfp, order);
> + if (!ret) {
> + page->memcg_data = (unsigned long)memcg | 
> MEMCG_DATA_KMEM;
> + return 0;
> + }
> + css_put(&memcg->css);
> + }
> + return ret;
>  }
>  
>  static inline void memcg_kmem_uncharge_page(struct page *page, int order)
>  {
>   if (memcg_kmem_enabled())
> - __memcg_kmem_uncharge_page(page, order);
> + __memcg_kmem_uncharge_page(page, order, NULL);
>  }
>  
>  /*
> @@ -1647,13 +1660,14 @@ static inline void memcg_kmem_uncharge_page(struct 
> page *page, int order)
>  {
>  }
>  
> -static inline int __memcg_kmem_charge_page(struct page *page, gfp_t gfp,
> +static inline int __memcg_kmem_charge_page(struct mem_cgroup *memcg, gfp_t 
> gfp,
>  int order)
>  {
>   return 0;
>  }
>  
> -static inline void __memcg_kmem_uncharge_page(struct page *page, int order)
> +static inline void __memcg_kmem_uncharge_page(struct page *page, int order,
> + struct mem_cgroup *memcg)
>  {
>  }
>  
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index e064ac0d..8df57b7 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -1085,7 +1085,7 @@ static __always_inline bool memcg_kmem_bypass(void)
>  /**
>   * If active memcg is set, do not fallback to current->mm->memcg.
>   */
> -static __always_inline struct mem_cgroup *get_mem_cgroup_from_current(void)
> +struct mem_cgroup *get_mem_cgroup_from_current(void)
>  {
>   if (memcg_kmem_bypass())
>   return NULL;
> @@ -3113,21 +3113,11 @@ static void __memcg_kmem_uncharge(struct mem_cgroup 
> *memcg, unsigned int nr_page
>   *
>   * Returns 0 on success, an error code on failure.
>   */
> -int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order)
> +int __memcg_kmem_charge_page(struct mem_cgroup *memcg, gfp_t gfp, int order)
>  {
> - struct mem_cgroup *memcg;
> - int ret = 0;
> + int ret;
>  
> - memcg = get_mem_cgroup_from_current();
> - if (memcg && !mem_cgroup_is_root(memcg)) {
> - ret = __memcg_kmem_charge(memcg, gfp, 1 << order);
> - if (!ret) {
> - page->memcg_data = (unsigned long)memcg |
> - MEMCG_DATA_KMEM;
> - return 0;
> - }
> - css_put(&memcg->css);
> - }
> + ret = __memcg_kmem_charge(memcg, gfp, 1 << order);
>   

Re: [PATCH] kunit: add unit test for filtering suites by names

2021-04-12 Thread kernel test robot
Hi Daniel,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on 1678e493d530e7977cce34e59a86bb86f3c5631e]

url:
https://github.com/0day-ci/linux/commits/Daniel-Latypov/kunit-add-unit-test-for-filtering-suites-by-names/20210413-080913
base:   1678e493d530e7977cce34e59a86bb86f3c5631e
config: microblaze-randconfig-r014-20210413 (attached as .config)
compiler: microblaze-linux-gcc (GCC) 9.3.0
reproduce (this is a W=1 build):
wget 
https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O 
~/bin/make.cross
chmod +x ~/bin/make.cross
# 
https://github.com/0day-ci/linux/commit/756df216f1586cecdf02f278fbed232fb25fa3f7
git remote add linux-review https://github.com/0day-ci/linux
git fetch --no-tags linux-review 
Daniel-Latypov/kunit-add-unit-test-for-filtering-suites-by-names/20210413-080913
git checkout 756df216f1586cecdf02f278fbed232fb25fa3f7
# save the attached .config to linux build tree
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-9.3.0 make.cross 
ARCH=microblaze 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot 

All warnings (new ones prefixed by >>):

   In file included from lib/kunit/executor.c:119:
   lib/kunit/executor_test.c: In function 'alloc_fake_suite':
>> lib/kunit/executor_test.c:129:2: warning: 'strncpy' specified bound 256 
>> equals destination size [-Wstringop-truncation]
 129 |  strncpy((char *)suite->name, suite_name, sizeof(suite->name));
 |  ^


vim +/strncpy +129 lib/kunit/executor_test.c

   121  
   122  static struct kunit_suite *alloc_fake_suite(struct kunit *test,
   123  const char *suite_name)
   124  {
   125  struct kunit_suite *suite;
   126  
   127  /* We normally never expect to allocate suites, hence the 
non-const cast. */
   128  suite = kunit_kzalloc(test, sizeof(*suite), GFP_KERNEL);
 > 129  strncpy((char *)suite->name, suite_name, sizeof(suite->name));

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-...@lists.01.org


.config.gz
Description: application/gzip


Re: [PATCH 6/7] i915: Convert to verify_page_range()

2021-04-12 Thread Peter Zijlstra
On Mon, Apr 12, 2021 at 01:08:38PM -0700, Kees Cook wrote:
> On Mon, Apr 12, 2021 at 10:00:18AM +0200, Peter Zijlstra wrote:
> > @@ -1249,14 +1249,14 @@ static int check_absent_pte(pte_t *pte,
> >  
> >  static int check_present(unsigned long addr, unsigned long len)
> >  {
> > -   return apply_to_page_range(current->mm, addr, len,
> > -  check_present_pte, (void *)addr);
> > +   return verify_page_range(current->mm, addr, len,
> > +check_present_pte, (void *)addr);
> 
> For example, switch to returning bad addr through verify_page_range(),
> or have a by-reference value, etc:
> 
>   unsigned long failed;
> 
>   failed = verify_page_range(current->mm< addr, len, check_present_pte);
>   if (failed) {
>   pr_err("missing PTE:%lx\n",
>  (addr - failed) >> PAGE_SHIFT);

OK, lemme try that.


Re: [Outreachy kernel] Subject: [PATCH v2] staging: media: meson: vdec: declare u32 as static const appropriately

2021-04-12 Thread Julia Lawall



On Tue, 13 Apr 2021, Mitali Borkar wrote:

> Declared 32 bit unsigned int as static constant inside a function
> appropriately.

I don't think that the description matches what is done.  Perhaps all the
meaning is intended to be in the word "appropriately", but that is not
very clear.  The message makes it looks like static const is the new part,
but it is already there.

julia

>
> Reported-by: kernel test robot 
> Signed-off-by: Mitali Borkar 
> ---
>
> Changes from v1:- Rectified the mistake by declaring u32 as static const
> properly.
>
>  drivers/staging/media/meson/vdec/codec_h264.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/staging/media/meson/vdec/codec_h264.c 
> b/drivers/staging/media/meson/vdec/codec_h264.c
> index ea86e9e1c447..80141b89a9f6 100644
> --- a/drivers/staging/media/meson/vdec/codec_h264.c
> +++ b/drivers/staging/media/meson/vdec/codec_h264.c
> @@ -287,8 +287,8 @@ static void codec_h264_resume(struct amvdec_session *sess)
>   struct amvdec_core *core = sess->core;
>   struct codec_h264 *h264 = sess->priv;
>   u32 mb_width, mb_height, mb_total;
> - static const u32[] canvas3 = { ANCO_CANVAS_ADDR, 0 };
> - static const u32[] canvas4 = { 24, 0 };
> + static const u32 canvas3[] = { ANCO_CANVAS_ADDR, 0 };
> + static const u32 canvas4[] = { 24, 0 };
>
>   amvdec_set_canvases(sess, canvas3, canvas4);
>
> --
> 2.30.2
>
> --
> You received this message because you are subscribed to the Google Groups 
> "outreachy-kernel" group.
> To unsubscribe from this group and stop receiving emails from it, send an 
> email to outreachy-kernel+unsubscr...@googlegroups.com.
> To view this discussion on the web visit 
> https://groups.google.com/d/msgid/outreachy-kernel/YHU56OM%2BC2zY34VP%40kali.
>


Re: [PATCH 3/3] sched: Use cpu_dying() to fix balance_push vs hotplug-rollback

2021-04-12 Thread Peter Zijlstra
On Mon, Apr 12, 2021 at 06:22:42PM +0100, Valentin Schneider wrote:
> On 12/04/21 14:03, Peter Zijlstra wrote:
> > On Thu, Mar 11, 2021 at 03:13:04PM +, Valentin Schneider wrote:
> >> Peter Zijlstra  writes:
> >> > @@ -7910,6 +7908,14 @@ int sched_cpu_deactivate(unsigned int cp
> >> >}
> >> >rq_unlock_irqrestore(rq, &rf);
> >> >
> >> > +/*
> >> > + * From this point forward, this CPU will refuse to run any 
> >> > task that
> >> > + * is not: migrate_disable() or KTHREAD_IS_PER_CPU, and will 
> >> > actively
> >> > + * push those tasks away until this gets cleared, see
> >> > + * sched_cpu_dying().
> >> > + */
> >> > +balance_push_set(cpu, true);
> >> > +
> >>
> >> AIUI with cpu_dying_mask being flipped before even entering
> >> sched_cpu_deactivate(), we don't need this to be before the
> >> synchronize_rcu() anymore; is there more than that to why you're punting it
> >> back this side of it?
> >
> > I think it does does need to be like this, we need to clearly separate
> > the active=true and balance_push_set(). If we were to somehow observe
> > both balance_push_set() and active==false, we'd be in trouble.
> >
> 
> I'm afraid I don't follow; we're replacing a read of rq->balance_push with
> cpu_dying(), and those are still written on the same side of the
> synchronize_rcu(). What am I missing?

Yeah, I'm not sure anymnore either; I tried to work out why I'd done
that but upon closer examination everything fell flat.

Let me try again today :-)

> Oooh, I can't read, only the boot CPU gets its callback uninstalled in
> sched_init()! So secondaries keep push_callback installed up until
> sched_cpu_activate(), but as you said it's not effective unless a rollback
> happens.
> 
> Now, doesn't that mean we should *not* uninstall the callback in
> sched_cpu_dying()? AFAIK it's possible for the initial secondary CPU
> boot to go fine, but the next offline+online cycle fails while going up -
> that would need to rollback with push_callback installed.

Quite; I removed that shortly after sending this; when I tried to write
a comment and found it.


Re: [PATCH] MIPS: fix memory reservation for non-usermem setups

2021-04-12 Thread Ilya Lipnitskiy
On Mon, Apr 12, 2021 at 11:45 PM Ilya Lipnitskiy
 wrote:
>
> Hi Thomas,
>
> On Tue, Apr 6, 2021 at 6:18 AM Thomas Bogendoerfer
>  wrote:
> >
> > On Sat, Apr 03, 2021 at 07:02:13PM -0700, Ilya Lipnitskiy wrote:
> > > Hi Mike,
> > >
> > > On Tue, Mar 16, 2021 at 11:33 PM Mike Rapoport  wrote:
> > > >
> > > > Hi Ilya,
> > > >
> > > > On Tue, Mar 16, 2021 at 10:10:09PM -0700, Ilya Lipnitskiy wrote:
> > > > > Hi Thomas,
> > > > >
> > > > > On Fri, Mar 12, 2021 at 7:19 AM Thomas Bogendoerfer
> > > > >  wrote:
> > > > > >
> > > > > > On Sun, Mar 07, 2021 at 11:40:30AM -0800, Ilya Lipnitskiy wrote:
> > > > > > > From: Tobias Wolf 
> > > > > > >
> > > > > > > Commit 67a3ba25aa95 ("MIPS: Fix incorrect mem=X@Y handling") 
> > > > > > > introduced a new
> > > > > > > issue for rt288x where "PHYS_OFFSET" is 0x0 but the calculated 
> > > > > > > "ramstart" is
> > > > > > > not. As the prerequisite of custom memory map has been removed, 
> > > > > > > this results
> > > > > > > in the full memory range of 0x0 - 0x800 to be marked as 
> > > > > > > reserved for this
> > > > > > > platform.
> > > > > >
> > > > > > and where is the problem here ?
> > > > > Turns out this was already attempted to be upstreamed - not clear why
> > > > > it wasn't merged. Context:
> > > > > https://lore.kernel.org/linux-mips/6504517.U6H5IhoIOn@loki/
> > > > >
> > > > > I hope the thread above helps you understand the problem.
> > > >
> > > > The memory initialization was a bit different then. Do you still see the
> > > > same problem?
> > > Thanks for asking. I obtained a RT2880 device and gave it a try. It
> > > hangs at boot without this patch, however selecting
> >
> > can you provide debug logs with memblock=debug for both good and bad
> > kernels ? I'm curious what's the reason for failing allocation...
> Sorry for taking a while to respond. See attached.
> FWIW, it seems these are the lines that stand out in hang.log:
> [0.00] memblock_reserve: [0x-0x07ff] 
> setup_arch+0x214/0x5d8
> [0.00] Wasting 1048576 bytes for tracking 32768 unused pages
> ...
> [0.00]  reserved[0x0][0x-0x087137aa], 0x087137ab
> bytes flags: 0x0
Just to be clear, good.log is mips-next tip (dbd815c0dcca) and
hang.log is the same with MIPS_AUTO_PFN_OFFSET _NOT_ selected.

Ilya


Re: [PATCH] vfio/pci: Add missing range check in vfio_pci_mmap

2021-04-12 Thread Cornelia Huck
On Mon, 12 Apr 2021 23:41:24 +0200
"Christian A. Ehrhardt"  wrote:

> When mmaping an extra device region verify that the region index
> derived from the mmap offset is valid.
> 
> Fixes: a15b1883fee1 ("vfio_pci: Allow mapping extra regions")
> Cc: sta...@vger.kernel.org
> Signed-off-by: Christian A. Ehrhardt 
> ---
>  drivers/vfio/pci/vfio_pci.c | 4 +++-
>  1 file changed, 3 insertions(+), 1 deletion(-)

Reviewed-by: Cornelia Huck 



Re: [PATCH v1 6/7] mfd: lpc_ich: Add support for pinctrl in non-ACPI system

2021-04-12 Thread Henning Schild
Am Mon, 12 Apr 2021 20:34:45 +0300
schrieb Andy Shevchenko :

> On Mon, Apr 12, 2021 at 07:16:53PM +0200, Henning Schild wrote:
> > Am Mon, 12 Apr 2021 19:59:05 +0300
> > schrieb Andy Shevchenko :  
> > > On Mon, Apr 12, 2021 at 06:40:01PM +0200, Henning Schild wrote:  
> > > > Tan or Andy,
> > > > 
> > > > maybe you can point me to a user of that patch. I guess there
> > > > might be an out-of-tree driver or userland code on how to use
> > > > the GPIOs from there.
> > > 
> > > I'm confused. User of this patch is pinctrl-broxton driver.
> > > It's in upstream.  
> > 
> > Should this appear in /sys/class/gpio as chip so that pins can be
> > exported?  
> 
> No. Sysfs interface is deprecated. It should appear as /dev/gpiochip0
> or so.

Ok, just found that there is a null pointer deref in the probe function
of the pinctrl driver, looking into that.

Meanwhile i think i will need a similar patch for
pinctrl-sunrisepoint.c for that wdt, do you happen to have that as
well? Or a spec where to find all the magic numbers.

regards,
Henning

> 
> > That is what i tried and failed with.
> >   
> > > Using GPIOs from it is something as done in a few drivers already
> > > (Assuming we have no resources described in the ACPI). I.e. you
> > > need to register in board file the GPIO mapping table with help of
> > > devm_acpi_dev_add_driver_gpios() and use one of gpiod_get()
> > > family of functions to request it.
> > > 
> > > In case of LEDs you simple describe GPIO device name in lookup
> > > table and that's it. The drivers/platform/x86/pcengines-apuv2.c
> > > not the best but will give you an idea how to use "leds-gpio"
> > > driver in board files.  
> > 
> > I am aware of that driver and had a look at it. In order to figure
> > out the arguments for the macros/functions i was hoping for
> > userland gpio "export", but maybe that does not work here ...
> > For now i will assume that it does not show up in sysfs and can
> > maybe still be used, and try to build on top.  
> 
> Just switch to use libgpiod and associated tools / bindings in user
> space. Sysfs ABI is not being developed anymore.
> 



Re: [PATCH] MIPS: fix memory reservation for non-usermem setups

2021-04-12 Thread Ilya Lipnitskiy
Hi Thomas,

On Tue, Apr 6, 2021 at 6:18 AM Thomas Bogendoerfer
 wrote:
>
> On Sat, Apr 03, 2021 at 07:02:13PM -0700, Ilya Lipnitskiy wrote:
> > Hi Mike,
> >
> > On Tue, Mar 16, 2021 at 11:33 PM Mike Rapoport  wrote:
> > >
> > > Hi Ilya,
> > >
> > > On Tue, Mar 16, 2021 at 10:10:09PM -0700, Ilya Lipnitskiy wrote:
> > > > Hi Thomas,
> > > >
> > > > On Fri, Mar 12, 2021 at 7:19 AM Thomas Bogendoerfer
> > > >  wrote:
> > > > >
> > > > > On Sun, Mar 07, 2021 at 11:40:30AM -0800, Ilya Lipnitskiy wrote:
> > > > > > From: Tobias Wolf 
> > > > > >
> > > > > > Commit 67a3ba25aa95 ("MIPS: Fix incorrect mem=X@Y handling") 
> > > > > > introduced a new
> > > > > > issue for rt288x where "PHYS_OFFSET" is 0x0 but the calculated 
> > > > > > "ramstart" is
> > > > > > not. As the prerequisite of custom memory map has been removed, 
> > > > > > this results
> > > > > > in the full memory range of 0x0 - 0x800 to be marked as 
> > > > > > reserved for this
> > > > > > platform.
> > > > >
> > > > > and where is the problem here ?
> > > > Turns out this was already attempted to be upstreamed - not clear why
> > > > it wasn't merged. Context:
> > > > https://lore.kernel.org/linux-mips/6504517.U6H5IhoIOn@loki/
> > > >
> > > > I hope the thread above helps you understand the problem.
> > >
> > > The memory initialization was a bit different then. Do you still see the
> > > same problem?
> > Thanks for asking. I obtained a RT2880 device and gave it a try. It
> > hangs at boot without this patch, however selecting
>
> can you provide debug logs with memblock=debug for both good and bad
> kernels ? I'm curious what's the reason for failing allocation...
Sorry for taking a while to respond. See attached.
FWIW, it seems these are the lines that stand out in hang.log:
[0.00] memblock_reserve: [0x-0x07ff] setup_arch+0x214/0x5d8
[0.00] Wasting 1048576 bytes for tracking 32768 unused pages
...
[0.00]  reserved[0x0][0x-0x087137aa], 0x087137ab
bytes flags: 0x0

Ilya
[0.00] Linux version 5.12.0-rc2+ (builder@buildhost) (mipsel-openwrt-linux-musl-gcc (OpenWrt GCC 7.5.0 r4-7145a72d3ce2) 7.5.0, GNU ld (GNU Binutils) 2.31.1) #4 Mon Apr 12 23:41:18 PDT 2021
[0.00] SoC Type: Ralink RT2880 id:2 rev:1
[0.00] printk: bootconsole [early0] enabled
[0.00] CPU0 revision is: 0001906c (MIPS 4KEc)
[0.00] MIPS: machine is Belkin F5D8235 v1
[0.00] memblock_reserve: [0x085d84a8-0x085d9f5e] setup_arch+0x14c/0x5c0
[0.00] memblock_reserve: [0x0800-0x0871378f] setup_arch+0x220/0x5c0
[0.00] Initrd not found or empty - disabling initrd
[0.00] memblock_alloc_try_nid: 6839 bytes align=0x40 nid=-1 from=0x max_addr=0x early_init_dt_alloc_memory_arch+0x40/0x84
[0.00] memblock_reserve: [0x087137c0-0x08715276] memblock_alloc_range_nid+0xf0/0x184
[0.00] memblock_alloc_try_nid: 21180 bytes align=0x4 nid=-1 from=0x max_addr=0x early_init_dt_alloc_memory_arch+0x40/0x84
[0.00] memblock_reserve: [0x08715278-0x0871a533] memblock_alloc_range_nid+0xf0/0x184
[0.00] memblock_alloc_try_nid: 27 bytes align=0x4 nid=-1 from=0x max_addr=0x early_init_dt_alloc_memory_arch+0x40/0x84
[0.00] memblock_reserve: [0x08713790-0x087137aa] memblock_alloc_range_nid+0xf0/0x184
[0.00] memblock_reserve: [0x08526000-0x08525fff] setup_arch+0x390/0x5c0
[0.00] memblock_alloc_try_nid: 32 bytes align=0x10 nid=-1 from=0x max_addr=0x setup_arch+0x4ec/0x5c0
[0.00] memblock_reserve: [0x0871a540-0x0871a55f] memblock_alloc_range_nid+0xf0/0x184
[0.00] Primary instruction cache 16kB, VIPT, 4-way, linesize 16 bytes.
[0.00] Primary data cache 16kB, 4-way, VIPT, no aliases, linesize 16 bytes
[0.00] Zone ranges:
[0.00]   Normal   [mem 0x0800-0x09ff]
[0.00] Movable zone start for each node
[0.00] Early memory node ranges
[0.00]   node   0: [mem 0x0800-0x09ff]
[0.00] Initmem setup node 0 [mem 0x0800-0x09ff]
[0.00] memblock_alloc_try_nid: 262144 bytes align=0x10 nid=0 from=0x max_addr=0x alloc_node_mem_map.constprop.145+0x6c/0xd0
[0.00] memblock_reserve: [0x0871a560-0x0875a55f] memblock_alloc_range_nid+0xf0/0x184
[0.00] memblock_alloc_try_nid: 4 bytes align=0x10 nid=0 from=0x max_addr=0x setup_usemap+0x64/0x98
[0.00] memblock_reserve: [0x087137b0-0x087137b3] memblock_alloc_range_nid+0xf0/0x184
[0.00] MEMBLOCK configuration:
[0.00]  memory size = 0x0200 reserved size = 0x0075b542
[0.00]  memory.cnt  = 0x1
[0.00]  memory[0x0]	[0x0800-0x09ff], 0x0200 bytes flags: 0x0
[0.00]  reserved.cnt  = 0x6
[0.00]  reserved[0x0]	[0x-0x0fff], 0x1000 bytes flags: 0x0
[0.00]  reserved[0x1]	[0x0800-0x08

Re: [PATCH v2 resend] mm/memory_hotplug: Make unpopulated zones PCP structures unreachable during hot remove

2021-04-12 Thread Michal Hocko
On Mon 12-04-21 14:40:18, Vlastimil Babka wrote:
> On 4/12/21 2:08 PM, Mel Gorman wrote:
> > zone_pcp_reset allegedly protects against a race with drain_pages
> > using local_irq_save but this is bogus. local_irq_save only operates
> > on the local CPU. If memory hotplug is running on CPU A and drain_pages
> > is running on CPU B, disabling IRQs on CPU A does not affect CPU B and
> > offers no protection.
> > 
> > This patch deletes IRQ disable/enable on the grounds that IRQs protect
> > nothing and assumes the existing hotplug paths guarantees the PCP cannot be
> > used after zone_pcp_enable(). That should be the case already because all
> > the pages have been freed and there is no page to put on the PCP lists.
> > 
> > Signed-off-by: Mel Gorman 
> 
> Yeah the irq disabling here is clearly bogus, so:
> 
> Acked-by: Vlastimil Babka 
> 
> But I think Michal has a point that we might best leave the pagesets around, 
> by
> a future change. I'm have some doubts that even with your reordering of the
> reset/destroy after zonelist rebuild in v1 they cant't be reachable. We have 
> no
> protection between zonelist rebuild and zonelist traversal, and that's why we
> just leave pgdats around.
> 
> So I can imagine a task racing with memory hotremove might see watermarks as 
> ok
> in get_page_from_freelist() for the zone and proceeds to try_this_zone:, then
> gets stalled/scheduled out while hotremove rebuilds the zonelist and destroys
> the pcplists, then the first task is resumed and proceeds with 
> rmqueue_pcplist().
> 
> So that's very rare thus not urgent, and this patch doesn't make it less rare 
> so
> not a reason to block it.

Completely agreed here. Not an urgent thing to work on but something to
look into long term.

-- 
Michal Hocko
SUSE Labs


Re: [PATCH v5] lib: add basic KUnit test for lib/math

2021-04-12 Thread David Gow
On Tue, Apr 13, 2021 at 3:07 AM Daniel Latypov  wrote:
>
> Add basic test coverage for files that don't require any config options:
> * part of math.h (what seem to be the most commonly used macros)
> * gcd.c
> * lcm.c
> * int_sqrt.c
> * reciprocal_div.c
> (Ignored int_pow.c since it's a simple textbook algorithm.)
>
> These tests aren't particularly interesting, but they
> * provide short and simple examples of parameterized tests
> * provide a place to add tests for any new files in this dir
> * are written so adding new test cases to cover edge cases should be easy
>   * looking at code coverage, we hit all the branches in the .c files
>
> Signed-off-by: Daniel Latypov 

This looks good to me. A few comments/observations below, but nothing
that I think should actually block this.

Reviewed-by: David Gow 

-- David

> ---
> Changes since v4:
> * add in test cases for some math.h macros (abs, round_up/round_down,
>   div_round_down/closest)
> * use parameterized testing less to keep things terser
>
> Changes since v3:
> * fix `checkpatch.pl --strict` warnings
> * add test cases for gcd(0,0) and lcm(0,0)
> * minor: don't test both gcd(a,b) and gcd(b,a) when a == b
>
> Changes since v2: mv math_test.c => math_kunit.c
>
> Changes since v1:
> * Rebase and rewrite to use the new parameterized testing support.
> * misc: fix overflow in literal and inline int_sqrt format string.
> * related: commit 1f0e943df68a ("Documentation: kunit: provide guidance
> for testing many inputs") was merged explaining the patterns shown here.
>   * there's an in-flight patch to update it for parameterized testing.
>
> v1: https://lore.kernel.org/lkml/20201019224556.3536790-1-dlaty...@google.com/
> ---
>  lib/math/Kconfig  |   5 +
>  lib/math/Makefile |   2 +
>  lib/math/math_kunit.c | 264 ++
>  3 files changed, 271 insertions(+)
>  create mode 100644 lib/math/math_kunit.c
>
> diff --git a/lib/math/Kconfig b/lib/math/Kconfig
> index f19bc9734fa7..6ba8680439c1 100644
> --- a/lib/math/Kconfig
> +++ b/lib/math/Kconfig
> @@ -15,3 +15,8 @@ config PRIME_NUMBERS
>
>  config RATIONAL
> bool
> +
> +config MATH_KUNIT_TEST
> +   tristate "KUnit test for lib/math" if !KUNIT_ALL_TESTS
> +   default KUNIT_ALL_TESTS
> +   depends on KUNIT

This could have a description of the test and KUnit here, as mentioned
in the style guide doc:
https://www.kernel.org/doc/html/latest/dev-tools/kunit/style.html#test-kconfig-entries

(I think it's sufficiently self explanatory that it's not essential,
but it could be nice to have a more detailed description of the things
being tested than just "lib/math".)

> diff --git a/lib/math/Makefile b/lib/math/Makefile
> index be6909e943bd..30abb7a8d564 100644
> --- a/lib/math/Makefile
> +++ b/lib/math/Makefile
> @@ -4,3 +4,5 @@ obj-y += div64.o gcd.o lcm.o int_pow.o int_sqrt.o 
> reciprocal_div.o
>  obj-$(CONFIG_CORDIC)   += cordic.o
>  obj-$(CONFIG_PRIME_NUMBERS)+= prime_numbers.o
>  obj-$(CONFIG_RATIONAL) += rational.o
> +
> +obj-$(CONFIG_MATH_KUNIT_TEST)  += math_kunit.o
> diff --git a/lib/math/math_kunit.c b/lib/math/math_kunit.c
> new file mode 100644
> index ..80a087a32884
> --- /dev/null
> +++ b/lib/math/math_kunit.c
> @@ -0,0 +1,264 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Simple KUnit suite for math helper funcs that are always enabled.
> + *
> + * Copyright (C) 2020, Google LLC.
> + * Author: Daniel Latypov 
> + */
> +
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +
> +static void abs_test(struct kunit *test)
> +{

There's something weird about taking the absolute values of char
literals. I'm not sure if it's better to case integer literals (like
with 'short' below), or keep it as-is.
> +   KUNIT_EXPECT_EQ(test, abs('\0'), '\0');
> +   KUNIT_EXPECT_EQ(test, abs('a'), 'a');
> +   KUNIT_EXPECT_EQ(test, abs(-'a'), 'a');
> +
> +   /* The expression in the macro is actually promoted to an int. */
> +   KUNIT_EXPECT_EQ(test, abs((short)0),  0);
> +   KUNIT_EXPECT_EQ(test, abs((short)42),  42);
> +   KUNIT_EXPECT_EQ(test, abs((short)-42),  42);
> +
> +   KUNIT_EXPECT_EQ(test, abs(0),  0);
> +   KUNIT_EXPECT_EQ(test, abs(42),  42);
> +   KUNIT_EXPECT_EQ(test, abs(-42),  42);
> +
> +   KUNIT_EXPECT_EQ(test, abs(0L), 0L);
> +   KUNIT_EXPECT_EQ(test, abs(42L), 42L);
> +   KUNIT_EXPECT_EQ(test, abs(-42L), 42L);
> +
> +   KUNIT_EXPECT_EQ(test, abs(0LL), 0LL);
> +   KUNIT_EXPECT_EQ(test, abs(42LL), 42LL);
> +   KUNIT_EXPECT_EQ(test, abs(-42LL), 42LL);
> +
> +   /* Unsigned types get casted to signed. */
> +   KUNIT_EXPECT_EQ(test, abs(0ULL), 0LL);
> +   KUNIT_EXPECT_EQ(test, abs(42ULL), 42LL);

A part of me is curious what the result is for -0x8000, but I
guess that's not defined, so shouldn't be tested. :-)
> +}
> +
> +static void int_sqrt_test(struct kunit *test)
> +{
> +   KUNIT_EXPECT_EQ(test, int_sqrt(0U

Re: [PATCH v2 resend] mm/memory_hotplug: Make unpopulated zones PCP structures unreachable during hot remove

2021-04-12 Thread Michal Hocko
On Mon 12-04-21 13:08:42, Mel Gorman wrote:
> zone_pcp_reset allegedly protects against a race with drain_pages
> using local_irq_save but this is bogus. local_irq_save only operates
> on the local CPU. If memory hotplug is running on CPU A and drain_pages
> is running on CPU B, disabling IRQs on CPU A does not affect CPU B and
> offers no protection.
> 
> This patch deletes IRQ disable/enable on the grounds that IRQs protect
> nothing and assumes the existing hotplug paths guarantees the PCP cannot be
> used after zone_pcp_enable(). That should be the case already because all
> the pages have been freed and there is no page to put on the PCP lists.

Yes, that is the case since ec6e8c7e0314 ("mm, page_alloc: disable
pcplists during memory offline"). Prior to this commit the behavior was
undefined but full zone/node hotremove is rare enough that an existing
race was likely never observed.

Acked-by: Michal Hocko 

Thanks!
 
> Signed-off-by: Mel Gorman 
> ---
> Resending for email address correction and adding lists
> 
> Changelog since v1
> o Minimal fix
> 
>  mm/page_alloc.c | 4 
>  1 file changed, 4 deletions(-)
> 
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 5e8aedb64b57..9bf0db982f14 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -8952,12 +8952,9 @@ void zone_pcp_enable(struct zone *zone)
>  
>  void zone_pcp_reset(struct zone *zone)
>  {
> - unsigned long flags;
>   int cpu;
>   struct per_cpu_pageset *pset;
>  
> - /* avoid races with drain_pages()  */
> - local_irq_save(flags);
>   if (zone->pageset != &boot_pageset) {
>   for_each_online_cpu(cpu) {
>   pset = per_cpu_ptr(zone->pageset, cpu);
> @@ -8966,7 +8963,6 @@ void zone_pcp_reset(struct zone *zone)
>   free_percpu(zone->pageset);
>   zone->pageset = &boot_pageset;
>   }
> - local_irq_restore(flags);
>  }
>  
>  #ifdef CONFIG_MEMORY_HOTREMOVE

-- 
Michal Hocko
SUSE Labs


[PATCH] stm class: remove useless function

2021-04-12 Thread Jiapeng Chong
Fix the following clang warning:

drivers/hwtracing/stm/policy.c:60:21: warning: unused function
'stp_policy_node_name' [-Wunused-function].

Reported-by: Abaci Robot 
Signed-off-by: Jiapeng Chong 
---
 drivers/hwtracing/stm/policy.c | 5 -
 1 file changed, 5 deletions(-)

diff --git a/drivers/hwtracing/stm/policy.c b/drivers/hwtracing/stm/policy.c
index 603b4a99..42103c3 100644
--- a/drivers/hwtracing/stm/policy.c
+++ b/drivers/hwtracing/stm/policy.c
@@ -57,11 +57,6 @@ void stp_policy_node_get_ranges(struct stp_policy_node 
*policy_node,
*cend   = policy_node->last_channel;
 }
 
-static inline char *stp_policy_node_name(struct stp_policy_node *policy_node)
-{
-   return policy_node->group.cg_item.ci_name ? : "";
-}
-
 static inline struct stp_policy *to_stp_policy(struct config_item *item)
 {
return item ?
-- 
1.8.3.1



RE: [PATCH v7 1/2] platform/x86: dell-privacy: Add support for Dell hardware privacy

2021-04-12 Thread Yuan, Perry
Hi ,
> -Original Message-
> From: Amadeusz Sławiński 
> Sent: 2021年4月12日 18:40
> To: Yuan, Perry; po...@protonmail.com; pierre-
> louis.boss...@linux.intel.com; oder_ch...@realtek.com; pe...@perex.cz;
> ti...@suse.com; hdego...@redhat.com; mgr...@linux.intel.com
> Cc: alsa-de...@alsa-project.org; linux-kernel@vger.kernel.org;
> lgirdw...@gmail.com; platform-driver-...@vger.kernel.org;
> broo...@kernel.org; Dell Client Kernel; mario.limoncie...@outlook.com
> Subject: Re: [PATCH v7 1/2] platform/x86: dell-privacy: Add support for Dell
> hardware privacy
> 
> 
> [EXTERNAL EMAIL]
> 
> On 4/12/2021 11:19 AM, Perry Yuan wrote:
> > From: Perry Yuan 
> >
> 
> (...)
> 
> > diff --git a/drivers/platform/x86/dell/dell-laptop.c
> > b/drivers/platform/x86/dell/dell-laptop.c
> > index 70edc5bb3a14..e7ffc0b81208 100644
> > --- a/drivers/platform/x86/dell/dell-laptop.c
> > +++ b/drivers/platform/x86/dell/dell-laptop.c
> > @@ -31,6 +31,8 @@
> >   #include "dell-rbtn.h"
> >   #include "dell-smbios.h"
> >
> > +#include "dell-privacy-wmi.h"
> > +
> >   struct quirk_entry {
> > bool touchpad_led;
> > bool kbd_led_not_present;
> > @@ -90,6 +92,7 @@ static struct rfkill *wifi_rfkill;
> >   static struct rfkill *bluetooth_rfkill;
> >   static struct rfkill *wwan_rfkill;
> >   static bool force_rfkill;
> > +static bool has_privacy;
> >
> >   module_param(force_rfkill, bool, 0444);
> >   MODULE_PARM_DESC(force_rfkill, "enable rfkill on non whitelisted
> > models"); @@ -2206,10 +2209,16 @@ static int __init dell_init(void)
> >
> > if (dell_smbios_find_token(GLOBAL_MIC_MUTE_DISABLE) &&
> > dell_smbios_find_token(GLOBAL_MIC_MUTE_ENABLE)) {
> > -   micmute_led_cdev.brightness =
> ledtrig_audio_get(LED_AUDIO_MICMUTE);
> > -   ret = led_classdev_register(&platform_device->dev,
> &micmute_led_cdev);
> > -   if (ret < 0)
> > -   goto fail_led;
> > +   if (dell_privacy_present())
> > +   has_privacy = true;
> > +   else
> > +   has_privacy = false;
> 
> Bit, of nitpicking, but you can write above shorter:
> has_privacy = dell_privacy_present();

Good point, changed the code as you suggested.
Thank you.
Perry.


[PATCH v8] RISC-V: enable XIP

2021-04-12 Thread Alexandre Ghiti
From: Vitaly Wool 

Introduce XIP (eXecute In Place) support for RISC-V platforms.
It allows code to be executed directly from non-volatile storage
directly addressable by the CPU, such as QSPI NOR flash which can
be found on many RISC-V platforms. This makes way for significant
optimization of RAM footprint. The XIP kernel is not compressed
since it has to run directly from flash, so it will occupy more
space on the non-volatile storage. The physical flash address used
to link the kernel object files and for storing it has to be known
at compile time and is represented by a Kconfig option.

XIP on RISC-V will for the time being only work on MMU-enabled
kernels.

Signed-off-by: Alexandre Ghiti  [ Rebase on top of "Move
kernel mapping outside the linear mapping" ]
Signed-off-by: Vitaly Wool 
---
 arch/riscv/Kconfig  |  55 +++-
 arch/riscv/Makefile |   8 +-
 arch/riscv/boot/Makefile|  13 +++
 arch/riscv/include/asm/page.h   |  21 +
 arch/riscv/include/asm/pgtable.h|  25 +-
 arch/riscv/kernel/head.S|  46 +-
 arch/riscv/kernel/head.h|   3 +
 arch/riscv/kernel/setup.c   |  10 ++-
 arch/riscv/kernel/vmlinux-xip.lds.S | 133 
 arch/riscv/kernel/vmlinux.lds.S |   6 ++
 arch/riscv/mm/init.c| 115 ++--
 11 files changed, 418 insertions(+), 17 deletions(-)
 create mode 100644 arch/riscv/kernel/vmlinux-xip.lds.S

diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 8ea60a0a19ae..7c7efdd67a10 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -28,7 +28,7 @@ config RISCV
select ARCH_HAS_PTE_SPECIAL
select ARCH_HAS_SET_DIRECT_MAP
select ARCH_HAS_SET_MEMORY
-   select ARCH_HAS_STRICT_KERNEL_RWX if MMU
+   select ARCH_HAS_STRICT_KERNEL_RWX if MMU && !XIP_KERNEL
select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
select ARCH_OPTIONAL_KERNEL_RWX if ARCH_HAS_STRICT_KERNEL_RWX
select ARCH_OPTIONAL_KERNEL_RWX_DEFAULT
@@ -441,7 +441,7 @@ config EFI_STUB
 
 config EFI
bool "UEFI runtime support"
-   depends on OF
+   depends on OF && !XIP_KERNEL
select LIBFDT
select UCS2_STRING
select EFI_PARAMS_FROM_FDT
@@ -465,11 +465,60 @@ config STACKPROTECTOR_PER_TASK
def_bool y
depends on STACKPROTECTOR && CC_HAVE_STACKPROTECTOR_TLS
 
+config PHYS_RAM_BASE_FIXED
+   bool "Explicitly specified physical RAM address"
+   default n
+
+config PHYS_RAM_BASE
+   hex "Platform Physical RAM address"
+   depends on PHYS_RAM_BASE_FIXED
+   default "0x8000"
+   help
+ This is the physical address of RAM in the system. It has to be
+ explicitly specified to run early relocations of read-write data
+ from flash to RAM.
+
+config XIP_KERNEL
+   bool "Kernel Execute-In-Place from ROM"
+   depends on MMU && SPARSEMEM
+   select PHYS_RAM_BASE_FIXED
+   help
+ Execute-In-Place allows the kernel to run from non-volatile storage
+ directly addressable by the CPU, such as NOR flash. This saves RAM
+ space since the text section of the kernel is not loaded from flash
+ to RAM.  Read-write sections, such as the data section and stack,
+ are still copied to RAM.  The XIP kernel is not compressed since
+ it has to run directly from flash, so it will take more space to
+ store it.  The flash address used to link the kernel object files,
+ and for storing it, is configuration dependent. Therefore, if you
+ say Y here, you must know the proper physical address where to
+ store the kernel image depending on your own flash memory usage.
+
+ Also note that the make target becomes "make xipImage" rather than
+ "make zImage" or "make Image".  The final kernel binary to put in
+ ROM memory will be arch/riscv/boot/xipImage.
+
+ SPARSEMEM is required because the kernel text and rodata that are
+ flash resident are not backed by memmap, then any attempt to get
+ a struct page on those regions will trigger a fault.
+
+ If unsure, say N.
+
+config XIP_PHYS_ADDR
+   hex "XIP Kernel Physical Location"
+   depends on XIP_KERNEL
+   default "0x2100"
+   help
+ This is the physical address in your flash memory the kernel will
+ be linked for and stored to.  This address is dependent on your
+ own flash usage.
+
 endmenu
 
 config BUILTIN_DTB
-   def_bool n
+   bool
depends on OF
+   default y if XIP_KERNEL
 
 menu "Power management options"
 
diff --git a/arch/riscv/Makefile b/arch/riscv/Makefile
index 1368d943f1f3..8fcbec03974d 100644
--- a/arch/riscv/Makefile
+++ b/arch/riscv/Makefile
@@ -82,7 +82,11 @@ CHECKFLAGS += -D__riscv -D__riscv_xlen=$(BITS)
 
 # Default target when executing plain make
 boot   :=

Re: [PATCH RESEND v1 0/4] powerpc/vdso: Add support for time namespaces

2021-04-12 Thread Michael Ellerman
Thomas Gleixner  writes:
> On Wed, Mar 31 2021 at 16:48, Christophe Leroy wrote:
>> [Sorry, resending with complete destination list, I used the wrong script on 
>> the first delivery]
>>
>> This series adds support for time namespaces on powerpc.
>>
>> All timens selftests are successfull.
>
> If PPC people want to pick up the whole lot, no objections from my side.

Thanks, will do.

cheers


Re: [PATCH for-next v3 0/2] Introduce rdma_set_min_rnr_timer() and use it in RDS

2021-04-12 Thread Leon Romanovsky
On Mon, Apr 12, 2021 at 07:58:47PM -0300, Jason Gunthorpe wrote:
> On Wed, Mar 31, 2021 at 08:43:12PM +0200, Håkon Bugge wrote:
> > ib_modify_qp() is an expensive operation on some HCAs running
> > virtualized. This series removes two ib_modify_qp() calls from RDS.
> > 
> > I am sending this as a v3, even though it is the first sent to
> > net. This because the IB Core commit has reach v3.
> > 
> > Håkon Bugge (2):
> >   IB/cma: Introduce rdma_set_min_rnr_timer()
> >   rds: ib: Remove two ib_modify_qp() calls
> 
> Applied to rdma for-next, thanks

Jason,

It should be 
+   WARN_ON(id->qp_type != IB_QPT_RC && id->qp_type != IB_QPT_XRC_TGT);

and not
+   if (WARN_ON(id->qp_type != IB_QPT_RC && id->qp_type != IB_QPT_XRC_TGT))
+   return -EINVAL;

Thanks

> 
> Jason


Re: [PATCH 1/4] dt-bindings: Add bindings for aspeed pwm-tach.

2021-04-12 Thread Billy Tsai
Hi,

Best Regards,
Billy Tsai

On 2021/4/12, 8:55 PM,Uwe Kleine-Königwrote:

> Hello,

On Mon, Apr 12, 2021 at 05:54:54PM +0800, Billy Tsai wrote:
> +  - Billy Tsai 

> I object because the MTA at aspeedtech.com doesn't know this email
> address.

This is typo error, my email address is billy_t...@aspeedtech.com
I will fix it at v2.

> Best regards
> Uwe

> -- 
> Pengutronix e.K.   | Uwe Kleine-König|
> Industrial Linux Solutions | https://www.pengutronix.de/ |



arch/mips/n64/init.c:57:38: sparse: sparse: incorrect type in argument 2 (different address spaces)

2021-04-12 Thread kernel test robot
tree:   https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git 
master
head:   89698becf06d341a700913c3d89ce2a914af69a2
commit: baec970aa5ba11099ad7a91773350c91fb2113f0 mips: Add N64 machine type
date:   3 months ago
config: mips-randconfig-s032-20210413 (attached as .config)
compiler: mips64-linux-gcc (GCC) 9.3.0
reproduce:
wget 
https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O 
~/bin/make.cross
chmod +x ~/bin/make.cross
# apt-get install sparse
# sparse version: v0.6.3-280-g2cd6d34e-dirty
# 
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=baec970aa5ba11099ad7a91773350c91fb2113f0
git remote add linus 
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
git fetch --no-tags linus master
git checkout baec970aa5ba11099ad7a91773350c91fb2113f0
# save the attached .config to linux build tree
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-9.3.0 make.cross C=1 
CF='-fdiagnostic-prefix -D__CHECK_ENDIAN__' ARCH=mips 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot 


sparse warnings: (new ones prefixed by >>)
   command-line: note: in included file:
   builtin:1:9: sparse: sparse: preprocessor token __ATOMIC_ACQUIRE redefined
   builtin:0:0: sparse: this was the original definition
   builtin:1:9: sparse: sparse: preprocessor token __ATOMIC_SEQ_CST redefined
   builtin:0:0: sparse: this was the original definition
   builtin:1:9: sparse: sparse: preprocessor token __ATOMIC_ACQ_REL redefined
   builtin:0:0: sparse: this was the original definition
   builtin:1:9: sparse: sparse: preprocessor token __ATOMIC_RELEASE redefined
   builtin:0:0: sparse: this was the original definition
>> arch/mips/n64/init.c:57:38: sparse: sparse: incorrect type in argument 2 
>> (different address spaces) @@ expected void volatile [noderef] __iomem 
>> *mem @@ got unsigned int [usertype] * @@
   arch/mips/n64/init.c:57:38: sparse: expected void volatile [noderef] 
__iomem *mem
   arch/mips/n64/init.c:57:38: sparse: got unsigned int [usertype] *

vim +57 arch/mips/n64/init.c

54  
55  static void __init n64rdp_write_reg(const u8 reg, const u32 value)
56  {
  > 57  __raw_writel(value, REG_BASE + reg);
58  }
59  

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-...@lists.01.org


.config.gz
Description: application/gzip


Subject: [PATCH v2] staging: media: meson: vdec: declare u32 as static const appropriately

2021-04-12 Thread Mitali Borkar
Declared 32 bit unsigned int as static constant inside a function
appropriately.

Reported-by: kernel test robot 
Signed-off-by: Mitali Borkar 
---

Changes from v1:- Rectified the mistake by declaring u32 as static const
properly.

 drivers/staging/media/meson/vdec/codec_h264.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/staging/media/meson/vdec/codec_h264.c 
b/drivers/staging/media/meson/vdec/codec_h264.c
index ea86e9e1c447..80141b89a9f6 100644
--- a/drivers/staging/media/meson/vdec/codec_h264.c
+++ b/drivers/staging/media/meson/vdec/codec_h264.c
@@ -287,8 +287,8 @@ static void codec_h264_resume(struct amvdec_session *sess)
struct amvdec_core *core = sess->core;
struct codec_h264 *h264 = sess->priv;
u32 mb_width, mb_height, mb_total;
-   static const u32[] canvas3 = { ANCO_CANVAS_ADDR, 0 };
-   static const u32[] canvas4 = { 24, 0 };
+   static const u32 canvas3[] = { ANCO_CANVAS_ADDR, 0 };
+   static const u32 canvas4[] = { 24, 0 };
 
amvdec_set_canvases(sess, canvas3, canvas4);
 
-- 
2.30.2



[PATCH] kernel:irq:manage: request threaded irq with a specified priority

2021-04-12 Thread Song Chen
In general, irq handler thread will be assigned a default priority which
is MAX_RT_PRIO/2, as a result, no one can preempt others.

Here is the case I found in a real project, an interrupt int_a is
coming, wakes up its handler handler_a and handler_a wakes up a
userspace RT process task_a.

However, if another irq handler handler_b which has nothing to do
with any RT tasks is running when int_a is coming, handler_a can't
preempt handler_b, as a result, task_a can't be waken up immediately
as expected until handler_b gives up cpu voluntarily. In this case,
determinism breaks.

Therefore, this patch introduce a new api to give drivers a chance to
assign expected priorities to their irq handler thread.

Signed-off-by: Song Chen 
---
 include/linux/interrupt.h  |  7 +
 include/linux/sched.h  |  1 +
 include/linux/sched/prio.h |  1 +
 kernel/irq/manage.c| 64 +++---
 kernel/sched/core.c| 11 
 5 files changed, 80 insertions(+), 4 deletions(-)

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 967e257..5ab9169 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -121,6 +121,7 @@ struct irqaction {
unsigned long   thread_mask;
const char  *name;
struct proc_dir_entry   *dir;
+   int prio;
 } cacheline_internodealigned_in_smp;
 
 extern irqreturn_t no_action(int cpl, void *dev_id);
@@ -136,6 +137,12 @@ extern irqreturn_t no_action(int cpl, void *dev_id);
 #define IRQ_NOTCONNECTED   (1U << 31)
 
 extern int __must_check
+request_threaded_irq_with_prio(unsigned int irq, irq_handler_t handler,
+irq_handler_t thread_fn,
+unsigned long flags, const char *name, void *dev,
+int prio);
+
+extern int __must_check
 request_threaded_irq(unsigned int irq, irq_handler_t handler,
 irq_handler_t thread_fn,
 unsigned long flags, const char *name, void *dev);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index ef00bb2..50edae9 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1711,6 +1711,7 @@ extern int sched_setscheduler(struct task_struct *, int, 
const struct sched_para
 extern int sched_setscheduler_nocheck(struct task_struct *, int, const struct 
sched_param *);
 extern void sched_set_fifo(struct task_struct *p);
 extern void sched_set_fifo_low(struct task_struct *p);
+extern void sched_set_fifo_with_prio(struct task_struct *p, int prio);
 extern void sched_set_normal(struct task_struct *p, int nice);
 extern int sched_setattr(struct task_struct *, const struct sched_attr *);
 extern int sched_setattr_nocheck(struct task_struct *, const struct sched_attr 
*);
diff --git a/include/linux/sched/prio.h b/include/linux/sched/prio.h
index ab83d85..1e1186e 100644
--- a/include/linux/sched/prio.h
+++ b/include/linux/sched/prio.h
@@ -15,6 +15,7 @@
 
 #define MAX_RT_PRIO100
 
+#define DEFAULT_RT_PRIO(MAX_RT_PRIO / 2)
 #define MAX_PRIO   (MAX_RT_PRIO + NICE_WIDTH)
 #define DEFAULT_PRIO   (MAX_RT_PRIO + NICE_WIDTH / 2)
 
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 21ea370..111b8ce 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1394,7 +1394,7 @@ setup_irq_thread(struct irqaction *new, unsigned int irq, 
bool secondary)
if (IS_ERR(t))
return PTR_ERR(t);
 
-   sched_set_fifo(t);
+   sched_set_fifo_with_prio(t, new->prio);
 
/*
 * We keep the reference to the task struct even if
@@ -2032,7 +2032,7 @@ const void *free_nmi(unsigned int irq, void *dev_id)
 }
 
 /**
- * request_threaded_irq - allocate an interrupt line
+ * request_threaded_irq_with_prio - allocate an interrupt line
  * @irq: Interrupt line to allocate
  * @handler: Function to be called when the IRQ occurs.
  *   Primary handler for threaded interrupts
@@ -2043,6 +2043,7 @@ const void *free_nmi(unsigned int irq, void *dev_id)
  * @irqflags: Interrupt type flags
  * @devname: An ascii name for the claiming device
  * @dev_id: A cookie passed back to the handler function
+ * @prio: priority of the irq handler thread
  *
  * This call allocates interrupt resources and enables the
  * interrupt line and IRQ handling. From the point this
@@ -2067,15 +2068,18 @@ const void *free_nmi(unsigned int irq, void *dev_id)
  * If your interrupt is shared you must pass a non NULL dev_id
  * as this is required when freeing the interrupt.
  *
+ * If you want to assign a priority for your irq handler thread
+ * instead of default value, you need to supply @prio.
+ *
  * Flags:
  *
  * IRQF_SHARED Interrupt is shared
  * IRQF_TRIGGER_*  Specify active edge(s) or level
  *
  */
-int request_threaded_irq(unsigned int irq, irq_handler_t handler,
+int request_threaded_irq_with_prio(uns

[PATCH] x86: Accelerate copy_page with non-temporal in X86

2021-04-12 Thread Kemeng Shi
I'm using AEP with dax_kmem drvier, and AEP is export as a NUMA node in
my system. I will move cold pages from DRAM node to AEP node with
move_pages system call. With old "rep movsq', it costs 2030ms to move
1 GB pages. With "movnti", it only cost about 890ms to move 1GB pages.
I also test move 1GB pages from AEP node to DRAM node. But the result is
unexpected. "rep movesq" cost about 372 ms while "movnti" cost about
477ms. As said in X86 , "movnti" could avoid "polluting the caches" in
this situaction. I don't know if it's general result or just happening
in my machine. Hardware information is as follow:
CPU:
Intel(R) Xeon(R) Gold 6266C CPU @ 3.00GHz
DRAM:
Memory Device
Array Handle: 0x0035
Error Information Handle: Not Provided
Total Width: 72 bits
Data Width: 64 bits
Size: 64 GB
Form Factor: DIMM
Set: None
Locator: DIMM130 J40
Bank Locator: _Node1_Channel3_Dimm0
Type: DDR4
Type Detail: Synchronous Registered (Buffered)
Speed: 2933 MT/s
Manufacturer: Samsung
Serial Number: 03B71EB0
Asset Tag: 1950
Part Number: M393A8G40MB2-CVF
Rank: 2
Configured Memory Speed: 2666 MT/s
Minimum Voltage: 1.2 V
Maximum Voltage: 1.2 V
Configured Voltage: 1.2 V
Memory Technology: DRAM
Memory Operating Mode Capability: Volatile memory
Firmware Version: 
Module Manufacturer ID: Bank 1, Hex 0xCE
Module Product ID: Unknown
Memory Subsystem Controller Manufacturer ID: Unknown
Memory Subsystem Controller Product ID: Unknown
Non-Volatile Size: None
Volatile Size: 64 GB
Cache Size: None
Logical Size: None
AEP:
Memory Device
Array Handle: 0x0035
Error Information Handle: Not Provided
Total Width: 72 bits
Data Width: 64 bits
Size: 128 GB
Form Factor: DIMM
Set: None
Locator: DIMM131 J41
Bank Locator: _Node1_Channel3_Dimm1
Type: Logical non-volatile device
Type Detail: Synchronous Non-Volatile LRDIMM
Speed: 2666 MT/s
Manufacturer: Intel
Serial Number: 6803
Asset Tag: 1949
Part Number: NMA1XXD128GPS
Rank: 1
Configured Memory Speed: 2666 MT/s
Minimum Voltage: 1.2 V
Maximum Voltage: 1.2 V
Configured Voltage: 1.2 V
Memory Technology: Intel persistent memory
Memory Operating Mode Capability: Volatile memory
Byte-accessible persistent memory
Firmware Version: 5355
Module Manufacturer ID: Bank 1, Hex 0x89
Module Product ID: 0x0556
Memory Subsystem Controller Manufacturer ID: Bank 1, Hex 0x89
Memory Subsystem Controller Product ID: 0x097A
Non-Volatile Size: 126 GB
Volatile Size: None
Cache Size: None
Logical Size: None
Memory dimm topoloygy:
AEP
 |
DRAMDRAMDRAM
 |   |   |
 |---|---|
CPU
 |---|---|
 |   |   |
DRAMDRAMDRAM

Signed-off-by: Kemeng Shi 
---
 arch/x86/lib/copy_page_64.S | 73 -
 1 file changed, 72 insertions(+), 1 deletion(-)

diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S
index 2402d4c489d2..69389b4aeeed 100644
--- a/arch/x86/lib/copy_page_64.S
+++ b/arch/x86/lib/copy_page_64.S
@@ -14,7 +14,8 @@
  */
ALIGN
 SYM_FUNC_START(copy_page)
-   ALTERNATIVE "jmp copy_page_regs", "", X86_FEATURE_REP_GOOD
+   ALTERNATIVE_2 "jmp copy_page_regs", "", X86_FEATURE_REP_GOOD, \
+  "jmp copy_page_nt", X86_FEATURE_XMM2
movl$4096/8, %ecx
rep movsq
ret
@@ -87,3 +88,73 @@ SYM_FUNC_START_LOCAL(copy_page_regs)
addq$2*8, %rsp
ret
 SYM_FUNC_END(copy_page_regs)
+
+SYM_FUNC_START_LOCAL(copy_page_nt)
+   subq$2*8,   %rsp
+   movq%rbx,   (%rsp)
+   movq%r12,   1*8(%rsp)
+
+   movl$(4096/64)-5, %ecx
+   .p2align 4
+.LoopNT64:
+   decl%ecx
+
+   movq0x8*0(%rsi), %rax
+   movq0x8*1(%rsi), %rbx
+   movq0x8*2(%rsi), %rdx
+   movq0x8*3(%rsi), %r8
+   movq0x8*4(%rsi), %r9
+   movq0x8*5(%rsi), %r10
+   movq0x8*6(%rsi), %r11
+   movq0x8*7(%rsi), %r12
+
+   prefetcht0 5*64(%rsi)
+
+   movnti  %rax, 0x8*0(%rdi)
+   movnti  %rbx, 0x8*1(%rdi)
+   movnti  %rdx, 0x8*2(%rdi)
+   movnti  %r8,  0x8*3(%rdi)
+   movnti  %r9,  0x8*4(%rdi)
+   movnti  %r10, 0x8*5(%rdi)
+   movnti  %r11, 0x8*6(%rdi)
+   movnti  %r12, 0x8*7(%rdi)
+
+   leaq64(%rdi), %rdi
+   leaq64(%rsi), %rsi
+   jnz .LoopNT64
+
+   movl$5, %ecx
+   .p2align 4
+.LoopNT2:
+   decl%ecx
+
+   movq0x8*0(%rsi), %rax
+   movq0x8*1(%rsi), %rbx
+   movq0x8*2(%rsi), %rdx
+   movq0x8*3(%rsi), %r8

[PATCH 5/8] MIPS: pci-legacy: stop using of_pci_range_to_resource

2021-04-12 Thread Ilya Lipnitskiy
Mirror commit aeba3731b150 ("powerpc/pci: Fix IO space breakage after
of_pci_range_to_resource() change").

Most MIPS platforms do not define PCI_IOBASE, nor implement
pci_address_to_pio(). Moreover, IO_SPACE_LIMIT is 0x for most MIPS
platforms. of_pci_range_to_resource passes the _start address_ of the IO
range into pci_address_to_pio, which then checks it against
IO_SPACE_LIMIT and fails, because for MIPS platforms that use
pci-legacy (pci-lantiq, pci-rt3883, pci-mt7620), IO ranges start much
higher than 0x.

In fact, pci-mt7621 in staging already works around this problem, see
commit 09dd629eeabb ("staging: mt7621-pci: fix io space and properly set
resource limits")

So just stop using of_pci_range_to_resource, which does not work for
MIPS.

Fixes PCI errors like:
  pci_bus :00: root bus resource [io  0x]

Fixes: 0b0b0893d49b ("of/pci: Fix the conversion of IO ranges into IO 
resources")
Signed-off-by: Ilya Lipnitskiy 
Cc: Liviu Dudau 
---
 arch/mips/pci/pci-legacy.c | 9 +++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/arch/mips/pci/pci-legacy.c b/arch/mips/pci/pci-legacy.c
index 39052de915f3..3a909194284a 100644
--- a/arch/mips/pci/pci-legacy.c
+++ b/arch/mips/pci/pci-legacy.c
@@ -166,8 +166,13 @@ void pci_load_of_ranges(struct pci_controller *hose, 
struct device_node *node)
res = hose->mem_resource;
break;
}
-   if (res != NULL)
-   of_pci_range_to_resource(&range, node, res);
+   if (res != NULL) {
+   res->name = node->full_name;
+   res->flags = range.flags;
+   res->start = range.cpu_addr;
+   res->end = range.cpu_addr + range.size - 1;
+   res->parent = res->child = res->sibling = NULL;
+   }
}
 }
 
-- 
2.31.1



[PATCH 8/8] MIPS: pci-legacy: use generic pci_enable_resources

2021-04-12 Thread Ilya Lipnitskiy
Follow the reasoning from commit 842de40d93e0 ("PCI: add generic
pci_enable_resources()"):

  The only functional difference from the MIPS version is that the
  generic one uses "!r->parent" to check for resource collisions
  instead of "!r->start && r->end".

That should have no effect on any pci-legacy driver.

Suggested-by: Bjorn Helgaas 
Signed-off-by: Ilya Lipnitskiy 
---
 arch/mips/pci/pci-legacy.c | 40 ++
 1 file changed, 2 insertions(+), 38 deletions(-)

diff --git a/arch/mips/pci/pci-legacy.c b/arch/mips/pci/pci-legacy.c
index 78c22987bef0..c24226ea0a6e 100644
--- a/arch/mips/pci/pci-legacy.c
+++ b/arch/mips/pci/pci-legacy.c
@@ -241,47 +241,11 @@ static int __init pcibios_init(void)
 
 subsys_initcall(pcibios_init);
 
-static int pcibios_enable_resources(struct pci_dev *dev, int mask)
-{
-   u16 cmd, old_cmd;
-   int idx;
-   struct resource *r;
-
-   pci_read_config_word(dev, PCI_COMMAND, &cmd);
-   old_cmd = cmd;
-   for (idx=0; idx < PCI_NUM_RESOURCES; idx++) {
-   /* Only set up the requested stuff */
-   if (!(mask & (1flags & (IORESOURCE_IO | IORESOURCE_MEM)))
-   continue;
-   if ((idx == PCI_ROM_RESOURCE) &&
-   (!(r->flags & IORESOURCE_ROM_ENABLE)))
-   continue;
-   if (!r->start && r->end) {
-   pci_err(dev,
-   "can't enable device: resource collisions\n");
-   return -EINVAL;
-   }
-   if (r->flags & IORESOURCE_IO)
-   cmd |= PCI_COMMAND_IO;
-   if (r->flags & IORESOURCE_MEM)
-   cmd |= PCI_COMMAND_MEMORY;
-   }
-   if (cmd != old_cmd) {
-   pci_info(dev, "enabling device (%04x -> %04x)\n", old_cmd, cmd);
-   pci_write_config_word(dev, PCI_COMMAND, cmd);
-   }
-   return 0;
-}
-
 int pcibios_enable_device(struct pci_dev *dev, int mask)
 {
-   int err;
+   int err = pci_enable_resources(dev, mask);
 
-   if ((err = pcibios_enable_resources(dev, mask)) < 0)
+   if (err < 0)
return err;
 
return pcibios_plat_dev_init(dev);
-- 
2.31.1



[PATCH 1/8] MIPS: pci-rt2880: fix slot 0 configuration

2021-04-12 Thread Ilya Lipnitskiy
pci_fixup_irqs() used to call pcibios_map_irq on every PCI device, which
for RT2880 included bus 0 slot 0. After pci_fixup_irqs() got removed,
only slots/funcs with devices attached would be called. While arguably
the right thing, that left no chance for this driver to ever initialize
slot 0, effectively bricking PCI and USB on RT2880 devices such as the
Belkin F5D8235-4 v1.

Slot 0 configuration needs to happen after PCI bus enumeration, but
before any device at slot 0x11 (func 0 or 1) is talked to. That was
determined empirically by testing on a Belkin F5D8235-4 v1 device. A
minimal BAR 0 config write followed by read, then setting slot 0
PCI_COMMAND to MASTER | IO | MEMORY is all that seems to be required for
proper functionality.

Tested by ensuring that full- and high-speed USB devices get enumerated
on the Belkin F5D8235-4 v1 (with an out of tree DTS file from OpenWrt).

Fixes: 04c81c7293df ("MIPS: PCI: Replace pci_fixup_irqs() call with host bridge 
IRQ mapping hooks")
Signed-off-by: Ilya Lipnitskiy 
Cc: Lorenzo Pieralisi 
Cc: Tobias Wolf 
Cc:  # v4.14+
---
 arch/mips/pci/pci-rt2880.c | 50 +-
 1 file changed, 33 insertions(+), 17 deletions(-)

diff --git a/arch/mips/pci/pci-rt2880.c b/arch/mips/pci/pci-rt2880.c
index e1f12e398136..19f7860fb28b 100644
--- a/arch/mips/pci/pci-rt2880.c
+++ b/arch/mips/pci/pci-rt2880.c
@@ -66,9 +66,13 @@ static int rt2880_pci_config_read(struct pci_bus *bus, 
unsigned int devfn,
unsigned long flags;
u32 address;
u32 data;
+   int busn = 0;
 
-   address = rt2880_pci_get_cfgaddr(bus->number, PCI_SLOT(devfn),
-PCI_FUNC(devfn), where);
+   if (bus)
+   busn = bus->number;
+
+   address = rt2880_pci_get_cfgaddr(busn, PCI_SLOT(devfn), PCI_FUNC(devfn),
+where);
 
spin_lock_irqsave(&rt2880_pci_lock, flags);
rt2880_pci_reg_write(address, RT2880_PCI_REG_CONFIG_ADDR);
@@ -96,9 +100,13 @@ static int rt2880_pci_config_write(struct pci_bus *bus, 
unsigned int devfn,
unsigned long flags;
u32 address;
u32 data;
+   int busn = 0;
+
+   if (bus)
+   busn = bus->number;
 
-   address = rt2880_pci_get_cfgaddr(bus->number, PCI_SLOT(devfn),
-PCI_FUNC(devfn), where);
+   address = rt2880_pci_get_cfgaddr(busn, PCI_SLOT(devfn), PCI_FUNC(devfn),
+where);
 
spin_lock_irqsave(&rt2880_pci_lock, flags);
rt2880_pci_reg_write(address, RT2880_PCI_REG_CONFIG_ADDR);
@@ -180,7 +188,6 @@ static inline void rt2880_pci_write_u32(unsigned long reg, 
u32 val)
 
 int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin)
 {
-   u16 cmd;
int irq = -1;
 
if (dev->bus->number != 0)
@@ -188,8 +195,6 @@ int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 
pin)
 
switch (PCI_SLOT(dev->devfn)) {
case 0x00:
-   rt2880_pci_write_u32(PCI_BASE_ADDRESS_0, 0x0800);
-   (void) rt2880_pci_read_u32(PCI_BASE_ADDRESS_0);
break;
case 0x11:
irq = RT288X_CPU_IRQ_PCI;
@@ -201,16 +206,6 @@ int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 
pin)
break;
}
 
-   pci_write_config_byte((struct pci_dev *) dev,
-   PCI_CACHE_LINE_SIZE, 0x14);
-   pci_write_config_byte((struct pci_dev *) dev, PCI_LATENCY_TIMER, 0xFF);
-   pci_read_config_word((struct pci_dev *) dev, PCI_COMMAND, &cmd);
-   cmd |= PCI_COMMAND_MASTER | PCI_COMMAND_IO | PCI_COMMAND_MEMORY |
-   PCI_COMMAND_INVALIDATE | PCI_COMMAND_FAST_BACK |
-   PCI_COMMAND_SERR | PCI_COMMAND_WAIT | PCI_COMMAND_PARITY;
-   pci_write_config_word((struct pci_dev *) dev, PCI_COMMAND, cmd);
-   pci_write_config_byte((struct pci_dev *) dev, PCI_INTERRUPT_LINE,
- dev->irq);
return irq;
 }
 
@@ -251,6 +246,27 @@ static int rt288x_pci_probe(struct platform_device *pdev)
 
 int pcibios_plat_dev_init(struct pci_dev *dev)
 {
+   static bool slot0_init;
+
+   /*
+* Nobody seems to initialize slot 0, but this platform requires it, so
+* do it once when some other slot is being enabled. The PCI subsystem
+* should configure other slots properly, so no need to do anything
+* special for those.
+*/
+   if (!slot0_init) {
+   u32 cmd;
+
+   slot0_init = true;
+
+   rt2880_pci_write_u32(PCI_BASE_ADDRESS_0, 0x0800);
+   (void) rt2880_pci_read_u32(PCI_BASE_ADDRESS_0);
+
+   rt2880_pci_config_read(NULL, 0, PCI_COMMAND, 2, &cmd);
+   cmd |= PCI_COMMAND_MASTER | PCI_COMMAND_IO | PCI_COMMAND_MEMORY;
+   rt2880_pci_config_write(NULL, 0, PCI_COMMAND, 2, cmd);
+   }
+
return 0;
 }
 
-- 
2.31.1



[PATCH 2/8] MIPS: pci-rt2880: remove unneeded locks

2021-04-12 Thread Ilya Lipnitskiy
Mirror pci-rt3883 fix from commit e5067c718b3a ("MIPS: pci-rt3883:
Remove odd locking in PCI config space access code"). pci-rt2880 shares
the driver layout with pci-rt3883 and the same reasons apply.

Caller (generic PCI code) already does proper locking, so no need to add
another one here. Local PCI read/write functions are never called
simultaneously, also they do not require synchronization with the PCI
controller ops, since they are used before the controller registration.

Suggested-by: Sergey Ryazanov 
Signed-off-by: Ilya Lipnitskiy 
---
 arch/mips/pci/pci-rt2880.c | 13 -
 1 file changed, 13 deletions(-)

diff --git a/arch/mips/pci/pci-rt2880.c b/arch/mips/pci/pci-rt2880.c
index 19f7860fb28b..b4ee07cbcf2a 100644
--- a/arch/mips/pci/pci-rt2880.c
+++ b/arch/mips/pci/pci-rt2880.c
@@ -41,7 +41,6 @@
 #define RT2880_PCI_REG_ARBCTL  0x80
 
 static void __iomem *rt2880_pci_base;
-static DEFINE_SPINLOCK(rt2880_pci_lock);
 
 static u32 rt2880_pci_reg_read(u32 reg)
 {
@@ -63,7 +62,6 @@ static inline u32 rt2880_pci_get_cfgaddr(unsigned int bus, 
unsigned int slot,
 static int rt2880_pci_config_read(struct pci_bus *bus, unsigned int devfn,
  int where, int size, u32 *val)
 {
-   unsigned long flags;
u32 address;
u32 data;
int busn = 0;
@@ -74,10 +72,8 @@ static int rt2880_pci_config_read(struct pci_bus *bus, 
unsigned int devfn,
address = rt2880_pci_get_cfgaddr(busn, PCI_SLOT(devfn), PCI_FUNC(devfn),
 where);
 
-   spin_lock_irqsave(&rt2880_pci_lock, flags);
rt2880_pci_reg_write(address, RT2880_PCI_REG_CONFIG_ADDR);
data = rt2880_pci_reg_read(RT2880_PCI_REG_CONFIG_DATA);
-   spin_unlock_irqrestore(&rt2880_pci_lock, flags);
 
switch (size) {
case 1:
@@ -97,7 +93,6 @@ static int rt2880_pci_config_read(struct pci_bus *bus, 
unsigned int devfn,
 static int rt2880_pci_config_write(struct pci_bus *bus, unsigned int devfn,
   int where, int size, u32 val)
 {
-   unsigned long flags;
u32 address;
u32 data;
int busn = 0;
@@ -108,7 +103,6 @@ static int rt2880_pci_config_write(struct pci_bus *bus, 
unsigned int devfn,
address = rt2880_pci_get_cfgaddr(busn, PCI_SLOT(devfn), PCI_FUNC(devfn),
 where);
 
-   spin_lock_irqsave(&rt2880_pci_lock, flags);
rt2880_pci_reg_write(address, RT2880_PCI_REG_CONFIG_ADDR);
data = rt2880_pci_reg_read(RT2880_PCI_REG_CONFIG_DATA);
 
@@ -127,7 +121,6 @@ static int rt2880_pci_config_write(struct pci_bus *bus, 
unsigned int devfn,
}
 
rt2880_pci_reg_write(data, RT2880_PCI_REG_CONFIG_DATA);
-   spin_unlock_irqrestore(&rt2880_pci_lock, flags);
 
return PCIBIOS_SUCCESSFUL;
 }
@@ -159,31 +152,25 @@ static struct pci_controller rt2880_pci_controller = {
 
 static inline u32 rt2880_pci_read_u32(unsigned long reg)
 {
-   unsigned long flags;
u32 address;
u32 ret;
 
address = rt2880_pci_get_cfgaddr(0, 0, 0, reg);
 
-   spin_lock_irqsave(&rt2880_pci_lock, flags);
rt2880_pci_reg_write(address, RT2880_PCI_REG_CONFIG_ADDR);
ret = rt2880_pci_reg_read(RT2880_PCI_REG_CONFIG_DATA);
-   spin_unlock_irqrestore(&rt2880_pci_lock, flags);
 
return ret;
 }
 
 static inline void rt2880_pci_write_u32(unsigned long reg, u32 val)
 {
-   unsigned long flags;
u32 address;
 
address = rt2880_pci_get_cfgaddr(0, 0, 0, reg);
 
-   spin_lock_irqsave(&rt2880_pci_lock, flags);
rt2880_pci_reg_write(address, RT2880_PCI_REG_CONFIG_ADDR);
rt2880_pci_reg_write(val, RT2880_PCI_REG_CONFIG_DATA);
-   spin_unlock_irqrestore(&rt2880_pci_lock, flags);
 }
 
 int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin)
-- 
2.31.1



Re: linux-next: Tree for Apr 9 (x86 boot problem)

2021-04-12 Thread Randy Dunlap
On 4/12/21 11:06 PM, Mike Rapoport wrote:
> Hi Randy,
> 
> On Mon, Apr 12, 2021 at 01:53:34PM -0700, Randy Dunlap wrote:
>> On 4/12/21 10:01 AM, Mike Rapoport wrote:
>>> On Mon, Apr 12, 2021 at 08:49:49AM -0700, Randy Dunlap wrote:
>>>  
>>> I thought about adding some prints to see what's causing the hang, the
>>> reservations or their absence. Can you replace the debug patch with this
>>> one:
>>>
>>> diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
>>> index 776fc9b3fafe..a10ac252dbcc 100644
>>> --- a/arch/x86/kernel/setup.c
>>> +++ b/arch/x86/kernel/setup.c
>>> @@ -600,10 +600,13 @@ static bool __init snb_gfx_workaround_needed(void)
>>> return false;
>>>  
>>> vendor = read_pci_config_16(0, 2, 0, PCI_VENDOR_ID);
>>> +   devid = read_pci_config_16(0, 2, 0, PCI_DEVICE_ID);
>>> +
>>> +   pr_info("%s: vendor: %x, device: %x\n", __func__, vendor, device);
>>
>> s/device)/devid)/
>  
> Oh, sorry.
> 
>>> +
>>> if (vendor != 0x8086)
>>> return false;
>>>  
>>> -   devid = read_pci_config_16(0, 2, 0, PCI_DEVICE_ID);
>>> for (i = 0; i < ARRAY_SIZE(snb_ids); i++)
>>> if (devid == snb_ids[i])
>>> return true;
>>
>> That prints:
>>
>> [0.00] snb_gfx_workaround_needed: vendor: 8086, device: 126
>> [0.00] early_reserve_memory: snb_gfx: 1
>> ...
>> [0.014061] snb_gfx_workaround_needed: vendor: 8086, device: 126
>> [0.014064] reserving inaccessible SNB gfx pages
>>
>>
>> The full boot log is attached.
>  
> Can you please send the log with memblock=debug added to the kernel command
> line?
> 
> Probably should have started from this...
> 

It's attached.

-- 
~Randy
{bedtime}



boot0409-memblk-debug.log.gz
Description: application/gzip


[PATCH 3/8] MIPS: pci-rt3883: trivial: remove unused variable

2021-04-12 Thread Ilya Lipnitskiy
Fixes the following compiler warning:
  warning: unused variable 'flags' [-Wunused-variable]

Fixes: e5067c718b3a ("MIPS: pci-rt3883: Remove odd locking in PCI config space 
access code")
Signed-off-by: Ilya Lipnitskiy 
Cc: Sergey Ryazanov 
Cc: triv...@kernel.org
---
 arch/mips/pci/pci-rt3883.c | 4 
 1 file changed, 4 deletions(-)

diff --git a/arch/mips/pci/pci-rt3883.c b/arch/mips/pci/pci-rt3883.c
index 0ac6346026d0..e422f78db5bc 100644
--- a/arch/mips/pci/pci-rt3883.c
+++ b/arch/mips/pci/pci-rt3883.c
@@ -100,7 +100,6 @@ static u32 rt3883_pci_read_cfg32(struct 
rt3883_pci_controller *rpc,
   unsigned bus, unsigned slot,
   unsigned func, unsigned reg)
 {
-   unsigned long flags;
u32 address;
u32 ret;
 
@@ -116,7 +115,6 @@ static void rt3883_pci_write_cfg32(struct 
rt3883_pci_controller *rpc,
 unsigned bus, unsigned slot,
 unsigned func, unsigned reg, u32 val)
 {
-   unsigned long flags;
u32 address;
 
address = rt3883_pci_get_cfgaddr(bus, slot, func, reg);
@@ -229,7 +227,6 @@ static int rt3883_pci_config_read(struct pci_bus *bus, 
unsigned int devfn,
  int where, int size, u32 *val)
 {
struct rt3883_pci_controller *rpc;
-   unsigned long flags;
u32 address;
u32 data;
 
@@ -263,7 +260,6 @@ static int rt3883_pci_config_write(struct pci_bus *bus, 
unsigned int devfn,
   int where, int size, u32 val)
 {
struct rt3883_pci_controller *rpc;
-   unsigned long flags;
u32 address;
u32 data;
 
-- 
2.31.1



[PATCH 4/8] MIPS: pci-rt3883: more accurate DT error messages

2021-04-12 Thread Ilya Lipnitskiy
Existing strings do not make sense: one is always NULL and the other
refers to the wrong parent node.

Signed-off-by: Ilya Lipnitskiy 
---
 arch/mips/pci/pci-rt3883.c | 6 ++
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/arch/mips/pci/pci-rt3883.c b/arch/mips/pci/pci-rt3883.c
index e422f78db5bc..aebd4964ea34 100644
--- a/arch/mips/pci/pci-rt3883.c
+++ b/arch/mips/pci/pci-rt3883.c
@@ -431,8 +431,7 @@ static int rt3883_pci_probe(struct platform_device *pdev)
 
if (!rpc->intc_of_node) {
dev_err(dev, "%pOF has no %s child node",
-   rpc->intc_of_node,
-   "interrupt controller");
+   np, "interrupt controller");
return -EINVAL;
}
 
@@ -446,8 +445,7 @@ static int rt3883_pci_probe(struct platform_device *pdev)
 
if (!rpc->pci_controller.of_node) {
dev_err(dev, "%pOF has no %s child node",
-   rpc->intc_of_node,
-   "PCI host bridge");
+   np, "PCI host bridge");
err = -EINVAL;
goto err_put_intc_node;
}
-- 
2.31.1



[PATCH 7/8] MIPS: pci-legacy: remove busn_resource field

2021-04-12 Thread Ilya Lipnitskiy
No drivers set the busn_resource field in the pci_controller struct.
Commit 7ee214b540d9 ("MIPS: PCI: Remove unused busn_offset") almost
removed it over 3 years ago. Remove it for good to free up memory and
eliminate messages like:
  pci_bus :00: root bus resource [??? 0x flags 0x0]

Signed-off-by: Ilya Lipnitskiy 
Cc: Bjorn Helgaas 
---
 arch/mips/include/asm/pci.h | 1 -
 arch/mips/pci/pci-legacy.c  | 1 -
 2 files changed, 2 deletions(-)

diff --git a/arch/mips/include/asm/pci.h b/arch/mips/include/asm/pci.h
index 6f48649201c5..9ffc8192adae 100644
--- a/arch/mips/include/asm/pci.h
+++ b/arch/mips/include/asm/pci.h
@@ -38,7 +38,6 @@ struct pci_controller {
struct resource *io_resource;
unsigned long io_offset;
unsigned long io_map_base;
-   struct resource *busn_resource;
 
 #ifndef CONFIG_PCI_DOMAINS_GENERIC
unsigned int index;
diff --git a/arch/mips/pci/pci-legacy.c b/arch/mips/pci/pci-legacy.c
index ec3f52ade72d..78c22987bef0 100644
--- a/arch/mips/pci/pci-legacy.c
+++ b/arch/mips/pci/pci-legacy.c
@@ -89,7 +89,6 @@ static void pcibios_scanbus(struct pci_controller *hose)
hose->mem_resource, hose->mem_offset);
pci_add_resource_offset(&resources,
hose->io_resource, hose->io_offset);
-   pci_add_resource(&resources, hose->busn_resource);
list_splice_init(&resources, &bridge->windows);
bridge->dev.parent = NULL;
bridge->sysdata = hose;
-- 
2.31.1



[PATCH 6/8] MIPS: pci-legacy: remove redundant info messages

2021-04-12 Thread Ilya Lipnitskiy
Remove the following pci-legacy message:
  PCI host bridge /pci@44/host-bridge ranges:
   MEM 0x2000..0x2fff
IO 0x0046..0x0046

It is followed shortly by the same data from pci_register_host_bridge:
  PCI host bridge to bus :00
  pci_bus :00: root bus resource [mem 0x2000-0x2fff]
  pci_bus :00: root bus resource [io  0x46-0x46]

Signed-off-by: Ilya Lipnitskiy 
---
 arch/mips/pci/pci-legacy.c | 7 ---
 1 file changed, 7 deletions(-)

diff --git a/arch/mips/pci/pci-legacy.c b/arch/mips/pci/pci-legacy.c
index 3a909194284a..ec3f52ade72d 100644
--- a/arch/mips/pci/pci-legacy.c
+++ b/arch/mips/pci/pci-legacy.c
@@ -140,7 +140,6 @@ void pci_load_of_ranges(struct pci_controller *hose, struct 
device_node *node)
struct of_pci_range range;
struct of_pci_range_parser parser;
 
-   pr_info("PCI host bridge %pOF ranges:\n", node);
hose->of_node = node;
 
if (of_pci_range_parser_init(&parser, node))
@@ -151,18 +150,12 @@ void pci_load_of_ranges(struct pci_controller *hose, 
struct device_node *node)
 
switch (range.flags & IORESOURCE_TYPE_BITS) {
case IORESOURCE_IO:
-   pr_info("  IO 0x%016llx..0x%016llx\n",
-   range.cpu_addr,
-   range.cpu_addr + range.size - 1);
hose->io_map_base =
(unsigned long)ioremap(range.cpu_addr,
   range.size);
res = hose->io_resource;
break;
case IORESOURCE_MEM:
-   pr_info(" MEM 0x%016llx..0x%016llx\n",
-   range.cpu_addr,
-   range.cpu_addr + range.size - 1);
res = hose->mem_resource;
break;
}
-- 
2.31.1



[PATCH 0/8] MIPS: Fixes for PCI legacy drivers (rt2880, rt3883)

2021-04-12 Thread Ilya Lipnitskiy
One major fix for rt2880-pci in the first patch - fixes breakage that
existed since v4.14.

Other more minor fixes, cleanups, and improvements that either free up
memory, make dmesg messages clearer, or remove redundant dmesg output.

Ilya Lipnitskiy (8):
  MIPS: pci-rt2880: fix slot 0 configuration
  MIPS: pci-rt2880: remove unneeded locks
  MIPS: pci-rt3883: trivial: remove unused variable
  MIPS: pci-rt3883: more accurate DT error messages
  MIPS: pci-legacy: stop using of_pci_range_to_resource
  MIPS: pci-legacy: remove redundant info messages
  MIPS: pci-legacy: remove busn_resource field
  MIPS: pci-legacy: use generic pci_enable_resources

 arch/mips/include/asm/pci.h |  1 -
 arch/mips/pci/pci-legacy.c  | 57 ++---
 arch/mips/pci/pci-rt2880.c  | 63 +++--
 arch/mips/pci/pci-rt3883.c  | 10 ++
 4 files changed, 44 insertions(+), 87 deletions(-)

-- 
2.31.1



Re: [PATCH 1/1] arm: topology: parse the topology from the dt

2021-04-12 Thread Ruifeng Zhang
Dietmar Eggemann  于2021年4月12日周一 下午8:40写道:
>
> On 12/04/2021 14:20, Ruifeng Zhang wrote:
> > Valentin Schneider  于2021年4月12日周一 下午7:32写道:
> >>
> >>
> >> Hi,
> >>
> >> On 12/04/21 15:08, Ruifeng Zhang wrote:
> >>> From: Ruifeng Zhang 
> >>>
> >>> The arm topology still parse from the MPIDR, but it is incomplete.  When
> >>> the armv8.3 cpu runs in aarch32 mode, it will parse out the wrong 
> >>> topology.
> >>>
> >>> armv7 (A7) mpidr is:
> >>> [11:8]  [7:2]   [1:0]
> >>> cluster reservedcpu
> >>>
> >>> armv8.3 (A55) mpidr is:
> >>> [23:16] [15:8]  [7:0]
> >>> cluster cpu thread
> >>>
> >>> For compatibility to keep the function of get capacity from default
> >>> cputype, renamed arm parse_dt_topology to get_cputype_capacity and delete
> >>> related logic of parse from dt.
> >>> Arm using the same parse_dt_topology function as arm64.
> >>>
> >>> The arm device boot step is to look for the default cputype and get cpu
> >>> capacity firstly. Then parse the topology and capacity from dt to replace
> >>> default values.
> >>>
> >>
> >> I'm afraid I don't get it.
> >>
> >> CONFIG_COMPAT lets you run 32-bit stuff at EL0, but the kernel is still
> >> arm64. So if you take your armv8.3 system, the topology parsed by the
> >> kernel will be the same regardless of CONFIG_COMPAT.
> >>
> >> Could you elaborate on what problem you are trying to fix here?
> >
> > There is a armv8.3 cpu which should work normally both on aarch64 and 
> > aarch32.
> > The MPIDR has been written to the chip register in armv8.3 format.
> > For example,
> > core0: 8000
> > core1: 8100
> > core2: 8200
> > ...
> >
> > Its cpu topology can be parsed normally on aarch64 mode (both
> > userspace and kernel work on arm64).
> >
> > The problem is when it working on aarch32 mode (both userspace and
> > kernel work on arm 32-bit), the cpu topology
> > will parse error because of the format is different between armv7 and 
> > armv8.3.
> > The arm 32-bit driver, arch/arm/kernel/topology will parse the MPIDR
> > and store to the topology with armv7,
> > and the result is all cpu core_id is 0, the bit[1:0] of armv7 MPIDR format.
> >
> > In addition, I think arm should also allow customers to configure cpu
> > topologies via DT.
>
> This patch ruins the CPU capacity detection based on capacity-dmips-mhz
> (Documentation/devicetree/bindings/arm/cpu-capacity.txt) on my TC2 [L B
> B L L] (armv7).
>
> tip/sched/core with *mainline* multi_v7_defconfig:
>
> root@linaro-nano:~# cat /sys/devices/system/cpu/cpu*/cpu_capacity
> 516
> 1024
> 1024
> 516
> 516
>
> your patch with mainline multi_v7_defconfig:
>
> root@linaro-nano:~#  cat /sys/devices/system/cpu/cpu*/cpu_capacity
> 1024
> 1024
> 1024
> 1024
> 1024
>
>
> There are 2 capacity detection mechanism in arch/arm/kernel/topology.c:
>
> (1) cpu_efficiency (only for armv7 a15 and a7) based, relies on
> clock-frequency dt property
>
> (2) capacity-dmips-mhz dt property based
>
> I currently don't see how this different MPIDR layout leads to you code
> changes.

Thanks for your test, I will update patch-V2 to solve this problem.
>
>


[PATCH] ASoC: ak5558: correct reset polarity

2021-04-12 Thread Shengjiu Wang
Reset (aka power off) happens when the reset gpio is made active.
Change function name to ak5558_reset to match devicetree property
"reset-gpios".

Signed-off-by: Shengjiu Wang 
---
 sound/soc/codecs/ak5558.c | 30 ++
 1 file changed, 10 insertions(+), 20 deletions(-)

diff --git a/sound/soc/codecs/ak5558.c b/sound/soc/codecs/ak5558.c
index 8e4dca753f0b..5c3f15827423 100644
--- a/sound/soc/codecs/ak5558.c
+++ b/sound/soc/codecs/ak5558.c
@@ -318,29 +318,19 @@ static struct snd_soc_dai_driver ak5552_dai = {
.ops = &ak5558_dai_ops,
 };
 
-static void ak5558_power_off(struct ak5558_priv *ak5558)
+static void ak5558_reset(struct ak5558_priv *ak5558, bool active)
 {
-   if (!ak5558->reset_gpiod)
-   return;
-
-   gpiod_set_value_cansleep(ak5558->reset_gpiod, 0);
-   usleep_range(1000, 2000);
-}
-
-static void ak5558_power_on(struct ak5558_priv *ak5558)
-{
-   if (!ak5558->reset_gpiod)
-   return;
-
-   gpiod_set_value_cansleep(ak5558->reset_gpiod, 1);
-   usleep_range(1000, 2000);
+   if (ak5558->reset_gpiod) {
+   gpiod_set_value_cansleep(ak5558->reset_gpiod, active);
+   usleep_range(1000, 2000);
+   }
 }
 
 static int ak5558_probe(struct snd_soc_component *component)
 {
struct ak5558_priv *ak5558 = snd_soc_component_get_drvdata(component);
 
-   ak5558_power_on(ak5558);
+   ak5558_reset(ak5558, false);
return ak5558_set_mcki(component);
 }
 
@@ -348,7 +338,7 @@ static void ak5558_remove(struct snd_soc_component 
*component)
 {
struct ak5558_priv *ak5558 = snd_soc_component_get_drvdata(component);
 
-   ak5558_power_off(ak5558);
+   ak5558_reset(ak5558, true);
 }
 
 static int __maybe_unused ak5558_runtime_suspend(struct device *dev)
@@ -356,7 +346,7 @@ static int __maybe_unused ak5558_runtime_suspend(struct 
device *dev)
struct ak5558_priv *ak5558 = dev_get_drvdata(dev);
 
regcache_cache_only(ak5558->regmap, true);
-   ak5558_power_off(ak5558);
+   ak5558_reset(ak5558, true);
 
regulator_bulk_disable(ARRAY_SIZE(ak5558->supplies),
   ak5558->supplies);
@@ -375,8 +365,8 @@ static int __maybe_unused ak5558_runtime_resume(struct 
device *dev)
return ret;
}
 
-   ak5558_power_off(ak5558);
-   ak5558_power_on(ak5558);
+   ak5558_reset(ak5558, true);
+   ak5558_reset(ak5558, false);
 
regcache_cache_only(ak5558->regmap, false);
regcache_mark_dirty(ak5558->regmap);
-- 
2.27.0



[PATCH] drm/i915/gvt: remove useless function

2021-04-12 Thread Jiapeng Chong
Fix the following clang warning:

drivers/gpu/drm/i915/gvt/gtt.c:590:20: warning: unused function
'ppgtt_set_guest_root_entry' [-Wunused-function].

Reported-by: Abaci Robot 
Signed-off-by: Jiapeng Chong 
---
 drivers/gpu/drm/i915/gvt/gtt.c | 6 --
 1 file changed, 6 deletions(-)

diff --git a/drivers/gpu/drm/i915/gvt/gtt.c b/drivers/gpu/drm/i915/gvt/gtt.c
index 897c007..a01ff44 100644
--- a/drivers/gpu/drm/i915/gvt/gtt.c
+++ b/drivers/gpu/drm/i915/gvt/gtt.c
@@ -587,12 +587,6 @@ static void _ppgtt_set_root_entry(struct intel_vgpu_mm *mm,
   entry, index, false, 0, mm->vgpu);
 }
 
-static inline void ppgtt_set_guest_root_entry(struct intel_vgpu_mm *mm,
-   struct intel_gvt_gtt_entry *entry, unsigned long index)
-{
-   _ppgtt_set_root_entry(mm, entry, index, true);
-}
-
 static inline void ppgtt_set_shadow_root_entry(struct intel_vgpu_mm *mm,
struct intel_gvt_gtt_entry *entry, unsigned long index)
 {
-- 
1.8.3.1



[PATCH] irq: Fix missing IRQF_ONESHOT as only threaded handler

2021-04-12 Thread zhuguangqing83
From: Guangqing Zhu 

Coccinelle noticed:
  kernel/irq/manage.c:2199:8-28: ERROR: Threaded IRQ with no primary
handler requested without IRQF_ONESHOT.

Signed-off-by: Guangqing Zhu 
---
 kernel/irq/manage.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 4c14356543d9..222816750048 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -2197,7 +2197,7 @@ int request_any_context_irq(unsigned int irq, 
irq_handler_t handler,
 
if (irq_settings_is_nested_thread(desc)) {
ret = request_threaded_irq(irq, NULL, handler,
-  flags, name, dev_id);
+  flags | IRQF_ONESHOT, name, dev_id);
return !ret ? IRQC_IS_NESTED : ret;
}
 
-- 
2.17.1



Re: [PATCH 5.10 000/188] 5.10.30-rc1 review

2021-04-12 Thread Samuel Zou




On 2021/4/12 16:38, Greg Kroah-Hartman wrote:

This is the start of the stable review cycle for the 5.10.30 release.
There are 188 patches in this series, all will be posted as a response
to this one.  If anyone has any issues with these being applied, please
let me know.

Responses should be made by Wed, 14 Apr 2021 08:39:44 +.
Anything received after that time might be too late.

The whole patch series can be found in one patch at:

https://www.kernel.org/pub/linux/kernel/v5.x/stable-review/patch-5.10.30-rc1.gz
or in the git tree and branch at:

git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git 
linux-5.10.y
and the diffstat can be found below.

thanks,

greg k-h



Tested on arm64 and x86 for 5.10.30-rc1,

Kernel repo:
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git
Branch: linux-5.10.y
Version: 5.10.30-rc1
Commit: 8ac4b1deedaa507b5d0f46316e7f32004dd99cd1
Compiler: gcc version 7.3.0 (GCC)

arm64:

Testcase Result Summary:
total: 5264
passed: 5264
failed: 0
timeout: 0


x86:

Testcase Result Summary:
total: 5264
passed: 5264
failed: 0
timeout: 0


Tested-by: Hulk Robot 



[RFC PATCH] delayacct: delayacct_stats[] can be static

2021-04-12 Thread kernel test robot


Reported-by: kernel test robot 
Signed-off-by: kernel test robot 
---
 delayacct.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index b8d719fbfc404..2505aa9f87f61 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -32,7 +32,7 @@ struct delayacct_stat {
unsigned int idx;
 };
 
-struct delayacct_stat delayacct_stats[] = {
+static struct delayacct_stat delayacct_stats[] = {
{"blkio", DELAYACCT_BLKIO},
{"swapin", DELAYACCT_SWAPIN},
{"pagecache_thrashing", DELAYACCT_THRASHING},


Re: [RESEND PATCH 2/2] delayacct: Add a proc file to dump the delay info

2021-04-12 Thread kernel test robot
Hi brookxu,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on linus/master]
[also build test WARNING on v5.12-rc7 next-20210412]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:
https://github.com/0day-ci/linux/commits/brookxu/delayacct-refactor-the-code-to-simplify-the-implementation/20210413-093934
base:   https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git 
89698becf06d341a700913c3d89ce2a914af69a2
config: x86_64-randconfig-s021-20210413 (attached as .config)
compiler: gcc-9 (Debian 9.3.0-22) 9.3.0
reproduce:
# apt-get install sparse
# sparse version: v0.6.3-280-g2cd6d34e-dirty
# 
https://github.com/0day-ci/linux/commit/7023a409dec95195a0e3360a36e8cb66363a9457
git remote add linux-review https://github.com/0day-ci/linux
git fetch --no-tags linux-review 
brookxu/delayacct-refactor-the-code-to-simplify-the-implementation/20210413-093934
git checkout 7023a409dec95195a0e3360a36e8cb66363a9457
# save the attached .config to linux build tree
make W=1 C=1 CF='-fdiagnostic-prefix -D__CHECK_ENDIAN__' ARCH=x86_64 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot 


sparse warnings: (new ones prefixed by >>)
>> kernel/delayacct.c:35:23: sparse: sparse: symbol 'delayacct_stats' was not 
>> declared. Should it be static?

Please review and possibly fold the followup patch.

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-...@lists.01.org


.config.gz
Description: application/gzip


[tip:x86/sgx] BUILD SUCCESS 523caed9efbb049339706b124185c9358c1b6477

2021-04-12 Thread kernel test robot
   defconfig
alphaallyesconfig
xtensa   allyesconfig
h8300allyesconfig
arc defconfig
sh   allmodconfig
parisc  defconfig
s390 allyesconfig
s390 allmodconfig
parisc   allyesconfig
s390defconfig
sparcallyesconfig
sparc   defconfig
i386defconfig
mips allyesconfig
mips allmodconfig
powerpc  allyesconfig
powerpc  allmodconfig
powerpc   allnoconfig
i386 randconfig-a003-20210412
i386 randconfig-a001-20210412
i386 randconfig-a006-20210412
i386 randconfig-a005-20210412
i386 randconfig-a004-20210412
i386 randconfig-a002-20210412
i386 randconfig-a003-20210413
i386 randconfig-a001-20210413
i386 randconfig-a006-20210413
i386 randconfig-a005-20210413
i386 randconfig-a004-20210413
i386 randconfig-a002-20210413
x86_64   randconfig-a014-20210412
x86_64   randconfig-a015-20210412
x86_64   randconfig-a011-20210412
x86_64   randconfig-a013-20210412
x86_64   randconfig-a012-20210412
x86_64   randconfig-a016-20210412
i386 randconfig-a015-20210412
i386 randconfig-a014-20210412
i386 randconfig-a013-20210412
i386 randconfig-a012-20210412
i386 randconfig-a016-20210412
i386 randconfig-a011-20210412
riscvnommu_k210_defconfig
riscvnommu_virt_defconfig
riscv   defconfig
riscv  rv32_defconfig
umallnoconfig
um   allyesconfig
um  defconfig
x86_64rhel-8.3-kselftests
x86_64  defconfig
x86_64   rhel-8.3
x86_64  rhel-8.3-kbuiltin
x86_64  kexec

clang tested configs:
x86_64   randconfig-a003-20210412
x86_64   randconfig-a002-20210412
x86_64   randconfig-a001-20210412
x86_64   randconfig-a005-20210412
x86_64   randconfig-a006-20210412
x86_64   randconfig-a004-20210412

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-...@lists.01.org


[tip:x86/platform] BUILD SUCCESS 8f2aca40dd077f74e62982cd2669845f41ed0ac6

2021-04-12 Thread kernel test robot
multi_v5_defconfig
powerpc pq2fads_defconfig
sh   se7751_defconfig
m68k  amiga_defconfig
arm vf610m4_defconfig
arm lpc32xx_defconfig
h8300alldefconfig
powerpc   ebony_defconfig
pariscgeneric-32bit_defconfig
ia64 allmodconfig
ia64 allyesconfig
m68k allmodconfig
m68kdefconfig
m68k allyesconfig
nios2   defconfig
arc  allyesconfig
nds32 allnoconfig
nds32   defconfig
cskydefconfig
alpha   defconfig
alphaallyesconfig
xtensa   allyesconfig
h8300allyesconfig
arc defconfig
sh   allmodconfig
parisc  defconfig
s390 allyesconfig
s390 allmodconfig
parisc   allyesconfig
s390defconfig
sparcallyesconfig
sparc   defconfig
i386defconfig
mips allyesconfig
mips allmodconfig
powerpc  allyesconfig
powerpc  allmodconfig
powerpc   allnoconfig
x86_64   randconfig-a003-20210413
x86_64   randconfig-a002-20210413
x86_64   randconfig-a001-20210413
x86_64   randconfig-a005-20210413
x86_64   randconfig-a006-20210413
x86_64   randconfig-a004-20210413
i386 randconfig-a003-20210412
i386 randconfig-a001-20210412
i386 randconfig-a006-20210412
i386 randconfig-a005-20210412
i386 randconfig-a004-20210412
i386 randconfig-a002-20210412
i386 randconfig-a003-20210413
i386 randconfig-a001-20210413
i386 randconfig-a006-20210413
i386 randconfig-a005-20210413
i386 randconfig-a004-20210413
i386 randconfig-a002-20210413
x86_64   randconfig-a014-20210412
x86_64   randconfig-a015-20210412
x86_64   randconfig-a011-20210412
x86_64   randconfig-a013-20210412
x86_64   randconfig-a012-20210412
x86_64   randconfig-a016-20210412
i386 randconfig-a015-20210412
i386 randconfig-a014-20210412
i386 randconfig-a013-20210412
i386 randconfig-a012-20210412
i386 randconfig-a016-20210412
i386 randconfig-a011-20210412
riscvnommu_k210_defconfig
riscvnommu_virt_defconfig
riscv   defconfig
riscv  rv32_defconfig
um   allyesconfig
x86_64rhel-8.3-kselftests
x86_64  defconfig
x86_64   rhel-8.3
x86_64  rhel-8.3-kbuiltin
x86_64  kexec

clang tested configs:
x86_64   randconfig-a003-20210412
x86_64   randconfig-a002-20210412
x86_64   randconfig-a001-20210412
x86_64   randconfig-a005-20210412
x86_64   randconfig-a006-20210412
x86_64   randconfig-a004-20210412

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-...@lists.01.org


Re: [PATCH 1/1] arm: topology: parse the topology from the dt

2021-04-12 Thread Ruifeng Zhang
Valentin Schneider  于2021年4月12日周一 下午11:33写道:
>
> On 12/04/21 20:20, Ruifeng Zhang wrote:
> > There is a armv8.3 cpu which should work normally both on aarch64 and 
> > aarch32.
> > The MPIDR has been written to the chip register in armv8.3 format.
> > For example,
> > core0: 8000
> > core1: 8100
> > core2: 8200
> > ...
> >
> > Its cpu topology can be parsed normally on aarch64 mode (both
> > userspace and kernel work on arm64).
> >
> > The problem is when it working on aarch32 mode (both userspace and
> > kernel work on arm 32-bit),
>
> I didn't know using aarch32 elsewhere than EL0 was something actually being
> used. Do you deploy this somewhere, or do you use it for testing purposes?

In Unisoc, the sc9863a SoC which using cortex-a55, it has two software
version, one
of them is the kernel running on EL1 using aarch32.
  user(EL0)kernel(EL1)
sc9863a_go  aarch32   aarch32
sc9863aaarch64   aarch64
>
> > the cpu topology
> > will parse error because of the format is different between armv7 and 
> > armv8.3.
> > The arm 32-bit driver, arch/arm/kernel/topology will parse the MPIDR
> > and store to the topology with armv7,
> > and the result is all cpu core_id is 0, the bit[1:0] of armv7 MPIDR format.
> >
>
> I'm not fluent at all in armv7 (or most aarch32 compat mode stuff), but
> I couldn't find anything about MPIDR format differences:
>
>   DDI 0487G.a G8.2.113
>   """
>   AArch32 System register MPIDR bits [31:0] are architecturally mapped to
>   AArch64 System register MPIDR_EL1[31:0].
>   """
>
> Peeking at some armv7 doc and arm/kernel/topology.c the layout really looks
> just the same, i.e. for both of them, with your example of:

The cortex-a7 spec DDI0464F 4.3.5
https://developer.arm.com/documentation/ddi0464/f/?lang=en

The current arch/arm/kernel/topology code parse the MPIDR with a armv7 format.
the parse code is:
void store_cpu_topology(unsigned int cpuid)
{
...
cpuid_topo->thread_id = -1;
cpuid_topo->core_id = MPIDR_AFFINITY_LEVEL(mpidr, 0);
cpuid_topo->package_id = MPIDR_AFFINITY_LEVEL(mpidr, 1);
...
}
>
>   core0: 8000
>   core1: 8100
>   core2: 8200
>   ...
>
> we'll get:
>
>   |   | aff2 | aff1 | aff0 |
>   |---+--+--+--|
>   | Core0 |0 |0 |0 |
>   | Core1 |0 |1 |0 |
>   | Core2 |0 |2 |0 |
>   ...
>
> Now, arm64 doesn't fallback to MPIDR for topology information anymore since
>
>   3102bc0e6ac7 ("arm64: topology: Stop using MPIDR for topology information")
>
> so without DT we would get:
>   |   | package_id | core_id |
>   |---++-|
>   | Core0 |  0 |   0 |
>   | Core1 |  0 |   1 |
>   | Core2 |  0 |   2 |
>
> Whereas with an arm kernel we'll end up parsing MPIDR as:
>   |   | package_id | core_id |
>   |---++-|
>   | Core0 |  0 |   0 |
>   | Core1 |  1 |   0 |
>   | Core2 |  2 |   0 |
>
> Did I get this right? Is this what you're observing?

Yes, this is a problem if an armv8.2 or above cpu is running a 32-bit
kernel on EL1.
>
> > In addition, I think arm should also allow customers to configure cpu
> > topologies via DT.


Re: [syzbot] KASAN: slab-out-of-bounds Read in reiserfs_xattr_get

2021-04-12 Thread Dmitry Vyukov
On Tue, Apr 13, 2021 at 7:55 AM syzbot
 wrote:
>
> Hello,
>
> syzbot found the following issue on:
>
> HEAD commit:3a229812 Merge tag 'arm-fixes-5.11-2' of git://git.kernel...
> git tree:   upstream
> console output: https://syzkaller.appspot.com/x/log.txt?x=16b4d196d0
> kernel config:  https://syzkaller.appspot.com/x/.config?x=f91155ccddaf919c
> dashboard link: https://syzkaller.appspot.com/bug?extid=72ba979b6681c3369db4
> compiler:   Debian clang version 11.0.1-2
>
> Unfortunately, I don't have any reproducer for this issue yet.
>
> IMPORTANT: if you fix the issue, please add the following tag to the commit:
> Reported-by: syzbot+72ba979b6681c3369...@syzkaller.appspotmail.com

Maybe related to:
https://lore.kernel.org/lkml/5f397905ba42a...@google.com/
? there are some uninits involved in reiserfs attrs.

> loop3: detected capacity change from 0 to 65534
> ==
> BUG: KASAN: slab-out-of-bounds in reiserfs_xattr_get+0xe0/0x590 
> fs/reiserfs/xattr.c:681
> Read of size 8 at addr 888028983198 by task syz-executor.3/4211
>
> CPU: 1 PID: 4211 Comm: syz-executor.3 Not tainted 5.12.0-rc6-syzkaller #0
> Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS 
> Google 01/01/2011
> Call Trace:
>  __dump_stack lib/dump_stack.c:79 [inline]
>  dump_stack+0x176/0x24e lib/dump_stack.c:120
>  print_address_description+0x5f/0x3a0 mm/kasan/report.c:232
>  __kasan_report mm/kasan/report.c:399 [inline]
>  kasan_report+0x15c/0x200 mm/kasan/report.c:416
>  reiserfs_xattr_get+0xe0/0x590 fs/reiserfs/xattr.c:681
>  reiserfs_get_acl+0x63/0x670 fs/reiserfs/xattr_acl.c:211
>  get_acl+0x152/0x2e0 fs/posix_acl.c:141
>  check_acl fs/namei.c:294 [inline]
>  acl_permission_check fs/namei.c:339 [inline]
>  generic_permission+0x2ed/0x5b0 fs/namei.c:392
>  do_inode_permission fs/namei.c:446 [inline]
>  inode_permission+0x28e/0x500 fs/namei.c:513
>  may_open+0x228/0x3e0 fs/namei.c:2985
>  do_open fs/namei.c:3365 [inline]
>  path_openat+0x2697/0x3860 fs/namei.c:3500
>  do_filp_open+0x1a3/0x3b0 fs/namei.c:3527
>  do_sys_openat2+0xba/0x380 fs/open.c:1187
>  do_sys_open fs/open.c:1203 [inline]
>  __do_sys_openat fs/open.c:1219 [inline]
>  __se_sys_openat fs/open.c:1214 [inline]
>  __x64_sys_openat+0x1c8/0x1f0 fs/open.c:1214
>  do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46
>  entry_SYSCALL_64_after_hwframe+0x44/0xae
> RIP: 0033:0x419544
> Code: 84 00 00 00 00 00 44 89 54 24 0c e8 96 f9 ff ff 44 8b 54 24 0c 44 89 e2 
> 48 89 ee 41 89 c0 bf 9c ff ff ff b8 01 01 00 00 0f 05 <48> 3d 00 f0 ff ff 77 
> 34 44 89 c7 89 44 24 0c e8 c8 f9 ff ff 8b 44
> RSP: 002b:7fa357a03f30 EFLAGS: 0293 ORIG_RAX: 0101
> RAX: ffda RBX: 2200 RCX: 00419544
> RDX: 0001 RSI: 2100 RDI: ff9c
> RBP: 2100 R08:  R09: 2000
> R10:  R11: 0293 R12: 0001
> R13: 2100 R14: 7fa357a04000 R15: 20065600
>
> Allocated by task 4210:
>  kasan_save_stack mm/kasan/common.c:38 [inline]
>  kasan_set_track mm/kasan/common.c:46 [inline]
>  set_alloc_info mm/kasan/common.c:427 [inline]
>  kasan_kmalloc+0xc2/0xf0 mm/kasan/common.c:506
>  kasan_kmalloc include/linux/kasan.h:233 [inline]
>  kmem_cache_alloc_trace+0x21b/0x350 mm/slub.c:2934
>  kmalloc include/linux/slab.h:554 [inline]
>  kzalloc include/linux/slab.h:684 [inline]
>  smk_fetch security/smack/smack_lsm.c:288 [inline]
>  smack_d_instantiate+0x65c/0xcc0 security/smack/smack_lsm.c:3411
>  security_d_instantiate+0xa5/0x100 security/security.c:1987
>  d_instantiate_new+0x61/0x110 fs/dcache.c:2025
>  ext4_add_nondir+0x22b/0x290 fs/ext4/namei.c:2590
>  ext4_symlink+0x8ce/0xe90 fs/ext4/namei.c:3417
>  vfs_symlink+0x3a0/0x540 fs/namei.c:4178
>  do_symlinkat+0x1c9/0x440 fs/namei.c:4208
>  do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46
>  entry_SYSCALL_64_after_hwframe+0x44/0xae
>
> Freed by task 4210:
>  kasan_save_stack mm/kasan/common.c:38 [inline]
>  kasan_set_track+0x3d/0x70 mm/kasan/common.c:46
>  kasan_set_free_info+0x1f/0x40 mm/kasan/generic.c:357
>  kasan_slab_free+0x100/0x140 mm/kasan/common.c:360
>  kasan_slab_free include/linux/kasan.h:199 [inline]
>  slab_free_hook mm/slub.c:1562 [inline]
>  slab_free_freelist_hook+0x171/0x270 mm/slub.c:1600
>  slab_free mm/slub.c:3161 [inline]
>  kfree+0xcf/0x2d0 mm/slub.c:4213
>  smk_fetch security/smack/smack_lsm.c:300 [inline]
>  smack_d_instantiate+0x6db/0xcc0 security/smack/smack_lsm.c:3411
>  security_d_instantiate+0xa5/0x100 security/security.c:1987
>  d_instantiate_new+0x61/0x110 fs/dcache.c:2025
>  ext4_add_nondir+0x22b/0x290 fs/ext4/namei.c:2590
>  ext4_symlink+0x8ce/0xe90 fs/ext4/namei.c:3417
>  vfs_symlink+0x3a0/0x540 fs/namei.c:4178
>  do_symlinkat+0x1c9/0x440 fs/namei.c:4208
>  do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46
>  entry_SYSCALL_64_after_hwframe+0x44/0xae
>
> Las

Re: [PATCH v14 4/6] locking/qspinlock: Introduce starvation avoidance into CNA

2021-04-12 Thread Andi Kleen
Andi Kleen  writes:

> Alex Kogan  writes:
>>  
>> +numa_spinlock_threshold=[NUMA, PV_OPS]
>> +Set the time threshold in milliseconds for the
>> +number of intra-node lock hand-offs before the
>> +NUMA-aware spinlock is forced to be passed to
>> +a thread on another NUMA node.  Valid values
>> +are in the [1..100] range. Smaller values result
>> +in a more fair, but less performant spinlock,
>> +and vice versa. The default value is 10.
>
> ms granularity seems very coarse grained for this. Surely
> at some point of spinning you can afford a ktime_get? But ok.

Actually thinking about it more using jiffies is likely broken
anyways because if the interrupts are disabled and the CPU
is running the main timer interrupts they won't increase.

cpu_clock (better than ktime_get) or sched_clock would work.

-Andi


/usr/bin/ld: ll_temac_main.c:undefined reference to `devm_of_iomap'

2021-04-12 Thread kernel test robot
Hi Andre,

FYI, the error/warning still remains.

tree:   https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git 
master
head:   89698becf06d341a700913c3d89ce2a914af69a2
commit: e8b6c54f6d57822e228027d41a1edb317034a08c net: xilinx: temac: Relax 
Kconfig dependencies
date:   1 year, 1 month ago
config: um-randconfig-r026-20210413 (attached as .config)
compiler: gcc-9 (Debian 9.3.0-22) 9.3.0
reproduce (this is a W=1 build):
# 
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=e8b6c54f6d57822e228027d41a1edb317034a08c
git remote add linus 
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
git fetch --no-tags linus master
git checkout e8b6c54f6d57822e228027d41a1edb317034a08c
# save the attached .config to linux build tree
make W=1 ARCH=um 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot 

All errors (new ones prefixed by >>):

   /usr/bin/ld: drivers/net/ethernet/xilinx/ll_temac_main.o: in function 
`temac_probe':
   ll_temac_main.c:(.text+0xe9d): undefined reference to `devm_ioremap'
>> /usr/bin/ld: ll_temac_main.c:(.text+0xf90): undefined reference to 
>> `devm_of_iomap'
   /usr/bin/ld: ll_temac_main.c:(.text+0x1159): undefined reference to 
`devm_ioremap'
   /usr/bin/ld: drivers/misc/altera-stapl/altera-lpt.o:(.altinstructions+0x8): 
undefined reference to `X86_FEATURE_XMM2'
   /usr/bin/ld: drivers/misc/altera-stapl/altera-lpt.o:(.altinstructions+0x15): 
undefined reference to `X86_FEATURE_XMM'
   /usr/bin/ld: drivers/misc/altera-stapl/altera-lpt.o:(.altinstructions+0x22): 
undefined reference to `X86_FEATURE_XMM'
   /usr/bin/ld: drivers/misc/altera-stapl/altera-lpt.o:(.altinstructions+0x2f): 
undefined reference to `X86_FEATURE_XMM2'
   /usr/bin/ld: drivers/misc/altera-stapl/altera-lpt.o:(.altinstructions+0x3c): 
undefined reference to `X86_FEATURE_XMM'
   /usr/bin/ld: drivers/misc/altera-stapl/altera-lpt.o:(.altinstructions+0x49): 
undefined reference to `X86_FEATURE_XMM'
   collect2: error: ld returned 1 exit status

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-...@lists.01.org


.config.gz
Description: application/gzip


Re: [PATCH][next] scsi: aacraid: Replace one-element array with flexible-array member

2021-04-12 Thread Gustavo A. R. Silva
Hi Martin,

On 4/12/21 23:52, Martin K. Petersen wrote:

> Silencing analyzer warnings shouldn't be done at the expense of human
> readers. If it is imperative to switch to flex_array_size() to quiesce
> checker warnings, please add a comment in the code explaining that the
> size evaluates to nseg_new-1 sge_ieee1212 structs.

Done:
https://lore.kernel.org/lkml/20210413054032.GA276102@embeddedor/

Thanks!
--
Gustavo


Re: [PATCH] KVM: arm/arm64: Fix KVM_VGIC_V3_ADDR_TYPE_REDIST read

2021-04-12 Thread Keqian Zhu


On 2021/4/12 23:00, Eric Auger wrote:
> When reading the base address of the a REDIST region
> through KVM_VGIC_V3_ADDR_TYPE_REDIST we expect the
> redistributor region list to be populated with a single
> element.
> 
> However list_first_entry() expects the list to be non empty.
Indeed, list_first_entry() always return a non-null ptr. If the list
is empty, it will mistake the list head as the first element.

> Instead we should use list_first_entry_or_null which effectively
> returns NULL if the list is empty.
> 
> Fixes: dbd9733ab674 ("KVM: arm/arm64: Replace the single rdist region by a 
> list")
> Cc:  # v4.18+
> Signed-off-by: Eric Auger 
> Reported-by: Gavin Shan 
> ---
>  arch/arm64/kvm/vgic/vgic-kvm-device.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/arm64/kvm/vgic/vgic-kvm-device.c 
> b/arch/arm64/kvm/vgic/vgic-kvm-device.c
> index 44419679f91a..5eaede3e3b5a 100644
> --- a/arch/arm64/kvm/vgic/vgic-kvm-device.c
> +++ b/arch/arm64/kvm/vgic/vgic-kvm-device.c
> @@ -87,8 +87,8 @@ int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 
> *addr, bool write)
>   r = vgic_v3_set_redist_base(kvm, 0, *addr, 0);
>   goto out;
>   }
> - rdreg = list_first_entry(&vgic->rd_regions,
> -  struct vgic_redist_region, list);
> + rdreg = list_first_entry_or_null(&vgic->rd_regions,
> +  struct vgic_redist_region, 
> list);
>   if (!rdreg)
>   addr_ptr = &undef_value;
>   else
> 


Re: [PATCH v5 04/16] memory: mtk-smi: Add device-link between smi-larb and smi-common

2021-04-12 Thread Yong Wu
On Sat, 2021-04-10 at 14:40 +0200, Krzysztof Kozlowski wrote:
> On 10/04/2021 11:11, Yong Wu wrote:
> > Normally, If the smi-larb HW need work, we should enable the smi-common
> > HW power and clock firstly.
> > This patch adds device-link between the smi-larb dev and the smi-common
> > dev. then If pm_runtime_get_sync(smi-larb-dev), the pm_runtime_get_sync
> > (smi-common-dev) will be called automatically.
> > 
> > Also, Add DL_FLAG_STATELESS to avoid the smi-common clocks be gated when
> > probe.
> > 
> > CC: Matthias Brugger 
> > Suggested-by: Tomasz Figa 
> > Signed-off-by: Yong Wu 
> > ---
> >  drivers/memory/mtk-smi.c | 19 ++-
> >  1 file changed, 10 insertions(+), 9 deletions(-)
> 
> I understood this is a dependency for other patches, so:
> Acked-by: Krzysztof Kozlowski 
> 
> If I am wrong and I can take it via memory tree, let me know.

Hi Krzysztof,

Thanks very much for your quickly review.

I think it is ok if it go through memory tree. In the original patch, we
pm_runtime_get(smi-common-dev) in the smi-larb's pm resume callback.
This patch only use device-link do this. thus, this patch have no
function change. it only adjusts the SMI internal code flow.

In addition, [14/16] expects your Acked-by. and that one should be
merged with the others.

About the others patches, I'm not sure which tree they should go
through. they cross several trees, dt-binding/iommu/media/drm/dts.

Not sure if Matthias could have time to review and give some suggestion.

> 
> Best regards,
> Krzysztof
> 
> ___
> Linux-mediatek mailing list
> linux-media...@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-mediatek



Re: linux-next: Tree for Apr 9 (x86 boot problem)

2021-04-12 Thread Mike Rapoport
Hi Randy,

On Mon, Apr 12, 2021 at 01:53:34PM -0700, Randy Dunlap wrote:
> On 4/12/21 10:01 AM, Mike Rapoport wrote:
> > On Mon, Apr 12, 2021 at 08:49:49AM -0700, Randy Dunlap wrote:
> >  
> > I thought about adding some prints to see what's causing the hang, the
> > reservations or their absence. Can you replace the debug patch with this
> > one:
> > 
> > diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
> > index 776fc9b3fafe..a10ac252dbcc 100644
> > --- a/arch/x86/kernel/setup.c
> > +++ b/arch/x86/kernel/setup.c
> > @@ -600,10 +600,13 @@ static bool __init snb_gfx_workaround_needed(void)
> > return false;
> >  
> > vendor = read_pci_config_16(0, 2, 0, PCI_VENDOR_ID);
> > +   devid = read_pci_config_16(0, 2, 0, PCI_DEVICE_ID);
> > +
> > +   pr_info("%s: vendor: %x, device: %x\n", __func__, vendor, device);
> 
> s/device)/devid)/
 
Oh, sorry.

> > +
> > if (vendor != 0x8086)
> > return false;
> >  
> > -   devid = read_pci_config_16(0, 2, 0, PCI_DEVICE_ID);
> > for (i = 0; i < ARRAY_SIZE(snb_ids); i++)
> > if (devid == snb_ids[i])
> > return true;
> 
> That prints:
> 
> [0.00] snb_gfx_workaround_needed: vendor: 8086, device: 126
> [0.00] early_reserve_memory: snb_gfx: 1
> ...
> [0.014061] snb_gfx_workaround_needed: vendor: 8086, device: 126
> [0.014064] reserving inaccessible SNB gfx pages
> 
> 
> The full boot log is attached.
 
Can you please send the log with memblock=debug added to the kernel command
line?

Probably should have started from this...

-- 
Sincerely yours,
Mike.


[PATCH] hwmon: (nct6683) remove useless function

2021-04-12 Thread Jiapeng Chong
Fix the following clang warning:

drivers/hwmon/nct6683.c:491:19: warning: unused function 'in_to_reg'
[-Wunused-function].

Reported-by: Abaci Robot 
Signed-off-by: Jiapeng Chong 
---
 drivers/hwmon/nct6683.c | 11 ---
 1 file changed, 11 deletions(-)

diff --git a/drivers/hwmon/nct6683.c b/drivers/hwmon/nct6683.c
index a23047a..b886cf0 100644
--- a/drivers/hwmon/nct6683.c
+++ b/drivers/hwmon/nct6683.c
@@ -488,17 +488,6 @@ static inline long in_from_reg(u16 reg, u8 src)
return reg * scale;
 }
 
-static inline u16 in_to_reg(u32 val, u8 src)
-{
-   int scale = 16;
-
-   if (src == MON_SRC_VCC || src == MON_SRC_VSB || src == MON_SRC_AVSB ||
-   src == MON_SRC_VBAT)
-   scale <<= 1;
-
-   return clamp_val(DIV_ROUND_CLOSEST(val, scale), 0, 127);
-}
-
 static u16 nct6683_read(struct nct6683_data *data, u16 reg)
 {
int res;
-- 
1.8.3.1



Re: [PATCH v14 4/6] locking/qspinlock: Introduce starvation avoidance into CNA

2021-04-12 Thread Andi Kleen
Alex Kogan  writes:
>  
> + numa_spinlock_threshold=[NUMA, PV_OPS]
> + Set the time threshold in milliseconds for the
> + number of intra-node lock hand-offs before the
> + NUMA-aware spinlock is forced to be passed to
> + a thread on another NUMA node.  Valid values
> + are in the [1..100] range. Smaller values result
> + in a more fair, but less performant spinlock,
> + and vice versa. The default value is 10.

ms granularity seems very coarse grained for this. Surely
at some point of spinning you can afford a ktime_get? But ok.

Could you turn that into a moduleparm which can be changed at runtime?
Would be strange to have to reboot just to play with this parameter

This would also make the code a lot shorter I guess.

-Andi


[syzbot] KASAN: null-ptr-deref Write in rhashtable_free_and_destroy (2)

2021-04-12 Thread syzbot
Hello,

syzbot found the following issue on:

HEAD commit:d93a0d43 Merge tag 'block-5.12-2021-04-02' of git://git.ke..
git tree:   upstream
console output: https://syzkaller.appspot.com/x/log.txt?x=12d81cfcd0
kernel config:  https://syzkaller.appspot.com/x/.config?x=71a75beb62b62a34
dashboard link: https://syzkaller.appspot.com/bug?extid=860268315ba86ea6b96b
compiler:   Debian clang version 11.0.1-2

Unfortunately, I don't have any reproducer for this issue yet.

IMPORTANT: if you fix the issue, please add the following tag to the commit:
Reported-by: syzbot+860268315ba86ea6b...@syzkaller.appspotmail.com

==
BUG: KASAN: null-ptr-deref in instrument_atomic_read_write 
include/linux/instrumented.h:101 [inline]
BUG: KASAN: null-ptr-deref in test_and_set_bit 
include/asm-generic/bitops/instrumented-atomic.h:70 [inline]
BUG: KASAN: null-ptr-deref in try_to_grab_pending+0xee/0xa50 
kernel/workqueue.c:1257
Write of size 8 at addr 0088 by task kworker/0:3/4787

CPU: 0 PID: 4787 Comm: kworker/0:3 Not tainted 5.12.0-rc5-syzkaller #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 
01/01/2011
Workqueue: events cfg80211_destroy_iface_wk
Call Trace:
 __dump_stack lib/dump_stack.c:79 [inline]
 dump_stack+0x176/0x24e lib/dump_stack.c:120
 __kasan_report mm/kasan/report.c:403 [inline]
 kasan_report+0x152/0x200 mm/kasan/report.c:416
 check_region_inline mm/kasan/generic.c:135 [inline]
 kasan_check_range+0x2b5/0x2f0 mm/kasan/generic.c:186
 instrument_atomic_read_write include/linux/instrumented.h:101 [inline]
 test_and_set_bit include/asm-generic/bitops/instrumented-atomic.h:70 [inline]
 try_to_grab_pending+0xee/0xa50 kernel/workqueue.c:1257
 __cancel_work_timer+0x81/0x5b0 kernel/workqueue.c:3098
 rhashtable_free_and_destroy+0x25/0x8b0 lib/rhashtable.c:1137
 mesh_table_free net/mac80211/mesh_pathtbl.c:70 [inline]
 mesh_pathtbl_unregister+0x4b/0xa0 net/mac80211/mesh_pathtbl.c:812
 unregister_netdevice_many+0x12ea/0x18e0 net/core/dev.c:10951
 unregister_netdevice_queue+0x2a9/0x300 net/core/dev.c:10868
 unregister_netdevice include/linux/netdevice.h:2884 [inline]
 _cfg80211_unregister_wdev+0x17b/0x5b0 net/wireless/core.c:1127
 ieee80211_if_remove+0x1cc/0x250 net/mac80211/iface.c:2020
 ieee80211_del_iface+0x12/0x20 net/mac80211/cfg.c:144
 rdev_del_virtual_intf net/wireless/rdev-ops.h:57 [inline]
 cfg80211_destroy_ifaces+0x182/0x250 net/wireless/core.c:341
 cfg80211_destroy_iface_wk+0x30/0x40 net/wireless/core.c:354
 process_one_work+0x789/0xfd0 kernel/workqueue.c:2275
 worker_thread+0xac1/0x1300 kernel/workqueue.c:2421
 kthread+0x39a/0x3c0 kernel/kthread.c:292
 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:294
==


---
This report is generated by a bot. It may contain errors.
See https://goo.gl/tpsmEJ for more information about syzbot.
syzbot engineers can be reached at syzkal...@googlegroups.com.

syzbot will keep track of this issue. See:
https://goo.gl/tpsmEJ#status for how to communicate with syzbot.


[PATCH v4 3/4] pinctrl: add drive for I2C related pins on MT8195

2021-04-12 Thread Zhiyong Tao
This patch provides the advanced drive raw data setting version
for I2C used pins on MT8195.

Signed-off-by: Zhiyong Tao 
---
 drivers/pinctrl/mediatek/pinctrl-mt8195.c | 22 +++
 .../pinctrl/mediatek/pinctrl-mtk-common-v2.c  | 14 
 .../pinctrl/mediatek/pinctrl-mtk-common-v2.h  |  5 +
 3 files changed, 41 insertions(+)

diff --git a/drivers/pinctrl/mediatek/pinctrl-mt8195.c 
b/drivers/pinctrl/mediatek/pinctrl-mt8195.c
index 063f164d7c9b..a7500e18bb1d 100644
--- a/drivers/pinctrl/mediatek/pinctrl-mt8195.c
+++ b/drivers/pinctrl/mediatek/pinctrl-mt8195.c
@@ -760,6 +760,25 @@ static const struct mtk_pin_field_calc 
mt8195_pin_drv_range[] = {
PIN_FIELD_BASE(143, 143, 1, 0x020, 0x10, 24, 3),
 };
 
+static const struct mtk_pin_field_calc mt8195_pin_drv_adv_range[] = {
+   PIN_FIELD_BASE(8, 8, 4, 0x020, 0x10, 15, 3),
+   PIN_FIELD_BASE(9, 9, 4, 0x020, 0x10, 0, 3),
+   PIN_FIELD_BASE(10, 10, 4, 0x020, 0x10, 18, 3),
+   PIN_FIELD_BASE(11, 11, 4, 0x020, 0x10, 3, 3),
+   PIN_FIELD_BASE(12, 12, 4, 0x020, 0x10, 21, 3),
+   PIN_FIELD_BASE(13, 13, 4, 0x020, 0x10, 6, 3),
+   PIN_FIELD_BASE(14, 14, 4, 0x020, 0x10, 24, 3),
+   PIN_FIELD_BASE(15, 15, 4, 0x020, 0x10, 9, 3),
+   PIN_FIELD_BASE(16, 16, 4, 0x020, 0x10, 27, 3),
+   PIN_FIELD_BASE(17, 17, 4, 0x020, 0x10, 12, 3),
+   PIN_FIELD_BASE(29, 29, 2, 0x020, 0x10, 0, 3),
+   PIN_FIELD_BASE(30, 30, 2, 0x020, 0x10, 3, 3),
+   PIN_FIELD_BASE(34, 34, 1, 0x040, 0x10, 0, 3),
+   PIN_FIELD_BASE(35, 35, 1, 0x040, 0x10, 3, 3),
+   PIN_FIELD_BASE(44, 44, 1, 0x040, 0x10, 6, 3),
+   PIN_FIELD_BASE(45, 45, 1, 0x040, 0x10, 9, 3),
+};
+
 static const struct mtk_pin_reg_calc mt8195_reg_cals[PINCTRL_PIN_REG_MAX] = {
[PINCTRL_PIN_REG_MODE] = MTK_RANGE(mt8195_pin_mode_range),
[PINCTRL_PIN_REG_DIR] = MTK_RANGE(mt8195_pin_dir_range),
@@ -773,6 +792,7 @@ static const struct mtk_pin_reg_calc 
mt8195_reg_cals[PINCTRL_PIN_REG_MAX] = {
[PINCTRL_PIN_REG_PUPD] = MTK_RANGE(mt8195_pin_pupd_range),
[PINCTRL_PIN_REG_R0] = MTK_RANGE(mt8195_pin_r0_range),
[PINCTRL_PIN_REG_R1] = MTK_RANGE(mt8195_pin_r1_range),
+   [PINCTRL_PIN_REG_DRV_ADV] = MTK_RANGE(mt8195_pin_drv_adv_range),
 };
 
 static const char * const mt8195_pinctrl_register_base_names[] = {
@@ -801,6 +821,8 @@ static const struct mtk_pin_soc mt8195_data = {
.bias_get_combo = mtk_pinconf_bias_get_combo,
.drive_set = mtk_pinconf_drive_set_rev1,
.drive_get = mtk_pinconf_drive_get_rev1,
+   .adv_drive_get = mtk_pinconf_adv_drive_get_raw,
+   .adv_drive_set = mtk_pinconf_adv_drive_set_raw,
 };
 
 static const struct of_device_id mt8195_pinctrl_of_match[] = {
diff --git a/drivers/pinctrl/mediatek/pinctrl-mtk-common-v2.c 
b/drivers/pinctrl/mediatek/pinctrl-mtk-common-v2.c
index 72f17f26acd8..2b51f4a9b860 100644
--- a/drivers/pinctrl/mediatek/pinctrl-mtk-common-v2.c
+++ b/drivers/pinctrl/mediatek/pinctrl-mtk-common-v2.c
@@ -1027,6 +1027,20 @@ int mtk_pinconf_adv_drive_get(struct mtk_pinctrl *hw,
 }
 EXPORT_SYMBOL_GPL(mtk_pinconf_adv_drive_get);
 
+int mtk_pinconf_adv_drive_set_raw(struct mtk_pinctrl *hw,
+ const struct mtk_pin_desc *desc, u32 arg)
+{
+   return mtk_hw_set_value(hw, desc, PINCTRL_PIN_REG_DRV_ADV, arg);
+}
+EXPORT_SYMBOL_GPL(mtk_pinconf_adv_drive_set_raw);
+
+int mtk_pinconf_adv_drive_get_raw(struct mtk_pinctrl *hw,
+ const struct mtk_pin_desc *desc, u32 *val)
+{
+   return mtk_hw_get_value(hw, desc, PINCTRL_PIN_REG_DRV_ADV, val);
+}
+EXPORT_SYMBOL_GPL(mtk_pinconf_adv_drive_get_raw);
+
 MODULE_LICENSE("GPL v2");
 MODULE_AUTHOR("Sean Wang ");
 MODULE_DESCRIPTION("Pin configuration library module for mediatek SoCs");
diff --git a/drivers/pinctrl/mediatek/pinctrl-mtk-common-v2.h 
b/drivers/pinctrl/mediatek/pinctrl-mtk-common-v2.h
index e2aae285b5fc..fd5ce9c5dcbd 100644
--- a/drivers/pinctrl/mediatek/pinctrl-mtk-common-v2.h
+++ b/drivers/pinctrl/mediatek/pinctrl-mtk-common-v2.h
@@ -66,6 +66,7 @@ enum {
PINCTRL_PIN_REG_DRV_EN,
PINCTRL_PIN_REG_DRV_E0,
PINCTRL_PIN_REG_DRV_E1,
+   PINCTRL_PIN_REG_DRV_ADV,
PINCTRL_PIN_REG_MAX,
 };
 
@@ -314,6 +315,10 @@ int mtk_pinconf_adv_drive_set(struct mtk_pinctrl *hw,
  const struct mtk_pin_desc *desc, u32 arg);
 int mtk_pinconf_adv_drive_get(struct mtk_pinctrl *hw,
  const struct mtk_pin_desc *desc, u32 *val);
+int mtk_pinconf_adv_drive_set_raw(struct mtk_pinctrl *hw,
+ const struct mtk_pin_desc *desc, u32 arg);
+int mtk_pinconf_adv_drive_get_raw(struct mtk_pinctrl *hw,
+ const struct mtk_pin_desc *desc, u32 *val);
 
 bool mtk_is_virt_gpio(struct mtk_pinctrl *hw, unsigned int gpio_n);
 #endif /* __PINCTRL_MTK_COMMON_V2_H */
-- 
2.18.0



[syzbot] KASAN: slab-out-of-bounds Read in __xfrm_decode_session (2)

2021-04-12 Thread syzbot
Hello,

syzbot found the following issue on:

HEAD commit:1678e493 Merge tag 'lto-v5.12-rc6' of git://git.kernel.org..
git tree:   upstream
console output: https://syzkaller.appspot.com/x/log.txt?x=1565bf7cd0
kernel config:  https://syzkaller.appspot.com/x/.config?x=71a75beb62b62a34
dashboard link: https://syzkaller.appspot.com/bug?extid=518a7b845c0083047e9c
compiler:   Debian clang version 11.0.1-2

Unfortunately, I don't have any reproducer for this issue yet.

IMPORTANT: if you fix the issue, please add the following tag to the commit:
Reported-by: syzbot+518a7b845c0083047...@syzkaller.appspotmail.com

==
BUG: KASAN: slab-out-of-bounds in decode_session6 net/xfrm/xfrm_policy.c:3403 
[inline]
BUG: KASAN: slab-out-of-bounds in __xfrm_decode_session+0x1ba4/0x2720 
net/xfrm/xfrm_policy.c:3495
Read of size 1 at addr 888013104540 by task syz-executor.3/16514

CPU: 0 PID: 16514 Comm: syz-executor.3 Not tainted 5.12.0-rc5-syzkaller #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 
01/01/2011
Call Trace:
 
 __dump_stack lib/dump_stack.c:79 [inline]
 dump_stack+0x176/0x24e lib/dump_stack.c:120
 print_address_description+0x5f/0x3a0 mm/kasan/report.c:232
 __kasan_report mm/kasan/report.c:399 [inline]
 kasan_report+0x15c/0x200 mm/kasan/report.c:416
 decode_session6 net/xfrm/xfrm_policy.c:3403 [inline]
 __xfrm_decode_session+0x1ba4/0x2720 net/xfrm/xfrm_policy.c:3495
 vti_tunnel_xmit+0x1ea/0x1510 net/ipv4/ip_vti.c:286
 __netdev_start_xmit include/linux/netdevice.h:4825 [inline]
 netdev_start_xmit include/linux/netdevice.h:4839 [inline]
 xmit_one net/core/dev.c:3605 [inline]
 dev_hard_start_xmit+0x20b/0x450 net/core/dev.c:3621
 sch_direct_xmit+0x1f0/0xd30 net/sched/sch_generic.c:313
 qdisc_restart net/sched/sch_generic.c:376 [inline]
 __qdisc_run+0xa4d/0x1a90 net/sched/sch_generic.c:384
 __dev_xmit_skb net/core/dev.c:3855 [inline]
 __dev_queue_xmit+0x1141/0x2a50 net/core/dev.c:4162
 neigh_output include/net/neighbour.h:510 [inline]
 ip6_finish_output2+0x10be/0x1460 net/ipv6/ip6_output.c:117
 dst_output include/net/dst.h:448 [inline]
 NF_HOOK include/linux/netfilter.h:301 [inline]
 ndisc_send_skb+0x93b/0xd50 net/ipv6/ndisc.c:508
 addrconf_rs_timer+0x242/0x6f0 net/ipv6/addrconf.c:3877
 call_timer_fn+0x91/0x160 kernel/time/timer.c:1431
 expire_timers kernel/time/timer.c:1476 [inline]
 __run_timers+0x6c0/0x8a0 kernel/time/timer.c:1745
 run_timer_softirq+0x63/0xf0 kernel/time/timer.c:1758
 __do_softirq+0x318/0x714 kernel/softirq.c:345
 invoke_softirq kernel/softirq.c:221 [inline]
 __irq_exit_rcu+0x1d8/0x200 kernel/softirq.c:422
 irq_exit_rcu+0x5/0x20 kernel/softirq.c:434
 sysvec_apic_timer_interrupt+0x91/0xb0 arch/x86/kernel/apic/apic.c:1100
 
 asm_sysvec_apic_timer_interrupt+0x12/0x20 arch/x86/include/asm/idtentry.h:632
RIP: 0010:__sanitizer_cov_trace_pc+0x56/0x60 kernel/kcov.c:205
Code: 2c 8b 91 10 15 00 00 83 fa 02 75 21 48 8b 91 18 15 00 00 48 8b 32 48 8d 
7e 01 8b 89 14 15 00 00 48 39 cf 73 08 48 89 44 f2 08 <48> 89 3a c3 66 0f 1f 44 
00 00 4c 8b 04 24 65 48 8b 14 25 80 ef 01
RSP: 0018:c90001acf9f0 EFLAGS: 0283
RAX: 821506a4 RBX:  RCX: 0004
RDX: c9000f2df000 RSI: 2928 RDI: 2929
RBP: 192000359f57 R08: dc00 R09: f52000359f5e
R10: f52000359f5e R11:  R12: 111029006027
R13: 888034b67020 R14: 192000359f98 R15: 888034b67018
 ext4_match fs/ext4/namei.c:1364 [inline]
 ext4_search_dir+0x2f4/0xa10 fs/ext4/namei.c:1395
 search_dirblock fs/ext4/namei.c:1199 [inline]
 __ext4_find_entry+0x121c/0x1790 fs/ext4/namei.c:1553
 ext4_find_entry fs/ext4/namei.c:1602 [inline]
 ext4_rmdir+0x347/0x1180 fs/ext4/namei.c:3132
 vfs_rmdir+0x20a/0x3f0 fs/namei.c:3899
 ovl_remove_upper fs/overlayfs/dir.c:825 [inline]
 ovl_do_remove+0x4d2/0xbe0 fs/overlayfs/dir.c:904
 vfs_rmdir+0x20a/0x3f0 fs/namei.c:3899
 do_rmdir+0x2a5/0x560 fs/namei.c:3962
 do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46
 entry_SYSCALL_64_after_hwframe+0x44/0xae
RIP: 0033:0x466459
Code: ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 48 89 f8 48 89 f7 48 
89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 
c3 48 c7 c1 bc ff ff ff f7 d8 64 89 01 48
RSP: 002b:7f08cdd4a188 EFLAGS: 0246 ORIG_RAX: 0054
RAX: ffda RBX: 0056c008 RCX: 00466459
RDX:  RSI:  RDI: 20c0
RBP: 004bf9fb R08:  R09: 
R10:  R11: 0246 R12: 0056c008
R13: 7ffefaa401bf R14: 7f08cdd4a300 R15: 00022000

Allocated by task 8393:
 kasan_save_stack mm/kasan/common.c:38 [inline]
 kasan_set_track mm/kasan/common.c:46 [inline]
 set_alloc_info mm/kasan/common.c:427 [inline]
 kasan_kmalloc+0xc2/0xf0 mm/kasan/common.c:506
 kasan_kmalloc include/linux/kasan.h:233 [inline]
 __k

[PATCH v4 2/4] pinctrl: add pinctrl driver on mt8195

2021-04-12 Thread Zhiyong Tao
This commit includes pinctrl driver for mt8195.

Signed-off-by: Zhiyong Tao 
---
 drivers/pinctrl/mediatek/Kconfig  |6 +
 drivers/pinctrl/mediatek/Makefile |1 +
 drivers/pinctrl/mediatek/pinctrl-mt8195.c |  828 
 drivers/pinctrl/mediatek/pinctrl-mtk-mt8195.h | 1669 +
 4 files changed, 2504 insertions(+)
 create mode 100644 drivers/pinctrl/mediatek/pinctrl-mt8195.c
 create mode 100644 drivers/pinctrl/mediatek/pinctrl-mtk-mt8195.h

diff --git a/drivers/pinctrl/mediatek/Kconfig b/drivers/pinctrl/mediatek/Kconfig
index eef17f228669..90f0c8255eaf 100644
--- a/drivers/pinctrl/mediatek/Kconfig
+++ b/drivers/pinctrl/mediatek/Kconfig
@@ -147,6 +147,12 @@ config PINCTRL_MT8192
default ARM64 && ARCH_MEDIATEK
select PINCTRL_MTK_PARIS
 
+config PINCTRL_MT8195
+   bool "Mediatek MT8195 pin control"
+   depends on OF
+   depends on ARM64 || COMPILE_TEST
+   select PINCTRL_MTK_PARIS
+
 config PINCTRL_MT8516
bool "Mediatek MT8516 pin control"
depends on OF
diff --git a/drivers/pinctrl/mediatek/Makefile 
b/drivers/pinctrl/mediatek/Makefile
index 01218bf4dc30..06fde993ace2 100644
--- a/drivers/pinctrl/mediatek/Makefile
+++ b/drivers/pinctrl/mediatek/Makefile
@@ -21,5 +21,6 @@ obj-$(CONFIG_PINCTRL_MT8167)  += pinctrl-mt8167.o
 obj-$(CONFIG_PINCTRL_MT8173)   += pinctrl-mt8173.o
 obj-$(CONFIG_PINCTRL_MT8183)   += pinctrl-mt8183.o
 obj-$(CONFIG_PINCTRL_MT8192)   += pinctrl-mt8192.o
+obj-$(CONFIG_PINCTRL_MT8195)+= pinctrl-mt8195.o
 obj-$(CONFIG_PINCTRL_MT8516)   += pinctrl-mt8516.o
 obj-$(CONFIG_PINCTRL_MT6397)   += pinctrl-mt6397.o
diff --git a/drivers/pinctrl/mediatek/pinctrl-mt8195.c 
b/drivers/pinctrl/mediatek/pinctrl-mt8195.c
new file mode 100644
index ..063f164d7c9b
--- /dev/null
+++ b/drivers/pinctrl/mediatek/pinctrl-mt8195.c
@@ -0,0 +1,828 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2020 MediaTek Inc.
+ *
+ * Author: Zhiyong Tao 
+ *
+ */
+
+#include "pinctrl-mtk-mt8195.h"
+#include "pinctrl-paris.h"
+
+/* MT8195 have multiple bases to program pin configuration listed as the below:
+ * iocfg[0]:0x10005000, iocfg[1]:0x11d1, iocfg[2]:0x11d3,
+ * iocfg[3]:0x11d4, iocfg[4]:0x11e2, iocfg[5]:0x11eb,
+ * iocfg[6]:0x11f4.
+ * _i_based could be used to indicate what base the pin should be mapped into.
+ */
+
+#define PIN_FIELD_BASE(s_pin, e_pin, i_base, s_addr, x_addrs, s_bit, x_bits) \
+   PIN_FIELD_CALC(s_pin, e_pin, i_base, s_addr, x_addrs, s_bit, x_bits, \
+  32, 0)
+
+#define PINS_FIELD_BASE(s_pin, e_pin, i_base, s_addr, x_addrs, s_bit, x_bits) \
+   PIN_FIELD_CALC(s_pin, e_pin, i_base, s_addr, x_addrs, s_bit, x_bits,  \
+  32, 1)
+
+static const struct mtk_pin_field_calc mt8195_pin_mode_range[] = {
+   PIN_FIELD(0, 144, 0x300, 0x10, 0, 4),
+};
+
+static const struct mtk_pin_field_calc mt8195_pin_dir_range[] = {
+   PIN_FIELD(0, 144, 0x0, 0x10, 0, 1),
+};
+
+static const struct mtk_pin_field_calc mt8195_pin_di_range[] = {
+   PIN_FIELD(0, 144, 0x200, 0x10, 0, 1),
+};
+
+static const struct mtk_pin_field_calc mt8195_pin_do_range[] = {
+   PIN_FIELD(0, 144, 0x100, 0x10, 0, 1),
+};
+
+static const struct mtk_pin_field_calc mt8195_pin_ies_range[] = {
+   PIN_FIELD_BASE(0, 0, 4, 0x040, 0x10, 0, 1),
+   PIN_FIELD_BASE(1, 1, 4, 0x040, 0x10, 1, 1),
+   PIN_FIELD_BASE(2, 2, 4, 0x040, 0x10, 2, 1),
+   PIN_FIELD_BASE(3, 3, 4, 0x040, 0x10, 3, 1),
+   PIN_FIELD_BASE(4, 4, 4, 0x040, 0x10, 4, 1),
+   PIN_FIELD_BASE(5, 5, 4, 0x040, 0x10, 5, 1),
+   PIN_FIELD_BASE(6, 6, 4, 0x040, 0x10, 6, 1),
+   PIN_FIELD_BASE(7, 7, 4, 0x040, 0x10, 7, 1),
+   PIN_FIELD_BASE(8, 8, 4, 0x040, 0x10, 13, 1),
+   PIN_FIELD_BASE(9, 9, 4, 0x040, 0x10, 8, 1),
+   PIN_FIELD_BASE(10, 10, 4, 0x040, 0x10, 14, 1),
+   PIN_FIELD_BASE(11, 11, 4, 0x040, 0x10, 9, 1),
+   PIN_FIELD_BASE(12, 12, 4, 0x040, 0x10, 15, 1),
+   PIN_FIELD_BASE(13, 13, 4, 0x040, 0x10, 10, 1),
+   PIN_FIELD_BASE(14, 14, 4, 0x040, 0x10, 16, 1),
+   PIN_FIELD_BASE(15, 15, 4, 0x040, 0x10, 11, 1),
+   PIN_FIELD_BASE(16, 16, 4, 0x040, 0x10, 17, 1),
+   PIN_FIELD_BASE(17, 17, 4, 0x040, 0x10, 12, 1),
+   PIN_FIELD_BASE(18, 18, 2, 0x040, 0x10, 5, 1),
+   PIN_FIELD_BASE(19, 19, 2, 0x040, 0x10, 12, 1),
+   PIN_FIELD_BASE(20, 20, 2, 0x040, 0x10, 11, 1),
+   PIN_FIELD_BASE(21, 21, 2, 0x040, 0x10, 10, 1),
+   PIN_FIELD_BASE(22, 22, 2, 0x040, 0x10, 0, 1),
+   PIN_FIELD_BASE(23, 23, 2, 0x040, 0x10, 1, 1),
+   PIN_FIELD_BASE(24, 24, 2, 0x040, 0x10, 2, 1),
+   PIN_FIELD_BASE(25, 25, 2, 0x040, 0x10, 4, 1),
+   PIN_FIELD_BASE(26, 26, 2, 0x040, 0x10, 3, 1),
+   PIN_FIELD_BASE(27, 27, 2, 0x040, 0x10, 6, 1),
+   PIN_FIELD_BASE(28, 28, 2, 0x040, 0x10, 7, 1),
+   PIN_FIELD_BASE(29, 29, 2, 0x040, 0x10, 8, 1),
+   PIN_FIELD_BASE(30, 30, 2, 0x040, 0x10, 9, 1),
+   PIN_FIELD_BAS

[PATCH v4 4/4] pinctrl: add rsel setting on MT8195

2021-04-12 Thread Zhiyong Tao
This patch provides rsel setting on MT8195.

Signed-off-by: Zhiyong Tao 
---
 drivers/pinctrl/mediatek/pinctrl-mt8195.c | 22 +++
 .../pinctrl/mediatek/pinctrl-mtk-common-v2.c  | 14 
 .../pinctrl/mediatek/pinctrl-mtk-common-v2.h  | 10 +
 drivers/pinctrl/mediatek/pinctrl-paris.c  | 16 ++
 4 files changed, 62 insertions(+)

diff --git a/drivers/pinctrl/mediatek/pinctrl-mt8195.c 
b/drivers/pinctrl/mediatek/pinctrl-mt8195.c
index a7500e18bb1d..66608b8d346a 100644
--- a/drivers/pinctrl/mediatek/pinctrl-mt8195.c
+++ b/drivers/pinctrl/mediatek/pinctrl-mt8195.c
@@ -779,6 +779,25 @@ static const struct mtk_pin_field_calc 
mt8195_pin_drv_adv_range[] = {
PIN_FIELD_BASE(45, 45, 1, 0x040, 0x10, 9, 3),
 };
 
+static const struct mtk_pin_field_calc mt8195_pin_rsel_range[] = {
+   PIN_FIELD_BASE(8, 8, 4, 0x0c0, 0x10, 15, 3),
+   PIN_FIELD_BASE(9, 9, 4, 0x0c0, 0x10, 0, 3),
+   PIN_FIELD_BASE(10, 10, 4, 0x0c0, 0x10, 18, 3),
+   PIN_FIELD_BASE(11, 11, 4, 0x0c0, 0x10, 3, 3),
+   PIN_FIELD_BASE(12, 12, 4, 0x0c0, 0x10, 21, 3),
+   PIN_FIELD_BASE(13, 13, 4, 0x0c0, 0x10, 6, 3),
+   PIN_FIELD_BASE(14, 14, 4, 0x0c0, 0x10, 24, 3),
+   PIN_FIELD_BASE(15, 15, 4, 0x0c0, 0x10, 9, 3),
+   PIN_FIELD_BASE(16, 16, 4, 0x0c0, 0x10, 27, 3),
+   PIN_FIELD_BASE(17, 17, 4, 0x0c0, 0x10, 12, 3),
+   PIN_FIELD_BASE(29, 29, 2, 0x080, 0x10, 0, 3),
+   PIN_FIELD_BASE(30, 30, 2, 0x080, 0x10, 3, 3),
+   PIN_FIELD_BASE(34, 34, 1, 0x0e0, 0x10, 0, 3),
+   PIN_FIELD_BASE(35, 35, 1, 0x0e0, 0x10, 3, 3),
+   PIN_FIELD_BASE(44, 44, 1, 0x0e0, 0x10, 6, 3),
+   PIN_FIELD_BASE(45, 45, 1, 0x0e0, 0x10, 9, 3),
+};
+
 static const struct mtk_pin_reg_calc mt8195_reg_cals[PINCTRL_PIN_REG_MAX] = {
[PINCTRL_PIN_REG_MODE] = MTK_RANGE(mt8195_pin_mode_range),
[PINCTRL_PIN_REG_DIR] = MTK_RANGE(mt8195_pin_dir_range),
@@ -793,6 +812,7 @@ static const struct mtk_pin_reg_calc 
mt8195_reg_cals[PINCTRL_PIN_REG_MAX] = {
[PINCTRL_PIN_REG_R0] = MTK_RANGE(mt8195_pin_r0_range),
[PINCTRL_PIN_REG_R1] = MTK_RANGE(mt8195_pin_r1_range),
[PINCTRL_PIN_REG_DRV_ADV] = MTK_RANGE(mt8195_pin_drv_adv_range),
+   [PINCTRL_PIN_REG_RSEL] = MTK_RANGE(mt8195_pin_rsel_range),
 };
 
 static const char * const mt8195_pinctrl_register_base_names[] = {
@@ -823,6 +843,8 @@ static const struct mtk_pin_soc mt8195_data = {
.drive_get = mtk_pinconf_drive_get_rev1,
.adv_drive_get = mtk_pinconf_adv_drive_get_raw,
.adv_drive_set = mtk_pinconf_adv_drive_set_raw,
+   .rsel_set = mtk_pinconf_rsel_set,
+   .rsel_get = mtk_pinconf_rsel_get,
 };
 
 static const struct of_device_id mt8195_pinctrl_of_match[] = {
diff --git a/drivers/pinctrl/mediatek/pinctrl-mtk-common-v2.c 
b/drivers/pinctrl/mediatek/pinctrl-mtk-common-v2.c
index 2b51f4a9b860..d1526d0c6248 100644
--- a/drivers/pinctrl/mediatek/pinctrl-mtk-common-v2.c
+++ b/drivers/pinctrl/mediatek/pinctrl-mtk-common-v2.c
@@ -1041,6 +1041,20 @@ int mtk_pinconf_adv_drive_get_raw(struct mtk_pinctrl *hw,
 }
 EXPORT_SYMBOL_GPL(mtk_pinconf_adv_drive_get_raw);
 
+int mtk_pinconf_rsel_set(struct mtk_pinctrl *hw,
+const struct mtk_pin_desc *desc, u32 arg)
+{
+   return mtk_hw_set_value(hw, desc, PINCTRL_PIN_REG_RSEL, arg);
+}
+EXPORT_SYMBOL_GPL(mtk_pinconf_rsel_set);
+
+int mtk_pinconf_rsel_get(struct mtk_pinctrl *hw,
+const struct mtk_pin_desc *desc, u32 *val)
+{
+   return mtk_hw_get_value(hw, desc, PINCTRL_PIN_REG_RSEL, val);
+}
+EXPORT_SYMBOL_GPL(mtk_pinconf_rsel_get);
+
 MODULE_LICENSE("GPL v2");
 MODULE_AUTHOR("Sean Wang ");
 MODULE_DESCRIPTION("Pin configuration library module for mediatek SoCs");
diff --git a/drivers/pinctrl/mediatek/pinctrl-mtk-common-v2.h 
b/drivers/pinctrl/mediatek/pinctrl-mtk-common-v2.h
index fd5ce9c5dcbd..570e8da7bf38 100644
--- a/drivers/pinctrl/mediatek/pinctrl-mtk-common-v2.h
+++ b/drivers/pinctrl/mediatek/pinctrl-mtk-common-v2.h
@@ -67,6 +67,7 @@ enum {
PINCTRL_PIN_REG_DRV_E0,
PINCTRL_PIN_REG_DRV_E1,
PINCTRL_PIN_REG_DRV_ADV,
+   PINCTRL_PIN_REG_RSEL,
PINCTRL_PIN_REG_MAX,
 };
 
@@ -237,6 +238,10 @@ struct mtk_pin_soc {
 const struct mtk_pin_desc *desc, u32 arg);
int (*adv_drive_get)(struct mtk_pinctrl *hw,
 const struct mtk_pin_desc *desc, u32 *val);
+   int (*rsel_set)(struct mtk_pinctrl *hw,
+   const struct mtk_pin_desc *desc, u32 arg);
+   int (*rsel_get)(struct mtk_pinctrl *hw,
+   const struct mtk_pin_desc *desc, u32 *val);
 
/* Specific driver data */
void*driver_data;
@@ -320,5 +325,10 @@ int mtk_pinconf_adv_drive_set_raw(struct mtk_pinctrl *hw,
 int mtk_pinconf_adv_drive_get_raw(struct mtk_pinctrl *hw,
  const struct mtk_pin_desc *desc, u32 *val);
 
+int mtk_pinconf_rse

[PATCH v4 1/4] dt-bindings: pinctrl: mt8195: add pinctrl file and binding document

2021-04-12 Thread Zhiyong Tao
1. This patch adds pinctrl file for mt8195.
2. This patch adds mt8195 compatible node in binding document.

Signed-off-by: Zhiyong Tao 
---
 .../bindings/pinctrl/pinctrl-mt8195.yaml  | 151 +++
 include/dt-bindings/pinctrl/mt8195-pinfunc.h  | 962 ++
 2 files changed, 1113 insertions(+)
 create mode 100644 
Documentation/devicetree/bindings/pinctrl/pinctrl-mt8195.yaml
 create mode 100644 include/dt-bindings/pinctrl/mt8195-pinfunc.h

diff --git a/Documentation/devicetree/bindings/pinctrl/pinctrl-mt8195.yaml 
b/Documentation/devicetree/bindings/pinctrl/pinctrl-mt8195.yaml
new file mode 100644
index ..2f12ec59eee5
--- /dev/null
+++ b/Documentation/devicetree/bindings/pinctrl/pinctrl-mt8195.yaml
@@ -0,0 +1,151 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/pinctrl/pinctrl-mt8195.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Mediatek MT8195 Pin Controller
+
+maintainers:
+  - Sean Wang 
+
+description: |
+  The Mediatek's Pin controller is used to control SoC pins.
+
+properties:
+  compatible:
+const: mediatek,mt8195-pinctrl
+
+  gpio-controller: true
+
+  '#gpio-cells':
+description: |
+  Number of cells in GPIO specifier. Since the generic GPIO binding is 
used,
+  the amount of cells must be specified as 2. See the below
+  mentioned gpio binding representation for description of particular 
cells.
+const: 2
+
+  gpio-ranges:
+description: gpio valid number range.
+maxItems: 1
+
+  reg:
+description: |
+  Physical address base for gpio base registers. There are 8 GPIO
+  physical address base in mt8195.
+maxItems: 8
+
+  reg-names:
+description: |
+  Gpio base register names.
+maxItems: 8
+
+  interrupt-controller: true
+
+  '#interrupt-cells':
+const: 2
+
+  interrupts:
+description: The interrupt outputs to sysirq.
+maxItems: 1
+
+#PIN CONFIGURATION NODES
+patternProperties:
+  '-pins$':
+type: object
+description: |
+  A pinctrl node should contain at least one subnodes representing the
+  pinctrl groups available on the machine. Each subnode will list the
+  pins it needs, and how they should be configured, with regard to muxer
+  configuration, pullups, drive strength, input enable/disable and
+  input schmitt.
+  An example of using macro:
+  pincontroller {
+/* GPIO0 set as multifunction GPIO0 */
+gpio_pin {
+  pinmux = ;
+};
+/* GPIO8 set as multifunction SDA0 */
+i2c0_pin {
+  pinmux = ;
+};
+  };
+$ref: "pinmux-node.yaml"
+
+properties:
+  pinmux:
+description: |
+  Integer array, represents gpio pin number and mux setting.
+  Supported pin number and mux varies for different SoCs, and are 
defined
+  as macros in dt-bindings/pinctrl/-pinfunc.h directly.
+
+  drive-strength:
+description: |
+  It can support some arguments which is from 0 to 7. It can only 
support
+  2/4/6/8/10/12/14/16mA in mt8195.
+enum: [0, 1, 2, 3, 4, 5, 6, 7]
+
+  bias-pull-down: true
+
+  bias-pull-up: true
+
+  bias-disable: true
+
+  output-high: true
+
+  output-low: true
+
+  input-enable: true
+
+  input-disable: true
+
+  input-schmitt-enable: true
+
+  input-schmitt-disable: true
+
+required:
+  - pinmux
+
+additionalProperties: false
+
+required:
+  - compatible
+  - reg
+  - interrupts
+  - interrupt-controller
+  - '#interrupt-cells'
+  - gpio-controller
+  - '#gpio-cells'
+  - gpio-ranges
+
+additionalProperties: false
+
+examples:
+  - |
+#include 
+#include 
+pio: pinctrl@10005000 {
+compatible = "mediatek,mt8195-pinctrl";
+reg = <0x10005000 0x1000>,
+  <0x11d1 0x1000>,
+  <0x11d3 0x1000>,
+  <0x11d4 0x1000>,
+  <0x11e2 0x1000>,
+  <0x11eb 0x1000>,
+  <0x11f4 0x1000>,
+  <0x1000b000 0x1000>;
+reg-names = "iocfg0", "iocfg_bm", "iocfg_bl",
+  "iocfg_br", "iocfg_lm", "iocfg_rb",
+  "iocfg_tl", "eint";
+gpio-controller;
+#gpio-cells = <2>;
+gpio-ranges = <&pio 0 0 144>;
+interrupt-controller;
+interrupts = ;
+#interrupt-cells = <2>;
+
+pio-pins {
+  pinmux = ;
+  output-low;
+};
+};
diff --git a/include/dt-bindings/pinctrl/mt8195-pinfunc.h 
b/include/dt-bindings/pinctrl/mt8195-pinfunc.h
new file mode 100644
index ..666331bb9b40
--- /dev/null
+++ b

[PATCH v4 0/4] Mediatek pinctrl patch on mt8195

2021-04-12 Thread Zhiyong Tao
This series includes 4 patches:
1.add pinctrl file and inding document on mt8195.
2.add pinctrl driver on MT8195.
3.add pinctrl drive for I2C related pins on MT8195.
4.add pinctrl rsel setting on MT8195.

Changes in patch v4:
1)fix pinctrl-mt8195.yaml warning error.
2)remove pinctrl device node patch which is based on "mt8195.dtsi".

Changes in patch v3:
1)change '^pins' to '-pins$'.
2)change 'state_0_node_a' to 'gpio_pin' which is defined in dts.
3)change 'state_0_node_b' to 'i2c0_pin' which is defined in dts.
4)reorder this series patches. change pinctrl file and binding document
together in one patch.

There are no changes in v1 & v2.

Zhiyong Tao (4):
  dt-bindings: pinctrl: mt8195: add pinctrl file and binding document
  pinctrl: add pinctrl driver on mt8195
  pinctrl: add drive for I2C related pins on MT8195
  pinctrl: add rsel setting on MT8195

 .../bindings/pinctrl/pinctrl-mt8195.yaml  |  151 ++
 drivers/pinctrl/mediatek/Kconfig  |6 +
 drivers/pinctrl/mediatek/Makefile |1 +
 drivers/pinctrl/mediatek/pinctrl-mt8195.c |  872 +
 .../pinctrl/mediatek/pinctrl-mtk-common-v2.c  |   28 +
 .../pinctrl/mediatek/pinctrl-mtk-common-v2.h  |   15 +
 drivers/pinctrl/mediatek/pinctrl-mtk-mt8195.h | 1669 +
 drivers/pinctrl/mediatek/pinctrl-paris.c  |   16 +
 include/dt-bindings/pinctrl/mt8195-pinfunc.h  |  962 ++
 9 files changed, 3720 insertions(+)
 create mode 100644 
Documentation/devicetree/bindings/pinctrl/pinctrl-mt8195.yaml
 create mode 100644 drivers/pinctrl/mediatek/pinctrl-mt8195.c
 create mode 100644 drivers/pinctrl/mediatek/pinctrl-mtk-mt8195.h
 create mode 100644 include/dt-bindings/pinctrl/mt8195-pinfunc.h

--
2.18.0




[syzbot] BUG: unable to handle kernel NULL pointer dereference in __lookup_slow (2)

2021-04-12 Thread syzbot
Hello,

syzbot found the following issue on:

HEAD commit:d93a0d43 Merge tag 'block-5.12-2021-04-02' of git://git.ke..
git tree:   upstream
console output: https://syzkaller.appspot.com/x/log.txt?x=16519431d0
kernel config:  https://syzkaller.appspot.com/x/.config?x=71a75beb62b62a34
dashboard link: https://syzkaller.appspot.com/bug?extid=11c49ce9d4e7896f3406
compiler:   Debian clang version 11.0.1-2

Unfortunately, I don't have any reproducer for this issue yet.

IMPORTANT: if you fix the issue, please add the following tag to the commit:
Reported-by: syzbot+11c49ce9d4e7896f3...@syzkaller.appspotmail.com

REISERFS (device loop4): Using r5 hash to sort names
BUG: kernel NULL pointer dereference, address: 
#PF: supervisor instruction fetch in kernel mode
#PF: error_code(0x0010) - not-present page
PGD 6bb82067 P4D 6bb82067 PUD 6bb81067 PMD 0 
Oops: 0010 [#1] PREEMPT SMP KASAN
CPU: 0 PID: 11072 Comm: syz-executor.4 Not tainted 5.12.0-rc5-syzkaller #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 
01/01/2011
RIP: 0010:0x0
Code: Unable to access opcode bytes at RIP 0xffd6.
RSP: 0018:c90008f8fa20 EFLAGS: 00010246
RAX: 113872e8 RBX: dc00 RCX: 0004
RDX:  RSI: 88802e9d9490 RDI: 88807f140190
RBP: 89c39740 R08: 81c9d4de R09: fbfff200a946
R10: fbfff200a946 R11:  R12: 
R13: 88807f140190 R14: 111005d3b292 R15: 88802e9d9490
FS:  7f894af88700() GS:8880b9c0() knlGS:
CS:  0010 DS:  ES:  CR0: 80050033
CR2: ffd6 CR3: 6bb83000 CR4: 001506f0
DR0:  DR1:  DR2: 
DR3:  DR6: fffe0ff0 DR7: 0400
Call Trace:
 __lookup_slow+0x240/0x370 fs/namei.c:1626
 lookup_one_len+0x10e/0x200 fs/namei.c:2649
 reiserfs_lookup_privroot+0x85/0x1e0 fs/reiserfs/xattr.c:980
 reiserfs_fill_super+0x2a69/0x3160 fs/reiserfs/super.c:2176
 mount_bdev+0x26c/0x3a0 fs/super.c:1367
 legacy_get_tree+0xea/0x180 fs/fs_context.c:592
 vfs_get_tree+0x86/0x270 fs/super.c:1497
 do_new_mount fs/namespace.c:2903 [inline]
 path_mount+0x188a/0x29a0 fs/namespace.c:3233
 do_mount fs/namespace.c:3246 [inline]
 __do_sys_mount fs/namespace.c:3454 [inline]
 __se_sys_mount+0x28c/0x320 fs/namespace.c:3431
 do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46
 entry_SYSCALL_64_after_hwframe+0x44/0xae
RIP: 0033:0x46797a
Code: 48 c7 c2 bc ff ff ff f7 d8 64 89 02 b8 ff ff ff ff eb d2 e8 b8 04 00 00 
0f 1f 84 00 00 00 00 00 49 89 ca b8 a5 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 
c3 48 c7 c1 bc ff ff ff f7 d8 64 89 01 48
RSP: 002b:7f894af87fa8 EFLAGS: 0206 ORIG_RAX: 00a5
RAX: ffda RBX: 2200 RCX: 0046797a
RDX: 2000 RSI: 2100 RDI: 7f894af88000
RBP: 7f894af88040 R08: 7f894af88040 R09: 2000
R10:  R11: 0206 R12: 2000
R13: 2100 R14: 7f894af88000 R15: 20011500
Modules linked in:
CR2: 
---[ end trace a1b8dbb111baf993 ]---
RIP: 0010:0x0
Code: Unable to access opcode bytes at RIP 0xffd6.
RSP: 0018:c90008f8fa20 EFLAGS: 00010246
RAX: 113872e8 RBX: dc00 RCX: 0004
RDX:  RSI: 88802e9d9490 RDI: 88807f140190
RBP: 89c39740 R08: 81c9d4de R09: fbfff200a946
R10: fbfff200a946 R11:  R12: 
R13: 88807f140190 R14: 111005d3b292 R15: 88802e9d9490
FS:  7f894af88700() GS:8880b9c0() knlGS:
CS:  0010 DS:  ES:  CR0: 80050033
CR2: ffd6 CR3: 6bb83000 CR4: 001506f0
DR0:  DR1:  DR2: 
DR3:  DR6: fffe0ff0 DR7: 0400


---
This report is generated by a bot. It may contain errors.
See https://goo.gl/tpsmEJ for more information about syzbot.
syzbot engineers can be reached at syzkal...@googlegroups.com.

syzbot will keep track of this issue. See:
https://goo.gl/tpsmEJ#status for how to communicate with syzbot.


[syzbot] KASAN: slab-out-of-bounds Read in reiserfs_xattr_get

2021-04-12 Thread syzbot
Hello,

syzbot found the following issue on:

HEAD commit:3a229812 Merge tag 'arm-fixes-5.11-2' of git://git.kernel...
git tree:   upstream
console output: https://syzkaller.appspot.com/x/log.txt?x=16b4d196d0
kernel config:  https://syzkaller.appspot.com/x/.config?x=f91155ccddaf919c
dashboard link: https://syzkaller.appspot.com/bug?extid=72ba979b6681c3369db4
compiler:   Debian clang version 11.0.1-2

Unfortunately, I don't have any reproducer for this issue yet.

IMPORTANT: if you fix the issue, please add the following tag to the commit:
Reported-by: syzbot+72ba979b6681c3369...@syzkaller.appspotmail.com

loop3: detected capacity change from 0 to 65534
==
BUG: KASAN: slab-out-of-bounds in reiserfs_xattr_get+0xe0/0x590 
fs/reiserfs/xattr.c:681
Read of size 8 at addr 888028983198 by task syz-executor.3/4211

CPU: 1 PID: 4211 Comm: syz-executor.3 Not tainted 5.12.0-rc6-syzkaller #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 
01/01/2011
Call Trace:
 __dump_stack lib/dump_stack.c:79 [inline]
 dump_stack+0x176/0x24e lib/dump_stack.c:120
 print_address_description+0x5f/0x3a0 mm/kasan/report.c:232
 __kasan_report mm/kasan/report.c:399 [inline]
 kasan_report+0x15c/0x200 mm/kasan/report.c:416
 reiserfs_xattr_get+0xe0/0x590 fs/reiserfs/xattr.c:681
 reiserfs_get_acl+0x63/0x670 fs/reiserfs/xattr_acl.c:211
 get_acl+0x152/0x2e0 fs/posix_acl.c:141
 check_acl fs/namei.c:294 [inline]
 acl_permission_check fs/namei.c:339 [inline]
 generic_permission+0x2ed/0x5b0 fs/namei.c:392
 do_inode_permission fs/namei.c:446 [inline]
 inode_permission+0x28e/0x500 fs/namei.c:513
 may_open+0x228/0x3e0 fs/namei.c:2985
 do_open fs/namei.c:3365 [inline]
 path_openat+0x2697/0x3860 fs/namei.c:3500
 do_filp_open+0x1a3/0x3b0 fs/namei.c:3527
 do_sys_openat2+0xba/0x380 fs/open.c:1187
 do_sys_open fs/open.c:1203 [inline]
 __do_sys_openat fs/open.c:1219 [inline]
 __se_sys_openat fs/open.c:1214 [inline]
 __x64_sys_openat+0x1c8/0x1f0 fs/open.c:1214
 do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46
 entry_SYSCALL_64_after_hwframe+0x44/0xae
RIP: 0033:0x419544
Code: 84 00 00 00 00 00 44 89 54 24 0c e8 96 f9 ff ff 44 8b 54 24 0c 44 89 e2 
48 89 ee 41 89 c0 bf 9c ff ff ff b8 01 01 00 00 0f 05 <48> 3d 00 f0 ff ff 77 34 
44 89 c7 89 44 24 0c e8 c8 f9 ff ff 8b 44
RSP: 002b:7fa357a03f30 EFLAGS: 0293 ORIG_RAX: 0101
RAX: ffda RBX: 2200 RCX: 00419544
RDX: 0001 RSI: 2100 RDI: ff9c
RBP: 2100 R08:  R09: 2000
R10:  R11: 0293 R12: 0001
R13: 2100 R14: 7fa357a04000 R15: 20065600

Allocated by task 4210:
 kasan_save_stack mm/kasan/common.c:38 [inline]
 kasan_set_track mm/kasan/common.c:46 [inline]
 set_alloc_info mm/kasan/common.c:427 [inline]
 kasan_kmalloc+0xc2/0xf0 mm/kasan/common.c:506
 kasan_kmalloc include/linux/kasan.h:233 [inline]
 kmem_cache_alloc_trace+0x21b/0x350 mm/slub.c:2934
 kmalloc include/linux/slab.h:554 [inline]
 kzalloc include/linux/slab.h:684 [inline]
 smk_fetch security/smack/smack_lsm.c:288 [inline]
 smack_d_instantiate+0x65c/0xcc0 security/smack/smack_lsm.c:3411
 security_d_instantiate+0xa5/0x100 security/security.c:1987
 d_instantiate_new+0x61/0x110 fs/dcache.c:2025
 ext4_add_nondir+0x22b/0x290 fs/ext4/namei.c:2590
 ext4_symlink+0x8ce/0xe90 fs/ext4/namei.c:3417
 vfs_symlink+0x3a0/0x540 fs/namei.c:4178
 do_symlinkat+0x1c9/0x440 fs/namei.c:4208
 do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46
 entry_SYSCALL_64_after_hwframe+0x44/0xae

Freed by task 4210:
 kasan_save_stack mm/kasan/common.c:38 [inline]
 kasan_set_track+0x3d/0x70 mm/kasan/common.c:46
 kasan_set_free_info+0x1f/0x40 mm/kasan/generic.c:357
 kasan_slab_free+0x100/0x140 mm/kasan/common.c:360
 kasan_slab_free include/linux/kasan.h:199 [inline]
 slab_free_hook mm/slub.c:1562 [inline]
 slab_free_freelist_hook+0x171/0x270 mm/slub.c:1600
 slab_free mm/slub.c:3161 [inline]
 kfree+0xcf/0x2d0 mm/slub.c:4213
 smk_fetch security/smack/smack_lsm.c:300 [inline]
 smack_d_instantiate+0x6db/0xcc0 security/smack/smack_lsm.c:3411
 security_d_instantiate+0xa5/0x100 security/security.c:1987
 d_instantiate_new+0x61/0x110 fs/dcache.c:2025
 ext4_add_nondir+0x22b/0x290 fs/ext4/namei.c:2590
 ext4_symlink+0x8ce/0xe90 fs/ext4/namei.c:3417
 vfs_symlink+0x3a0/0x540 fs/namei.c:4178
 do_symlinkat+0x1c9/0x440 fs/namei.c:4208
 do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46
 entry_SYSCALL_64_after_hwframe+0x44/0xae

Last potentially related work creation:
 kasan_save_stack+0x27/0x50 mm/kasan/common.c:38
 kasan_record_aux_stack+0xee/0x120 mm/kasan/generic.c:345
 __call_rcu kernel/rcu/tree.c:3039 [inline]
 call_rcu+0x130/0x8e0 kernel/rcu/tree.c:3114
 fib6_info_release include/net/ip6_fib.h:337 [inline]
 nsim_rt6_release drivers/net/netdevsim/fib.c:507 [inline]
 nsim_fib6_event_fini+0x100/0x1f0 drivers

Re: [PATCH] message/fusion: Use BUG_ON instead of if condition followed by BUG.

2021-04-12 Thread Martin K. Petersen
On Tue, 30 Mar 2021 05:46:01 -0700, zhouchuangao wrote:

> BUG_ON() uses unlikely in if(), which can be optimized at compile time.

Applied to 5.13/scsi-queue, thanks!

[1/1] message/fusion: Use BUG_ON instead of if condition followed by BUG.
  https://git.kernel.org/mkp/scsi/c/4dec8004de29

-- 
Martin K. Petersen  Oracle Linux Engineering


Re: [PATCH] scsi: bfa: Remove unnecessary struct declaration

2021-04-12 Thread Martin K. Petersen
On Thu, 1 Apr 2021 14:35:34 +0800, Wan Jiabing wrote:

> struct bfa_fcs_s is declared twice. One is declared
> at 50th line. Remove the duplicate.
> struct bfa_fcs_fabric_s is defined at 175th line.
> Remove unnecessary declaration.

Applied to 5.13/scsi-queue, thanks!

[1/1] scsi: bfa: Remove unnecessary struct declaration
  https://git.kernel.org/mkp/scsi/c/c3b0d087763f

-- 
Martin K. Petersen  Oracle Linux Engineering


Re: [PATCH][next] scsi: ufs: Fix out-of-bounds warnings in ufshcd_exec_raw_upiu_cmd

2021-04-12 Thread Martin K. Petersen
On Wed, 31 Mar 2021 17:43:38 -0500, Gustavo A. R. Silva wrote:

> Fix the following out-of-bounds warnings by enclosing
> some structure members into new structure objects upiu_req
> and upiu_rsp:
> 
> include/linux/fortify-string.h:20:29: warning: '__builtin_memcpy' offset [29, 
> 48] from the object at 'treq' is out of the bounds of referenced subobject 
> 'req_header' with type 'struct utp_upiu_header' at offset 16 [-Warray-bounds]
> include/linux/fortify-string.h:20:29: warning: '__builtin_memcpy' offset [61, 
> 80] from the object at 'treq' is out of the bounds of referenced subobject 
> 'rsp_header' with type 'struct utp_upiu_header' at offset 48 [-Warray-bounds]
> arch/m68k/include/asm/string.h:72:25: warning: '__builtin_memcpy' offset [29, 
> 48] from the object at 'treq' is out of the bounds of referenced subobject 
> 'req_header' with type 'struct utp_upiu_header' at offset 16 [-Warray-bounds]
> arch/m68k/include/asm/string.h:72:25: warning: '__builtin_memcpy' offset [61, 
> 80] from the object at 'treq' is out of the bounds of referenced subobject 
> 'rsp_header' with type 'struct utp_upiu_header' at offset 48 [-Warray-bounds]
> 
> [...]

Applied to 5.13/scsi-queue, thanks!

[1/1] scsi: ufs: Fix out-of-bounds warnings in ufshcd_exec_raw_upiu_cmd
  https://git.kernel.org/mkp/scsi/c/1352eec8c0da

-- 
Martin K. Petersen  Oracle Linux Engineering


Re: [PATCH -next] scsi: fnic: remove unnecessary spin_lock_init() and INIT_LIST_HEAD()

2021-04-12 Thread Martin K. Petersen
On Tue, 30 Mar 2021 20:59:11 +0800, Yang Yingliang wrote:

> The spinlock and list head of fnic_list is initialized statically.
> It is unnecessary to initialize by spin_lock_init() and INIT_LIST_HEAD().

Applied to 5.13/scsi-queue, thanks!

[1/1] scsi: fnic: remove unnecessary spin_lock_init() and INIT_LIST_HEAD()
  https://git.kernel.org/mkp/scsi/c/aa6f2fccd711

-- 
Martin K. Petersen  Oracle Linux Engineering


Re: [PATCH v7 3/4] spmi: mediatek: Add support for MT6873/8192

2021-04-12 Thread Hsin-hsiung Wang
Hi Maintainers,
Gentle pin for this patch.

Thanks.

On Sun, 2021-03-14 at 02:00 +0800, Hsin-Hsiung Wang wrote:
> Add spmi support for MT6873/8192.
> 
> Signed-off-by: Hsin-Hsiung Wang 
> ---
> changes since v6:
> - remove unused spinlock.
> - remove redundant check for slave id.
> ---
>  drivers/spmi/Kconfig |  10 +
>  drivers/spmi/Makefile|   2 +
>  drivers/spmi/spmi-mtk-pmif.c | 465 +++
>  3 files changed, 477 insertions(+)
>  create mode 100644 drivers/spmi/spmi-mtk-pmif.c
> 
> diff --git a/drivers/spmi/Kconfig b/drivers/spmi/Kconfig
> index a53bad541f1a..692bac98a120 100644
> --- a/drivers/spmi/Kconfig
> +++ b/drivers/spmi/Kconfig
> @@ -25,4 +25,14 @@ config SPMI_MSM_PMIC_ARB
> This is required for communicating with Qualcomm PMICs and
> other devices that have the SPMI interface.
>  
> +config SPMI_MTK_PMIF
> + tristate "Mediatek SPMI Controller (PMIC Arbiter)"
> + help
> +   If you say yes to this option, support will be included for the
> +   built-in SPMI PMIC Arbiter interface on Mediatek family
> +   processors.
> +
> +   This is required for communicating with Mediatek PMICs and
> +   other devices that have the SPMI interface.
> +
>  endif
> diff --git a/drivers/spmi/Makefile b/drivers/spmi/Makefile
> index 55a94cadeffe..76fb3b3ab510 100644
> --- a/drivers/spmi/Makefile
> +++ b/drivers/spmi/Makefile
> @@ -5,3 +5,5 @@
>  obj-$(CONFIG_SPMI)   += spmi.o
>  
>  obj-$(CONFIG_SPMI_MSM_PMIC_ARB)  += spmi-pmic-arb.o
> +obj-$(CONFIG_SPMI_MTK_PMIF)  += spmi-mtk-pmif.o
> +
> diff --git a/drivers/spmi/spmi-mtk-pmif.c b/drivers/spmi/spmi-mtk-pmif.c
> new file mode 100644
> index ..94c45d46ab0c
> --- /dev/null
> +++ b/drivers/spmi/spmi-mtk-pmif.c
> @@ -0,0 +1,465 @@
> +// SPDX-License-Identifier: GPL-2.0
> +//
> +// Copyright (c) 2021 MediaTek Inc.
> +
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +
> +#define SWINF_IDLE   0x00
> +#define SWINF_WFVLDCLR   0x06
> +
> +#define GET_SWINF(x) (((x) >> 1) & 0x7)
> +
> +#define PMIF_CMD_REG_0   0
> +#define PMIF_CMD_REG 1
> +#define PMIF_CMD_EXT_REG 2
> +#define PMIF_CMD_EXT_REG_LONG3
> +
> +#define PMIF_DELAY_US   10
> +#define PMIF_TIMEOUT_US (10 * 1000)
> +
> +#define PMIF_CHAN_OFFSET 0x5
> +
> +#define PMIF_MAX_CLKS3
> +
> +#define SPMI_OP_ST_BUSY 1
> +
> +struct ch_reg {
> + u32 ch_sta;
> + u32 wdata;
> + u32 rdata;
> + u32 ch_send;
> + u32 ch_rdy;
> +};
> +
> +struct pmif_data {
> + const u32   *regs;
> + const u32   *spmimst_regs;
> + u32 soc_chan;
> +};
> +
> +struct pmif {
> + void __iomem*base;
> + void __iomem*spmimst_base;
> + struct ch_reg   chan;
> + struct clk_bulk_data clks[PMIF_MAX_CLKS];
> + u32 nclks;
> + const struct pmif_data *data;
> +};
> +
> +static const char * const pmif_clock_names[] = {
> + "pmif_sys_ck", "pmif_tmr_ck", "spmimst_clk_mux",
> +};
> +
> +enum pmif_regs {
> + PMIF_INIT_DONE,
> + PMIF_INF_EN,
> + PMIF_ARB_EN,
> + PMIF_CMDISSUE_EN,
> + PMIF_TIMER_CTRL,
> + PMIF_SPI_MODE_CTRL,
> + PMIF_IRQ_EVENT_EN_0,
> + PMIF_IRQ_FLAG_0,
> + PMIF_IRQ_CLR_0,
> + PMIF_IRQ_EVENT_EN_1,
> + PMIF_IRQ_FLAG_1,
> + PMIF_IRQ_CLR_1,
> + PMIF_IRQ_EVENT_EN_2,
> + PMIF_IRQ_FLAG_2,
> + PMIF_IRQ_CLR_2,
> + PMIF_IRQ_EVENT_EN_3,
> + PMIF_IRQ_FLAG_3,
> + PMIF_IRQ_CLR_3,
> + PMIF_IRQ_EVENT_EN_4,
> + PMIF_IRQ_FLAG_4,
> + PMIF_IRQ_CLR_4,
> + PMIF_WDT_EVENT_EN_0,
> + PMIF_WDT_FLAG_0,
> + PMIF_WDT_EVENT_EN_1,
> + PMIF_WDT_FLAG_1,
> + PMIF_SWINF_0_STA,
> + PMIF_SWINF_0_WDATA_31_0,
> + PMIF_SWINF_0_RDATA_31_0,
> + PMIF_SWINF_0_ACC,
> + PMIF_SWINF_0_VLD_CLR,
> + PMIF_SWINF_1_STA,
> + PMIF_SWINF_1_WDATA_31_0,
> + PMIF_SWINF_1_RDATA_31_0,
> + PMIF_SWINF_1_ACC,
> + PMIF_SWINF_1_VLD_CLR,
> + PMIF_SWINF_2_STA,
> + PMIF_SWINF_2_WDATA_31_0,
> + PMIF_SWINF_2_RDATA_31_0,
> + PMIF_SWINF_2_ACC,
> + PMIF_SWINF_2_VLD_CLR,
> + PMIF_SWINF_3_STA,
> + PMIF_SWINF_3_WDATA_31_0,
> + PMIF_SWINF_3_RDATA_31_0,
> + PMIF_SWINF_3_ACC,
> + PMIF_SWINF_3_VLD_CLR,
> +};
> +
> +static const u32 mt6873_regs[] = {
> + [PMIF_INIT_DONE] =  0x,
> + [PMIF_INF_EN] = 0x0024,
> + [PMIF_ARB_EN] = 0x0150,
> + [PMIF_CMDISSUE_EN] =0x03B4,
> + [PMIF_TIMER_CTRL] = 0x03E0,
> + [PMIF_SPI_MODE_CTRL] =  0x0400,
> + [PMIF_IRQ_EVENT_EN_0] = 0x0418,
> + [PMIF_IRQ_FLAG_0] = 0x0420,
> + [PMIF_IRQ_CLR_0] =  0x0424,
> + [PMIF_IRQ_EVENT_EN_1] = 0x0428,
> + [PMIF_IRQ_FLAG_1] = 0x0430,
> + [PMIF_IRQ_CLR_1] =  0x0434,
> + [PMIF_IRQ_EVENT_EN_2] = 0x0438,
> + [PMIF_IRQ_FLAG_2] = 0x0440,
> + [PMIF_IRQ_CLR_2] =  0x0444,
> + [PMIF_IRQ_EVENT_EN_3] = 0x0448,
> + [PMIF_IRQ_FLAG_3] = 0x0

[PATCH RFC v2 0/4] virtio net: spurious interrupt related fixes

2021-04-12 Thread Michael S. Tsirkin
With the implementation of napi-tx in virtio driver, we clean tx
descriptors from rx napi handler, for the purpose of reducing tx
complete interrupts. But this introduces a race where tx complete
interrupt has been raised, but the handler finds there is no work to do
because we have done the work in the previous rx interrupt handler.
A similar issue exists with polling from start_xmit, it is however
less common because of the delayed cb optimization of the split ring -
but will likely affect the packed ring once that is more common.

In particular, this was reported to lead to the following warning msg:
[ 3588.010778] irq 38: nobody cared (try booting with the
"irqpoll" option)
[ 3588.017938] CPU: 4 PID: 0 Comm: swapper/4 Not tainted
5.3.0-19-generic #20~18.04.2-Ubuntu
[ 3588.017940] Call Trace:
[ 3588.017942]  
[ 3588.017951]  dump_stack+0x63/0x85
[ 3588.017953]  __report_bad_irq+0x35/0xc0
[ 3588.017955]  note_interrupt+0x24b/0x2a0
[ 3588.017956]  handle_irq_event_percpu+0x54/0x80
[ 3588.017957]  handle_irq_event+0x3b/0x60
[ 3588.017958]  handle_edge_irq+0x83/0x1a0
[ 3588.017961]  handle_irq+0x20/0x30
[ 3588.017964]  do_IRQ+0x50/0xe0
[ 3588.017966]  common_interrupt+0xf/0xf
[ 3588.017966]  
[ 3588.017989] handlers:
[ 3588.020374] [<1b9f1da8>] vring_interrupt
[ 3588.025099] Disabling IRQ #38

This patchset attempts to fix this by cleaning up a bunch of races
related to the handling of sq callbacks (aka tx interrupts).
Very lightly tested, sending out for help with testing, early feedback
and flames. Thanks!

Michael S. Tsirkin (4):
  virtio: fix up virtio_disable_cb
  virtio_net: disable cb aggressively
  virtio_net: move tx vq operation under tx queue lock
  virtio_net: move txq wakeups under tx q lock

 drivers/net/virtio_net.c | 35 +--
 drivers/virtio/virtio_ring.c | 26 +-
 2 files changed, 54 insertions(+), 7 deletions(-)

-- 
MST



Re: [PATCH v1 0/2] scsi: libsas: few clean up patches

2021-04-12 Thread Martin K. Petersen
On Thu, 25 Mar 2021 20:29:54 +0800, Luo Jiaxing wrote:

> Two types of errors are detected by the checkpatch.
> 1. Alignment between switches and cases
> 2. Improper use of some spaces
> 
> Here are the clean up patches.
> 
> Luo Jiaxing (2):
>   scsi: libsas: make switch and case at the same indent in
> sas_to_ata_err()
>   scsi: libsas: clean up for white spaces
> 
> [...]

Applied to 5.13/scsi-queue, thanks!

[1/2] scsi: libsas: make switch and case at the same indent in sas_to_ata_err()
  https://git.kernel.org/mkp/scsi/c/c03f2422b9f5
[2/2] scsi: libsas: clean up for white spaces
  https://git.kernel.org/mkp/scsi/c/857a80bbd732

-- 
Martin K. Petersen  Oracle Linux Engineering


Re: [PATCH v2] scsi: libsas: Reset num_scatter if libata mark qc as NODATA

2021-04-12 Thread Martin K. Petersen
On Thu, 18 Mar 2021 15:56:32 -0700, Jolly Shah wrote:

> When the cache_type for the scsi device is changed, the scsi layer
> issues a MODE_SELECT command. The caching mode details are communicated
> via a request buffer associated with the scsi command with data
> direction set as DMA_TO_DEVICE (scsi_mode_select). When this command
> reaches the libata layer, as a part of generic initial setup, libata
> layer sets up the scatterlist for the command using the scsi command
> (ata_scsi_qc_new). This command is then translated by the libata layer
> into ATA_CMD_SET_FEATURES (ata_scsi_mode_select_xlat). The libata layer
> treats this as a non data command (ata_mselect_caching), since it only
> needs an ata taskfile to pass the caching on/off information to the
> device. It does not need the scatterlist that has been setup, so it does
> not perform dma_map_sg on the scatterlist (ata_qc_issue). Unfortunately,
> when this command reaches the libsas layer(sas_ata_qc_issue), libsas
> layer sees it as a non data command with a scatterlist. It cannot
> extract the correct dma length, since the scatterlist has not been
> mapped with dma_map_sg for a DMA operation. When this partially
> constructed SAS task reaches pm80xx LLDD, it results in below warning.
> 
> [...]

Applied to 5.12/scsi-fixes, thanks!

[1/1] scsi: libsas: Reset num_scatter if libata mark qc as NODATA
  https://git.kernel.org/mkp/scsi/c/176ddd89171d

-- 
Martin K. Petersen  Oracle Linux Engineering


[PATCH RFC v2 4/4] virtio_net: move txq wakeups under tx q lock

2021-04-12 Thread Michael S. Tsirkin
We currently check num_free outside tx q lock
which is unsafe: new packets can arrive meanwhile
and there won't be space in the queue.
Thus a spurious queue wakeup causing overhead
and even packet drops.

Move the check under the lock to fix that.

Signed-off-by: Michael S. Tsirkin 
---
 drivers/net/virtio_net.c | 13 +++--
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 460ccdbb840e..febaf55ec1f6 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -1431,11 +1431,12 @@ static void virtnet_poll_cleantx(struct receive_queue 
*rq)
if (__netif_tx_trylock(txq)) {
virtqueue_disable_cb(sq->vq);
free_old_xmit_skbs(sq, true);
+
+   if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS)
+   netif_tx_wake_queue(txq);
+
__netif_tx_unlock(txq);
}
-
-   if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS)
-   netif_tx_wake_queue(txq);
 }
 
 static int virtnet_poll(struct napi_struct *napi, int budget)
@@ -1519,6 +1520,9 @@ static int virtnet_poll_tx(struct napi_struct *napi, int 
budget)
virtqueue_disable_cb(sq->vq);
free_old_xmit_skbs(sq, true);
 
+   if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS)
+   netif_tx_wake_queue(txq);
+
opaque = virtqueue_enable_cb_prepare(sq->vq);
 
done = napi_complete_done(napi, 0);
@@ -1539,9 +1543,6 @@ static int virtnet_poll_tx(struct napi_struct *napi, int 
budget)
}
}
 
-   if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS)
-   netif_tx_wake_queue(txq);
-
return 0;
 }
 
-- 
MST



[PATCH RFC v2 3/4] virtio_net: move tx vq operation under tx queue lock

2021-04-12 Thread Michael S. Tsirkin
It's unsafe to operate a vq from multiple threads.
Unfortunately this is exactly what we do when invoking
clean tx poll from rx napi.
As a fix move everything that deals with the vq to under tx lock.

Signed-off-by: Michael S. Tsirkin 
---
 drivers/net/virtio_net.c | 22 +-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 16d5abed582c..460ccdbb840e 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -1505,6 +1505,8 @@ static int virtnet_poll_tx(struct napi_struct *napi, int 
budget)
struct virtnet_info *vi = sq->vq->vdev->priv;
unsigned int index = vq2txq(sq->vq);
struct netdev_queue *txq;
+   int opaque;
+   bool done;
 
if (unlikely(is_xdp_raw_buffer_queue(vi, index))) {
/* We don't need to enable cb for XDP */
@@ -1514,10 +1516,28 @@ static int virtnet_poll_tx(struct napi_struct *napi, 
int budget)
 
txq = netdev_get_tx_queue(vi->dev, index);
__netif_tx_lock(txq, raw_smp_processor_id());
+   virtqueue_disable_cb(sq->vq);
free_old_xmit_skbs(sq, true);
+
+   opaque = virtqueue_enable_cb_prepare(sq->vq);
+
+   done = napi_complete_done(napi, 0);
+
+   if (!done)
+   virtqueue_disable_cb(sq->vq);
+
__netif_tx_unlock(txq);
 
-   virtqueue_napi_complete(napi, sq->vq, 0);
+   if (done) {
+   if (unlikely(virtqueue_poll(sq->vq, opaque))) {
+   if (napi_schedule_prep(napi)) {
+   __netif_tx_lock(txq, raw_smp_processor_id());
+   virtqueue_disable_cb(sq->vq);
+   __netif_tx_unlock(txq);
+   __napi_schedule(napi);
+   }
+   }
+   }
 
if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS)
netif_tx_wake_queue(txq);
-- 
MST



[PATCH RFC v2 2/4] virtio_net: disable cb aggressively

2021-04-12 Thread Michael S. Tsirkin
There are currently two cases where we poll TX vq not in response to a
callback: start xmit and rx napi.  We currently do this with callbacks
enabled which can cause extra interrupts from the card.  Used not to be
a big issue as we run with interrupts disabled but that is no longer the
case, and in some cases the rate of spurious interrupts is so high
linux detects this and actually kills the interrupt.

Fix up by disabling the callbacks before polling the tx vq.

Signed-off-by: Michael S. Tsirkin 
---
 drivers/net/virtio_net.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 82e520d2cb12..16d5abed582c 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -1429,6 +1429,7 @@ static void virtnet_poll_cleantx(struct receive_queue *rq)
return;
 
if (__netif_tx_trylock(txq)) {
+   virtqueue_disable_cb(sq->vq);
free_old_xmit_skbs(sq, true);
__netif_tx_unlock(txq);
}
@@ -1582,6 +1583,7 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct 
net_device *dev)
bool use_napi = sq->napi.weight;
 
/* Free up any pending old buffers before queueing new ones. */
+   virtqueue_disable_cb(sq->vq);
free_old_xmit_skbs(sq, false);
 
if (use_napi && kick)
-- 
MST



[PATCH RFC v2 1/4] virtio: fix up virtio_disable_cb

2021-04-12 Thread Michael S. Tsirkin
virtio_disable_cb is currently a nop for split ring with event index.
This is because it used to be always called from a callback when we know
device won't trigger more events until we update the index.  However,
now that we run with interrupts enabled a lot we also poll without a
callback so that is different: disabling callbacks will help reduce the
number of spurious interrupts.
Further, if using event index with a packed ring, and if being called
from a callback, we actually do disable interrupts which is unnecessary.

Fix both issues by tracking whenever we get a callback. If that is
the case disabling interrupts with event index can be a nop.
If not the case disable interrupts. Note: with a split ring
there's no explicit "no interrupts" value. For now we write
a fixed value so our chance of triggering an interupt
is 1/ring size. It's probably better to write something
related to the last used index there to reduce the chance
even further. For now I'm keeping it simple.

Signed-off-by: Michael S. Tsirkin 
---
 drivers/virtio/virtio_ring.c | 26 +-
 1 file changed, 25 insertions(+), 1 deletion(-)

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 71e16b53e9c1..88f0b16b11b8 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -113,6 +113,9 @@ struct vring_virtqueue {
/* Last used index we've seen. */
u16 last_used_idx;
 
+   /* Hint for event idx: already triggered no need to disable. */
+   bool event_triggered;
+
union {
/* Available for split ring */
struct {
@@ -739,7 +742,10 @@ static void virtqueue_disable_cb_split(struct virtqueue 
*_vq)
 
if (!(vq->split.avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT)) {
vq->split.avail_flags_shadow |= VRING_AVAIL_F_NO_INTERRUPT;
-   if (!vq->event)
+   if (vq->event)
+   /* TODO: this is a hack. Figure out a cleaner value to 
write. */
+   vring_used_event(&vq->split.vring) = 0x0;
+   else
vq->split.vring.avail->flags =
cpu_to_virtio16(_vq->vdev,
vq->split.avail_flags_shadow);
@@ -1605,6 +1611,7 @@ static struct virtqueue *vring_create_virtqueue_packed(
vq->weak_barriers = weak_barriers;
vq->broken = false;
vq->last_used_idx = 0;
+   vq->event_triggered = false;
vq->num_added = 0;
vq->packed_ring = true;
vq->use_dma_api = vring_use_dma_api(vdev);
@@ -1919,6 +1926,12 @@ void virtqueue_disable_cb(struct virtqueue *_vq)
 {
struct vring_virtqueue *vq = to_vvq(_vq);
 
+   /* If device triggered an event already it won't trigger one again:
+* no need to disable.
+*/
+   if (vq->event_triggered)
+   return;
+
if (vq->packed_ring)
virtqueue_disable_cb_packed(_vq);
else
@@ -1942,6 +1955,9 @@ unsigned virtqueue_enable_cb_prepare(struct virtqueue 
*_vq)
 {
struct vring_virtqueue *vq = to_vvq(_vq);
 
+   if (vq->event_triggered)
+   vq->event_triggered = false;
+
return vq->packed_ring ? virtqueue_enable_cb_prepare_packed(_vq) :
 virtqueue_enable_cb_prepare_split(_vq);
 }
@@ -2005,6 +2021,9 @@ bool virtqueue_enable_cb_delayed(struct virtqueue *_vq)
 {
struct vring_virtqueue *vq = to_vvq(_vq);
 
+   if (vq->event_triggered)
+   vq->event_triggered = false;
+
return vq->packed_ring ? virtqueue_enable_cb_delayed_packed(_vq) :
 virtqueue_enable_cb_delayed_split(_vq);
 }
@@ -2044,6 +2063,10 @@ irqreturn_t vring_interrupt(int irq, void *_vq)
if (unlikely(vq->broken))
return IRQ_HANDLED;
 
+   /* Just a hint for performance: so it's ok that this can be racy! */
+   if (vq->event)
+   vq->event_triggered = true;
+
pr_debug("virtqueue callback for %p (%p)\n", vq, vq->vq.callback);
if (vq->vq.callback)
vq->vq.callback(&vq->vq);
@@ -2083,6 +2106,7 @@ struct virtqueue *__vring_new_virtqueue(unsigned int 
index,
vq->weak_barriers = weak_barriers;
vq->broken = false;
vq->last_used_idx = 0;
+   vq->event_triggered = false;
vq->num_added = 0;
vq->use_dma_api = vring_use_dma_api(vdev);
 #ifdef DEBUG
-- 
MST



[syzbot] KASAN: use-after-free Read in skcipher_walk_next

2021-04-12 Thread syzbot
Hello,

syzbot found the following issue on:

HEAD commit:4fa56ad0 Merge tag 'for-linus' of git://git.kernel.org/pub..
git tree:   upstream
console output: https://syzkaller.appspot.com/x/log.txt?x=17dbd09ad0
kernel config:  https://syzkaller.appspot.com/x/.config?x=9320464bf47598bd
dashboard link: https://syzkaller.appspot.com/bug?extid=4061a98a8ab454dde8ff

Unfortunately, I don't have any reproducer for this issue yet.

IMPORTANT: if you fix the issue, please add the following tag to the commit:
Reported-by: syzbot+4061a98a8ab454dde...@syzkaller.appspotmail.com

==
BUG: KASAN: use-after-free in memcpy include/linux/fortify-string.h:191 [inline]
BUG: KASAN: use-after-free in skcipher_next_copy crypto/skcipher.c:292 [inline]
BUG: KASAN: use-after-free in skcipher_walk_next+0xb69/0x1680 
crypto/skcipher.c:379
Read of size 2785 at addr 8880781c by task kworker/u4:3/204

CPU: 0 PID: 204 Comm: kworker/u4:3 Not tainted 5.12.0-rc6-syzkaller #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 
01/01/2011
Workqueue: pencrypt_parallel padata_parallel_worker
Call Trace:
 __dump_stack lib/dump_stack.c:79 [inline]
 dump_stack+0x141/0x1d7 lib/dump_stack.c:120
 print_address_description.constprop.0.cold+0x5b/0x2f8 mm/kasan/report.c:232
 __kasan_report mm/kasan/report.c:399 [inline]
 kasan_report.cold+0x7c/0xd8 mm/kasan/report.c:416
 check_region_inline mm/kasan/generic.c:180 [inline]
 kasan_check_range+0x13d/0x180 mm/kasan/generic.c:186
 memcpy+0x20/0x60 mm/kasan/shadow.c:65
 memcpy include/linux/fortify-string.h:191 [inline]
 skcipher_next_copy crypto/skcipher.c:292 [inline]
 skcipher_walk_next+0xb69/0x1680 crypto/skcipher.c:379
 skcipher_walk_done+0x7a3/0xf00 crypto/skcipher.c:159
 gcmaes_crypt_by_sg+0x377/0x8a0 arch/x86/crypto/aesni-intel_glue.c:694

The buggy address belongs to the page:
page:ea0001e07000 refcount:0 mapcount:-128 mapping: 
index:0x1 pfn:0x781c0
flags: 0xfff000()
raw: 00fff000 ea0001e06808 ea0001c67008 
raw: 0001 0004 ff7f 
page dumped because: kasan: bad access detected

Memory state around the buggy address:
 8880781bff00: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
 8880781bff80: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
>8880781c: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
   ^
 8880781c0080: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
 8880781c0100: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
==


---
This report is generated by a bot. It may contain errors.
See https://goo.gl/tpsmEJ for more information about syzbot.
syzbot engineers can be reached at syzkal...@googlegroups.com.

syzbot will keep track of this issue. See:
https://goo.gl/tpsmEJ#status for how to communicate with syzbot.


linux-next: manual merge of the kvm-arm tree with the arm64 tree

2021-04-12 Thread Stephen Rothwell
Hi all,

Today's linux-next merge of the kvm-arm tree got a conflict in:

  arch/arm64/include/asm/assembler.h

between commits:

  27248fe1abb2 ("arm64: assembler: remove conditional NEON yield macros")
  13150149aa6d ("arm64: fpsimd: run kernel mode NEON with softirqs disabled")

from the arm64 tree and commits:

  8f4de66e247b ("arm64: asm: Provide set_sctlr_el2 macro")
  755db23420a1 ("KVM: arm64: Generate final CTR_EL0 value when running in 
Protected mode")

from the kvm-arm tree.

I fixed it up (see below) and can carry the fix as necessary. This
is now fixed as far as linux-next is concerned, but any non trivial
conflicts should be mentioned to your upstream maintainer when your tree
is submitted for merging.  You may also want to consider cooperating
with the maintainer of the conflicting tree to minimise any particularly
complex conflicts.

-- 
Cheers,
Stephen Rothwell

diff --cc arch/arm64/include/asm/assembler.h
index ab569b0b45fc,34ddd8a0f3dd..
--- a/arch/arm64/include/asm/assembler.h
+++ b/arch/arm64/include/asm/assembler.h
@@@ -15,7 -15,7 +15,8 @@@
  #include 
  
  #include 
 +#include 
+ #include 
  #include 
  #include 
  #include 
@@@ -701,25 -705,95 +714,33 @@@ USER(\label, ic ivau, \tmp2)
// inval
isb
  .endm
  
+ .macro set_sctlr_el1, reg
+   set_sctlr sctlr_el1, \reg
+ .endm
+ 
+ .macro set_sctlr_el2, reg
+   set_sctlr sctlr_el2, \reg
+ .endm
+ 
 -/*
 - * Check whether to yield to another runnable task from kernel mode NEON code
 - * (which runs with preemption disabled).
 - *
 - * if_will_cond_yield_neon
 - *// pre-yield patchup code
 - * do_cond_yield_neon
 - *// post-yield patchup code
 - * endif_yield_neon
 - *
 - * where  is optional, and marks the point where execution will resume
 - * after a yield has been performed. If omitted, execution resumes right after
 - * the endif_yield_neon invocation. Note that the entire sequence, including
 - * the provided patchup code, will be omitted from the image if
 - * CONFIG_PREEMPTION is not defined.
 - *
 - * As a convenience, in the case where no patchup code is required, the above
 - * sequence may be abbreviated to
 - *
 - * cond_yield_neon 
 - *
 - * Note that the patchup code does not support assembler directives that 
change
 - * the output section, any use of such directives is undefined.
 - *
 - * The yield itself consists of the following:
 - * - Check whether the preempt count is exactly 1 and a reschedule is also
 - *   needed. If so, calling of preempt_enable() in kernel_neon_end() will
 - *   trigger a reschedule. If it is not the case, yielding is pointless.
 - * - Disable and re-enable kernel mode NEON, and branch to the yield fixup
 - *   code.
 - *
 - * This macro sequence may clobber all CPU state that is not guaranteed by the
 - * AAPCS to be preserved across an ordinary function call.
 - */
 -
 -  .macro  cond_yield_neon, lbl
 -  if_will_cond_yield_neon
 -  do_cond_yield_neon
 -  endif_yield_neon\lbl
 -  .endm
 -
 -  .macro  if_will_cond_yield_neon
 -#ifdef CONFIG_PREEMPTION
 -  get_current_taskx0
 -  ldr x0, [x0, #TSK_TI_PREEMPT]
 -  sub x0, x0, #PREEMPT_DISABLE_OFFSET
 -  cbz x0, .Lyield_\@
 -  /* fall through to endif_yield_neon */
 -  .subsection 1
 -.Lyield_\@ :
 -#else
 -  .section".discard.cond_yield_neon", "ax"
 -#endif
 -  .endm
 -
 -  .macro  do_cond_yield_neon
 -  bl  kernel_neon_end
 -  bl  kernel_neon_begin
 -  .endm
 -
 -  .macro  endif_yield_neon, lbl
 -  .ifnb   \lbl
 -  b   \lbl
 -  .else
 -  b   .Lyield_out_\@
 -  .endif
 -  .previous
 -.Lyield_out_\@ :
 -  .endm
 -
/*
 -   * Check whether preempt-disabled code should yield as soon as it
 -   * is able. This is the case if re-enabling preemption a single
 -   * time results in a preempt count of zero, and the TIF_NEED_RESCHED
 -   * flag is set. (Note that the latter is stored negated in the
 -   * top word of the thread_info::preempt_count field)
 +   * Check whether preempt/bh-disabled asm code should yield as soon as
 +   * it is able. This is the case if we are currently running in task
 +   * context, and either a softirq is pending, or the TIF_NEED_RESCHED
 +   * flag is set and re-enabling preemption a single time would result in
 +   * a preempt count of zero. (Note that the TIF_NEED_RESCHED flag is
 +   * stored negated in the top word of the thread_info::preempt_count
 +   * field)
 */
 -  .macro  cond_yield, lbl:req, tmp:req
 -#ifdef CONFIG_PREEMPTION
 +  .macro  cond_yield, lbl:req, tmp:req, tmp2:req
get_current_task \tmp
ldr \tmp, [\tmp, #TSK_TI_PREEMPT]
 +  /*
 +   * If we are serving a softirq, t

Re: [RFC] mm: activate access-more-than-once page via NUMA balancing

2021-04-12 Thread Huang, Ying
Yu Zhao  writes:

> On Fri, Mar 26, 2021 at 12:21 AM Huang, Ying  wrote:
>>
>> Mel Gorman  writes:
>>
>> > On Thu, Mar 25, 2021 at 12:33:45PM +0800, Huang, Ying wrote:
>> >> > I caution against this patch.
>> >> >
>> >> > It's non-deterministic for a number of reasons. As it requires NUMA
>> >> > balancing to be enabled, the pageout behaviour of a system changes when
>> >> > NUMA balancing is active. If this led to pages being artificially and
>> >> > inappropriately preserved, NUMA balancing could be disabled for the
>> >> > wrong reasons.  It only applies to pages that have no target node so
>> >> > memory policies affect which pages are activated differently. Similarly,
>> >> > NUMA balancing does not scan all VMAs and some pages may never trap a
>> >> > NUMA fault as a result. The timing of when an address space gets scanned
>> >> > is driven by the locality of pages and so the timing of page activation
>> >> > potentially becomes linked to whether pages are local or need to migrate
>> >> > (although not right now for this patch as it only affects pages with a
>> >> > target nid of NUMA_NO_NODE). In other words, changes in NUMA balancing
>> >> > that affect migration potentially affect the aging rate.  Similarly,
>> >> > the activate rate of a process with a single thread and multiple threads
>> >> > potentially have different activation rates.
>> >> >
>> >> > Finally, the NUMA balancing scan algorithm is sub-optimal. It 
>> >> > potentially
>> >> > scans the entire address space even though only a small number of pages
>> >> > are scanned. This is particularly problematic when a process has a lot
>> >> > of threads because threads are redundantly scanning the same regions. If
>> >> > NUMA balancing ever introduced range tracking of faulted pages to limit
>> >> > how much scanning it has to do, it would inadvertently cause a change in
>> >> > page activation rate.
>> >> >
>> >> > NUMA balancing is about page locality, it should not get conflated with
>> >> > page aging.
>> >>
>> >> I understand your concerns about binding the NUMA balancing and page
>> >> reclaiming.  The requirement of the page locality and page aging is
>> >> different, so the policies need to be different.  This is the wrong part
>> >> of the patch.
>> >>
>> >> From another point of view, it's still possible to share some underlying
>> >> mechanisms (and code) between them.  That is, scanning the page tables
>> >> to make pages unaccessible and capture the page accesses via the page
>> >> fault.
>> >
>> > Potentially yes but not necessarily recommended for page aging. NUMA
>> > balancing has to be careful about the rate it scans pages to avoid
>> > excessive overhead so it's driven by locality. The scanning happens
>> > within a tasks context so during that time, the task is not executing
>> > its normal work and it incurs the overhead for faults. Generally, this
>> > is not too much overhead because pages get migrated locally, the scan
>> > rate drops and so does the overhead.
>> >
>> > However, if you want to drive page aging, that is constant so the rate
>> > could not be easily adapted in a way that would be deterministic.
>> >
>> >> Now these page accessing information is used for the page
>> >> locality.  Do you think it's a good idea to use these information for
>> >> the page aging too (but with a different policy as you pointed out)?
>> >>
>> >
>> > I'm not completely opposed to it but I think the overhead it would
>> > introduce could be severe. Worse, if a workload fits in memory and there
>> > is limited to no memory pressure, it's all overhead for no gain. Early
>> > generations of NUMA balancing had to find a balance to sure the gains
>> > from locality exceeded the cost of measuring locality and doing the same
>> > for page aging in some ways is even more challenging.
>>
>> Yes.  I will think more about it from the overhead vs. gain point of
>> view.  Thanks a lot for your sharing on that.
>>
>> >> From yet another point of view :-), in current NUMA balancing
>> >> implementation, it's assumed that the node private pages can fit in the
>> >> accessing node.  But this may be not always true.  Is it a valid
>> >> optimization to migrate the hot private pages first?
>> >>
>> >
>> > I'm not sure how the hotness of pages could be ranked. At the time of a
>> > hinting fault, the page is by definition active now because it was been
>> > accessed. Prioritising what pages to migrate based on the number of faults
>> > that have been trapped would have to be stored somewhere.
>>
>> Yes.  We need to store some information about that.  In an old version
>> of the patchset which uses NUMA balancing to promote hot pages from the
>> PMEM to DRAM, we have designed a method to measure the hotness of the
>> pages.  The basic idea is as follows,
>>
>> - When the page table of a process is scanned, the latest N scanning
>>   address ranges and scan times are recorded in a ring buffer of
>>   mm_struct.
>>
>> - In hint page fault handler, 

Re: [PATCH 4.19 00/66] 4.19.187-rc1 review

2021-04-12 Thread Naresh Kamboju
On Mon, 12 Apr 2021 at 14:13, Greg Kroah-Hartman
 wrote:
>
> This is the start of the stable review cycle for the 4.19.187 release.
> There are 66 patches in this series, all will be posted as a response
> to this one.  If anyone has any issues with these being applied, please
> let me know.
>
> Responses should be made by Wed, 14 Apr 2021 08:39:44 +.
> Anything received after that time might be too late.
>
> The whole patch series can be found in one patch at:
> 
> https://www.kernel.org/pub/linux/kernel/v4.x/stable-review/patch-4.19.187-rc1.gz
> or in the git tree and branch at:
> 
> git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git 
> linux-4.19.y
> and the diffstat can be found below.
>
> thanks,
>
> greg k-h

Results from Linaro’s test farm.
No regressions on arm64, arm, x86_64, and i386.

Tested-by: Linux Kernel Functional Testing 

## Build
* kernel: 4.19.187-rc1
* git: 
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git
* git branch: linux-4.19.y
* git commit: 85bc28045cdbb9576907965c761445aaece4f5ad
* git describe: v4.19.186-67-g85bc28045cdb
* test details:
https://qa-reports.linaro.org/lkft/linux-stable-rc-linux-4.19.y/build/v4.19.186-67-g85bc28045cdb

## No regressions (compared to v4.19.185-19-g6aba908ea95f)

## No fixes (compared to v4.19.185-19-g6aba908ea95f)

## Test result summary
 total: 65010, pass: 52744, fail: 1575, skip: 10433, xfail: 258,

## Build Summary
* arm: 97 total, 96 passed, 1 failed
* arm64: 25 total, 24 passed, 1 failed
* dragonboard-410c: 1 total, 1 passed, 0 failed
* hi6220-hikey: 1 total, 1 passed, 0 failed
* i386: 15 total, 13 passed, 2 failed
* juno-r2: 1 total, 1 passed, 0 failed
* mips: 39 total, 39 passed, 0 failed
* s390: 9 total, 9 passed, 0 failed
* sparc: 9 total, 9 passed, 0 failed
* x15: 2 total, 1 passed, 1 failed
* x86: 1 total, 1 passed, 0 failed
* x86_64: 15 total, 14 passed, 1 failed

## Test suites summary
* fwts
* igt-gpu-tools
* install-android-platform-tools-r2600
* kselftest-
* kselftest-android
* kselftest-bpf
* kselftest-capabilities
* kselftest-cgroup
* kselftest-clone3
* kselftest-core
* kselftest-cpu-hotplug
* kselftest-cpufreq
* kselftest-efivarfs
* kselftest-filesystems
* kselftest-firmware
* kselftest-fpu
* kselftest-futex
* kselftest-gpio
* kselftest-intel_pstate
* kselftest-ipc
* kselftest-ir
* kselftest-kcmp
* kselftest-kexec
* kselftest-kvm
* kselftest-lib
* kselftest-livepatch
* kselftest-lkdtm
* kselftest-membarrier
* kselftest-memfd
* kselftest-memory-hotplug
* kselftest-mincore
* kselftest-mount
* kselftest-mqueue
* kselftest-net
* kselftest-netfilter
* kselftest-nsfs
* kselftest-openat2
* kselftest-pid_namespace
* kselftest-pidfd
* kselftest-proc
* kselftest-pstore
* kselftest-ptrace
* kselftest-rseq
* kselftest-rtc
* kselftest-seccomp
* kselftest-sigaltstack
* kselftest-size
* kselftest-splice
* kselftest-static_keys
* kselftest-sync
* kselftest-sysctl
* kselftest-tc-testing
* kselftest-timens
* kselftest-timers
* kselftest-tmpfs
* kselftest-tpm2
* kselftest-user
* kselftest-vm
* kselftest-vsyscall-mode-native-
* kselftest-vsyscall-mode-none-
* kselftest-x86
* kselftest-zram
* kvm-unit-tests
* libhugetlbfs
* linux-log-parser
* ltp-cap_bounds-tests
* ltp-commands-tests
* ltp-containers-tests
* ltp-controllers-tests
* ltp-cpuhotplug-tests
* ltp-crypto-tests
* ltp-cve-tests
* ltp-dio-tests
* ltp-fcntl-locktests-tests
* ltp-filecaps-tests
* ltp-fs-tests
* ltp-fs_bind-tests
* ltp-fs_perms_simple-tests
* ltp-fsx-tests
* ltp-hugetlb-tests
* ltp-io-tests
* ltp-ipc-tests
* ltp-math-tests
* ltp-mm-tests
* ltp-nptl-tests
* ltp-open-posix-tests
* ltp-pty-tests
* ltp-sched-tests
* ltp-securebits-tests
* ltp-syscalls-tests
* ltp-tracing-tests
* network-basic-tests
* perf
* rcutorture
* ssuite
* v4l2-compliance

--
Linaro LKFT
https://lkft.linaro.org


Re: [PATCH][next] KEYS: trusted: Fix missing null return from kzalloc call

2021-04-12 Thread Sumit Garg
On Mon, 12 Apr 2021 at 22:34, Colin Ian King  wrote:
>
> On 12/04/2021 17:48, James Bottomley wrote:
> > On Mon, 2021-04-12 at 17:01 +0100, Colin King wrote:
> >> From: Colin Ian King 
> >>
> >> The kzalloc call can return null with the GFP_KERNEL flag so
> >> add a null check and exit via a new error exit label. Use the
> >> same exit error label for another error path too.
> >>
> >> Addresses-Coverity: ("Dereference null return value")
> >> Fixes: 830027e2cb55 ("KEYS: trusted: Add generic trusted keys
> >> framework")
> >> Signed-off-by: Colin Ian King 
> >> ---
> >>  security/keys/trusted-keys/trusted_core.c | 6 --
> >>  1 file changed, 4 insertions(+), 2 deletions(-)
> >>
> >> diff --git a/security/keys/trusted-keys/trusted_core.c
> >> b/security/keys/trusted-keys/trusted_core.c
> >> index ec3a066a4b42..90774793f0b1 100644
> >> --- a/security/keys/trusted-keys/trusted_core.c
> >> +++ b/security/keys/trusted-keys/trusted_core.c
> >> @@ -116,11 +116,13 @@ static struct trusted_key_payload
> >> *trusted_payload_alloc(struct key *key)
> >>
> >>  ret = key_payload_reserve(key, sizeof(*p));
> >>  if (ret < 0)
> >> -return p;
> >> +goto err;
> >>  p = kzalloc(sizeof(*p), GFP_KERNEL);
> >> +if (!p)
> >> +goto err;
> >>
> >>  p->migratable = migratable;
> >> -
> >> +err:
> >>  return p;
> >
> > This is clearly a code migration bug in
> >
> > commit 251c85bd106099e6f388a89e88e12d14de2c9cda
> > Author: Sumit Garg 
> > Date:   Mon Mar 1 18:41:24 2021 +0530
> >
> > KEYS: trusted: Add generic trusted keys framework
> >
> > Which has for addition to trusted_core.c:
> >
> > +static struct trusted_key_payload *trusted_payload_alloc(struct key
> > *key)
> > +{
> > +   struct trusted_key_payload *p = NULL;
> > +   int ret;
> > +
> > +   ret = key_payload_reserve(key, sizeof(*p));
> > +   if (ret < 0)
> > +   return p;
> > +   p = kzalloc(sizeof(*p), GFP_KERNEL);
> > +
> > +   p->migratable = migratable;
> > +
> > +   return p;
> > +}
> >
> > And for trusted_tpm1.c:
> >
> > -static struct trusted_key_payload *trusted_payload_alloc(struct key
> > *key)
> > -{
> > -   struct trusted_key_payload *p = NULL;
> > -   int ret;
> > -
> > -   ret = key_payload_reserve(key, sizeof *p);
> > -   if (ret < 0)
> > -   return p;
> > -   p = kzalloc(sizeof *p, GFP_KERNEL);
> > -   if (p)
> > -   p->migratable = 1; /* migratable by default */
> > -   return p;
> > -}
> >
> > The trusted_tpm1.c code was correct and we got this bug introduced by
> > what should have been a simple cut and paste ... how did that happen?

It was a little more than just cut and paste where I did generalized
"migratable" flag to be provided by the corresponding trust source's
ops struct.

> > And therefore, how safe is the rest of the extraction into
> > trusted_core.c?
> >
>
> fortunately it gets caught by static analysis, but it does make me also
> concerned about what else has changed and how this gets through review.
>

I agree that extraction into trusted_core.c was a complex change but
this patch has been up for review for almost 2 years [1]. And
extensive testing can't catch this sort of bug as allocation wouldn't
normally fail.

[1] https://lwn.net/Articles/795416/

-Sumit

> > James
> >
> >
>


Re: [PATCH 0/1] Use of /sys/bus/pci/devices/…/index for non-SMBIOS platforms

2021-04-12 Thread Leon Romanovsky
On Mon, Apr 12, 2021 at 03:59:04PM +0200, Niklas Schnelle wrote:
> Hi Narendra, Hi All,
> 
> According to Documentation/ABI/testing/sysfs-bus-pci you are responsible
> for the index device attribute that is used by systemd to create network
> interface names.
> 
> Now we would like to reuse this attribute for firmware provided PCI
> device index numbers on the s390 architecture which doesn't have
> SMBIOS/DMI nor ACPI. All code changes are within our architecture
> specific code but I'd like to get some Acks for this reuse. I've sent an
> RFC version of this patch on 15th of March with the subject:
> 
>s390/pci: expose a PCI device's UID as its index
> 
> but got no response. Would it be okay to re-use this attribute for
> essentially the same purpose but with index numbers provided by
> a different platform mechanism? I think this would be cleaner than
> further proliferation of /sys/bus/pci/devices//xyz_index
> attributes and allows re-use of the existing userspace infrastructure.

I'm missing an explanation that this change is safe for systemd and
they don't have some hard-coded assumption about the meaning of existing
index on s390.

Thanks


[PATCH v2][next] scsi: aacraid: Replace one-element array with flexible-array member

2021-04-12 Thread Gustavo A. R. Silva
There is a regular need in the kernel to provide a way to declare having
a dynamically sized set of trailing elements in a structure. Kernel code
should always use “flexible array members”[1] for these cases. The older
style of one-element or zero-length arrays should no longer be used[2].

Refactor the code according to the use of a flexible-array member in
struct aac_raw_io2 instead of one-element array, and use the
struct_size() and flex_array_size() helpers.

Also, this helps with the ongoing efforts to enable -Warray-bounds by
fixing the following warnings:

drivers/scsi/aacraid/aachba.c: In function ‘aac_build_sgraw2’:
drivers/scsi/aacraid/aachba.c:3970:18: warning: array subscript 1 is above 
array bounds of ‘struct sge_ieee1212[1]’ [-Warray-bounds]
 3970 | if (rio2->sge[j].length % (i*PAGE_SIZE)) {
  | ~^~~
drivers/scsi/aacraid/aachba.c:3974:27: warning: array subscript 1 is above 
array bounds of ‘struct sge_ieee1212[1]’ [-Warray-bounds]
 3974 | nseg_new += (rio2->sge[j].length / (i*PAGE_SIZE));
  |  ~^~~
drivers/scsi/aacraid/aachba.c:4011:28: warning: array subscript 1 is above 
array bounds of ‘struct sge_ieee1212[1]’ [-Warray-bounds]
 4011 |   for (j = 0; j < rio2->sge[i].length / (pages * PAGE_SIZE); ++j) {
  |   ~^~~
drivers/scsi/aacraid/aachba.c:4012:24: warning: array subscript 1 is above 
array bounds of ‘struct sge_ieee1212[1]’ [-Warray-bounds]
 4012 |addr_low = rio2->sge[i].addrLow + j * pages * PAGE_SIZE;
  |   ~^~~
drivers/scsi/aacraid/aachba.c:4014:33: warning: array subscript 1 is above 
array bounds of ‘struct sge_ieee1212[1]’ [-Warray-bounds]
 4014 |sge[pos].addrHigh = rio2->sge[i].addrHigh;
  |~^~~
drivers/scsi/aacraid/aachba.c:4015:28: warning: array subscript 1 is above 
array bounds of ‘struct sge_ieee1212[1]’ [-Warray-bounds]
 4015 |if (addr_low < rio2->sge[i].addrLow)
  |   ~^~~

[1] https://en.wikipedia.org/wiki/Flexible_array_member
[2] 
https://www.kernel.org/doc/html/v5.9/process/deprecated.html#zero-length-and-one-element-arrays

Link: https://github.com/KSPP/linux/issues/79
Link: https://github.com/KSPP/linux/issues/109
Build-tested-by: kernel test robot 
Link: https://lore.kernel.org/lkml/60414244.ur4%2fki+fbf1ohkzs%25...@intel.com/
Signed-off-by: Gustavo A. R. Silva 
---
Changes in v2:
 - Add code comment for clarification.

 drivers/scsi/aacraid/aachba.c  | 17 +++--
 drivers/scsi/aacraid/aacraid.h |  2 +-
 2 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/drivers/scsi/aacraid/aachba.c b/drivers/scsi/aacraid/aachba.c
index 8e06604370c4..2816a15d5633 100644
--- a/drivers/scsi/aacraid/aachba.c
+++ b/drivers/scsi/aacraid/aachba.c
@@ -1235,8 +1235,8 @@ static int aac_read_raw_io(struct fib * fib, struct 
scsi_cmnd * cmd, u64 lba, u3
if (ret < 0)
return ret;
command = ContainerRawIo2;
-   fibsize = sizeof(struct aac_raw_io2) +
-   ((le32_to_cpu(readcmd2->sgeCnt)-1) * sizeof(struct 
sge_ieee1212));
+   fibsize = struct_size(readcmd2, sge,
+le32_to_cpu(readcmd2->sgeCnt));
} else {
struct aac_raw_io *readcmd;
readcmd = (struct aac_raw_io *) fib_data(fib);
@@ -1366,8 +1366,8 @@ static int aac_write_raw_io(struct fib * fib, struct 
scsi_cmnd * cmd, u64 lba, u
if (ret < 0)
return ret;
command = ContainerRawIo2;
-   fibsize = sizeof(struct aac_raw_io2) +
-   ((le32_to_cpu(writecmd2->sgeCnt)-1) * sizeof(struct 
sge_ieee1212));
+   fibsize = struct_size(writecmd2, sge,
+ le32_to_cpu(writecmd2->sgeCnt));
} else {
struct aac_raw_io *writecmd;
writecmd = (struct aac_raw_io *) fib_data(fib);
@@ -4003,7 +4003,7 @@ static int aac_convert_sgraw2(struct aac_raw_io2 *rio2, 
int pages, int nseg, int
if (aac_convert_sgl == 0)
return 0;
 
-   sge = kmalloc_array(nseg_new, sizeof(struct sge_ieee1212), GFP_ATOMIC);
+   sge = kmalloc_array(nseg_new, sizeof(*sge), GFP_ATOMIC);
if (sge == NULL)
return -ENOMEM;
 
@@ -4020,7 +4020,12 @@ static int aac_convert_sgraw2(struct aac_raw_io2 *rio2, 
int pages, int nseg, int
}
}
sge[pos] = rio2->sge[nseg-1];
-   memcpy(&rio2->sge[1], &sge[1], (nseg_new-1)*sizeof(struct 
sge_ieee1212));
+   /*
+* Notice that, in this case, flex_array_size() evaluates to
+* (nseg_new - 1) number of sge objects of type struct sge_ieee1212.
+*/
+   memcpy(&rio2->sge[1], &sge[1],
+  flex_array_size(rio2, sge, nseg_new - 1));
 
kfree(sge);
rio2->sgeCnt = cpu_to_le32(nseg_new);
dif

Re: [RESEND,v5,1/2] bio: limit bio max size

2021-04-12 Thread Christoph Hellwig
And more importantly please test with a file system that uses the
iomap direct I/O code (btrfs, gfs2, ext4, xfs, zonefs) as we should
never just work aroudn a legacy codebase that should go away in the
block layer.


[PATCH v2 2/2] x86/tsc: skip tsc watchdog checking for qualified platforms

2021-04-12 Thread Feng Tang
There are cases that tsc clocksources are wrongly judged as unstable by
clocksource watchdogs like hpet, acpi_pm or 'refined-jiffies'. While
there is hardly a general reliable way to check the validity of a
watchdog, and to protect the innocent tsc, Thomas Gleixner proposed [1]:

"I'm inclined to lift that requirement when the CPU has:

1) X86_FEATURE_CONSTANT_TSC
2) X86_FEATURE_NONSTOP_TSC
3) X86_FEATURE_NONSTOP_TSC_S3
4) X86_FEATURE_TSC_ADJUST
5) At max. 4 sockets

 After two decades of horrors we're finally at a point where TSC seems
 to be halfway reliable and less abused by BIOS tinkerers. TSC_ADJUST
 was really key as we can now detect even small modifications reliably
 and the important point is that we can cure them as well (not pretty
 but better than all other options)."

As feature #3 X86_FEATURE_NONSTOP_TSC_S3 only exists on several generations
of Atom processor, and is always coupled with X86_FEATURE_CONSTANT_TSC
and X86_FEATURE_NONSTOP_TSC, skip checking it, and also be more defensive
to use maxim of 2 sockets.

The check is done inside tsc_init() before registering 'tsc-early' and
'tsc' clocksources, as there were cases that both of them had been
wrongly judged as unreliable.

[1]. https://lore.kernel.org/lkml/87eekfk8bd@nanos.tec.linutronix.de/
Suggested-by: Thomas Gleixner 
Signed-off-by: Feng Tang 
---
Change log:

  v2:
* Directly skip watchdog check without messing flag
  'tsc_clocksource_reliable' (Thomas)

 arch/x86/kernel/tsc.c | 22 ++
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index f70dffc..bfd013b 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -1177,6 +1177,12 @@ void mark_tsc_unstable(char *reason)
 
 EXPORT_SYMBOL_GPL(mark_tsc_unstable);
 
+static void __init tsc_skip_watchdog_verify(void)
+{
+   clocksource_tsc_early.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
+   clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
+}
+
 static void __init check_system_tsc_reliable(void)
 {
 #if defined(CONFIG_MGEODEGX1) || defined(CONFIG_MGEODE_LX) || 
defined(CONFIG_X86_GENERIC)
@@ -1193,6 +1199,17 @@ static void __init check_system_tsc_reliable(void)
 #endif
if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE))
tsc_clocksource_reliable = 1;
+
+   /*
+* Ideally the socket number should be checked, but this is called
+* by tsc_init() which is in early boot phase and the socket numbers
+* may not be available. Use 'nr_online_nodes' as a fallback solution
+*/
+   if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) &&
+   boot_cpu_has(X86_FEATURE_NONSTOP_TSC) &&
+   boot_cpu_has(X86_FEATURE_TSC_ADJUST) &&
+   nr_online_nodes <= 2)
+   tsc_skip_watchdog_verify();
 }
 
 /*
@@ -1384,9 +1401,6 @@ static int __init init_tsc_clocksource(void)
if (tsc_unstable)
goto unreg;
 
-   if (tsc_clocksource_reliable || no_tsc_watchdog)
-   clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
-
if (boot_cpu_has(X86_FEATURE_NONSTOP_TSC_S3))
clocksource_tsc.flags |= CLOCK_SOURCE_SUSPEND_NONSTOP;
 
@@ -1524,7 +1538,7 @@ void __init tsc_init(void)
}
 
if (tsc_clocksource_reliable || no_tsc_watchdog)
-   clocksource_tsc_early.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
+   tsc_skip_watchdog_verify();
 
clocksource_register_khz(&clocksource_tsc_early, tsc_khz);
detect_art();
-- 
2.7.4



[PATCH v2 1/2] x86/tsc: add a timer to make sure tsc_adjust is always checked

2021-04-12 Thread Feng Tang
Normally the tsc_sync will get checked every time system enters idle state,
but Thomas Gleixner mentioned there is still a caveat that a system won't
enter idle [1], either because it's too busy or configured purposely to not
enter idle. Setup a periodic timer to make sure the check is always on.

[1]. https://lore.kernel.org/lkml/875z286xtk@nanos.tec.linutronix.de/
Signed-off-by: Feng Tang 
---
Change log:
  
  v2:
 * skip timer setup when tsc_clocksource_reliabe==1 (Thomas)
 * refine comment and code format (Thomas) 

 arch/x86/kernel/tsc_sync.c | 39 +++
 1 file changed, 39 insertions(+)

diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index 3d3c761..39f18fa 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -30,6 +30,7 @@ struct tsc_adjust {
 };
 
 static DEFINE_PER_CPU(struct tsc_adjust, tsc_adjust);
+static struct timer_list tsc_sync_check_timer;
 
 /*
  * TSC's on different sockets may be reset asynchronously.
@@ -77,6 +78,44 @@ void tsc_verify_tsc_adjust(bool resume)
}
 }
 
+/*
+ * Normally the tsc_sync will be checked every time system enters idle state,
+ * but there is still caveat that a system won't enter idle, either because
+ * it's too busy or configured purposely to not enter idle.
+ *
+ * So setup a periodic timer to make sure the check is always on.
+ */
+
+#define SYNC_CHECK_INTERVAL(HZ * 600)
+
+static void tsc_sync_check_timer_fn(struct timer_list *unused)
+{
+   int next_cpu;
+
+   tsc_verify_tsc_adjust(false);
+
+   /* Run the check for all onlined CPUs in turn */
+   next_cpu = cpumask_next(raw_smp_processor_id(), cpu_online_mask);
+   if (next_cpu >= nr_cpu_ids)
+   next_cpu = cpumask_first(cpu_online_mask);
+
+   tsc_sync_check_timer.expires += SYNC_CHECK_INTERVAL;
+   add_timer_on(&tsc_sync_check_timer, next_cpu);
+}
+
+static int __init start_sync_check_timer(void)
+{
+   if (!boot_cpu_has(X86_FEATURE_TSC_ADJUST) || tsc_clocksource_reliable)
+   return 0;
+
+   timer_setup(&tsc_sync_check_timer, tsc_sync_check_timer_fn, 0);
+   tsc_sync_check_timer.expires = jiffies + SYNC_CHECK_INTERVAL;
+   add_timer(&tsc_sync_check_timer);
+
+   return 0;
+}
+late_initcall(start_sync_check_timer);
+
 static void tsc_sanitize_first_cpu(struct tsc_adjust *cur, s64 bootval,
   unsigned int cpu, bool bootcpu)
 {
-- 
2.7.4



Re: [PATCH][next] KEYS: trusted: Fix missing null return from kzalloc call

2021-04-12 Thread Sumit Garg
On Mon, 12 Apr 2021 at 21:31, Colin King  wrote:
>
> From: Colin Ian King 
>
> The kzalloc call can return null with the GFP_KERNEL flag so
> add a null check and exit via a new error exit label. Use the
> same exit error label for another error path too.
>
> Addresses-Coverity: ("Dereference null return value")
> Fixes: 830027e2cb55 ("KEYS: trusted: Add generic trusted keys framework")
> Signed-off-by: Colin Ian King 
> ---
>  security/keys/trusted-keys/trusted_core.c | 6 --
>  1 file changed, 4 insertions(+), 2 deletions(-)
>

Ah, it's my bad. Thanks for fixing this issue.

Reviewed-by: Sumit Garg 

-Sumit

> diff --git a/security/keys/trusted-keys/trusted_core.c 
> b/security/keys/trusted-keys/trusted_core.c
> index ec3a066a4b42..90774793f0b1 100644
> --- a/security/keys/trusted-keys/trusted_core.c
> +++ b/security/keys/trusted-keys/trusted_core.c
> @@ -116,11 +116,13 @@ static struct trusted_key_payload 
> *trusted_payload_alloc(struct key *key)
>
> ret = key_payload_reserve(key, sizeof(*p));
> if (ret < 0)
> -   return p;
> +   goto err;
> p = kzalloc(sizeof(*p), GFP_KERNEL);
> +   if (!p)
> +   goto err;
>
> p->migratable = migratable;
> -
> +err:
> return p;
>  }
>
> --
> 2.30.2
>


Re: [PATCH] ibmvfc: Fix invalid state machine BUG_ON

2021-04-12 Thread Martin K. Petersen


Tyrel,

> This fixes an issue hitting the BUG_ON in ibmvfc_do_work. When going
> through a host action of IBMVFC_HOST_ACTION_RESET, we change the
> action to IBMVFC_HOST_ACTION_TGT_DEL, then drop the host lock, and
> reset the CRQ, which changes the host state to IBMVFC_NO_CRQ.

[...]

Applied to 5.13/scsi-staging, thanks!

-- 
Martin K. Petersen  Oracle Linux Engineering


[PATCH v2 6/9] userfaultfd/selftests: create alias mappings in the shmem test

2021-04-12 Thread Axel Rasmussen
Previously, we just allocated two shm areas: area_src and area_dst. With
this commit, change this so we also allocate area_src_alias, and
area_dst_alias.

area_*_alias and area_* (respectively) point to the same underlying
physical pages, but are different VMAs. In a future commit in this
series, we'll leverage this setup to exercise minor fault handling
support for shmem, just like we do in the hugetlb_shared test.

Signed-off-by: Axel Rasmussen 
---
 tools/testing/selftests/vm/userfaultfd.c | 22 +++---
 1 file changed, 19 insertions(+), 3 deletions(-)

diff --git a/tools/testing/selftests/vm/userfaultfd.c 
b/tools/testing/selftests/vm/userfaultfd.c
index fc40831f818f..1f65c4ab7994 100644
--- a/tools/testing/selftests/vm/userfaultfd.c
+++ b/tools/testing/selftests/vm/userfaultfd.c
@@ -278,13 +278,29 @@ static void shmem_release_pages(char *rel_area)
 
 static void shmem_allocate_area(void **alloc_area)
 {
-   unsigned long offset =
-   alloc_area == (void **)&area_src ? 0 : nr_pages * page_size;
+   void *area_alias = NULL;
+   bool is_src = alloc_area == (void **)&area_src;
+   unsigned long offset = is_src ? 0 : nr_pages * page_size;
 
*alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
   MAP_SHARED, shm_fd, offset);
if (*alloc_area == MAP_FAILED)
err("mmap of memfd failed");
+
+   area_alias = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
+ MAP_SHARED, shm_fd, offset);
+   if (area_alias == MAP_FAILED)
+   err("mmap of memfd alias failed");
+
+   if (is_src)
+   area_src_alias = area_alias;
+   else
+   area_dst_alias = area_alias;
+}
+
+static void shmem_alias_mapping(__u64 *start, size_t len, unsigned long offset)
+{
+   *start = (unsigned long)area_dst_alias + offset;
 }
 
 struct uffd_test_ops {
@@ -314,7 +330,7 @@ static struct uffd_test_ops shmem_uffd_test_ops = {
.expected_ioctls = SHMEM_EXPECTED_IOCTLS,
.allocate_area  = shmem_allocate_area,
.release_pages  = shmem_release_pages,
-   .alias_mapping = noop_alias_mapping,
+   .alias_mapping = shmem_alias_mapping,
 };
 
 static struct uffd_test_ops hugetlb_uffd_test_ops = {
-- 
2.31.1.295.g9ea45b61b8-goog



  1   2   3   4   5   6   7   8   9   10   >