Re: [PATCH v3 0/5] mm/memcg: Reduce kmemcache memory accounting overhead

2021-04-15 Thread Masayoshi Mizuma
On Thu, Apr 15, 2021 at 09:17:37AM -0400, Waiman Long wrote:
> I was focusing on your kernel module benchmark in testing my patch. I will
> try out your pgbench benchmark to see if there can be other tuning that can
> be done.

Thanks a lot!

> BTW, how many numa nodes does your test machine? I did my testing with a
> 2-socket system. The vmstat caching part may be less effective on systems
> with more numa nodes. I will try to find a larger 4-socket systems for
> testing.

The test machine has one node.

- Masa


Re: [PATCH v3 5/5] mm/memcg: Optimize user context object stock access

2021-04-15 Thread Masayoshi Mizuma
On Thu, Apr 15, 2021 at 11:44:55AM +0200, Christoph Lameter wrote:
> Would you please stop quoting the whole patch when you have nothing to say
> about the details? It is enough to just respond without quoting. I was
> looking through this trying to find something you said about individual
> sections of code but there was nothing.

Thank you for pointing it out and sorry about that.
I'll do that next time.

- Masa


Re: [PATCH v3 5/5] mm/memcg: Optimize user context object stock access

2021-04-14 Thread Masayoshi Mizuma
14 @@ static bool consume_obj_stock(struct obj_cgroup 
> *objcg, unsigned int nr_bytes)
>   unsigned long flags;
>   bool ret = false;
>  
> - local_irq_save(flags);
> + stock = get_obj_stock(flags);
>  
> - stock = current_obj_stock();
>   if (objcg == stock->cached_objcg && stock->nr_bytes >= nr_bytes) {
>   stock->nr_bytes -= nr_bytes;
>   ret = true;
>   }
>  
> - local_irq_restore(flags);
> + put_obj_stock(flags);
>  
>   return ret;
>  }
> @@ -3254,8 +3293,13 @@ static bool obj_stock_flush_required(struct 
> memcg_stock_pcp *stock,
>  {
>   struct mem_cgroup *memcg;
>  
> - if (stock->obj.cached_objcg) {
> - memcg = obj_cgroup_memcg(stock->obj.cached_objcg);
> + if (in_task() && stock->task_obj.cached_objcg) {
> + memcg = obj_cgroup_memcg(stock->task_obj.cached_objcg);
> + if (memcg && mem_cgroup_is_descendant(memcg, root_memcg))
> + return true;
> + }
> + if (stock->irq_obj.cached_objcg) {
> + memcg = obj_cgroup_memcg(stock->irq_obj.cached_objcg);
>   if (memcg && mem_cgroup_is_descendant(memcg, root_memcg))
>   return true;
>   }
> @@ -3283,9 +3327,9 @@ static void refill_obj_stock(struct obj_cgroup *objcg, 
> unsigned int nr_bytes)
>  {
>   unsigned long flags;
>  
> - local_irq_save(flags);
> + get_obj_stock(flags);
>   __refill_obj_stock(objcg, nr_bytes);
> - local_irq_restore(flags);
> + put_obj_stock(flags);
>  }
>  
>  static void __mod_obj_stock_state(struct obj_cgroup *objcg,
> @@ -3325,9 +3369,9 @@ void mod_obj_stock_state(struct obj_cgroup *objcg, 
> struct pglist_data *pgdat,
>  {
>   unsigned long flags;
>  
> - local_irq_save(flags);
> + get_obj_stock(flags);
>   __mod_obj_stock_state(objcg, pgdat, idx, nr);
> - local_irq_restore(flags);
> + put_obj_stock(flags);
>  }
>  
>  int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size)
> @@ -3380,10 +3424,10 @@ void obj_cgroup_uncharge_mod_state(struct obj_cgroup 
> *objcg, size_t size,
>  {
>   unsigned long flags;
>  
> - local_irq_save(flags);
> + get_obj_stock(flags);
>   __refill_obj_stock(objcg, size);
>   __mod_obj_stock_state(objcg, pgdat, idx, -(int)size);
> - local_irq_restore(flags);
> + put_obj_stock(flags);
>  }
>  
>  #endif /* CONFIG_MEMCG_KMEM */
> -- 
> 2.18.1
> 

Please feel free to add:

Tested-by: Masayoshi Mizuma 

Thanks!
Masa


Re: [PATCH v3 4/5] mm/memcg: Separate out object stock data into its own struct

2021-04-14 Thread Masayoshi Mizuma
c void refill_obj_stock(struct obj_cgroup *objcg, 
> unsigned int nr_bytes)
>  static void __mod_obj_stock_state(struct obj_cgroup *objcg,
> struct pglist_data *pgdat, int idx, int nr)
>  {
> - struct memcg_stock_pcp *stock = this_cpu_ptr(_stock);
> + struct obj_stock *stock = current_obj_stock();
>  
>   if (stock->cached_objcg != objcg) {
>   /* Output the current data as is */
> -- 
> 2.18.1
> 
Please feel free to add:

Tested-by: Masayoshi Mizuma 

Thanks!
Masa


Re: [PATCH v3 3/5] mm/memcg: Cache vmstat data in percpu memcg_stock_pcp

2021-04-14 Thread Masayoshi Mizuma
data *pgdat,
> +  int idx, int nr)
> +{
> + unsigned long flags;
> +
> + local_irq_save(flags);
> + __mod_obj_stock_state(objcg, pgdat, idx, nr);
> + local_irq_restore(flags);
> +}
> +
>  int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size)
>  {
>   struct mem_cgroup *memcg;
> @@ -3300,18 +3368,10 @@ void obj_cgroup_uncharge_mod_state(struct obj_cgroup 
> *objcg, size_t size,
>  struct pglist_data *pgdat, int idx)
>  {
>   unsigned long flags;
> - struct mem_cgroup *memcg;
> - struct lruvec *lruvec = NULL;
>  
>   local_irq_save(flags);
>   __refill_obj_stock(objcg, size);
> -
> - rcu_read_lock();
> - memcg = obj_cgroup_memcg(objcg);
> - if (pgdat)
> - lruvec = mem_cgroup_lruvec(memcg, pgdat);
> - __mod_memcg_lruvec_state(memcg, lruvec, idx, -(int)size);
> - rcu_read_unlock();
> + __mod_obj_stock_state(objcg, pgdat, idx, -(int)size);
>   local_irq_restore(flags);
>  }
>  
> diff --git a/mm/slab.h b/mm/slab.h
> index 677cdc52e641..03bd9813422b 100644
> --- a/mm/slab.h
> +++ b/mm/slab.h
> @@ -239,6 +239,8 @@ static inline bool kmem_cache_debug_flags(struct 
> kmem_cache *s, slab_flags_t fla
>  #ifdef CONFIG_MEMCG_KMEM
>  int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s,
>gfp_t gfp, bool new_page);
> +void mod_obj_stock_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
> +  int idx, int nr);
>  
>  static inline void memcg_free_page_obj_cgroups(struct page *page)
>  {
> @@ -283,20 +285,6 @@ static inline bool memcg_slab_pre_alloc_hook(struct 
> kmem_cache *s,
>   return true;
>  }
>  
> -static inline void mod_objcg_state(struct obj_cgroup *objcg,
> -struct pglist_data *pgdat,
> -enum node_stat_item idx, int nr)
> -{
> - struct mem_cgroup *memcg;
> - struct lruvec *lruvec;
> -
> - rcu_read_lock();
> - memcg = obj_cgroup_memcg(objcg);
> - lruvec = mem_cgroup_lruvec(memcg, pgdat);
> - mod_memcg_lruvec_state(memcg, lruvec, idx, nr);
> - rcu_read_unlock();
> -}
> -
>  static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s,
> struct obj_cgroup *objcg,
> gfp_t flags, size_t size,
> @@ -324,8 +312,9 @@ static inline void memcg_slab_post_alloc_hook(struct 
> kmem_cache *s,
>   off = obj_to_index(s, page, p[i]);
>   obj_cgroup_get(objcg);
>   page_objcgs(page)[off] = objcg;
> - mod_objcg_state(objcg, page_pgdat(page),
> - cache_vmstat_idx(s), obj_full_size(s));
> + mod_obj_stock_state(objcg, page_pgdat(page),
> + cache_vmstat_idx(s),
> +     obj_full_size(s));
>   } else {
>   obj_cgroup_uncharge(objcg, obj_full_size(s));
>   }
> @@ -408,6 +397,11 @@ static inline void memcg_slab_free_hook(struct 
> kmem_cache *s,
>   void **p, int objects)
>  {
>  }
> +
> +static inline void mod_obj_stock_state(struct obj_cgroup *objcg,
> +struct pglist_data *pgdat, int idx, int 
> nr)
> +{
> +}
>  #endif /* CONFIG_MEMCG_KMEM */
>  
>  static inline struct kmem_cache *virt_to_cache(const void *obj)
> -- 
> 2.18.1
> 

Please feel free to add:

Tested-by: Masayoshi Mizuma 

Thanks!
Masa


Re: [PATCH v3 2/5] mm/memcg: Introduce obj_cgroup_uncharge_mod_state()

2021-04-14 Thread Masayoshi Mizuma
On Tue, Apr 13, 2021 at 09:20:24PM -0400, Waiman Long wrote:
> In memcg_slab_free_hook()/pcpu_memcg_free_hook(), obj_cgroup_uncharge()
> is followed by mod_objcg_state()/mod_memcg_state(). Each of these
> function call goes through a separate irq_save/irq_restore cycle. That
> is inefficient.  Introduce a new function obj_cgroup_uncharge_mod_state()
> that combines them with a single irq_save/irq_restore cycle.
> 
> Signed-off-by: Waiman Long 
> Reviewed-by: Shakeel Butt 
> Acked-by: Roman Gushchin 
> ---
>  include/linux/memcontrol.h |  2 ++
>  mm/memcontrol.c| 31 +++
>  mm/percpu.c|  9 ++---
>  mm/slab.h  |  6 +++---
>  4 files changed, 34 insertions(+), 14 deletions(-)
> 
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index 95f12996e66c..6890f999c1a3 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -1592,6 +1592,8 @@ struct obj_cgroup *get_obj_cgroup_from_current(void);
>  
>  int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size);
>  void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size);
> +void obj_cgroup_uncharge_mod_state(struct obj_cgroup *objcg, size_t size,
> +struct pglist_data *pgdat, int idx);
>  
>  extern struct static_key_false memcg_kmem_enabled_key;
>  
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index d66e1e38f8ac..b19100c68aa0 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -3225,12 +3225,9 @@ static bool obj_stock_flush_required(struct 
> memcg_stock_pcp *stock,
>   return false;
>  }
>  
> -static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
> +static void __refill_obj_stock(struct obj_cgroup *objcg, unsigned int 
> nr_bytes)
>  {
>   struct memcg_stock_pcp *stock;
> - unsigned long flags;
> -
> - local_irq_save(flags);
>  
>   stock = this_cpu_ptr(_stock);
>   if (stock->cached_objcg != objcg) { /* reset if necessary */
> @@ -3243,7 +3240,14 @@ static void refill_obj_stock(struct obj_cgroup *objcg, 
> unsigned int nr_bytes)
>  
>   if (stock->nr_bytes > PAGE_SIZE)
>   drain_obj_stock(stock);
> +}
> +
> +static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
> +{
> + unsigned long flags;
>  
> + local_irq_save(flags);
> + __refill_obj_stock(objcg, nr_bytes);
>   local_irq_restore(flags);
>  }
>  
> @@ -3292,6 +3296,25 @@ void obj_cgroup_uncharge(struct obj_cgroup *objcg, 
> size_t size)
>   refill_obj_stock(objcg, size);
>  }
>  
> +void obj_cgroup_uncharge_mod_state(struct obj_cgroup *objcg, size_t size,
> +struct pglist_data *pgdat, int idx)
> +{
> + unsigned long flags;
> + struct mem_cgroup *memcg;
> + struct lruvec *lruvec = NULL;
> +
> + local_irq_save(flags);
> + __refill_obj_stock(objcg, size);
> +
> + rcu_read_lock();
> + memcg = obj_cgroup_memcg(objcg);
> + if (pgdat)
> + lruvec = mem_cgroup_lruvec(memcg, pgdat);
> + __mod_memcg_lruvec_state(memcg, lruvec, idx, -(int)size);
> + rcu_read_unlock();
> + local_irq_restore(flags);
> +}
> +
>  #endif /* CONFIG_MEMCG_KMEM */
>  
>  /*
> diff --git a/mm/percpu.c b/mm/percpu.c
> index 23308113a5ff..fd7aad6d7f90 100644
> --- a/mm/percpu.c
> +++ b/mm/percpu.c
> @@ -1631,13 +1631,8 @@ static void pcpu_memcg_free_hook(struct pcpu_chunk 
> *chunk, int off, size_t size)
>   objcg = chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT];
>   chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT] = NULL;
>  
> - obj_cgroup_uncharge(objcg, size * num_possible_cpus());
> -
> - rcu_read_lock();
> - mod_memcg_state(obj_cgroup_memcg(objcg), MEMCG_PERCPU_B,
> - -(size * num_possible_cpus()));
> - rcu_read_unlock();
> -
> + obj_cgroup_uncharge_mod_state(objcg, size * num_possible_cpus(),
> +   NULL, MEMCG_PERCPU_B);
>   obj_cgroup_put(objcg);
>  }
>  
> diff --git a/mm/slab.h b/mm/slab.h
> index bc6c7545e487..677cdc52e641 100644
> --- a/mm/slab.h
> +++ b/mm/slab.h
> @@ -366,9 +366,9 @@ static inline void memcg_slab_free_hook(struct kmem_cache 
> *s_orig,
>   continue;
>  
>   objcgs[off] = NULL;
> - obj_cgroup_uncharge(objcg, obj_full_size(s));
> - mod_objcg_state(objcg, page_pgdat(page), cache_vmstat_idx(s),
> - -obj_full_size(s));
> + obj_cgroup_uncharge_mod_state(objcg, obj_full_size(s),
> +   page_pgdat(page),
> +   cache_vmstat_idx(s));
>   obj_cgroup_put(objcg);
>   }
>  }
> -- 
> 2.18.1
> 

Please feel free to add:

Tested-by: Masayoshi Mizuma 

Thanks!
Masa


Re: [PATCH v3 1/5] mm/memcg: Pass both memcg and lruvec to mod_memcg_lruvec_state()

2021-04-14 Thread Masayoshi Mizuma
On Tue, Apr 13, 2021 at 09:20:23PM -0400, Waiman Long wrote:
> The caller of mod_memcg_lruvec_state() has both memcg and lruvec readily
> available. So both of them are now passed to mod_memcg_lruvec_state()
> and __mod_memcg_lruvec_state(). The __mod_memcg_lruvec_state() is
> updated to allow either of the two parameters to be set to null. This
> makes mod_memcg_lruvec_state() equivalent to mod_memcg_state() if lruvec
> is null.
> 
> The new __mod_memcg_lruvec_state() function will be used in the next
> patch as a replacement of mod_memcg_state() in mm/percpu.c for the
> consolidation of the memory uncharge and vmstat update functions in
> the kmem_cache_free() path.
> 
> Signed-off-by: Waiman Long 
> Acked-by: Roman Gushchin 
> Reviewed-by: Shakeel Butt 
> ---
>  include/linux/memcontrol.h | 12 +++-
>  mm/memcontrol.c| 19 +--
>  mm/slab.h  |  2 +-
>  3 files changed, 21 insertions(+), 12 deletions(-)
> 
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index 0c04d39a7967..95f12996e66c 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -955,8 +955,8 @@ static inline unsigned long 
> lruvec_page_state_local(struct lruvec *lruvec,
>   return x;
>  }
>  
> -void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
> -   int val);
> +void __mod_memcg_lruvec_state(struct mem_cgroup *memcg, struct lruvec 
> *lruvec,
> +   enum node_stat_item idx, int val);
>  void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val);
>  
>  static inline void mod_lruvec_kmem_state(void *p, enum node_stat_item idx,
> @@ -969,13 +969,14 @@ static inline void mod_lruvec_kmem_state(void *p, enum 
> node_stat_item idx,
>   local_irq_restore(flags);
>  }
>  
> -static inline void mod_memcg_lruvec_state(struct lruvec *lruvec,
> +static inline void mod_memcg_lruvec_state(struct mem_cgroup *memcg,
> +   struct lruvec *lruvec,
> enum node_stat_item idx, int val)
>  {
>   unsigned long flags;
>  
>   local_irq_save(flags);
> - __mod_memcg_lruvec_state(lruvec, idx, val);
> + __mod_memcg_lruvec_state(memcg, lruvec, idx, val);
>   local_irq_restore(flags);
>  }
>  
> @@ -1369,7 +1370,8 @@ static inline unsigned long 
> lruvec_page_state_local(struct lruvec *lruvec,
>   return node_page_state(lruvec_pgdat(lruvec), idx);
>  }
>  
> -static inline void __mod_memcg_lruvec_state(struct lruvec *lruvec,
> +static inline void __mod_memcg_lruvec_state(struct mem_cgroup *memcg,
> + struct lruvec *lruvec,
>   enum node_stat_item idx, int val)
>  {
>  }
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index e064ac0d850a..d66e1e38f8ac 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -799,20 +799,27 @@ parent_nodeinfo(struct mem_cgroup_per_node *pn, int nid)
>   return mem_cgroup_nodeinfo(parent, nid);
>  }
>  
> -void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
> -   int val)
> +/*
> + * Either one of memcg or lruvec can be NULL, but not both.
> + */
> +void __mod_memcg_lruvec_state(struct mem_cgroup *memcg, struct lruvec 
> *lruvec,
> +   enum node_stat_item idx, int val)
>  {
>   struct mem_cgroup_per_node *pn;
> - struct mem_cgroup *memcg;
>   long x, threshold = MEMCG_CHARGE_BATCH;
>  
> + /* Update lruvec */
>   pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
> - memcg = pn->memcg;
> +
> + if (!memcg)
> + memcg = pn->memcg;
>  
>   /* Update memcg */
>   __mod_memcg_state(memcg, idx, val);
>  
> - /* Update lruvec */
> + if (!lruvec)
> + return;
> +
>   __this_cpu_add(pn->lruvec_stat_local->count[idx], val);
>  
>   if (vmstat_item_in_bytes(idx))
> @@ -848,7 +855,7 @@ void __mod_lruvec_state(struct lruvec *lruvec, enum 
> node_stat_item idx,
>  
>   /* Update memcg and lruvec */
>   if (!mem_cgroup_disabled())
> - __mod_memcg_lruvec_state(lruvec, idx, val);
> + __mod_memcg_lruvec_state(NULL, lruvec, idx, val);
>  }
>  
>  void __mod_lruvec_page_state(struct page *page, enum node_stat_item idx,
> diff --git a/mm/slab.h b/mm/slab.h
> index 076582f58f68..bc6c7545e487 100644
> --- a/mm/slab.h
> +++ b/mm/slab.h
> @@ -293,7 +293,7 @@ static inline void mod_objcg_state(struct obj_cgroup 
> *objcg,
>   rcu_read_lock();
>   memcg = obj_cgroup_memcg(objcg);
>   lruvec = mem_cgroup_lruvec(memcg, pgdat);
> - mod_memcg_lruvec_state(lruvec, idx, nr);
> + mod_memcg_lruvec_state(memcg, lruvec, idx, nr);
>   rcu_read_unlock();
>  }
>  
> -- 
> 2.18.1
> 

Please feel free to add:

Tested-by: Masayoshi Mizuma 

Thanks!
Masa


Re: [PATCH v3 0/5] mm/memcg: Reduce kmemcache memory accounting overhead

2021-04-14 Thread Masayoshi Mizuma
On Tue, Apr 13, 2021 at 09:20:22PM -0400, Waiman Long wrote:
>  v3:
>   - Add missing "inline" qualifier to the alternate mod_obj_stock_state()
> in patch 3.
>   - Remove redundant current_obj_stock() call in patch 5.
> 
>  v2:
>   - Fix bug found by test robot in patch 5.
>   - Update cover letter and commit logs.
> 
> With the recent introduction of the new slab memory controller, we
> eliminate the need for having separate kmemcaches for each memory
> cgroup and reduce overall kernel memory usage. However, we also add
> additional memory accounting overhead to each call of kmem_cache_alloc()
> and kmem_cache_free().
> 
> For workloads that require a lot of kmemcache allocations and
> de-allocations, they may experience performance regression as illustrated
> in [1] and [2].
> 
> A simple kernel module that performs repeated loop of 100,000,000
> kmem_cache_alloc() and kmem_cache_free() of a 64-byte object at module
> init time is used for benchmarking. The test was run on a CascadeLake
> server with turbo-boosting disable to reduce run-to-run variation.
> 
> With memory accounting disable, the run time was 2.848s. With memory
> accounting enabled, the run times with the application of various
> patches in the patchset were:
> 
>   Applied patches   Run time   Accounting overhead   Overhead %age
>   ---      ---   -
>None  10.800s 7.952s  100.0%
> 1-2   9.140s 6.292s   79.1%
> 1-3   7.641s 4.793s   60.3%
> 1-5   6.801s 3.953s   49.7%
> 
> Note that this is the best case scenario where most updates happen only
> to the percpu stocks. Real workloads will likely have a certain amount
> of updates to the memcg charges and vmstats. So the performance benefit
> will be less.
> 
> It was found that a big part of the memory accounting overhead
> was caused by the local_irq_save()/local_irq_restore() sequences in
> updating local stock charge bytes and vmstat array, at least in x86
> systems. There are two such sequences in kmem_cache_alloc() and two
> in kmem_cache_free(). This patchset tries to reduce the use of such
> sequences as much as possible. In fact, it eliminates them in the common
> case. Another part of this patchset to cache the vmstat data update in
> the local stock as well which also helps.
> 
> [1] 
> https://lore.kernel.org/linux-mm/20210408193948.vfktg3azh2wrt56t@gabell/T/#u

Hi Longman,

Thank you for your patches.
I rerun the benchmark with your patches, it seems that the reduction
is small... The total duration of sendto() and recvfrom() system call 
during the benchmark are as follows.

- sendto
  - v5.8 vanilla:  2576.056 msec (100%)
  - v5.12-rc7 vanilla: 2988.911 msec (116%)
  - v5.12-rc7 with your patches (1-5): 2984.307 msec (115%)

- recvfrom
  - v5.8 vanilla:  2113.156 msec (100%)
  - v5.12-rc7 vanilla: 2305.810 msec (109%)
  - v5.12-rc7 with your patches (1-5): 2287.351 msec (108%)

kmem_cache_alloc()/kmem_cache_free() are called around 1,400,000 times during
the benchmark. I ran a loop in a kernel module as following. The duration
is reduced by your patches actually.

  ---
  dummy_cache = KMEM_CACHE(dummy, SLAB_ACCOUNT);
  for (i = 0; i < 140; i++) {
p = kmem_cache_alloc(dummy_cache, GFP_KERNEL);
kmem_cache_free(dummy_cache, p);
  }
  ---

- v5.12-rc7 vanilla: 110 msec (100%)
- v5.12-rc7 with your patches (1-5):  85 msec (77%)

It seems that the reduction is small for the benchmark though...
Anyway, I can see your patches reduce the overhead.
Please feel free to add:

Tested-by: Masayoshi Mizuma 

Thanks!
Masa

> [2] https://lore.kernel.org/lkml/20210114025151.GA22932@xsang-OptiPlex-9020/
> 
> Waiman Long (5):
>   mm/memcg: Pass both memcg and lruvec to mod_memcg_lruvec_state()
>   mm/memcg: Introduce obj_cgroup_uncharge_mod_state()
>   mm/memcg: Cache vmstat data in percpu memcg_stock_pcp
>   mm/memcg: Separate out object stock data into its own struct
>   mm/memcg: Optimize user context object stock access
> 
>  include/linux/memcontrol.h |  14 ++-
>  mm/memcontrol.c| 199 -
>  mm/percpu.c|   9 +-
>  mm/slab.h  |  32 +++---
>  4 files changed, 196 insertions(+), 58 deletions(-)
> 
> -- 
> 2.18.1
> 


Re: [PATCH v6 08/10] perf: arm64: Add test for userspace counter access on heterogeneous systems

2021-03-15 Thread Masayoshi Mizuma
On Wed, Mar 10, 2021 at 05:08:35PM -0700, Rob Herring wrote:
> Userspace counter access only works on heterogeneous systems with some
> restrictions. The userspace process must be pinned to a homogeneous
> subset of CPUs and must open the corresponding PMU for those CPUs. This
> commit adds a test implementing these requirements.
> 
> Signed-off-by: Rob Herring 
> ---
> v6:
>  - Add a check on cap_user_rdpmc
> v5:
>  - Adapt to libperf mmap API changes
> v4:
>  - Update perf_evsel__mmap params
> v2:
>  - Drop all but heterogeneous test as others covered by libperf tests
>  - Rework to use libperf
> ---
>  tools/perf/arch/arm64/include/arch-tests.h |   7 +
>  tools/perf/arch/arm64/tests/Build  |   1 +
>  tools/perf/arch/arm64/tests/arch-tests.c   |   4 +
>  tools/perf/arch/arm64/tests/user-events.c  | 177 +
>  4 files changed, 189 insertions(+)
>  create mode 100644 tools/perf/arch/arm64/tests/user-events.c
> 
> diff --git a/tools/perf/arch/arm64/include/arch-tests.h 
> b/tools/perf/arch/arm64/include/arch-tests.h
> index 90ec4c8cb880..380ad34a3f09 100644
> --- a/tools/perf/arch/arm64/include/arch-tests.h
> +++ b/tools/perf/arch/arm64/include/arch-tests.h
> @@ -2,11 +2,18 @@
>  #ifndef ARCH_TESTS_H
>  #define ARCH_TESTS_H
>  
> +#include 
> +
>  #ifdef HAVE_DWARF_UNWIND_SUPPORT
>  struct thread;
>  struct perf_sample;
> +int test__arch_unwind_sample(struct perf_sample *sample,
> +  struct thread *thread);
>  #endif

I got the following compile error with aarch64:

# make tools/perf
...
  GEN  /root/linux/tools/perf/python/perf.so
  CC   /root/linux/tools/perf/arch/arm64/tests/user-events.o
  CC   /root/linux/tools/perf/arch/arm64/tests/arch-tests.o
In file included from arch/arm64/tests/arch-tests.c:4:
/root/linux/tools/perf/arch/arm64/include/arch-tests.h:10:5: error: 
redundant redeclaration of 'test__arch_unwind_sample' [-Werror=redundant-decls]
 int test__arch_unwind_sample(struct perf_sample *sample,
 ^~~~
In file included from arch/arm64/tests/arch-tests.c:3:
/root/linux/tools/perf/tests/tests.h:140:5: note: previous declaration of 
'test__arch_unwind_sample' was here
 int test__arch_unwind_sample(struct perf_sample *sample,
 ^~~~
cc1: all warnings being treated as errors
make[8]: *** [/root/linux/tools/build/Makefile.build:97: 
/root/linux/tools/perf/arch/arm64/tests/arch-tests.o] Error 1

That's because test__arch_unwind_sample() is in tools/perf/tests/tests.h as 
well.

tools/perf/tests/tests.h:
...
#if defined(__arm__) || defined(__aarch64__)
#ifdef HAVE_DWARF_UNWIND_SUPPORT
struct thread;
struct perf_sample;
int test__arch_unwind_sample(struct perf_sample *sample,
 struct thread *thread);
#endif
#endif

I'm not sure the best way to resolve the error, but the error is gone
with the following additional patch.
Could you take a look it?

diff --git a/tools/perf/arch/arm64/include/arch-tests.h 
b/tools/perf/arch/arm64/include/arch-tests.h
index ddfa7460e1e1..7ff2e29bdc1c 100644
--- a/tools/perf/arch/arm64/include/arch-tests.h
+++ b/tools/perf/arch/arm64/include/arch-tests.h
@@ -4,13 +4,6 @@
 
 #include 
 
-#ifdef HAVE_DWARF_UNWIND_SUPPORT
-struct thread;
-struct perf_sample;
-int test__arch_unwind_sample(struct perf_sample *sample,
-struct thread *thread);
-#endif
-
 extern struct test arch_tests[];
 int test__rd_pinned(struct test __maybe_unused *test,
   int __maybe_unused subtest);

Thanks!
Masa

>  
>  extern struct test arch_tests[];
> +int test__rd_pinned(struct test __maybe_unused *test,
> +int __maybe_unused subtest);
> +
>  
>  #endif
> diff --git a/tools/perf/arch/arm64/tests/Build 
> b/tools/perf/arch/arm64/tests/Build
> index a61c06bdb757..3f9a20c17fc6 100644
> --- a/tools/perf/arch/arm64/tests/Build
> +++ b/tools/perf/arch/arm64/tests/Build
> @@ -1,4 +1,5 @@
>  perf-y += regs_load.o
>  perf-$(CONFIG_DWARF_UNWIND) += dwarf-unwind.o
>  
> +perf-y += user-events.o
>  perf-y += arch-tests.o
> diff --git a/tools/perf/arch/arm64/tests/arch-tests.c 
> b/tools/perf/arch/arm64/tests/arch-tests.c
> index 5b1543c98022..80ce7bd3c16d 100644
> --- a/tools/perf/arch/arm64/tests/arch-tests.c
> +++ b/tools/perf/arch/arm64/tests/arch-tests.c
> @@ -10,6 +10,10 @@ struct test arch_tests[] = {
>   .func = test__dwarf_unwind,
>   },
>  #endif
> + {
> + .desc = "Pinned CPU user counter access",
> + .func = test__rd_pinned,
> + },
>   {
>   .func = NULL,
>   },
> diff --git a/tools/perf/arch/arm64/tests/user-events.c 
> b/tools/perf/arch/arm64/tests/user-events.c
> new file mode 100644
> index ..c8efc6b369e6
> --- /dev/null
> +++ b/tools/perf/arch/arm64/tests/user-events.c
> @@ -0,0 +1,177 @@
> +// SPDX-License-Identifier: GPL-2.0
> 

Re: [PATCH v9 2/2] perf vendor events arm64: Add Fujitsu A64FX pmu event

2021-03-09 Thread Masayoshi Mizuma
On Mon, Mar 08, 2021 at 07:53:41PM +0900, Shunsuke Nakamura wrote:
> Add pmu events for A64FX.
> 
> Documentation source:
> https://github.com/fujitsu/A64FX/blob/master/doc/A64FX_PMU_Events_v1.2.pdf

The PMU events described in above document seems to work well
with this patch!
Please feel free to add:

Tested-by: Masayoshi Mizuma 

Thanks!
Masa

> 
> Signed-off-by: Shunsuke Nakamura 
> Reviewed-by: John Garry 
> ---
>  .../arch/arm64/fujitsu/a64fx/branch.json  |   8 +
>  .../arch/arm64/fujitsu/a64fx/bus.json |  62 ++
>  .../arch/arm64/fujitsu/a64fx/cache.json   | 128 
>  .../arch/arm64/fujitsu/a64fx/cycle.json   |   5 +
>  .../arch/arm64/fujitsu/a64fx/exception.json   |  29 +++
>  .../arch/arm64/fujitsu/a64fx/instruction.json | 131 
>  .../arch/arm64/fujitsu/a64fx/memory.json  |   8 +
>  .../arch/arm64/fujitsu/a64fx/other.json   | 188 +
>  .../arch/arm64/fujitsu/a64fx/pipeline.json| 194 ++
>  .../arch/arm64/fujitsu/a64fx/sve.json | 110 ++
>  tools/perf/pmu-events/arch/arm64/mapfile.csv  |   1 +
>  11 files changed, 864 insertions(+)
>  create mode 100644 tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/branch.json
>  create mode 100644 tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/bus.json
>  create mode 100644 tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/cache.json
>  create mode 100644 tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/cycle.json
>  create mode 100644 
> tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/exception.json
>  create mode 100644 
> tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/instruction.json
>  create mode 100644 tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/memory.json
>  create mode 100644 tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/other.json
>  create mode 100644 
> tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/pipeline.json
>  create mode 100644 tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/sve.json
> 
> diff --git a/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/branch.json 
> b/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/branch.json
> new file mode 100644
> index ..b011af11bf94
> --- /dev/null
> +++ b/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/branch.json
> @@ -0,0 +1,8 @@
> +[
> +  {
> +"ArchStdEvent": "BR_MIS_PRED"
> +  },
> +  {
> +"ArchStdEvent": "BR_PRED"
> +  }
> +]
> diff --git a/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/bus.json 
> b/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/bus.json
> new file mode 100644
> index ..084e88d7df73
> --- /dev/null
> +++ b/tools/perf/pmu-events/arch/arm64/fujitsu/a64fx/bus.json
> @@ -0,0 +1,62 @@
> +[
> +  {
> +"PublicDescription": "This event counts read transactions from tofu 
> controller to measured CMG.",
> +"EventCode": "0x314",
> +"EventName": "BUS_READ_TOTAL_TOFU",
> +"BriefDescription": "This event counts read transactions from tofu 
> controller to measured CMG."
> +  },
> +  {
> +"PublicDescription": "This event counts read transactions from PCI 
> controller to measured CMG.",
> +"EventCode": "0x315",
> +"EventName": "BUS_READ_TOTAL_PCI",
> +"BriefDescription": "This event counts read transactions from PCI 
> controller to measured CMG."
> +  },
> +  {
> +"PublicDescription": "This event counts read transactions from measured 
> CMG local memory to measured CMG.",
> +"EventCode": "0x316",
> +"EventName": "BUS_READ_TOTAL_MEM",
> +"BriefDescription": "This event counts read transactions from measured 
> CMG local memory to measured CMG."
> +  },
> +  {
> +"PublicDescription": "This event counts write transactions from measured 
> CMG to CMG0, if measured CMG is not CMG0.",
> +"EventCode": "0x318",
> +"EventName": "BUS_WRITE_TOTAL_CMG0",
> +"BriefDescription": "This event counts write transactions from measured 
> CMG to CMG0, if measured CMG is not CMG0."
> +  },
> +  {
> +"PublicDescription": "This event counts write transactions from measured 
> CMG to CMG1, if measured CMG is not CMG1.",
> +"EventCode": "0x319",
> +"EventName": "BUS_WRITE_TOTAL_CMG1",
> +"BriefDescription": "This event counts write transactions from measured 
> CMG to CMG1, if measured CMG is not CMG1

Re: [RFC PATCH v3 1/2] topology: Represent clusters of CPUs within a die.

2021-02-09 Thread Masayoshi Mizuma
On Wed, Jan 06, 2021 at 09:30:25PM +1300, Barry Song wrote:
> From: Jonathan Cameron 
> 
> Both ACPI and DT provide the ability to describe additional layers of
> topology between that of individual cores and higher level constructs
> such as the level at which the last level cache is shared.
> In ACPI this can be represented in PPTT as a Processor Hierarchy
> Node Structure [1] that is the parent of the CPU cores and in turn
> has a parent Processor Hierarchy Nodes Structure representing
> a higher level of topology.
> 
> For example Kunpeng 920 has 6 clusters in each NUMA node, and each
> cluster has 4 cpus. All clusters share L3 cache data, but each cluster
> has local L3 tag. On the other hand, each clusters will share some
> internal system bus.
> 
> +---+  +-+
> |  +--++--++---+ |
> |  | CPU0 || cpu1 | |+---+ | |
> |  +--++--+ ||   | | |
> |   ++L3 | | |
> |  +--++--+   cluster   ||tag| | |
> |  | CPU2 || CPU3 | ||   | | |
> |  +--++--+ |+---+ | |
> |   |  | |
> +---+  | |
> +---+  | |
> |  +--++--+ +--+ |
> |  |  ||  | |+---+ | |
> |  +--++--+ ||   | | |
> |   ||L3 | | |
> |  +--++--+ ++tag| | |
> |  |  ||  | ||   | | |
> |  +--++--+ |+---+ | |
> |   |  | |
> +---+  |   L3|
>|   data  |
> +---+  | |
> |  +--++--+ |+---+ | |
> |  |  ||  | ||   | | |
> |  +--++--+ ++L3 | | |
> |   ||tag| | |
> |  +--++--+ ||   | | |
> |  |  ||  |+++---+ | |
> |  +--++--+|---+ |
> +---|  | |
> +---|  | |
> |  +--++--++---+ |
> |  |  ||  | |+---+ | |
> |  +--++--+ ||   | | |
> |   ++L3 | | |
> |  +--++--+ ||tag| | |
> |  |  ||  | ||   | | |
> |  +--++--+ |+---+ | |
> |   |  | |
> +---+  | |
> +---+  | |
> |  +--++--+ +--+ |
> |  |  ||  | |   +---+  | |
> |  +--++--+ |   |   |  | |
> |   |   |L3 |  | |
> |  +--++--+ +---+tag|  | |
> |  |  ||  | |   |   |  | |
> |  +--++--+ |   +---+  | |
> |   |  | |
> +---+  | |
> +---+ ++ |
> |  +--++--+ +--+ |
> |  |  ||  | |  +---+   | |
> |  +--++--+ |  |   |   | |
> |   |  |L3 |   | |

Re: [PATCH 1/1] ktest.pl: Fix incorrect reboot for grub2bls

2020-11-29 Thread Masayoshi Mizuma
On Fri, Nov 20, 2020 at 06:12:43PM -0800, Libo Chen wrote:
> This issue was first noticed when I was testing different kernels on
> Oracle Linux 8 which as Fedora 30+ adopts BLS as default. Even though a
> kernel entry was added successfully and the index of that kernel entry was
> retrieved correctly, ktest still wouldn't reboot the system into
> user-specified kernel.
> 
> The bug was spotted in subroutine reboot_to where the if-statement never
> checks for REBOOT_TYPE "grub2bls", therefore the desired entry will not be
> set for the next boot.
> 
> Add a check for "grub2bls" so that $grub_reboot $grub_number can
> be run before a reboot if REBOOT_TYPE is "grub2bls" then we can boot to
> the correct kernel.
> 
> Fixes: ac2466456eaa ("ktest: introduce grub2bls REBOOT_TYPE option")
> 
> Signed-off-by: Libo Chen 

Thank you for the fix!
I tested the patch with fedora33. It works well.

Please feel free to add:

Tested-by: Masayoshi Mizuma 

Thanks!
Masa

> ---
>  tools/testing/ktest/ktest.pl | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/tools/testing/ktest/ktest.pl b/tools/testing/ktest/ktest.pl
> index cb16d2aac51c..54188ee16c48 100755
> --- a/tools/testing/ktest/ktest.pl
> +++ b/tools/testing/ktest/ktest.pl
> @@ -2040,7 +2040,7 @@ sub reboot_to {
>  
>  if ($reboot_type eq "grub") {
>   run_ssh "'(echo \"savedefault --default=$grub_number --once\" | grub 
> --batch)'";
> -} elsif ($reboot_type eq "grub2") {
> +} elsif (($reboot_type eq "grub2") or ($reboot_type eq "grub2bls")) {
>   run_ssh "$grub_reboot $grub_number";
>  } elsif ($reboot_type eq "syslinux") {
>   run_ssh "$syslinux --once \\\"$syslinux_label\\\" $syslinux_path";
> -- 
> 2.27.0
> 


Re: [PATCH v2 1/3] irqchip/gic-v3: Enable support for SGIs to act as NMIs

2020-11-16 Thread Masayoshi Mizuma
Hi Yuichi-san,

This patch is under review here:
https://lore.kernel.org/linux-arm-kernel/1604317487-14543-3-git-send-email-sumit.g...@linaro.org/

So, it would be great if you could send your feedback to the
thread; testing, code review.

Thanks,
Masa

On Wed, Nov 04, 2020 at 05:05:37PM +0900, Yuichi Ito wrote:
> From: From: Sumit Garg 
> 
> Add support to handle SGIs as pseudo NMIs. As SGIs or IPIs default to a
> special flow handler: handle_percpu_devid_fasteoi_ipi(), so skip NMI
> handler update in case of SGIs.
> 
> Also, enable NMI support prior to gic_smp_init() as allocation of SGIs
> as IRQs/NMIs happen as part of this routine.
> 
> Signed-off-by: Sumit Garg 
> ---
>  drivers/irqchip/irq-gic-v3.c | 29 +
>  1 file changed, 21 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/irqchip/irq-gic-v3.c b/drivers/irqchip/irq-gic-v3.c
> index 16fecc0..7010ae2 100644
> --- a/drivers/irqchip/irq-gic-v3.c
> +++ b/drivers/irqchip/irq-gic-v3.c
> @@ -461,6 +461,7 @@ static u32 gic_get_ppi_index(struct irq_data *d)
>  static int gic_irq_nmi_setup(struct irq_data *d)
>  {
>   struct irq_desc *desc = irq_to_desc(d->irq);
> + u32 idx;
>  
>   if (!gic_supports_nmi())
>   return -EINVAL;
> @@ -478,16 +479,22 @@ static int gic_irq_nmi_setup(struct irq_data *d)
>   return -EINVAL;
>  
>   /* desc lock should already be held */
> - if (gic_irq_in_rdist(d)) {
> - u32 idx = gic_get_ppi_index(d);
> + switch (get_intid_range(d)) {
> + case SGI_RANGE:
> + break;
> + case PPI_RANGE:
> + case EPPI_RANGE:
> + idx = gic_get_ppi_index(d);
>  
>   /* Setting up PPI as NMI, only switch handler for first NMI */
>   if (!refcount_inc_not_zero(_nmi_refs[idx])) {
>   refcount_set(_nmi_refs[idx], 1);
>   desc->handle_irq = handle_percpu_devid_fasteoi_nmi;
>   }
> - } else {
> + break;
> + default:
>   desc->handle_irq = handle_fasteoi_nmi;
> + break;
>   }
>  
>   gic_irq_set_prio(d, GICD_INT_NMI_PRI);
> @@ -498,6 +505,7 @@ static int gic_irq_nmi_setup(struct irq_data *d)
>  static void gic_irq_nmi_teardown(struct irq_data *d)
>  {
>   struct irq_desc *desc = irq_to_desc(d->irq);
> + u32 idx;
>  
>   if (WARN_ON(!gic_supports_nmi()))
>   return;
> @@ -515,14 +523,20 @@ static void gic_irq_nmi_teardown(struct irq_data *d)
>   return;
>  
>   /* desc lock should already be held */
> - if (gic_irq_in_rdist(d)) {
> - u32 idx = gic_get_ppi_index(d);
> + switch (get_intid_range(d)) {
> + case SGI_RANGE:
> + break;
> + case PPI_RANGE:
> + case EPPI_RANGE:
> + idx = gic_get_ppi_index(d);
>  
>   /* Tearing down NMI, only switch handler for last NMI */
>   if (refcount_dec_and_test(_nmi_refs[idx]))
>   desc->handle_irq = handle_percpu_devid_irq;
> - } else {
> + break;
> + default:
>   desc->handle_irq = handle_fasteoi_irq;
> + break;
>   }
>  
>   gic_irq_set_prio(d, GICD_INT_DEF_PRI);
> @@ -1708,6 +1722,7 @@ static int __init gic_init_bases(void __iomem 
> *dist_base,
>  
>   gic_dist_init();
>   gic_cpu_init();
> + gic_enable_nmi_support();
>   gic_smp_init();
>   gic_cpu_pm_init();
>  
> @@ -1719,8 +1734,6 @@ static int __init gic_init_bases(void __iomem 
> *dist_base,
>   gicv2m_init(handle, gic_data.domain);
>   }
>  
> - gic_enable_nmi_support();
> -
>   return 0;
>  
>  out_free:
> -- 
> 1.8.3.1
> 
> 
> ___
> linux-arm-kernel mailing list
> linux-arm-ker...@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel


Re: [PATCH v7 5/7] arm64: ipi_nmi: Add support for NMI backtrace

2020-11-02 Thread Masayoshi Mizuma
On Mon, Nov 02, 2020 at 05:14:45PM +0530, Sumit Garg wrote:
> Enable NMI backtrace support on arm64 using IPI turned as an NMI
> leveraging pseudo NMIs support. It is now possible for users to get a
> backtrace of a CPU stuck in hard-lockup using magic SYSRQ.
> 
> Signed-off-by: Sumit Garg 
> ---
>  arch/arm64/include/asm/irq.h |  6 ++
>  arch/arm64/kernel/ipi_nmi.c  | 18 --
>  2 files changed, 22 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/arm64/include/asm/irq.h b/arch/arm64/include/asm/irq.h
> index b2b0c64..ef018a8 100644
> --- a/arch/arm64/include/asm/irq.h
> +++ b/arch/arm64/include/asm/irq.h
> @@ -6,6 +6,12 @@
>  
>  #include 
>  
> +#ifdef CONFIG_SMP
> +extern bool arch_trigger_cpumask_backtrace(const cpumask_t *mask,
> +bool exclude_self);
> +#define arch_trigger_cpumask_backtrace arch_trigger_cpumask_backtrace
> +#endif
> +
>  struct pt_regs;
>  
>  static inline int nr_legacy_irqs(void)
> diff --git a/arch/arm64/kernel/ipi_nmi.c b/arch/arm64/kernel/ipi_nmi.c
> index a945dcf..597dcf7 100644
> --- a/arch/arm64/kernel/ipi_nmi.c
> +++ b/arch/arm64/kernel/ipi_nmi.c
> @@ -8,6 +8,7 @@
>  
>  #include 
>  #include 
> +#include 
>  #include 
>  
>  #include 
> @@ -31,11 +32,24 @@ void arm64_send_nmi(cpumask_t *mask)
>   __ipi_send_mask(ipi_nmi_desc, mask);
>  }
>  
> +bool arch_trigger_cpumask_backtrace(const cpumask_t *mask, bool exclude_self)
> +{
> + if (!ipi_nmi_desc)
> + return false;
> +
> + nmi_trigger_cpumask_backtrace(mask, exclude_self, arm64_send_nmi);
> +
> + return true;
> +}
> +
>  static irqreturn_t ipi_nmi_handler(int irq, void *data)
>  {
> - /* nop, NMI handlers for special features can be added here. */
> + irqreturn_t ret = IRQ_NONE;
> +
> + if (nmi_cpu_backtrace(get_irq_regs()))
> + ret = IRQ_HANDLED;
>  
> - return IRQ_NONE;
> + return ret;
>  }
>  
>  void dynamic_ipi_setup(int cpu)
> -- 

It works well with sysrq l trigger.
Please feel free to add:

Tested-by: Masayoshi Mizuma 

Thanks!
Masa


Re: [PATCH v7 3/7] arm64: smp: Assign and setup an IPI as NMI

2020-11-02 Thread Masayoshi Mizuma
On Mon, Nov 02, 2020 at 05:14:43PM +0530, Sumit Garg wrote:
> Assign an unused IPI which can be turned as NMI using ipi_nmi framework.
> Also, invoke corresponding dynamic IPI setup/teardown APIs.
> 
> Signed-off-by: Sumit Garg 
> ---
>  arch/arm64/kernel/smp.c | 8 
>  1 file changed, 8 insertions(+)
> 
> diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
> index 82e75fc..2e118e2 100644
> --- a/arch/arm64/kernel/smp.c
> +++ b/arch/arm64/kernel/smp.c
> @@ -43,6 +43,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  #include 
>  #include 
> @@ -962,6 +963,8 @@ static void ipi_setup(int cpu)
>  
>   for (i = 0; i < nr_ipi; i++)
>   enable_percpu_irq(ipi_irq_base + i, 0);
> +
> + dynamic_ipi_setup(cpu);
>  }
>  
>  #ifdef CONFIG_HOTPLUG_CPU
> @@ -974,6 +977,8 @@ static void ipi_teardown(int cpu)
>  
>   for (i = 0; i < nr_ipi; i++)
>   disable_percpu_irq(ipi_irq_base + i);
> +
> + dynamic_ipi_teardown(cpu);
>  }
>  #endif
>  
> @@ -995,6 +1000,9 @@ void __init set_smp_ipi_range(int ipi_base, int n)
>   irq_set_status_flags(ipi_base + i, IRQ_HIDDEN);
>   }
>  
> + if (n > nr_ipi)
> + set_smp_dynamic_ipi(ipi_base + nr_ipi);
> +
>   ipi_irq_base = ipi_base;
>  
>   /* Setup the boot CPU immediately */
> -- 

Looks good to me. Please feel free to add:

Reviewed-by: Masayoshi Mizuma 

Thanks!
Masa


Re: [PATCH v7 2/7] irqchip/gic-v3: Enable support for SGIs to act as NMIs

2020-11-02 Thread Masayoshi Mizuma
On Mon, Nov 02, 2020 at 05:14:42PM +0530, Sumit Garg wrote:
> Add support to handle SGIs as pseudo NMIs. As SGIs or IPIs default to a
> special flow handler: handle_percpu_devid_fasteoi_ipi(), so skip NMI
> handler update in case of SGIs.
> 
> Also, enable NMI support prior to gic_smp_init() as allocation of SGIs
> as IRQs/NMIs happen as part of this routine.
> 
> Signed-off-by: Sumit Garg 
> ---
>  drivers/irqchip/irq-gic-v3.c | 29 +
>  1 file changed, 21 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/irqchip/irq-gic-v3.c b/drivers/irqchip/irq-gic-v3.c
> index 16fecc0..7010ae2 100644
> --- a/drivers/irqchip/irq-gic-v3.c
> +++ b/drivers/irqchip/irq-gic-v3.c
> @@ -461,6 +461,7 @@ static u32 gic_get_ppi_index(struct irq_data *d)
>  static int gic_irq_nmi_setup(struct irq_data *d)
>  {
>   struct irq_desc *desc = irq_to_desc(d->irq);
> + u32 idx;
>  
>   if (!gic_supports_nmi())
>   return -EINVAL;
> @@ -478,16 +479,22 @@ static int gic_irq_nmi_setup(struct irq_data *d)
>   return -EINVAL;
>  
>   /* desc lock should already be held */
> - if (gic_irq_in_rdist(d)) {
> - u32 idx = gic_get_ppi_index(d);
> + switch (get_intid_range(d)) {
> + case SGI_RANGE:
> + break;
> + case PPI_RANGE:
> + case EPPI_RANGE:
> + idx = gic_get_ppi_index(d);
>  
>   /* Setting up PPI as NMI, only switch handler for first NMI */
>   if (!refcount_inc_not_zero(_nmi_refs[idx])) {
>   refcount_set(_nmi_refs[idx], 1);
>   desc->handle_irq = handle_percpu_devid_fasteoi_nmi;
>   }
> - } else {
> + break;
> + default:
>   desc->handle_irq = handle_fasteoi_nmi;
> + break;
>   }
>  
>   gic_irq_set_prio(d, GICD_INT_NMI_PRI);
> @@ -498,6 +505,7 @@ static int gic_irq_nmi_setup(struct irq_data *d)
>  static void gic_irq_nmi_teardown(struct irq_data *d)
>  {
>   struct irq_desc *desc = irq_to_desc(d->irq);
> + u32 idx;
>  
>   if (WARN_ON(!gic_supports_nmi()))
>   return;
> @@ -515,14 +523,20 @@ static void gic_irq_nmi_teardown(struct irq_data *d)
>   return;
>  
>   /* desc lock should already be held */
> - if (gic_irq_in_rdist(d)) {
> - u32 idx = gic_get_ppi_index(d);
> + switch (get_intid_range(d)) {
> + case SGI_RANGE:
> + break;
> + case PPI_RANGE:
> + case EPPI_RANGE:
> + idx = gic_get_ppi_index(d);
>  
>   /* Tearing down NMI, only switch handler for last NMI */
>   if (refcount_dec_and_test(_nmi_refs[idx]))
>   desc->handle_irq = handle_percpu_devid_irq;
> - } else {
> + break;
> + default:
>   desc->handle_irq = handle_fasteoi_irq;
> + break;
>   }
>  
>   gic_irq_set_prio(d, GICD_INT_DEF_PRI);
> @@ -1708,6 +1722,7 @@ static int __init gic_init_bases(void __iomem 
> *dist_base,
>  
>   gic_dist_init();
>   gic_cpu_init();
> + gic_enable_nmi_support();
>   gic_smp_init();
>   gic_cpu_pm_init();
>  
> @@ -1719,8 +1734,6 @@ static int __init gic_init_bases(void __iomem 
> *dist_base,
>   gicv2m_init(handle, gic_data.domain);
>   }
>  
> - gic_enable_nmi_support();
> -
>   return 0;
>  
>  out_free:
> -- 

Looks good to me. Please feel free to add:

Reviewed-by: Masayoshi Mizuma 

Thanks!
Masa


Re: [PATCH v7 1/7] arm64: Add framework to turn IPI as NMI

2020-11-02 Thread Masayoshi Mizuma
On Mon, Nov 02, 2020 at 05:14:41PM +0530, Sumit Garg wrote:
> Introduce framework to turn an IPI as NMI using pseudo NMIs. The main
> motivation for this feature is to have an IPI that can be leveraged to
> invoke NMI functions on other CPUs.
> 
> And current prospective users are NMI backtrace and KGDB CPUs round-up
> whose support is added via future patches.
> 
> Signed-off-by: Sumit Garg 
> ---
>  arch/arm64/include/asm/nmi.h | 17 
>  arch/arm64/kernel/Makefile   |  2 +-
>  arch/arm64/kernel/ipi_nmi.c  | 65 
> 
>  3 files changed, 83 insertions(+), 1 deletion(-)
>  create mode 100644 arch/arm64/include/asm/nmi.h
>  create mode 100644 arch/arm64/kernel/ipi_nmi.c
> 
> diff --git a/arch/arm64/include/asm/nmi.h b/arch/arm64/include/asm/nmi.h
> new file mode 100644
> index 000..4cd14b6
> --- /dev/null
> +++ b/arch/arm64/include/asm/nmi.h
> @@ -0,0 +1,17 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef __ASM_NMI_H
> +#define __ASM_NMI_H
> +
> +#ifndef __ASSEMBLER__
> +
> +#include 
> +
> +extern bool arm64_supports_nmi(void);
> +extern void arm64_send_nmi(cpumask_t *mask);
> +
> +void set_smp_dynamic_ipi(int ipi);
> +void dynamic_ipi_setup(int cpu);
> +void dynamic_ipi_teardown(int cpu);
> +
> +#endif /* !__ASSEMBLER__ */
> +#endif
> diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
> index bbaf0bc..525a1e0 100644
> --- a/arch/arm64/kernel/Makefile
> +++ b/arch/arm64/kernel/Makefile
> @@ -17,7 +17,7 @@ obj-y   := debug-monitors.o entry.o 
> irq.o fpsimd.o  \
>  return_address.o cpuinfo.o cpu_errata.o  
> \
>  cpufeature.o alternative.o cacheinfo.o   
> \
>  smp.o smp_spin_table.o topology.o smccc-call.o   
> \
> -syscall.o proton-pack.o
> +syscall.o proton-pack.o ipi_nmi.o
>  
>  targets  += efi-entry.o
>  
> diff --git a/arch/arm64/kernel/ipi_nmi.c b/arch/arm64/kernel/ipi_nmi.c
> new file mode 100644
> index 000..a945dcf
> --- /dev/null
> +++ b/arch/arm64/kernel/ipi_nmi.c
> @@ -0,0 +1,65 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * NMI support for IPIs
> + *
> + * Copyright (C) 2020 Linaro Limited
> + * Author: Sumit Garg 
> + */
> +
> +#include 
> +#include 
> +#include 
> +
> +#include 
> +
> +static struct irq_desc *ipi_nmi_desc __read_mostly;
> +static int ipi_nmi_id __read_mostly;
> +
> +bool arm64_supports_nmi(void)
> +{
> + if (ipi_nmi_desc)
> + return true;
> +
> + return false;
> +}
> +
> +void arm64_send_nmi(cpumask_t *mask)
> +{
> + if (WARN_ON_ONCE(!ipi_nmi_desc))
> + return;
> +
> + __ipi_send_mask(ipi_nmi_desc, mask);
> +}
> +
> +static irqreturn_t ipi_nmi_handler(int irq, void *data)
> +{
> + /* nop, NMI handlers for special features can be added here. */
> +
> + return IRQ_NONE;
> +}
> +

> +void dynamic_ipi_setup(int cpu)

cpu isn't used here, so void is better?

void dynamic_ipi_setup(void)

> +{
> + if (!ipi_nmi_desc)
> + return;
> +
> + if (!prepare_percpu_nmi(ipi_nmi_id))
> + enable_percpu_nmi(ipi_nmi_id, IRQ_TYPE_NONE);
> +}
> +

> +void dynamic_ipi_teardown(int cpu)

Same as here:
void dynamic_ipi_teardown(void)

> +{
> + if (!ipi_nmi_desc)
> + return;
> +
> + disable_percpu_nmi(ipi_nmi_id);
> +     teardown_percpu_nmi(ipi_nmi_id);
> +}
> +
> +void __init set_smp_dynamic_ipi(int ipi)
> +{
> + if (!request_percpu_nmi(ipi, ipi_nmi_handler, "IPI", _number)) {
> + ipi_nmi_desc = irq_to_desc(ipi);
> + ipi_nmi_id = ipi;
> + }
> +}
> -- 

Otherwise, looks good to me. Please feel free to add:

Reviewed-by: Masayoshi Mizuma 

Thanks,
Masa


Re: [PATCH v5 1/5] arm64: Add framework to turn IPI as NMI

2020-10-14 Thread Masayoshi Mizuma
On Wed, Oct 14, 2020 at 04:42:07PM +0530, Sumit Garg wrote:
> Introduce framework to turn an IPI as NMI using pseudo NMIs. In case a
> particular platform doesn't support pseudo NMIs, then request IPI as a
> regular IRQ.
> 
> The main motivation for this feature is to have an IPI that can be
> leveraged to invoke NMI functions on other CPUs. And current prospective
> users are NMI backtrace and KGDB CPUs round-up whose support is added
> via future patches.
> 
> Signed-off-by: Sumit Garg 
> ---
>  arch/arm64/include/asm/nmi.h | 16 +
>  arch/arm64/kernel/Makefile   |  2 +-
>  arch/arm64/kernel/ipi_nmi.c  | 77 
> 
>  3 files changed, 94 insertions(+), 1 deletion(-)
>  create mode 100644 arch/arm64/include/asm/nmi.h
>  create mode 100644 arch/arm64/kernel/ipi_nmi.c
> 
> diff --git a/arch/arm64/include/asm/nmi.h b/arch/arm64/include/asm/nmi.h
> new file mode 100644
> index 000..3433c55
> --- /dev/null
> +++ b/arch/arm64/include/asm/nmi.h
> @@ -0,0 +1,16 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef __ASM_NMI_H
> +#define __ASM_NMI_H
> +
> +#ifndef __ASSEMBLER__
> +
> +#include 
> +
> +extern void arch_send_call_nmi_func_ipi_mask(cpumask_t *mask);
> +
> +void set_smp_ipi_nmi(int ipi);
> +void ipi_nmi_setup(int cpu);
> +void ipi_nmi_teardown(int cpu);
> +
> +#endif /* !__ASSEMBLER__ */
> +#endif
> diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
> index bbaf0bc..525a1e0 100644
> --- a/arch/arm64/kernel/Makefile
> +++ b/arch/arm64/kernel/Makefile
> @@ -17,7 +17,7 @@ obj-y   := debug-monitors.o entry.o 
> irq.o fpsimd.o  \
>  return_address.o cpuinfo.o cpu_errata.o  
> \
>  cpufeature.o alternative.o cacheinfo.o   
> \
>  smp.o smp_spin_table.o topology.o smccc-call.o   
> \
> -syscall.o proton-pack.o
> +syscall.o proton-pack.o ipi_nmi.o
>  
>  targets  += efi-entry.o
>  
> diff --git a/arch/arm64/kernel/ipi_nmi.c b/arch/arm64/kernel/ipi_nmi.c
> new file mode 100644
> index 000..a959256
> --- /dev/null
> +++ b/arch/arm64/kernel/ipi_nmi.c
> @@ -0,0 +1,77 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * NMI support for IPIs
> + *
> + * Copyright (C) 2020 Linaro Limited
> + * Author: Sumit Garg 
> + */
> +
> +#include 
> +#include 
> +#include 
> +
> +#include 
> +
> +static struct irq_desc *ipi_desc __read_mostly;
> +static int ipi_id __read_mostly;
> +static bool is_nmi __read_mostly;
> +
> +void arch_send_call_nmi_func_ipi_mask(cpumask_t *mask)
> +{
> + if (WARN_ON_ONCE(!ipi_desc))
> + return;
> +
> + __ipi_send_mask(ipi_desc, mask);
> +}
> +
> +static irqreturn_t ipi_nmi_handler(int irq, void *data)
> +{
> + /* nop, NMI handlers for special features can be added here. */
> +
> + return IRQ_HANDLED;
> +}
> +
> +void ipi_nmi_setup(int cpu)
> +{
> + if (!ipi_desc)
> + return;
> +
> + if (is_nmi) {
> + if (!prepare_percpu_nmi(ipi_id))
> + enable_percpu_nmi(ipi_id, IRQ_TYPE_NONE);
> + } else {
> + enable_percpu_irq(ipi_id, IRQ_TYPE_NONE);
> + }
> +}
> +
> +void ipi_nmi_teardown(int cpu)
> +{
> + if (!ipi_desc)
> + return;
> +
> + if (is_nmi) {
> + disable_percpu_nmi(ipi_id);
> + teardown_percpu_nmi(ipi_id);
> + } else {
> + disable_percpu_irq(ipi_id);
> + }
> +}
> +
> +void __init set_smp_ipi_nmi(int ipi)
> +{
> + int err;
> +
> + err = request_percpu_nmi(ipi, ipi_nmi_handler, "IPI", _number);
> + if (err) {
> + err = request_percpu_irq(ipi, ipi_nmi_handler, "IPI",
> +  _number);
> + WARN_ON(err);
> + is_nmi = false;
> + } else {
> + is_nmi = true;
> + }
> +
> + ipi_desc = irq_to_desc(ipi);
> + irq_set_status_flags(ipi, IRQ_HIDDEN);
> + ipi_id = ipi;
> +}
> -- 

Looks good to me. Please feel free to add:

Reviewed-by: Masayoshi Mizuma 

Thanks!
Masa


Re: [PATCH v5 2/5] irqchip/gic-v3: Enable support for SGIs to act as NMIs

2020-10-14 Thread Masayoshi Mizuma
On Wed, Oct 14, 2020 at 04:42:08PM +0530, Sumit Garg wrote:
> Add support to handle SGIs as regular NMIs. As SGIs or IPIs defaults to a
> special flow handler: handle_percpu_devid_fasteoi_ipi(), so skip NMI
> handler update in case of SGIs.
> 
> Also, enable NMI support prior to gic_smp_init() as allocation of SGIs
> as IRQs/NMIs happen as part of this routine.
> 
> Signed-off-by: Sumit Garg 
> ---
>  drivers/irqchip/irq-gic-v3.c | 13 +++--
>  1 file changed, 11 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/irqchip/irq-gic-v3.c b/drivers/irqchip/irq-gic-v3.c
> index 16fecc0..5efc865 100644
> --- a/drivers/irqchip/irq-gic-v3.c
> +++ b/drivers/irqchip/irq-gic-v3.c
> @@ -477,6 +477,11 @@ static int gic_irq_nmi_setup(struct irq_data *d)
>   if (WARN_ON(gic_irq(d) >= 8192))
>   return -EINVAL;
>  
> + if (get_intid_range(d) == SGI_RANGE) {
> + gic_irq_set_prio(d, GICD_INT_NMI_PRI);
> + return 0;
> + }
> +
>   /* desc lock should already be held */
>   if (gic_irq_in_rdist(d)) {
>   u32 idx = gic_get_ppi_index(d);
> @@ -514,6 +519,11 @@ static void gic_irq_nmi_teardown(struct irq_data *d)
>   if (WARN_ON(gic_irq(d) >= 8192))
>   return;
>  
> + if (get_intid_range(d) == SGI_RANGE) {
> + gic_irq_set_prio(d, GICD_INT_DEF_PRI);
> + return;
> + }
> +
>   /* desc lock should already be held */
>   if (gic_irq_in_rdist(d)) {
>   u32 idx = gic_get_ppi_index(d);
> @@ -1708,6 +1718,7 @@ static int __init gic_init_bases(void __iomem 
> *dist_base,
>  
>   gic_dist_init();
>   gic_cpu_init();
> + gic_enable_nmi_support();
>   gic_smp_init();
>   gic_cpu_pm_init();
>  
> @@ -1719,8 +1730,6 @@ static int __init gic_init_bases(void __iomem 
> *dist_base,
>   gicv2m_init(handle, gic_data.domain);
>   }
>  
> - gic_enable_nmi_support();
> -
>   return 0;
>  
>  out_free:
> -- 

Looks good to me. Please feel free to add:

Reviewed-by: Masayoshi Mizuma 

Thanks!
Masa


Re: [PATCH v5 5/5] arm64: ipi_nmi: Add support for NMI backtrace

2020-10-14 Thread Masayoshi Mizuma
On Wed, Oct 14, 2020 at 04:42:11PM +0530, Sumit Garg wrote:
> Enable NMI backtrace support on arm64 using IPI turned as an NMI
> leveraging pseudo NMIs support. It is now possible for users to get a
> backtrace of a CPU stuck in hard-lockup using magic SYSRQ.
> 
> Signed-off-by: Sumit Garg 
> ---
>  arch/arm64/include/asm/irq.h |  6 ++
>  arch/arm64/kernel/ipi_nmi.c  | 12 +++-
>  2 files changed, 17 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/arm64/include/asm/irq.h b/arch/arm64/include/asm/irq.h
> index b2b0c64..e840bf1 100644
> --- a/arch/arm64/include/asm/irq.h
> +++ b/arch/arm64/include/asm/irq.h
> @@ -6,6 +6,12 @@
>  
>  #include 
>  
> +#ifdef CONFIG_SMP
> +extern void arch_trigger_cpumask_backtrace(const cpumask_t *mask,
> +bool exclude_self);
> +#define arch_trigger_cpumask_backtrace arch_trigger_cpumask_backtrace
> +#endif
> +
>  struct pt_regs;
>  
>  static inline int nr_legacy_irqs(void)
> diff --git a/arch/arm64/kernel/ipi_nmi.c b/arch/arm64/kernel/ipi_nmi.c
> index e0a9e03..e1dc19d 100644
> --- a/arch/arm64/kernel/ipi_nmi.c
> +++ b/arch/arm64/kernel/ipi_nmi.c
> @@ -9,6 +9,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  
>  #include 
> @@ -25,12 +26,21 @@ void arch_send_call_nmi_func_ipi_mask(cpumask_t *mask)
>   __ipi_send_mask(ipi_desc, mask);
>  }
>  
> +void arch_trigger_cpumask_backtrace(const cpumask_t *mask, bool exclude_self)
> +{
> + nmi_trigger_cpumask_backtrace(mask, exclude_self,
> +   arch_send_call_nmi_func_ipi_mask);
> +}
> +
>  static irqreturn_t ipi_nmi_handler(int irq, void *data)
>  {
>   unsigned int cpu = smp_processor_id();
>  
> - ipi_kgdb_nmicallback(cpu, get_irq_regs());
> + if (nmi_cpu_backtrace(get_irq_regs()))
> + goto out;
>  
> +     ipi_kgdb_nmicallback(cpu, get_irq_regs());
> +out:
>   return IRQ_HANDLED;
>  }
>  
> -- 

It works well. Please feel free to add:

Tested-by: Masayoshi Mizuma 

Thanks!
Masa


Re: [PATCH v5 3/5] arm64: smp: Allocate and setup IPI as NMI

2020-10-14 Thread Masayoshi Mizuma
On Wed, Oct 14, 2020 at 04:42:09PM +0530, Sumit Garg wrote:
> Allocate an unused IPI that can be turned as NMI using ipi_nmi framework.
> Also, invoke corresponding NMI setup/teardown APIs.
> 
> Signed-off-by: Sumit Garg 
> ---
>  arch/arm64/kernel/smp.c | 8 
>  1 file changed, 8 insertions(+)
> 
> diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
> index 82e75fc..129ebfb 100644
> --- a/arch/arm64/kernel/smp.c
> +++ b/arch/arm64/kernel/smp.c
> @@ -43,6 +43,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  #include 
>  #include 
> @@ -962,6 +963,8 @@ static void ipi_setup(int cpu)
>  
>   for (i = 0; i < nr_ipi; i++)
>   enable_percpu_irq(ipi_irq_base + i, 0);
> +
> + ipi_nmi_setup(cpu);
>  }
>  
>  #ifdef CONFIG_HOTPLUG_CPU
> @@ -974,6 +977,8 @@ static void ipi_teardown(int cpu)
>  
>   for (i = 0; i < nr_ipi; i++)
>   disable_percpu_irq(ipi_irq_base + i);
> +
> + ipi_nmi_teardown(cpu);
>  }
>  #endif
>  
> @@ -995,6 +1000,9 @@ void __init set_smp_ipi_range(int ipi_base, int n)
>   irq_set_status_flags(ipi_base + i, IRQ_HIDDEN);
>   }
>  
> + if (n > nr_ipi)
> + set_smp_ipi_nmi(ipi_base + nr_ipi);
> +
>   ipi_irq_base = ipi_base;
>  
>   /* Setup the boot CPU immediately */
> -- 

Looks good to me. Please feel free to add:

Reviewed-by: Masayoshi Mizuma 

Thanks!
Masa



Re: [PATCH v4 1/5] arm64: Add framework to turn IPI as NMI

2020-10-12 Thread Masayoshi Mizuma
On Mon, Oct 12, 2020 at 05:49:09PM +0530, Sumit Garg wrote:
> Hi Masa,
> 
> On Sat, 10 Oct 2020 at 20:43, Masayoshi Mizuma  wrote:
> >
> > On Sat, Oct 10, 2020 at 10:34:04AM +0100, Marc Zyngier wrote:
> > > On Sat, 10 Oct 2020 02:58:55 +0100,
> > > Masayoshi Mizuma  wrote:
> > >
> > > [...]
> > >
> > > > > +void ipi_nmi_setup(int cpu)
> > > > > +{
> > > > > + if (!ipi_desc)
> > > > > + return;
> > > >
> > > > ipi_nmi_setup() may be called twice for CPU0:
> > > >
> > > >   set_smp_ipi_range => set_smp_ipi_nmi => ipi_nmi_setup
> > > > => ipi_setup => ipi_nmi_setup
> > > >
> > > > Actually, I got the following error message via the second 
> > > > ipi_nmi_setup():
> > > >
> > > >   GICv3: Pseudo-NMIs enabled using relaxed ICC_PMR_EL1 synchronisation
> > > >   GICv3: Cannot set NMI property of enabled IRQ 8
> > > >   genirq: Failed to setup NMI delivery: irq 8
> > > >
> 
> Ah, thanks for catching this which I missed during my testing.
> 
> > > > Why don't we have a check to prevent that? Like as:
> > > >
> > > >if (cpumask_test_cpu(cpu, ipi_desc->percpu_enabled))
> > > >return;
> > >
> > > That's definitely the wrong thing to do. prepare_nmi_setup() shouldn't
> > > be called twice, and papering over it isn't acceptable.
> >
> > Got it. How about moving ipi_nmi_setup() from ipi_setup() to
> > secondary_start_kernel() ? so that CPU0 can call ipi_nmi_setup() only
> > from set_smp_ipi_nmi().
> >
> > --- a/arch/arm64/kernel/smp.c
> > +++ b/arch/arm64/kernel/smp.c
> > @@ -245,6 +245,7 @@ asmlinkage notrace void secondary_start_kernel(void)
> > notify_cpu_starting(cpu);
> >
> > ipi_setup(cpu);
> > +   ipi_nmi_setup(cpu);
> >
> > store_cpu_topology(cpu);
> > numa_add_cpu(cpu);
> > @@ -966,8 +967,6 @@ static void ipi_setup(int cpu)
> >
> > for (i = 0; i < nr_ipi; i++)
> > enable_percpu_irq(ipi_irq_base + i, 0);
> > -
> > -   ipi_nmi_setup(cpu);
> >  }
> >
> >  #ifdef CONFIG_HOTPLUG_CPU
> >
> 
> IMO, it would be more consistent to keep invocation of ipi_nmi_setup()
> from ipi_setup(). So let me remove other invocation from
> set_smp_ipi_nmi():
> 
> diff --git a/arch/arm64/kernel/ipi_nmi.c b/arch/arm64/kernel/ipi_nmi.c
> index d3aa430..000e457 100644
> --- a/arch/arm64/kernel/ipi_nmi.c
> +++ b/arch/arm64/kernel/ipi_nmi.c
> @@ -87,7 +87,4 @@ void __init set_smp_ipi_nmi(int ipi)
> ipi_desc = irq_to_desc(ipi);
> irq_set_status_flags(ipi, IRQ_HIDDEN);
> ipi_id = ipi;
> -
> -   /* Setup the boot CPU immediately */
> -   ipi_nmi_setup(smp_processor_id());
>  }
> 
> Do let me know if this works for you?

Yes, make sense to me.

Thanks!
Masa


Re: [PATCH v4 1/5] arm64: Add framework to turn IPI as NMI

2020-10-10 Thread Masayoshi Mizuma
On Sat, Oct 10, 2020 at 10:34:04AM +0100, Marc Zyngier wrote:
> On Sat, 10 Oct 2020 02:58:55 +0100,
> Masayoshi Mizuma  wrote:
> 
> [...]
> 
> > > +void ipi_nmi_setup(int cpu)
> > > +{
> > > + if (!ipi_desc)
> > > + return;
> > 
> > ipi_nmi_setup() may be called twice for CPU0:
> > 
> >   set_smp_ipi_range => set_smp_ipi_nmi => ipi_nmi_setup
> > => ipi_setup => ipi_nmi_setup
> > 
> > Actually, I got the following error message via the second ipi_nmi_setup():
> > 
> >   GICv3: Pseudo-NMIs enabled using relaxed ICC_PMR_EL1 synchronisation
> >   GICv3: Cannot set NMI property of enabled IRQ 8
> >   genirq: Failed to setup NMI delivery: irq 8
> > 
> > Why don't we have a check to prevent that? Like as:
> > 
> >if (cpumask_test_cpu(cpu, ipi_desc->percpu_enabled))
> >return;
> 
> That's definitely the wrong thing to do. prepare_nmi_setup() shouldn't
> be called twice, and papering over it isn't acceptable.

Got it. How about moving ipi_nmi_setup() from ipi_setup() to
secondary_start_kernel() ? so that CPU0 can call ipi_nmi_setup() only
from set_smp_ipi_nmi().

--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -245,6 +245,7 @@ asmlinkage notrace void secondary_start_kernel(void)
notify_cpu_starting(cpu);
 
ipi_setup(cpu);
+   ipi_nmi_setup(cpu);
 
store_cpu_topology(cpu);
numa_add_cpu(cpu);
@@ -966,8 +967,6 @@ static void ipi_setup(int cpu)
 
for (i = 0; i < nr_ipi; i++)
enable_percpu_irq(ipi_irq_base + i, 0);
-
-   ipi_nmi_setup(cpu);
 }
 
 #ifdef CONFIG_HOTPLUG_CPU

Thanks,
Masa


Re: [PATCH v4 1/5] arm64: Add framework to turn IPI as NMI

2020-10-09 Thread Masayoshi Mizuma
Hi Sumit,

On Fri, Sep 11, 2020 at 06:58:40PM +0530, Sumit Garg wrote:
> Introduce framework to turn an IPI as NMI using pseudo NMIs. In case a
> particular platform doesn't support pseudo NMIs, then request IPI as a
> regular IRQ.
> 
> The main motivation for this feature is to have an IPI that can be
> leveraged to invoke NMI functions on other CPUs. And current prospective
> users are NMI backtrace and KGDB CPUs round-up whose support is added
> via future patches.
> 
> Signed-off-by: Sumit Garg 
> ---
>  arch/arm64/include/asm/nmi.h | 16 +
>  arch/arm64/kernel/Makefile   |  2 +-
>  arch/arm64/kernel/ipi_nmi.c  | 80 
> 
>  3 files changed, 97 insertions(+), 1 deletion(-)
>  create mode 100644 arch/arm64/include/asm/nmi.h
>  create mode 100644 arch/arm64/kernel/ipi_nmi.c
> 
> diff --git a/arch/arm64/include/asm/nmi.h b/arch/arm64/include/asm/nmi.h
> new file mode 100644
> index 000..3433c55
> --- /dev/null
> +++ b/arch/arm64/include/asm/nmi.h
> @@ -0,0 +1,16 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef __ASM_NMI_H
> +#define __ASM_NMI_H
> +
> +#ifndef __ASSEMBLER__
> +
> +#include 
> +
> +extern void arch_send_call_nmi_func_ipi_mask(cpumask_t *mask);
> +
> +void set_smp_ipi_nmi(int ipi);
> +void ipi_nmi_setup(int cpu);
> +void ipi_nmi_teardown(int cpu);
> +
> +#endif /* !__ASSEMBLER__ */
> +#endif
> diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
> index a561cbb..022c26b 100644
> --- a/arch/arm64/kernel/Makefile
> +++ b/arch/arm64/kernel/Makefile
> @@ -19,7 +19,7 @@ obj-y   := debug-monitors.o entry.o 
> irq.o fpsimd.o  \
>  return_address.o cpuinfo.o cpu_errata.o  
> \
>  cpufeature.o alternative.o cacheinfo.o   
> \
>  smp.o smp_spin_table.o topology.o smccc-call.o   
> \
> -syscall.o
> +syscall.o ipi_nmi.o
>  
>  targets  += efi-entry.o
>  
> diff --git a/arch/arm64/kernel/ipi_nmi.c b/arch/arm64/kernel/ipi_nmi.c
> new file mode 100644
> index 000..355ef92
> --- /dev/null
> +++ b/arch/arm64/kernel/ipi_nmi.c
> @@ -0,0 +1,80 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * NMI support for IPIs
> + *
> + * Copyright (C) 2020 Linaro Limited
> + * Author: Sumit Garg 
> + */
> +
> +#include 
> +#include 
> +#include 
> +
> +#include 
> +
> +static struct irq_desc *ipi_desc __read_mostly;
> +static int ipi_id __read_mostly;
> +static bool is_nmi __read_mostly;
> +
> +void arch_send_call_nmi_func_ipi_mask(cpumask_t *mask)
> +{
> + if (WARN_ON_ONCE(!ipi_desc))
> + return;
> +
> + __ipi_send_mask(ipi_desc, mask);
> +}
> +
> +static irqreturn_t ipi_nmi_handler(int irq, void *data)
> +{
> + /* nop, NMI handlers for special features can be added here. */
> +
> + return IRQ_HANDLED;
> +}
> +
> +void ipi_nmi_setup(int cpu)
> +{
> + if (!ipi_desc)
> + return;

ipi_nmi_setup() may be called twice for CPU0:

  set_smp_ipi_range => set_smp_ipi_nmi => ipi_nmi_setup
=> ipi_setup => ipi_nmi_setup

Actually, I got the following error message via the second ipi_nmi_setup():

  GICv3: Pseudo-NMIs enabled using relaxed ICC_PMR_EL1 synchronisation
  GICv3: Cannot set NMI property of enabled IRQ 8
  genirq: Failed to setup NMI delivery: irq 8

Why don't we have a check to prevent that? Like as:

   if (cpumask_test_cpu(cpu, ipi_desc->percpu_enabled))
   return;

> +
> + if (is_nmi) {
> + if (!prepare_percpu_nmi(ipi_id))
> + enable_percpu_nmi(ipi_id, 0);

It would be better to use IRQ_TYPE_NONE instead of 0.

enable_percpu_nmi(ipi_id, IRQ_TYPE_NONE);

> + } else {
> + enable_percpu_irq(ipi_id, 0);

Same as here:
enable_percpu_irq(ipi_id, IRQ_TYPE_NONE);

Thanks,
Masa

> + }
> +}
> +
> +void ipi_nmi_teardown(int cpu)
> +{
> + if (!ipi_desc)
> + return;
> +
> + if (is_nmi) {
> + disable_percpu_nmi(ipi_id);
> + teardown_percpu_nmi(ipi_id);
> + } else {
> + disable_percpu_irq(ipi_id);
> + }
> +}
> +
> +void __init set_smp_ipi_nmi(int ipi)
> +{
> + int err;
> +
> + err = request_percpu_nmi(ipi, ipi_nmi_handler, "IPI", _number);
> + if (err) {
> + err = request_percpu_irq(ipi, ipi_nmi_handler, "IPI",
> +  _number);
> + WARN_ON(err);
> + is_nmi = false;
> + } else {
> + is_nmi = true;
> + }
> +
> + ipi_desc = irq_to_desc(ipi);
> + irq_set_status_flags(ipi, IRQ_HIDDEN);
> + ipi_id = ipi;
> +
> + /* Setup the boot CPU immediately */
> + ipi_nmi_setup(smp_processor_id());
> +}
> -- 
> 2.7.4
> 
> 
> ___
> linux-arm-kernel mailing list
> 

[PATCH v3] arm64/sve: Fix wrong free for task->thread.sve_state

2019-09-30 Thread Masayoshi Mizuma
From: Masayoshi Mizuma 

The system which has SVE feature crashed because of
the memory pointed by task->thread.sve_state was destroyed
by someone.

That is because sve_state is freed while the forking the
child process. The child process has the pointer of sve_state
which is same as the parent's because the child's task_struct
is copied from the parent's one. If the copy_process()
fails as an error on somewhere, for example, copy_creds(),
then the sve_state is freed even if the parent is alive.
The flow is as follows.

copy_process
p = dup_task_struct
=> arch_dup_task_struct
*dst = *src;  // copy the entire region.
:
retval = copy_creds
if (retval < 0)
goto bad_fork_free;
:
bad_fork_free:
...
delayed_free_task(p);
  => free_task
 => arch_release_task_struct
=> fpsimd_release_task
   => __sve_free
  => kfree(task->thread.sve_state);
 // free the parent's sve_state

Move child's sve_state = NULL and clearing TIF_SVE flag
to arch_dup_task_struct() so that the child doesn't free the
parent's one.
There is no need to wait until copy_process() to clear TIF_SVE for
dst, because the thread flags for dst are initialized already by
copying the src task_struct.
This change simplifies the code, so get rid of comments that are no
longer needed.

As a note, arm64 used to have thread_info on the stack. So it
would not be possible to clear TIF_SVE until the stack is initialized.
>From commit c02433dd6de3 ("arm64: split thread_info from task stack"),
the thread_info is part of the task, so it should be valid to modify
the flag from arch_dup_task_struct().

Cc: sta...@vger.kernel.org # 4.15.x-
Fixes: bc0ee4760364 ("arm64/sve: Core task context handling")
Signed-off-by: Masayoshi Mizuma 
Reported-by: Hidetoshi Seto 
Suggested-by: Dave Martin 
Tested-by: Julien Grall 
---
 arch/arm64/kernel/process.c | 32 +++-
 1 file changed, 15 insertions(+), 17 deletions(-)

diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index a47462def04b..ef7aa909bfda 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -332,22 +332,27 @@ void arch_release_task_struct(struct task_struct *tsk)
fpsimd_release_task(tsk);
 }
 
-/*
- * src and dst may temporarily have aliased sve_state after task_struct
- * is copied.  We cannot fix this properly here, because src may have
- * live SVE state and dst's thread_info may not exist yet, so tweaking
- * either src's or dst's TIF_SVE is not safe.
- *
- * The unaliasing is done in copy_thread() instead.  This works because
- * dst is not schedulable or traceable until both of these functions
- * have been called.
- */
 int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
 {
if (current->mm)
fpsimd_preserve_current_state();
*dst = *src;
 
+   /* We rely on the above assignment to initialize dst's thread_flags: */
+   BUILD_BUG_ON(!IS_ENABLED(CONFIG_THREAD_INFO_IN_TASK));
+
+   /*
+* Detach src's sve_state (if any) from dst so that it does not
+* get erroneously used or freed prematurely.  dst's sve_state
+* will be allocated on demand later on if dst uses SVE.
+* For consistency, also clear TIF_SVE here: this could be done
+* later in copy_process(), but to avoid tripping up future
+* maintainers it is best not to leave TIF_SVE and sve_state in
+* an inconsistent state, even temporarily.
+*/
+   dst->thread.sve_state = NULL;
+   clear_tsk_thread_flag(dst, TIF_SVE);
+
return 0;
 }
 
@@ -360,13 +365,6 @@ int copy_thread(unsigned long clone_flags, unsigned long 
stack_start,
 
memset(>thread.cpu_context, 0, sizeof(struct cpu_context));
 
-   /*
-* Unalias p->thread.sve_state (if any) from the parent task
-* and disable discard SVE state for p:
-*/
-   clear_tsk_thread_flag(p, TIF_SVE);
-   p->thread.sve_state = NULL;
-
/*
 * In case p was allocated the same task_struct pointer as some
 * other recently-exited task, make sure p is disassociated from
-- 
2.18.1



Re: [PATCH v2] arm64/sve: Fix wrong free for task->thread.sve_state

2019-09-30 Thread Masayoshi Mizuma
Hi Julien and Dave,

On Mon, Sep 30, 2019 at 02:02:46PM +0100, Dave Martin wrote:
> On Mon, Sep 30, 2019 at 01:23:18PM +0100, Julien Grall wrote:
> > Hi,
> > 
> > On 27/09/2019 16:39, Masayoshi Mizuma wrote:
> > >From: Masayoshi Mizuma 
> > >
> > >The system which has SVE feature crashed because of
> > >the memory pointed by task->thread.sve_state was destroyed
> > >by someone.
> > >
> > >That is because sve_state is freed while the forking the
> > >child process. The child process has the pointer of sve_state
> > >which is same as the parent's because the child's task_struct
> > >is copied from the parent's one. If the copy_process()
> > >fails as an error on somewhere, for example, copy_creds(),
> > >then the sve_state is freed even if the parent is alive.
> > >The flow is as follows.
> > >
> > >copy_process
> > > p = dup_task_struct
> > > => arch_dup_task_struct
> > > *dst = *src;  // copy the entire region.
> > >:
> > > retval = copy_creds
> > > if (retval < 0)
> > > goto bad_fork_free;
> > >:
> > >bad_fork_free:
> > >...
> > > delayed_free_task(p);
> > >   => free_task
> > >  => arch_release_task_struct
> > > => fpsimd_release_task
> > >=> __sve_free
> > >   => kfree(task->thread.sve_state);
> > >  // free the parent's sve_state
> > >
> > >Move child's sve_state = NULL and clearing TIF_SVE flag
> > >to arch_dup_task_struct() so that the child doesn't free the
> > >parent's one.
> > >
> > >Cc: sta...@vger.kernel.org
> > >Fixes: bc0ee4760364 ("arm64/sve: Core task context handling")
> > 
> > Looking at the log, it looks like THREAD_INFO_IN_TASK was selected before
> > the bc0ee4760364. So it should be fine to backport for all the Linux tree
> > contain this commit.

I think this patch is needed for the kernel has SVE support.
I'll add the Cc tag as Dave said:

Cc: sta...@vger.kernel.org # 4.15+

So, I suppose this patch will be backported to stables 5.3.X,
5.2.X and longterm 4.19.X.
Does this make sense?

> > 
> > >Signed-off-by: Masayoshi Mizuma 
> > >Reported-by: Hidetoshi Seto 
> > >Suggested-by: Dave Martin 
> > 
> > I have tested the patch and can confirm that double-free disappeared after
> > the patch is applied:
> > 
> > Tested-by: Julien Grall 

Thank you so much!

> 
> Good to have that confirmed -- thanks for verifying.
> 
> [...]
> 
> > >---
> > >  arch/arm64/kernel/process.c | 21 -
> > >  1 file changed, 4 insertions(+), 17 deletions(-)
> > >
> > >diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
> > >index f674f28df..6937f5935 100644
> > >--- a/arch/arm64/kernel/process.c
> > >+++ b/arch/arm64/kernel/process.c
> > >@@ -323,22 +323,16 @@ void arch_release_task_struct(struct task_struct 
> > >*tsk)
> > >   fpsimd_release_task(tsk);
> > >  }
> > >-/*
> > >- * src and dst may temporarily have aliased sve_state after task_struct
> > >- * is copied.  We cannot fix this properly here, because src may have
> > >- * live SVE state and dst's thread_info may not exist yet, so tweaking
> > >- * either src's or dst's TIF_SVE is not safe.
> > >- *
> > >- * The unaliasing is done in copy_thread() instead.  This works because
> > >- * dst is not schedulable or traceable until both of these functions
> > >- * have been called.
> > >- */
> > 
> > It would be good to explain in the commit message why tweaking "dst" in
> > arch_dup_task_struct() is fine.
> > 
> > From my understanding, Arm64 used to have thread_info on the stack. So it
> > would not be possible to clear TIF_SVE until the stack is initialized.
> > 
> > Now that the thread_info is part of the task, it should be valid to modify
> > the flag from arch_dup_task_struct().
> > 
> > Note that technically, TIF_SVE does not need to be cleared from
> > arch_dup_task_struct(). It could also be done from copy_thread(). But it is
> > easier to keep the both changes together.

Thanks, let me add some comments to the commit log.

> > 
> > >  int arch_dup_task_struct(struct task_struct *dst, struct task_struct 
> > > *src)
>

[PATCH v2] arm64/sve: Fix wrong free for task->thread.sve_state

2019-09-27 Thread Masayoshi Mizuma
From: Masayoshi Mizuma 

The system which has SVE feature crashed because of
the memory pointed by task->thread.sve_state was destroyed
by someone.

That is because sve_state is freed while the forking the
child process. The child process has the pointer of sve_state
which is same as the parent's because the child's task_struct
is copied from the parent's one. If the copy_process()
fails as an error on somewhere, for example, copy_creds(),
then the sve_state is freed even if the parent is alive.
The flow is as follows.

copy_process
p = dup_task_struct
=> arch_dup_task_struct
*dst = *src;  // copy the entire region.
:
retval = copy_creds
if (retval < 0)
goto bad_fork_free;
:
bad_fork_free:
...
delayed_free_task(p);
  => free_task
 => arch_release_task_struct
=> fpsimd_release_task
   => __sve_free
  => kfree(task->thread.sve_state);
 // free the parent's sve_state

Move child's sve_state = NULL and clearing TIF_SVE flag
to arch_dup_task_struct() so that the child doesn't free the
parent's one.

Cc: sta...@vger.kernel.org
Fixes: bc0ee4760364 ("arm64/sve: Core task context handling")
Signed-off-by: Masayoshi Mizuma 
Reported-by: Hidetoshi Seto 
Suggested-by: Dave Martin 
---
 arch/arm64/kernel/process.c | 21 -
 1 file changed, 4 insertions(+), 17 deletions(-)

diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index f674f28df..6937f5935 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -323,22 +323,16 @@ void arch_release_task_struct(struct task_struct *tsk)
fpsimd_release_task(tsk);
 }
 
-/*
- * src and dst may temporarily have aliased sve_state after task_struct
- * is copied.  We cannot fix this properly here, because src may have
- * live SVE state and dst's thread_info may not exist yet, so tweaking
- * either src's or dst's TIF_SVE is not safe.
- *
- * The unaliasing is done in copy_thread() instead.  This works because
- * dst is not schedulable or traceable until both of these functions
- * have been called.
- */
 int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
 {
if (current->mm)
fpsimd_preserve_current_state();
*dst = *src;
 
+   BUILD_BUG_ON(!IS_ENABLED(CONFIG_THREAD_INFO_IN_TASK));
+   dst->thread.sve_state = NULL;
+   clear_tsk_thread_flag(dst, TIF_SVE);
+
return 0;
 }
 
@@ -351,13 +345,6 @@ int copy_thread(unsigned long clone_flags, unsigned long 
stack_start,
 
memset(>thread.cpu_context, 0, sizeof(struct cpu_context));
 
-   /*
-* Unalias p->thread.sve_state (if any) from the parent task
-* and disable discard SVE state for p:
-*/
-   clear_tsk_thread_flag(p, TIF_SVE);
-   p->thread.sve_state = NULL;
-
/*
 * In case p was allocated the same task_struct pointer as some
 * other recently-exited task, make sure p is disassociated from
-- 
2.18.1



Re: [PATCH 1/1] arm64/sve: Fix wrong free for task->thread.sve_state

2019-09-27 Thread Masayoshi Mizuma
Hi Julien and Dave,

Thank you for your comments!
Dave's suggestion looks good for me, many thanks!
I'll post it as v2.

- Masa

On Fri, Sep 27, 2019 at 01:52:30PM +0100, Dave Martin wrote:
> On Thu, Sep 26, 2019 at 03:08:46PM -0400, Masayoshi Mizuma wrote:
> > From: Masayoshi Mizuma 
> > 
> > The system which has SVE feature crashed because of
> > the memory pointed by task->thread.sve_state was destroyed
> > by someone.
> > 
> > That is because sve_state is freed while the forking the
> > child process. The child process has the pointer of sve_state
> > which is same as the parent's because the child's task_struct
> > is copied from the parent's one. If the copy_process()
> > fails as an error on somewhere, for example, copy_creds(),
> > then the sve_state is freed even if the parent is alive.
> > The flow is as follows.
> > 
> > copy_process
> > p = dup_task_struct
> > => arch_dup_task_struct
> > *dst = *src;  // copy the entire region.
> > :
> > retval = copy_creds
> > if (retval < 0)
> > goto bad_fork_free;
> > :
> > bad_fork_free:
> > ...
> > delayed_free_task(p);
> >   => free_task
> >  => arch_release_task_struct
> > => fpsimd_release_task
> >=> __sve_free
> >   => kfree(task->thread.sve_state);
> >  // free the parent's sve_state
> > 
> > Add a flag in task->thread which shows the fork is in progress.
> > If the fork is in progress, that means the child has the pointer
> > to the parent's sve_state, doesn't free the sve_state.
> > 
> > Signed-off-by: Masayoshi Mizuma 
> > Reported-by: Hidetoshi Seto 
> > ---
> >  arch/arm64/include/asm/processor.h | 1 +
> >  arch/arm64/kernel/fpsimd.c | 6 --
> >  arch/arm64/kernel/process.c| 2 ++
> >  3 files changed, 7 insertions(+), 2 deletions(-)
> > 
> > diff --git a/arch/arm64/include/asm/processor.h 
> > b/arch/arm64/include/asm/processor.h
> > index 5623685c7d13..3ca3e350145a 100644
> > --- a/arch/arm64/include/asm/processor.h
> > +++ b/arch/arm64/include/asm/processor.h
> > @@ -143,6 +143,7 @@ struct thread_struct {
> > unsigned long   fault_address;  /* fault info */
> > unsigned long   fault_code; /* ESR_EL1 value */
> > struct debug_info   debug;  /* debugging */
> > +   unsigned intfork_in_progress;
> >  #ifdef CONFIG_ARM64_PTR_AUTH
> > struct ptrauth_keys keys_user;
> >  #endif
> > diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
> > index 37d3912cfe06..8641db4cb062 100644
> > --- a/arch/arm64/kernel/fpsimd.c
> > +++ b/arch/arm64/kernel/fpsimd.c
> > @@ -202,8 +202,10 @@ static bool have_cpu_fpsimd_context(void)
> >   */
> >  static void __sve_free(struct task_struct *task)
> >  {
> > -   kfree(task->thread.sve_state);
> > -   task->thread.sve_state = NULL;
> > +   if (!task->thread.fork_in_progress) {
> > +   kfree(task->thread.sve_state);
> > +   task->thread.sve_state = NULL;
> > +   }
> >  }
> >  
> >  static void sve_free(struct task_struct *task)
> > diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
> > index a47462def04b..8ac0ee4e5f76 100644
> > --- a/arch/arm64/kernel/process.c
> > +++ b/arch/arm64/kernel/process.c
> > @@ -347,6 +347,7 @@ int arch_dup_task_struct(struct task_struct *dst, 
> > struct task_struct *src)
> > if (current->mm)
> > fpsimd_preserve_current_state();
> > *dst = *src;
> > +   dst->thread.fork_in_progress = 1;
> >  
> > return 0;
> >  }
> > @@ -365,6 +366,7 @@ int copy_thread(unsigned long clone_flags, unsigned 
> > long stack_start,
> >  * and disable discard SVE state for p:
> >  */
> > clear_tsk_thread_flag(p, TIF_SVE);
> > +   p->thread.fork_in_progress = 0;
> > p->thread.sve_state = NULL;
> 
> There's definitely a problem here, but a simpler fix is probably to
> NULL sve_state and clear TIF_SVE for dst at the same time.
> 
> Once upon a time, I had to cope with the thread_flags not being copied
> as part of task_struct here, which is one reason why the code is the
> (broken) way it is, but this is ancient history...
> 
> Commit c02433dd6de3 ("arm64: split thread_info from task stack") was
> merged in v4.10 a

[PATCH 0/1] arm64/sve: Fix wrong free for task->thread.sve_state

2019-09-26 Thread Masayoshi Mizuma
From: Masayoshi Mizuma 

The system which has SVE feature crashed by unknown
reason. According to the memory dump, the panic happened
because the memory data pointed by task->thread.sve_state was
destroyed by someone.

We tried to reproduce the crash, however, it's hard to do that.
But, we found a potential issue by reviewing the code.

In copy_process(), the child process has the pointer of sve_state
which is same as the parent's because the child's task_struct is
copied from the parent's one. If the copy_process() fails as an
error on somewhere, for example, copy_creds(), then the sve_state
is freed even if the parent is alive. The flow is as follows.

copy_process
p = dup_task_struct
=> arch_dup_task_struct
*dst = *src;  // copy the entire region.
:
retval = copy_creds
if (retval < 0)
goto bad_fork_free;
:
bad_fork_free:
...
delayed_free_task(p);
=> free_task
   => arch_release_task_struct
  => fpsimd_release_task
 => __sve_free
=> kfree(task->thread.sve_state);
   // free the parent's sve_state

To fix that, add a flag in task->thread which shows the fork
is in progress. If the fork is in progress, that means the
child has the pointer to the parent's sve_state, doesn't
free the sve_state.

Masayoshi Mizuma (1):
  arm64/sve: Fix wrong free for task->thread.sve_state

 arch/arm64/include/asm/processor.h | 1 +
 arch/arm64/kernel/fpsimd.c | 6 --
 arch/arm64/kernel/process.c| 2 ++
 3 files changed, 7 insertions(+), 2 deletions(-)

-- 
2.18.1



[PATCH 1/1] arm64/sve: Fix wrong free for task->thread.sve_state

2019-09-26 Thread Masayoshi Mizuma
From: Masayoshi Mizuma 

The system which has SVE feature crashed because of
the memory pointed by task->thread.sve_state was destroyed
by someone.

That is because sve_state is freed while the forking the
child process. The child process has the pointer of sve_state
which is same as the parent's because the child's task_struct
is copied from the parent's one. If the copy_process()
fails as an error on somewhere, for example, copy_creds(),
then the sve_state is freed even if the parent is alive.
The flow is as follows.

copy_process
p = dup_task_struct
=> arch_dup_task_struct
*dst = *src;  // copy the entire region.
:
retval = copy_creds
if (retval < 0)
goto bad_fork_free;
:
bad_fork_free:
...
delayed_free_task(p);
  => free_task
 => arch_release_task_struct
=> fpsimd_release_task
   => __sve_free
  => kfree(task->thread.sve_state);
 // free the parent's sve_state

Add a flag in task->thread which shows the fork is in progress.
If the fork is in progress, that means the child has the pointer
to the parent's sve_state, doesn't free the sve_state.

Signed-off-by: Masayoshi Mizuma 
Reported-by: Hidetoshi Seto 
---
 arch/arm64/include/asm/processor.h | 1 +
 arch/arm64/kernel/fpsimd.c | 6 --
 arch/arm64/kernel/process.c| 2 ++
 3 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/include/asm/processor.h 
b/arch/arm64/include/asm/processor.h
index 5623685c7d13..3ca3e350145a 100644
--- a/arch/arm64/include/asm/processor.h
+++ b/arch/arm64/include/asm/processor.h
@@ -143,6 +143,7 @@ struct thread_struct {
unsigned long   fault_address;  /* fault info */
unsigned long   fault_code; /* ESR_EL1 value */
struct debug_info   debug;  /* debugging */
+   unsigned intfork_in_progress;
 #ifdef CONFIG_ARM64_PTR_AUTH
struct ptrauth_keys keys_user;
 #endif
diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
index 37d3912cfe06..8641db4cb062 100644
--- a/arch/arm64/kernel/fpsimd.c
+++ b/arch/arm64/kernel/fpsimd.c
@@ -202,8 +202,10 @@ static bool have_cpu_fpsimd_context(void)
  */
 static void __sve_free(struct task_struct *task)
 {
-   kfree(task->thread.sve_state);
-   task->thread.sve_state = NULL;
+   if (!task->thread.fork_in_progress) {
+   kfree(task->thread.sve_state);
+   task->thread.sve_state = NULL;
+   }
 }
 
 static void sve_free(struct task_struct *task)
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index a47462def04b..8ac0ee4e5f76 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -347,6 +347,7 @@ int arch_dup_task_struct(struct task_struct *dst, struct 
task_struct *src)
if (current->mm)
fpsimd_preserve_current_state();
*dst = *src;
+   dst->thread.fork_in_progress = 1;
 
return 0;
 }
@@ -365,6 +366,7 @@ int copy_thread(unsigned long clone_flags, unsigned long 
stack_start,
 * and disable discard SVE state for p:
 */
clear_tsk_thread_flag(p, TIF_SVE);
+   p->thread.fork_in_progress = 0;
p->thread.sve_state = NULL;
 
/*
-- 
2.18.1



[PATCH v3 4/5] x86/mm/KASLR: Cleanup calculation for direct mapping size

2019-08-30 Thread Masayoshi Mizuma
From: Masayoshi Mizuma 

Cleanup calculation for direct mapping size.

Signed-off-by: Baoquan He 
Signed-off-by: Masayoshi Mizuma 
---
 arch/x86/mm/kaslr.c | 50 +++--
 1 file changed, 35 insertions(+), 15 deletions(-)

diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c
index dc6182eec..8e5f3642e 100644
--- a/arch/x86/mm/kaslr.c
+++ b/arch/x86/mm/kaslr.c
@@ -70,15 +70,45 @@ static inline bool kaslr_memory_enabled(void)
return kaslr_enabled() && !IS_ENABLED(CONFIG_KASAN);
 }
 
+/*
+ * Even though a huge virtual address space is reserved for the direct
+ * mapping of physical memory, e.g in 4-level paging mode, it's 64TB,
+ * rare system can own enough physical memory to use it up, most are
+ * even less than 1TB. So with KASLR enabled, we adapt the size of
+ * direct mapping area to the size of actual physical memory plus the
+ * configured padding CONFIG_RANDOMIZE_MEMORY_PHYSICAL_PADDING.
+ * The left part will be taken out to join memory randomization.
+ */
+static inline unsigned long calc_direct_mapping_size(void)
+{
+   unsigned long size_tb, memory_tb;
+
+   /*
+* Update Physical memory mapping to available and
+* add padding if needed (especially for memory hotplug support).
+*/
+   memory_tb = DIV_ROUND_UP(max_pfn << PAGE_SHIFT, 1UL << TB_SHIFT) +
+   CONFIG_RANDOMIZE_MEMORY_PHYSICAL_PADDING;
+
+   size_tb = 1 << (MAX_PHYSMEM_BITS - TB_SHIFT);
+
+   /*
+* Adapt physical memory region size based on available memory
+*/
+   if (memory_tb < size_tb)
+   size_tb = memory_tb;
+
+   return size_tb;
+}
+
 /* Initialize base and padding for each memory region randomized with KASLR */
 void __init kernel_randomize_memory(void)
 {
-   size_t i;
-   unsigned long vaddr_start, vaddr;
-   unsigned long rand, memory_tb;
-   struct rnd_state rand_state;
+   unsigned long vaddr_start, vaddr, rand;
unsigned long remain_entropy;
unsigned long vmemmap_size;
+   struct rnd_state rand_state;
+   size_t i;
 
vaddr_start = pgtable_l5_enabled() ? __PAGE_OFFSET_BASE_L5 : 
__PAGE_OFFSET_BASE_L4;
vaddr = vaddr_start;
@@ -95,20 +125,10 @@ void __init kernel_randomize_memory(void)
if (!kaslr_memory_enabled())
return;
 
-   kaslr_regions[0].size_tb = 1 << (MAX_PHYSMEM_BITS - TB_SHIFT);
+   kaslr_regions[0].size_tb = calc_direct_mapping_size();
kaslr_regions[1].size_tb = VMALLOC_SIZE_TB;
 
-   /*
-* Update Physical memory mapping to available and
-* add padding if needed (especially for memory hotplug support).
-*/
BUG_ON(kaslr_regions[0].base != _offset_base);
-   memory_tb = DIV_ROUND_UP(max_pfn << PAGE_SHIFT, 1UL << TB_SHIFT) +
-   CONFIG_RANDOMIZE_MEMORY_PHYSICAL_PADDING;
-
-   /* Adapt phyiscal memory region size based on available memory */
-   if (memory_tb < kaslr_regions[0].size_tb)
-   kaslr_regions[0].size_tb = memory_tb;
 
/*
 * Calculate the vmemmap region size in TBs, aligned to a TB
-- 
2.18.1



[PATCH v3 0/5] Adjust the padding size for KASLR

2019-08-30 Thread Masayoshi Mizuma
From: Masayoshi Mizuma 

The system sometimes crashes while memory hot-adding on KASLR
enabled system. The crash happens because the regions pointed by
kaslr_regions[].base are overwritten by the hot-added memory.

It happens because of the padding size for kaslr_regions[].base isn't
enough for the system whose physical memory layout has huge space for
memory hotplug. kaslr_regions[].base points "actual installed
memory size + padding" or higher address. So, if the "actual + padding"
is lower address than the maximum memory address, which means the memory
address reachable by memory hot-add, kaslr_regions[].base is destroyed by
the overwritten.

  address
^
|--- maximum memory address (Hotplug)
|^
|--- kaslr_regions[0].base   | Hotadd-able region
| ^  |
| | padding  |
| V  V
|--- actual memory address (Installed on boot)
|

Fix it by getting the maximum memory address from SRAT and store
the value in boot_param, then set the padding size while KASLR
initializing if the default padding size isn't enough.

Masayoshi Mizuma (5):
  x86/boot: Wrap up the SRAT traversing code into subtable_parse()
  x86/boot: Add max_addr field in struct boot_params
  x86/boot: Get the max address from SRAT
  x86/mm/KASLR: Cleanup calculation for direct mapping size
  x86/mm/KASLR: Adjust the padding size for the direct mapping.

 Documentation/x86/zero-page.rst   |  4 ++
 arch/x86/boot/compressed/acpi.c   | 33 +---
 arch/x86/include/uapi/asm/bootparam.h |  2 +-
 arch/x86/mm/kaslr.c   | 77 +--
 4 files changed, 93 insertions(+), 23 deletions(-)

-- 
2.18.1



[PATCH v3 5/5] x86/mm/KASLR: Adjust the padding size for the direct mapping.

2019-08-30 Thread Masayoshi Mizuma
From: Masayoshi Mizuma 

The system sometimes crashes while memory hot-adding on KASLR
enabled system. The crash happens because the regions pointed by
kaslr_regions[].base are overwritten by the hot-added memory.

It happens because of the padding size for kaslr_regions[].base isn't
enough for the system whose physical memory layout has huge space for
memory hotplug. kaslr_regions[].base points "actual installed
memory size + padding" or higher address. So, if the "actual + padding"
is lower address than the maximum memory address, which means the memory
address reachable by memory hot-add, kaslr_regions[].base is destroyed by
the overwritten.

  address
^
|--- maximum memory address (Hotplug)
|^
|--- kaslr_regions[0].base   | Hotadd-able region
| ^  |
| | padding  |
| V  V
|--- actual memory address (Installed on boot)
|

Fix it by getting the maximum memory address from SRAT and store
the value in boot_param, then set the padding size while KASLR
initializing if the default padding size isn't enough.

Signed-off-by: Masayoshi Mizuma 
---
 arch/x86/mm/kaslr.c | 31 +--
 1 file changed, 29 insertions(+), 2 deletions(-)

diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c
index 8e5f3642e..a78844c57 100644
--- a/arch/x86/mm/kaslr.c
+++ b/arch/x86/mm/kaslr.c
@@ -70,6 +70,34 @@ static inline bool kaslr_memory_enabled(void)
return kaslr_enabled() && !IS_ENABLED(CONFIG_KASAN);
 }
 
+static inline unsigned long phys_memmap_size(void)
+{
+   unsigned long padding = CONFIG_RANDOMIZE_MEMORY_PHYSICAL_PADDING;
+#ifdef CONFIG_MEMORY_HOTPLUG
+   unsigned long actual, maximum, base;
+
+   if (!boot_params.max_addr)
+   goto out;
+
+   /*
+* The padding size should set to get for kaslr_regions[].base
+* bigger address than the maximum memory address the system can
+* have. kaslr_regions[].base points "actual size + padding" or
+* higher address. If "actual size + padding" points the lower
+* address than the maximum memory size, fix the padding size.
+*/
+   actual = roundup(PFN_PHYS(max_pfn), 1UL << TB_SHIFT);
+   maximum = roundup(boot_params.max_addr, 1UL << TB_SHIFT);
+   base = actual + (padding << TB_SHIFT);
+
+   if (maximum > base)
+   padding = (maximum - actual) >> TB_SHIFT;
+out:
+#endif
+   return DIV_ROUND_UP(max_pfn << PAGE_SHIFT, 1UL << TB_SHIFT) +
+   padding;
+}
+
 /*
  * Even though a huge virtual address space is reserved for the direct
  * mapping of physical memory, e.g in 4-level paging mode, it's 64TB,
@@ -87,8 +115,7 @@ static inline unsigned long calc_direct_mapping_size(void)
 * Update Physical memory mapping to available and
 * add padding if needed (especially for memory hotplug support).
 */
-   memory_tb = DIV_ROUND_UP(max_pfn << PAGE_SHIFT, 1UL << TB_SHIFT) +
-   CONFIG_RANDOMIZE_MEMORY_PHYSICAL_PADDING;
+   memory_tb = phys_memmap_size();
 
size_tb = 1 << (MAX_PHYSMEM_BITS - TB_SHIFT);
 
-- 
2.18.1



[PATCH v3 2/5] x86/boot: Add max_addr field in struct boot_params

2019-08-30 Thread Masayoshi Mizuma
From: Masayoshi Mizuma 

Add max_addr field in struct boot_params. max_addr shows the
maximum memory address to be reachable by memory hot-add.
max_addr is set by parsing ACPI SRAT.

Signed-off-by: Masayoshi Mizuma 
---
 Documentation/x86/zero-page.rst   | 4 
 arch/x86/include/uapi/asm/bootparam.h | 2 +-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/Documentation/x86/zero-page.rst b/Documentation/x86/zero-page.rst
index f088f5881..cc3938d68 100644
--- a/Documentation/x86/zero-page.rst
+++ b/Documentation/x86/zero-page.rst
@@ -19,6 +19,7 @@ Offset/Size   Proto   NameMeaning
 058/008ALL tboot_addr  Physical address of 
tboot shared page
 060/010ALL ist_infoIntel SpeedStep (IST) 
BIOS support information
(struct ist_info)
+078/010ALL max_addrThe possible maximum 
physical memory address [1]_
 080/010ALL hd0_infohd0 disk parameter, 
OBSOLETE!!
 090/010ALL hd1_infohd1 disk parameter, 
OBSOLETE!!
 0A0/010ALL sys_desc_table  System description 
table (struct sys_desc_table),
@@ -43,3 +44,6 @@ Offset/Size   Proto   NameMeaning
(array of struct e820_entry)
 D00/1ECALL eddbuf  EDD data (array of 
struct edd_info)
 ====   === 
=
+
+.. [1] max_addr shows the maximum memory address to be reachable by memory
+   hot-add. max_addr is set by parsing ACPI SRAT.
diff --git a/arch/x86/include/uapi/asm/bootparam.h 
b/arch/x86/include/uapi/asm/bootparam.h
index c895df548..6efad338b 100644
--- a/arch/x86/include/uapi/asm/bootparam.h
+++ b/arch/x86/include/uapi/asm/bootparam.h
@@ -158,7 +158,7 @@ struct boot_params {
__u64  tboot_addr;  /* 0x058 */
struct ist_info ist_info;   /* 0x060 */
__u64 acpi_rsdp_addr;   /* 0x070 */
-   __u8  _pad3[8]; /* 0x078 */
+   __u64 max_addr; /* 0x078 */
__u8  hd0_info[16]; /* obsolete! */ /* 0x080 */
__u8  hd1_info[16]; /* obsolete! */ /* 0x090 */
struct sys_desc_table sys_desc_table; /* obsolete! */   /* 0x0a0 */
-- 
2.18.1



[PATCH v3 1/5] x86/boot: Wrap up the SRAT traversing code into subtable_parse()

2019-08-30 Thread Masayoshi Mizuma
From: Masayoshi Mizuma 

Wrap up the SRAT traversing code into subtable_parse().

Signed-off-by: Masayoshi Mizuma 
---
 arch/x86/boot/compressed/acpi.c | 21 ++---
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/arch/x86/boot/compressed/acpi.c b/arch/x86/boot/compressed/acpi.c
index 149795c36..908a1bfab 100644
--- a/arch/x86/boot/compressed/acpi.c
+++ b/arch/x86/boot/compressed/acpi.c
@@ -362,6 +362,19 @@ static unsigned long get_acpi_srat_table(void)
return 0;
 }
 
+static void subtable_parse(struct acpi_subtable_header *sub_table, int *num)
+{
+   struct acpi_srat_mem_affinity *ma;
+
+   ma = (struct acpi_srat_mem_affinity *)sub_table;
+
+   if (!(ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && ma->length) {
+   immovable_mem[*num].start = ma->base_address;
+   immovable_mem[*num].size = ma->length;
+   (*num)++;
+   }
+}
+
 /**
  * count_immovable_mem_regions - Parse SRAT and cache the immovable
  * memory regions into the immovable_mem array.
@@ -395,14 +408,8 @@ int count_immovable_mem_regions(void)
while (table + sizeof(struct acpi_subtable_header) < table_end) {
sub_table = (struct acpi_subtable_header *)table;
if (sub_table->type == ACPI_SRAT_TYPE_MEMORY_AFFINITY) {
-   struct acpi_srat_mem_affinity *ma;
 
-   ma = (struct acpi_srat_mem_affinity *)sub_table;
-   if (!(ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && 
ma->length) {
-   immovable_mem[num].start = ma->base_address;
-   immovable_mem[num].size = ma->length;
-   num++;
-   }
+   subtable_parse(sub_table, );
 
if (num >= MAX_NUMNODES*2) {
debug_putstr("Too many immovable memory 
regions, aborting.\n");
-- 
2.18.1



[PATCH v3 3/5] x86/boot: Get the max address from SRAT

2019-08-30 Thread Masayoshi Mizuma
From: Masayoshi Mizuma 

Get the max address from SRAT and write it into boot_params->max_addr.

Signed-off-by: Masayoshi Mizuma 
---
 arch/x86/boot/compressed/acpi.c | 24 ++--
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/arch/x86/boot/compressed/acpi.c b/arch/x86/boot/compressed/acpi.c
index 908a1bfab..ba2bc5ab9 100644
--- a/arch/x86/boot/compressed/acpi.c
+++ b/arch/x86/boot/compressed/acpi.c
@@ -362,16 +362,24 @@ static unsigned long get_acpi_srat_table(void)
return 0;
 }
 
-static void subtable_parse(struct acpi_subtable_header *sub_table, int *num)
+static void subtable_parse(struct acpi_subtable_header *sub_table, int *num,
+   unsigned long *max_addr)
 {
struct acpi_srat_mem_affinity *ma;
+   unsigned long addr;
 
ma = (struct acpi_srat_mem_affinity *)sub_table;
 
-   if (!(ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && ma->length) {
-   immovable_mem[*num].start = ma->base_address;
-   immovable_mem[*num].size = ma->length;
-   (*num)++;
+   if (ma->length) {
+   if (ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) {
+   addr = ma->base_address + ma->length;
+   if (addr > *max_addr)
+   *max_addr = addr;
+   } else {
+   immovable_mem[*num].start = ma->base_address;
+   immovable_mem[*num].size = ma->length;
+   (*num)++;
+   }
}
 }
 
@@ -391,6 +399,7 @@ int count_immovable_mem_regions(void)
struct acpi_subtable_header *sub_table;
struct acpi_table_header *table_header;
char arg[MAX_ACPI_ARG_LENGTH];
+   unsigned long max_addr = 0;
int num = 0;
 
if (cmdline_find_option("acpi", arg, sizeof(arg)) == 3 &&
@@ -409,7 +418,7 @@ int count_immovable_mem_regions(void)
sub_table = (struct acpi_subtable_header *)table;
if (sub_table->type == ACPI_SRAT_TYPE_MEMORY_AFFINITY) {
 
-   subtable_parse(sub_table, );
+   subtable_parse(sub_table, , _addr);
 
if (num >= MAX_NUMNODES*2) {
debug_putstr("Too many immovable memory 
regions, aborting.\n");
@@ -418,6 +427,9 @@ int count_immovable_mem_regions(void)
}
table += sub_table->length;
}
+
+   boot_params->max_addr = max_addr;
+
return num;
 }
 #endif /* CONFIG_RANDOMIZE_BASE && CONFIG_MEMORY_HOTREMOVE */
-- 
2.18.1



[PATCH v2] arm64/mm: Correct the cache line size warning with non coherent device

2019-06-14 Thread Masayoshi Mizuma
From: Masayoshi Mizuma 

If the cache line size is greater than ARCH_DMA_MINALIGN (128),
the warning shows and it's tainted as TAINT_CPU_OUT_OF_SPEC.

However, it's not good because as discussed in the thread [1], the cpu
cache line size will be problem only on non-coherent devices.

Since the coherent flag is already introduced to struct device,
show the warning only if the device is non-coherent device and
ARCH_DMA_MINALIGN is smaller than the cpu cache size.

[1] 
https://lore.kernel.org/linux-arm-kernel/20180514145703.celnlobzn3uh5tc2@localhost/

Signed-off-by: Masayoshi Mizuma 
Reviewed-by: Hidetoshi Seto 
Tested-by: Zhang Lei 
---
 arch/arm64/include/asm/cache.h |  7 +++
 arch/arm64/kernel/cacheinfo.c  |  4 +---
 arch/arm64/mm/dma-mapping.c| 14 ++
 3 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/arch/arm64/include/asm/cache.h b/arch/arm64/include/asm/cache.h
index 758af6340314..d24b7c1ecd9b 100644
--- a/arch/arm64/include/asm/cache.h
+++ b/arch/arm64/include/asm/cache.h
@@ -91,6 +91,13 @@ static inline u32 cache_type_cwg(void)
 
 #define __read_mostly __attribute__((__section__(".data..read_mostly")))
 
+static inline int cache_line_size_of_cpu(void)
+{
+   u32 cwg = cache_type_cwg();
+
+   return cwg ? 4 << cwg : ARCH_DMA_MINALIGN;
+}
+
 int cache_line_size(void);
 
 /*
diff --git a/arch/arm64/kernel/cacheinfo.c b/arch/arm64/kernel/cacheinfo.c
index 6eaf1c07aa4e..7fa6828bb488 100644
--- a/arch/arm64/kernel/cacheinfo.c
+++ b/arch/arm64/kernel/cacheinfo.c
@@ -19,12 +19,10 @@
 
 int cache_line_size(void)
 {
-   u32 cwg = cache_type_cwg();
-
if (coherency_max_size != 0)
return coherency_max_size;
 
-   return cwg ? 4 << cwg : ARCH_DMA_MINALIGN;
+   return cache_line_size_of_cpu();
 }
 EXPORT_SYMBOL_GPL(cache_line_size);
 
diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c
index 1669618db08a..379589dc7113 100644
--- a/arch/arm64/mm/dma-mapping.c
+++ b/arch/arm64/mm/dma-mapping.c
@@ -38,10 +38,6 @@ void arch_dma_prep_coherent(struct page *page, size_t size)
 
 static int __init arm64_dma_init(void)
 {
-   WARN_TAINT(ARCH_DMA_MINALIGN < cache_line_size(),
-  TAINT_CPU_OUT_OF_SPEC,
-  "ARCH_DMA_MINALIGN smaller than CTR_EL0.CWG (%d < %d)",
-  ARCH_DMA_MINALIGN, cache_line_size());
return dma_atomic_pool_init(GFP_DMA32, __pgprot(PROT_NORMAL_NC));
 }
 arch_initcall(arm64_dma_init);
@@ -56,7 +52,17 @@ void arch_teardown_dma_ops(struct device *dev)
 void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
const struct iommu_ops *iommu, bool coherent)
 {
+   int cls = cache_line_size_of_cpu();
+
dev->dma_coherent = coherent;
+
+   if (!coherent)
+   WARN_TAINT(cls > ARCH_DMA_MINALIGN,
+   TAINT_CPU_OUT_OF_SPEC,
+   "%s %s: ARCH_DMA_MINALIGN smaller than CTR_EL0.CWG (%d 
< %d)",
+   dev_driver_string(dev), dev_name(dev),
+   ARCH_DMA_MINALIGN, cls);
+
if (iommu)
iommu_setup_dma_ops(dev, dma_base, size);
 
-- 
2.20.1



Re: [PATCH 1/2] arm64/mm: check cpu cache line size with non-coherent device

2019-06-11 Thread Masayoshi Mizuma
On Tue, Jun 11, 2019 at 07:00:07PM +0100, Catalin Marinas wrote:
> On Tue, Jun 11, 2019 at 11:17:30AM -0400, Masayoshi Mizuma wrote:
> > --- a/arch/arm64/mm/dma-mapping.c
> > +++ b/arch/arm64/mm/dma-mapping.c
> > @@ -91,10 +91,6 @@ static int __swiotlb_mmap_pfn(struct vm_area_struct *vma,
> >  
> >  static int __init arm64_dma_init(void)
> >  {
> > -   WARN_TAINT(ARCH_DMA_MINALIGN < cache_line_size(),
> > -  TAINT_CPU_OUT_OF_SPEC,
> > -  "ARCH_DMA_MINALIGN smaller than CTR_EL0.CWG (%d < %d)",
> > -  ARCH_DMA_MINALIGN, cache_line_size());
> > return dma_atomic_pool_init(GFP_DMA32, __pgprot(PROT_NORMAL_NC));
> >  }
> >  arch_initcall(arm64_dma_init);
> > @@ -473,6 +469,11 @@ void arch_setup_dma_ops(struct device *dev, u64 
> > dma_base, u64 size,
> > const struct iommu_ops *iommu, bool coherent)
> >  {
> > dev->dma_coherent = coherent;
> > +
> > +   if (!coherent && (cache_line_size() > ARCH_DMA_MINALIGN))
> > +   dev_WARN(dev, "ARCH_DMA_MINALIGN smaller than CTR_EL0.CWG (%d < 
> > %d)",
> > +   ARCH_DMA_MINALIGN, cache_line_size());
> 
> I'm ok in principle with this patch, with the minor issue that since
> commit 7b8c87b297a7 ("arm64: cacheinfo: Update cache_line_size detected
> from DT or PPTT") queued for 5.3 cache_line_size() gets the information
> from DT or ACPI. The reason for this change is that the information is
> used for performance tuning rather than DMA coherency.
> 
> You can go for a direct cache_type_cwg() check in here, unless Robin
> (cc'ed) has a better idea.

Got it, thanks.
I believe coherency_max_size is zero in case of coherent is false,
so I'll modify the patch as following. Does it make sense?

@@ -57,6 +53,11 @@ void arch_setup_dma_ops(struct device *dev, u64 dma_base, 
u64 size,
const struct iommu_ops *iommu, bool coherent)
 {
dev->dma_coherent = coherent;
+
+   if (!coherent && (cache_line_size() > ARCH_DMA_MINALIGN))
+   dev_WARN(dev, "ARCH_DMA_MINALIGN smaller than CTR_EL0.CWG (%d < 
%d)",
+   ARCH_DMA_MINALIGN, (4 << cache_type_cwg()));
+
if (iommu)
iommu_setup_dma_ops(dev, dma_base, size);

Thanks,
Masa


Re: [PATCH 2/2] arm64/mm: show TAINT_CPU_OUT_OF_SPEC warning if the cache size is over the spec.

2019-06-11 Thread Masayoshi Mizuma
On Tue, Jun 11, 2019 at 04:41:06PM +0100, Catalin Marinas wrote:
> On Tue, Jun 11, 2019 at 11:17:31AM -0400, Masayoshi Mizuma wrote:
> > From: Masayoshi Mizuma 
> > 
> > Show the warning and taints as TAINT_CPU_OUT_OF_SPEC if the cache line
> > size is greater than the maximum.
> 
> In general the "out of spec" part is a misnomer, we tend to apply it to
> CPU features that are not supported by the kernel rather than some CPU
> feature not compliant with the architecture (we call the latter errata).
> 
> I suggest you drop this patch.

Thank you for your comments. I agree with you, so I drop this
patch.

Thanks,
Masa


[PATCH 2/2] arm64/mm: show TAINT_CPU_OUT_OF_SPEC warning if the cache size is over the spec.

2019-06-11 Thread Masayoshi Mizuma
From: Masayoshi Mizuma 

Show the warning and taints as TAINT_CPU_OUT_OF_SPEC if the cache line
size is greater than the maximum.

Signed-off-by: Masayoshi Mizuma 
Reviewed-by: Hidetoshi Seto 
Tested-by: Zhang Lei 
---
 arch/arm64/include/asm/cache.h | 2 ++
 arch/arm64/mm/init.c   | 5 +
 2 files changed, 7 insertions(+)

diff --git a/arch/arm64/include/asm/cache.h b/arch/arm64/include/asm/cache.h
index 926434f413fa..636e277fefc9 100644
--- a/arch/arm64/include/asm/cache.h
+++ b/arch/arm64/include/asm/cache.h
@@ -91,6 +91,8 @@ static inline u32 cache_type_cwg(void)
 
 #define __read_mostly __attribute__((__section__(".data..read_mostly")))
 
+#define ARM64_MAX_CACHE_LINE_SIZE  2048
+
 static inline int cache_line_size(void)
 {
u32 cwg = cache_type_cwg();
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index d2adffb81b5d..df621d90b19c 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -562,6 +562,11 @@ void __init mem_init(void)
 */
sysctl_overcommit_memory = OVERCOMMIT_ALWAYS;
}
+
+   WARN_TAINT(cache_line_size() > ARM64_MAX_CACHE_LINE_SIZE,
+  TAINT_CPU_OUT_OF_SPEC,
+  "CTR_EL0.CWG is greater than the spec (%d > %d)",
+  cache_line_size(), ARM64_MAX_CACHE_LINE_SIZE);
 }
 
 void free_initmem(void)
-- 
2.20.1



[PATCH 1/2] arm64/mm: check cpu cache line size with non-coherent device

2019-06-11 Thread Masayoshi Mizuma
From: Masayoshi Mizuma 

As discussed in the thread [1], the cpu cache line size will be problem
only on non-coherent devices. And, the coherent flag is already introduced
to struct device.

Show the warning only if the device is non-coherent device and
ARCH_DMA_MINALIGN is smaller than the cpu cache size.

[1] 
https://lore.kernel.org/linux-arm-kernel/20180514145703.celnlobzn3uh5tc2@localhost/

Signed-off-by: Masayoshi Mizuma 
Reviewed-by: Hidetoshi Seto 
Tested-by: Zhang Lei 
---
 arch/arm64/mm/dma-mapping.c | 9 +
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c
index 674860e3e478..c0c09890c845 100644
--- a/arch/arm64/mm/dma-mapping.c
+++ b/arch/arm64/mm/dma-mapping.c
@@ -91,10 +91,6 @@ static int __swiotlb_mmap_pfn(struct vm_area_struct *vma,
 
 static int __init arm64_dma_init(void)
 {
-   WARN_TAINT(ARCH_DMA_MINALIGN < cache_line_size(),
-  TAINT_CPU_OUT_OF_SPEC,
-  "ARCH_DMA_MINALIGN smaller than CTR_EL0.CWG (%d < %d)",
-  ARCH_DMA_MINALIGN, cache_line_size());
return dma_atomic_pool_init(GFP_DMA32, __pgprot(PROT_NORMAL_NC));
 }
 arch_initcall(arm64_dma_init);
@@ -473,6 +469,11 @@ void arch_setup_dma_ops(struct device *dev, u64 dma_base, 
u64 size,
const struct iommu_ops *iommu, bool coherent)
 {
dev->dma_coherent = coherent;
+
+   if (!coherent && (cache_line_size() > ARCH_DMA_MINALIGN))
+   dev_WARN(dev, "ARCH_DMA_MINALIGN smaller than CTR_EL0.CWG (%d < 
%d)",
+   ARCH_DMA_MINALIGN, cache_line_size());
+
__iommu_setup_dma_ops(dev, dma_base, size, iommu);
 
 #ifdef CONFIG_XEN
-- 
2.20.1



[PATCH 0/2] Correct the cache line size warning

2019-06-11 Thread Masayoshi Mizuma
From: Masayoshi Mizuma 

If the cache line size is greater than ARCH_DMA_MINALIGN (128),
the warning shows and it's tainted as TAINT_CPU_OUT_OF_SPEC.

However, it's not good about two points.
First, as discussed in the thread [1], the cpu cache line size will be
problem only on non-coherent devices.
Then, it should not be tainted as TAINT_CPU_OUT_OF_SPEC because
according to the specification of CTR_EL0.CWG, the maximum cache
writeback granule is 2048 byte (CWG == 0b1001).

This patch series try to:

- Show the warning only if the device is non-coherent device and
  ARCH_DMA_MINALIGN is smaller than the cpu cache size.

- Show the warning and taints as TAINT_CPU_OUT_OF_SPEC if the cache line
  size is greater than the maximum.

[1] 
https://lore.kernel.org/linux-arm-kernel/20180514145703.celnlobzn3uh5tc2@localhost/

Masayoshi Mizuma (2):
  arm64/mm: check cpu cache line size with non-coherent device
  arm64/mm: show TAINT_CPU_OUT_OF_SPEC warning if the cache size is over
the spec.

 arch/arm64/include/asm/cache.h | 2 ++
 arch/arm64/mm/dma-mapping.c| 9 +
 arch/arm64/mm/init.c   | 5 +
 3 files changed, 12 insertions(+), 4 deletions(-)

-- 
2.20.1



[PATCH v2 6/6] ktest: update sample.conf for grub2bls

2019-05-09 Thread Masayoshi Mizuma
From: Masayoshi Mizuma 

Update sample.conf for grub2bls

Signed-off-by: Masayoshi Mizuma 
---
 tools/testing/ktest/sample.conf | 20 ++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/tools/testing/ktest/sample.conf b/tools/testing/ktest/sample.conf
index 8c893a58b68e..c3bc933d437b 100644
--- a/tools/testing/ktest/sample.conf
+++ b/tools/testing/ktest/sample.conf
@@ -349,13 +349,13 @@
 # option to boot to with GRUB_REBOOT
 #GRUB_FILE = /boot/grub2/grub.cfg
 
-# The tool for REBOOT_TYPE = grub2 to set the next reboot kernel
+# The tool for REBOOT_TYPE = grub2 or grub2bls to set the next reboot kernel
 # to boot into (one shot mode).
 # (default grub2_reboot)
 #GRUB_REBOOT = grub2_reboot
 
 # The grub title name for the test kernel to boot
-# (Only mandatory if REBOOT_TYPE = grub or grub2)
+# (Only mandatory if REBOOT_TYPE = grub or grub2 or grub2bls)
 #
 # Note, ktest.pl will not update the grub menu.lst, you need to
 # manually add an option for the test. ktest.pl will search
@@ -374,6 +374,10 @@
 # do a: GRUB_MENU = 'Test Kernel'
 # For customizing, add your entry in /etc/grub.d/40_custom.
 #
+# For grub2bls, a search of "title"s are done. The menu is found
+# by searching for the contents of GRUB_MENU in the line that starts
+# with "title".
+#
 #GRUB_MENU = Test Kernel
 
 # For REBOOT_TYPE = syslinux, the name of the syslinux executable
@@ -479,6 +483,11 @@
 # default (undefined)
 #POST_KTEST = ${SSH} ~/dismantle_test
 
+# If you want to remove the kernel entry in Boot Loader Specification (BLS)
+# environment, use kernel-install command.
+# Here's the example:
+#POST_KTEST = ssh root@Test "/usr/bin/kernel-install remove $KERNEL_VERSION"
+
 # The default test type (default test)
 # The test types may be:
 #   build   - only build the kernel, do nothing else
@@ -530,6 +539,11 @@
 # or on some systems:
 #POST_INSTALL = ssh user@target /sbin/dracut -f /boot/initramfs-test.img 
$KERNEL_VERSION
 
+# If you want to add the kernel entry in Boot Loader Specification (BLS)
+# environment, use kernel-install command.
+# Here's the example:
+#POST_INSTALL = ssh root@Test "/usr/bin/kernel-install add $KERNEL_VERSION 
/boot/vmlinuz-$KERNEL_VERSION"
+
 # If for some reason you just want to boot the kernel and you do not
 # want the test to install anything new. For example, you may just want
 # to boot test the same kernel over and over and do not want to go through
@@ -593,6 +607,8 @@
 # For REBOOT_TYPE = grub2, you must define both GRUB_MENU and
 # GRUB_FILE.
 #
+# For REBOOT_TYPE = grub2bls, you must define GRUB_MENU.
+#
 # For REBOOT_TYPE = syslinux, you must define SYSLINUX_LABEL, and
 # perhaps modify SYSLINUX (default extlinux) and SYSLINUX_PATH
 # (default /boot/extlinux)
-- 
2.20.1



[PATCH v2 5/6] ktest: remove get_grub2_index

2019-05-09 Thread Masayoshi Mizuma
From: Masayoshi Mizuma 

Remove get_grub2_index() because it isn't used anywhere.

Signed-off-by: Masayoshi Mizuma 
---
 tools/testing/ktest/ktest.pl | 36 
 1 file changed, 36 deletions(-)

diff --git a/tools/testing/ktest/ktest.pl b/tools/testing/ktest/ktest.pl
index abd6f37b0561..4711f57e809a 100755
--- a/tools/testing/ktest/ktest.pl
+++ b/tools/testing/ktest/ktest.pl
@@ -1916,42 +1916,6 @@ sub _get_grub_index {
 $last_machine = $machine;
 }
 
-sub get_grub2_index {
-
-return if (defined($grub_number) && defined($last_grub_menu) &&
-  $last_grub_menu eq $grub_menu && defined($last_machine) &&
-  $last_machine eq $machine);
-
-doprint "Find grub2 menu ... ";
-$grub_number = -1;
-
-my $ssh_grub = $ssh_exec;
-$ssh_grub =~ s,\$SSH_COMMAND,cat $grub_file,g;
-
-open(IN, "$ssh_grub |")
-   or dodie "unable to get $grub_file";
-
-my $found = 0;
-my $grub_menu_qt = quotemeta($grub_menu);
-
-while () {
-   if (/^menuentry.*$grub_menu_qt/) {
-   $grub_number++;
-   $found = 1;
-   last;
-   } elsif (/^menuentry\s|^submenu\s/) {
-   $grub_number++;
-   }
-}
-close(IN);
-
-dodie "Could not find '$grub_menu' in $grub_file on $machine"
-   if (!$found);
-doprint "$grub_number\n";
-$last_grub_menu = $grub_menu;
-$last_machine = $machine;
-}
-
 sub get_grub_index {
 
 my $command;
-- 
2.20.1



[PATCH v2 4/6] ktest: pass KERNEL_VERSION to POST_KTEST

2019-05-09 Thread Masayoshi Mizuma
From: Masayoshi Mizuma 

For BLS, kernel entry is added by kernel-install command through
POST_INSALL, for example,

POST_INSTALL = ssh root@Test "/usr/bin/kernel-install \
add $KERNEL_VERSION /boot/vmlinuz-$KERNEL_VERSION"

The entry is removed by kernel-install command and the kernel
version is needed for the argument. 

Pass KERNEL_VERSION variable to POST_KTEST so that kernel-install
command can remove the entry like as follows:

POST_KTEST = ssh root@Test "/usr/bin/kernel-install remove $KERNEL_VERSION"

Signed-off-by: Masayoshi Mizuma 
---
 tools/testing/ktest/ktest.pl | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tools/testing/ktest/ktest.pl b/tools/testing/ktest/ktest.pl
index df0c609c7c50..abd6f37b0561 100755
--- a/tools/testing/ktest/ktest.pl
+++ b/tools/testing/ktest/ktest.pl
@@ -4456,7 +4456,9 @@ for (my $i = 1; $i <= $opt{"NUM_TESTS"}; $i++) {
 }
 
 if (defined($final_post_ktest)) {
-run_command $final_post_ktest;
+
+my $cp_final_post_ktest = eval_kernel_version $final_post_ktest;
+run_command $cp_final_post_ktest;
 }
 
 if ($opt{"POWEROFF_ON_SUCCESS"}) {
-- 
2.20.1



[PATCH v2 1/6] ktest: introduce _get_grub_index

2019-05-09 Thread Masayoshi Mizuma
From: Masayoshi Mizuma 

Introduce _get_grub_index() to deal with Boot Loader
Specification (BLS) and cleanup.

Signed-off-by: Masayoshi Mizuma 
---
 tools/testing/ktest/ktest.pl | 37 
 1 file changed, 37 insertions(+)

diff --git a/tools/testing/ktest/ktest.pl b/tools/testing/ktest/ktest.pl
index 275ad8ac8872..43868ee07e17 100755
--- a/tools/testing/ktest/ktest.pl
+++ b/tools/testing/ktest/ktest.pl
@@ -1871,6 +1871,43 @@ sub run_scp_mod {
 return run_scp($src, $dst, $cp_scp);
 }
 
+sub _get_grub_index {
+
+my ($command, $target, $skip) = @_;
+
+return if (defined($grub_number) && defined($last_grub_menu) &&
+  $last_grub_menu eq $grub_menu && defined($last_machine) &&
+  $last_machine eq $machine);
+
+doprint "Find $reboot_type menu ... ";
+$grub_number = -1;
+
+my $ssh_grub = $ssh_exec;
+$ssh_grub =~ s,\$SSH_COMMAND,$command,g;
+
+open(IN, "$ssh_grub |")
+   or dodie "unable to execute $command";
+
+my $found = 0;
+
+while () {
+   if (/$target/) {
+   $grub_number++;
+   $found = 1;
+   last;
+   } elsif (/$skip/) {
+   $grub_number++;
+   }
+}
+close(IN);
+
+dodie "Could not find '$grub_menu' through $command on $machine"
+   if (!$found);
+doprint "$grub_number\n";
+$last_grub_menu = $grub_menu;
+$last_machine = $machine;
+}
+
 sub get_grub2_index {
 
 return if (defined($grub_number) && defined($last_grub_menu) &&
-- 
2.20.1



[PATCH v2 3/6] ktest: introduce grub2bls REBOOT_TYPE option

2019-05-09 Thread Masayoshi Mizuma
From: Masayoshi Mizuma 

Fedora 30 introduces Boot Loader Specification (BLS),
it changes around grub entry configuration.

kernel entries aren't in grub.cfg. We can get the entries
by "grubby --info=ALL" command.

Introduce grub2bls as REBOOT_TYPE option for BLS.

Signed-off-by: Masayoshi Mizuma 
---
 tools/testing/ktest/ktest.pl | 18 +++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/tools/testing/ktest/ktest.pl b/tools/testing/ktest/ktest.pl
index ff43f8336da1..df0c609c7c50 100755
--- a/tools/testing/ktest/ktest.pl
+++ b/tools/testing/ktest/ktest.pl
@@ -64,6 +64,7 @@ my %default = (
 "STOP_TEST_AFTER"  => 600,
 "MAX_MONITOR_WAIT" => 1800,
 "GRUB_REBOOT"  => "grub2-reboot",
+"GRUB_BLS_GET" => "grubby --info=ALL",
 "SYSLINUX" => "extlinux",
 "SYSLINUX_PATH"=> "/boot/extlinux",
 "CONNECT_TIMEOUT"  => 25,
@@ -125,6 +126,7 @@ my $last_grub_menu;
 my $grub_file;
 my $grub_number;
 my $grub_reboot;
+my $grub_bls_get;
 my $syslinux;
 my $syslinux_path;
 my $syslinux_label;
@@ -295,6 +297,7 @@ my %option_map = (
 "GRUB_MENU"=> \$grub_menu,
 "GRUB_FILE"=> \$grub_file,
 "GRUB_REBOOT"  => \$grub_reboot,
+"GRUB_BLS_GET" => \$grub_bls_get,
 "SYSLINUX" => \$syslinux,
 "SYSLINUX_PATH"=> \$syslinux_path,
 "SYSLINUX_LABEL"   => \$syslinux_label,
@@ -440,7 +443,7 @@ EOF
 ;
 $config_help{"REBOOT_TYPE"} = << "EOF"
  Way to reboot the box to the test kernel.
- Only valid options so far are "grub", "grub2", "syslinux", and "script".
+ Only valid options so far are "grub", "grub2", "grub2bls", "syslinux", and 
"script".
 
  If you specify grub, it will assume grub version 1
  and will search in /boot/grub/menu.lst for the title \$GRUB_MENU
@@ -454,6 +457,8 @@ $config_help{"REBOOT_TYPE"} = << "EOF"
  If you specify grub2, then you also need to specify both \$GRUB_MENU
  and \$GRUB_FILE.
 
+ If you specify grub2bls, then you also need to specify \$GRUB_MENU.
+
  If you specify syslinux, then you may use SYSLINUX to define the syslinux
  command (defaults to extlinux), and SYSLINUX_PATH to specify the path to
  the syslinux install (defaults to /boot/extlinux). But you have to specify
@@ -479,6 +484,9 @@ $config_help{"GRUB_MENU"} = << "EOF"
  menu must be a non-nested menu. Add the quotes used in the menu
  to guarantee your selection, as the first menuentry with the content
  of \$GRUB_MENU that is found will be used.
+
+ For grub2bls, \$GRUB_MENU is searched on the result of \$GRUB_BLS_GET
+ command for the lines that begin with "title".
 EOF
 ;
 $config_help{"GRUB_FILE"} = << "EOF"
@@ -695,7 +703,7 @@ sub get_mandatory_configs {
}
 }
 
-if ($rtype eq "grub") {
+if (($rtype eq "grub") or ($rtype eq "grub2bls")) {
get_mandatory_config("GRUB_MENU");
 }
 
@@ -1965,6 +1973,10 @@ sub get_grub_index {
$command = "cat $grub_file";
$target = '^menuentry.*' . $grub_menu_qt;
$skip = '^menuentry\s|^submenu\s';
+} elsif ($reboot_type eq "grub2bls") {
+$command = $grub_bls_get;
+$target = '^title=.*' . $grub_menu_qt;
+$skip = '^title=';
 } else {
return;
 }
@@ -4324,7 +4336,7 @@ for (my $i = 1; $i <= $opt{"NUM_TESTS"}; $i++) {
 
 if (!$buildonly) {
$target = "$ssh_user\@$machine";
-   if ($reboot_type eq "grub") {
+   if (($reboot_type eq "grub") or ($reboot_type eq "grub2bls")) {
dodie "GRUB_MENU not defined" if (!defined($grub_menu));
} elsif ($reboot_type eq "grub2") {
dodie "GRUB_MENU not defined" if (!defined($grub_menu));
-- 
2.20.1



[PATCH v2 2/6] ktest: cleanup get_grub_index

2019-05-09 Thread Masayoshi Mizuma
From: Masayoshi Mizuma 

Cleanup get_grub_index().

Signed-off-by: Masayoshi Mizuma 
---
 tools/testing/ktest/ktest.pl | 50 
 1 file changed, 17 insertions(+), 33 deletions(-)

diff --git a/tools/testing/ktest/ktest.pl b/tools/testing/ktest/ktest.pl
index 43868ee07e17..ff43f8336da1 100755
--- a/tools/testing/ktest/ktest.pl
+++ b/tools/testing/ktest/ktest.pl
@@ -1946,46 +1946,30 @@ sub get_grub2_index {
 
 sub get_grub_index {
 
-if ($reboot_type eq "grub2") {
-   get_grub2_index;
-   return;
-}
+my $command;
+my $target;
+my $skip;
+my $grub_menu_qt;
 
-if ($reboot_type ne "grub") {
+if ($reboot_type !~ /^grub/) {
return;
 }
-return if (defined($grub_number) && defined($last_grub_menu) &&
-  $last_grub_menu eq $grub_menu && defined($last_machine) &&
-  $last_machine eq $machine);
-
-doprint "Find grub menu ... ";
-$grub_number = -1;
 
-my $ssh_grub = $ssh_exec;
-$ssh_grub =~ s,\$SSH_COMMAND,cat /boot/grub/menu.lst,g;
-
-open(IN, "$ssh_grub |")
-   or dodie "unable to get menu.lst";
+$grub_menu_qt = quotemeta($grub_menu);
 
-my $found = 0;
-my $grub_menu_qt = quotemeta($grub_menu);
-
-while () {
-   if (/^\s*title\s+$grub_menu_qt\s*$/) {
-   $grub_number++;
-   $found = 1;
-   last;
-   } elsif (/^\s*title\s/) {
-   $grub_number++;
-   }
+if ($reboot_type eq "grub") {
+   $command = "cat /boot/grub/menu.lst";
+   $target = '^\s*title\s+' . $grub_menu_qt . '\s*$';
+   $skip = '^\s*title\s';
+} elsif ($reboot_type eq "grub2") {
+   $command = "cat $grub_file";
+   $target = '^menuentry.*' . $grub_menu_qt;
+   $skip = '^menuentry\s|^submenu\s';
+} else {
+   return;
 }
-close(IN);
 
-dodie "Could not find '$grub_menu' in /boot/grub/menu on $machine"
-   if (!$found);
-doprint "$grub_number\n";
-$last_grub_menu = $grub_menu;
-$last_machine = $machine;
+_get_grub_index($command, $target, $skip);
 }
 
 sub wait_for_input
-- 
2.20.1



[PATCH v2 0/6] ktest: support for Boot Loader Specification

2019-05-09 Thread Masayoshi Mizuma
From: Masayoshi Mizuma 

Fedora 30 introduces Boot Loader Specification (BLS) [1],
it changes around grub entry configuration.

This patch series deals with the new configuration.

- Add grub2bls option as REBOOT_TYPE to deal with BLS.
- Some cleanup around getting kernel entries.

To use ktest.pl to BLS environment,

- Set REBOOT_TYPE = grub2bls.
- Set POST_INSTALL to add the kernel entry like as follows.

  POST_INSTALL = ssh root@Test "/usr/bin/kernel-install add \
 $KERNEL_VERSION /boot/vmlinuz-$KERNEL_VERSION"

- Set POST_KTEST to remove the kernel entry (optional).

  POST_KTEST = ssh root@Test "/usr/bin/kernel-install remove $KERNEL_VERSION"

[1] https://fedoraproject.org/wiki/Changes/BootLoaderSpecByDefault

Masayoshi Mizuma (6):
  ktest: introduce _get_grub_index
  ktest: cleanup get_grub_index
  ktest: introduce grub2bls REBOOT_TYPE option
  ktest: pass KERNEL_VERSION to POST_KTEST
  ktest: remove get_grub2_index
  ktest: update sample.conf for grub2bls

 tools/testing/ktest/ktest.pl| 89 -
 tools/testing/ktest/sample.conf | 20 +++-
 2 files changed, 62 insertions(+), 47 deletions(-)

-- 
2.20.1



Re: [PATCH 2/5] ktest: cleanup get_grub_index

2019-05-09 Thread Masayoshi Mizuma
On Thu, May 09, 2019 at 01:57:21PM -0400, Steven Rostedt wrote:
> On Thu,  9 May 2019 13:46:27 -0400
> Masayoshi Mizuma  wrote:
> 
> > From: Masayoshi Mizuma 
> > 
> > Cleanup get_grub_index().
> 
> Hi Masayoshi,
> 
> Thanks for the patches, quick comment below.
> 
> > 
> > Signed-off-by: Masayoshi Mizuma 
> > ---
> >  tools/testing/ktest/ktest.pl | 50 +++-
> >  1 file changed, 15 insertions(+), 35 deletions(-)
> > 
> > diff --git a/tools/testing/ktest/ktest.pl b/tools/testing/ktest/ktest.pl
> > index 3862b23672f7..1255ea0d9df4 100755
> > --- a/tools/testing/ktest/ktest.pl
> > +++ b/tools/testing/ktest/ktest.pl
> > @@ -1934,46 +1934,26 @@ sub get_grub2_index {
> >  
> >  sub get_grub_index {
> >  
> > -if ($reboot_type eq "grub2") {
> > -   get_grub2_index;
> > -   return;
> > -}
> > -
> > -if ($reboot_type ne "grub") {
> > -   return;
> > -}
> 
> We still need something like:
> 
>   if ($reboot_type !~ /^grub/) {
>   return;
>   }
> 
> Because I believe this will run (and probably error) for syslinux boot
> systems. I have a couple, I could test it and find out ;-)

Thank you for your review! I'll add the check.

Thanks!
Masa

> 
> -- Steve
> 
> > -return if (defined($grub_number) && defined($last_grub_menu) &&
> > -  $last_grub_menu eq $grub_menu && defined($last_machine) &&
> > -  $last_machine eq $machine);
> > -
> > -doprint "Find grub menu ... ";
> > -$grub_number = -1;
> > +my $command;
> > +my $target;
> > +my $skip;
> > +my $grub_menu_qt;
> >  
> > -my $ssh_grub = $ssh_exec;
> > -$ssh_grub =~ s,\$SSH_COMMAND,cat /boot/grub/menu.lst,g;
> > +return if ($reboot_type ne "grub") and ($reboot_type ne "grub2");
> >  
> > -open(IN, "$ssh_grub |")
> > -   or dodie "unable to get menu.lst";
> > -
> > -my $found = 0;
> > -my $grub_menu_qt = quotemeta($grub_menu);
> > +$grub_menu_qt = quotemeta($grub_menu);
> >  
> > -while () {
> > -   if (/^\s*title\s+$grub_menu_qt\s*$/) {
> > -   $grub_number++;
> > -   $found = 1;
> > -   last;
> > -   } elsif (/^\s*title\s/) {
> > -   $grub_number++;
> > -   }
> > +if ($reboot_type eq "grub") {
> > +   $command = "cat /boot/grub/menu.lst";
> > +   $target = '^\s*title\s+' . $grub_menu_qt . '\s*$';
> > +   $skip = '^\s*title\s';
> > +} elsif ($reboot_type eq "grub2") {
> > +   $command = "cat $grub_file";
> > +   $target = '^menuentry.*' . $grub_menu_qt;
> > +   $skip = '^menuentry\s|^submenu\s';
> >  }
> > -close(IN);
> >  
> > -dodie "Could not find '$grub_menu' in /boot/grub/menu on $machine"
> > -   if (!$found);
> > -doprint "$grub_number\n";
> > -$last_grub_menu = $grub_menu;
> > -$last_machine = $machine;
> > +_get_grub_index($command, $target, $skip);
> >  }
> >  
> >  sub wait_for_input
> 


[PATCH 0/5] ktest: support for Boot Loader Specification

2019-05-09 Thread Masayoshi Mizuma
From: Masayoshi Mizuma 

Fedora 30 introduces Boot Loader Specification (BLS) [1],
it changes around grub entry configuration.

This patchset deals with the new configuration.

- Add grub2bls option as REBOOT_TYPE to deal with BLS.
- Some cleanup around getting kernel entries.

[1] https://fedoraproject.org/wiki/Changes/BootLoaderSpecByDefault

Masayoshi Mizuma (5):
  ktest: introduce _get_grub_index
  ktest: cleanup get_grub_index
  ktest: introduce grub2bls REBOOT_TYPE option
  ktest: remove get_grub2_index
  ktest: update sample.conf for grub2bls

 tools/testing/ktest/ktest.pl| 86 +++--
 tools/testing/ktest/sample.conf | 10 +++-
 2 files changed, 48 insertions(+), 48 deletions(-)

-- 
2.20.1



[PATCH 5/5] ktest: update sample.conf for grub2bls

2019-05-09 Thread Masayoshi Mizuma
From: Masayoshi Mizuma 

Update sample.conf for grub2bls.

Signed-off-by: Masayoshi Mizuma 
---
 tools/testing/ktest/sample.conf | 10 --
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/tools/testing/ktest/sample.conf b/tools/testing/ktest/sample.conf
index 8c893a58b68e..5ac32ab1f3bc 100644
--- a/tools/testing/ktest/sample.conf
+++ b/tools/testing/ktest/sample.conf
@@ -349,13 +349,13 @@
 # option to boot to with GRUB_REBOOT
 #GRUB_FILE = /boot/grub2/grub.cfg
 
-# The tool for REBOOT_TYPE = grub2 to set the next reboot kernel
+# The tool for REBOOT_TYPE = grub2 or grub2bls to set the next reboot kernel
 # to boot into (one shot mode).
 # (default grub2_reboot)
 #GRUB_REBOOT = grub2_reboot
 
 # The grub title name for the test kernel to boot
-# (Only mandatory if REBOOT_TYPE = grub or grub2)
+# (Only mandatory if REBOOT_TYPE = grub or grub2 or grub2bls)
 #
 # Note, ktest.pl will not update the grub menu.lst, you need to
 # manually add an option for the test. ktest.pl will search
@@ -374,6 +374,10 @@
 # do a: GRUB_MENU = 'Test Kernel'
 # For customizing, add your entry in /etc/grub.d/40_custom.
 #
+# For grub2bls, a search of "title"s are done. The menu is found
+# by searching for the contents of GRUB_MENU in the line that starts
+# with "title".
+#
 #GRUB_MENU = Test Kernel
 
 # For REBOOT_TYPE = syslinux, the name of the syslinux executable
@@ -593,6 +597,8 @@
 # For REBOOT_TYPE = grub2, you must define both GRUB_MENU and
 # GRUB_FILE.
 #
+# For REBOOT_TYPE = grub2bls, you must define GRUB_MENU.
+#
 # For REBOOT_TYPE = syslinux, you must define SYSLINUX_LABEL, and
 # perhaps modify SYSLINUX (default extlinux) and SYSLINUX_PATH
 # (default /boot/extlinux)
-- 
2.20.1



[PATCH 2/5] ktest: cleanup get_grub_index

2019-05-09 Thread Masayoshi Mizuma
From: Masayoshi Mizuma 

Cleanup get_grub_index().

Signed-off-by: Masayoshi Mizuma 
---
 tools/testing/ktest/ktest.pl | 50 +++-
 1 file changed, 15 insertions(+), 35 deletions(-)

diff --git a/tools/testing/ktest/ktest.pl b/tools/testing/ktest/ktest.pl
index 3862b23672f7..1255ea0d9df4 100755
--- a/tools/testing/ktest/ktest.pl
+++ b/tools/testing/ktest/ktest.pl
@@ -1934,46 +1934,26 @@ sub get_grub2_index {
 
 sub get_grub_index {
 
-if ($reboot_type eq "grub2") {
-   get_grub2_index;
-   return;
-}
-
-if ($reboot_type ne "grub") {
-   return;
-}
-return if (defined($grub_number) && defined($last_grub_menu) &&
-  $last_grub_menu eq $grub_menu && defined($last_machine) &&
-  $last_machine eq $machine);
-
-doprint "Find grub menu ... ";
-$grub_number = -1;
+my $command;
+my $target;
+my $skip;
+my $grub_menu_qt;
 
-my $ssh_grub = $ssh_exec;
-$ssh_grub =~ s,\$SSH_COMMAND,cat /boot/grub/menu.lst,g;
+return if ($reboot_type ne "grub") and ($reboot_type ne "grub2");
 
-open(IN, "$ssh_grub |")
-   or dodie "unable to get menu.lst";
-
-my $found = 0;
-my $grub_menu_qt = quotemeta($grub_menu);
+$grub_menu_qt = quotemeta($grub_menu);
 
-while () {
-   if (/^\s*title\s+$grub_menu_qt\s*$/) {
-   $grub_number++;
-   $found = 1;
-   last;
-   } elsif (/^\s*title\s/) {
-   $grub_number++;
-   }
+if ($reboot_type eq "grub") {
+   $command = "cat /boot/grub/menu.lst";
+   $target = '^\s*title\s+' . $grub_menu_qt . '\s*$';
+   $skip = '^\s*title\s';
+} elsif ($reboot_type eq "grub2") {
+   $command = "cat $grub_file";
+   $target = '^menuentry.*' . $grub_menu_qt;
+   $skip = '^menuentry\s|^submenu\s';
 }
-close(IN);
 
-dodie "Could not find '$grub_menu' in /boot/grub/menu on $machine"
-   if (!$found);
-doprint "$grub_number\n";
-$last_grub_menu = $grub_menu;
-$last_machine = $machine;
+_get_grub_index($command, $target, $skip);
 }
 
 sub wait_for_input
-- 
2.20.1



[PATCH 4/5] ktest: remove get_grub2_index

2019-05-09 Thread Masayoshi Mizuma
From: Masayoshi Mizuma 

Remove get_grub2_index() because it isn't used anywhere.

Signed-off-by: Masayoshi Mizuma 
---
 tools/testing/ktest/ktest.pl | 36 
 1 file changed, 36 deletions(-)

diff --git a/tools/testing/ktest/ktest.pl b/tools/testing/ktest/ktest.pl
index c910d7921f48..e965751ad2da 100755
--- a/tools/testing/ktest/ktest.pl
+++ b/tools/testing/ktest/ktest.pl
@@ -1904,42 +1904,6 @@ sub _get_grub_index {
 $last_machine = $machine;
 }
 
-sub get_grub2_index {
-
-return if (defined($grub_number) && defined($last_grub_menu) &&
-  $last_grub_menu eq $grub_menu && defined($last_machine) &&
-  $last_machine eq $machine);
-
-doprint "Find grub2 menu ... ";
-$grub_number = -1;
-
-my $ssh_grub = $ssh_exec;
-$ssh_grub =~ s,\$SSH_COMMAND,cat $grub_file,g;
-
-open(IN, "$ssh_grub |")
-   or dodie "unable to get $grub_file";
-
-my $found = 0;
-my $grub_menu_qt = quotemeta($grub_menu);
-
-while () {
-   if (/^menuentry.*$grub_menu_qt/) {
-   $grub_number++;
-   $found = 1;
-   last;
-   } elsif (/^menuentry\s|^submenu\s/) {
-   $grub_number++;
-   }
-}
-close(IN);
-
-dodie "Could not find '$grub_menu' in $grub_file on $machine"
-   if (!$found);
-doprint "$grub_number\n";
-$last_grub_menu = $grub_menu;
-$last_machine = $machine;
-}
-
 sub get_grub_index {
 
 my $command;
-- 
2.20.1



[PATCH 3/5] ktest: introduce grub2bls REBOOT_TYPE option

2019-05-09 Thread Masayoshi Mizuma
From: Masayoshi Mizuma 

Fedora 30 introduces Boot Loader Specification (BLS),
it changes around grub entry configuration.

kernel entries aren't in grub.cfg. We can get the entries
by "grubby --info=ALL" command.

Introduce grub2bls as REBOOT_TYPE option for BLS.

Signed-off-by: Masayoshi Mizuma 
---
 tools/testing/ktest/ktest.pl | 21 +
 1 file changed, 17 insertions(+), 4 deletions(-)

diff --git a/tools/testing/ktest/ktest.pl b/tools/testing/ktest/ktest.pl
index 1255ea0d9df4..c910d7921f48 100755
--- a/tools/testing/ktest/ktest.pl
+++ b/tools/testing/ktest/ktest.pl
@@ -64,6 +64,7 @@ my %default = (
 "STOP_TEST_AFTER"  => 600,
 "MAX_MONITOR_WAIT" => 1800,
 "GRUB_REBOOT"  => "grub2-reboot",
+"GRUB_BLS_GET" => "grubby --info=ALL",
 "SYSLINUX" => "extlinux",
 "SYSLINUX_PATH"=> "/boot/extlinux",
 "CONNECT_TIMEOUT"  => 25,
@@ -125,6 +126,7 @@ my $last_grub_menu;
 my $grub_file;
 my $grub_number;
 my $grub_reboot;
+my $grub_bls_get;
 my $syslinux;
 my $syslinux_path;
 my $syslinux_label;
@@ -295,6 +297,7 @@ my %option_map = (
 "GRUB_MENU"=> \$grub_menu,
 "GRUB_FILE"=> \$grub_file,
 "GRUB_REBOOT"  => \$grub_reboot,
+"GRUB_BLS_GET" => \$grub_bls_get,
 "SYSLINUX" => \$syslinux,
 "SYSLINUX_PATH"=> \$syslinux_path,
 "SYSLINUX_LABEL"   => \$syslinux_label,
@@ -440,7 +443,7 @@ EOF
 ;
 $config_help{"REBOOT_TYPE"} = << "EOF"
  Way to reboot the box to the test kernel.
- Only valid options so far are "grub", "grub2", "syslinux", and "script".
+ Only valid options so far are "grub", "grub2", "grub2bls", "syslinux", and 
"script".
 
  If you specify grub, it will assume grub version 1
  and will search in /boot/grub/menu.lst for the title \$GRUB_MENU
@@ -454,6 +457,8 @@ $config_help{"REBOOT_TYPE"} = << "EOF"
  If you specify grub2, then you also need to specify both \$GRUB_MENU
  and \$GRUB_FILE.
 
+ If you specify grub2bls, then you also need to specify \$GRUB_MENU.
+
  If you specify syslinux, then you may use SYSLINUX to define the syslinux
  command (defaults to extlinux), and SYSLINUX_PATH to specify the path to
  the syslinux install (defaults to /boot/extlinux). But you have to specify
@@ -479,6 +484,9 @@ $config_help{"GRUB_MENU"} = << "EOF"
  menu must be a non-nested menu. Add the quotes used in the menu
  to guarantee your selection, as the first menuentry with the content
  of \$GRUB_MENU that is found will be used.
+
+ For grub2bls, \$GRUB_MENU is searched on the result of \$GRUB_BLS_GET
+ command for the lines that begin with "title".
 EOF
 ;
 $config_help{"GRUB_FILE"} = << "EOF"
@@ -695,7 +703,7 @@ sub get_mandatory_configs {
}
 }
 
-if ($rtype eq "grub") {
+if (($rtype eq "grub") or ($rtype eq "grub2bls")) {
get_mandatory_config("GRUB_MENU");
 }
 
@@ -1939,7 +1947,8 @@ sub get_grub_index {
 my $skip;
 my $grub_menu_qt;
 
-return if ($reboot_type ne "grub") and ($reboot_type ne "grub2");
+return if ($reboot_type ne "grub") and ($reboot_type ne "grub2") and
+   ($reboot_type ne "grub2bls");
 
 $grub_menu_qt = quotemeta($grub_menu);
 
@@ -1951,6 +1960,10 @@ sub get_grub_index {
$command = "cat $grub_file";
$target = '^menuentry.*' . $grub_menu_qt;
$skip = '^menuentry\s|^submenu\s';
+} elsif ($reboot_type eq "grub2bls") {
+$command = $grub_bls_get;
+$target = '^title=.*' . $grub_menu_qt;
+$skip = '^title=';
 }
 
 _get_grub_index($command, $target, $skip);
@@ -4306,7 +4319,7 @@ for (my $i = 1; $i <= $opt{"NUM_TESTS"}; $i++) {
 
 if (!$buildonly) {
$target = "$ssh_user\@$machine";
-   if ($reboot_type eq "grub") {
+   if (($reboot_type eq "grub") or ($reboot_type eq "grub2bls")) {
dodie "GRUB_MENU not defined" if (!defined($grub_menu));
} elsif ($reboot_type eq "grub2") {
dodie "GRUB_MENU not defined" if (!defined($grub_menu));
-- 
2.20.1



[PATCH 1/5] ktest: introduce _get_grub_index

2019-05-09 Thread Masayoshi Mizuma
From: Masayoshi Mizuma 

Introduce _get_grub_index() to deal with Boot Loader
Specification (BLS) and cleanup.

Signed-off-by: Masayoshi Mizuma 
---
 tools/testing/ktest/ktest.pl | 37 
 1 file changed, 37 insertions(+)

diff --git a/tools/testing/ktest/ktest.pl b/tools/testing/ktest/ktest.pl
index 751e32a31ed4..3862b23672f7 100755
--- a/tools/testing/ktest/ktest.pl
+++ b/tools/testing/ktest/ktest.pl
@@ -1859,6 +1859,43 @@ sub run_scp_mod {
 return run_scp($src, $dst, $cp_scp);
 }
 
+sub _get_grub_index {
+
+my ($command, $target, $skip) = @_;
+
+return if (defined($grub_number) && defined($last_grub_menu) &&
+  $last_grub_menu eq $grub_menu && defined($last_machine) &&
+  $last_machine eq $machine);
+
+doprint "Find $reboot_type menu ... ";
+$grub_number = -1;
+
+my $ssh_grub = $ssh_exec;
+$ssh_grub =~ s,\$SSH_COMMAND,$command,g;
+
+open(IN, "$ssh_grub |")
+   or dodie "unable to execute $command";
+
+my $found = 0;
+
+while () {
+   if (/$target/) {
+   $grub_number++;
+   $found = 1;
+   last;
+   } elsif (/$skip/) {
+   $grub_number++;
+   }
+}
+close(IN);
+
+dodie "Could not find '$grub_menu' through $command on $machine"
+   if (!$found);
+doprint "$grub_number\n";
+$last_grub_menu = $grub_menu;
+$last_machine = $machine;
+}
+
 sub get_grub2_index {
 
 return if (defined($grub_number) && defined($last_grub_menu) &&
-- 
2.20.1



[PATCH v2] ktest: introduce REBOOT_RETURN_CODE to confirm the result of REBOOT

2019-04-18 Thread Masayoshi Mizuma
From: Masayoshi Mizuma 

Unexpected power cycle occurs while the installation of the
kernel.

   ssh root@Test sync ... [0 seconds] SUCCESS
   ssh root@Test reboot ... [1 second] FAILED!
   virsh destroy Test; sleep 5; virsh start Test ... [6 seconds] SUCCESS

That is because REBOOT, the default is "ssh $SSH_USER@$MACHINE
reboot", exits as 255 even if the reboot is successfully done,
like as:

   ]# ssh root@Test reboot
   Connection to Test closed by remote host.
   ]# echo $?
   255
   ]#

To avoid the unexpected power cycle, introduce a new parameter,
REBOOT_RETURN_CODE to judge whether REBOOT is successfully done
or not.

Signed-off-by: Masayoshi Mizuma 
---
 tools/testing/ktest/ktest.pl| 9 +
 tools/testing/ktest/sample.conf | 4 
 2 files changed, 13 insertions(+)

diff --git a/tools/testing/ktest/ktest.pl b/tools/testing/ktest/ktest.pl
index fc6140d45280..751e32a31ed4 100755
--- a/tools/testing/ktest/ktest.pl
+++ b/tools/testing/ktest/ktest.pl
@@ -58,6 +58,7 @@ my %default = (
 "SCP_TO_TARGET"=> "scp \$SRC_FILE 
\$SSH_USER\@\$MACHINE:\$DST_FILE",
 "SCP_TO_TARGET_INSTALL"=> "\${SCP_TO_TARGET}",
 "REBOOT"   => "ssh \$SSH_USER\@\$MACHINE reboot",
+"REBOOT_RETURN_CODE"   => 255,
 "STOP_AFTER_SUCCESS"   => 10,
 "STOP_AFTER_FAILURE"   => 60,
 "STOP_TEST_AFTER"  => 600,
@@ -105,6 +106,7 @@ my $reboot_type;
 my $reboot_script;
 my $power_cycle;
 my $reboot;
+my $reboot_return_code;
 my $reboot_on_error;
 my $switch_to_good;
 my $switch_to_test;
@@ -278,6 +280,7 @@ my %option_map = (
 "POST_BUILD_DIE"   => \$post_build_die,
 "POWER_CYCLE"  => \$power_cycle,
 "REBOOT"   => \$reboot,
+"REBOOT_RETURN_CODE"   => \$reboot_return_code,
 "BUILD_NOCLEAN"=> \$noclean,
 "MIN_CONFIG"   => \$minconfig,
 "OUTPUT_MIN_CONFIG"=> \$output_minconfig,
@@ -1737,6 +1740,7 @@ sub run_command {
 my $dord = 0;
 my $dostdout = 0;
 my $pid;
+my $command_orig = $command;
 
 $command =~ s/\$SSH_USER/$ssh_user/g;
 $command =~ s/\$MACHINE/$machine/g;
@@ -1791,6 +1795,11 @@ sub run_command {
 # shift 8 for real exit status
 $run_command_status = $? >> 8;
 
+if ($command_orig eq $default{REBOOT} &&
+   $run_command_status == $reboot_return_code) {
+   $run_command_status = 0;
+}
+
 close(CMD);
 close(LOG) if ($dolog);
 close(RD)  if ($dord);
diff --git a/tools/testing/ktest/sample.conf b/tools/testing/ktest/sample.conf
index 6ca6ca0ce695..8c893a58b68e 100644
--- a/tools/testing/ktest/sample.conf
+++ b/tools/testing/ktest/sample.conf
@@ -887,6 +887,10 @@
 # The variables SSH_USER and MACHINE are defined.
 #REBOOT = ssh $SSH_USER@$MACHINE reboot
 
+# The return code of REBOOT
+# (default 255)
+#REBOOT_RETURN_CODE = 255
+
 # The way triple faults are detected is by testing the kernel
 # banner. If the kernel banner for the kernel we are testing is
 # found, and then later a kernel banner for another kernel version
-- 
2.20.1



[PATCH v2] ktest: Add support for meta characters in GRUB_MENU

2019-04-17 Thread Masayoshi Mizuma
From: Masayoshi Mizuma 

ktest fails if meta characters are in GRUB_MENU, for example
GRUB_MENU = 'Fedora (test)'

The failure happens because the meta characters are not escaped,
so the menu doesn't match in any entries in GRUB_FILE.

Use quotemeta() to escape the meta characters.

Signed-off-by: Masayoshi Mizuma 
---
 tools/testing/ktest/ktest.pl | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tools/testing/ktest/ktest.pl b/tools/testing/ktest/ktest.pl
index 87af8a68ab25..fc6140d45280 100755
--- a/tools/testing/ktest/ktest.pl
+++ b/tools/testing/ktest/ktest.pl
@@ -1866,9 +1866,10 @@ sub get_grub2_index {
or dodie "unable to get $grub_file";
 
 my $found = 0;
+my $grub_menu_qt = quotemeta($grub_menu);
 
 while () {
-   if (/^menuentry.*$grub_menu/) {
+   if (/^menuentry.*$grub_menu_qt/) {
$grub_number++;
$found = 1;
last;
@@ -1909,9 +1910,10 @@ sub get_grub_index {
or dodie "unable to get menu.lst";
 
 my $found = 0;
+my $grub_menu_qt = quotemeta($grub_menu);
 
 while () {
-   if (/^\s*title\s+$grub_menu\s*$/) {
+   if (/^\s*title\s+$grub_menu_qt\s*$/) {
$grub_number++;
$found = 1;
last;
-- 
2.20.1



Re: [PATCH] ktest: Add workaround to avoid unexpected power cycle

2019-04-17 Thread Masayoshi Mizuma
On Wed, Apr 17, 2019 at 04:29:42PM -0400, Steven Rostedt wrote:
> On Wed, 17 Apr 2019 16:14:42 -0400
> Masayoshi Mizuma  wrote:
> 
> > From: Masayoshi Mizuma 
> > 
> > Unexpected power cycle occurs while the installation of the
> > kernel.
> > 
> > That is because the default reboot command, "ssh $SSH_USER@$MACHINE
> > reboot" exits as 255 even if the reboot is successfully done,
> > like as:
> > 
> >   ]# ssh root@Test reboot
> >   Connection to Test closed by remote host.
> >   ]# echo $?
> >   255
> >   ]#
> > 
> > To avoid the unexpected power cycle, the reboot is considered as
> > successfully done if the reboot is the default command and the
> > return code is 255.
> > 
> 
> Ah that explains why I've been seeing this :-)
> 
> Can we add a config modifying variable called:
> 
> REBOOT_RETURN_CODE
> 
> that is by default 255 and can be changed by the config file?
> 
> You just need to add in %default:
> 
>   "REBOOT_RETURN_CODE"=> 255,
> 
> my $reboot_return_code;
> 
> 
> and in %option_map:
> 
>   "REBOOT_RETURN_CODE"=> \$reboot_return_code,

Great idea, thanks! I'll add thease.

> 
> 
> > Signed-off-by: Masayoshi Mizuma 
> > ---
> >  tools/testing/ktest/ktest.pl | 9 +
> >  1 file changed, 9 insertions(+)
> > 
> > diff --git a/tools/testing/ktest/ktest.pl b/tools/testing/ktest/ktest.pl
> > index ea07d43856b8..765c6bc83ab4 100755
> > --- a/tools/testing/ktest/ktest.pl
> > +++ b/tools/testing/ktest/ktest.pl
> > @@ -1737,6 +1737,11 @@ sub run_command {
> >  my $dord = 0;
> >  my $dostdout = 0;
> >  my $pid;
> > +my $is_default_reboot = 0;
> > +
> > +if ($command eq $default{REBOOT}) {
> > +   $is_default_reboot = 1;
> > +}
> 
> Do we really need to add this variable?
> 

> >  
> >  $command =~ s/\$SSH_USER/$ssh_user/g;
> >  $command =~ s/\$MACHINE/$machine/g;

$command is modified here, so...

> > @@ -1791,6 +1796,10 @@ sub run_command {
> >  # shift 8 for real exit status
> >  $run_command_status = $? >> 8;
> >  
> > +if ($run_command_status == 255 && $is_default_reboot) {
> 
> Instead can we have:
> 
>   if ($command eq $default{REBOOT} &&
>   $run_command_status == $reboot_return_code) {
> 
> ?

How about the following?

@@ -1737,6 +1740,7 @@ sub run_command {
 my $dord = 0;
 my $dostdout = 0;
 my $pid;
+my $command_orig = $command;

 $command =~ s/\$SSH_USER/$ssh_user/g;
 $command =~ s/\$MACHINE/$machine/g;
@@ -1791,6 +1795,11 @@ sub run_command {
 # shift 8 for real exit status
 $run_command_status = $? >> 8;

+if ($command_orig eq $default{REBOOT} &&
+   $run_command_status == $reboot_return_code) {
+   $run_command_status = 0;
+}
+
 close(CMD);
 close(LOG) if ($dolog);
 close(RD)  if ($dord);

Thanks!
Masa


Re: [PATCH] ktest: Add support for meta characters in GRUB_MENU

2019-04-17 Thread Masayoshi Mizuma
On Wed, Apr 17, 2019 at 04:17:39PM -0400, Steven Rostedt wrote:
> On Wed, 17 Apr 2019 16:11:14 -0400
> Masayoshi Mizuma  wrote:
> 
> > From: Masayoshi Mizuma 
> > 
> > ktest fails if meta characters are in GRUB_MENU, for example
> > GRUB_MENU = 'Fedora (test)'
> > 
> 
> Thanks for the patch! One little nit below though.
> 
> > The failure happens because the meta characters are not escaped,
> > so the menu doesn't match in any entries in GRUB_FILE.
> > 
> > Use quotameta() to escape the meta characters.
> > 
> > Signed-off-by: Masayoshi Mizuma 
> > ---
> >  tools/testing/ktest/ktest.pl | 6 --
> >  1 file changed, 4 insertions(+), 2 deletions(-)
> > 
> > diff --git a/tools/testing/ktest/ktest.pl b/tools/testing/ktest/ktest.pl
> > index 87af8a68ab25..ea07d43856b8 100755
> > --- a/tools/testing/ktest/ktest.pl
> > +++ b/tools/testing/ktest/ktest.pl
> > @@ -1866,9 +1866,10 @@ sub get_grub2_index {
> > or dodie "unable to get $grub_file";
> >  
> >  my $found = 0;
> > +my $_grub_menu = quotemeta($grub_menu);
> 
> I'd like to avoid names that start with underscore. Could you call this:
> 
>   $grub_menu_qt
> 
> or something similar, to be a bit more descriptive of what the variable
> is.

Thank you for your review! I'll fix it and post the v2.

Thanks!
Masa


[PATCH] ktest: Add workaround to avoid unexpected power cycle

2019-04-17 Thread Masayoshi Mizuma
From: Masayoshi Mizuma 

Unexpected power cycle occurs while the installation of the
kernel.

That is because the default reboot command, "ssh $SSH_USER@$MACHINE
reboot" exits as 255 even if the reboot is successfully done,
like as:

  ]# ssh root@Test reboot
  Connection to Test closed by remote host.
  ]# echo $?
  255
  ]#

To avoid the unexpected power cycle, the reboot is considered as
successfully done if the reboot is the default command and the
return code is 255.

Signed-off-by: Masayoshi Mizuma 
---
 tools/testing/ktest/ktest.pl | 9 +
 1 file changed, 9 insertions(+)

diff --git a/tools/testing/ktest/ktest.pl b/tools/testing/ktest/ktest.pl
index ea07d43856b8..765c6bc83ab4 100755
--- a/tools/testing/ktest/ktest.pl
+++ b/tools/testing/ktest/ktest.pl
@@ -1737,6 +1737,11 @@ sub run_command {
 my $dord = 0;
 my $dostdout = 0;
 my $pid;
+my $is_default_reboot = 0;
+
+if ($command eq $default{REBOOT}) {
+   $is_default_reboot = 1;
+}
 
 $command =~ s/\$SSH_USER/$ssh_user/g;
 $command =~ s/\$MACHINE/$machine/g;
@@ -1791,6 +1796,10 @@ sub run_command {
 # shift 8 for real exit status
 $run_command_status = $? >> 8;
 
+if ($run_command_status == 255 && $is_default_reboot) {
+   $run_command_status = 0;
+}
+
 close(CMD);
 close(LOG) if ($dolog);
 close(RD)  if ($dord);
-- 
2.20.1



[PATCH] ktest: Add support for meta characters in GRUB_MENU

2019-04-17 Thread Masayoshi Mizuma
From: Masayoshi Mizuma 

ktest fails if meta characters are in GRUB_MENU, for example
GRUB_MENU = 'Fedora (test)'

The failure happens because the meta characters are not escaped,
so the menu doesn't match in any entries in GRUB_FILE.

Use quotameta() to escape the meta characters.

Signed-off-by: Masayoshi Mizuma 
---
 tools/testing/ktest/ktest.pl | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tools/testing/ktest/ktest.pl b/tools/testing/ktest/ktest.pl
index 87af8a68ab25..ea07d43856b8 100755
--- a/tools/testing/ktest/ktest.pl
+++ b/tools/testing/ktest/ktest.pl
@@ -1866,9 +1866,10 @@ sub get_grub2_index {
or dodie "unable to get $grub_file";
 
 my $found = 0;
+my $_grub_menu = quotemeta($grub_menu);
 
 while () {
-   if (/^menuentry.*$grub_menu/) {
+   if (/^menuentry.*$_grub_menu/) {
$grub_number++;
$found = 1;
last;
@@ -1909,9 +1910,10 @@ sub get_grub_index {
or dodie "unable to get menu.lst";
 
 my $found = 0;
+my $_grub_menu = quotemeta($grub_menu);
 
 while () {
-   if (/^\s*title\s+$grub_menu\s*$/) {
+   if (/^\s*title\s+$_grub_menu\s*$/) {
$grub_number++;
$found = 1;
last;
-- 
2.20.1



Re: [PATCH v2] x86/mm: Adjust the padding size for KASLR

2019-02-14 Thread Masayoshi Mizuma
Hi Baoquan,

Thank you for your review.

On Thu, Feb 14, 2019 at 06:12:36PM +0800, Baoquan He wrote:
> Hi Masa,
> 
> On 02/11/19 at 08:31pm, Masayoshi Mizuma wrote:
> > From: Masayoshi Mizuma 
> > 
> > The system sometimes crashes while memory hot-adding on KASLR
> > enabled system. The crash happens because the regions pointed by
> > kaslr_regions[].base are overwritten by the hot-added memory.
> > 
> > It happens because of the padding size for kaslr_regions[].base isn't
> > enough for the system whose physical memory layout has huge space for
> > memory hotplug. kaslr_regions[].base points "actual installed
> > memory size + padding" or higher address. So, if the "actual + padding"
> > is lower address than the maximum memory address, which means the memory
> > address reachable by memory hot-add, kaslr_regions[].base is destroyed by
> > the overwritten.
> > 
> >   address
> > ^
> > |--- maximum memory address (Hotplug)
> > |^
> > |--- kaslr_regions[0].base   | Hotadd-able region
> > | ^  |
> > | | padding  |
> > | V  V
> > |--- actual memory address (Installed on boot)
> > |
> > 
> > Fix it by getting the maximum memory address from SRAT and store
> > the value in boot_param, then set the padding size while kaslr
> > initializing if the default padding size isn't enough.
> 
> Thanks for the effort on fixing this KASLR conflict issue.
> I roughly go through this patch, seems three parts are contained:
>  
> 1) Wrap up the SRAT travesing code into subtable_parse();
> 2) Add a field max_addr in struct boot_params, and get the max address
>from SRAT and write it into boot_params->max_addr;
> 3) Add kaslr_padding() to adjust the padding size for the direct
> mapping. 
> 
> So could you split them into three patches for better reviewing?

Yes, I will split into the three.

> 
> Another thing is for the 3rd part, I also queued several patches in my
> local branch, they are code bug fix patches, and several clean up
> patches suggested by Ingo and Kirill. They can be found here:
> 
> https://github.com/baoquan-he/linux/commits/kaslar-mm-bug-fix
> 
> In my local patches, Ingo suggested opening code get_padding(), and
> about the SGI UV bug, he suggested adding another function to calculate
> the needed size for the direct mapping region. So I am wondering if you
> can rebase the part 3 on top of it, or you add a new function to
> calculate the size for the direct mapping region so that I can rebase on
> top of your patch and reuse it.
> 
> What do you think about it?

OK, I will rebase my patches on top of your patch.
Could you add CCing me when you post your patches?

Thanks!
Masa


[PATCH v2] x86/mm: Adjust the padding size for KASLR

2019-02-11 Thread Masayoshi Mizuma
From: Masayoshi Mizuma 

The system sometimes crashes while memory hot-adding on KASLR
enabled system. The crash happens because the regions pointed by
kaslr_regions[].base are overwritten by the hot-added memory.

It happens because of the padding size for kaslr_regions[].base isn't
enough for the system whose physical memory layout has huge space for
memory hotplug. kaslr_regions[].base points "actual installed
memory size + padding" or higher address. So, if the "actual + padding"
is lower address than the maximum memory address, which means the memory
address reachable by memory hot-add, kaslr_regions[].base is destroyed by
the overwritten.

  address
^
|--- maximum memory address (Hotplug)
|^
|--- kaslr_regions[0].base   | Hotadd-able region
| ^  |
| | padding  |
| V  V
|--- actual memory address (Installed on boot)
|

Fix it by getting the maximum memory address from SRAT and store
the value in boot_param, then set the padding size while kaslr
initializing if the default padding size isn't enough.

Signed-off-by: Masayoshi Mizuma 
---
 Documentation/x86/zero-page.txt   |  4 
 arch/x86/boot/compressed/acpi.c   | 30 ---
 arch/x86/include/uapi/asm/bootparam.h |  2 +-
 arch/x86/mm/kaslr.c   | 29 +-
 4 files changed, 56 insertions(+), 9 deletions(-)

diff --git a/Documentation/x86/zero-page.txt b/Documentation/x86/zero-page.txt
index 68aed077f..6c107816c 100644
--- a/Documentation/x86/zero-page.txt
+++ b/Documentation/x86/zero-page.txt
@@ -15,6 +15,7 @@ OffsetProto   NameMeaning
 058/008ALL tboot_addr  Physical address of tboot shared page
 060/010ALL ist_infoIntel SpeedStep (IST) BIOS support 
information
(struct ist_info)
+078/010ALL max_addrThe possible maximum physical memory 
address[*].
 080/010ALL hd0_infohd0 disk parameter, OBSOLETE!!
 090/010ALL hd1_infohd1 disk parameter, OBSOLETE!!
 0A0/010ALL sys_desc_table  System description table (struct 
sys_desc_table),
@@ -38,3 +39,6 @@ OffsetProto   NameMeaning
 2D0/A00ALL e820_table  E820 memory map table
(array of struct e820_entry)
 D00/1ECALL eddbuf  EDD data (array of struct edd_info)
+
+[*]: max_addr shows the maximum memory address to be reachable by memory
+ hot-add. max_addr is set by parsing ACPI SRAT.
diff --git a/arch/x86/boot/compressed/acpi.c b/arch/x86/boot/compressed/acpi.c
index c5a949335..3247c7153 100644
--- a/arch/x86/boot/compressed/acpi.c
+++ b/arch/x86/boot/compressed/acpi.c
@@ -272,6 +272,26 @@ static unsigned long get_acpi_srat_table(void)
return 0;
 }
 
+static void subtable_parse(struct acpi_subtable_header *sub_table, int *num,
+   unsigned long *max_addr)
+{
+   struct acpi_srat_mem_affinity *ma;
+   unsigned long addr;
+
+   ma = (struct acpi_srat_mem_affinity *)sub_table;
+   if (ma->length) {
+   if (ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) {
+   addr = ma->base_address + ma->length;
+   if (addr > *max_addr)
+   *max_addr = addr;
+   } else {
+   immovable_mem[*num].start = ma->base_address;
+   immovable_mem[*num].size = ma->length;
+   (*num)++;
+   }
+   }
+}
+
 /**
  * count_immovable_mem_regions - Parse SRAT and cache the immovable
  * memory regions into the immovable_mem array.
@@ -288,6 +308,7 @@ int count_immovable_mem_regions(void)
struct acpi_subtable_header *sub_table;
struct acpi_table_header *table_header;
char arg[MAX_ACPI_ARG_LENGTH];
+   unsigned long max_addr = 0;
int num = 0;
 
if (cmdline_find_option("acpi", arg, sizeof(arg)) == 3 &&
@@ -305,14 +326,8 @@ int count_immovable_mem_regions(void)
while (table + sizeof(struct acpi_subtable_header) < table_end) {
sub_table = (struct acpi_subtable_header *)table;
if (sub_table->type == ACPI_SRAT_TYPE_MEMORY_AFFINITY) {
-   struct acpi_srat_mem_affinity *ma;
 
-   ma = (struct acpi_srat_mem_affinity *)sub_table;
-   if (!(ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && 
ma->length) {
-   immovable_mem[num].start = ma->base_address;
-   immovable_mem[num].size = ma->length;
-   num++;
-   }
+ 

Re: [PATCH v8 0/3] x86/boot/KASLR: Parse ACPI table and limit kaslr in immovable memory

2019-02-11 Thread Masayoshi Mizuma
Hi Chao,

Thank you for your review.

On Mon, Feb 11, 2019 at 09:46:05AM +0800, Chao Fan wrote:
> On Tue, Feb 05, 2019 at 10:05:16AM -0500, Masayoshi Mizuma wrote:
> [...]
> 
> Hi Masa,
> 
> Sorry for delay, since last days were Chinese holiday.
> 
> >diff --git a/arch/x86/boot/compressed/acpi.c 
> >b/arch/x86/boot/compressed/acpi.c
> >index c5a949335..7dd61b943 100644
> >--- a/arch/x86/boot/compressed/acpi.c
> >+++ b/arch/x86/boot/compressed/acpi.c
> >@@ -288,6 +288,7 @@ int count_immovable_mem_regions(void)
> > struct acpi_subtable_header *sub_table;
> > struct acpi_table_header *table_header;
> > char arg[MAX_ACPI_ARG_LENGTH];
> >+unsigned long long possible_addr, max_possible_addr = 0;
> 
> This line is so long that it should be added in first line.

Thanks. I will simplify around the local variables.

> 
> > int num = 0;
> > 
> > if (cmdline_find_option("acpi", arg, sizeof(arg)) == 3 &&
> >@@ -308,10 +309,19 @@ int count_immovable_mem_regions(void)
> > struct acpi_srat_mem_affinity *ma;
> > 
> > ma = (struct acpi_srat_mem_affinity *)sub_table;
> >-if (!(ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && 
> >ma->length) {
> >-immovable_mem[num].start = ma->base_address;
> >-immovable_mem[num].size = ma->length;
> >-num++;
> >+if (ma->length) {
> >+if (ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) {
> >+possible_addr =
> >+ma->base_address + ma->length;
> >+if (possible_addr > max_possible_addr)
> >+max_possible_addr =
> >+possible_addr;
> >+} else {
> >+immovable_mem[num].start =
> >+ma->base_address;
> >+immovable_mem[num].size = ma->length;
> >+num++;
> >+}
> > }
> 
> It looks better in another mail where you add a new function.

Thanks!

- Masa


Re: [PATCH v8 0/3] x86/boot/KASLR: Parse ACPI table and limit kaslr in immovable memory

2019-02-08 Thread Masayoshi Mizuma
Hi Boris,

Thank you for your review.

On Fri, Feb 08, 2019 at 07:26:00PM +0100, Borislav Petkov wrote:
> On Tue, Feb 05, 2019 at 10:05:16AM -0500, Masayoshi Mizuma wrote:
> > From: Masayoshi Mizuma 
> > Date: Tue, 5 Feb 2019 10:00:59 -0500
> > Subject: [PATCH] x86/mm: Introduce adjustment the padding size for KASLR
> 
>   "Adjust the padding size for KASLR"
> 
> > If the physical memory layout has huge space for hotplug, the padding
> > used for the physical memory mapping section is not enough.
> > So, such system may crash while memory hot-adding on KASLR enabled system.
> 
> Crash why?
> 
> Why is the padding not enough?
> 
> > For example, SRAT has the following layout, the maximum possible memory
> > size is 32TB, and the memory is installed as 2TB actually, then the padding
> > size should set 30TB (== possible memory size - actual memory size).
> > 
> >   SRAT: Node 3 PXM 7 [mem 0x1c00-0x1fff] hotplug
> 
> What is that supposed to exemplify: that range is 3T not 2 and that
> range start is not at 2T but 28T. So I have absolutely no clue what
> you're trying to explain here.
> 
> Please go back, take your time and structure your commit message like
> this:
> 
> Problem is A.
> 
> It happens because of B.
> 
> Fix it by doing C.
> 
> (Potentially do D).
> 
> For more detailed info, see
> Documentation/process/submitting-patches.rst, Section "2) Describe your
> changes".

Got it, thanks.

> 
> > This patch introduces adjustment the padding size if the default
> 
> Avoid having "This patch" or "This commit" in the commit message. It is
> tautologically useless.
> 
> Also, do
> 
> $ git grep 'This patch' Documentation/process
> 
> for more details.

Thanks. I see.

> 
> > padding size isn't enough.
> > 
> > Signed-off-by: Masayoshi Mizuma 
> > ---
> >  Documentation/x86/zero-page.txt   |  1 +
> >  arch/x86/boot/compressed/acpi.c   | 19 +++
> >  arch/x86/include/uapi/asm/bootparam.h |  2 +-
> >  arch/x86/mm/kaslr.c   | 26 +-
> >  4 files changed, 42 insertions(+), 6 deletions(-)
> > 
> > diff --git a/Documentation/x86/zero-page.txt 
> > b/Documentation/x86/zero-page.txt
> > index 68aed077f..343fe1a90 100644
> > --- a/Documentation/x86/zero-page.txt
> > +++ b/Documentation/x86/zero-page.txt
> > @@ -15,6 +15,7 @@ OffsetProto   NameMeaning
> >  058/008ALL tboot_addr  Physical address of tboot shared page
> >  060/010ALL ist_infoIntel SpeedStep (IST) BIOS support 
> > information
> > (struct ist_info)
> > +078/010ALL possible_mem_addr The possible maximum physical memory 
> > address.
> 
> Why isn't this called max_phys_addr then?
> 
> Also, please explain what it means at the end of this file.
> 
> >  080/010ALL hd0_infohd0 disk parameter, OBSOLETE!!
> >  090/010ALL hd1_infohd1 disk parameter, OBSOLETE!!
> >  0A0/010ALL sys_desc_table  System description table (struct 
> > sys_desc_table),
> > diff --git a/arch/x86/boot/compressed/acpi.c 
> > b/arch/x86/boot/compressed/acpi.c
> > index c5a949335..7dd61b943 100644
> > --- a/arch/x86/boot/compressed/acpi.c
> > +++ b/arch/x86/boot/compressed/acpi.c
> > @@ -288,6 +288,7 @@ int count_immovable_mem_regions(void)
> > struct acpi_subtable_header *sub_table;
> > struct acpi_table_header *table_header;
> > char arg[MAX_ACPI_ARG_LENGTH];
> > +   unsigned long long possible_addr, max_possible_addr = 0;
> > int num = 0;
> >  
> > if (cmdline_find_option("acpi", arg, sizeof(arg)) == 3 &&
> > @@ -308,10 +309,19 @@ int count_immovable_mem_regions(void)
> > struct acpi_srat_mem_affinity *ma;
> >  
> > ma = (struct acpi_srat_mem_affinity *)sub_table;
> > -   if (!(ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && 
> > ma->length) {
> > -   immovable_mem[num].start = ma->base_address;
> > -   immovable_mem[num].size = ma->length;
> > -   num++;
> > +   if (ma->length) {
> > +   if (ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) {
> > +   possible_addr =
> > +   ma->base_address + ma->length;
> > +   

Re: [PATCH v8 0/3] x86/boot/KASLR: Parse ACPI table and limit kaslr in immovable memory

2019-02-05 Thread Masayoshi Mizuma
Hi Boris and all,

On Sun, Nov 11, 2018 at 08:45:57AM -0500, Masayoshi Mizuma wrote:
> On Sat, Nov 10, 2018 at 11:54:22AM +0100, Borislav Petkov wrote:
> > On Thu, Nov 08, 2018 at 11:51:29AM +0100, Borislav Petkov wrote:
> > > A global definition which doesn't need allocation?
> > > 
> > > Maybe hpa would have another, better idea...
> > 
> > ...and he has: just put that address in a new field in struct
> > boot_params by converting one of the padding arrays there.
> > 
> > Don't forget to document it in Documentation/x86/zero-page.txt
> > 
> > This way you don't need any of the allocation fun or to use setup_data
> > at all.
> 
> Thanks!
> I have the prototype patch to use boot_params [1].
> I will try to brush up it.
> 
> [1] https://lore.kernel.org/lkml/20181016151353.punyk7exekut2543@gabell

Chao's patches are included in the tip tree, so I modified the patch.

Could you review the following patch?

From: Masayoshi Mizuma 
Date: Tue, 5 Feb 2019 10:00:59 -0500
Subject: [PATCH] x86/mm: Introduce adjustment the padding size for KASLR

If the physical memory layout has huge space for hotplug, the padding
used for the physical memory mapping section is not enough.
So, such system may crash while memory hot-adding on KASLR enabled system.

For example, SRAT has the following layout, the maximum possible memory
size is 32TB, and the memory is installed as 2TB actually, then the padding
size should set 30TB (== possible memory size - actual memory size).

  SRAT: Node 3 PXM 7 [mem 0x1c00-0x1fff] hotplug

This patch introduces adjustment the padding size if the default
padding size isn't enough.

Signed-off-by: Masayoshi Mizuma 
---
 Documentation/x86/zero-page.txt   |  1 +
 arch/x86/boot/compressed/acpi.c   | 19 +++
 arch/x86/include/uapi/asm/bootparam.h |  2 +-
 arch/x86/mm/kaslr.c   | 26 +-
 4 files changed, 42 insertions(+), 6 deletions(-)

diff --git a/Documentation/x86/zero-page.txt b/Documentation/x86/zero-page.txt
index 68aed077f..343fe1a90 100644
--- a/Documentation/x86/zero-page.txt
+++ b/Documentation/x86/zero-page.txt
@@ -15,6 +15,7 @@ OffsetProto   NameMeaning
 058/008ALL tboot_addr  Physical address of tboot shared page
 060/010ALL ist_infoIntel SpeedStep (IST) BIOS support 
information
(struct ist_info)
+078/010ALL possible_mem_addr The possible maximum physical memory 
address.
 080/010ALL hd0_infohd0 disk parameter, OBSOLETE!!
 090/010ALL hd1_infohd1 disk parameter, OBSOLETE!!
 0A0/010ALL sys_desc_table  System description table (struct 
sys_desc_table),
diff --git a/arch/x86/boot/compressed/acpi.c b/arch/x86/boot/compressed/acpi.c
index c5a949335..7dd61b943 100644
--- a/arch/x86/boot/compressed/acpi.c
+++ b/arch/x86/boot/compressed/acpi.c
@@ -288,6 +288,7 @@ int count_immovable_mem_regions(void)
struct acpi_subtable_header *sub_table;
struct acpi_table_header *table_header;
char arg[MAX_ACPI_ARG_LENGTH];
+   unsigned long long possible_addr, max_possible_addr = 0;
int num = 0;
 
if (cmdline_find_option("acpi", arg, sizeof(arg)) == 3 &&
@@ -308,10 +309,19 @@ int count_immovable_mem_regions(void)
struct acpi_srat_mem_affinity *ma;
 
ma = (struct acpi_srat_mem_affinity *)sub_table;
-   if (!(ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && 
ma->length) {
-   immovable_mem[num].start = ma->base_address;
-   immovable_mem[num].size = ma->length;
-   num++;
+   if (ma->length) {
+   if (ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) {
+   possible_addr =
+   ma->base_address + ma->length;
+   if (possible_addr > max_possible_addr)
+   max_possible_addr =
+   possible_addr;
+   } else {
+   immovable_mem[num].start =
+   ma->base_address;
+   immovable_mem[num].size = ma->length;
+   num++;
+   }
}
 
if (num >= MAX_NUMNODES*2) {
@@ -320,6 +330,7 @@ int count_immovable_mem_regions(void)
}
}
table += sub_table->length;
+   boot_params->p

Re: [PATCH] sched/debug: initialize sd_sysctl_cpus if !CONFIG_CPUMASK_OFFSTACK

2019-01-31 Thread Masayoshi Mizuma
On Wed, Jan 30, 2019 at 09:14:00PM +0100, Peter Zijlstra wrote:
> On Tue, Jan 29, 2019 at 10:12:45AM -0500, Masayoshi Mizuma wrote:
> > From: Hidetoshi Seto 
> > 
> > register_sched_domain_sysctl() copies the cpu_possible_mask into
> > sd_sysctl_cpus, but only if sd_sysctl_cpus hasn't already been
> > allocated (ie, CONFIG_CPUMASK_OFFSTACK is set).  However, when
> > CONFIG_CPUMASK_OFFSTACK is not set, sd_sysctl_cpus is left uninitialized
> > (all zeroes) and the kernel may fail to initialize sched_domain sysctl
> > entries for all possible cpus.
> > 
> > This is visible to the user if the kernel is booted with maxcpus=n, or
> > if ACPI tables have been modified to leave cpus offline, and then
> > checking for missing /proc/sys/kernel/sched_domain/cpu* entries.
> > 
> > Fix this by separating the allocataion and initialization, and adding
> > a flag to initialize the possible cpu entries while system booting only.
> > 
> > Signed-off-by: Hidetoshi Seto 
> > Reviewed-by: Masayoshi Mizuma 
> > Tested-by: Syuuichirou Ishii 
> > Tested-by: Tarumizu, Kohei 
> > ---
> >  kernel/sched/debug.c | 4 
> >  1 file changed, 4 insertions(+)
> > 
> > diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
> > index de3de997e245..9c6637f3e21d 100644
> > --- a/kernel/sched/debug.c
> > +++ b/kernel/sched/debug.c
> > @@ -310,6 +310,7 @@ static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu)
> >  
> >  static cpumask_var_t   sd_sysctl_cpus;
> >  static struct ctl_table_header *sd_sysctl_header;
> > +static int register_sched_domain_sysctl_on_boot = 1;
> >  
> >  void register_sched_domain_sysctl(void)
> >  {
> > @@ -344,9 +345,12 @@ void register_sched_domain_sysctl(void)
> > if (!cpumask_available(sd_sysctl_cpus)) {
> > if (!alloc_cpumask_var(_sysctl_cpus, GFP_KERNEL))
> > return;
> > +   }
> >  
> > +   if (register_sched_domain_sysctl_on_boot) {
> > /* init to possible to not have holes in @cpu_entries */
> > cpumask_copy(sd_sysctl_cpus, cpu_possible_mask);
> > +   register_sched_domain_sysctl_on_boot = 0;
> > }
> >  
> > for_each_cpu(i, sd_sysctl_cpus) {
> 
> I change it like the below. By keeping the initial value 0 it can go
> into .bss instead of .data.

Great, thanks!
Should I re-post the patch as v2?

- Masa

> 
> --- a/kernel/sched/debug.c
> +++ b/kernel/sched/debug.c
> @@ -315,6 +315,7 @@ void register_sched_domain_sysctl(void)
>  {
>   static struct ctl_table *cpu_entries;
>   static struct ctl_table **cpu_idx;
> + static bool init_done = false;
>   char buf[32];
>   int i;
>  
> @@ -344,7 +345,10 @@ void register_sched_domain_sysctl(void)
>   if (!cpumask_available(sd_sysctl_cpus)) {
>   if (!alloc_cpumask_var(_sysctl_cpus, GFP_KERNEL))
>   return;
> + }
>  
> + if (!init_done) {
> + init_done = true;
>   /* init to possible to not have holes in @cpu_entries */
>   cpumask_copy(sd_sysctl_cpus, cpu_possible_mask);
>   }


[PATCH] sched/debug: initialize sd_sysctl_cpus if !CONFIG_CPUMASK_OFFSTACK

2019-01-29 Thread Masayoshi Mizuma
From: Hidetoshi Seto 

register_sched_domain_sysctl() copies the cpu_possible_mask into
sd_sysctl_cpus, but only if sd_sysctl_cpus hasn't already been
allocated (ie, CONFIG_CPUMASK_OFFSTACK is set).  However, when
CONFIG_CPUMASK_OFFSTACK is not set, sd_sysctl_cpus is left uninitialized
(all zeroes) and the kernel may fail to initialize sched_domain sysctl
entries for all possible cpus.

This is visible to the user if the kernel is booted with maxcpus=n, or
if ACPI tables have been modified to leave cpus offline, and then
checking for missing /proc/sys/kernel/sched_domain/cpu* entries.

Fix this by separating the allocataion and initialization, and adding
a flag to initialize the possible cpu entries while system booting only.

Signed-off-by: Hidetoshi Seto 
Reviewed-by: Masayoshi Mizuma 
Tested-by: Syuuichirou Ishii 
Tested-by: Tarumizu, Kohei 
---
 kernel/sched/debug.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index de3de997e245..9c6637f3e21d 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -310,6 +310,7 @@ static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu)
 
 static cpumask_var_t   sd_sysctl_cpus;
 static struct ctl_table_header *sd_sysctl_header;
+static int register_sched_domain_sysctl_on_boot = 1;
 
 void register_sched_domain_sysctl(void)
 {
@@ -344,9 +345,12 @@ void register_sched_domain_sysctl(void)
if (!cpumask_available(sd_sysctl_cpus)) {
if (!alloc_cpumask_var(_sysctl_cpus, GFP_KERNEL))
return;
+   }
 
+   if (register_sched_domain_sysctl_on_boot) {
/* init to possible to not have holes in @cpu_entries */
cpumask_copy(sd_sysctl_cpus, cpu_possible_mask);
+   register_sched_domain_sysctl_on_boot = 0;
}
 
for_each_cpu(i, sd_sysctl_cpus) {
-- 
2.20.1



Re: [PATCH v13 3/6] x86/boot: Introduce efi_get_rsdp_addr() to find RSDP from EFI table

2018-12-17 Thread Masayoshi Mizuma
On Wed, Dec 12, 2018 at 11:10:50AM +0800, Chao Fan wrote:
> Memory information in SRAT is necessary to fix the conflict between
> KASLR and memory-hotremove. So RSDP and SRAT should be parsed.
> 
> When booting form KEXEC/EFI/BIOS, the methods to compute RSDP
> are different. When booting from EFI, EFI table points to RSDP.
> So parse the EFI table and find the RSDP.
> 
> Signed-off-by: Chao Fan 
> ---
>  arch/x86/boot/compressed/acpi.c | 79 +
>  1 file changed, 79 insertions(+)
> 
> diff --git a/arch/x86/boot/compressed/acpi.c b/arch/x86/boot/compressed/acpi.c
> index cad15686f82c..c96008712ec9 100644
> --- a/arch/x86/boot/compressed/acpi.c
> +++ b/arch/x86/boot/compressed/acpi.c
> @@ -28,3 +28,82 @@ static acpi_physical_address get_acpi_rsdp(void)
>   return 0;
>  #endif
>  }
> +
> +/* Search EFI table for RSDP. */
> +static acpi_physical_address efi_get_rsdp_addr(void)
> +{
> +#ifdef CONFIG_EFI
> + acpi_physical_address rsdp_addr = 0;
> + efi_system_table_t *systab;
> + struct efi_info *e;
> + bool efi_64;
> + char *sig;
> + int size;
> + int i;
> +
> + e = _params->efi_info;
> + sig = (char *)>efi_loader_signature;
> +
> + if (!strncmp(sig, EFI64_LOADER_SIGNATURE, 4))
> + efi_64 = true;
> + else if (!strncmp(sig, EFI32_LOADER_SIGNATURE, 4))
> + efi_64 = false;
> + else {
> + debug_putstr("Wrong EFI loader signature.\n");
> + return 0;
> + }
> +
> + /* Get systab from boot params. Based on efi_init(). */
> +#ifdef CONFIG_X86_64
> + systab = (efi_system_table_t *)(e->efi_systab | 
> ((__u64)e->efi_systab_hi<<32));
> +#else
> + if (e->efi_systab_hi || e->efi_memmap_hi) {
> + debug_putstr("Error getting RSDP address: EFI system table 
> located above 4GB.\n");
> + return 0;
> + }
> + systab = (efi_system_table_t *)e->efi_systab;
> +#endif
> +
> + if (!systab)
> + return 0;
> +
> + /*
> +  * Get EFI tables from systab. Based on efi_config_init() and
> +  * efi_config_parse_tables().
> +  */
> + size = efi_64 ? sizeof(efi_config_table_64_t) :
> + sizeof(efi_config_table_32_t);
> +
> + for (i = 0; i < systab->nr_tables; i++) {
> + void *config_tables;

> + unsigned long table;

u64 table;

Otherwise, the following build warning happen in ARCH=i386.
===
arch/x86/boot/compressed/acpi.c:96:44: warning: right shift count >= width of 
type [-Wshift-count-overflow]
if (!IS_ENABLED(CONFIG_X86_64) && table >> 32) {
===

Thanks,
Masa


Re: [PATCH v13 2/6] x86/boot: Introduce get_acpi_rsdp() to parse RSDP in cmdline from KEXEC

2018-12-13 Thread Masayoshi Mizuma
On Thu, Dec 13, 2018 at 08:29:50PM +0100, Borislav Petkov wrote:
> On Thu, Dec 13, 2018 at 02:25:30PM -0500, Masayoshi Mizuma wrote:
> > > + len = cmdline_find_option("acpi_rsdp", val, MAX_ADDRESS_LENGTH+1);
> > 
> > sizeof() is better here.
> > 
> > len = cmdline_find_option("acpi_rsdp", val, sizeof(var));
> 
> Why is it better?
> 
> That makes you go look for the "val" variable and see what it's size is.
> MAX_ADDRESS_LENGTH+1 is OTOH explicit.

Ah, thanks, make sense.

- Masa


Re: [PATCH v13 2/6] x86/boot: Introduce get_acpi_rsdp() to parse RSDP in cmdline from KEXEC

2018-12-13 Thread Masayoshi Mizuma
Hi Chao,

Great work! Let me say some trivial comments.

On Wed, Dec 12, 2018 at 11:10:49AM +0800, Chao Fan wrote:
> Memory information in SRAT is necessary to fix the conflict between
> KASLR and memory-hotremove.
> 
> ACPI SRAT (System/Static Resource Affinity Table) shows the details
> about memory ranges, including ranges of memory provided by hot-added
> memory devices. SRAT is introduced by Root System Description
> Pointer(RSDP). So RSDP should be found firstly.
> 
> When booting form KEXEC/EFI/BIOS, the methods to find RSDP
> are different. When booting from KEXEC, 'acpi_rsdp' may have been
> added to cmdline, so parse cmdline to find RSDP.
> 
> Since 'RANDOMIZE_BASE' && 'MEMORY_HOTREMOVE' is needed, introduce
> 'CONFIG_EARLY_PARSE_RSDP' to make ifdeffery clear.
> 
> Signed-off-by: Chao Fan 
> ---
>  arch/x86/Kconfig| 10 ++
>  arch/x86/boot/compressed/acpi.c | 30 ++
>  arch/x86/boot/compressed/misc.h |  6 ++
>  3 files changed, 46 insertions(+)
>  create mode 100644 arch/x86/boot/compressed/acpi.c
> 
> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> index ba7e3464ee92..455da382fa9e 100644
> --- a/arch/x86/Kconfig
> +++ b/arch/x86/Kconfig
> @@ -2149,6 +2149,16 @@ config X86_NEED_RELOCS
>   def_bool y
>   depends on RANDOMIZE_BASE || (X86_32 && RELOCATABLE)
>  
> +config EARLY_PARSE_RSDP
> + bool "Parse RSDP pointer on compressed period for KASLR"
> + def_bool y
> + depends on RANDOMIZE_BASE && MEMORY_HOTREMOVE
> + help
> +   This option parses RSDP in compressed period. Works
> +   for KASLR to get memory information from SRAT table and choose
> +   immovable memory to extract kernel.
> +   Say Y if you want to use both KASLR and memory-hotremove.
> +
>  config PHYSICAL_ALIGN
>   hex "Alignment value to which kernel should be aligned"
>   default "0x20"
> diff --git a/arch/x86/boot/compressed/acpi.c b/arch/x86/boot/compressed/acpi.c
> new file mode 100644
> index ..cad15686f82c
> --- /dev/null
> +++ b/arch/x86/boot/compressed/acpi.c
> @@ -0,0 +1,30 @@
> +// SPDX-License-Identifier: GPL-2.0
> +#define BOOT_CTYPE_H
> +#include "misc.h"
> +#include "error.h"
> +
> +#include 
> +#include 
> +#include 
> +#include 
> +
> +#define STATIC
> +#include 
> +
> +#include "../string.h"
> +
> +static acpi_physical_address get_acpi_rsdp(void)
> +{
> +#ifdef CONFIG_KEXEC
> + unsigned long long res;
> + int len = 0;
> + char val[MAX_ADDRESS_LENGTH+1];
> +

> + len = cmdline_find_option("acpi_rsdp", val, MAX_ADDRESS_LENGTH+1);

sizeof() is better here.

len = cmdline_find_option("acpi_rsdp", val, sizeof(var));

> + if (len > 0) {
> + val[len] = 0;

'\0' should be fine here not 0.

val[len] = '\0';

> + return (acpi_physical_address)kstrtoull(val, 16, );
> + }
> + return 0;
> +#endif
> +}
> diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
> index a1d5918765f3..72fcfbfec3c6 100644
> --- a/arch/x86/boot/compressed/misc.h
> +++ b/arch/x86/boot/compressed/misc.h
> @@ -116,3 +116,9 @@ static inline void console_init(void)
>  void set_sev_encryption_mask(void);
>  
>  #endif
> +
> +/* acpi.c */
> +#ifdef CONFIG_EARLY_PARSE_RSDP
> +/* Max length of 64-bit hex address string is 18, prefix "0x" + 16 hex 
> digit. */
> +#define MAX_ADDRESS_LENGTH 18
> +#endif
> -- 
> 2.19.2
> 
> 


Re: [PATCH v12 1/5] x86/boot: Add get_acpi_rsdp() to parse RSDP in cmdline from KEXEC

2018-11-30 Thread Masayoshi Mizuma
On Fri, Nov 30, 2018 at 10:43:47AM +0800, Chao Fan wrote:
...
> >]$ make arch/x86/boot/compressed/misc.o
> >  CALLscripts/checksyscalls.sh
> >  DESCEND  objtool
> >  CC  arch/x86/boot/compressed/misc.o
> >ld: -r and -pie may not be used together
> >make[1]: *** [scripts/Makefile.build:294: arch/x86/boot/compressed/misc.o] 
> >Error 1
> >make: *** [Makefile:1715: arch/x86/boot/compressed/misc.o] Error 2
> >]$
> 
> Hi Masa,
> 
> So many thanks for your test.
> 
> Could you give me more details about this error? More error message.
> Just on the first commit or the whole PATCHSET?
> Cause I didn't get error both on this commit and on the whole PATCHSET.

I built your whole patchset and got the error.
The error depends on CONFIG_MODVERSIONS.
If CONFIG_MODVERSIONS=y, you will get the build error.

Thanks,
Masa


Re: [PATCH v12 1/5] x86/boot: Add get_acpi_rsdp() to parse RSDP in cmdline from KEXEC

2018-11-30 Thread Masayoshi Mizuma
On Fri, Nov 30, 2018 at 10:43:47AM +0800, Chao Fan wrote:
...
> >]$ make arch/x86/boot/compressed/misc.o
> >  CALLscripts/checksyscalls.sh
> >  DESCEND  objtool
> >  CC  arch/x86/boot/compressed/misc.o
> >ld: -r and -pie may not be used together
> >make[1]: *** [scripts/Makefile.build:294: arch/x86/boot/compressed/misc.o] 
> >Error 1
> >make: *** [Makefile:1715: arch/x86/boot/compressed/misc.o] Error 2
> >]$
> 
> Hi Masa,
> 
> So many thanks for your test.
> 
> Could you give me more details about this error? More error message.
> Just on the first commit or the whole PATCHSET?
> Cause I didn't get error both on this commit and on the whole PATCHSET.

I built your whole patchset and got the error.
The error depends on CONFIG_MODVERSIONS.
If CONFIG_MODVERSIONS=y, you will get the build error.

Thanks,
Masa


Re: [PATCH v12 4/5] x86/boot: Parse SRAT table from RSDP and store immovable memory

2018-11-30 Thread Masayoshi Mizuma
On Fri, Nov 30, 2018 at 09:24:54AM +0800, Chao Fan wrote:
> On Thu, Nov 29, 2018 at 12:55:21PM -0500, Masayoshi Mizuma wrote:
> >On Thu, Nov 29, 2018 at 04:16:30PM +0800, Chao Fan wrote:
> >> To fix the conflict between KASLR and memory-hotremove, SRAT table
> >> should be parsed by RSDP pointer, then find the immovable
> >> memory regions and store them in an array called immovable_mem[].
> >> The array called immovable_mem[] will extern to KASLR, then
> >> KASLR will avoid to extract kernel to these regions.
> >> 
> >> Add 'CONFIG_EARLY_PARSE_RSDP' which depends on RANDOMIZE_BASE &&
> >> MEMORY_HOTREMOVE, cause only when both KASLR and memory-hotremove
> >> are enabled, RSDP needs to be parsed in compressed period.
> >> 
> >> Signed-off-by: Chao Fan 
> >> ---
> >>  arch/x86/Kconfig  |  10 +++
> >>  arch/x86/boot/compressed/Makefile |   2 +
> >>  arch/x86/boot/compressed/acpitb.c | 125 ++
> >>  arch/x86/boot/compressed/kaslr.c  |   4 -
> >>  arch/x86/boot/compressed/misc.h   |  20 +
> >>  5 files changed, 157 insertions(+), 4 deletions(-)
> >> 
> >> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> >> index a29d49ef4d56..bc775968557b 100644
> >> --- a/arch/x86/Kconfig
> >> +++ b/arch/x86/Kconfig
> >> @@ -2146,6 +2146,16 @@ config X86_NEED_RELOCS
> >>def_bool y
> >>depends on RANDOMIZE_BASE || (X86_32 && RELOCATABLE)
> >>  
> >
> >> +config CONFIG_EARLY_PARSE_RSDP
> >
> >config EARLY_PARSE_RSDP
> >
> >> +  bool "Parse RSDP pointer on compressed period for KASLR"
> >> +  def_bool n
> >
> >Should be def_bool y?
> 
> I will change it to y.
> 
> >It is better to enable EARLY_PARSE_RSDP by default if 
> >RANDOMIZE_BASE and MEMORY_HOTREMOVE are enabled.
> >
> >> +  depends on RANDOMIZE_BASE && MEMORY_HOTREMOVE
> >> +  help
> >> +This option parses RSDP pointer in compressed period. Works
> >> +for KASLR to get memory information by SRAT table and choose
> >> +immovable memory to extract kernel.
> >> +Say Y if you want to use both KASLR and memory-hotremove.
> >> +
> >>  config PHYSICAL_ALIGN
> >>hex "Alignment value to which kernel should be aligned"
> >>default "0x20"
> >> diff --git a/arch/x86/boot/compressed/Makefile 
> >> b/arch/x86/boot/compressed/Makefile
> >> index 466f66c8a7f8..4cbfb58bf083 100644
> >> --- a/arch/x86/boot/compressed/Makefile
> >> +++ b/arch/x86/boot/compressed/Makefile
> >> @@ -84,6 +84,8 @@ ifdef CONFIG_X86_64
> >>vmlinux-objs-y += $(obj)/pgtable_64.o
> >>  endif
> >>  
> >> +vmlinux-objs-$(CONFIG_EARLY_PARSE_RSDP) += $(obj)/acpitb.o
> >> +
> >>  $(obj)/eboot.o: KBUILD_CFLAGS += -fshort-wchar -mno-red-zone
> >>  
> >>  vmlinux-objs-$(CONFIG_EFI_STUB) += $(obj)/eboot.o 
> >> $(obj)/efi_stub_$(BITS).o \
> >> diff --git a/arch/x86/boot/compressed/acpitb.c 
> >> b/arch/x86/boot/compressed/acpitb.c
> >> index 82d27c4b8978..023b33d0cd3b 100644
> >> --- a/arch/x86/boot/compressed/acpitb.c
> >> +++ b/arch/x86/boot/compressed/acpitb.c
> >> @@ -195,3 +195,128 @@ static acpi_physical_address bios_get_rsdp_addr(void)
> >>return (acpi_physical_address)address;
> >>}
> >>  }
> >> +
> >> +/* Used to determine RSDP table, based on acpi_os_get_root_pointer(). */
> >> +static acpi_physical_address get_rsdp_addr(void)
> >> +{
> >> +  acpi_physical_address pa = 0;
> >> +
> >> +  pa = get_acpi_rsdp();
> >> +
> >> +  if (!pa)
> >> +  pa = efi_get_rsdp_addr();
> >> +
> >> +  if (!pa)
> >> +  pa = bios_get_rsdp_addr();
> >> +
> >> +  return pa;
> >> +}
> >> +
> >> +/* Compute SRAT table from RSDP. */
> >> +static struct acpi_table_header *get_acpi_srat_table(void)
> >> +{
> >> +  acpi_physical_address acpi_table;
> >> +  acpi_physical_address root_table;
> >> +  struct acpi_table_header *header;
> >> +  struct acpi_table_rsdp *rsdp;
> >> +  int num_entries;
> >> +  char arg[10];
> >> +  u8 *entry;
> >> +  u32 size;
> >> +  u32 len;
> >> +
> >> +  rsdp = (struct acpi_table_rsdp *)get_rsdp_addr();
> >> +  i

Re: [PATCH v12 4/5] x86/boot: Parse SRAT table from RSDP and store immovable memory

2018-11-30 Thread Masayoshi Mizuma
On Fri, Nov 30, 2018 at 09:24:54AM +0800, Chao Fan wrote:
> On Thu, Nov 29, 2018 at 12:55:21PM -0500, Masayoshi Mizuma wrote:
> >On Thu, Nov 29, 2018 at 04:16:30PM +0800, Chao Fan wrote:
> >> To fix the conflict between KASLR and memory-hotremove, SRAT table
> >> should be parsed by RSDP pointer, then find the immovable
> >> memory regions and store them in an array called immovable_mem[].
> >> The array called immovable_mem[] will extern to KASLR, then
> >> KASLR will avoid to extract kernel to these regions.
> >> 
> >> Add 'CONFIG_EARLY_PARSE_RSDP' which depends on RANDOMIZE_BASE &&
> >> MEMORY_HOTREMOVE, cause only when both KASLR and memory-hotremove
> >> are enabled, RSDP needs to be parsed in compressed period.
> >> 
> >> Signed-off-by: Chao Fan 
> >> ---
> >>  arch/x86/Kconfig  |  10 +++
> >>  arch/x86/boot/compressed/Makefile |   2 +
> >>  arch/x86/boot/compressed/acpitb.c | 125 ++
> >>  arch/x86/boot/compressed/kaslr.c  |   4 -
> >>  arch/x86/boot/compressed/misc.h   |  20 +
> >>  5 files changed, 157 insertions(+), 4 deletions(-)
> >> 
> >> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> >> index a29d49ef4d56..bc775968557b 100644
> >> --- a/arch/x86/Kconfig
> >> +++ b/arch/x86/Kconfig
> >> @@ -2146,6 +2146,16 @@ config X86_NEED_RELOCS
> >>def_bool y
> >>depends on RANDOMIZE_BASE || (X86_32 && RELOCATABLE)
> >>  
> >
> >> +config CONFIG_EARLY_PARSE_RSDP
> >
> >config EARLY_PARSE_RSDP
> >
> >> +  bool "Parse RSDP pointer on compressed period for KASLR"
> >> +  def_bool n
> >
> >Should be def_bool y?
> 
> I will change it to y.
> 
> >It is better to enable EARLY_PARSE_RSDP by default if 
> >RANDOMIZE_BASE and MEMORY_HOTREMOVE are enabled.
> >
> >> +  depends on RANDOMIZE_BASE && MEMORY_HOTREMOVE
> >> +  help
> >> +This option parses RSDP pointer in compressed period. Works
> >> +for KASLR to get memory information by SRAT table and choose
> >> +immovable memory to extract kernel.
> >> +Say Y if you want to use both KASLR and memory-hotremove.
> >> +
> >>  config PHYSICAL_ALIGN
> >>hex "Alignment value to which kernel should be aligned"
> >>default "0x20"
> >> diff --git a/arch/x86/boot/compressed/Makefile 
> >> b/arch/x86/boot/compressed/Makefile
> >> index 466f66c8a7f8..4cbfb58bf083 100644
> >> --- a/arch/x86/boot/compressed/Makefile
> >> +++ b/arch/x86/boot/compressed/Makefile
> >> @@ -84,6 +84,8 @@ ifdef CONFIG_X86_64
> >>vmlinux-objs-y += $(obj)/pgtable_64.o
> >>  endif
> >>  
> >> +vmlinux-objs-$(CONFIG_EARLY_PARSE_RSDP) += $(obj)/acpitb.o
> >> +
> >>  $(obj)/eboot.o: KBUILD_CFLAGS += -fshort-wchar -mno-red-zone
> >>  
> >>  vmlinux-objs-$(CONFIG_EFI_STUB) += $(obj)/eboot.o 
> >> $(obj)/efi_stub_$(BITS).o \
> >> diff --git a/arch/x86/boot/compressed/acpitb.c 
> >> b/arch/x86/boot/compressed/acpitb.c
> >> index 82d27c4b8978..023b33d0cd3b 100644
> >> --- a/arch/x86/boot/compressed/acpitb.c
> >> +++ b/arch/x86/boot/compressed/acpitb.c
> >> @@ -195,3 +195,128 @@ static acpi_physical_address bios_get_rsdp_addr(void)
> >>return (acpi_physical_address)address;
> >>}
> >>  }
> >> +
> >> +/* Used to determine RSDP table, based on acpi_os_get_root_pointer(). */
> >> +static acpi_physical_address get_rsdp_addr(void)
> >> +{
> >> +  acpi_physical_address pa = 0;
> >> +
> >> +  pa = get_acpi_rsdp();
> >> +
> >> +  if (!pa)
> >> +  pa = efi_get_rsdp_addr();
> >> +
> >> +  if (!pa)
> >> +  pa = bios_get_rsdp_addr();
> >> +
> >> +  return pa;
> >> +}
> >> +
> >> +/* Compute SRAT table from RSDP. */
> >> +static struct acpi_table_header *get_acpi_srat_table(void)
> >> +{
> >> +  acpi_physical_address acpi_table;
> >> +  acpi_physical_address root_table;
> >> +  struct acpi_table_header *header;
> >> +  struct acpi_table_rsdp *rsdp;
> >> +  int num_entries;
> >> +  char arg[10];
> >> +  u8 *entry;
> >> +  u32 size;
> >> +  u32 len;
> >> +
> >> +  rsdp = (struct acpi_table_rsdp *)get_rsdp_addr();
> >> +  i

Re: [PATCH v12 1/5] x86/boot: Add get_acpi_rsdp() to parse RSDP in cmdline from KEXEC

2018-11-29 Thread Masayoshi Mizuma
On Thu, Nov 29, 2018 at 04:16:27PM +0800, Chao Fan wrote:
> To fix the conflict between KASLR and memory-hotremove, memory
> information in SRAT table is necessary.
> 
> ACPI SRAT (System/Static Resource Affinity Table) can show the details
> about memory ranges, including ranges of memory provided by hot-added
> memory devices. SRAT table must be introduced by RSDP pointer (Root
> System Description Pointer). So RSDP should be found firstly.
> 
> When booting form KEXEC/EFI/BIOS, the methods to find RSDP pointer
> are different. When booting from KEXEC, 'acpi_rsdp' may have been
> added to cmdline, so parse the cmdline and find the RSDP pointer.
> 
> Signed-off-by: Chao Fan 
> ---
>  arch/x86/boot/compressed/acpitb.c | 33 +++
>  arch/x86/boot/compressed/misc.c   |  5 +
>  arch/x86/boot/compressed/misc.h   |  4 
>  lib/kstrtox.c |  5 +
>  4 files changed, 47 insertions(+)
>  create mode 100644 arch/x86/boot/compressed/acpitb.c
> 
> diff --git a/arch/x86/boot/compressed/acpitb.c 
> b/arch/x86/boot/compressed/acpitb.c
> new file mode 100644
> index ..614c45655cff
> --- /dev/null
> +++ b/arch/x86/boot/compressed/acpitb.c
> @@ -0,0 +1,33 @@
> +// SPDX-License-Identifier: GPL-2.0
> +#define BOOT_CTYPE_H
> +#include "misc.h"
> +#include "error.h"
> +
> +#include 
> +#include 
> +#include 
> +#include 
> +
> +#define STATIC
> +#include 
> +
> +/* Store the immovable memory regions. */
> +struct mem_vector immovable_mem[MAX_NUMNODES*2];
> +#endif
> +
> +static acpi_physical_address get_acpi_rsdp(void)
> +{
> +#ifdef CONFIG_KEXEC
> + unsigned long long res;
> + int len = 0;
> + char *val;
> +
> + val = malloc(19);
> + len = cmdline_find_option("acpi_rsdp", val, 19);
> + if (len > 0) {
> + val[len] = 0;
> + return (acpi_physical_address)kstrtoull(val, 16, );
> + }
> + return 0;
> +#endif
> +}
> diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
> index 8dd1d5ccae58..e51713fe3add 100644
> --- a/arch/x86/boot/compressed/misc.c
> +++ b/arch/x86/boot/compressed/misc.c
> @@ -12,6 +12,7 @@
>   * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996
>   */
>  
> +#define BOOT_CTYPE_H
>  #include "misc.h"
>  #include "error.h"
>  #include "pgtable.h"
> @@ -426,3 +427,7 @@ void fortify_panic(const char *name)
>  {
>   error("detected buffer overflow");
>  }
> +
> +#ifdef BOOT_STRING
> +#include "../../../../lib/kstrtox.c"
> +#endif
> diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
> index a1d5918765f3..809c31effa4b 100644
> --- a/arch/x86/boot/compressed/misc.h
> +++ b/arch/x86/boot/compressed/misc.h
> @@ -116,3 +116,7 @@ static inline void console_init(void)
>  void set_sev_encryption_mask(void);
>  
>  #endif
> +
> +/* acpitb.c */
> +#define BOOT_STRING
> +extern int kstrtoull(const char *s, unsigned int base, unsigned long long 
> *res);
> diff --git a/lib/kstrtox.c b/lib/kstrtox.c
> index 1006bf70bf74..a0ac1b2257b8 100644
> --- a/lib/kstrtox.c
> +++ b/lib/kstrtox.c
> @@ -126,6 +126,9 @@ int kstrtoull(const char *s, unsigned int base, unsigned 
> long long *res)
>  }
>  EXPORT_SYMBOL(kstrtoull);
>  
> +/* Make compressed period code be able to use kstrtoull(). */
> +#ifndef BOOT_STRING

I got the following build error.

]$ make arch/x86/boot/compressed/misc.o
  CALLscripts/checksyscalls.sh
  DESCEND  objtool
  CC  arch/x86/boot/compressed/misc.o
ld: -r and -pie may not be used together
make[1]: *** [scripts/Makefile.build:294: arch/x86/boot/compressed/misc.o] 
Error 1
make: *** [Makefile:1715: arch/x86/boot/compressed/misc.o] Error 2
]$

I think this error gets fixed by changing the BOOT_STRING ifndef
order before the EXPORT_SYMBOL, like this:

#ifndef BOOT_STRING
EXPORT_SYMBOL(kstrtoull);

I'm not sure this change is a good way...

Thanks,
Masa


Re: [PATCH v12 1/5] x86/boot: Add get_acpi_rsdp() to parse RSDP in cmdline from KEXEC

2018-11-29 Thread Masayoshi Mizuma
On Thu, Nov 29, 2018 at 04:16:27PM +0800, Chao Fan wrote:
> To fix the conflict between KASLR and memory-hotremove, memory
> information in SRAT table is necessary.
> 
> ACPI SRAT (System/Static Resource Affinity Table) can show the details
> about memory ranges, including ranges of memory provided by hot-added
> memory devices. SRAT table must be introduced by RSDP pointer (Root
> System Description Pointer). So RSDP should be found firstly.
> 
> When booting form KEXEC/EFI/BIOS, the methods to find RSDP pointer
> are different. When booting from KEXEC, 'acpi_rsdp' may have been
> added to cmdline, so parse the cmdline and find the RSDP pointer.
> 
> Signed-off-by: Chao Fan 
> ---
>  arch/x86/boot/compressed/acpitb.c | 33 +++
>  arch/x86/boot/compressed/misc.c   |  5 +
>  arch/x86/boot/compressed/misc.h   |  4 
>  lib/kstrtox.c |  5 +
>  4 files changed, 47 insertions(+)
>  create mode 100644 arch/x86/boot/compressed/acpitb.c
> 
> diff --git a/arch/x86/boot/compressed/acpitb.c 
> b/arch/x86/boot/compressed/acpitb.c
> new file mode 100644
> index ..614c45655cff
> --- /dev/null
> +++ b/arch/x86/boot/compressed/acpitb.c
> @@ -0,0 +1,33 @@
> +// SPDX-License-Identifier: GPL-2.0
> +#define BOOT_CTYPE_H
> +#include "misc.h"
> +#include "error.h"
> +
> +#include 
> +#include 
> +#include 
> +#include 
> +
> +#define STATIC
> +#include 
> +
> +/* Store the immovable memory regions. */
> +struct mem_vector immovable_mem[MAX_NUMNODES*2];
> +#endif
> +
> +static acpi_physical_address get_acpi_rsdp(void)
> +{
> +#ifdef CONFIG_KEXEC
> + unsigned long long res;
> + int len = 0;
> + char *val;
> +
> + val = malloc(19);
> + len = cmdline_find_option("acpi_rsdp", val, 19);
> + if (len > 0) {
> + val[len] = 0;
> + return (acpi_physical_address)kstrtoull(val, 16, );
> + }
> + return 0;
> +#endif
> +}
> diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
> index 8dd1d5ccae58..e51713fe3add 100644
> --- a/arch/x86/boot/compressed/misc.c
> +++ b/arch/x86/boot/compressed/misc.c
> @@ -12,6 +12,7 @@
>   * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996
>   */
>  
> +#define BOOT_CTYPE_H
>  #include "misc.h"
>  #include "error.h"
>  #include "pgtable.h"
> @@ -426,3 +427,7 @@ void fortify_panic(const char *name)
>  {
>   error("detected buffer overflow");
>  }
> +
> +#ifdef BOOT_STRING
> +#include "../../../../lib/kstrtox.c"
> +#endif
> diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
> index a1d5918765f3..809c31effa4b 100644
> --- a/arch/x86/boot/compressed/misc.h
> +++ b/arch/x86/boot/compressed/misc.h
> @@ -116,3 +116,7 @@ static inline void console_init(void)
>  void set_sev_encryption_mask(void);
>  
>  #endif
> +
> +/* acpitb.c */
> +#define BOOT_STRING
> +extern int kstrtoull(const char *s, unsigned int base, unsigned long long 
> *res);
> diff --git a/lib/kstrtox.c b/lib/kstrtox.c
> index 1006bf70bf74..a0ac1b2257b8 100644
> --- a/lib/kstrtox.c
> +++ b/lib/kstrtox.c
> @@ -126,6 +126,9 @@ int kstrtoull(const char *s, unsigned int base, unsigned 
> long long *res)
>  }
>  EXPORT_SYMBOL(kstrtoull);
>  
> +/* Make compressed period code be able to use kstrtoull(). */
> +#ifndef BOOT_STRING

I got the following build error.

]$ make arch/x86/boot/compressed/misc.o
  CALLscripts/checksyscalls.sh
  DESCEND  objtool
  CC  arch/x86/boot/compressed/misc.o
ld: -r and -pie may not be used together
make[1]: *** [scripts/Makefile.build:294: arch/x86/boot/compressed/misc.o] 
Error 1
make: *** [Makefile:1715: arch/x86/boot/compressed/misc.o] Error 2
]$

I think this error gets fixed by changing the BOOT_STRING ifndef
order before the EXPORT_SYMBOL, like this:

#ifndef BOOT_STRING
EXPORT_SYMBOL(kstrtoull);

I'm not sure this change is a good way...

Thanks,
Masa


Re: [PATCH v12 4/5] x86/boot: Parse SRAT table from RSDP and store immovable memory

2018-11-29 Thread Masayoshi Mizuma
On Thu, Nov 29, 2018 at 04:16:30PM +0800, Chao Fan wrote:
> To fix the conflict between KASLR and memory-hotremove, SRAT table
> should be parsed by RSDP pointer, then find the immovable
> memory regions and store them in an array called immovable_mem[].
> The array called immovable_mem[] will extern to KASLR, then
> KASLR will avoid to extract kernel to these regions.
> 
> Add 'CONFIG_EARLY_PARSE_RSDP' which depends on RANDOMIZE_BASE &&
> MEMORY_HOTREMOVE, cause only when both KASLR and memory-hotremove
> are enabled, RSDP needs to be parsed in compressed period.
> 
> Signed-off-by: Chao Fan 
> ---
>  arch/x86/Kconfig  |  10 +++
>  arch/x86/boot/compressed/Makefile |   2 +
>  arch/x86/boot/compressed/acpitb.c | 125 ++
>  arch/x86/boot/compressed/kaslr.c  |   4 -
>  arch/x86/boot/compressed/misc.h   |  20 +
>  5 files changed, 157 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> index a29d49ef4d56..bc775968557b 100644
> --- a/arch/x86/Kconfig
> +++ b/arch/x86/Kconfig
> @@ -2146,6 +2146,16 @@ config X86_NEED_RELOCS
>   def_bool y
>   depends on RANDOMIZE_BASE || (X86_32 && RELOCATABLE)
>  

> +config CONFIG_EARLY_PARSE_RSDP

config EARLY_PARSE_RSDP

> + bool "Parse RSDP pointer on compressed period for KASLR"
> + def_bool n

Should be def_bool y?
It is better to enable EARLY_PARSE_RSDP by default if 
RANDOMIZE_BASE and MEMORY_HOTREMOVE are enabled.

> + depends on RANDOMIZE_BASE && MEMORY_HOTREMOVE
> + help
> +   This option parses RSDP pointer in compressed period. Works
> +   for KASLR to get memory information by SRAT table and choose
> +   immovable memory to extract kernel.
> +   Say Y if you want to use both KASLR and memory-hotremove.
> +
>  config PHYSICAL_ALIGN
>   hex "Alignment value to which kernel should be aligned"
>   default "0x20"
> diff --git a/arch/x86/boot/compressed/Makefile 
> b/arch/x86/boot/compressed/Makefile
> index 466f66c8a7f8..4cbfb58bf083 100644
> --- a/arch/x86/boot/compressed/Makefile
> +++ b/arch/x86/boot/compressed/Makefile
> @@ -84,6 +84,8 @@ ifdef CONFIG_X86_64
>   vmlinux-objs-y += $(obj)/pgtable_64.o
>  endif
>  
> +vmlinux-objs-$(CONFIG_EARLY_PARSE_RSDP) += $(obj)/acpitb.o
> +
>  $(obj)/eboot.o: KBUILD_CFLAGS += -fshort-wchar -mno-red-zone
>  
>  vmlinux-objs-$(CONFIG_EFI_STUB) += $(obj)/eboot.o $(obj)/efi_stub_$(BITS).o \
> diff --git a/arch/x86/boot/compressed/acpitb.c 
> b/arch/x86/boot/compressed/acpitb.c
> index 82d27c4b8978..023b33d0cd3b 100644
> --- a/arch/x86/boot/compressed/acpitb.c
> +++ b/arch/x86/boot/compressed/acpitb.c
> @@ -195,3 +195,128 @@ static acpi_physical_address bios_get_rsdp_addr(void)
>   return (acpi_physical_address)address;
>   }
>  }
> +
> +/* Used to determine RSDP table, based on acpi_os_get_root_pointer(). */
> +static acpi_physical_address get_rsdp_addr(void)
> +{
> + acpi_physical_address pa = 0;
> +
> + pa = get_acpi_rsdp();
> +
> + if (!pa)
> + pa = efi_get_rsdp_addr();
> +
> + if (!pa)
> + pa = bios_get_rsdp_addr();
> +
> + return pa;
> +}
> +
> +/* Compute SRAT table from RSDP. */
> +static struct acpi_table_header *get_acpi_srat_table(void)
> +{
> + acpi_physical_address acpi_table;
> + acpi_physical_address root_table;
> + struct acpi_table_header *header;
> + struct acpi_table_rsdp *rsdp;
> + int num_entries;
> + char arg[10];
> + u8 *entry;
> + u32 size;
> + u32 len;
> +
> + rsdp = (struct acpi_table_rsdp *)get_rsdp_addr();
> + if (!rsdp)
> + return NULL;
> +
> + /* Get RSDT or XSDT from RSDP. */

> + if (!(cmdline_find_option("acpi", arg, sizeof(arg)) == 4 &&
> + !strncmp(arg, "rsdt", 4)) &&
> + rsdp->xsdt_physical_address &&
> + rsdp->revision > 1) {
> + root_table = rsdp->xsdt_physical_address;
> + size = ACPI_XSDT_ENTRY_SIZE;
> + } else {
> + root_table = rsdp->rsdt_physical_address;
> + size = ACPI_RSDT_ENTRY_SIZE;
> + }
> +
> + /* Get ACPI root table from RSDT or XSDT.*/
> + header = (struct acpi_table_header *)root_table;
> + if (!header)
> + return NULL;
> +
> + len = header->length;
> + num_entries = (u32)((len - sizeof(struct acpi_table_header)) / size);

> + if (num_entries > MAX_ACPI_SIG)
> + return NULL;

I think this check isn't needed...

> +
> + entry = ACPI_ADD_PTR(u8, header, sizeof(struct acpi_table_header));
> +
> + while (num_entries--) {
> + u64 address64;
> +
> + if (size == ACPI_RSDT_ENTRY_SIZE)
> + acpi_table = ((acpi_physical_address)
> +   (*ACPI_CAST_PTR(u32, entry)));
> + else {
> + *(u64 *)(void *) = *(u64 *)(void *)entry;
> + acpi_table = (acpi_physical_address) address64;
> +  

Re: [PATCH v12 4/5] x86/boot: Parse SRAT table from RSDP and store immovable memory

2018-11-29 Thread Masayoshi Mizuma
On Thu, Nov 29, 2018 at 04:16:30PM +0800, Chao Fan wrote:
> To fix the conflict between KASLR and memory-hotremove, SRAT table
> should be parsed by RSDP pointer, then find the immovable
> memory regions and store them in an array called immovable_mem[].
> The array called immovable_mem[] will extern to KASLR, then
> KASLR will avoid to extract kernel to these regions.
> 
> Add 'CONFIG_EARLY_PARSE_RSDP' which depends on RANDOMIZE_BASE &&
> MEMORY_HOTREMOVE, cause only when both KASLR and memory-hotremove
> are enabled, RSDP needs to be parsed in compressed period.
> 
> Signed-off-by: Chao Fan 
> ---
>  arch/x86/Kconfig  |  10 +++
>  arch/x86/boot/compressed/Makefile |   2 +
>  arch/x86/boot/compressed/acpitb.c | 125 ++
>  arch/x86/boot/compressed/kaslr.c  |   4 -
>  arch/x86/boot/compressed/misc.h   |  20 +
>  5 files changed, 157 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> index a29d49ef4d56..bc775968557b 100644
> --- a/arch/x86/Kconfig
> +++ b/arch/x86/Kconfig
> @@ -2146,6 +2146,16 @@ config X86_NEED_RELOCS
>   def_bool y
>   depends on RANDOMIZE_BASE || (X86_32 && RELOCATABLE)
>  

> +config CONFIG_EARLY_PARSE_RSDP

config EARLY_PARSE_RSDP

> + bool "Parse RSDP pointer on compressed period for KASLR"
> + def_bool n

Should be def_bool y?
It is better to enable EARLY_PARSE_RSDP by default if 
RANDOMIZE_BASE and MEMORY_HOTREMOVE are enabled.

> + depends on RANDOMIZE_BASE && MEMORY_HOTREMOVE
> + help
> +   This option parses RSDP pointer in compressed period. Works
> +   for KASLR to get memory information by SRAT table and choose
> +   immovable memory to extract kernel.
> +   Say Y if you want to use both KASLR and memory-hotremove.
> +
>  config PHYSICAL_ALIGN
>   hex "Alignment value to which kernel should be aligned"
>   default "0x20"
> diff --git a/arch/x86/boot/compressed/Makefile 
> b/arch/x86/boot/compressed/Makefile
> index 466f66c8a7f8..4cbfb58bf083 100644
> --- a/arch/x86/boot/compressed/Makefile
> +++ b/arch/x86/boot/compressed/Makefile
> @@ -84,6 +84,8 @@ ifdef CONFIG_X86_64
>   vmlinux-objs-y += $(obj)/pgtable_64.o
>  endif
>  
> +vmlinux-objs-$(CONFIG_EARLY_PARSE_RSDP) += $(obj)/acpitb.o
> +
>  $(obj)/eboot.o: KBUILD_CFLAGS += -fshort-wchar -mno-red-zone
>  
>  vmlinux-objs-$(CONFIG_EFI_STUB) += $(obj)/eboot.o $(obj)/efi_stub_$(BITS).o \
> diff --git a/arch/x86/boot/compressed/acpitb.c 
> b/arch/x86/boot/compressed/acpitb.c
> index 82d27c4b8978..023b33d0cd3b 100644
> --- a/arch/x86/boot/compressed/acpitb.c
> +++ b/arch/x86/boot/compressed/acpitb.c
> @@ -195,3 +195,128 @@ static acpi_physical_address bios_get_rsdp_addr(void)
>   return (acpi_physical_address)address;
>   }
>  }
> +
> +/* Used to determine RSDP table, based on acpi_os_get_root_pointer(). */
> +static acpi_physical_address get_rsdp_addr(void)
> +{
> + acpi_physical_address pa = 0;
> +
> + pa = get_acpi_rsdp();
> +
> + if (!pa)
> + pa = efi_get_rsdp_addr();
> +
> + if (!pa)
> + pa = bios_get_rsdp_addr();
> +
> + return pa;
> +}
> +
> +/* Compute SRAT table from RSDP. */
> +static struct acpi_table_header *get_acpi_srat_table(void)
> +{
> + acpi_physical_address acpi_table;
> + acpi_physical_address root_table;
> + struct acpi_table_header *header;
> + struct acpi_table_rsdp *rsdp;
> + int num_entries;
> + char arg[10];
> + u8 *entry;
> + u32 size;
> + u32 len;
> +
> + rsdp = (struct acpi_table_rsdp *)get_rsdp_addr();
> + if (!rsdp)
> + return NULL;
> +
> + /* Get RSDT or XSDT from RSDP. */

> + if (!(cmdline_find_option("acpi", arg, sizeof(arg)) == 4 &&
> + !strncmp(arg, "rsdt", 4)) &&
> + rsdp->xsdt_physical_address &&
> + rsdp->revision > 1) {
> + root_table = rsdp->xsdt_physical_address;
> + size = ACPI_XSDT_ENTRY_SIZE;
> + } else {
> + root_table = rsdp->rsdt_physical_address;
> + size = ACPI_RSDT_ENTRY_SIZE;
> + }
> +
> + /* Get ACPI root table from RSDT or XSDT.*/
> + header = (struct acpi_table_header *)root_table;
> + if (!header)
> + return NULL;
> +
> + len = header->length;
> + num_entries = (u32)((len - sizeof(struct acpi_table_header)) / size);

> + if (num_entries > MAX_ACPI_SIG)
> + return NULL;

I think this check isn't needed...

> +
> + entry = ACPI_ADD_PTR(u8, header, sizeof(struct acpi_table_header));
> +
> + while (num_entries--) {
> + u64 address64;
> +
> + if (size == ACPI_RSDT_ENTRY_SIZE)
> + acpi_table = ((acpi_physical_address)
> +   (*ACPI_CAST_PTR(u32, entry)));
> + else {
> + *(u64 *)(void *) = *(u64 *)(void *)entry;
> + acpi_table = (acpi_physical_address) address64;
> +  

Re: [PATCH v12 1/5] x86/boot: Add get_acpi_rsdp() to parse RSDP in cmdline from KEXEC

2018-11-29 Thread Masayoshi Mizuma
On Thu, Nov 29, 2018 at 04:16:27PM +0800, Chao Fan wrote:
> To fix the conflict between KASLR and memory-hotremove, memory
> information in SRAT table is necessary.
> 
> ACPI SRAT (System/Static Resource Affinity Table) can show the details
> about memory ranges, including ranges of memory provided by hot-added
> memory devices. SRAT table must be introduced by RSDP pointer (Root
> System Description Pointer). So RSDP should be found firstly.
> 
> When booting form KEXEC/EFI/BIOS, the methods to find RSDP pointer
> are different. When booting from KEXEC, 'acpi_rsdp' may have been
> added to cmdline, so parse the cmdline and find the RSDP pointer.
> 
> Signed-off-by: Chao Fan 
> ---
>  arch/x86/boot/compressed/acpitb.c | 33 +++
>  arch/x86/boot/compressed/misc.c   |  5 +
>  arch/x86/boot/compressed/misc.h   |  4 
>  lib/kstrtox.c |  5 +
>  4 files changed, 47 insertions(+)
>  create mode 100644 arch/x86/boot/compressed/acpitb.c
> 
> diff --git a/arch/x86/boot/compressed/acpitb.c 
> b/arch/x86/boot/compressed/acpitb.c
> new file mode 100644
> index ..614c45655cff
> --- /dev/null
> +++ b/arch/x86/boot/compressed/acpitb.c
> @@ -0,0 +1,33 @@
> +// SPDX-License-Identifier: GPL-2.0
> +#define BOOT_CTYPE_H
> +#include "misc.h"
> +#include "error.h"
> +
> +#include 
> +#include 
> +#include 
> +#include 
> +
> +#define STATIC
> +#include 
> +
> +/* Store the immovable memory regions. */
> +struct mem_vector immovable_mem[MAX_NUMNODES*2];

> +#endif

Remove this #endif...

Thanks,
Masa

> +
> +static acpi_physical_address get_acpi_rsdp(void)
> +{
> +#ifdef CONFIG_KEXEC
> + unsigned long long res;
> + int len = 0;
> + char *val;
> +
> + val = malloc(19);
> + len = cmdline_find_option("acpi_rsdp", val, 19);
> + if (len > 0) {
> + val[len] = 0;
> + return (acpi_physical_address)kstrtoull(val, 16, );
> + }
> + return 0;
> +#endif
> +}
> diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
> index 8dd1d5ccae58..e51713fe3add 100644
> --- a/arch/x86/boot/compressed/misc.c
> +++ b/arch/x86/boot/compressed/misc.c
> @@ -12,6 +12,7 @@
>   * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996
>   */
>  
> +#define BOOT_CTYPE_H
>  #include "misc.h"
>  #include "error.h"
>  #include "pgtable.h"
> @@ -426,3 +427,7 @@ void fortify_panic(const char *name)
>  {
>   error("detected buffer overflow");
>  }
> +
> +#ifdef BOOT_STRING
> +#include "../../../../lib/kstrtox.c"
> +#endif
> diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
> index a1d5918765f3..809c31effa4b 100644
> --- a/arch/x86/boot/compressed/misc.h
> +++ b/arch/x86/boot/compressed/misc.h
> @@ -116,3 +116,7 @@ static inline void console_init(void)
>  void set_sev_encryption_mask(void);
>  
>  #endif
> +
> +/* acpitb.c */
> +#define BOOT_STRING
> +extern int kstrtoull(const char *s, unsigned int base, unsigned long long 
> *res);
> diff --git a/lib/kstrtox.c b/lib/kstrtox.c
> index 1006bf70bf74..a0ac1b2257b8 100644
> --- a/lib/kstrtox.c
> +++ b/lib/kstrtox.c
> @@ -126,6 +126,9 @@ int kstrtoull(const char *s, unsigned int base, unsigned 
> long long *res)
>  }
>  EXPORT_SYMBOL(kstrtoull);
>  
> +/* Make compressed period code be able to use kstrtoull(). */
> +#ifndef BOOT_STRING
> +
>  /**
>   * kstrtoll - convert a string to a long long
>   * @s: The start of the string. The string must be null-terminated, and may 
> also
> @@ -408,3 +411,5 @@ kstrto_from_user(kstrtou16_from_user, kstrtou16,  
> u16);
>  kstrto_from_user(kstrtos16_from_user,kstrtos16,  s16);
>  kstrto_from_user(kstrtou8_from_user, kstrtou8,   u8);
>  kstrto_from_user(kstrtos8_from_user, kstrtos8,   s8);
> +
> +#endif /* BOOT_STRING */
> -- 
> 2.19.1
> 
> 
> 


Re: [PATCH v12 1/5] x86/boot: Add get_acpi_rsdp() to parse RSDP in cmdline from KEXEC

2018-11-29 Thread Masayoshi Mizuma
On Thu, Nov 29, 2018 at 04:16:27PM +0800, Chao Fan wrote:
> To fix the conflict between KASLR and memory-hotremove, memory
> information in SRAT table is necessary.
> 
> ACPI SRAT (System/Static Resource Affinity Table) can show the details
> about memory ranges, including ranges of memory provided by hot-added
> memory devices. SRAT table must be introduced by RSDP pointer (Root
> System Description Pointer). So RSDP should be found firstly.
> 
> When booting form KEXEC/EFI/BIOS, the methods to find RSDP pointer
> are different. When booting from KEXEC, 'acpi_rsdp' may have been
> added to cmdline, so parse the cmdline and find the RSDP pointer.
> 
> Signed-off-by: Chao Fan 
> ---
>  arch/x86/boot/compressed/acpitb.c | 33 +++
>  arch/x86/boot/compressed/misc.c   |  5 +
>  arch/x86/boot/compressed/misc.h   |  4 
>  lib/kstrtox.c |  5 +
>  4 files changed, 47 insertions(+)
>  create mode 100644 arch/x86/boot/compressed/acpitb.c
> 
> diff --git a/arch/x86/boot/compressed/acpitb.c 
> b/arch/x86/boot/compressed/acpitb.c
> new file mode 100644
> index ..614c45655cff
> --- /dev/null
> +++ b/arch/x86/boot/compressed/acpitb.c
> @@ -0,0 +1,33 @@
> +// SPDX-License-Identifier: GPL-2.0
> +#define BOOT_CTYPE_H
> +#include "misc.h"
> +#include "error.h"
> +
> +#include 
> +#include 
> +#include 
> +#include 
> +
> +#define STATIC
> +#include 
> +
> +/* Store the immovable memory regions. */
> +struct mem_vector immovable_mem[MAX_NUMNODES*2];

> +#endif

Remove this #endif...

Thanks,
Masa

> +
> +static acpi_physical_address get_acpi_rsdp(void)
> +{
> +#ifdef CONFIG_KEXEC
> + unsigned long long res;
> + int len = 0;
> + char *val;
> +
> + val = malloc(19);
> + len = cmdline_find_option("acpi_rsdp", val, 19);
> + if (len > 0) {
> + val[len] = 0;
> + return (acpi_physical_address)kstrtoull(val, 16, );
> + }
> + return 0;
> +#endif
> +}
> diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
> index 8dd1d5ccae58..e51713fe3add 100644
> --- a/arch/x86/boot/compressed/misc.c
> +++ b/arch/x86/boot/compressed/misc.c
> @@ -12,6 +12,7 @@
>   * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996
>   */
>  
> +#define BOOT_CTYPE_H
>  #include "misc.h"
>  #include "error.h"
>  #include "pgtable.h"
> @@ -426,3 +427,7 @@ void fortify_panic(const char *name)
>  {
>   error("detected buffer overflow");
>  }
> +
> +#ifdef BOOT_STRING
> +#include "../../../../lib/kstrtox.c"
> +#endif
> diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
> index a1d5918765f3..809c31effa4b 100644
> --- a/arch/x86/boot/compressed/misc.h
> +++ b/arch/x86/boot/compressed/misc.h
> @@ -116,3 +116,7 @@ static inline void console_init(void)
>  void set_sev_encryption_mask(void);
>  
>  #endif
> +
> +/* acpitb.c */
> +#define BOOT_STRING
> +extern int kstrtoull(const char *s, unsigned int base, unsigned long long 
> *res);
> diff --git a/lib/kstrtox.c b/lib/kstrtox.c
> index 1006bf70bf74..a0ac1b2257b8 100644
> --- a/lib/kstrtox.c
> +++ b/lib/kstrtox.c
> @@ -126,6 +126,9 @@ int kstrtoull(const char *s, unsigned int base, unsigned 
> long long *res)
>  }
>  EXPORT_SYMBOL(kstrtoull);
>  
> +/* Make compressed period code be able to use kstrtoull(). */
> +#ifndef BOOT_STRING
> +
>  /**
>   * kstrtoll - convert a string to a long long
>   * @s: The start of the string. The string must be null-terminated, and may 
> also
> @@ -408,3 +411,5 @@ kstrto_from_user(kstrtou16_from_user, kstrtou16,  
> u16);
>  kstrto_from_user(kstrtos16_from_user,kstrtos16,  s16);
>  kstrto_from_user(kstrtou8_from_user, kstrtou8,   u8);
>  kstrto_from_user(kstrtos8_from_user, kstrtos8,   s8);
> +
> +#endif /* BOOT_STRING */
> -- 
> 2.19.1
> 
> 
> 


Re: [PATCH v12 0/5] x86/boot/KASLR: Parse ACPI table and limit KASLR to choosing immovable memory

2018-11-29 Thread Masayoshi Mizuma
Hi Chao,

Thank you for your continued working.

Could you please build your patches before sending?
Your patches depend on the following kconfig,
so please build them under the config combination.

RANDOMIZE_BASE
MEMORY_HOTREMOVE
EARLY_PARSE_RSDP
KEXEC
EFI

Thanks,
Masa

On Thu, Nov 29, 2018 at 04:16:26PM +0800, Chao Fan wrote:
> ***Background:
> People reported that KASLR may randomly choose some positions
> which are located in movable memory regions. This will break memory
> hotplug feature and make the movable memory chosen by KASLR can't be
> removed.
> 
> ***Solutions:
> Get the information of memory hot-remove, then KASLR will know the
> right regions. Information about memory hot-remove is in ACPI
> tables, which will be parsed after start_kernel(), so that KASLR
> can't get the information.
> 
> Somebody suggest to add a kernel parameter to specify the
> immovable memory so that limit KASLR in these regions. Then I make
> a patchset. After several versions, Ingo gave a suggestion:
> https://www.mail-archive.com/linux-kernel@vger.kernel.org/msg1634024.html
> Follow Ingo's suggestion, imitate the ACPI code to parse the ACPI
> tables, so that the kaslr can get necessary memory information in
> ACPI tables.
> I think ACPI code is an independent part, so imitate the codes
> and functions to 'compressed/' directory, so that kaslr won't
> influence the initialization of ACPI.
> 
> PATCH 1/5 Add get_acpi_rsdp() to parse RSDP in cmdline from KEXEC
> PATCH 2/5 Add efi_get_rsdp_addr() to find RSDP from EFI table when
>   booting from EFI.
> PATCH 3/5 Add bios_get_rsdp_addr() to search RSDP in memory when EFI
>   table not found.
> PATCH 4/5 Compute SRAT table from RSDP and walk SRAT table to store
>   the immovable memory regions.
> PATCH 5/5 Calculate the intersection between memory regions from e820/efi
>   memory table and immovable memory regions. Limit KASLR to
>   choosing these regions for randomization.
> 
> v1->v2:
>  -  Simplify some code.
> Follow Baoquan He's suggestion:
>  - Reuse the head file of acpi code.
> 
> v2->v3:
>  - Test in more conditions, so remove the 'RFC' tag.
>  - Change some comments.
> 
> v3->v4:
> Follow Thomas Gleixner's suggetsion:
>  - Put the whole efi related function into #define CONFIG_EFI and return
>false in the other stub.
> 
> v4->v5:
> Follow Dou Liyang's suggestion:
>  - Add more comments about some functions based on kernel code.
>  - Change some typo in comments.
>  - Clean useless variable.
>  - Add check for the boundary of array.
>  - Add check for 'movable_node' parameter
> 
> v5->v6:
> Follow Baoquan He's suggestion:
>  - Change some log.
>  - Add the check for acpi_rsdp
>  - Change some code logical to make code clear
> 
> v6->v7:
> Follow Rafael's suggestion:
>  - Add more comments and patch log.
> Follow test robot's suggestion:
>  - Add "static" tag for function
> 
> v7-v8:
> Follow Kees Cook's suggestion:
>  - Use mem_overlaps() to check memory region.
>  - Use #ifdef in the definition of function.
> 
> v8-v9:
> Follow Boris' suggestion:
>  - Change code style.
>  - Splite PATCH 1/3 to more path.
>  - Introduce some new function
>  - Use existing function to rework some code
> Follow Masayoshi's suggetion:
>  - Make code more readable
> 
> v9->v10:
> Follow Baoquan's suggestion:
>  - Change some log
>  - Merge last two patch together.
> 
> v10->v11:
> Follow Boris' suggestion:
>  - Link kstrtoull() instead of copying it.
>  - Drop the useless wrapped function.
> 
> v11->v12:
> Follow Boris' suggestion:
>  - Change patch log and code comments.
>  - Add 'CONFIG_EARLY_PARSE_RSDP' to make code easy to read
>  - Put strtoull() to misc.c
> Follow Masa's suggestion:
>  - Remove the detection for 'movable_node'
>  - Change the code logical about cmdline_find_option()
> 
> Any comments will be welcome.
> 
> 
> Chao Fan (5):
>   x86/boot: Add get_acpi_rsdp() to parse RSDP in cmdline from KEXEC
>   x86/boot: Add efi_get_rsdp_addr() to find RSDP from EFI table
>   x86/boot: Add bios_get_rsdp_addr() to search RSDP in memory
>   x86/boot: Parse SRAT table from RSDP and store immovable memory
>   x86/boot/KASLR: Limit KASLR to extracting kernel in immovable memory
> 
>  arch/x86/Kconfig  |  10 +
>  arch/x86/boot/compressed/Makefile |   2 +
>  arch/x86/boot/compressed/acpitb.c | 322 ++
>  arch/x86/boot/compressed/kaslr.c  |  79 ++--
>  arch/x86/boot/compressed/misc.c   |   5 +
>  arch/x86/boot/compressed/misc.h   |  24 +++
>  lib/kstrtox.c |   5 +
>  7 files changed, 432 insertions(+), 15 deletions(-)
>  create mode 100644 arch/x86/boot/compressed/acpitb.c
> 
> -- 
> 2.19.1
> 
> 
> 


Re: [PATCH v12 0/5] x86/boot/KASLR: Parse ACPI table and limit KASLR to choosing immovable memory

2018-11-29 Thread Masayoshi Mizuma
Hi Chao,

Thank you for your continued working.

Could you please build your patches before sending?
Your patches depend on the following kconfig,
so please build them under the config combination.

RANDOMIZE_BASE
MEMORY_HOTREMOVE
EARLY_PARSE_RSDP
KEXEC
EFI

Thanks,
Masa

On Thu, Nov 29, 2018 at 04:16:26PM +0800, Chao Fan wrote:
> ***Background:
> People reported that KASLR may randomly choose some positions
> which are located in movable memory regions. This will break memory
> hotplug feature and make the movable memory chosen by KASLR can't be
> removed.
> 
> ***Solutions:
> Get the information of memory hot-remove, then KASLR will know the
> right regions. Information about memory hot-remove is in ACPI
> tables, which will be parsed after start_kernel(), so that KASLR
> can't get the information.
> 
> Somebody suggest to add a kernel parameter to specify the
> immovable memory so that limit KASLR in these regions. Then I make
> a patchset. After several versions, Ingo gave a suggestion:
> https://www.mail-archive.com/linux-kernel@vger.kernel.org/msg1634024.html
> Follow Ingo's suggestion, imitate the ACPI code to parse the ACPI
> tables, so that the kaslr can get necessary memory information in
> ACPI tables.
> I think ACPI code is an independent part, so imitate the codes
> and functions to 'compressed/' directory, so that kaslr won't
> influence the initialization of ACPI.
> 
> PATCH 1/5 Add get_acpi_rsdp() to parse RSDP in cmdline from KEXEC
> PATCH 2/5 Add efi_get_rsdp_addr() to find RSDP from EFI table when
>   booting from EFI.
> PATCH 3/5 Add bios_get_rsdp_addr() to search RSDP in memory when EFI
>   table not found.
> PATCH 4/5 Compute SRAT table from RSDP and walk SRAT table to store
>   the immovable memory regions.
> PATCH 5/5 Calculate the intersection between memory regions from e820/efi
>   memory table and immovable memory regions. Limit KASLR to
>   choosing these regions for randomization.
> 
> v1->v2:
>  -  Simplify some code.
> Follow Baoquan He's suggestion:
>  - Reuse the head file of acpi code.
> 
> v2->v3:
>  - Test in more conditions, so remove the 'RFC' tag.
>  - Change some comments.
> 
> v3->v4:
> Follow Thomas Gleixner's suggetsion:
>  - Put the whole efi related function into #define CONFIG_EFI and return
>false in the other stub.
> 
> v4->v5:
> Follow Dou Liyang's suggestion:
>  - Add more comments about some functions based on kernel code.
>  - Change some typo in comments.
>  - Clean useless variable.
>  - Add check for the boundary of array.
>  - Add check for 'movable_node' parameter
> 
> v5->v6:
> Follow Baoquan He's suggestion:
>  - Change some log.
>  - Add the check for acpi_rsdp
>  - Change some code logical to make code clear
> 
> v6->v7:
> Follow Rafael's suggestion:
>  - Add more comments and patch log.
> Follow test robot's suggestion:
>  - Add "static" tag for function
> 
> v7-v8:
> Follow Kees Cook's suggestion:
>  - Use mem_overlaps() to check memory region.
>  - Use #ifdef in the definition of function.
> 
> v8-v9:
> Follow Boris' suggestion:
>  - Change code style.
>  - Splite PATCH 1/3 to more path.
>  - Introduce some new function
>  - Use existing function to rework some code
> Follow Masayoshi's suggetion:
>  - Make code more readable
> 
> v9->v10:
> Follow Baoquan's suggestion:
>  - Change some log
>  - Merge last two patch together.
> 
> v10->v11:
> Follow Boris' suggestion:
>  - Link kstrtoull() instead of copying it.
>  - Drop the useless wrapped function.
> 
> v11->v12:
> Follow Boris' suggestion:
>  - Change patch log and code comments.
>  - Add 'CONFIG_EARLY_PARSE_RSDP' to make code easy to read
>  - Put strtoull() to misc.c
> Follow Masa's suggestion:
>  - Remove the detection for 'movable_node'
>  - Change the code logical about cmdline_find_option()
> 
> Any comments will be welcome.
> 
> 
> Chao Fan (5):
>   x86/boot: Add get_acpi_rsdp() to parse RSDP in cmdline from KEXEC
>   x86/boot: Add efi_get_rsdp_addr() to find RSDP from EFI table
>   x86/boot: Add bios_get_rsdp_addr() to search RSDP in memory
>   x86/boot: Parse SRAT table from RSDP and store immovable memory
>   x86/boot/KASLR: Limit KASLR to extracting kernel in immovable memory
> 
>  arch/x86/Kconfig  |  10 +
>  arch/x86/boot/compressed/Makefile |   2 +
>  arch/x86/boot/compressed/acpitb.c | 322 ++
>  arch/x86/boot/compressed/kaslr.c  |  79 ++--
>  arch/x86/boot/compressed/misc.c   |   5 +
>  arch/x86/boot/compressed/misc.h   |  24 +++
>  lib/kstrtox.c |   5 +
>  7 files changed, 432 insertions(+), 15 deletions(-)
>  create mode 100644 arch/x86/boot/compressed/acpitb.c
> 
> -- 
> 2.19.1
> 
> 
> 


Re: [PATCH v12 1/5] x86/boot: Add get_acpi_rsdp() to parse RSDP in cmdline from KEXEC

2018-11-29 Thread Masayoshi Mizuma
On Thu, Nov 29, 2018 at 04:16:27PM +0800, Chao Fan wrote:
> To fix the conflict between KASLR and memory-hotremove, memory
> information in SRAT table is necessary.
> 
> ACPI SRAT (System/Static Resource Affinity Table) can show the details
> about memory ranges, including ranges of memory provided by hot-added
> memory devices. SRAT table must be introduced by RSDP pointer (Root
> System Description Pointer). So RSDP should be found firstly.
> 
> When booting form KEXEC/EFI/BIOS, the methods to find RSDP pointer
> are different. When booting from KEXEC, 'acpi_rsdp' may have been
> added to cmdline, so parse the cmdline and find the RSDP pointer.
> 
> Signed-off-by: Chao Fan 
> ---
>  arch/x86/boot/compressed/acpitb.c | 33 +++
>  arch/x86/boot/compressed/misc.c   |  5 +
>  arch/x86/boot/compressed/misc.h   |  4 
>  lib/kstrtox.c |  5 +
>  4 files changed, 47 insertions(+)
>  create mode 100644 arch/x86/boot/compressed/acpitb.c
> 
> diff --git a/arch/x86/boot/compressed/acpitb.c 
> b/arch/x86/boot/compressed/acpitb.c
> new file mode 100644
> index ..614c45655cff
> --- /dev/null
> +++ b/arch/x86/boot/compressed/acpitb.c
> @@ -0,0 +1,33 @@
> +// SPDX-License-Identifier: GPL-2.0
> +#define BOOT_CTYPE_H
> +#include "misc.h"
> +#include "error.h"
> +
> +#include 
> +#include 
> +#include 
> +#include 
> +
> +#define STATIC
> +#include 
> +
> +/* Store the immovable memory regions. */
> +struct mem_vector immovable_mem[MAX_NUMNODES*2];
> +#endif
> +
> +static acpi_physical_address get_acpi_rsdp(void)
> +{
> +#ifdef CONFIG_KEXEC
> + unsigned long long res;
> + int len = 0;

> + char *val;
> +
> + val = malloc(19);
> + len = cmdline_find_option("acpi_rsdp", val, 19);
> + if (len > 0) {
> + val[len] = 0;

val[len] = '\0';

> + return (acpi_physical_address)kstrtoull(val, 16, );
> + }

I think free() is needed. Or why don't you use stack?

char val[19];

len = cmdline_find_option("acpi_rsdp", val, sizeof(val));
if (len > 0) {
val[len] = '\0';
return (acpi_physical_address)kstrtoull(val, 16, );
}

Thanks,
Masa

> + return 0;
> +#endif
> +}
> diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
> index 8dd1d5ccae58..e51713fe3add 100644
> --- a/arch/x86/boot/compressed/misc.c
> +++ b/arch/x86/boot/compressed/misc.c
> @@ -12,6 +12,7 @@
>   * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996
>   */
>  
> +#define BOOT_CTYPE_H
>  #include "misc.h"
>  #include "error.h"
>  #include "pgtable.h"
> @@ -426,3 +427,7 @@ void fortify_panic(const char *name)
>  {
>   error("detected buffer overflow");
>  }
> +
> +#ifdef BOOT_STRING
> +#include "../../../../lib/kstrtox.c"
> +#endif
> diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
> index a1d5918765f3..809c31effa4b 100644
> --- a/arch/x86/boot/compressed/misc.h
> +++ b/arch/x86/boot/compressed/misc.h
> @@ -116,3 +116,7 @@ static inline void console_init(void)
>  void set_sev_encryption_mask(void);
>  
>  #endif
> +
> +/* acpitb.c */
> +#define BOOT_STRING
> +extern int kstrtoull(const char *s, unsigned int base, unsigned long long 
> *res);
> diff --git a/lib/kstrtox.c b/lib/kstrtox.c
> index 1006bf70bf74..a0ac1b2257b8 100644
> --- a/lib/kstrtox.c
> +++ b/lib/kstrtox.c
> @@ -126,6 +126,9 @@ int kstrtoull(const char *s, unsigned int base, unsigned 
> long long *res)
>  }
>  EXPORT_SYMBOL(kstrtoull);
>  
> +/* Make compressed period code be able to use kstrtoull(). */
> +#ifndef BOOT_STRING
> +
>  /**
>   * kstrtoll - convert a string to a long long
>   * @s: The start of the string. The string must be null-terminated, and may 
> also
> @@ -408,3 +411,5 @@ kstrto_from_user(kstrtou16_from_user, kstrtou16,  
> u16);
>  kstrto_from_user(kstrtos16_from_user,kstrtos16,  s16);
>  kstrto_from_user(kstrtou8_from_user, kstrtou8,   u8);
>  kstrto_from_user(kstrtos8_from_user, kstrtos8,   s8);
> +
> +#endif /* BOOT_STRING */
> -- 
> 2.19.1
> 
> 
> 


Re: [PATCH v12 1/5] x86/boot: Add get_acpi_rsdp() to parse RSDP in cmdline from KEXEC

2018-11-29 Thread Masayoshi Mizuma
On Thu, Nov 29, 2018 at 04:16:27PM +0800, Chao Fan wrote:
> To fix the conflict between KASLR and memory-hotremove, memory
> information in SRAT table is necessary.
> 
> ACPI SRAT (System/Static Resource Affinity Table) can show the details
> about memory ranges, including ranges of memory provided by hot-added
> memory devices. SRAT table must be introduced by RSDP pointer (Root
> System Description Pointer). So RSDP should be found firstly.
> 
> When booting form KEXEC/EFI/BIOS, the methods to find RSDP pointer
> are different. When booting from KEXEC, 'acpi_rsdp' may have been
> added to cmdline, so parse the cmdline and find the RSDP pointer.
> 
> Signed-off-by: Chao Fan 
> ---
>  arch/x86/boot/compressed/acpitb.c | 33 +++
>  arch/x86/boot/compressed/misc.c   |  5 +
>  arch/x86/boot/compressed/misc.h   |  4 
>  lib/kstrtox.c |  5 +
>  4 files changed, 47 insertions(+)
>  create mode 100644 arch/x86/boot/compressed/acpitb.c
> 
> diff --git a/arch/x86/boot/compressed/acpitb.c 
> b/arch/x86/boot/compressed/acpitb.c
> new file mode 100644
> index ..614c45655cff
> --- /dev/null
> +++ b/arch/x86/boot/compressed/acpitb.c
> @@ -0,0 +1,33 @@
> +// SPDX-License-Identifier: GPL-2.0
> +#define BOOT_CTYPE_H
> +#include "misc.h"
> +#include "error.h"
> +
> +#include 
> +#include 
> +#include 
> +#include 
> +
> +#define STATIC
> +#include 
> +
> +/* Store the immovable memory regions. */
> +struct mem_vector immovable_mem[MAX_NUMNODES*2];
> +#endif
> +
> +static acpi_physical_address get_acpi_rsdp(void)
> +{
> +#ifdef CONFIG_KEXEC
> + unsigned long long res;
> + int len = 0;

> + char *val;
> +
> + val = malloc(19);
> + len = cmdline_find_option("acpi_rsdp", val, 19);
> + if (len > 0) {
> + val[len] = 0;

val[len] = '\0';

> + return (acpi_physical_address)kstrtoull(val, 16, );
> + }

I think free() is needed. Or why don't you use stack?

char val[19];

len = cmdline_find_option("acpi_rsdp", val, sizeof(val));
if (len > 0) {
val[len] = '\0';
return (acpi_physical_address)kstrtoull(val, 16, );
}

Thanks,
Masa

> + return 0;
> +#endif
> +}
> diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
> index 8dd1d5ccae58..e51713fe3add 100644
> --- a/arch/x86/boot/compressed/misc.c
> +++ b/arch/x86/boot/compressed/misc.c
> @@ -12,6 +12,7 @@
>   * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996
>   */
>  
> +#define BOOT_CTYPE_H
>  #include "misc.h"
>  #include "error.h"
>  #include "pgtable.h"
> @@ -426,3 +427,7 @@ void fortify_panic(const char *name)
>  {
>   error("detected buffer overflow");
>  }
> +
> +#ifdef BOOT_STRING
> +#include "../../../../lib/kstrtox.c"
> +#endif
> diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
> index a1d5918765f3..809c31effa4b 100644
> --- a/arch/x86/boot/compressed/misc.h
> +++ b/arch/x86/boot/compressed/misc.h
> @@ -116,3 +116,7 @@ static inline void console_init(void)
>  void set_sev_encryption_mask(void);
>  
>  #endif
> +
> +/* acpitb.c */
> +#define BOOT_STRING
> +extern int kstrtoull(const char *s, unsigned int base, unsigned long long 
> *res);
> diff --git a/lib/kstrtox.c b/lib/kstrtox.c
> index 1006bf70bf74..a0ac1b2257b8 100644
> --- a/lib/kstrtox.c
> +++ b/lib/kstrtox.c
> @@ -126,6 +126,9 @@ int kstrtoull(const char *s, unsigned int base, unsigned 
> long long *res)
>  }
>  EXPORT_SYMBOL(kstrtoull);
>  
> +/* Make compressed period code be able to use kstrtoull(). */
> +#ifndef BOOT_STRING
> +
>  /**
>   * kstrtoll - convert a string to a long long
>   * @s: The start of the string. The string must be null-terminated, and may 
> also
> @@ -408,3 +411,5 @@ kstrto_from_user(kstrtou16_from_user, kstrtou16,  
> u16);
>  kstrto_from_user(kstrtos16_from_user,kstrtos16,  s16);
>  kstrto_from_user(kstrtou8_from_user, kstrtou8,   u8);
>  kstrto_from_user(kstrtos8_from_user, kstrtos8,   s8);
> +
> +#endif /* BOOT_STRING */
> -- 
> 2.19.1
> 
> 
> 


Re: [PATCH v3 0/3] mm: Fix for movable_node boot option

2018-10-09 Thread Masayoshi Mizuma
Ping...

On Tue, Oct 02, 2018 at 10:38:18AM -0400, Masayoshi Mizuma wrote:
> This patch series are the fix for movable_node boot option
> issue which was introduced by commit 124049decbb1 ("x86/e820:
> put !E820_TYPE_RAM regions into memblock.reserved").
> 
> The commit breaks the option because it changed the memory
> gap range to reserved memblock. So, the node is marked as
> Normal zone even if the SRAT has Hot pluggable affinity.
> 
> First and second patch fix the original issue which the commit
> tried to fix, then revert the commit.
> 
> Changelog from v2:
>  - Change the patch order. The revert patch is moved to the last.
> 
> Masayoshi Mizuma (1):
>   Revert "x86/e820: put !E820_TYPE_RAM regions into memblock.reserved"
> 
> Naoya Horiguchi (1):
>   mm: zero remaining unavailable struct pages
> 
> Pavel Tatashin (1):
>   mm: return zero_resv_unavail optimization
> 
>  arch/x86/kernel/e820.c   | 15 +++
>  include/linux/memblock.h | 15 ---
>  mm/page_alloc.c  | 54 +++-
>  3 files changed, 40 insertions(+), 44 deletions(-)
> 
> -- 
> 2.18.0
> 


Re: [PATCH v3 0/3] mm: Fix for movable_node boot option

2018-10-09 Thread Masayoshi Mizuma
Ping...

On Tue, Oct 02, 2018 at 10:38:18AM -0400, Masayoshi Mizuma wrote:
> This patch series are the fix for movable_node boot option
> issue which was introduced by commit 124049decbb1 ("x86/e820:
> put !E820_TYPE_RAM regions into memblock.reserved").
> 
> The commit breaks the option because it changed the memory
> gap range to reserved memblock. So, the node is marked as
> Normal zone even if the SRAT has Hot pluggable affinity.
> 
> First and second patch fix the original issue which the commit
> tried to fix, then revert the commit.
> 
> Changelog from v2:
>  - Change the patch order. The revert patch is moved to the last.
> 
> Masayoshi Mizuma (1):
>   Revert "x86/e820: put !E820_TYPE_RAM regions into memblock.reserved"
> 
> Naoya Horiguchi (1):
>   mm: zero remaining unavailable struct pages
> 
> Pavel Tatashin (1):
>   mm: return zero_resv_unavail optimization
> 
>  arch/x86/kernel/e820.c   | 15 +++
>  include/linux/memblock.h | 15 ---
>  mm/page_alloc.c  | 54 +++-
>  3 files changed, 40 insertions(+), 44 deletions(-)
> 
> -- 
> 2.18.0
> 


Re: [tip:x86/boot] ACPI/NUMA: Add warning message if the padding size for KASLR is not enough

2018-10-03 Thread Masayoshi Mizuma
On Wed, Oct 03, 2018 at 02:48:14PM +0200, Borislav Petkov wrote:
> On Wed, Oct 03, 2018 at 02:34:02PM +0200, Peter Zijlstra wrote:
> > 
> > Subject: ACPI/NUMA: Fix KASLR build error
> > 
> > There is no point in trying to compile KASLR specific code when there is
> > no KASLR.
> > 
> > Signed-off-by: Peter Zijlstra (Intel) 
> > ---
> 
> Yeah, Peter and I were just talking on IRC and he gave me a much better
> idea how to fix this, see below. I'll run this through the *config builder and
> commit it if no complaints.

Hi Boris and Peter,

Thank you for the fix. It is great!

- Masa

> 
> ---
> From: "Peter Zijlstra (Intel)" 
> Date: Wed, 3 Oct 2018 14:41:27 +0200
> Subject: [PATCH] ACPI/NUMA: Fix KASLR build error
> 
> There is no point in trying to compile KASLR specific code when there is
> no KASLR.
> 
>  [ bp: Move the whole crap into kaslr.c and make
>rand_mem_physical_padding static. ]
> 
> Signed-off-by: Peter Zijlstra (Intel) 
> Signed-off-by: Borislav Petkov 
> Cc: 
> Cc: 
> Cc: 
> Cc: 
> Cc: 
> Cc: 
> Link: 
> http://lkml.kernel.org/r/20181003123402.ga15...@hirez.programming.kicks-ass.net
> ---
>  arch/x86/include/asm/kaslr.h |  2 ++
>  arch/x86/include/asm/setup.h |  2 --
>  arch/x86/mm/kaslr.c  | 18 +-
>  drivers/acpi/numa.c  | 15 +++
>  4 files changed, 22 insertions(+), 15 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kaslr.h b/arch/x86/include/asm/kaslr.h
> index db7ba2feb947..95ef3fc01d12 100644
> --- a/arch/x86/include/asm/kaslr.h
> +++ b/arch/x86/include/asm/kaslr.h
> @@ -6,8 +6,10 @@ unsigned long kaslr_get_random_long(const char *purpose);
>  
>  #ifdef CONFIG_RANDOMIZE_MEMORY
>  void kernel_randomize_memory(void);
> +void kaslr_check_padding(void);
>  #else
>  static inline void kernel_randomize_memory(void) { }
> +static inline void kaslr_check_padding(void) { }
>  #endif /* CONFIG_RANDOMIZE_MEMORY */
>  
>  #endif
> diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
> index 65a5bf8f6aba..ae13bc974416 100644
> --- a/arch/x86/include/asm/setup.h
> +++ b/arch/x86/include/asm/setup.h
> @@ -80,8 +80,6 @@ static inline unsigned long kaslr_offset(void)
>   return (unsigned long)&_text - __START_KERNEL;
>  }
>  
> -extern int rand_mem_physical_padding;
> -
>  /*
>   * Do NOT EVER look at the BIOS memory size location.
>   * It does not work on many machines.
> diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c
> index 00cf4cae38f5..d58b7da0d55c 100644
> --- a/arch/x86/mm/kaslr.c
> +++ b/arch/x86/mm/kaslr.c
> @@ -40,7 +40,7 @@
>   */
>  static const unsigned long vaddr_end = CPU_ENTRY_AREA_BASE;
>  
> -int __initdata rand_mem_physical_padding = 
> CONFIG_RANDOMIZE_MEMORY_PHYSICAL_PADDING;
> +static int __initdata rand_mem_physical_padding = 
> CONFIG_RANDOMIZE_MEMORY_PHYSICAL_PADDING;
>  /*
>   * Memory regions randomized by KASLR (except modules that use a separate 
> logic
>   * earlier during boot). The list is ordered based on virtual addresses. This
> @@ -70,6 +70,22 @@ static inline bool kaslr_memory_enabled(void)
>   return kaslr_enabled() && !IS_ENABLED(CONFIG_KASAN);
>  }
>  
> +/*
> + * Check the padding size for KASLR is enough.
> + */
> +void kaslr_check_padding(void)
> +{
> + u64 max_possible_phys, max_actual_phys, threshold;
> +
> + max_actual_phys = roundup(PFN_PHYS(max_pfn), 1ULL << 40);
> + max_possible_phys = roundup(PFN_PHYS(max_possible_pfn), 1ULL << 40);
> + threshold = max_actual_phys + ((u64)rand_mem_physical_padding << 40);
> +
> + if (max_possible_phys > threshold)
> + pr_warn("Set 'rand_mem_physical_padding=%llu' to avoid memory 
> hotadd failure.\n",
> + (max_possible_phys - max_actual_phys) >> 40);
> +}
> +
>  static int __init rand_mem_physical_padding_setup(char *str)
>  {
>   int max_padding = (1 << (MAX_PHYSMEM_BITS - TB_SHIFT)) - 1;
> diff --git a/drivers/acpi/numa.c b/drivers/acpi/numa.c
> index 3d69834c692f..4408e37600ef 100644
> --- a/drivers/acpi/numa.c
> +++ b/drivers/acpi/numa.c
> @@ -32,7 +32,7 @@
>  #include 
>  #include 
>  #include 
> -#include 
> +#include 
>  
>  static nodemask_t nodes_found_map = NODE_MASK_NONE;
>  
> @@ -436,7 +436,6 @@ acpi_table_parse_srat(enum acpi_srat_type id,
>  int __init acpi_numa_init(void)
>  {
>   int cnt = 0;
> - u64 max_possible_phys, max_actual_phys, threshold;
>  
>   if (acpi_disabled)
>   return -EINVAL;
> @@ -466,17 +465,9 @@ int __init acpi_numa_init(void)
>   cnt = acpi_table_parse_srat(ACPI_SRAT_TYPE_MEMORY_AFFINITY,
>   acpi_parse_memory_affinity, 0);
>  
> - /* check the padding size for KASLR is enough. */
> - if (parsed_numa_memblks && kaslr_enabled()) {
> - max_actual_phys = roundup(PFN_PHYS(max_pfn), 1ULL << 
> 40);
> - max_possible_phys = roundup(PFN_PHYS(max_possible_pfn), 
> 1ULL << 40);
> - threshold 

Re: [tip:x86/boot] ACPI/NUMA: Add warning message if the padding size for KASLR is not enough

2018-10-03 Thread Masayoshi Mizuma
On Wed, Oct 03, 2018 at 02:48:14PM +0200, Borislav Petkov wrote:
> On Wed, Oct 03, 2018 at 02:34:02PM +0200, Peter Zijlstra wrote:
> > 
> > Subject: ACPI/NUMA: Fix KASLR build error
> > 
> > There is no point in trying to compile KASLR specific code when there is
> > no KASLR.
> > 
> > Signed-off-by: Peter Zijlstra (Intel) 
> > ---
> 
> Yeah, Peter and I were just talking on IRC and he gave me a much better
> idea how to fix this, see below. I'll run this through the *config builder and
> commit it if no complaints.

Hi Boris and Peter,

Thank you for the fix. It is great!

- Masa

> 
> ---
> From: "Peter Zijlstra (Intel)" 
> Date: Wed, 3 Oct 2018 14:41:27 +0200
> Subject: [PATCH] ACPI/NUMA: Fix KASLR build error
> 
> There is no point in trying to compile KASLR specific code when there is
> no KASLR.
> 
>  [ bp: Move the whole crap into kaslr.c and make
>rand_mem_physical_padding static. ]
> 
> Signed-off-by: Peter Zijlstra (Intel) 
> Signed-off-by: Borislav Petkov 
> Cc: 
> Cc: 
> Cc: 
> Cc: 
> Cc: 
> Cc: 
> Link: 
> http://lkml.kernel.org/r/20181003123402.ga15...@hirez.programming.kicks-ass.net
> ---
>  arch/x86/include/asm/kaslr.h |  2 ++
>  arch/x86/include/asm/setup.h |  2 --
>  arch/x86/mm/kaslr.c  | 18 +-
>  drivers/acpi/numa.c  | 15 +++
>  4 files changed, 22 insertions(+), 15 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kaslr.h b/arch/x86/include/asm/kaslr.h
> index db7ba2feb947..95ef3fc01d12 100644
> --- a/arch/x86/include/asm/kaslr.h
> +++ b/arch/x86/include/asm/kaslr.h
> @@ -6,8 +6,10 @@ unsigned long kaslr_get_random_long(const char *purpose);
>  
>  #ifdef CONFIG_RANDOMIZE_MEMORY
>  void kernel_randomize_memory(void);
> +void kaslr_check_padding(void);
>  #else
>  static inline void kernel_randomize_memory(void) { }
> +static inline void kaslr_check_padding(void) { }
>  #endif /* CONFIG_RANDOMIZE_MEMORY */
>  
>  #endif
> diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
> index 65a5bf8f6aba..ae13bc974416 100644
> --- a/arch/x86/include/asm/setup.h
> +++ b/arch/x86/include/asm/setup.h
> @@ -80,8 +80,6 @@ static inline unsigned long kaslr_offset(void)
>   return (unsigned long)&_text - __START_KERNEL;
>  }
>  
> -extern int rand_mem_physical_padding;
> -
>  /*
>   * Do NOT EVER look at the BIOS memory size location.
>   * It does not work on many machines.
> diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c
> index 00cf4cae38f5..d58b7da0d55c 100644
> --- a/arch/x86/mm/kaslr.c
> +++ b/arch/x86/mm/kaslr.c
> @@ -40,7 +40,7 @@
>   */
>  static const unsigned long vaddr_end = CPU_ENTRY_AREA_BASE;
>  
> -int __initdata rand_mem_physical_padding = 
> CONFIG_RANDOMIZE_MEMORY_PHYSICAL_PADDING;
> +static int __initdata rand_mem_physical_padding = 
> CONFIG_RANDOMIZE_MEMORY_PHYSICAL_PADDING;
>  /*
>   * Memory regions randomized by KASLR (except modules that use a separate 
> logic
>   * earlier during boot). The list is ordered based on virtual addresses. This
> @@ -70,6 +70,22 @@ static inline bool kaslr_memory_enabled(void)
>   return kaslr_enabled() && !IS_ENABLED(CONFIG_KASAN);
>  }
>  
> +/*
> + * Check the padding size for KASLR is enough.
> + */
> +void kaslr_check_padding(void)
> +{
> + u64 max_possible_phys, max_actual_phys, threshold;
> +
> + max_actual_phys = roundup(PFN_PHYS(max_pfn), 1ULL << 40);
> + max_possible_phys = roundup(PFN_PHYS(max_possible_pfn), 1ULL << 40);
> + threshold = max_actual_phys + ((u64)rand_mem_physical_padding << 40);
> +
> + if (max_possible_phys > threshold)
> + pr_warn("Set 'rand_mem_physical_padding=%llu' to avoid memory 
> hotadd failure.\n",
> + (max_possible_phys - max_actual_phys) >> 40);
> +}
> +
>  static int __init rand_mem_physical_padding_setup(char *str)
>  {
>   int max_padding = (1 << (MAX_PHYSMEM_BITS - TB_SHIFT)) - 1;
> diff --git a/drivers/acpi/numa.c b/drivers/acpi/numa.c
> index 3d69834c692f..4408e37600ef 100644
> --- a/drivers/acpi/numa.c
> +++ b/drivers/acpi/numa.c
> @@ -32,7 +32,7 @@
>  #include 
>  #include 
>  #include 
> -#include 
> +#include 
>  
>  static nodemask_t nodes_found_map = NODE_MASK_NONE;
>  
> @@ -436,7 +436,6 @@ acpi_table_parse_srat(enum acpi_srat_type id,
>  int __init acpi_numa_init(void)
>  {
>   int cnt = 0;
> - u64 max_possible_phys, max_actual_phys, threshold;
>  
>   if (acpi_disabled)
>   return -EINVAL;
> @@ -466,17 +465,9 @@ int __init acpi_numa_init(void)
>   cnt = acpi_table_parse_srat(ACPI_SRAT_TYPE_MEMORY_AFFINITY,
>   acpi_parse_memory_affinity, 0);
>  
> - /* check the padding size for KASLR is enough. */
> - if (parsed_numa_memblks && kaslr_enabled()) {
> - max_actual_phys = roundup(PFN_PHYS(max_pfn), 1ULL << 
> 40);
> - max_possible_phys = roundup(PFN_PHYS(max_possible_pfn), 
> 1ULL << 40);
> - threshold 

Re: [PATCH v6 0/3] Add a kernel parameter to change the padding size for KASLR

2018-10-03 Thread Masayoshi Mizuma
On Wed, Oct 03, 2018 at 08:52:19AM +0200, Ingo Molnar wrote:
> 
> * Masayoshi Mizuma  wrote:
> 
> > This patch series are adding an kernel parameter to change
> > the padding size used for KASLR. It is useful for memory hotplug
> > capable system. User can adjust the padding size to use it.
> > 
> > It is better if the padding size is calculated automatically,
> > however, ACPI SRAT is not available at the KASLR initialization
> > time. So, I add a message for user to tell the suitable padding
> > size. User can set it on next reboot.
> > 
> > This patch series don't change the current default padding size.
> > 
> > Change log from v5:
> >  - Fix build error if CONFIG_RANDOMIZE_MEMORY is not defined.
> 
> Please send a delta patch on top of tip:x86/boot with the 
> get_rand_mem_physical_padding() fix, 
> when doing that you'll see that I did some minor cleanups when applying the 
> patches which your 
> v6 series undoes.

I have sent the delta patch. Thanks!

- Masa


Re: [PATCH v6 0/3] Add a kernel parameter to change the padding size for KASLR

2018-10-03 Thread Masayoshi Mizuma
On Wed, Oct 03, 2018 at 08:52:19AM +0200, Ingo Molnar wrote:
> 
> * Masayoshi Mizuma  wrote:
> 
> > This patch series are adding an kernel parameter to change
> > the padding size used for KASLR. It is useful for memory hotplug
> > capable system. User can adjust the padding size to use it.
> > 
> > It is better if the padding size is calculated automatically,
> > however, ACPI SRAT is not available at the KASLR initialization
> > time. So, I add a message for user to tell the suitable padding
> > size. User can set it on next reboot.
> > 
> > This patch series don't change the current default padding size.
> > 
> > Change log from v5:
> >  - Fix build error if CONFIG_RANDOMIZE_MEMORY is not defined.
> 
> Please send a delta patch on top of tip:x86/boot with the 
> get_rand_mem_physical_padding() fix, 
> when doing that you'll see that I did some minor cleanups when applying the 
> patches which your 
> v6 series undoes.

I have sent the delta patch. Thanks!

- Masa


Re: [tip:x86/boot] ACPI/NUMA: Add warning message if the padding size for KASLR is not enough

2018-10-03 Thread Masayoshi Mizuma
From: Masayoshi Mizuma 
Subject: [PATCH] Fix for build error if CONFIG_RANDOMIZE_BASE is not defined.

Fix the following build error.

  ld: drivers/acpi/numa.o: in function `acpi_numa_init':
  drivers/acpi/numa.c:473: undefined reference to `rand_mem_physical_padding'
  make: *** [Makefile:1030: vmlinux] Error 1

- Add get_rand_mem_physical_padding() which returns
  rand_mem_physical_padding or 0 if
  CONFIG_RANDOMIZE_MEMORY is not defined.
- Make rand_mem_physical_padding static

Signed-off-by: Masayoshi Mizuma 
---
 arch/x86/include/asm/setup.h | 9 -
 arch/x86/mm/kaslr.c  | 9 +++--
 drivers/acpi/numa.c  | 3 ++-
 3 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index 65a5bf8..1765a15 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -80,7 +80,14 @@ static inline unsigned long kaslr_offset(void)
return (unsigned long)&_text - __START_KERNEL;
 }

-extern int rand_mem_physical_padding;
+#ifdef CONFIG_RANDOMIZE_MEMORY
+extern inline int __init get_rand_mem_physical_padding(void);
+#else
+static inline int __init get_rand_mem_physical_padding(void)
+{
+   return 0;
+}
+#endif

 /*
  * Do NOT EVER look at the BIOS memory size location.
diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c
index 00cf4ca..eb47f05 100644
--- a/arch/x86/mm/kaslr.c
+++ b/arch/x86/mm/kaslr.c
@@ -40,7 +40,7 @@
  */
 static const unsigned long vaddr_end = CPU_ENTRY_AREA_BASE;

-int __initdata rand_mem_physical_padding = 
CONFIG_RANDOMIZE_MEMORY_PHYSICAL_PADDING;
+static int rand_mem_physical_padding __initdata = 
CONFIG_RANDOMIZE_MEMORY_PHYSICAL_PADDING;
 /*
  * Memory regions randomized by KASLR (except modules that use a separate logic
  * earlier during boot). The list is ordered based on virtual addresses. This
@@ -70,6 +70,11 @@ static inline bool kaslr_memory_enabled(void)
return kaslr_enabled() && !IS_ENABLED(CONFIG_KASAN);
 }

+inline int __init get_rand_mem_physical_padding(void)
+{
+   return rand_mem_physical_padding;
+}
+
 static int __init rand_mem_physical_padding_setup(char *str)
 {
int max_padding = (1 << (MAX_PHYSMEM_BITS - TB_SHIFT)) - 1;
@@ -117,7 +122,7 @@ void __init kernel_randomize_memory(void)
 */
BUG_ON(kaslr_regions[0].base != _offset_base);
memory_tb = DIV_ROUND_UP(max_pfn << PAGE_SHIFT, 1UL << TB_SHIFT) +
-   rand_mem_physical_padding;
+   get_rand_mem_physical_padding();

/* Adapt phyiscal memory region size based on available memory */
if (memory_tb < kaslr_regions[0].size_tb)
diff --git a/drivers/acpi/numa.c b/drivers/acpi/numa.c
index 3d69834..303b024 100644
--- a/drivers/acpi/numa.c
+++ b/drivers/acpi/numa.c
@@ -470,7 +470,8 @@ int __init acpi_numa_init(void)
if (parsed_numa_memblks && kaslr_enabled()) {
max_actual_phys = roundup(PFN_PHYS(max_pfn), 1ULL << 
40);
max_possible_phys = roundup(PFN_PHYS(max_possible_pfn), 
1ULL << 40);
-   threshold = max_actual_phys + 
((u64)rand_mem_physical_padding << 40);
+   threshold = max_actual_phys +
+   ((u64)get_rand_mem_physical_padding() << 40);

if (max_possible_phys > threshold) {
pr_warn("Set 'rand_mem_physical_padding=%llu' 
to avoid memory hotadd failure.\n",
--
2.18.0


On Tue, Oct 02, 2018 at 03:18:41AM -0700, tip-bot for Masayoshi Mizuma wrote:
> Commit-ID:  3b054ca88c4f4dd5f516a12d4b6d6bd0ae826f41
> Gitweb: 
> https://git.kernel.org/tip/3b054ca88c4f4dd5f516a12d4b6d6bd0ae826f41
> Author: Masayoshi Mizuma 
> AuthorDate: Mon, 1 Oct 2018 10:08:42 -0400
> Committer:  Ingo Molnar 
> CommitDate: Tue, 2 Oct 2018 11:47:21 +0200
> 
> ACPI/NUMA: Add warning message if the padding size for KASLR is not enough
> 
> Add warning message if the padding size for KASLR,
> rand_mem_physical_padding, is not enough. The message also
> says the suitable padding size.
> 
> Signed-off-by: Masayoshi Mizuma 
> Cc: Baoquan He 
> Cc: Borislav Petkov 
> Cc: Linus Torvalds 
> Cc: Masayoshi Mizuma 
> Cc: Peter Zijlstra 
> Cc: Thomas Gleixner 
> Link: http://lkml.kernel.org/r/20181001140843.26137-3-msys.miz...@gmail.com
> Signed-off-by: Ingo Molnar 
> ---
>  arch/x86/include/asm/setup.h |  2 ++
>  drivers/acpi/numa.c  | 14 ++
>  2 files changed, 16 insertions(+)
> 
> diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
> index ae13bc974416..65a5bf8f6aba 100644
> --- a/arch/x86/include/asm/setup.h
> +++ b/arch/x86/include/asm/setup.h
> @@ -80,6 +80,8 @@ static inline unsigned long kaslr_offset(void)
>   return (unsigned long)&_text - __

Re: [tip:x86/boot] ACPI/NUMA: Add warning message if the padding size for KASLR is not enough

2018-10-03 Thread Masayoshi Mizuma
From: Masayoshi Mizuma 
Subject: [PATCH] Fix for build error if CONFIG_RANDOMIZE_BASE is not defined.

Fix the following build error.

  ld: drivers/acpi/numa.o: in function `acpi_numa_init':
  drivers/acpi/numa.c:473: undefined reference to `rand_mem_physical_padding'
  make: *** [Makefile:1030: vmlinux] Error 1

- Add get_rand_mem_physical_padding() which returns
  rand_mem_physical_padding or 0 if
  CONFIG_RANDOMIZE_MEMORY is not defined.
- Make rand_mem_physical_padding static

Signed-off-by: Masayoshi Mizuma 
---
 arch/x86/include/asm/setup.h | 9 -
 arch/x86/mm/kaslr.c  | 9 +++--
 drivers/acpi/numa.c  | 3 ++-
 3 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index 65a5bf8..1765a15 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -80,7 +80,14 @@ static inline unsigned long kaslr_offset(void)
return (unsigned long)&_text - __START_KERNEL;
 }

-extern int rand_mem_physical_padding;
+#ifdef CONFIG_RANDOMIZE_MEMORY
+extern inline int __init get_rand_mem_physical_padding(void);
+#else
+static inline int __init get_rand_mem_physical_padding(void)
+{
+   return 0;
+}
+#endif

 /*
  * Do NOT EVER look at the BIOS memory size location.
diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c
index 00cf4ca..eb47f05 100644
--- a/arch/x86/mm/kaslr.c
+++ b/arch/x86/mm/kaslr.c
@@ -40,7 +40,7 @@
  */
 static const unsigned long vaddr_end = CPU_ENTRY_AREA_BASE;

-int __initdata rand_mem_physical_padding = 
CONFIG_RANDOMIZE_MEMORY_PHYSICAL_PADDING;
+static int rand_mem_physical_padding __initdata = 
CONFIG_RANDOMIZE_MEMORY_PHYSICAL_PADDING;
 /*
  * Memory regions randomized by KASLR (except modules that use a separate logic
  * earlier during boot). The list is ordered based on virtual addresses. This
@@ -70,6 +70,11 @@ static inline bool kaslr_memory_enabled(void)
return kaslr_enabled() && !IS_ENABLED(CONFIG_KASAN);
 }

+inline int __init get_rand_mem_physical_padding(void)
+{
+   return rand_mem_physical_padding;
+}
+
 static int __init rand_mem_physical_padding_setup(char *str)
 {
int max_padding = (1 << (MAX_PHYSMEM_BITS - TB_SHIFT)) - 1;
@@ -117,7 +122,7 @@ void __init kernel_randomize_memory(void)
 */
BUG_ON(kaslr_regions[0].base != _offset_base);
memory_tb = DIV_ROUND_UP(max_pfn << PAGE_SHIFT, 1UL << TB_SHIFT) +
-   rand_mem_physical_padding;
+   get_rand_mem_physical_padding();

/* Adapt phyiscal memory region size based on available memory */
if (memory_tb < kaslr_regions[0].size_tb)
diff --git a/drivers/acpi/numa.c b/drivers/acpi/numa.c
index 3d69834..303b024 100644
--- a/drivers/acpi/numa.c
+++ b/drivers/acpi/numa.c
@@ -470,7 +470,8 @@ int __init acpi_numa_init(void)
if (parsed_numa_memblks && kaslr_enabled()) {
max_actual_phys = roundup(PFN_PHYS(max_pfn), 1ULL << 
40);
max_possible_phys = roundup(PFN_PHYS(max_possible_pfn), 
1ULL << 40);
-   threshold = max_actual_phys + 
((u64)rand_mem_physical_padding << 40);
+   threshold = max_actual_phys +
+   ((u64)get_rand_mem_physical_padding() << 40);

if (max_possible_phys > threshold) {
pr_warn("Set 'rand_mem_physical_padding=%llu' 
to avoid memory hotadd failure.\n",
--
2.18.0


On Tue, Oct 02, 2018 at 03:18:41AM -0700, tip-bot for Masayoshi Mizuma wrote:
> Commit-ID:  3b054ca88c4f4dd5f516a12d4b6d6bd0ae826f41
> Gitweb: 
> https://git.kernel.org/tip/3b054ca88c4f4dd5f516a12d4b6d6bd0ae826f41
> Author: Masayoshi Mizuma 
> AuthorDate: Mon, 1 Oct 2018 10:08:42 -0400
> Committer:  Ingo Molnar 
> CommitDate: Tue, 2 Oct 2018 11:47:21 +0200
> 
> ACPI/NUMA: Add warning message if the padding size for KASLR is not enough
> 
> Add warning message if the padding size for KASLR,
> rand_mem_physical_padding, is not enough. The message also
> says the suitable padding size.
> 
> Signed-off-by: Masayoshi Mizuma 
> Cc: Baoquan He 
> Cc: Borislav Petkov 
> Cc: Linus Torvalds 
> Cc: Masayoshi Mizuma 
> Cc: Peter Zijlstra 
> Cc: Thomas Gleixner 
> Link: http://lkml.kernel.org/r/20181001140843.26137-3-msys.miz...@gmail.com
> Signed-off-by: Ingo Molnar 
> ---
>  arch/x86/include/asm/setup.h |  2 ++
>  drivers/acpi/numa.c  | 14 ++
>  2 files changed, 16 insertions(+)
> 
> diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
> index ae13bc974416..65a5bf8f6aba 100644
> --- a/arch/x86/include/asm/setup.h
> +++ b/arch/x86/include/asm/setup.h
> @@ -80,6 +80,8 @@ static inline unsigned long kaslr_offset(void)
>   return (unsigned long)&_text - __

[PATCH v6 3/3] Documentation/kernel-parameters.txt: Document rand_mem_physical_padding=

2018-10-02 Thread Masayoshi Mizuma
This kernel parameter allows the modification of the padding used
for the physical memory mapping section when KASLR memory is enabled.

For memory hotplug capable systems, the default padding size,
CONFIG_RANDOMIZE_MEMORY_PHYSICAL_PADDING, may not be enough.

The option is useful to adjust the padding size.

Signed-off-by: Masayoshi Mizuma 
---
 .../admin-guide/kernel-parameters.txt | 19 +++
 1 file changed, 19 insertions(+)

diff --git a/Documentation/admin-guide/kernel-parameters.txt 
b/Documentation/admin-guide/kernel-parameters.txt
index 92eb1f4..f0930e3 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -3529,6 +3529,25 @@
fully seed the kernel's CRNG. Default is controlled
by CONFIG_RANDOM_TRUST_CPU.
 
+   rand_mem_physical_padding=
+   [KNL] Define the padding size in terabytes
+   used for the physical memory mapping section
+   when KASLR is enabled.
+   If the padding size is not enough, you can see
+   'Set rand_mem_physical_padding=XX ...' in system
+   boot message, so set the parameter as the message
+   suggests.
+
+   This parameter is useful for memory hot-add capable
+   systems. Such systems may have more space than
+   actual memory size to hot-add memory. If the
+   padding size is not enough and memory is hot-added,
+   the hot-adding will fail because it destroys the
+   system memory map. So, the padding size needs to be
+   adjusted in such a system.
+   The default value is the value of
+   CONFIG_RANDOMIZE_MEMORY_PHYSICAL_PADDING.
+
ras=option[,option,...] [KNL] RAS-specific options
 
cec_disable [X86]
-- 
2.18.0



[PATCH v6 3/3] Documentation/kernel-parameters.txt: Document rand_mem_physical_padding=

2018-10-02 Thread Masayoshi Mizuma
This kernel parameter allows the modification of the padding used
for the physical memory mapping section when KASLR memory is enabled.

For memory hotplug capable systems, the default padding size,
CONFIG_RANDOMIZE_MEMORY_PHYSICAL_PADDING, may not be enough.

The option is useful to adjust the padding size.

Signed-off-by: Masayoshi Mizuma 
---
 .../admin-guide/kernel-parameters.txt | 19 +++
 1 file changed, 19 insertions(+)

diff --git a/Documentation/admin-guide/kernel-parameters.txt 
b/Documentation/admin-guide/kernel-parameters.txt
index 92eb1f4..f0930e3 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -3529,6 +3529,25 @@
fully seed the kernel's CRNG. Default is controlled
by CONFIG_RANDOM_TRUST_CPU.
 
+   rand_mem_physical_padding=
+   [KNL] Define the padding size in terabytes
+   used for the physical memory mapping section
+   when KASLR is enabled.
+   If the padding size is not enough, you can see
+   'Set rand_mem_physical_padding=XX ...' in system
+   boot message, so set the parameter as the message
+   suggests.
+
+   This parameter is useful for memory hot-add capable
+   systems. Such systems may have more space than
+   actual memory size to hot-add memory. If the
+   padding size is not enough and memory is hot-added,
+   the hot-adding will fail because it destroys the
+   system memory map. So, the padding size needs to be
+   adjusted in such a system.
+   The default value is the value of
+   CONFIG_RANDOMIZE_MEMORY_PHYSICAL_PADDING.
+
ras=option[,option,...] [KNL] RAS-specific options
 
cec_disable [X86]
-- 
2.18.0



[PATCH v6 1/3] x86/mm: Add a kernel parameter to change the padding used for the physical memory mapping

2018-10-02 Thread Masayoshi Mizuma
If each node of physical memory layout has huge space for hotplug,
the padding used for the physical memory mapping section is not enough.
For exapmle of the layout:

  SRAT: Node 6 PXM 4 [mem 0x1000-0x13ff] hotplug
  SRAT: Node 7 PXM 5 [mem 0x1400-0x17ff] hotplug
  SRAT: Node 2 PXM 6 [mem 0x1800-0x1bff] hotplug
  SRAT: Node 3 PXM 7 [mem 0x1c00-0x1fff] hotplug

We can increase the padding via CONFIG_RANDOMIZE_MEMORY_PHYSICAL_PADDING,
however, the needed padding size depends on the system environment.
The kernel option is better than changing the config.

Signed-off-by: Masayoshi Mizuma 
Reviewed-by: Baoquan He 
---
 arch/x86/include/asm/setup.h |  9 +
 arch/x86/mm/kaslr.c  | 22 +-
 2 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index ae13bc9..1765a15 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -80,6 +80,15 @@ static inline unsigned long kaslr_offset(void)
return (unsigned long)&_text - __START_KERNEL;
 }
 
+#ifdef CONFIG_RANDOMIZE_MEMORY
+extern inline int __init get_rand_mem_physical_padding(void);
+#else
+static inline int __init get_rand_mem_physical_padding(void)
+{
+   return 0;
+}
+#endif
+
 /*
  * Do NOT EVER look at the BIOS memory size location.
  * It does not work on many machines.
diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c
index 61db77b..eb47f05 100644
--- a/arch/x86/mm/kaslr.c
+++ b/arch/x86/mm/kaslr.c
@@ -40,6 +40,7 @@
  */
 static const unsigned long vaddr_end = CPU_ENTRY_AREA_BASE;
 
+static int rand_mem_physical_padding __initdata = 
CONFIG_RANDOMIZE_MEMORY_PHYSICAL_PADDING;
 /*
  * Memory regions randomized by KASLR (except modules that use a separate logic
  * earlier during boot). The list is ordered based on virtual addresses. This
@@ -69,6 +70,25 @@ static inline bool kaslr_memory_enabled(void)
return kaslr_enabled() && !IS_ENABLED(CONFIG_KASAN);
 }
 
+inline int __init get_rand_mem_physical_padding(void)
+{
+   return rand_mem_physical_padding;
+}
+
+static int __init rand_mem_physical_padding_setup(char *str)
+{
+   int max_padding = (1 << (MAX_PHYSMEM_BITS - TB_SHIFT)) - 1;
+
+   get_option(, _mem_physical_padding);
+   if (rand_mem_physical_padding < 0)
+   rand_mem_physical_padding = 0;
+   else if (rand_mem_physical_padding > max_padding)
+   rand_mem_physical_padding = max_padding;
+
+   return 0;
+}
+early_param("rand_mem_physical_padding", rand_mem_physical_padding_setup);
+
 /* Initialize base and padding for each memory region randomized with KASLR */
 void __init kernel_randomize_memory(void)
 {
@@ -102,7 +122,7 @@ void __init kernel_randomize_memory(void)
 */
BUG_ON(kaslr_regions[0].base != _offset_base);
memory_tb = DIV_ROUND_UP(max_pfn << PAGE_SHIFT, 1UL << TB_SHIFT) +
-   CONFIG_RANDOMIZE_MEMORY_PHYSICAL_PADDING;
+   get_rand_mem_physical_padding();
 
/* Adapt phyiscal memory region size based on available memory */
if (memory_tb < kaslr_regions[0].size_tb)
-- 
2.18.0



[PATCH v6 1/3] x86/mm: Add a kernel parameter to change the padding used for the physical memory mapping

2018-10-02 Thread Masayoshi Mizuma
If each node of physical memory layout has huge space for hotplug,
the padding used for the physical memory mapping section is not enough.
For exapmle of the layout:

  SRAT: Node 6 PXM 4 [mem 0x1000-0x13ff] hotplug
  SRAT: Node 7 PXM 5 [mem 0x1400-0x17ff] hotplug
  SRAT: Node 2 PXM 6 [mem 0x1800-0x1bff] hotplug
  SRAT: Node 3 PXM 7 [mem 0x1c00-0x1fff] hotplug

We can increase the padding via CONFIG_RANDOMIZE_MEMORY_PHYSICAL_PADDING,
however, the needed padding size depends on the system environment.
The kernel option is better than changing the config.

Signed-off-by: Masayoshi Mizuma 
Reviewed-by: Baoquan He 
---
 arch/x86/include/asm/setup.h |  9 +
 arch/x86/mm/kaslr.c  | 22 +-
 2 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index ae13bc9..1765a15 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -80,6 +80,15 @@ static inline unsigned long kaslr_offset(void)
return (unsigned long)&_text - __START_KERNEL;
 }
 
+#ifdef CONFIG_RANDOMIZE_MEMORY
+extern inline int __init get_rand_mem_physical_padding(void);
+#else
+static inline int __init get_rand_mem_physical_padding(void)
+{
+   return 0;
+}
+#endif
+
 /*
  * Do NOT EVER look at the BIOS memory size location.
  * It does not work on many machines.
diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c
index 61db77b..eb47f05 100644
--- a/arch/x86/mm/kaslr.c
+++ b/arch/x86/mm/kaslr.c
@@ -40,6 +40,7 @@
  */
 static const unsigned long vaddr_end = CPU_ENTRY_AREA_BASE;
 
+static int rand_mem_physical_padding __initdata = 
CONFIG_RANDOMIZE_MEMORY_PHYSICAL_PADDING;
 /*
  * Memory regions randomized by KASLR (except modules that use a separate logic
  * earlier during boot). The list is ordered based on virtual addresses. This
@@ -69,6 +70,25 @@ static inline bool kaslr_memory_enabled(void)
return kaslr_enabled() && !IS_ENABLED(CONFIG_KASAN);
 }
 
+inline int __init get_rand_mem_physical_padding(void)
+{
+   return rand_mem_physical_padding;
+}
+
+static int __init rand_mem_physical_padding_setup(char *str)
+{
+   int max_padding = (1 << (MAX_PHYSMEM_BITS - TB_SHIFT)) - 1;
+
+   get_option(, _mem_physical_padding);
+   if (rand_mem_physical_padding < 0)
+   rand_mem_physical_padding = 0;
+   else if (rand_mem_physical_padding > max_padding)
+   rand_mem_physical_padding = max_padding;
+
+   return 0;
+}
+early_param("rand_mem_physical_padding", rand_mem_physical_padding_setup);
+
 /* Initialize base and padding for each memory region randomized with KASLR */
 void __init kernel_randomize_memory(void)
 {
@@ -102,7 +122,7 @@ void __init kernel_randomize_memory(void)
 */
BUG_ON(kaslr_regions[0].base != _offset_base);
memory_tb = DIV_ROUND_UP(max_pfn << PAGE_SHIFT, 1UL << TB_SHIFT) +
-   CONFIG_RANDOMIZE_MEMORY_PHYSICAL_PADDING;
+   get_rand_mem_physical_padding();
 
/* Adapt phyiscal memory region size based on available memory */
if (memory_tb < kaslr_regions[0].size_tb)
-- 
2.18.0



[PATCH v6 0/3] Add a kernel parameter to change the padding size for KASLR

2018-10-02 Thread Masayoshi Mizuma
This patch series are adding an kernel parameter to change
the padding size used for KASLR. It is useful for memory hotplug
capable system. User can adjust the padding size to use it.

It is better if the padding size is calculated automatically,
however, ACPI SRAT is not available at the KASLR initialization
time. So, I add a message for user to tell the suitable padding
size. User can set it on next reboot.

This patch series don't change the current default padding size.

Change log from v5:
 - Fix build error if CONFIG_RANDOMIZE_MEMORY is not defined.

Change log from v4:
 - Fix the padding size check (2nd patch)
 - Add explanation for the parameter in the document. (3rd patch)

Change log from v3:
 - Add a warning message if the padding size for KASLR is not enough.
   And it says the suitable padding size to user.

Change log from v2:
 - Simplify the description. As Baoquan said, this is similar SGI UV issue,
   but a little different. Remove SGI UV description.

Masayoshi Mizuma (3):
  x86/mm: Add a kernel parameter to change the padding used for the
physical memory mapping
  ACPI/NUMA: Add warning message if the padding size for KASLR is not
enough
  Documentation/kernel-parameters.txt: Document
rand_mem_physical_padding=

 .../admin-guide/kernel-parameters.txt | 19 
 arch/x86/include/asm/setup.h  |  9 
 arch/x86/mm/kaslr.c   | 22 ++-
 drivers/acpi/numa.c   | 16 ++
 4 files changed, 65 insertions(+), 1 deletion(-)

-- 
2.18.0



[PATCH v6 2/3] ACPI/NUMA: Add warning message if the padding size for KASLR is not enough

2018-10-02 Thread Masayoshi Mizuma
Add warning message if the padding size for KASLR,
rand_mem_physical_padding, is not enough. The message also
says the suitable padding size.

Signed-off-by: Masayoshi Mizuma 
---
 drivers/acpi/numa.c | 16 
 1 file changed, 16 insertions(+)

diff --git a/drivers/acpi/numa.c b/drivers/acpi/numa.c
index 8516760..420ed2c 100644
--- a/drivers/acpi/numa.c
+++ b/drivers/acpi/numa.c
@@ -32,6 +32,7 @@
 #include 
 #include 
 #include 
+#include 
 
 static nodemask_t nodes_found_map = NODE_MASK_NONE;
 
@@ -435,6 +436,7 @@ acpi_table_parse_srat(enum acpi_srat_type id,
 int __init acpi_numa_init(void)
 {
int cnt = 0;
+   u64 max_possible_phys, max_actual_phys, threshold;
 
if (acpi_disabled)
return -EINVAL;
@@ -463,6 +465,20 @@ int __init acpi_numa_init(void)
 
cnt = acpi_table_parse_srat(ACPI_SRAT_TYPE_MEMORY_AFFINITY,
acpi_parse_memory_affinity, 0);
+
+   /* check the padding size for KASLR is enough. */
+   if (parsed_numa_memblks && kaslr_enabled()) {
+   max_actual_phys = roundup(PFN_PHYS(max_pfn),
+   1ULL << 40);
+   max_possible_phys = roundup(PFN_PHYS(max_possible_pfn),
+   1ULL << 40);
+   threshold = max_actual_phys +
+   ((u64)get_rand_mem_physical_padding() << 40);
+
+   if (max_possible_phys > threshold)
+   pr_warn("Set 'rand_mem_physical_padding=%llu' 
to avoid memory hotadd failure.\n",
+ (max_possible_phys - max_actual_phys) >> 40);
+   }
}
 
/* SLIT: System Locality Information Table */
-- 
2.18.0



  1   2   3   >