[PATCH v4] mm: slub: move sysfs slab alloc/free interfaces to debugfs

2021-04-16 Thread Faiyaz Mohammed
alloc_calls and free_calls implementation in sysfs have two issues,
one is PAGE_SIZE limitiation of sysfs and other is it does not adhere
to "one value per file" rule.

To overcome this issues, move the alloc_calls and free_calls implemeation
to debugfs.

Signed-off-by: Faiyaz Mohammed 
---
 include/linux/slub_def.h |  10 +++
 mm/slab_common.c |   9 +++
 mm/slub.c| 202 ++-
 3 files changed, 200 insertions(+), 21 deletions(-)

diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index dcde82a..f8c268d 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -110,6 +110,9 @@ struct kmem_cache {
 #ifdef CONFIG_SYSFS
struct kobject kobj;/* For sysfs */
 #endif
+#ifdef CONFIG_SLUB_DEBUG
+   struct dentry *slab_cache_dentry;
+#endif
 #ifdef CONFIG_SLAB_FREELIST_HARDENED
unsigned long random;
 #endif
@@ -159,6 +162,13 @@ static inline void sysfs_slab_release(struct kmem_cache *s)
 }
 #endif
 
+#ifdef CONFIG_DEBUG_FS
+void debugfs_slab_release(struct kmem_cache *);
+#else
+static inline void debugfs_slab_release(struct kmem_cache *s)
+{
+}
+#endif
 void object_err(struct kmem_cache *s, struct page *page,
u8 *object, char *reason);
 
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 88e8339..fb28328 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -437,6 +437,9 @@ static void slab_caches_to_rcu_destroy_workfn(struct 
work_struct *work)
 #else
slab_kmem_cache_release(s);
 #endif
+#ifdef CONFIG_DEBUG_FS
+   debugfs_slab_release(s);
+#endif
}
 }
 
@@ -454,6 +457,9 @@ static int shutdown_cache(struct kmem_cache *s)
 #ifdef SLAB_SUPPORTS_SYSFS
sysfs_slab_unlink(s);
 #endif
+#ifdef CONFIG_DEBUG_FS
+   debugfs_slab_release(s);
+#endif
list_add_tail(>list, _caches_to_rcu_destroy);
schedule_work(_caches_to_rcu_destroy_work);
} else {
@@ -464,6 +470,9 @@ static int shutdown_cache(struct kmem_cache *s)
 #else
slab_kmem_cache_release(s);
 #endif
+#ifdef CONFIG_DEBUG_FS
+   debugfs_slab_release(s);
+#endif
}
 
return 0;
diff --git a/mm/slub.c b/mm/slub.c
index 3021ce9..ab7a0d3 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -36,6 +36,7 @@
 #include 
 #include 
 
+#include 
 #include 
 
 #include "internal.h"
@@ -225,6 +226,15 @@ static inline int sysfs_slab_alias(struct kmem_cache *s, 
const char *p)
{ return 0; }
 #endif
 
+#ifdef CONFIG_DEBUG_FS
+static void debugfs_slab_add(struct kmem_cache *);
+static int debugfs_slab_alias(struct kmem_cache *, const char *);
+#else
+static inline void debugfs_slab_add(struct kmem_cache *s) { }
+static inline int debugfs_slab_alias(struct kmem_cache *s, const char *p)
+   { return 0; }
+#endif
+
 static inline void stat(const struct kmem_cache *s, enum stat_item si)
 {
 #ifdef CONFIG_SLUB_STATS
@@ -4521,6 +4531,8 @@ __kmem_cache_alias(const char *name, unsigned int size, 
unsigned int align,
s->refcount--;
s = NULL;
}
+
+   debugfs_slab_alias(s, name);
}
 
return s;
@@ -4542,6 +4554,8 @@ int __kmem_cache_create(struct kmem_cache *s, 
slab_flags_t flags)
if (err)
__kmem_cache_release(s);
 
+   debugfs_slab_add(s);
+
return err;
 }
 
@@ -4682,6 +4696,8 @@ static long validate_slab_cache(struct kmem_cache *s)
 
return count;
 }
+
+#ifdef CONFIG_DEBUG_FS
 /*
  * Generate lists of code addresses where slabcache objects are allocated
  * and freed.
@@ -4705,6 +4721,8 @@ struct loc_track {
struct location *loc;
 };
 
+static struct dentry *slab_debugfs_root;
+
 static void free_loc_track(struct loc_track *t)
 {
if (t->max)
@@ -4822,10 +4840,9 @@ static void process_slab(struct loc_track *t, struct 
kmem_cache *s,
put_map(map);
 }
 
-static int list_locations(struct kmem_cache *s, char *buf,
+static int list_locations(struct seq_file *seq, struct kmem_cache *s,
  enum track_item alloc)
 {
-   int len = 0;
unsigned long i;
struct loc_track t = { 0, 0, NULL };
int node;
@@ -4833,7 +4850,8 @@ static int list_locations(struct kmem_cache *s, char *buf,
 
if (!alloc_loc_track(, PAGE_SIZE / sizeof(struct location),
 GFP_KERNEL)) {
-   return sysfs_emit(buf, "Out of memory\n");
+   seq_puts(seq, "Out of memory\n");
+   return -ENOMEM;
}
/* Push back cpu slabs */
flush_all(s);
@@ -4856,46 +4874,46 @@ static int list_locations(struct kmem_cache *s, char 
*buf,
for (i = 0; i < t.count; i++) {
struct location *l = [i];
 
-   

Re: [PATCH] mm: slub: move sysfs slab alloc/free interfaces to debugfs

2021-04-06 Thread Faiyaz Mohammed
Please ignore this patch!

Thanks and regards,
Mohammed Faiyaz.

On 4/6/2021 5:55 PM, Faiyaz Mohammed wrote:
> alloc_calls and free_calls implementation in sysfs have two issues,
> one is PAGE_SIZE limitiation of sysfs and other is it does not adhere
> to "one value per file" rule.
> 
> To overcome this issues, move the alloc_calls and free_calls implemeation
> to debugfs.
> 
> Signed-off-by: Faiyaz Mohammed 
> ---
>  mm/slub.c | 518 
> ++
>  1 file changed, 286 insertions(+), 232 deletions(-)
> 
> diff --git a/mm/slub.c b/mm/slub.c
> index 3021ce9..4d20ee0 100644
> --- a/mm/slub.c
> +++ b/mm/slub.c
> @@ -36,6 +36,7 @@
>  #include 
>  #include 
>  
> +#include 
>  #include 
>  
>  #include "internal.h"
> @@ -225,6 +226,12 @@ static inline int sysfs_slab_alias(struct kmem_cache *s, 
> const char *p)
>   { return 0; }
>  #endif
>  
> +#ifdef CONFIG_DEBUG_FS
> +static void debugfs_slab_add(struct kmem_cache *);
> +#else
> +static inline void debugfs_slab_add(struct kmem_cache *) { }
> +#endif
> +
>  static inline void stat(const struct kmem_cache *s, enum stat_item si)
>  {
>  #ifdef CONFIG_SLUB_STATS
> @@ -4542,6 +4549,8 @@ int __kmem_cache_create(struct kmem_cache *s, 
> slab_flags_t flags)
>   if (err)
>   __kmem_cache_release(s);
>  
> + debugfs_slab_add(s);
> +
>   return err;
>  }
>  
> @@ -4682,221 +4691,6 @@ static long validate_slab_cache(struct kmem_cache *s)
>  
>   return count;
>  }
> -/*
> - * Generate lists of code addresses where slabcache objects are allocated
> - * and freed.
> - */
> -
> -struct location {
> - unsigned long count;
> - unsigned long addr;
> - long long sum_time;
> - long min_time;
> - long max_time;
> - long min_pid;
> - long max_pid;
> - DECLARE_BITMAP(cpus, NR_CPUS);
> - nodemask_t nodes;
> -};
> -
> -struct loc_track {
> - unsigned long max;
> - unsigned long count;
> - struct location *loc;
> -};
> -
> -static void free_loc_track(struct loc_track *t)
> -{
> - if (t->max)
> - free_pages((unsigned long)t->loc,
> - get_order(sizeof(struct location) * t->max));
> -}
> -
> -static int alloc_loc_track(struct loc_track *t, unsigned long max, gfp_t 
> flags)
> -{
> - struct location *l;
> - int order;
> -
> - order = get_order(sizeof(struct location) * max);
> -
> - l = (void *)__get_free_pages(flags, order);
> - if (!l)
> - return 0;
> -
> - if (t->count) {
> - memcpy(l, t->loc, sizeof(struct location) * t->count);
> - free_loc_track(t);
> - }
> - t->max = max;
> - t->loc = l;
> - return 1;
> -}
> -
> -static int add_location(struct loc_track *t, struct kmem_cache *s,
> - const struct track *track)
> -{
> - long start, end, pos;
> - struct location *l;
> - unsigned long caddr;
> - unsigned long age = jiffies - track->when;
> -
> - start = -1;
> - end = t->count;
> -
> - for ( ; ; ) {
> - pos = start + (end - start + 1) / 2;
> -
> - /*
> -  * There is nothing at "end". If we end up there
> -  * we need to add something to before end.
> -  */
> - if (pos == end)
> - break;
> -
> - caddr = t->loc[pos].addr;
> - if (track->addr == caddr) {
> -
> - l = >loc[pos];
> - l->count++;
> - if (track->when) {
> - l->sum_time += age;
> - if (age < l->min_time)
> - l->min_time = age;
> - if (age > l->max_time)
> - l->max_time = age;
> -
> - if (track->pid < l->min_pid)
> - l->min_pid = track->pid;
> - if (track->pid > l->max_pid)
> - l->max_pid = track->pid;
> -
> - cpumask_set_cpu(track->cpu,
> - to_cpumask(l->cpus));
> - }
> - node_set(page_to_nid(virt_to_page(track)), l->nodes);
> - return 1;

[PATCH v3] mm: slub: move sysfs slab alloc/free interfaces to debugfs

2021-04-06 Thread Faiyaz Mohammed
alloc_calls and free_calls implementation in sysfs have two issues,
one is PAGE_SIZE limitiation of sysfs and other is it does not adhere
to "one value per file" rule.

To overcome this issues, move the alloc_calls and free_calls implemeation
to debugfs.

Signed-off-by: Faiyaz Mohammed 
---
 mm/slub.c | 518 ++
 1 file changed, 286 insertions(+), 232 deletions(-)

diff --git a/mm/slub.c b/mm/slub.c
index 3021ce9..4d20ee0 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -36,6 +36,7 @@
 #include 
 #include 
 
+#include 
 #include 
 
 #include "internal.h"
@@ -225,6 +226,12 @@ static inline int sysfs_slab_alias(struct kmem_cache *s, 
const char *p)
{ return 0; }
 #endif
 
+#ifdef CONFIG_DEBUG_FS
+static void debugfs_slab_add(struct kmem_cache *);
+#else
+static inline void debugfs_slab_add(struct kmem_cache *) { }
+#endif
+
 static inline void stat(const struct kmem_cache *s, enum stat_item si)
 {
 #ifdef CONFIG_SLUB_STATS
@@ -4542,6 +4549,8 @@ int __kmem_cache_create(struct kmem_cache *s, 
slab_flags_t flags)
if (err)
__kmem_cache_release(s);
 
+   debugfs_slab_add(s);
+
return err;
 }
 
@@ -4682,221 +4691,6 @@ static long validate_slab_cache(struct kmem_cache *s)
 
return count;
 }
-/*
- * Generate lists of code addresses where slabcache objects are allocated
- * and freed.
- */
-
-struct location {
-   unsigned long count;
-   unsigned long addr;
-   long long sum_time;
-   long min_time;
-   long max_time;
-   long min_pid;
-   long max_pid;
-   DECLARE_BITMAP(cpus, NR_CPUS);
-   nodemask_t nodes;
-};
-
-struct loc_track {
-   unsigned long max;
-   unsigned long count;
-   struct location *loc;
-};
-
-static void free_loc_track(struct loc_track *t)
-{
-   if (t->max)
-   free_pages((unsigned long)t->loc,
-   get_order(sizeof(struct location) * t->max));
-}
-
-static int alloc_loc_track(struct loc_track *t, unsigned long max, gfp_t flags)
-{
-   struct location *l;
-   int order;
-
-   order = get_order(sizeof(struct location) * max);
-
-   l = (void *)__get_free_pages(flags, order);
-   if (!l)
-   return 0;
-
-   if (t->count) {
-   memcpy(l, t->loc, sizeof(struct location) * t->count);
-   free_loc_track(t);
-   }
-   t->max = max;
-   t->loc = l;
-   return 1;
-}
-
-static int add_location(struct loc_track *t, struct kmem_cache *s,
-   const struct track *track)
-{
-   long start, end, pos;
-   struct location *l;
-   unsigned long caddr;
-   unsigned long age = jiffies - track->when;
-
-   start = -1;
-   end = t->count;
-
-   for ( ; ; ) {
-   pos = start + (end - start + 1) / 2;
-
-   /*
-* There is nothing at "end". If we end up there
-* we need to add something to before end.
-*/
-   if (pos == end)
-   break;
-
-   caddr = t->loc[pos].addr;
-   if (track->addr == caddr) {
-
-   l = >loc[pos];
-   l->count++;
-   if (track->when) {
-   l->sum_time += age;
-   if (age < l->min_time)
-   l->min_time = age;
-   if (age > l->max_time)
-   l->max_time = age;
-
-   if (track->pid < l->min_pid)
-   l->min_pid = track->pid;
-   if (track->pid > l->max_pid)
-   l->max_pid = track->pid;
-
-   cpumask_set_cpu(track->cpu,
-   to_cpumask(l->cpus));
-   }
-   node_set(page_to_nid(virt_to_page(track)), l->nodes);
-   return 1;
-   }
-
-   if (track->addr < caddr)
-   end = pos;
-   else
-   start = pos;
-   }
-
-   /*
-* Not found. Insert new tracking element.
-*/
-   if (t->count >= t->max && !alloc_loc_track(t, 2 * t->max, GFP_ATOMIC))
-   return 0;
-
-   l = t->loc + pos;
-   if (pos < t->count)
-   memmove(l + 1, l,
-   (t->count - pos) * sizeof(struct location));
-   t->count++;
-   l->count = 1;
-   l->addr = track->addr;
-   l->sum_time = age;
-   l->min_time = age;
-   l->max_time 

[PATCH] mm: slub: move sysfs slab alloc/free interfaces to debugfs

2021-04-06 Thread Faiyaz Mohammed
alloc_calls and free_calls implementation in sysfs have two issues,
one is PAGE_SIZE limitiation of sysfs and other is it does not adhere
to "one value per file" rule.

To overcome this issues, move the alloc_calls and free_calls implemeation
to debugfs.

Signed-off-by: Faiyaz Mohammed 
---
 mm/slub.c | 518 ++
 1 file changed, 286 insertions(+), 232 deletions(-)

diff --git a/mm/slub.c b/mm/slub.c
index 3021ce9..4d20ee0 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -36,6 +36,7 @@
 #include 
 #include 
 
+#include 
 #include 
 
 #include "internal.h"
@@ -225,6 +226,12 @@ static inline int sysfs_slab_alias(struct kmem_cache *s, 
const char *p)
{ return 0; }
 #endif
 
+#ifdef CONFIG_DEBUG_FS
+static void debugfs_slab_add(struct kmem_cache *);
+#else
+static inline void debugfs_slab_add(struct kmem_cache *) { }
+#endif
+
 static inline void stat(const struct kmem_cache *s, enum stat_item si)
 {
 #ifdef CONFIG_SLUB_STATS
@@ -4542,6 +4549,8 @@ int __kmem_cache_create(struct kmem_cache *s, 
slab_flags_t flags)
if (err)
__kmem_cache_release(s);
 
+   debugfs_slab_add(s);
+
return err;
 }
 
@@ -4682,221 +4691,6 @@ static long validate_slab_cache(struct kmem_cache *s)
 
return count;
 }
-/*
- * Generate lists of code addresses where slabcache objects are allocated
- * and freed.
- */
-
-struct location {
-   unsigned long count;
-   unsigned long addr;
-   long long sum_time;
-   long min_time;
-   long max_time;
-   long min_pid;
-   long max_pid;
-   DECLARE_BITMAP(cpus, NR_CPUS);
-   nodemask_t nodes;
-};
-
-struct loc_track {
-   unsigned long max;
-   unsigned long count;
-   struct location *loc;
-};
-
-static void free_loc_track(struct loc_track *t)
-{
-   if (t->max)
-   free_pages((unsigned long)t->loc,
-   get_order(sizeof(struct location) * t->max));
-}
-
-static int alloc_loc_track(struct loc_track *t, unsigned long max, gfp_t flags)
-{
-   struct location *l;
-   int order;
-
-   order = get_order(sizeof(struct location) * max);
-
-   l = (void *)__get_free_pages(flags, order);
-   if (!l)
-   return 0;
-
-   if (t->count) {
-   memcpy(l, t->loc, sizeof(struct location) * t->count);
-   free_loc_track(t);
-   }
-   t->max = max;
-   t->loc = l;
-   return 1;
-}
-
-static int add_location(struct loc_track *t, struct kmem_cache *s,
-   const struct track *track)
-{
-   long start, end, pos;
-   struct location *l;
-   unsigned long caddr;
-   unsigned long age = jiffies - track->when;
-
-   start = -1;
-   end = t->count;
-
-   for ( ; ; ) {
-   pos = start + (end - start + 1) / 2;
-
-   /*
-* There is nothing at "end". If we end up there
-* we need to add something to before end.
-*/
-   if (pos == end)
-   break;
-
-   caddr = t->loc[pos].addr;
-   if (track->addr == caddr) {
-
-   l = >loc[pos];
-   l->count++;
-   if (track->when) {
-   l->sum_time += age;
-   if (age < l->min_time)
-   l->min_time = age;
-   if (age > l->max_time)
-   l->max_time = age;
-
-   if (track->pid < l->min_pid)
-   l->min_pid = track->pid;
-   if (track->pid > l->max_pid)
-   l->max_pid = track->pid;
-
-   cpumask_set_cpu(track->cpu,
-   to_cpumask(l->cpus));
-   }
-   node_set(page_to_nid(virt_to_page(track)), l->nodes);
-   return 1;
-   }
-
-   if (track->addr < caddr)
-   end = pos;
-   else
-   start = pos;
-   }
-
-   /*
-* Not found. Insert new tracking element.
-*/
-   if (t->count >= t->max && !alloc_loc_track(t, 2 * t->max, GFP_ATOMIC))
-   return 0;
-
-   l = t->loc + pos;
-   if (pos < t->count)
-   memmove(l + 1, l,
-   (t->count - pos) * sizeof(struct location));
-   t->count++;
-   l->count = 1;
-   l->addr = track->addr;
-   l->sum_time = age;
-   l->min_time = age;
-   l->max_time 

Good Day

2021-04-01 Thread Mr Mohammed Mashab
Good Day,

Please accept my apologies for writing you a surprise letter.I am
Mr.Mohammed Mashab, account Manager with an investment bank here in
Burkina Faso.I have a very important business I want to discuss with
you.There is a draft account opened in my firm by a long-time client
of our bank.I have the opportunity of transferring the left over fund
(15.8 Million UsDollars)Fiftheen Million Eight Hundred Thousand United
States of American Dollars of one of my Bank clients who died at the
collapsing of the world trade center at the United States on September
11th 2001.

I want to invest this funds and introduce you to our bank for this
deal.All I require is your honest co-operation and I guarantee you
that this will be executed under a legitimate arrangement that will
protect us from any breach of the law.I agree that 40% of this money
will be for you as my foreign partner,50% for me while 10% is for
establishing of foundation for the less privilleges in your country.If
you are really interested in my proposal further details of the
Transfer will be forwarded unto you as soon as I receive your
willingness mail for a successful transfer.

Yours Sincerely,
Mr.Mohammed Mashab,


OK.........................

2021-03-17 Thread Mohammed Saad
Greetings,

This is a personal email directed to you for your consideration alone.
I request that it remain and be treated as such only. My name is Mr.
Mohammed Saad . I have an interesting business proposal for you that
will be of immense benefit to both of us, although this may be hard
for you to believe, we stand to gain € 15 million Euros between us in
a matter of one week. This is fully legal and 100% genuine.

I need you to signify your interest by replying to my email. Most
importantly, I need you to keep whatever information between us
confidential even if you decide not to go along with me. I will make
more details available to you on receipt of your positive response.
Kindly send me the followings

Full Names
Address
Occupation
Direct Mobile Telephone Lines
Nationality

Regards,

Mohammed Saad

UBA BANK

OUAGADOUGOU BURKINA FASO


Re: [PATCH v2] mm: slub: Convert sys slab alloc_calls, free_calls to bin attribute

2021-02-17 Thread Faiyaz Mohammed
+linux-mm, linux-kernel.

On 2/17/2021 12:01 PM, Faiyaz Mohammed wrote:
> Reading the sys slab alloc_calls, free_calls returns the available object
> owners, but the size of this file is limited to PAGE_SIZE
> because of the limitation of sysfs attributes, it is returning the
> partial owner info, which is not sufficient to debug/account the slab
> memory and alloc_calls output is not matching with /proc/slabinfo.
> 
> To remove the PAGE_SIZE limitation converted the sys slab
> alloc_calls, free_calls to bin attribute.
> 
> Signed-off-by: Faiyaz Mohammed 
> ---
>  mm/slub.c | 84 
> +++
>  1 file changed, 63 insertions(+), 21 deletions(-)
> 
> diff --git a/mm/slub.c b/mm/slub.c
> index b22a4b1..71cfe3b 100644
> --- a/mm/slub.c
> +++ b/mm/slub.c
> @@ -37,6 +37,9 @@
>  
>  #include 
>  
> +#define TRACE_ENTRY_MAX 80
> +#define TRACKS_PER_PAGE  ((PAGE_SIZE - KSYM_SYMBOL_LEN - 100) / 
> TRACE_ENTRY_MAX)
> +
>  #include "internal.h"
>  
>  /*
> @@ -4748,6 +4751,7 @@ static int list_locations(struct kmem_cache *s, char 
> *buf,
>   struct loc_track t = { 0, 0, NULL };
>   int node;
>   struct kmem_cache_node *n;
> + unsigned int previous_read_count = 0;
>  
>   if (!alloc_loc_track(, PAGE_SIZE / sizeof(struct location),
>GFP_KERNEL)) {
> @@ -4756,6 +4760,11 @@ static int list_locations(struct kmem_cache *s, char 
> *buf,
>   /* Push back cpu slabs */
>   flush_all(s);
>  
> + if (offset != 0)
> + previous_read_count = (offset / TRACE_ENTRY_MAX);
> +
> + memset(buf, 0, PAGE_SIZE);
> +
>   for_each_kmem_cache_node(s, node, n) {
>   unsigned long flags;
>   struct page *page;
> @@ -4771,48 +4780,62 @@ static int list_locations(struct kmem_cache *s, char 
> *buf,
>   spin_unlock_irqrestore(>list_lock, flags);
>   }
>  
> - for (i = 0; i < t.count; i++) {
> + for (i = previous_read_count; i < t.count; i++) {
>   struct location *l = [i];
> + unsigned int cur_len = 0;
>  
> - len += sysfs_emit_at(buf, len, "%7ld ", l->count);
> + cur_len += sysfs_emit_at(buf, cur_len + len, "%7ld ", l->count);
>  
>   if (l->addr)
> - len += sysfs_emit_at(buf, len, "%pS", (void *)l->addr);
> + cur_len += sysfs_emit_at(buf, cur_len + len, "%pS", 
> (void *)l->addr);
>   else
> - len += sysfs_emit_at(buf, len, "");
> + cur_len += sysfs_emit_at(buf, cur_len + len, 
> "");
>  
>   if (l->sum_time != l->min_time)
> - len += sysfs_emit_at(buf, len, " age=%ld/%ld/%ld",
> + cur_len += sysfs_emit_at(buf, cur_len + len, " 
> age=%ld/%ld/%ld",
>l->min_time,
>(long)div_u64(l->sum_time,
>  l->count),
>l->max_time);
>   else
> - len += sysfs_emit_at(buf, len, " age=%ld", l->min_time);
> + cur_len += sysfs_emit_at(buf, cur_len + len, " 
> age=%ld", l->min_time);
>  
>   if (l->min_pid != l->max_pid)
> - len += sysfs_emit_at(buf, len, " pid=%ld-%ld",
> + cur_len += sysfs_emit_at(buf, cur_len + len, " 
> pid=%ld-%ld",
>l->min_pid, l->max_pid);
>   else
> - len += sysfs_emit_at(buf, len, " pid=%ld",
> + cur_len += sysfs_emit_at(buf, cur_len + len, " pid=%ld",
>l->min_pid);
>  
>   if (num_online_cpus() > 1 &&
>   !cpumask_empty(to_cpumask(l->cpus)))
> - len += sysfs_emit_at(buf, len, " cpus=%*pbl",
> + cur_len += sysfs_emit_at(buf, cur_len + len, " 
> cpus=%*pbl",
>
> cpumask_pr_args(to_cpumask(l->cpus)));
>  
>   if (nr_online_nodes > 1 && !nodes_empty(l->nodes))
> - len += sysfs_emit_at(buf, len, " nodes=%*pbl",
> + cur_len += sysfs_emit_at(buf, cur_len + len, " 
> nodes=%*pbl",
&g

Re: [PATCH] mm: slub: Convert sys slab alloc_calls, free_calls to bin attribute

2021-02-16 Thread Faiyaz Mohammed
Hi Vlastimil,

On 1/13/2021 9:35 PM, Vlastimil Babka wrote:
> On 1/12/21 10:21 AM, Faiyaz Mohammed wrote:
>> Reading the sys slab alloc_calls, free_calls returns the available object
>> owners, but the size of this file is limited to PAGE_SIZE
>> because of the limitation of sysfs attributes, it is returning the
>> partial owner info, which is not sufficient to debug/account the slab
>> memory and alloc_calls output is not matching with /proc/slabinfo.
>>
>> To remove the PAGE_SIZE limitation converted the sys slab
>> alloc_calls, free_calls to bin attribute.
>>
>> Signed-off-by: Faiyaz Mohammed 
>> ---
>>  mm/slub.c | 61 +++--
>>  1 file changed, 47 insertions(+), 14 deletions(-)
>>
>> diff --git a/mm/slub.c b/mm/slub.c
>> index b52384e..8744e5ec 100644
>> --- a/mm/slub.c
>> +++ b/mm/slub.c
>> @@ -4710,13 +4710,14 @@ static void process_slab(struct loc_track *t, struct 
>> kmem_cache *s,
>>  }
>>  
>>  static int list_locations(struct kmem_cache *s, char *buf,
>> -enum track_item alloc)
>> +loff_t offset, enum track_item alloc)
>>  {
>>  int len = 0;
>>  unsigned long i;
>>  struct loc_track t = { 0, 0, NULL };
>>  int node;
>>  struct kmem_cache_node *n;
>> +static unsigned int previous_read_count;
> 
> Hmm static? What about parallel reads from different files? I guess you'll 
> have
> to somehow employ the offset parameter here and it won't be pretty, because 
> you
> are still printing free text and not some fixed-size binary chunks where 
> seeking
> is simple.
To avoid static, I have used small logic by fixing track max length and
tracks per page. Please find patch v2.

> Also it's wasteful to to repeat the data gathering for each pritned page, 
> you'd
> need a mechanism that allows holding private data between printing out the
> pages. If bin_attribute doesn't have that, you'd need e.g. seq_file which we 
> use
> for /proc/pid/(s)maps etc.>I think seq_file implementation is not feasible 
> with sysfs and I didn't
find any example in kernel. If we want to use seq_file than I guess we
have to move slab in debugfs.


>>  unsigned long *map = bitmap_alloc(oo_objects(s->max), GFP_KERNEL);
> 
> This line doesn't exist since 90e9f6a66c78f in v5.6-rc1, is the patch based on
> an old kernel?
> 
Updated the patch v2.

>>  if (!map || !alloc_loc_track(, PAGE_SIZE / sizeof(struct location),
>> @@ -4742,11 +4743,9 @@ static int list_locations(struct kmem_cache *s, char 
>> *buf,
>>  spin_unlock_irqrestore(>list_lock, flags);
>>  }
>>  
>> -for (i = 0; i < t.count; i++) {
>> +for (i = previous_read_count; i < t.count; i++) {
>>  struct location *l = [i];
>>  
>> -if (len > PAGE_SIZE - KSYM_SYMBOL_LEN - 100)
>> -break;
>>  len += sprintf(buf + len, "%7ld ", l->count);
>>  
>>  if (l->addr)
>> @@ -4784,12 +4783,20 @@ static int list_locations(struct kmem_cache *s, char 
>> *buf,
>>   nodemask_pr_args(>nodes));
>>  
>>  len += sprintf(buf + len, "\n");
>> +
>> +if (len > PAGE_SIZE - KSYM_SYMBOL_LEN - 100) {
>> +previous_read_count = i + 1;
>> +break;
>> +}
>>  }
>>  
>> +if ((offset != 0) && ((i >= t.count) || (previous_read_count > 
>> t.count))) {
>> +previous_read_count = 0;
>> +len = 0;
>> +} else if (!t.count)
>> +len += sprintf(buf, "No data\n");
>>  free_loc_track();
>>  bitmap_free(map);
>> -if (!t.count)
>> -len += sprintf(buf, "No data\n");
>>  return len;
>>  }
>>  
>> @@ -5180,6 +5187,7 @@ static int any_slab_objects(struct kmem_cache *s)
>>  
>>  struct slab_attribute {
>>  struct attribute attr;
>> +struct bin_attribute bin_attr;
>>  ssize_t (*show)(struct kmem_cache *s, char *buf);
>>  ssize_t (*store)(struct kmem_cache *s, const char *x, size_t count);
>>  };
>> @@ -5192,6 +5200,12 @@ struct slab_attribute {
>>  static struct slab_attribute _name##_attr =  \
>>  __ATTR(_name, 0600, _name##_show, _name##_store)
>>  
>> +#define SLAB_BIN_ATTR_RO(_name) \
>> +static struct slab_attribute _name##_attr = 

Re: [PATCH] mm: slub: Convert sys slab alloc_calls, free_calls to bin attribute

2021-02-16 Thread Faiyaz Mohammed
Hi Matthew,

On 1/12/2021 5:52 PM, Matthew Wilcox wrote:
> On Tue, Jan 12, 2021 at 02:51:27PM +0530, Faiyaz Mohammed wrote:
>> @@ -5180,6 +5187,7 @@ static int any_slab_objects(struct kmem_cache *s)
>>  
>>  struct slab_attribute {
>>  struct attribute attr;
>> +struct bin_attribute bin_attr;
>>  ssize_t (*show)(struct kmem_cache *s, char *buf);
>>  ssize_t (*store)(struct kmem_cache *s, const char *x, size_t count);
>>  };
> 
> I'd rather you added a struct slab_bin_attribute.  If that's even
> needed ..  I think you could just use the bin_attribute directly instead
> of embedding it in this struct.
> 
Yes, we can use bin_attribute directly. Please find patch v2.


[no subject]

2021-01-26 Thread HK Mohammed Hossain
-- 
Dearest Friend,

I have an important message for you just get back for details.

Regards
Mohammed Hossain


Re: [PATCH] mm: slub: Convert sys slab alloc_calls, free_calls to bin attribute

2021-01-24 Thread Faiyaz Mohammed



On 1/13/2021 9:35 PM, Vlastimil Babka wrote:
> On 1/12/21 10:21 AM, Faiyaz Mohammed wrote:
>> Reading the sys slab alloc_calls, free_calls returns the available object
>> owners, but the size of this file is limited to PAGE_SIZE
>> because of the limitation of sysfs attributes, it is returning the
>> partial owner info, which is not sufficient to debug/account the slab
>> memory and alloc_calls output is not matching with /proc/slabinfo.
>>
>> To remove the PAGE_SIZE limitation converted the sys slab
>> alloc_calls, free_calls to bin attribute.
>>
>> Signed-off-by: Faiyaz Mohammed 
>> ---
>>  mm/slub.c | 61 +++--
>>  1 file changed, 47 insertions(+), 14 deletions(-)
>>
>> diff --git a/mm/slub.c b/mm/slub.c
>> index b52384e..8744e5ec 100644
>> --- a/mm/slub.c
>> +++ b/mm/slub.c
>> @@ -4710,13 +4710,14 @@ static void process_slab(struct loc_track *t, struct 
>> kmem_cache *s,
>>  }
>>  
>>  static int list_locations(struct kmem_cache *s, char *buf,
>> -enum track_item alloc)
>> +loff_t offset, enum track_item alloc)
>>  {
>>  int len = 0;
>>  unsigned long i;
>>  struct loc_track t = { 0, 0, NULL };
>>  int node;
>>  struct kmem_cache_node *n;
>> +static unsigned int previous_read_count;
> 
> Hmm static? What about parallel reads from different files? I guess you'll 
> have
> to somehow employ the offset parameter here and it won't be pretty, because 
> you
> are still printing free text and not some fixed-size binary chunks where 
> seeking
> is simple.
> Also it's wasteful to to repeat the data gathering for each pritned page, 
> you'd
> need a mechanism that allows holding private data between printing out the
> pages. If bin_attribute doesn't have that, you'd need e.g. seq_file which we 
> use
> for /proc/pid/(s)maps etc.
> 

Sorry for the delay response, I was on vacation. I will get back to you
on seq_file feasibility.

>>  unsigned long *map = bitmap_alloc(oo_objects(s->max), GFP_KERNEL);
> 
> This line doesn't exist since 90e9f6a66c78f in v5.6-rc1, is the patch based on
> an old kernel?
> 
>>  if (!map || !alloc_loc_track(, PAGE_SIZE / sizeof(struct location),
>> @@ -4742,11 +4743,9 @@ static int list_locations(struct kmem_cache *s, char 
>> *buf,
>>  spin_unlock_irqrestore(>list_lock, flags);
>>  }
>>  
>> -for (i = 0; i < t.count; i++) {
>> +for (i = previous_read_count; i < t.count; i++) {
>>  struct location *l = [i];
>>  
>> -if (len > PAGE_SIZE - KSYM_SYMBOL_LEN - 100)
>> -break;
>>  len += sprintf(buf + len, "%7ld ", l->count);
>>  
>>  if (l->addr)
>> @@ -4784,12 +4783,20 @@ static int list_locations(struct kmem_cache *s, char 
>> *buf,
>>   nodemask_pr_args(>nodes));
>>  
>>  len += sprintf(buf + len, "\n");
>> +
>> +if (len > PAGE_SIZE - KSYM_SYMBOL_LEN - 100) {
>> +previous_read_count = i + 1;
>> +break;
>> +}
>>  }
>>  
>> +if ((offset != 0) && ((i >= t.count) || (previous_read_count > 
>> t.count))) {
>> +previous_read_count = 0;
>> +len = 0;
>> +} else if (!t.count)
>> +len += sprintf(buf, "No data\n");
>>  free_loc_track();
>>  bitmap_free(map);
>> -if (!t.count)
>> -len += sprintf(buf, "No data\n");
>>  return len;
>>  }
>>  
>> @@ -5180,6 +5187,7 @@ static int any_slab_objects(struct kmem_cache *s)
>>  
>>  struct slab_attribute {
>>  struct attribute attr;
>> +struct bin_attribute bin_attr;
>>  ssize_t (*show)(struct kmem_cache *s, char *buf);
>>  ssize_t (*store)(struct kmem_cache *s, const char *x, size_t count);
>>  };
>> @@ -5192,6 +5200,12 @@ struct slab_attribute {
>>  static struct slab_attribute _name##_attr =  \
>>  __ATTR(_name, 0600, _name##_show, _name##_store)
>>  
>> +#define SLAB_BIN_ATTR_RO(_name) \
>> +static struct slab_attribute _name##_attr = { \
>> +.bin_attr = \
>> +__BIN_ATTR_RO(_name, 0) \
>> +} \
>> +
>>  static ssize_t slab_size_show(struct kmem_cache *s, char *buf)
>>  {
>>  return sprintf(buf, 

[no subject]

2021-01-21 Thread Mohammed Hossain
-- 
I  have an proposal for you get back for more details.


Re: [PATCH] KVM: x86: VMX: Make smaller physical guest address space support user-configurable

2021-01-18 Thread Mohammed Gamal
On Fri, 2021-01-15 at 16:08 -0800, Jim Mattson wrote:
> On Thu, Sep 3, 2020 at 7:12 AM Mohammed Gamal 
> wrote:
> > 
> > This patch exposes allow_smaller_maxphyaddr to the user as a module
> > parameter.
> > 
> > Since smaller physical address spaces are only supported on VMX,
> > the parameter
> > is only exposed in the kvm_intel module.
> > Modifications to VMX page fault and EPT violation handling will
> > depend on whether
> > that parameter is enabled.
> > 
> > Also disable support by default, and let the user decide if they
> > want to enable
> > it.
> > 
> > Signed-off-by: Mohammed Gamal 
> > ---
> >  arch/x86/kvm/vmx/vmx.c | 15 ++-
> >  arch/x86/kvm/vmx/vmx.h |  3 +++
> >  arch/x86/kvm/x86.c |  2 +-
> >  3 files changed, 10 insertions(+), 10 deletions(-)
> > 
> > diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> > index 819c185adf09..dc778c7b5a06 100644
> > --- a/arch/x86/kvm/vmx/vmx.c
> > +++ b/arch/x86/kvm/vmx/vmx.c
> > @@ -129,6 +129,9 @@ static bool __read_mostly
> > enable_preemption_timer = 1;
> >  module_param_named(preemption_timer, enable_preemption_timer,
> > bool, S_IRUGO);
> >  #endif
> > 
> > +extern bool __read_mostly allow_smaller_maxphyaddr;
> 
> Since this variable is in the kvm module rather than the kvm_intel
> module, its current setting is preserved across "rmmod kvm_intel;
> modprobe kvm_intel." That is, if set to true, it doesn't revert to
> false after "rmmod kvm_intel." Is that the intended behavior?
> 

IIRC, this is because this setting was indeed not intended to be just
VMX-specific, but since AMD has an issue with PTE accessed-bits being
set by hardware and thus we can't yet enable this feature on it, it
might make sense to move the variable to the kvm_intel module for now.

Paolo, what do you think?




OK.........................

2021-01-15 Thread Mohammed Saad
Dear Friend,

An oil business man made a fixed deposit of €15 MILLION Euros in my
bank branch where I am a director and he died with his entire family
in a plane crash  leaving behind no next of kin. I Propose to present
you as next of kin to claim the funds,if interested contact me with
your full name, and telephone number to reach you and most
importantly, a confirmation of acceptance from you.

Mr Mohammed Saad


Re: [PATCH] mm: slub: Convert sys slab alloc_calls, free_calls to bin attribute

2021-01-12 Thread Faiyaz Mohammed



On 1/12/2021 5:52 PM, Matthew Wilcox wrote:
> On Tue, Jan 12, 2021 at 02:51:27PM +0530, Faiyaz Mohammed wrote:
>> @@ -5180,6 +5187,7 @@ static int any_slab_objects(struct kmem_cache *s)
>>  
>>  struct slab_attribute {
>>  struct attribute attr;
>> +struct bin_attribute bin_attr;
>>  ssize_t (*show)(struct kmem_cache *s, char *buf);
>>  ssize_t (*store)(struct kmem_cache *s, const char *x, size_t count);
>>  };
> 
> I'd rather you added a struct slab_bin_attribute.  If that's even
> needed ..  I think you could just use the bin_attribute directly instead
> of embedding it in this struct.
> 

Okay, I will use bin_attribute directly, but I added the bin_attribute
inside the slab_attribute to maintain similar code like sysfs attribute
and slab_attribute is embedded with it's all slab attributes.


[no subject]

2021-01-12 Thread Mohammed Hossain
-- 
I have an important message for you get back for more details.


[PATCH] mm: slub: Convert sys slab alloc_calls, free_calls to bin attribute

2021-01-12 Thread Faiyaz Mohammed
Reading the sys slab alloc_calls, free_calls returns the available object
owners, but the size of this file is limited to PAGE_SIZE
because of the limitation of sysfs attributes, it is returning the
partial owner info, which is not sufficient to debug/account the slab
memory and alloc_calls output is not matching with /proc/slabinfo.

To remove the PAGE_SIZE limitation converted the sys slab
alloc_calls, free_calls to bin attribute.

Signed-off-by: Faiyaz Mohammed 
---
 mm/slub.c | 61 +++--
 1 file changed, 47 insertions(+), 14 deletions(-)

diff --git a/mm/slub.c b/mm/slub.c
index b52384e..8744e5ec 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -4710,13 +4710,14 @@ static void process_slab(struct loc_track *t, struct 
kmem_cache *s,
 }
 
 static int list_locations(struct kmem_cache *s, char *buf,
-   enum track_item alloc)
+   loff_t offset, enum track_item alloc)
 {
int len = 0;
unsigned long i;
struct loc_track t = { 0, 0, NULL };
int node;
struct kmem_cache_node *n;
+   static unsigned int previous_read_count;
unsigned long *map = bitmap_alloc(oo_objects(s->max), GFP_KERNEL);
 
if (!map || !alloc_loc_track(, PAGE_SIZE / sizeof(struct location),
@@ -4742,11 +4743,9 @@ static int list_locations(struct kmem_cache *s, char 
*buf,
spin_unlock_irqrestore(>list_lock, flags);
}
 
-   for (i = 0; i < t.count; i++) {
+   for (i = previous_read_count; i < t.count; i++) {
struct location *l = [i];
 
-   if (len > PAGE_SIZE - KSYM_SYMBOL_LEN - 100)
-   break;
len += sprintf(buf + len, "%7ld ", l->count);
 
if (l->addr)
@@ -4784,12 +4783,20 @@ static int list_locations(struct kmem_cache *s, char 
*buf,
 nodemask_pr_args(>nodes));
 
len += sprintf(buf + len, "\n");
+
+   if (len > PAGE_SIZE - KSYM_SYMBOL_LEN - 100) {
+   previous_read_count = i + 1;
+   break;
+   }
}
 
+   if ((offset != 0) && ((i >= t.count) || (previous_read_count > 
t.count))) {
+   previous_read_count = 0;
+   len = 0;
+   } else if (!t.count)
+   len += sprintf(buf, "No data\n");
free_loc_track();
bitmap_free(map);
-   if (!t.count)
-   len += sprintf(buf, "No data\n");
return len;
 }
 
@@ -5180,6 +5187,7 @@ static int any_slab_objects(struct kmem_cache *s)
 
 struct slab_attribute {
struct attribute attr;
+   struct bin_attribute bin_attr;
ssize_t (*show)(struct kmem_cache *s, char *buf);
ssize_t (*store)(struct kmem_cache *s, const char *x, size_t count);
 };
@@ -5192,6 +5200,12 @@ struct slab_attribute {
static struct slab_attribute _name##_attr =  \
__ATTR(_name, 0600, _name##_show, _name##_store)
 
+#define SLAB_BIN_ATTR_RO(_name) \
+   static struct slab_attribute _name##_attr = { \
+   .bin_attr = \
+   __BIN_ATTR_RO(_name, 0) \
+   } \
+
 static ssize_t slab_size_show(struct kmem_cache *s, char *buf)
 {
return sprintf(buf, "%u\n", s->size);
@@ -5535,21 +5549,33 @@ static ssize_t validate_store(struct kmem_cache *s,
 }
 SLAB_ATTR(validate);
 
-static ssize_t alloc_calls_show(struct kmem_cache *s, char *buf)
+static ssize_t alloc_calls_read(struct file *filp, struct kobject *kobj,
+   struct bin_attribute *bin_attr, char *buf,
+   loff_t offset, size_t count)
 {
+   struct kmem_cache *s;
+
+   s = to_slab(kobj);
if (!(s->flags & SLAB_STORE_USER))
return -ENOSYS;
-   return list_locations(s, buf, TRACK_ALLOC);
+
+   return list_locations(s, buf, offset, TRACK_ALLOC);
 }
-SLAB_ATTR_RO(alloc_calls);
+SLAB_BIN_ATTR_RO(alloc_calls);
 
-static ssize_t free_calls_show(struct kmem_cache *s, char *buf)
+static ssize_t free_calls_read(struct file *filp, struct kobject *kobj,
+   struct bin_attribute *bin_attr, char *buf,
+   loff_t offset, size_t count)
 {
+   struct kmem_cache *s;
+
+   s = to_slab(kobj);
if (!(s->flags & SLAB_STORE_USER))
return -ENOSYS;
-   return list_locations(s, buf, TRACK_FREE);
+
+   return list_locations(s, buf, offset, TRACK_FREE);
 }
-SLAB_ATTR_RO(free_calls);
+SLAB_BIN_ATTR_RO(free_calls);
 #endif /* CONFIG_SLUB_DEBUG */
 
 #ifdef CONFIG_FAILSLAB
@@ -5694,6 +5720,14 @@ STAT_ATTR(CPU_PARTIAL_NODE, cpu_partial_node);
 STAT_ATTR(CPU_PARTIAL_DRAIN, cpu_partial_drain);
 #endif /* CONFIG_SLUB_STATS */
 
+
+static struct bin_attribute *slab_bin_attrs[] = {
+#ifde

[PATCH v2] mm: memblock: drop __init from memblock functions to make it inline

2020-11-16 Thread Faiyaz Mohammed
__init is used with inline due to which memblock wraper functions are
not getting inline.
for example:
[0.00] memblock_alloc_try_nid: 1490 bytes align=0x40 nid=-1 
from=0x max_addr=0x memblock_alloc+0x20/0x2c
[0.00] memblock_reserve: [0x00023f09a3c0-0x00023f09a991] 
memblock_alloc_range_nid+0xc0/0x188

Dropping __init from memblock wrapper functions to make it inline and it
increase the debugability.
After:
[0.00] memblock_alloc_try_nid: 1490 bytes align=0x40 nid=-1 
from=0x max_addr=0x start_kernel+0xa4/0x568
[0.00] memblock_reserve: [0x00023f09a3c0-0x00023f09a991] 
memblock_alloc_range_nid+0xc0/0x188

Signed-off-by: Faiyaz Mohammed 
---
 include/linux/memblock.h | 18 +-
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index ef13125..f78113f 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -404,13 +404,13 @@ void *memblock_alloc_try_nid(phys_addr_t size, 
phys_addr_t align,
 phys_addr_t min_addr, phys_addr_t max_addr,
 int nid);
 
-static inline void * __init memblock_alloc(phys_addr_t size,  phys_addr_t 
align)
+static inline void *memblock_alloc(phys_addr_t size,  phys_addr_t align)
 {
return memblock_alloc_try_nid(size, align, MEMBLOCK_LOW_LIMIT,
  MEMBLOCK_ALLOC_ACCESSIBLE, NUMA_NO_NODE);
 }
 
-static inline void * __init memblock_alloc_raw(phys_addr_t size,
+static inline void *memblock_alloc_raw(phys_addr_t size,
   phys_addr_t align)
 {
return memblock_alloc_try_nid_raw(size, align, MEMBLOCK_LOW_LIMIT,
@@ -418,7 +418,7 @@ static inline void * __init memblock_alloc_raw(phys_addr_t 
size,
  NUMA_NO_NODE);
 }
 
-static inline void * __init memblock_alloc_from(phys_addr_t size,
+static inline void *memblock_alloc_from(phys_addr_t size,
phys_addr_t align,
phys_addr_t min_addr)
 {
@@ -426,33 +426,33 @@ static inline void * __init 
memblock_alloc_from(phys_addr_t size,
  MEMBLOCK_ALLOC_ACCESSIBLE, NUMA_NO_NODE);
 }
 
-static inline void * __init memblock_alloc_low(phys_addr_t size,
+static inline void *memblock_alloc_low(phys_addr_t size,
   phys_addr_t align)
 {
return memblock_alloc_try_nid(size, align, MEMBLOCK_LOW_LIMIT,
  ARCH_LOW_ADDRESS_LIMIT, NUMA_NO_NODE);
 }
 
-static inline void * __init memblock_alloc_node(phys_addr_t size,
+static inline void *memblock_alloc_node(phys_addr_t size,
phys_addr_t align, int nid)
 {
return memblock_alloc_try_nid(size, align, MEMBLOCK_LOW_LIMIT,
  MEMBLOCK_ALLOC_ACCESSIBLE, nid);
 }
 
-static inline void __init memblock_free_early(phys_addr_t base,
+static inline void memblock_free_early(phys_addr_t base,
  phys_addr_t size)
 {
memblock_free(base, size);
 }
 
-static inline void __init memblock_free_early_nid(phys_addr_t base,
+static inline void memblock_free_early_nid(phys_addr_t base,
  phys_addr_t size, int nid)
 {
memblock_free(base, size);
 }
 
-static inline void __init memblock_free_late(phys_addr_t base, phys_addr_t 
size)
+static inline void memblock_free_late(phys_addr_t base, phys_addr_t size)
 {
__memblock_free_late(base, size);
 }
@@ -460,7 +460,7 @@ static inline void __init memblock_free_late(phys_addr_t 
base, phys_addr_t size)
 /*
  * Set the allocation direction to bottom-up or top-down.
  */
-static inline void __init memblock_set_bottom_up(bool enable)
+static inline void memblock_set_bottom_up(bool enable)
 {
memblock.bottom_up = enable;
 }
-- 
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a
member of the Code Aurora Forum, hosted by The Linux Foundation



[PATCH v2] mm: memblock: add more debug logs

2020-11-15 Thread Faiyaz Mohammed
It is useful to know the exact caller of memblock_phys_alloc_range() to
track early memory reservations during development.

Currently, when memblock debugging is enabled, the allocations done with
memblock_phys_alloc_range() are only reported at memblock_reserve():

[0.00] memblock_reserve: [0x00023fc6b000-0x00023fc6bfff] 
memblock_alloc_range_nid+0xc0/0x188

Add memblock_dbg() to memblock_phys_alloc_range() to get details about
its usage.

For example:

[0.00] memblock_phys_alloc_range: 4096 bytes align=0x1000 
from=0x max_addr=0x 
early_pgtable_alloc+0x24/0x178
[0.00] memblock_reserve: [0x00023fc6b000-0x00023fc6bfff] 
memblock_alloc_range_nid+0xc0/0x188

Signed-off-by: Faiyaz Mohammed 
---
 mm/memblock.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/mm/memblock.c b/mm/memblock.c
index 049df41..f65af9f 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -1419,6 +1419,9 @@ phys_addr_t __init memblock_phys_alloc_range(phys_addr_t 
size,
 phys_addr_t start,
 phys_addr_t end)
 {
+   memblock_dbg("%s: %llu bytes align=0x%llx from=%pa max_addr=%pa %pS\n",
+   __func__, (u64)size, (u64)align, , ,
+   (void *)_RET_IP_);
return memblock_alloc_range_nid(size, align, start, end, NUMA_NO_NODE,
false);
 }
-- 
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a
member of the Code Aurora Forum, hosted by The Linux Foundation



Re: [PATCH] mm: memblock: always inline memblock_alloc

2020-11-11 Thread Faiyaz Mohammed



On 11/11/2020 12:11 AM, Mike Rapoport wrote:

Hi,

On Tue, Nov 10, 2020 at 05:50:17PM +0530, Faiyaz Mohammed wrote:

Since memblock_alloc is not getting inlined, memblock_reserve owner info
is lost. Below information is not enough for memory accounting.
for example:
[0.00] memblock_alloc_try_nid: 1490 bytes align=0x40 nid=-1 
from=0x max_addr=0x memblock_alloc+0x20/0x2c
[0.00] memblock_reserve: [0x00023f09a3c0-0x00023f09a991] 
memblock_alloc_range_nid+0xc0/0x188

Add "__always_inline" to make sure it get inlined and to get the exact
owner of the memblock_reserve.
After adding __always_inline:
[0.00] memblock_alloc_try_nid: 1490 bytes align=0x40 nid=-1 
from=0x max_addr=0x start_kernel+0xa4/0x568
[0.00] memblock_reserve: [0x00023f09a3c0-0x00023f09a991] 
memblock_alloc_range_nid+0xc0/0x188
  
I agree that making memblock_alloc() inline as well as other similar

wrappers would improve the debugability.
Still, it has nothing to do with memory accounting and owner tracking.
Please update the patch description to better explain what it actually
improves.


As describe in other thread, do memblock reserved accounting to track 
owners


to know size of memory allocated by different drivers/owners through

memblock_reserve, which help in comparing different kernel version and in

optimizations.



Signed-off-by: Faiyaz Mohammed 
---
  include/linux/memblock.h | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index ef13125..54f9544 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -404,7 +404,7 @@ void *memblock_alloc_try_nid(phys_addr_t size, phys_addr_t 
align,
 phys_addr_t min_addr, phys_addr_t max_addr,
 int nid);
  
-static inline void * __init memblock_alloc(phys_addr_t size,  phys_addr_t align)

+static __always_inline void * __init memblock_alloc(phys_addr_t size,  
phys_addr_t align)

I think simply dropping __init here will make memblock_alloc() inline.
There are also several more convenience wrappers marked __init, do you
mind removing the __init annotation for them as well?
Yes, if we drop __init, memblock_alloc will get inline but would it not 
increase


kernel footprint as the function will no more be released after kernel 
init?



  {
return memblock_alloc_try_nid(size, align, MEMBLOCK_LOW_LIMIT,
  MEMBLOCK_ALLOC_ACCESSIBLE, NUMA_NO_NODE);


Re: [PATCH] mm: memblock: add more debug logs

2020-11-11 Thread Faiyaz Mohammed



On 11/10/2020 11:46 PM, Mike Rapoport wrote:

Hi,

On Tue, Nov 10, 2020 at 05:49:58PM +0530, Faiyaz Mohammed wrote:

Exact caller of memblock_phys_alloc_range is not available with
"memblock=debug". Below information is not enough for memory accounting.
for example:
[0.00] memblock_reserve: [0x00023fc6b000-0x00023fc6bfff] 
memblock_alloc_range_nid+0xc0/0x188

To enhance the memblock_dbg information or to get the exact owner of the
memblock_reserve, add debug logs in memblock_phys_alloc_range function.

Why would you want to use memblock=debug for memory accounting or for
tracking of the owners of the reserved memory?

We do memblock reserved accounting to track owners to know size of memory

allocated by different drivers/owners through memblock_reserve, which help

in comparing different kernel version and in optimizations.




After adding logs:
[0.00] memblock_phys_alloc_range: 4096 bytes align=0x1000 
from=0x max_addr=0x 
early_pgtable_alloc+0x24/0x178
[0.00] memblock_reserve: [0x00023fc6b000-0x00023fc6bfff] 
memblock_alloc_range_nid+0xc0/0x188

Signed-off-by: Faiyaz Mohammed 
---
  mm/memblock.c | 3 +++
  1 file changed, 3 insertions(+)

diff --git a/mm/memblock.c b/mm/memblock.c
index 049df41..f65af9f 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -1419,6 +1419,9 @@ phys_addr_t __init memblock_phys_alloc_range(phys_addr_t 
size,
 phys_addr_t start,
 phys_addr_t end)
  {
+   memblock_dbg("%s: %llu bytes align=0x%llx from=%pa max_addr=%pa %pS\n",
+   __func__, (u64)size, (u64)align, , ,
+   (void *)_RET_IP_);
return memblock_alloc_range_nid(size, align, start, end, NUMA_NO_NODE,
false);
  }
--
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a
member of the Code Aurora Forum, hosted by The Linux Foundation



[PATCH] mm: memblock: always inline memblock_alloc

2020-11-10 Thread Faiyaz Mohammed
Since memblock_alloc is not getting inlined, memblock_reserve owner info
is lost. Below information is not enough for memory accounting.
for example:
[0.00] memblock_alloc_try_nid: 1490 bytes align=0x40 nid=-1 
from=0x max_addr=0x memblock_alloc+0x20/0x2c
[0.00] memblock_reserve: [0x00023f09a3c0-0x00023f09a991] 
memblock_alloc_range_nid+0xc0/0x188

Add "__always_inline" to make sure it get inlined and to get the exact
owner of the memblock_reserve.
After adding __always_inline:
[0.00] memblock_alloc_try_nid: 1490 bytes align=0x40 nid=-1 
from=0x max_addr=0x start_kernel+0xa4/0x568
[0.00] memblock_reserve: [0x00023f09a3c0-0x00023f09a991] 
memblock_alloc_range_nid+0xc0/0x188

Signed-off-by: Faiyaz Mohammed 
---
 include/linux/memblock.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index ef13125..54f9544 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -404,7 +404,7 @@ void *memblock_alloc_try_nid(phys_addr_t size, phys_addr_t 
align,
 phys_addr_t min_addr, phys_addr_t max_addr,
 int nid);
 
-static inline void * __init memblock_alloc(phys_addr_t size,  phys_addr_t 
align)
+static __always_inline void * __init memblock_alloc(phys_addr_t size,  
phys_addr_t align)
 {
return memblock_alloc_try_nid(size, align, MEMBLOCK_LOW_LIMIT,
  MEMBLOCK_ALLOC_ACCESSIBLE, NUMA_NO_NODE);
-- 
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a
member of the Code Aurora Forum, hosted by The Linux Foundation



[PATCH] mm: memblock: add more debug logs

2020-11-10 Thread Faiyaz Mohammed
Exact caller of memblock_phys_alloc_range is not available with
"memblock=debug". Below information is not enough for memory accounting.
for example:
[0.00] memblock_reserve: [0x00023fc6b000-0x00023fc6bfff] 
memblock_alloc_range_nid+0xc0/0x188

To enhance the memblock_dbg information or to get the exact owner of the
memblock_reserve, add debug logs in memblock_phys_alloc_range function.
After adding logs:
[0.00] memblock_phys_alloc_range: 4096 bytes align=0x1000 
from=0x max_addr=0x 
early_pgtable_alloc+0x24/0x178
[0.00] memblock_reserve: [0x00023fc6b000-0x00023fc6bfff] 
memblock_alloc_range_nid+0xc0/0x188

Signed-off-by: Faiyaz Mohammed 
---
 mm/memblock.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/mm/memblock.c b/mm/memblock.c
index 049df41..f65af9f 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -1419,6 +1419,9 @@ phys_addr_t __init memblock_phys_alloc_range(phys_addr_t 
size,
 phys_addr_t start,
 phys_addr_t end)
 {
+   memblock_dbg("%s: %llu bytes align=0x%llx from=%pa max_addr=%pa %pS\n",
+   __func__, (u64)size, (u64)align, , ,
+   (void *)_RET_IP_);
return memblock_alloc_range_nid(size, align, start, end, NUMA_NO_NODE,
false);
 }
-- 
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a
member of the Code Aurora Forum, hosted by The Linux Foundation



Re: [PATCH RESEND] hv: clocksource: Add notrace attribute to read_hv_sched_clock_*() functions

2020-10-05 Thread Mohammed Gamal
On Mon, 2020-10-05 at 13:47 +0200, Mohammed Gamal wrote:
> When selecting function_graph tracer with the command:
>  # echo function_graph > /sys/kernel/debug/tracing/current_tracer
> 
> The kernel crashes with the following stack trace:
> 
> [69703.122389] BUG: stack guard page was hit at 1056545c
> (stack is fa3f8fed..05d39503)
> [69703.122403] kernel stack overflow (double-fault):  [#1] SMP
> PTI
> [69703.122413] CPU: 0 PID: 16982 Comm: bash Kdump: loaded Not tainted
> 4.18.0-236.el8.x86_64 #1
> [69703.122420] Hardware name: Microsoft Corporation Virtual
> Machine/Virtual Machine, BIOS Hyper-V UEFI Release v4.0 12/17/2019
> [69703.122433] RIP: 0010repare_ftrace_return+0xa/0x110
> [69703.122458] Code: 05 00 0f 0b 48 c7 c7 10 ca 69 ae 0f b6 f0 e8 4b
> 52 0c 00 31 c0 eb ca 66 0f 1f 84 00 00 00 00 00 55 48 89 e5 41 56 41
> 55 41 54 <53> 48 83 ec 18 65 48 8b 04 25 28 00 00 00 48 89 45 d8 31
> c0 48 85
> [69703.122467] RSP: 0018:bd6d01118000 EFLAGS: 00010086
> [69703.122476] RAX:  RBX:  RCX:
> 0003
> [69703.122484] RDX:  RSI: bd6d011180d8 RDI:
> adce7550
> [69703.122491] RBP: bd6d01118018 R08:  R09:
> 9d4b09266000
> [69703.122498] R10: 9d4b0fc04540 R11: 9d4b0fc20a00 R12:
> 9d4b6e42aa90
> [69703.122506] R13: 9d4b0fc20ab8 R14: 03e8 R15:
> bd6d0111837c
> [69703.122514] FS:  7fd5f2588740() GS:9d4b6e40()
> knlGS:
> [69703.122521] CS:  0010 DS:  ES:  CR0: 80050033
> [69703.122528] CR2: bd6d01117ff8 CR3: 565d8001 CR4:
> 003606f0
> [69703.122538] DR0:  DR1:  DR2:
> 
> [69703.122545] DR3:  DR6: fffe0ff0 DR7:
> 0400
> [69703.122552] Call Trace:
> [69703.122568]  ftrace_graph_caller+0x6b/0xa0
> [69703.122589]  ? read_hv_sched_clock_tsc+0x5/0x20
> [69703.122599]  read_hv_sched_clock_tsc+0x5/0x20
> [69703.122611]  sched_clock+0x5/0x10
> [69703.122621]  sched_clock_local+0x12/0x80
> [69703.122631]  sched_clock_cpu+0x8c/0xb0
> [69703.122644]  trace_clock_global+0x21/0x90
> [69703.122655]  ring_buffer_lock_reserve+0x100/0x3c0
> [69703.122671]  trace_buffer_lock_reserve+0x16/0x50
> [69703.122683]  __trace_graph_entry+0x28/0x90
> [69703.122695]  trace_graph_entry+0xfd/0x1a0
> [69703.122705]  ? read_hv_clock_tsc_cs+0x10/0x10
> [69703.122714]  ? sched_clock+0x5/0x10
> [69703.122723]  prepare_ftrace_return+0x99/0x110
> [69703.122734]  ? read_hv_clock_tsc_cs+0x10/0x10
> [69703.122743]  ? sched_clock+0x5/0x10
> [...]
> 
> Setting the notrace attribute for read_hv_sched_clock_msr() and
> read_hv_sched_clock_tsc() fixes it
> 
> Fixes: bd00cd52d5be ("clocksource/drivers/hyperv: Add Hyper-V
> specific
> sched clock function")
> Suggested-by: Vitaly Kuznetsov 
> Signed-off-by: Mohammed Gamal 
> ---
>  drivers/clocksource/hyperv_timer.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/clocksource/hyperv_timer.c
> b/drivers/clocksource/hyperv_timer.c
> index 09aa44cb8a91d..ba04cb381cd3f 100644
> --- a/drivers/clocksource/hyperv_timer.c
> +++ b/drivers/clocksource/hyperv_timer.c
> @@ -341,7 +341,7 @@ static u64 notrace read_hv_clock_tsc_cs(struct
> clocksource *arg)
>   return read_hv_clock_tsc();
>  }
>  
> -static u64 read_hv_sched_clock_tsc(void)
> +static u64 notrace read_hv_sched_clock_tsc(void)
>  {
>   return (read_hv_clock_tsc() - hv_sched_clock_offset) *
>   (NSEC_PER_SEC / HV_CLOCK_HZ);
> @@ -404,7 +404,7 @@ static u64 notrace read_hv_clock_msr_cs(struct
> clocksource *arg)
>   return read_hv_clock_msr();
>  }
>  
> -static u64 read_hv_sched_clock_msr(void)
> +static u64 notrace read_hv_sched_clock_msr(void)
>  {
>   return (read_hv_clock_msr() - hv_sched_clock_offset) *
>   (NSEC_PER_SEC / HV_CLOCK_HZ);

Please ignore the patch. Somehow I missed Wei's reply on it. It's
already applied to hyperv-next.

Thanks



[PATCH RESEND] hv: clocksource: Add notrace attribute to read_hv_sched_clock_*() functions

2020-10-05 Thread Mohammed Gamal
When selecting function_graph tracer with the command:
 # echo function_graph > /sys/kernel/debug/tracing/current_tracer

The kernel crashes with the following stack trace:

[69703.122389] BUG: stack guard page was hit at 1056545c (stack is 
fa3f8fed..05d39503)
[69703.122403] kernel stack overflow (double-fault):  [#1] SMP PTI
[69703.122413] CPU: 0 PID: 16982 Comm: bash Kdump: loaded Not tainted 
4.18.0-236.el8.x86_64 #1
[69703.122420] Hardware name: Microsoft Corporation Virtual Machine/Virtual 
Machine, BIOS Hyper-V UEFI Release v4.0 12/17/2019
[69703.122433] RIP: 0010repare_ftrace_return+0xa/0x110
[69703.122458] Code: 05 00 0f 0b 48 c7 c7 10 ca 69 ae 0f b6 f0 e8 4b 52 0c 00 
31 c0 eb ca 66 0f 1f 84 00 00 00 00 00 55 48 89 e5 41 56 41 55 41 54 <53> 48 83 
ec 18 65 48 8b 04 25 28 00 00 00 48 89 45 d8 31 c0 48 85
[69703.122467] RSP: 0018:bd6d01118000 EFLAGS: 00010086
[69703.122476] RAX:  RBX:  RCX: 0003
[69703.122484] RDX:  RSI: bd6d011180d8 RDI: adce7550
[69703.122491] RBP: bd6d01118018 R08:  R09: 9d4b09266000
[69703.122498] R10: 9d4b0fc04540 R11: 9d4b0fc20a00 R12: 9d4b6e42aa90
[69703.122506] R13: 9d4b0fc20ab8 R14: 03e8 R15: bd6d0111837c
[69703.122514] FS:  7fd5f2588740() GS:9d4b6e40() 
knlGS:
[69703.122521] CS:  0010 DS:  ES:  CR0: 80050033
[69703.122528] CR2: bd6d01117ff8 CR3: 565d8001 CR4: 003606f0
[69703.122538] DR0:  DR1:  DR2: 
[69703.122545] DR3:  DR6: fffe0ff0 DR7: 0400
[69703.122552] Call Trace:
[69703.122568]  ftrace_graph_caller+0x6b/0xa0
[69703.122589]  ? read_hv_sched_clock_tsc+0x5/0x20
[69703.122599]  read_hv_sched_clock_tsc+0x5/0x20
[69703.122611]  sched_clock+0x5/0x10
[69703.122621]  sched_clock_local+0x12/0x80
[69703.122631]  sched_clock_cpu+0x8c/0xb0
[69703.122644]  trace_clock_global+0x21/0x90
[69703.122655]  ring_buffer_lock_reserve+0x100/0x3c0
[69703.122671]  trace_buffer_lock_reserve+0x16/0x50
[69703.122683]  __trace_graph_entry+0x28/0x90
[69703.122695]  trace_graph_entry+0xfd/0x1a0
[69703.122705]  ? read_hv_clock_tsc_cs+0x10/0x10
[69703.122714]  ? sched_clock+0x5/0x10
[69703.122723]  prepare_ftrace_return+0x99/0x110
[69703.122734]  ? read_hv_clock_tsc_cs+0x10/0x10
[69703.122743]  ? sched_clock+0x5/0x10
[...]

Setting the notrace attribute for read_hv_sched_clock_msr() and
read_hv_sched_clock_tsc() fixes it

Fixes: bd00cd52d5be ("clocksource/drivers/hyperv: Add Hyper-V specific
sched clock function")
Suggested-by: Vitaly Kuznetsov 
Signed-off-by: Mohammed Gamal 
---
 drivers/clocksource/hyperv_timer.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/clocksource/hyperv_timer.c 
b/drivers/clocksource/hyperv_timer.c
index 09aa44cb8a91d..ba04cb381cd3f 100644
--- a/drivers/clocksource/hyperv_timer.c
+++ b/drivers/clocksource/hyperv_timer.c
@@ -341,7 +341,7 @@ static u64 notrace read_hv_clock_tsc_cs(struct clocksource 
*arg)
return read_hv_clock_tsc();
 }
 
-static u64 read_hv_sched_clock_tsc(void)
+static u64 notrace read_hv_sched_clock_tsc(void)
 {
return (read_hv_clock_tsc() - hv_sched_clock_offset) *
(NSEC_PER_SEC / HV_CLOCK_HZ);
@@ -404,7 +404,7 @@ static u64 notrace read_hv_clock_msr_cs(struct clocksource 
*arg)
return read_hv_clock_msr();
 }
 
-static u64 read_hv_sched_clock_msr(void)
+static u64 notrace read_hv_sched_clock_msr(void)
 {
return (read_hv_clock_msr() - hv_sched_clock_offset) *
(NSEC_PER_SEC / HV_CLOCK_HZ);
-- 
2.26.2



[PATCH] hv: clocksource: Add notrace attribute to read_hv_sched_clock_*() functions

2020-09-24 Thread Mohammed Gamal
When selecting function_graph tracer with the command:
 # echo function_graph > /sys/kernel/debug/tracing/current_tracer

The kernel crashes with the following stack trace:

[69703.122389] BUG: stack guard page was hit at 1056545c (stack is 
fa3f8fed..05d39503)
[69703.122403] kernel stack overflow (double-fault):  [#1] SMP PTI
[69703.122413] CPU: 0 PID: 16982 Comm: bash Kdump: loaded Not tainted 
4.18.0-236.el8.x86_64 #1
[69703.122420] Hardware name: Microsoft Corporation Virtual Machine/Virtual 
Machine, BIOS Hyper-V UEFI Release v4.0 12/17/2019
[69703.122433] RIP: 0010repare_ftrace_return+0xa/0x110
[69703.122458] Code: 05 00 0f 0b 48 c7 c7 10 ca 69 ae 0f b6 f0 e8 4b 52 0c 00 
31 c0 eb ca 66 0f 1f 84 00 00 00 00 00 55 48 89 e5 41 56 41 55 41 54 <53> 48 83 
ec 18 65 48 8b 04 25 28 00 00 00 48 89 45 d8 31 c0 48 85
[69703.122467] RSP: 0018:bd6d01118000 EFLAGS: 00010086
[69703.122476] RAX:  RBX:  RCX: 0003
[69703.122484] RDX:  RSI: bd6d011180d8 RDI: adce7550
[69703.122491] RBP: bd6d01118018 R08:  R09: 9d4b09266000
[69703.122498] R10: 9d4b0fc04540 R11: 9d4b0fc20a00 R12: 9d4b6e42aa90
[69703.122506] R13: 9d4b0fc20ab8 R14: 03e8 R15: bd6d0111837c
[69703.122514] FS:  7fd5f2588740() GS:9d4b6e40() 
knlGS:
[69703.122521] CS:  0010 DS:  ES:  CR0: 80050033
[69703.122528] CR2: bd6d01117ff8 CR3: 565d8001 CR4: 003606f0
[69703.122538] DR0:  DR1:  DR2: 
[69703.122545] DR3:  DR6: fffe0ff0 DR7: 0400
[69703.122552] Call Trace:
[69703.122568]  ftrace_graph_caller+0x6b/0xa0
[69703.122589]  ? read_hv_sched_clock_tsc+0x5/0x20
[69703.122599]  read_hv_sched_clock_tsc+0x5/0x20
[69703.122611]  sched_clock+0x5/0x10
[69703.122621]  sched_clock_local+0x12/0x80
[69703.122631]  sched_clock_cpu+0x8c/0xb0
[69703.122644]  trace_clock_global+0x21/0x90
[69703.122655]  ring_buffer_lock_reserve+0x100/0x3c0
[69703.122671]  trace_buffer_lock_reserve+0x16/0x50
[69703.122683]  __trace_graph_entry+0x28/0x90
[69703.122695]  trace_graph_entry+0xfd/0x1a0
[69703.122705]  ? read_hv_clock_tsc_cs+0x10/0x10
[69703.122714]  ? sched_clock+0x5/0x10
[69703.122723]  prepare_ftrace_return+0x99/0x110
[69703.122734]  ? read_hv_clock_tsc_cs+0x10/0x10
[69703.122743]  ? sched_clock+0x5/0x10
[69703.122752]  ftrace_graph_caller+0x6b/0xa0
[69703.122768]  ? read_hv_clock_tsc_cs+0x10/0x10
[69703.122777]  ? sched_clock+0x5/0x10
[69703.122786]  ? read_hv_sched_clock_tsc+0x5/0x20
[69703.122796]  ? ring_buffer_unlock_commit+0x1d/0xa0
[69703.122805]  read_hv_sched_clock_tsc+0x5/0x20
[69703.122814]  ftrace_graph_caller+0xa0/0xa0
[69703.122823]  ? trace_clock_local+0x5/0x10
[69703.122831]  ? ftrace_push_return_trace+0x5d/0x120
[69703.122842]  ? read_hv_clock_tsc_cs+0x10/0x10
[69703.122850]  ? sched_clock+0x5/0x10
[69703.122860]  ? prepare_ftrace_return+0xd5/0x110
[69703.122871]  ? read_hv_clock_tsc_cs+0x10/0x10
[69703.122879]  ? sched_clock+0x5/0x10
[69703.122889]  ? ftrace_graph_caller+0x6b/0xa0
[69703.122904]  ? read_hv_clock_tsc_cs+0x10/0x10
[69703.122912]  ? sched_clock+0x5/0x10
[69703.122922]  ? read_hv_sched_clock_tsc+0x5/0x20
[69703.122931]  ? ring_buffer_unlock_commit+0x1d/0xa0
[69703.122940]  ? read_hv_sched_clock_tsc+0x5/0x20
[69703.122966]  ? ftrace_graph_caller+0xa0/0xa0
[69703.122975]  ? trace_clock_local+0x5/0x10
[69703.122984]  ? ftrace_push_return_trace+0x5d/0x120
[69703.122995]  ? read_hv_clock_tsc_cs+0x10/0x10
[69703.123006]  ? sched_clock+0x5/0x10
[69703.123016]  ? prepare_ftrace_return+0xd5/0x110
[69703.123026]  ? read_hv_clock_tsc_cs+0x10/0x10
[69703.123035]  ? sched_clock+0x5/0x10
[69703.123044]  ? ftrace_graph_caller+0x6b/0xa0
[69703.123059]  ? read_hv_clock_tsc_cs+0x10/0x10
[69703.123068]  ? sched_clock+0x5/0x10

Setting the notrace attribute for read_hv_sched_clock_msr() and
read_hv_sched_clock_tsc() fixes it

Fixes: bd00cd52d5be ("clocksource/drivers/hyperv: Add Hyper-V specific
sched clock function")
Signed-off-by: Vitaly Kuznetsov 
Signed-off-by: Mohammed Gamal 
---
 drivers/clocksource/hyperv_timer.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/clocksource/hyperv_timer.c 
b/drivers/clocksource/hyperv_timer.c
index 09aa44cb8a91d..ba04cb381cd3f 100644
--- a/drivers/clocksource/hyperv_timer.c
+++ b/drivers/clocksource/hyperv_timer.c
@@ -341,7 +341,7 @@ static u64 notrace read_hv_clock_tsc_cs(struct clocksource 
*arg)
return read_hv_clock_tsc();
 }
 
-static u64 read_hv_sched_clock_tsc(void)
+static u64 notrace read_hv_sched_clock_tsc(void)
 {
return (read_hv_clock_tsc() - hv_sched_clock_offset) *
(NSEC_PER_SEC / HV_CLOCK_HZ);
@@ -404,7 +404,7 @@ static u64 notrace read_hv_clock_msr_cs(struct clocksource 
*arg)
return read_hv_clock_msr(

[PATCH] KVM: x86: VMX: Make smaller physical guest address space support user-configurable

2020-09-03 Thread Mohammed Gamal
This patch exposes allow_smaller_maxphyaddr to the user as a module parameter.

Since smaller physical address spaces are only supported on VMX, the parameter
is only exposed in the kvm_intel module.
Modifications to VMX page fault and EPT violation handling will depend on 
whether
that parameter is enabled.

Also disable support by default, and let the user decide if they want to enable
it.

Signed-off-by: Mohammed Gamal 
---
 arch/x86/kvm/vmx/vmx.c | 15 ++-
 arch/x86/kvm/vmx/vmx.h |  3 +++
 arch/x86/kvm/x86.c |  2 +-
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 819c185adf09..dc778c7b5a06 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -129,6 +129,9 @@ static bool __read_mostly enable_preemption_timer = 1;
 module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO);
 #endif
 
+extern bool __read_mostly allow_smaller_maxphyaddr;
+module_param(allow_smaller_maxphyaddr, bool, S_IRUGO | S_IWUSR);
+
 #define KVM_VM_CR0_ALWAYS_OFF (X86_CR0_NW | X86_CR0_CD)
 #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR0_NE
 #define KVM_VM_CR0_ALWAYS_ON   \
@@ -4798,7 +4801,8 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu)
 
if (is_page_fault(intr_info)) {
cr2 = vmx_get_exit_qual(vcpu);
-   if (enable_ept && !vcpu->arch.apf.host_apf_flags) {
+   if (enable_ept && !vcpu->arch.apf.host_apf_flags
+   && allow_smaller_maxphyaddr) {
/*
 * EPT will cause page fault only if we need to
 * detect illegal GPAs.
@@ -5331,7 +5335,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
 * would also use advanced VM-exit information for EPT violations to
 * reconstruct the page fault error code.
 */
-   if (unlikely(kvm_mmu_is_illegal_gpa(vcpu, gpa)))
+   if (unlikely(kvm_mmu_is_illegal_gpa(vcpu, gpa)) && 
allow_smaller_maxphyaddr)
return kvm_emulate_instruction(vcpu, 0);
 
return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
@@ -8303,13 +8307,6 @@ static int __init vmx_init(void)
 #endif
vmx_check_vmcs12_offsets();
 
-   /*
-* Intel processors don't have problems with
-* GUEST_MAXPHYADDR < HOST_MAXPHYADDR so enable
-* it for VMX by default
-*/
-   allow_smaller_maxphyaddr = true;
-
return 0;
 }
 module_init(vmx_init);
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 26175a4759fa..b859435efa2e 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -551,6 +551,9 @@ static inline bool vmx_has_waitpkg(struct vcpu_vmx *vmx)
 
 static inline bool vmx_need_pf_intercept(struct kvm_vcpu *vcpu)
 {
+   if (!allow_smaller_maxphyaddr)
+   return false;
+
return !enable_ept || cpuid_maxphyaddr(vcpu) < 
boot_cpu_data.x86_phys_bits;
 }
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index d39d6cf1d473..982f1d73a884 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -188,7 +188,7 @@ static struct kvm_shared_msrs __percpu *shared_msrs;
 u64 __read_mostly host_efer;
 EXPORT_SYMBOL_GPL(host_efer);
 
-bool __read_mostly allow_smaller_maxphyaddr;
+bool __read_mostly allow_smaller_maxphyaddr = 0;
 EXPORT_SYMBOL_GPL(allow_smaller_maxphyaddr);
 
 static u64 __read_mostly host_xss;
-- 
2.26.2



[PATCH] staging: rtl8723bs: os_dep: fix function-name print using __func__

2020-08-12 Thread Mohammed Rushad
This patch to the os_intfs.c fixes the printing of function names using
the preferred '"%s...", __func__' and alignment issues as pointed out by
the checkpatch.pl tool.

Signed-off-by: Mohammed Rushad 
---
 drivers/staging/rtl8723bs/os_dep/os_intfs.c | 56 +++--
 1 file changed, 29 insertions(+), 27 deletions(-)

diff --git a/drivers/staging/rtl8723bs/os_dep/os_intfs.c 
b/drivers/staging/rtl8723bs/os_dep/os_intfs.c
index 27f990a01a23..0460db4ae660 100644
--- a/drivers/staging/rtl8723bs/os_dep/os_intfs.c
+++ b/drivers/staging/rtl8723bs/os_dep/os_intfs.c
@@ -400,17 +400,17 @@ u16 rtw_recv_select_queue(struct sk_buff *skb)
memcpy(_type, pdata + (ETH_ALEN << 1), 2);
 
switch (be16_to_cpu(eth_type)) {
-   case ETH_P_IP:
+   case ETH_P_IP:
 
-   piphdr = (struct iphdr *)(pdata + ETH_HLEN);
+   piphdr = (struct iphdr *)(pdata + ETH_HLEN);
 
-   dscp = piphdr->tos & 0xfc;
+   dscp = piphdr->tos & 0xfc;
 
-   priority = dscp >> 5;
+   priority = dscp >> 5;
 
-   break;
-   default:
-   priority = 0;
+   break;
+   default:
+   priority = 0;
}
 
return rtw_1d_to_queue[priority];
@@ -539,7 +539,7 @@ u32 rtw_start_drv_threads(struct adapter *padapter)
 {
u32 _status = _SUCCESS;
 
-   RT_TRACE(_module_os_intfs_c_, _drv_info_, ("+rtw_start_drv_threads\n"));
+   RT_TRACE(_module_os_intfs_c_, _drv_info_, ("+%s\n", __func__));
padapter->xmitThread = kthread_run(rtw_xmit_thread, padapter, 
"RTW_XMIT_THREAD");
if (IS_ERR(padapter->xmitThread))
_status = _FAIL;
@@ -556,7 +556,7 @@ u32 rtw_start_drv_threads(struct adapter *padapter)
 
 void rtw_stop_drv_threads(struct adapter *padapter)
 {
-   RT_TRACE(_module_os_intfs_c_, _drv_info_, ("+rtw_stop_drv_threads\n"));
+   RT_TRACE(_module_os_intfs_c_, _drv_info_, ("+%s\n", __func__));
 
rtw_stop_cmd_thread(padapter);
 
@@ -710,7 +710,7 @@ u8 rtw_init_drv_sw(struct adapter *padapter)
 {
u8 ret8 = _SUCCESS;
 
-   RT_TRACE(_module_os_intfs_c_, _drv_info_, ("+rtw_init_drv_sw\n"));
+   RT_TRACE(_module_os_intfs_c_, _drv_info_, ("+%s\n", __func__));
 
rtw_init_default_value(padapter);
 
@@ -773,29 +773,29 @@ u8 rtw_init_drv_sw(struct adapter *padapter)
 
 exit:
 
-   RT_TRACE(_module_os_intfs_c_, _drv_info_, ("-rtw_init_drv_sw\n"));
+   RT_TRACE(_module_os_intfs_c_, _drv_info_, ("-%s\n", __func__));
 
return ret8;
 }
 
 void rtw_cancel_all_timer(struct adapter *padapter)
 {
-   RT_TRACE(_module_os_intfs_c_, _drv_info_, ("+rtw_cancel_all_timer\n"));
+   RT_TRACE(_module_os_intfs_c_, _drv_info_, ("+%s\n", __func__));
 
del_timer_sync(>mlmepriv.assoc_timer);
-   RT_TRACE(_module_os_intfs_c_, _drv_info_, ("rtw_cancel_all_timer:cancel 
association timer complete!\n"));
+   RT_TRACE(_module_os_intfs_c_, _drv_info_, ("%s:cancel association timer 
complete!\n", __func__));
 
del_timer_sync(>mlmepriv.scan_to_timer);
-   RT_TRACE(_module_os_intfs_c_, _drv_info_, ("rtw_cancel_all_timer:cancel 
scan_to_timer!\n"));
+   RT_TRACE(_module_os_intfs_c_, _drv_info_, ("%s:cancel 
scan_to_timer!\n", __func__));
 
del_timer_sync(>mlmepriv.dynamic_chk_timer);
-   RT_TRACE(_module_os_intfs_c_, _drv_info_, ("rtw_cancel_all_timer:cancel 
dynamic_chk_timer!\n"));
+   RT_TRACE(_module_os_intfs_c_, _drv_info_, ("%s:cancel 
dynamic_chk_timer!\n", __func__));
 
del_timer_sync(&(adapter_to_pwrctl(padapter)->pwr_state_check_timer));
 
del_timer_sync(>mlmepriv.set_scan_deny_timer);
rtw_clear_scan_deny(padapter);
-   RT_TRACE(_module_os_intfs_c_, _drv_info_, ("rtw_cancel_all_timer:cancel 
set_scan_deny_timer!\n"));
+   RT_TRACE(_module_os_intfs_c_, _drv_info_, ("%s:cancel 
set_scan_deny_timer!\n", __func__));
 
del_timer_sync(>recvpriv.signal_stat_timer);
 
@@ -805,7 +805,7 @@ void rtw_cancel_all_timer(struct adapter *padapter)
 
 u8 rtw_free_drv_sw(struct adapter *padapter)
 {
-   RT_TRACE(_module_os_intfs_c_, _drv_info_, ("==>rtw_free_drv_sw"));
+   RT_TRACE(_module_os_intfs_c_, _drv_info_, ("==>%s", __func__));
 
free_mlme_ext_priv(>mlmeextpriv);
 
@@ -829,7 +829,7 @@ u8 rtw_free_drv_sw(struct adapter *padapter)
 
rtw_hal_free_data(padapter);
 
-   RT_TRACE(_module_os_intfs_c_, _drv_info_, ("<==rtw_free_drv_sw\n"));
+   RT_TRACE(_module_os_intfs_c_, _drv_info_, ("<==%s\n", __func__));
 
/* free the old_pnetdev */
if

Hello My Dear,

2020-08-09 Thread Mrs Faiza Mohammed



Hello My Dear,

Please do not feel disturbed for contacting you, based on the critical 
condition I find mine self though, it's not financial problem, but my health 
you might have know that cancer is not what to talk home about, I am married to 
Mr.Umair Mohammed who worked with Tunisia embassy in Burkina Faso for nine 
years before he died in the year 2012.We were married for eleven years without 
a child. He died after a brief illness that lasted for five days.

Since his death I decided not to remarry, When my late husband was alive he 
deposited the sum of US$ 9.2m (Nine million two hundred thousand dollars) in a 
bank in Burkina Faso, Presently this money is still in bank. And My Doctor told 
me that I don't have much time to live because of the cancer problem, Having 
known my condition I decided to hand you over this fond to take care of the 
less-privileged people, you will utilize this money the way I am going to 
instruct herein. I want you to take 30 Percent of the total money for your 
personal use While 70% of the money will go to charity" people and helping the 
orphanage.

I don't want my husband's efforts to be used by the Government. I grew up as an 
Orphan and I don't have anybody as my family member,

I am expecting your response to private faiza_mo...@yahoo.com

Regards,

Mrs.Faiza Mohammed.
written from Hospital.


[PATCH] staging: rtl8723bs: os_dep: fix brace coding style issue in sdio_intf.c

2020-08-02 Thread Mohammed Rushad
This is a patch to the sdio_intf.c file that fixes up an incorrectly
placed brace found by the checkpatch.pl tool

Signed-off-by: Mohammed Rushad 
---
 drivers/staging/rtl8723bs/os_dep/sdio_intf.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/staging/rtl8723bs/os_dep/sdio_intf.c 
b/drivers/staging/rtl8723bs/os_dep/sdio_intf.c
index 5b1392deb0a7..f1e2829a19a7 100644
--- a/drivers/staging/rtl8723bs/os_dep/sdio_intf.c
+++ b/drivers/staging/rtl8723bs/os_dep/sdio_intf.c
@@ -15,8 +15,7 @@
 #define dev_to_sdio_func(d) container_of(d, struct sdio_func, dev)
 #endif
 
-static const struct sdio_device_id sdio_ids[] =
-{
+static const struct sdio_device_id sdio_ids[] = {
{ SDIO_DEVICE(0x024c, 0x0523), },
{ SDIO_DEVICE(0x024c, 0x0525), },
{ SDIO_DEVICE(0x024c, 0x0623), },
-- 
2.17.1



[PATCH] staging: rtl8723bs: os_dep: fix coding style issue in xmit_linux.c

2020-08-02 Thread Mohammed Rushad
This is a patch to the xmit_linux.c file that fixes brace and missing
line warning found by checkpatch.pl tool

Signed-off-by: Mohammed Rushad 
---
 drivers/staging/rtl8723bs/os_dep/xmit_linux.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/staging/rtl8723bs/os_dep/xmit_linux.c 
b/drivers/staging/rtl8723bs/os_dep/xmit_linux.c
index fec8a8caaa46..b199d355e568 100644
--- a/drivers/staging/rtl8723bs/os_dep/xmit_linux.c
+++ b/drivers/staging/rtl8723bs/os_dep/xmit_linux.c
@@ -148,13 +148,13 @@ static int rtw_mlcst2unicst(struct adapter *padapter, 
struct sk_buff *skb)
/* free sta asoc_queue */
while (phead != plist) {
int stainfo_offset;
+
psta = LIST_CONTAINOR(plist, struct sta_info, asoc_list);
plist = get_next(plist);
 
stainfo_offset = rtw_stainfo_offset(pstapriv, psta);
-   if (stainfo_offset_valid(stainfo_offset)) {
+   if (stainfo_offset_valid(stainfo_offset))
chk_alive_list[chk_alive_num++] = stainfo_offset;
-   }
}
spin_unlock_bh(>asoc_list_lock);
 
-- 
2.17.1



[PATCH] Staging: rtl8192e: fix indent coding style issue in rtllib_tx.c

2020-08-02 Thread Mohammed Rushad
This is a patch to the rtllib_tx.c file that fixes fixes an improper
indent found by the checkpatch.pl tool

Signed-off-by: Mohammed Rushad 
---
 drivers/staging/rtl8192e/rtllib_tx.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/staging/rtl8192e/rtllib_tx.c 
b/drivers/staging/rtl8192e/rtllib_tx.c
index 79d7ad7c0a4a..e0d79daca24a 100644
--- a/drivers/staging/rtl8192e/rtllib_tx.c
+++ b/drivers/staging/rtl8192e/rtllib_tx.c
@@ -859,7 +859,7 @@ static int rtllib_xmit_inter(struct sk_buff *skb, struct 
net_device *dev)
if (ieee->seq_ctrl[0] == 0xFFF)
ieee->seq_ctrl[0] = 0;
else
-   ieee->seq_ctrl[0]++;
+   ieee->seq_ctrl[0]++;
}
} else {
if (unlikely(skb->len < sizeof(struct rtllib_hdr_3addr))) {
-- 
2.17.1



[PATCH v3 6/9] KVM: VMX: introduce vmx_need_pf_intercept

2020-07-10 Thread Mohammed Gamal
From: Paolo Bonzini 

Signed-off-by: Paolo Bonzini 
---
 arch/x86/kvm/vmx/nested.c | 28 +---
 arch/x86/kvm/vmx/vmx.c|  2 +-
 arch/x86/kvm/vmx/vmx.h|  5 +
 3 files changed, 23 insertions(+), 12 deletions(-)

diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index b26655104d4a..1aea9e3b8c43 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -2433,22 +2433,28 @@ static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, 
struct vmcs12 *vmcs12)
 
/*
 * Whether page-faults are trapped is determined by a combination of
-* 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF.
-* If enable_ept, L0 doesn't care about page faults and we should
-* set all of these to L1's desires. However, if !enable_ept, L0 does
-* care about (at least some) page faults, and because it is not easy
-* (if at all possible?) to merge L0 and L1's desires, we simply ask
-* to exit on each and every L2 page fault. This is done by setting
-* MASK=MATCH=0 and (see below) EB.PF=1.
+* 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF.  If L0
+* doesn't care about page faults then we should set all of these to
+* L1's desires. However, if L0 does care about (some) page faults, it
+* is not easy (if at all possible?) to merge L0 and L1's desires, we
+* simply ask to exit on each and every L2 page fault. This is done by
+* setting MASK=MATCH=0 and (see below) EB.PF=1.
 * Note that below we don't need special code to set EB.PF beyond the
 * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept,
 * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when
 * !enable_ept, EB.PF is 1, so the "or" will always be 1.
 */
-   vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK,
-   enable_ept ? vmcs12->page_fault_error_code_mask : 0);
-   vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH,
-   enable_ept ? vmcs12->page_fault_error_code_match : 0);
+   if (vmx_need_pf_intercept(>vcpu)) {
+   /*
+* TODO: if both L0 and L1 need the same MASK and MATCH,
+* go ahead and use it?
+*/
+   vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
+   vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
+   } else {
+   vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 
vmcs12->page_fault_error_code_mask);
+   vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 
vmcs12->page_fault_error_code_match);
+   }
 
if (cpu_has_vmx_apicv()) {
vmcs_write64(EOI_EXIT_BITMAP0, vmcs12->eoi_exit_bitmap0);
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 178ee92551a9..770b090969fb 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -780,7 +780,7 @@ void update_exception_bitmap(struct kvm_vcpu *vcpu)
eb |= 1u << BP_VECTOR;
if (to_vmx(vcpu)->rmode.vm86_active)
eb = ~0;
-   if (enable_ept)
+   if (!vmx_need_pf_intercept(vcpu))
eb &= ~(1u << PF_VECTOR);
 
/* When we are running a nested L2 guest and L1 specified for it a
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 639798e4a6ca..b0e5e210f1c1 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -550,6 +550,11 @@ static inline bool vmx_has_waitpkg(struct vcpu_vmx *vmx)
SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE;
 }
 
+static inline bool vmx_need_pf_intercept(struct kvm_vcpu *vcpu)
+{
+   return !enable_ept;
+}
+
 void dump_vmcs(void);
 
 #endif /* __KVM_X86_VMX_H */
-- 
2.26.2



[PATCH v3 9/9] KVM: x86: SVM: VMX: Make GUEST_MAXPHYADDR < HOST_MAXPHYADDR support configurable

2020-07-10 Thread Mohammed Gamal
The reason behind including this patch is unexpected behaviour we see
with NPT vmexit handling in AMD processor.

With previous patch ("KVM: SVM: Add guest physical address check in
NPF/PF interception") we see the followning error multiple times in
the 'access' test in kvm-unit-tests:

test pte.p pte.36 pde.p: FAIL: pte 221 expected 201
Dump mapping: address: 0x1234
--L4: 24c3027
--L3: 24c4027
--L2: 24c5021
--L1: 100221

This shows that the PTE's accessed bit is apparently being set by
the CPU hardware before the NPF vmexit. This completely handled by
hardware and can not be fixed in software.

This patch introduces a workaround. We add a boolean variable:
'allow_smaller_maxphyaddr'
Which is set individually by VMX and SVM init routines. On VMX it's
always set to true, on SVM it's only set to true when NPT is not
enabled.

We also add a new capability KVM_CAP_SMALLER_MAXPHYADDR which
allows userspace to query if the underlying architecture would
support GUEST_MAXPHYADDR < HOST_MAXPHYADDR and hence act accordingly
(e.g. qemu can decide if it would ignore the -cpu ..,phys-bits=X)

CC: Tom Lendacky 
CC: Babu Moger 
Signed-off-by: Mohammed Gamal 
---
 arch/x86/include/asm/kvm_host.h |  2 +-
 arch/x86/kvm/svm/svm.c  | 15 +++
 arch/x86/kvm/vmx/vmx.c  |  7 +++
 arch/x86/kvm/x86.c  |  6 ++
 include/uapi/linux/kvm.h|  1 +
 5 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index bb4044ffb7b7..26002e1b47f7 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1304,7 +1304,7 @@ struct kvm_arch_async_pf {
 };
 
 extern u64 __read_mostly host_efer;
-
+extern bool __read_mostly allow_smaller_maxphyaddr;
 extern struct kvm_x86_ops kvm_x86_ops;
 
 #define __KVM_HAVE_ARCH_VM_ALLOC
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 79c33b3539f0..f3d7ae26875c 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -924,6 +924,21 @@ static __init int svm_hardware_setup(void)
 
svm_set_cpu_caps();
 
+   /*
+* It seems that on AMD processors PTE's accessed bit is
+* being set by the CPU hardware before the NPF vmexit.
+* This is not expected behaviour and our tests fail because
+* of it.
+* A workaround here is to disable support for
+* GUEST_MAXPHYADDR < HOST_MAXPHYADDR if NPT is enabled.
+* In this case userspace can know if there is support using
+* KVM_CAP_SMALLER_MAXPHYADDR extension and decide how to handle
+* it
+* If future AMD CPU models change the behaviour described above,
+* this variable can be changed accordingly
+*/
+   allow_smaller_maxphyaddr = !npt_enabled;
+
return 0;
 
 err:
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 0cebc4832805..8a8e85e6c529 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -8294,6 +8294,13 @@ static int __init vmx_init(void)
 #endif
vmx_check_vmcs12_offsets();
 
+   /*
+* Intel processors don't have problems with
+* GUEST_MAXPHYADDR < HOST_MAXPHYADDR so enable
+* it for VMX by default
+*/
+   allow_smaller_maxphyaddr = true;
+
return 0;
 }
 module_init(vmx_init);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 03c401963062..167becd6a634 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -187,6 +187,9 @@ static struct kvm_shared_msrs __percpu *shared_msrs;
 u64 __read_mostly host_efer;
 EXPORT_SYMBOL_GPL(host_efer);
 
+bool __read_mostly allow_smaller_maxphyaddr;
+EXPORT_SYMBOL_GPL(allow_smaller_maxphyaddr);
+
 static u64 __read_mostly host_xss;
 u64 __read_mostly supported_xss;
 EXPORT_SYMBOL_GPL(supported_xss);
@@ -3538,6 +3541,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long 
ext)
case KVM_CAP_HYPERV_ENLIGHTENED_VMCS:
r = kvm_x86_ops.nested_ops->enable_evmcs != NULL;
break;
+   case KVM_CAP_SMALLER_MAXPHYADDR:
+   r = (int) allow_smaller_maxphyaddr;
+   break;
default:
break;
}
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 4fdf30316582..68cd3a0af9bb 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1031,6 +1031,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_PPC_SECURE_GUEST 181
 #define KVM_CAP_HALT_POLL 182
 #define KVM_CAP_ASYNC_PF_INT 183
+#define KVM_CAP_SMALLER_MAXPHYADDR 184
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
-- 
2.26.2



[PATCH v3 4/9] KVM: x86: rename update_bp_intercept to update_exception_bitmap

2020-07-10 Thread Mohammed Gamal
From: Paolo Bonzini 

We would like to introduce a callback to update the #PF intercept
when CPUID changes.  Just reuse update_bp_intercept since VMX is
already using update_exception_bitmap instead of a bespoke function.

While at it, remove an unnecessary assignment in the SVM version,
which is already done in the caller (kvm_arch_vcpu_ioctl_set_guest_debug)
and has nothing to do with the exception bitmap.

Signed-off-by: Paolo Bonzini 
---
 arch/x86/include/asm/kvm_host.h | 2 +-
 arch/x86/kvm/svm/svm.c  | 7 +++
 arch/x86/kvm/vmx/vmx.c  | 2 +-
 arch/x86/kvm/x86.c  | 2 +-
 4 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 62373cc06c72..bb4044ffb7b7 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1098,7 +1098,7 @@ struct kvm_x86_ops {
void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu);
void (*vcpu_put)(struct kvm_vcpu *vcpu);
 
-   void (*update_bp_intercept)(struct kvm_vcpu *vcpu);
+   void (*update_exception_bitmap)(struct kvm_vcpu *vcpu);
int (*get_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr);
int (*set_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr);
u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg);
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index c0da4dd78ac5..79c33b3539f0 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -1627,7 +1627,7 @@ static void svm_set_segment(struct kvm_vcpu *vcpu,
mark_dirty(svm->vmcb, VMCB_SEG);
 }
 
-static void update_bp_intercept(struct kvm_vcpu *vcpu)
+static void update_exception_bitmap(struct kvm_vcpu *vcpu)
 {
struct vcpu_svm *svm = to_svm(vcpu);
 
@@ -1636,8 +1636,7 @@ static void update_bp_intercept(struct kvm_vcpu *vcpu)
if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
set_exception_intercept(svm, BP_VECTOR);
-   } else
-   vcpu->guest_debug = 0;
+   }
 }
 
 static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
@@ -3989,7 +3988,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.vcpu_blocking = svm_vcpu_blocking,
.vcpu_unblocking = svm_vcpu_unblocking,
 
-   .update_bp_intercept = update_bp_intercept,
+   .update_exception_bitmap = update_exception_bitmap,
.get_msr_feature = svm_get_msr_feature,
.get_msr = svm_get_msr,
.set_msr = svm_set_msr,
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 13745f2a5ecd..178ee92551a9 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -7859,7 +7859,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
.vcpu_load = vmx_vcpu_load,
.vcpu_put = vmx_vcpu_put,
 
-   .update_bp_intercept = update_exception_bitmap,
+   .update_exception_bitmap = update_exception_bitmap,
.get_msr_feature = vmx_get_msr_feature,
.get_msr = vmx_get_msr,
.set_msr = vmx_set_msr,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 1f5f4074fc59..03c401963062 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -9281,7 +9281,7 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu 
*vcpu,
 */
kvm_set_rflags(vcpu, rflags);
 
-   kvm_x86_ops.update_bp_intercept(vcpu);
+   kvm_x86_ops.update_exception_bitmap(vcpu);
 
r = 0;
 
-- 
2.26.2



[PATCH v3 8/9] KVM: VMX: optimize #PF injection when MAXPHYADDR does not match

2020-07-10 Thread Mohammed Gamal
From: Paolo Bonzini 

Ignore non-present page faults, since those cannot have reserved
bits set.

When running access.flat with "-cpu Haswell,phys-bits=36", the
number of trapped page faults goes down from 8872644 to 3978948.

Signed-off-by: Paolo Bonzini 
---
 arch/x86/kvm/vmx/vmx.c | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index de3f436b2d32..0cebc4832805 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -4355,6 +4355,16 @@ static void init_vmcs(struct vcpu_vmx *vmx)
vmx->pt_desc.guest.output_mask = 0x7F;
vmcs_write64(GUEST_IA32_RTIT_CTL, 0);
}
+
+   /*
+* If EPT is enabled, #PF is only trapped if MAXPHYADDR is mismatched
+* between guest and host.  In that case we only care about present
+* faults.
+*/
+   if (enable_ept) {
+   vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, PFERR_PRESENT_MASK);
+   vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, PFERR_PRESENT_MASK);
+   }
 }
 
 static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
-- 
2.26.2



[PATCH v3 7/9] KVM: VMX: Add guest physical address check in EPT violation and misconfig

2020-07-10 Thread Mohammed Gamal
Check guest physical address against it's maximum physical memory. If
the guest's physical address exceeds the maximum (i.e. has reserved bits
set), inject a guest page fault with PFERR_RSVD_MASK set.

This has to be done both in the EPT violation and page fault paths, as
there are complications in both cases with respect to the computation
of the correct error code.

For EPT violations, unfortunately the only possibility is to emulate,
because the access type in the exit qualification might refer to an
access to a paging structure, rather than to the access performed by
the program.

Trapping page faults instead is needed in order to correct the error code,
but the access type can be obtained from the original error code and
passed to gva_to_gpa.  The corrections required in the error code are
subtle. For example, imagine that a PTE for a supervisor page has a reserved
bit set.  On a supervisor-mode access, the EPT violation path would trigger.
However, on a user-mode access, the processor will not notice the reserved
bit and not include PFERR_RSVD_MASK in the error code.

Co-developed-by: Mohammed Gamal 
Signed-off-by: Paolo Bonzini 
---
 arch/x86/kvm/vmx/vmx.c | 24 +---
 arch/x86/kvm/vmx/vmx.h |  3 ++-
 2 files changed, 23 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 770b090969fb..de3f436b2d32 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -4790,9 +4790,15 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu)
 
if (is_page_fault(intr_info)) {
cr2 = vmx_get_exit_qual(vcpu);
-   /* EPT won't cause page fault directly */
-   WARN_ON_ONCE(!vcpu->arch.apf.host_apf_flags && enable_ept);
-   return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0);
+   if (enable_ept && !vcpu->arch.apf.host_apf_flags) {
+   /*
+* EPT will cause page fault only if we need to
+* detect illegal GPAs.
+*/
+   kvm_fixup_and_inject_pf_error(vcpu, cr2, error_code);
+   return 1;
+   } else
+   return kvm_handle_page_fault(vcpu, error_code, cr2, 
NULL, 0);
}
 
ex_no = intr_info & INTR_INFO_VECTOR_MASK;
@@ -5308,6 +5314,18 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
   PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK;
 
vcpu->arch.exit_qualification = exit_qualification;
+
+   /*
+* Check that the GPA doesn't exceed physical memory limits, as that is
+* a guest page fault.  We have to emulate the instruction here, because
+* if the illegal address is that of a paging structure, then
+* EPT_VIOLATION_ACC_WRITE bit is set.  Alternatively, if supported we
+* would also use advanced VM-exit information for EPT violations to
+* reconstruct the page fault error code.
+*/
+   if (unlikely(kvm_mmu_is_illegal_gpa(vcpu, gpa)))
+   return kvm_emulate_instruction(vcpu, 0);
+
return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
 }
 
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index b0e5e210f1c1..0d06951e607c 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -11,6 +11,7 @@
 #include "kvm_cache_regs.h"
 #include "ops.h"
 #include "vmcs.h"
+#include "cpuid.h"
 
 extern const u32 vmx_msr_index[];
 
@@ -552,7 +553,7 @@ static inline bool vmx_has_waitpkg(struct vcpu_vmx *vmx)
 
 static inline bool vmx_need_pf_intercept(struct kvm_vcpu *vcpu)
 {
-   return !enable_ept;
+   return !enable_ept || cpuid_maxphyaddr(vcpu) < 
boot_cpu_data.x86_phys_bits;
 }
 
 void dump_vmcs(void);
-- 
2.26.2



[PATCH v3 3/9] KVM: x86: mmu: Add guest physical address check in translate_gpa()

2020-07-10 Thread Mohammed Gamal
In case of running a guest with 4-level page tables on a 5-level page
table host, it might happen that a guest might have a physical address
with reserved bits set, but the host won't see that and trap it.

Hence, we need to check page faults' physical addresses against the guest's
maximum physical memory and if it's exceeded, we need to add
the PFERR_RSVD_MASK bits to the PF's error code.

Also make sure the error code isn't overwritten by the page table walker.

Signed-off-by: Mohammed Gamal 
Signed-off-by: Paolo Bonzini 
---
 arch/x86/kvm/mmu/mmu.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index f8b3c5181466..e03e85b21cda 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -518,6 +518,12 @@ static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 
spte)
 static gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access,
   struct x86_exception *exception)
 {
+   /* Check if guest physical address doesn't exceed guest maximum */
+   if (kvm_mmu_is_illegal_gpa(vcpu, gpa)) {
+   exception->error_code |= PFERR_RSVD_MASK;
+   return UNMAPPED_GVA;
+   }
+
 return gpa;
 }
 
-- 
2.26.2



[PATCH v3 1/9] KVM: x86: Add helper functions for illegal GPA checking and page fault injection

2020-07-10 Thread Mohammed Gamal
This patch adds two helper functions that will be used to support virtualizing
MAXPHYADDR in both kvm-intel.ko and kvm.ko.

kvm_fixup_and_inject_pf_error() injects a page fault for a user-specified GVA,
while kvm_mmu_is_illegal_gpa() checks whether a GPA exceeds vCPU address limits.

Signed-off-by: Mohammed Gamal 
Signed-off-by: Paolo Bonzini 
---
 arch/x86/kvm/mmu.h |  6 ++
 arch/x86/kvm/x86.c | 21 +
 arch/x86/kvm/x86.h |  1 +
 3 files changed, 28 insertions(+)

diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 444bb9c54548..59930231d5d5 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -4,6 +4,7 @@
 
 #include 
 #include "kvm_cache_regs.h"
+#include "cpuid.h"
 
 #define PT64_PT_BITS 9
 #define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS)
@@ -158,6 +159,11 @@ static inline bool is_write_protection(struct kvm_vcpu 
*vcpu)
return kvm_read_cr0_bits(vcpu, X86_CR0_WP);
 }
 
+static inline bool kvm_mmu_is_illegal_gpa(struct kvm_vcpu *vcpu, gpa_t gpa)
+{
+return (gpa >= BIT_ULL(cpuid_maxphyaddr(vcpu)));
+}
+
 /*
  * Check if a given access (described through the I/D, W/R and U/S bits of a
  * page fault error code pfec) causes a permission fault with the given PTE
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 88c593f83b28..1f5f4074fc59 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -10699,6 +10699,27 @@ u64 kvm_spec_ctrl_valid_bits(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_spec_ctrl_valid_bits);
 
+void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 
error_code)
+{
+   struct x86_exception fault;
+
+   if (!(error_code & PFERR_PRESENT_MASK) ||
+   vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, error_code, ) != 
UNMAPPED_GVA) {
+   /*
+* If vcpu->arch.walk_mmu->gva_to_gpa succeeded, the page
+* tables probably do not match the TLB.  Just proceed
+* with the error code that the processor gave.
+*/
+   fault.vector = PF_VECTOR;
+   fault.error_code_valid = true;
+   fault.error_code = error_code;
+   fault.nested_page_fault = false;
+   fault.address = gva;
+   }
+   vcpu->arch.walk_mmu->inject_page_fault(vcpu, );
+}
+EXPORT_SYMBOL_GPL(kvm_fixup_and_inject_pf_error);
+
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 6eb62e97e59f..239ae0f3e40b 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -272,6 +272,7 @@ int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 
*pdata);
 bool kvm_mtrr_check_gfn_range_consistency(struct kvm_vcpu *vcpu, gfn_t gfn,
  int page_num);
 bool kvm_vector_hashing_enabled(void);
+void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 
error_code);
 int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
int emulation_type, void *insn, int insn_len);
 fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu);
-- 
2.26.2



[PATCH v3 0/9] KVM: Support guest MAXPHYADDR < host MAXPHYADDR

2020-07-10 Thread Mohammed Gamal
When EPT is enabled, KVM does not really look at guest physical
address size. Address bits above maximum physical memory size are reserved.
Because KVM does not look at these guest physical addresses, it currently
effectively supports guest physical address sizes equal to the host.

This can be problem when having a mixed setup of machines with 5-level page
tables and machines with 4-level page tables, as live migration can change
MAXPHYADDR while the guest runs, which can theoretically introduce bugs.

In this patch series we add checks on guest physical addresses in EPT
violation/misconfig and NPF vmexits and if needed inject the proper
page faults in the guest.

A more subtle issue is when the host MAXPHYADDR is larger than that of the
guest. Page faults caused by reserved bits on the guest won't cause an EPT
violation/NPF and hence we also check guest MAXPHYADDR and add PFERR_RSVD_MASK
error code to the page fault if needed.



Changes from v2:
- Drop support for this feature on AMD processors after discussion with AMD


Mohammed Gamal (5):
  KVM: x86: Add helper functions for illegal GPA checking and page fault
injection
  KVM: x86: mmu: Move translate_gpa() to mmu.c
  KVM: x86: mmu: Add guest physical address check in translate_gpa()
  KVM: VMX: Add guest physical address check in EPT violation and
misconfig
  KVM: x86: SVM: VMX: Make GUEST_MAXPHYADDR < HOST_MAXPHYADDR support
configurable

Paolo Bonzini (4):
  KVM: x86: rename update_bp_intercept to update_exception_bitmap
  KVM: x86: update exception bitmap on CPUID changes
  KVM: VMX: introduce vmx_need_pf_intercept
  KVM: VMX: optimize #PF injection when MAXPHYADDR does not match

 arch/x86/include/asm/kvm_host.h | 10 ++--
 arch/x86/kvm/cpuid.c|  2 ++
 arch/x86/kvm/mmu.h  |  6 +
 arch/x86/kvm/mmu/mmu.c  | 12 +
 arch/x86/kvm/svm/svm.c  | 22 +---
 arch/x86/kvm/vmx/nested.c   | 28 
 arch/x86/kvm/vmx/vmx.c  | 45 +
 arch/x86/kvm/vmx/vmx.h  |  6 +
 arch/x86/kvm/x86.c  | 29 -
 arch/x86/kvm/x86.h  |  1 +
 include/uapi/linux/kvm.h|  1 +
 11 files changed, 133 insertions(+), 29 deletions(-)

-- 
2.26.2



[PATCH v3 5/9] KVM: x86: update exception bitmap on CPUID changes

2020-07-10 Thread Mohammed Gamal
From: Paolo Bonzini 

Allow vendor code to observe changes to MAXPHYADDR and start/stop
intercepting page faults.

Signed-off-by: Paolo Bonzini 
---
 arch/x86/kvm/cpuid.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 8a294f9747aa..ea5bbf2153bb 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -128,6 +128,8 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
kvm_mmu_reset_context(vcpu);
 
kvm_pmu_refresh(vcpu);
+   kvm_x86_ops.update_exception_bitmap(vcpu);
+
return 0;
 }
 
-- 
2.26.2



[PATCH v3 2/9] KVM: x86: mmu: Move translate_gpa() to mmu.c

2020-07-10 Thread Mohammed Gamal
Also no point of it being inline since it's always called through
function pointers. So remove that.

Signed-off-by: Mohammed Gamal 
Signed-off-by: Paolo Bonzini 
---
 arch/x86/include/asm/kvm_host.h | 6 --
 arch/x86/kvm/mmu/mmu.c  | 6 ++
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index be5363b21540..62373cc06c72 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1551,12 +1551,6 @@ void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t 
new_pgd, bool skip_tlb_flush,
 
 void kvm_configure_mmu(bool enable_tdp, int tdp_page_level);
 
-static inline gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access,
- struct x86_exception *exception)
-{
-   return gpa;
-}
-
 static inline struct kvm_mmu_page *page_header(hpa_t shadow_page)
 {
struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT);
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 6d6a0ae7800c..f8b3c5181466 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -515,6 +515,12 @@ static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 
spte)
return likely(kvm_gen == spte_gen);
 }
 
+static gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access,
+  struct x86_exception *exception)
+{
+return gpa;
+}
+
 /*
  * Sets the shadow PTE masks used by the MMU.
  *
-- 
2.26.2



greetings

2020-07-07 Thread Hamza Mohammed
COMPENSATION PAYMENT THROUGH ATM VISA CARD
Attn: Beneficiaries
This is to officially inform you that we have verified your
inheritance file fund presently on my desk, and I found out that you
have not received your payment due to your lack of co-operation and
not fulfilling the obligations giving to you in respect to your
inheritance payment. Secondly, you are hereby advised to stop dealing
with some non-officials in the bank as this is an illegal act and will
have to stop if you so wish to receive your payment immediately.


After the board meeting held at our headquarters, we have resolved
in finding a solution to your problem, and as you may know, we have
arranged your payment through our SWIFTCARD PAYMENT CENTER in Europe,
America and Asia Pacific, which is the instruction given by the UN,
(GCFR) Burkina Faso.


This card centre will send you an ATM VISA CARD which you will use
to withdraw your money in an ATM MACHINE in any part of the world, but
the maximum Amount is $3.5million dollars while ($5,000.00) Five
Thousand Dollars per day. So, if you like to receive your fund this
way, please do contact (drdavidgilber...@gmail.com) Telephone +226
76364392 and forward the below details to ATM VISA CARD department
immediately.


1.) Full Name:
2.) Residence Address where your ATM CARD will be send:
3.) Phone Number And Fax Number:
4.) Total Fund to be received:
5.) Age and Occupation:
6.) A Scan of Your International Passport or Identity Card or
Driver's License:


Instead of losing your fund, please indicate to the card centre
the total sum you are expecting and for your Information you have stop
any further communication with anybody or office. On this regards, do
not hesitate tocontact me for more details and direction, and also
please do update me with any new development.

Thanks for your co-operation.


Regards,

Managment
Mr Hamza Muhammed +226 76364392


Re: [PATCH v2 00/11] KVM: Support guest MAXPHYADDR < host MAXPHYADDR

2020-06-22 Thread Mohammed Gamal
On Fri, 2020-06-19 at 16:52 -0500, Tom Lendacky wrote:
> On 6/19/20 10:39 AM, Mohammed Gamal wrote:
> > When EPT/NPT is enabled, KVM does not really look at guest physical
> > address size. Address bits above maximum physical memory size are
> > reserved.
> > Because KVM does not look at these guest physical addresses, it
> > currently
> > effectively supports guest physical address sizes equal to the
> > host.
> > 
> > This can be problem when having a mixed setup of machines with 5-
> > level page
> > tables and machines with 4-level page tables, as live migration can
> > change
> > MAXPHYADDR while the guest runs, which can theoretically introduce
> > bugs.
> > 
> > In this patch series we add checks on guest physical addresses in
> > EPT
> > violation/misconfig and NPF vmexits and if needed inject the proper
> > page faults in the guest.
> > 
> > A more subtle issue is when the host MAXPHYADDR is larger than that
> > of the
> > guest. Page faults caused by reserved bits on the guest won't cause
> > an EPT
> > violation/NPF and hence we also check guest MAXPHYADDR and add
> > PFERR_RSVD_MASK
> > error code to the page fault if needed.
> 
> I'm probably missing something here, but I'm confused by this
> statement. 
> Is this for a case where a page has been marked not present and the
> guest 
> has also set what it believes are reserved bits? Then when the page
> is 
> accessed, the guest sees a page fault without the error code for
> reserved 
> bits? If so, my understanding is that is architecturally correct. P=0
> is 
> considered higher priority than other page faults, at least on AMD.
> So if 
> you have a P=0 and other issues exist within the PTE, AMD will report
> the 
> P=0 fault and that's it.
> 
> The priority of other page fault conditions when P=1 is not defined
> and I 
> don't think we guarantee that you would get all error codes on
> fault. 
> Software is always expected to address the page fault and retry, and
> it 
> may get another page fault when it does, with a different error
> code. 
> Assuming the other errors are addressed, eventually the reserved
> bits 
> would cause an NPF and that could be detected by the HV and handled 
> appropriately.
> 
> > The last 3 patches (i.e. SVM bits and patch 11) are not intended
> > for
> > immediate inclusion and probably need more discussion.
> > We've been noticing some unexpected behavior in handling NPF
> > vmexits
> > on AMD CPUs (see individual patches for details), and thus we are
> > proposing a workaround (see last patch) that adds a capability that
> > userspace can use to decide who to deal with hosts that might have
> > issues supprting guest MAXPHYADDR < host MAXPHYADDR.
> 
> Also, something to consider. On AMD, when memory encryption is
> enabled 
> (via the SYS_CFG MSR), a guest can actually have a larger MAXPHYADDR
> than 
> the host. How do these patches all play into that?

Well the patches definitely don't address that case. It's assumed a
guest VM's MAXPHYADDR <= host MAXPHYADDR, and hence we handle the case
where a guests's physical address space is smaller and try to trap
faults that may go unnoticed by the host.

My question is in the case of guest MAXPHYADDR > host MAXPHYADDR, do we
expect somehow that there might be guest physical addresses that
contain what the host could see as reserved bits? And how'd the host
handle that?

Thanks,
Mohammed

> 
> Thanks,
> Tom
> 
> > 
> > Mohammed Gamal (7):
> >KVM: x86: Add helper functions for illegal GPA checking and page
> > fault
> >  injection
> >KVM: x86: mmu: Move translate_gpa() to mmu.c
> >KVM: x86: mmu: Add guest physical address check in
> > translate_gpa()
> >KVM: VMX: Add guest physical address check in EPT violation and
> >  misconfig
> >KVM: SVM: introduce svm_need_pf_intercept
> >KVM: SVM: Add guest physical address check in NPF/PF
> > interception
> >KVM: x86: SVM: VMX: Make GUEST_MAXPHYADDR < HOST_MAXPHYADDR
> > support
> >  configurable
> > 
> > Paolo Bonzini (4):
> >KVM: x86: rename update_bp_intercept to update_exception_bitmap
> >KVM: x86: update exception bitmap on CPUID changes
> >KVM: VMX: introduce vmx_need_pf_intercept
> >KVM: VMX: optimize #PF injection when MAXPHYADDR does not match
> > 
> >   arch/x86/include/asm/kvm_host.h | 10 ++--
> >   arch/x86/kvm/cpuid.c|  2 ++
> >   arch/x86/kvm/mmu.h  |  6 +
> >   arch/x86/kvm/mmu/mmu.c  | 12 +
> >

Re: [PATCH v2 01/11] KVM: x86: Add helper functions for illegal GPA checking and page fault injection

2020-06-22 Thread Mohammed Gamal
On Mon, 2020-06-22 at 12:44 +0800, Yuan Yao wrote:
> On Fri, Jun 19, 2020 at 05:39:15PM +0200, Mohammed Gamal wrote:
> > This patch adds two helper functions that will be used to support
> > virtualizing
> > MAXPHYADDR in both kvm-intel.ko and kvm.ko.
> > 
> > kvm_fixup_and_inject_pf_error() injects a page fault for a user-
> > specified GVA,
> > while kvm_mmu_is_illegal_gpa() checks whether a GPA exceeds vCPU
> > address limits.
> > 
> > Signed-off-by: Mohammed Gamal 
> > Signed-off-by: Paolo Bonzini 
> > ---
> >  arch/x86/kvm/mmu.h |  6 ++
> >  arch/x86/kvm/x86.c | 21 +
> >  arch/x86/kvm/x86.h |  1 +
> >  3 files changed, 28 insertions(+)
> > 
> > diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
> > index 0ad06bfe2c2c..555237dfb91c 100644
> > --- a/arch/x86/kvm/mmu.h
> > +++ b/arch/x86/kvm/mmu.h
> > @@ -4,6 +4,7 @@
> >  
> >  #include 
> >  #include "kvm_cache_regs.h"
> > +#include "cpuid.h"
> >  
> >  #define PT64_PT_BITS 9
> >  #define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS)
> > @@ -158,6 +159,11 @@ static inline bool is_write_protection(struct
> > kvm_vcpu *vcpu)
> > return kvm_read_cr0_bits(vcpu, X86_CR0_WP);
> >  }
> >  
> > +static inline bool kvm_mmu_is_illegal_gpa(struct kvm_vcpu *vcpu,
> > gpa_t gpa)
> > +{
> > +return (gpa >= BIT_ULL(cpuid_maxphyaddr(vcpu)));
> > +}
> > +
> >  /*
> >   * Check if a given access (described through the I/D, W/R and U/S
> > bits of a
> >   * page fault error code pfec) causes a permission fault with the
> > given PTE
> > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > index 00c88c2f34e4..ac8642e890b1 100644
> > --- a/arch/x86/kvm/x86.c
> > +++ b/arch/x86/kvm/x86.c
> > @@ -10693,6 +10693,27 @@ u64 kvm_spec_ctrl_valid_bits(struct
> > kvm_vcpu *vcpu)
> >  }
> >  EXPORT_SYMBOL_GPL(kvm_spec_ctrl_valid_bits);
> >  
> > +void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t
> > gva, u16 error_code)
> > +{
> > +   struct x86_exception fault;
> > +
> > +   if (!(error_code & PFERR_PRESENT_MASK) ||
> > +   vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, error_code,
> > ) != UNMAPPED_GVA) {
> > +   /*
> > +* If vcpu->arch.walk_mmu->gva_to_gpa succeeded, the
> > page
> > +* tables probably do not match the TLB.  Just proceed
> > +* with the error code that the processor gave.
> > +*/
> > +   fault.vector = PF_VECTOR;
> > +   fault.error_code_valid = true;
> > +   fault.error_code = error_code;
> > +   fault.nested_page_fault = false;
> > +   fault.address = gva;
> > +   }
> > +   vcpu->arch.walk_mmu->inject_page_fault(vcpu, );
> 
> Should this "vcpu->arch.walk_mmu->inject_page_fault(vcpu, )"
> inside the last brace?
> Otherwise an uninitialized fault variable will be passed to the
> walk_mmu->inject_page_fault.

Good catch. You're right. Will fix it in v3

> 
> > +}
> > +EXPORT_SYMBOL_GPL(kvm_fixup_and_inject_pf_error);
> > +
> >  EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
> >  EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio);
> >  EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
> > diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
> > index 6eb62e97e59f..239ae0f3e40b 100644
> > --- a/arch/x86/kvm/x86.h
> > +++ b/arch/x86/kvm/x86.h
> > @@ -272,6 +272,7 @@ int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32
> > msr, u64 *pdata);
> >  bool kvm_mtrr_check_gfn_range_consistency(struct kvm_vcpu *vcpu,
> > gfn_t gfn,
> >   int page_num);
> >  bool kvm_vector_hashing_enabled(void);
> > +void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t
> > gva, u16 error_code);
> >  int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t
> > cr2_or_gpa,
> > int emulation_type, void *insn, int
> > insn_len);
> >  fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu);
> > -- 
> > 2.26.2
> > 



[PATCH v2 00/11] KVM: Support guest MAXPHYADDR < host MAXPHYADDR

2020-06-19 Thread Mohammed Gamal
When EPT/NPT is enabled, KVM does not really look at guest physical
address size. Address bits above maximum physical memory size are reserved.
Because KVM does not look at these guest physical addresses, it currently
effectively supports guest physical address sizes equal to the host.

This can be problem when having a mixed setup of machines with 5-level page
tables and machines with 4-level page tables, as live migration can change
MAXPHYADDR while the guest runs, which can theoretically introduce bugs.

In this patch series we add checks on guest physical addresses in EPT
violation/misconfig and NPF vmexits and if needed inject the proper
page faults in the guest.

A more subtle issue is when the host MAXPHYADDR is larger than that of the
guest. Page faults caused by reserved bits on the guest won't cause an EPT
violation/NPF and hence we also check guest MAXPHYADDR and add PFERR_RSVD_MASK
error code to the page fault if needed.

The last 3 patches (i.e. SVM bits and patch 11) are not intended for
immediate inclusion and probably need more discussion.
We've been noticing some unexpected behavior in handling NPF vmexits
on AMD CPUs (see individual patches for details), and thus we are
proposing a workaround (see last patch) that adds a capability that
userspace can use to decide who to deal with hosts that might have
issues supprting guest MAXPHYADDR < host MAXPHYADDR.


Mohammed Gamal (7):
  KVM: x86: Add helper functions for illegal GPA checking and page fault
injection
  KVM: x86: mmu: Move translate_gpa() to mmu.c
  KVM: x86: mmu: Add guest physical address check in translate_gpa()
  KVM: VMX: Add guest physical address check in EPT violation and
misconfig
  KVM: SVM: introduce svm_need_pf_intercept
  KVM: SVM: Add guest physical address check in NPF/PF interception
  KVM: x86: SVM: VMX: Make GUEST_MAXPHYADDR < HOST_MAXPHYADDR support
configurable

Paolo Bonzini (4):
  KVM: x86: rename update_bp_intercept to update_exception_bitmap
  KVM: x86: update exception bitmap on CPUID changes
  KVM: VMX: introduce vmx_need_pf_intercept
  KVM: VMX: optimize #PF injection when MAXPHYADDR does not match

 arch/x86/include/asm/kvm_host.h | 10 ++--
 arch/x86/kvm/cpuid.c|  2 ++
 arch/x86/kvm/mmu.h  |  6 +
 arch/x86/kvm/mmu/mmu.c  | 12 +
 arch/x86/kvm/svm/svm.c  | 41 +++---
 arch/x86/kvm/svm/svm.h  |  6 +
 arch/x86/kvm/vmx/nested.c   | 28 
 arch/x86/kvm/vmx/vmx.c  | 45 +
 arch/x86/kvm/vmx/vmx.h  |  6 +
 arch/x86/kvm/x86.c  | 29 -
 arch/x86/kvm/x86.h  |  1 +
 include/uapi/linux/kvm.h|  1 +
 12 files changed, 158 insertions(+), 29 deletions(-)

-- 
2.26.2



[PATCH v2 01/11] KVM: x86: Add helper functions for illegal GPA checking and page fault injection

2020-06-19 Thread Mohammed Gamal
This patch adds two helper functions that will be used to support virtualizing
MAXPHYADDR in both kvm-intel.ko and kvm.ko.

kvm_fixup_and_inject_pf_error() injects a page fault for a user-specified GVA,
while kvm_mmu_is_illegal_gpa() checks whether a GPA exceeds vCPU address limits.

Signed-off-by: Mohammed Gamal 
Signed-off-by: Paolo Bonzini 
---
 arch/x86/kvm/mmu.h |  6 ++
 arch/x86/kvm/x86.c | 21 +
 arch/x86/kvm/x86.h |  1 +
 3 files changed, 28 insertions(+)

diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 0ad06bfe2c2c..555237dfb91c 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -4,6 +4,7 @@
 
 #include 
 #include "kvm_cache_regs.h"
+#include "cpuid.h"
 
 #define PT64_PT_BITS 9
 #define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS)
@@ -158,6 +159,11 @@ static inline bool is_write_protection(struct kvm_vcpu 
*vcpu)
return kvm_read_cr0_bits(vcpu, X86_CR0_WP);
 }
 
+static inline bool kvm_mmu_is_illegal_gpa(struct kvm_vcpu *vcpu, gpa_t gpa)
+{
+return (gpa >= BIT_ULL(cpuid_maxphyaddr(vcpu)));
+}
+
 /*
  * Check if a given access (described through the I/D, W/R and U/S bits of a
  * page fault error code pfec) causes a permission fault with the given PTE
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 00c88c2f34e4..ac8642e890b1 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -10693,6 +10693,27 @@ u64 kvm_spec_ctrl_valid_bits(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_spec_ctrl_valid_bits);
 
+void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 
error_code)
+{
+   struct x86_exception fault;
+
+   if (!(error_code & PFERR_PRESENT_MASK) ||
+   vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, error_code, ) != 
UNMAPPED_GVA) {
+   /*
+* If vcpu->arch.walk_mmu->gva_to_gpa succeeded, the page
+* tables probably do not match the TLB.  Just proceed
+* with the error code that the processor gave.
+*/
+   fault.vector = PF_VECTOR;
+   fault.error_code_valid = true;
+   fault.error_code = error_code;
+   fault.nested_page_fault = false;
+   fault.address = gva;
+   }
+   vcpu->arch.walk_mmu->inject_page_fault(vcpu, );
+}
+EXPORT_SYMBOL_GPL(kvm_fixup_and_inject_pf_error);
+
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 6eb62e97e59f..239ae0f3e40b 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -272,6 +272,7 @@ int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 
*pdata);
 bool kvm_mtrr_check_gfn_range_consistency(struct kvm_vcpu *vcpu, gfn_t gfn,
  int page_num);
 bool kvm_vector_hashing_enabled(void);
+void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 
error_code);
 int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
int emulation_type, void *insn, int insn_len);
 fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu);
-- 
2.26.2



[PATCH v2 03/11] KVM: x86: mmu: Add guest physical address check in translate_gpa()

2020-06-19 Thread Mohammed Gamal
In case of running a guest with 4-level page tables on a 5-level page
table host, it might happen that a guest might have a physical address
with reserved bits set, but the host won't see that and trap it.

Hence, we need to check page faults' physical addresses against the guest's
maximum physical memory and if it's exceeded, we need to add
the PFERR_RSVD_MASK bits to the PF's error code.

Also make sure the error code isn't overwritten by the page table walker.

Signed-off-by: Mohammed Gamal 
Signed-off-by: Paolo Bonzini 
---
 arch/x86/kvm/mmu/mmu.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index ee113fc1f1bf..10409b76b2d8 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -518,6 +518,12 @@ static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 
spte)
 static gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access,
   struct x86_exception *exception)
 {
+   /* Check if guest physical address doesn't exceed guest maximum */
+   if (kvm_mmu_is_illegal_gpa(vcpu, gpa)) {
+   exception->error_code |= PFERR_RSVD_MASK;
+   return UNMAPPED_GVA;
+   }
+
 return gpa;
 }
 
-- 
2.26.2



[PATCH v2 04/11] KVM: x86: rename update_bp_intercept to update_exception_bitmap

2020-06-19 Thread Mohammed Gamal
From: Paolo Bonzini 

We would like to introduce a callback to update the #PF intercept
when CPUID changes.  Just reuse update_bp_intercept since VMX is
already using update_exception_bitmap instead of a bespoke function.

While at it, remove an unnecessary assignment in the SVM version,
which is already done in the caller (kvm_arch_vcpu_ioctl_set_guest_debug)
and has nothing to do with the exception bitmap.

Signed-off-by: Paolo Bonzini 
---
 arch/x86/include/asm/kvm_host.h | 2 +-
 arch/x86/kvm/svm/svm.c  | 7 +++
 arch/x86/kvm/vmx/vmx.c  | 2 +-
 arch/x86/kvm/x86.c  | 2 +-
 4 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index bc0fb116cc5c..7ebdb43632e0 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1098,7 +1098,7 @@ struct kvm_x86_ops {
void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu);
void (*vcpu_put)(struct kvm_vcpu *vcpu);
 
-   void (*update_bp_intercept)(struct kvm_vcpu *vcpu);
+   void (*update_exception_bitmap)(struct kvm_vcpu *vcpu);
int (*get_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr);
int (*set_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr);
u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg);
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 8ccfa4197d9c..94108e6cc6da 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -1627,7 +1627,7 @@ static void svm_set_segment(struct kvm_vcpu *vcpu,
mark_dirty(svm->vmcb, VMCB_SEG);
 }
 
-static void update_bp_intercept(struct kvm_vcpu *vcpu)
+static void update_exception_bitmap(struct kvm_vcpu *vcpu)
 {
struct vcpu_svm *svm = to_svm(vcpu);
 
@@ -1636,8 +1636,7 @@ static void update_bp_intercept(struct kvm_vcpu *vcpu)
if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
set_exception_intercept(svm, BP_VECTOR);
-   } else
-   vcpu->guest_debug = 0;
+   }
 }
 
 static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
@@ -3989,7 +3988,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.vcpu_blocking = svm_vcpu_blocking,
.vcpu_unblocking = svm_vcpu_unblocking,
 
-   .update_bp_intercept = update_bp_intercept,
+   .update_exception_bitmap = update_exception_bitmap,
.get_msr_feature = svm_get_msr_feature,
.get_msr = svm_get_msr,
.set_msr = svm_set_msr,
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 36c771728c8c..f82c42ac87f9 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -7881,7 +7881,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
.vcpu_load = vmx_vcpu_load,
.vcpu_put = vmx_vcpu_put,
 
-   .update_bp_intercept = update_exception_bitmap,
+   .update_exception_bitmap = update_exception_bitmap,
.get_msr_feature = vmx_get_msr_feature,
.get_msr = vmx_get_msr,
.set_msr = vmx_set_msr,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ac8642e890b1..84f1f0084d2e 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -9275,7 +9275,7 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu 
*vcpu,
 */
kvm_set_rflags(vcpu, rflags);
 
-   kvm_x86_ops.update_bp_intercept(vcpu);
+   kvm_x86_ops.update_exception_bitmap(vcpu);
 
r = 0;
 
-- 
2.26.2



[PATCH v2 02/11] KVM: x86: mmu: Move translate_gpa() to mmu.c

2020-06-19 Thread Mohammed Gamal
Also no point of it being inline since it's always called through
function pointers. So remove that.

Signed-off-by: Mohammed Gamal 
Signed-off-by: Paolo Bonzini 
---
 arch/x86/include/asm/kvm_host.h | 6 --
 arch/x86/kvm/mmu/mmu.c  | 6 ++
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index f8998e97457f..bc0fb116cc5c 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1551,12 +1551,6 @@ void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t 
new_pgd, bool skip_tlb_flush,
 
 void kvm_configure_mmu(bool enable_tdp, int tdp_page_level);
 
-static inline gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access,
- struct x86_exception *exception)
-{
-   return gpa;
-}
-
 static inline struct kvm_mmu_page *page_header(hpa_t shadow_page)
 {
struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT);
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index fdd05c233308..ee113fc1f1bf 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -515,6 +515,12 @@ static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 
spte)
return likely(kvm_gen == spte_gen);
 }
 
+static gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access,
+  struct x86_exception *exception)
+{
+return gpa;
+}
+
 /*
  * Sets the shadow PTE masks used by the MMU.
  *
-- 
2.26.2



[PATCH v2 06/11] KVM: VMX: introduce vmx_need_pf_intercept

2020-06-19 Thread Mohammed Gamal
From: Paolo Bonzini 

Signed-off-by: Paolo Bonzini 
---
 arch/x86/kvm/vmx/nested.c | 28 +---
 arch/x86/kvm/vmx/vmx.c|  2 +-
 arch/x86/kvm/vmx/vmx.h|  5 +
 3 files changed, 23 insertions(+), 12 deletions(-)

diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index d1af20b050a8..328411919518 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -2433,22 +2433,28 @@ static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, 
struct vmcs12 *vmcs12)
 
/*
 * Whether page-faults are trapped is determined by a combination of
-* 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF.
-* If enable_ept, L0 doesn't care about page faults and we should
-* set all of these to L1's desires. However, if !enable_ept, L0 does
-* care about (at least some) page faults, and because it is not easy
-* (if at all possible?) to merge L0 and L1's desires, we simply ask
-* to exit on each and every L2 page fault. This is done by setting
-* MASK=MATCH=0 and (see below) EB.PF=1.
+* 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF.  If L0
+* doesn't care about page faults then we should set all of these to
+* L1's desires. However, if L0 does care about (some) page faults, it
+* is not easy (if at all possible?) to merge L0 and L1's desires, we
+* simply ask to exit on each and every L2 page fault. This is done by
+* setting MASK=MATCH=0 and (see below) EB.PF=1.
 * Note that below we don't need special code to set EB.PF beyond the
 * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept,
 * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when
 * !enable_ept, EB.PF is 1, so the "or" will always be 1.
 */
-   vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK,
-   enable_ept ? vmcs12->page_fault_error_code_mask : 0);
-   vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH,
-   enable_ept ? vmcs12->page_fault_error_code_match : 0);
+   if (vmx_need_pf_intercept(>vcpu)) {
+   /*
+* TODO: if both L0 and L1 need the same MASK and MATCH,
+* go ahead and use it?
+*/
+   vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
+   vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
+   } else {
+   vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 
vmcs12->page_fault_error_code_mask);
+   vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 
vmcs12->page_fault_error_code_match);
+   }
 
if (cpu_has_vmx_apicv()) {
vmcs_write64(EOI_EXIT_BITMAP0, vmcs12->eoi_exit_bitmap0);
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index f82c42ac87f9..46d522ee5cb1 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -783,7 +783,7 @@ void update_exception_bitmap(struct kvm_vcpu *vcpu)
eb |= 1u << BP_VECTOR;
if (to_vmx(vcpu)->rmode.vm86_active)
eb = ~0;
-   if (enable_ept)
+   if (!vmx_need_pf_intercept(vcpu))
eb &= ~(1u << PF_VECTOR);
 
/* When we are running a nested L2 guest and L1 specified for it a
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 8a83b5edc820..5e2da15fe94f 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -552,6 +552,11 @@ static inline bool vmx_has_waitpkg(struct vcpu_vmx *vmx)
SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE;
 }
 
+static inline bool vmx_need_pf_intercept(struct kvm_vcpu *vcpu)
+{
+   return !enable_ept;
+}
+
 void dump_vmcs(void);
 
 #endif /* __KVM_X86_VMX_H */
-- 
2.26.2



[PATCH v2 09/11] KVM: SVM: introduce svm_need_pf_intercept

2020-06-19 Thread Mohammed Gamal
CC: Tom Lendacky 
CC: Babu Moger 
Signed-off-by: Mohammed Gamal 
---
 arch/x86/kvm/svm/svm.c | 8 
 arch/x86/kvm/svm/svm.h | 6 ++
 2 files changed, 14 insertions(+)

diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 94108e6cc6da..05412818027d 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -1087,6 +1087,9 @@ static void init_vmcb(struct vcpu_svm *svm)
}
svm->asid_generation = 0;
 
+   if (svm_need_pf_intercept(svm))
+   set_exception_intercept(svm, PF_VECTOR);
+
svm->nested.vmcb = 0;
svm->vcpu.arch.hflags = 0;
 
@@ -1633,6 +1636,11 @@ static void update_exception_bitmap(struct kvm_vcpu 
*vcpu)
 
clr_exception_intercept(svm, BP_VECTOR);
 
+   if (svm_need_pf_intercept(svm))
+   set_exception_intercept(svm, PF_VECTOR);
+   else
+   clr_exception_intercept(svm, PF_VECTOR);
+
if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
set_exception_intercept(svm, BP_VECTOR);
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 6ac4c00a5d82..2b7469f3db0e 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -19,6 +19,7 @@
 #include 
 
 #include 
+#include "cpuid.h"
 
 static const u32 host_save_user_msrs[] = {
 #ifdef CONFIG_X86_64
@@ -345,6 +346,11 @@ static inline bool gif_set(struct vcpu_svm *svm)
return !!(svm->vcpu.arch.hflags & HF_GIF_MASK);
 }
 
+static inline bool svm_need_pf_intercept(struct vcpu_svm *svm)
+{
+return !npt_enabled;
+}
+
 /* svm.c */
 #define MSR_INVALID0xU
 
-- 
2.26.2



[PATCH v2 11/11] KVM: x86: SVM: VMX: Make GUEST_MAXPHYADDR < HOST_MAXPHYADDR support configurable

2020-06-19 Thread Mohammed Gamal
The reason behind including this patch is unexpected behaviour we see
with NPT vmexit handling in AMD processor.

With previous patch ("KVM: SVM: Add guest physical address check in
NPF/PF interception") we see the followning error multiple times in
the 'access' test in kvm-unit-tests:

test pte.p pte.36 pde.p: FAIL: pte 221 expected 201
Dump mapping: address: 0x1234
--L4: 24c3027
--L3: 24c4027
--L2: 24c5021
--L1: 100221

This shows that the PTE's accessed bit is apparently being set by
the CPU hardware before the NPF vmexit. This completely handled by
hardware and can not be fixed in software.

This patch introduces a workaround. We add a boolean variable:
'allow_smaller_maxphyaddr'
Which is set individually by VMX and SVM init routines. On VMX it's
always set to true, on SVM it's only set to true when NPT is not
enabled.

We also add a new capability KVM_CAP_SMALLER_MAXPHYADDR which
allows userspace to query if the underlying architecture would
support GUEST_MAXPHYADDR < HOST_MAXPHYADDR and hence act accordingly
(e.g. qemu can decide if it would ignore the -cpu ..,phys-bits=X)

CC: Tom Lendacky 
CC: Babu Moger 
Signed-off-by: Mohammed Gamal 
---
 arch/x86/include/asm/kvm_host.h |  2 +-
 arch/x86/kvm/svm/svm.c  | 15 +++
 arch/x86/kvm/vmx/vmx.c  |  7 +++
 arch/x86/kvm/x86.c  |  6 ++
 include/uapi/linux/kvm.h|  1 +
 5 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 7ebdb43632e0..b25f7497307d 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1304,7 +1304,7 @@ struct kvm_arch_async_pf {
 };
 
 extern u64 __read_mostly host_efer;
-
+extern bool __read_mostly allow_smaller_maxphyaddr;
 extern struct kvm_x86_ops kvm_x86_ops;
 
 #define __KVM_HAVE_ARCH_VM_ALLOC
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index ec3224a2e7c2..1b8880b89e9f 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -924,6 +924,21 @@ static __init int svm_hardware_setup(void)
 
svm_set_cpu_caps();
 
+   /*
+* It seems that on AMD processors PTE's accessed bit is
+* being set by the CPU hardware before the NPF vmexit.
+* This is not expected behaviour and our tests fail because
+* of it.
+* A workaround here is to disable support for
+* GUEST_MAXPHYADDR < HOST_MAXPHYADDR if NPT is enabled.
+* In this case userspace can know if there is support using
+* KVM_CAP_SMALLER_MAXPHYADDR extension and decide how to handle
+* it
+* If future AMD CPU models change the behaviour described above,
+* this variable can be changed accordingly
+*/
+   allow_smaller_maxphyaddr = !npt_enabled;
+
return 0;
 
 err:
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 8daf78b2d4cb..fe0ca39c0887 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -8316,6 +8316,13 @@ static int __init vmx_init(void)
 #endif
vmx_check_vmcs12_offsets();
 
+   /*
+* Intel processors don't have problems with
+* GUEST_MAXPHYADDR < HOST_MAXPHYADDR so enable
+* it for VMX by default
+*/
+   allow_smaller_maxphyaddr = true;
+
return 0;
 }
 module_init(vmx_init);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 84f1f0084d2e..5bca6d6d24e9 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -187,6 +187,9 @@ static struct kvm_shared_msrs __percpu *shared_msrs;
 u64 __read_mostly host_efer;
 EXPORT_SYMBOL_GPL(host_efer);
 
+bool __read_mostly allow_smaller_maxphyaddr;
+EXPORT_SYMBOL_GPL(allow_smaller_maxphyaddr);
+
 static u64 __read_mostly host_xss;
 u64 __read_mostly supported_xss;
 EXPORT_SYMBOL_GPL(supported_xss);
@@ -3533,6 +3536,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long 
ext)
case KVM_CAP_HYPERV_ENLIGHTENED_VMCS:
r = kvm_x86_ops.nested_ops->enable_evmcs != NULL;
break;
+   case KVM_CAP_SMALLER_MAXPHYADDR:
+   r = (int) allow_smaller_maxphyaddr;
+   break;
default:
break;
}
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 4fdf30316582..68cd3a0af9bb 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1031,6 +1031,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_PPC_SECURE_GUEST 181
 #define KVM_CAP_HALT_POLL 182
 #define KVM_CAP_ASYNC_PF_INT 183
+#define KVM_CAP_SMALLER_MAXPHYADDR 184
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
-- 
2.26.2



[PATCH v2 10/11] KVM: SVM: Add guest physical address check in NPF/PF interception

2020-06-19 Thread Mohammed Gamal
Check guest physical address against it's maximum physical memory. If
the guest's physical address exceeds the maximum (i.e. has reserved bits
set), inject a guest page fault with PFERR_RSVD_MASK set.

Similar ot VMX, this has to be done both in the NPF and page fault 
interceptions,
as there are complications in both cases with respect to the computation
of the correct error code.

For NPF interceptions, unfortunately the only possibility is to emulate,
because the access type in the exit qualification might refer to an
access to a paging structure, rather than to the access performed by
the program.

Trapping page faults instead is needed in order to correct the error code,
but the access type can be obtained from the original error code and
passed to gva_to_gpa.  The corrections required in the error code are
subtle. For example, imagine that a PTE for a supervisor page has a reserved
bit set.  On a supervisor-mode access, the EPT violation path would trigger.
However, on a user-mode access, the processor will not notice the reserved
bit and not include PFERR_RSVD_MASK in the error code.

CC: Tom Lendacky 
CC: Babu Moger 
Signed-off-by: Mohammed Gamal 
---
 arch/x86/kvm/svm/svm.c | 11 +++
 arch/x86/kvm/svm/svm.h |  2 +-
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 05412818027d..ec3224a2e7c2 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -1702,6 +1702,12 @@ static int pf_interception(struct vcpu_svm *svm)
u64 fault_address = __sme_clr(svm->vmcb->control.exit_info_2);
u64 error_code = svm->vmcb->control.exit_info_1;
 
+   if (npt_enabled && !svm->vcpu.arch.apf.host_apf_flags) {
+   kvm_fixup_and_inject_pf_error(>vcpu,
+   fault_address, error_code);
+   return 1;
+   }
+
return kvm_handle_page_fault(>vcpu, error_code, fault_address,
static_cpu_has(X86_FEATURE_DECODEASSISTS) ?
svm->vmcb->control.insn_bytes : NULL,
@@ -1714,6 +1720,11 @@ static int npf_interception(struct vcpu_svm *svm)
u64 error_code = svm->vmcb->control.exit_info_1;
 
trace_kvm_page_fault(fault_address, error_code);
+
+   /* Check if guest gpa doesn't exceed physical memory limits */
+   if (unlikely(kvm_mmu_is_illegal_gpa(>vcpu, fault_address)))
+   return kvm_emulate_instruction(>vcpu, 0);
+
return kvm_mmu_page_fault(>vcpu, fault_address, error_code,
static_cpu_has(X86_FEATURE_DECODEASSISTS) ?
svm->vmcb->control.insn_bytes : NULL,
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 2b7469f3db0e..12b502e36dbd 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -348,7 +348,7 @@ static inline bool gif_set(struct vcpu_svm *svm)
 
 static inline bool svm_need_pf_intercept(struct vcpu_svm *svm)
 {
-return !npt_enabled;
+return !npt_enabled || cpuid_maxphyaddr(>vcpu) < 
boot_cpu_data.x86_phys_bits;
 }
 
 /* svm.c */
-- 
2.26.2



[PATCH v2 07/11] KVM: VMX: Add guest physical address check in EPT violation and misconfig

2020-06-19 Thread Mohammed Gamal
Check guest physical address against it's maximum physical memory. If
the guest's physical address exceeds the maximum (i.e. has reserved bits
set), inject a guest page fault with PFERR_RSVD_MASK set.

This has to be done both in the EPT violation and page fault paths, as
there are complications in both cases with respect to the computation
of the correct error code.

For EPT violations, unfortunately the only possibility is to emulate,
because the access type in the exit qualification might refer to an
access to a paging structure, rather than to the access performed by
the program.

Trapping page faults instead is needed in order to correct the error code,
but the access type can be obtained from the original error code and
passed to gva_to_gpa.  The corrections required in the error code are
subtle. For example, imagine that a PTE for a supervisor page has a reserved
bit set.  On a supervisor-mode access, the EPT violation path would trigger.
However, on a user-mode access, the processor will not notice the reserved
bit and not include PFERR_RSVD_MASK in the error code.

Co-developed-by: Mohammed Gamal 
Signed-off-by: Paolo Bonzini 
---
 arch/x86/kvm/vmx/vmx.c | 24 +---
 arch/x86/kvm/vmx/vmx.h |  3 ++-
 2 files changed, 23 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 46d522ee5cb1..f38cbadcb3a5 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -4793,9 +4793,15 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu)
 
if (is_page_fault(intr_info)) {
cr2 = vmx_get_exit_qual(vcpu);
-   /* EPT won't cause page fault directly */
-   WARN_ON_ONCE(!vcpu->arch.apf.host_apf_flags && enable_ept);
-   return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0);
+   if (enable_ept && !vcpu->arch.apf.host_apf_flags) {
+   /*
+* EPT will cause page fault only if we need to
+* detect illegal GPAs.
+*/
+   kvm_fixup_and_inject_pf_error(vcpu, cr2, error_code);
+   return 1;
+   } else
+   return kvm_handle_page_fault(vcpu, error_code, cr2, 
NULL, 0);
}
 
ex_no = intr_info & INTR_INFO_VECTOR_MASK;
@@ -5311,6 +5317,18 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
   PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK;
 
vcpu->arch.exit_qualification = exit_qualification;
+
+   /*
+* Check that the GPA doesn't exceed physical memory limits, as that is
+* a guest page fault.  We have to emulate the instruction here, because
+* if the illegal address is that of a paging structure, then
+* EPT_VIOLATION_ACC_WRITE bit is set.  Alternatively, if supported we
+* would also use advanced VM-exit information for EPT violations to
+* reconstruct the page fault error code.
+*/
+   if (unlikely(kvm_mmu_is_illegal_gpa(vcpu, gpa)))
+   return kvm_emulate_instruction(vcpu, 0);
+
return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
 }
 
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 5e2da15fe94f..950ecf237558 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -11,6 +11,7 @@
 #include "kvm_cache_regs.h"
 #include "ops.h"
 #include "vmcs.h"
+#include "cpuid.h"
 
 extern const u32 vmx_msr_index[];
 
@@ -554,7 +555,7 @@ static inline bool vmx_has_waitpkg(struct vcpu_vmx *vmx)
 
 static inline bool vmx_need_pf_intercept(struct kvm_vcpu *vcpu)
 {
-   return !enable_ept;
+   return !enable_ept || cpuid_maxphyaddr(vcpu) < 
boot_cpu_data.x86_phys_bits;
 }
 
 void dump_vmcs(void);
-- 
2.26.2



[PATCH v2 05/11] KVM: x86: update exception bitmap on CPUID changes

2020-06-19 Thread Mohammed Gamal
From: Paolo Bonzini 

Allow vendor code to observe changes to MAXPHYADDR and start/stop
intercepting page faults.

Signed-off-by: Paolo Bonzini 
---
 arch/x86/kvm/cpuid.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 8a294f9747aa..ea5bbf2153bb 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -128,6 +128,8 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
kvm_mmu_reset_context(vcpu);
 
kvm_pmu_refresh(vcpu);
+   kvm_x86_ops.update_exception_bitmap(vcpu);
+
return 0;
 }
 
-- 
2.26.2



[PATCH v2 08/11] KVM: VMX: optimize #PF injection when MAXPHYADDR does not match

2020-06-19 Thread Mohammed Gamal
From: Paolo Bonzini 

Ignore non-present page faults, since those cannot have reserved
bits set.

When running access.flat with "-cpu Haswell,phys-bits=36", the
number of trapped page faults goes down from 8872644 to 3978948.

Signed-off-by: Paolo Bonzini 
---
 arch/x86/kvm/vmx/vmx.c | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index f38cbadcb3a5..8daf78b2d4cb 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -4358,6 +4358,16 @@ static void init_vmcs(struct vcpu_vmx *vmx)
vmx->pt_desc.guest.output_mask = 0x7F;
vmcs_write64(GUEST_IA32_RTIT_CTL, 0);
}
+
+   /*
+* If EPT is enabled, #PF is only trapped if MAXPHYADDR is mismatched
+* between guest and host.  In that case we only care about present
+* faults.
+*/
+   if (enable_ept) {
+   vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, PFERR_PRESENT_MASK);
+   vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, PFERR_PRESENT_MASK);
+   }
 }
 
 static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
-- 
2.26.2



Re: ARM: vmsplit 4g/4g

2020-06-15 Thread afzal mohammed
Hi Linus,

On Mon, Jun 15, 2020 at 11:11:04AM +0200, Linus Walleij wrote:

> OK I would be very happy to look at it so I can learn a bit about the
> hands-on and general approach here. Just send it to this address
> directly and I will look!

Have sent it

> > For the next 3 weeks, right now, i cannot say whether i would be able
> > to spend time on it, perhaps might be possible, but only during that
> > time i will know.
> 
> I'm going for vacation the next 2 weeks or so, but then it'd be great if
> we can start looking at this in-depth!

Yes for me too

Regards
afzal


Re: [RFC 1/3] lib: copy_{from,to}_user using gup & kmap_atomic()

2020-06-14 Thread afzal mohammed
Hi,

On Sun, Jun 14, 2020 at 06:51:43PM +0530, afzal mohammed wrote:

> It is MB/s for copying one file to another via user space buffer, i.e.
> the value coreutils 'dd' shows w/ status=progress (here it was busybox
> 'dd', so instead it was enabling a compile time option)

Just for correctness, status=progress is not required, it's there in
the default 3rd line of coreutils 'dd' o/p

Regards
afzal


Re: [RFC 1/3] lib: copy_{from,to}_user using gup & kmap_atomic()

2020-06-14 Thread afzal mohammed
Hi,

On Sat, Jun 13, 2020 at 10:45:33PM +0200, Arnd Bergmann wrote:

> 4% boot time increase sounds like a lot, especially if that is only for
> copy_from_user/copy_to_user. In the end it really depends on how well
> get_user()/put_user() and small copies can be optimized in the end.

i mentioned the worst case(happened only once), normally it was in
the range 2-3%

> From the numbers you
> measured, it seems the beaglebone currently needs an extra ~6µs or
> 3µs per copy_to/from_user() call with your patch, depending on what
> your benchmark was (MB/s for just reading or writing vs MB/s for
> copying from one file to another through a user space buffer).

It is MB/s for copying one file to another via user space buffer, i.e.
the value coreutils 'dd' shows w/ status=progress (here it was busybox
'dd', so instead it was enabling a compile time option)

> but if you want to test what the overhead is, you could try changing
> /dev/zero (or a different chardev like it) to use a series of
> put_user(0, u32uptr++) in place of whatever it has, and then replace the
> 'str' instruction with dummy writes to ttbr0 using the value it already
> has, like:
> 
>   mcr p15, 0, %0, c2, c0, 0  /* set_ttbr0() */
>   isb  /* prevent speculative access to kernel table */
>   str%1, [%2],0 /* write 32 bit to user space */
>   mcr p15, 0, %0, c2, c0, 0  /* set_ttbr0() */
>   isb  /* prevent speculative access to user table */

> It would be interesting to compare it to the overhead of a
> get_user_page_fast() based implementation.

i have to relocate & be on quarantine couple of weeks, so i will
temporarily stop here, otherwise might end up in roadside.

Reading feedbacks from everyone, some of it i could grasp only bits &
pieces, familiarizing more w/ mm & vfs would help me add value better
to the goal/discussion. Linus Walleij, if you wish to explore things,
feel free, right now don't know how my connectivity would be for next
3 weeks.

Regards
afzal


Re: [RFC 1/3] lib: copy_{from,to}_user using gup & kmap_atomic()

2020-06-14 Thread afzal mohammed
Hi,

On Sat, Jun 13, 2020 at 02:15:52PM +0100, Russell King - ARM Linux admin wrote:
> On Sat, Jun 13, 2020 at 05:34:32PM +0530, afzal mohammed wrote:

> > i think C
> > library cuts any size read, write to page size (if it exceeds) &
> > invokes the system call.

> You can't make that assumption about read(2).  stdio in the C library
> may read a page size of data at a time, but programs are allowed to
> call read(2) directly, and the C library will pass such a call straight
> through to the kernel.  So, if userspace requests a 16k read via
> read(2), then read(2) will be invoked covering 16k.
> 
> As an extreme case, for example:
> 
> $ strace -e read dd if=/dev/zero of=/dev/null bs=1048576 count=1
> read(0, 
> "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 
> 1048576) = 1048576

Okay. Yes, observed that dd is passing whatever is the 'bs' to
Kernel and from the 'dd' sources (of busybox), it is invoking read
system call directly passing 'bs', so it is the tmpfs read that is
splitting it to page size as mentioned by Arnd.

Regards
afzal


Re: [RFC 1/3] lib: copy_{from,to}_user using gup & kmap_atomic()

2020-06-13 Thread afzal mohammed
Hi,

On Sat, Jun 13, 2020 at 01:56:15PM +0100, Al Viro wrote:

> Incidentally, what about get_user()/put_user()?  _That_ is where it's
> going to really hurt...

All other uaccess routines are also planned to be added, posting only
copy_{from,to}_user() was to get early feedback (mentioned in the
cover letter)

Regards
afzal


Re: [RFC 1/3] lib: copy_{from,to}_user using gup & kmap_atomic()

2020-06-13 Thread afzal mohammed
Hi,

On Sat, Jun 13, 2020 at 02:08:11PM +0300, Andy Shevchenko wrote:
> On Fri, Jun 12, 2020 at 1:20 PM afzal mohammed  
> wrote:

> > +// Started from arch/um/kernel/skas/uaccess.c
> 
> Does it mean you will deduplicate it there?

What i meant was, that file was taken as a template & nothing more, at
same time i wanted to give credit to that file, i will explicitly
mention it next time.

It is not meant to deduplicate it. Functionality here is completely
different.

In the case here, there would be different virtual address mapping
that CPU will be see once in Kernel as compared to user mode.

Here a facility is provided to access the user page, when the
current virtual address mapping of the CPU excludes it. This
is for providing full 4G virtual address to both user & kernel on
32bit ARM to avoid using highmem or reduce the impact of highmem,
i.e. so that Kernel can address till 4GiB (almost) as lowmem.

Here assumption is that user mapping is not a subset of virtual
address mapped by CPU, but a separate one. Upon Kernel entry ttbr0
is changed to Kernel lowmem, while upon Kernel exit is changed back to
user pages (ttbrx in ARM, iiuc, equivalent to cr3 in x86)

Now realize that i am unable to put coherently the problem being
attempted to solve here to a person not familiar w/ the issue
w/o taking considerable time. If above explanation is not enough,
i will try to explain later in a better way.

> > +#include 
> > +#include 
> > +#include 
> > +#include 
> 
> Perhaps ordered?

will take care

> > +static int do_op_one_page(unsigned long addr, int len,
> > +int (*op)(unsigned long addr, int len, void *arg), void 
> > *arg,
> > +struct page *page)
> 
> Maybe typedef for the func() ?

will take care

> > +{
> > +   int n;
> > +
> > +   addr = (unsigned long) kmap_atomic(page) + (addr & ~PAGE_MASK);
> 
> I don't remember about this one...

i am not following you here, for my case !CONFIG_64BIT case in that
file was required, hence only it was picked (or rather not deleted)

> > +   size = min(PAGE_ALIGN(addr) - addr, (unsigned long) len);
> 
> ...but here seems to me you can use helpers (offset_in_page() or how
> it's called).

i was not aware of it, will use it as required.

> 
> Also consider to use macros like PFN_DOWN(), PFN_UP(), etc in your code.

Okay

> 
> > +   remain = len;
> > +   if (size == 0)
> > +   goto page_boundary;
> > +
> > +   n = do_op_one_page(addr, size, op, arg, *pages);
> > +   if (n != 0) {
> 
> > +   remain = (n < 0 ? remain : 0);
> 
> Why duplicate three times (!) this line, if you can move it to under 'out'?

yes better to move there

> 
> > +   goto out;
> > +   }
> > +
> > +   pages++;
> > +   addr += size;
> > +   remain -= size;
> > +
> > +page_boundary:
> > +   if (remain == 0)
> > +   goto out;
> > +   while (addr < ((addr + remain) & PAGE_MASK)) {
> > +   n = do_op_one_page(addr, PAGE_SIZE, op, arg, *pages);
> > +   if (n != 0) {
> > +   remain = (n < 0 ? remain : 0);
> > +   goto out;
> > +   }
> > +
> > +   pages++;
> > +   addr += PAGE_SIZE;
> > +   remain -= PAGE_SIZE;
> > +   }
> 
> Sounds like this can be refactored to iterate over pages rather than 
> addresses.

Okay, i will check

> > +static int copy_chunk_from_user(unsigned long from, int len, void *arg)
> > +{
> > +   unsigned long *to_ptr = arg, to = *to_ptr;
> > +
> > +   memcpy((void *) to, (void *) from, len);
> 
> What is the point in the casting to void *?

The reason it was there was because of copy-paste :), passing unsigned
long as 'void *' or 'const void *' requires casting right ?, or you
meant something else ?

now i checked removing the cast, compiler is abusing me :), says
'makes pointer from integer without a cast'

> > +   num_pages = DIV_ROUND_UP((unsigned long)from + n, PAGE_SIZE) -
> > +(unsigned long)from / PAGE_SIZE;
> 
> PFN_UP() ?

Okay

> I think you can clean up the code a bit after you will get the main
> functionality working.

Yes, surely, intention was to post proof-of-concept ASAP, perhaps
contents will change drastically in next version so that any
resemblence of arch/um/kernel/skas/uaccess.c might not be there.

Regards
afzal


Re: [RFC 1/3] lib: copy_{from,to}_user using gup & kmap_atomic()

2020-06-13 Thread afzal mohammed
Hi,

On Fri, Jun 12, 2020 at 10:07:28PM +0200, Arnd Bergmann wrote:

> I think a lot
> of usercopy calls are only for a few bytes, though this is of course
> highly workload dependent and you might only care about the large
> ones.

Observation is that max. pages reaching copy_{from,to}_user() is 2,
observed maximum of n (number of bytes) being 1 page size. i think C
library cuts any size read, write to page size (if it exceeds) &
invokes the system call. Max. pages reaching 2, happens when 'n'
crosses page boundary, this has been observed w/ small size request
as well w/ ones of exact page size (but not page aligned).

Even w/ dd of various size >4K, never is the number of pages required
to be mapped going greater than 2 (even w/ 'dd' 'bs=1M')

i have a worry (don't know whether it is an unnecessary one): even
if we improve performance w/ large copy sizes, it might end up in a
sluggishness w.r.t user experience due to most (hence a high amount)
of user copy calls being few bytes & there the penalty being higher.
And benchmark would not be able to detect anything abnormal since
usercopy are being tested on large sizes.

Quickly comparing boot-time on Beagle Bone White, boot time increases
by only 4%, perhaps this worry is irrelevant, but just thought will
put it across.

> There is also still hope of optimizing small aligned copies like
> 
> set_ttbr0(user_ttbr);
> ldm();
> set_ttbr0(kernel_ttbr);
> stm();

Hmm, more needs to be done to be in a position to test it.

Regards
afzal


Re: [RFC 0/3] ARM: copy_{from,to}_user() for vmsplit 4g/4g

2020-06-12 Thread afzal mohammed
Hi,

On Fri, Jun 12, 2020 at 09:31:12PM +0530, afzal mohammed wrote:

>  512 1K  4K 16K 32K 64K 1M
>  
> normal   30  46  89 95  90  85  65
> 
> uaccess_w_memcpy 28.545  85 92  91  85  65
> 
> w/ series22  36  72 79  78  75  61

For the sake of completeness all in MB/s, w/ various 'dd' 'bs' sizes.

Regards
afzal


Re: [RFC 0/3] ARM: copy_{from,to}_user() for vmsplit 4g/4g

2020-06-12 Thread afzal mohammed
Hi,

On Fri, Jun 12, 2020 at 11:19:23AM -0400, Nicolas Pitre wrote:
> On Fri, 12 Jun 2020, afzal mohammed wrote:

> > Performance wise, results are not encouraging, 'dd' on tmpfs results,

> Could you compare with CONFIG_UACCESS_WITH_MEMCPY as well?

 512 1K  4K 16K 32K 64K 1M
 
normal   30  46  89 95  90  85  65

uaccess_w_memcpy 28.545  85 92  91  85  65

w/ series22  36  72 79  78  75  61

There are variations in the range +/-2 in some readings when repeated,
not put above, to keep comparison simple.

Regards
afzal


Re: [RFC 1/3] lib: copy_{from,to}_user using gup & kmap_atomic()

2020-06-12 Thread afzal mohammed
Hi,

On Fri, Jun 12, 2020 at 02:02:13PM +0200, Arnd Bergmann wrote:
> On Fri, Jun 12, 2020 at 12:18 PM afzal mohammed  
> wrote:

> > Roughly a one-third drop in performance. Disabling highmem improves
> > performance only slightly.

> There are probably some things that can be done to optimize it,
> but I guess most of the overhead is from the page table operations
> and cannot be avoided.

Ingo's series did a follow_page() first, then as a fallback did it
invoke get_user_pages(), i will try that way as well.

Yes, i too feel get_user_pages_fast() path is the most time consuming,
will instrument & check.

> What was the exact 'dd' command you used, in particular the block size?
> Note that by default, 'dd' will request 512 bytes at a time, so you usually
> only access a single page. It would be interesting to see the overhead with
> other typical or extreme block sizes, e.g. '1', '64', '4K', '64K' or '1M'.

It was the default(512), more test results follows (in MB/s),

512 1K  4K  16K 32K 64K 1M

w/o series  30  46  89  95  90  85  65

w/ series   22  36  72  79  78  75  61

perf drop   26% 21% 19% 16% 13% 12%6%

Hmm, results ain't that bad :)

> If you want to drill down into where exactly the overhead is (i.e.
> get_user_pages or kmap_atomic, or something different), using
> 'perf record dd ..', and 'perf report' may be helpful.

Let me dig deeper & try to find out where the major overhead and try
to figure out ways to reduce it.

One reason to disable highmem & test (results mentioned earlier) was
to make kmap_atomic() very lightweight, that was not making much
difference, around 3% only.

> > +static int copy_chunk_from_user(unsigned long from, int len, void *arg)
> > +{
> > +   unsigned long *to_ptr = arg, to = *to_ptr;
> > +
> > +   memcpy((void *) to, (void *) from, len);
> > +   *to_ptr += len;
> > +   return 0;
> > +}
> > +
> > +static int copy_chunk_to_user(unsigned long to, int len, void *arg)
> > +{
> > +   unsigned long *from_ptr = arg, from = *from_ptr;
> > +
> > +   memcpy((void *) to, (void *) from, len);
> > +   *from_ptr += len;
> > +   return 0;
> > +}
> 
> Will gcc optimize away the indirect function call and inline everything?
> If not, that would be a small part of the overhead.

i think not, based on objdump, i will make these & wherever other
places possible inline & see the difference.

> > +   num_pages = DIV_ROUND_UP((unsigned long)from + n, PAGE_SIZE) -
> > +(unsigned long)from / PAGE_SIZE;
> 
> Make sure this doesn't turn into actual division operations but uses shifts.
> It might even be clearer here to open-code the shift operation so readers
> can see what this is meant to compile into.

Okay

> 
> > +   pages = kmalloc_array(num_pages, sizeof(*pages), GFP_KERNEL | 
> > __GFP_ZERO);
> > +   if (!pages)
> > +   goto end;
> 
> Another micro-optimization may be to avoid the kmalloc for the common case,
> e.g. anything with "num_pages <= 64", using an array on the stack.

Okay

> > +   ret = get_user_pages_fast((unsigned long)from, num_pages, 0, pages);
> > +   if (ret < 0)
> > +   goto free_pages;
> > +
> > +   if (ret != num_pages) {
> > +   num_pages = ret;
> > +   goto put_pages;
> > +   }
> 
> I think this is technically incorrect: if get_user_pages_fast() only
> gets some of the
> pages, you should continue with the short buffer and return the number
> of remaining
> bytes rather than not copying anything. I think you did that correctly
> for a failed
> kmap_atomic(), but this has to use the same logic.

yes, will fix that.


Regards
afzal


Re: ARM: vmsplit 4g/4g

2020-06-12 Thread afzal mohammed
Hi,

On Wed, Jun 10, 2020 at 12:10:21PM +0200, Linus Walleij wrote:
> On Mon, Jun 8, 2020 at 1:09 PM afzal mohammed  wrote:

> > Not yet. Yes, i will do the performance evaluation.
> >
> > i am also worried about the impact on performance as these
> > [ get_user_pages() or friends, kmap_atomic() ] are additionally
> > invoked in the copy_{from,to}_user() path now.
> 
> I am happy to help!

Thanks Linus

> I am anyway working on MMU-related code (KASan) so I need to be on
> top of this stuff.

i earlier went thr' KASAN series secretly & did learn a thing or two
from that!

> What test is appropriate for this? I would intuitively think hackbench?

'dd', i think, as you mentioned 'hackbench' i will use that as well.

> > Note that this was done on a topic branch for user copy. Changes for
> > kernel static mapping to vmalloc has not been merged with these.
> > Also having kernel lowmem w/ a separate asid & switching at kernel
> > entry/exit b/n user & kernel lowmem by changing ttbr0 is yet to be
> > done. Quite a few things remaining to be done to achieve vmsplit 4g/4g
> 
> I will be very excited to look at patches or a git branch once you have
> something you want to show. Also to just understand how you go about
> this.

Don't put too much expectation on me, this is more of a learning for
me. For user copy, the baby steps has been posted (To'ed you). On the
static kernel mapping on vmalloc front, i do not want to post the
patches in the current shape, though git-ized, will result in me
getting mercilessly thrashed in public :). Many of the other platforms
would fail and is not multi-platform friendly. i do not yet have a
public git branch, i can send you the (ugly) patches separately, just
let me know.

> I have several elder systems under my roof

i have only a few low RAM & CPU systems, so that is certainly helpful.

> so my contribution could hopefully be to help and debug any issues

If you would like, we can work together, at the same time keep in mind
that me spending time on it would be intermittent & erratic (though i
am trying to keep a consistent, but slow pace) perhaps making it
difficult to coordinate. Or else i will continue the same way & request
your help when required.

For the next 3 weeks, right now, i cannot say whether i would be able
to spend time on it, perhaps might be possible, but only during that
time i will know.

Regards
afzal


[RFC 3/3] ARM: provide CONFIG_VMSPLIT_4G_DEV for development

2020-06-12 Thread afzal mohammed
Select UACCESS_GUP_KMAP_MEMCPY initially.

Signed-off-by: afzal mohammed 
---
 arch/arm/Kconfig | 9 +
 1 file changed, 9 insertions(+)

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index c77c93c485a08..ae2687679d7c8 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -1326,6 +1326,15 @@ config PAGE_OFFSET
default 0xB000 if VMSPLIT_3G_OPT
default 0xC000
 
+config VMSPLIT_4G_DEV
+   bool "Experimental changes for 4G/4G user/kernel split"
+   depends on ARM_LPAE
+   select UACCESS_GUP_KMAP_MEMCPY
+   help
+ Experimental changes during 4G/4G user/kernel split development.
+ Existing vmsplit config option is used, once development is done,
+ this would be put as a new choice & _DEV suffix removed.
+
 config NR_CPUS
int "Maximum number of CPUs (2-32)"
range 2 32
-- 
2.26.2



[RFC 1/3] lib: copy_{from,to}_user using gup & kmap_atomic()

2020-06-12 Thread afzal mohammed
copy_{from,to}_user() uaccess helpers are implemented by user page
pinning, followed by temporary kernel mapping & then memcpy(). This
helps to achieve user page copy when current virtual address mapping
of the CPU excludes user pages.

Performance wise, results are not encouraging, 'dd' on tmpfs results,

ARM Cortex-A8, BeagleBone White (256MiB RAM):
w/o series - ~29.5 MB/s
w/ series - ~20.5 MB/s
w/ series & highmem disabled - ~21.2 MB/s

On Cortex-A15(2GiB RAM) in QEMU:
w/o series - ~4 MB/s
w/ series - ~2.6 MB/s

Roughly a one-third drop in performance. Disabling highmem improves
performance only slightly.

'hackbench' also showed a similar pattern.

uaccess routines using page pinning & temporary kernel mapping is not
something new, it has been done long long ago by Ingo [1] as part of
4G/4G user/kernel mapping implementation on x86, though not merged in
mainline.

[1] 
https://lore.kernel.org/lkml/Pine.LNX.4.44.0307082332450.17252-10@localhost.localdomain/

Signed-off-by: afzal mohammed 
---
 lib/Kconfig   |   4 +
 lib/Makefile  |   3 +
 lib/uaccess_gup_kmap_memcpy.c | 162 ++
 3 files changed, 169 insertions(+)
 create mode 100644 lib/uaccess_gup_kmap_memcpy.c

diff --git a/lib/Kconfig b/lib/Kconfig
index 5d53f9609c252..dadf4f6cc391d 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -622,6 +622,10 @@ config ARCH_HAS_MEMREMAP_COMPAT_ALIGN
 config UACCESS_MEMCPY
bool
 
+# pin page + kmap_atomic + memcpy for user copies, intended for vmsplit 4g/4g
+config UACCESS_GUP_KMAP_MEMCPY
+   bool
+
 config ARCH_HAS_UACCESS_FLUSHCACHE
bool
 
diff --git a/lib/Makefile b/lib/Makefile
index 685aee60de1d5..bc457f85e391a 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -309,3 +309,6 @@ obj-$(CONFIG_OBJAGG) += objagg.o
 
 # KUnit tests
 obj-$(CONFIG_LIST_KUNIT_TEST) += list-test.o
+
+# uaccess
+obj-$(CONFIG_UACCESS_GUP_KMAP_MEMCPY) += uaccess_gup_kmap_memcpy.o
diff --git a/lib/uaccess_gup_kmap_memcpy.c b/lib/uaccess_gup_kmap_memcpy.c
new file mode 100644
index 0..1536762df1fd5
--- /dev/null
+++ b/lib/uaccess_gup_kmap_memcpy.c
@@ -0,0 +1,162 @@
+// SPDX-License-Identifier: GPL-2.0
+// Started from arch/um/kernel/skas/uaccess.c
+
+#include 
+#include 
+#include 
+#include 
+
+#include 
+#include 
+
+static int do_op_one_page(unsigned long addr, int len,
+int (*op)(unsigned long addr, int len, void *arg), void *arg,
+struct page *page)
+{
+   int n;
+
+   addr = (unsigned long) kmap_atomic(page) + (addr & ~PAGE_MASK);
+   n = (*op)(addr, len, arg);
+   kunmap_atomic((void *)addr);
+
+   return n;
+}
+
+static long buffer_op(unsigned long addr, int len,
+ int (*op)(unsigned long, int, void *), void *arg,
+ struct page **pages)
+{
+   long size, remain, n;
+
+   size = min(PAGE_ALIGN(addr) - addr, (unsigned long) len);
+   remain = len;
+   if (size == 0)
+   goto page_boundary;
+
+   n = do_op_one_page(addr, size, op, arg, *pages);
+   if (n != 0) {
+   remain = (n < 0 ? remain : 0);
+   goto out;
+   }
+
+   pages++;
+   addr += size;
+   remain -= size;
+
+page_boundary:
+   if (remain == 0)
+   goto out;
+   while (addr < ((addr + remain) & PAGE_MASK)) {
+   n = do_op_one_page(addr, PAGE_SIZE, op, arg, *pages);
+   if (n != 0) {
+   remain = (n < 0 ? remain : 0);
+   goto out;
+   }
+
+   pages++;
+   addr += PAGE_SIZE;
+   remain -= PAGE_SIZE;
+   }
+   if (remain == 0)
+   goto out;
+
+   n = do_op_one_page(addr, remain, op, arg, *pages);
+   if (n != 0) {
+   remain = (n < 0 ? remain : 0);
+   goto out;
+   }
+
+   return 0;
+out:
+   return remain;
+}
+
+static int copy_chunk_from_user(unsigned long from, int len, void *arg)
+{
+   unsigned long *to_ptr = arg, to = *to_ptr;
+
+   memcpy((void *) to, (void *) from, len);
+   *to_ptr += len;
+   return 0;
+}
+
+static int copy_chunk_to_user(unsigned long to, int len, void *arg)
+{
+   unsigned long *from_ptr = arg, from = *from_ptr;
+
+   memcpy((void *) to, (void *) from, len);
+   *from_ptr += len;
+   return 0;
+}
+
+unsigned long gup_kmap_copy_from_user(void *to, const void __user *from, 
unsigned long n)
+{
+   struct page **pages;
+   int num_pages, ret, i;
+
+   if (uaccess_kernel()) {
+   memcpy(to, (__force void *)from, n);
+   return 0;
+   }
+
+   num_pages = DIV_ROUND_UP((unsigned long)from + n, PAGE_SIZE) -
+(unsigned long)from / PAGE_SIZE;
+   pages = kmalloc_array(num_pages, sizeof(*pages), GFP_KERNEL | 
__GFP_ZERO);
+   if (!pages)
+  

[RFC 2/3] ARM: uaccess: let UACCESS_GUP_KMAP_MEMCPY enabling

2020-06-12 Thread afzal mohammed
Turn off existing raw_copy_{from,to}_user() using
arm_copy_{from,to}_user() when CONFIG_UACCESS_GUP_KMAP_MEMCPY is
enabled.

Signed-off-by: afzal mohammed 
---
 arch/arm/include/asm/uaccess.h | 20 
 arch/arm/kernel/armksyms.c |  2 ++
 arch/arm/lib/Makefile  |  7 +--
 3 files changed, 27 insertions(+), 2 deletions(-)

diff --git a/arch/arm/include/asm/uaccess.h b/arch/arm/include/asm/uaccess.h
index 98c6b91be4a8a..4a16ae52d4978 100644
--- a/arch/arm/include/asm/uaccess.h
+++ b/arch/arm/include/asm/uaccess.h
@@ -512,6 +512,15 @@ do {   
\
 extern unsigned long __must_check
 arm_copy_from_user(void *to, const void __user *from, unsigned long n);
 
+#ifdef CONFIG_UACCESS_GUP_KMAP_MEMCPY
+extern unsigned long __must_check
+gup_kmap_copy_from_user(void *to, const void __user *from, unsigned long n);
+static inline __must_check unsigned long
+raw_copy_from_user(void *to, const void __user *from, unsigned long n)
+{
+   return gup_kmap_copy_from_user(to, from, n);
+}
+#else
 static inline unsigned long __must_check
 raw_copy_from_user(void *to, const void __user *from, unsigned long n)
 {
@@ -522,12 +531,22 @@ raw_copy_from_user(void *to, const void __user *from, 
unsigned long n)
uaccess_restore(__ua_flags);
return n;
 }
+#endif
 
 extern unsigned long __must_check
 arm_copy_to_user(void __user *to, const void *from, unsigned long n);
 extern unsigned long __must_check
 __copy_to_user_std(void __user *to, const void *from, unsigned long n);
 
+#ifdef CONFIG_UACCESS_GUP_KMAP_MEMCPY
+extern unsigned long __must_check
+gup_kmap_copy_to_user(void __user *to, const void *from, unsigned long n);
+static inline __must_check unsigned long
+raw_copy_to_user(void __user *to, const void *from, unsigned long n)
+{
+   return gup_kmap_copy_to_user(to, from, n);
+}
+#else
 static inline unsigned long __must_check
 raw_copy_to_user(void __user *to, const void *from, unsigned long n)
 {
@@ -541,6 +560,7 @@ raw_copy_to_user(void __user *to, const void *from, 
unsigned long n)
return arm_copy_to_user(to, from, n);
 #endif
 }
+#endif
 
 extern unsigned long __must_check
 arm_clear_user(void __user *addr, unsigned long n);
diff --git a/arch/arm/kernel/armksyms.c b/arch/arm/kernel/armksyms.c
index 98bdea51089d5..8c92fe30d1559 100644
--- a/arch/arm/kernel/armksyms.c
+++ b/arch/arm/kernel/armksyms.c
@@ -96,8 +96,10 @@ EXPORT_SYMBOL(mmiocpy);
 #ifdef CONFIG_MMU
 EXPORT_SYMBOL(copy_page);
 
+#ifndef CONFIG_UACCESS_GUP_KMAP_MEMCPY
 EXPORT_SYMBOL(arm_copy_from_user);
 EXPORT_SYMBOL(arm_copy_to_user);
+#endif
 EXPORT_SYMBOL(arm_clear_user);
 
 EXPORT_SYMBOL(__get_user_1);
diff --git a/arch/arm/lib/Makefile b/arch/arm/lib/Makefile
index 6d2ba454f25b6..1aeff2cd7b4b3 100644
--- a/arch/arm/lib/Makefile
+++ b/arch/arm/lib/Makefile
@@ -16,8 +16,11 @@ lib-y:= changebit.o csumipv6.o csumpartial.o 
  \
   io-readsb.o io-writesb.o io-readsl.o io-writesl.o  \
   call_with_stack.o bswapsdi2.o
 
-mmu-y  := clear_user.o copy_page.o getuser.o putuser.o   \
-  copy_from_user.o copy_to_user.o
+mmu-y  := clear_user.o copy_page.o getuser.o putuser.o
+
+ifndef CONFIG_UACCESS_GUP_KMAP_MEMCPY
+  mmu-y+= copy_from_user.o copy_to_user.o
+endif
 
 ifdef CONFIG_CC_IS_CLANG
   lib-y+= backtrace-clang.o
-- 
2.26.2



[RFC 0/3] ARM: copy_{from,to}_user() for vmsplit 4g/4g

2020-06-12 Thread afzal mohammed
Hi,

copy_{from,to}_user() uaccess helpers are implemented by user page
pinning, followed by temporary kernel mapping & then memcpy(). This
helps to achieve user page copy when current virtual address mapping
of the CPU excludes user pages.

Other uaccess routines are also planned to be modified to make use of
pinning plus kmap_atomic() based on the feedback here.

This is done as one of the initial steps to achieve 4G virtual
address mapping for user as well as Kernel on ARMv7 w/ LPAE.

Motive behind this is to enable Kernel access till 4GiB (almost) as
lowmem, thus helping in removing highmem support for platforms having
upto 4GiB RAM. In the case of platforms having >4GiB, highmem is still
required for the Kernel to be able to access whole RAM.

Performance wise, results are not encouraging, 'dd' on tmpfs results,

ARM Cortex-A8, BeagleBone White (256MiB RAM):
w/o series - ~29.5 MB/s
w/ series - ~20.5 MB/s
w/ series & highmem disabled - ~21.2 MB/s

On Cortex-A15(2GiB RAM) in QEMU:
w/o series - ~4 MB/s
w/ series - ~2.6 MB/s

Roughly a one-third drop in performance. Disabling highmem improves
performance only slightly.

'hackbench' also showed a similar pattern.

Ways to improve the performance has to be explored, if any one has
thoughts on it, please share.

uaccess routines using page pinning & temporary kernel mapping is not
something new, it has been done by Ingo long long ago [1] as part of
4G/4G user/kernel mapping implementation on x86, though not merged in
mainline.

Arnd has outlined basic design for vmsplit 4g/4g, uaccess routines
using user page pinning plus kmap_atomic() is one part of that.

[1] 
https://lore.kernel.org/lkml/Pine.LNX.4.44.0307082332450.17252-10@localhost.localdomain/

Last 2 patches are only meant for testing first patch.

Regards
afzal

afzal mohammed (3):
  lib: copy_{from,to}_user using gup & kmap_atomic()
  ARM: uaccess: let UACCESS_GUP_KMAP_MEMCPY enabling
  ARM: provide CONFIG_VMSPLIT_4G_DEV for development

 arch/arm/Kconfig   |   9 ++
 arch/arm/include/asm/uaccess.h |  20 
 arch/arm/kernel/armksyms.c |   2 +
 arch/arm/lib/Makefile  |   7 +-
 lib/Kconfig|   4 +
 lib/Makefile   |   3 +
 lib/uaccess_gup_kmap_memcpy.c  | 162 +
 7 files changed, 205 insertions(+), 2 deletions(-)
 create mode 100644 lib/uaccess_gup_kmap_memcpy.c

-- 
2.26.2



Re: ARM: vmsplit 4g/4g

2020-06-09 Thread afzal mohammed
Hi,

On Mon, Jun 08, 2020 at 08:47:27PM +0530, afzal mohammed wrote:
> On Mon, Jun 08, 2020 at 04:43:57PM +0200, Arnd Bergmann wrote:

> > There is another difference: get_user_pages_fast() does not return
> > a  vm_area_struct pointer, which is where you would check the access
> > permissions. I suppose those pointers could not be returned to callers
> > that don't already hold the mmap_sem.
> 
> Ok, thanks for the details, i need to familiarize better with mm.

i was & now more confused w.r.t checking access permission using
vm_area_struct to deny write on a read only user page.

i have been using get_user_pages_fast() w/ FOLL_WRITE in copy_to_user.
Isn't that sufficient ?, afaiu, get_user_pages_fast() will ensure that
w/ FOLL_WRITE, pte has write permission, else no struct page * is
handed back to the caller.

One of the simplified path which could be relevant in the majority of
the cases that i figured out as follows,

 get_user_pages_fast
  internal_user_pages_fast
   gup_pgd_range [ no mmap_sem acquire path]
gup_p4d_range 
 gup_pud_range
  gup_pmd_range
   gup_pte_range
if (!pte_access_permitted(pte, flags & FOLL_WRITE))
 [ causes to return NULL page if access violation ]

   __gup_longterm_unlocked [ mmap_sem acquire path]
get_user_pages_unlocked
 __get_user_pages_locked
  __get_user_pages
   follow_page_mask
follow_p4d_mask
 follow_pud_mask
  follow_pmd_mask
   follow_page_pte
if ((flags & FOLL_WRITE) && !can_follow_write_pte(pte, flags))
 [ causes to return NULL page if access violation ]

As far as i could see none of the get_user_pages() caller are passing
struct vm_area_struct ** to get it populated.

And Ingo's series eons ago didn't either pass it or check permission
using it (it was passing a 'write' arguement, which i believe
corrresponds to FOLL_WRITE)

Am i missing something or wrong in the analysis ?

Regards
afzal


Re: ARM: vmsplit 4g/4g

2020-06-08 Thread afzal mohammed
Hi,

On Mon, Jun 08, 2020 at 04:43:57PM +0200, Arnd Bergmann wrote:

> There is another difference: get_user_pages_fast() does not return
> a  vm_area_struct pointer, which is where you would check the access
> permissions. I suppose those pointers could not be returned to callers
> that don't already hold the mmap_sem.

Ok, thanks for the details, i need to familiarize better with mm.

Regards
afzal


Re: ARM: vmsplit 4g/4g

2020-06-08 Thread afzal mohammed
Hi,

On Sun, Jun 07, 2020 at 09:26:26PM +0200, Arnd Bergmann wrote:

> I think you have to use get_user_pages() though instead of
> get_user_pages_fast(),
> in order to be able to check the permission bits to prevent doing a
> copy_to_user()
> into read-only mappings.

i was not aware of this, is it documented somewhere ?, afaiu,
difference b/n get_user_pages_fast() & get_user_pages() is that fast
version will try to pin pages w/o acquiring mmap_sem if possible.

> Do you want me to review the uaccess patch to look for any missing
> corner cases, or do you want to do the whole set of user access helpers
> first?

i will cleanup and probably post RFC initially for the changes
handling copy_{from,to}_user() to get feedback.

Regards
afzal


Re: ARM: vmsplit 4g/4g

2020-06-08 Thread afzal mohammed
Hi,

[ my previous mail did not make into linux-arm-kernel mailing list,
 got a  mail saying it has a suspicious header and that it is waiting
 moderator approval ]

On Sun, Jun 07, 2020 at 05:11:16PM +0100, Russell King - ARM Linux admin wrote:
> On Sun, Jun 07, 2020 at 06:29:32PM +0530, afzal mohammed wrote:

> > get_user_pages_fast() followed by kmap_atomic() & then memcpy() seems
> > to work in principle for user copy.
> 
> Have you done any performance evaluation of the changes yet? I think
> it would be a good idea to keep that in the picture. If there's any
> significant regression, then that will need addressing.

Not yet. Yes, i will do the performance evaluation.

i am also worried about the impact on performance as these
[ get_user_pages() or friends, kmap_atomic() ] are additionally
invoked in the copy_{from,to}_user() path now.

Note that this was done on a topic branch for user copy. Changes for
kernel static mapping to vmalloc has not been merged with these.
Also having kernel lowmem w/ a separate asid & switching at kernel
entry/exit b/n user & kernel lowmem by changing ttbr0 is yet to be
done. Quite a few things remaining to be done to achieve vmsplit 4g/4g

Regards
afzal


ARM: vmsplit 4g/4g

2020-06-07 Thread afzal mohammed
Hi,

On Sat, May 16, 2020 at 09:35:57AM +0200, Arnd Bergmann wrote:
> On Sat, May 16, 2020 at 8:06 AM afzal mohammed  
> wrote:

> > Okay, so the conclusion i take is,
> > 1. VMSPLIT 4G/4G have to live alongside highmem
> > 2. For user space copy, do pinning followed by kmap

> Right, though kmap_atomic() should be sufficient here
> because it is always a short-lived mapping.

get_user_pages_fast() followed by kmap_atomic() & then memcpy() seems
to work in principle for user copy.

Verified in a crude way by pointing TTBR0 to a location that has user
pgd's cleared upon entry to copy_to_user() & restoring TTBR0 to
earlier value after user copying was done and ensuring boot.

Meanwhile more testing w/ kernel static mapping in vmalloc space
revealed a major issue, w/ LPAE it was not booting. There were issues
related to pmd handling, w/ !LPAE those issues were not present as pmd
is in effect equivalent to pgd. The issues has been fixed, though now
LPAE boots, but feel a kind of fragile, will probably have to revisit
it.

Regards
afzal


Re: ARM: static kernel in vmalloc space

2020-05-16 Thread afzal mohammed
Hi,

On Thu, May 14, 2020 at 05:32:41PM +0200, Arnd Bergmann wrote:

> Typical distros currently offer two kernels, with and without LPAE,
> and they probably don't want to add a third one for LPAE with
> either highmem or vmsplit-4g-4g. Having extra user address
> space and more lowmem is both going to help users that
> still have 8GB configurations.

Okay, so the conclusion i take is,

1. VMSPLIT 4G/4G have to live alongside highmem
2. For user space copy, do pinning followed by kmap

Regards
afzal


Re: ARM: static kernel in vmalloc space

2020-05-14 Thread afzal mohammed
Hi,

On Thu, May 14, 2020 at 07:05:45PM +0530, afzal mohammed wrote:

> So if we make VMSPLIT_4G_4G, depends on !HIGH_MEMORY (w/ mention of
> caveat in Kconfig help that this is meant for platforms w/ <=4GB), then
> we can do copy_{from,to}_user the same way currently do, and no need to
> do the user page pinning & kmap, right ?

i think user page pinning is still required, but kmap can be avoided
by using lowmem corresponding to that page, right ?, or am i
completely wrong ?

Regards
afzal


Re: ARM: static kernel in vmalloc space

2020-05-14 Thread afzal mohammed
Hi,

On Thu, May 14, 2020 at 02:41:11PM +0200, Arnd Bergmann wrote:
> On Thu, May 14, 2020 at 1:18 PM afzal mohammed  
> wrote:

> > 1. SoC w/ LPAE
> > 2. TTBR1 (top 256MB) for static kernel, modules, io mappings, vmalloc,
> > kmap, fixmap & vectors

> Right, these kind of go together because pre-LPAE cannot do the
> same TTBR1 split, and they more frequently have conflicting
> static mappings.
> 
> It's clearly possible to do something very similar for older chips
> (v6 or v7 without LPAE, possibly even v5), it just gets harder
> while providing less benefit.

Yes, lets have it only for LPAE

> > 3. TTBR0 (low 3768MB) for user space & lowmem (kernel lowmem to have

> hardcoded 3840/256 split is likely the best compromise of all the

hmm,i swallowed 72MB ;)

> > 4. for user space to/from copy
> >  a. pin user pages
> >  b. kmap user page (can't corresponding lowmem be used instead ?)

> - In the long run, there is no need for kmap()/kmap_atomic() after
>   highmem gets removed from the kernel, but for the next few years
>   we should still assume that highmem can be used, in order to support
>   systems like the 8GB highbank, armadaxp, keystone2 or virtual
>   machines. For lowmem pages (i.e. all pages when highmem is
>   disabled), kmap_atomic() falls back to page_address() anyway,
>   so there is no much overhead.

Here i have some confusion - iiuc, VMSPLIT_4G_4G is meant to help
platforms having RAM > 768M and <= 4GB disable high memory and still
be able to access full RAM, so high memory shouldn't come into picture,
right ?. And for the above platforms it can continue current VMPSLIT
option (the default 3G/1G), no ?, as VMSPLIT_4G_4G can't help complete
8G to be accessible from lowmem.

So if we make VMSPLIT_4G_4G, depends on !HIGH_MEMORY (w/ mention of
caveat in Kconfig help that this is meant for platforms w/ <=4GB), then
we can do copy_{from,to}_user the same way currently do, and no need to
do the user page pinning & kmap, right ?

Only problem i see is Kernel compiled w/ VMSPLIT_4G_4G not suitable
for >4GB machines, but anyway iiuc, it is was not meant for those
machines. And it is not going to affect our current multiplatform
setup as LPAE is not defined in multi_v7.

Regards
afzal


Re: ARM: static kernel in vmalloc space

2020-05-14 Thread afzal mohammed
Hi,

On Tue, May 12, 2020 at 09:49:59PM +0200, Arnd Bergmann wrote:

> Any idea which bit you want to try next?

My plan has been to next post patches for the static kernel migration
to vmalloc space (currently the code is rigid, taking easy route
wherever possible & not of high quality) as that feature has an
independent existence & adds value by itself.  And then start working
on other steps towards VMSPLIT_4G_4G.

Now that you mentioned about other things, i will slowly start those
as well.

> Creating a raw_copy_{from,to}_user()
> based on get_user_pages()/kmap_atomic()/memcpy() is probably a good
> next thing to do. I think it can be done one page at a time with only
> checking for
> get_fs(), access_ok(), and page permissions, while get_user()/put_user()
> need to handle a few more corner cases.

Before starting w/ other things, i would like to align on the high
level design,

My understanding (mostly based on your comments) as follows,
(i currently do not have a firm grip over these things, hope to have
it once started w/ the implementation)

1. SoC w/ LPAE 
2. TTBR1 (top 256MB) for static kernel, modules, io mappings, vmalloc,
kmap, fixmap & vectors
3. TTBR0 (low 3768MB) for user space & lowmem (kernel lowmem to have
separate ASID)
4. for user space to/from copy
 a. pin user pages
 b. kmap user page (can't corresponding lowmem be used instead ?)
 c. copy

Main points are as above, right ?, anything missed ?, or anything more
you want to add ?, let me know your opinion.

Regards
afzal


Re: ARM: static kernel in vmalloc space

2020-05-12 Thread afzal mohammed
Hi,

On Mon, May 11, 2020 at 05:29:29PM +0200, Arnd Bergmann wrote:

> What do you currently do with the module address space?

In the current setup, module address space was untouched, i.e. virtual
address difference b/n text & module space is far greater than 32MB, at
least > (2+768+16)MB and modules can't be loaded unless ARM_MODULE_PLTS
is enabled (this was checked now)

> easiest way is to just always put modules into vmalloc space, as we already
> do with CONFIG_ARM_MODULE_PLTS when the special area gets full,
> but that could be optimized once the rest works.

Okay

Regards
afzal


ARM: static kernel in vmalloc space (was Re: [PATCH 0/3] Highmem support for 32-bit RISC-V)

2020-05-11 Thread afzal mohammed
Hi,

Kernel now boots to prompt w/ static kernel mapping moved to vmalloc
space.

Changes currently done have a couple of platform specific things, this
has to be modified to make it multiplatform friendly (also to be taken
care is ARM_PATCH_PHYS_VIRT case). Module address space has to be
taken care as well.

Logs follows

Regards
afzal

[0.00] Booting Linux on physical CPU 0x0
[0.00] Linux version 5.7.0-rc1-00043-ge8ffd99475b9c (afzal@afzalpc) 
(gcc version 8.2.0 (GCC_MA), GNU ld (GCC_MA) 2.31.1) #277 SMP Mon May 11 
18:16:51 IST 2020
[0.00] CPU: ARMv7 Processor [412fc0f1] revision 1 (ARMv7), cr=10c5387d
[0.00] CPU: div instructions available: patching division code
[0.00] CPU: PIPT / VIPT nonaliasing data cache, PIPT instruction cache
[0.00] OF: fdt: Machine model: V2P-CA15
[0.00] printk: bootconsole [earlycon0] enabled
[0.00] Memory policy: Data cache writealloc
[0.00] efi: UEFI not found.
[0.00] Reserved memory: created DMA memory pool at 0x1800, size 8 
MiB
[0.00] OF: reserved mem: initialized node vram@1800, compatible id 
shared-dma-pool
[0.00] percpu: Embedded 20 pages/cpu s49164 r8192 d24564 u81920
[0.00] Built 1 zonelists, mobility grouping on.  Total pages: 522751
[0.00] Kernel command line: console=ttyAMA0,115200 rootwait 
root=/dev/mmcblk0 earlyprintk
[0.00] Dentry cache hash table entries: 131072 (order: 7, 524288 bytes, 
linear)
[0.00] Inode-cache hash table entries: 65536 (order: 6, 262144 bytes, 
linear)
[0.00] mem auto-init: stack:off, heap alloc:off, heap free:off
[0.00] Memory: 2057032K/2097148K available (12288K kernel code, 1785K 
rwdata, 5188K rodata, 2048K init, 403K bss, 40116K reserved, 0K cma-reserved, 
1310716K highmem)
[0.00] Virtual kernel memory layout:
[0.00] vector  : 0x - 0x1000   (   4 kB)
[0.00] fixmap  : 0xffc0 - 0xfff0   (3072 kB)
[0.00] vmalloc : 0xf100 - 0xff80   ( 232 MB)
[0.00] lowmem  : 0xc000 - 0xf000   ( 768 MB)
[0.00] pkmap   : 0xbfe0 - 0xc000   (   2 MB)
[0.00] modules : 0xbf00 - 0xbfe0   (  14 MB)
[0.00]   .text : 0xf1208000 - 0xf1f0   (13280 kB)
[0.00]   .init : 0xf250 - 0xf270   (2048 kB)
[0.00]   .data : 0xf270 - 0xf28be558   (1786 kB)
[0.00].bss : 0xf28be558 - 0xf29231a8   ( 404 kB)
[0.00] SLUB: HWalign=64, Order=0-3, MinObjects=0, CPUs=2, Nodes=1
[0.00] rcu: Hierarchical RCU implementation.
[0.00] rcu: RCU event tracing is enabled.
[0.00] rcu: RCU restricting CPUs from NR_CPUS=16 to nr_cpu_ids=2.
[0.00] rcu: RCU calculated value of scheduler-enlistment delay is 10 
jiffies.
[0.00] rcu: Adjusting geometry for rcu_fanout_leaf=16, nr_cpu_ids=2
[0.00] NR_IRQS: 16, nr_irqs: 16, preallocated irqs: 16
[0.00] random: get_random_bytes called from start_kernel+0x304/0x49c 
with crng_init=0
[0.000311] sched_clock: 32 bits at 24MHz, resolution 41ns, wraps every 
89478484971ns
[0.006788] clocksource: arm,sp804: mask: 0x max_cycles: 0x, 
max_idle_ns: 1911260446275 ns
[0.008479] Failed to initialize 
'/bus@800/motherboard/iofpga@3,/timer@12': -22
[0.013414] arch_timer: cp15 timer(s) running at 62.50MHz (virt).
[0.013875] clocksource: arch_sys_counter: mask: 0xff 
max_cycles: 0x1cd42e208c, max_idle_ns: 881590405314 ns
[0.014610] sched_clock: 56 bits at 62MHz, resolution 16ns, wraps every 
4398046511096ns
[0.015199] Switching to timer-based delay loop, resolution 16ns
[0.020168] Console: colour dummy device 80x30
[0.022219] Calibrating delay loop (skipped), value calculated using timer 
frequency.. 125.00 BogoMIPS (lpj=625000)
[0.026998] pid_max: default: 32768 minimum: 301
[0.028835] Mount-cache hash table entries: 2048 (order: 1, 8192 bytes, 
linear)
[0.029319] Mountpoint-cache hash table entries: 2048 (order: 1, 8192 bytes, 
linear)
[0.044484] CPU: Testing write buffer coherency: ok
[0.045452] CPU0: Spectre v2: firmware did not set auxiliary control 
register IBE bit, system vulnerable
[0.057536] /cpus/cpu@0 missing clock-frequency property
[0.058065] /cpus/cpu@1 missing clock-frequency property
[0.058538] CPU0: thread -1, cpu 0, socket 0, mpidr 8000
[0.066972] Setting up static identity map for 0x8030 - 0x803000ac
[0.074772] rcu: Hierarchical SRCU implementation.
[0.083336] EFI services will not be available.
[0.085605] smp: Bringing up secondary CPUs ...
[0.090454] CPU1: thread -1, cpu 1, socket 0, mpidr 8001
[0.090560] CPU1: Spectre v2: firmware did not set auxiliary control 
register IBE bit, system vulnerable
[0.096711] smp: Brought up 1 node, 2 CPUs
[0.097132] SMP: Total of 2 processors 

Re: [PATCH] ARM: omap1: fix irq setup

2020-05-05 Thread afzal mohammed
Hi,

On Tue, May 05, 2020 at 04:13:48PM +0200, Arnd Bergmann wrote:

> A recent cleanup introduced a bug on any omap1 machine that has
> no wakeup IRQ, i.e. omap15xx:

> Move this code into a separate function to deal with it cleanly.
> 
> Fixes: b75ca5217743 ("ARM: OMAP: replace setup_irq() by request_irq()")
> Signed-off-by: Arnd Bergmann 

Sorry for the mistake and thanks for the fix,

Acked-by: afzal mohammed 

Regards
afzal


Re: [PATCH 0/3] Highmem support for 32-bit RISC-V

2020-05-04 Thread afzal mohammed
[ +linux-arm-kernel

  Context: This is regarding VMSPLIT_4G_4G support for 32-bit ARM as a
  possible replacement to highmem. For that, initially, it is being
  attempted to move static kernel mapping from lowmem to vmalloc space.

  in next reply, i will remove everyone/list !ARM related ]

Hi,

On Sun, May 03, 2020 at 10:20:39PM +0200, Arnd Bergmann wrote:

> Which SoC platform are you running this on? Just making
> sure that this won't conflict with static mappings later.

Versatile Express V2P-CA15 on qemu, qemu options include --smp 2 &
2GB memory.

BTW, i could not convince myself why, except for DEBUG_LL, static io
mappings are used.

> 
> One problem I see immediately in arm_memblock_init()

Earlier it went past arm_memblock_init(), issue was clearing the page
tables from VMALLOC_START in devicemaps_init() thr' paging_init(),
which was like cutting the sitting branch of the tree.

Now it is crashing at debug_ll_io_init() of devicemap_init(), and
printascii/earlycon was & is being used to debug :). Things are going
wrong when it tries to create mapping for debug_ll. It looks like a
conflict with static mapping, which you mentioned above, at the same
time i am not seeing kernel static mapping in the same virtual
address, need to dig deeper.

Also tried removing DEBUG_LL, there is a deafening silence in the
console ;)

> is that it uses
> __pa() to convert from virtual address in the linear map to physical,
> but now you actually pass an address that is in vmalloc rather than
> the linear map.

__virt_to_phys_nodebug() which does the actual work on __pa() invocation
has been modifed to handle that case (ideas lifted from ARM64's
implementation), though currently it is a hack as below (and applicable
only for ARM_PATCH_PHYS_VIRT disabled case), other hacks being
VMALLOC_OFFSET set to 0 and adjusting vmalloc size.

static inline phys_addr_t __virt_to_phys_nodebug(unsigned long x)
{
phys_addr_t __x = (phys_addr_t)x;

if (__x >= 0xf000)
return __x - KIMAGE_OFFSET + PHYS_OFFSET;
else
return __x - PAGE_OFFSET + PHYS_OFFSET;
}

Regards
afzal


Re: [PATCH 0/3] Highmem support for 32-bit RISC-V

2020-05-03 Thread afzal mohammed
Hi Arnd,

> On Tue, Apr 14, 2020 at 09:29:46PM +0200, Arnd Bergmann wrote:

> > Another thing to try early is to move the vmlinux virtual address
> > from the linear mapping into vmalloc space. This does not require
> > LPAE either, but it only works on relatively modern platforms that
> > don't have conflicting fixed mappings there.

i have started by attempting to move static kernel mapping from lowmem
to vmalloc space. At boot the execution so far has went past assembly
& reached C, to be specific, arm_memblock_init [in setup_arch()],
currently debugging the hang that happens after that point. To make
things easier in the beginning, ARM_PATCH_PHYS_VIRT is disabled &
platform specific PHYS_OFFSET is fed, this is planned to be fixed once
it boots.

[ i will probably start a new thread or hopefully RFC on LAKML ]

Regards
afzal


Re: CONGRATULATION ONCE AGAIN

2019-08-24 Thread Mrs.Amina Jane Mohammed
-- 
Attention:

This is to inform you that you have been selected to receive the 2019
United Nations Democracy Fund (UNDEF) of $250,000 (US Dollars). The
selection process was carried out through The United Nations (UN)
computerized email selection system (ESS), from a database of over
250,000 email addresses obtained from all continents of the world,
which you were selected among to receive $250,000 (US Dollars).

The United Nations Democracy Fund (UNDEF) was created by the former UN
Secretary-General Kofi A. Annan in 2005 as a United Nations General
Trust Fund to support democratization efforts and civil society
organizations around the world. You are advice to contact The United
Nations Democracy Fund (UNDEF) Grants Manager to claim your Cashier
Check valued the sum of $250,000 (US Dollars).

Name: Christine Maulhardt
Email: c.maulhard...@yahoo.com
Phone: +1 (708) 390-8956

Contact her by providing her with the under listed information as soon
as possible.

1. Name In Full :
2. Address :
3. Nationality :
4. Direct Phone :

Be inform that Christine Maulhardt is the Grants Manager and she is
responsible for the smooth and efficient processing of your $250,000
(US Dollars) United Nations Democracy Fund grants,therefore, contact
her immediately with these informations as required above.
Congratulations once again.

Mrs.Amina Jane Mohammed
Deputy Secretary-General of the United Nations.


Dear Friend,

2019-07-16 Thread Salif Mohammed
Dear Friend,
I am delighted to get across to you for a confidential private business “ 
investment proposal” worth of US$ 22 Million in which your partnership and 
trust is highly required.

For more details and the procedure, kindly provide me with your following 
information:

1) Your Full name

2) Occupation

3) Full contact address with your direct mobile phone number.

4) Marital status

5) Copy of your International passport/National ID card.

Shortly after receiving all the above listed information from you as the 
affirmation of your total interest, I shall email you with every details that 
you need for us to proceed.

Sincerely,

Salif Mohammed.

Senior Auditor

Dubai Islamic Bank,

Abu Dhabi, U.A.E
my email [ mohmm54...@gmail.com ]




Hello....

2019-07-01 Thread Edith Mohammed
Date: Mon, 1 Jul 2019 10:16:36 +
Subject: Hello
<>


Re: [PATCH] hyper-v: Check for ring buffer in hv_get_bytes_to_read/write

2019-03-13 Thread Mohammed Gamal
On Tue, 2019-03-12 at 18:02 +, Haiyang Zhang wrote:
>  
>  
> > -Original Message-
> > From: Mohammed Gamal 
> > Sent: Thursday, March 7, 2019 1:32 PM
> > To: Michael Kelley ; linux-hyp...@vger.kern
> el.org;
> > kimbrownkd 
> > Cc: Sasha Levin ; Dexuan Cui
> > ; Stephen Hemminger ;
> > Long Li ; KY Srinivasan ;
> Haiyang
> > Zhang ; vkuznets ;
> linux-
> > ker...@vger.kernel.org
> > Subject: Re: [PATCH] hyper-v: Check for ring buffer in
> > hv_get_bytes_to_read/write
> >
> > On Thu, 2019-03-07 at 17:33 +, Michael Kelley wrote:
> > > From: Mohammed Gamal  Sent: Thursday, March 7,
> > > 2019 8:36 AM
> > > >
> > > > This patch adds a check for the presence of the ring buffer in
> > > > hv_get_bytes_to_read/write() to avoid possible NULL pointer
> > > > dereferences.
> > > > If the ring buffer is not yet allocated, return 0 bytes to be
> > > > read/written.
> > > >
> > > > The root cause is that code that accesses the ring buffer
> including
> > > > hv_get_bytes_to_read/write() could be vulnerable to the race
> > > > condition discussed in
> > > > https://nam06.safelinks.protection.outlook.com/?url=https%3A%2F
> %2Flk
> > > >
> > ml.org%2Flkml%2F2018%2F10%2F18%2F779data=02%7C01%7Chaiyangz
> > %40m
> > > >
> > icrosoft.com%7C73af013c14034bb0b1ad08d6a32b419c%7C72f988bf86f141af9
> > 1
> > > >
> > ab2d7cd011db47%7C1%7C0%7C636875803518430021sdata=b51Xc5GUN
> > nHX0K
> > > > 08LrH3ShTyFcRZ4mYHUATd%2BDpvYDw%3Dreserved=0>;
> > > >
> > > > This race is being addressed by the patch series by Kimberly
> Brown
> > > > in
> > > > https://nam06.safelinks.protection.outlook.com/?url=https%3A%2F
> %2Flk
> > > >
> > ml.org%2Flkml%2F2019%2F2%2F21%2F1236data=02%7C01%7Chaiyangz
> > %40m
> > > >
> > icrosoft.com%7C73af013c14034bb0b1ad08d6a32b419c%7C72f988bf86f141af9
> > 1
> > > >
> > ab2d7cd011db47%7C1%7C0%7C636875803518430021sdata=js1ff15Gbk7
> > 0MD
> > > > A2hkMZExxvAAbDuKDhfBvc5ZrckzM%3Dreserved=0 which is not
> > final
> > > > yet
> > > >
> > > > Signed-off-by: Mohammed Gamal 
> > >
> > > Could you elaborate on the code paths where
> > > hv_get_bytes_to_read/write() could be called when the ring buffer
> > > isn't yet allocated?  My sense is that Kim Brown's patch will
> address
> > > all of the code paths that involved sysfs access from outside the
> > > driver.  And within a driver, the ring buffer should never be
> accessed
> > > unless it is already allocated.  Is there another code path we're
> not
> > > aware of?  I'm wondering if these changes are really needed once
> Kim
> > > Brown's patch is finished.
> > >
> > > Michael
> >
> > I've seen one instance of the race in the netvsc driver when
> running traffic
> > through it with iperf3 while continuously changing the channel
> settings.
> >
> > The following code path deallocates the ring buffer:
> > netvsc_set_channels() -> netvsc_detach() ->
> > rndis_filter_device_remove() -> netvsc_device_remove() ->
> vmbus_close()
> > -> vmbus_free_ring() -> hv_ringbuffer_cleanup().
> >
> > netvsc_send_pkt() -> hv_get_bytes_to_write() might get called
> concurrently
> > after vmbus_close() and before vmbus_open() returns and sets up the
> new ring
> > buffer.
> >
> > The race is fairly hard to reproduce on recent upstream kernels,
> but I still
> > managed to reproduce it.
>  
> Looking at the code from netvsc_detach() –
>  netif_tx_disable(ndev) is called before
> rndis_filter_device_remove(hdev, nvdev).
> So there should be no call to netvsc_send_pkt() after detaching.
> What’s the crash stack trace?
>  
> static int netvsc_detach(struct net_device *ndev,
>  struct netvsc_device *nvdev)
> {
>     struct net_device_context *ndev_ctx = netdev_priv(ndev);
>     struct hv_device *hdev = ndev_ctx->device_ctx;
>     int ret;
>  
>     /* Don't try continuing to try and setup sub channels */
>     if (cancel_work_sync(>subchan_work))
>     nvdev->num_chn = 1;
>  
>     /* If device was up (receiving) then shutdown */
>     if (netif_running(ndev)) {
>     netif_tx_disable(ndev);
>  
>     ret = rndis_filter_close(nvdev);
>     if (re

Re: [PATCH] hyper-v: Check for ring buffer in hv_get_bytes_to_read/write

2019-03-07 Thread Mohammed Gamal
On Thu, 2019-03-07 at 17:33 +, Michael Kelley wrote:
> From: Mohammed Gamal  Sent: Thursday, March 7,
> 2019 8:36 AM
> > 
> > This patch adds a check for the presence of the ring buffer in
> > hv_get_bytes_to_read/write() to avoid possible NULL pointer
> > dereferences.
> > If the ring buffer is not yet allocated, return 0 bytes to be
> > read/written.
> > 
> > The root cause is that code that accesses the ring buffer including
> > hv_get_bytes_to_read/write() could be vulnerable to the race
> > condition
> > discussed in https://lkml.org/lkml/2018/10/18/779>;
> > 
> > This race is being addressed by the patch series by Kimberly Brown
> > in
> > https://lkml.org/lkml/2019/2/21/1236 which is not final yet
> > 
> > Signed-off-by: Mohammed Gamal 
> 
> Could you elaborate on the code paths where
> hv_get_bytes_to_read/write() could be called when the ring buffer
> isn't yet allocated?  My sense is that Kim Brown's patch will address
> all of the code paths that involved sysfs access from outside the
> driver.  And within a driver, the ring buffer should never be
> accessed
> unless it is already allocated.  Is there another code path we're not
> aware of?  I'm wondering if these changes are really needed once
> Kim Brown's patch is finished.
> 
> Michael

I've seen one instance of the race in the netvsc driver when running
traffic through it with iperf3 while continuously changing the channel
settings.

The following code path deallocates the ring buffer:
netvsc_set_channels() -> netvsc_detach() ->
rndis_filter_device_remove() -> netvsc_device_remove() -> vmbus_close()
-> vmbus_free_ring() -> hv_ringbuffer_cleanup().

netvsc_send_pkt() -> hv_get_bytes_to_write() might get called
concurrently after vmbus_close() and before vmbus_open() returns and
sets up the new ring buffer. 

The race is fairly hard to reproduce on recent upstream kernels, but I
still managed to reproduce it.


[PATCH] hyper-v: Check for ring buffer in hv_get_bytes_to_read/write

2019-03-07 Thread Mohammed Gamal
This patch adds a check for the presence of the ring buffer in
hv_get_bytes_to_read/write() to avoid possible NULL pointer dereferences.
If the ring buffer is not yet allocated, return 0 bytes to be read/written.

The root cause is that code that accesses the ring buffer including
hv_get_bytes_to_read/write() could be vulnerable to the race condition
discussed in https://lkml.org/lkml/2018/10/18/779

This race is being addressed by the patch series by Kimberly Brown in
https://lkml.org/lkml/2019/2/21/1236 which is not final yet

Signed-off-by: Mohammed Gamal 
---
 include/linux/hyperv.h | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index 64698ec8f2ac..7b2f566250b2 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -148,6 +148,9 @@ static inline u32 hv_get_bytes_to_read(const struct 
hv_ring_buffer_info *rbi)
 {
u32 read_loc, write_loc, dsize, read;
 
+   if (!rbi->ring_buffer)
+   return 0;
+
dsize = rbi->ring_datasize;
read_loc = rbi->ring_buffer->read_index;
write_loc = READ_ONCE(rbi->ring_buffer->write_index);
@@ -162,6 +165,9 @@ static inline u32 hv_get_bytes_to_write(const struct 
hv_ring_buffer_info *rbi)
 {
u32 read_loc, write_loc, dsize, write;
 
+   if (!rbi->ring_buffer)
+   return 0;
+
dsize = rbi->ring_datasize;
read_loc = READ_ONCE(rbi->ring_buffer->read_index);
write_loc = rbi->ring_buffer->write_index;
-- 
2.18.1



Dear,

2018-12-07 Thread Mohammed Rouhani
Dear,

Please accept my apologies I do not intend to invade your privacy, I
wrote to you earlier, but no answer, in my first post I told you about
my late client  who bears the same surname with you, I received
several letters from the bank, where he made a deposit of 7.2 million
Euros  before his death, the bank asked me to provide his next of kin
or any of his relatives, who  will stand for this claim, otherwise it
will be confiscated by the bank due to lack of claims from his
relatives hence I contacted you to present you as is beneficiary since
you have the same last name with him. After your reply I shall give
you the details and procedures of this transaction, waiting your
reply.

Regards,
Mohammed Rouhani


Dear,

2018-12-07 Thread Mohammed Rouhani
Dear,

Please accept my apologies I do not intend to invade your privacy, I
wrote to you earlier, but no answer, in my first post I told you about
my late client  who bears the same surname with you, I received
several letters from the bank, where he made a deposit of 7.2 million
Euros  before his death, the bank asked me to provide his next of kin
or any of his relatives, who  will stand for this claim, otherwise it
will be confiscated by the bank due to lack of claims from his
relatives hence I contacted you to present you as is beneficiary since
you have the same last name with him. After your reply I shall give
you the details and procedures of this transaction, waiting your
reply.

Regards,
Mohammed Rouhani


Urgent Respond Needed

2018-11-14 Thread Ibrahim Mohammed
I need am Honest Person to support and your cooperation with me in business of 
$26,700,000.00 thanks.


Urgent Respond Needed

2018-11-14 Thread Ibrahim Mohammed
I need am Honest Person to support and your cooperation with me in business of 
$26,700,000.00 thanks.


Dear,

2018-10-15 Thread Mohammed Rouhani
Dear,

Please accept my apologies I do not intend to invade your privacy, I
wrote to you earlier, but no answer, in my first post I told you about
my late client  who bears the same surname with you, I received
several letters from the bank, where he made a deposit of 7.2 million
Euros  before his death, the bank asked me to provide his next of kin
or any of his relatives, who  will stand for this claim, otherwise it
will be confiscated by the bank due to lack of claims from his
relatives hence I contacted you to present you as is beneficiary since
you have the same last name with him. After your reply I shall give
you the details and procedures of this transaction, waiting your
reply.

Regards,
Mohammed Rouhani


Dear,

2018-10-15 Thread Mohammed Rouhani
Dear,

Please accept my apologies I do not intend to invade your privacy, I
wrote to you earlier, but no answer, in my first post I told you about
my late client  who bears the same surname with you, I received
several letters from the bank, where he made a deposit of 7.2 million
Euros  before his death, the bank asked me to provide his next of kin
or any of his relatives, who  will stand for this claim, otherwise it
will be confiscated by the bank due to lack of claims from his
relatives hence I contacted you to present you as is beneficiary since
you have the same last name with him. After your reply I shall give
you the details and procedures of this transaction, waiting your
reply.

Regards,
Mohammed Rouhani


Dear,

2018-10-06 Thread Mohammed Rouhani
Dear,

Please accept my apologies I do not intend to invade your privacy, I
wrote to you earlier, but no answer, in my first post I told you about
my late client  who bears the same surname with you, I received
several letters from the bank, where he made a deposit of 7.2 million
Euros  before his death, the bank asked me to provide his next of kin
or any of his relatives, who  will stand for this claim, otherwise it
will be confiscated by the bank due to lack of claims from his
relatives hence I contacted you to present you as is beneficiary since
you have the same last name with him. After your reply I shall give
you the details and procedures of this transaction, waiting your
reply.

Regards,
Mohammed Rouhani


Dear,

2018-10-06 Thread Mohammed Rouhani
Dear,

Please accept my apologies I do not intend to invade your privacy, I
wrote to you earlier, but no answer, in my first post I told you about
my late client  who bears the same surname with you, I received
several letters from the bank, where he made a deposit of 7.2 million
Euros  before his death, the bank asked me to provide his next of kin
or any of his relatives, who  will stand for this claim, otherwise it
will be confiscated by the bank due to lack of claims from his
relatives hence I contacted you to present you as is beneficiary since
you have the same last name with him. After your reply I shall give
you the details and procedures of this transaction, waiting your
reply.

Regards,
Mohammed Rouhani


  1   2   3   4   5   6   7   8   9   10   >