[RFC v5 3/3] cpuidle-powernv : Recompute the idle-state timeouts when state usage is enabled/disabled

2019-10-02 Thread Abhishek Goel
The disable callback can be used to compute timeout for other states
whenever a state is enabled or disabled. We store the computed timeout
in "timeout" defined in cpuidle state strucure. So, we compute timeout
only when some state is enabled or disabled and not every time in the
fast idle path.
We also use the computed timeout to get timeout for snooze, thus getting
rid of get_snooze_timeout for snooze loop.

Signed-off-by: Abhishek Goel 
---
 drivers/cpuidle/cpuidle-powernv.c | 35 +++
 include/linux/cpuidle.h   |  1 +
 2 files changed, 13 insertions(+), 23 deletions(-)

diff --git a/drivers/cpuidle/cpuidle-powernv.c 
b/drivers/cpuidle/cpuidle-powernv.c
index d7686ce6e..a75226f52 100644
--- a/drivers/cpuidle/cpuidle-powernv.c
+++ b/drivers/cpuidle/cpuidle-powernv.c
@@ -45,7 +45,6 @@ struct stop_psscr_table {
 static struct stop_psscr_table stop_psscr_table[CPUIDLE_STATE_MAX] 
__read_mostly;
 
 static u64 default_snooze_timeout __read_mostly;
-static bool snooze_timeout_en __read_mostly;
 
 static u64 forced_wakeup_timeout(struct cpuidle_device *dev,
 struct cpuidle_driver *drv,
@@ -67,26 +66,13 @@ static u64 forced_wakeup_timeout(struct cpuidle_device *dev,
return 0;
 }
 
-static u64 get_snooze_timeout(struct cpuidle_device *dev,
- struct cpuidle_driver *drv,
- int index)
+static void pnv_disable_callback(struct cpuidle_device *dev,
+struct cpuidle_driver *drv)
 {
int i;
 
-   if (unlikely(!snooze_timeout_en))
-   return default_snooze_timeout;
-
-   for (i = index + 1; i < drv->state_count; i++) {
-   struct cpuidle_state *s = &drv->states[i];
-   struct cpuidle_state_usage *su = &dev->states_usage[i];
-
-   if (s->disabled || su->disable)
-   continue;
-
-   return s->target_residency * tb_ticks_per_usec;
-   }
-
-   return default_snooze_timeout;
+   for (i = 0; i < drv->state_count; i++)
+   drv->states[i].timeout = forced_wakeup_timeout(dev, drv, i);
 }
 
 static int snooze_loop(struct cpuidle_device *dev,
@@ -94,16 +80,20 @@ static int snooze_loop(struct cpuidle_device *dev,
int index)
 {
u64 snooze_exit_time;
+   u64 snooze_timeout = drv->states[index].timeout;
+
+   if (!snooze_timeout)
+   snooze_timeout = default_snooze_timeout;
 
set_thread_flag(TIF_POLLING_NRFLAG);
 
local_irq_enable();
 
-   snooze_exit_time = get_tb() + get_snooze_timeout(dev, drv, index);
+   snooze_exit_time = get_tb() + snooze_timeout;
ppc64_runlatch_off();
HMT_very_low();
while (!need_resched()) {
-   if (likely(snooze_timeout_en) && get_tb() > snooze_exit_time) {
+   if (get_tb() > snooze_exit_time) {
/*
 * Task has not woken up but we are exiting the polling
 * loop anyway. Require a barrier after polling is
@@ -168,7 +158,7 @@ static int stop_loop(struct cpuidle_device *dev,
u64 timeout_tb;
bool forced_wakeup = false;
 
-   timeout_tb = forced_wakeup_timeout(dev, drv, index);
+   timeout_tb = drv->states[index].timeout;
 
if (timeout_tb) {
/* Ensure that the timeout is at least one microsecond
@@ -263,6 +253,7 @@ static int powernv_cpuidle_driver_init(void)
 */
 
drv->cpumask = (struct cpumask *)cpu_present_mask;
+   drv->disable_callback = pnv_disable_callback;
 
return 0;
 }
@@ -422,8 +413,6 @@ static int powernv_idle_probe(void)
/* Device tree can indicate more idle states */
max_idle_state = powernv_add_idle_states();
default_snooze_timeout = TICK_USEC * tb_ticks_per_usec;
-   if (max_idle_state > 1)
-   snooze_timeout_en = true;
} else
return -ENODEV;
 
diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index 1729a497b..64195861b 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -50,6 +50,7 @@ struct cpuidle_state {
int power_usage; /* in mW */
unsigned inttarget_residency; /* in US */
booldisabled; /* disabled on all CPUs */
+   unsigned long long timeout; /* timeout for exiting out of a state */
 
int (*enter)(struct cpuidle_device *dev,
struct cpuidle_driver *drv,
-- 
2.17.1



[RFC v5 2/3] cpuidle : Add callback whenever a state usage is enabled/disabled

2019-10-02 Thread Abhishek Goel
To force wakeup a cpu, we need to compute the timeout in the fast idle
path as a state may be enabled or disabled but there did not exist a
feedback to driver when a state is enabled or disabled.
This patch adds a callback whenever a state_usage records a store for
disable attribute

Signed-off-by: Abhishek Goel 
---
 drivers/cpuidle/sysfs.c | 15 ++-
 include/linux/cpuidle.h |  3 +++
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/drivers/cpuidle/sysfs.c b/drivers/cpuidle/sysfs.c
index 2bb2683b4..6c9bf2f7b 100644
--- a/drivers/cpuidle/sysfs.c
+++ b/drivers/cpuidle/sysfs.c
@@ -418,8 +418,21 @@ static ssize_t cpuidle_state_store(struct kobject *kobj, 
struct attribute *attr,
struct cpuidle_state_attr *cattr = attr_to_stateattr(attr);
struct cpuidle_device *dev = kobj_to_device(kobj);
 
-   if (cattr->store)
+   if (cattr->store) {
ret = cattr->store(state, state_usage, buf, size);
+   if (ret == size &&
+   strncmp(cattr->attr.name, "disable",
+   strlen("disable"))) {
+   struct kobject *cpuidle_kobj = kobj->parent;
+   struct cpuidle_device *dev =
+   to_cpuidle_device(cpuidle_kobj);
+   struct cpuidle_driver *drv =
+   cpuidle_get_cpu_driver(dev);
+
+   if (drv->disable_callback)
+   drv->disable_callback(dev, drv);
+   }
+   }
 
/* reset poll time cache */
dev->poll_limit_ns = 0;
diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index 4b6b5bea8..1729a497b 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -122,6 +122,9 @@ struct cpuidle_driver {
/* the driver handles the cpus in cpumask */
struct cpumask  *cpumask;
 
+   void (*disable_callback)(struct cpuidle_device *dev,
+struct cpuidle_driver *drv);
+
/* preferred governor to switch at register time */
const char  *governor;
 };
-- 
2.17.1



[PATCH v5 1/3] cpuidle-powernv : forced wakeup for stop states

2019-10-02 Thread Abhishek Goel
Currently, the cpuidle governors determine what idle state a idling CPU
should enter into based on heuristics that depend on the idle history on
that CPU. Given that no predictive heuristic is perfect, there are cases
where the governor predicts a shallow idle state, hoping that the CPU will
be busy soon. However, if no new workload is scheduled on that CPU in the
near future, the CPU may end up in the shallow state.

This is problematic, when the predicted state in the aforementioned
scenario is a shallow stop state on a tickless system. As we might get
stuck into shallow states for hours, in absence of ticks or interrupts.

To address this, We forcefully wakeup the cpu by setting the
decrementer. The decrementer is set to a value that corresponds with the
residency of the next available state. Thus firing up a timer that will
forcefully wakeup the cpu. Few such iterations will essentially train the
governor to select a deeper state for that cpu, as the timer here
corresponds to the next available cpuidle state residency. Thus, cpu will
eventually end up in the deepest possible state.

Signed-off-by: Abhishek Goel 
---

Auto-promotion
  v1 : started as auto promotion logic for cpuidle states in generic
driver
  v2 : Removed timeout_needed and rebased the code to upstream kernel
Forced-wakeup
  v1 : New patch with name of forced wakeup started
  v2 : Extending the forced wakeup logic for all states. Setting the
decrementer instead of queuing up a hrtimer to implement the logic.
  v3 : Cleanly handle setting/resetting of decrementer so as to not break
irq work
  v4 : Changed type and name of set/reset decrementer fucntion
   Handled irq_work_pending in try_set_dec_before_idle
  v5 : Removed forced wakeup for last stop state by changing the
   checking conditon of timeout_tb

 arch/powerpc/include/asm/time.h   |  2 ++
 arch/powerpc/kernel/time.c| 43 +++
 drivers/cpuidle/cpuidle-powernv.c | 40 
 3 files changed, 85 insertions(+)

diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h
index 08dbe3e68..06a6a2314 100644
--- a/arch/powerpc/include/asm/time.h
+++ b/arch/powerpc/include/asm/time.h
@@ -184,6 +184,8 @@ static inline unsigned long tb_ticks_since(unsigned long 
tstamp)
 extern u64 mulhdu(u64, u64);
 #endif
 
+extern bool try_set_dec_before_idle(u64 timeout);
+extern void try_reset_dec_after_idle(void);
 extern void div128_by_32(u64 dividend_high, u64 dividend_low,
 unsigned divisor, struct div_result *dr);
 
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 694522308..d004c0d8e 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -576,6 +576,49 @@ void arch_irq_work_raise(void)
 
 #endif /* CONFIG_IRQ_WORK */
 
+/*
+ * This function tries setting decrementer before entering into idle.
+ * Returns true if we have reprogrammed the decrementer for idle.
+ * Returns false if the decrementer is unchanged.
+ */
+bool try_set_dec_before_idle(u64 timeout)
+{
+   u64 *next_tb = this_cpu_ptr(&decrementers_next_tb);
+   u64 now = get_tb_or_rtc();
+
+   if (now + timeout > *next_tb)
+   return false;
+
+   set_dec(timeout);
+   if (test_irq_work_pending())
+   set_dec(1);
+
+   return true;
+}
+
+/*
+ * This function gets called if we have set decrementer before
+ * entering into idle. It tries to reset/restore the decrementer
+ * to its original value.
+ */
+void try_reset_dec_after_idle(void)
+{
+   u64 now;
+   u64 *next_tb;
+
+   if (test_irq_work_pending())
+   return;
+
+   now = get_tb_or_rtc();
+   next_tb = this_cpu_ptr(&decrementers_next_tb);
+   if (now >= *next_tb)
+   return;
+
+   set_dec(*next_tb - now);
+   if (test_irq_work_pending())
+   set_dec(1);
+}
+
 /*
  * timer_interrupt - gets called when the decrementer overflows,
  * with interrupts disabled.
diff --git a/drivers/cpuidle/cpuidle-powernv.c 
b/drivers/cpuidle/cpuidle-powernv.c
index 84b1ebe21..d7686ce6e 100644
--- a/drivers/cpuidle/cpuidle-powernv.c
+++ b/drivers/cpuidle/cpuidle-powernv.c
@@ -21,6 +21,7 @@
 #include 
 #include 
 #include 
+#include 
 
 /*
  * Expose only those Hardware idle states via the cpuidle framework
@@ -46,6 +47,26 @@ static struct stop_psscr_table 
stop_psscr_table[CPUIDLE_STATE_MAX] __read_mostly
 static u64 default_snooze_timeout __read_mostly;
 static bool snooze_timeout_en __read_mostly;
 
+static u64 forced_wakeup_timeout(struct cpuidle_device *dev,
+struct cpuidle_driver *drv,
+int index)
+{
+   int i;
+
+   for (i = index + 1; i < drv->state_count; i++) {
+   struct cpuidle_state *s = &drv->states[i];
+   struct cpuidle_state_usage *su = &dev->states_usage[i];
+
+   if (s->disabled || su->disable)
+   c

[PATCH v5 0/3] Forced-wakeup for stop states on Powernv

2019-10-02 Thread Abhishek Goel
Currently, the cpuidle governors determine what idle state a idling CPU
should enter into based on heuristics that depend on the idle history on
that CPU. Given that no predictive heuristic is perfect, there are cases
where the governor predicts a shallow idle state, hoping that the CPU will
be busy soon. However, if no new workload is scheduled on that CPU in the
near future, the CPU will end up in the shallow state.

Motivation
--
In case of POWER, this is problematic, when the predicted state in the
aforementioned scenario is a shallow stop state on a tickless system. As
we might get stuck into shallow states even for hours, in absence of ticks
or interrupts.

To address this, We forcefully wakeup the cpu by setting the decrementer.
The decrementer is set to a value that corresponds with the residency of
the next available state. Thus firing up a timer that will forcefully
wakeup the cpu. Few such iterations will essentially train the governor to
select a deeper state for that cpu, as the timer here corresponds to the
next available cpuidle state residency. Thus, cpu will eventually end up
in the deepest possible state and we won't get stuck in a shallow state
for long duration.

Experiment
--
For earlier versions when this feature was meat to be only for shallow lite
states, I performed experiments for three scenarios to collect some data.

case 1 :
Without this patch and without tick retained, i.e. in a upstream kernel,
It would spend more than even a second to get out of stop0_lite.

case 2 : With tick retained in a upstream kernel -

Generally, we have a sched tick at 4ms(CONF_HZ = 250). Ideally I expected
it to take 8 sched tick to get out of stop0_lite. Experimentally,
observation was

=
sample  minmax   99percentile
20  4ms12ms  4ms
=

It would take atleast one sched tick to get out of stop0_lite.

case 2 :  With this patch (not stopping tick, but explicitly queuing a
  timer)


sample  min max 99percentile

20  144us   192us   144us



Description of current implementation
-

We calculate timeout for the current idle state as the residency value
of the next available idle state. If the decrementer is set to be
greater than this timeout, we update the decrementer value with the
residency of next available idle state. Thus, essentially training the
governor to select the next available deeper state until we reach the
deepest state. Hence, we won't get stuck unnecessarily in shallow states
for longer duration.


v1 of auto-promotion : https://lkml.org/lkml/2019/3/22/58 This patch was
implemented only for shallow lite state in generic cpuidle driver.

v2 : Removed timeout_needed and rebased to current
upstream kernel

Then,
v1 of forced-wakeup : Moved the code to cpuidle powernv driver and started
as forced wakeup instead of auto-promotion

v2 : Extended the forced wakeup logic for all states.
Setting the decrementer instead of queuing up a hrtimer to implement the
logic.

v3 : 1) Cleanly handle setting the decrementer after exiting out of stop
   states.
 2) Added a disable_callback feature to compute timeout whenever a
state is enbaled or disabled instead of computing everytime in fast
idle path.
 3) Use disable callback to recompute timeout whenever state usage
is changed for a state. Also, cleaned up the get_snooze_timeout
function.

v4 :Changed the type and name of set/reset decrementer function.
Handled irq work pending in try_set_dec_before_idle.
No change in patch 2 and 3.

v5 :Removed forced wakeup for the last state. We dont want to wakeup
unnecessarily when already in deepest state. It was a mistake in
previous patches that was found out in recent experiments.
No change in patch 2 and 3.

Abhishek Goel (3):
  cpuidle-powernv : forced wakeup for stop states
  cpuidle : Add callback whenever a state usage is enabled/disabled
  cpuidle-powernv : Recompute the idle-state timeouts when state usage
is enabled/disabled

 arch/powerpc/include/asm/time.h   |  2 ++
 arch/powerpc/kernel/time.c| 43 
 drivers/cpuidle/cpuidle-powernv.c | 55 +++
 drivers/cpuidle/sysfs.c   | 15 -
 include/linux/cpuidle.h   |  4 +++
 5 files changed, 105 insertions(+), 14 deletions(-)

-- 
2.17.1



Re: [PATCH] powerpc/powernv/prd: Validate whether address to be mapped is part of system RAM

2019-10-02 Thread Vasant Hegde

On 10/3/19 10:26 AM, Jeremy Kerr wrote:

Hi Vasant,


Correct. We will have `ibm,prd-label` property. That's not the issue.


It sure sounds like the issue - someone has represented a range that
should be mapped by HBRT, but isn't appropriate for mapping by HBRT.


Here issueis HBRT is loaded into NVDIMM memory.


OK. How about we just don't do that?


Yes. Hostboot will fix that. It will make sure that HBRT is loaded into regular 
memory.




It sounds like we're just trying to work around an invalid
representation of the mappings.


Its not workaround. Its additional check. So that we don't mmap() if HBRT is not 
in system RAM

and throw proper error.. So that debugging becomes easy.

-Vasant



Re: [PATCH] powerpc/powernv/prd: Validate whether address to be mapped is part of system RAM

2019-10-02 Thread Jeremy Kerr
Hi Vasant,

> Correct. We will have `ibm,prd-label` property. That's not the issue. 

It sure sounds like the issue - someone has represented a range that
should be mapped by HBRT, but isn't appropriate for mapping by HBRT.

> Here issueis HBRT is loaded into NVDIMM memory.

OK. How about we just don't do that?

It sounds like we're just trying to work around an invalid
representation of the mappings.

Cheers,


Jeremy




Re: [PATCH] powerpc/powernv/prd: Validate whether address to be mapped is part of system RAM

2019-10-02 Thread Vasant Hegde

On 10/3/19 7:17 AM, Jeremy Kerr wrote:

Hi Vasant,


Jeremy,




Add check to validate whether requested page is part of system RAM
or not before mmap() and error out if its not part of system RAM.


opal_prd_range_is_valid() will return false if the reserved memory range
does not have an ibm,prd-label property. If this you're getting invalid
memory mapped through the PRD interface, that means the device tree is
incorrectly describing those ranges.


Correct. We will have `ibm,prd-label` property. That's not the issue. Here issue
is HBRT is loaded into NVDIMM memory.


Copy-pasting Vaidy's explanation from internal bugzilla here:

--
The root-cause of the problem seem to be in HBRT using NVDIMM area/addresses for 
firmware operation.


Linux maps the required address for HBRT to read, write and execute. This all 
works fine for normal RAM addresses.  However NVDIMM is not normal RAM, it is 
device memory which can be used as RAM or through other layers and subsystem.


Linux kernel memory manager set page table attributes as 0b10 non-idempotent I/O 
instead of normal RAM 0b00 since this is a special type of device memory 
initialized and used by a firmware device driver.  This prevented instruction 
execution from that mapped page.  Since instruction could not be fetched, 
opal-prd application could not complete init and start.


--

Hostboot should detect NVDIMM areas and avoid using those areas for any firmware 
purposes including HBRT. Hostboot will fix this issue.


In this patch we are adding additional check to make sure mmap() fails 
gracefully and we log proper error log. That way opal-prd will fail to start 
instead of looping forever .


-Vasant



Re: [PATCH v5 10/11] mm/Kconfig: Adds config option to track lockless pagetable walks

2019-10-02 Thread Qian Cai



> On Oct 2, 2019, at 9:36 PM, Leonardo Bras  wrote:
> 
> Adds config option LOCKLESS_PAGE_TABLE_WALK_TRACKING to make possible
> enabling tracking lockless pagetable walks directly from kernel config.

Can’t this name and all those new *lockless* function names be shorter? There 
are many functions name with *_locked, so how about dropping lockless at all, 
i.e., PAGE_TABLE_WALK_TRACKING blah blah?

> 
> Signed-off-by: Leonardo Bras 
> ---
> mm/Kconfig | 11 +++
> 1 file changed, 11 insertions(+)
> 
> diff --git a/mm/Kconfig b/mm/Kconfig
> index a5dae9a7eb51..00f487a0122f 100644
> --- a/mm/Kconfig
> +++ b/mm/Kconfig
> @@ -736,4 +736,15 @@ config ARCH_HAS_PTE_SPECIAL
> config ARCH_HAS_HUGEPD
>bool
> 
> +config LOCKLESS_PAGE_TABLE_WALK_TRACKING
> +bool "Track (and optimize) lockless page table walks"
> +default n
> +
> +help
> +  Maintain a reference count of active lockless page table
> +  walkers. This adds 4 bytes to struct mm size, and two atomic
> +  operations to calls such as get_user_pages_fast(). Some
> +  architectures can optimize lockless page table operations if
> +  this is enabled.
> +
> endmenu
> -- 
> 2.20.1


Re: [PATCH] powerpc/powernv/prd: Validate whether address to be mapped is part of system RAM

2019-10-02 Thread Jeremy Kerr
Hi Vasant,

> Add check to validate whether requested page is part of system RAM
> or not before mmap() and error out if its not part of system RAM.

opal_prd_range_is_valid() will return false if the reserved memory range
does not have an ibm,prd-label property. If this you're getting invalid
memory mapped through the PRD interface, that means the device tree is
incorrectly describing those ranges.

Or am I missing something?

Cheers,


Jeremy




[PATCH v5 10/11] mm/Kconfig: Adds config option to track lockless pagetable walks

2019-10-02 Thread Leonardo Bras
Adds config option LOCKLESS_PAGE_TABLE_WALK_TRACKING to make possible
enabling tracking lockless pagetable walks directly from kernel config.

Signed-off-by: Leonardo Bras 
---
 mm/Kconfig | 11 +++
 1 file changed, 11 insertions(+)

diff --git a/mm/Kconfig b/mm/Kconfig
index a5dae9a7eb51..00f487a0122f 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -736,4 +736,15 @@ config ARCH_HAS_PTE_SPECIAL
 config ARCH_HAS_HUGEPD
bool
 
+config LOCKLESS_PAGE_TABLE_WALK_TRACKING
+   bool "Track (and optimize) lockless page table walks"
+   default n
+
+   help
+ Maintain a reference count of active lockless page table
+ walkers. This adds 4 bytes to struct mm size, and two atomic
+ operations to calls such as get_user_pages_fast(). Some
+ architectures can optimize lockless page table operations if
+ this is enabled.
+
 endmenu
-- 
2.20.1



[PATCH v5 05/11] powerpc/perf: Applies counting method to monitor lockless pgtbl walks

2019-10-02 Thread Leonardo Bras
Applies the counting-based method for monitoring lockless pgtable walks on
read_user_stack_slow.

local_irq_{save,restore} is already inside {begin,end}_lockless_pgtbl_walk,
so there is no need to repeat it here.

Variable that saves the irq mask was renamed from flags to irq_mask so it
doesn't lose meaning now it's not directly passed to local_irq_* functions.

Signed-off-by: Leonardo Bras 
---
 arch/powerpc/perf/callchain.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/perf/callchain.c b/arch/powerpc/perf/callchain.c
index c84bbd4298a0..43e49d8be344 100644
--- a/arch/powerpc/perf/callchain.c
+++ b/arch/powerpc/perf/callchain.c
@@ -116,14 +116,14 @@ static int read_user_stack_slow(void __user *ptr, void 
*buf, int nb)
unsigned shift;
unsigned long addr = (unsigned long) ptr;
unsigned long offset;
-   unsigned long pfn, flags;
+   unsigned long pfn, irq_mask;
void *kaddr;
 
pgdir = current->mm->pgd;
if (!pgdir)
return -EFAULT;
 
-   local_irq_save(flags);
+   irq_mask = begin_lockless_pgtbl_walk(current->mm);
ptep = find_current_mm_pte(pgdir, addr, NULL, &shift);
if (!ptep)
goto err_out;
@@ -145,7 +145,7 @@ static int read_user_stack_slow(void __user *ptr, void 
*buf, int nb)
memcpy(buf, kaddr + offset, nb);
ret = 0;
 err_out:
-   local_irq_restore(flags);
+   end_lockless_pgtbl_walk(current->mm, irq_mask);
return ret;
 }
 
-- 
2.20.1



[PATCH v5 11/11] powerpc/mm/book3s64/pgtable: Uses counting method to skip serializing

2019-10-02 Thread Leonardo Bras
Skips slow part of serialize_against_pte_lookup if there is no running
lockless pagetable walk.

Signed-off-by: Leonardo Bras 
---
 arch/powerpc/mm/book3s64/pgtable.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/mm/book3s64/pgtable.c 
b/arch/powerpc/mm/book3s64/pgtable.c
index ae557fdce9a3..0fef9400f210 100644
--- a/arch/powerpc/mm/book3s64/pgtable.c
+++ b/arch/powerpc/mm/book3s64/pgtable.c
@@ -95,7 +95,8 @@ static void do_nothing(void *unused)
 void serialize_against_pte_lookup(struct mm_struct *mm)
 {
smp_mb();
-   smp_call_function_many(mm_cpumask(mm), do_nothing, NULL, 1);
+   if (running_lockless_pgtbl_walk(mm))
+   smp_call_function_many(mm_cpumask(mm), do_nothing, NULL, 1);
 }
 
 /*
-- 
2.20.1



[PATCH v5 09/11] powerpc/kvm/book3s_64: Applies counting method to monitor lockless pgtbl walks

2019-10-02 Thread Leonardo Bras
Applies the counting-based method for monitoring all book3s_64-related
functions that do lockless pagetable walks.

Adds comments explaining that some lockless pagetable walks don't need
protection due to guest pgd not being a target of THP collapse/split, or
due to being called from Realmode + MSR_EE = 0.

Given that some of these functions always are called in realmode,  we use
__{begin,end}_lockless_pgtbl_walk so we can decide when to disable
interrupts.

local_irq_{save,restore} is already inside {begin,end}_lockless_pgtbl_walk,
so there is no need to repeat it here.

Variable that saves the irq mask was renamed from flags to irq_mask so it
doesn't lose meaning now it's not directly passed to local_irq_* functions.

There are also a function that uses local_irq_{en,dis}able, so the return
value of begin_lockless_pgtbl_walk() is ignored and we pass IRQS_ENABLED to
end_lockless_pgtbl_walk() to mimic the effect of local_irq_enable().

Signed-off-by: Leonardo Bras 
---
 arch/powerpc/kvm/book3s_64_mmu_hv.c|  6 ++---
 arch/powerpc/kvm/book3s_64_mmu_radix.c | 34 +++---
 arch/powerpc/kvm/book3s_64_vio_hv.c|  7 +-
 3 files changed, 40 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c 
b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 9a75f0e1933b..d8f374c979f5 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -615,12 +615,12 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, 
struct kvm_vcpu *vcpu,
/* if the guest wants write access, see if that is OK */
if (!writing && hpte_is_writable(r)) {
pte_t *ptep, pte;
-   unsigned long flags;
+   unsigned long irq_mask;
/*
 * We need to protect against page table destruction
 * hugepage split and collapse.
 */
-   local_irq_save(flags);
+   irq_mask = begin_lockless_pgtbl_walk(kvm->mm);
ptep = find_current_mm_pte(current->mm->pgd,
   hva, NULL, NULL);
if (ptep) {
@@ -628,7 +628,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct 
kvm_vcpu *vcpu,
if (__pte_write(pte))
write_ok = 1;
}
-   local_irq_restore(flags);
+   end_lockless_pgtbl_walk(kvm->mm, irq_mask);
}
}
 
diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c 
b/arch/powerpc/kvm/book3s_64_mmu_radix.c
index 2d415c36a61d..8ba9742e2fc8 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
@@ -813,20 +813,20 @@ int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu,
 * Read the PTE from the process' radix tree and use that
 * so we get the shift and attribute bits.
 */
-   local_irq_disable();
+   begin_lockless_pgtbl_walk(kvm->mm);
ptep = __find_linux_pte(vcpu->arch.pgdir, hva, NULL, &shift);
/*
 * If the PTE disappeared temporarily due to a THP
 * collapse, just return and let the guest try again.
 */
if (!ptep) {
-   local_irq_enable();
+   end_lockless_pgtbl_walk(kvm->mm, IRQS_ENABLED);
if (page)
put_page(page);
return RESUME_GUEST;
}
pte = *ptep;
-   local_irq_enable();
+   end_lockless_pgtbl_walk(kvm->mm, IRQS_ENABLED);
 
/* If we're logging dirty pages, always map single pages */
large_enable = !(memslot->flags & KVM_MEM_LOG_DIRTY_PAGES);
@@ -972,10 +972,16 @@ int kvm_unmap_radix(struct kvm *kvm, struct 
kvm_memory_slot *memslot,
unsigned long gpa = gfn << PAGE_SHIFT;
unsigned int shift;
 
+   /*
+* We are walking the secondary (partition-scoped) page table here.
+* We can do this without disabling irq because the Linux MM
+* subsystem doesn't do THP splits and collapses on this tree.
+*/
ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift);
if (ptep && pte_present(*ptep))
kvmppc_unmap_pte(kvm, ptep, gpa, shift, memslot,
 kvm->arch.lpid);
+
return 0;   
 }
 
@@ -989,6 +995,11 @@ int kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot 
*memslot,
int ref = 0;
unsigned long old, *rmapp;
 
+   /*
+* We are walking the secondary (partition-scoped) page table here.
+* We can do this without disabling irq because the Linux MM
+* subsystem doesn't do THP splits and collapses on this tree.
+*/
ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift);
i

[PATCH v5 08/11] powerpc/kvm/book3s_hv: Applies counting method to monitor lockless pgtbl walks

2019-10-02 Thread Leonardo Bras
Applies the counting-based method for monitoring all book3s_hv related
functions that do lockless pagetable walks.

Adds comments explaining that some lockless pagetable walks don't need
protection due to guest pgd not being a target of THP collapse/split, or
due to being called from Realmode + MSR_EE = 0

kvmppc_do_h_enter: Fixes where local_irq_restore() must be placed (after
the last usage of ptep).

Given that some of these functions can be called in real mode, and others
always are, we use __{begin,end}_lockless_pgtbl_walk so we can decide when
to disable interrupts.

Signed-off-by: Leonardo Bras 
---
 arch/powerpc/kvm/book3s_hv_nested.c | 22 ++--
 arch/powerpc/kvm/book3s_hv_rm_mmu.c | 32 -
 2 files changed, 38 insertions(+), 16 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv_nested.c 
b/arch/powerpc/kvm/book3s_hv_nested.c
index cdf30c6eaf54..89944c699fd6 100644
--- a/arch/powerpc/kvm/book3s_hv_nested.c
+++ b/arch/powerpc/kvm/book3s_hv_nested.c
@@ -803,7 +803,11 @@ static void kvmhv_update_nest_rmap_rc(struct kvm *kvm, u64 
n_rmap,
if (!gp)
return;
 
-   /* Find the pte */
+   /* Find the pte:
+* We are walking the nested guest (partition-scoped) page table here.
+* We can do this without disabling irq because the Linux MM
+* subsystem doesn't do THP splits and collapses on this tree.
+*/
ptep = __find_linux_pte(gp->shadow_pgtable, gpa, NULL, &shift);
/*
 * If the pte is present and the pfn is still the same, update the pte.
@@ -853,7 +857,11 @@ static void kvmhv_remove_nest_rmap(struct kvm *kvm, u64 
n_rmap,
if (!gp)
return;
 
-   /* Find and invalidate the pte */
+   /* Find and invalidate the pte:
+* We are walking the nested guest (partition-scoped) page table here.
+* We can do this without disabling irq because the Linux MM
+* subsystem doesn't do THP splits and collapses on this tree.
+*/
ptep = __find_linux_pte(gp->shadow_pgtable, gpa, NULL, &shift);
/* Don't spuriously invalidate ptes if the pfn has changed */
if (ptep && pte_present(*ptep) && ((pte_val(*ptep) & mask) == hpa))
@@ -921,6 +929,11 @@ static bool kvmhv_invalidate_shadow_pte(struct kvm_vcpu 
*vcpu,
int shift;
 
spin_lock(&kvm->mmu_lock);
+   /*
+* We are walking the nested guest (partition-scoped) page table here.
+* We can do this without disabling irq because the Linux MM
+* subsystem doesn't do THP splits and collapses on this tree.
+*/
ptep = __find_linux_pte(gp->shadow_pgtable, gpa, NULL, &shift);
if (!shift)
shift = PAGE_SHIFT;
@@ -1362,6 +1375,11 @@ static long int __kvmhv_nested_page_fault(struct kvm_run 
*run,
/* See if can find translation in our partition scoped tables for L1 */
pte = __pte(0);
spin_lock(&kvm->mmu_lock);
+   /*
+* We are walking the secondary (partition-scoped) page table here.
+* We can do this without disabling irq because the Linux MM
+* subsystem doesn't do THP splits and collapses on this tree.
+*/
pte_p = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift);
if (!shift)
shift = PAGE_SHIFT;
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c 
b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index 220305454c23..a8be42f5be1e 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -210,7 +210,7 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
pte_t *ptep;
unsigned int writing;
unsigned long mmu_seq;
-   unsigned long rcbits, irq_flags = 0;
+   unsigned long rcbits, irq_mask = 0;
 
if (kvm_is_radix(kvm))
return H_FUNCTION;
@@ -252,12 +252,8 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long 
flags,
 * If we had a page table table change after lookup, we would
 * retry via mmu_notifier_retry.
 */
-   if (!realmode)
-   local_irq_save(irq_flags);
-   /*
-* If called in real mode we have MSR_EE = 0. Otherwise
-* we disable irq above.
-*/
+   irq_mask = __begin_lockless_pgtbl_walk(kvm->mm, !realmode);
+
ptep = __find_linux_pte(pgdir, hva, NULL, &hpage_shift);
if (ptep) {
pte_t pte;
@@ -272,8 +268,7 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
 * to <= host page size, if host is using hugepage
 */
if (host_pte_size < psize) {
-   if (!realmode)
-   local_irq_restore(flags);
+   __end_lockless_pgtbl_walk(kvm->mm, irq_mask, !realmode);
return H_PARAMETER;
}
pte = kvmppc_read_update_linux_pte(ptep, writing);
@@ -287,8 +282,6 

[PATCH v5 07/11] powerpc/kvm/e500: Applies counting method to monitor lockless pgtbl walks

2019-10-02 Thread Leonardo Bras
Applies the counting-based method for monitoring lockless pgtable walks on
kvmppc_e500_shadow_map().

Fixes the place where local_irq_restore() is called: previously, if ptep
was NULL, local_irq_restore() would never be called.

local_irq_{save,restore} is already inside {begin,end}_lockless_pgtbl_walk,
so there is no need to repeat it here.

Variable that saves the irq mask was renamed from flags to irq_mask so it
doesn't lose meaning now it's not directly passed to local_irq_* functions.

Signed-off-by: Leonardo Bras 
---
 arch/powerpc/kvm/e500_mmu_host.c | 9 +
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c
index 321db0fdb9db..36f07c6a5f10 100644
--- a/arch/powerpc/kvm/e500_mmu_host.c
+++ b/arch/powerpc/kvm/e500_mmu_host.c
@@ -336,7 +336,7 @@ static inline int kvmppc_e500_shadow_map(struct 
kvmppc_vcpu_e500 *vcpu_e500,
pte_t *ptep;
unsigned int wimg = 0;
pgd_t *pgdir;
-   unsigned long flags;
+   unsigned long irq_mask;
 
/* used to check for invalidations in progress */
mmu_seq = kvm->mmu_notifier_seq;
@@ -473,7 +473,7 @@ static inline int kvmppc_e500_shadow_map(struct 
kvmppc_vcpu_e500 *vcpu_e500,
 * We are holding kvm->mmu_lock so a notifier invalidate
 * can't run hence pfn won't change.
 */
-   local_irq_save(flags);
+   irq_mask = begin_lockless_pgtbl_walk(kvm->mm);
ptep = find_linux_pte(pgdir, hva, NULL, NULL);
if (ptep) {
pte_t pte = READ_ONCE(*ptep);
@@ -481,15 +481,16 @@ static inline int kvmppc_e500_shadow_map(struct 
kvmppc_vcpu_e500 *vcpu_e500,
if (pte_present(pte)) {
wimg = (pte_val(pte) >> PTE_WIMGE_SHIFT) &
MAS2_WIMGE_MASK;
-   local_irq_restore(flags);
} else {
-   local_irq_restore(flags);
+   end_lockless_pgtbl_walk(kvm->mm, irq_mask);
pr_err_ratelimited("%s: pte not present: gfn %lx,pfn 
%lx\n",
   __func__, (long)gfn, pfn);
ret = -EINVAL;
goto out;
}
}
+   end_lockless_pgtbl_walk(kvm->mm, irq_mask);
+
kvmppc_e500_ref_setup(ref, gtlbe, pfn, wimg);
 
kvmppc_e500_setup_stlbe(&vcpu_e500->vcpu, gtlbe, tsize,
-- 
2.20.1



[PATCH v5 06/11] powerpc/mm/book3s64/hash: Applies counting method to monitor lockless pgtbl walks

2019-10-02 Thread Leonardo Bras
Applies the counting-based method for monitoring all hash-related functions
that do lockless pagetable walks.

hash_page_mm: Adds comment that explain that there is no need to
local_int_disable/save given that it is only called from DataAccess
interrupt, so interrupts are already disabled.

local_irq_{save,restore} is already inside {begin,end}_lockless_pgtbl_walk,
so there is no need to repeat it here.

Variable that saves the irq mask was renamed from flags to irq_mask so it
doesn't lose meaning now it's not directly passed to local_irq_* functions.

Signed-off-by: Leonardo Bras 
---
 arch/powerpc/mm/book3s64/hash_tlb.c   |  6 +++---
 arch/powerpc/mm/book3s64/hash_utils.c | 27 +--
 2 files changed, 20 insertions(+), 13 deletions(-)

diff --git a/arch/powerpc/mm/book3s64/hash_tlb.c 
b/arch/powerpc/mm/book3s64/hash_tlb.c
index 4a70d8dd39cd..b0ef67d8c88a 100644
--- a/arch/powerpc/mm/book3s64/hash_tlb.c
+++ b/arch/powerpc/mm/book3s64/hash_tlb.c
@@ -194,7 +194,7 @@ void __flush_hash_table_range(struct mm_struct *mm, 
unsigned long start,
 {
bool is_thp;
int hugepage_shift;
-   unsigned long flags;
+   unsigned long irq_mask;
 
start = _ALIGN_DOWN(start, PAGE_SIZE);
end = _ALIGN_UP(end, PAGE_SIZE);
@@ -209,7 +209,7 @@ void __flush_hash_table_range(struct mm_struct *mm, 
unsigned long start,
 * to being hashed). This is not the most performance oriented
 * way to do things but is fine for our needs here.
 */
-   local_irq_save(flags);
+   irq_mask = begin_lockless_pgtbl_walk(mm);
arch_enter_lazy_mmu_mode();
for (; start < end; start += PAGE_SIZE) {
pte_t *ptep = find_current_mm_pte(mm->pgd, start, &is_thp,
@@ -229,7 +229,7 @@ void __flush_hash_table_range(struct mm_struct *mm, 
unsigned long start,
hpte_need_flush(mm, start, ptep, pte, hugepage_shift);
}
arch_leave_lazy_mmu_mode();
-   local_irq_restore(flags);
+   end_lockless_pgtbl_walk(mm, irq_mask);
 }
 
 void flush_tlb_pmd_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr)
diff --git a/arch/powerpc/mm/book3s64/hash_utils.c 
b/arch/powerpc/mm/book3s64/hash_utils.c
index 6c123760164e..7a01a12a19bb 100644
--- a/arch/powerpc/mm/book3s64/hash_utils.c
+++ b/arch/powerpc/mm/book3s64/hash_utils.c
@@ -1313,12 +1313,16 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea,
ea &= ~((1ul << mmu_psize_defs[psize].shift) - 1);
 #endif /* CONFIG_PPC_64K_PAGES */
 
-   /* Get PTE and page size from page tables */
+   /* Get PTE and page size from page tables :
+* Called in from DataAccess interrupt (data_access_common: 0x300),
+* interrupts are disabled here.
+*/
+   __begin_lockless_pgtbl_walk(mm, false);
ptep = find_linux_pte(pgdir, ea, &is_thp, &hugeshift);
if (ptep == NULL || !pte_present(*ptep)) {
DBG_LOW(" no PTE !\n");
rc = 1;
-   goto bail;
+   goto bail_pgtbl_walk;
}
 
/* Add _PAGE_PRESENT to the required access perm */
@@ -1331,7 +1335,7 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea,
if (!check_pte_access(access, pte_val(*ptep))) {
DBG_LOW(" no access !\n");
rc = 1;
-   goto bail;
+   goto bail_pgtbl_walk;
}
 
if (hugeshift) {
@@ -1355,7 +1359,7 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea,
if (current->mm == mm)
check_paca_psize(ea, mm, psize, user_region);
 
-   goto bail;
+   goto bail_pgtbl_walk;
}
 
 #ifndef CONFIG_PPC_64K_PAGES
@@ -1429,6 +1433,8 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea,
 #endif
DBG_LOW(" -> rc=%d\n", rc);
 
+bail_pgtbl_walk:
+   __end_lockless_pgtbl_walk(mm, 0, false);
 bail:
exception_exit(prev_state);
return rc;
@@ -1517,7 +1523,7 @@ static void hash_preload(struct mm_struct *mm, unsigned 
long ea,
unsigned long vsid;
pgd_t *pgdir;
pte_t *ptep;
-   unsigned long flags;
+   unsigned long irq_mask;
int rc, ssize, update_flags = 0;
unsigned long access = _PAGE_PRESENT | _PAGE_READ | (is_exec ? 
_PAGE_EXEC : 0);
 
@@ -1539,11 +1545,12 @@ static void hash_preload(struct mm_struct *mm, unsigned 
long ea,
vsid = get_user_vsid(&mm->context, ea, ssize);
if (!vsid)
return;
+
/*
 * Hash doesn't like irqs. Walking linux page table with irq disabled
 * saves us from holding multiple locks.
 */
-   local_irq_save(flags);
+   irq_mask = begin_lockless_pgtbl_walk(mm);
 
/*
 * THP pages use update_mmu_cache_pmd. We don't do
@@ -1588,7 +1595,7 @@ static void hash_preload(struct mm_struct *mm, unsigned 
long ea,
   mm_ctx_user_psize(&mm->context)

[PATCH v5 04/11] powerpc/mce_power: Applies counting method to monitor lockless pgtbl walks

2019-10-02 Thread Leonardo Bras
Applies the counting-based method for monitoring lockless pgtable walks on
addr_to_pfn().

local_irq_{save,restore} is already inside {begin,end}_lockless_pgtbl_walk,
so there is no need to repeat it here.

Signed-off-by: Leonardo Bras 
---
 arch/powerpc/kernel/mce_power.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c
index 1cbf7f1a4e3d..7f888fb6f65c 100644
--- a/arch/powerpc/kernel/mce_power.c
+++ b/arch/powerpc/kernel/mce_power.c
@@ -29,7 +29,7 @@ unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long 
addr)
 {
pte_t *ptep;
unsigned int shift;
-   unsigned long pfn, flags;
+   unsigned long pfn, irq_mask;
struct mm_struct *mm;
 
if (user_mode(regs))
@@ -37,7 +37,7 @@ unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long 
addr)
else
mm = &init_mm;
 
-   local_irq_save(flags);
+   irq_mask = begin_lockless_pgtbl_walk(mm);
ptep = __find_linux_pte(mm->pgd, addr, NULL, &shift);
 
if (!ptep || pte_special(*ptep)) {
@@ -53,7 +53,7 @@ unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long 
addr)
}
 
 out:
-   local_irq_restore(flags);
+   end_lockless_pgtbl_walk(mm, irq_mask);
return pfn;
 }
 
-- 
2.20.1



[PATCH v5 01/11] asm-generic/pgtable: Adds generic functions to monitor lockless pgtable walks

2019-10-02 Thread Leonardo Bras
It's necessary to monitor lockless pagetable walks, in order to avoid doing
THP splitting/collapsing during them.

Some methods rely on irq enable/disable, but that can be slow on
cases with a lot of cpus are used for the process, given all these cpus
have to run a IPI.

In order to speedup some cases, I propose a refcount-based approach,
that counts the number of lockless pagetable walks happening on the
process. If this count is zero, it skips the irq-oriented method.

Given that there are lockless pagetable walks on generic code, it's
necessary to create documented generic functions that may be enough for
most archs and but let open to arch-specific implemenations.

This method does not exclude the current irq-oriented method. It works as a
complement to skip unnecessary waiting.

begin_lockless_pgtbl_walk(mm)
Insert before starting any lockless pgtable walk
end_lockless_pgtbl_walk(mm)
Insert after the end of any lockless pgtable walk
(Mostly after the ptep is last used)
running_lockless_pgtbl_walk(mm)
Returns the number of lockless pgtable walks running

While there is no config option, the method is disabled and these functions
are only doing what was already needed to lockless pagetable walks
(disabling interrupt). A memory barrier was also added just to make sure
there is no speculative read outside the interrupt disabled area.

Signed-off-by: Leonardo Bras 
---
 include/asm-generic/pgtable.h | 58 +++
 include/linux/mm_types.h  | 11 +++
 kernel/fork.c |  3 ++
 3 files changed, 72 insertions(+)

diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 818691846c90..3043ea9812d5 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -1171,6 +1171,64 @@ static inline bool arch_has_pfn_modify_check(void)
 #endif
 #endif
 
+#ifndef __HAVE_ARCH_LOCKLESS_PGTBL_WALK_CONTROL
+static inline unsigned long begin_lockless_pgtbl_walk(struct mm_struct *mm)
+{
+   unsigned long irq_mask;
+
+   if (IS_ENABLED(CONFIG_LOCKLESS_PAGE_TABLE_WALK_TRACKING))
+   atomic_inc(&mm->lockless_pgtbl_walkers);
+
+   /*
+* Interrupts must be disabled during the lockless page table walk.
+* That's because the deleting or splitting involves flushing TLBs,
+* which in turn issues interrupts, that will block when disabled.
+*/
+   local_irq_save(irq_mask);
+
+   /*
+* This memory barrier pairs with any code that is either trying to
+* delete page tables, or split huge pages. Without this barrier,
+* the page tables could be read speculatively outside of interrupt
+* disabling.
+*/
+   smp_mb();
+
+   return irq_mask;
+}
+
+static inline void end_lockless_pgtbl_walk(struct mm_struct *mm,
+  unsigned long irq_mask)
+{
+   /*
+* This memory barrier pairs with any code that is either trying to
+* delete page tables, or split huge pages. Without this barrier,
+* the page tables could be read speculatively outside of interrupt
+* disabling.
+*/
+   smp_mb();
+
+   /*
+* Interrupts must be disabled during the lockless page table walk.
+* That's because the deleting or splitting involves flushing TLBs,
+* which in turn issues interrupts, that will block when disabled.
+*/
+   local_irq_restore(irq_mask);
+
+   if (IS_ENABLED(CONFIG_LOCKLESS_PAGE_TABLE_WALK_TRACKING))
+   atomic_dec(&mm->lockless_pgtbl_walkers);
+}
+
+static inline int running_lockless_pgtbl_walk(struct mm_struct *mm)
+{
+   if (IS_ENABLED(CONFIG_LOCKLESS_PAGE_TABLE_WALK_TRACKING))
+   return atomic_read(&mm->lockless_pgtbl_walkers);
+
+   /* If disabled, must return > 0, so it falls back to sync method */
+   return 1;
+}
+#endif
+
 /*
  * On some architectures it depends on the mm if the p4d/pud or pmd
  * layer of the page table hierarchy is folded or not.
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index fa795284..277462f0b4fd 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -521,6 +521,17 @@ struct mm_struct {
struct work_struct async_put_work;
} __randomize_layout;
 
+#ifdef CONFIG_LOCKLESS_PAGE_TABLE_WALK_TRACKING
+   /*
+* Number of callers who are doing a lockless walk of the
+* page tables. Typically arches might enable this in order to
+* help optimize performance, by possibly avoiding expensive
+* IPIs at the wrong times.
+*/
+   atomic_t lockless_pgtbl_walkers;
+
+#endif
+
/*
 * The mm_cpumask needs to be at the end of mm_struct, because it
 * is dynamically sized based on nr_cpu_ids.
diff --git a/kernel/fork.c b/kernel/fork.c
index f9572f416126..2cbca867f5a5 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -102

[PATCH v5 00/11] Introduces new count-based method for tracking lockless pagetable walks

2019-10-02 Thread Leonardo Bras
If a process (qemu) with a lot of CPUs (128) try to munmap() a large
chunk of memory (496GB) mapped with THP, it takes an average of 275
seconds, which can cause a lot of problems to the load (in qemu case,
the guest will lock for this time).

Trying to find the source of this bug, I found out most of this time is
spent on serialize_against_pte_lookup(). This function will take a lot
of time in smp_call_function_many() if there is more than a couple CPUs
running the user process. Since it has to happen to all THP mapped, it
will take a very long time for large amounts of memory.

By the docs, serialize_against_pte_lookup() is needed in order to avoid
pmd_t to pte_t casting inside find_current_mm_pte(), or any lockless
pagetable walk, to happen concurrently with THP splitting/collapsing.

It does so by calling a do_nothing() on each CPU in mm->cpu_bitmap[],
after interrupts are re-enabled.
Since, interrupts are (usually) disabled during lockless pagetable
walk, and serialize_against_pte_lookup will only return after
interrupts are enabled, it is protected.

So, by what I could understand, if there is no lockless pagetable walk
running, there is no need to call serialize_against_pte_lookup().

So, to avoid the cost of running serialize_against_pte_lookup(), I
propose a counter that keeps track of how many find_current_mm_pte()
are currently running, and if there is none, just skip
smp_call_function_many().

The related functions are:
begin_lockless_pgtbl_walk()
Insert before starting any lockless pgtable walk
end_lockless_pgtbl_walk()
Insert after the end of any lockless pgtable walk
(Mostly after the ptep is last used)
running_lockless_pgtbl_walk()
Returns the number of lockless pgtable walks running

On my workload (qemu), I could see munmap's time reduction from 275
seconds to 418ms.

Also, I documented some lockless pagetable walks in which it's not
necessary to keep track, given they work on init_mm or guest pgd.

The patchset works by focusing all steps needed to begin/end lockless
pagetable walks on the above functions, and then adding the config option
to enable the tracking of these functions using the counting method.

Changes since v4:
 Rebased on top of v5.4-rc1
 Declared real generic functions instead of dummies
 start_lockless_pgtbl_walk renamed to begin_lockless_pgtbl_walk
 Interrupt {dis,en}able is now inside of {begin,end}_lockless_pgtbl_walk
 Power implementation has option to not {dis,en}able interrupt
 More documentation inside the funtions.
 Some irq maks variables renamed
 Removed some proxy mm_structs
 Few typos fixed

Changes since v3:
 Explain (comments) why some lockless pgtbl walks don't need
local_irq_disable (real mode + MSR_EE=0)
 Explain (comments) places where counting method is not needed (guest pgd,
which is not touched by THP)
 Fixes some misplaced local_irq_restore()
 Link: http://patchwork.ozlabs.org/project/linuxppc-dev/list/?series=132417

Changes since v2:
 Rebased to v5.3
 Adds support on __get_user_pages_fast
 Adds usage decription to *_lockless_pgtbl_walk()
 Better style to dummy functions
 Link: http://patchwork.ozlabs.org/project/linuxppc-dev/list/?series=131839

Changes since v1:
 Isolated atomic operations in functions *_lockless_pgtbl_walk()
 Fixed behavior of decrementing before last ptep was used
 Link: http://patchwork.ozlabs.org/patch/1163093/


Leonardo Bras (11):
  asm-generic/pgtable: Adds generic functions to monitor lockless
pgtable walks
  powerpc/mm: Adds counting method to monitor lockless pgtable walks
  mm/gup: Applies counting method to monitor gup_pgd_range
  powerpc/mce_power: Applies counting method to monitor lockless pgtbl
walks
  powerpc/perf: Applies counting method to monitor lockless pgtbl walks
  powerpc/mm/book3s64/hash: Applies counting method to monitor lockless
pgtbl walks
  powerpc/kvm/e500: Applies counting method to monitor lockless pgtbl
walks
  powerpc/kvm/book3s_hv: Applies counting method to monitor lockless
pgtbl walks
  powerpc/kvm/book3s_64: Applies counting method to monitor lockless
pgtbl walks
  mm/Kconfig: Adds config option to track lockless pagetable walks
  powerpc/mm/book3s64/pgtable: Uses counting method to skip serializing

 arch/powerpc/include/asm/book3s/64/pgtable.h |   9 ++
 arch/powerpc/kernel/mce_power.c  |   6 +-
 arch/powerpc/kvm/book3s_64_mmu_hv.c  |   6 +-
 arch/powerpc/kvm/book3s_64_mmu_radix.c   |  34 +-
 arch/powerpc/kvm/book3s_64_vio_hv.c  |   7 +-
 arch/powerpc/kvm/book3s_hv_nested.c  |  22 +++-
 arch/powerpc/kvm/book3s_hv_rm_mmu.c  |  32 ++---
 arch/powerpc/kvm/e500_mmu_host.c |   9 +-
 arch/powerpc/mm/book3s64/hash_tlb.c  |   6 +-
 arch/powerpc/mm/book3s64/hash_utils.c|  27 +++--
 arch/powerpc/mm/book3s64/pgtable.c   | 120 ++-
 arch/powerpc/perf/callchain.c|   6 +-
 include/asm-generic/pgtable.h 

[PATCH v5 03/11] mm/gup: Applies counting method to monitor gup_pgd_range

2019-10-02 Thread Leonardo Bras
As described, gup_pgd_range is a lockless pagetable walk. So, in order to
monitor against THP split/collapse with the counting method, it's necessary
to bound it with {begin,end}_lockless_pgtbl_walk.

local_irq_{save,restore} is already inside {begin,end}_lockless_pgtbl_walk,
so there is no need to repeat it here.

Signed-off-by: Leonardo Bras 
---
 mm/gup.c | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/mm/gup.c b/mm/gup.c
index 23a9f9c9d377..52e53b4f39d8 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -2319,7 +2319,7 @@ int __get_user_pages_fast(unsigned long start, int 
nr_pages, int write,
  struct page **pages)
 {
unsigned long len, end;
-   unsigned long flags;
+   unsigned long irq_mask;
int nr = 0;
 
start = untagged_addr(start) & PAGE_MASK;
@@ -2345,9 +2345,9 @@ int __get_user_pages_fast(unsigned long start, int 
nr_pages, int write,
 
if (IS_ENABLED(CONFIG_HAVE_FAST_GUP) &&
gup_fast_permitted(start, end)) {
-   local_irq_save(flags);
+   irq_mask = begin_lockless_pgtbl_walk(current->mm);
gup_pgd_range(start, end, write ? FOLL_WRITE : 0, pages, &nr);
-   local_irq_restore(flags);
+   end_lockless_pgtbl_walk(current->mm, irq_mask);
}
 
return nr;
@@ -2414,9 +2414,9 @@ int get_user_pages_fast(unsigned long start, int nr_pages,
 
if (IS_ENABLED(CONFIG_HAVE_FAST_GUP) &&
gup_fast_permitted(start, end)) {
-   local_irq_disable();
+   begin_lockless_pgtbl_walk(current->mm);
gup_pgd_range(addr, end, gup_flags, pages, &nr);
-   local_irq_enable();
+   end_lockless_pgtbl_walk(current->mm, IRQS_ENABLED);
ret = nr;
}
 
-- 
2.20.1



[PATCH v5 02/11] powerpc/mm: Adds counting method to monitor lockless pgtable walks

2019-10-02 Thread Leonardo Bras
It's necessary to monitor lockless pagetable walks, in order to avoid doing
THP splitting/collapsing during them.

On powerpc, we need to do some lockless pagetable walks from functions
that already have disabled interrupts, specially from real mode with
MSR[EE=0].

In these contexts, disabling/enabling interrupts can be very troubling.

So, this arch-specific implementation features functions with an extra
argument that allows interrupt enable/disable to be skipped:
__begin_lockless_pgtbl_walk() and __end_lockless_pgtbl_walk().

Functions similar to the generic ones are also exported, by calling
the above functions with parameter *able_irq = false.

While there is no config option, the method is disabled and these functions
are only doing what was already needed to lockless pagetable walks
(disabling interrupt). A memory barrier was also added just to make sure
there is no speculative read outside the interrupt disabled area.

Signed-off-by: Leonardo Bras 
---
 arch/powerpc/include/asm/book3s/64/pgtable.h |   9 ++
 arch/powerpc/mm/book3s64/pgtable.c   | 117 +++
 2 files changed, 126 insertions(+)

diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h 
b/arch/powerpc/include/asm/book3s/64/pgtable.h
index b01624e5c467..8330b35cd28d 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -1372,5 +1372,14 @@ static inline bool pgd_is_leaf(pgd_t pgd)
return !!(pgd_raw(pgd) & cpu_to_be64(_PAGE_PTE));
 }
 
+#define __HAVE_ARCH_LOCKLESS_PGTBL_WALK_CONTROL
+unsigned long begin_lockless_pgtbl_walk(struct mm_struct *mm);
+unsigned long __begin_lockless_pgtbl_walk(struct mm_struct *mm,
+ bool disable_irq);
+void end_lockless_pgtbl_walk(struct mm_struct *mm, unsigned long irq_mask);
+void __end_lockless_pgtbl_walk(struct mm_struct *mm, unsigned long irq_mask,
+  bool enable_irq);
+int running_lockless_pgtbl_walk(struct mm_struct *mm);
+
 #endif /* __ASSEMBLY__ */
 #endif /* _ASM_POWERPC_BOOK3S_64_PGTABLE_H_ */
diff --git a/arch/powerpc/mm/book3s64/pgtable.c 
b/arch/powerpc/mm/book3s64/pgtable.c
index 75483b40fcb1..ae557fdce9a3 100644
--- a/arch/powerpc/mm/book3s64/pgtable.c
+++ b/arch/powerpc/mm/book3s64/pgtable.c
@@ -98,6 +98,123 @@ void serialize_against_pte_lookup(struct mm_struct *mm)
smp_call_function_many(mm_cpumask(mm), do_nothing, NULL, 1);
 }
 
+/*
+ * Counting method to monitor lockless pagetable walks:
+ * Uses begin_lockless_pgtbl_walk and end_lockless_pgtbl_walk to track the
+ * number of lockless pgtable walks happening, and
+ * running_lockless_pgtbl_walk to return this value.
+ */
+
+/* begin_lockless_pgtbl_walk: Must be inserted before a function call that does
+ *   lockless pagetable walks, such as __find_linux_pte().
+ * This version allows setting disable_irq=false, so irqs are not touched, 
which
+ *   is quite useful for running when ints are already disabled (like 
real-mode)
+ */
+
+inline unsigned long __begin_lockless_pgtbl_walk(struct mm_struct *mm,
+bool disable_irq)
+{
+   unsigned long irq_mask = 0;
+
+   if (IS_ENABLED(CONFIG_LOCKLESS_PAGE_TABLE_WALK_TRACKING))
+   atomic_inc(&mm->lockless_pgtbl_walkers);
+
+   /*
+* Interrupts must be disabled during the lockless page table walk.
+* That's because the deleting or splitting involves flushing TLBs,
+* which in turn issues interrupts, that will block when disabled.
+*
+* When this function is called from realmode with MSR[EE=0],
+* it's not needed to touch irq, since it's already disabled.
+*/
+   if (disable_irq)
+   local_irq_save(irq_mask);
+
+   /*
+* This memory barrier pairs with any code that is either trying to
+* delete page tables, or split huge pages. Without this barrier,
+* the page tables could be read speculatively outside of interrupt
+* disabling or reference counting.
+*/
+   smp_mb();
+
+   return irq_mask;
+}
+EXPORT_SYMBOL(__begin_lockless_pgtbl_walk);
+
+/* begin_lockless_pgtbl_walk: Must be inserted before a function call that does
+ *   lockless pagetable walks, such as __find_linux_pte().
+ * This version is used by generic code, and always assume irqs being disabled
+ */
+unsigned long begin_lockless_pgtbl_walk(struct mm_struct *mm)
+{
+   return __begin_lockless_pgtbl_walk(mm, true);
+}
+EXPORT_SYMBOL(begin_lockless_pgtbl_walk);
+
+/*
+ * __end_lockless_pgtbl_walk: Must be inserted after the last use of a pointer
+ *   returned by a lockless pagetable walk, such as __find_linux_pte()
+ * This version allows setting enable_irq=false, so irqs are not touched, which
+ *   is quite useful for running when ints are already disabled (like 
real-mode)
+ */
+inline void __end_lockless_pgtbl_walk(struct mm_struct *mm,
+ unsi

Re: [PATCH v6 6/9] ima: make process_buffer_measurement() non static

2019-10-02 Thread Mimi Zohar
[Cc'ing Prakhar]

On Fri, 2019-09-27 at 10:25 -0400, Nayna Jain wrote:
> To add the support for checking against blacklist, it would be needed
> to add an additional measurement record that identifies the record
> as blacklisted.
> 
> This patch modifies the process_buffer_measurement() and makes it
> non static to be used by blacklist functionality. It modifies the
> function to handle more than just the KEXEC_CMDLINE.
> 
> Signed-off-by: Nayna Jain 

Making process_buffer_measurement() non static is the end result, not
the reason for the change.  The reason for changing
process_buffer_measurement() is to make it more generic.  The
blacklist measurement record is the usecase.

Please rewrite the patch description.

thanks,

Mimi



Re: [PATCH v6 3/9] powerpc: add support to initialize ima policy rules

2019-10-02 Thread Mimi Zohar
On Tue, 2019-10-01 at 12:07 -0400, Nayna wrote:
> 
> On 09/30/2019 09:04 PM, Thiago Jung Bauermann wrote:
> > Hello,
> 
> Hi,
> 
> >
> >> diff --git a/arch/powerpc/kernel/ima_arch.c 
> >> b/arch/powerpc/kernel/ima_arch.c
> >> new file mode 100644
> >> index ..39401b67f19e
> >> --- /dev/null
> >> +++ b/arch/powerpc/kernel/ima_arch.c
> >> @@ -0,0 +1,33 @@
> >> +// SPDX-License-Identifier: GPL-2.0
> >> +/*
> >> + * Copyright (C) 2019 IBM Corporation
> >> + * Author: Nayna Jain
> >> + */
> >> +
> >> +#include 
> >> +#include 
> >> +
> >> +bool arch_ima_get_secureboot(void)
> >> +{
> >> +  return is_powerpc_os_secureboot_enabled();
> >> +}
> >> +
> >> +/* Defines IMA appraise rules for secureboot */
> >> +static const char *const arch_rules[] = {
> >> +  "appraise func=KEXEC_KERNEL_CHECK appraise_type=imasig|modsig",
> >> +#if !IS_ENABLED(CONFIG_MODULE_SIG)
> >> +  "appraise func=MODULE_CHECK appraise_type=imasig|modsig",
> >> +#endif
> >> +  NULL
> >> +};
> >> +
> >> +/*
> >> + * Returns the relevant IMA arch policies based on the system secureboot 
> >> state.
> >> + */
> >> +const char *const *arch_get_ima_policy(void)
> >> +{
> >> +  if (is_powerpc_os_secureboot_enabled())
> >> +  return arch_rules;
> >> +
> >> +  return NULL;
> >> +}
> > If CONFIG_MODULE_SIG is enabled but module signatures aren't enforced,
> > then IMA won't enforce module signature either. x86's
> > arch_get_ima_policy() calls set_module_sig_enforced(). Doesn't the
> > powerpc version need to do that as well?
> >
> > On the flip side, if module signatures are enforced by the module
> > subsystem then IMA will verify the signature a second time since there's
> > no sharing of signature verification results between the module
> > subsystem and IMA (this was observed by Mimi).
> >
> > IMHO this is a minor issue, since module loading isn't a hot path and
> > the duplicate work shouldn't impact anything. But it could be avoided by
> > having a NULL entry in arch_rules, which arch_get_ima_policy() would
> > dynamically update with the "appraise func=MODULE_CHECK" rule if
> > is_module_sig_enforced() is true.
> 
> Thanks Thiago for reviewing.  I am wondering that this will give two 
> meanings for NULL. Can we do something like below, there are possibly 
> two options ?
> 
> 1. Set IMA_APPRAISED in the iint->flags if is_module_sig_enforced().
> 
> OR
> 
> 2. Let ima_get_action() check for is_module_sig_enforced() when policy 
> is appraise and func is MODULE_CHECK.

I'm a bit hesitant about mixing the module subsystem signature
verification method with the IMA measure "template=ima-modsig" rules.
 Does it actually work?

We can at least limit verifying the same appended signature twice to
when "module.sig_enforce" is specified on the boot command line, by
changing "!IS_ENABLED(CONFIG_MODULE_SIG)" to test
"CONFIG_MODULE_SIG_FORCE".

Mimi



[PATCH] macintosh/ams-input: switch to using input device polling mode

2019-10-02 Thread Dmitry Torokhov
Now that instances of input_dev support polling mode natively,
we no longer need to create input_polled_dev instance.

Signed-off-by: Dmitry Torokhov 
---
 drivers/macintosh/Kconfig |  1 -
 drivers/macintosh/ams/ams-input.c | 37 +++
 drivers/macintosh/ams/ams.h   |  4 ++--
 3 files changed, 20 insertions(+), 22 deletions(-)

diff --git a/drivers/macintosh/Kconfig b/drivers/macintosh/Kconfig
index 574e122ae105..da6a943ad746 100644
--- a/drivers/macintosh/Kconfig
+++ b/drivers/macintosh/Kconfig
@@ -247,7 +247,6 @@ config PMAC_RACKMETER
 config SENSORS_AMS
tristate "Apple Motion Sensor driver"
depends on PPC_PMAC && !PPC64 && INPUT && ((ADB_PMU && I2C = y) || 
(ADB_PMU && !I2C) || I2C)
-   select INPUT_POLLDEV
help
  Support for the motion sensor included in PowerBooks. Includes
  implementations for PMU and I2C.
diff --git a/drivers/macintosh/ams/ams-input.c 
b/drivers/macintosh/ams/ams-input.c
index 06a96b3f11de..0da493d449b2 100644
--- a/drivers/macintosh/ams/ams-input.c
+++ b/drivers/macintosh/ams/ams-input.c
@@ -25,9 +25,8 @@ MODULE_PARM_DESC(invert, "Invert input data on X and Y axis");
 
 static DEFINE_MUTEX(ams_input_mutex);
 
-static void ams_idev_poll(struct input_polled_dev *dev)
+static void ams_idev_poll(struct input_dev *idev)
 {
-   struct input_dev *idev = dev->input;
s8 x, y, z;
 
mutex_lock(&ams_info.lock);
@@ -59,14 +58,10 @@ static int ams_input_enable(void)
ams_info.ycalib = y;
ams_info.zcalib = z;
 
-   ams_info.idev = input_allocate_polled_device();
-   if (!ams_info.idev)
+   input = input_allocate_device();
+   if (!input)
return -ENOMEM;
 
-   ams_info.idev->poll = ams_idev_poll;
-   ams_info.idev->poll_interval = 25;
-
-   input = ams_info.idev->input;
input->name = "Apple Motion Sensor";
input->id.bustype = ams_info.bustype;
input->id.vendor = 0;
@@ -75,28 +70,32 @@ static int ams_input_enable(void)
input_set_abs_params(input, ABS_X, -50, 50, 3, 0);
input_set_abs_params(input, ABS_Y, -50, 50, 3, 0);
input_set_abs_params(input, ABS_Z, -50, 50, 3, 0);
+   input_set_capability(input, EV_KEY, BTN_TOUCH);
 
-   set_bit(EV_ABS, input->evbit);
-   set_bit(EV_KEY, input->evbit);
-   set_bit(BTN_TOUCH, input->keybit);
+   error = input_setup_polling(input, ams_idev_poll);
+   if (error)
+   goto err_free_input;
 
-   error = input_register_polled_device(ams_info.idev);
-   if (error) {
-   input_free_polled_device(ams_info.idev);
-   ams_info.idev = NULL;
-   return error;
-   }
+   input_set_poll_interval(input, 25);
 
+   error = input_register_device(input);
+   if (error)
+   goto err_free_input;
+
+   ams_info.idev = input;
joystick = true;
 
return 0;
+
+err_free_input:
+   input_free_device(input);
+   return error;
 }
 
 static void ams_input_disable(void)
 {
if (ams_info.idev) {
-   input_unregister_polled_device(ams_info.idev);
-   input_free_polled_device(ams_info.idev);
+   input_unregister_device(ams_info.idev);
ams_info.idev = NULL;
}
 
diff --git a/drivers/macintosh/ams/ams.h b/drivers/macintosh/ams/ams.h
index fe8d596f9845..935bdd9cd9a6 100644
--- a/drivers/macintosh/ams/ams.h
+++ b/drivers/macintosh/ams/ams.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 #include 
-#include 
+#include 
 #include 
 #include 
 #include 
@@ -51,7 +51,7 @@ struct ams {
 #endif
 
/* Joystick emulation */
-   struct input_polled_dev *idev;
+   struct input_dev *idev;
__u16 bustype;
 
/* calibrated null values */
-- 
2.23.0.444.g18eeb5a265-goog


-- 
Dmitry


Re: [PATCH v6 8/9] ima: deprecate permit_directio, instead use appraise_flag

2019-10-02 Thread Mimi Zohar
Hi Nayna,

On Fri, 2019-09-27 at 10:25 -0400, Nayna Jain wrote:
> This patch deprecates the existing permit_directio flag, instead adds
> it as possible value to appraise_flag parameter.
> For eg.
> appraise_flag=permit_directio

Defining a generic "appraise_flag=", which supports different options,
is the right direction.  I would really like to depreciate the
"permit_directio" flag, not just change the policy syntax.  For now,
let's drop this change.

Mimi



Re: [PATCH v6 7/9] ima: check against blacklisted hashes for files with modsig

2019-10-02 Thread Mimi Zohar
On Fri, 2019-09-27 at 10:25 -0400, Nayna Jain wrote:
> Asymmetric private keys are used to sign multiple files. The kernel
> currently support checking against the blacklisted keys. However, if the
> public key is blacklisted, any file signed by the blacklisted key will
> automatically fail signature verification. We might not want to blacklist
> all the files signed by a particular key, but just a single file. 
> Blacklisting the public key is not fine enough granularity.
> 
> This patch adds support for blacklisting binaries with appended signatures,
> based on the IMA policy.  Defined is a new policy option
> "appraise_flag=check_blacklist".
> 
> Signed-off-by: Nayna Jain 
> ---
>  Documentation/ABI/testing/ima_policy  |  1 +
>  security/integrity/ima/ima.h  | 12 +
>  security/integrity/ima/ima_appraise.c | 35 +++
>  security/integrity/ima/ima_main.c |  8 --
>  security/integrity/ima/ima_policy.c   | 10 ++--
>  security/integrity/integrity.h|  1 +
>  6 files changed, 63 insertions(+), 4 deletions(-)
> 
> diff --git a/Documentation/ABI/testing/ima_policy 
> b/Documentation/ABI/testing/ima_policy
> index 29ebe9afdac4..4c97afcc0f3c 100644
> --- a/Documentation/ABI/testing/ima_policy
> +++ b/Documentation/ABI/testing/ima_policy
> @@ -25,6 +25,7 @@ Description:
>   lsm:[[subj_user=] [subj_role=] [subj_type=]
>[obj_user=] [obj_role=] [obj_type=]]
>   option: [[appraise_type=]] [template=] [permit_directio]
> + [appraise_flag=[check_blacklist]]
>   base:   func:= 
> [BPRM_CHECK][MMAP_CHECK][CREDS_CHECK][FILE_CHECK][MODULE_CHECK]
>   [FIRMWARE_CHECK]
>   [KEXEC_KERNEL_CHECK] [KEXEC_INITRAMFS_CHECK]
> diff --git a/security/integrity/ima/ima.h b/security/integrity/ima/ima.h
> index 9bf509217e8e..2c034728b239 100644
> --- a/security/integrity/ima/ima.h
> +++ b/security/integrity/ima/ima.h
> @@ -254,6 +254,9 @@ int ima_policy_show(struct seq_file *m, void *v);
>  #define IMA_APPRAISE_KEXEC   0x40
>  
>  #ifdef CONFIG_IMA_APPRAISE
> +int ima_blacklist_measurement(struct integrity_iint_cache *iint,
> +   const struct modsig *modsig, int action,
> +   int pcr, struct ima_template_desc *template_desc);
>  int ima_appraise_measurement(enum ima_hooks func,
>struct integrity_iint_cache *iint,
>struct file *file, const unsigned char *filename,
> @@ -269,6 +272,15 @@ int ima_read_xattr(struct dentry *dentry,
>  struct evm_ima_xattr_data **xattr_value);
>  
>  #else
> +static inline int ima_blacklist_measurement(struct integrity_iint_cache 
> *iint,
> + const struct modsig *modsig,
> + int action, int pcr,
> + struct ima_template_desc
> + *template_desc)
> +{
> + return 0;
> +}
> +
>  static inline int ima_appraise_measurement(enum ima_hooks func,
>  struct integrity_iint_cache *iint,
>  struct file *file,
> diff --git a/security/integrity/ima/ima_appraise.c 
> b/security/integrity/ima/ima_appraise.c
> index 136ae4e0ee92..a5a82e870e24 100644
> --- a/security/integrity/ima/ima_appraise.c
> +++ b/security/integrity/ima/ima_appraise.c
> @@ -12,6 +12,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  
>  #include "ima.h"
>  
> @@ -303,6 +304,40 @@ static int modsig_verify(enum ima_hooks func, const 
> struct modsig *modsig,
>   return rc;
>  }
>  
> +/*
> + * ima_blacklist_measurement - checks if the file measurement is blacklisted
> + *
> + * Returns -EKEYREJECTED if the hash is blacklisted.
> + */


In the function comment, please mention that blacklisted binaries are
included in the IMA measurement list.

> +int ima_blacklist_measurement(struct integrity_iint_cache *iint,
> +   const struct modsig *modsig, int action, int pcr,
> +   struct ima_template_desc *template_desc)
> +{
> + enum hash_algo hash_algo;
> + const u8 *digest = NULL;
> + u32 digestsize = 0;
> + u32 secid;
> + int rc = 0;
> +
> + if (!(iint->flags & IMA_CHECK_BLACKLIST))
> + return 0;
> +
> + if (iint->flags & IMA_MODSIG_ALLOWED) {
> + security_task_getsecid(current, &secid);
> + ima_get_modsig_digest(modsig, &hash_algo, &digest, &digestsize);
> +
> + rc = is_hash_blacklisted(digest, digestsize, "bin");
> +
> + /* Returns -EKEYREJECTED on blacklisted hash found */

is_hash_blacklisted() returning -EKEYREJECTED makes sense if the key
is blacklisted, not so much for a binary.  It would make more sense to
define is_binary_blacklis

[PATCH 3.16 76/87] perf/ioctl: Add check for the sample_period value

2019-10-02 Thread Ben Hutchings
3.16.75-rc1 review patch.  If anyone has any objections, please let me know.

--

From: Ravi Bangoria 

commit 913a90bc5a3a06b1f04c337320e9aeee2328dd77 upstream.

perf_event_open() limits the sample_period to 63 bits. See:

  0819b2e30ccb ("perf: Limit perf_event_attr::sample_period to 63 bits")

Make ioctl() consistent with it.

Also on PowerPC, negative sample_period could cause a recursive
PMIs leading to a hang (reported when running perf-fuzzer).

Signed-off-by: Ravi Bangoria 
Signed-off-by: Peter Zijlstra (Intel) 
Cc: Alexander Shishkin 
Cc: Arnaldo Carvalho de Melo 
Cc: Jiri Olsa 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Stephane Eranian 
Cc: Thomas Gleixner 
Cc: Vince Weaver 
Cc: a...@kernel.org
Cc: linuxppc-dev@lists.ozlabs.org
Cc: ma...@linux.vnet.ibm.com
Cc: m...@ellerman.id.au
Fixes: 0819b2e30ccb ("perf: Limit perf_event_attr::sample_period to 63 bits")
Link: https://lkml.kernel.org/r/20190604042953.914-1-ravi.bango...@linux.ibm.com
Signed-off-by: Ingo Molnar 
[bwh: Backported to 3.16: adjust context]
Signed-off-by: Ben Hutchings 
---
 kernel/events/core.c | 3 +++
 1 file changed, 3 insertions(+)

--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3823,6 +3823,9 @@ static int perf_event_period(struct perf
if (perf_event_check_period(event, value))
return -EINVAL;
 
+   if (!event->attr.freq && (value & (1ULL << 63)))
+   return -EINVAL;
+
task = ctx->task;
pe.value = value;
 



Re: [PATCH v4 0/5] Early node associativity

2019-10-02 Thread Nathan Lynch
Hi Srikar,

Srikar Dronamraju  writes:
> Abdul reported  a warning on a shared lpar.
> "WARNING: workqueue cpumask: online intersect > possible intersect".
> This is because per node workqueue possible mask is set very early in the
> boot process even before the system was querying the home node
> associativity. However per node workqueue online cpumask gets updated
> dynamically. Hence there is a chance when per node workqueue online cpumask
> is a superset of per node workqueue possible mask.

Sorry for the delay in following up on these. The series looks good to
me, and I've given it a little testing with LPM and DLPAR. I've also
verified that the cpu assignments occur early as intended on an LPAR
where that workqueue warning had been triggered.


Re: [PATCH v4 5/5] powerpc/numa: Remove late request for home node associativity

2019-10-02 Thread Nathan Lynch
Srikar Dronamraju  writes:

> With commit ("powerpc/numa: Early request for home node associativity"),
> commit 2ea626306810 ("powerpc/topology: Get topology for shared
> processors at boot") which was requesting home node associativity
> becomes redundant.
>
> Hence remove the late request for home node associativity.

Reviewed-by: Nathan Lynch 


Re: [PATCH v4 4/5] powerpc/numa: Early request for home node associativity

2019-10-02 Thread Nathan Lynch
Srikar Dronamraju  writes:

> Currently the kernel detects if its running on a shared lpar platform
> and requests home node associativity before the scheduler sched_domains
> are setup. However between the time NUMA setup is initialized and the
> request for home node associativity, workqueue initializes its per node
> cpumask. The per node workqueue possible cpumask may turn invalid
> after home node associativity resulting in weird situations like
> workqueue possible cpumask being a subset of workqueue online cpumask.
>
> This can be fixed by requesting home node associativity earlier just
> before NUMA setup. However at the NUMA setup time, kernel may not be in
> a position to detect if its running on a shared lpar platform. So
> request for home node associativity and if the request fails, fallback
> on the device tree property.

Reviewed-by: Nathan Lynch 


Re: [PATCH v4 3/5] powerpc/numa: Use cpu node map of first sibling thread

2019-10-02 Thread Nathan Lynch
Srikar Dronamraju  writes:

> All the sibling threads of a core have to be part of the same node.
> To ensure that all the sibling threads map to the same node, always
> lookup/update the cpu-to-node map of the first thread in the core.

Reviewed-by: Nathan Lynch 


Re: [PATCH v4 2/5] powerpc/numa: Handle extra hcall_vphn error cases

2019-10-02 Thread Nathan Lynch
Srikar Dronamraju  writes:
> Currently code handles H_FUNCTION, H_SUCCESS, H_HARDWARE return codes.
> However hcall_vphn can return other return codes. Now it also handles
> H_PARAMETER return code.  Also the rest return codes are handled under the
> default case.

Reviewed-by: Nathan Lynch 

However:

> + case H_PARAMETER:
> + pr_err_ratelimited("hcall_vphn() was passed an invalid 
> parameter. "
> + "Disabling polling...\n");

Consider including the bad parameter value in the message?


Re: [PATCH v4 1/5] powerpc/vphn: Check for error from hcall_vphn

2019-10-02 Thread Nathan Lynch
Srikar Dronamraju  writes:

> There is no value in unpacking associativity, if
> H_HOME_NODE_ASSOCIATIVITY hcall has returned an error.

Reviewed-by: Nathan Lynch 


Re: [PATCH v8 1/5] kasan: support backing vmalloc space with real shadow memory

2019-10-02 Thread Uladzislau Rezki
On Wed, Oct 02, 2019 at 11:23:06AM +1000, Daniel Axtens wrote:
> Hi,
> 
> >>/*
> >> * Find a place in the tree where VA potentially will be
> >> * inserted, unless it is merged with its sibling/siblings.
> >> @@ -741,6 +752,10 @@ merge_or_add_vmap_area(struct vmap_area *va,
> >>if (sibling->va_end == va->va_start) {
> >>sibling->va_end = va->va_end;
> >>  
> >> +  kasan_release_vmalloc(orig_start, orig_end,
> >> +sibling->va_start,
> >> +sibling->va_end);
> >> +
> > The same.
> 
> The call to kasan_release_vmalloc() is a static inline no-op if
> CONFIG_KASAN_VMALLOC is not defined, which I thought was the preferred
> way to do things rather than sprinkling the code with ifdefs?
> 
I agree that is totally correct.

> The complier should be smart enough to eliminate all the
> orig_state/orig_end stuff at compile time because it can see that it's
> not used, so there's no cost in the binary.
> 
It should. I was more thinking about if those two variables can be
considered as unused, resulting in compile warning like "set but not used".
But that is theory and in case of having any warning the test robot will
notify anyway about that.

So, i am totally fine with that if compiler does not complain. If so,
please ignore my comments :)

--
Vlad Rezki


Re: [PATCH v2 00/21] Refine memblock API

2019-10-02 Thread Adam Ford
On Wed, Oct 2, 2019 at 2:36 AM Mike Rapoport  wrote:
>
> Hi Adam,
>
> On Tue, Oct 01, 2019 at 07:14:13PM -0500, Adam Ford wrote:
> > On Sun, Sep 29, 2019 at 8:33 AM Adam Ford  wrote:
> > >
> > > I am attaching two logs.  I now the mailing lists will be unhappy, but
> > >  don't want to try and spam a bunch of log through the mailing liast.
> > > The two logs show the differences between the working and non-working
> > > imx6q 3D accelerator when trying to run a simple glmark2-es2-drm demo.
> > >
> > > The only change between them is the 2 line code change you suggested.
> > >
> > > In both cases, I have cma=128M set in my bootargs.  Historically this
> > > has been sufficient, but cma=256M has not made a difference.
> > >
> >
> > Mike any suggestions on how to move forward?
> > I was hoping to get the fixes tested and pushed before 5.4 is released
> > if at all possible
>
> I have a fix (below) that kinda restores the original behaviour, but I
> still would like to double check to make sure it's not a band aid and I
> haven't missed the actual root cause.
>
> Can you please send me your device tree definition and the output of
>
> cat /sys/kernel/debug/memblock/memory
>
> and
>
> cat /sys/kernel/debug/memblock/reserved
>
> Thanks!
>

Before the patch:

# cat /sys/kernel/debug/memblock/memory
   0: 0x1000..0x8fff
# cat /sys/kernel/debug/memblock/reserved
   0: 0x10004000..0x10007fff
   1: 0x1010..0x11ab141f
   2: 0x1fff1000..0x1fffcfff
   3: 0x2ee4..0x2ef53fff
   4: 0x2ef56940..0x2ef56c43
   5: 0x2ef56c48..0x2fffefff
   6: 0x20c0..0x24d8
   7: 0x2500..0x255f
   8: 0x2580..0x2703
   9: 0x2740..0x2918
  10: 0x2940..0x29cf
  11: 0x2a00..0x2a0f
  12: 0x2a40..0x2a43
  13: 0x2a80..0x2ad5
  14: 0x2b00..0x2b55
  15: 0x2b80..0x2bd5
  16: 0x2c00..0x2c4e
  17: 0x2c50..0x2c6a
  18: 0x2c6c..0x2ce6
  19: 0x2ce8..0x2d02
  20: 0x2d04..0x2d1e
  21: 0x2d20..0x2d3a
  22: 0x2d3c..0x2d56
  23: 0x2d58..0x2e30
  24: 0x2e34..0x2e4c
  25: 0x2e50..0x2e68
  26: 0x2e6c..0x2e84
  27: 0x2e88..0x2ea0
  28: 0x2ea4..0x2ebc
  29: 0x2ec0..0x2edf
  30: 0x2ee4..0x2efc
  31: 0x2f00..0x2f13
  32: 0x2f28..0x2f4b
  33: 0x2f50..0x2f84
  34: 0x2f88..0x3fff


After the patch:
# cat /sys/kernel/debug/memblock/memory
   0: 0x1000..0x8fff
# cat /sys/kernel/debug/memblock/reserved
   0: 0x10004000..0x10007fff
   1: 0x1010..0x11ab141f
   2: 0x1fff1000..0x1fffcfff
   3: 0x3eec..0x3efd3fff
   4: 0x3efd6940..0x3efd6c43
   5: 0x3efd6c48..0x3fffbfff
   6: 0x3fffc0c0..0x3fffc4d8
   7: 0x3fffc500..0x3fffc55f
   8: 0x3fffc580..0x3fffc703
   9: 0x3fffc740..0x3fffc918
  10: 0x3fffc940..0x3fffc9cf
  11: 0x3fffca00..0x3fffca0f
  12: 0x3fffca40..0x3fffca43
  13: 0x3fffca80..0x3fffca83
  14: 0x3fffcac0..0x3fffcb15
  15: 0x3fffcb40..0x3fffcb95
  16: 0x3fffcbc0..0x3fffcc15
  17: 0x3fffcc28..0x3fffcc72
  18: 0x3fffcc74..0x3fffcc8e
  19: 0x3fffcc90..0x3fffcd0a
  20: 0x3fffcd0c..0x3fffcd26
  21: 0x3fffcd28..0x3fffcd42
  22: 0x3fffcd44..0x3fffcd5e
  23: 0x3fffcd60..0x3fffcd7a
  24: 0x3fffcd7c..0x3fffce54
  25: 0x3fffce58..0x3fffce70
  26: 0x3fffce74..0x3fffce8c
  27: 0x3fffce90..0x3fffcea8
  28: 0x3fffceac..0x3fffcec4
  29: 0x3fffcec8..0x3fffcee0
  30: 0x3fffcee4..0x3fffcefc
  31: 0x3fffcf00..0x3fffcf1f
  32: 0x3fffcf28..0x3fffcf53
  33: 0x3fffcf68..0x3fffcf8b
  34: 0x3fffcf90..0x3fffcfac
  35: 0x3fffcfb0..0x3fff
  36: 0x8000..0x8fff

> From 06529f861772b7dea2912fc2245debe4690139b8 Mon Sep 17 00:00:00 2001
> From: Mike Rapoport 
> Date: Wed, 2 Oct 2019 10:14:17 +0300
> Subject: [PATCH] mm: memblock: do not enforce current limit for memblock_phys*
>  family
>
> Until commit 92d12f9544b7 ("memblock: refactor internal allocation
> functions") the maximal address for memblock allocations was forced to
> memblock.current_limit only for the allocation functions returning virtual
> address. The changes introduced by that commit moved the limit enforcement
> into the allocation core and as a result the allocation functions returning
> physical address also started to limit allocations to
> memblock.current_limit.
>
> This caused breakage of etnaviv GPU driver:
>
> [3.682347] etnaviv etnaviv: bound 13.gpu (ops gpu_ops)
> [3.688669] etnaviv etnaviv: bound 134000.gpu (ops gpu_ops)
> [3.695099] etnaviv etnaviv: bound 2204000.gpu (ops gpu_ops)
> [3.700800] etnaviv-gpu 13.gpu: model: GC2000, revision: 5108
> [3.723013] etnaviv-gpu 13.gpu: command buffer outside valid
> memory window
> [3.731308] etnaviv-gpu 134000.gpu: model: GC320, revision: 5007
> [3.752437] etnaviv-gpu 134000.gpu: command buffer outside valid
> memory window
> [3.760583] etnaviv-gpu 2204000.gpu: model: GC355, revision: 1215
> [3.766766] etnaviv-gpu 2204000.gpu: Ignoring GPU with VG and FE2.0
>
> Resto

Re: [PATCH v1 1/2] PCI/AER: Use for_each_set_bit()

2019-10-02 Thread David Howells
Andy Shevchenko  wrote:

> > but I confess to being a little ambivalent.  It's
> > arguably a little easier to read,
> 
> I have another opinion here. Instead of parsing body of for-loop, the name of
> the function tells you exactly what it's done. Besides the fact that reading
> and parsing two lines, with zero conditionals, is faster.
> 
> > but it's not nearly as efficient
> > (not a great concern here)
> 
> David, do you know why for_each_set_bit() has no optimization for the cases
> when nbits <= BITS_PER_LONG? (Actually find_*bit() family of functions)

I've not had anything to do with for_each_set_bit() itself.

By 'nbits', I presume you mean the size parameter - max in the sample bit of
code.

It would need per-arch optimisation.  Some arches have an instruction to find
the next bit and some don't.

Using for_each_set_bit() like this is definitely suboptimal, since
find_first_bit() and find_next_bit() may well be out of line.

It should probably be using something like __ffs() if size <= BITS_PER_LONG.

David


Re: [PATCH] powerpc/powernv/prd: Validate whether address to be mapped is part of system RAM

2019-10-02 Thread Vaidyanathan Srinivasan
* Vasant Hegde  [2019-10-02 13:18:56]:

> Add check to validate whether requested page is part of system RAM
> or not before mmap() and error out if its not part of system RAM.
> 
> cat /proc/iomem:
> -
> -27 : System RAM
> 28-2f : namespace0.0
> 2000-2027 : System RAM
> 2028-202f : namespace1.0
> 6-6003fbfff : pciex@600c3c000
> 60040-6007f7fff : pciex@600c3c010
> 
> 
> 
> Sample dmesg output with this fix:
> --
> [  160.371911] opal-prd: mmap: Requested page is not part of system RAM (addr 
> : 0x202ffcfc, size : 0x0057)
> [  160.665366] opal-prd: mmap: Requested page is not part of system RAM (addr 
> : 0x202ffcfc, size : 0x0057)
> [  160.914627] opal-prd: mmap: Requested page is not part of system RAM (addr 
> : 0x202ffcfc, size : 0x0057)
> [  161.165253] opal-prd: mmap: Requested page is not part of system RAM (addr 
> : 0x202ffcfc, size : 0x0057)
> [  161.414604] opal-prd: mmap: Requested page is not part of system RAM (addr 
> : 0x202ffcfc, size : 0x0057)

Thanks Vasant for the patch.  HBRT pages landing in NVDIMM area caused
the opal-prd failure.  Good debug and root-cause.

Thanks to Aneesh for the tip to look into ATT bits of the PTE mapping.

> CC: Aneesh Kumar K.V 
> CC: Jeremy Kerr 
> CC: Vaidyanathan Srinivasan 
> Signed-off-by: Vasant Hegde 

Signed-off-by: Vaidyanathan Srinivasan 

> ---
>  arch/powerpc/platforms/powernv/opal-prd.c | 16 +++-
>  1 file changed, 15 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/powerpc/platforms/powernv/opal-prd.c 
> b/arch/powerpc/platforms/powernv/opal-prd.c
> index 45f4223a790f..0f88752302a2 100644
> --- a/arch/powerpc/platforms/powernv/opal-prd.c
> +++ b/arch/powerpc/platforms/powernv/opal-prd.c
> @@ -3,7 +3,7 @@
>   * OPAL Runtime Diagnostics interface driver
>   * Supported on POWERNV platform
>   *
> - * Copyright IBM Corporation 2015
> + * Copyright IBM Corporation 2015-2019
>   */
> 
>  #define pr_fmt(fmt) "opal-prd: " fmt
> @@ -47,6 +47,20 @@ static bool opal_prd_range_is_valid(uint64_t addr, 
> uint64_t size)
>   if (addr + size < addr)
>   return false;
> 
> + /*
> +  * Check if region is in system RAM and error out if the address
> +  * belongs to special devices like NVDIMM. phys_mem_access_prot()
> +  * routine will change ATT bits to non cachable if page is not in
> +  * RAM, causing HBRT to not fetch and execute in the mapped memory
> +  * and fail. Page permissions can be left at default since all
> +  * firmware component should be in system RAM only.
> +  */
> + if (!page_is_ram(addr >> PAGE_SHIFT)) {
> + pr_warn("mmap: Requested page is not part of system RAM "
> + "(addr : 0x%016llx, size : 0x%016llx)\n", addr, size);
> + return false;
> + }
> +
>   parent = of_find_node_by_path("/reserved-memory");
>   if (!parent)
>   return false;
> -- 
> 2.21.0
> 



Re: [RFC PATCH 23/27] powerpc/64: system call implement the bulk of the logic in C

2019-10-02 Thread Michal Suchánek
On Sun, Sep 15, 2019 at 11:28:09AM +1000, Nicholas Piggin wrote:
> System call entry and particularly exit code is beyond the limit of what
> is reasonable to implement in asm.
> 
> This conversion moves all conditional branches out of the asm code,
> except for the case that all GPRs should be restored at exit.
> 
> Null syscall test is about 5% faster after this patch, because the exit
> work is handled under local_irq_disable, and the hard mask and pending
> interrupt replay is handled after that, which avoids games with MSR.
> 
> Signed-off-by: Nicholas Piggin 
> 
> v3:
> - Fix !KUAP build [mpe]
> - Fix BookE build/boot [mpe]
> - Don't trace irqs with MSR[RI]=0
> - Don't allow syscall_exit_prepare to be ftraced, because function
>   graph tracing which traces exits barfs after the IRQ state is
>   prepared for kernel exit.
> - Fix BE syscall table to use normal function descriptors now that they
>   are called from C.
> - Comment syscall_exit_prepare.
...
> -#if defined(CONFIG_VIRT_CPU_ACCOUNTING_NATIVE) && defined(CONFIG_PPC_SPLPAR)
> -BEGIN_FW_FTR_SECTION
> - /* see if there are any DTL entries to process */
> - ld  r10,PACALPPACAPTR(r13)  /* get ptr to VPA */
> - ld  r11,PACA_DTL_RIDX(r13)  /* get log read index */
> - addir10,r10,LPPACA_DTLIDX
> - LDX_BE  r10,0,r10   /* get log write index */
> - cmpdr11,r10
> - beq+33f
> - bl  accumulate_stolen_time
> - REST_GPR(0,r1)
> - REST_4GPRS(3,r1)
> - REST_2GPRS(7,r1)
> - addir9,r1,STACK_FRAME_OVERHEAD
> -33:
> -END_FW_FTR_SECTION_IFSET(FW_FEATURE_SPLPAR)
> -#endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE && CONFIG_PPC_SPLPAR */
...
> diff --git a/arch/powerpc/kernel/syscall_64.c 
> b/arch/powerpc/kernel/syscall_64.c
> new file mode 100644
> index ..1d2529824588
> --- /dev/null
> +++ b/arch/powerpc/kernel/syscall_64.c
> @@ -0,0 +1,195 @@
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +
> +extern void __noreturn tabort_syscall(void);
> +
> +typedef long (*syscall_fn)(long, long, long, long, long, long);
> +
> +long system_call_exception(long r3, long r4, long r5, long r6, long r7, long 
> r8, unsigned long r0, struct pt_regs *regs)
> +{
> + unsigned long ti_flags;
> + syscall_fn f;
> +
> + BUG_ON(!(regs->msr & MSR_PR));
> +
> + if (IS_ENABLED(CONFIG_PPC_TRANSACTIONAL_MEM) &&
> + unlikely(regs->msr & MSR_TS_T))
> + tabort_syscall();
> +
> + account_cpu_user_entry();
> +
> +#ifdef CONFIG_PPC_SPLPAR
> + if (IS_ENABLED(CONFIG_VIRT_CPU_ACCOUNTING_NATIVE) &&
> + firmware_has_feature(FW_FEATURE_SPLPAR)) {
> + struct lppaca *lp = get_lppaca();
> +
> + if (unlikely(local_paca->dtl_ridx != lp->dtl_idx))

sparse complains about this, and in time.c this it converted like this:
if (unlikely(local_paca->dtl_ridx != be64_to_cpu(lp->dtl_idx)))
The removed code has a LDX_BE there so there should be some conversion
involved, right?

Thanks

Michal


Re: [RFC PATCH 27/27] powerpc/64s: system call support for scv/rfscv instructions

2019-10-02 Thread Michal Suchánek
On Sun, Sep 15, 2019 at 11:28:13AM +1000, Nicholas Piggin wrote:
> Add support for the scv instruction on POWER9 and later CPUs.
> 
> For now this implements the zeroth scv vector 'scv 0', as identical
> to 'sc' system calls, with the exception that lr is not preserved, and
> it is 64-bit only. There may yet be changes made to this ABI, so it's
> for testing only.
> 
> This also doesn't yet properly handle PR KVM, or the case where a guest
> is denied AIL=3 mode. I haven't added real mode entry points, so scv
> must not be used when AIL=0, but I haven't cleared the FSCR in this
> case.
> 
> This also implements a strange hack to handle the non-implemented
> vectors, scheduling a decrementer and expecting it to interrupt and
> replay pending soft masked interrupts. This is unlikely to be a good
> idea, and needs to actually do a proper handler and replay in case an
> interrupt is pending.
> 
> It may also require some errata handling before it can be safely used
> on all POWER9 CPUs, I have to look that up.
> 
> rfscv is implemented to return from scv type system calls. It can not
> be used to return from sc system calls because those are defined to
> preserve lr.
> 
> In a comparison of getpid syscall, the test program had scv taking
> about 3 more cycles in user mode (92 vs 89 for sc), due to lr handling.
> Total cycles taken for a getpid system call on POWER9 are improved from
> 436 to 345 (26.3% faster), mostly due to reducing mtmsr and mtspr.
...
> diff --git a/arch/powerpc/kernel/syscall_64.c 
> b/arch/powerpc/kernel/syscall_64.c
> index 034b52d3d78c..3e8aa5ae8ec8 100644
> --- a/arch/powerpc/kernel/syscall_64.c
> +++ b/arch/powerpc/kernel/syscall_64.c
> @@ -15,6 +15,77 @@ extern void __noreturn tabort_syscall(void);
>  
>  typedef long (*syscall_fn)(long, long, long, long, long, long);
>  
> +#ifdef CONFIG_PPC_BOOK3S
> +long system_call_vectored_exception(long r3, long r4, long r5, long r6, long 
> r7, long r8, unsigned long r0, struct pt_regs *regs)
> +{
> + unsigned long ti_flags;
> + syscall_fn f;
> +
> + BUG_ON(!(regs->msr & MSR_RI));
> + BUG_ON(!(regs->msr & MSR_PR));
> + BUG_ON(!FULL_REGS(regs));
> + BUG_ON(regs->softe != IRQS_ENABLED);
> +
> + if (IS_ENABLED(CONFIG_PPC_TRANSACTIONAL_MEM) &&
> + unlikely(regs->msr & MSR_TS_T))
> + tabort_syscall();
> +
> + account_cpu_user_entry();
> +
> +#ifdef CONFIG_PPC_SPLPAR
> + if (IS_ENABLED(CONFIG_VIRT_CPU_ACCOUNTING_NATIVE) &&
> + firmware_has_feature(FW_FEATURE_SPLPAR)) {
> + struct lppaca *lp = get_lppaca();
> +
> + if (unlikely(local_paca->dtl_ridx != lp->dtl_idx))
This adds another instance of the lack of endian conversion issue.
> + accumulate_stolen_time();
> + }
> +#endif

Thanks

Michal


Re: [RFC PATCH 24/27] powerpc/64s: interrupt return in C

2019-10-02 Thread Michal Suchánek
On Sun, Sep 15, 2019 at 11:28:10AM +1000, Nicholas Piggin wrote:
> Implement the bulk of interrupt return logic in C. The asm return code
> must handle a few cases: restoring full GPRs, and emulating stack store.
> 
> The asm return code is moved into 64e for now. The new logic has made
> allowance for 64e, but I don't have a full environment that works well
> to test it, and even booting in emulated qemu is not great for stress
> testing. 64e shouldn't be too far off working with this, given a bit
> more testing and auditing of the logic.
> 
> This is slightly faster on a POWER9 (page fault speed increases about
> 1.1%), probably due to reduced mtmsrd.
> 
> Signed-off-by: Nicholas Piggin 
...
> diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
> index 24621e7e5033..45c1524b6c9e 100644
> --- a/arch/powerpc/kernel/process.c
> +++ b/arch/powerpc/kernel/process.c
> @@ -524,6 +524,7 @@ void giveup_all(struct task_struct *tsk)
>  }
>  EXPORT_SYMBOL(giveup_all);
>  
> +#ifdef CONFIG_PPC_BOOK3S_64
This fails build on !BOOK3S_64 because restore_fpu and restore_altivec
are used exclusively from restore_math which is now BOOK3S_64 only.
>  /*
>   * The exception exit path calls restore_math() with interrupts hard disabled
>   * but the soft irq state not "reconciled". ftrace code that calls
> @@ -564,6 +565,7 @@ void notrace restore_math(struct pt_regs *regs)
>  
>   regs->msr = msr;
>  }
> +#endif
>  
>  static void save_all(struct task_struct *tsk)
>  {

Thanks

Michal


Re: [RFC PATCH 00/27] current interrupt series plus scv syscall

2019-10-02 Thread Michal Suchánek
Hello,

can you mark the individual patches with RFC rather than the wole
series?

Thanks

Michal

On Sun, Sep 15, 2019 at 11:27:46AM +1000, Nicholas Piggin wrote:
> My interrupt entry patches have finally collided with syscall and
> interrupt exit patches, so I'll merge the series. Most patches have
> been seen already, however there have been a number of small changes
> and fixes throughout the series.
> 
> The final two patches add support for 'scv' and 'rfscv' instructions.
> 
> I'm posting this out now so we can start considering ABI and userspace
> support. We have the PPC_FEATURE2_SCV hwcap for this.
> 
> Thanks,
> Nick
> 
> Nicholas Piggin (27):
>   powerpc/64s/exception: Introduce INT_DEFINE parameter block for code
> generation
>   powerpc/64s/exception: Add GEN_COMMON macro that uses INT_DEFINE
> parameters
>   powerpc/64s/exception: Add GEN_KVM macro that uses INT_DEFINE
> parameters
>   powerpc/64s/exception: Expand EXC_COMMON and EXC_COMMON_ASYNC macros
>   powerpc/64s/exception: Move all interrupt handlers to new style code
> gen macros
>   powerpc/64s/exception: Remove old INT_ENTRY macro
>   powerpc/64s/exception: Remove old INT_COMMON macro
>   powerpc/64s/exception: Remove old INT_KVM_HANDLER
>   powerpc/64s/exception: Add ISIDE option
>   powerpc/64s/exception: move real->virt switch into the common handler
>   powerpc/64s/exception: move soft-mask test to common code
>   powerpc/64s/exception: move KVM test to common code
>   powerpc/64s/exception: remove confusing IEARLY option
>   powerpc/64s/exception: remove the SPR saving patch code macros
>   powerpc/64s/exception: trim unused arguments from KVMTEST macro
>   powerpc/64s/exception: hdecrementer avoid touching the stack
>   powerpc/64s/exception: re-inline some handlers
>   powerpc/64s/exception: Clean up SRR specifiers
>   powerpc/64s/exception: add more comments for interrupt handlers
>   powerpc/64s/exception: only test KVM in SRR interrupts when PR KVM is
> supported
>   powerpc/64s/exception: soft nmi interrupt should not use
> ret_from_except
>   powerpc/64: system call remove non-volatile GPR save optimisation
>   powerpc/64: system call implement the bulk of the logic in C
>   powerpc/64s: interrupt return in C
>   powerpc/64s/exception: remove lite interrupt return
>   powerpc/64s/exception: treat NIA below __end_interrupts as soft-masked
>   powerpc/64s: system call support for scv/rfscv instructions
> 
>  arch/powerpc/include/asm/asm-prototypes.h |   11 -
>  .../powerpc/include/asm/book3s/64/kup-radix.h |   24 +-
>  arch/powerpc/include/asm/cputime.h|   24 +
>  arch/powerpc/include/asm/exception-64s.h  |4 -
>  arch/powerpc/include/asm/head-64.h|2 +-
>  arch/powerpc/include/asm/hw_irq.h |4 +
>  arch/powerpc/include/asm/ppc_asm.h|2 +
>  arch/powerpc/include/asm/processor.h  |2 +-
>  arch/powerpc/include/asm/ptrace.h |3 +
>  arch/powerpc/include/asm/signal.h |3 +
>  arch/powerpc/include/asm/switch_to.h  |   11 +
>  arch/powerpc/include/asm/time.h   |4 +-
>  arch/powerpc/kernel/Makefile  |3 +-
>  arch/powerpc/kernel/cpu_setup_power.S |2 +-
>  arch/powerpc/kernel/dt_cpu_ftrs.c |1 +
>  arch/powerpc/kernel/entry_64.S|  964 ++--
>  arch/powerpc/kernel/exceptions-64e.S  |  254 +-
>  arch/powerpc/kernel/exceptions-64s.S  | 2046 -
>  arch/powerpc/kernel/process.c |2 +
>  arch/powerpc/kernel/signal.h  |2 -
>  arch/powerpc/kernel/syscall_64.c  |  422 
>  arch/powerpc/kernel/syscalls/syscall.tbl  |   22 +-
>  arch/powerpc/kernel/systbl.S  |9 +-
>  arch/powerpc/kernel/time.c|9 -
>  arch/powerpc/kernel/vector.S  |2 +-
>  arch/powerpc/kvm/book3s_hv_rmhandlers.S   |   11 -
>  arch/powerpc/kvm/book3s_segment.S |7 -
>  27 files changed, 2458 insertions(+), 1392 deletions(-)
>  create mode 100644 arch/powerpc/kernel/syscall_64.c
> 
> -- 
> 2.23.0
> 


[PATCH] powerpc/powernv/prd: Validate whether address to be mapped is part of system RAM

2019-10-02 Thread Vasant Hegde
Add check to validate whether requested page is part of system RAM
or not before mmap() and error out if its not part of system RAM.

cat /proc/iomem:
-
-27 : System RAM
28-2f : namespace0.0
2000-2027 : System RAM
2028-202f : namespace1.0
6-6003fbfff : pciex@600c3c000
60040-6007f7fff : pciex@600c3c010



Sample dmesg output with this fix:
--
[  160.371911] opal-prd: mmap: Requested page is not part of system RAM (addr : 
0x202ffcfc, size : 0x0057)
[  160.665366] opal-prd: mmap: Requested page is not part of system RAM (addr : 
0x202ffcfc, size : 0x0057)
[  160.914627] opal-prd: mmap: Requested page is not part of system RAM (addr : 
0x202ffcfc, size : 0x0057)
[  161.165253] opal-prd: mmap: Requested page is not part of system RAM (addr : 
0x202ffcfc, size : 0x0057)
[  161.414604] opal-prd: mmap: Requested page is not part of system RAM (addr : 
0x202ffcfc, size : 0x0057)

CC: Aneesh Kumar K.V 
CC: Jeremy Kerr 
CC: Vaidyanathan Srinivasan 
Signed-off-by: Vasant Hegde 
---
 arch/powerpc/platforms/powernv/opal-prd.c | 16 +++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/powernv/opal-prd.c 
b/arch/powerpc/platforms/powernv/opal-prd.c
index 45f4223a790f..0f88752302a2 100644
--- a/arch/powerpc/platforms/powernv/opal-prd.c
+++ b/arch/powerpc/platforms/powernv/opal-prd.c
@@ -3,7 +3,7 @@
  * OPAL Runtime Diagnostics interface driver
  * Supported on POWERNV platform
  *
- * Copyright IBM Corporation 2015
+ * Copyright IBM Corporation 2015-2019
  */
 
 #define pr_fmt(fmt) "opal-prd: " fmt
@@ -47,6 +47,20 @@ static bool opal_prd_range_is_valid(uint64_t addr, uint64_t 
size)
if (addr + size < addr)
return false;
 
+   /*
+* Check if region is in system RAM and error out if the address
+* belongs to special devices like NVDIMM. phys_mem_access_prot()
+* routine will change ATT bits to non cachable if page is not in
+* RAM, causing HBRT to not fetch and execute in the mapped memory
+* and fail. Page permissions can be left at default since all
+* firmware component should be in system RAM only.
+*/
+   if (!page_is_ram(addr >> PAGE_SHIFT)) {
+   pr_warn("mmap: Requested page is not part of system RAM "
+   "(addr : 0x%016llx, size : 0x%016llx)\n", addr, size);
+   return false;
+   }
+
parent = of_find_node_by_path("/reserved-memory");
if (!parent)
return false;
-- 
2.21.0



Re: [PATCH v2 00/21] Refine memblock API

2019-10-02 Thread Mike Rapoport
Hi Adam,

On Tue, Oct 01, 2019 at 07:14:13PM -0500, Adam Ford wrote:
> On Sun, Sep 29, 2019 at 8:33 AM Adam Ford  wrote:
> >
> > I am attaching two logs.  I now the mailing lists will be unhappy, but
> >  don't want to try and spam a bunch of log through the mailing liast.
> > The two logs show the differences between the working and non-working
> > imx6q 3D accelerator when trying to run a simple glmark2-es2-drm demo.
> >
> > The only change between them is the 2 line code change you suggested.
> >
> > In both cases, I have cma=128M set in my bootargs.  Historically this
> > has been sufficient, but cma=256M has not made a difference.
> >
> 
> Mike any suggestions on how to move forward?
> I was hoping to get the fixes tested and pushed before 5.4 is released
> if at all possible

I have a fix (below) that kinda restores the original behaviour, but I
still would like to double check to make sure it's not a band aid and I
haven't missed the actual root cause.

Can you please send me your device tree definition and the output of 

cat /sys/kernel/debug/memblock/memory

and 

cat /sys/kernel/debug/memblock/reserved

Thanks!

>From 06529f861772b7dea2912fc2245debe4690139b8 Mon Sep 17 00:00:00 2001
From: Mike Rapoport 
Date: Wed, 2 Oct 2019 10:14:17 +0300
Subject: [PATCH] mm: memblock: do not enforce current limit for memblock_phys*
 family

Until commit 92d12f9544b7 ("memblock: refactor internal allocation
functions") the maximal address for memblock allocations was forced to
memblock.current_limit only for the allocation functions returning virtual
address. The changes introduced by that commit moved the limit enforcement
into the allocation core and as a result the allocation functions returning
physical address also started to limit allocations to
memblock.current_limit.

This caused breakage of etnaviv GPU driver:

[3.682347] etnaviv etnaviv: bound 13.gpu (ops gpu_ops)
[3.688669] etnaviv etnaviv: bound 134000.gpu (ops gpu_ops)
[3.695099] etnaviv etnaviv: bound 2204000.gpu (ops gpu_ops)
[3.700800] etnaviv-gpu 13.gpu: model: GC2000, revision: 5108
[3.723013] etnaviv-gpu 13.gpu: command buffer outside valid
memory window
[3.731308] etnaviv-gpu 134000.gpu: model: GC320, revision: 5007
[3.752437] etnaviv-gpu 134000.gpu: command buffer outside valid
memory window
[3.760583] etnaviv-gpu 2204000.gpu: model: GC355, revision: 1215
[3.766766] etnaviv-gpu 2204000.gpu: Ignoring GPU with VG and FE2.0

Restore the behaviour of memblock_phys* family so that these functions will
not enforce memblock.current_limit.

Fixes: 92d12f9544b7 ("memblock: refactor internal allocation functions")
Reported-by: Adam Ford 
Signed-off-by: Mike Rapoport 
---
 mm/memblock.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/mm/memblock.c b/mm/memblock.c
index 7d4f61a..c4b16ca 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -1356,9 +1356,6 @@ static phys_addr_t __init 
memblock_alloc_range_nid(phys_addr_t size,
align = SMP_CACHE_BYTES;
}
 
-   if (end > memblock.current_limit)
-   end = memblock.current_limit;
-
 again:
found = memblock_find_in_range_node(size, align, start, end, nid,
flags);
@@ -1469,6 +1466,9 @@ static void * __init memblock_alloc_internal(
if (WARN_ON_ONCE(slab_is_available()))
return kzalloc_node(size, GFP_NOWAIT, nid);
 
+   if (max_addr > memblock.current_limit)
+   max_addr = memblock.current_limit;
+
alloc = memblock_alloc_range_nid(size, align, min_addr, max_addr, nid);
 
/* retry allocation without lower limit */
-- 
2.7.4

 
> > adam
> >
> > On Sat, Sep 28, 2019 at 2:33 AM Mike Rapoport  wrote:
> > >
> > > On Thu, Sep 26, 2019 at 02:35:53PM -0500, Adam Ford wrote:
> > > > On Thu, Sep 26, 2019 at 11:04 AM Mike Rapoport  
> > > > wrote:
> > > > >
> > > > > Hi,
> > > > >
> > > > > On Thu, Sep 26, 2019 at 08:09:52AM -0500, Adam Ford wrote:
> > > > > > On Wed, Sep 25, 2019 at 10:17 AM Fabio Estevam  
> > > > > > wrote:
> > > > > > >
> > > > > > > On Wed, Sep 25, 2019 at 9:17 AM Adam Ford  
> > > > > > > wrote:
> > > > > > >
> > > > > > > > I tried cma=256M and noticed the cma dump at the beginning 
> > > > > > > > didn't
> > > > > > > > change.  Do we need to setup a reserved-memory node like
> > > > > > > > imx6ul-ccimx6ulsom.dtsi did?
> > > > > > >
> > > > > > > I don't think so.
> > > > > > >
> > > > > > > Were you able to identify what was the exact commit that caused 
> > > > > > > such regression?
> > > > > >
> > > > > > I was able to narrow it down the 92d12f9544b7 ("memblock: refactor
> > > > > > internal allocation functions") that caused the regression with
> > > > > > Etnaviv.
> > > > >
> > > > >
> > > > > Can you please test with this change:
> > > > >
> > > >
> > > > That appears to have fixed my issue.  I am not sure what the impact
> > > > is, but is this a safe option?
> > >
> > > It'

Re: [PATCH v8 1/5] kasan: support backing vmalloc space with real shadow memory

2019-10-02 Thread Christophe Leroy

Daniel Axtens  a écrit :


Hi,


/*
 * Find a place in the tree where VA potentially will be
 * inserted, unless it is merged with its sibling/siblings.
@@ -741,6 +752,10 @@ merge_or_add_vmap_area(struct vmap_area *va,
if (sibling->va_end == va->va_start) {
sibling->va_end = va->va_end;

+   kasan_release_vmalloc(orig_start, orig_end,
+ sibling->va_start,
+ sibling->va_end);
+

The same.


The call to kasan_release_vmalloc() is a static inline no-op if
CONFIG_KASAN_VMALLOC is not defined, which I thought was the preferred
way to do things rather than sprinkling the code with ifdefs?

The complier should be smart enough to eliminate all the
orig_state/orig_end stuff at compile time because it can see that it's
not used, so there's no cost in the binary.




Daniel,

You are entirely right in your way to do i, that's fully in line with  
Linux kernel codying style  
https://www.kernel.org/doc/html/latest/process/coding-style.html#conditional-compilation


Christophe



Re: [PATCH v5 04/10] mm/memory_hotplug: Don't access uninitialized memmaps in shrink_zone_span()

2019-10-02 Thread David Hildenbrand
On 02.10.19 02:06, kbuild test robot wrote:
> Hi David,
> 
> I love your patch! Perhaps something to improve:
> 
> [auto build test WARNING on mmotm/master]
> 
> url:
> https://github.com/0day-ci/linux/commits/David-Hildenbrand/mm-memory_hotplug-Shrink-zones-before-removing-memory/20191002-054310
> base:   git://git.cmpxchg.org/linux-mmotm.git master
> config: x86_64-randconfig-b002-201939 (attached as .config)
> compiler: gcc-7 (Debian 7.4.0-13) 7.4.0
> reproduce:
> # save the attached .config to linux build tree
> make ARCH=x86_64 
> 
> If you fix the issue, kindly add following tag
> Reported-by: kbuild test robot 
> 
> All warnings (new ones prefixed by >>):
> 
>In file included from include/asm-generic/bug.h:5:0,
> from arch/x86/include/asm/bug.h:83,
> from include/linux/bug.h:5,
> from include/linux/mmdebug.h:5,
> from include/linux/mm.h:9,
> from mm/memory_hotplug.c:9:
>mm/memory_hotplug.c: In function '__remove_zone':
>mm/memory_hotplug.c:471:24: error: 'ZONE_DEVICE' undeclared (first use in 
> this function); did you mean 'ZONE_MOVABLE'?
>  if (zone_idx(zone) == ZONE_DEVICE)
>^
>include/linux/compiler.h:58:52: note: in definition of macro 
> '__trace_if_var'
> #define __trace_if_var(cond) (__builtin_constant_p(cond) ? (cond) : 
> __trace_if_value(cond))
>^~~~
>>> mm/memory_hotplug.c:471:2: note: in expansion of macro 'if'
>  if (zone_idx(zone) == ZONE_DEVICE)
>  ^~
>mm/memory_hotplug.c:471:24: note: each undeclared identifier is reported 
> only once for each function it appears in
>  if (zone_idx(zone) == ZONE_DEVICE)
>^
>include/linux/compiler.h:58:52: note: in definition of macro 
> '__trace_if_var'
> #define __trace_if_var(cond) (__builtin_constant_p(cond) ? (cond) : 
> __trace_if_value(cond))
>^~~~
>>> mm/memory_hotplug.c:471:2: note: in expansion of macro 'if'
>  if (zone_idx(zone) == ZONE_DEVICE)
>  ^~
> 
> vim +/if +471 mm/memory_hotplug.c
> 
>459
>460static void __remove_zone(struct zone *zone, unsigned long 
> start_pfn,
>461unsigned long nr_pages)
>462{
>463struct pglist_data *pgdat = zone->zone_pgdat;
>464unsigned long flags;
>465
>466/*
>467 * Zone shrinking code cannot properly deal with 
> ZONE_DEVICE. So
>468 * we will not try to shrink the zones - which is okay 
> as
>469 * set_zone_contiguous() cannot deal with ZONE_DEVICE 
> either way.
>470 */
>  > 471if (zone_idx(zone) == ZONE_DEVICE)
>472return;
>473
>474pgdat_resize_lock(zone->zone_pgdat, &flags);
>475shrink_zone_span(zone, start_pfn, start_pfn + nr_pages);
>476update_pgdat_span(pgdat);
>477pgdat_resize_unlock(zone->zone_pgdat, &flags);
>478}
>479
> 
> ---
> 0-DAY kernel test infrastructureOpen Source Technology Center
> https://lists.01.org/pipermail/kbuild-all   Intel Corporation
> 

That should be easy to fix with some ifdef-ery :)

-- 

Thanks,

David / dhildenb