[PATCH] powerpc/eeh: fix pr_debug()s in eeh_cache.c

2016-06-23 Thread Andrew Donnellan
eeh_cache.c doesn't build cleanly with -DDEBUG when
CONFIG_PHYS_ADDR_T_64BIT is set, as a couple of pr_debug()s use "%lx" for
resource_size_t parameters.

Use "%pap" instead, as it's the correct format specifier for types deriving
from phys_addr_t.

Signed-off-by: Andrew Donnellan 
Cc: Russell Currey 
Cc: Gavin Shan 
---
 arch/powerpc/kernel/eeh_cache.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/kernel/eeh_cache.c b/arch/powerpc/kernel/eeh_cache.c
index ddbcfab..ceb81a4 100644
--- a/arch/powerpc/kernel/eeh_cache.c
+++ b/arch/powerpc/kernel/eeh_cache.c
@@ -114,9 +114,9 @@ static void eeh_addr_cache_print(struct pci_io_addr_cache 
*cache)
while (n) {
struct pci_io_addr_range *piar;
piar = rb_entry(n, struct pci_io_addr_range, rb_node);
-   pr_debug("PCI: %s addr range %d [%lx-%lx]: %s\n",
+   pr_debug("PCI: %s addr range %d [%pap-%pap]: %s\n",
   (piar->flags & IORESOURCE_IO) ? "i/o" : "mem", cnt,
-  piar->addr_lo, piar->addr_hi, pci_name(piar->pcidev));
+  >addr_lo, >addr_hi, pci_name(piar->pcidev));
cnt++;
n = rb_next(n);
}
@@ -159,8 +159,8 @@ eeh_addr_cache_insert(struct pci_dev *dev, resource_size_t 
alo,
piar->flags = flags;
 
 #ifdef DEBUG
-   pr_debug("PIAR: insert range=[%lx:%lx] dev=%s\n",
- alo, ahi, pci_name(dev));
+   pr_debug("PIAR: insert range=[%pap:%pap] dev=%s\n",
+ , , pci_name(dev));
 #endif
 
rb_link_node(>rb_node, parent, p);
-- 
Andrew Donnellan  OzLabs, ADL Canberra
andrew.donnel...@au1.ibm.com  IBM Australia Limited

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH] powerpc/pseries: Auto online hotplugged memory

2016-06-23 Thread Michael Ellerman
On Mon, 2016-06-20 at 21:14 -0500, Nathan Fontenot wrote:
> On 06/20/2016 07:57 PM, Michael Ellerman wrote:
> > On Mon, 2016-06-20 at 08:51 -0500, Nathan Fontenot wrote:
> > 
> > > Auto online hotplugged memory
> > > 
> > > A recent update (commit id 31bc3858ea3) to the core mm hotplug code
> > > introduced the memhp_auto_online variable to allow for automatically
> > > onlining memory that is added.
> > > 
> > > This patch update the pseries memory hotplug code to enable this so that
> > > any memory DLPAR added to the system is automatically onlined. The code
> > > to add the memory block for memory added from add_memory() is removed as
> > > this is not needed, the memory_add code does this.
> > 
> > Is this a bug fix, or just a cleanup?
> 
> Hmmm.. some cleanup and some new feature. The removal of the memblock_add()
> call is a cleanup and the setting of the memhp_auto_online variable is
> taking advantage of a feature I was not previously aware of.

OK. Looking at usage of memhp_auto_online it's not clear to me that you're
supposed to be setting it in arch code.

eg. if I build my kernel with CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE=n, I will
expect it to not be onlined by default.

Similarly if I boot with memhp_default_state=offline on the kernel command line.

But this patch would then mean it is onlined by default. So that seems kind of
confusing for users.

I think instead we should be merging the bulk of this patch, but without the
forced assignment to memhp_auto_online?

cheers

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [v2,1/2] refactor code parsing size based on memory range

2016-06-23 Thread Michael Ellerman
On Wed, 2016-22-06 at 19:25:26 UTC, Hari Bathini wrote:
> Currently, crashkernel parameter supports the below syntax to parse size
> based on memory range:
> 
>   crashkernel=:[,:,...]
> 
> While such parsing is implemented for crashkernel parameter, it applies to
> other parameters with similar syntax. So, move this code to a more generic
> place for code reuse.
> 
> Cc: Eric Biederman 
> Cc: Vivek Goyal 
> Cc: Rusty Russell 
> Cc: ke...@lists.infradead.org
> Signed-off-by: Hari Bathini 

Hari, it's not immediately clear that this makes no change to the logic in the
kexec code. Can you reply with a longer change log explaining why the old & new
logic is the same for kexec.

cheers


> diff --git a/include/linux/kernel.h b/include/linux/kernel.h
> index 94aa10f..72f55e5 100644
> --- a/include/linux/kernel.h
> +++ b/include/linux/kernel.h
> @@ -436,6 +436,11 @@ extern char *get_options(const char *str, int nints, int 
> *ints);
>  extern unsigned long long memparse(const char *ptr, char **retptr);
>  extern bool parse_option_str(const char *str, const char *option);
>  
> +extern bool __init is_param_range_based(const char *cmdline);
> +extern unsigned long long __init parse_mem_range_size(const char *param,
> +   char **str,
> +   unsigned long long 
> system_ram);
> +
>  extern int core_kernel_text(unsigned long addr);
>  extern int core_kernel_data(unsigned long addr);
>  extern int __kernel_text_address(unsigned long addr);
> diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
> index 56b3ed0..d43f5cc 100644
> --- a/kernel/kexec_core.c
> +++ b/kernel/kexec_core.c
> @@ -1083,59 +1083,9 @@ static int __init parse_crashkernel_mem(char *cmdline,
>   char *cur = cmdline, *tmp;
>  
>   /* for each entry of the comma-separated list */
> - do {
> - unsigned long long start, end = ULLONG_MAX, size;
> -
> - /* get the start of the range */
> - start = memparse(cur, );
> - if (cur == tmp) {
> - pr_warn("crashkernel: Memory value expected\n");
> - return -EINVAL;
> - }
> - cur = tmp;
> - if (*cur != '-') {
> - pr_warn("crashkernel: '-' expected\n");
> - return -EINVAL;
> - }
> - cur++;
> -
> - /* if no ':' is here, than we read the end */
> - if (*cur != ':') {
> - end = memparse(cur, );
> - if (cur == tmp) {
> - pr_warn("crashkernel: Memory value expected\n");
> - return -EINVAL;
> - }
> - cur = tmp;
> - if (end <= start) {
> - pr_warn("crashkernel: end <= start\n");
> - return -EINVAL;
> - }
> - }
> -
> - if (*cur != ':') {
> - pr_warn("crashkernel: ':' expected\n");
> - return -EINVAL;
> - }
> - cur++;
> -
> - size = memparse(cur, );
> - if (cur == tmp) {
> - pr_warn("Memory value expected\n");
> - return -EINVAL;
> - }
> - cur = tmp;
> - if (size >= system_ram) {
> - pr_warn("crashkernel: invalid size\n");
> - return -EINVAL;
> - }
> -
> - /* match ? */
> - if (system_ram >= start && system_ram < end) {
> - *crash_size = size;
> - break;
> - }
> - } while (*cur++ == ',');
> + *crash_size = parse_mem_range_size("crashkernel", , system_ram);
> + if (cur == cmdline)
> + return -EINVAL;
>  
>   if (*crash_size > 0) {
>   while (*cur && *cur != ' ' && *cur != '@')
> @@ -1272,7 +1222,6 @@ static int __init __parse_crashkernel(char *cmdline,
>const char *name,
>const char *suffix)
>  {
> - char*first_colon, *first_space;
>   char*ck_cmdline;
>  
>   BUG_ON(!crash_size || !crash_base);
> @@ -1290,12 +1239,10 @@ static int __init __parse_crashkernel(char *cmdline,
>   return parse_crashkernel_suffix(ck_cmdline, crash_size,
>   suffix);
>   /*
> -  * if the commandline contains a ':', then that's the extended
> +  * if the parameter is range based, then that's the extended
>* syntax -- if not, it must be the classic syntax
>*/
> - first_colon = strchr(ck_cmdline, ':');
> - first_space = strchr(ck_cmdline, ' ');
> - if (first_colon && (!first_space || first_colon < first_space))

[PATCH] powerpc/eeh: Fix wrong argument passed to eeh_rmv_device()

2016-06-23 Thread Gavin Shan
When calling eeh_rmv_device() in eeh_reset_device() for partial
hotplug case, @rmv_data instead of its address is the proper
argument. Otherwise, the stack frame is corrupted when writing
to @rmv_data (actually its address) in eeh_rmv_device(). It
results in kernel crash as observed.

This fixes the issue by passing @rmv_data, not its address to
eeh_rmv_device() in eeh_reset_device().

Cc: sta...@vger.kernel.org # v4.6+
Fixes: 67086e32b564 ("powerpc/eeh: powerpc/eeh: Support error recovery for VF 
PE")
Reported-by: Pridhiviraj Paidipeddi 
Signed-off-by: Gavin Shan 
---
 arch/powerpc/kernel/eeh_driver.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
index 389b0d3..5c0429f 100644
--- a/arch/powerpc/kernel/eeh_driver.c
+++ b/arch/powerpc/kernel/eeh_driver.c
@@ -648,7 +648,7 @@ static int eeh_reset_device(struct eeh_pe *pe, struct 
pci_bus *bus,
pci_unlock_rescan_remove();
}
} else if (frozen_bus) {
-   eeh_pe_dev_traverse(pe, eeh_rmv_device, _data);
+   eeh_pe_dev_traverse(pe, eeh_rmv_device, rmv_data);
}
 
/*
-- 
2.1.0

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH] powerpc/mm: update arch_{add,remove}_memory() for radix

2016-06-23 Thread Balbir Singh


On 24/06/16 03:17, Aneesh Kumar K.V wrote:
> Reza Arbab  writes:
> 
>> These functions are making direct calls to the hash table APIs,
>> leading to a BUG() on systems using radix.
>>
>> Switch them to the vmemmap_{create,remove}_mapping() wrappers, and
>> move to the __meminit section.
> 
> 
> They are really not the same. They can possibly end up using different
> base page size. Also vmemmap is available only with SPARSEMEM_VMEMMAP
> enabled. Does hotplug depend on sparsemem vmemmap ?

# eventually, we can have this option just 'select SPARSEMEM'
config MEMORY_HOTPLUG
bool "Allow for memory hot-add"
depends on SPARSEMEM || X86_64_ACPI_NUMA
depends on ARCH_ENABLE_MEMORY_HOTPLUG

We depend on sparsemem for sure. vmemmap is just a way of getting the memory
virtually mapped. From the patch perspective, I think we need the equivalent of
just mapping the pages in kernel. The address may differ based on whether 
vmemmap
is used or not and of-course page_size, 

Balbir Singh
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH v5] powerpc/timer - large decrementer support

2016-06-23 Thread Oliver O'Halloran
Power ISAv3 adds a large decrementer (LD) mode which increases the size
of the decrementer register. The size of the enlarged decrementer
register is between 32 and 64 bits with the exact size being dependent
on the implementation. When in LD mode, reads are sign extended to 64
bits and a decrementer exception is raised when the high bit is set (i.e
the value goes below zero). Writes however are truncated to the physical
register width so some care needs to be taken to ensure that the high
bit is not set when reloading the decrementer. This patch adds support
for using the LD inside the host kernel on processors that support it.

When LD mode is supported firmware will supply the ibm,dec-bits property
for CPU nodes to allow the kernel to determine the maximum decrementer
value. Enabling LD mode is a hypervisor privileged operation so the
kernel can only enable it manually when running in hypervisor mode.
Guest kernels that support LD mode can request it using the
"ibm,client-architecture-support" firmware call or some other platform
specific method. If this property is not supplied then the traditional
decrementer width of 32 bit is assumed and LD mode will not be enabled.

This patch was based on initial work by Jack Miller.

Signed-off-by: Oliver O'Halloran 
Signed-off-by: Balbir Singh 
Cc: Michael Neuling 
Cc: Jack Miller 
---
 arch/powerpc/include/asm/reg.h  |   1 +
 arch/powerpc/include/asm/time.h |   6 +--
 arch/powerpc/kernel/time.c  | 104 
 3 files changed, 100 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index a0948f40bc7b..12d970d64bb3 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -332,6 +332,7 @@
 #define   LPCR_AIL_0   0x  /* MMU off exception offset 0x0 */
 #define   LPCR_AIL_3   0x0180  /* MMU on exception offset 0xc00...4xxx 
*/
 #define   LPCR_ONL 0x0004  /* online - PURR/SPURR count */
+#define   LPCR_LD  0x0002  /* large decremeter */
 #define   LPCR_PECE0x0001f000  /* powersave exit cause enable */
 #define LPCR_PECEDP0x0001  /* directed priv dbells cause 
exit */
 #define LPCR_PECEDH0x8000  /* directed hyp dbells cause 
exit */
diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h
index 1092fdd7e737..09211640a0e0 100644
--- a/arch/powerpc/include/asm/time.h
+++ b/arch/powerpc/include/asm/time.h
@@ -146,7 +146,7 @@ static inline void set_tb(unsigned int upper, unsigned int 
lower)
  * in auto-reload mode.  The problem is PIT stops counting when it
  * hits zero.  If it would wrap, we could use it just like a decrementer.
  */
-static inline unsigned int get_dec(void)
+static inline u64 get_dec(void)
 {
 #if defined(CONFIG_40x)
return (mfspr(SPRN_PIT));
@@ -160,10 +160,10 @@ static inline unsigned int get_dec(void)
  * in when the decrementer generates its interrupt: on the 1 to 0
  * transition for Book E/4xx, but on the 0 to -1 transition for others.
  */
-static inline void set_dec(int val)
+static inline void set_dec(u64 val)
 {
 #if defined(CONFIG_40x)
-   mtspr(SPRN_PIT, val);
+   mtspr(SPRN_PIT, (u32) val);
 #else
 #ifndef CONFIG_BOOKE
--val;
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 7a482a7f4d8d..efebe52133ef 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -97,7 +97,8 @@ static struct clocksource clocksource_timebase = {
.read = timebase_read,
 };
 
-#define DECREMENTER_MAX0x7fff
+#define DECREMENTER_DEFAULT_MAX 0x7FFF
+u64 decrementer_max = DECREMENTER_DEFAULT_MAX;
 
 static int decrementer_set_next_event(unsigned long evt,
  struct clock_event_device *dev);
@@ -505,8 +506,8 @@ static void __timer_interrupt(void)
__this_cpu_inc(irq_stat.timer_irqs_event);
} else {
now = *next_tb - now;
-   if (now <= DECREMENTER_MAX)
-   set_dec((int)now);
+   if (now <= decrementer_max)
+   set_dec(now);
/* We may have raced with new irq work */
if (test_irq_work_pending())
set_dec(1);
@@ -536,7 +537,7 @@ void timer_interrupt(struct pt_regs * regs)
/* Ensure a positive value is written to the decrementer, or else
 * some CPUs will continue to take decrementer exceptions.
 */
-   set_dec(DECREMENTER_MAX);
+   set_dec(decrementer_max);
 
/* Some implementations of hotplug will get timer interrupts while
 * offline, just ignore these and we also need to set
@@ -584,9 +585,9 @@ static void generic_suspend_disable_irqs(void)
 * with suspending.
 */
 
-   set_dec(DECREMENTER_MAX);
+   

Re: [PATCH v3 0/9] kexec_file_load implementation for PowerPC

2016-06-23 Thread Thiago Jung Bauermann
Am Freitag, 24 Juni 2016, 08:33:24 schrieb Balbir Singh:
> On 24/06/16 02:44, Thiago Jung Bauermann wrote:
> > Sorry, I still don't understand your concern. What kind of cheating?
> > Which values? If it's the values in the event log, there's no need to
> > trust the old kernel. The new kernel knows that the old kernel didn't
> > pass wrong measurement values in the event log because it can
> > recalculate the PCR extend operations recorded in the log and compare
> > the results of the replay with the current PCR values stored in the TPM
> > device. If they match, then the event log is guaranteed to be correct.
> > If they don't match, either the memory was corrupted somehow during the
> > kexec process, or the old kernel tried to pass a falsified event log.
> 
> Yep, get it/got it. My concern was anything using passed on the values
> should compare the results with the current PCR values.
> 
> BTW, what do we gain by passing the values if we are relying on the PCR
> registers anyway, can't we directly read them off from there? Aren't we
> going to ready anyway to compare, what does passing the values gain?

The PCR values themselves change for reasons that the application/user may 
not care about. For example, just changing the order in which measurements 
are made changes the final value of the PCR, even if all the measurements 
themselves don't change. And in current multi-processor machines this order 
does change at each boot, so you can't rely on two boots of the same machine 
with the same software to have the same PCR values.

Also, you may want to verify only the measurement of one of the components 
and not care about the other components.

With an event log, you can verify the checksum of each measured component 
individually, and the PCR value serves to confirm that the event log is 
correct. Just having the final PCR value without the event log, you don't 
know which measurements were made.

> >> and
> >> 
> >> How do we know the new kernel is safe to load - I guess via a signature
> >> that the new kernel is signed with (assuming it is present in the key
> >> ring).
> > 
> > Correct. That goal is met by signature verification, not by integrity
> > assurance.
> > 
> > I'll note that even with both of my patch series there's still code
> > missing for kernel signature verification in PowerPC. I believe there's
> > not a file format defined yet for how to store a signature in a PowerPC
> > kernel image.
> > 
> > Integrity assurance doesn't depend on kernel signature verification
> > though. There's value in both my patch series even without kernel
> > signature verification support. They're complementary features.
> 
> Thanks for clarifying

Thank you for your interest.

-- 
[]'s
Thiago Jung Bauermann
IBM Linux Technology Center

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH v3 0/9] kexec_file_load implementation for PowerPC

2016-06-23 Thread Balbir Singh


On 24/06/16 02:44, Thiago Jung Bauermann wrote:
> Am Donnerstag, 23 Juni 2016, 09:57:51 schrieb Balbir Singh:
>> On 23/06/16 03:02, Thiago Jung Bauermann wrote:
> 3. have IMA pass-on its event log (where integrity measurements are
>
>registered) accross kexec to the second kernel, so that the event
>history is preserved.

 OK.. and this is safe? Do both the kernels need to be signed by the
 same certificate?
>>>
>>> They don't. The integrity of the event log (assuming that is what you
>>> mean by "this" in "this is safe") is guaranteed by the TPM device. Each
>>> event in the measurement list extends a PCR and records its PCR value.
>>> It is cryptographically guaranteed that if you replay the PCR extends
>>> recorded in the event log and in the end of the process they match the
>>> current PCR values in the TPM device, then that event log is correct.
>>
>> What I meant was how does the new kernel know that the old kernel did not
>> cheat while passing on the values? I presume because we trust that kernel
>> via a signature.
> 
> Sorry, I still don't understand your concern. What kind of cheating? Which 
> values? If it's the values in the event log, there's no need to trust the 
> old kernel. The new kernel knows that the old kernel didn't pass wrong 
> measurement values in the event log because it can recalculate the PCR 
> extend operations recorded in the log and compare the results of the replay 
> with the current PCR values stored in the TPM device. If they match, then 
> the event log is guaranteed to be correct. If they don't match, either the 
> memory was corrupted somehow during the kexec process, or the old kernel 
> tried to pass a falsified event log.
> 

Yep, get it/got it. My concern was anything using passed on the values should
compare the results with the current PCR values.

BTW, what do we gain by passing the values if we are relying on the PCR 
registers
anyway, can't we directly read them off from there? Aren't we going to ready 
anyway
to compare, what does passing the values gain?

[snip]

>> and
>>
>> How do we know the new kernel is safe to load - I guess via a signature
>> that the new kernel is signed with (assuming it is present in the key
>> ring).
> 
> Correct. That goal is met by signature verification, not by integrity 
> assurance.
> 
> I'll note that even with both of my patch series there's still code missing 
> for kernel signature verification in PowerPC. I believe there's not a file 
> format defined yet for how to store a signature in a PowerPC kernel image.
> 
> Integrity assurance doesn't depend on kernel signature verification though. 
> There's value in both my patch series even without kernel signature 
> verification support. They're complementary features.
>  

Thanks for clarifying

Balbir Singh.
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [RFC PATCH v2 05/18] sched: add task flag for preempt IRQ tracking

2016-06-23 Thread Andy Lutomirski
On Thu, Jun 23, 2016 at 1:40 PM, Josh Poimboeuf  wrote:
> On Thu, Jun 23, 2016 at 01:31:32PM -0500, Josh Poimboeuf wrote:
>> On Thu, Jun 23, 2016 at 09:35:29AM -0700, Andy Lutomirski wrote:
>> > > So which is the least-bad option?  To summarize:
>> > >
>> > >   1) task flag(s) for preemption and page faults
>> > >
>> > >   2) turn pt_regs into a stack frame
>> > >
>> > >   3) annotate all calls from entry code in a table
>> > >
>> > >   4) encode rbp on entry
>> > >
>> > > They all have their issues, though I'm partial to #2.
>> > >
>> > > Any more hare-brained ideas? :-)
>> >
>> > I'll try to take a closer look at #2 and see just how much I dislike
>> > all the stack frame munging.
>>
>> Ok.
>>
>> > Also, in principle, it's only the
>> > sleeping calls and the calls that make it into real (non-entry) kernel
>> > code that really want to be unwindable through this mechanism.
>>
>> Yeah, that's true.  We could modify options 2 or 3 to be less absolute.
>> Though I think that makes them more prone to future breakage.
>>
>> > FWIW, I don't care that much about preserving gdb's partial ability to
>> > unwind through pt_regs, especially because gdb really ought to be able
>> > to use DWARF, too.
>>
>> Hm, that's a good point.  I really don't know if there are any other
>> external tools out there that would care.  Maybe we could try option 4
>> and then see if anybody complains.
>
> I'm starting to think hare-brained option 4 is the way to go.  Any
> external tooling should really be relying on DWARF anyway.
>
> Here's a sneak preview.  If this general approach looks ok to you, I'll
> go ahead and port all the in-tree unwinders and post a proper patch.
>
> Instead of using xor -1 on the pt_regs pointer, I just cleared the
> high-order bit.  That makes the unwinding experience much more pleasant
> for a human stack walker, and also ensures that anybody trying to
> dereference it gets slapped with an oops, at least in the 48-bit address
> space era.
>
> diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
> index 9a9e588..bf397426 100644
> --- a/arch/x86/entry/calling.h
> +++ b/arch/x86/entry/calling.h
> @@ -201,6 +201,23 @@ For 32-bit we have the following conventions - kernel is 
> built with
> .byte 0xf1
> .endm
>
> +   /*
> +* This is a sneaky trick to help the unwinder find pt_regs on the
> +* stack.  The frame pointer is replaced with an encoded pointer to
> +* pt_regs.  The encoding is just a clearing of the highest-order bit,
> +* which makes it an invalid address and is also a signal to the
> +* unwinder that it's a pt_regs pointer in disguise.
> +*
> +* NOTE: This must be called *after* SAVE_EXTRA_REGS because it
> +* corrupts rbp.
> +*/
> +   .macro ENCODE_FRAME_POINTER ptregs_offset=0
> +#ifdef CONFIG_FRAME_POINTER
> +   leaq \ptregs_offset(%rsp), %rbp
> +   btr $63, %rbp
> +#endif
> +   .endm
> +

Maybe optimize slightly:

.ifeq \ptregs_offset
mov %rsp, %rbp
.else
leaq \ptregs_offset(%rsp), %rbp
.endif
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [RFC PATCH v2 05/18] sched: add task flag for preempt IRQ tracking

2016-06-23 Thread Josh Poimboeuf
On Thu, Jun 23, 2016 at 01:31:32PM -0500, Josh Poimboeuf wrote:
> On Thu, Jun 23, 2016 at 09:35:29AM -0700, Andy Lutomirski wrote:
> > > So which is the least-bad option?  To summarize:
> > >
> > >   1) task flag(s) for preemption and page faults
> > >
> > >   2) turn pt_regs into a stack frame
> > >
> > >   3) annotate all calls from entry code in a table
> > >
> > >   4) encode rbp on entry
> > >
> > > They all have their issues, though I'm partial to #2.
> > >
> > > Any more hare-brained ideas? :-)
> > 
> > I'll try to take a closer look at #2 and see just how much I dislike
> > all the stack frame munging.
> 
> Ok.
> 
> > Also, in principle, it's only the
> > sleeping calls and the calls that make it into real (non-entry) kernel
> > code that really want to be unwindable through this mechanism.
> 
> Yeah, that's true.  We could modify options 2 or 3 to be less absolute.
> Though I think that makes them more prone to future breakage.
> 
> > FWIW, I don't care that much about preserving gdb's partial ability to
> > unwind through pt_regs, especially because gdb really ought to be able
> > to use DWARF, too.
> 
> Hm, that's a good point.  I really don't know if there are any other
> external tools out there that would care.  Maybe we could try option 4
> and then see if anybody complains.

I'm starting to think hare-brained option 4 is the way to go.  Any
external tooling should really be relying on DWARF anyway.

Here's a sneak preview.  If this general approach looks ok to you, I'll
go ahead and port all the in-tree unwinders and post a proper patch.

Instead of using xor -1 on the pt_regs pointer, I just cleared the
high-order bit.  That makes the unwinding experience much more pleasant
for a human stack walker, and also ensures that anybody trying to
dereference it gets slapped with an oops, at least in the 48-bit address
space era.

diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index 9a9e588..bf397426 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -201,6 +201,23 @@ For 32-bit we have the following conventions - kernel is 
built with
.byte 0xf1
.endm
 
+   /*
+* This is a sneaky trick to help the unwinder find pt_regs on the
+* stack.  The frame pointer is replaced with an encoded pointer to
+* pt_regs.  The encoding is just a clearing of the highest-order bit,
+* which makes it an invalid address and is also a signal to the
+* unwinder that it's a pt_regs pointer in disguise.
+*
+* NOTE: This must be called *after* SAVE_EXTRA_REGS because it
+* corrupts rbp.
+*/
+   .macro ENCODE_FRAME_POINTER ptregs_offset=0
+#ifdef CONFIG_FRAME_POINTER
+   leaq \ptregs_offset(%rsp), %rbp
+   btr $63, %rbp
+#endif
+   .endm
+
 #endif /* CONFIG_X86_64 */
 
 /*
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 9ee0da1..eb79652 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -431,6 +431,7 @@ END(irq_entries_start)
ALLOC_PT_GPREGS_ON_STACK
SAVE_C_REGS
SAVE_EXTRA_REGS
+   ENCODE_FRAME_POINTER
 
testb   $3, CS(%rsp)
jz  1f
@@ -893,6 +894,7 @@ ENTRY(xen_failsafe_callback)
ALLOC_PT_GPREGS_ON_STACK
SAVE_C_REGS
SAVE_EXTRA_REGS
+   ENCODE_FRAME_POINTER
jmp error_exit
 END(xen_failsafe_callback)
 
@@ -936,6 +938,7 @@ ENTRY(paranoid_entry)
cld
SAVE_C_REGS 8
SAVE_EXTRA_REGS 8
+   ENCODE_FRAME_POINTER 8
movl$1, %ebx
movl$MSR_GS_BASE, %ecx
rdmsr
@@ -983,6 +986,7 @@ ENTRY(error_entry)
cld
SAVE_C_REGS 8
SAVE_EXTRA_REGS 8
+   ENCODE_FRAME_POINTER 8
xorl%ebx, %ebx
testb   $3, CS+8(%rsp)
jz  .Lerror_kernelspace
@@ -1165,6 +1169,7 @@ ENTRY(nmi)
pushq   %r13/* pt_regs->r13 */
pushq   %r14/* pt_regs->r14 */
pushq   %r15/* pt_regs->r15 */
+   ENCODE_FRAME_POINTER
 
/*
 * At this point we no longer need to worry about stack damage
@@ -1182,7 +1187,7 @@ ENTRY(nmi)
 * do_nmi doesn't modify pt_regs.
 */
SWAPGS
-   jmp restore_c_regs_and_iret
+   jmp restore_regs_and_iret
 
 .Lnmi_from_kernel:
/*
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH] powerpc/mm: update arch_{add,remove}_memory() for radix

2016-06-23 Thread Reza Arbab

On Thu, Jun 23, 2016 at 02:37:39PM -0500, Reza Arbab wrote:
Could it be that the functions just need to be renamed 
hash__create_mapping()/radix__create_mapping() and moved out of #ifdef 
SPARSEMEM_VMEMMAP?


Or vice-versa, maybe the callers should have been wrapped in the first 
place:


arch_add_memory() {
...
if (IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP))
vmemmap_create_mapping()
...
}

arch_remove_memory() {
...
if (IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP))
vmemmap_remove_mapping()
...
}

--
Reza Arbab

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH] powerpc/mm: update arch_{add,remove}_memory() for radix

2016-06-23 Thread Reza Arbab

On Thu, Jun 23, 2016 at 10:47:20PM +0530, Aneesh Kumar K.V wrote:

Reza Arbab  writes:

These functions are making direct calls to the hash table APIs,
leading to a BUG() on systems using radix.

Switch them to the vmemmap_{create,remove}_mapping() wrappers, and
move to the __meminit section.


They are really not the same. They can possibly end up using different
base page size. Also vmemmap is available only with SPARSEMEM_VMEMMAP
enabled. Does hotplug depend on sparsemem vmemmap ?


I'm not sure. Maybe it's best if I back up a step and explain what lead 
me to this patch. During hotplug, you get


...
arch_add_memory
create_section_mapping
htab_bolt_mapping
BUG_ON(!ppc_md.hpte_insert);

So it seemed to me that I needed a radix equivalent of 
create_section_mapping().


After some digging, I found hash__vmemmap_create_mapping() and 
radix__vmemmap_create_mapping() did what I needed. I did not notice the 
#ifdef SPARSEMEM_VMEMMAP around them.


Hotplug/remove are now working for me as far as I can tell, so I think
the functional change in itself was correct. Hotplug may not depend on 
sparsemem vmemmap, but we seem to need a radix create_section_mapping() 
regardless.


Could it be that the functions just need to be renamed 
hash__create_mapping()/radix__create_mapping() and moved out of #ifdef 
SPARSEMEM_VMEMMAP?


--
Reza Arbab

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [RFC PATCH v2 05/18] sched: add task flag for preempt IRQ tracking

2016-06-23 Thread Josh Poimboeuf
On Thu, Jun 23, 2016 at 09:35:29AM -0700, Andy Lutomirski wrote:
> > So which is the least-bad option?  To summarize:
> >
> >   1) task flag(s) for preemption and page faults
> >
> >   2) turn pt_regs into a stack frame
> >
> >   3) annotate all calls from entry code in a table
> >
> >   4) encode rbp on entry
> >
> > They all have their issues, though I'm partial to #2.
> >
> > Any more hare-brained ideas? :-)
> 
> I'll try to take a closer look at #2 and see just how much I dislike
> all the stack frame munging.

Ok.

> Also, in principle, it's only the
> sleeping calls and the calls that make it into real (non-entry) kernel
> code that really want to be unwindable through this mechanism.

Yeah, that's true.  We could modify options 2 or 3 to be less absolute.
Though I think that makes them more prone to future breakage.

> FWIW, I don't care that much about preserving gdb's partial ability to
> unwind through pt_regs, especially because gdb really ought to be able
> to use DWARF, too.

Hm, that's a good point.  I really don't know if there are any other
external tools out there that would care.  Maybe we could try option 4
and then see if anybody complains.

-- 
Josh
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH] powerpc/mm: update arch_{add,remove}_memory() for radix

2016-06-23 Thread Nathan Fontenot
On 06/23/2016 12:17 PM, Aneesh Kumar K.V wrote:
> Reza Arbab  writes:
> 
>> These functions are making direct calls to the hash table APIs,
>> leading to a BUG() on systems using radix.
>>
>> Switch them to the vmemmap_{create,remove}_mapping() wrappers, and
>> move to the __meminit section.
> 
> 
> They are really not the same. They can possibly end up using different
> base page size. Also vmemmap is available only with SPARSEMEM_VMEMMAP
> enabled. Does hotplug depend on sparsemem vmemmap ?

Sparse vmemmap is supported, and I think enabled by default for pseries,
but hotplug does not depend on sparse vmemmep.

-Nathan

> 
>>
>> Signed-off-by: Reza Arbab 
>> ---
>>  arch/powerpc/mm/mem.c | 8 
>>  1 file changed, 4 insertions(+), 4 deletions(-)
>>
>> diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
>> index 2fd57fa..80c6ee7 100644
>> --- a/arch/powerpc/mm/mem.c
>> +++ b/arch/powerpc/mm/mem.c
>> @@ -116,7 +116,7 @@ int memory_add_physaddr_to_nid(u64 start)
>>  }
>>  #endif
>>
>> -int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
>> +int __meminit arch_add_memory(int nid, u64 start, u64 size, bool for_device)
>>  {
>>  struct pglist_data *pgdata;
>>  struct zone *zone;
>> @@ -127,7 +127,7 @@ int arch_add_memory(int nid, u64 start, u64 size, bool 
>> for_device)
>>  pgdata = NODE_DATA(nid);
>>
>>  start = (unsigned long)__va(start);
>> -rc = create_section_mapping(start, start + size);
>> +rc = vmemmap_create_mapping(start, size, __pa(start));
>>  if (rc) {
>>  pr_warning(
>>  "Unable to create mapping for hot added memory 
>> 0x%llx..0x%llx: %d\n",
>> @@ -143,7 +143,7 @@ int arch_add_memory(int nid, u64 start, u64 size, bool 
>> for_device)
>>  }
>>
>>  #ifdef CONFIG_MEMORY_HOTREMOVE
>> -int arch_remove_memory(u64 start, u64 size)
>> +int __meminit arch_remove_memory(u64 start, u64 size)
>>  {
>>  unsigned long start_pfn = start >> PAGE_SHIFT;
>>  unsigned long nr_pages = size >> PAGE_SHIFT;
>> @@ -157,7 +157,7 @@ int arch_remove_memory(u64 start, u64 size)
>>
>>  /* Remove htab bolted mappings for this section of memory */
>>  start = (unsigned long)__va(start);
>> -ret = remove_section_mapping(start, start + size);
>> +vmemmap_remove_mapping(start, size);
>>
>>  /* Ensure all vmalloc mappings are flushed in case they also
>>   * hit that section of memory
>> -- 
>> 1.8.3.1
> 
> We really want to look at memory hotplug to fully understand the
> radix impact. The unplug operation needs to do some additional freeing.
> 
> I did put in comments around that with the idea of coming back to that
> later
> 
> #ifdef CONFIG_MEMORY_HOTPLUG
> void radix__vmemmap_remove_mapping(unsigned long start, unsigned long 
> page_size)
> {
>   /* FIXME!! intel does more. We should free page tables mapping vmemmap 
> ? */
> }
> #endif
> 
> -aneesh
> 
> ___
> Linuxppc-dev mailing list
> Linuxppc-dev@lists.ozlabs.org
> https://lists.ozlabs.org/listinfo/linuxppc-dev
> 

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH] powerpc/mm: update arch_{add,remove}_memory() for radix

2016-06-23 Thread Aneesh Kumar K.V
Reza Arbab  writes:

> These functions are making direct calls to the hash table APIs,
> leading to a BUG() on systems using radix.
>
> Switch them to the vmemmap_{create,remove}_mapping() wrappers, and
> move to the __meminit section.


They are really not the same. They can possibly end up using different
base page size. Also vmemmap is available only with SPARSEMEM_VMEMMAP
enabled. Does hotplug depend on sparsemem vmemmap ?

>
> Signed-off-by: Reza Arbab 
> ---
>  arch/powerpc/mm/mem.c | 8 
>  1 file changed, 4 insertions(+), 4 deletions(-)
>
> diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
> index 2fd57fa..80c6ee7 100644
> --- a/arch/powerpc/mm/mem.c
> +++ b/arch/powerpc/mm/mem.c
> @@ -116,7 +116,7 @@ int memory_add_physaddr_to_nid(u64 start)
>  }
>  #endif
>
> -int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
> +int __meminit arch_add_memory(int nid, u64 start, u64 size, bool for_device)
>  {
>   struct pglist_data *pgdata;
>   struct zone *zone;
> @@ -127,7 +127,7 @@ int arch_add_memory(int nid, u64 start, u64 size, bool 
> for_device)
>   pgdata = NODE_DATA(nid);
>
>   start = (unsigned long)__va(start);
> - rc = create_section_mapping(start, start + size);
> + rc = vmemmap_create_mapping(start, size, __pa(start));
>   if (rc) {
>   pr_warning(
>   "Unable to create mapping for hot added memory 
> 0x%llx..0x%llx: %d\n",
> @@ -143,7 +143,7 @@ int arch_add_memory(int nid, u64 start, u64 size, bool 
> for_device)
>  }
>
>  #ifdef CONFIG_MEMORY_HOTREMOVE
> -int arch_remove_memory(u64 start, u64 size)
> +int __meminit arch_remove_memory(u64 start, u64 size)
>  {
>   unsigned long start_pfn = start >> PAGE_SHIFT;
>   unsigned long nr_pages = size >> PAGE_SHIFT;
> @@ -157,7 +157,7 @@ int arch_remove_memory(u64 start, u64 size)
>
>   /* Remove htab bolted mappings for this section of memory */
>   start = (unsigned long)__va(start);
> - ret = remove_section_mapping(start, start + size);
> + vmemmap_remove_mapping(start, size);
>
>   /* Ensure all vmalloc mappings are flushed in case they also
>* hit that section of memory
> -- 
> 1.8.3.1

We really want to look at memory hotplug to fully understand the
radix impact. The unplug operation needs to do some additional freeing.

I did put in comments around that with the idea of coming back to that
later

#ifdef CONFIG_MEMORY_HOTPLUG
void radix__vmemmap_remove_mapping(unsigned long start, unsigned long page_size)
{
/* FIXME!! intel does more. We should free page tables mapping vmemmap 
? */
}
#endif

-aneesh

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH] powerpc/powernv: spelling mistake: "Retrived" -> "Retrieved"

2016-06-23 Thread Colin King
From: Colin Ian King 

trivial fix to spelling mistake in pr_debug message

Signed-off-by: Colin Ian King 
---
 arch/powerpc/platforms/powernv/opal-memory-errors.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/powernv/opal-memory-errors.c 
b/arch/powerpc/platforms/powernv/opal-memory-errors.c
index 00a2943..4495f42 100644
--- a/arch/powerpc/platforms/powernv/opal-memory-errors.c
+++ b/arch/powerpc/platforms/powernv/opal-memory-errors.c
@@ -44,7 +44,7 @@ static void handle_memory_error_event(struct 
OpalMemoryErrorData *merr_evt)
 {
uint64_t paddr_start, paddr_end;
 
-   pr_debug("%s: Retrived memory error event, type: 0x%x\n",
+   pr_debug("%s: Retrieved memory error event, type: 0x%x\n",
  __func__, merr_evt->type);
switch (merr_evt->type) {
case OPAL_MEM_ERR_TYPE_RESILIENCE:
-- 
2.8.1

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH v3 0/9] kexec_file_load implementation for PowerPC

2016-06-23 Thread Thiago Jung Bauermann
Am Donnerstag, 23 Juni 2016, 09:57:51 schrieb Balbir Singh:
> On 23/06/16 03:02, Thiago Jung Bauermann wrote:
> >>> 3. have IMA pass-on its event log (where integrity measurements are
> >>> 
> >>>registered) accross kexec to the second kernel, so that the event
> >>>history is preserved.
> >> 
> >> OK.. and this is safe? Do both the kernels need to be signed by the
> >> same certificate?
> > 
> > They don't. The integrity of the event log (assuming that is what you
> > mean by "this" in "this is safe") is guaranteed by the TPM device. Each
> > event in the measurement list extends a PCR and records its PCR value.
> > It is cryptographically guaranteed that if you replay the PCR extends
> > recorded in the event log and in the end of the process they match the
> > current PCR values in the TPM device, then that event log is correct.
> 
> What I meant was how does the new kernel know that the old kernel did not
> cheat while passing on the values? I presume because we trust that kernel
> via a signature.

Sorry, I still don't understand your concern. What kind of cheating? Which 
values? If it's the values in the event log, there's no need to trust the 
old kernel. The new kernel knows that the old kernel didn't pass wrong 
measurement values in the event log because it can recalculate the PCR 
extend operations recorded in the log and compare the results of the replay 
with the current PCR values stored in the TPM device. If they match, then 
the event log is guaranteed to be correct. If they don't match, either the 
memory was corrupted somehow during the kexec process, or the old kernel 
tried to pass a falsified event log.

There's no known way to construct an alternative series of PCR extend 
operations that will result in the same final value in the PCR register of 
the TPM device. If you can do that, you discovered a hash collision attack 
on the SHA-1 or SHA-256 algorithms (depending on which algorithm is being 
used by IMA in the event log). Or a bug in the TPM device implementation.

> and
> 
> How do we know the new kernel is safe to load - I guess via a signature
> that the new kernel is signed with (assuming it is present in the key
> ring).

Correct. That goal is met by signature verification, not by integrity 
assurance.

I'll note that even with both of my patch series there's still code missing 
for kernel signature verification in PowerPC. I believe there's not a file 
format defined yet for how to store a signature in a PowerPC kernel image.

Integrity assurance doesn't depend on kernel signature verification though. 
There's value in both my patch series even without kernel signature 
verification support. They're complementary features.
 
[]'s
Thiago Jung Bauermann
IBM Linux Technology Center

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH] powerpc/mm: update arch_{add,remove}_memory() for radix

2016-06-23 Thread Reza Arbab
These functions are making direct calls to the hash table APIs,
leading to a BUG() on systems using radix.

Switch them to the vmemmap_{create,remove}_mapping() wrappers, and
move to the __meminit section.

Signed-off-by: Reza Arbab 
---
 arch/powerpc/mm/mem.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 2fd57fa..80c6ee7 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -116,7 +116,7 @@ int memory_add_physaddr_to_nid(u64 start)
 }
 #endif
 
-int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
+int __meminit arch_add_memory(int nid, u64 start, u64 size, bool for_device)
 {
struct pglist_data *pgdata;
struct zone *zone;
@@ -127,7 +127,7 @@ int arch_add_memory(int nid, u64 start, u64 size, bool 
for_device)
pgdata = NODE_DATA(nid);
 
start = (unsigned long)__va(start);
-   rc = create_section_mapping(start, start + size);
+   rc = vmemmap_create_mapping(start, size, __pa(start));
if (rc) {
pr_warning(
"Unable to create mapping for hot added memory 
0x%llx..0x%llx: %d\n",
@@ -143,7 +143,7 @@ int arch_add_memory(int nid, u64 start, u64 size, bool 
for_device)
 }
 
 #ifdef CONFIG_MEMORY_HOTREMOVE
-int arch_remove_memory(u64 start, u64 size)
+int __meminit arch_remove_memory(u64 start, u64 size)
 {
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
@@ -157,7 +157,7 @@ int arch_remove_memory(u64 start, u64 size)
 
/* Remove htab bolted mappings for this section of memory */
start = (unsigned long)__va(start);
-   ret = remove_section_mapping(start, start + size);
+   vmemmap_remove_mapping(start, size);
 
/* Ensure all vmalloc mappings are flushed in case they also
 * hit that section of memory
-- 
1.8.3.1

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [RFC PATCH v2 05/18] sched: add task flag for preempt IRQ tracking

2016-06-23 Thread Andy Lutomirski
On Thu, Jun 23, 2016 at 9:19 AM, Josh Poimboeuf  wrote:
> On Wed, Jun 22, 2016 at 12:17:25PM -0700, Andy Lutomirski wrote:
>> On Wed, Jun 22, 2016 at 11:40 AM, Josh Poimboeuf  wrote:
>> > On Wed, Jun 22, 2016 at 11:26:21AM -0700, Andy Lutomirski wrote:
>> >> >
>> >> > So are you suggesting something like:
>> >> >
>> >> > .macro ENTRY_CALL func pt_regs_offset=0
>> >> > call \func
>> >> > 1:  .pushsection .entry_calls, "a"
>> >> > .long 1b - .
>> >> > .long \pt_regs_offset
>> >> > .popsection
>> >> > .endm
>> >> >
>> >> > and then change every call in the entry code to ENTRY_CALL?
>> >>
>> >> Yes, exactly, modulo whether the section name is good.  hpa is
>> >> probably the authority on that.
>> >
>> > Well, as you probably know, I don't really like peppering ENTRY_CALL
>> > everywhere. :-/
>>
>> Me neither.  But at least it's less constraining on the
>> already-fairly-hairy code.
>>
>> >
>> > Also I wonder how we could annotate the hypercalls, for example
>> > DISABLE_INTERRUPTS actually wraps the call in a push/pop pair.
>>
>> Oh, yuck.  But forcing all the DISABLE_INTERRUPTS and
>> ENABLE_INTERRUPTS invocations to be in frame pointer regions isn't so
>> great either.
>
> Hm, I don't follow this statement.  Why not?  The more frame pointer
> coverage, the better, especially if it doesn't add any additional
> overhead.

Less flexibility, and it's IMO annoying to make the Xen case have
extra constraints.  It also makes it very awkward or impossible to
take advantage of the sti interrupt window, although admittedly that
doesn't work on Xen either, so maybe that's moot.

>
>> DWARF solves this problem completely and IMO fairly cleanly.  Maybe we
>> should add your task flag and then consider removing it again when
>> DWARF happens.
>
> I tend to doubt we'd be able to remove it later.  As you said before,
> many embedded platforms probably won't be able to switch to DWARF, and
> they'll want to do live patching too.
>
> So which is the least-bad option?  To summarize:
>
>   1) task flag(s) for preemption and page faults
>
>   2) turn pt_regs into a stack frame
>
>   3) annotate all calls from entry code in a table
>
>   4) encode rbp on entry
>
> They all have their issues, though I'm partial to #2.
>
> Any more hare-brained ideas? :-)

I'll try to take a closer look at #2 and see just how much I dislike
all the stack frame munging.  Also, in principle, it's only the
sleeping calls and the calls that make it into real (non-entry) kernel
code that really want to be unwindable through this mechanism.

FWIW, I don't care that much about preserving gdb's partial ability to
unwind through pt_regs, especially because gdb really ought to be able
to use DWARF, too.

--Andy
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [RFC PATCH v2 05/18] sched: add task flag for preempt IRQ tracking

2016-06-23 Thread Josh Poimboeuf
On Wed, Jun 22, 2016 at 12:17:25PM -0700, Andy Lutomirski wrote:
> On Wed, Jun 22, 2016 at 11:40 AM, Josh Poimboeuf  wrote:
> > On Wed, Jun 22, 2016 at 11:26:21AM -0700, Andy Lutomirski wrote:
> >> >
> >> > So are you suggesting something like:
> >> >
> >> > .macro ENTRY_CALL func pt_regs_offset=0
> >> > call \func
> >> > 1:  .pushsection .entry_calls, "a"
> >> > .long 1b - .
> >> > .long \pt_regs_offset
> >> > .popsection
> >> > .endm
> >> >
> >> > and then change every call in the entry code to ENTRY_CALL?
> >>
> >> Yes, exactly, modulo whether the section name is good.  hpa is
> >> probably the authority on that.
> >
> > Well, as you probably know, I don't really like peppering ENTRY_CALL
> > everywhere. :-/
> 
> Me neither.  But at least it's less constraining on the
> already-fairly-hairy code.
> 
> >
> > Also I wonder how we could annotate the hypercalls, for example
> > DISABLE_INTERRUPTS actually wraps the call in a push/pop pair.
> 
> Oh, yuck.  But forcing all the DISABLE_INTERRUPTS and
> ENABLE_INTERRUPTS invocations to be in frame pointer regions isn't so
> great either.

Hm, I don't follow this statement.  Why not?  The more frame pointer
coverage, the better, especially if it doesn't add any additional
overhead.

> DWARF solves this problem completely and IMO fairly cleanly.  Maybe we
> should add your task flag and then consider removing it again when
> DWARF happens.

I tend to doubt we'd be able to remove it later.  As you said before,
many embedded platforms probably won't be able to switch to DWARF, and
they'll want to do live patching too.

So which is the least-bad option?  To summarize:

  1) task flag(s) for preemption and page faults

  2) turn pt_regs into a stack frame

  3) annotate all calls from entry code in a table

  4) encode rbp on entry

They all have their issues, though I'm partial to #2.

Any more hare-brained ideas? :-)

-- 
Josh
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [RFC PATCH v2 05/18] sched: add task flag for preempt IRQ tracking

2016-06-23 Thread Josh Poimboeuf
On Wed, Jun 22, 2016 at 05:09:11PM -0700, Andy Lutomirski wrote:
> On Wed, Jun 22, 2016 at 9:30 AM, Josh Poimboeuf  wrote:
> > Andy,
> >
> > So I got a chance to look at this some more.  I'm thinking that to make
> > this feature more consistently useful, we shouldn't only annotate
> > pt_regs frames for calls to handlers; other calls should be annotated as
> > well: preempt_schedule_irq, CALL_enter_from_user_mode,
> > prepare_exit_to_usermode, SWAPGS, TRACE_IRQS_OFF, DISABLE_INTERRUPTS,
> > etc.  That way, the unwinder will always be able to find pt_regs from an
> > interrupt/exception, even if starting from one of these other calls.
> >
> > But then, things get ugly.  You have to either setup and tear down the
> > frame for every possible call, or do a higher-level setup/teardown
> > across multiple calls, which invalidates several assumptions in the
> > entry code about the location of pt_regs on the stack.
> >
> 
> Here's yet another harebrained idea.  Maybe it works better than my
> previous harebrained ideas :)
> 
> Your patch is already creating a somewhat nonstandard stack frame:
> 
> +   movq%rbp,   0*8(%rsp)
> +   movq$entry_frame_ret,   1*8(%rsp)
> +   movq%rsp, %rbp
> 
> It's kind of a normal stack frame, but rbp points at something odd,
> and to unwind it fully correctly, the unwinder needs to know about it.
> 
> What if we made it even more special, along the lines of:
> 
> leaq offset_to_ptregs(%rsp), %rbp
> xorq $-1, %rbp
> 
> IOW, don't write anything to the stack at all, and just put a special
> value into RBP that says "the next frame is pt_regs at such-and-such
> address".  Do this once on entry and make sure to restore RBP (from
> pt_regs) on exit.  Now the unwinder can notice that RBP has the high
> bits clear *and* that the negation of it points to the stack, and it
> can figure out what's going on.
> 
> What do you think?  Am I nuts or could this work?
> 
> It had better not have much risk of breaking things worse than they
> currently are, given that current kernel allow user code to stick any
> value it likes into the very last element of the RBP chain.

I think it's a good idea, and it could work... BUT it would break
external unwinders like gdb for the in-kernel entry case.

For interrupts and exceptions in kernel mode, rbp *is* valid.  Sure, it
doesn't tell you the interrupted function, but it does tell you its
caller.  A generic frame pointer unwinder skips the interrupted
function, but at least it keeps going.  If we encoded rbp on entry, that
would break.

-- 
Josh
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH v3 3/9] kexec_file: Factor out kexec_locate_mem_hole from kexec_add_buffer.

2016-06-23 Thread Thiago Jung Bauermann
Am Donnerstag, 23 Juni 2016, 01:44:07 schrieb Dave Young:
> Hmm, hold on. For declaring a struct in a header file, comment should be
> just after each fields, like below, your format is for a function instead:
> struct pci_slot {
> struct pci_bus *bus;/* The bus this slot is on */
> struct list_head list;  /* node in list of slots on this
> bus */ struct hotplug_slot *hotplug;   /* Hotplug info (migrate over
> time) */ unsigned char number;   /* PCI_SLOT(pci_dev->devfn) */
> struct kobject kobj;
> };

The comment style you mention above is not extractable documentation. The 
style I used is what is described in section "kernel-doc for structs, 
unions, enums, and typedefs" in Documentation/kernel-doc-nano-HOWTO.txt.
 
> BTW, what is @size? there's no size field in kexec_buf. I think it is not
> necessary to add these comment, they are easy to understand. If you really
> want, please rewrite them correctly, for example "image" description is
> wrong. It is not only for searching memory only, top_down description is
> also bad.

Sorry, I moved these comments from kexec_locate_mem_hole but forgot to 
rename the parameters to what they are called in struct kexec_buf. @size 
should have been @memsz (other fields also have wrong names, I'll fix them 
as well). The image description is correct in the context of where struct 
kexec_buf is used and explains what it will be used for in the function 
taking kexec_buf as an argument. It is not meant as a general description of 
the purpose of struct kimage. What is bad about the description of top_down?

I decided to add these comments because struct kexec_buf is now part of the 
kernel API for kexec. kernel-doc-nano-HOWTO.txt says:

> We definitely need kernel-doc formatted documentation for functions
> that are exported to loadable modules using EXPORT_SYMBOL.
> 
> We also look to provide kernel-doc formatted documentation for
> functions externally visible to other kernel files (not marked
> "static").
> 
> We also recommend providing kernel-doc formatted documentation
> for private (file "static") routines, for consistency of kernel
> source code layout.  But this is lower priority and at the
> discretion of the MAINTAINER of that kernel source file.

If you think they are not necessary or just add clutter I can leave them 
out.

-- 
[]'s
Thiago Jung Bauermann
IBM Linux Technology Center

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH] cpuidle/powernv: Fix snooze timeout

2016-06-23 Thread Daniel Lezcano

On 06/23/2016 03:35 PM, Shreyas B Prabhu wrote:



On 06/23/2016 03:31 PM, Daniel Lezcano wrote:

On 06/23/2016 11:28 AM, Balbir Singh wrote:

[ ... ]


cpuidle_enter_state()
{
 [...]
 time_start = local_clock();
 [enter idle state]
 time_end = local_clock();
 /*
   * local_clock() returns the time in nanosecond, let's shift
   * by 10 (divide by 1024) to have microsecond based time.
   */
  diff = (time_end - time_start) >> 10;
 [...]
 dev->last_residency = (int) diff;
}

Because of >>10 as opposed to /1000, last_residency is lesser by 2.3%


I am surprised the last_residency is 2.3% exactly less. The difference
between >>10 and /1000 is 2.34%.

What is the next target residency value ?


Target residency of the next idle state is 100 microseconds.
When snooze times out after 100 microseconds, last_residency value
calculated is typically 97 or 98 microseconds.


I see, the snooze exit is very fast.


Does it solve the issue if you replace >>10 by /1000 ?



Yes it does.


Ok. IMO, it would be cleaner to fix this in the core code.

  -- Daniel

--
  Linaro.org │ Open source software for ARM SoCs

Follow Linaro:   Facebook |
 Twitter |
 Blog

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH] cpuidle/powernv: Fix snooze timeout

2016-06-23 Thread Shreyas B Prabhu


On 06/23/2016 03:31 PM, Daniel Lezcano wrote:
> On 06/23/2016 11:28 AM, Balbir Singh wrote:
> 
> [ ... ]
> 
>>> cpuidle_enter_state()
>>> {
>>> [...]
>>> time_start = local_clock();
>>> [enter idle state]
>>> time_end = local_clock();
>>> /*
>>>   * local_clock() returns the time in nanosecond, let's shift
>>>   * by 10 (divide by 1024) to have microsecond based time.
>>>   */
>>>  diff = (time_end - time_start) >> 10;
>>> [...]
>>> dev->last_residency = (int) diff;
>>> }
>>>
>>> Because of >>10 as opposed to /1000, last_residency is lesser by 2.3%
> 
> I am surprised the last_residency is 2.3% exactly less. The difference
> between >>10 and /1000 is 2.34%.
> 
> What is the next target residency value ?
> 
Target residency of the next idle state is 100 microseconds.
When snooze times out after 100 microseconds, last_residency value
calculated is typically 97 or 98 microseconds.

> Does it solve the issue if you replace >>10 by /1000 ?
> 

Yes it does.

--Shreyas

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[v6,2/2] cxl: Add set and get private data to context struct

2016-06-23 Thread Philippe Bergheaud
From: Michael Neuling 

This provides AFU drivers a means to associate private data with a cxl
context. This is particularly intended for make the new callbacks for
driver specific events easier for AFU drivers to use, as they can easily
get back to any private data structures they may use.

Signed-off-by: Michael Neuling 
Signed-off-by: Ian Munsie 
Signed-off-by: Philippe Bergheaud 
---
No changes since v1. Added Matt Ochs reviewed-by tag.

 drivers/misc/cxl/api.c | 21 +
 drivers/misc/cxl/cxl.h |  3 +++
 include/misc/cxl.h |  7 +++
 3 files changed, 31 insertions(+)

diff --git a/drivers/misc/cxl/api.c b/drivers/misc/cxl/api.c
index dd1988e..271bf77 100644
--- a/drivers/misc/cxl/api.c
+++ b/drivers/misc/cxl/api.c
@@ -94,6 +94,27 @@ static irq_hw_number_t cxl_find_afu_irq(struct cxl_context 
*ctx, int num)
return 0;
 }
 
+
+int cxl_set_priv(struct cxl_context *ctx, void *priv)
+{
+   if (!ctx)
+   return -EINVAL;
+
+   ctx->priv = priv;
+
+   return 0;
+}
+EXPORT_SYMBOL_GPL(cxl_set_priv);
+
+void *cxl_get_priv(struct cxl_context *ctx)
+{
+   if (!ctx)
+   return ERR_PTR(-EINVAL);
+
+   return ctx->priv;
+}
+EXPORT_SYMBOL_GPL(cxl_get_priv);
+
 int cxl_allocate_afu_irqs(struct cxl_context *ctx, int num)
 {
int res;
diff --git a/drivers/misc/cxl/cxl.h b/drivers/misc/cxl/cxl.h
index b0027e6..1e56304 100644
--- a/drivers/misc/cxl/cxl.h
+++ b/drivers/misc/cxl/cxl.h
@@ -478,6 +478,9 @@ struct cxl_context {
/* Only used in PR mode */
u64 process_token;
 
+   /* driver private data */
+   void *priv;
+
unsigned long *irq_bitmap; /* Accessed from IRQ context */
struct cxl_irq_ranges irqs;
struct list_head irq_names;
diff --git a/include/misc/cxl.h b/include/misc/cxl.h
index 17419f6..b6d040f 100644
--- a/include/misc/cxl.h
+++ b/include/misc/cxl.h
@@ -86,6 +86,13 @@ struct cxl_context *cxl_dev_context_init(struct pci_dev 
*dev);
 int cxl_release_context(struct cxl_context *ctx);
 
 /*
+ * Set and get private data associated with a context. Allows drivers to have a
+ * back pointer to some useful structure.
+ */
+int cxl_set_priv(struct cxl_context *ctx, void *priv);
+void *cxl_get_priv(struct cxl_context *ctx);
+
+/*
  * Allocate AFU interrupts for this context. num=0 will allocate the default
  * for this AFU as given in the AFU descriptor. This number doesn't include the
  * interrupt 0 (CAIA defines AFU IRQ 0 for page faults). Each interrupt to be
-- 
2.8.0

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[v7, 1/2] cxl: Add mechanism for delivering AFU driver specific events

2016-06-23 Thread Philippe Bergheaud
This adds an afu_driver_ops structure with fetch_event() and
event_delivered() callbacks. An AFU driver such as cxlflash can fill
this out and associate it with a context to enable passing custom AFU
specific events to userspace.

This also adds a new kernel API function cxl_context_pending_events(),
that the AFU driver can use to notify the cxl driver that new specific
events are ready to be delivered, and wake up anyone waiting on the
context wait queue.

The current count of AFU driver specific events is stored in the field
afu_driver_events of the context structure.

The cxl driver checks the afu_driver_events count during poll, select,
read, etc. calls to check if an AFU driver specific event is pending,
and calls fetch_event() to obtain and deliver that event. This way, the
cxl driver takes care of all the usual locking semantics around these
calls and handles all the generic cxl events, so that the AFU driver
only needs to worry about it's own events.

fetch_event() return a struct cxl_event_afu_driver_reserved, allocated
by the AFU driver, and filled in with the specific event information and
size. Total event size (header + data) should not be greater than
CXL_READ_MIN_SIZE (4K).

Th cxl driver prepends an appropriate cxl event header, copies the event
to userspace, and finally calls event_delivered() to return the status of
the operation to the AFU driver. The event is identified by the context
and cxl_event_afu_driver_reserved pointers.

Since AFU drivers provide their own means for userspace to obtain the
AFU file descriptor (i.e. cxlflash uses an ioctl on their scsi file
descriptor to obtain the AFU file descriptor) and the generic cxl driver
will never use this event, the ABI of the event is up to each individual
AFU driver.

Signed-off-by: Philippe Bergheaud 
---
Changes since v6:
- Dropped cxl_unset_driver_ops
  (passing NULL to cxl_set_driver_ops will do)
- Dropped CXL_MAX_EVENT_DATA_SIZE
  (event size now limited to CXL_READ_MIN_SIZE)

Changes since v5:
- s/deliver_event/fetch_event/
- Fixed the handling of fetch_event errors
- Documented the return codes of event_delivered

Changes since v4:
- Addressed comments from Vaibhav:
  - Changed struct cxl_event_afu_driver_reserved from
{ __u64 reserved[4]; } to { size_t data_size; u8 data[]; }
  - Modified deliver_event to return a struct cxl_event_afu_driver_reserved
  - Added new callback event_delivered
  - Added static function afu_driver_event_copy

Changes since v3:
- Removed driver ops callback ctx_event_pending
- Created cxl function cxl_context_pending_events
- Created cxl function cxl_unset_driver_ops
- Added atomic event counter afu_driver_events

Changes since v2:
- Fixed some typos spotted by Matt Ochs

Changes since v1:
- Rebased on upstream
- Bumped cxl api version to 3
- Addressed comments from mpe:
  - Clarified commit message & some comments
  - Mentioned 'cxlflash' as a possible user of this event
  - Check driver ops on registration and warn if missing calls
  - Remove redundant checks where driver ops is used
  - Simplified ctx_event_pending and removed underscore version
  - Changed deliver_event to take the context as the first argument

 drivers/misc/cxl/Kconfig |  5 
 drivers/misc/cxl/api.c   | 17 +
 drivers/misc/cxl/cxl.h   |  7 +-
 drivers/misc/cxl/file.c  | 64 ++--
 include/misc/cxl.h   | 48 
 include/uapi/misc/cxl.h  | 17 +
 6 files changed, 149 insertions(+), 9 deletions(-)

diff --git a/drivers/misc/cxl/Kconfig b/drivers/misc/cxl/Kconfig
index 8756d06..560412c 100644
--- a/drivers/misc/cxl/Kconfig
+++ b/drivers/misc/cxl/Kconfig
@@ -15,12 +15,17 @@ config CXL_EEH
bool
default n
 
+config CXL_AFU_DRIVER_OPS
+   bool
+   default n
+
 config CXL
tristate "Support for IBM Coherent Accelerators (CXL)"
depends on PPC_POWERNV && PCI_MSI && EEH
select CXL_BASE
select CXL_KERNEL_API
select CXL_EEH
+   select CXL_AFU_DRIVER_OPS
default m
help
  Select this option to enable driver support for IBM Coherent
diff --git a/drivers/misc/cxl/api.c b/drivers/misc/cxl/api.c
index 6d228cc..dd1988e 100644
--- a/drivers/misc/cxl/api.c
+++ b/drivers/misc/cxl/api.c
@@ -323,6 +323,23 @@ struct cxl_context *cxl_fops_get_context(struct file *file)
 }
 EXPORT_SYMBOL_GPL(cxl_fops_get_context);
 
+void cxl_set_driver_ops(struct cxl_context *ctx,
+   struct cxl_afu_driver_ops *ops)
+{
+   WARN_ON(!ops->fetch_event || !ops->event_delivered);
+   atomic_set(>afu_driver_events, 0);
+   ctx->afu_driver_ops = ops;
+}
+EXPORT_SYMBOL_GPL(cxl_set_driver_ops);
+
+void cxl_context_events_pending(struct cxl_context *ctx,
+   unsigned int new_events)
+{
+   atomic_add(new_events, >afu_driver_events);
+   wake_up_all(>wq);
+}

Re: [PATCH] cpuidle/powernv: Fix snooze timeout

2016-06-23 Thread Daniel Lezcano

On 06/23/2016 11:28 AM, Balbir Singh wrote:

[ ... ]


cpuidle_enter_state()
{
[...]
time_start = local_clock();
[enter idle state]
time_end = local_clock();
/*
  * local_clock() returns the time in nanosecond, let's shift
  * by 10 (divide by 1024) to have microsecond based time.
  */
 diff = (time_end - time_start) >> 10;
[...]
dev->last_residency = (int) diff;
}

Because of >>10 as opposed to /1000, last_residency is lesser by 2.3%


I am surprised the last_residency is 2.3% exactly less. The difference 
between >>10 and /1000 is 2.34%.


What is the next target residency value ?

Does it solve the issue if you replace >>10 by /1000 ?



--
  Linaro.org │ Open source software for ARM SoCs

Follow Linaro:   Facebook |
 Twitter |
 Blog

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH] cpuidle/powernv: Fix snooze timeout

2016-06-23 Thread Balbir Singh
>> This is still a rounding error but at a different site. I see we saved
>> a division by doing a >> 10, but we added it right back by doing a /20
>> later in the platform code.
>
> While a >> 10 is done at every idle exit, div by 20 is done once during
> boot, so this doesn't negate the previous optimization.
>

Yes, fair point

>> Shouldn't the rounding affect other
>> platforms as well? Can't we fix it in cpuidle_enter_state().
>
> This does affect all platforms, but I'm guessing no other place relied
> on the precision of last_residency calculations.
> Daniel can perhaps comment on this.
>
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH] cpuidle/powernv: Fix snooze timeout

2016-06-23 Thread Shreyas B Prabhu


On 06/23/2016 02:58 PM, Balbir Singh wrote:
> 
> 
> On 23/06/16 14:58, Shreyas B Prabhu wrote:
>>
>>
>> On 06/23/2016 05:18 AM, Balbir Singh wrote:
>>>
>>>
>>> On 23/06/16 05:36, Shreyas B. Prabhu wrote:
 Snooze is a poll idle state in powernv and pseries platforms. Snooze
 has a timeout so that if a cpu stays in snooze for more than target
 residency of the next available idle state, then it would exit thereby
 giving chance to the cpuidle governor to re-evaluate and
 promote the cpu to a deeper idle state. Therefore whenever snooze exits
 due to this timeout, its last_residency will be target_residency of next
 deeper state.

 commit e93e59ce5b85 ("cpuidle: Replace ktime_get() with local_clock()")
 changed the math around last_residency calculation. Specifically, while
 converting last_residency value from nanoseconds to microseconds it does
 right shift by 10. Due to this, in snooze timeout exit scenarios
 last_residency calculated is roughly 2.3% less than target_residency of
 next available state. This pattern is picked up get_typical_interval()
 in the menu governor and therefore expected_interval in menu_select() is
 frequently less than the target_residency of any state but snooze.

 Due to this we are entering snooze at a higher rate, thereby affecting
 the single thread performance.
 Since the math around last_residency is not meant to be precise, fix this
 issue setting snooze timeout to 105% of target_residency of next
 available idle state.

 This also adds comment around why snooze timeout is necessary.

 Reported-by: Anton Blanchard 
 Signed-off-by: Shreyas B. Prabhu 
 ---
  drivers/cpuidle/cpuidle-powernv.c | 14 ++
  drivers/cpuidle/cpuidle-pseries.c | 13 +
  2 files changed, 27 insertions(+)

 diff --git a/drivers/cpuidle/cpuidle-powernv.c 
 b/drivers/cpuidle/cpuidle-powernv.c
 index e12dc30..5835491 100644
 --- a/drivers/cpuidle/cpuidle-powernv.c
 +++ b/drivers/cpuidle/cpuidle-powernv.c
 @@ -268,10 +268,24 @@ static int powernv_idle_probe(void)
cpuidle_state_table = powernv_states;
/* Device tree can indicate more idle states */
max_idle_state = powernv_add_idle_states();
 +
 +  /*
 +   * Staying in snooze for a long period can degrade the
 +   * perfomance of the sibling cpus. Set timeout for snooze such
 +   * that if the cpu stays in snooze longer than target residency
 +   * of the next available idle state then exit from snooze. This
 +   * gives a chance to the cpuidle governor to re-evaluate and
 +   * promote it to deeper idle states.
 +   */
if (max_idle_state > 1) {
snooze_timeout_en = true;
snooze_timeout = powernv_states[1].target_residency *
 tb_ticks_per_usec;
 +  /*
 +   * Give a 5% margin since target residency related math
 +   * is not precise in cpuidle core.
 +   */
>>>
>>> Is this due to the microsecond conversion mentioned above? It would be nice 
>>> to
>>> have it in the comment. Does
>>>
>>> (powernv_states[1].target_residency + tb_ticks_per_usec) / 
>>> tb_ticks_per_usec solve
>>> your rounding issues, assuming the issue is really rounding or maybe it is 
>>> due
>>> to the shift by 10, could you please elaborate on what related math is not
>>> precise? That would explain to me why I missed understanding your changes.
>>>
 +  snooze_timeout += snooze_timeout / 20;
>>>
>>> For now 5% is sufficient, but do you want to check to assert to check if
>>>
>>> snooze_timeout (in microseconds) / tb_ticks_per_usec > 
>>> powernv_states[i].target_residency?
>>>
>>
>> This is not a rounding issue. As I mentioned in the commit message, this
>> is because of the last_residency calculation in cpuidle.c.
>> To elaborate, last residency calculation is done in the following way
>> after commit e93e59ce5b85 ("cpuidle: Replace ktime_get() with
>> local_clock()") -
>>
>> cpuidle_enter_state()
>> {
>>  [...]
>>  time_start = local_clock();
>>  [enter idle state]
>>  time_end = local_clock();
>>  /*
>>  * local_clock() returns the time in nanosecond, let's shift
>>  * by 10 (divide by 1024) to have microsecond based time.
>>  */
>> diff = (time_end - time_start) >> 10;
>>  [...]
>>  dev->last_residency = (int) diff;
>> }
>>
>> Because of >>10 as opposed to /1000, last_residency is lesser by 2.3%
> 
> 
> This is still a rounding error but at a different site. I see we saved
> a division by doing a >> 10, but we added it right back by doing a /20
> later in the 

Re: [PATCH] cpuidle/powernv: Fix snooze timeout

2016-06-23 Thread Balbir Singh


On 23/06/16 14:58, Shreyas B Prabhu wrote:
> 
> 
> On 06/23/2016 05:18 AM, Balbir Singh wrote:
>>
>>
>> On 23/06/16 05:36, Shreyas B. Prabhu wrote:
>>> Snooze is a poll idle state in powernv and pseries platforms. Snooze
>>> has a timeout so that if a cpu stays in snooze for more than target
>>> residency of the next available idle state, then it would exit thereby
>>> giving chance to the cpuidle governor to re-evaluate and
>>> promote the cpu to a deeper idle state. Therefore whenever snooze exits
>>> due to this timeout, its last_residency will be target_residency of next
>>> deeper state.
>>>
>>> commit e93e59ce5b85 ("cpuidle: Replace ktime_get() with local_clock()")
>>> changed the math around last_residency calculation. Specifically, while
>>> converting last_residency value from nanoseconds to microseconds it does
>>> right shift by 10. Due to this, in snooze timeout exit scenarios
>>> last_residency calculated is roughly 2.3% less than target_residency of
>>> next available state. This pattern is picked up get_typical_interval()
>>> in the menu governor and therefore expected_interval in menu_select() is
>>> frequently less than the target_residency of any state but snooze.
>>>
>>> Due to this we are entering snooze at a higher rate, thereby affecting
>>> the single thread performance.
>>> Since the math around last_residency is not meant to be precise, fix this
>>> issue setting snooze timeout to 105% of target_residency of next
>>> available idle state.
>>>
>>> This also adds comment around why snooze timeout is necessary.
>>>
>>> Reported-by: Anton Blanchard 
>>> Signed-off-by: Shreyas B. Prabhu 
>>> ---
>>>  drivers/cpuidle/cpuidle-powernv.c | 14 ++
>>>  drivers/cpuidle/cpuidle-pseries.c | 13 +
>>>  2 files changed, 27 insertions(+)
>>>
>>> diff --git a/drivers/cpuidle/cpuidle-powernv.c 
>>> b/drivers/cpuidle/cpuidle-powernv.c
>>> index e12dc30..5835491 100644
>>> --- a/drivers/cpuidle/cpuidle-powernv.c
>>> +++ b/drivers/cpuidle/cpuidle-powernv.c
>>> @@ -268,10 +268,24 @@ static int powernv_idle_probe(void)
>>> cpuidle_state_table = powernv_states;
>>> /* Device tree can indicate more idle states */
>>> max_idle_state = powernv_add_idle_states();
>>> +
>>> +   /*
>>> +* Staying in snooze for a long period can degrade the
>>> +* perfomance of the sibling cpus. Set timeout for snooze such
>>> +* that if the cpu stays in snooze longer than target residency
>>> +* of the next available idle state then exit from snooze. This
>>> +* gives a chance to the cpuidle governor to re-evaluate and
>>> +* promote it to deeper idle states.
>>> +*/
>>> if (max_idle_state > 1) {
>>> snooze_timeout_en = true;
>>> snooze_timeout = powernv_states[1].target_residency *
>>>  tb_ticks_per_usec;
>>> +   /*
>>> +* Give a 5% margin since target residency related math
>>> +* is not precise in cpuidle core.
>>> +*/
>>
>> Is this due to the microsecond conversion mentioned above? It would be nice 
>> to
>> have it in the comment. Does
>>
>> (powernv_states[1].target_residency + tb_ticks_per_usec) / tb_ticks_per_usec 
>> solve
>> your rounding issues, assuming the issue is really rounding or maybe it is 
>> due
>> to the shift by 10, could you please elaborate on what related math is not
>> precise? That would explain to me why I missed understanding your changes.
>>
>>> +   snooze_timeout += snooze_timeout / 20;
>>
>> For now 5% is sufficient, but do you want to check to assert to check if
>>
>> snooze_timeout (in microseconds) / tb_ticks_per_usec > 
>> powernv_states[i].target_residency?
>>
> 
> This is not a rounding issue. As I mentioned in the commit message, this
> is because of the last_residency calculation in cpuidle.c.
> To elaborate, last residency calculation is done in the following way
> after commit e93e59ce5b85 ("cpuidle: Replace ktime_get() with
> local_clock()") -
> 
> cpuidle_enter_state()
> {
>   [...]
>   time_start = local_clock();
>   [enter idle state]
>   time_end = local_clock();
>   /*
>  * local_clock() returns the time in nanosecond, let's shift
>  * by 10 (divide by 1024) to have microsecond based time.
>  */
> diff = (time_end - time_start) >> 10;
>   [...]
>   dev->last_residency = (int) diff;
> }
> 
> Because of >>10 as opposed to /1000, last_residency is lesser by 2.3%


This is still a rounding error but at a different site. I see we saved
a division by doing a >> 10, but we added it right back by doing a /20
later in the platform code. Shouldn't the rounding affect other
platforms as well? Can't we fix it in cpuidle_enter_state(). Division
by 1000 can be 

Re: [V2] powerpc/mm/radix: Update Radix tree size as per ISA 3.0

2016-06-23 Thread Michael Ellerman
On Fri, 2016-17-06 at 06:10:36 UTC, "Aneesh Kumar K.V" wrote:
> ISA 3.0 updated it to be encoded as Radix tree size = 2^(RTS + 31). We
> have it encoded as 2^(RTS + 28). Add a helper with the correct encoding
> and use it instead of opencoding.
> 
> Fixes commit 2bfd65e45e87 ("powerpc/mm/radix: Add radix callbacks for
> early init routine ")
> 
> Reviewed-by: Balbir Singh 
> Signed-off-by: Aneesh Kumar K.V 

Applied to powerpc fixes, thanks.

https://git.kernel.org/powerpc/c/b23d9c5b9c83c05e013aa52460

cheers
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: powerpc: Fix faults caused by radix patching of SLB miss handler

2016-06-23 Thread Michael Ellerman
On Tue, 2016-21-06 at 10:36:19 UTC, Michael Ellerman wrote:
> As part of the Radix MMU support we added some feature sections in the
> SLB miss handler. These are intended to catch the case that we
> incorrectly take an SLB miss when Radix is enabled, and instead of
> crashing weirdly they bail out to a well defined exit path and trigger
> an oops.
> 
> However the way they were written meant the bailout case was enabled by
> default until we did CPU feature patching.
> 
> On powermacs the early debug prints in setup_system() can cause an SLB
> miss, which happens before code patching, and so the SLB miss handler
> would incorrectly bailout and crash during boot.
> 
> Fix it by inverting the sense of the feature section, so that the code
> which is in place at boot is correct for the hash case. Once we
> determine we are using Radix - which will never happen on a powermac -
> only then do we patch in the bailout case which unconditionally jumps.
> 
> Fixes: caca285e5ab4 ("powerpc/mm/radix: Use STD_MMU_64 to properly isolate 
> hash related code")
> Reported-by: Denis Kirjanov 
> Tested-by: Denis Kirjanov 
> Signed-off-by: Michael Ellerman 
> Reviewed-by: Aneesh Kumar K.V 

Applied to powerpc fixes.

https://git.kernel.org/powerpc/c/6e914ee629c411d9c6d160399c

cheers
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCHv2,1/7] ppc bpf/jit: Disable classic BPF JIT on ppc64le

2016-06-23 Thread Michael Ellerman
On Wed, 2016-22-06 at 16:25:01 UTC, "Naveen N. Rao" wrote:
> Classic BPF JIT was never ported completely to work on little endian
> powerpc. However, it can be enabled and will crash the system when used.
> As such, disable use of BPF JIT on ppc64le.
> 
> Reported-by: Thadeu Lima de Souza Cascardo 
> Signed-off-by: Naveen N. Rao 
> Acked-by: Thadeu Lima de Souza Cascardo 

Applied to powerpc fixes, thanks.

https://git.kernel.org/powerpc/c/844e3be47693f92a108cb1fb3b

cheers
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: powerpc/eeh: Fix invalid cached PE primary bus

2016-06-23 Thread Michael Ellerman
On Fri, 2016-17-06 at 03:05:11 UTC, Gavin Shan wrote:
> The PE primary bus cannot be got from its child devices when having
> full hotplug in error recovery. The PE primary bus is cached, which
> is done in commit <05ba75f84864> ("powerpc/eeh: Fix stale cached primary
> bus"). In eeh_reset_device(), the flag (EEH_PE_PRI_BUS) is cleared
> before the PCI hot remove. eeh_pe_bus_get() then returns NULL as the
> PE primary bus in pnv_eeh_reset() and it crashes the kernel eventually.
> 
> This fixes the issue by clearing the flag (EEH_PE_PRI_BUS) before the
> PCI hot add. With it, the PowerNV EEH reset backend (pnv_eeh_reset())
> can get valid PE primary bus through eeh_pe_bus_get().
> 
> Fixes: 67086e32b564 ("powerpc/eeh: powerpc/eeh: Support error recovery for VF 
> PE")
> Reported-by: Pridhiviraj Paidipeddi 
> Signed-off-by: Gavin Shan 

Applied to powerpc fixes, thanks.

https://git.kernel.org/powerpc/c/a3aa256b7258b3d19f8b44557c

cheers
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: powerpc/mm/hash: Don't add memory coherence if cache inhibited is set

2016-06-23 Thread Michael Ellerman
On Fri, 2016-17-06 at 06:02:00 UTC, "Aneesh Kumar K.V" wrote:
> H_ENTER hcall handling in qemu had assumptions that a cache inhibited
> hpte entry won't have memory conference set. Also older kernel
> mentioned that some version of pHyp required this (the code removed
> by the below commit says:
> 
> /* Make pHyp happy */
> if ((rflags & _PAGE_NO_CACHE) && !(rflags & _PAGE_WRITETHRU))
> hpte_r &= ~HPTE_R_M;
> 
> But with older kernel we had some inconsistent memory conherence
> mapping. We always enabled memory conherence in the page fault path and
> removed memory conherence is _PAGE_NO_CACHE was set when we mapped the
> page via htab_bolt_mapping. The commit mentioned below tried to
> consolidate that by always enabling memory conherence. But as mentioned
> above that breaks Qemu H_ENTER handling.
> 
> This patch update this such that we enable memory conherence only if
> cache inhibited is not set and bring fault handling, lpar and bolt
> mapping in sync.
> 
> Fixes: commit 30bda41aba4e("powerpc/mm: Drop WIMG in favour of new constant")
> 
> Signed-off-by: Aneesh Kumar K.V 

Applied to powerpc fixes, thanks.

https://git.kernel.org/powerpc/c/e568006b9d828403397668864d

cheers
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCHv4] powerpc/timer - large decrementer support

2016-06-23 Thread oliver
On Thu, Jun 23, 2016 at 6:02 PM, Denis Kirjanov  wrote:
> On 6/23/16, Oliver O'Halloran  wrote:
>> +static void __init set_decrementer_max(void)
>> +{
>> + struct device_node *cpu;
>> + const __be32 *fp;
>> + u64 bits = 32;
>> +
>> + /* Prior to ISAv3 the decrementer is always 32 bit */
>> + if (!cpu_has_feature(CPU_FTR_ARCH_300))
>> + return;
>> +
>> + cpu = of_find_node_by_type(NULL, "cpu");
>> + if (cpu)
>> + fp = of_get_property(cpu, "ibm,dec-bits", NULL);
>> +
>> + if (cpu && fp) {
>> + bits = of_read_number(fp, 1);
>> +
>> + if (bits > 64 || bits < 32) {
>> + pr_warn("time_init: firmware supplied invalid 
>> ibm,dec-bits");
>> + bits = 32;
>> + }
>> +
>> +
>> + /* calculate the signed maximum given this many bits */
>> + decrementer_max = (1ul << (bits - 1)) - 1;
>> + }
>> +
>> + pr_info("time_init: %llu bit decrementer (max: %llx)\n",
>> + bits, decrementer_max);
>> +}
> the call to of_node_put(cpu) is missing in the function.

Well spotted, thanks.
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCHv4] powerpc/timer - large decrementer support

2016-06-23 Thread Denis Kirjanov
On 6/23/16, Oliver O'Halloran  wrote:
> Power ISAv3 adds a large decrementer (LD) mode which increases the size
> of the decrementer register. The size of the enlarged decrementer
> register is between 32 and 64 bits with the exact size being dependent
> on the implementation. When in LD mode, reads are sign extended to 64
> bits and a decrementer exception is raised when the high bit is set (i.e
> the value goes below zero). Writes however are truncated to the physical
> register width so some care needs to be taken to ensure that the high
> bit is not set when reloading the decrementer. This patch adds support
> for using the LD inside the host kernel on processors that support it.
>
> When LD mode is supported firmware will supply the ibm,dec-bits property
> for CPU nodes to allow the kernel to determine the maximum decrementer
> value. Enabling LD mode is a hypervisor privileged operation so the
> kernel can only enable it manually when running in hypervisor mode.
> Guest kernels that support LD mode can request it using the
> "ibm,client-architecture-support" firmware call or some other platform
> specific method. If this property is not supplied then the traditional
> decrementer width of 32 bit is assumed and LD mode will not be enabled.
>
> This patch was based on initial work by Jack Miller.
>
> Signed-off-by: Oliver O'Halloran 
> Signed-off-by: Balbir Singh 
> Cc: Michael Neuling 
> Cc: Jack Miller 
> ---
>  arch/powerpc/include/asm/reg.h  |   1 +
>  arch/powerpc/include/asm/time.h |   6 +--
>  arch/powerpc/kernel/time.c  | 102
> 
>  3 files changed, 98 insertions(+), 11 deletions(-)
>
> diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
> index a0948f40bc7b..12d970d64bb3 100644
> --- a/arch/powerpc/include/asm/reg.h
> +++ b/arch/powerpc/include/asm/reg.h
> @@ -332,6 +332,7 @@
>  #define   LPCR_AIL_0 0x  /* MMU off exception offset 0x0 */
>  #define   LPCR_AIL_3 0x0180  /* MMU on exception offset 0xc00...4xxx 
> */
>  #define   LPCR_ONL   0x0004  /* online - PURR/SPURR count */
> +#define   LPCR_LD0x0002  /* large decremeter */
>  #define   LPCR_PECE  0x0001f000  /* powersave exit cause enable */
>  #define LPCR_PECEDP  0x0001  /* directed priv dbells cause 
> exit */
>  #define LPCR_PECEDH  0x8000  /* directed hyp dbells cause 
> exit */
> diff --git a/arch/powerpc/include/asm/time.h
> b/arch/powerpc/include/asm/time.h
> index 1092fdd7e737..09211640a0e0 100644
> --- a/arch/powerpc/include/asm/time.h
> +++ b/arch/powerpc/include/asm/time.h
> @@ -146,7 +146,7 @@ static inline void set_tb(unsigned int upper, unsigned
> int lower)
>   * in auto-reload mode.  The problem is PIT stops counting when it
>   * hits zero.  If it would wrap, we could use it just like a decrementer.
>   */
> -static inline unsigned int get_dec(void)
> +static inline u64 get_dec(void)
>  {
>  #if defined(CONFIG_40x)
>   return (mfspr(SPRN_PIT));
> @@ -160,10 +160,10 @@ static inline unsigned int get_dec(void)
>   * in when the decrementer generates its interrupt: on the 1 to 0
>   * transition for Book E/4xx, but on the 0 to -1 transition for others.
>   */
> -static inline void set_dec(int val)
> +static inline void set_dec(u64 val)
>  {
>  #if defined(CONFIG_40x)
> - mtspr(SPRN_PIT, val);
> + mtspr(SPRN_PIT, (u32) val);
>  #else
>  #ifndef CONFIG_BOOKE
>   --val;
> diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
> index 7a482a7f4d8d..aa6d399d939b 100644
> --- a/arch/powerpc/kernel/time.c
> +++ b/arch/powerpc/kernel/time.c
> @@ -97,7 +97,8 @@ static struct clocksource clocksource_timebase = {
>   .read = timebase_read,
>  };
>
> -#define DECREMENTER_MAX  0x7fff
> +#define DECREMENTER_DEFAULT_MAX 0x7FFF
> +u64 decrementer_max = DECREMENTER_DEFAULT_MAX;
>
>  static int decrementer_set_next_event(unsigned long evt,
> struct clock_event_device *dev);
> @@ -505,8 +506,8 @@ static void __timer_interrupt(void)
>   __this_cpu_inc(irq_stat.timer_irqs_event);
>   } else {
>   now = *next_tb - now;
> - if (now <= DECREMENTER_MAX)
> - set_dec((int)now);
> + if (now <= decrementer_max)
> + set_dec(now);
>   /* We may have raced with new irq work */
>   if (test_irq_work_pending())
>   set_dec(1);
> @@ -536,7 +537,7 @@ void timer_interrupt(struct pt_regs * regs)
>   /* Ensure a positive value is written to the decrementer, or else
>* some CPUs will continue to take decrementer exceptions.
>*/
> - set_dec(DECREMENTER_MAX);
> + set_dec(decrementer_max);
>
>   /* Some implementations of hotplug will get timer interrupts while
>* 

[PATCHv4] powerpc/timer - large decrementer support

2016-06-23 Thread Oliver O'Halloran
Power ISAv3 adds a large decrementer (LD) mode which increases the size
of the decrementer register. The size of the enlarged decrementer
register is between 32 and 64 bits with the exact size being dependent
on the implementation. When in LD mode, reads are sign extended to 64
bits and a decrementer exception is raised when the high bit is set (i.e
the value goes below zero). Writes however are truncated to the physical
register width so some care needs to be taken to ensure that the high
bit is not set when reloading the decrementer. This patch adds support
for using the LD inside the host kernel on processors that support it.

When LD mode is supported firmware will supply the ibm,dec-bits property
for CPU nodes to allow the kernel to determine the maximum decrementer
value. Enabling LD mode is a hypervisor privileged operation so the
kernel can only enable it manually when running in hypervisor mode.
Guest kernels that support LD mode can request it using the
"ibm,client-architecture-support" firmware call or some other platform
specific method. If this property is not supplied then the traditional
decrementer width of 32 bit is assumed and LD mode will not be enabled.

This patch was based on initial work by Jack Miller.

Signed-off-by: Oliver O'Halloran 
Signed-off-by: Balbir Singh 
Cc: Michael Neuling 
Cc: Jack Miller 
---
 arch/powerpc/include/asm/reg.h  |   1 +
 arch/powerpc/include/asm/time.h |   6 +--
 arch/powerpc/kernel/time.c  | 102 
 3 files changed, 98 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index a0948f40bc7b..12d970d64bb3 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -332,6 +332,7 @@
 #define   LPCR_AIL_0   0x  /* MMU off exception offset 0x0 */
 #define   LPCR_AIL_3   0x0180  /* MMU on exception offset 0xc00...4xxx 
*/
 #define   LPCR_ONL 0x0004  /* online - PURR/SPURR count */
+#define   LPCR_LD  0x0002  /* large decremeter */
 #define   LPCR_PECE0x0001f000  /* powersave exit cause enable */
 #define LPCR_PECEDP0x0001  /* directed priv dbells cause 
exit */
 #define LPCR_PECEDH0x8000  /* directed hyp dbells cause 
exit */
diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h
index 1092fdd7e737..09211640a0e0 100644
--- a/arch/powerpc/include/asm/time.h
+++ b/arch/powerpc/include/asm/time.h
@@ -146,7 +146,7 @@ static inline void set_tb(unsigned int upper, unsigned int 
lower)
  * in auto-reload mode.  The problem is PIT stops counting when it
  * hits zero.  If it would wrap, we could use it just like a decrementer.
  */
-static inline unsigned int get_dec(void)
+static inline u64 get_dec(void)
 {
 #if defined(CONFIG_40x)
return (mfspr(SPRN_PIT));
@@ -160,10 +160,10 @@ static inline unsigned int get_dec(void)
  * in when the decrementer generates its interrupt: on the 1 to 0
  * transition for Book E/4xx, but on the 0 to -1 transition for others.
  */
-static inline void set_dec(int val)
+static inline void set_dec(u64 val)
 {
 #if defined(CONFIG_40x)
-   mtspr(SPRN_PIT, val);
+   mtspr(SPRN_PIT, (u32) val);
 #else
 #ifndef CONFIG_BOOKE
--val;
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 7a482a7f4d8d..aa6d399d939b 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -97,7 +97,8 @@ static struct clocksource clocksource_timebase = {
.read = timebase_read,
 };
 
-#define DECREMENTER_MAX0x7fff
+#define DECREMENTER_DEFAULT_MAX 0x7FFF
+u64 decrementer_max = DECREMENTER_DEFAULT_MAX;
 
 static int decrementer_set_next_event(unsigned long evt,
  struct clock_event_device *dev);
@@ -505,8 +506,8 @@ static void __timer_interrupt(void)
__this_cpu_inc(irq_stat.timer_irqs_event);
} else {
now = *next_tb - now;
-   if (now <= DECREMENTER_MAX)
-   set_dec((int)now);
+   if (now <= decrementer_max)
+   set_dec(now);
/* We may have raced with new irq work */
if (test_irq_work_pending())
set_dec(1);
@@ -536,7 +537,7 @@ void timer_interrupt(struct pt_regs * regs)
/* Ensure a positive value is written to the decrementer, or else
 * some CPUs will continue to take decrementer exceptions.
 */
-   set_dec(DECREMENTER_MAX);
+   set_dec(decrementer_max);
 
/* Some implementations of hotplug will get timer interrupts while
 * offline, just ignore these and we also need to set
@@ -584,9 +585,9 @@ static void generic_suspend_disable_irqs(void)
 * with suspending.
 */
 
-   set_dec(DECREMENTER_MAX);
+