Re: [PATCH 1/2] powerpc: rewrite LOAD_REG_IMMEDIATE() as an intelligent macro

2019-08-14 Thread Segher Boessenkool
Hi Christophe,

On Tue, Aug 13, 2019 at 09:59:35AM +, Christophe Leroy wrote:
> + rldicr  \r, \r, 32, 31

Could you please write this as
sldi\r, \r, 32
?  It's much easier to read, imo (it's the exact same instruction).

You can do a lot cheaper sequences if you have a temporary reg, as well
(longest path of 3 insns instead of 5):
lis rt,A
ori rt,B
lis rd,C
ori rd,D
rldimi rd,rt,32,0
to load ABCD.


Segher


Re: [PATCH 1/2] powerpc: rewrite LOAD_REG_IMMEDIATE() as an intelligent macro

2019-08-14 Thread Christophe Leroy




Le 14/08/2019 à 04:08, Paul Mackerras a écrit :

On Tue, Aug 13, 2019 at 09:59:35AM +, Christophe Leroy wrote:

[snip]


+.macro __LOAD_REG_IMMEDIATE r, x
+   .if \x & ~0x != 0
+   __LOAD_REG_IMMEDIATE_32 \r, (\x) >> 32
+   rldicr  \r, \r, 32, 31
+   .if (\x) & 0x != 0
+   oris \r, \r, (\x)@__AS_ATHIGH
+   .endif
+   .if (\x) & 0x != 0
+   oris \r, \r, (\x)@l
+   .endif
+   .else
+   __LOAD_REG_IMMEDIATE_32 \r, \x
+   .endif
+.endm


Doesn't this force all negative constants, even small ones, to use
the long sequence?  For example, __LOAD_REG_IMMEDIATE r3, -1 will
generate (as far as I can see):

li  r3, -1
rldicr  r3, r3, 32, 31
orisr3, r3, 0x
ori r3, r3, 0x

which seems suboptimal.


Ah yes, thanks. And it is also buggy when \x is over 0x8000 because 
lis is a signed ops


I'll send v2

Christophe


Re: [PATCH 1/2] powerpc: rewrite LOAD_REG_IMMEDIATE() as an intelligent macro

2019-08-13 Thread Paul Mackerras
On Tue, Aug 13, 2019 at 09:59:35AM +, Christophe Leroy wrote:

[snip]

> +.macro __LOAD_REG_IMMEDIATE r, x
> + .if \x & ~0x != 0
> + __LOAD_REG_IMMEDIATE_32 \r, (\x) >> 32
> + rldicr  \r, \r, 32, 31
> + .if (\x) & 0x != 0
> + oris \r, \r, (\x)@__AS_ATHIGH
> + .endif
> + .if (\x) & 0x != 0
> + oris \r, \r, (\x)@l
> + .endif
> + .else
> + __LOAD_REG_IMMEDIATE_32 \r, \x
> + .endif
> +.endm

Doesn't this force all negative constants, even small ones, to use
the long sequence?  For example, __LOAD_REG_IMMEDIATE r3, -1 will
generate (as far as I can see):

li  r3, -1
rldicr  r3, r3, 32, 31
orisr3, r3, 0x
ori r3, r3, 0x

which seems suboptimal.

Paul.


[PATCH 1/2] powerpc: rewrite LOAD_REG_IMMEDIATE() as an intelligent macro

2019-08-13 Thread Christophe Leroy
Today LOAD_REG_IMMEDIATE() is a basic #define which loads all
parts on a value into a register, including the parts that are NUL.

This means always 2 instructions on PPC32 and always 5 instructions
on PPC64. And those instructions cannot run in parallele as they are
updating the same register.

Ex: LOAD_REG_IMMEDIATE(r1,THREAD_SIZE) in head_64.S results in:

3c 20 00 00 lis r1,0
60 21 00 00 ori r1,r1,0
78 21 07 c6 rldicr  r1,r1,32,31
64 21 00 00 orisr1,r1,0
60 21 40 00 ori r1,r1,16384

Rewrite LOAD_REG_IMMEDIATE() with GAS macro in order to skip
the parts that are NUL.

Rename existing LOAD_REG_IMMEDIATE() as LOAD_REG_IMMEDIATE_SYM()
and use that one for loading value of symbols which are not known
at compile time.

Now LOAD_REG_IMMEDIATE(r1,THREAD_SIZE) in head_64.S results in:

38 20 40 00 li  r1,16384

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/include/asm/ppc_asm.h   | 42 +++-
 arch/powerpc/kernel/exceptions-64e.S | 10 -
 arch/powerpc/kernel/head_64.S|  2 +-
 3 files changed, 43 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/include/asm/ppc_asm.h 
b/arch/powerpc/include/asm/ppc_asm.h
index e0637730a8e7..9a7c2ca9b714 100644
--- a/arch/powerpc/include/asm/ppc_asm.h
+++ b/arch/powerpc/include/asm/ppc_asm.h
@@ -311,13 +311,43 @@ GLUE(.,name):
addis   reg,reg,(name - 0b)@ha; \
addireg,reg,(name - 0b)@l;
 
-#ifdef __powerpc64__
-#ifdef HAVE_AS_ATHIGH
+#if defined(__powerpc64__) && defined(HAVE_AS_ATHIGH)
 #define __AS_ATHIGH high
 #else
 #define __AS_ATHIGH h
 #endif
-#define LOAD_REG_IMMEDIATE(reg,expr)   \
+
+.macro __LOAD_REG_IMMEDIATE_32 r, x
+   .if (\x) >= 0x8000 || (\x) < -0x8000
+   lis \r, (\x)@__AS_ATHIGH
+   .if (\x) & 0x != 0
+   ori \r, \r, (\x)@l
+   .endif
+   .else
+   li \r, (\x)@l
+   .endif
+.endm
+
+.macro __LOAD_REG_IMMEDIATE r, x
+   .if \x & ~0x != 0
+   __LOAD_REG_IMMEDIATE_32 \r, (\x) >> 32
+   rldicr  \r, \r, 32, 31
+   .if (\x) & 0x != 0
+   oris \r, \r, (\x)@__AS_ATHIGH
+   .endif
+   .if (\x) & 0x != 0
+   oris \r, \r, (\x)@l
+   .endif
+   .else
+   __LOAD_REG_IMMEDIATE_32 \r, \x
+   .endif
+.endm
+
+#ifdef __powerpc64__
+
+#define LOAD_REG_IMMEDIATE(reg, expr) __LOAD_REG_IMMEDIATE reg, expr
+
+#define LOAD_REG_IMMEDIATE_SYM(reg,expr)   \
lis reg,(expr)@highest; \
ori reg,reg,(expr)@higher;  \
rldicr  reg,reg,32,31;  \
@@ -335,11 +365,13 @@ GLUE(.,name):
 
 #else /* 32-bit */
 
-#define LOAD_REG_IMMEDIATE(reg,expr)   \
+#define LOAD_REG_IMMEDIATE(reg, expr) __LOAD_REG_IMMEDIATE_32 reg, expr
+
+#define LOAD_REG_IMMEDIATE_SYM(reg,expr)   \
lis reg,(expr)@ha;  \
addireg,reg,(expr)@l;
 
-#define LOAD_REG_ADDR(reg,name)LOAD_REG_IMMEDIATE(reg, name)
+#define LOAD_REG_ADDR(reg,name)LOAD_REG_IMMEDIATE_SYM(reg, 
name)
 
 #define LOAD_REG_ADDRBASE(reg, name)   lis reg,name@ha
 #define ADDROFF(name)  name@l
diff --git a/arch/powerpc/kernel/exceptions-64e.S 
b/arch/powerpc/kernel/exceptions-64e.S
index 1cfb3da4a84a..898aae6da167 100644
--- a/arch/powerpc/kernel/exceptions-64e.S
+++ b/arch/powerpc/kernel/exceptions-64e.S
@@ -751,8 +751,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
ld  r14,interrupt_base_book3e@got(r15)
ld  r15,__end_interrupts@got(r15)
 #else
-   LOAD_REG_IMMEDIATE(r14,interrupt_base_book3e)
-   LOAD_REG_IMMEDIATE(r15,__end_interrupts)
+   LOAD_REG_IMMEDIATE_SYM(r14,interrupt_base_book3e)
+   LOAD_REG_IMMEDIATE_SYM(r15,__end_interrupts)
 #endif
cmpld   cr0,r10,r14
cmpld   cr1,r10,r15
@@ -821,8 +821,8 @@ kernel_dbg_exc:
ld  r14,interrupt_base_book3e@got(r15)
ld  r15,__end_interrupts@got(r15)
 #else
-   LOAD_REG_IMMEDIATE(r14,interrupt_base_book3e)
-   LOAD_REG_IMMEDIATE(r15,__end_interrupts)
+   LOAD_REG_IMMEDIATE_SYM(r14,interrupt_base_book3e)
+   LOAD_REG_IMMEDIATE_SYM(r15,__end_interrupts)
 #endif
cmpld   cr0,r10,r14
cmpld   cr1,r10,r15
@@ -1449,7 +1449,7 @@ a2_tlbinit_code_start:
 a2_tlbinit_after_linear_map:
 
/* Now we branch the new virtual address mapped by this entry */
-   LOAD_REG_IMMEDIATE(r3,1f)
+   LOAD_REG_IMMEDIATE_SYM(r3,1f)
mtctr   r3
bctr
 
diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S
index 91d297e696dd..1fd44761e997 100644
--- a/arch/powerpc/kernel/head_64.S
+++ b/arch/powerpc/kernel/head_64.S
@@ -635,7 +635,7 @@ __after_prom_start:
sub r5,r5,r11
 #else
/* just copy interrupts */
-   LOAD_REG_IMMEDIATE(r5, FIXED_SYMBOL_ABS_ADD