Re: Optimised memset64/memset32 for powerpc

2017-03-27 Thread Naveen N. Rao
On 2017/03/22 12:30PM, Matthew Wilcox wrote:
> On Wed, Mar 22, 2017 at 06:18:05AM -0700, Matthew Wilcox wrote:
> > There's one other potential user I've been wondering about, which are the
> > various console drivers.  They use 'memsetw' to blank the entire console
> > or lines of the console when scrolling, but the only architecture which
> > ever bothered implementing an optimised version of it was Alpha.
> > 
> > Might be worth it on powerpc actually ... better than a loop calling
> > cpu_to_le16() on each iteration.  That'd complete the set with a
> > memset16().
> 
> All hail plane rides ... This would need to be resplit and merged properly,
> but I think it makes life a little saner.

... not to forget train rides :)

Here's a straight-forward implementation for powerpc64, along with one
other user in bpf. It is obviously non-critical, but given that we have
64K pages on powerpc64, it does help to speed up the BPF JIT.

- Naveen

Naveen N. Rao (2):
  powerpc: string: implement optimized memset variants
  powerpc: bpf: use memset32() to pre-fill traps in BPF page(s)

 arch/powerpc/include/asm/string.h | 24 
 arch/powerpc/lib/mem_64.S | 19 ++-
 arch/powerpc/net/bpf_jit_comp64.c |  6 +-
 3 files changed, 43 insertions(+), 6 deletions(-)

-- 
2.11.1



Re: Optimised memset64/memset32 for powerpc

2017-03-22 Thread Matthew Wilcox
On Wed, Mar 22, 2017 at 06:18:05AM -0700, Matthew Wilcox wrote:
> There's one other potential user I've been wondering about, which are the
> various console drivers.  They use 'memsetw' to blank the entire console
> or lines of the console when scrolling, but the only architecture which
> ever bothered implementing an optimised version of it was Alpha.
> 
> Might be worth it on powerpc actually ... better than a loop calling
> cpu_to_le16() on each iteration.  That'd complete the set with a
> memset16().

All hail plane rides ... This would need to be resplit and merged properly,
but I think it makes life a little saner.

I make no claims that the ARM assembly in here is correct.  The single
x86 instruction that I wrote^W coped and pasted appears to be correct by
my understanding of the instruction set.


diff --git a/arch/alpha/include/asm/string.h b/arch/alpha/include/asm/string.h
index c2911f591704..74c0a693b76b 100644
--- a/arch/alpha/include/asm/string.h
+++ b/arch/alpha/include/asm/string.h
@@ -65,13 +65,14 @@ extern void * memchr(const void *, int, size_t);
aligned values.  The DEST and COUNT parameters must be even for 
correct operation.  */
 
-#define __HAVE_ARCH_MEMSETW
-extern void * __memsetw(void *dest, unsigned short, size_t count);
-
-#define memsetw(s, c, n)\
-(__builtin_constant_p(c)\
- ? __constant_c_memset((s),0x0001000100010001UL*(unsigned short)(c),(n)) \
- : __memsetw((s),(c),(n)))
+#define __HAVE_ARCH_MEMSET16
+extern void * __memset16(void *dest, unsigned short, size_t count);
+static inline void *memset16(uint16_t *p, uint16_t v, size_t n)
+{
+   if (__builtin_constant_p(v))
+   return __constant_c_memset(p, 0x0001000100010001UL * v, n * 2)
+   return __memset16(p, v, n * 2);
+}
 
 #endif /* __KERNEL__ */
 
diff --git a/arch/alpha/include/asm/vga.h b/arch/alpha/include/asm/vga.h
index c00106bac521..3c1c2b6128e7 100644
--- a/arch/alpha/include/asm/vga.h
+++ b/arch/alpha/include/asm/vga.h
@@ -34,7 +34,7 @@ static inline void scr_memsetw(u16 *s, u16 c, unsigned int 
count)
if (__is_ioaddr(s))
memsetw_io((u16 __iomem *) s, c, count);
else
-   memsetw(s, c, count);
+   memset16(s, c, count / 2);
 }
 
 /* Do not trust that the usage will be correct; analyze the arguments.  */
diff --git a/arch/alpha/lib/memset.S b/arch/alpha/lib/memset.S
index 89a26f5e89de..f824969e9e77 100644
--- a/arch/alpha/lib/memset.S
+++ b/arch/alpha/lib/memset.S
@@ -20,7 +20,7 @@
.globl memset
.globl __memset
.globl ___memset
-   .globl __memsetw
+   .globl __memset16
.globl __constant_c_memset
 
.ent ___memset
@@ -110,8 +110,8 @@ EXPORT_SYMBOL(___memset)
 EXPORT_SYMBOL(__constant_c_memset)
 
.align 5
-   .ent __memsetw
-__memsetw:
+   .ent __memset16
+__memset16:
.prologue 0
 
inswl $17,0,$1  /* E0 */
@@ -123,8 +123,8 @@ __memsetw:
or $1,$4,$17/* E0 */
br __constant_c_memset  /* .. E1 */
 
-   .end __memsetw
-EXPORT_SYMBOL(__memsetw)
+   .end __memset16
+EXPORT_SYMBOL(__memset16)
 
 memset = ___memset
 __memset = ___memset
diff --git a/arch/arm/include/asm/string.h b/arch/arm/include/asm/string.h
index da88299f758b..bc7a1be7a76a 100644
--- a/arch/arm/include/asm/string.h
+++ b/arch/arm/include/asm/string.h
@@ -24,15 +24,22 @@ extern void * memchr(const void *, int, __kernel_size_t);
 #define __HAVE_ARCH_MEMSET
 extern void * memset(void *, int, __kernel_size_t);
 
-#define __HAVE_ARCH_MEMSET_PLUS
-extern void *__memset32(uint32_t *, uint32_t v, __kernel_size_t);
-extern void *__memset64(uint64_t *, uint32_t low, __kernel_size_t, uint32_t 
hi);
+#define __HAVE_ARCH_MEMSET16
+extern void *__memset16(uint16_t *, uint16_t v, __kernel_size_t);
+static inline void *memset16(uint16_t *p, uint16_t v, __kernel_size_t n)
+{
+   return __memset16(p, v, n * 2);
+}
 
+#define __HAVE_ARCH_MEMSET32
+extern void *__memset32(uint32_t *, uint32_t v, __kernel_size_t);
 static inline void *memset32(uint32_t *p, uint32_t v, __kernel_size_t n)
 {
return __memset32(p, v, n * 4);
 }
 
+#define __HAVE_ARCH_MEMSET64
+extern void *__memset64(uint64_t *, uint32_t low, __kernel_size_t, uint32_t 
hi);
 static inline void *memset64(uint64_t *p, uint64_t v, __kernel_size_t n)
 {
return __memset64(p, v, n * 8, v >> 32);
diff --git a/arch/arm/lib/memset.S b/arch/arm/lib/memset.S
index a835ff9ed30c..0b6cbaa25b33 100644
--- a/arch/arm/lib/memset.S
+++ b/arch/arm/lib/memset.S
@@ -21,12 +21,12 @@ ENTRY(memset)
 UNWIND( .fnstart )
andsr3, r0, #3  @ 1 unaligned?
mov ip, r0  @ preserve r0 as return value
+   orr r1, r1, r1, lsl #8
bne 6f  @ 1
 /*
  * we know that the pointer in ip is aligned to a word boundary.
  */
-1: orr 

Re: Optimised memset64/memset32 for powerpc

2017-03-22 Thread Matthew Wilcox
On Wed, Mar 22, 2017 at 08:26:12AM +1100, Benjamin Herrenschmidt wrote:
> On Tue, 2017-03-21 at 06:29 -0700, Matthew Wilcox wrote:
> > 
> > Well, those are the generic versions in the first patch:
> > 
> > http://git.infradead.org/users/willy/linux-dax.git/commitdiff/538b977
> > 6ac925199969bd5af4e994da776d461e7
> > 
> > so if those are good enough for you guys, there's no need for you to
> > do anything.
> > 
> > Thanks for your time!
> 
> I suspect on ppc64 we can do much better, if anything moving 64-bit at
> a time. Matthew, what are the main use cases of these ?

I've only converted two users so far -- zram was the initial inspiration
for this.  It notices when a page has a pattern in it which is
representable as a repetition of an 'unsigned long' (this seems to be
a relatively common thing for userspace to do -- not as common as an
entirely zero page, but common enough to be worth optimising for).  So it
may be doing an entire page worth of this to handle a page fault, or if
there's an I/O to such a page, it will be doing a multiple of 512 bytes.

The other user is sym53c8xx_2; it's an initialisation path thing, and
it saves a few bytes in the driver to call the optimised routine rather
than have its own loop to initialise the array.

I suspect we have additional places in the kernel that could use
memset32/memset64 -- look for loops which store a value which is not
dependent on the loop counter.  They're probably not performance path
though; I'd focus on zram as being the case to optimise for.

There's one other potential user I've been wondering about, which are the
various console drivers.  They use 'memsetw' to blank the entire console
or lines of the console when scrolling, but the only architecture which
ever bothered implementing an optimised version of it was Alpha.

Might be worth it on powerpc actually ... better than a loop calling
cpu_to_le16() on each iteration.  That'd complete the set with a
memset16().


Re: Optimised memset64/memset32 for powerpc

2017-03-21 Thread Benjamin Herrenschmidt
On Tue, 2017-03-21 at 06:29 -0700, Matthew Wilcox wrote:
> 
> Well, those are the generic versions in the first patch:
> 
> http://git.infradead.org/users/willy/linux-dax.git/commitdiff/538b977
> 6ac925199969bd5af4e994da776d461e7
> 
> so if those are good enough for you guys, there's no need for you to
> do anything.
> 
> Thanks for your time!

I suspect on ppc64 we can do much better, if anything moving 64-bit at
a time. Matthew, what are the main use cases of these ?

Cheers,
Ben.



Re: Optimised memset64/memset32 for powerpc

2017-03-21 Thread Segher Boessenkool
On Tue, Mar 21, 2017 at 06:29:10AM -0700, Matthew Wilcox wrote:
> > Unrolling the loop could help a bit on old powerpc32s that don't have branch
> > units, but on those processors the main driver is the time spent to do the
> > effective write to memory, and the operations necessary to unroll the loop
> > are not worth the cycle added by the branch.
> > 
> > On more modern powerpc32s, the branch unit implies that branches have a zero
> > cost.
> 
> Fair enough.  I'm just surprised it was worth unrolling the loop on
> powerpc64 and not on powerpc32 -- see mem_64.S.

We can do at most one loop iteration per cycle, but we can do multiple
stores per cycle, on modern, bigger CPUs.  Many old or small CPUs have
only one load/store unit on the other hand.  There are other issues,
but that is the biggest difference.


Segher


Re: Optimised memset64/memset32 for powerpc

2017-03-21 Thread Matthew Wilcox
On Tue, Mar 21, 2017 at 01:23:36PM +0100, Christophe LEROY wrote:
> > It doesn't look free for you as you only store one register each time
> > around the loop in the 32-bit memset implementation:
> > 
> > 1:  stwur4,4(r6)
> > bdnz1b
> > 
> > (wouldn't you get better performance on 32-bit powerpc by unrolling that
> > loop like you do on 64-bit?)
> 
> In arch/powerpc/lib/copy_32.S, the implementation of memset() is optimised
> when the value to be set is zero. It makes use of the 'dcbz' instruction
> which zeroizes a complete cache line.
> 
> Not much effort has been put on optimising non-zero memset() because there
> are almost none.

Yes, bzero() is much more common than setting an 8-bit pattern.
And setting an 8-bit pattern is almost certainly more common than setting
a 32 or 64 bit pattern.

> Unrolling the loop could help a bit on old powerpc32s that don't have branch
> units, but on those processors the main driver is the time spent to do the
> effective write to memory, and the operations necessary to unroll the loop
> are not worth the cycle added by the branch.
> 
> On more modern powerpc32s, the branch unit implies that branches have a zero
> cost.

Fair enough.  I'm just surprised it was worth unrolling the loop on
powerpc64 and not on powerpc32 -- see mem_64.S.

> A simple static inline C function would probably do the job, based on what I
> get below:
> 
> void memset32(int *p, int v, unsigned int c)
> {
>   int i;
> 
>   for (i = 0; i < c; i++)
>   *p++ = v;
> }
> 
> void memset64(long long *p, long long v, unsigned int c)
> {
>   int i;
> 
>   for (i = 0; i < c; i++)
>   *p++ = v;
> }

Well, those are the generic versions in the first patch:

http://git.infradead.org/users/willy/linux-dax.git/commitdiff/538b9776ac925199969bd5af4e994da776d461e7

so if those are good enough for you guys, there's no need for you to
do anything.

Thanks for your time!


Re: Optimised memset64/memset32 for powerpc

2017-03-21 Thread Christophe LEROY

Hi Matthew

Le 20/03/2017 à 22:14, Matthew Wilcox a écrit :

I recently introduced memset32() / memset64().  I've done implementations
for x86 & ARM; akpm has agreed to take the patchset through his tree.
Do you fancy doing a powerpc version?  Minchan Kim got a 7% performance
increase with zram from switching to the optimised version on x86.

Here's the development git tree:
http://git.infradead.org/users/willy/linux-dax.git/shortlog/refs/heads/memfill
(most recent 7 commits)

ARM probably offers the best model for you to work from; it's basically
just a case of jumping into the middle of your existing memset loop.
It was only three instructions to add support to ARM, but I don't know
PowerPC well enough to understand how your existing memset works.
I'd start with something like this ... note that you don't have to
implement memset64 on 32-bit; I only did it on ARM because it was free.
It doesn't look free for you as you only store one register each time
around the loop in the 32-bit memset implementation:

1:  stwur4,4(r6)
bdnz1b

(wouldn't you get better performance on 32-bit powerpc by unrolling that
loop like you do on 64-bit?)


In arch/powerpc/lib/copy_32.S, the implementation of memset() is 
optimised when the value to be set is zero. It makes use of the 'dcbz' 
instruction which zeroizes a complete cache line.


Not much effort has been put on optimising non-zero memset() because 
there are almost none.


Unrolling the loop could help a bit on old powerpc32s that don't have 
branch units, but on those processors the main driver is the time spent 
to do the effective write to memory, and the operations necessary to 
unroll the loop are not worth the cycle added by the branch.


On more modern powerpc32s, the branch unit implies that branches have a 
zero cost.


A simple static inline C function would probably do the job, based on 
what I get below:


void memset32(int *p, int v, unsigned int c)
{
int i;

for (i = 0; i < c; i++)
*p++ = v;
}

void memset64(long long *p, long long v, unsigned int c)
{
int i;

for (i = 0; i < c; i++)
*p++ = v;
}

test.o: file format elf32-powerpc


Disassembly of section .text:

 :
   0:   2c 05 00 00 cmpwi   r5,0
   4:   38 63 ff fc addir3,r3,-4
   8:   4d 82 00 20 beqlr
   c:   7c a9 03 a6 mtctr   r5
  10:   94 83 00 04 stwur4,4(r3)
  14:   42 00 ff fc bdnz10 
  18:   4e 80 00 20 blr

001c :
  1c:   2c 07 00 00 cmpwi   r7,0
  20:   7c cb 33 78 mr  r11,r6
  24:   7c aa 2b 78 mr  r10,r5
  28:   38 63 ff f8 addir3,r3,-8
  2c:   4d 82 00 20 beqlr
  30:   7c e9 03 a6 mtctr   r7
  34:   95 43 00 08 stwur10,8(r3)
  38:   91 63 00 04 stw r11,4(r3)
  3c:   42 00 ff f8 bdnz34 
  40:   4e 80 00 20 blr



Christophe



diff --git a/arch/powerpc/include/asm/string.h 
b/arch/powerpc/include/asm/string.h
index da3cdffca440..c02392fced98 100644
--- a/arch/powerpc/include/asm/string.h
+++ b/arch/powerpc/include/asm/string.h
@@ -6,6 +6,7 @@
 #define __HAVE_ARCH_STRNCPY
 #define __HAVE_ARCH_STRNCMP
 #define __HAVE_ARCH_MEMSET
+#define __HAVE_ARCH_MEMSET_PLUS
 #define __HAVE_ARCH_MEMCPY
 #define __HAVE_ARCH_MEMMOVE
 #define __HAVE_ARCH_MEMCMP
@@ -23,6 +24,18 @@ extern void * memmove(void *,const void *,__kernel_size_t);
 extern int memcmp(const void *,const void *,__kernel_size_t);
 extern void * memchr(const void *,int,__kernel_size_t);

+extern void *__memset32(uint32_t *, uint32_t v, __kernel_size_t);
+static inline void *memset32(uint32_t *p, uint32_t v, __kernel_size_t n)
+{
+   return __memset32(p, v, n * 4);
+}
+
+extern void *__memset64(uint64_t *, uint64_t v, __kernel_size_t);
+static inline void *memset64(uint64_t *p, uint64_t v, __kernel_size_t n)
+{
+   return __memset64(p, v, n * 8);
+}
+
 #endif /* __KERNEL__ */

 #endif /* _ASM_POWERPC_STRING_H */



Re: Optimised memset64/memset32 for powerpc

2017-03-20 Thread Benjamin Herrenschmidt
On Mon, 2017-03-20 at 14:14 -0700, Matthew Wilcox wrote:
> I recently introduced memset32() / memset64().  I've done implementations
> for x86 & ARM; akpm has agreed to take the patchset through his tree.
> Do you fancy doing a powerpc version?  Minchan Kim got a 7% performance
> increase with zram from switching to the optimised version on x86.

+Anton

Thanks Matthew !

> Here's the development git tree:
> http://git.infradead.org/users/willy/linux-dax.git/shortlog/refs/heads/memfill
> (most recent 7 commits)
> 
> ARM probably offers the best model for you to work from; it's basically
> just a case of jumping into the middle of your existing memset loop.
> It was only three instructions to add support to ARM, but I don't know
> PowerPC well enough to understand how your existing memset works.
> I'd start with something like this ... note that you don't have to
> implement memset64 on 32-bit; I only did it on ARM because it was free.
> It doesn't look free for you as you only store one register each time
> around the loop in the 32-bit memset implementation:
> 
> 1:  stwur4,4(r6)
> bdnz1b
> 
> (wouldn't you get better performance on 32-bit powerpc by unrolling that
> loop like you do on 64-bit?)
> 
> diff --git a/arch/powerpc/include/asm/string.h 
> b/arch/powerpc/include/asm/string.h
> index da3cdffca440..c02392fced98 100644
> --- a/arch/powerpc/include/asm/string.h
> +++ b/arch/powerpc/include/asm/string.h
> @@ -6,6 +6,7 @@
>  #define __HAVE_ARCH_STRNCPY
>  #define __HAVE_ARCH_STRNCMP
>  #define __HAVE_ARCH_MEMSET
> +#define __HAVE_ARCH_MEMSET_PLUS
>  #define __HAVE_ARCH_MEMCPY
>  #define __HAVE_ARCH_MEMMOVE
>  #define __HAVE_ARCH_MEMCMP
> @@ -23,6 +24,18 @@ extern void * memmove(void *,const void *,__kernel_size_t);
>  extern int memcmp(const void *,const void *,__kernel_size_t);
>  extern void * memchr(const void *,int,__kernel_size_t);
>  
> +extern void *__memset32(uint32_t *, uint32_t v, __kernel_size_t);
> +static inline void *memset32(uint32_t *p, uint32_t v, __kernel_size_t n)
> +{
> > +   return __memset32(p, v, n * 4);
> +}
> +
> +extern void *__memset64(uint64_t *, uint64_t v, __kernel_size_t);
> +static inline void *memset64(uint64_t *p, uint64_t v, __kernel_size_t n)
> +{
> > +   return __memset64(p, v, n * 8);
> +}
> +
>  #endif /* __KERNEL__ */
>  
> >  #endif /* _ASM_POWERPC_STRING_H */


Optimised memset64/memset32 for powerpc

2017-03-20 Thread Matthew Wilcox
I recently introduced memset32() / memset64().  I've done implementations
for x86 & ARM; akpm has agreed to take the patchset through his tree.
Do you fancy doing a powerpc version?  Minchan Kim got a 7% performance
increase with zram from switching to the optimised version on x86.

Here's the development git tree:
http://git.infradead.org/users/willy/linux-dax.git/shortlog/refs/heads/memfill
(most recent 7 commits)

ARM probably offers the best model for you to work from; it's basically
just a case of jumping into the middle of your existing memset loop.
It was only three instructions to add support to ARM, but I don't know
PowerPC well enough to understand how your existing memset works.
I'd start with something like this ... note that you don't have to
implement memset64 on 32-bit; I only did it on ARM because it was free.
It doesn't look free for you as you only store one register each time
around the loop in the 32-bit memset implementation:

1:  stwur4,4(r6)
bdnz1b

(wouldn't you get better performance on 32-bit powerpc by unrolling that
loop like you do on 64-bit?)

diff --git a/arch/powerpc/include/asm/string.h 
b/arch/powerpc/include/asm/string.h
index da3cdffca440..c02392fced98 100644
--- a/arch/powerpc/include/asm/string.h
+++ b/arch/powerpc/include/asm/string.h
@@ -6,6 +6,7 @@
 #define __HAVE_ARCH_STRNCPY
 #define __HAVE_ARCH_STRNCMP
 #define __HAVE_ARCH_MEMSET
+#define __HAVE_ARCH_MEMSET_PLUS
 #define __HAVE_ARCH_MEMCPY
 #define __HAVE_ARCH_MEMMOVE
 #define __HAVE_ARCH_MEMCMP
@@ -23,6 +24,18 @@ extern void * memmove(void *,const void *,__kernel_size_t);
 extern int memcmp(const void *,const void *,__kernel_size_t);
 extern void * memchr(const void *,int,__kernel_size_t);
 
+extern void *__memset32(uint32_t *, uint32_t v, __kernel_size_t);
+static inline void *memset32(uint32_t *p, uint32_t v, __kernel_size_t n)
+{
+   return __memset32(p, v, n * 4);
+}
+
+extern void *__memset64(uint64_t *, uint64_t v, __kernel_size_t);
+static inline void *memset64(uint64_t *p, uint64_t v, __kernel_size_t n)
+{
+   return __memset64(p, v, n * 8);
+}
+
 #endif /* __KERNEL__ */
 
 #endif /* _ASM_POWERPC_STRING_H */