Re: Optimised memset64/memset32 for powerpc
On 2017/03/22 12:30PM, Matthew Wilcox wrote: > On Wed, Mar 22, 2017 at 06:18:05AM -0700, Matthew Wilcox wrote: > > There's one other potential user I've been wondering about, which are the > > various console drivers. They use 'memsetw' to blank the entire console > > or lines of the console when scrolling, but the only architecture which > > ever bothered implementing an optimised version of it was Alpha. > > > > Might be worth it on powerpc actually ... better than a loop calling > > cpu_to_le16() on each iteration. That'd complete the set with a > > memset16(). > > All hail plane rides ... This would need to be resplit and merged properly, > but I think it makes life a little saner. ... not to forget train rides :) Here's a straight-forward implementation for powerpc64, along with one other user in bpf. It is obviously non-critical, but given that we have 64K pages on powerpc64, it does help to speed up the BPF JIT. - Naveen Naveen N. Rao (2): powerpc: string: implement optimized memset variants powerpc: bpf: use memset32() to pre-fill traps in BPF page(s) arch/powerpc/include/asm/string.h | 24 arch/powerpc/lib/mem_64.S | 19 ++- arch/powerpc/net/bpf_jit_comp64.c | 6 +- 3 files changed, 43 insertions(+), 6 deletions(-) -- 2.11.1
Re: Optimised memset64/memset32 for powerpc
On Wed, Mar 22, 2017 at 06:18:05AM -0700, Matthew Wilcox wrote: > There's one other potential user I've been wondering about, which are the > various console drivers. They use 'memsetw' to blank the entire console > or lines of the console when scrolling, but the only architecture which > ever bothered implementing an optimised version of it was Alpha. > > Might be worth it on powerpc actually ... better than a loop calling > cpu_to_le16() on each iteration. That'd complete the set with a > memset16(). All hail plane rides ... This would need to be resplit and merged properly, but I think it makes life a little saner. I make no claims that the ARM assembly in here is correct. The single x86 instruction that I wrote^W coped and pasted appears to be correct by my understanding of the instruction set. diff --git a/arch/alpha/include/asm/string.h b/arch/alpha/include/asm/string.h index c2911f591704..74c0a693b76b 100644 --- a/arch/alpha/include/asm/string.h +++ b/arch/alpha/include/asm/string.h @@ -65,13 +65,14 @@ extern void * memchr(const void *, int, size_t); aligned values. The DEST and COUNT parameters must be even for correct operation. */ -#define __HAVE_ARCH_MEMSETW -extern void * __memsetw(void *dest, unsigned short, size_t count); - -#define memsetw(s, c, n)\ -(__builtin_constant_p(c)\ - ? __constant_c_memset((s),0x0001000100010001UL*(unsigned short)(c),(n)) \ - : __memsetw((s),(c),(n))) +#define __HAVE_ARCH_MEMSET16 +extern void * __memset16(void *dest, unsigned short, size_t count); +static inline void *memset16(uint16_t *p, uint16_t v, size_t n) +{ + if (__builtin_constant_p(v)) + return __constant_c_memset(p, 0x0001000100010001UL * v, n * 2) + return __memset16(p, v, n * 2); +} #endif /* __KERNEL__ */ diff --git a/arch/alpha/include/asm/vga.h b/arch/alpha/include/asm/vga.h index c00106bac521..3c1c2b6128e7 100644 --- a/arch/alpha/include/asm/vga.h +++ b/arch/alpha/include/asm/vga.h @@ -34,7 +34,7 @@ static inline void scr_memsetw(u16 *s, u16 c, unsigned int count) if (__is_ioaddr(s)) memsetw_io((u16 __iomem *) s, c, count); else - memsetw(s, c, count); + memset16(s, c, count / 2); } /* Do not trust that the usage will be correct; analyze the arguments. */ diff --git a/arch/alpha/lib/memset.S b/arch/alpha/lib/memset.S index 89a26f5e89de..f824969e9e77 100644 --- a/arch/alpha/lib/memset.S +++ b/arch/alpha/lib/memset.S @@ -20,7 +20,7 @@ .globl memset .globl __memset .globl ___memset - .globl __memsetw + .globl __memset16 .globl __constant_c_memset .ent ___memset @@ -110,8 +110,8 @@ EXPORT_SYMBOL(___memset) EXPORT_SYMBOL(__constant_c_memset) .align 5 - .ent __memsetw -__memsetw: + .ent __memset16 +__memset16: .prologue 0 inswl $17,0,$1 /* E0 */ @@ -123,8 +123,8 @@ __memsetw: or $1,$4,$17/* E0 */ br __constant_c_memset /* .. E1 */ - .end __memsetw -EXPORT_SYMBOL(__memsetw) + .end __memset16 +EXPORT_SYMBOL(__memset16) memset = ___memset __memset = ___memset diff --git a/arch/arm/include/asm/string.h b/arch/arm/include/asm/string.h index da88299f758b..bc7a1be7a76a 100644 --- a/arch/arm/include/asm/string.h +++ b/arch/arm/include/asm/string.h @@ -24,15 +24,22 @@ extern void * memchr(const void *, int, __kernel_size_t); #define __HAVE_ARCH_MEMSET extern void * memset(void *, int, __kernel_size_t); -#define __HAVE_ARCH_MEMSET_PLUS -extern void *__memset32(uint32_t *, uint32_t v, __kernel_size_t); -extern void *__memset64(uint64_t *, uint32_t low, __kernel_size_t, uint32_t hi); +#define __HAVE_ARCH_MEMSET16 +extern void *__memset16(uint16_t *, uint16_t v, __kernel_size_t); +static inline void *memset16(uint16_t *p, uint16_t v, __kernel_size_t n) +{ + return __memset16(p, v, n * 2); +} +#define __HAVE_ARCH_MEMSET32 +extern void *__memset32(uint32_t *, uint32_t v, __kernel_size_t); static inline void *memset32(uint32_t *p, uint32_t v, __kernel_size_t n) { return __memset32(p, v, n * 4); } +#define __HAVE_ARCH_MEMSET64 +extern void *__memset64(uint64_t *, uint32_t low, __kernel_size_t, uint32_t hi); static inline void *memset64(uint64_t *p, uint64_t v, __kernel_size_t n) { return __memset64(p, v, n * 8, v >> 32); diff --git a/arch/arm/lib/memset.S b/arch/arm/lib/memset.S index a835ff9ed30c..0b6cbaa25b33 100644 --- a/arch/arm/lib/memset.S +++ b/arch/arm/lib/memset.S @@ -21,12 +21,12 @@ ENTRY(memset) UNWIND( .fnstart ) andsr3, r0, #3 @ 1 unaligned? mov ip, r0 @ preserve r0 as return value + orr r1, r1, r1, lsl #8 bne 6f @ 1 /* * we know that the pointer in ip is aligned to a word boundary. */ -1: orr
Re: Optimised memset64/memset32 for powerpc
On Wed, Mar 22, 2017 at 08:26:12AM +1100, Benjamin Herrenschmidt wrote: > On Tue, 2017-03-21 at 06:29 -0700, Matthew Wilcox wrote: > > > > Well, those are the generic versions in the first patch: > > > > http://git.infradead.org/users/willy/linux-dax.git/commitdiff/538b977 > > 6ac925199969bd5af4e994da776d461e7 > > > > so if those are good enough for you guys, there's no need for you to > > do anything. > > > > Thanks for your time! > > I suspect on ppc64 we can do much better, if anything moving 64-bit at > a time. Matthew, what are the main use cases of these ? I've only converted two users so far -- zram was the initial inspiration for this. It notices when a page has a pattern in it which is representable as a repetition of an 'unsigned long' (this seems to be a relatively common thing for userspace to do -- not as common as an entirely zero page, but common enough to be worth optimising for). So it may be doing an entire page worth of this to handle a page fault, or if there's an I/O to such a page, it will be doing a multiple of 512 bytes. The other user is sym53c8xx_2; it's an initialisation path thing, and it saves a few bytes in the driver to call the optimised routine rather than have its own loop to initialise the array. I suspect we have additional places in the kernel that could use memset32/memset64 -- look for loops which store a value which is not dependent on the loop counter. They're probably not performance path though; I'd focus on zram as being the case to optimise for. There's one other potential user I've been wondering about, which are the various console drivers. They use 'memsetw' to blank the entire console or lines of the console when scrolling, but the only architecture which ever bothered implementing an optimised version of it was Alpha. Might be worth it on powerpc actually ... better than a loop calling cpu_to_le16() on each iteration. That'd complete the set with a memset16().
Re: Optimised memset64/memset32 for powerpc
On Tue, 2017-03-21 at 06:29 -0700, Matthew Wilcox wrote: > > Well, those are the generic versions in the first patch: > > http://git.infradead.org/users/willy/linux-dax.git/commitdiff/538b977 > 6ac925199969bd5af4e994da776d461e7 > > so if those are good enough for you guys, there's no need for you to > do anything. > > Thanks for your time! I suspect on ppc64 we can do much better, if anything moving 64-bit at a time. Matthew, what are the main use cases of these ? Cheers, Ben.
Re: Optimised memset64/memset32 for powerpc
On Tue, Mar 21, 2017 at 06:29:10AM -0700, Matthew Wilcox wrote: > > Unrolling the loop could help a bit on old powerpc32s that don't have branch > > units, but on those processors the main driver is the time spent to do the > > effective write to memory, and the operations necessary to unroll the loop > > are not worth the cycle added by the branch. > > > > On more modern powerpc32s, the branch unit implies that branches have a zero > > cost. > > Fair enough. I'm just surprised it was worth unrolling the loop on > powerpc64 and not on powerpc32 -- see mem_64.S. We can do at most one loop iteration per cycle, but we can do multiple stores per cycle, on modern, bigger CPUs. Many old or small CPUs have only one load/store unit on the other hand. There are other issues, but that is the biggest difference. Segher
Re: Optimised memset64/memset32 for powerpc
On Tue, Mar 21, 2017 at 01:23:36PM +0100, Christophe LEROY wrote: > > It doesn't look free for you as you only store one register each time > > around the loop in the 32-bit memset implementation: > > > > 1: stwur4,4(r6) > > bdnz1b > > > > (wouldn't you get better performance on 32-bit powerpc by unrolling that > > loop like you do on 64-bit?) > > In arch/powerpc/lib/copy_32.S, the implementation of memset() is optimised > when the value to be set is zero. It makes use of the 'dcbz' instruction > which zeroizes a complete cache line. > > Not much effort has been put on optimising non-zero memset() because there > are almost none. Yes, bzero() is much more common than setting an 8-bit pattern. And setting an 8-bit pattern is almost certainly more common than setting a 32 or 64 bit pattern. > Unrolling the loop could help a bit on old powerpc32s that don't have branch > units, but on those processors the main driver is the time spent to do the > effective write to memory, and the operations necessary to unroll the loop > are not worth the cycle added by the branch. > > On more modern powerpc32s, the branch unit implies that branches have a zero > cost. Fair enough. I'm just surprised it was worth unrolling the loop on powerpc64 and not on powerpc32 -- see mem_64.S. > A simple static inline C function would probably do the job, based on what I > get below: > > void memset32(int *p, int v, unsigned int c) > { > int i; > > for (i = 0; i < c; i++) > *p++ = v; > } > > void memset64(long long *p, long long v, unsigned int c) > { > int i; > > for (i = 0; i < c; i++) > *p++ = v; > } Well, those are the generic versions in the first patch: http://git.infradead.org/users/willy/linux-dax.git/commitdiff/538b9776ac925199969bd5af4e994da776d461e7 so if those are good enough for you guys, there's no need for you to do anything. Thanks for your time!
Re: Optimised memset64/memset32 for powerpc
Hi Matthew Le 20/03/2017 à 22:14, Matthew Wilcox a écrit : I recently introduced memset32() / memset64(). I've done implementations for x86 & ARM; akpm has agreed to take the patchset through his tree. Do you fancy doing a powerpc version? Minchan Kim got a 7% performance increase with zram from switching to the optimised version on x86. Here's the development git tree: http://git.infradead.org/users/willy/linux-dax.git/shortlog/refs/heads/memfill (most recent 7 commits) ARM probably offers the best model for you to work from; it's basically just a case of jumping into the middle of your existing memset loop. It was only three instructions to add support to ARM, but I don't know PowerPC well enough to understand how your existing memset works. I'd start with something like this ... note that you don't have to implement memset64 on 32-bit; I only did it on ARM because it was free. It doesn't look free for you as you only store one register each time around the loop in the 32-bit memset implementation: 1: stwur4,4(r6) bdnz1b (wouldn't you get better performance on 32-bit powerpc by unrolling that loop like you do on 64-bit?) In arch/powerpc/lib/copy_32.S, the implementation of memset() is optimised when the value to be set is zero. It makes use of the 'dcbz' instruction which zeroizes a complete cache line. Not much effort has been put on optimising non-zero memset() because there are almost none. Unrolling the loop could help a bit on old powerpc32s that don't have branch units, but on those processors the main driver is the time spent to do the effective write to memory, and the operations necessary to unroll the loop are not worth the cycle added by the branch. On more modern powerpc32s, the branch unit implies that branches have a zero cost. A simple static inline C function would probably do the job, based on what I get below: void memset32(int *p, int v, unsigned int c) { int i; for (i = 0; i < c; i++) *p++ = v; } void memset64(long long *p, long long v, unsigned int c) { int i; for (i = 0; i < c; i++) *p++ = v; } test.o: file format elf32-powerpc Disassembly of section .text: : 0: 2c 05 00 00 cmpwi r5,0 4: 38 63 ff fc addir3,r3,-4 8: 4d 82 00 20 beqlr c: 7c a9 03 a6 mtctr r5 10: 94 83 00 04 stwur4,4(r3) 14: 42 00 ff fc bdnz1018: 4e 80 00 20 blr 001c : 1c: 2c 07 00 00 cmpwi r7,0 20: 7c cb 33 78 mr r11,r6 24: 7c aa 2b 78 mr r10,r5 28: 38 63 ff f8 addir3,r3,-8 2c: 4d 82 00 20 beqlr 30: 7c e9 03 a6 mtctr r7 34: 95 43 00 08 stwur10,8(r3) 38: 91 63 00 04 stw r11,4(r3) 3c: 42 00 ff f8 bdnz34 40: 4e 80 00 20 blr Christophe diff --git a/arch/powerpc/include/asm/string.h b/arch/powerpc/include/asm/string.h index da3cdffca440..c02392fced98 100644 --- a/arch/powerpc/include/asm/string.h +++ b/arch/powerpc/include/asm/string.h @@ -6,6 +6,7 @@ #define __HAVE_ARCH_STRNCPY #define __HAVE_ARCH_STRNCMP #define __HAVE_ARCH_MEMSET +#define __HAVE_ARCH_MEMSET_PLUS #define __HAVE_ARCH_MEMCPY #define __HAVE_ARCH_MEMMOVE #define __HAVE_ARCH_MEMCMP @@ -23,6 +24,18 @@ extern void * memmove(void *,const void *,__kernel_size_t); extern int memcmp(const void *,const void *,__kernel_size_t); extern void * memchr(const void *,int,__kernel_size_t); +extern void *__memset32(uint32_t *, uint32_t v, __kernel_size_t); +static inline void *memset32(uint32_t *p, uint32_t v, __kernel_size_t n) +{ + return __memset32(p, v, n * 4); +} + +extern void *__memset64(uint64_t *, uint64_t v, __kernel_size_t); +static inline void *memset64(uint64_t *p, uint64_t v, __kernel_size_t n) +{ + return __memset64(p, v, n * 8); +} + #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_STRING_H */
Re: Optimised memset64/memset32 for powerpc
On Mon, 2017-03-20 at 14:14 -0700, Matthew Wilcox wrote: > I recently introduced memset32() / memset64(). I've done implementations > for x86 & ARM; akpm has agreed to take the patchset through his tree. > Do you fancy doing a powerpc version? Minchan Kim got a 7% performance > increase with zram from switching to the optimised version on x86. +Anton Thanks Matthew ! > Here's the development git tree: > http://git.infradead.org/users/willy/linux-dax.git/shortlog/refs/heads/memfill > (most recent 7 commits) > > ARM probably offers the best model for you to work from; it's basically > just a case of jumping into the middle of your existing memset loop. > It was only three instructions to add support to ARM, but I don't know > PowerPC well enough to understand how your existing memset works. > I'd start with something like this ... note that you don't have to > implement memset64 on 32-bit; I only did it on ARM because it was free. > It doesn't look free for you as you only store one register each time > around the loop in the 32-bit memset implementation: > > 1: stwur4,4(r6) > bdnz1b > > (wouldn't you get better performance on 32-bit powerpc by unrolling that > loop like you do on 64-bit?) > > diff --git a/arch/powerpc/include/asm/string.h > b/arch/powerpc/include/asm/string.h > index da3cdffca440..c02392fced98 100644 > --- a/arch/powerpc/include/asm/string.h > +++ b/arch/powerpc/include/asm/string.h > @@ -6,6 +6,7 @@ > #define __HAVE_ARCH_STRNCPY > #define __HAVE_ARCH_STRNCMP > #define __HAVE_ARCH_MEMSET > +#define __HAVE_ARCH_MEMSET_PLUS > #define __HAVE_ARCH_MEMCPY > #define __HAVE_ARCH_MEMMOVE > #define __HAVE_ARCH_MEMCMP > @@ -23,6 +24,18 @@ extern void * memmove(void *,const void *,__kernel_size_t); > extern int memcmp(const void *,const void *,__kernel_size_t); > extern void * memchr(const void *,int,__kernel_size_t); > > +extern void *__memset32(uint32_t *, uint32_t v, __kernel_size_t); > +static inline void *memset32(uint32_t *p, uint32_t v, __kernel_size_t n) > +{ > > + return __memset32(p, v, n * 4); > +} > + > +extern void *__memset64(uint64_t *, uint64_t v, __kernel_size_t); > +static inline void *memset64(uint64_t *p, uint64_t v, __kernel_size_t n) > +{ > > + return __memset64(p, v, n * 8); > +} > + > #endif /* __KERNEL__ */ > > > #endif /* _ASM_POWERPC_STRING_H */
Optimised memset64/memset32 for powerpc
I recently introduced memset32() / memset64(). I've done implementations for x86 & ARM; akpm has agreed to take the patchset through his tree. Do you fancy doing a powerpc version? Minchan Kim got a 7% performance increase with zram from switching to the optimised version on x86. Here's the development git tree: http://git.infradead.org/users/willy/linux-dax.git/shortlog/refs/heads/memfill (most recent 7 commits) ARM probably offers the best model for you to work from; it's basically just a case of jumping into the middle of your existing memset loop. It was only three instructions to add support to ARM, but I don't know PowerPC well enough to understand how your existing memset works. I'd start with something like this ... note that you don't have to implement memset64 on 32-bit; I only did it on ARM because it was free. It doesn't look free for you as you only store one register each time around the loop in the 32-bit memset implementation: 1: stwur4,4(r6) bdnz1b (wouldn't you get better performance on 32-bit powerpc by unrolling that loop like you do on 64-bit?) diff --git a/arch/powerpc/include/asm/string.h b/arch/powerpc/include/asm/string.h index da3cdffca440..c02392fced98 100644 --- a/arch/powerpc/include/asm/string.h +++ b/arch/powerpc/include/asm/string.h @@ -6,6 +6,7 @@ #define __HAVE_ARCH_STRNCPY #define __HAVE_ARCH_STRNCMP #define __HAVE_ARCH_MEMSET +#define __HAVE_ARCH_MEMSET_PLUS #define __HAVE_ARCH_MEMCPY #define __HAVE_ARCH_MEMMOVE #define __HAVE_ARCH_MEMCMP @@ -23,6 +24,18 @@ extern void * memmove(void *,const void *,__kernel_size_t); extern int memcmp(const void *,const void *,__kernel_size_t); extern void * memchr(const void *,int,__kernel_size_t); +extern void *__memset32(uint32_t *, uint32_t v, __kernel_size_t); +static inline void *memset32(uint32_t *p, uint32_t v, __kernel_size_t n) +{ + return __memset32(p, v, n * 4); +} + +extern void *__memset64(uint64_t *, uint64_t v, __kernel_size_t); +static inline void *memset64(uint64_t *p, uint64_t v, __kernel_size_t n) +{ + return __memset64(p, v, n * 8); +} + #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_STRING_H */