svn commit: r346588 - head/lib/libc/powerpc64/string

2019-09-03 Thread Justin Hibbits
Author: jhibbits
Date: Tue Apr 23 02:53:53 2019
New Revision: 346588
URL: https://svnweb.freebsd.org/changeset/base/346588

Log:
  powerpc64: Rewrite strcmp in asm to take advantage of word size
  
  Summary:
  Optimize strcmp for powerpc64.
  Data is loaded by double words and cmpb intruction is used to find '\0'.
  
  Some performance gain rates between the current and the optimized solution:
  
  String size (bytes)   Gain rate
<=8 0.59%
<=161.92%
32  3.02%
64  5.60%
128 10.16%
256 18.05%
512 30.18%
102442.82%
  
  Submitted by: alexandre.yamashita_eldorado.org.br,
leonardo.bianconi_eldorado.org.br
  Differential Revision: https://reviews.freebsd.org/D15220

Added:
  head/lib/libc/powerpc64/string/
  head/lib/libc/powerpc64/string/Makefile.inc   (contents, props changed)
  head/lib/libc/powerpc64/string/strcmp.S   (contents, props changed)

Added: head/lib/libc/powerpc64/string/Makefile.inc
==
--- /dev/null   00:00:00 1970   (empty, because file is newly added)
+++ head/lib/libc/powerpc64/string/Makefile.inc Tue Apr 23 02:53:53 2019
(r346588)
@@ -0,0 +1,4 @@
+# $FreeBSD$
+
+MDSRCS+= \
+   strcmp.S

Added: head/lib/libc/powerpc64/string/strcmp.S
==
--- /dev/null   00:00:00 1970   (empty, because file is newly added)
+++ head/lib/libc/powerpc64/string/strcmp.S Tue Apr 23 02:53:53 2019
(r346588)
@@ -0,0 +1,207 @@
+/*-
+ * Copyright (c) 2018 Instituto de Pesquisas Eldorado
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *notice, this list of conditions and the following disclaimer in the
+ *documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include 
+__FBSDID("$FreeBSD$");
+
+#if 0
+   RCSID("$NetBSD: strcmp.S,v 1.0 2018/05/10 12:33:02 alexandre Exp $")
+#endif
+
+/* Alignment mask. */
+#define STRCMP_MULTI_ALIGNMENT_BYTES 8
+#define STRCMP_MULTI_ALIGNMENT_MASK (STRCMP_MULTI_ALIGNMENT_BYTES - 1)
+
+ENTRY(strcmp)
+   /* Starting alignment even if aligned, avoiding performance
+* degradation for short strings.
+*/
+   lbz %r5,0(%r3)  /* Load chars. */
+   lbz %r6,0(%r4)
+   cmpd%r5,%r6 /* Check if chars are different. */
+   bne .Lstrcmp_end
+   cmpdi   %r5,0   /* Check if char is zero. */
+   beq .Lstrcmp_end
+
+   /* Checking if addresses can be aligned, otherwise copy by byte */
+   xor %r7,%r3,%r4
+   andi.   %r7,%r7,STRCMP_MULTI_ALIGNMENT_MASK
+   bne .Lstrcmp_compare_by_byte_loop
+
+.Lstrcmp_param1_align_loop:
+   lbzu%r5,1(%r3)  /* Load chars. */
+   lbzu%r6,1(%r4)
+   cmpd%r5,%r6 /* Check if chars are different. */
+   bne .Lstrcmp_end
+   cmpdi   %r5,0   /* Check if char is zero. */
+   beq .Lstrcmp_end
+   andi.   %r7,%r3,STRCMP_MULTI_ALIGNMENT_MASK /* Check alignment. */
+   bne .Lstrcmp_param1_align_loop
+
+.Lstrcmp_param1_aligned:
+   /* If parameter 2 is aligned compare by qword/word,
+* else compare by byte. */
+   andi.   %r7,%r4,STRCMP_MULTI_ALIGNMENT_MASK
+   beq .Lstrcmp_compare_by_word
+   lbz %r5,0(%r3)  /* Load chars. */
+   lbz %r6,0(%r4)
+   cmpd%r5,%r6 /* Check if chars are different. */
+   bne+.Lstrcmp_end
+   cmpdi   %r5,0   /* Check if char is zero. */
+   beq+.Lstrcmp_end
+
+.Lstrcmp_compare_by

Re: svn commit: r346588 - head/lib/libc/powerpc64/string

2019-09-03 Thread Alexey Dokuchaev
On Tue, Apr 23, 2019 at 02:53:53AM +, Justin Hibbits wrote:
> New Revision: 346588
> URL: https://svnweb.freebsd.org/changeset/base/346588
> 
> Log:
>   powerpc64: Rewrite strcmp in asm to take advantage of word size
> ...
>   Some performance gain rates between the current and the optimized
>   solution:
> 
>   String size (bytes) Gain rate
>   <=8 0.59%
>   <=161.92%
>   32  3.02%
>   64  5.60%
>   128 10.16%
>   256 18.05%
>   512 30.18%
>   102442.82%

Nice!  This should help to speed up buildworld quite a bit.  Would it
be feasible to patch ppc32 in a similar fashion?  Thanks,

./danfe


___
svn-src-head@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"


Re: svn commit: r346588 - head/lib/libc/powerpc64/string

2019-05-02 Thread Mark Millard via svn-src-head
[I did not deal with translating register usage correctly.]

> On 2019-Apr-27, at 01:50, Mark Millard  wrote:
> 
> Justin Hibbits jhibbits at FreeBSD.org wrote on
> Fri Apr 26 16:21:47 UTC 2019 :
> 
>> This actually uses 'cmpb' which is only available on PowerISA 2.05+, so
>> I'll need to pull it out for now, and re-enable it once we have
>> ifuncs.  As it stands, this commit broke the G5 and POWER4/POWER5.
> 
> As I understand the code like:
> 
>   xor %r8,%r8,%r8 /* %r8 <- Zero. */
>   xor %r0,%r5,%r6 /* Check if double words are different. */
>   cmpb%r7,%r5,%r8 /* Check if double words contain zero. */
> 
>   /*
>* If double words are different or contain zero,
>* find what byte is different or contains zero,
>* else load next double words.
>*/
>   or. %r9,%r7,%r0
>   bne .Lstrcmp_check_zeros_differences
> 
> (and similarly for the loop. . .):
> 
> A) Each byte of %r5 that is non-zero needs that byte of %r7 to be zero.
> B) Each byte of %r5 that is zero need that byte of %r7 to be non-zero.
> 
> (cmpb assigns 0xff for non-zero as I understand, but even one non-zero
> bit is sufficient for the overall code structure.)
> 
> If I've got that much correct, then the following might be an
> alternative to cmpb for now. I'll explain the code via commented
> c/c++-ish code and then show the assembler notation:
> 
> unsigned long ul_has_zero_byte(unsigned long b)
> {
>unsigned long constexpr low_7bits_of_bytes{0x7f7f7f7f'7f7f7f7ful};
> 
>   // Illustrating byte 
> transformations:
>unsigned long const x= b & low_7bits_of_bytes; // 0x00->0x00, 
> 0x80->0x00, other->ms-bit-in-byte==0
>unsigned long const y= x + low_7bits_of_bytes; // ->0x7f, 
> ->0x7f,  ->ms-bit-in-byte==1
>unsigned long const z= b | y | low_7bits_of_bytes; // ->0x7f, 
> ->0xff,  ->0xff
>return ~z; // ->0x80, 
> ->0x00,  ->0x00
> }
> 
> (used in a powerpc64 context, so unsigned long being 64 bits).
> 
> So, not using %r8 as zero but for a different value,
> each cmpb can be replaced by:
> 
> # Only once to set up the value in %r8 (Note: 32639=0x7f7f):
> lis r8,32639
> ori r8,r8,32639
> rldimi  r8,r8,32,0
> 
> # each "cmpb %r7,%r5,%r8" replaced by:
> and r7,r5,r8
> add r7,r7,r8
> nor r5,r7,r5
> andcr5,r5,r8

The above 4 lines are an incorrect match to the context's
register usage: only r7 of the 3 registers r5, r7, r8
should have been changed. It looks like another temporary
register (for the stage) is required to make a match:

and  %r9,%r5,%r8
add  %r9,%r9,%r8
nor  %r7,%r9,%r5
andc %r7,%r7,%r8

(%r9 later being replaced via: or. %r9,%r7,%r0)

> (The code is from compiler output, but with registers adjusted
> to match the context.)
> 
> 
> The c/c++-ish code came from thinking about material from Hacker's
> Delight Second Edition and the specific criteria needed here: it
> uses part of Figure 6-2 "Find First 0-Byte, branch-free code",
> adjusted for width and for returning something sufficient here.
> 



===
Mark Millard
marklmi at yahoo.com
( dsl-only.net went
away in early 2018-Mar)

___
svn-src-head@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"


Re: svn commit: r346588 - head/lib/libc/powerpc64/string

2019-04-27 Thread Mark Millard via svn-src-head
Justin Hibbits jhibbits at FreeBSD.org wrote on
Fri Apr 26 16:21:47 UTC 2019 :

> This actually uses 'cmpb' which is only available on PowerISA 2.05+, so
> I'll need to pull it out for now, and re-enable it once we have
> ifuncs.  As it stands, this commit broke the G5 and POWER4/POWER5.

As I understand the code like:

xor %r8,%r8,%r8 /* %r8 <- Zero. */
xor %r0,%r5,%r6 /* Check if double words are different. */
cmpb%r7,%r5,%r8 /* Check if double words contain zero. */

/*
 * If double words are different or contain zero,
 * find what byte is different or contains zero,
 * else load next double words.
 */
or. %r9,%r7,%r0
bne .Lstrcmp_check_zeros_differences

(and similarly for the loop. . .):

A) Each byte of %r5 that is non-zero needs that byte of %r7 to be zero.
B) Each byte of %r5 that is zero need that byte of %r7 to be non-zero.

(cmpb assigns 0xff for non-zero as I understand, but even one non-zero
bit is sufficient for the overall code structure.)

If I've got that much correct, then the following might be an
alternative to cmpb for now. I'll explain the code via commented
c/c++-ish code and then show the assembler notation:

unsigned long ul_has_zero_byte(unsigned long b)
{
unsigned long constexpr low_7bits_of_bytes{0x7f7f7f7f'7f7f7f7ful};

   // Illustrating byte 
transformations:
unsigned long const x= b & low_7bits_of_bytes; // 0x00->0x00, 
0x80->0x00, other->ms-bit-in-byte==0
unsigned long const y= x + low_7bits_of_bytes; // ->0x7f, 
->0x7f,  ->ms-bit-in-byte==1
unsigned long const z= b | y | low_7bits_of_bytes; // ->0x7f, 
->0xff,  ->0xff
return ~z; // ->0x80, 
->0x00,  ->0x00
}

(used in a powerpc64 context, so unsigned long being 64 bits).

So, not using %r8 as zero but for a different value,
each cmpb can be replaced by:

# Only once to set up the value in %r8 (Note: 32639=0x7f7f):
lis r8,32639
ori r8,r8,32639
rldimi  r8,r8,32,0

# each "cmpb %r7,%r5,%r8" replaced by:
and r7,r5,r8
add r7,r7,r8
nor r5,r7,r5
andcr5,r5,r8

(The code is from compiler output, but with registers adjusted
to match the context.)


The c/c++-ish code came from thinking about material from Hacker's
Delight Second Edition and the specific criteria needed here: it
uses part of Figure 6-2 "Find First 0-Byte, branch-free code",
adjusted for width and for returning something sufficient here.

===
Mark Millard
marklmi at yahoo.com
( dsl-only.net went
away in early 2018-Mar)

___
svn-src-head@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"


Re: svn commit: r346588 - head/lib/libc/powerpc64/string

2019-04-26 Thread Justin Hibbits
On Tue, 23 Apr 2019 10:24:42 +
Alexey Dokuchaev  wrote:

> On Tue, Apr 23, 2019 at 02:53:53AM +, Justin Hibbits wrote:
> > New Revision: 346588
> > URL: https://svnweb.freebsd.org/changeset/base/346588
> > 
> > Log:
> >   powerpc64: Rewrite strcmp in asm to take advantage of word size
> > ...
> >   Some performance gain rates between the current and the optimized
> >   solution:
> > 
> >   String size (bytes)   Gain rate
> > <=8 0.59%
> > <=161.92%
> > 32  3.02%
> > 64  5.60%
> > 128 10.16%
> > 256 18.05%
> > 512 30.18%
> > 102442.82%  
> 
> Nice!  This should help to speed up buildworld quite a bit.  Would it
> be feasible to patch ppc32 in a similar fashion?  Thanks,
> 
> ./danfe

This actually uses 'cmpb' which is only available on PowerISA 2.05+, so
I'll need to pull it out for now, and re-enable it once we have
ifuncs.  As it stands, this commit broke the G5 and POWER4/POWER5.

- Justin
___
svn-src-head@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"


Re: svn commit: r346588 - head/lib/libc/powerpc64/string

2019-04-23 Thread Alexey Dokuchaev
On Tue, Apr 23, 2019 at 02:53:53AM +, Justin Hibbits wrote:
> New Revision: 346588
> URL: https://svnweb.freebsd.org/changeset/base/346588
> 
> Log:
>   powerpc64: Rewrite strcmp in asm to take advantage of word size
> ...
>   Some performance gain rates between the current and the optimized
>   solution:
> 
>   String size (bytes) Gain rate
>   <=8 0.59%
>   <=161.92%
>   32  3.02%
>   64  5.60%
>   128 10.16%
>   256 18.05%
>   512 30.18%
>   102442.82%

Nice!  This should help to speed up buildworld quite a bit.  Would it
be feasible to patch ppc32 in a similar fashion?  Thanks,

./danfe
___
svn-src-head@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"


svn commit: r346588 - head/lib/libc/powerpc64/string

2019-04-22 Thread Justin Hibbits
Author: jhibbits
Date: Tue Apr 23 02:53:53 2019
New Revision: 346588
URL: https://svnweb.freebsd.org/changeset/base/346588

Log:
  powerpc64: Rewrite strcmp in asm to take advantage of word size
  
  Summary:
  Optimize strcmp for powerpc64.
  Data is loaded by double words and cmpb intruction is used to find '\0'.
  
  Some performance gain rates between the current and the optimized solution:
  
  String size (bytes)   Gain rate
<=8 0.59%
<=161.92%
32  3.02%
64  5.60%
128 10.16%
256 18.05%
512 30.18%
102442.82%
  
  Submitted by: alexandre.yamashita_eldorado.org.br,
leonardo.bianconi_eldorado.org.br
  Differential Revision: https://reviews.freebsd.org/D15220

Added:
  head/lib/libc/powerpc64/string/
  head/lib/libc/powerpc64/string/Makefile.inc   (contents, props changed)
  head/lib/libc/powerpc64/string/strcmp.S   (contents, props changed)

Added: head/lib/libc/powerpc64/string/Makefile.inc
==
--- /dev/null   00:00:00 1970   (empty, because file is newly added)
+++ head/lib/libc/powerpc64/string/Makefile.inc Tue Apr 23 02:53:53 2019
(r346588)
@@ -0,0 +1,4 @@
+# $FreeBSD$
+
+MDSRCS+= \
+   strcmp.S

Added: head/lib/libc/powerpc64/string/strcmp.S
==
--- /dev/null   00:00:00 1970   (empty, because file is newly added)
+++ head/lib/libc/powerpc64/string/strcmp.S Tue Apr 23 02:53:53 2019
(r346588)
@@ -0,0 +1,207 @@
+/*-
+ * Copyright (c) 2018 Instituto de Pesquisas Eldorado
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *notice, this list of conditions and the following disclaimer in the
+ *documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include 
+__FBSDID("$FreeBSD$");
+
+#if 0
+   RCSID("$NetBSD: strcmp.S,v 1.0 2018/05/10 12:33:02 alexandre Exp $")
+#endif
+
+/* Alignment mask. */
+#define STRCMP_MULTI_ALIGNMENT_BYTES 8
+#define STRCMP_MULTI_ALIGNMENT_MASK (STRCMP_MULTI_ALIGNMENT_BYTES - 1)
+
+ENTRY(strcmp)
+   /* Starting alignment even if aligned, avoiding performance
+* degradation for short strings.
+*/
+   lbz %r5,0(%r3)  /* Load chars. */
+   lbz %r6,0(%r4)
+   cmpd%r5,%r6 /* Check if chars are different. */
+   bne .Lstrcmp_end
+   cmpdi   %r5,0   /* Check if char is zero. */
+   beq .Lstrcmp_end
+
+   /* Checking if addresses can be aligned, otherwise copy by byte */
+   xor %r7,%r3,%r4
+   andi.   %r7,%r7,STRCMP_MULTI_ALIGNMENT_MASK
+   bne .Lstrcmp_compare_by_byte_loop
+
+.Lstrcmp_param1_align_loop:
+   lbzu%r5,1(%r3)  /* Load chars. */
+   lbzu%r6,1(%r4)
+   cmpd%r5,%r6 /* Check if chars are different. */
+   bne .Lstrcmp_end
+   cmpdi   %r5,0   /* Check if char is zero. */
+   beq .Lstrcmp_end
+   andi.   %r7,%r3,STRCMP_MULTI_ALIGNMENT_MASK /* Check alignment. */
+   bne .Lstrcmp_param1_align_loop
+
+.Lstrcmp_param1_aligned:
+   /* If parameter 2 is aligned compare by qword/word,
+* else compare by byte. */
+   andi.   %r7,%r4,STRCMP_MULTI_ALIGNMENT_MASK
+   beq .Lstrcmp_compare_by_word
+   lbz %r5,0(%r3)  /* Load chars. */
+   lbz %r6,0(%r4)
+   cmpd%r5,%r6 /* Check if chars are different. */
+   bne+.Lstrcmp_end
+   cmpdi   %r5,0   /* Check if char is zero. */
+   beq+.Lstrcmp_end
+
+.Lstrcmp_compare_by