https://gcc.gnu.org/bugzilla/show_bug.cgi?id=117568
Bug ID: 117568
Summary: z13: Use vector instructions for fixed length memcmp
Product: gcc
Version: 13.2.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: target
Assignee: unassigned at gcc dot gnu.org
Reporter: jens.seifert at de dot ibm.com
Target Milestone: ---
#include <memory.h>
#include <vecintrin.h>
Up to 16 bytes consider using vector instructions for memcmp.
This is not required for 1,2,4,8 bytes, but for the rest.
For general memcmp.
memcmp == 0:
Use lochi to get result of clc
otherwise
maybe use 2xlochi instead of ipm+2*shift after clc, but I am not 100% sure if
lochi is faster.b
bool eq15(const unsigned char *a, const unsigned char *b)
{
return memcmp(a, b, 15) == 0;
}
bool eq15_vec(const unsigned char *a, const unsigned char *b)
{
const int len = 15;
vector unsigned char va = vec_load_len(a, len-1);
vector unsigned char vb = vec_load_len(b, len-1);
return vec_all_eq(va, vb);
}
eq15(unsigned char const*, unsigned char const*):
clc 0(15,%r3),0(%r2)
ipm %r2
sll %r2,2
sra %r2,30
lpr %r0,%r2
ahi %r0,-1
risbgn %r2,%r0,64-1,128+63,32+1
br %r14
eq15_vec(unsigned char const*, unsigned char const*):
lhi %r1,14
vll %v0,%r1,0(%r2)
vll %v2,%r1,0(%r3)
lghi %r2,0
vceqbs %v0,%v0,%v2
locghie %r2,1
br %r14
int compare15(const unsigned char *a, const unsigned char *b)
{
return memcmp(a, b, 15);
}
int compare15_vec(const unsigned char *a, const unsigned char *b)
{
const int len = 15;
vector unsigned char va = vec_load_len(a, len-1);
vector unsigned char vb = vec_load_len(b, len-1);
vector int le = (vector int)vec_subc_u128(va, vb);
vector int ge = (vector int)vec_subc_u128(vb, va);
return vec_extract(le - ge, 3);
}
compare15(unsigned char const*, unsigned char const*):
clc 0(15,%r3),0(%r2)
ipm %r2
sllg %r0,%r2,34
srag %r2,%r0,62
br %r14
compare15_vec(unsigned char const*, unsigned char const*):
lhi %r1,14
vll %v4,%r1,0(%r2)
vll %v0,%r1,0(%r3)
vscbiq %v2,%v4,%v0
vscbiq %v6,%v0,%v4
vsf %v1,%v2,%v6
vlgvf %r2,%v1,3
lgfr %r2,%r2
br %r14
=> For general memcmp vectorization does not pay off.