https://gcc.gnu.org/bugzilla/show_bug.cgi?id=109605
Wojciech Mula <wojciech_mula at poczta dot onet.pl> changed:
What |Removed |Added
----------------------------------------------------------------------------
CC| |wojciech_mula at poczta dot
onet.p
| |l
--- Comment #3 from Wojciech Mula <wojciech_mula at poczta dot onet.pl> ---
This is somehow related. I needed to generate the particular procedure without
any vector instruction (the surrounding code is free to RVV instructions).
But when a code uses the builtin function `memset`, GCC still emits some vector
instruction. The cure is setting `-fno-builtin`, because pragma does not accept
that option.
The attached sample code comes from simdutf project (src/scalar/utf.f), godbolt
link for convenience https://godbolt.org/z/Ya91he99v.
---no-vector.cpp--
#include <cstdlib>
#include <cstdint>
#include <cstring>
#pragma GCC optimize ("no-tree-vectorize")
#pragma GCC optimize ("no-tree-loop-vectorize")
#pragma GCC optimize ("no-tree-slp-vectorize")
#pragma GCC optimize ("no-builtin") // not accepted by the compiler
bool validate(const char *buf, size_t len) noexcept {
const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
uint64_t pos = 0;
uint32_t code_point = 0;
while (pos < len) {
// check of the next 16 bytes are ascii.
uint64_t next_pos = pos + 16;
if (next_pos <=
len) { // if it is safe to read 16 more bytes, check that they are
ascii
uint64_t v1;
std::memcpy(&v1, data + pos, sizeof(uint64_t));
uint64_t v2;
std::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
uint64_t v{v1 | v2};
if ((v & 0x8080808080808080) == 0) {
pos = next_pos;
continue;
}
}
unsigned char byte = data[pos];
while (byte < 0b10000000) {
if (++pos == len) {
return true;
}
byte = data[pos];
}
if ((byte & 0b11100000) == 0b11000000) {
next_pos = pos + 2;
if (next_pos > len) {
return false;
}
if ((data[pos + 1] & 0b11000000) != 0b10000000) {
return false;
}
// range check
code_point = (byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
if ((code_point < 0x80) || (0x7ff < code_point)) {
return false;
}
} else if ((byte & 0b11110000) == 0b11100000) {
next_pos = pos + 3;
if (next_pos > len) {
return false;
}
if ((data[pos + 1] & 0b11000000) != 0b10000000) {
return false;
}
if ((data[pos + 2] & 0b11000000) != 0b10000000) {
return false;
}
// range check
code_point = (byte & 0b00001111) << 12 |
(data[pos + 1] & 0b00111111) << 6 |
(data[pos + 2] & 0b00111111);
if ((code_point < 0x800) || (0xffff < code_point) ||
(0xd7ff < code_point && code_point < 0xe000)) {
return false;
}
} else if ((byte & 0b11111000) == 0b11110000) { // 0b11110000
next_pos = pos + 4;
if (next_pos > len) {
return false;
}
if ((data[pos + 1] & 0b11000000) != 0b10000000) {
return false;
}
if ((data[pos + 2] & 0b11000000) != 0b10000000) {
return false;
}
if ((data[pos + 3] & 0b11000000) != 0b10000000) {
return false;
}
// range check
code_point =
(byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 |
(data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
if (code_point <= 0xffff || 0x10ffff < code_point) {
return false;
}
} else {
// we may have a continuation
return false;
}
pos = next_pos;
}
return true;
}
---eof---
The head of generated asm:
---
validate(char const*, unsigned long):
beq a1,zero,.L32
li a4,2139062272
addi a4,a4,-129
slli a2,a4,32
addi sp,sp,-16
add a2,a2,a4
li a5,0
xori a2,a2,-1
addi a7,sp,8
vsetivli zero,8,e8,mf2,ta,ma ###### here
.L2:
addi a3,a5,16
add t1,a0,a5
bltu a1,a3,.L36
vle8.v v1,0(t1) #####
addi a4,a5,8
add a4,a0,a4
vse8.v v1,0(sp) #####
vle8.v v1,0(a4) #####
ld a4,0(sp)
vse8.v v1,0(a7) #####
ld a6,8(sp)
or a4,a4,a6
and a4,a4,a2
bne a4,zero,.L36
mv a5,a3
.L6:
---