This is an automated email from the ASF dual-hosted git repository. nickva pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/couchdb-jiffy.git
commit a81f9b716d1c56daae6e702708df3e6990fcd488 Author: Nick Vatamaniuc <[email protected]> AuthorDate: Mon Apr 20 00:55:21 2026 -0400 Speed up UTF8 validation by about 25% Use a pseudo-simd (SWAR-like) trick to validate UTF8 faster. There are essentially two improvements here: 1) Use a table similar to `hexvals` to quicky get the length and knock out some of the invalid cases 2) check continuation bytes in parallel by packing them into an int and then applying a mask on all of them All in all, we get a nice perf boost on UTF8-heavy benchmarks and it's not just stats padding since some users have data that is UTF8-heavy so it seems worthy at the expense of making the code a tiny bit more complicated. The idea here is not new, and originally was presented by Keiser and Lemire in "Validating UTF-8 In Less Than One Instruction Per Byte" (https://arxiv.org/pdf/2010.03090) and a practical C implementation of loading all bytes into an unsigned is form [yyjson](https://github.com/ibireme/yyjson). --- c_src/jiffy_utf8.h | 125 ++++++++++++++++++++++++++++++++--------------------- 1 file changed, 76 insertions(+), 49 deletions(-) diff --git a/c_src/jiffy_utf8.h b/c_src/jiffy_utf8.h index b75b9ad..f6891e9 100644 --- a/c_src/jiffy_utf8.h +++ b/c_src/jiffy_utf8.h @@ -145,68 +145,95 @@ utf8_to_unicode(unsigned char* buf, size_t size) return ret; } +// Lead-byte length table +// 0 : invalid lead byte, len 2 overlongs, > U+10FFFF [F5..FF] +// 1 : ASCII +// 2,3,4 : lead byte of that sequence +// +// Note: we mark some overlongs here right away (2 char ones and greater than +// F4). Below we'll only need continuation byte checks and two boundary checks +// (b0 and b1) for overlong 3 and 4, surrogate 3 and over-long 4. +// +// The idea is originally from https://arxiv.org/pdf/2010.03090 "Validating +// UTF-8 In Less Than One Instruction Per Byte" with an accompanying blog +// https://lemire.me/blog/2020/10/20/ridiculously-fast-unicode-utf-8-validation +// +static const unsigned char utf8_seq_len[256] = { + /* 0x00 */ 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, + /* 0x10 */ 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, + /* 0x20 */ 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, + /* 0x30 */ 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, + /* 0x40 */ 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, + /* 0x50 */ 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, + /* 0x60 */ 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, + /* 0x70 */ 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, + /* 0x80 */ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, + /* 0x90 */ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, + /* 0xA0 */ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, + /* 0xB0 */ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, + /* 0xC0 */ 0,0,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, + /* 0xD0 */ 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, + /* 0xE0 */ 3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, + /* 0xF0 */ 4,4,4,4,4,0,0,0, 0,0,0,0,0,0,0,0 + /* 0 1 2 3 4 5 6 7 8 9 A B C D E F */ +}; + static inline size_t -utf8_validate(unsigned char* data, size_t size) +utf8_validate(const unsigned char* JIFFY_RESTRICT data, size_t size) { - size_t ulen = 0; - int ui; - size_t i; - - if((data[0] & 0x80) == 0x00) { - ulen = 1; - } else if((data[0] & 0xE0) == 0xC0) { - ulen = 2; - } else if((data[0] & 0xF0) == 0xE0) { - ulen = 3; - } else if((data[0] & 0xF8) == 0xF0) { - ulen = 4; - } - if(ulen == 0 || ulen > size) { + unsigned int b0 = data[0]; + unsigned int len = utf8_seq_len[b0]; + if(len == 0 || len > size) { return 0; } - - // Check each continuation byte. - for(i = 1; i < ulen; i++) { - if((data[i] & 0xC0) != 0x80) return 0; + if(len == 1) { + // We should never get here from dec_string since we pre-filter bytes < 0x80. + return 1; // LCOV_EXCL_LINE } - // Wikipedia says I have to check that a UTF-8 encoding - // uses as few bits as possible. This means that we - // can't do things like encode 't' in three bytes. - // To check this all we need to ensure is that for each - // of the following bit patterns that there is at least - // one 1 bit in any of the x's - // 1: 0yyyyyyy - // 2: 110xxxxy 10yyyyyy - // 3: 1110xxxx 10xyyyyy 10yyyyyy - // 4: 11110xxx 10xxyyyy 10yyyyyy 10yyyyyy + // This is the tricky bit: pack the 1 to 3 continuation bytes into an + // unsigned int then do masked compares on all at the same time. I saw this + // the first time in yyjson library. + unsigned int cont = (unsigned int)data[1]; + if(len >= 3) { + cont |= (unsigned int)data[2] << 8; + } + if(len == 4) { + cont |= (unsigned int)data[3] << 16; + } + static const unsigned int CONT_MASK[5] = {0, 0, 0x0000C0, 0x00C0C0, 0xC0C0C0}; + static const unsigned int CONT_EXPECT[5] = {0, 0, 0x000080, 0x008080, 0x808080}; + if((cont & CONT_MASK[len]) != CONT_EXPECT[len]) { + return 0; + } - // ulen == 1 passes by definition - if(ulen == 2) { - if((data[0] & 0x1E) == 0) - return 0; - } else if(ulen == 3) { - if((data[0] & 0x0F) + (data[1] & 0x20) == 0) - return 0; - } else if(ulen == 4) { - if((data[0] & 0x07) + (data[1] & 0x30) == 0) - return 0; + if(len == 2) { + return 2; } - // Lastly we need to check some miscellaneous ranges for - // some of the larger code point values. - if(ulen >= 3) { - ui = utf8_to_unicode(data, ulen); - if(ui < 0) { + // Boundary checks based on b0 and b1. The length table has already + // filtered overlong 2-byte leads and leads > F4. (See table note) + unsigned int b1 = cont & 0xFF; + if(len == 3) { + // E0 80..9F is overlong 3-byte + if((b0 == 0xE0) & (b1 < 0xA0)) { return 0; - } else if(ui >= 0xD800 && ui <= 0xDFFF) { - return 0; - } else if(ui > 0x10FFFF) { + } + // ED A0..BF is a surrogate. + if((b0 == 0xED) & (b1 >= 0xA0)) { return 0; } + return 3; } - - return ulen; + // F0 80..8F overlong 4-byte + if((b0 == 0xF0) & (b1 < 0x90)) { + return 0; + } + // F4 90..BF these are invalid + if((b0 == 0xF4) & (b1 >= 0x90)) { + return 0; + } + return 4; } static inline int
